diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json
index 475f75b5bf19b..ced418e0f4cc9 100644
--- a/cgmanifests/generated/cgmanifest.json
+++ b/cgmanifests/generated/cgmanifest.json
@@ -196,7 +196,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "bc0d2e35909b8456abe32f3b30a49bb0c125e8b7",
+          "commitHash": "9c69a24bc2e20c8a511a4e6b06fd49639ec5300a",
           "repositoryUrl": "https://github.com/onnx/onnx-tensorrt.git"
         },
         "comments": "onnx_tensorrt"
diff --git a/cmake/deps.txt b/cmake/deps.txt
index ed41ad5b0ceb1..cb5a5910fb3d0 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -36,8 +36,8 @@ microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.z
 mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee7d34223d0567892db5179849939c8769dc41
 mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063
 onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.17.0.zip;13a60ac5217c104139ce0fd024f48628e7bcf5bc
-# Use the latest commit of 10.6-GA-ORT-DDS
-onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/bc0d2e35909b8456abe32f3b30a49bb0c125e8b7.zip;f233ae871ad82c023da62e5dd620639f00bc2d15
+# Use the latest commit of 10.7-GA
+onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/9c69a24bc2e20c8a511a4e6b06fd49639ec5300a.zip;ff1fe9af78eb129b4a4cdcb7450b7390b4436dd3
 protobuf;https://github.com/protocolbuffers/protobuf/archive/refs/tags/v21.12.zip;7cf2733949036c7d52fda017badcab093fe73bfa
 protoc_win64;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip;b4521f7ada5b260380f94c4bd7f1b7684c76969a
 protoc_win32;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win32.zip;3688010318192c46ce73213cdfb6b3e5656da874
diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake
index 732c0511d400f..d72b61a0859b2 100644
--- a/cmake/onnxruntime.cmake
+++ b/cmake/onnxruntime.cmake
@@ -77,6 +77,7 @@ if(WIN32)
   onnxruntime_add_shared_library(onnxruntime
     ${SYMBOL_FILE}
     "${ONNXRUNTIME_ROOT}/core/dll/dllmain.cc"
+    "${ONNXRUNTIME_ROOT}/core/dll/delay_load_hook.cc"
     "${ONNXRUNTIME_ROOT}/core/dll/onnxruntime.rc"
   )
 elseif(onnxruntime_BUILD_APPLE_FRAMEWORK)
diff --git a/cmake/onnxruntime_nodejs.cmake b/cmake/onnxruntime_nodejs.cmake
index 376d895be34a9..355575be3bcf7 100644
--- a/cmake/onnxruntime_nodejs.cmake
+++ b/cmake/onnxruntime_nodejs.cmake
@@ -60,15 +60,26 @@ else()
     endif()
 endif()
 
+# a list of DLLs that the Node.js binding depends on
+set(NODEJS_DLL_DEPS)
+
 # setup providers
 if (onnxruntime_USE_CUDA)
     set(NODEJS_BINDING_USE_CUDA "--use_cuda")
 endif()
 if (onnxruntime_USE_DML)
     set(NODEJS_BINDING_USE_DML "--use_dml")
+    list(APPEND NODEJS_DLL_DEPS "$<TARGET_FILE_DIR:onnxruntime>/DirectML.dll")
 endif()
 if (onnxruntime_USE_WEBGPU)
     set(NODEJS_BINDING_USE_WEBGPU "--use_webgpu")
+    if (WIN32 AND onnxruntime_ENABLE_DAWN_BACKEND_D3D12)
+        list(APPEND NODEJS_DLL_DEPS "$<TARGET_FILE_DIR:dxcompiler>/dxil.dll")
+        list(APPEND NODEJS_DLL_DEPS "$<TARGET_FILE_DIR:dxcompiler>/dxcompiler.dll")
+    endif()
+    if (onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY)
+        list(APPEND NODEJS_DLL_DEPS "$<TARGET_FILE:dawn::webgpu_dawn>")
+    endif()
 endif()
 if (onnxruntime_USE_TENSORRT)
     set(NODEJS_BINDING_USE_TENSORRT "--use_tensorrt")
@@ -94,9 +105,12 @@ add_custom_target(js_common_npm_ci ALL
 
 add_custom_target(nodejs_binding_wrapper ALL
     COMMAND ${NPM_CLI} ci
-    COMMAND ${NPM_CLI} run build -- --onnxruntime-build-dir=${CMAKE_CURRENT_BINARY_DIR} --config=${CMAKE_BUILD_TYPE} --onnxruntime-generator=${CMAKE_GENERATOR}
-        --arch=${NODEJS_BINDING_ARCH} ${NODEJS_BINDING_USE_CUDA} ${NODEJS_BINDING_USE_DML} ${NODEJS_BINDING_USE_WEBGPU} ${NODEJS_BINDING_USE_TENSORRT}
-        ${NODEJS_BINDING_USE_COREML} ${NODEJS_BINDING_USE_QNN}
+    COMMAND ${NPM_CLI} run build -- "--onnxruntime-build-dir=${CMAKE_CURRENT_BINARY_DIR}"
+        --config=${CMAKE_BUILD_TYPE}
+        "--onnxruntime-generator=${CMAKE_GENERATOR}"
+        "--dll_deps=${NODEJS_DLL_DEPS}"
+        --arch=${NODEJS_BINDING_ARCH} ${NODEJS_BINDING_USE_CUDA} ${NODEJS_BINDING_USE_DML} ${NODEJS_BINDING_USE_WEBGPU}
+        ${NODEJS_BINDING_USE_TENSORRT} ${NODEJS_BINDING_USE_COREML} ${NODEJS_BINDING_USE_QNN}
     WORKING_DIRECTORY ${JS_NODE_ROOT}
     COMMENT "Using cmake-js to build OnnxRuntime Node.js binding")
 
diff --git a/cmake/onnxruntime_providers_webgpu.cmake b/cmake/onnxruntime_providers_webgpu.cmake
index fea5964f0dda9..e527d538d8757 100644
--- a/cmake/onnxruntime_providers_webgpu.cmake
+++ b/cmake/onnxruntime_providers_webgpu.cmake
@@ -23,19 +23,18 @@
   onnxruntime_add_include_to_target(onnxruntime_providers_webgpu
     onnxruntime_common dawn::dawncpp_headers dawn::dawn_headers onnx onnx_proto flatbuffers::flatbuffers Boost::mp11 safeint_interface)
 
+  set(onnxruntime_providers_webgpu_dll_deps)
+
   if (onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY)
     target_link_libraries(onnxruntime_providers_webgpu dawn::webgpu_dawn)
 
-    if (onnxruntime_ENABLE_DELAY_LOADING_WIN_DLLS)
-      list(APPEND onnxruntime_DELAYLOAD_FLAGS "/DELAYLOAD:webgpu_dawn.dll")
-    endif()
+    if (WIN32)
+      if (onnxruntime_ENABLE_DELAY_LOADING_WIN_DLLS)
+        list(APPEND onnxruntime_DELAYLOAD_FLAGS "/DELAYLOAD:webgpu_dawn.dll")
+      endif()
 
-    # Copy webgpu_dawn.dll to the output directory
-    add_custom_command(
-      TARGET onnxruntime_providers_webgpu
-      POST_BUILD
-      COMMAND ${CMAKE_COMMAND} -E copy_if_different "$<TARGET_FILE:dawn::webgpu_dawn>" "$<TARGET_FILE_DIR:onnxruntime_providers_webgpu>"
-      VERBATIM )
+      list(APPEND onnxruntime_providers_webgpu_dll_deps "$<TARGET_FILE:dawn::webgpu_dawn>")
+    endif()
   else()
     if (NOT onnxruntime_USE_EXTERNAL_DAWN)
       target_link_libraries(onnxruntime_providers_webgpu dawn::dawn_native)
@@ -43,4 +42,23 @@
     target_link_libraries(onnxruntime_providers_webgpu dawn::dawn_proc)
   endif()
 
+  if (WIN32 AND onnxruntime_ENABLE_DAWN_BACKEND_D3D12)
+    # Ensure dxil.dll and dxcompiler.dll exist in the output directory $<TARGET_FILE_DIR:dxcompiler>
+    add_dependencies(onnxruntime_providers_webgpu copy_dxil_dll)
+    add_dependencies(onnxruntime_providers_webgpu dxcompiler)
+
+    list(APPEND onnxruntime_providers_webgpu_dll_deps "$<TARGET_FILE_DIR:dxcompiler>/dxil.dll")
+    list(APPEND onnxruntime_providers_webgpu_dll_deps "$<TARGET_FILE_DIR:dxcompiler>/dxcompiler.dll")
+  endif()
+
+  if (onnxruntime_providers_webgpu_dll_deps)
+    # Copy dependency DLLs to the output directory
+    add_custom_command(
+      TARGET onnxruntime_providers_webgpu
+      POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different "${onnxruntime_providers_webgpu_dll_deps}" "$<TARGET_FILE_DIR:onnxruntime_providers_webgpu>"
+      COMMAND_EXPAND_LISTS
+      VERBATIM )
+  endif()
+
   set_target_properties(onnxruntime_providers_webgpu PROPERTIES FOLDER "ONNXRuntime")
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index e822f0a3655fc..9e3ab4d41f416 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -525,6 +525,9 @@ set (onnxruntime_global_thread_pools_test_SRC
 set (onnxruntime_webgpu_external_dawn_test_SRC
           ${TEST_SRC_DIR}/webgpu/external_dawn/main.cc)
 
+set (onnxruntime_webgpu_delay_load_test_SRC
+          ${TEST_SRC_DIR}/webgpu/delay_load/main.cc)
+
 # tests from lowest level library up.
 # the order of libraries should be maintained, with higher libraries being added first in the list
 
@@ -1864,4 +1867,13 @@ if (onnxruntime_USE_WEBGPU AND onnxruntime_USE_EXTERNAL_DAWN)
   onnxruntime_add_include_to_target(onnxruntime_webgpu_external_dawn_test dawn::dawncpp_headers dawn::dawn_headers)
 endif()
 
+if (onnxruntime_USE_WEBGPU AND WIN32 AND onnxruntime_BUILD_SHARED_LIB AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND NOT onnxruntime_MINIMAL_BUILD)
+  AddTest(DYN
+          TARGET onnxruntime_webgpu_delay_load_test
+          SOURCES ${onnxruntime_webgpu_delay_load_test_SRC}
+          LIBS ${SYS_PATH_LIB}
+          DEPENDS ${all_dependencies}
+  )
+endif()
+
 include(onnxruntime_fuzz_test.cmake)
diff --git a/js/node/CMakeLists.txt b/js/node/CMakeLists.txt
index d79a82c572dc2..c78b40a3e7429 100644
--- a/js/node/CMakeLists.txt
+++ b/js/node/CMakeLists.txt
@@ -113,10 +113,12 @@ endif()
 if (WIN32)
   file(COPY ${ONNXRUNTIME_WIN_BIN_DIR}/onnxruntime.dll
       DESTINATION ${dist_folder})
-  if (USE_DML)
-    file(COPY ${ONNXRUNTIME_WIN_BIN_DIR}/DirectML.dll
-      DESTINATION ${dist_folder})
-  endif ()
+  if (ORT_NODEJS_DLL_DEPS)
+    foreach(dll ${ORT_NODEJS_DLL_DEPS})
+      file(COPY ${dll} DESTINATION ${dist_folder})
+    endforeach()
+  endif()
+
 elseif (APPLE)
   file(COPY ${ONNXRUNTIME_BUILD_DIR}/libonnxruntime.dylib
       DESTINATION ${dist_folder} FOLLOW_SYMLINK_CHAIN)
diff --git a/js/node/script/build.ts b/js/node/script/build.ts
index dcdcb93377b4c..b557368ed58c6 100644
--- a/js/node/script/build.ts
+++ b/js/node/script/build.ts
@@ -39,6 +39,8 @@ const USE_TENSORRT = !!buildArgs.use_tensorrt;
 const USE_COREML = !!buildArgs.use_coreml;
 // --use_qnn
 const USE_QNN = !!buildArgs.use_qnn;
+// --dll_deps=
+const DLL_DEPS = buildArgs.dll_deps;
 
 // build path
 const ROOT_FOLDER = path.join(__dirname, '..');
@@ -82,6 +84,9 @@ if (USE_COREML) {
 if (USE_QNN) {
   args.push('--CDUSE_QNN=ON');
 }
+if (DLL_DEPS) {
+  args.push(`--CDORT_NODEJS_DLL_DEPS=${DLL_DEPS}`);
+}
 
 // set CMAKE_OSX_ARCHITECTURES for macOS build
 if (os.platform() === 'darwin') {
diff --git a/js/node/src/directml_load_helper.cc b/js/node/src/directml_load_helper.cc
deleted file mode 100644
index 6aafe4d5fa788..0000000000000
--- a/js/node/src/directml_load_helper.cc
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#ifdef _WIN32
-#include "common.h"
-#include "windows.h"
-
-void LoadDirectMLDll(Napi::Env env) {
-  DWORD pathLen = MAX_PATH;
-  std::wstring path(pathLen, L'\0');
-  HMODULE moduleHandle = nullptr;
-
-  GetModuleHandleEx(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT,
-                    reinterpret_cast<LPCSTR>(&LoadDirectMLDll), &moduleHandle);
-
-  DWORD getModuleFileNameResult = GetModuleFileNameW(moduleHandle, const_cast<wchar_t*>(path.c_str()), pathLen);
-  while (getModuleFileNameResult == 0 || getModuleFileNameResult == pathLen) {
-    int ret = GetLastError();
-    if (ret == ERROR_INSUFFICIENT_BUFFER && pathLen < 32768) {
-      pathLen *= 2;
-      path.resize(pathLen);
-      getModuleFileNameResult = GetModuleFileNameW(moduleHandle, const_cast<wchar_t*>(path.c_str()), pathLen);
-    } else {
-      ORT_NAPI_THROW_ERROR(env, "Failed getting path to load DirectML.dll, error code: ", ret);
-    }
-  }
-
-  path.resize(path.rfind(L'\\') + 1);
-  path.append(L"DirectML.dll");
-  HMODULE libraryLoadResult = LoadLibraryW(path.c_str());
-
-  if (!libraryLoadResult) {
-    int ret = GetLastError();
-    ORT_NAPI_THROW_ERROR(env, "Failed loading bundled DirectML.dll, error code: ", ret);
-  }
-}
-#endif
diff --git a/js/node/src/directml_load_helper.h b/js/node/src/directml_load_helper.h
deleted file mode 100644
index 074a4f95ed476..0000000000000
--- a/js/node/src/directml_load_helper.h
+++ /dev/null
@@ -1,6 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#if defined(USE_DML) && defined(_WIN32)
-void LoadDirectMLDll(Napi::Env env);
-#endif
diff --git a/js/node/src/inference_session_wrap.cc b/js/node/src/inference_session_wrap.cc
index 23d859351f426..04ab71dc48ec2 100644
--- a/js/node/src/inference_session_wrap.cc
+++ b/js/node/src/inference_session_wrap.cc
@@ -4,7 +4,6 @@
 #include "onnxruntime_cxx_api.h"
 
 #include "common.h"
-#include "directml_load_helper.h"
 #include "inference_session_wrap.h"
 #include "run_options_helper.h"
 #include "session_options_helper.h"
@@ -19,9 +18,6 @@ Napi::FunctionReference& InferenceSessionWrap::GetTensorConstructor() {
 }
 
 Napi::Object InferenceSessionWrap::Init(Napi::Env env, Napi::Object exports) {
-#if defined(USE_DML) && defined(_WIN32)
-  LoadDirectMLDll(env);
-#endif
   // create ONNX runtime env
   Ort::InitApi();
   ORT_NAPI_THROW_ERROR_IF(
diff --git a/js/node/src/tensor_helper.cc b/js/node/src/tensor_helper.cc
index 27eb9b65c62d3..12b1a79793ff3 100644
--- a/js/node/src/tensor_helper.cc
+++ b/js/node/src/tensor_helper.cc
@@ -53,24 +53,24 @@ constexpr size_t DATA_TYPE_ELEMENT_SIZE_MAP[] = {
 static_assert(sizeof(DATA_TYPE_ELEMENT_SIZE_MAP) == sizeof(size_t) * ONNX_TENSOR_ELEMENT_DATA_TYPE_COUNT,
               "definition not matching");
 
-constexpr napi_typedarray_type DATA_TYPE_TYPEDARRAY_MAP[] = {
-    (napi_typedarray_type)(-1),  // ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED     not supported
-    napi_float32_array,          // ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT
-    napi_uint8_array,            // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8
-    napi_int8_array,             // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8
-    napi_uint16_array,           // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16
-    napi_int16_array,            // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16
-    napi_int32_array,            // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32
-    napi_bigint64_array,         // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64
-    (napi_typedarray_type)(-1),  // ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING        not supported
-    napi_uint8_array,            // ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL
-    napi_uint16_array,           // ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16       FLOAT16 uses Uint16Array
-    napi_float64_array,          // ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE
-    napi_uint32_array,           // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32
-    napi_biguint64_array,        // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64
-    (napi_typedarray_type)(-1),  // ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX64     not supported
-    (napi_typedarray_type)(-1),  // ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX128    not supported
-    (napi_typedarray_type)(-1)   // ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16      not supported
+constexpr std::underlying_type_t<napi_typedarray_type> DATA_TYPE_TYPEDARRAY_MAP[] = {
+    std::underlying_type_t<napi_typedarray_type>(-1),  // ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED     not supported
+    napi_float32_array,                                // ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT
+    napi_uint8_array,                                  // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8
+    napi_int8_array,                                   // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8
+    napi_uint16_array,                                 // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16
+    napi_int16_array,                                  // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16
+    napi_int32_array,                                  // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32
+    napi_bigint64_array,                               // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64
+    std::underlying_type_t<napi_typedarray_type>(-1),  // ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING        not supported
+    napi_uint8_array,                                  // ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL
+    napi_uint16_array,                                 // ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16       FLOAT16 uses Uint16Array
+    napi_float64_array,                                // ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE
+    napi_uint32_array,                                 // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32
+    napi_biguint64_array,                              // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64
+    std::underlying_type_t<napi_typedarray_type>(-1),  // ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX64     not supported
+    std::underlying_type_t<napi_typedarray_type>(-1),  // ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX128    not supported
+    std::underlying_type_t<napi_typedarray_type>(-1)   // ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16      not supported
 };
 static_assert(sizeof(DATA_TYPE_TYPEDARRAY_MAP) == sizeof(napi_typedarray_type) * ONNX_TENSOR_ELEMENT_DATA_TYPE_COUNT,
               "definition not matching");
@@ -98,7 +98,20 @@ static_assert(sizeof(DATA_TYPE_ID_TO_NAME_MAP) == sizeof(const char*) * ONNX_TEN
               "definition not matching");
 
 const std::unordered_map<std::string, ONNXTensorElementDataType> DATA_TYPE_NAME_TO_ID_MAP = {
-    {"float32", ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT}, {"uint8", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8}, {"int8", ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8}, {"uint16", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16}, {"int16", ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16}, {"int32", ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32}, {"int64", ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64}, {"string", ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING}, {"bool", ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL}, {"float16", ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16}, {"float64", ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE}, {"uint32", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32}, {"uint64", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64}};
+    {"float32", ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT},
+    {"uint8", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8},
+    {"int8", ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8},
+    {"uint16", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16},
+    {"int16", ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16},
+    {"int32", ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32},
+    {"int64", ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64},
+    {"string", ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING},
+    {"bool", ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL},
+    {"float16", ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16},
+    {"float64", ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE},
+    {"uint32", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32},
+    {"uint64", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64},
+};
 
 // currently only support tensor
 Ort::Value NapiValueToOrtValue(Napi::Env env, Napi::Value value, OrtMemoryInfo* cpu_memory_info, OrtMemoryInfo* webgpu_memory_info) {
@@ -181,7 +194,7 @@ Ort::Value NapiValueToOrtValue(Napi::Env env, Napi::Value value, OrtMemoryInfo*
                                   "Tensor.data must be a typed array for numeric tensor.");
 
       auto tensorDataTypedArray = tensorDataValue.As<Napi::TypedArray>();
-      auto typedArrayType = tensorDataValue.As<Napi::TypedArray>().TypedArrayType();
+      std::underlying_type_t<napi_typedarray_type> typedArrayType = tensorDataValue.As<Napi::TypedArray>().TypedArrayType();
       ORT_NAPI_THROW_TYPEERROR_IF(DATA_TYPE_TYPEDARRAY_MAP[elemType] != typedArrayType, env,
                                   "Tensor.data must be a typed array (", DATA_TYPE_TYPEDARRAY_MAP[elemType], ") for ",
                                   tensorTypeString, " tensors, but got typed array (", typedArrayType, ").");
@@ -294,7 +307,7 @@ Napi::Value OrtValueToNapiValue(Napi::Env env, Ort::Value&& value) {
       }
       napi_value typedArrayData;
       napi_status status =
-          napi_create_typedarray(env, DATA_TYPE_TYPEDARRAY_MAP[elemType], size, arrayBuffer, 0, &typedArrayData);
+          napi_create_typedarray(env, (napi_typedarray_type)DATA_TYPE_TYPEDARRAY_MAP[elemType], size, arrayBuffer, 0, &typedArrayData);
       NAPI_THROW_IF_FAILED(env, status, Napi::Value);
 
       // new Tensor(type, typedArrayData, dims)
diff --git a/onnxruntime/core/dll/delay_load_hook.cc b/onnxruntime/core/dll/delay_load_hook.cc
new file mode 100644
index 0000000000000..23fc8bca7368e
--- /dev/null
+++ b/onnxruntime/core/dll/delay_load_hook.cc
@@ -0,0 +1,83 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// == workaround for delay loading of dependencies of onnxruntime.dll ==
+//
+// Problem:
+//
+// When onnxruntime.dll uses delay loading for its dependencies, the dependencies are loaded using LoadLibraryEx,
+// which search the directory of process (.exe) instead of this library (onnxruntime.dll). This is a problem for
+// usages of Node.js binding and python binding, because Windows will try to find the dependencies in the directory
+// of node.exe or python.exe, which is not the directory of onnxruntime.dll.
+//
+// Solution:
+//
+// By using the delay load hook `__pfnDliNotifyHook2`, we can intervene the loading procedure by loading from an
+// absolute path. The absolute path is constructed by appending the name of the DLL to load to the directory of
+// onnxruntime.dll. This way, we can ensure that the dependencies are loaded from the same directory as onnxruntime.dll.
+//
+// See also:
+// - https://learn.microsoft.com/en-us/cpp/build/reference/understanding-the-helper-function?view=msvc-170#structure-and-constant-definitions
+// - https://learn.microsoft.com/en-us/windows/win32/dlls/dynamic-link-library-search-order#alternate-search-order-for-unpackaged-apps
+//
+// The DLL DelayLoad hook is only enabled when the compiler is MSVC and at least one of the following is True:
+// - both USE_WEBGPU and BUILD_DAWN_MONOLITHIC_LIBRARY are defined
+// - USE_DML is defined
+//
+#define ORT_DELAY_LOAD_WEBGPU_DAWN_DLL (defined(USE_WEBGPU) && defined(BUILD_DAWN_MONOLITHIC_LIBRARY))
+#define ORT_DELAY_LOAD_DIRECTML_DLL defined(USE_DML)
+#if defined(_MSC_VER) && (ORT_DELAY_LOAD_WEBGPU_DAWN_DLL || ORT_DELAY_LOAD_DIRECTML_DLL)
+
+#include <Windows.h>
+#include <delayimp.h>
+#include <stdlib.h>
+#include <string>
+
+#include "core/platform/env.h"
+
+namespace {
+
+#define DEFINE_KNOWN_DLL(name) {#name ".dll", L#name L".dll"}
+
+constexpr struct {
+  const char* str;
+  const wchar_t* wstr;
+} known_dlls[] = {
+#if ORT_DELAY_LOAD_WEBGPU_DAWN_DLL
+    DEFINE_KNOWN_DLL(webgpu_dawn),
+#endif
+#if ORT_DELAY_LOAD_DIRECTML_DLL
+    DEFINE_KNOWN_DLL(DirectML),
+#endif
+};
+}  // namespace
+
+FARPROC WINAPI delay_load_hook(unsigned dliNotify, PDelayLoadInfo pdli) {
+  if (dliNotify == dliNotePreLoadLibrary) {
+    for (size_t i = 0; i < _countof(known_dlls); ++i) {
+      if (_stricmp(pdli->szDll, known_dlls[i].str) == 0) {
+        // Try to load the DLL from the same directory as onnxruntime.dll
+
+        // First, get the path to onnxruntime.dll
+        auto path = Env::Default().GetRuntimePath();
+        if (path.empty()) {
+          // Failed to get the path to onnxruntime.dll. In this case, we will just return NULL and let the system
+          // search for the DLL in the default search order.
+          return NULL;
+        }
+
+        // Append the name of the DLL. Now `path` is the absolute path to the DLL to load.
+        path.append(known_dlls[i].wstr);
+
+        // Load the DLL
+        return FARPROC(LoadLibraryExW(path.c_str(), NULL,
+                                      LOAD_LIBRARY_SEARCH_DEFAULT_DIRS | LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR));
+      }
+    }
+  }
+  return NULL;
+}
+
+extern "C" const PfnDliHook __pfnDliNotifyHook2 = delay_load_hook;
+
+#endif
diff --git a/onnxruntime/core/dll/dllmain.cc b/onnxruntime/core/dll/dllmain.cc
index 2e7bdafd0599f..ac5dcd9c96084 100644
--- a/onnxruntime/core/dll/dllmain.cc
+++ b/onnxruntime/core/dll/dllmain.cc
@@ -13,7 +13,7 @@
 #pragma GCC diagnostic pop
 #endif
 
-// dllmain.cpp : Defines the entry point for the DLL application.
+// dllmain.cc : Defines the entry point for the DLL application.
 BOOL APIENTRY DllMain(HMODULE /*hModule*/,
                       DWORD ul_reason_for_call,
                       LPVOID /*lpReserved*/
diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.cc b/onnxruntime/core/providers/webgpu/webgpu_context.cc
index d66c2a79d28a8..c85a15017659c 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_context.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_context.cc
@@ -10,6 +10,8 @@
 #endif
 
 #include "core/common/common.h"
+#include "core/common/path_string.h"
+#include "core/platform/env.h"
 
 #include "core/providers/webgpu/compute_context.h"
 #include "core/providers/webgpu/webgpu_context.h"
@@ -50,6 +52,30 @@ void WebGpuContext::Initialize(const WebGpuExecutionProviderInfo& webgpu_ep_info
 
     // Initialization.Step.2 - Create wgpu::Adapter
     if (adapter_ == nullptr) {
+#if !defined(__EMSCRIPTEN__) && defined(_MSC_VER) && defined(DAWN_ENABLE_D3D12) && !defined(USE_EXTERNAL_DAWN)
+      // If we are using the D3D12 backend on Windows and the build does not use external Dawn, dxil.dll and dxcompiler.dll are required.
+      //
+      // Dawn will try to load them later, but if they are in the different directory to the executable, it may fail to find them.
+      // To avoid this issue, we try to load them from the same directory as current module (usually onnxruntime.dll).
+      auto runtime_path = Env::Default().GetRuntimePath();
+      if (!runtime_path.empty()) {
+        Status status;
+        void* module_handle = nullptr;
+
+        PathString dxil_path = runtime_path + ToPathString(L"dxil.dll");
+        status = Env::Default().LoadDynamicLibrary(dxil_path, false, &module_handle);
+        if (status.IsOK() && module_handle != nullptr) {
+          modules_.Add(dxil_path, module_handle);
+        }
+
+        PathString dxcompiler_path = runtime_path + ToPathString(L"dxcompiler.dll");
+        status = Env::Default().LoadDynamicLibrary(dxcompiler_path, false, &module_handle);
+        if (status.IsOK() && module_handle != nullptr) {
+          modules_.Add(dxcompiler_path, module_handle);
+        }
+      }
+#endif
+
       wgpu::RequestAdapterOptions req_adapter_options = {};
       wgpu::DawnTogglesDescriptor adapter_toggles_desc = {};
       req_adapter_options.nextInChain = &adapter_toggles_desc;
diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.h b/onnxruntime/core/providers/webgpu/webgpu_context.h
index be05b06523b9c..c41ef3e211264 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_context.h
+++ b/onnxruntime/core/providers/webgpu/webgpu_context.h
@@ -13,6 +13,7 @@
 #include <webgpu/webgpu_cpp.h>
 
 #include "core/common/common.h"
+#include "core/framework/library_handles.h"
 #include "core/providers/webgpu/webgpu_execution_provider.h"
 #include "core/providers/webgpu/buffer_manager.h"
 #include "core/providers/webgpu/program_manager.h"
@@ -153,6 +154,8 @@ class WebGpuContext final {
 
   std::once_flag init_flag_;
 
+  LibraryHandles modules_;
+
   wgpu::Instance instance_;
   wgpu::Adapter adapter_;
   wgpu::Device device_;
diff --git a/onnxruntime/python/tools/tensorrt/perf/build/build_image.py b/onnxruntime/python/tools/tensorrt/perf/build/build_image.py
index 3ebc33c02592d..541dc4978dad1 100644
--- a/onnxruntime/python/tools/tensorrt/perf/build/build_image.py
+++ b/onnxruntime/python/tools/tensorrt/perf/build/build_image.py
@@ -15,10 +15,10 @@
 from typing import List, Optional
 
 TRT_DOCKER_FILES = {
-    "8.6.cuda_11_8_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_6",
-    "8.6.cuda_12_3_cudnn_9": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_3_tensorrt8_6",
-    "10.5.cuda_11_8_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10",
-    "10.5.cuda_12_5_cudnn_9": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10",
+    "8.6_cuda11.8_cudnn8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_6",
+    "8.6_cuda12.3_cudnn9": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_3_tensorrt8_6",
+    "10.7_cuda11.8_cudnn8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10",
+    "10.7_cuda12.5_cudnn9": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10",
     "BIN": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin",
 }
 
diff --git a/onnxruntime/test/webgpu/delay_load/main.cc b/onnxruntime/test/webgpu/delay_load/main.cc
new file mode 100644
index 0000000000000..f909b4a6916b4
--- /dev/null
+++ b/onnxruntime/test/webgpu/delay_load/main.cc
@@ -0,0 +1,142 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <iostream>
+#include <Windows.h>
+#include <stdlib.h>
+#include <filesystem>
+#define ORT_API_MANUAL_INIT
+#include "core/session/onnxruntime_cxx_api.h"
+
+// This program is to test the delay loading of onnxruntime.dll.
+//
+// To verify the delay loading actually works, we need to do the test in 2 steps:
+//
+// 1. Prepare a folder structure like below:
+//
+//    ├── webgpu_delay_load_test_root (newly created folder)
+//    │   ├── dlls
+//    │   │   ├── onnxruntime.dll
+//    │   │   ├── webgpu_dawn.dll
+//    │   │   ├── dxil.dll
+//    │   │   └── dxcompiler.dll
+//    │   └── test.exe
+//    └── onnxruntime_webgpu_delay_load_test.exe (this binary)
+//
+//    This folder structure ensures no DLLs are in the same folder as the executable (test.exe).
+//
+// 2. Launch the test binary from the root folder of the above structure.
+//
+// So, there are 2 modes of this program:
+// 1. "Prepare" mode: Do the step 1 above. (default)
+// 2. "Test" mode: Do the step 2 above. (specified by --test argument)
+
+int prepare_main();
+int test_main();
+
+int wmain(int argc, wchar_t* argv[]) {
+  if (argc == 2 && wcscmp(argv[1], L"--test") == 0) {
+    return test_main();
+  } else {
+    return prepare_main();
+  }
+}
+
+int prepare_main() {
+  std::wstring path_str(32768, L'\0');
+  GetModuleFileNameW(NULL, path_str.data(), static_cast<DWORD>(path_str.size()));
+
+  namespace fs = std::filesystem;
+  fs::path exe_full_path{path_str};                                    // <TEST_DIR>/onnxruntime_webgpu_delay_load_test.exe
+  fs::path test_dir = exe_full_path.parent_path();                     // <TEST_DIR>/
+  fs::path exe_name = exe_full_path.filename();                        // onnxruntime_webgpu_delay_load_test.exe
+  fs::path root_folder = test_dir / L"webgpu_delay_load_test_root\\";  // <TEST_DIR>/webgpu_delay_load_test_root/
+  fs::path dlls_folder = root_folder / L"dlls\\";                      // <TEST_DIR>/webgpu_delay_load_test_root/dlls/
+
+  // ensure the test folder exists and is empty
+  if (fs::exists(root_folder)) {
+    fs::remove_all(root_folder);
+  }
+  fs::create_directories(dlls_folder);
+
+  fs::current_path(test_dir);
+
+  // copy the required DLLs to the dlls folder
+  fs::copy_file(L"onnxruntime.dll", dlls_folder / L"onnxruntime.dll");
+  fs::copy_file(L"dxil.dll", dlls_folder / L"dxil.dll");
+  fs::copy_file(L"dxcompiler.dll", dlls_folder / L"dxcompiler.dll");
+  if (fs::exists(L"webgpu_dawn.dll")) {
+    fs::copy_file(L"webgpu_dawn.dll", dlls_folder / L"webgpu_dawn.dll");
+  }
+
+  // copy the test binary to the root folder
+  fs::copy_file(exe_full_path, root_folder / L"test.exe");
+
+  // run "test.exe --test" from the test root folder
+  fs::current_path(root_folder);
+  return _wsystem(L"test.exe --test");
+}
+
+int run() {
+  Ort::Env env{nullptr};
+  int retval = 0;
+  try {
+    env = Ort::Env{ORT_LOGGING_LEVEL_WARNING, "Default"};
+
+    // model is https://github.com/onnx/onnx/blob/v1.15.0/onnx/backend/test/data/node/test_abs/model.onnx
+    constexpr uint8_t MODEL_DATA[] = {8, 7, 18, 12, 98, 97, 99, 107, 101, 110,
+                                      100, 45, 116, 101, 115, 116, 58, 73, 10, 11,
+                                      10, 1, 120, 18, 1, 121, 34, 3, 65, 98,
+                                      115, 18, 8, 116, 101, 115, 116, 95, 97, 98,
+                                      115, 90, 23, 10, 1, 120, 18, 18, 10, 16,
+                                      8, 1, 18, 12, 10, 2, 8, 3, 10, 2,
+                                      8, 4, 10, 2, 8, 5, 98, 23, 10, 1,
+                                      121, 18, 18, 10, 16, 8, 1, 18, 12, 10,
+                                      2, 8, 3, 10, 2, 8, 4, 10, 2, 8,
+                                      5, 66, 4, 10, 0, 16, 13};
+
+    Ort::SessionOptions session_options;
+    session_options.DisableMemPattern();
+    std::unordered_map<std::string, std::string> provider_options;
+    session_options.AppendExecutionProvider("WebGPU", provider_options);
+    Ort::Session session{env, MODEL_DATA, sizeof(MODEL_DATA), session_options};
+
+    // successfully initialized
+    std::cout << "Successfully initialized WebGPU EP." << std::endl;
+    retval = 0;
+  } catch (const std::exception& ex) {
+    std::cerr << ex.what() << std::endl;
+
+    std::cerr << "Unexpected exception." << std::endl;
+    retval = -1;
+  }
+
+  return retval;
+}
+
+int test_main() {
+  HMODULE hModule = LoadLibraryA("dlls\\onnxruntime.dll");
+  if (hModule == NULL) {
+    std::cout << "Failed to load dlls\\onnxruntime.dll" << std::endl;
+    return 1;
+  }
+
+  int retval = 0;
+
+  using OrtGetApiBaseFunction = decltype(&OrtGetApiBase);
+  auto fnOrtGetApiBase = (OrtGetApiBaseFunction)GetProcAddress(hModule, "OrtGetApiBase");
+  if (fnOrtGetApiBase == NULL) {
+    std::cout << "Failed to get OrtGetApiBase" << std::endl;
+    retval = 1;
+    goto cleanup;
+  }
+  Ort::InitApi(fnOrtGetApiBase()->GetApi(ORT_API_VERSION));
+
+  retval = run();
+
+cleanup:
+  if (hModule != NULL) {
+    FreeLibrary(hModule);
+  }
+  return retval;
+}
diff --git a/onnxruntime/test/webgpu/external_dawn/main.cc b/onnxruntime/test/webgpu/external_dawn/main.cc
index ed8d2eab94ce9..1cb22b131d76b 100644
--- a/onnxruntime/test/webgpu/external_dawn/main.cc
+++ b/onnxruntime/test/webgpu/external_dawn/main.cc
@@ -1,5 +1,4 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 // Licensed under the MIT License.
 
 #include <iostream>
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 3527a89ca7a7b..53dcdc6e0c6fa 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -260,6 +260,7 @@ def convert_arg_line_to_args(self, arg_line):
     )
 
     parser.add_argument("--disable_cuda_nhwc_ops", action="store_true", help="Disable CUDA NHWC ops in build.")
+    parser.add_argument("--enable_cuda_minimal_build", action="store_true", help="Enable CUDA minimal build.")
 
     # Python bindings
     parser.add_argument("--enable_pybind", action="store_true", help="Enable Python Bindings.")
@@ -1093,6 +1094,7 @@ def generate_build_tree(
         "-Donnxruntime_DISABLE_FLOAT8_TYPES=" + ("ON" if disable_float8_types else "OFF"),
         "-Donnxruntime_DISABLE_SPARSE_TENSORS=" + ("ON" if disable_sparse_tensors else "OFF"),
         "-Donnxruntime_DISABLE_OPTIONAL_TYPE=" + ("ON" if disable_optional_type else "OFF"),
+        "-Donnxruntime_CUDA_MINIMAL=" + ("ON" if args.enable_cuda_minimal_build else "OFF"),
     ]
 
     if args.rv64:
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml
new file mode 100644
index 0000000000000..2a32dd1a62408
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml
@@ -0,0 +1,108 @@
+##### start trigger Don't edit it manually, Please do edit set-trigger-rules.py ####
+### please do rerun set-trigger-rules.py ###
+trigger:
+  branches:
+    include:
+    - main
+    - rel-*
+  paths:
+    exclude:
+    - docs/**
+    - README.md
+    - CONTRIBUTING.md
+    - BUILD.md
+    - 'js/web'
+    - 'onnxruntime/core/providers/js'
+pr:
+  branches:
+    include:
+    - main
+    - rel-*
+  paths:
+    exclude:
+    - docs/**
+    - README.md
+    - CONTRIBUTING.md
+    - BUILD.md
+    - 'js/web'
+    - 'onnxruntime/core/providers/js'
+#### end trigger ####
+parameters:
+  - name: CudaVersion
+    displayName: CUDA version
+    type: string
+    default: '12.2'
+    values:
+      - 11.8
+      - 12.2
+
+variables:
+  - template: templates/common-variables.yml
+  - name: docker_base_image
+    ${{ if eq(parameters.CudaVersion, '11.8') }}:
+      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241120.3
+    ${{ if eq(parameters.CudaVersion, '12.2') }}:
+      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20241120.3
+  - name: linux_trt_version
+    ${{ if eq(parameters.CudaVersion, '11.8') }}:
+      value: ${{ variables.linux_trt_version_cuda11 }}
+    ${{ if eq(parameters.CudaVersion, '12.2') }}:
+      value: ${{ variables.linux_trt_version_cuda12 }}
+
+jobs:
+- job: Linux_Build
+  timeoutInMinutes: 180
+  variables:
+    skipComponentGovernanceDetection: true
+    ALLOW_RELEASED_ONNX_OPSET_ONLY: '1'
+    ORT_CACHE_DIR: '$(Agent.TempDirectory)/ort/ccache'
+    TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
+  workspace:
+    clean: all
+  pool: onnxruntime-tensorrt-linuxbuild-T4
+  steps:
+  - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+    displayName: 'Clean Agent Directories'
+    condition: always()
+
+  - checkout: self
+    clean: true
+    submodules: none
+
+  - template: templates/get-docker-image-steps.yml
+    parameters:
+      Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
+      Context: tools/ci_build/github/linux/docker
+      DockerBuildArgs: "
+      --network=host
+      --build-arg BASEIMAGE=${{ variables.docker_base_image }}
+      --build-arg TRT_VERSION=${{ variables.linux_trt_version }}
+      --build-arg BUILD_UID=$( id -u )
+      "
+      Repository: onnxruntimetensorrtcudaminimalbuild
+
+  - template: templates/linux-build-step-with-cache.yml
+    parameters:
+      WithCache: true
+      Today: $(TODAY)
+      AdditionalKey: gpu_tensorrt_cuda_minimal
+      CacheDir: '$(ORT_CACHE_DIR)'
+      BuildStep:
+        - task: CmdLine@2
+          inputs:
+            script: |
+              docker run --gpus all --rm \
+                  --volume /data/onnx:/data/onnx:ro \
+                  --volume $(Build.SourcesDirectory):/onnxruntime_src \
+                  --volume $(Build.BinariesDirectory):/build \
+                  --volume /data/models:/build/models:ro \
+                  --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
+                  --volume $(ORT_CACHE_DIR):/cache \
+                  -e ALLOW_RELEASED_ONNX_OPSET_ONLY=0 \
+                  -e NIGHTLY_BUILD \
+                  -e BUILD_BUILDNUMBER \
+                  -e CCACHE_DIR=/cache -w /onnxruntime_src \
+                  onnxruntimetensorrtcudaminimalbuild tools/ci_build/github/linux/build_tensorrt_ci.sh --cuda_minimal=ON
+            workingDirectory: $(Build.SourcesDirectory)
+
+  - template: templates/explicitly-defined-final-tasks.yml
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml
index 83cf26614a285..9286b5a54ac27 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml
@@ -8,12 +8,12 @@ parameters:
 - name: TrtVersion
   displayName: TensorRT Version
   type: string
-  default: 10.5.cuda_12_5_cudnn_9
+  default: 10.7_cuda12.5_cudnn9
   values:
-  - 8.6.cuda_11_8_cudnn_8
-  - 8.6.cuda_12_3_cudnn_9
-  - 10.5.cuda_11_8_cudnn_8
-  - 10.5.cuda_12_5_cudnn_9
+  - 8.6_cuda11.8_cudnn8
+  - 8.6_cuda12.3_cudnn9
+  - 10.7_cuda11.8_cudnn8
+  - 10.7_cuda12.5_cudnn9
   - BIN
 
 - name: UseTensorrtOssParser
@@ -198,4 +198,4 @@ jobs:
       parameters :
         condition : 'succeeded'
 
-    - template: templates/clean-agent-build-directory-step.yml
+    - template: templates/clean-agent-build-directory-step.yml
\ No newline at end of file
diff --git a/tools/ci_build/github/azure-pipelines/py-cuda-alt-package-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-cuda-alt-package-test-pipeline.yml
index 9296928ad97e0..cf434e4eadf0d 100644
--- a/tools/ci_build/github/azure-pipelines/py-cuda-alt-package-test-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-cuda-alt-package-test-pipeline.yml
@@ -19,6 +19,6 @@ stages:
           python_wheel_suffix: '_gpu'
           timeout: 480
           docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241120.3
-          trt_version: '10.6.0.26-1.cuda11.8'
+          trt_version: '10.7.0.23-1.cuda11.8'
           cuda_version: '11.8'
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/common-variables.yml b/tools/ci_build/github/azure-pipelines/templates/common-variables.yml
index d35bed69ee409..3d4e5326ae7c6 100644
--- a/tools/ci_build/github/azure-pipelines/templates/common-variables.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/common-variables.yml
@@ -1,5 +1,5 @@
 variables:
-  common_trt_version: '10.6.0.26'
+  common_trt_version: '10.7.0.23'
   # As for Debian installation, replace '-1.' by '-1+' when assigning trt version below
   linux_trt_version_cuda11: ${{ variables.common_trt_version }}-1.cuda11.8
   linux_trt_version_cuda12: ${{ variables.common_trt_version }}-1.cuda12.6
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml
index ae54b3849a862..14b9c378bec14 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml
@@ -13,10 +13,10 @@ parameters:
       - 12.2
   - name: TrtVersion
     type: string
-    default: '10.6.0.26'
+    default: '10.7.0.23'
     values:
       - 8.6.1.6
-      - 10.6.0.26
+      - 10.7.0.23
 
 steps:
   - ${{ if eq(parameters.DownloadCUDA, true) }}:
@@ -42,7 +42,7 @@ steps:
         - powershell: |
             Write-Host "##vso[task.setvariable variable=trtCudaVersion;]12.0"
           displayName: Set trtCudaVersion
-    - ${{ if and(eq(parameters.CudaVersion, '12.2'), eq(parameters.TrtVersion, '10.6.0.26')) }}:
+    - ${{ if and(eq(parameters.CudaVersion, '12.2'), eq(parameters.TrtVersion, '10.7.0.23')) }}:
         - powershell: |
             Write-Host "##vso[task.setvariable variable=trtCudaVersion;]12.6"
           displayName: Set trtCudaVersion
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml
index dfaf237a711fe..45572416350c3 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml
@@ -15,10 +15,10 @@ parameters:
     default: '11.8'
   - name: win_trt_folder_cuda11
     type: string
-    default: 'TensorRT-10.6.0.26.Windows10.x86_64.cuda-11.8'
+    default: 'TensorRT-10.7.0.23.Windows10.x86_64.cuda-11.8'
   - name: win_trt_folder_cuda12
     type: string
-    default: 'TensorRT-10.6.0.26.Windows10.x86_64.cuda-12.6'
+    default: 'TensorRT-10.7.0.23.Windows10.x86_64.cuda-12.6'
 
 steps:
   - ${{ if eq(parameters.DownloadCUDA, 'true') }}:
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-cuda-minimal-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-cuda-minimal-ci-pipeline.yml
new file mode 100644
index 0000000000000..c68ba01485db2
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-cuda-minimal-ci-pipeline.yml
@@ -0,0 +1,86 @@
+##### start trigger Don't edit it manually, Please do edit set-trigger-rules.py ####
+### please do rerun set-trigger-rules.py ###
+trigger:
+  branches:
+    include:
+    - main
+    - rel-*
+  paths:
+    exclude:
+    - docs/**
+    - README.md
+    - CONTRIBUTING.md
+    - BUILD.md
+    - 'js/web'
+    - 'onnxruntime/core/providers/js'
+pr:
+  branches:
+    include:
+    - main
+    - rel-*
+  paths:
+    exclude:
+    - docs/**
+    - README.md
+    - CONTRIBUTING.md
+    - BUILD.md
+    - 'js/web'
+    - 'onnxruntime/core/providers/js'
+#### end trigger ####
+parameters:
+- name: CudaVersion
+  displayName: CUDA version
+  type: string
+  default: '12.2'
+  values:
+    - 11.8
+    - 12.2
+
+variables:
+  - template: templates/common-variables.yml
+  - name: win_trt_folder
+    ${{ if eq(parameters.CudaVersion, '11.8') }}:
+      value: ${{ variables.win_trt_folder_cuda11 }}
+    ${{ if eq(parameters.CudaVersion, '12.2') }}:
+      value: ${{ variables.win_trt_folder_cuda12 }}
+
+jobs:
+- job: 'build'
+  pool: 'onnxruntime-Win2022-GPU-A10'
+  variables:
+    MsbuildArguments: '-detailedsummary -maxcpucount -consoleloggerparameters:PerformanceSummary'
+    EnvSetupScript: setup_env_trt.bat
+    skipComponentGovernanceDetection: true
+    TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
+  timeoutInMinutes: 150
+  workspace:
+    clean: all
+  steps:
+  - template: templates/jobs/win-ci-prebuild-steps.yml
+    parameters:
+      EnvSetupScript: $(EnvSetupScript)
+      DownloadCUDA: true
+      DownloadTRT: true
+      BuildArch: 'x64'
+      BuildConfig: RelWithDebInfo
+      MachinePool: 'onnxruntime-Win2022-GPU-A10'
+      WithCache: true
+      Today: $(Today)
+
+  - template: templates/jobs/win-ci-build-steps.yml
+    parameters:
+      WithCache: True
+      Today: $(TODAY)
+      AdditionalKey: "gpu_tensorrt_cuda_minimal | RelWithDebInfo"
+      BuildPyArguments: '--config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\${{ variables.win_trt_folder }}" --cuda_home="$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --enable_cuda_minimal_build'
+      MsbuildArguments: $(MsbuildArguments)
+      BuildArch: 'x64'
+      Platform: 'x64'
+      BuildConfig: RelWithDebInfo
+
+  - task: PythonScript@0
+    displayName: 'Build wheel'
+    inputs:
+      scriptPath: '$(Build.SourcesDirectory)\setup.py'
+      arguments: 'bdist_wheel'
+      workingDirectory: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo'
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-webgpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-webgpu-ci-pipeline.yml
index 06f374afca57a..8460df2ec3799 100644
--- a/tools/ci_build/github/azure-pipelines/win-gpu-webgpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-webgpu-ci-pipeline.yml
@@ -48,7 +48,7 @@ stages:
           --enable_pybind
           --build_nodejs
           --use_webgpu
-          --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON
+          --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY=ON
         msbuildPlatform: x64
         isX86: false
         job_name_suffix: x64_RelWithDebInfo
diff --git a/tools/ci_build/github/linux/build_tensorrt_ci.sh b/tools/ci_build/github/linux/build_tensorrt_ci.sh
index 5b206bc0a92d9..ccf7a6f4ea630 100755
--- a/tools/ci_build/github/linux/build_tensorrt_ci.sh
+++ b/tools/ci_build/github/linux/build_tensorrt_ci.sh
@@ -21,6 +21,19 @@ BUILD_ARGS=('--config' 'Release'
 	      "CMAKE_CUDA_ARCHITECTURES=75"
 	      "onnxruntime_BUILD_UNIT_TESTS=ON"
 	      "onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON")
+
+# Parse external args
+for arg in "$@"; do
+  case $arg in
+    --cuda_minimal=ON)
+      # Replace onnxruntime_BUILD_UNIT_TESTS=ON with OFF
+      BUILD_ARGS=("${BUILD_ARGS[@]/onnxruntime_BUILD_UNIT_TESTS=ON/onnxruntime_BUILD_UNIT_TESTS=OFF}")
+      BUILD_ARGS+=("--enable_cuda_minimal_build")
+      BUILD_ARGS+=("--skip_tests")
+      ;;
+  esac
+done
+
 if [ -x "$(command -v ninja)" ]; then
     BUILD_ARGS+=('--cmake_generator' 'Ninja')
 fi
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0 b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0
index c2bae5fd7ee59..df5112dc38af4 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0
@@ -6,7 +6,7 @@
 
 # Build base image with required system packages
 ARG BASEIMAGE=nvidia/cuda:12.5.1-cudnn-devel-ubi8
-ARG TRT_VERSION=10.6.0.26-1.cuda12.6
+ARG TRT_VERSION=10.7.0.23-1.cuda12.6
 FROM $BASEIMAGE AS base
 ARG TRT_VERSION
 ENV PATH=/opt/python/cp310-cp310/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH}
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch
index 2ecc6d1918b1a..fef95b8574520 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch
@@ -6,7 +6,7 @@
 
 # Build base image with required system packages
 ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8
-ARG TRT_VERSION=10.6.0.26-1.cuda11.8
+ARG TRT_VERSION=10.7.0.23-1.cuda11.8
 FROM $BASEIMAGE AS base
 ARG TRT_VERSION
 ENV PATH=/opt/python/cp310-cp310/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH}
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
index 81aeada6a4a46..e91f14ff955b9 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
@@ -6,7 +6,7 @@
 
 # Build base image with required system packages
 ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
-ARG TRT_VERSION=10.6.0.26-1+cuda11.8
+ARG TRT_VERSION=10.7.0.23-1+cuda11.8
 ARG LD_LIBRARY_PATH_ARG=/usr/local/lib64:/usr/local/cuda/lib64
 FROM $BASEIMAGE AS base
 ARG TRT_VERSION
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_ffmpeg b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_ffmpeg
index 4298dd53e4c66..0b08d4b3024b8 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_ffmpeg
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_ffmpeg
@@ -6,7 +6,7 @@
 
 # Build base image with required system packages
 ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
-ARG TRT_VERSION=10.6.0.26-1+cuda11.8
+ARG TRT_VERSION=10.7.0.23-1+cuda11.8
 ARG LD_LIBRARY_PATH_ARG=/usr/local/lib64:/usr/local/cuda/lib64
 FROM $BASEIMAGE AS base
 ARG TRT_VERSION
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_opencv b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_opencv
index 1312475ceca3a..3a7e064686ae5 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_opencv
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_opencv
@@ -6,7 +6,7 @@
 
 # Build base image with required system packages
 ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
-ARG TRT_VERSION=10.6.0.26-1+cuda11.8
+ARG TRT_VERSION=10.7.0.23-1+cuda11.8
 ARG LD_LIBRARY_PATH_ARG=/usr/local/lib64:/usr/local/cuda/lib64
 FROM $BASEIMAGE AS base
 ARG TRT_VERSION
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10
index 22d5e3b0248a8..01f08ff41e2cc 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10
@@ -31,7 +31,7 @@ RUN pip install --upgrade pip
 RUN pip install psutil setuptools>=68.2.2
 
 # Install TensorRT
-RUN TRT_VERSION="10.6.0.26-1+cuda11.8" &&\
+RUN TRT_VERSION="10.7.0.23-1+cuda11.8" &&\
     apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\
     apt-get update &&\
     apt-get install -y \
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10
index 819d9bab7be75..781f0647a084b 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10
@@ -31,7 +31,7 @@ RUN pip install --upgrade pip
 RUN pip install setuptools>=68.2.2 psutil
 
 # Install TensorRT
-RUN TRT_VERSION="10.6.0.26-1+cuda12.6" &&\
+RUN TRT_VERSION="10.7.0.23-1+cuda12.6" &&\
     apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\
     apt-get update &&\
     apt-get install -y \
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile
index a69b98f86ba1b..5f10607b11626 100644
--- a/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile
@@ -5,7 +5,7 @@
 ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8
 
 FROM $BASEIMAGE
-ARG TRT_VERSION=10.6.0.26-1.cuda11.8
+ARG TRT_VERSION=10.7.0.23-1.cuda11.8
 
 #Install TensorRT only if TRT_VERSION is not empty
 RUN if [ -n "${TRT_VERSION}" ]; then  \
diff --git a/tools/ci_build/github/windows/setup_env_gpu.bat b/tools/ci_build/github/windows/setup_env_gpu.bat
index 34ddd75da16fc..4e2bd8f8386e2 100644
--- a/tools/ci_build/github/windows/setup_env_gpu.bat
+++ b/tools/ci_build/github/windows/setup_env_gpu.bat
@@ -6,10 +6,10 @@ if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ (
 ) else (
     set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\CUPTI\lib64;%PATH%
 )
-set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.6.0.26.Windows10.x86_64.cuda-12.6\lib;%PATH%
+set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.7.0.23.Windows10.x86_64.cuda-12.6\lib;%PATH%
 
 @REM The default version is still cuda v12.2, because set cuda v11.8 after it
-set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\TensorRT-10.6.0.26.Windows10.x86_64.cuda-11.8\lib
+set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\TensorRT-10.7.0.23.Windows10.x86_64.cuda-11.8\lib
 if exist PATH=%AGENT_TEMPDIRECTORY%\v11.8\ (
     set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v11.8\bin;%AGENT_TEMPDIRECTORY%\v11.8\extras\CUPTI\lib64
 ) else (
diff --git a/tools/ci_build/github/windows/setup_env_trt.bat b/tools/ci_build/github/windows/setup_env_trt.bat
index 03734293be5c4..6a602e46661e7 100644
--- a/tools/ci_build/github/windows/setup_env_trt.bat
+++ b/tools/ci_build/github/windows/setup_env_trt.bat
@@ -6,6 +6,6 @@ if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ (
 ) else (
     set PATH=%PATH%;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\CUPTI\lib64
 )
-set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.6.0.26.Windows10.x86_64.cuda-12.6\lib;%PATH%
+set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.7.0.23.Windows10.x86_64.cuda-12.6\lib;%PATH%
 set GRADLE_OPTS=-Dorg.gradle.daemon=false
 set CUDA_MODULE_LOADING=LAZY