diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json
index 07dff50f9a3bd..ad4195f31aa7c 100644
--- a/cgmanifests/generated/cgmanifest.json
+++ b/cgmanifests/generated/cgmanifest.json
@@ -196,7 +196,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "bc0d2e35909b8456abe32f3b30a49bb0c125e8b7",
+          "commitHash": "9c69a24bc2e20c8a511a4e6b06fd49639ec5300a",
           "repositoryUrl": "https://github.com/onnx/onnx-tensorrt.git"
         },
         "comments": "onnx_tensorrt"
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index d2fe7e7457983..febefff6756e7 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -130,8 +130,7 @@ option(onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS "Dump debug information about node
 cmake_dependent_option(onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS_ENABLE_DUMP_TO_SQLDB "Build dump debug information about node inputs and outputs with support for sql database." OFF "onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS" OFF)
 
 # When loading a delay loaded DLL, Windows searches the main EXE's folder first.
-# In a Python process, it searches where python.exe lives, but it doesn't search the python package's installation folder. Therefore we cannot enable this flag when Python is enabled.
-cmake_dependent_option(onnxruntime_ENABLE_DELAY_LOADING_WIN_DLLS "Delay load some of the dependent DLls that are part of the OS" ON "WIN32;NOT GDK_PLATFORM;NOT onnxruntime_ENABLE_PYTHON" OFF)
+cmake_dependent_option(onnxruntime_ENABLE_DELAY_LOADING_WIN_DLLS "Delay load some of the dependent DLls that are part of the OS" ON "WIN32;NOT GDK_PLATFORM" OFF)
 option(onnxruntime_USE_DML "Build with DirectML support" OFF)
 option(onnxruntime_USE_MIGRAPHX "Build with AMDMIGraphX support" OFF)
 option(onnxruntime_USE_WINML "Build with WinML support" OFF)
diff --git a/cmake/deps.txt b/cmake/deps.txt
index 21f9ee1701c46..04a306e0ee657 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -36,8 +36,8 @@ microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.z
 mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee7d34223d0567892db5179849939c8769dc41
 mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063
 onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.16.1.zip;2eb9198bb352757d5ff13977cbe0634898e0837c
-# Use the latest commit of 10.6-GA-ORT-DDS
-onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/bc0d2e35909b8456abe32f3b30a49bb0c125e8b7.zip;f233ae871ad82c023da62e5dd620639f00bc2d15
+# Use the latest commit of 10.7-GA
+onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/9c69a24bc2e20c8a511a4e6b06fd49639ec5300a.zip;ff1fe9af78eb129b4a4cdcb7450b7390b4436dd3
 protobuf;https://github.com/protocolbuffers/protobuf/archive/refs/tags/v21.12.zip;7cf2733949036c7d52fda017badcab093fe73bfa
 protoc_win64;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip;b4521f7ada5b260380f94c4bd7f1b7684c76969a
 protoc_win32;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win32.zip;3688010318192c46ce73213cdfb6b3e5656da874
diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake
index 3b76aff829be2..5adacdc393da8 100644
--- a/cmake/onnxruntime.cmake
+++ b/cmake/onnxruntime.cmake
@@ -77,6 +77,7 @@ if(WIN32)
   onnxruntime_add_shared_library(onnxruntime
     ${SYMBOL_FILE}
     "${ONNXRUNTIME_ROOT}/core/dll/dllmain.cc"
+    "${ONNXRUNTIME_ROOT}/core/dll/delay_load_hook.cc"
     "${ONNXRUNTIME_ROOT}/core/dll/onnxruntime.rc"
   )
 elseif(onnxruntime_BUILD_APPLE_FRAMEWORK)
diff --git a/cmake/onnxruntime_nodejs.cmake b/cmake/onnxruntime_nodejs.cmake
index 376d895be34a9..355575be3bcf7 100644
--- a/cmake/onnxruntime_nodejs.cmake
+++ b/cmake/onnxruntime_nodejs.cmake
@@ -60,15 +60,26 @@ else()
     endif()
 endif()
 
+# a list of DLLs that the Node.js binding depends on
+set(NODEJS_DLL_DEPS)
+
 # setup providers
 if (onnxruntime_USE_CUDA)
     set(NODEJS_BINDING_USE_CUDA "--use_cuda")
 endif()
 if (onnxruntime_USE_DML)
     set(NODEJS_BINDING_USE_DML "--use_dml")
+    list(APPEND NODEJS_DLL_DEPS "$<TARGET_FILE_DIR:onnxruntime>/DirectML.dll")
 endif()
 if (onnxruntime_USE_WEBGPU)
     set(NODEJS_BINDING_USE_WEBGPU "--use_webgpu")
+    if (WIN32 AND onnxruntime_ENABLE_DAWN_BACKEND_D3D12)
+        list(APPEND NODEJS_DLL_DEPS "$<TARGET_FILE_DIR:dxcompiler>/dxil.dll")
+        list(APPEND NODEJS_DLL_DEPS "$<TARGET_FILE_DIR:dxcompiler>/dxcompiler.dll")
+    endif()
+    if (onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY)
+        list(APPEND NODEJS_DLL_DEPS "$<TARGET_FILE:dawn::webgpu_dawn>")
+    endif()
 endif()
 if (onnxruntime_USE_TENSORRT)
     set(NODEJS_BINDING_USE_TENSORRT "--use_tensorrt")
@@ -94,9 +105,12 @@ add_custom_target(js_common_npm_ci ALL
 
 add_custom_target(nodejs_binding_wrapper ALL
     COMMAND ${NPM_CLI} ci
-    COMMAND ${NPM_CLI} run build -- --onnxruntime-build-dir=${CMAKE_CURRENT_BINARY_DIR} --config=${CMAKE_BUILD_TYPE} --onnxruntime-generator=${CMAKE_GENERATOR}
-        --arch=${NODEJS_BINDING_ARCH} ${NODEJS_BINDING_USE_CUDA} ${NODEJS_BINDING_USE_DML} ${NODEJS_BINDING_USE_WEBGPU} ${NODEJS_BINDING_USE_TENSORRT}
-        ${NODEJS_BINDING_USE_COREML} ${NODEJS_BINDING_USE_QNN}
+    COMMAND ${NPM_CLI} run build -- "--onnxruntime-build-dir=${CMAKE_CURRENT_BINARY_DIR}"
+        --config=${CMAKE_BUILD_TYPE}
+        "--onnxruntime-generator=${CMAKE_GENERATOR}"
+        "--dll_deps=${NODEJS_DLL_DEPS}"
+        --arch=${NODEJS_BINDING_ARCH} ${NODEJS_BINDING_USE_CUDA} ${NODEJS_BINDING_USE_DML} ${NODEJS_BINDING_USE_WEBGPU}
+        ${NODEJS_BINDING_USE_TENSORRT} ${NODEJS_BINDING_USE_COREML} ${NODEJS_BINDING_USE_QNN}
     WORKING_DIRECTORY ${JS_NODE_ROOT}
     COMMENT "Using cmake-js to build OnnxRuntime Node.js binding")
 
diff --git a/cmake/onnxruntime_providers_webgpu.cmake b/cmake/onnxruntime_providers_webgpu.cmake
index fea5964f0dda9..e527d538d8757 100644
--- a/cmake/onnxruntime_providers_webgpu.cmake
+++ b/cmake/onnxruntime_providers_webgpu.cmake
@@ -23,19 +23,18 @@
   onnxruntime_add_include_to_target(onnxruntime_providers_webgpu
     onnxruntime_common dawn::dawncpp_headers dawn::dawn_headers onnx onnx_proto flatbuffers::flatbuffers Boost::mp11 safeint_interface)
 
+  set(onnxruntime_providers_webgpu_dll_deps)
+
   if (onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY)
     target_link_libraries(onnxruntime_providers_webgpu dawn::webgpu_dawn)
 
-    if (onnxruntime_ENABLE_DELAY_LOADING_WIN_DLLS)
-      list(APPEND onnxruntime_DELAYLOAD_FLAGS "/DELAYLOAD:webgpu_dawn.dll")
-    endif()
+    if (WIN32)
+      if (onnxruntime_ENABLE_DELAY_LOADING_WIN_DLLS)
+        list(APPEND onnxruntime_DELAYLOAD_FLAGS "/DELAYLOAD:webgpu_dawn.dll")
+      endif()
 
-    # Copy webgpu_dawn.dll to the output directory
-    add_custom_command(
-      TARGET onnxruntime_providers_webgpu
-      POST_BUILD
-      COMMAND ${CMAKE_COMMAND} -E copy_if_different "$<TARGET_FILE:dawn::webgpu_dawn>" "$<TARGET_FILE_DIR:onnxruntime_providers_webgpu>"
-      VERBATIM )
+      list(APPEND onnxruntime_providers_webgpu_dll_deps "$<TARGET_FILE:dawn::webgpu_dawn>")
+    endif()
   else()
     if (NOT onnxruntime_USE_EXTERNAL_DAWN)
       target_link_libraries(onnxruntime_providers_webgpu dawn::dawn_native)
@@ -43,4 +42,23 @@
     target_link_libraries(onnxruntime_providers_webgpu dawn::dawn_proc)
   endif()
 
+  if (WIN32 AND onnxruntime_ENABLE_DAWN_BACKEND_D3D12)
+    # Ensure dxil.dll and dxcompiler.dll exist in the output directory $<TARGET_FILE_DIR:dxcompiler>
+    add_dependencies(onnxruntime_providers_webgpu copy_dxil_dll)
+    add_dependencies(onnxruntime_providers_webgpu dxcompiler)
+
+    list(APPEND onnxruntime_providers_webgpu_dll_deps "$<TARGET_FILE_DIR:dxcompiler>/dxil.dll")
+    list(APPEND onnxruntime_providers_webgpu_dll_deps "$<TARGET_FILE_DIR:dxcompiler>/dxcompiler.dll")
+  endif()
+
+  if (onnxruntime_providers_webgpu_dll_deps)
+    # Copy dependency DLLs to the output directory
+    add_custom_command(
+      TARGET onnxruntime_providers_webgpu
+      POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different "${onnxruntime_providers_webgpu_dll_deps}" "$<TARGET_FILE_DIR:onnxruntime_providers_webgpu>"
+      COMMAND_EXPAND_LISTS
+      VERBATIM )
+  endif()
+
   set_target_properties(onnxruntime_providers_webgpu PROPERTIES FOLDER "ONNXRuntime")
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index c19a18ef15089..17ee0e9c4e15a 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -525,6 +525,9 @@ set (onnxruntime_global_thread_pools_test_SRC
 set (onnxruntime_webgpu_external_dawn_test_SRC
           ${TEST_SRC_DIR}/webgpu/external_dawn/main.cc)
 
+set (onnxruntime_webgpu_delay_load_test_SRC
+          ${TEST_SRC_DIR}/webgpu/delay_load/main.cc)
+
 # tests from lowest level library up.
 # the order of libraries should be maintained, with higher libraries being added first in the list
 
@@ -1863,4 +1866,13 @@ if (onnxruntime_USE_WEBGPU AND onnxruntime_USE_EXTERNAL_DAWN)
   onnxruntime_add_include_to_target(onnxruntime_webgpu_external_dawn_test dawn::dawncpp_headers dawn::dawn_headers)
 endif()
 
+if (onnxruntime_USE_WEBGPU AND WIN32 AND onnxruntime_BUILD_SHARED_LIB AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND NOT onnxruntime_MINIMAL_BUILD)
+  AddTest(DYN
+          TARGET onnxruntime_webgpu_delay_load_test
+          SOURCES ${onnxruntime_webgpu_delay_load_test_SRC}
+          LIBS ${SYS_PATH_LIB}
+          DEPENDS ${all_dependencies}
+  )
+endif()
+
 include(onnxruntime_fuzz_test.cmake)
diff --git a/cmake/winml.cmake b/cmake/winml.cmake
index ff6b71217ad87..63f356fcf831d 100644
--- a/cmake/winml.cmake
+++ b/cmake/winml.cmake
@@ -782,7 +782,7 @@ add_dependencies(winml_dll winml_api_native)
 add_dependencies(winml_dll winml_api_native_internal)
 
 # Link libraries
-target_link_libraries(winml_dll PRIVATE re2)
+target_link_libraries(winml_dll PRIVATE re2::re2)
 target_link_libraries(winml_dll PRIVATE ${WIL_TARGET})
 target_link_libraries(winml_dll PRIVATE winml_lib_api)
 if (NOT winml_is_inbox)
diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
index 6ea3f93cdea12..2290030073e5c 100644
--- a/docs/ContribOperators.md
+++ b/docs/ContribOperators.md
@@ -1625,7 +1625,7 @@ This version of the operator has been available since version 1 of the 'com.micr
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T</tt> : tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(float16), tensor(float), tensor(double), tensor(bfloat16)</dt>
 <dd>Constrain input and output types.</dd>
 </dl>
 
diff --git a/include/onnxruntime/core/framework/op_kernel.h b/include/onnxruntime/core/framework/op_kernel.h
index 07625c38d8474..375f0a4dc8dd2 100644
--- a/include/onnxruntime/core/framework/op_kernel.h
+++ b/include/onnxruntime/core/framework/op_kernel.h
@@ -7,6 +7,7 @@
 
 // It is safe to include the below header even if SHARED_PROVIDER macro is enabled
 // as it doesn't include any pb headers.
+#include "core/framework/buffer_deleter.h"
 #include "core/framework/prepacked_weights_container.h"
 
 #ifndef SHARED_PROVIDER
diff --git a/include/onnxruntime/core/graph/graph.h b/include/onnxruntime/core/graph/graph.h
index eb9581e8018d1..7798394b045dc 100644
--- a/include/onnxruntime/core/graph/graph.h
+++ b/include/onnxruntime/core/graph/graph.h
@@ -3,14 +3,15 @@
 
 #pragma once
 
+#include <filesystem>
 #include <functional>
 #include <limits>
 #include <memory>
+#include <optional>
 #include <string>
 #include <type_traits>
 #include <unordered_map>
 #include <unordered_set>
-#include <filesystem>
 
 #include "core/common/flatbuffers.h"
 
@@ -19,13 +20,14 @@
 #include "core/common/common.h"
 #include "core/common/path_string.h"
 #include "core/common/const_pointer_container.h"
+#include "core/common/inlined_containers_fwd.h"
 #if !defined(ORT_MINIMAL_BUILD)
 #include "core/common/inlined_containers.h"
 #endif
-#include "core/common/inlined_containers_fwd.h"
 #include "core/common/span_utils.h"
 #include "core/common/status.h"
 #include "core/common/logging/logging.h"
+#include "core/framework/prepacked_weights_container.h"
 #include "core/graph/onnx_protobuf.h"
 #include "core/graph/basic_types.h"
 #include "core/graph/constants.h"
@@ -41,6 +43,7 @@ namespace onnxruntime {
 class Graph;
 struct IndexedSubGraph;
 class Model;
+struct ModelSavingOptions;
 class OpSignature;
 
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
@@ -1153,29 +1156,6 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
   const ONNX_NAMESPACE::GraphProto& ToGraphProto();
   ONNX_NAMESPACE::GraphProto ToGraphProto() const;
 
-  // Options to align external initializer offset.
-  // For models running on CPU, ORT will try to use mmap to load external initializers.
-  // To use mmap, external initializer need to be offset aligned.
-  // ORT saves external initializers into signle data file, each initializer is accessed with
-  // offset(start position of initializer) and length(byte length of initializer) of the data file.
-  // To use mmap, each offset need to be aligned which means offset need to divisible by
-  // allocation granularity(64KB for windows and 4K for other OSes).
-  // With align_offset to true, ORT will align offset for large initializer when
-  // save ONNX model with external data file.
-  struct OffsetAlignmentInfo {
-    // Offset will always be page aligned and allocation granularity aligned for mmap support.
-    // This is done by padding previous tensor data with zeros keeping same length.
-    bool align_offset = false;
-    // Alignment threshold for size of data.
-    // Having a low threshold will waste file space for small initializers.
-    // Only when tensor's data size is > the page_align_threshold it will be force aligned.
-    // Default to 1MB.
-    int64_t align_threshold = 1048576;
-    // The allocation Granularity for mmap() support.
-    // Typically 64KB for Windows & 4KB for other OSes. Default to 64KB.
-    int64_t allocation_granularity = 65536;
-  };
-
   /** Gets the GraphProto representation of this Graph
   @param external_file_path File path of the binary file to use for initializers.
   @param model_file_path path of the model file.
@@ -1186,15 +1166,7 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
   */
   ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path,
                                                                   const std::filesystem::path& model_file_path,
-                                                                  size_t initializer_size_threshold,
-                                                                  const OffsetAlignmentInfo& align_info) const;
-
-  ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path,
-                                                                  const std::filesystem::path& model_file_path,
-                                                                  size_t initializer_size_threshold) const {
-    OffsetAlignmentInfo default_options;
-    return ToGraphProtoWithExternalInitializers(external_file_path, model_file_path, initializer_size_threshold, default_options);
-  }
+                                                                  const ModelSavingOptions& model_saving_options) const;
 
   /** Gets the ISchemaRegistry instances being used with this Graph. */
   IOnnxRuntimeOpSchemaCollectionPtr GetSchemaRegistry() const;
@@ -1400,6 +1372,18 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
 
 #endif  // !defined(ORT_MINIMAL_BUILD)
 
+  // This function constructs PrepackedSharedContainer in the root graph only
+  // and initializes a reference to it in all (sub)graphs
+  void ConstructPrepackedSharedContainerAndSetMode(bool saving_mode_on);
+
+  const PrepackedWeightsForGraph& GetPrepacked() const noexcept {
+    return *prepacked_weights_for_graph_;
+  }
+
+  PrepackedWeightsForGraph& GetPrepacked() noexcept {
+    return *prepacked_weights_for_graph_;
+  }
+
   /** Returns the Node containing the GraphProto for this Graph instance if IsSubgraph is true */
   const Node* ParentNode() const { return parent_node_; }
 
@@ -1519,6 +1503,31 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
   Status AddConstantProtoAsInitializer(const ONNX_NAMESPACE::NodeProto& constant_node_proto,
                                        std::optional<std::string_view> new_name);
 
+  /// <summary>
+  /// This function traverses the graph bottom up and externalizes
+  /// constant initializers along with their pre-packed blobs from different
+  /// kernels. Writes constant initializers to the external file with any pre-packed
+  /// blobs (if enabled and produced for this initializer) and then modifies TensorProto
+  /// entry with external data references.
+  /// </summary>
+  /// <param name="model_path">model file path from Model</param>
+  /// <param name="external_file_path">a binary file path for relative to the model file path
+  /// where the initializers data is written</param>
+  /// <param name="model_external_file_path">model file folder path with external file path appended</param>
+  /// <param name="model_saving_options">model saving options including alignment and pre-packs</param>
+  /// <param name="output_graph_proto">The graph proto to be modified</param>
+  /// <param name="external_stream">external file stream</param>
+  /// <param name="external_offset">current external file offset updated with each write</param>
+  /// <returns>Status instance</returns>
+  Status AddExternalInitializersToGraphProtoImpl(
+      const std::filesystem::path& model_path,
+      const std::filesystem::path& external_file_path,
+      const std::filesystem::path& model_external_file_path,
+      const ModelSavingOptions& model_saving_options,
+      ONNX_NAMESPACE::GraphProto& output_graph_proto,
+      std::ostream& external_stream,
+      int64_t& external_offset) const;
+
 #endif
 
   Version IrVersion() const noexcept {
@@ -1703,6 +1712,21 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
                      std::hash<std::string>, std::equal_to<std::string>>
       sparse_tensor_names_;
 
+  // Prepacked blobs container that stored pre-packed initializers
+  // data that is:
+  // - mem-mapped from disk
+  // - shared within the session
+  // - shared across sessions by transferring the ownership of loaded data entries to
+  // SessionState::PrepackedWeightsContainer* if one is present.
+  // This container is optional because it is present only in the root graph.
+  std::optional<PrepackedKeyToBlobMap> prepacked_key_to_blobs_;
+
+  // This container contains a reference to the root prepacked_key_to_blobs_
+  // and also (in the save mode) records association between the initializer
+  // names and their pre-packed blobs (via keys).
+  // This is optional due to delayed construction.
+  std::optional<PrepackedWeightsForGraph> prepacked_weights_for_graph_;
+
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
   // Runtime optimization storage.
   // Note: runtime_optimizations_ == *runtime_optimizations_ptr_ and must be initialized
diff --git a/include/onnxruntime/core/graph/model_saving_options.h b/include/onnxruntime/core/graph/model_saving_options.h
new file mode 100644
index 0000000000000..924799f15b247
--- /dev/null
+++ b/include/onnxruntime/core/graph/model_saving_options.h
@@ -0,0 +1,44 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+namespace onnxruntime {
+
+class PrepackedWeightsForGraph;
+
+// These options affect how the model initializers are written to the external file.
+// This includes options to align external initializer offset.
+// For models running on CPU, ORT will try to use mmap to load external
+// initializers. To use mmap, external initializer need to be offset aligned.
+// ORT saves external initializers into single data file, each initializer is
+// accessed with offset(start position of initializer) and length(byte length of
+// initializer) of the data file. To use mmap, each offset need to be aligned
+// which means offset need to divisible by allocation granularity(64KB for
+// windows and 4K for other OSes). With align_offset to true, ORT will align
+// offset for large initializer when save ONNX model with external data file.
+struct ModelSavingOptions {
+  explicit ModelSavingOptions(size_t size_threshold)
+      : initializer_size_threshold(size_threshold) {}
+
+  // Mimimal initializer size in bytes to be externalized on disk
+  size_t initializer_size_threshold;
+  // Offset will always be page aligned and allocation granularity aligned for
+  // mmap support. This is done by padding previous tensor data with zeros
+  // keeping same length.
+  bool align_offset = false;
+  // Alignment threshold for size of data.
+  // Having a low threshold will waste file space for small initializers.
+  // Only when tensor's data size is > the page_align_threshold it will be force
+  // aligned. Default to 1MB.
+  int64_t align_threshold = 1048576;
+  // The allocation Granularity for mmap() support.
+  // Typically 64KB for Windows & 4KB for other OSes. Default to 64KB.
+#ifdef _WIN32
+  int64_t allocation_granularity = 65536;
+#else
+  int64_t allocation_granularity = 4096;
+#endif
+};
+
+}  // namespace onnxruntime
diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
index 8f1bc98ce7b49..64a4dd19c12b0 100644
--- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
+++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@@ -250,6 +250,17 @@ static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersFil
 static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersMinSizeInBytes =
     "session.optimized_model_external_initializers_min_size_in_bytes";
 
+// Use this config when saving pre-packed constant initializers to an external data file.
+// This allows you to memory map pre-packed initializers on model load and leave it to
+// to the OS the amount of memory consumed by the pre-packed initializers. Otherwise,
+// pre-packed data resides on the heap.
+//
+// - "0": Default is not save pre-packed initializers to a data file.
+// - "1": Save pre-packed constant initializers to an external data file.
+// Sample usage: sess_options.add_session_config_entry(kOrtSessionOptionsSavePrePackedConstantInitializers,  "1")
+static const char* const kOrtSessionOptionsSavePrePackedConstantInitializers =
+    "session.save_external_prepacked_constant_initializers";
+
 // Enable EP context feature to dump the partitioned graph which includes the EP context into Onnx file.
 // The dumped Onnx model with EP context can be used for future inference to avoid the EP graph partitioning/compile overhead.
 // "0": disable. (default)
diff --git a/java/src/test/java/ai/onnxruntime/InferenceTest.java b/java/src/test/java/ai/onnxruntime/InferenceTest.java
index 15d89b536b39a..e11537492d3a7 100644
--- a/java/src/test/java/ai/onnxruntime/InferenceTest.java
+++ b/java/src/test/java/ai/onnxruntime/InferenceTest.java
@@ -737,7 +737,6 @@ public void testCoreML() throws OrtException {
     runProvider(OrtProvider.CORE_ML);
   }
 
-  @Disabled("DirectML Java API hasn't been supported yet")
   @Test
   @EnabledIfSystemProperty(named = "USE_DML", matches = "1")
   public void testDirectML() throws OrtException {
diff --git a/java/src/test/java/ai/onnxruntime/providers/ProviderOptionsTest.java b/java/src/test/java/ai/onnxruntime/providers/ProviderOptionsTest.java
index fa0b6fd0ef9d9..57c4eb3577fd0 100644
--- a/java/src/test/java/ai/onnxruntime/providers/ProviderOptionsTest.java
+++ b/java/src/test/java/ai/onnxruntime/providers/ProviderOptionsTest.java
@@ -27,7 +27,6 @@
 import java.util.HashMap;
 import java.util.Map;
 import org.junit.jupiter.api.Test;
-import org.junit.jupiter.api.condition.DisabledIfSystemProperty;
 import org.junit.jupiter.api.condition.EnabledIfSystemProperty;
 
 public class ProviderOptionsTest {
@@ -35,7 +34,6 @@ public class ProviderOptionsTest {
 
   @Test
   @EnabledIfSystemProperty(named = "USE_CUDA", matches = "1")
-  @DisabledIfSystemProperty(named = "NO_CUDA_TEST", matches = "1")
   public void testCUDAOptions() throws OrtException {
     // Test standard options
     OrtCUDAProviderOptions cudaOpts = new OrtCUDAProviderOptions(0);
@@ -63,7 +61,6 @@ public void testCUDAOptions() throws OrtException {
 
   @Test
   @EnabledIfSystemProperty(named = "USE_TENSORRT", matches = "1")
-  @DisabledIfSystemProperty(named = "NO_CUDA_TEST", matches = "1")
   public void testTensorRT() throws OrtException {
     // Test standard options
     OrtTensorRTProviderOptions rtOpts = new OrtTensorRTProviderOptions(0);
diff --git a/js/node/CMakeLists.txt b/js/node/CMakeLists.txt
index d79a82c572dc2..c78b40a3e7429 100644
--- a/js/node/CMakeLists.txt
+++ b/js/node/CMakeLists.txt
@@ -113,10 +113,12 @@ endif()
 if (WIN32)
   file(COPY ${ONNXRUNTIME_WIN_BIN_DIR}/onnxruntime.dll
       DESTINATION ${dist_folder})
-  if (USE_DML)
-    file(COPY ${ONNXRUNTIME_WIN_BIN_DIR}/DirectML.dll
-      DESTINATION ${dist_folder})
-  endif ()
+  if (ORT_NODEJS_DLL_DEPS)
+    foreach(dll ${ORT_NODEJS_DLL_DEPS})
+      file(COPY ${dll} DESTINATION ${dist_folder})
+    endforeach()
+  endif()
+
 elseif (APPLE)
   file(COPY ${ONNXRUNTIME_BUILD_DIR}/libonnxruntime.dylib
       DESTINATION ${dist_folder} FOLLOW_SYMLINK_CHAIN)
diff --git a/js/node/script/build.ts b/js/node/script/build.ts
index dcdcb93377b4c..b557368ed58c6 100644
--- a/js/node/script/build.ts
+++ b/js/node/script/build.ts
@@ -39,6 +39,8 @@ const USE_TENSORRT = !!buildArgs.use_tensorrt;
 const USE_COREML = !!buildArgs.use_coreml;
 // --use_qnn
 const USE_QNN = !!buildArgs.use_qnn;
+// --dll_deps=
+const DLL_DEPS = buildArgs.dll_deps;
 
 // build path
 const ROOT_FOLDER = path.join(__dirname, '..');
@@ -82,6 +84,9 @@ if (USE_COREML) {
 if (USE_QNN) {
   args.push('--CDUSE_QNN=ON');
 }
+if (DLL_DEPS) {
+  args.push(`--CDORT_NODEJS_DLL_DEPS=${DLL_DEPS}`);
+}
 
 // set CMAKE_OSX_ARCHITECTURES for macOS build
 if (os.platform() === 'darwin') {
diff --git a/js/node/src/directml_load_helper.cc b/js/node/src/directml_load_helper.cc
deleted file mode 100644
index 6aafe4d5fa788..0000000000000
--- a/js/node/src/directml_load_helper.cc
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#ifdef _WIN32
-#include "common.h"
-#include "windows.h"
-
-void LoadDirectMLDll(Napi::Env env) {
-  DWORD pathLen = MAX_PATH;
-  std::wstring path(pathLen, L'\0');
-  HMODULE moduleHandle = nullptr;
-
-  GetModuleHandleEx(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT,
-                    reinterpret_cast<LPCSTR>(&LoadDirectMLDll), &moduleHandle);
-
-  DWORD getModuleFileNameResult = GetModuleFileNameW(moduleHandle, const_cast<wchar_t*>(path.c_str()), pathLen);
-  while (getModuleFileNameResult == 0 || getModuleFileNameResult == pathLen) {
-    int ret = GetLastError();
-    if (ret == ERROR_INSUFFICIENT_BUFFER && pathLen < 32768) {
-      pathLen *= 2;
-      path.resize(pathLen);
-      getModuleFileNameResult = GetModuleFileNameW(moduleHandle, const_cast<wchar_t*>(path.c_str()), pathLen);
-    } else {
-      ORT_NAPI_THROW_ERROR(env, "Failed getting path to load DirectML.dll, error code: ", ret);
-    }
-  }
-
-  path.resize(path.rfind(L'\\') + 1);
-  path.append(L"DirectML.dll");
-  HMODULE libraryLoadResult = LoadLibraryW(path.c_str());
-
-  if (!libraryLoadResult) {
-    int ret = GetLastError();
-    ORT_NAPI_THROW_ERROR(env, "Failed loading bundled DirectML.dll, error code: ", ret);
-  }
-}
-#endif
diff --git a/js/node/src/directml_load_helper.h b/js/node/src/directml_load_helper.h
deleted file mode 100644
index 074a4f95ed476..0000000000000
--- a/js/node/src/directml_load_helper.h
+++ /dev/null
@@ -1,6 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#if defined(USE_DML) && defined(_WIN32)
-void LoadDirectMLDll(Napi::Env env);
-#endif
diff --git a/js/node/src/inference_session_wrap.cc b/js/node/src/inference_session_wrap.cc
index 23d859351f426..04ab71dc48ec2 100644
--- a/js/node/src/inference_session_wrap.cc
+++ b/js/node/src/inference_session_wrap.cc
@@ -4,7 +4,6 @@
 #include "onnxruntime_cxx_api.h"
 
 #include "common.h"
-#include "directml_load_helper.h"
 #include "inference_session_wrap.h"
 #include "run_options_helper.h"
 #include "session_options_helper.h"
@@ -19,9 +18,6 @@ Napi::FunctionReference& InferenceSessionWrap::GetTensorConstructor() {
 }
 
 Napi::Object InferenceSessionWrap::Init(Napi::Env env, Napi::Object exports) {
-#if defined(USE_DML) && defined(_WIN32)
-  LoadDirectMLDll(env);
-#endif
   // create ONNX runtime env
   Ort::InitApi();
   ORT_NAPI_THROW_ERROR_IF(
diff --git a/js/node/src/tensor_helper.cc b/js/node/src/tensor_helper.cc
index 27eb9b65c62d3..12b1a79793ff3 100644
--- a/js/node/src/tensor_helper.cc
+++ b/js/node/src/tensor_helper.cc
@@ -53,24 +53,24 @@ constexpr size_t DATA_TYPE_ELEMENT_SIZE_MAP[] = {
 static_assert(sizeof(DATA_TYPE_ELEMENT_SIZE_MAP) == sizeof(size_t) * ONNX_TENSOR_ELEMENT_DATA_TYPE_COUNT,
               "definition not matching");
 
-constexpr napi_typedarray_type DATA_TYPE_TYPEDARRAY_MAP[] = {
-    (napi_typedarray_type)(-1),  // ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED     not supported
-    napi_float32_array,          // ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT
-    napi_uint8_array,            // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8
-    napi_int8_array,             // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8
-    napi_uint16_array,           // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16
-    napi_int16_array,            // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16
-    napi_int32_array,            // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32
-    napi_bigint64_array,         // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64
-    (napi_typedarray_type)(-1),  // ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING        not supported
-    napi_uint8_array,            // ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL
-    napi_uint16_array,           // ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16       FLOAT16 uses Uint16Array
-    napi_float64_array,          // ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE
-    napi_uint32_array,           // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32
-    napi_biguint64_array,        // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64
-    (napi_typedarray_type)(-1),  // ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX64     not supported
-    (napi_typedarray_type)(-1),  // ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX128    not supported
-    (napi_typedarray_type)(-1)   // ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16      not supported
+constexpr std::underlying_type_t<napi_typedarray_type> DATA_TYPE_TYPEDARRAY_MAP[] = {
+    std::underlying_type_t<napi_typedarray_type>(-1),  // ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED     not supported
+    napi_float32_array,                                // ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT
+    napi_uint8_array,                                  // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8
+    napi_int8_array,                                   // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8
+    napi_uint16_array,                                 // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16
+    napi_int16_array,                                  // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16
+    napi_int32_array,                                  // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32
+    napi_bigint64_array,                               // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64
+    std::underlying_type_t<napi_typedarray_type>(-1),  // ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING        not supported
+    napi_uint8_array,                                  // ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL
+    napi_uint16_array,                                 // ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16       FLOAT16 uses Uint16Array
+    napi_float64_array,                                // ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE
+    napi_uint32_array,                                 // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32
+    napi_biguint64_array,                              // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64
+    std::underlying_type_t<napi_typedarray_type>(-1),  // ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX64     not supported
+    std::underlying_type_t<napi_typedarray_type>(-1),  // ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX128    not supported
+    std::underlying_type_t<napi_typedarray_type>(-1)   // ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16      not supported
 };
 static_assert(sizeof(DATA_TYPE_TYPEDARRAY_MAP) == sizeof(napi_typedarray_type) * ONNX_TENSOR_ELEMENT_DATA_TYPE_COUNT,
               "definition not matching");
@@ -98,7 +98,20 @@ static_assert(sizeof(DATA_TYPE_ID_TO_NAME_MAP) == sizeof(const char*) * ONNX_TEN
               "definition not matching");
 
 const std::unordered_map<std::string, ONNXTensorElementDataType> DATA_TYPE_NAME_TO_ID_MAP = {
-    {"float32", ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT}, {"uint8", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8}, {"int8", ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8}, {"uint16", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16}, {"int16", ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16}, {"int32", ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32}, {"int64", ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64}, {"string", ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING}, {"bool", ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL}, {"float16", ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16}, {"float64", ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE}, {"uint32", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32}, {"uint64", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64}};
+    {"float32", ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT},
+    {"uint8", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8},
+    {"int8", ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8},
+    {"uint16", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16},
+    {"int16", ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16},
+    {"int32", ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32},
+    {"int64", ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64},
+    {"string", ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING},
+    {"bool", ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL},
+    {"float16", ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16},
+    {"float64", ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE},
+    {"uint32", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32},
+    {"uint64", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64},
+};
 
 // currently only support tensor
 Ort::Value NapiValueToOrtValue(Napi::Env env, Napi::Value value, OrtMemoryInfo* cpu_memory_info, OrtMemoryInfo* webgpu_memory_info) {
@@ -181,7 +194,7 @@ Ort::Value NapiValueToOrtValue(Napi::Env env, Napi::Value value, OrtMemoryInfo*
                                   "Tensor.data must be a typed array for numeric tensor.");
 
       auto tensorDataTypedArray = tensorDataValue.As<Napi::TypedArray>();
-      auto typedArrayType = tensorDataValue.As<Napi::TypedArray>().TypedArrayType();
+      std::underlying_type_t<napi_typedarray_type> typedArrayType = tensorDataValue.As<Napi::TypedArray>().TypedArrayType();
       ORT_NAPI_THROW_TYPEERROR_IF(DATA_TYPE_TYPEDARRAY_MAP[elemType] != typedArrayType, env,
                                   "Tensor.data must be a typed array (", DATA_TYPE_TYPEDARRAY_MAP[elemType], ") for ",
                                   tensorTypeString, " tensors, but got typed array (", typedArrayType, ").");
@@ -294,7 +307,7 @@ Napi::Value OrtValueToNapiValue(Napi::Env env, Ort::Value&& value) {
       }
       napi_value typedArrayData;
       napi_status status =
-          napi_create_typedarray(env, DATA_TYPE_TYPEDARRAY_MAP[elemType], size, arrayBuffer, 0, &typedArrayData);
+          napi_create_typedarray(env, (napi_typedarray_type)DATA_TYPE_TYPEDARRAY_MAP[elemType], size, arrayBuffer, 0, &typedArrayData);
       NAPI_THROW_IF_FAILED(env, status, Napi::Value);
 
       // new Tensor(type, typedArrayData, dims)
diff --git a/js/web/test/data/ops/conv.jsonc b/js/web/test/data/ops/conv.jsonc
index f514ae5fa75e6..262503214a50a 100644
--- a/js/web/test/data/ops/conv.jsonc
+++ b/js/web/test/data/ops/conv.jsonc
@@ -391,48 +391,48 @@
       }
     ]
   },
-  // {
-  //   "name": "conv - vectorize group - B",
-  //   "operator": "Conv",
-  //   "inputShapeDefinitions": "rankOnly",
-  //   "opset": { "domain": "", "version": 17 },
-  //   "attributes": [
-  //     { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
-  //     { "name": "group", "data": 3, "type": "int" }
-  //   ],
-  //   "cases": [
-  //     {
-  //       "name": "T[0]",
-  //       "inputs": [
-  //         {
-  //           "data": [
-  //             0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
-  //             19.0, 20.0, 21.0, 22.0, 23.0, 0, 0, 0
-  //           ],
-  //           "dims": [1, 3, 3, 3],
-  //           "type": "float32"
-  //         },
-  //         {
-  //           "data": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0],
-  //           "dims": [3, 1, 2, 2],
-  //           "type": "float32"
-  //         },
-  //         {
-  //           "data": [0.1, 0.2, 0.3],
-  //           "dims": [3],
-  //           "type": "float32"
-  //         }
-  //       ],
-  //       "outputs": [
-  //         {
-  //           "data": [27.1, 37.1, 57.1, 67.1, 293.2, 319.2, 371.2, 397.2, 847.3, 889.3, 409.3, 428.3],
-  //           "dims": [1, 3, 2, 2],
-  //           "type": "float32"
-  //         }
-  //       ]
-  //     }
-  //   ]
-  // },
+  {
+    "name": "conv - vectorize group - B",
+    "operator": "Conv",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": { "domain": "", "version": 17 },
+    "attributes": [
+      { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
+      { "name": "group", "data": 3, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "T[0]",
+        "inputs": [
+          {
+            "data": [
+              0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
+              19.0, 20.0, 21.0, 22.0, 23.0, 0, 0, 0
+            ],
+            "dims": [1, 3, 3, 3],
+            "type": "float32"
+          },
+          {
+            "data": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0],
+            "dims": [3, 1, 2, 2],
+            "type": "float32"
+          },
+          {
+            "data": [0.1, 0.2, 0.3],
+            "dims": [3],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [27.1, 37.1, 57.1, 67.1, 293.2, 319.2, 371.2, 397.2, 847.3, 889.3, 409.3, 428.3],
+            "dims": [1, 3, 2, 2],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
   {
     "name": "conv - vectorize group - C",
     "operator": "Conv",
@@ -470,44 +470,44 @@
       }
     ]
   },
-  // {
-  //   "name": "conv - vectorize group - D",
-  //   "operator": "Conv",
-  //   "inputShapeDefinitions": "rankOnly",
-  //   "opset": { "domain": "", "version": 17 },
-  //   "attributes": [
-  //     { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
-  //     { "name": "group", "data": 3, "type": "int" },
-  //     { "name": "strides", "data": [2, 2], "type": "ints" }
-  //   ],
-  //   "cases": [
-  //     {
-  //       "name": "T[0] strides = [2, 2]",
-  //       "inputs": [
-  //         {
-  //           "data": [
-  //             0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
-  //             19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0
-  //           ],
-  //           "dims": [1, 3, 3, 4],
-  //           "type": "float32"
-  //         },
-  //         {
-  //           "data": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0],
-  //           "dims": [3, 1, 2, 2],
-  //           "type": "float32"
-  //         }
-  //       ],
-  //       "outputs": [
-  //         {
-  //           "data": [34, 54, 386, 438, 1122, 1206],
-  //           "dims": [1, 3, 1, 2],
-  //           "type": "float32"
-  //         }
-  //       ]
-  //     }
-  //   ]
-  // },
+  {
+    "name": "conv - vectorize group - D",
+    "operator": "Conv",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": { "domain": "", "version": 17 },
+    "attributes": [
+      { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
+      { "name": "group", "data": 3, "type": "int" },
+      { "name": "strides", "data": [2, 2], "type": "ints" }
+    ],
+    "cases": [
+      {
+        "name": "T[0] strides = [2, 2]",
+        "inputs": [
+          {
+            "data": [
+              0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
+              19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0
+            ],
+            "dims": [1, 3, 3, 4],
+            "type": "float32"
+          },
+          {
+            "data": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0],
+            "dims": [3, 1, 2, 2],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [34, 54, 386, 438, 1122, 1206],
+            "dims": [1, 3, 1, 2],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
   {
     "name": "conv - pointwise",
     "operator": "Conv",
diff --git a/js/web/test/data/ops/fused-conv.jsonc b/js/web/test/data/ops/fused-conv.jsonc
index ebb0b5d3e1f58..d88c91ebc9de7 100644
--- a/js/web/test/data/ops/fused-conv.jsonc
+++ b/js/web/test/data/ops/fused-conv.jsonc
@@ -249,44 +249,44 @@
       }
     ]
   },
-  // {
-  //   "name": "NHWC group-conv with HardSigmoid",
-  //   "operator": "Conv",
-  //   "attributes": [
-  //     { "name": "activation", "data": "HardSigmoid", "type": "string" },
-  //     { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
-  //     { "name": "group", "data": 3, "type": "int" },
-  //     { "name": "activation_params", "data": [2.0, 5.0], "type": "floats" }
-  //   ],
-  //   "opset": { "domain": "com.ms.internal.nhwc", "version": 1 },
-  //   "cases": [
-  //     {
-  //       "name": "T[0]",
-  //       "inputs": [
-  //         {
-  //           "data": [
-  //             0.0, 1.0, 2.0, -3.0, 4.0, -5.0, 6.0, 7.0, 8.0, -9.0, -10.0, 11.0, -12.0, 13.0, -14.0, 15.0, 16.0, 17.0,
-  //             18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0
-  //           ],
-  //           "dims": [1, 3, 3, 3],
-  //           "type": "float32"
-  //         },
-  //         {
-  //           "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
-  //           "dims": [3, 1, 2, 2],
-  //           "type": "float32"
-  //         }
-  //       ],
-  //       "outputs": [
-  //         {
-  //           "data": [0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-  //           "dims": [1, 2, 2, 3],
-  //           "type": "float32"
-  //         }
-  //       ]
-  //     }
-  //   ]
-  // },
+  {
+    "name": "NHWC group-conv with HardSigmoid",
+    "operator": "Conv",
+    "attributes": [
+      { "name": "activation", "data": "HardSigmoid", "type": "string" },
+      { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
+      { "name": "group", "data": 3, "type": "int" },
+      { "name": "activation_params", "data": [2.0, 5.0], "type": "floats" }
+    ],
+    "opset": { "domain": "com.ms.internal.nhwc", "version": 1 },
+    "cases": [
+      {
+        "name": "T[0]",
+        "inputs": [
+          {
+            "data": [
+              0.0, 1.0, 2.0, -3.0, 4.0, -5.0, 6.0, 7.0, 8.0, -9.0, -10.0, 11.0, -12.0, 13.0, -14.0, 15.0, 16.0, 17.0,
+              18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0
+            ],
+            "dims": [1, 3, 3, 3],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
+            "dims": [3, 1, 2, 2],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+            "dims": [1, 2, 2, 3],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
   {
     "name": "fused group-conv with LeakyRelu",
     "operator": "FusedConv",
@@ -325,44 +325,44 @@
       }
     ]
   },
-  // {
-  //   "name": "NHWC group-conv with LeakyRelu",
-  //   "operator": "Conv",
-  //   "attributes": [
-  //     { "name": "activation", "data": "LeakyRelu", "type": "string" },
-  //     { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
-  //     { "name": "group", "data": 3, "type": "int" },
-  //     { "name": "activation_params", "data": [2.0], "type": "floats" }
-  //   ],
-  //   "opset": { "domain": "com.ms.internal.nhwc", "version": 1 },
-  //   "cases": [
-  //     {
-  //       "name": "T[0]",
-  //       "inputs": [
-  //         {
-  //           "data": [
-  //             0.0, 1.0, 2.0, -3.0, 4.0, -5.0, 6.0, 7.0, 8.0, -9.0, -10.0, 11.0, -12.0, 13.0, -14.0, 15.0, 16.0, 17.0,
-  //             18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0
-  //           ],
-  //           "dims": [1, 3, 3, 3],
-  //           "type": "float32"
-  //         },
-  //         {
-  //           "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
-  //           "dims": [3, 1, 2, 2],
-  //           "type": "float32"
-  //         }
-  //       ],
-  //       "outputs": [
-  //         {
-  //           "data": [-162, 63, -158, 33, 281, 85, 105, 337, 455, 177, 515, 609],
-  //           "dims": [1, 2, 2, 3],
-  //           "type": "float32"
-  //         }
-  //       ]
-  //     }
-  //   ]
-  // },
+  {
+    "name": "NHWC group-conv with LeakyRelu",
+    "operator": "Conv",
+    "attributes": [
+      { "name": "activation", "data": "LeakyRelu", "type": "string" },
+      { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
+      { "name": "group", "data": 3, "type": "int" },
+      { "name": "activation_params", "data": [2.0], "type": "floats" }
+    ],
+    "opset": { "domain": "com.ms.internal.nhwc", "version": 1 },
+    "cases": [
+      {
+        "name": "T[0]",
+        "inputs": [
+          {
+            "data": [
+              0.0, 1.0, 2.0, -3.0, 4.0, -5.0, 6.0, 7.0, 8.0, -9.0, -10.0, 11.0, -12.0, 13.0, -14.0, 15.0, 16.0, 17.0,
+              18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0
+            ],
+            "dims": [1, 3, 3, 3],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
+            "dims": [3, 1, 2, 2],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [-162, 63, -158, 33, 281, 85, 105, 337, 455, 177, 515, 609],
+            "dims": [1, 2, 2, 3],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
   {
     "name": "fused conv with LeakyRelu",
     "operator": "FusedConv",
diff --git a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc
index 9a49adf347a29..8abcd78bfff4c 100644
--- a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc
+++ b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc
@@ -60,7 +60,7 @@ Status MatMulNBitsProgram::GenerateShaderCode(ShaderHelper& shader) const {
   const auto& scales = shader.AddInput("scales", ShaderUsage::UseUniform);
   const auto& y = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias | ShaderUsage::UseIndicesTypeAlias);
 
-  if ((is_intel_ || tile_m_ > 1) && block_size_ == 32) {
+  if (block_size_ == 32) {
     const uint32_t workgroup_size = WorkgroupSizeX() * WorkgroupSizeY();
     const uint32_t tile_size = WorkgroupSizeX() * components_b_ * 8;  // each uint32 has 8 data.
     const uint32_t a_length_per_tile = tile_size / a.NumComponents();
@@ -408,14 +408,12 @@ Status MatMulNBits::ComputeInternal(onnxruntime::webgpu::ComputeContext& context
   const uint32_t components_b = GetMaxComponents(blob_size_in_words);
   uint32_t components = GetMaxComponents(N);
 
-  const bool is_intel = context.AdapterInfo().vendor == std::string_view{"intel"} &&
-                        context.AdapterInfo().architecture == std::string_view{"gen-12lp"};
   const bool has_zero_points = zero_points != nullptr;
 
   // TODO: Support output_number > 1. Some cases are failed when output_number > 1.
   constexpr uint32_t output_number = 1;
   const uint32_t tile_m = M > kMinMForTileOptimization ? 4 : 1;
-  MatMulNBitsProgram program{output_number, block_size, tile_m, gsl::narrow<int>(components_b), has_zero_points, is_intel};
+  MatMulNBitsProgram program{output_number, block_size, tile_m, gsl::narrow<int>(components_b), has_zero_points};
   if (M > kMinMForTileOptimization && block_size == 32) {
     components = 1;
     constexpr uint32_t workgroup_size = 64;
@@ -426,7 +424,7 @@ Status MatMulNBits::ComputeInternal(onnxruntime::webgpu::ComputeContext& context
                                  (M + tile_m - 1) / tile_m,
                                  batch_count);
     program.CacheHint("T_M" + std::to_string(tile_m));
-  } else if (is_intel && block_size == 32) {
+  } else if (block_size == 32) {
     components = 1;
     constexpr uint32_t workgroup_size = 128;
     const uint32_t workgroup_y = N % 8 == 0 ? 8 : N % 4 == 0 ? 4
diff --git a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.h b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.h
index 8a4626083419c..57615d3ddabcf 100644
--- a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.h
+++ b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.h
@@ -14,13 +14,12 @@ using namespace onnxruntime::webgpu;
 
 class MatMulNBitsProgram final : public Program<MatMulNBitsProgram> {
  public:
-  MatMulNBitsProgram(uint32_t output_number, uint32_t block_size, uint32_t tile_m, int components_b, bool has_zero_points, bool is_intel) : Program{"MatMulNBits"},
-                                                                                                                                            output_number_{output_number},
-                                                                                                                                            block_size_{block_size},
-                                                                                                                                            tile_m_{tile_m},
-                                                                                                                                            components_b_{components_b},
-                                                                                                                                            has_zero_points_{has_zero_points},
-                                                                                                                                            is_intel_{is_intel} {
+  MatMulNBitsProgram(uint32_t output_number, uint32_t block_size, uint32_t tile_m, int components_b, bool has_zero_points) : Program{"MatMulNBits"},
+                                                                                                                             output_number_{output_number},
+                                                                                                                             block_size_{block_size},
+                                                                                                                             tile_m_{tile_m},
+                                                                                                                             components_b_{components_b},
+                                                                                                                             has_zero_points_{has_zero_points} {
   }
 
   Status GenerateShaderCode(ShaderHelper& sh) const override;
@@ -32,7 +31,6 @@ class MatMulNBitsProgram final : public Program<MatMulNBitsProgram> {
   uint32_t tile_m_;
   int components_b_;
   bool has_zero_points_;
-  bool is_intel_;
 };
 
 class MatMulNBits final : public WebGpuKernel {
diff --git a/onnxruntime/core/dll/delay_load_hook.cc b/onnxruntime/core/dll/delay_load_hook.cc
new file mode 100644
index 0000000000000..bc5e1aa662721
--- /dev/null
+++ b/onnxruntime/core/dll/delay_load_hook.cc
@@ -0,0 +1,91 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// == workaround for delay loading of dependencies of onnxruntime.dll ==
+//
+// Problem:
+//
+// When onnxruntime.dll uses delay loading for its dependencies, the dependencies are loaded using LoadLibraryEx,
+// which search the directory of process (.exe) instead of this library (onnxruntime.dll). This is a problem for
+// usages of Node.js binding and python binding, because Windows will try to find the dependencies in the directory
+// of node.exe or python.exe, which is not the directory of onnxruntime.dll.
+//
+// Solution:
+//
+// By using the delay load hook `__pfnDliNotifyHook2`, we can intervene the loading procedure by loading from an
+// absolute path. The absolute path is constructed by appending the name of the DLL to load to the directory of
+// onnxruntime.dll. This way, we can ensure that the dependencies are loaded from the same directory as onnxruntime.dll.
+//
+// See also:
+// - https://learn.microsoft.com/en-us/cpp/build/reference/understanding-the-helper-function?view=msvc-170#structure-and-constant-definitions
+// - https://learn.microsoft.com/en-us/windows/win32/dlls/dynamic-link-library-search-order#alternate-search-order-for-unpackaged-apps
+//
+// The DLL DelayLoad hook is only enabled when the compiler is MSVC and at least one of the following is True:
+// - both USE_WEBGPU and BUILD_DAWN_MONOLITHIC_LIBRARY are defined
+// - USE_DML is defined
+//
+#if defined(USE_WEBGPU) && defined(BUILD_DAWN_MONOLITHIC_LIBRARY)
+#define ORT_DELAY_LOAD_WEBGPU_DAWN_DLL 1
+#else
+#define ORT_DELAY_LOAD_WEBGPU_DAWN_DLL 0
+#endif
+#if defined(USE_DML)
+#define ORT_DELAY_LOAD_DIRECTML_DLL 1
+#else
+#define ORT_DELAY_LOAD_DIRECTML_DLL 0
+#endif
+#if defined(_MSC_VER) && (ORT_DELAY_LOAD_WEBGPU_DAWN_DLL || ORT_DELAY_LOAD_DIRECTML_DLL)
+
+#include <Windows.h>
+#include <delayimp.h>
+#include <stdlib.h>
+#include <string>
+
+#include "core/platform/env.h"
+
+namespace {
+
+#define DEFINE_KNOWN_DLL(name) {#name ".dll", L#name L".dll"}
+
+constexpr struct {
+  const char* str;
+  const wchar_t* wstr;
+} known_dlls[] = {
+#if ORT_DELAY_LOAD_WEBGPU_DAWN_DLL
+    DEFINE_KNOWN_DLL(webgpu_dawn),
+#endif
+#if ORT_DELAY_LOAD_DIRECTML_DLL
+    DEFINE_KNOWN_DLL(DirectML),
+#endif
+};
+}  // namespace
+
+FARPROC WINAPI delay_load_hook(unsigned dliNotify, PDelayLoadInfo pdli) {
+  if (dliNotify == dliNotePreLoadLibrary) {
+    for (size_t i = 0; i < _countof(known_dlls); ++i) {
+      if (_stricmp(pdli->szDll, known_dlls[i].str) == 0) {
+        // Try to load the DLL from the same directory as onnxruntime.dll
+
+        // First, get the path to onnxruntime.dll
+        auto path = onnxruntime::Env::Default().GetRuntimePath();
+        if (path.empty()) {
+          // Failed to get the path to onnxruntime.dll. In this case, we will just return NULL and let the system
+          // search for the DLL in the default search order.
+          return NULL;
+        }
+
+        // Append the name of the DLL. Now `path` is the absolute path to the DLL to load.
+        path.append(known_dlls[i].wstr);
+
+        // Load the DLL
+        return FARPROC(LoadLibraryExW(path.c_str(), NULL,
+                                      LOAD_LIBRARY_SEARCH_DEFAULT_DIRS | LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR));
+      }
+    }
+  }
+  return NULL;
+}
+
+extern "C" const PfnDliHook __pfnDliNotifyHook2 = delay_load_hook;
+
+#endif
diff --git a/onnxruntime/core/dll/dllmain.cc b/onnxruntime/core/dll/dllmain.cc
index 2e7bdafd0599f..ac5dcd9c96084 100644
--- a/onnxruntime/core/dll/dllmain.cc
+++ b/onnxruntime/core/dll/dllmain.cc
@@ -13,7 +13,7 @@
 #pragma GCC diagnostic pop
 #endif
 
-// dllmain.cpp : Defines the entry point for the DLL application.
+// dllmain.cc : Defines the entry point for the DLL application.
 BOOL APIENTRY DllMain(HMODULE /*hModule*/,
                       DWORD ul_reason_for_call,
                       LPVOID /*lpReserved*/
diff --git a/onnxruntime/core/framework/graph_partitioner.cc b/onnxruntime/core/framework/graph_partitioner.cc
index 406fc1b15effc..b97cf03e3bf59 100644
--- a/onnxruntime/core/framework/graph_partitioner.cc
+++ b/onnxruntime/core/framework/graph_partitioner.cc
@@ -681,7 +681,7 @@ static Status CreateEpContextModel(const ExecutionProviders& execution_providers
                            context_cache_path, "' exist already.");
   }
 
-  Model ep_context_model(graph.Name(), false, graph.GetModel().MetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
+  Model ep_context_model(graph.Name(), false, graph.GetModel().MetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList{graph.GetSchemaRegistry()},
                          graph.DomainToVersionMap(), {}, logger);
   auto& ep_graph = ep_context_model.MainGraph();
   ep_graph.SetDescription(graph.Description());
diff --git a/onnxruntime/core/framework/prepacked_weights.h b/onnxruntime/core/framework/prepacked_weights.h
index fbf99b81937ee..9695be1e0554c 100644
--- a/onnxruntime/core/framework/prepacked_weights.h
+++ b/onnxruntime/core/framework/prepacked_weights.h
@@ -6,7 +6,8 @@
 #include <vector>
 
 #include "core/common/basic_types.h"
-#include "core/framework/buffer_deleter.h"
+#include "core/common/inlined_containers_fwd.h"
+#include "core/framework/allocator.h"
 #include "core/framework/tensor_shape.h"
 
 namespace onnxruntime {
@@ -16,11 +17,14 @@ struct PrePackedWeights final {
   // Hence we hold them in container. It is upto the developer implementing each PrePack()
   // method to define what gets stored in which position of the container.
 
-  std::vector<IAllocatorUniquePtr<void>> buffers_;  // cache pre-packed buffers associated with the kernel
-  std::vector<size_t> buffer_sizes_;                // cache sizes of pre-packed buffers (in bytes)
+  InlinedVector<IAllocatorUniquePtr<void>> buffers_;  // cache pre-packed buffers associated with the kernel
+  InlinedVector<size_t> buffer_sizes_;                // cache sizes of pre-packed buffers (in bytes)
 
   // Produces a hash of the buffers stored in the given instance of this class
   HashValue GetHash() const;
+
+  // The function creates a copy with non-owning BufferUniquePtrs.
+  PrePackedWeights CreateReferringCopy() const;
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/prepacked_weights_container.cc b/onnxruntime/core/framework/prepacked_weights_container.cc
index b6d44dd248bdd..7c832a0ac2691 100644
--- a/onnxruntime/core/framework/prepacked_weights_container.cc
+++ b/onnxruntime/core/framework/prepacked_weights_container.cc
@@ -3,9 +3,21 @@
 
 #include "core/framework/prepacked_weights_container.h"
 #include "core/framework/allocator_utils.h"
+#include "core/graph/graph.h"
 
 namespace onnxruntime {
 
+PrePackedWeights PrePackedWeights::CreateReferringCopy() const {
+  PrePackedWeights copy;
+  for (const auto& prepacked_buffer : buffers_) {
+    // No deleter is needed as the buffer is not owned by the unique_ptr
+    copy.buffers_.emplace_back(prepacked_buffer.get(), [](void*) {});
+  }
+
+  copy.buffer_sizes_ = buffer_sizes_;
+  return copy;
+}
+
 AllocatorPtr PrepackedWeightsContainer::GetOrCreateAllocator(const std::string& device_name) {
   auto iter = allocators_.find(device_name);
 
@@ -49,4 +61,50 @@ size_t PrepackedWeightsContainer::GetNumberOfElements() const {
   return prepacked_weights_map_.size();
 }
 
+void PrepackedWeightsForGraph::InsertPrepackedWeights(const std::string& key, PrePackedWeights&& packed_weight) {
+  // We may have duplicate entries mapped from disk if the same weight is pre-packed from subgraphs and
+  // up the tree by the same kernel with the same result. The map prevents this from happening.
+  key_to_blobs_.emplace(key, std::move(packed_weight));
+}
+
+void PrepackedWeightsForGraph::WritePackedMaybeForSave(const std::string& weight_name, const std::string& key,
+                                                       PrePackedWeights&& packed_weight) {
+  key_to_blobs_.insert_or_assign(key, std::move(packed_weight));
+
+  if (save_mode_on_) {
+    weight_prepacks_for_saving_[weight_name].insert(key);
+  }
+}
+
+const PrePackedWeights* PrepackedWeightsForGraph::GetPrepackedWeights(const std::string& key) const {
+  auto it = key_to_blobs_.find(key);
+  if (it == key_to_blobs_.end()) {
+    return nullptr;
+  }
+  return &it->second;
+}
+
+std::optional<PrePackedWeights> PrepackedWeightsForGraph::ReplaceWithReferenceIfSaving(
+    const std::string& weight_name,
+    const std::string& key,
+    const PrePackedWeights& refer_to_if_absent) {
+  auto it = key_to_blobs_.find(key);
+  if (it == key_to_blobs_.end()) {
+    if (save_mode_on_) {
+      key_to_blobs_.emplace(key, refer_to_if_absent.CreateReferringCopy());
+      weight_prepacks_for_saving_[weight_name].insert(key);
+    }
+    return std::nullopt;
+  }
+
+  PrePackedWeights result = std::move(it->second);
+  if (save_mode_on_) {
+    it->second = result.CreateReferringCopy();
+    weight_prepacks_for_saving_[weight_name].insert(key);
+  } else {
+    key_to_blobs_.erase(it);
+  }
+  return result;
+}
+
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/prepacked_weights_container.h b/onnxruntime/core/framework/prepacked_weights_container.h
index 37fc01c05f2ae..f48c790eb4126 100644
--- a/onnxruntime/core/framework/prepacked_weights_container.h
+++ b/onnxruntime/core/framework/prepacked_weights_container.h
@@ -3,19 +3,26 @@
 
 #pragma once
 
-#include <unordered_map>
-#include <unordered_set>
-#include <string>
-#include <cstdint>
-
-#include "core/framework/buffer_deleter.h"
-
+#include "core/common/common.h"
 #include "core/framework/allocator.h"
-#include <mutex>
 #include "prepacked_weights.h"
 
+#include <cstdint>
+#include <mutex>
+#include <optional>
+#include <string>
+#include <tuple>
+#include <unordered_map>
+#include <unordered_set>
+
 namespace onnxruntime {
 
+#ifndef SHARED_PROVIDER
+class Graph;
+#else
+struct Graph;
+#endif
+
 class PrepackedWeightsContainer final {
  public:
   PrepackedWeightsContainer() {
@@ -66,4 +73,98 @@ class PrepackedWeightsContainer final {
   std::unordered_map<std::string, PrePackedWeights> prepacked_weights_map_;
 };
 
+// Maps a pre-packed weight blob key to PrepackedWeights instance
+using PrepackedKeyToBlobMap = std::unordered_map<std::string, PrePackedWeights>;
+
+/// <summary>
+/// This class has a dual purpose.
+/// If saving is OFF (IsSaveModeOn() false), it is used to contain the weights memory mapped from disk.
+/// Those weights are then moved to the shared container if weight sharing is enabled.
+/// If cross-session weight sharing is not enabled, the weights are stored in this container,
+/// and shared with the interested kernels.
+///
+/// When saving to disk is ON (IsSaveModeOn() true)
+/// It records the pre-packed weights blobs and associates them with the weight name.
+/// When saving the model with external initializers, the weights are written to disk along
+/// with the pre-packed blobs.
+///
+/// </summary>
+class PrepackedWeightsForGraph {
+ public:
+  PrepackedWeightsForGraph(PrepackedKeyToBlobMap& key_blobs, bool save_mode_on_)
+      : key_to_blobs_(key_blobs), save_mode_on_(save_mode_on_) {
+  }
+
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(PrepackedWeightsForGraph);
+
+  // WeightToPrePacksMap maps weight name to a set of pre-packed
+  // keys contained in the KeyToBlobMap
+  using KeysPerWeight = std::unordered_set<std::string>;  // blob keys
+  using WeightToPrePacksMap = std::unordered_map<std::string, KeysPerWeight>;
+
+  void InsertPrepackedWeights(const std::string& key, PrePackedWeights&& packed_weight);
+
+  // Overwrites the existing weights and associates key with weight_name
+  void WritePackedMaybeForSave(const std::string& weight_name, const std::string& key,
+                               PrePackedWeights&& packed_weight);
+
+  const PrePackedWeights* GetPrepackedWeights(const std::string& key) const;
+
+  // The function would add or replace existing entry with references to it.
+  // If the entry is present, it would replace it with references to the existing entry.
+  // If the entry is not present, it would add reference to refer_if_absent
+  // If the entry is present it would return the existing entry otherwise std::nullopt
+  // Reference in this context means a non-owning smart pointer. Essentially, this function
+  // replaces the existing entry with the same entry, but transfers the ownership outside
+  // the container.
+  std::optional<PrePackedWeights> ReplaceWithReferenceIfSaving(const std::string& weight_name,
+                                                               const std::string& key,
+                                                               const PrePackedWeights& refer_to_if_absent);
+
+  bool IsSaveModeOn() const noexcept {
+    return save_mode_on_;
+  }
+
+  void SetSaveMode(bool value) noexcept {
+    save_mode_on_ = value;
+  }
+
+  const KeysPerWeight* GetKeysForWeightForSaving(const std::string& weight_name) const {
+    auto hit = weight_prepacks_for_saving_.find(weight_name);
+    if (hit != weight_prepacks_for_saving_.end()) {
+      return &hit->second;
+    }
+    return nullptr;
+  }
+
+  size_t GetNumberOfWeightsForWriting() const noexcept {
+    return weight_prepacks_for_saving_.size();
+  }
+
+  size_t GetNumberOfKeyedBlobsForWriting() const noexcept {
+    size_t result = 0;
+    for (const auto& [_, keys] : weight_prepacks_for_saving_) {
+      result += keys.size();
+    }
+    return result;
+  }
+
+  const WeightToPrePacksMap& GetWeightToPrepack() const noexcept {
+    return weight_prepacks_for_saving_;
+  }
+
+  PrepackedKeyToBlobMap& GetKeyToBlob() noexcept {
+    return key_to_blobs_;
+  }
+
+  const PrepackedKeyToBlobMap& GetKeyToBlob() const noexcept {
+    return key_to_blobs_;
+  }
+
+ private:
+  PrepackedKeyToBlobMap& key_to_blobs_;
+  bool save_mode_on_;
+  WeightToPrePacksMap weight_prepacks_for_saving_;
+};
+
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/session_state.cc b/onnxruntime/core/framework/session_state.cc
index 0ac2271ba09f1..d7059bf848e83 100644
--- a/onnxruntime/core/framework/session_state.cc
+++ b/onnxruntime/core/framework/session_state.cc
@@ -13,6 +13,7 @@
 #include "core/framework/node_index_info.h"
 #include "core/framework/op_kernel.h"
 #include "core/framework/ort_value_pattern_planner.h"
+#include "core/framework/prepacked_weights_container.h"
 #include "core/framework/session_state_utils.h"
 #include "core/framework/utils.h"
 #include "core/providers/cpu/controlflow/utils.h"
@@ -122,7 +123,9 @@ void SessionState::UpdateAllocatorsWithEnvAllocators(const std::vector<Allocator
   }
 }
 
-void SessionState::CreateGraphInfo() {
+void SessionState::CreateGraphInfo(bool save_prepacked_on) {
+  graph_.ConstructPrepackedSharedContainerAndSetMode(save_prepacked_on);
+
   graph_viewer_.emplace(graph_);
   // use graph_viewer_ to initialize ort_value_name_idx_map_
   LOGS(logger_, VERBOSE) << "SaveMLValueNameIndexMapping";
@@ -316,6 +319,10 @@ const std::unordered_map<int, OrtValue>& SessionState::GetConstantInitializedTen
   return constant_initialized_tensors_;
 }
 
+const PrepackedWeightsForGraph& onnxruntime::SessionState::GetPrepackedIniitializersForGraph() const {
+  return graph_.GetPrepacked();
+}
+
 #if !defined(DISABLE_SPARSE_TENSORS)
 bool SessionState::IsSparseInitializer(int ort_value_index) const {
   return sparse_initialized_tensors_.count(ort_value_index) > 0;
@@ -396,8 +403,9 @@ static std::string GenerateKeyForPrepackedWeightsMap(const std::string& op_type,
   return ss_1.str();
 }
 
-Status SessionState::PrepackConstantInitializedTensors(InlinedHashMap<std::string, size_t>& constant_initializers_use_count,
-                                                       const std::unordered_map<std::string, const OrtValue*>& initializers_to_share_map) {
+Status SessionState::PrepackConstantInitializedTensors(
+    InlinedHashMap<std::string, size_t>& constant_initializers_use_count,
+    const std::unordered_map<std::string, const OrtValue*>& initializers_to_share_map) {
   auto prepacked_constant_weights = [this, &constant_initializers_use_count, &initializers_to_share_map](
                                         bool should_cache_prepacked_weights_for_shared_initializers) -> Status {
     for (auto& node : GetGraphViewer().Nodes()) {
@@ -407,6 +415,7 @@ Status SessionState::PrepackConstantInitializedTensors(InlinedHashMap<std::strin
         if (input_def->Exists()) {
           const std::string& input_name = input_def->Name();
           SessionState* st = this;
+          auto* prepacked_for_graph = &graph_.GetPrepacked();
           // subgraph can use the value from outer scope,
           // so it needs to check if current node uses constant initialized tensor from current and outer graphs
           do {
@@ -423,7 +432,8 @@ Status SessionState::PrepackConstantInitializedTensors(InlinedHashMap<std::strin
 
                 // Caching pre-packed weights is limited to shared initializers associated with the CPU EP for now
                 if (is_shared_initializer && should_cache_prepacked_weights_for_shared_initializers &&
-                    node.GetExecutionProviderType() == kCpuExecutionProvider) {  // caching of pre-packed weights' turned ON
+                    node.GetExecutionProviderType() == kCpuExecutionProvider) {
+                  // caching of pre-packed weights' turned ON
 
                   AllocatorPtr allocator_for_caching = prepacked_weights_container_->GetOrCreateAllocator(CPU);
                   ORT_ENFORCE(allocator_for_caching.get() != nullptr);
@@ -431,16 +441,19 @@ Status SessionState::PrepackConstantInitializedTensors(InlinedHashMap<std::strin
                   PrePackedWeights weights_to_be_filled_in;
                   // The reason we invoke PrePack() before looking into the container for any pre-packed weight
                   // cached by another instance of the same op_type (for the same constant initializer) is because
-                  // to truly know if we can use a cached pre-packed weight, we would have to compare the cached pre-packed
-                  // weight with the pre-packed weight generated by this instance of the same op_type because other static
-                  // properties of the node like node attributes could play a role in the pre-packed weights' contents.
+                  // to truly know if we can use a cached pre-packed weight, we would have to compare the cached
+                  // pre-packed  weight with the pre-packed weight generated by this instance of the same op_type
+                  // because other static properties of the node like node attributes could play a role in the
+                  // pre-packed weights' contents.
                   ORT_RETURN_IF_ERROR(kernel->PrePack(const_initialized_tensor, input_idx, allocator_for_caching,
                                                       is_packed,
                                                       &weights_to_be_filled_in));
 
                   if (is_packed) {
-                    // BUG CHECK: Ensure that the kernel has filled in the pre-packed weight to be cached if the weight was pre-packed
-                    ORT_ENFORCE(weights_to_be_filled_in.buffers_.size() > 0, "The kernel corresponding to the node ", node.Name(),
+                    // BUG CHECK: Ensure that the kernel has filled in the pre-packed weight
+                    // to be cached if the weight was pre-packed
+                    ORT_ENFORCE(weights_to_be_filled_in.buffers_.size() > 0,
+                                "The kernel corresponding to the node ", node.Name(),
                                 " doesn't have an implementation that can cache computed pre-packed weights");
 
                     const auto& op_type = node.OpType();
@@ -452,40 +465,117 @@ Status SessionState::PrepackConstantInitializedTensors(InlinedHashMap<std::strin
                     // The key for the pre-packed weights container lookup is the op_type + hash of the prepacked-weight
                     // that we just got by invoking PrePack() on this kernel.
 
-                    const std::string& prepacked_weights_container_key = GenerateKeyForPrepackedWeightsMap(op_type,
-                                                                                                           weights_to_be_filled_in);
+                    const std::string prepacked_weights_container_key =
+                        GenerateKeyForPrepackedWeightsMap(op_type,
+                                                          weights_to_be_filled_in);
 
-                    bool container_contains_packed_weight = prepacked_weights_container_->HasWeight(prepacked_weights_container_key);
+                    bool container_contains_packed_weight = prepacked_weights_container_->HasWeight(
+                        prepacked_weights_container_key);
 
                     if (container_contains_packed_weight) {
-                      LOGS(logger_, INFO) << "Using cached version of pre-packed weight for constant initializer: " << input_name
-                                          << " used in the node: " << node.Name() << " which is of op type: " << node.OpType();
+                      LOGS(logger_, INFO) << "Using cached version of pre-packed weight for constant initializer: "
+                                          << input_name
+                                          << " used in the node: " << node.Name() << " which is of op type: "
+                                          << node.OpType();
 
+                      const auto& prepacked_shared = prepacked_weights_container_->GetWeight(
+                          prepacked_weights_container_key);
                       ORT_RETURN_IF_ERROR(KernelUseSharedPrePackedBuffers(*kernel, input_idx,
-                                                                          prepacked_weights_container_->GetWeight(prepacked_weights_container_key),
+                                                                          prepacked_shared,
                                                                           node.Name()));
 
                       ++used_shared_pre_packed_weights_counter_;
-                    } else {  // container doesn't contain the pre-packed weight - so write into it for sharing across kernel instances
 
-                      if (!prepacked_weights_container_->WriteWeight(prepacked_weights_container_key, std::move(weights_to_be_filled_in))) {
-                        return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unable to write the provided PrePackedWeights instance into the container");
+                      // Write references to what is stored in the shared container
+                      // and release memory mapped entries this container may have loaded from disk
+                      std::ignore = prepacked_for_graph->ReplaceWithReferenceIfSaving(input_name,
+                                                                                      prepacked_weights_container_key,
+                                                                                      prepacked_shared);
+
+                    } else {
+                      // container doesn't contain the pre-packed weight - so write into it for sharing across
+                      // kernel instances
+
+                      // Check if we loaded it from disk, then put it into the shared container so
+                      // everybody can share the same memory mapped entry
+                      // the shared container takes ownership of the memory mapped entries
+
+                      // The next line replaces the existing entry with references to it
+                      // and returns the container that holds the memory mapped entries
+                      // so we can transfer it to shared container.
+                      // if there is not an entry, we replace it with references to weights_to_be_filled_in
+                      // in saving mode and return std::nullopt
+                      auto prepacked_from_disk = prepacked_for_graph->ReplaceWithReferenceIfSaving(
+                          input_name,
+                          prepacked_weights_container_key,
+                          weights_to_be_filled_in);
+
+                      if (prepacked_from_disk.has_value()) {
+                        weights_to_be_filled_in = std::move(*prepacked_from_disk);
                       }
 
+                      if (!prepacked_weights_container_->WriteWeight(prepacked_weights_container_key,
+                                                                     std::move(weights_to_be_filled_in))) {
+                        return ORT_MAKE_STATUS(
+                            ONNXRUNTIME, FAIL,
+                            "Unable to write the provided PrePackedWeights instance into the container");
+                      }
+
+                      const auto& shared_prepacked = prepacked_weights_container_->GetWeight(
+                          prepacked_weights_container_key);
                       ORT_RETURN_IF_ERROR(KernelUseSharedPrePackedBuffers(*kernel, input_idx,
-                                                                          prepacked_weights_container_->GetWeight(prepacked_weights_container_key),
+                                                                          shared_prepacked,
                                                                           node.Name()));
                     }
                   }
 
-                } else {  // caching of pre-packed weights' turned OFF
+                } else {
+                  // cross session caching of pre-packed weights' turned OFF
+                  // we use serialization container to share weights loaded from disk
+                  // within this session. Or if the weight is not present on disk,
+                  // we store the newly minted pre-packed data.
+
                   AllocatorPtr session_cpu_alloc = GetAllocator(kernel->Info().GetDevice(OrtMemType::OrtMemTypeDefault));
-                  ORT_RETURN_IF_ERROR(kernel->PrePack(const_initialized_tensor, input_idx,
-                                                      session_cpu_alloc,  // use allocator tied to this session
+                  PrePackedWeights weights_to_be_filled_in;
+                  // The reason we invoke PrePack() before looking into the container for any pre-packed weight
+                  // cached by another instance of the same op_type (for the same constant initializer) is because
+                  // to truly know if we can use a cached pre-packed weight, we would have to compare the cached
+                  // pre-packed weight with the pre-packed weight generated by this instance of the same op_type because
+                  // other static properties of the node like node attributes could play a role in the pre-packed
+                  // weights' contents.
+                  ORT_RETURN_IF_ERROR(kernel->PrePack(const_initialized_tensor, input_idx, session_cpu_alloc,
                                                       is_packed,
-                                                      nullptr  // no caching required
-                                                      ));
+                                                      &weights_to_be_filled_in));
+
+                  // Some kernels (matmul_nbits and non-CPU related kernels) do not share their pre-packed results
+                  // even though they set is_packed = true so we leave it up to them.
+                  // We can change their behavior if we wish do so in a separate PR
+                  // XXX: Interestingly enough, matmul_nbits does accept shared pre-packs, but does not
+                  // produce them.
+                  if (is_packed && !weights_to_be_filled_in.buffers_.empty()) {
+                    const auto& op_type = node.OpType();
+                    const std::string prepacked_weights_container_key = GenerateKeyForPrepackedWeightsMap(
+                        op_type,
+                        weights_to_be_filled_in);
+
+                    // See if we can use pre-packed data from disk
+                    const auto* weights_to_use = prepacked_for_graph->GetPrepackedWeights(
+                        prepacked_weights_container_key);
+
+                    if (weights_to_use == nullptr) {
+                      // In this case pre-packed container owns the data
+                      prepacked_for_graph->WritePackedMaybeForSave(input_name, prepacked_weights_container_key,
+                                                                   std::move(weights_to_be_filled_in));
+                      weights_to_use = prepacked_for_graph->GetPrepackedWeights(prepacked_weights_container_key);
+                      assert(weights_to_use != nullptr);
+                    }
+
+                    ORT_RETURN_IF_ERROR(KernelUseSharedPrePackedBuffers(*kernel, input_idx,
+                                                                        *weights_to_use,
+                                                                        node.Name()));
+                  }
                 }
+
                 if (is_packed) {
                   ++number_of_prepacks_counter_;
 
@@ -504,6 +594,7 @@ Status SessionState::PrepackConstantInitializedTensors(InlinedHashMap<std::strin
               }
             }
             st = st->Parent();
+            prepacked_for_graph = &st->graph_.GetPrepacked();
           } while (st);
         }
         input_idx++;
@@ -525,7 +616,8 @@ Status SessionState::PrepackConstantInitializedTensors(InlinedHashMap<std::strin
   }
 }
 
-static int64_t CalculateMemoryPatternsKey(const gsl::span<const OrtValue>& tensor_inputs) {
+static int64_t
+CalculateMemoryPatternsKey(const gsl::span<const OrtValue>& tensor_inputs) {
   int64_t key = 0;
   for (const auto& input : tensor_inputs) {
     for (auto dim : input.Get<Tensor>().Shape().GetDims()) key ^= dim;
@@ -1068,9 +1160,12 @@ Status SessionState::CreateSubgraphSessionState() {
 
 // Calculate the use count of a constant initialized tensor, including the use in subgraph.
 // Note: This function doesn't handle the case below:
-// The main graph has a constant initializer called X, and the subgraph also has a constant initializer called X, which overrides the X from main graph.
-// For case like this, the current implementation will calculate the use count as 2, but they could contain completely different values so each should have a use count of 1.
-// This is a very rare case. If it happens and X is prepacked, the consequence is that X won't be released and memory usage of X won't be saved. This will be fine.
+// The main graph has a constant initializer called X, and the subgraph also has a constant initializer called X,
+// which overrides the X from main graph.
+// For case like this, the current implementation will calculate the use count as 2, but they could contain completely
+// different values so each should have a use count of 1.
+// This is a very rare case. If it happens and X is prepacked, the consequence is that X won't be released and memory
+// usage of X won't be saved. This will be fine.
 static void ComputeConstantInitializerUseCount(const Graph& graph, InlinedHashMap<std::string, size_t>& constant_initializers_use_count) {
   for (const auto& node : graph.Nodes()) {
     for (const auto* arg : node.InputDefs()) {
@@ -1189,7 +1284,30 @@ Status SessionState::FinalizeSessionState(const std::basic_string<PATH_CHAR_TYPE
   InlinedHashMap<std::string, size_t> constant_initializers_use_count;
   ComputeConstantInitializerUseCount(graph_, constant_initializers_use_count);
   return FinalizeSessionStateImpl(graph_location, kernel_registry_manager, nullptr, sess_options_,
-                                  remove_initializers, constant_initializers_use_count);
+                                  remove_initializers,
+                                  GetSaveModeForPrepacks(!remove_initializers, saving_ort_format),
+                                  constant_initializers_use_count);
+}
+
+bool SessionState::GetSaveModeForPrepacks(bool saving_model, bool saving_ort_format) {
+  bool save_prepacked_constant_initializers =
+      sess_options_.config_options.GetConfigOrDefault(kOrtSessionOptionsSavePrePackedConstantInitializers,
+                                                      "0") == "1";
+
+  if (save_prepacked_constant_initializers && !saving_model) {
+    save_prepacked_constant_initializers = false;
+    LOGS(logger_, WARNING)
+        << "SavePrePackedConstantInitializers is set to true but the model is not being saved. Ignoring the flag.";
+  }
+
+  if (save_prepacked_constant_initializers && saving_ort_format) {
+    save_prepacked_constant_initializers = false;
+    LOGS(logger_, WARNING)
+        << "Serializing optimized model in ORT format with external pre-packed constant initializers is not supported."
+        << " Ignoring the flag.";
+  }
+
+  return save_prepacked_constant_initializers;
 }
 
 static Status Index(const OrtValueNameIdxMap& ort_value_name_idx_map,
@@ -1322,11 +1440,12 @@ Status SessionState::FinalizeSessionStateImpl(const std::basic_string<PATH_CHAR_
                                               _In_opt_ const Node* parent_node,
                                               const SessionOptions& session_options,
                                               bool remove_initializers,
+                                              bool save_prepacked_initializers,
                                               InlinedHashMap<std::string, size_t>& constant_initializers_use_count,
                                               const InlinedHashMap<OrtValueName, OrtDevice>& outer_scope_node_arg_to_location_map,
                                               bool graph_info_already_created) {
   if (!graph_info_already_created) {
-    CreateGraphInfo();
+    CreateGraphInfo(save_prepacked_initializers);
   }
 
 #if defined(ORT_EXTENDED_MINIMAL_BUILD)
@@ -1475,21 +1594,20 @@ Status SessionState::FinalizeSessionStateImpl(const std::basic_string<PATH_CHAR_
   }
 #endif
 
-  ORT_RETURN_IF_ERROR(
-      session_state_utils::SaveInitializedTensors(
-          Env::Default(), graph_location, *graph_viewer_,
-          GetAllocator(OrtDevice()),
-          ort_value_name_idx_map_, initializer_allocation_order, *tensor_allocator,
-          [this, remove_initializers](const std::string& name, int idx, const OrtValue& value, const OrtCallback& d,
-                                      bool constant, bool sparse) -> Status {
-            ORT_RETURN_IF_ERROR(AddInitializedTensor(idx, value, &d, constant, sparse));
-            if (remove_initializers) {
-              graph_.RemoveInitializedTensor(name);
-            }
-            return Status::OK();
-          },
-          logger_, data_transfer_mgr_, external_data_loader_mgr_, *p_seq_exec_plan_, session_options,
-          memory_profile_func, name_to_buffered_tensor_));
+  ORT_RETURN_IF_ERROR(session_state_utils::SaveInitializedTensors(
+      Env::Default(), graph_location, *graph_viewer_,
+      GetAllocator(OrtDevice()),
+      ort_value_name_idx_map_, initializer_allocation_order, *tensor_allocator,
+      [this, remove_initializers](const std::string& name, int idx, const OrtValue& value, const OrtCallback& d,
+                                  bool constant, bool sparse) -> Status {
+        ORT_RETURN_IF_ERROR(AddInitializedTensor(idx, value, &d, constant, sparse));
+        if (remove_initializers) {
+          graph_.RemoveInitializedTensor(name);
+        }
+        return Status::OK();
+      },
+      logger_, data_transfer_mgr_, external_data_loader_mgr_, *p_seq_exec_plan_, session_options,
+      memory_profile_func, name_to_buffered_tensor_, graph_.GetPrepacked()));
 
 #if !defined(ORT_MINIMAL_BUILD) && defined(ORT_MEMORY_PROFILE)
   // Record Weight allocation info on device
@@ -1537,15 +1655,17 @@ Status SessionState::FinalizeSessionStateImpl(const std::basic_string<PATH_CHAR_
 
       // We need to create graph info for the subgraphs because information accumulated there
       // is used in OuterScopeNodeArgLocationAccumulator()
-      subgraph_session_state.CreateGraphInfo();
+      subgraph_session_state.CreateGraphInfo(save_prepacked_initializers);
 
       InlinedHashMap<OrtValueName, OrtDevice> subgraph_outer_scope_node_arg_to_location_map;
       ORT_RETURN_IF_ERROR(OuterScopeNodeArgLocationAccumulator(*p_seq_exec_plan_, GetOrtValueNameIdxMap(),
                                                                node,
                                                                subgraph_session_state.GetGraphViewer(),
                                                                subgraph_outer_scope_node_arg_to_location_map));
+
       ORT_RETURN_IF_ERROR(subgraph_session_state.FinalizeSessionStateImpl(
           graph_location, kernel_registry_manager, &node, subgraph_session_options, remove_initializers,
+          save_prepacked_initializers,
           constant_initializers_use_count, subgraph_outer_scope_node_arg_to_location_map, true));
 
       // setup all the info for handling the feeds and fetches used in subgraph execution
diff --git a/onnxruntime/core/framework/session_state.h b/onnxruntime/core/framework/session_state.h
index e1674ba4b690b..82f520f4a4252 100644
--- a/onnxruntime/core/framework/session_state.h
+++ b/onnxruntime/core/framework/session_state.h
@@ -164,6 +164,8 @@ class SessionState {
    */
   const std::unordered_map<int, OrtValue>& GetConstantInitializedTensors() const;
 
+  const PrepackedWeightsForGraph& GetPrepackedIniitializersForGraph() const;
+
 #if !defined(DISABLE_SPARSE_TENSORS)
   bool IsSparseInitializer(int ort_value_index) const;
 #endif
@@ -364,11 +366,20 @@ class SessionState {
 
   const SessionOptions& GetSessionOptions() const { return sess_options_; }
 
+  /// <summary>
+  /// Deduce the flag whether we need to enable or disable
+  /// saving for pre-packed weights serialization.
+  /// </summary>
+  /// <param name="saving_model"></param>
+  /// <param name="saving_ort_format"></param>
+  /// <returns>true of false
+  bool GetSaveModeForPrepacks(bool saving_model, bool saving_ort_format);
+
  private:
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(SessionState);
 
   // Populate OrtValueNameIdxMap and create the graph viewer.
-  void CreateGraphInfo();
+  void CreateGraphInfo(bool save_prepacked_on);
 
   // create kernels using info in kernel_create_info_map_
   Status CreateKernels(const KernelRegistryManager& custom_registry_manager);
@@ -399,6 +410,7 @@ class SessionState {
                                   _In_opt_ const Node* parent_node,
                                   const SessionOptions& session_options,
                                   bool remove_initializers,
+                                  bool save_prepacked_initializers,
                                   InlinedHashMap<std::string, size_t>& constant_initializers_use_count,
                                   const InlinedHashMap<OrtValueName, OrtDevice>& outer_scope_node_arg_to_location_map = {},
                                   bool graph_info_already_created = false);
diff --git a/onnxruntime/core/framework/session_state_utils.cc b/onnxruntime/core/framework/session_state_utils.cc
index 2c74805c57dce..83a353615bc35 100644
--- a/onnxruntime/core/framework/session_state_utils.cc
+++ b/onnxruntime/core/framework/session_state_utils.cc
@@ -68,18 +68,19 @@ struct ExtDataValueDeleter {
 // buffered_tensor is not null, buffered_tensor holds the real buffer pointed
 // by tensor_proto. buffered_tensor must be the owner of the buffer and deleter
 // should release the buffer when tensor_proto is released.
-static inline common::Status ExtDataTensorProtoToTensor(const Env& env,
-                                                        const std::basic_string<PATH_CHAR_TYPE>& proto_path,
-                                                        const ONNX_NAMESPACE::TensorProto& tensor_proto,
-                                                        Tensor& tensor, OrtCallback& ext_data_deleter,
-                                                        Tensor* buffered_tensor = nullptr) {
+static common::Status ExtDataTensorProtoToTensor(const Env& env,
+                                                 const std::basic_string<PATH_CHAR_TYPE>& proto_path,
+                                                 const ONNX_NAMESPACE::TensorProto& tensor_proto,
+                                                 Tensor& tensor, OrtCallback& ext_data_deleter,
+                                                 PrepackedWeightsForGraph& prepacked_for_graph,
+                                                 Tensor* buffered_tensor = nullptr) {
   ORT_ENFORCE(utils::HasExternalData(tensor_proto));
 
   void* ext_data_buf = nullptr;
   SafeInt<size_t> ext_data_len = 0;
   ORT_RETURN_IF_ERROR(utils::GetExtDataFromTensorProto(env, proto_path.c_str(), tensor_proto,
                                                        ext_data_buf, ext_data_len, ext_data_deleter,
-                                                       buffered_tensor));
+                                                       buffered_tensor, &prepacked_for_graph));
 
   // NB: creating a do-nothing allocator per tensor is wasteful; can perhaps be
   // avoided if the Tensor class implements the do-nothing behavior when given a
@@ -100,6 +101,7 @@ static common::Status DeserializeTensorProto(const Env& env, const std::basic_st
                                              const AllocatorPtr& alloc, const AllocatorPtr& default_cpu_alloc,
                                              OrtValue& ort_value, const DataTransferManager& data_transfer_mgr,
                                              const ExternalDataLoaderManager& external_data_loader_mgr,
+                                             PrepackedWeightsForGraph& prepacked_for_graph,
                                              bool use_device_allocator_for_initializers = false,
                                              Tensor* buffered_tensor = nullptr) {
   if (bool(alloc) == (m != nullptr)) {
@@ -127,8 +129,7 @@ static common::Status DeserializeTensorProto(const Env& env, const std::basic_st
       ORT_RETURN_IF_ERROR(utils::LoadExtDataToTensorFromTensorProto(env, proto_path, tensor_proto,
                                                                     *external_data_loader, *p_tensor));
 
-      auto ml_tensor = DataTypeImpl::GetType<Tensor>();
-      ort_value.Init(p_tensor.release(), ml_tensor, ml_tensor->GetDeleteFunc());
+      Tensor::InitOrtValue(std::move(*p_tensor), ort_value);
       return common::Status::OK();
     } else if (device_type == OrtDevice::CPU) {
       // for external initializer on CPU we will use mmap for large initializers so don't need to allocate memory in advance
@@ -139,7 +140,8 @@ static common::Status DeserializeTensorProto(const Env& env, const std::basic_st
       // TensorProtoToTensor it would copy the data, causing unnecessary overhead
       OrtCallback ext_data_deleter;
       ORT_RETURN_IF_ERROR(ExtDataTensorProtoToTensor(env, proto_path, tensor_proto, *p_tensor,
-                                                     ext_data_deleter, buffered_tensor));
+                                                     ext_data_deleter, prepacked_for_graph,
+                                                     buffered_tensor));
 
       ExtDataValueDeleter deleter{ext_data_deleter, p_tensor.get()};
       MLDataType ml_tensor_type = DataTypeImpl::GetType<Tensor>();
@@ -163,8 +165,9 @@ static common::Status DeserializeTensorProto(const Env& env, const std::basic_st
       OrtCallback ext_data_deleter;
       std::optional<ScopedOrtCallbackInvoker> scoped_ort_callback_invoker;
       ORT_RETURN_IF_ERROR(ExtDataTensorProtoToTensor(env, proto_path, tensor_proto, *p_deserialize_tensor,
-                                                     ext_data_deleter, buffered_tensor));
-      scoped_ort_callback_invoker = ScopedOrtCallbackInvoker(ext_data_deleter);
+                                                     ext_data_deleter, prepacked_for_graph,
+                                                     buffered_tensor));
+      scoped_ort_callback_invoker.emplace(ext_data_deleter);
       // TODO!! Need a temp buffer allocator for non-escape buffers that maybe too big for stack allocation.
 
       return CopyTensorFromCPUToDevice(data_transfer_mgr, p_deserialize_tensor, p_tensor, ort_value);
@@ -272,13 +275,14 @@ common::Status SaveInitializedTensors(
     const ExecutionPlanBase& exec_plan,
     const SessionOptions& session_options,
     const MemoryProfileFunction& memory_profile_func,
-    std::unordered_map<std::string, std::unique_ptr<Tensor>>& buffered_tensors) {
+    std::unordered_map<std::string, std::unique_ptr<Tensor>>& buffered_tensors,
+    PrepackedWeightsForGraph& prepacked_for_graph) {
   LOGS(logger, INFO) << "Saving initialized tensors.";
   ORT_ENFORCE(ort_value_name_idx_map.MaxIdx() > -1, "OrtValue indexes should have been populated.");
 
   // Determine if an intializer was supplied by the user for the purpose of sharing and if it requires a cross-device
   // copy. In case a cross-device copy is required, sharing cannot be accomplished since we allocate our own buffer
-  // for the destn device which cannot be shared between sessions.
+  // for the destination device which cannot be shared between sessions.
   auto use_user_supplied_initializer =
       [&session_options, &exec_plan, &logger, &ort_value_name_idx_map](const std::string& name) -> bool {
     bool retval = false;
@@ -401,6 +405,7 @@ common::Status SaveInitializedTensors(
 
       Status st = DeserializeTensorProto(env, graph_loc, tensor_proto, (m.has_value()) ? &*m : nullptr, alloc,
                                          default_cpu_alloc, ort_value, data_transfer_mgr, external_data_loader_mgr,
+                                         prepacked_for_graph,
                                          use_device_allocator_for_initializers, p_tensor);
       if (!st.IsOK()) {
         std::ostringstream oss;
diff --git a/onnxruntime/core/framework/session_state_utils.h b/onnxruntime/core/framework/session_state_utils.h
index af27f5caba0f4..17400c45e5f32 100644
--- a/onnxruntime/core/framework/session_state_utils.h
+++ b/onnxruntime/core/framework/session_state_utils.h
@@ -9,6 +9,7 @@
 
 #include "core/common/const_pointer_container.h"
 #include "core/framework/allocator.h"
+#include "core/framework/prepacked_weights_container.h"
 #include "core/framework/tensor.h"
 #include "core/framework/tensor_allocator.h"
 #include "core/framework/session_options.h"
@@ -50,7 +51,8 @@ common::Status SaveInitializedTensors(
     const ExecutionPlanBase& exec_plan,
     const SessionOptions& session_options,
     const MemoryProfileFunction& memory_profile_func,
-    std::unordered_map<std::string, std::unique_ptr<Tensor>>& buffered_tensors);
+    std::unordered_map<std::string, std::unique_ptr<Tensor>>& buffered_tensors,
+    PrepackedWeightsForGraph& prepacked_for_graph);
 
 common::Status AllocateTensor(
     const onnxruntime::MemBuffer* m,
diff --git a/onnxruntime/core/framework/tensor_external_data_info.cc b/onnxruntime/core/framework/tensor_external_data_info.cc
index 93146e66d9f24..ec8b25e9f4afe 100644
--- a/onnxruntime/core/framework/tensor_external_data_info.cc
+++ b/onnxruntime/core/framework/tensor_external_data_info.cc
@@ -3,8 +3,13 @@
 
 #include "tensor_external_data_info.h"
 #include "core/common/common.h"
+#include "core/common/narrow.h"
+#include "core/common/safeint.h"
+#include "core/common/string_utils.h"
 #include "core/platform/path_lib.h"
 
+#include <vector>
+
 #ifdef _WIN32
 #include <Windows.h>
 #endif
@@ -14,8 +19,24 @@ using ::ONNX_NAMESPACE::StringStringEntryProto;
 namespace onnxruntime {
 Status ExternalDataInfo::Create(const RepeatedPtrField<StringStringEntryProto>& input,
                                 std::unique_ptr<ExternalDataInfo>& out) {
+  auto str_to_int = [](const std::string& s, OFFSET_TYPE& result) -> Status {
+    char* end;
+#ifdef _WIN32
+    result = _strtoi64(s.c_str(), &end, 10);
+#else
+    result = OrtStrToPtrDiff(s.c_str(), &end);
+#endif
+    if (end != s.c_str() + s.length()) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "parsing ", s, " failed");
+    }
+    return Status::OK();
+  };
+
   out = std::make_unique<ExternalDataInfo>();
+  PrepackedInfos prepacked_infos;
+
   const int input_size = input.size();
+
   for (int i = 0; i != input_size; ++i) {
     StringStringEntryProto stringmap = input[i];
     if (!stringmap.has_key())
@@ -25,28 +46,112 @@ Status ExternalDataInfo::Create(const RepeatedPtrField<StringStringEntryProto>&
     if (stringmap.key() == "location" && !stringmap.value().empty()) {
       out->rel_path_ = ToWideString(stringmap.value());
     } else if (stringmap.key() == "offset" && !stringmap.value().empty()) {
-      char* end;
-#ifdef _WIN32
-      out->offset_ = _strtoi64(stringmap.value().c_str(), &end, 10);
-#else
-      out->offset_ = OrtStrToPtrDiff(stringmap.value().c_str(), &end);
-#endif
-      if (end != stringmap.value().c_str() + stringmap.value().length())
-        return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "parsing ", stringmap.value(), " failed");
+      ORT_RETURN_IF_ERROR(str_to_int(stringmap.value(), out->offset_));
     } else if (stringmap.key() == "length" && !stringmap.value().empty()) {
       char* end;
-      out->length_ = static_cast<size_t>(OrtStrToPtrDiff(stringmap.value().c_str(), &end));
+      out->length_ = narrow<size_t>(OrtStrToPtrDiff(stringmap.value().c_str(), &end));
       if (end != stringmap.value().c_str() + stringmap.value().length())
         return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "parsing ", stringmap.value(), " failed");
     } else if (stringmap.key() == "checksum" && !stringmap.value().empty()) {
       out->checksum_ = stringmap.value();
+    } else if (stringmap.key().find("prepacked", 0) == 0) {
+      // Starts with 'prepacked', each has its own key.
+      // Each prepacked entry may have multiple blobs with the same key
+      // we output them with the same key
+      // format = key|offset;length;checksum[|offset;length;checksum]
+      // We are ignoring invalid entries (should not be any), and rely
+      // on in memory pre-packs regenerated in this case.
+      // users can over-write this file with the correct pre-packed info.
+      const std::string& prepacked = stringmap.value();
+      if (!prepacked.empty()) {
+        auto split_fields = utils::SplitString(prepacked, "|", false);
+        if (split_fields.size() > 1) {
+          const std::string key{split_fields[0]};
+          auto& blob_infos = prepacked_infos[key];
+          for (size_t f = 1; f < split_fields.size(); ++f) {
+            const auto& blob = split_fields[f];
+            auto blob_fields = utils::SplitString(blob, ";", false);
+            if (blob_fields.size() == 3) {
+              OFFSET_TYPE offset, len;
+              ORT_RETURN_IF_ERROR(str_to_int(std::string(blob_fields[0]), offset));
+              ORT_RETURN_IF_ERROR(str_to_int(std::string(blob_fields[1]), len));
+              blob_infos.push_back(std::make_tuple(offset, narrow<size_t>(len), std::string(blob_fields[2])));
+            }
+          }
+          if (blob_infos.empty()) {
+            prepacked_infos.erase(key);
+          }
+        }
+      }
     } else {
       return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "model format error!");
     }
   }
+
   if (out->rel_path_.empty()) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "model format error! Missing 'location'");
   }
+
+  if (!prepacked_infos.empty()) {
+    out->prepacked_infos_ = std::move(prepacked_infos);
+  }
+
   return Status::OK();
 }
+void ExternalDataInfo::SetExternalLocationToProto(const std::filesystem::path& external_file_path,
+                                                  int64_t external_offset, size_t tensor_bytes_size,
+                                                  ::ONNX_NAMESPACE::TensorProto& proto) {
+  proto.set_data_location(ONNX_NAMESPACE::TensorProto_DataLocation::TensorProto_DataLocation_EXTERNAL);
+
+  auto* location = proto.add_external_data();
+  location->set_key("location");
+  location->set_value(ToUTF8String(external_file_path.native()));
+
+  auto* offset = proto.add_external_data();
+  offset->set_key("offset");
+  offset->set_value(std::to_string(external_offset));
+
+  auto* length = proto.add_external_data();
+  length->set_key("length");
+  length->set_value(std::to_string(tensor_bytes_size));
+}
+
+std::ostream& ExternalDataInfo::WritePrepackedToFileAndAddToProto(
+    const PrepackedWeightsForGraph& prepacked_for_graph,
+    const InlinedHashSet<std::string>& blob_keys, bool align,
+    int64_t align_threshold, int64_t allocation_granularity,
+    std::ostream& os, int64_t& external_offset, ::ONNX_NAMESPACE::TensorProto& proto) {
+  size_t key_count = 0;
+  for (const auto& key : blob_keys) {
+    size_t prepack_count = 0;
+    const auto* prepacked_weights = prepacked_for_graph.GetPrepackedWeights(key);
+    ORT_ENFORCE(prepacked_weights != nullptr, "Prepacked weights not found for key ", key);
+    std::stringstream prepacked_entry;
+    prepacked_entry << key << "|";
+    for (size_t i = 0, size = prepacked_weights->buffers_.size(); i < size; ++i) {
+      const auto size_in_bytes = prepacked_weights->buffer_sizes_[i];
+      if (align && static_cast<int64_t>(size_in_bytes) > align_threshold) {
+        // return early on error
+        if (!AlignAndPad(os, allocation_granularity, external_offset)) {
+          return os;
+        }
+      }
+      if (prepack_count++ > 0) {
+        prepacked_entry << "|";
+      }
+      // Checksum is currently not validated
+      prepacked_entry << external_offset << ";" << size_in_bytes << ";0";
+      if (!os.write(reinterpret_cast<const char*>(prepacked_weights->buffers_[i].get()), size_in_bytes)) {
+        return os;
+      }
+      external_offset = SafeInt<int64_t>(external_offset) + size_in_bytes;
+    }
+    auto* prepacked = proto.add_external_data();
+    std::string prepacked_key("prepacked_");
+    prepacked_key.append(std::to_string(key_count++));
+    prepacked->set_key(std::move(prepacked_key));
+    prepacked->set_value(prepacked_entry.str());
+  }
+  return os;
+}
 }  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/core/framework/tensor_external_data_info.h b/onnxruntime/core/framework/tensor_external_data_info.h
index afc8fda6c3037..1b185b8c5da7d 100644
--- a/onnxruntime/core/framework/tensor_external_data_info.h
+++ b/onnxruntime/core/framework/tensor_external_data_info.h
@@ -2,12 +2,21 @@
 // Licensed under the MIT License.
 #pragma once
 
+#include <cmath>
+#include <filesystem>
+#include <ostream>
 #include <string>
+#include <tuple>
+
+#include <core/common/inlined_containers_fwd.h>
+#include "core/common/path_string.h"
+#include "core/common/safeint.h"
 #include "core/common/status.h"
+#include "core/framework/prepacked_weights_container.h"
 #include "core/graph/onnx_protobuf.h"
-#include "core/session/onnxruntime_c_api.h"
 
 namespace onnxruntime {
+
 class ExternalDataInfo {
  public:
 #ifdef _WIN32
@@ -16,7 +25,7 @@ class ExternalDataInfo {
   using OFFSET_TYPE = off_t;
 #endif
 
-  const std::basic_string<ORTCHAR_T>& GetRelPath() const { return rel_path_; }
+  const PathString& GetRelPath() const { return rel_path_; }
 
   OFFSET_TYPE GetOffset() const { return offset_; }
   size_t GetLength() const { return length_; }
@@ -29,12 +38,58 @@ class ExternalDataInfo {
       const ::google::protobuf::RepeatedPtrField<::ONNX_NAMESPACE::StringStringEntryProto>& input,
       std::unique_ptr<ExternalDataInfo>& out);
 
+  static void SetExternalLocationToProto(const std::filesystem::path& external_file_path,
+                                         int64_t offset,
+                                         size_t tensor_bytes_size,
+                                         ::ONNX_NAMESPACE::TensorProto& proto);
+
+  // Pads the output with zeros according to the specified allocation_granularity
+  // It updates external_offset for alignment.
+  // need to do padding before write actual tensor data as we do offset alignment at the begin of
+  // large tensors (offset need to be page aligned and allocation granularity aligned) like below:
+  // \242\2557\256\023.\031&0000000000000000\332)k+\253\246\342\246(&\006!\347\232\374\236\325\026\032+\36XXXX
+  // |<---smaller tensor---->|<---padding--->|<------------------large tensor----------------------------->|
+  static std::ostream& AlignAndPad(std::ostream& stream, int64_t allocation_granularity, int64_t& external_offset) {
+    // Align to the larger of the page size or the allocation granularity
+    int64_t alignment_factor = std::max(static_cast<int64_t>(4096), allocation_granularity);
+    // Align to the next page or alloc granularity boundary
+    SafeInt<int64_t> safe_external_offset = external_offset;
+    int64_t new_external_offset = ((safe_external_offset + alignment_factor - 1) / alignment_factor) *
+                                  alignment_factor;
+
+    // padding tensor with zeros for alignment
+    for (int64_t index = external_offset; index != new_external_offset; ++index) {
+      stream << '\0';
+    }
+    external_offset = new_external_offset;
+    return stream;
+  }
+
+  static std::ostream& WritePrepackedToFileAndAddToProto(
+      const PrepackedWeightsForGraph& prepacked_for_graph,
+      const InlinedHashSet<std::string>& blob_keys,
+      bool align, int64_t align_threshold, int64_t allocation_granularity,
+      std::ostream& os,
+      int64_t& external_offset,
+      ::ONNX_NAMESPACE::TensorProto& proto);
+
+  using PrepackedInfo = std::tuple<OFFSET_TYPE, size_t, std::string>;
+  using PrepackedInfos = std::unordered_map<std::string, std::vector<PrepackedInfo>>;
+
+  bool HasPrepackedInfo() const noexcept { return !prepacked_infos_.empty(); }
+
+  PrepackedInfos&& TakePrepackedInfos() { return std::move(prepacked_infos_); }
+
  private:
-  std::basic_string<ORTCHAR_T> rel_path_;
+  PathString rel_path_;
   OFFSET_TYPE offset_ = 0;
 
   // 0 means the whole file
   size_t length_ = 0;
   std::string checksum_;
+
+  // Pre-packed blobs found associated with this TensorProto if present
+  // format key, offset, length, checksum
+  PrepackedInfos prepacked_infos_;
 };
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/tensorprotoutils.cc b/onnxruntime/core/framework/tensorprotoutils.cc
index 2af9f95ad059e..097ce436f4419 100644
--- a/onnxruntime/core/framework/tensorprotoutils.cc
+++ b/onnxruntime/core/framework/tensorprotoutils.cc
@@ -234,7 +234,8 @@ Status GetExternalDataInfo(const ONNX_NAMESPACE::TensorProto& tensor_proto,
                            const std::filesystem::path& tensor_proto_dir,
                            std::basic_string<ORTCHAR_T>& external_file_path,
                            onnxruntime::FileOffsetType& file_offset,
-                           SafeInt<size_t>& tensor_byte_size) {
+                           SafeInt<size_t>& tensor_byte_size,
+                           ExternalDataInfo::PrepackedInfos* prepacked_infos) {
   ORT_RETURN_IF_NOT(onnxruntime::utils::HasExternalData(tensor_proto),
                     "Tensor does not have external data to read from.");
 
@@ -258,6 +259,10 @@ Status GetExternalDataInfo(const ONNX_NAMESPACE::TensorProto& tensor_proto,
 
   file_offset = external_data_info->GetOffset();
 
+  if (prepacked_infos != nullptr && external_data_info->HasPrepackedInfo()) {
+    *prepacked_infos = external_data_info->TakePrepackedInfos();
+  }
+
   return Status::OK();
 }
 
@@ -988,7 +993,8 @@ static Status GetFileContent(const Env& env, const std::filesystem::path& file_p
 Status GetExtDataFromTensorProto(const Env& env, const std::filesystem::path& model_path,
                                  const ONNX_NAMESPACE::TensorProto& tensor_proto, void*& ext_data_buf,
                                  SafeInt<size_t>& ext_data_len, OrtCallback& ext_data_deleter,
-                                 Tensor* buffered_tensor) {
+                                 Tensor* buffered_tensor,
+                                 PrepackedWeightsForGraph* prepacked_info) {
   ORT_ENFORCE(utils::HasExternalData(tensor_proto));
   std::basic_string<ORTCHAR_T> tensor_proto_dir;
   if (!model_path.empty()) {
@@ -997,8 +1003,13 @@ Status GetExtDataFromTensorProto(const Env& env, const std::filesystem::path& mo
   std::basic_string<ORTCHAR_T> external_data_file_path;
   FileOffsetType file_offset;
   SafeInt<size_t> raw_data_safe_len = 0;
+  std::optional<ExternalDataInfo::PrepackedInfos> prepacked_infos;
+  if (prepacked_info != nullptr) {
+    prepacked_infos.emplace();
+  }
   ORT_RETURN_IF_ERROR(
-      GetExternalDataInfo(tensor_proto, tensor_proto_dir, external_data_file_path, file_offset, raw_data_safe_len));
+      GetExternalDataInfo(tensor_proto, tensor_proto_dir, external_data_file_path, file_offset,
+                          raw_data_safe_len, (prepacked_info != nullptr) ? &*prepacked_infos : nullptr));
 
   if (external_data_file_path == onnxruntime::utils::kTensorProtoMemoryAddressTag) {
     // the value in location is the memory address of the data
@@ -1042,6 +1053,33 @@ Status GetExtDataFromTensorProto(const Env& env, const std::filesystem::path& mo
     ORT_RETURN_IF_ERROR(GetFileContent(env, external_data_file_path.c_str(), file_offset, raw_data_safe_len,
                                        ext_data_buf, ext_data_deleter));
     ext_data_len = raw_data_safe_len;
+
+    if (prepacked_info != nullptr && !prepacked_infos->empty()) {
+      for (const auto& [key, blobs] : *prepacked_infos) {
+        PrePackedWeights prepacked_weights;
+        prepacked_weights.buffers_.reserve(blobs.size());
+        prepacked_weights.buffer_sizes_.reserve(blobs.size());
+        for (const auto& blob : blobs) {
+          const auto blob_offset = std::get<0>(blob);
+          const auto blob_length = std::get<1>(blob);
+          SafeInt<FileOffsetType> end_of_blob{blob_offset};
+          end_of_blob += blob_length;
+          ORT_RETURN_IF(blob_offset < 0 || static_cast<uintmax_t>(end_of_blob) > file_length,
+                        "Pre-packed blob: ", key, " offset: ", blob_offset, " file_length: ", file_length,
+                        " is out of bounds and can not read in full");
+          void* data_ptr;
+          OrtCallback data_deleter;
+          ORT_RETURN_IF_ERROR(GetFileContent(env, external_data_file_path.c_str(), blob_offset, blob_length,
+                                             data_ptr, data_deleter));
+          IAllocatorUniquePtr<void> data_ptr_unique{data_ptr, OrtCallbackInvoker(data_deleter)};
+          prepacked_weights.buffers_.push_back(std::move(data_ptr_unique));
+          prepacked_weights.buffer_sizes_.push_back(blob_length);
+        }
+        if (!blobs.empty()) {
+          prepacked_info->InsertPrepackedWeights(key, std::move(prepacked_weights));
+        }
+      }
+    }
 #endif
   }
 
diff --git a/onnxruntime/core/framework/tensorprotoutils.h b/onnxruntime/core/framework/tensorprotoutils.h
index 262f7adaca1cb..7b9a47842388c 100644
--- a/onnxruntime/core/framework/tensorprotoutils.h
+++ b/onnxruntime/core/framework/tensorprotoutils.h
@@ -3,20 +3,21 @@
 
 #pragma once
 
-#include <vector>
-#include <type_traits>
-#include <string>
 #include <filesystem>
+#include <string>
+#include <type_traits>
+#include <vector>
 
 #ifndef SHARED_PROVIDER
 #include "core/common/common.h"
 #include "core/common/status.h"
 #include "core/common/safeint.h"
-#include "core/framework/endian_utils.h"
 #include "core/framework/allocator.h"
+#include "core/framework/endian_utils.h"
 #include "core/framework/external_data_loader.h"
-#include "core/framework/ort_value.h"
 #include "core/framework/mem_buffer.h"
+#include "core/framework/ort_value.h"
+#include "core/framework/prepacked_weights_container.h"
 #include "core/framework/tensor_external_data_info.h"
 #include "core/graph/onnx_protobuf.h"
 #include "core/platform/env.h"
@@ -36,7 +37,8 @@ Status GetExternalDataInfo(const ONNX_NAMESPACE::TensorProto& tensor_proto,
                            const std::filesystem::path& tensor_proto_dir,
                            std::basic_string<ORTCHAR_T>& external_file_path,
                            onnxruntime::FileOffsetType& file_offset,
-                           SafeInt<size_t>& tensor_byte_size);
+                           SafeInt<size_t>& tensor_byte_size,
+                           ExternalDataInfo::PrepackedInfos* prepacked_infos = nullptr);
 /**
  * This function is used to convert the endianess of Tensor data.
  * Mostly, will be used in big endian system to support the model file
@@ -172,7 +174,8 @@ common::Status GetExtDataFromTensorProto(const Env& env, const std::filesystem::
                                          const ONNX_NAMESPACE::TensorProto& tensor_proto,
                                          void*& ext_data_buf, SafeInt<size_t>& ext_data_len,
                                          OrtCallback& ext_data_deleter,
-                                         Tensor* buffered_tensor = nullptr);
+                                         Tensor* buffered_tensor = nullptr,
+                                         PrepackedWeightsForGraph* prepacked_for_graph = nullptr);
 
 // Given a tensor proto with external data obtain a tensor using the specified custom external data loader.
 common::Status LoadExtDataToTensorFromTensorProto(const Env& env, const std::filesystem::path& model_path,
diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
index c7a0793c4748f..d78fe7111c9be 100644
--- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
@@ -3371,7 +3371,8 @@ void RegisterContribSchemas() {
            "tensor(uint64)",
            "tensor(float16)",
            "tensor(float)",
-           "tensor(double)"},
+           "tensor(double)",
+           "tensor(bfloat16)"},
           "Constrain input and output types.");
 
   static const char* BitmaskDropout_ver1_doc = R"DOC(
diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc
index e8a5855b36496..0b6610db5e007 100644
--- a/onnxruntime/core/graph/graph.cc
+++ b/onnxruntime/core/graph/graph.cc
@@ -18,6 +18,7 @@
 #include "core/flatbuffers/flatbuffers_utils.h"
 #include "core/flatbuffers/schema/ort.fbs.h"
 #include "core/framework/tensor_shape.h"
+#include "core/framework/tensor_external_data_info.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/framework/utils.h"
 #include "core/graph/graph_flatbuffers_utils.h"
@@ -25,6 +26,7 @@
 #include "core/graph/indexed_sub_graph.h"
 #include "core/graph/model.h"
 #include "core/graph/model_load_utils.h"
+#include "core/graph/model_saving_options.h"
 #include "core/graph/node_attr_utils.h"
 #include "core/graph/op.h"
 #include "core/graph/runtime_optimization_record_container.h"
@@ -1543,6 +1545,17 @@ Status Graph::VerifyNoDuplicateName() {
 
 #endif  // !defined(ORT_MINIMAL_BUILD)
 
+void Graph::ConstructPrepackedSharedContainerAndSetMode(bool saving_mode_on) {
+  if (parent_graph_ == nullptr) {
+    prepacked_key_to_blobs_.emplace();
+    prepacked_weights_for_graph_.emplace(*prepacked_key_to_blobs_, saving_mode_on);
+  } else {
+    // Subgraph
+    prepacked_weights_for_graph_.emplace(parent_graph_->prepacked_weights_for_graph_->GetKeyToBlob(),
+                                         saving_mode_on);
+  }
+}
+
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
 void Graph::AddEdge(NodeIndex src_node_index, NodeIndex dst_node_index, int src_arg_slot, int dst_arg_slot) {
   if (nodes_.size() <= src_node_index || src_arg_slot < 0 || nodes_.size() <= dst_node_index || dst_arg_slot < 0 ||
@@ -4084,82 +4097,103 @@ ONNX_NAMESPACE::GraphProto Graph::ToGraphProto() const {
   return result;
 }
 
-ONNX_NAMESPACE::GraphProto Graph::ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path,
-                                                                       const std::filesystem::path& model_file_path,
-                                                                       size_t initializer_size_threshold,
-                                                                       const OffsetAlignmentInfo& align_info) const {
-  GraphProto result;
-  ToGraphProtoInternal(result);
-  ORT_ENFORCE(external_file_path.is_relative());
-  // If model_file_path is just a file name without a path separator, for example: "model.onnx". Its parent path could
-  // be empty. Else, save external data file in same directory as the model.
-  const std::filesystem::path modified_external_file_path = model_file_path.parent_path() / external_file_path;
+Status Graph::AddExternalInitializersToGraphProtoImpl(
+    const std::filesystem::path& model_path,
+    const std::filesystem::path& external_file_path,
+    const std::filesystem::path& model_external_file_path,
+    const ModelSavingOptions& model_saving_options,
+    ONNX_NAMESPACE::GraphProto& output_graph_proto,
+    std::ostream& external_stream,
+    int64_t& external_offset) const {
+  // Process initializers in a subgraph, check their size and
+  // write to an external file. This function also saves pre-packed
+  // blobs for the initializer being saved to disk, if the initializer has any pre-packs.
+  // This function is invoked by ToGraphProtoWithExternalInitiallizers() and processes subgraphs
+  // bottom up.
+  for (const auto& node : Nodes()) {
+    if (node.ContainsSubgraph()) {
+      // Let find this node in the output_graph_proto
+      auto hit = std::find_if(output_graph_proto.mutable_node()->begin(),
+                              output_graph_proto.mutable_node()->end(),
+                              [&node](const ONNX_NAMESPACE::NodeProto& proto) {
+                                return proto.name() == node.Name();
+                              });
+      ORT_RETURN_IF_NOT(hit != output_graph_proto.mutable_node()->end(), "Node ", node.Name(),
+                        " not found in output_graph_proto");
+      auto& result_node = *hit;
+      for (const auto& e : node.GetAttributeNameToSubgraphMap()) {
+        const auto& name = e.first;
+        const auto& subgraph = e.second;
+        // Lets find this subgraph in the result_node
+        auto sub_hit = std::find_if(result_node.mutable_attribute()->begin(),
+                                    result_node.mutable_attribute()->end(),
+                                    [&name](const ONNX_NAMESPACE::AttributeProto& proto) {
+                                      return proto.name() == name;
+                                    });
+        ORT_RETURN_IF_NOT(sub_hit != result_node.mutable_attribute()->end() && utils::HasGraph(*sub_hit),
+                          "Subgraph ", name, " is referred to in GetAttributeNameToSubgraphMap, but not found in node ",
+                          node.Name(), " while attempting to recurse into it.");
+        auto& result_subgraph = *sub_hit->mutable_g();
+        ORT_RETURN_IF_ERROR(subgraph->AddExternalInitializersToGraphProtoImpl(
+            model_path, external_file_path,
+            model_external_file_path, model_saving_options,
+            result_subgraph,
+            external_stream, external_offset));
+      }
+    }
+  }
 
-  std::ofstream external_stream(modified_external_file_path, std::ofstream::out | std::ofstream::binary);
-  ORT_ENFORCE(external_stream.is_open());
-  int64_t external_offset = 0;
+  // Used only when pre-packed weights are serialized
+  InlinedHashSet<std::string> processed_weights;
+  // prepacked_weights_for_graph_ is present only when SessionState is finalized.
+  const bool process_prepacks = prepacked_weights_for_graph_.has_value() &&
+                                prepacked_weights_for_graph_->GetNumberOfWeightsForWriting() > 0;
+  if (process_prepacks) {
+    processed_weights.reserve(graph_proto_->initializer_size());
+  }
 
   // Add the initializers to the result graph.
-  const auto& model_path = ModelPath();
-#if !defined(DISABLE_SPARSE_TENSORS)
-  const auto sparse_end = sparse_tensor_names_.end();
-#endif
-
   for (const auto& initializer : graph_proto_->initializer()) {
 #if !defined(DISABLE_SPARSE_TENSORS)
-    if (sparse_end != sparse_tensor_names_.find(initializer.name())) {
+    if (IsSparseInitializer(initializer.name())) {
       // Sparse tensors are added to the ONNX file.
-      auto& sparse_initializer = *result.add_sparse_initializer();
+      auto& sparse_initializer = *output_graph_proto.add_sparse_initializer();
       auto status = utils::DenseTensorToSparseTensorProto(initializer, model_path, sparse_initializer);
-      ORT_ENFORCE(status.IsOK(), "Failed to convert dense initializer to sparse");
+      ORT_RETURN_IF_NOT(status.IsOK(), "Failed to convert dense initializer to sparse");
     } else {
 #endif
       // Dense tensors larger than the threshold are added to the external file.
-      TensorProto* output_proto = result.add_initializer();
+      TensorProto* output_proto = output_graph_proto.add_initializer();
 
       std::vector<uint8_t> raw_data;
-      ORT_THROW_IF_ERROR(utils::UnpackInitializerData(initializer, model_path, raw_data));
+      ORT_RETURN_IF_ERROR(utils::UnpackInitializerData(initializer, model_path, raw_data));
       size_t tensor_bytes_size = raw_data.size();
-      if (tensor_bytes_size < initializer_size_threshold) {
+      if (tensor_bytes_size < model_saving_options.initializer_size_threshold) {
         *output_proto = initializer;
+        if (process_prepacks) {
+          // These pre-packs will reside in memory
+          processed_weights.insert(initializer.name());
+        }
         continue;
       }
 
       // update external_offset for alignment
       // need to do padding before write actual tensor data as we do offset alignment at the begin of
-      // large tensors (offset need to be page aligned and alloction granularity aligned) like below:
+      // large tensors (offset need to be page aligned and allocation granularity aligned) like below:
       // \242\2557\256\023.\031&0000000000000000\332)k+\253\246\342\246(&\006!\347\232\374\236\325\026\032+\36XXXX
-      // |<---small tensor---->|<---padding--->|<------------------large tensor----------------------------->|
-      if (align_info.align_offset && static_cast<int64_t>(tensor_bytes_size) > align_info.align_threshold) {
-        // Align to the larger of the page size or the allocation granularity
-        int64_t alignment_factor = std::max(static_cast<int64_t>(4096), align_info.allocation_granularity);
-        // Align to the next page or alloc granularity boundary
-        int64_t new_external_offset = static_cast<int64_t>(
-                                          std::floor((external_offset + alignment_factor - 1) / alignment_factor)) *
-                                      alignment_factor;
-
-        // padding tensor with zeros for alignment
-        for (int64_t index = external_offset; index != new_external_offset; ++index) {
-          external_stream << '0';
-        }
-
-        external_offset = new_external_offset;
+      // |<---smaller tensor---->|<---padding--->|<------------------large tensor----------------------------->|
+      if (model_saving_options.align_offset && static_cast<int64_t>(tensor_bytes_size) >
+                                                   model_saving_options.align_threshold) {
+        ORT_RETURN_IF_NOT(ExternalDataInfo::AlignAndPad(external_stream, model_saving_options.allocation_granularity,
+                                                        external_offset),
+                          "Failed writing external data to: ", model_external_file_path);
       }
 
-      for (size_t index = 0; index != tensor_bytes_size; ++index) {
-        external_stream << raw_data[index];
-      }
+      ORT_RETURN_IF_NOT(external_stream.write(reinterpret_cast<const char*>(raw_data.data()), tensor_bytes_size),
+                        "Failed to write external initializers to file: ", model_external_file_path);
 
-      output_proto->set_data_location(ONNX_NAMESPACE::TensorProto_DataLocation::TensorProto_DataLocation_EXTERNAL);
-      ONNX_NAMESPACE::StringStringEntryProto* location = output_proto->add_external_data();
-      location->set_key("location");
-      location->set_value(ToUTF8String(external_file_path.native()));
-      ONNX_NAMESPACE::StringStringEntryProto* offset = output_proto->add_external_data();
-      offset->set_key("offset");
-      offset->set_value(std::to_string(external_offset));
-      ONNX_NAMESPACE::StringStringEntryProto* length = output_proto->add_external_data();
-      length->set_key("length");
-      length->set_value(std::to_string(tensor_bytes_size));
+      ExternalDataInfo::SetExternalLocationToProto(external_file_path, external_offset,
+                                                   tensor_bytes_size, *output_proto);
 
       output_proto->set_name(initializer.name());
       output_proto->set_data_type(initializer.data_type());
@@ -4168,12 +4202,74 @@ ONNX_NAMESPACE::GraphProto Graph::ToGraphProtoWithExternalInitializers(const std
       }
       output_proto->set_doc_string(initializer.doc_string());
 
-      external_offset += tensor_bytes_size;
+      external_offset = SafeInt<int64_t>(external_offset) + tensor_bytes_size;
+
+      if (process_prepacks) {
+        // check if this weight was referred to in subgraphs
+        InlinedHashSet<std::string> blob_keys_to_external_data;
+
+        // See if this weight has any pre-prepacks referred to in this graph.
+        const auto* blobs_keys_for_weight = prepacked_weights_for_graph_->GetKeysForWeightForSaving(initializer.name());
+        if (blobs_keys_for_weight != nullptr && !blobs_keys_for_weight->empty()) {
+          // Add all the blob_keys to the set of keys to process
+          blob_keys_to_external_data.insert(blobs_keys_for_weight->begin(), blobs_keys_for_weight->end());
+        }
+
+        if (!blob_keys_to_external_data.empty()) {
+          auto& os = ExternalDataInfo::WritePrepackedToFileAndAddToProto(
+              *prepacked_weights_for_graph_, blob_keys_to_external_data,
+              model_saving_options.align_offset, model_saving_options.align_threshold,
+              model_saving_options.allocation_granularity,
+              external_stream, external_offset, *output_proto);
+          ORT_RETURN_IF_NOT(os.good(), "Failed to write pre-packed blobs to external file");
+        }
+
+        processed_weights.insert(initializer.name());
+      }
+
 #if !defined(DISABLE_SPARSE_TENSORS)
     }
 #endif
   }
 
+  // Check if there are any pre-packed weights this graph refers to, but they have
+  // not been processed.
+  if (process_prepacks) {
+    const auto& sorted_by_weights = prepacked_weights_for_graph_->GetWeightToPrepack();
+    for (const auto& [weight_name, blob_keys] : sorted_by_weights) {
+      ORT_ENFORCE(processed_weights.find(weight_name) != processed_weights.end());
+    }
+  }
+
+  return Status::OK();
+}
+
+ONNX_NAMESPACE::GraphProto Graph::ToGraphProtoWithExternalInitializers(
+    const std::filesystem::path& external_file_path,
+    const std::filesystem::path& model_file_path,
+    const ModelSavingOptions& model_saving_options) const {
+  GraphProto result;
+  ToGraphProtoInternal(result);
+  ORT_ENFORCE(external_file_path.is_relative());
+  // If model_file_path is just a file name without a path separator, for example: "model.onnx". Its parent path could
+  // be empty. Else, save external data file in same directory as the model.
+  const std::filesystem::path modified_external_file_path = model_file_path.parent_path() / external_file_path;
+  const auto& model_path = ModelPath();
+
+  // Create the external file.
+  std::ofstream external_stream(modified_external_file_path, std::ofstream::out | std::ofstream::binary);
+  ORT_ENFORCE(external_stream.is_open(), "Failed to open for writing:", modified_external_file_path);
+  int64_t external_offset = 0;
+
+  ORT_THROW_IF_ERROR(AddExternalInitializersToGraphProtoImpl(model_path, external_file_path,
+                                                             modified_external_file_path, model_saving_options,
+                                                             result,
+                                                             external_stream, external_offset));
+
+  if (!external_stream.flush()) {
+    ORT_THROW("Failed to flush file with external initializers: ", modified_external_file_path);
+  }
+
   return result;
 }
 
diff --git a/onnxruntime/core/graph/model.cc b/onnxruntime/core/graph/model.cc
index 1bae63b510563..be0531e6473fb 100644
--- a/onnxruntime/core/graph/model.cc
+++ b/onnxruntime/core/graph/model.cc
@@ -383,14 +383,12 @@ ModelProto Model::ToProto() const {
 
 ModelProto Model::ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_name,
                                                        const std::filesystem::path& file_path,
-                                                       size_t initializer_size_threshold,
-                                                       const Graph::OffsetAlignmentInfo& align_info) const {
+                                                       const ModelSavingOptions& model_saving_options) const {
   ModelProto result(model_proto_);
   const auto& graph = *graph_;
   *(result.mutable_graph()) = graph.ToGraphProtoWithExternalInitializers(external_file_name,
                                                                          file_path,
-                                                                         initializer_size_threshold,
-                                                                         align_info);
+                                                                         model_saving_options);
   return result;
 }
 
@@ -607,16 +605,13 @@ template <typename T>
 static Status SaveModelWithExternalInitializers(Model& model,
                                                 const T& file_path,
                                                 const std::filesystem::path& external_file_name,
-                                                size_t initializer_size_threshold,
-                                                const Graph::OffsetAlignmentInfo& align_info) {
+                                                const ModelSavingOptions& save_options) {
   int fd = 0;
   Status status = Env::Default().FileOpenWr(file_path, fd);
   ORT_RETURN_IF_ERROR(status);
 
   ORT_TRY {
-    status = Model::SaveWithExternalInitializers(model, fd, file_path, external_file_name,
-                                                 initializer_size_threshold,
-                                                 align_info);
+    status = Model::SaveWithExternalInitializers(model, fd, file_path, external_file_name, save_options);
   }
   ORT_CATCH(const std::exception& ex) {
     ORT_HANDLE_EXCEPTION([&]() {
@@ -646,10 +641,8 @@ Status Model::Load(const PathString& file_path, std::shared_ptr<Model>& p_model,
 
 Status Model::SaveWithExternalInitializers(Model& model, const std::filesystem::path& file_path,
                                            const std::filesystem::path& external_file_name,
-                                           size_t initializer_size_threshold,
-                                           const Graph::OffsetAlignmentInfo& align_info) {
-  return SaveModelWithExternalInitializers(model, file_path, external_file_name, initializer_size_threshold,
-                                           align_info);
+                                           const ModelSavingOptions& save_options) {
+  return SaveModelWithExternalInitializers(model, file_path, external_file_name, save_options);
 }
 
 Status Model::LoadFromBytes(int count, const void* p_bytes, /*out*/ ONNX_NAMESPACE::ModelProto& model_proto) {
@@ -765,8 +758,7 @@ Status Model::SaveWithExternalInitializers(Model& model,
                                            int fd,
                                            const std::filesystem::path& file_path,
                                            const std::filesystem::path& external_file_name,
-                                           size_t initializer_size_threshold,
-                                           const Graph::OffsetAlignmentInfo& align_info) {
+                                           const ModelSavingOptions& model_saving_options) {
   if (fd < 0) {
     return Status(ONNXRUNTIME, INVALID_ARGUMENT, "<fd> is less than 0.");
   }
@@ -774,8 +766,7 @@ Status Model::SaveWithExternalInitializers(Model& model,
   ORT_RETURN_IF_ERROR(model.MainGraph().Resolve());
 
   auto model_proto = model.ToGraphProtoWithExternalInitializers(external_file_name, file_path,
-                                                                initializer_size_threshold,
-                                                                align_info);
+                                                                model_saving_options);
   google::protobuf::io::FileOutputStream output(fd);
   const bool result = model_proto.SerializeToZeroCopyStream(&output) && output.Flush();
   if (result) {
diff --git a/onnxruntime/core/graph/model.h b/onnxruntime/core/graph/model.h
index 9bcec6f78ca08..2d2086aef41fd 100644
--- a/onnxruntime/core/graph/model.h
+++ b/onnxruntime/core/graph/model.h
@@ -20,6 +20,8 @@
 
 namespace onnxruntime {
 
+class PrepackedShareableWeightsContainer;
+
 namespace fbs {
 struct Model;
 }  // namespace fbs
@@ -190,15 +192,7 @@ class Model {
   // initializer offset could be page aligned and allocation granularity aligned for mmap support.
   ONNX_NAMESPACE::ModelProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_name,
                                                                   const std::filesystem::path& file_path,
-                                                                  size_t initializer_size_threshold,
-                                                                  const Graph::OffsetAlignmentInfo& align_info) const;
-
-  ONNX_NAMESPACE::ModelProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_name,
-                                                                  const std::filesystem::path& file_path,
-                                                                  size_t initializer_size_threshold) const {
-    Graph::OffsetAlignmentInfo default_align_info;
-    return ToGraphProtoWithExternalInitializers(external_file_name, file_path, initializer_size_threshold, default_align_info);
-  }
+                                                                  const ModelSavingOptions& model_saving_options) const;
 
   static common::Status Save(Model& model, const PathString& file_path);
 
@@ -209,32 +203,13 @@ class Model {
   static common::Status SaveWithExternalInitializers(Model& model,
                                                      const std::filesystem::path& file_path,
                                                      const std::filesystem::path& external_file_path,
-                                                     size_t initializer_size_threshold,
-                                                     const Graph::OffsetAlignmentInfo& align_info);
-
-  static common::Status SaveWithExternalInitializers(Model& model,
-                                                     const std::filesystem::path& file_path,
-                                                     const std::filesystem::path& external_file_path,
-                                                     size_t initializer_size_threshold) {
-    Graph::OffsetAlignmentInfo default_align_info;
-    return SaveWithExternalInitializers(model, file_path, external_file_path, initializer_size_threshold, default_align_info);
-  }
-
-  static common::Status SaveWithExternalInitializers(Model& model,
-                                                     int fd,
-                                                     const std::filesystem::path& file_path,
-                                                     const std::filesystem::path& external_file_path,
-                                                     size_t initializer_size_threshold,
-                                                     const Graph::OffsetAlignmentInfo& align_info);
+                                                     const ModelSavingOptions& save_options);
 
   static common::Status SaveWithExternalInitializers(Model& model,
                                                      int fd,
                                                      const std::filesystem::path& file_path,
                                                      const std::filesystem::path& external_file_path,
-                                                     size_t initializer_size_threshold) {
-    Graph::OffsetAlignmentInfo default_align_info;
-    return SaveWithExternalInitializers(model, fd, file_path, external_file_path, initializer_size_threshold, default_align_info);
-  }
+                                                     const ModelSavingOptions& save_options);
 
   static common::Status Load(std::istream& model_istream, ONNX_NAMESPACE::ModelProto* p_model_proto);
 
diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
index 76ccd361761a7..ede8d67e64381 100644
--- a/onnxruntime/core/providers/shared_library/provider_interfaces.h
+++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -42,6 +42,8 @@ using ProviderType = const std::string&;
 class RandomGenerator;
 class IOnnxRuntimeOpSchemaCollection;
 
+struct ModelSavingOptions;
+
 #ifdef ENABLE_TRAINING_TORCH_INTEROP
 namespace contrib {
 class PythonOpBase;
@@ -964,7 +966,11 @@ struct ProviderHost {
   virtual void Model__operator_delete(Model* p) = 0;
   virtual Graph& Model__MainGraph(Model* p) = 0;
   virtual std::unique_ptr<ONNX_NAMESPACE::ModelProto> Model__ToProto(Model* p) = 0;
-  virtual std::unique_ptr<ONNX_NAMESPACE::ModelProto> Model__ToGraphProtoWithExternalInitializers(Model* p, const std::filesystem::path& external_file_name, const std::filesystem::path& file_path, size_t initializer_size_threshold) = 0;
+  virtual std::unique_ptr<ONNX_NAMESPACE::ModelProto> Model__ToGraphProtoWithExternalInitializers(
+      Model* p,
+      const std::filesystem::path& external_file_name,
+      const std::filesystem::path& file_path,
+      const ModelSavingOptions&) = 0;
   virtual const ModelMetaData& Model__MetaData(const Model* p) const noexcept = 0;
   virtual Status Model__Load(const PathString& file_path, /*out*/ ONNX_NAMESPACE::ModelProto& model_proto) = 0;
 
diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
index b7817e98377eb..6a74221428fc3 100644
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -993,6 +993,8 @@ struct NodeUnit final {
   void operator=(const NodeUnit& v) = delete;
 };
 
+struct ModelSavingOptions;
+
 struct Model final {
   static std::unique_ptr<Model> Create(ONNX_NAMESPACE::ModelProto&& model_proto, const PathString& model_path,
                                        const IOnnxRuntimeOpSchemaRegistryList* local_registries, const logging::Logger& logger) {
@@ -1007,7 +1009,12 @@ struct Model final {
   Graph& MainGraph() { return g_host->Model__MainGraph(this); }
 
   std::unique_ptr<ONNX_NAMESPACE::ModelProto> ToProto() { return g_host->Model__ToProto(this); }
-  std::unique_ptr<ONNX_NAMESPACE::ModelProto> ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_name, const std::filesystem::path& file_path, size_t initializer_size_threshold) { return g_host->Model__ToGraphProtoWithExternalInitializers(this, external_file_name, file_path, initializer_size_threshold); }
+  std::unique_ptr<ONNX_NAMESPACE::ModelProto> ToGraphProtoWithExternalInitializers(
+      const std::filesystem::path& external_file_name,
+      const std::filesystem::path& file_path, const ModelSavingOptions& model_saving_options) {
+    return g_host->Model__ToGraphProtoWithExternalInitializers(this, external_file_name, file_path,
+                                                               model_saving_options);
+  }
   const ModelMetaData& MetaData() const noexcept { return g_host->Model__MetaData(this); }
 
   Model() = delete;
diff --git a/onnxruntime/core/providers/vitisai/imp/graph.cc b/onnxruntime/core/providers/vitisai/imp/graph.cc
index 191d26f3ab269..e7b39546fda6a 100644
--- a/onnxruntime/core/providers/vitisai/imp/graph.cc
+++ b/onnxruntime/core/providers/vitisai/imp/graph.cc
@@ -9,6 +9,7 @@
 #include <locale>
 #include <string>
 
+#include "core/graph/model_saving_options.h"
 #include "core/providers/shared_library/provider_api.h"
 #include "./vai_assert.h"
 
@@ -111,7 +112,9 @@ void graph_save(const Graph& graph, const std::string& filename, const std::stri
   if (initializer_size_threshold == std::numeric_limits<size_t>::max()) {
     model_proto = model->ToProto();
   } else {
-    model_proto = model->ToGraphProtoWithExternalInitializers(ToPathString(filename_dat), ToPathString(filename), initializer_size_threshold);
+    ModelSavingOptions model_saving_options{initializer_size_threshold};
+    model_proto = model->ToGraphProtoWithExternalInitializers(ToPathString(filename_dat), ToPathString(filename),
+                                                              model_saving_options);
   }
   auto& metadata = model->MetaData();
   if (!metadata.empty()) {
diff --git a/onnxruntime/core/providers/webgpu/buffer_manager.cc b/onnxruntime/core/providers/webgpu/buffer_manager.cc
index 45eb123943de9..233bb24083289 100644
--- a/onnxruntime/core/providers/webgpu/buffer_manager.cc
+++ b/onnxruntime/core/providers/webgpu/buffer_manager.cc
@@ -321,8 +321,8 @@ void BufferManager::Download(WGPUBuffer src, void* dst, size_t size) {
 
   // TODO: revise wait in whole project
 
-  ORT_ENFORCE(context_.Wait(staging_buffer.MapAsync(wgpu::MapMode::Read, 0, buffer_size, wgpu::CallbackMode::WaitAnyOnly, [](wgpu::MapAsyncStatus status, const char* message) {
-    ORT_ENFORCE(status == wgpu::MapAsyncStatus::Success, "Failed to download data from buffer: ", message);
+  ORT_ENFORCE(context_.Wait(staging_buffer.MapAsync(wgpu::MapMode::Read, 0, buffer_size, wgpu::CallbackMode::WaitAnyOnly, [](wgpu::MapAsyncStatus status, wgpu::StringView message) {
+    ORT_ENFORCE(status == wgpu::MapAsyncStatus::Success, "Failed to download data from buffer: ", std::string_view{message});
   })) == Status::OK());
 
   auto mapped_data = staging_buffer.GetConstMappedRange();
diff --git a/onnxruntime/core/providers/webgpu/tensor/gather_elements.cc b/onnxruntime/core/providers/webgpu/tensor/gather_elements.cc
new file mode 100644
index 0000000000000..00d8caf2624a9
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/tensor/gather_elements.cc
@@ -0,0 +1,86 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/common/inlined_containers.h"
+#include "core/providers/webgpu/tensor/gather_elements.h"
+#include "core/providers/cpu/tensor/utils.h"
+#include "core/providers/webgpu/shader_helper.h"
+#include "core/providers/webgpu/webgpu_supported_types.h"
+
+namespace onnxruntime {
+namespace webgpu {
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    GatherElements,
+    kOnnxDomain,
+    11, 12,
+    kWebGpuExecutionProvider,
+    (*KernelDefBuilder::Create()).TypeConstraint("T", WebGpuSupportedFloatTypes()),
+    GatherElements);
+
+ONNX_OPERATOR_KERNEL_EX(
+    GatherElements,
+    kOnnxDomain,
+    13,
+    kWebGpuExecutionProvider,
+    (*KernelDefBuilder::Create()).TypeConstraint("T", WebGpuSupportedFloatTypes()),
+    GatherElements);
+
+Status GatherElementsProgram::GenerateShaderCode(ShaderHelper& shader) const {
+  const ShaderVariableHelper& input = shader.AddInput("input", ShaderUsage::UseUniform);
+  const ShaderVariableHelper& indices = shader.AddInput("indices", ShaderUsage::UseUniform);
+  const ShaderVariableHelper& output = shader.AddOutput("output", ShaderUsage::UseUniform);
+
+  shader.MainFunctionBody() << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")
+                            << "let output_indices = " << output.OffsetToIndices("global_idx") << ";\n"
+                            << "var idx = " << indices.GetByOffset("global_idx") << ";\n"
+                            << "if (idx < 0) {\n"
+                            << "  idx = idx + uniforms.axis_dim_limit;\n"
+                            << "}\n"
+                            << "var input_indices = output_indices;\n"
+                            << input.IndicesSet("input_indices", "uniforms.axis", "u32(idx)") << ";\n"
+                            << "let value = " << input.GetByIndices("input_indices") << ";\n"
+                            << output.SetByOffset("global_idx", "value") << ";\n";
+
+  return Status::OK();
+}
+
+Status GatherElements::ComputeInternal(ComputeContext& context) const {
+  const auto* input_tensor = context.Input(0);
+  const TensorShape& input_shape = input_tensor->Shape();
+  int64_t input_rank = input_shape.NumDimensions();
+
+  const auto* indices_tensor = context.Input(1);
+  const TensorShape& indices_shape = indices_tensor->Shape();
+
+  // Handle negative axis
+  int64_t axis = axis_;
+  if (axis < 0) {
+    axis += input_rank;
+  }
+
+  auto axis_dim_limit = input_shape[axis];
+
+  auto output_dims = indices_shape.AsShapeVector();
+  TensorShape output_shape(output_dims);
+  auto* output_tensor = context.Output(0, output_shape);
+  int64_t output_size = output_tensor->Shape().Size();
+
+  if (output_size == 0) {
+    return Status::OK();
+  }
+
+  GatherElementsProgram program{};
+  program
+      .AddInputs({{input_tensor, ProgramTensorMetadataDependency::TypeAndRank}})
+      .AddInputs({{indices_tensor, ProgramTensorMetadataDependency::TypeAndRank}})
+      .AddOutputs({output_tensor})
+      .SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE)
+      .AddUniformVariables({{static_cast<uint32_t>(output_size)},
+                            {static_cast<int32_t>(axis_dim_limit)},
+                            {static_cast<int32_t>(axis)}});
+  return context.RunProgram(program);
+}
+
+}  // namespace webgpu
+}  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/core/providers/webgpu/tensor/gather_elements.h b/onnxruntime/core/providers/webgpu/tensor/gather_elements.h
new file mode 100644
index 0000000000000..f70bbda84c933
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/tensor/gather_elements.h
@@ -0,0 +1,36 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/webgpu/webgpu_kernel.h"
+#include "core/providers/webgpu/program.h"
+
+namespace onnxruntime {
+namespace webgpu {
+
+class GatherElementsProgram final : public Program<GatherElementsProgram> {
+ public:
+  GatherElementsProgram() : Program{"GatherElements"} {}
+
+  Status GenerateShaderCode(ShaderHelper& sh) const override;
+
+  WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"output_size", ProgramUniformVariableDataType::Uint32},
+                                          {"axis_dim_limit", ProgramUniformVariableDataType::Int32},
+                                          {"axis", ProgramUniformVariableDataType::Int32});
+};
+
+class GatherElements final : public WebGpuKernel {
+ public:
+  GatherElements(const OpKernelInfo& info) : WebGpuKernel(info) {
+    axis_ = info.GetAttrOrDefault<int64_t>("axis", 0);
+  }
+
+  Status ComputeInternal(ComputeContext& context) const override;
+
+ private:
+  int64_t axis_;
+};
+
+}  // namespace webgpu
+}  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.cc b/onnxruntime/core/providers/webgpu/webgpu_context.cc
index d66c2a79d28a8..b2f7748a54743 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_context.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_context.cc
@@ -10,6 +10,8 @@
 #endif
 
 #include "core/common/common.h"
+#include "core/common/path_string.h"
+#include "core/platform/env.h"
 
 #include "core/providers/webgpu/compute_context.h"
 #include "core/providers/webgpu/webgpu_context.h"
@@ -23,37 +25,38 @@
 namespace onnxruntime {
 namespace webgpu {
 
-void WebGpuContext::Initialize(const WebGpuExecutionProviderInfo& webgpu_ep_info, const void* dawn_proc_table) {
-  std::call_once(init_flag_, [this, &webgpu_ep_info, dawn_proc_table]() {
-    // Initialization.Step.1 - Create wgpu::Instance
-    if (instance_ == nullptr) {
-      const DawnProcTable* dawn_procs = reinterpret_cast<const DawnProcTable*>(dawn_proc_table);
-#if defined(BUILD_DAWN_MONOLITHIC_LIBRARY)
-      ORT_ENFORCE(dawn_procs == nullptr, "setting DawnProcTable is not allowed when dynamically linked to webgpu_dawn.");
-#else
-#if !defined(USE_EXTERNAL_DAWN)
-      if (dawn_procs == nullptr) {
-        dawn_procs = &dawn::native::GetProcs();
+void WebGpuContext::Initialize(const WebGpuBufferCacheConfig& buffer_cache_config, int backend_type) {
+  std::call_once(init_flag_, [this, &buffer_cache_config, backend_type]() {
+    // Create wgpu::Adapter
+    if (adapter_ == nullptr) {
+#if !defined(__EMSCRIPTEN__) && defined(_MSC_VER) && defined(DAWN_ENABLE_D3D12) && !defined(USE_EXTERNAL_DAWN)
+      // If we are using the D3D12 backend on Windows and the build does not use external Dawn, dxil.dll and dxcompiler.dll are required.
+      //
+      // Dawn will try to load them later, but if they are in the different directory to the executable, it may fail to find them.
+      // To avoid this issue, we try to load them from the same directory as current module (usually onnxruntime.dll).
+      auto runtime_path = Env::Default().GetRuntimePath();
+      if (!runtime_path.empty()) {
+        Status status;
+        void* module_handle = nullptr;
+
+        PathString dxil_path = runtime_path + ToPathString(L"dxil.dll");
+        status = Env::Default().LoadDynamicLibrary(dxil_path, false, &module_handle);
+        if (status.IsOK() && module_handle != nullptr) {
+          modules_.Add(dxil_path, module_handle);
+        }
+
+        PathString dxcompiler_path = runtime_path + ToPathString(L"dxcompiler.dll");
+        status = Env::Default().LoadDynamicLibrary(dxcompiler_path, false, &module_handle);
+        if (status.IsOK() && module_handle != nullptr) {
+          modules_.Add(dxcompiler_path, module_handle);
+        }
       }
-#else
-      ORT_ENFORCE(dawn_procs != nullptr, "DawnProcTable must be provided.");
-#endif
-      dawnProcSetProcs(dawn_procs);
 #endif
 
-      wgpu::InstanceDescriptor instance_desc{};
-      instance_desc.features.timedWaitAnyEnable = true;
-      instance_ = wgpu::CreateInstance(&instance_desc);
-
-      ORT_ENFORCE(instance_ != nullptr, "Failed to create wgpu::Instance.");
-    }
-
-    // Initialization.Step.2 - Create wgpu::Adapter
-    if (adapter_ == nullptr) {
       wgpu::RequestAdapterOptions req_adapter_options = {};
       wgpu::DawnTogglesDescriptor adapter_toggles_desc = {};
       req_adapter_options.nextInChain = &adapter_toggles_desc;
-      req_adapter_options.backendType = static_cast<wgpu::BackendType>(webgpu_ep_info.backend_type);
+      req_adapter_options.backendType = static_cast<wgpu::BackendType>(backend_type);
       req_adapter_options.powerPreference = wgpu::PowerPreference::HighPerformance;
 
       auto enabled_adapter_toggles = GetEnabledAdapterToggles();
@@ -72,7 +75,7 @@ void WebGpuContext::Initialize(const WebGpuExecutionProviderInfo& webgpu_ep_info
       ORT_ENFORCE(adapter_ != nullptr, "Failed to get a WebGPU adapter.");
     }
 
-    // Initialization.Step.3 - Create wgpu::Device
+    // Create wgpu::Device
     if (device_ == nullptr) {
       wgpu::DeviceDescriptor device_desc = {};
       wgpu::DawnTogglesDescriptor device_toggles_desc = {};
@@ -124,7 +127,10 @@ void WebGpuContext::Initialize(const WebGpuExecutionProviderInfo& webgpu_ep_info
     device_limits_ = device_supported_limits.limits;
 
     // create buffer manager
-    buffer_mgr_ = BufferManagerFactory::Create(*this, webgpu_ep_info.storage_buffer_cache_mode, webgpu_ep_info.uniform_buffer_cache_mode, webgpu_ep_info.query_resolve_buffer_cache_mode);
+    buffer_mgr_ = BufferManagerFactory::Create(*this,
+                                               buffer_cache_config.storage.mode,
+                                               buffer_cache_config.uniform.mode,
+                                               buffer_cache_config.query_resolve.mode);
 
     // create program manager
     program_mgr_ = std::make_unique<ProgramManager>(Device(), DeviceLimits());
@@ -526,8 +532,8 @@ void WebGpuContext::CollectProfilingData(profiling::Events& events) {
                                                   0,
                                                   query_read_buffer.GetSize(),
                                                   wgpu::CallbackMode::WaitAnyOnly,
-                                                  [](wgpu::MapAsyncStatus status, const char* message) {
-                                                    ORT_ENFORCE(status == wgpu::MapAsyncStatus::Success, "Failed to download data from buffer: ", message);
+                                                  [](wgpu::MapAsyncStatus status, wgpu::StringView message) {
+                                                    ORT_ENFORCE(status == wgpu::MapAsyncStatus::Success, "Failed to download data from buffer: ", std::string_view{message});
                                                   })) == Status::OK());
       auto mapped_data = static_cast<const uint64_t*>(query_read_buffer.GetConstMappedRange());
 
@@ -635,18 +641,46 @@ void WebGpuContext::Flush() {
   num_pending_dispatches_ = 0;
 }
 
-std::unordered_map<int32_t, std::unique_ptr<WebGpuContext>> WebGpuContextFactory::contexts_;
+std::unordered_map<int32_t, WebGpuContextFactory::WebGpuContextInfo> WebGpuContextFactory::contexts_;
 std::mutex WebGpuContextFactory::mutex_;
+std::once_flag WebGpuContextFactory::init_default_flag_;
+wgpu::Instance WebGpuContextFactory::default_instance_;
+
+WebGpuContext& WebGpuContextFactory::CreateContext(const WebGpuContextConfig& config) {
+  const int context_id = config.context_id;
+  WGPUInstance instance = config.instance;
+  WGPUAdapter adapter = config.adapter;
+  WGPUDevice device = config.device;
 
-WebGpuContext& WebGpuContextFactory::CreateContext(int context_id,
-                                                   WGPUInstance instance,
-                                                   WGPUAdapter adapter,
-                                                   WGPUDevice device,
-                                                   ValidationMode validation_mode) {
   if (context_id == 0) {
     // context ID is preserved for the default context. User cannot use context ID 0 as a custom context.
     ORT_ENFORCE(instance == nullptr && adapter == nullptr && device == nullptr,
                 "WebGPU EP default context (contextId=0) must not have custom WebGPU instance, adapter or device.");
+
+    std::call_once(init_default_flag_, [dawn_proc_table = config.dawn_proc_table]() {
+      // Step.1 - setup dawn proc table
+      const DawnProcTable* dawn_procs = reinterpret_cast<const DawnProcTable*>(dawn_proc_table);
+#if defined(BUILD_DAWN_MONOLITHIC_LIBRARY)
+      ORT_ENFORCE(dawn_procs == nullptr, "setting DawnProcTable is not allowed when dynamically linked to webgpu_dawn.");
+#else
+#if !defined(USE_EXTERNAL_DAWN)
+      if (dawn_procs == nullptr) {
+        dawn_procs = &dawn::native::GetProcs();
+      }
+#else
+      ORT_ENFORCE(dawn_procs != nullptr, "DawnProcTable must be provided.");
+#endif
+      dawnProcSetProcs(dawn_procs);
+#endif
+
+      // Step.2 - Create wgpu::Instance
+      wgpu::InstanceDescriptor instance_desc{};
+      instance_desc.features.timedWaitAnyEnable = true;
+      default_instance_ = wgpu::CreateInstance(&instance_desc);
+
+      ORT_ENFORCE(default_instance_ != nullptr, "Failed to create wgpu::Instance.");
+    });
+    instance = default_instance_.Get();
   } else {
     // for context ID > 0, user must provide custom WebGPU instance, adapter and device.
     ORT_ENFORCE(instance != nullptr && adapter != nullptr && device != nullptr,
@@ -658,13 +692,16 @@ WebGpuContext& WebGpuContextFactory::CreateContext(int context_id,
   auto it = contexts_.find(context_id);
   if (it == contexts_.end()) {
     GSL_SUPPRESS(r.11)
-    auto context = std::unique_ptr<WebGpuContext>(new WebGpuContext(instance, adapter, device, validation_mode));
-    it = contexts_.emplace(context_id, std::move(context)).first;
+    auto context = std::unique_ptr<WebGpuContext>(new WebGpuContext(instance, adapter, device, config.validation_mode));
+    it = contexts_.emplace(context_id, WebGpuContextFactory::WebGpuContextInfo{std::move(context), 0}).first;
   } else if (context_id != 0) {
-    ORT_ENFORCE(it->second->instance_.Get() == instance && it->second->adapter_.Get() == adapter && it->second->device_.Get() == device,
+    ORT_ENFORCE(it->second.context->instance_.Get() == instance &&
+                    it->second.context->adapter_.Get() == adapter &&
+                    it->second.context->device_.Get() == device,
                 "WebGPU EP context ID ", context_id, " is already created with different WebGPU instance, adapter or device.");
   }
-  return *it->second;
+  it->second.ref_count++;
+  return *it->second.context;
 }
 
 WebGpuContext& WebGpuContextFactory::GetContext(int context_id) {
@@ -673,12 +710,24 @@ WebGpuContext& WebGpuContextFactory::GetContext(int context_id) {
   auto it = contexts_.find(context_id);
   ORT_ENFORCE(it != contexts_.end(), "WebGPU EP context ID ", context_id, " is not found.");
 
-  return *it->second;
+  return *it->second.context;
+}
+
+void WebGpuContextFactory::ReleaseContext(int context_id) {
+  std::lock_guard<std::mutex> lock(mutex_);
+
+  auto it = contexts_.find(context_id);
+  ORT_ENFORCE(it != contexts_.end(), "WebGPU EP context ID ", context_id, " is not found.");
+
+  if (--it->second.ref_count == 0) {
+    contexts_.erase(it);
+  }
 }
 
 void WebGpuContextFactory::Cleanup() {
   std::lock_guard<std::mutex> lock(mutex_);
   contexts_.clear();
+  default_instance_ = nullptr;
 }
 
 void CleanupWebGpuContexts() {
diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.h b/onnxruntime/core/providers/webgpu/webgpu_context.h
index be05b06523b9c..d1f43cdc4ddff 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_context.h
+++ b/onnxruntime/core/providers/webgpu/webgpu_context.h
@@ -13,6 +13,7 @@
 #include <webgpu/webgpu_cpp.h>
 
 #include "core/common/common.h"
+#include "core/framework/library_handles.h"
 #include "core/providers/webgpu/webgpu_execution_provider.h"
 #include "core/providers/webgpu/buffer_manager.h"
 #include "core/providers/webgpu/program_manager.h"
@@ -25,28 +26,53 @@ class WebGpuContext;
 class ComputeContext;
 class ProgramBase;
 
+struct WebGpuContextConfig {
+  int context_id;
+  WGPUInstance instance;
+  WGPUAdapter adapter;
+  WGPUDevice device;
+  const void* dawn_proc_table;
+  ValidationMode validation_mode;
+};
+
+struct WebGpuBufferCacheConfig {
+  struct ConfigEntry {
+    BufferCacheMode mode;
+    std::string config_string;
+  };
+  ConfigEntry storage;
+  ConfigEntry uniform;
+  ConfigEntry query_resolve;
+  ConfigEntry default_entry;
+};
+
 class WebGpuContextFactory {
  public:
-  static WebGpuContext& CreateContext(int context_id,
-                                      WGPUInstance instance,
-                                      WGPUAdapter adapter,
-                                      WGPUDevice device,
-                                      ValidationMode validation_mode);
+  struct WebGpuContextInfo {
+    std::unique_ptr<WebGpuContext> context;
+    int ref_count;
+  };
+
+  static WebGpuContext& CreateContext(const WebGpuContextConfig& config);
   static WebGpuContext& GetContext(int context_id);
 
+  static void ReleaseContext(int context_id);
+
   static void Cleanup();
 
  private:
   WebGpuContextFactory() {}
 
-  static std::unordered_map<int32_t, std::unique_ptr<WebGpuContext>> contexts_;
+  static std::unordered_map<int32_t, WebGpuContextInfo> contexts_;
   static std::mutex mutex_;
+  static std::once_flag init_default_flag_;
+  static wgpu::Instance default_instance_;
 };
 
 // Class WebGpuContext includes all necessary resources for the context.
 class WebGpuContext final {
  public:
-  void Initialize(const WebGpuExecutionProviderInfo& webgpu_ep_info, const void* dawn_proc_table);
+  void Initialize(const WebGpuBufferCacheConfig& buffer_cache_config, int backend_type);
 
   Status Wait(wgpu::Future f);
 
@@ -153,6 +179,8 @@ class WebGpuContext final {
 
   std::once_flag init_flag_;
 
+  LibraryHandles modules_;
+
   wgpu::Instance instance_;
   wgpu::Adapter adapter_;
   wgpu::Device device_;
diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
index 66209adf6f1a9..76a55b7ce4f2e 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
@@ -649,8 +649,8 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, Gather)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, Gather)>,
 
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, GatherElements)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, GatherElements)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, GatherElements)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, GatherElements)>,
 
       // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 10, 10, Resize)>,
       // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, Resize)>,
@@ -743,13 +743,13 @@ using namespace webgpu;
 
 WebGpuExecutionProvider::WebGpuExecutionProvider(int context_id,
                                                  WebGpuContext& context,
-                                                 WebGpuExecutionProviderInfo&& info)
+                                                 WebGpuExecutionProviderConfig&& config)
     : IExecutionProvider{kWebGpuExecutionProvider, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0)},
       context_id_{context_id},
       context_{context},
-      preferred_data_layout_{info.data_layout},
-      force_cpu_node_names_{std::move(info.force_cpu_node_names)},
-      enable_graph_capture_{info.enable_graph_capture} {
+      preferred_data_layout_{config.data_layout},
+      force_cpu_node_names_{std::move(config.force_cpu_node_names)},
+      enable_graph_capture_{config.enable_graph_capture} {
 }
 
 std::vector<AllocatorPtr> WebGpuExecutionProvider::CreatePreferredAllocators() {
@@ -824,6 +824,7 @@ std::unique_ptr<onnxruntime::IDataTransfer> WebGpuExecutionProvider::GetDataTran
 }
 
 WebGpuExecutionProvider::~WebGpuExecutionProvider() {
+  WebGpuContextFactory::ReleaseContext(context_id_);
 }
 
 std::unique_ptr<profiling::EpProfiler> WebGpuExecutionProvider::GetProfiler() {
diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h
index f9c43c6bfd7d0..ad81924e06901 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h
+++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h
@@ -22,32 +22,22 @@ enum class BufferCacheMode;
 class WebGpuProfiler;
 }  // namespace webgpu
 
-struct WebGpuExecutionProviderInfo {
-  WebGpuExecutionProviderInfo(DataLayout data_layout, bool enable_graph_capture)
+struct WebGpuExecutionProviderConfig {
+  WebGpuExecutionProviderConfig(DataLayout data_layout, bool enable_graph_capture)
       : data_layout{data_layout},
-        enable_graph_capture{enable_graph_capture},
-        backend_type{},
-        storage_buffer_cache_mode{},
-        uniform_buffer_cache_mode{},
-        query_resolve_buffer_cache_mode{},
-        default_buffer_cache_mode{} {}
-  WebGpuExecutionProviderInfo(WebGpuExecutionProviderInfo&&) = default;
-  WebGpuExecutionProviderInfo& operator=(WebGpuExecutionProviderInfo&&) = default;
-  ORT_DISALLOW_COPY_AND_ASSIGNMENT(WebGpuExecutionProviderInfo);
+        enable_graph_capture{enable_graph_capture} {}
+  WebGpuExecutionProviderConfig(WebGpuExecutionProviderConfig&&) = default;
+  WebGpuExecutionProviderConfig& operator=(WebGpuExecutionProviderConfig&&) = default;
+  ORT_DISALLOW_COPY_AND_ASSIGNMENT(WebGpuExecutionProviderConfig);
 
   DataLayout data_layout;
   bool enable_graph_capture;
-  int backend_type;
-  webgpu::BufferCacheMode storage_buffer_cache_mode;
-  webgpu::BufferCacheMode uniform_buffer_cache_mode;
-  webgpu::BufferCacheMode query_resolve_buffer_cache_mode;
-  webgpu::BufferCacheMode default_buffer_cache_mode;
   std::vector<std::string> force_cpu_node_names;
 };
 
 class WebGpuExecutionProvider : public IExecutionProvider {
  public:
-  WebGpuExecutionProvider(int context_id, webgpu::WebGpuContext& context, WebGpuExecutionProviderInfo&& info);
+  WebGpuExecutionProvider(int context_id, webgpu::WebGpuContext& context, WebGpuExecutionProviderConfig&& config);
   ~WebGpuExecutionProvider() override;
 
   std::vector<std::unique_ptr<ComputeCapability>> GetCapability(
diff --git a/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc b/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc
index 6cfe9aac0b0e9..64eb80b26fbf9 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc
@@ -17,25 +17,25 @@ using namespace onnxruntime::webgpu::options;
 namespace onnxruntime {
 
 struct WebGpuProviderFactory : IExecutionProviderFactory {
-  WebGpuProviderFactory(int context_id, webgpu::WebGpuContext& context, WebGpuExecutionProviderInfo&& webgpu_ep_info)
-      : context_id_{context_id}, context_{context}, info_{std::move(webgpu_ep_info)} {
+  WebGpuProviderFactory(int context_id, webgpu::WebGpuContext& context, WebGpuExecutionProviderConfig&& webgpu_ep_config)
+      : context_id_{context_id}, context_{context}, config_{std::move(webgpu_ep_config)} {
   }
 
   std::unique_ptr<IExecutionProvider> CreateProvider() override {
-    return std::make_unique<WebGpuExecutionProvider>(context_id_, context_, std::move(info_));
+    return std::make_unique<WebGpuExecutionProvider>(context_id_, context_, std::move(config_));
   }
 
  private:
   int context_id_;
   webgpu::WebGpuContext& context_;
-  WebGpuExecutionProviderInfo info_;
+  WebGpuExecutionProviderConfig config_;
 };
 
 std::shared_ptr<IExecutionProviderFactory> WebGpuProviderFactoryCreator::Create(const ConfigOptions& config_options) {
   //
-  // STEP.1 - prepare WebGpuExecutionProviderInfo
+  // STEP.1 - prepare WebGpuExecutionProviderConfig
   //
-  WebGpuExecutionProviderInfo webgpu_ep_info{
+  WebGpuExecutionProviderConfig webgpu_ep_config{
       // preferred layout is NHWC by default
       DataLayout::NHWC,
       // graph capture feature is disabled by default
@@ -45,109 +45,33 @@ std::shared_ptr<IExecutionProviderFactory> WebGpuProviderFactoryCreator::Create(
   std::string preferred_layout_str;
   if (config_options.TryGetConfigEntry(kPreferredLayout, preferred_layout_str)) {
     if (preferred_layout_str == kPreferredLayout_NHWC) {
-      webgpu_ep_info.data_layout = DataLayout::NHWC;
+      webgpu_ep_config.data_layout = DataLayout::NHWC;
     } else if (preferred_layout_str == kPreferredLayout_NCHW) {
-      webgpu_ep_info.data_layout = DataLayout::NCHW;
+      webgpu_ep_config.data_layout = DataLayout::NCHW;
     } else {
       ORT_THROW("Invalid preferred layout: ", preferred_layout_str);
     }
   }
-  LOGS_DEFAULT(VERBOSE) << "WebGPU EP preferred layout: " << int(webgpu_ep_info.data_layout) << " (parsed from \""
+  LOGS_DEFAULT(VERBOSE) << "WebGPU EP preferred layout: " << int(webgpu_ep_config.data_layout) << " (parsed from \""
                         << preferred_layout_str << "\")";
 
   std::string enable_graph_capture_str;
   if (config_options.TryGetConfigEntry(kEnableGraphCapture, enable_graph_capture_str)) {
     if (enable_graph_capture_str == kEnableGraphCapture_ON) {
-      webgpu_ep_info.enable_graph_capture = true;
+      webgpu_ep_config.enable_graph_capture = true;
     } else if (enable_graph_capture_str == kEnableGraphCapture_OFF) {
-      webgpu_ep_info.enable_graph_capture = false;
+      webgpu_ep_config.enable_graph_capture = false;
     } else {
       ORT_THROW("Invalid enable graph capture: ", enable_graph_capture_str);
     }
   }
-  LOGS_DEFAULT(VERBOSE) << "WebGPU EP graph capture enable: " << webgpu_ep_info.enable_graph_capture;
-
-  std::string backend_type_str;
-  if (config_options.TryGetConfigEntry(kDawnBackendType, backend_type_str)) {
-#ifdef _WIN32
-    // Setup Windows default backend type based on the build configuration
-#if defined(onnxruntime_ENABLE_DAWN_BACKEND_D3D12)
-    webgpu_ep_info.backend_type = static_cast<int>(WGPUBackendType_D3D12);
-#elif defined(onnxruntime_ENABLE_DAWN_BACKEND_VULKAN)
-    webgpu_ep_info.backend_type = static_cast<int>(WGPUBackendType_Vulkan);
-#endif
-#endif
-    if (backend_type_str == kDawnBackendType_D3D12) {
-      webgpu_ep_info.backend_type = static_cast<int>(WGPUBackendType_D3D12);
-    } else if (backend_type_str == kDawnBackendType_Vulkan) {
-      webgpu_ep_info.backend_type = static_cast<int>(WGPUBackendType_Vulkan);
-    } else {
-      ORT_THROW("Invalid Dawn backend type: ", backend_type_str);
-    }
-  }
-  LOGS_DEFAULT(VERBOSE) << "WebGPU EP Dawn backend type: " << webgpu_ep_info.backend_type;
-
-  auto parse_buffer_cache_mode = [&config_options](const std::string& config_entry_str,
-                                                   webgpu::BufferCacheMode default_value) -> webgpu::BufferCacheMode {
-    std::string buffer_cache_mode_str;
-    if (config_options.TryGetConfigEntry(config_entry_str, buffer_cache_mode_str)) {
-      if (buffer_cache_mode_str == kBufferCacheMode_Disabled) {
-        return webgpu::BufferCacheMode::Disabled;
-      } else if (buffer_cache_mode_str == kBufferCacheMode_LazyRelease) {
-        return webgpu::BufferCacheMode::LazyRelease;
-      } else if (buffer_cache_mode_str == kBufferCacheMode_Simple) {
-        return webgpu::BufferCacheMode::Simple;
-      } else if (buffer_cache_mode_str == kBufferCacheMode_Bucket) {
-        return webgpu::BufferCacheMode::Bucket;
-      } else {
-        ORT_THROW("Invalid buffer cache mode: ", config_entry_str);
-      }
-    } else {
-      return default_value;
-    }
-  };
-
-  webgpu_ep_info.storage_buffer_cache_mode = parse_buffer_cache_mode(kStorageBufferCacheMode, webgpu::BufferCacheMode::Bucket);
-  LOGS_DEFAULT(VERBOSE) << "WebGPU EP storage buffer cache mode: " << webgpu_ep_info.storage_buffer_cache_mode;
-
-  webgpu_ep_info.uniform_buffer_cache_mode = parse_buffer_cache_mode(kUniformBufferCacheMode, webgpu::BufferCacheMode::Simple);
-  LOGS_DEFAULT(VERBOSE) << "WebGPU EP uniform buffer cache mode: " << webgpu_ep_info.uniform_buffer_cache_mode;
-
-  webgpu_ep_info.query_resolve_buffer_cache_mode = parse_buffer_cache_mode(kQueryResolveBufferCacheMode, webgpu::BufferCacheMode::Disabled);
-  LOGS_DEFAULT(VERBOSE) << "WebGPU EP query resolve buffer cache mode: " << webgpu_ep_info.query_resolve_buffer_cache_mode;
-
-  webgpu_ep_info.default_buffer_cache_mode = parse_buffer_cache_mode(kDefaultBufferCacheMode, webgpu::BufferCacheMode::Disabled);
-  LOGS_DEFAULT(VERBOSE) << "WebGPU EP default buffer cache mode: " << webgpu_ep_info.default_buffer_cache_mode;
-
-  webgpu::ValidationMode validation_mode =
-#ifndef NDEBUG
-      webgpu::ValidationMode::Full  // for debug build, enable full validation by default
-#else
-      webgpu::ValidationMode::Basic  // for release build, enable basic validation by default
-#endif  // !NDEBUG
-      ;
-  std::string validation_mode_str;
-  if (config_options.TryGetConfigEntry(kValidationMode, validation_mode_str)) {
-    if (validation_mode_str == kValidationMode_Disabled) {
-      validation_mode = webgpu::ValidationMode::Disabled;
-    } else if (validation_mode_str == kValidationMode_wgpuOnly) {
-      validation_mode = webgpu::ValidationMode::WGPUOnly;
-    } else if (validation_mode_str == kValidationMode_basic) {
-      validation_mode = webgpu::ValidationMode::Basic;
-    } else if (validation_mode_str == kValidationMode_full) {
-      validation_mode = webgpu::ValidationMode::Full;
-    } else {
-      ORT_THROW("Invalid validation mode: ", validation_mode_str);
-    }
-  }
+  LOGS_DEFAULT(VERBOSE) << "WebGPU EP graph capture enable: " << webgpu_ep_config.enable_graph_capture;
 
   // parse force CPU node names
   // The force CPU node names are separated by EOL (\n or \r\n) in the config entry.
   // each line is a node name that will be forced to run on CPU.
   std::string force_cpu_node_names_str;
   if (config_options.TryGetConfigEntry(kForceCpuNodeNames, force_cpu_node_names_str)) {
-    std::vector<std::string> force_cpu_node_names;
-
     // split the string by EOL (\n or \r\n)
     std::istringstream ss(force_cpu_node_names_str);
     std::string line;
@@ -157,14 +81,13 @@ std::shared_ptr<IExecutionProviderFactory> WebGpuProviderFactoryCreator::Create(
         continue;
       }
 
-      force_cpu_node_names.push_back(line);
+      webgpu_ep_config.force_cpu_node_names.push_back(line);
     }
-
-    webgpu_ep_info.force_cpu_node_names = std::move(force_cpu_node_names);
   }
+  LOGS_DEFAULT(VERBOSE) << "WebGPU EP force CPU node count: " << webgpu_ep_config.force_cpu_node_names.size();
 
   //
-  // STEP.2 - prepare WebGpuContext
+  // STEP.2 - prepare WebGpuContextConfig
   //
   int context_id = 0;
   std::string context_id_str;
@@ -204,14 +127,110 @@ std::shared_ptr<IExecutionProviderFactory> WebGpuProviderFactoryCreator::Create(
                 std::from_chars(dawn_proc_table_str.data(), dawn_proc_table_str.data() + dawn_proc_table_str.size(), dawn_proc_table).ec);
   }
 
-  auto& context = webgpu::WebGpuContextFactory::CreateContext(context_id,
-                                                              reinterpret_cast<WGPUInstance>(webgpu_instance),
-                                                              reinterpret_cast<WGPUAdapter>(webgpu_adapter),
-                                                              reinterpret_cast<WGPUDevice>(webgpu_device),
-                                                              validation_mode);
-  context.Initialize(webgpu_ep_info, reinterpret_cast<const void*>(dawn_proc_table));
+  webgpu::ValidationMode validation_mode =
+#ifndef NDEBUG
+      webgpu::ValidationMode::Full  // for debug build, enable full validation by default
+#else
+      webgpu::ValidationMode::Basic  // for release build, enable basic validation by default
+#endif  // !NDEBUG
+      ;
+  std::string validation_mode_str;
+  if (config_options.TryGetConfigEntry(kValidationMode, validation_mode_str)) {
+    if (validation_mode_str == kValidationMode_Disabled) {
+      validation_mode = webgpu::ValidationMode::Disabled;
+    } else if (validation_mode_str == kValidationMode_wgpuOnly) {
+      validation_mode = webgpu::ValidationMode::WGPUOnly;
+    } else if (validation_mode_str == kValidationMode_basic) {
+      validation_mode = webgpu::ValidationMode::Basic;
+    } else if (validation_mode_str == kValidationMode_full) {
+      validation_mode = webgpu::ValidationMode::Full;
+    } else {
+      ORT_THROW("Invalid validation mode: ", validation_mode_str);
+    }
+  }
+
+  webgpu::WebGpuContextConfig context_config{
+      context_id,
+      reinterpret_cast<WGPUInstance>(webgpu_instance),
+      reinterpret_cast<WGPUAdapter>(webgpu_adapter),
+      reinterpret_cast<WGPUDevice>(webgpu_device),
+      reinterpret_cast<const void*>(dawn_proc_table),
+      validation_mode,
+  };
+
+  //
+  // STEP.3 - prepare parameters for WebGPU context initialization.
+  //
+
+  int backend_type = 0;
+#ifdef _WIN32
+  // Setup Windows default backend type based on the build configuration
+#if defined(DAWN_ENABLE_D3D12)
+  backend_type = static_cast<int>(WGPUBackendType_D3D12);
+#elif defined(DAWN_ENABLE_VULKAN)
+  backend_type = static_cast<int>(WGPUBackendType_Vulkan);
+#endif
+#endif
+
+  std::string backend_type_str;
+  if (config_options.TryGetConfigEntry(kDawnBackendType, backend_type_str)) {
+    if (backend_type_str == kDawnBackendType_D3D12) {
+      backend_type = static_cast<int>(WGPUBackendType_D3D12);
+    } else if (backend_type_str == kDawnBackendType_Vulkan) {
+      backend_type = static_cast<int>(WGPUBackendType_Vulkan);
+    } else {
+      ORT_THROW("Invalid Dawn backend type: ", backend_type_str);
+    }
+  }
+  LOGS_DEFAULT(VERBOSE) << "WebGPU EP Dawn backend type: " << backend_type;
+
+  // buffer cache modes
+  auto parse_buffer_cache_mode = [&config_options](const std::string& config_entry_str,
+                                                   webgpu::BufferCacheMode default_value) -> webgpu::BufferCacheMode {
+    std::string buffer_cache_mode_str;
+    if (config_options.TryGetConfigEntry(config_entry_str, buffer_cache_mode_str)) {
+      if (buffer_cache_mode_str == kBufferCacheMode_Disabled) {
+        return webgpu::BufferCacheMode::Disabled;
+      } else if (buffer_cache_mode_str == kBufferCacheMode_LazyRelease) {
+        return webgpu::BufferCacheMode::LazyRelease;
+      } else if (buffer_cache_mode_str == kBufferCacheMode_Simple) {
+        return webgpu::BufferCacheMode::Simple;
+      } else if (buffer_cache_mode_str == kBufferCacheMode_Bucket) {
+        return webgpu::BufferCacheMode::Bucket;
+      } else {
+        ORT_THROW("Invalid buffer cache mode: ", config_entry_str);
+      }
+    } else {
+      return default_value;
+    }
+  };
+
+  webgpu::WebGpuBufferCacheConfig buffer_cache_config;
+
+  buffer_cache_config.storage.mode = parse_buffer_cache_mode(kStorageBufferCacheMode, webgpu::BufferCacheMode::Bucket);
+  LOGS_DEFAULT(VERBOSE) << "WebGPU EP storage buffer cache mode: " << buffer_cache_config.storage.mode;
+
+  buffer_cache_config.uniform.mode = parse_buffer_cache_mode(kUniformBufferCacheMode, webgpu::BufferCacheMode::Simple);
+  LOGS_DEFAULT(VERBOSE) << "WebGPU EP uniform buffer cache mode: " << buffer_cache_config.uniform.mode;
+
+  buffer_cache_config.query_resolve.mode = parse_buffer_cache_mode(kQueryResolveBufferCacheMode, webgpu::BufferCacheMode::Disabled);
+  LOGS_DEFAULT(VERBOSE) << "WebGPU EP query resolve buffer cache mode: " << buffer_cache_config.query_resolve.mode;
+
+  buffer_cache_config.default_entry.mode = parse_buffer_cache_mode(kDefaultBufferCacheMode, webgpu::BufferCacheMode::Disabled);
+  LOGS_DEFAULT(VERBOSE) << "WebGPU EP default buffer cache mode: " << buffer_cache_config.default_entry.mode;
+
+  //
+  // STEP.4 - start initialization.
+  //
+
+  // Load the Dawn library and create the WebGPU instance and adapter.
+  auto& context = webgpu::WebGpuContextFactory::CreateContext(context_config);
+
+  // Create WebGPU device and initialize the context.
+  context.Initialize(buffer_cache_config, backend_type);
 
-  return std::make_shared<WebGpuProviderFactory>(context_id, context, std::move(webgpu_ep_info));
+  // Create WebGPU EP factory.
+  return std::make_shared<WebGpuProviderFactory>(context_id, context, std::move(webgpu_ep_config));
 }
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index a60ee500a9898..223eed248800e 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -38,6 +38,7 @@
 #include "core/framework/utils.h"
 #include "core/graph/graph_viewer.h"
 #include "core/graph/model.h"
+#include "core/graph/model_saving_options.h"
 #include "core/optimizer/graph_transformer_utils.h"
 #include "core/optimizer/graph_transformer.h"
 #include "core/optimizer/layout_transformation/layout_transformation.h"
@@ -2099,13 +2100,12 @@ common::Status InferenceSession::Initialize() {
           const size_t optimized_model_external_initializers_min_size_in_bytes =
               ParseStringWithClassicLocale<size_t>(session_options_.config_options.GetConfigOrDefault(
                   kOrtSessionOptionsOptimizedModelExternalInitializersMinSizeInBytes, "1024"));
-          Graph::OffsetAlignmentInfo align_info;
-          align_info.align_offset = true;
+          ModelSavingOptions model_saving_options{optimized_model_external_initializers_min_size_in_bytes};
+          model_saving_options.align_offset = true;
           ORT_RETURN_IF_ERROR_SESSIONID_(Model::SaveWithExternalInitializers(*model_,
                                                                              session_options_.optimized_model_filepath,
                                                                              optimized_model_external_initializers_file_name,
-                                                                             optimized_model_external_initializers_min_size_in_bytes,
-                                                                             align_info));
+                                                                             model_saving_options));
         }
       }
     }
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index 78c441efea856..53770df228f5a 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -1200,7 +1200,14 @@ struct ProviderHostImpl : ProviderHost {
   void Model__operator_delete(Model* p) override { delete p; }
   Graph& Model__MainGraph(Model* p) override { return p->MainGraph(); }
   std::unique_ptr<ONNX_NAMESPACE::ModelProto> Model__ToProto(Model* p) override { return std::make_unique<ONNX_NAMESPACE::ModelProto>(p->ToProto()); }
-  std::unique_ptr<ONNX_NAMESPACE::ModelProto> Model__ToGraphProtoWithExternalInitializers(Model* p, const std::filesystem::path& external_file_name, const std::filesystem::path& file_path, size_t initializer_size_threshold) override { return std::make_unique<ONNX_NAMESPACE::ModelProto>(p->ToGraphProtoWithExternalInitializers(external_file_name, file_path, initializer_size_threshold)); };
+  std::unique_ptr<ONNX_NAMESPACE::ModelProto> Model__ToGraphProtoWithExternalInitializers(Model* p,
+                                                                                          const std::filesystem::path& external_file_name,
+                                                                                          const std::filesystem::path& file_path,
+                                                                                          const ModelSavingOptions& model_saving_options) override {
+    return std::make_unique<ONNX_NAMESPACE::ModelProto>(p->ToGraphProtoWithExternalInitializers(external_file_name,
+                                                                                                file_path,
+                                                                                                model_saving_options));
+  };
   const ModelMetaData& Model__MetaData(const Model* p) const noexcept override { return p->MetaData(); };
   Status Model__Load(const PathString& file_path, /*out*/ ONNX_NAMESPACE::ModelProto& model_proto) override { return Model::Load(file_path, model_proto); }
 
diff --git a/onnxruntime/python/tools/tensorrt/perf/build/build_image.py b/onnxruntime/python/tools/tensorrt/perf/build/build_image.py
index 3ebc33c02592d..541dc4978dad1 100644
--- a/onnxruntime/python/tools/tensorrt/perf/build/build_image.py
+++ b/onnxruntime/python/tools/tensorrt/perf/build/build_image.py
@@ -15,10 +15,10 @@
 from typing import List, Optional
 
 TRT_DOCKER_FILES = {
-    "8.6.cuda_11_8_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_6",
-    "8.6.cuda_12_3_cudnn_9": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_3_tensorrt8_6",
-    "10.5.cuda_11_8_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10",
-    "10.5.cuda_12_5_cudnn_9": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10",
+    "8.6_cuda11.8_cudnn8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_6",
+    "8.6_cuda12.3_cudnn9": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_3_tensorrt8_6",
+    "10.7_cuda11.8_cudnn8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10",
+    "10.7_cuda12.5_cudnn9": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10",
     "BIN": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin",
 }
 
diff --git a/onnxruntime/test/common/cuda_op_test_utils.h b/onnxruntime/test/common/cuda_op_test_utils.h
index d3e069237217e..6f3e460628566 100644
--- a/onnxruntime/test/common/cuda_op_test_utils.h
+++ b/onnxruntime/test/common/cuda_op_test_utils.h
@@ -5,11 +5,6 @@
 
 #include "test/util/include/default_providers.h"
 
-#define SKIP_CUDA_TEST_WITH_DML                                          \
-  if (DefaultCudaExecutionProvider() == nullptr) {                       \
-    GTEST_SKIP() << "CUDA Tests are not supported while DML is enabled"; \
-  }
-
 namespace onnxruntime {
 namespace test {
 
@@ -18,10 +13,6 @@ namespace test {
 int GetCudaArchitecture();
 
 inline bool HasCudaEnvironment(int min_cuda_architecture) {
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return false;
-  }
-
   if (DefaultCudaExecutionProvider().get() == nullptr) {
     return false;
   }
diff --git a/onnxruntime/test/contrib_ops/beam_search_test.cc b/onnxruntime/test/contrib_ops/beam_search_test.cc
index 8c69e2d9810b8..9f4ee071925b4 100644
--- a/onnxruntime/test/contrib_ops/beam_search_test.cc
+++ b/onnxruntime/test/contrib_ops/beam_search_test.cc
@@ -75,9 +75,6 @@ TEST(BeamSearchTest, GptBeamSearchFp32) {
   const char* const output_names[] = {"sequences"};
 
   Ort::SessionOptions session_options;
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
 #ifdef USE_CUDA
   OrtCUDAProviderOptionsV2 cuda_options;
   cuda_options.use_tf32 = false;
@@ -171,9 +168,6 @@ TEST(BeamSearchTest, GptBeamSearchFp16) {
   bool enable_rocm = (nullptr != DefaultRocmExecutionProvider().get());
   if (enable_cuda || enable_rocm) {
     Ort::SessionOptions session_options;
-#if defined(USE_CUDA) && defined(USE_DML)
-    SKIP_CUDA_TEST_WITH_DML;
-#endif
 #ifdef USE_CUDA
     OrtCUDAProviderOptionsV2 cuda_options;
     cuda_options.use_tf32 = false;
diff --git a/onnxruntime/test/contrib_ops/bias_dropout_op_test.cc b/onnxruntime/test/contrib_ops/bias_dropout_op_test.cc
index 297629b015796..027d4b3fff1b0 100644
--- a/onnxruntime/test/contrib_ops/bias_dropout_op_test.cc
+++ b/onnxruntime/test/contrib_ops/bias_dropout_op_test.cc
@@ -181,9 +181,6 @@ void RunBiasDropoutTest(const bool use_mask, const std::vector<int64_t>& input_s
   t.SetCustomOutputVerifier(output_verifier);
   std::vector<std::unique_ptr<IExecutionProvider>> t_eps;
 #ifdef USE_CUDA
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
   t_eps.emplace_back(DefaultCudaExecutionProvider());
 #elif USE_ROCM
   t_eps.emplace_back(DefaultRocmExecutionProvider());
diff --git a/onnxruntime/test/contrib_ops/bitmask_dropout_op_test.cc b/onnxruntime/test/contrib_ops/bitmask_dropout_op_test.cc
index 26b0e3a4dd7a9..7ca4e1004066c 100644
--- a/onnxruntime/test/contrib_ops/bitmask_dropout_op_test.cc
+++ b/onnxruntime/test/contrib_ops/bitmask_dropout_op_test.cc
@@ -61,9 +61,7 @@ void RunTestForInference(const std::vector<int64_t>& input_dims, bool has_ratio
 
   std::vector<std::unique_ptr<IExecutionProvider>> test_eps;
 #ifdef USE_CUDA
-  if (DefaultCudaExecutionProvider() != nullptr) {
-    test_eps.emplace_back(DefaultCudaExecutionProvider());
-  }
+  test_eps.emplace_back(DefaultCudaExecutionProvider());
 #elif USE_ROCM
   test_eps.emplace_back(DefaultRocmExecutionProvider());
 #endif
@@ -124,9 +122,6 @@ void RunTestForTraining(const std::vector<int64_t>& input_dims) {
 
     std::vector<std::unique_ptr<IExecutionProvider>> dropout_eps;
 #ifdef USE_CUDA
-    if (DefaultCudaExecutionProvider() == nullptr) {
-      return;
-    }
     dropout_eps.emplace_back(DefaultCudaExecutionProvider());
 #elif USE_ROCM
     dropout_eps.emplace_back(DefaultRocmExecutionProvider());
diff --git a/onnxruntime/test/contrib_ops/layer_norm_test.cc b/onnxruntime/test/contrib_ops/layer_norm_test.cc
index b414a98c4e756..46082e1b0cd31 100644
--- a/onnxruntime/test/contrib_ops/layer_norm_test.cc
+++ b/onnxruntime/test/contrib_ops/layer_norm_test.cc
@@ -2,7 +2,6 @@
 // Licensed under the MIT License.
 
 #include "test/providers/compare_provider_test_utils.h"
-#include "test/util/include/default_providers.h"
 
 namespace onnxruntime {
 namespace test {
@@ -80,20 +79,14 @@ static void TestLayerNorm(const std::vector<int64_t>& x_dims,
 #endif
 
 #ifdef USE_CUDA
-  if (DefaultCudaExecutionProvider() != nullptr) {
-    test.CompareWithCPU(kCudaExecutionProvider);
-  }
+  test.CompareWithCPU(kCudaExecutionProvider);
 #elif USE_ROCM
   test.CompareWithCPU(kRocmExecutionProvider);
+#elif USE_DML
+  test.CompareWithCPU(kDmlExecutionProvider);
 #elif USE_WEBGPU
   test.CompareWithCPU(kWebGpuExecutionProvider);
 #endif
-
-#ifdef USE_DML
-  if (DefaultDmlExecutionProvider() != nullptr) {
-    test.CompareWithCPU(kDmlExecutionProvider);
-  }
-#endif
 }
 
 TEST(CudaKernelTest, LayerNorm_NullInput) {
diff --git a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
index 6dedce24e7e07..eebe9197573c6 100644
--- a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
@@ -490,17 +490,13 @@ void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, int64_t accura
   std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
   if (use_float16) {
 #ifdef USE_CUDA
-    if (DefaultCudaExecutionProvider() != nullptr) {
-      execution_providers.push_back(DefaultCudaExecutionProvider());
-    }
+    execution_providers.push_back(DefaultCudaExecutionProvider());
 #endif
 #ifdef USE_ROCM
     execution_providers.push_back(DefaultRocmExecutionProvider());
 #endif
 #ifdef USE_DML
-    if (DefaultDmlExecutionProvider() != nullptr) {
-      execution_providers.push_back(DefaultDmlExecutionProvider());
-    }
+    execution_providers.push_back(DefaultDmlExecutionProvider());
 #endif
 #ifdef USE_WEBGPU
     execution_providers.push_back(DefaultWebGpuExecutionProvider());
@@ -518,11 +514,8 @@ void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, int64_t accura
 }  // namespace
 
 TEST(MatMulNBits, Float16Cuda) {
-#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_DML)
-  std::vector<bool> has_gidx_options = {true, false};
-  if (DefaultDmlExecutionProvider() != nullptr) {
-    has_gidx_options.assign(1, false);
-  }
+#if defined(USE_CUDA) || defined(USE_ROCM)
+  auto has_gidx_options = {true, false};
 #else
   auto has_gidx_options = {false};
 #endif
@@ -533,9 +526,7 @@ TEST(MatMulNBits, Float16Cuda) {
         for (auto block_size : {16, 32, 64, 128}) {
           for (auto has_gidx : has_gidx_options) {
 #ifdef USE_DML
-            if (DefaultDmlExecutionProvider() != nullptr) {
-              RunTest(M, N, K, block_size, 0, false, true, has_gidx, true, 0.04f);
-            }
+            RunTest(M, N, K, block_size, 0, false, true, has_gidx, true, 0.04f);
 #else
             RunTest(M, N, K, block_size, 0, false, true, has_gidx);
             RunTest(M, N, K, block_size, 0, true, true, has_gidx, false);
@@ -548,16 +539,12 @@ TEST(MatMulNBits, Float16Cuda) {
 }
 
 TEST(MatMulNBits, Float16Large) {
-#if defined(USE_CUDA) || defined(USE_DML)
+#ifdef USE_DML
   // For some reason, the A10 machine that runs these tests during CI has a much bigger error than all retail
   // machines we tested on. All consumer-grade machines from Nvidia/AMD/Intel seem to pass these tests with an
   // absolute error of 0.08, but the A10 has errors going as high as 0.22. Ultimately, given the large number
   // of elements in this test, ULPs should probably be used instead of absolute/relative tolerances.
-  float abs_error = 0.05f;
-  if (DefaultDmlExecutionProvider() != nullptr) {
-    // it means the ep is dml in runtime, the abs_error is changed to 0.3f
-    abs_error = 0.3f;
-  }
+  float abs_error = 0.3f;
 #elif USE_WEBGPU
   // See Intel A770 to pass these tests with an absolute error of 0.08.
   float abs_error = 0.08f;
@@ -573,6 +560,7 @@ TEST(MatMulNBits, Float16Large) {
     }
   }
 }
+
 #endif  // defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_DML)
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
index d88c3131a4ca5..8d7629b5fda1c 100644
--- a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
@@ -227,7 +227,7 @@ TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_U8S8) {
 }
 
 // DML EP supports Float16 output type and Signed A Matrix and Unsigned B Matric for Float32 output
-#if defined(USE_DML) && !defined(USE_CUDA)
+#if defined(USE_DML)
 
 TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_S8U8) {
   RunMatMulIntegerToFloatTest<int8_t, uint8_t, float, true, false>();
diff --git a/onnxruntime/test/contrib_ops/tensor_op_test.cc b/onnxruntime/test/contrib_ops/tensor_op_test.cc
index d5e2ddebfe67f..bc2ff5f4f724d 100644
--- a/onnxruntime/test/contrib_ops/tensor_op_test.cc
+++ b/onnxruntime/test/contrib_ops/tensor_op_test.cc
@@ -121,15 +121,7 @@ void MeanVarianceNormalizationAcrossChannels(bool across_channels, bool normaliz
   test.AddAttribute("normalize_variance", normalize_variance ? one : zero);
   test.AddInput<float>("input", {N, C, H, W}, X);
   test.AddOutput<float>("output", {N, C, H, W}, result);
-#if defined(USE_CUDA) && defined(USE_DML)
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kCudaExecutionProvider, kTensorrtExecutionProvider});
-  } else if (DefaultDmlExecutionProvider() == nullptr) {
-    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kDmlExecutionProvider, kTensorrtExecutionProvider});
-  }
-#else
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kTensorrtExecutionProvider});  // OpenVINO doesn't support MVN operator below opset 9. TensorRT doesn't support opset 8 of MVN operator.
-#endif
 }
 
 void MeanVarianceNormalizationPerChannel(bool across_channels, bool normalize_variance) {
@@ -196,15 +188,7 @@ void MeanVarianceNormalizationPerChannel(bool across_channels, bool normalize_va
   test.AddAttribute("normalize_variance", normalize_variance ? one : zero);
   test.AddInput<float>("input", {N, C, H, W}, X);
   test.AddOutput<float>("output", {N, C, H, W}, result);
-#if defined(USE_CUDA) && defined(USE_DML)
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kCudaExecutionProvider, kTensorrtExecutionProvider});
-  } else if (DefaultDmlExecutionProvider() == nullptr) {
-    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kDmlExecutionProvider, kTensorrtExecutionProvider});
-  }
-#else
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kTensorrtExecutionProvider});  // OpenVINO doesn't support MVN operator below opset 9. TensorRT doesn't support opset 8 of MVN operator.
-#endif
 }
 
 TEST(MVNContribOpTest, MeanVarianceNormalizationCPUTest_Version1_TO_8) {
@@ -246,9 +230,7 @@ TEST(UnfoldTensorOpTest, LastDim) {
 
   std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
 #ifdef USE_CUDA
-  if (DefaultCudaExecutionProvider() != nullptr) {
-    execution_providers.push_back(DefaultCudaExecutionProvider());
-  }
+  execution_providers.push_back(DefaultCudaExecutionProvider());
 #endif
   execution_providers.push_back(DefaultCpuExecutionProvider());
   tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
diff --git a/onnxruntime/test/framework/allocation_planner_test.cc b/onnxruntime/test/framework/allocation_planner_test.cc
index adab93908cdc4..eaebac177ca91 100644
--- a/onnxruntime/test/framework/allocation_planner_test.cc
+++ b/onnxruntime/test/framework/allocation_planner_test.cc
@@ -28,7 +28,6 @@ using json = nlohmann::json;
 #ifdef USE_CUDA
 #include "core/providers/cuda/cuda_execution_provider.h"
 #include "core/providers/cuda/cuda_provider_factory.h"
-#include "test/common/cuda_op_test_utils.h"
 #endif  // USE_CUDA
 #include "core/session/onnxruntime_session_options_config_keys.h"
 using namespace ONNX_NAMESPACE;
@@ -897,9 +896,6 @@ TEST_F(PlannerTest, LocationPlanningForPassThroughExplicitAndImplicitSubgraphInp
   SessionOptions so;
   InferenceSession sess{so, GetEnvironment()};
 
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
   auto status = sess.RegisterExecutionProvider(DefaultCudaExecutionProvider());
   ASSERT_TRUE(status.IsOK());
 
@@ -1042,9 +1038,6 @@ TEST_F(PlannerTest, LocationPlanningForInitializersOnlyUsedInANestedSubgraph) {
   SessionOptions so;
   InferenceSession sess{so, GetEnvironment()};
 
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
   auto status = sess.RegisterExecutionProvider(DefaultCudaExecutionProvider());
   ASSERT_TRUE(status.IsOK());
 
@@ -1152,9 +1145,6 @@ TEST_F(PlannerTest, LocationPlanningForInitializersUsedOnDifferentDevicesInMainG
   SessionOptions so;
   InferenceSession sess{so, GetEnvironment()};
 
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
   auto status = sess.RegisterExecutionProvider(DefaultCudaExecutionProvider());
   ASSERT_TRUE(status.IsOK());
 
@@ -1247,9 +1237,6 @@ TEST_F(PlannerTest, LocationPlanningForImplicitInputsWithoutExplicitConsumersInM
   SessionOptions so;
   InferenceSession sess{so, GetEnvironment()};
 
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
   auto status = sess.RegisterExecutionProvider(DefaultCudaExecutionProvider());
   ASSERT_TRUE(status.IsOK());
 
@@ -1282,10 +1269,6 @@ TEST_F(PlannerTest, LocationPlanningForImplicitInputsWithoutExplicitConsumersInM
 // Test MultiStream scenario for the graph:
 // node1(CPU ep)->node2(CPU ep)->node3(CUDA ep)->node4(CPU ep)
 TEST_F(PlannerTest, MultiStream) {
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
-
   ONNX_NAMESPACE::TensorProto tensor;
   tensor.add_dims(1);
   tensor.add_float_data(1.0f);
@@ -1304,7 +1287,6 @@ TEST_F(PlannerTest, MultiStream) {
   onnxruntime::ProviderInfo_CUDA& ep = onnxruntime::GetProviderInfo_CUDA();
   auto epFactory = ep.CreateExecutionProviderFactory(epi);
   std::unique_ptr<IExecutionProvider> execution_provider = epFactory->CreateProvider();
-
   ORT_THROW_IF_ERROR(GetExecutionProviders().Add("CUDAExecutionProvider", std::move(execution_provider)));
 
   CreatePlan({}, false);
@@ -1332,9 +1314,6 @@ TEST_F(PlannerTest, MultiStream) {
 //      node3
 // All 3 nodes are CUDA EP, node1 is in stream0, node2 is in stream1, node3 is in stream2
 TEST_F(PlannerTest, MultiStream1StreamWaitFor2Streams) {
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
   std::unique_ptr<::onnxruntime::KernelDef> cudaKernel = KernelDefBuilder().SetName("Transpose").Provider(kCudaExecutionProvider).SinceVersion(1, 10).Build();
   std::unique_ptr<::onnxruntime::KernelDef> cudaKernelAdd = KernelDefBuilder().SetName("Add").Provider(kCudaExecutionProvider).SinceVersion(1, 10).Build();
   std::string Graph_input("Graph_input"), Arg1("Arg1"), Arg2("Arg2"), Arg3("Arg3"), node1("node1"), node2("node2"), node3("node3");
@@ -1376,9 +1355,6 @@ TEST_F(PlannerTest, MultiStream1StreamWaitFor2Streams) {
 // stream 1: node2 (CPU EP)
 // node1's output, which is consumed by both node2 and node3, is in CPU.
 TEST_F(PlannerTest, MultiStreamCudaEPNodeCPUOutput) {
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
   MemcpyToHostInCuda_TransposeInCudaAndCpu("./testdata/multi_stream_models/memcpyToHost_same_stream_with_transpose.json");
   EXPECT_EQ(GetState().GetExecutionPlan()->execution_plan.size(), 2) << "2 logic streams";
   EXPECT_EQ(GetState().GetExecutionPlan()->execution_plan[0]->steps_.size(), 5) << "stream 0 has 5 steps";
@@ -1400,11 +1376,6 @@ TEST_F(PlannerTest, MultiStreamCudaEPNodeCPUOutput) {
 // TODO(leca): there is a bug in the corresponding graph that node2 will be visited twice when traversing node1's output nodes
 // (see: for (auto it = node->OutputNodesBegin(); it != node->OutputNodesEnd(); ++it) in BuildExecutionPlan()). We can just break the loop and don't need the extra variables once it is fixed
 TEST_F(PlannerTest, MultiStreamMultiOutput) {
-#if defined(USE_CUDA) && defined(USE_DML)
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
-#endif
   std::unique_ptr<::onnxruntime::KernelDef> cudaKernel = KernelDefBuilder().SetName("RNN").Provider(kCudaExecutionProvider).SinceVersion(7).Build();
   std::string Graph_input1("Graph_input1"), Graph_input2("Graph_input2"), Graph_input3("Graph_input3"), Arg1("Arg1"), Arg2("Arg2"), Arg3("Arg3"), node1("node1"), node2("node2");
   std::vector<onnxruntime::NodeArg*> input1{Arg(Graph_input1), Arg(Graph_input2), Arg(Graph_input3)}, output1{Arg(Arg1), Arg(Arg2)}, input2{Arg(Arg1), Arg(Arg2)}, output2{Arg(Arg3)};
@@ -1442,9 +1413,6 @@ TEST_F(PlannerTest, MultiStreamMultiOutput) {
 // TODO(leca): the ideal case is there is only 1 wait step before launching node3,
 // as there is a specific order between node1 and node2 if they are in the same stream, thus node3 will only need to wait the latter one
 TEST_F(PlannerTest, MultiStream2NodesSameStreamConsumedBy1NodeInDifferentStream) {
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
   std::unique_ptr<::onnxruntime::KernelDef> cudaKernel = KernelDefBuilder().SetName("Transpose").Provider(kCudaExecutionProvider).SinceVersion(1, 10).Build();
   std::string Graph_input1("Graph_input1"), Graph_input2("Graph_input2"), Graph_input3("Graph_input3"), Arg1("Arg1"), Arg2("Arg2"), Arg3("Arg3"), node1("node1"), node2("node2"), node3("node3");
   std::vector<onnxruntime::NodeArg*> input1{Arg(Graph_input1)}, input2{Arg(Graph_input2)}, output1{Arg(Arg1)}, output2{Arg(Arg2)}, input3{Arg(Arg1), Arg(Arg2)}, output3{Arg(Arg3)};
@@ -1482,9 +1450,6 @@ TEST_F(PlannerTest, MultiStream2NodesSameStreamConsumedBy1NodeInDifferentStream)
 
 #if !defined(__wasm__) && defined(ORT_ENABLE_STREAM)
 TEST_F(PlannerTest, ParaPlanCreation) {
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
   TypeProto graph_in_type;
   graph_in_type.mutable_tensor_type()->set_elem_type(TensorProto_DataType_FLOAT);
   auto* graph_in_shape = graph_in_type.mutable_tensor_type()->mutable_shape();
@@ -1926,10 +1891,6 @@ TEST_F(PlannerTest, ParaPlanCreation) {
 }
 
 TEST_F(PlannerTest, TestMultiStreamConfig) {
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
-
   const char* type = "DeviceBasedPartitioner";
   constexpr size_t type_len = 22;
 
@@ -2003,10 +1964,6 @@ TEST_F(PlannerTest, TestMultiStreamSaveConfig) {
 
 // Load with partition config where a node is missing, session load expected to fail.
 TEST_F(PlannerTest, TestMultiStreamMissingNodeConfig) {
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
-
   const char* config_file_path = "./testdata/multi_stream_models/conv_add_relu_single_stream_missing_node.json";
   SessionOptions sess_opt;
   sess_opt.graph_optimization_level = TransformerLevel::Default;
@@ -2027,9 +1984,6 @@ TEST_F(PlannerTest, TestMultiStreamMissingNodeConfig) {
 
 // Load with partition config where streams and devices has mismatch
 TEST_F(PlannerTest, TestMultiStreamMismatchDevice) {
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
   const char* config_file_path = "./testdata/multi_stream_models/conv_add_relu_single_stream_mismatch_device.json";
   SessionOptions sess_opt;
   sess_opt.graph_optimization_level = TransformerLevel::Default;
@@ -2055,9 +2009,6 @@ TEST_F(PlannerTest, TestCpuIf) {
   sess_opt.graph_optimization_level = TransformerLevel::Default;
 
   InferenceSession sess(sess_opt, GetEnvironment(), ORT_TSTR("./testdata/multi_stream_models/cpu_if.onnx"));
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
   ASSERT_STATUS_OK(sess.RegisterExecutionProvider(DefaultCudaExecutionProvider()));
   ASSERT_STATUS_OK(sess.Load());
   ASSERT_STATUS_OK(sess.Initialize());
@@ -2118,17 +2069,10 @@ TEST_F(PlannerTest, TestCpuIf) {
 //    onnx.save(model, 'issue_19480.onnx')
 //
 TEST(AllocationPlannerTest, ReusedInputCrossDifferentStreams) {
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
-
   SessionOptions sess_opt;
   sess_opt.graph_optimization_level = TransformerLevel::Default;
 
   InferenceSession sess(sess_opt, GetEnvironment(), ORT_TSTR("./testdata/multi_stream_models/issue_19480.onnx"));
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
   auto status = sess.RegisterExecutionProvider(DefaultCudaExecutionProvider());
   status = sess.Load();
   status = sess.Initialize();
diff --git a/onnxruntime/test/framework/cuda/fence_cuda_test.cc b/onnxruntime/test/framework/cuda/fence_cuda_test.cc
index 3e5ef30e7ebef..e28327941dda4 100644
--- a/onnxruntime/test/framework/cuda/fence_cuda_test.cc
+++ b/onnxruntime/test/framework/cuda/fence_cuda_test.cc
@@ -115,9 +115,6 @@ TEST(CUDAFenceTests, DISABLED_PartOnCPU) {
   SessionOptions so;
   FenceCudaTestInferenceSession session(so, GetEnvironment());
   ASSERT_STATUS_OK(LoadInferenceSessionFromModel(session, *model));
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
   ASSERT_STATUS_OK(session.RegisterExecutionProvider(DefaultCudaExecutionProvider()));
   ASSERT_TRUE(session.Initialize().IsOK());
   ASSERT_TRUE(1 == CountCopyNodes(graph));
@@ -167,9 +164,6 @@ TEST(CUDAFenceTests, TileWithInitializer) {
   SessionOptions so;
   FenceCudaTestInferenceSession session(so, GetEnvironment());
   ASSERT_STATUS_OK(LoadInferenceSessionFromModel(session, *model));
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
   ASSERT_STATUS_OK(session.RegisterExecutionProvider(DefaultCudaExecutionProvider()));
   ASSERT_STATUS_OK(session.Initialize());
 
@@ -230,9 +224,6 @@ TEST(CUDAFenceTests, TileWithComputedInput) {
   SessionOptions so;
   FenceCudaTestInferenceSession session(so, GetEnvironment());
   ASSERT_STATUS_OK(LoadInferenceSessionFromModel(session, *model));
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
   ASSERT_STATUS_OK(session.RegisterExecutionProvider(DefaultCudaExecutionProvider()));
   ASSERT_TRUE(session.Initialize().IsOK());
 
diff --git a/onnxruntime/test/framework/inference_session_test.cc b/onnxruntime/test/framework/inference_session_test.cc
index 7f4616c964e33..740c566794f15 100644
--- a/onnxruntime/test/framework/inference_session_test.cc
+++ b/onnxruntime/test/framework/inference_session_test.cc
@@ -34,7 +34,6 @@
 #ifdef USE_CUDA
 #include "core/providers/cuda/cuda_provider_factory.h"
 #include "core/providers/cuda/gpu_data_transfer.h"
-#include "test/common/cuda_op_test_utils.h"
 #endif
 #ifdef USE_TENSORRT
 #include "core/providers/tensorrt/tensorrt_provider_options.h"
@@ -636,9 +635,6 @@ TEST(InferenceSessionTests, CheckRunProfilerWithSessionOptions) {
 
   InferenceSession session_object(so, GetEnvironment());
 #ifdef USE_CUDA
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
   ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultCudaExecutionProvider()));
 #endif
 #ifdef USE_ROCM
@@ -693,9 +689,6 @@ TEST(InferenceSessionTests, CheckRunProfilerWithSessionOptions2) {
 
   InferenceSession session_object(so, GetEnvironment());
 #ifdef USE_CUDA
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
   ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultCudaExecutionProvider()));
 #endif
 #ifdef USE_ROCM
@@ -1049,9 +1042,6 @@ static void TestBindHelper(const std::string& log_str,
   if (bind_provider_type == kCudaExecutionProvider || bind_provider_type == kRocmExecutionProvider) {
 #ifdef USE_CUDA
     auto provider = DefaultCudaExecutionProvider();
-    if (provider == nullptr) {
-      return;
-    }
     gpu_provider = provider.get();
     ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(std::move(provider)));
 #endif
@@ -1647,9 +1637,6 @@ TEST(InferenceSessionTests, Test3LayerNestedSubgraph) {
 #if USE_TENSORRT
   ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultTensorrtExecutionProvider()));
 #elif USE_CUDA
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
   ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultCudaExecutionProvider()));
 #elif USE_ROCM
   ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultRocmExecutionProvider()));
@@ -1802,9 +1789,6 @@ TEST(InferenceSessionTests, Test2LayerNestedSubgraph) {
 #if USE_TENSORRT
   ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultTensorrtExecutionProvider()));
 #elif USE_CUDA
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
   ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultCudaExecutionProvider()));
 #elif USE_ROCM
   ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultRocmExecutionProvider()));
@@ -2160,9 +2144,6 @@ TEST(InferenceSessionTests, TestStrictShapeInference) {
 #ifdef USE_CUDA
 // disable it, since we are going to enable parallel execution with cuda ep
 TEST(InferenceSessionTests, DISABLED_TestParallelExecutionWithCudaProvider) {
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
   string model_uri = "testdata/transform/fusion/fuse-conv-bn-mul-add-unsqueeze.onnx";
 
   SessionOptions so;
@@ -2186,10 +2167,6 @@ TEST(InferenceSessionTests, DISABLED_TestParallelExecutionWithCudaProvider) {
 }
 
 TEST(InferenceSessionTests, TestArenaShrinkageAfterRun) {
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
-
   OrtArenaCfg arena_cfg;
   arena_cfg.arena_extend_strategy = 1;  // kSameAsRequested
 
diff --git a/onnxruntime/test/framework/memcpy_transformer_test.cc b/onnxruntime/test/framework/memcpy_transformer_test.cc
index 2313f00e4d123..6e86e5b58aead 100644
--- a/onnxruntime/test/framework/memcpy_transformer_test.cc
+++ b/onnxruntime/test/framework/memcpy_transformer_test.cc
@@ -9,9 +9,6 @@
 #include "default_providers.h"
 #include "gtest/gtest.h"
 #include "test_utils.h"
-#ifdef USE_CUDA
-#include "test/common/cuda_op_test_utils.h"
-#endif
 #include "test/test_environment.h"
 #include "asserts.h"
 
@@ -77,9 +74,6 @@ void ExpectCopy(const onnxruntime::Node& source, const std::string copy_op,
 #ifdef USE_CUDA
 
 TEST(TransformerTest, MemcpyTransformerTest) {
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
   std::unordered_map<std::string, int> domain_to_version;
   domain_to_version[kOnnxDomain] = 7;
   auto model = std::make_shared<onnxruntime::Model>("test", false, ModelMetaData(), PathString(),
@@ -112,9 +106,7 @@ TEST(TransformerTest, MemcpyTransformerTest) {
 
   KernelRegistryManager kernel_registry_manager;
   ExecutionProviders execution_providers;
-#if defined(USE_CUDA)
   ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCudaExecutionProvider, DefaultCudaExecutionProvider()));
-#endif
   ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCpuExecutionProvider,
                                            std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo())));
   KernelRegistryManager test_registry_manager;
@@ -137,9 +129,6 @@ TEST(TransformerTest, MemcpyTransformerTest) {
 }
 
 TEST(TransformerTest, MemcpyTransformerTestCudaFirst) {
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
   std::unordered_map<std::string, int> domain_to_version;
   domain_to_version[kOnnxDomain] = 7;
   auto model = std::make_shared<onnxruntime::Model>("test", false, ModelMetaData(), PathString(),
@@ -172,9 +161,7 @@ TEST(TransformerTest, MemcpyTransformerTestCudaFirst) {
 
   KernelRegistryManager kernel_registry_manager;
   ExecutionProviders execution_providers;
-
   ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCudaExecutionProvider, DefaultCudaExecutionProvider()));
-
   ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCpuExecutionProvider,
                                            std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo())));
   KernelRegistryManager test_registry_manager;
@@ -294,11 +281,7 @@ TEST(TransformerTest, TestInitializerDuplicationInSubgraph) {
 
   KernelRegistryManager kernel_registry_manager;
   ExecutionProviders execution_providers;
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
   ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCudaExecutionProvider, DefaultCudaExecutionProvider()));
-
   ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCpuExecutionProvider,
                                            std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo())));
   KernelRegistryManager test_registry_manager;
@@ -340,11 +323,7 @@ TEST(TransformerTest, MemcpyTransformerTestGraphInputConsumedOnMultipleDevices)
 
   KernelRegistryManager kernel_registry_manager;
   ExecutionProviders execution_providers;
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
   ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCudaExecutionProvider, DefaultCudaExecutionProvider()));
-
   ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCpuExecutionProvider,
                                            std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo())));
   KernelRegistryManager test_registry_manager;
@@ -446,11 +425,7 @@ TEST(TransformerTest, MemcpyTransformerTestImplicitInputConsumedOnMultipleDevice
 
   KernelRegistryManager kernel_registry_manager;
   ExecutionProviders execution_providers;
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
   ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCudaExecutionProvider, DefaultCudaExecutionProvider()));
-
   ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCpuExecutionProvider,
                                            std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo())));
   KernelRegistryManager test_registry_manager;
diff --git a/onnxruntime/test/framework/save_model_with_external_initializers.cc b/onnxruntime/test/framework/save_model_with_external_initializers.cc
index d0bc088175755..98874874d50e9 100644
--- a/onnxruntime/test/framework/save_model_with_external_initializers.cc
+++ b/onnxruntime/test/framework/save_model_with_external_initializers.cc
@@ -6,6 +6,7 @@
 #include "core/common/path_string.h"
 #include "core/framework/data_types.h"
 #include "core/graph/model.h"
+#include "core/graph/model_saving_options.h"
 #include "core/framework/tensorprotoutils.h"
 #include "test/test_environment.h"
 #include "test_utils.h"
@@ -23,15 +24,14 @@ Status LoadSaveAndCompareModel(const std::filesystem::path& input_onnx,
                                const std::filesystem::path& input_external_init_file,
                                const std::filesystem::path& output_onnx,
                                const std::filesystem::path& output_external_init_file,
-                               size_t initializer_size_threshold,
-                               const Graph::OffsetAlignmentInfo& align_info) {
+                               const ModelSavingOptions& model_saving_options) {
   auto logger = DefaultLoggingManager().CreateLogger("LoadSaveAndCompareModel");
   std::shared_ptr<Model> model;
   ORT_RETURN_IF_ERROR(Model::Load(input_onnx, model, nullptr, *logger));
   std::filesystem::remove(output_onnx);
   std::filesystem::remove(output_external_init_file);
-  ORT_RETURN_IF_ERROR(Model::SaveWithExternalInitializers(*model, output_onnx, output_external_init_file, initializer_size_threshold,
-                                                          align_info));
+  ORT_RETURN_IF_ERROR(Model::SaveWithExternalInitializers(*model, output_onnx, output_external_init_file,
+                                                          model_saving_options));
 
   std::shared_ptr<Model> model_from_external;
   ORT_RETURN_IF_ERROR(Model::Load(output_onnx.native(), model_from_external, nullptr, *logger));
@@ -67,7 +67,7 @@ Status LoadSaveAndCompareModel(const std::filesystem::path& input_onnx,
     ORT_RETURN_IF_ERROR(utils::UnpackInitializerData(*from_external_tensor_proto, model_path, from_external_tensor_proto_data));
     size_t from_external_tensor_proto_size = from_external_tensor_proto_data.size();
 
-    if (from_external_tensor_proto_size < initializer_size_threshold) {
+    if (from_external_tensor_proto_size < model_saving_options.initializer_size_threshold) {
       // 'Small' tensors should be embedded in the onnx file.
       ORT_RETURN_IF_NOT(from_external_tensor_proto->data_location() == ONNX_NAMESPACE::TensorProto_DataLocation::TensorProto_DataLocation_DEFAULT, "location mismatch");
     } else {
@@ -78,13 +78,14 @@ Status LoadSaveAndCompareModel(const std::filesystem::path& input_onnx,
     ORT_RETURN_IF_NOT(tensor_proto_size == from_external_tensor_proto_size, "size mismatch");
     ORT_RETURN_IF_NOT(memcmp(tensor_proto_data.data(), from_external_tensor_proto_data.data(), tensor_proto_size) == 0, "data mismatch");
 
-    if (align_info.align_offset) {
+    if (model_saving_options.align_offset) {
       for (const StringStringEntryProto& entry : from_external_tensor_proto->external_data()) {
         if (entry.has_key() && entry.has_value() && entry.key() == "offset") {
           size_t tensor_offset;
           std::stringstream stream(entry.value());
           stream >> tensor_offset;
-          ORT_RETURN_IF_NOT(tensor_offset % align_info.allocation_granularity == 0, "tensor offset not align");
+          ORT_RETURN_IF_NOT(tensor_offset % model_saving_options.allocation_granularity == 0,
+                            "tensor offset not align");
         }
       }
     }
@@ -97,22 +98,35 @@ Status LoadSaveAndCompareModel(const std::filesystem::path& input_onnx,
 
 // Original model does not have external initializers
 TEST(SaveWithExternalInitializers, Mnist) {
-  Graph::OffsetAlignmentInfo align_info;
-  ASSERT_STATUS_OK(LoadSaveAndCompareModel(ORT_TSTR("testdata/mnist.onnx"), ORT_TSTR(""), ORT_TSTR("testdata/mnist_with_external_initializers.onnx"), ORT_TSTR("mnist_external_initializers.bin"), 100, align_info));
+  ModelSavingOptions model_saving_options{100};
+  ASSERT_STATUS_OK(LoadSaveAndCompareModel(
+      ORT_TSTR("testdata/mnist.onnx"),
+      ORT_TSTR(""), ORT_TSTR("testdata/mnist_with_external_initializers.onnx"),
+      ORT_TSTR("mnist_external_initializers.bin"),
+      model_saving_options));
 }
 
 // Original model has external initializers
 TEST(SaveWithExternalInitializers, ModelWithOriginalExternalData) {
-  Graph::OffsetAlignmentInfo align_info;
-  ASSERT_STATUS_OK(LoadSaveAndCompareModel(ORT_TSTR("testdata/model_with_orig_ext_data.onnx"), ORT_TSTR("model_with_orig_ext_data.onnx.data"), ORT_TSTR("testdata/model_with_new_external_initializers.onnx"), ORT_TSTR("model_with_new_external_initializers.bin"), 0, align_info));
+  ModelSavingOptions model_saving_options{0};
+  ASSERT_STATUS_OK(LoadSaveAndCompareModel(
+      ORT_TSTR("testdata/model_with_orig_ext_data.onnx"),
+      ORT_TSTR("model_with_orig_ext_data.onnx.data"),
+      ORT_TSTR("testdata/model_with_new_external_initializers.onnx"),
+      ORT_TSTR("model_with_new_external_initializers.bin"),
+      model_saving_options));
 }
 
 // Original model has external initializers, align offset
 TEST(SaveWithExternalInitializers, ModelWithOriginalExternalDataAlignOffset) {
-  Graph::OffsetAlignmentInfo align_info;
-  align_info.align_offset = true;
-  align_info.align_threshold = 0;
-  ASSERT_STATUS_OK(LoadSaveAndCompareModel(ORT_TSTR("testdata/model_with_orig_ext_data.onnx"), ORT_TSTR("model_with_orig_ext_data.onnx.data"), ORT_TSTR("testdata/model_with_new_external_initializers.onnx"), ORT_TSTR("model_with_new_external_initializers.bin"), 0, align_info));
+  ModelSavingOptions model_saving_options{0};
+  model_saving_options.align_offset = true;
+  model_saving_options.align_threshold = 0;
+  ASSERT_STATUS_OK(LoadSaveAndCompareModel(
+      ORT_TSTR("testdata/model_with_orig_ext_data.onnx"),
+      ORT_TSTR("model_with_orig_ext_data.onnx.data"),
+      ORT_TSTR("testdata/model_with_new_external_initializers.onnx"),
+      ORT_TSTR("model_with_new_external_initializers.bin"), model_saving_options));
 }
 
 }  // namespace test
diff --git a/onnxruntime/test/framework/session_state_test.cc b/onnxruntime/test/framework/session_state_test.cc
index 3e694020f796b..e7f8b1aaa49d8 100644
--- a/onnxruntime/test/framework/session_state_test.cc
+++ b/onnxruntime/test/framework/session_state_test.cc
@@ -15,6 +15,7 @@
 #include "core/graph/graph_utils.h"
 #include "core/graph/graph_viewer.h"
 #include "core/graph/model.h"
+#include "core/graph/model_saving_options.h"
 #include "core/graph/op.h"
 #include "core/providers/cpu/cpu_execution_provider.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
@@ -22,13 +23,101 @@
 #include "gtest/gtest.h"
 #include "test/test_environment.h"
 #include "test/util/include/default_providers.h"
+#include "test/util/include/file_util.h"
 #include "core/optimizer/layout_transformation/layout_transformation.h"
 
 using namespace ONNX_NAMESPACE;
-using namespace std;
 namespace onnxruntime {
-
 namespace test {
+
+#ifndef ENABLE_TRAINING_CORE
+#ifndef __wasm__
+static void TestSavedPrepacks(const Model& model) {
+  auto inspect = [](const Graph& graph) {
+    const auto& prepacked_for_graph = graph.GetPrepacked();
+    const auto& key_to_blob = prepacked_for_graph.GetKeyToBlob();
+    ASSERT_EQ(1U, key_to_blob.size());
+    const size_t expected_prepacks_for_writing = (graph.ParentGraph() == nullptr) ? 1U : 0U;
+    ASSERT_EQ(expected_prepacks_for_writing, prepacked_for_graph.GetNumberOfWeightsForWriting());
+
+    const size_t expected_blobs_for_writing = (graph.ParentGraph() == nullptr) ? 1U : 0U;
+    ASSERT_EQ(expected_blobs_for_writing, prepacked_for_graph.GetNumberOfKeyedBlobsForWriting());
+
+    if (graph.ParentGraph() == nullptr) {
+      const auto* blob_keys = prepacked_for_graph.GetKeysForWeightForSaving("if_shared");
+      ASSERT_TRUE(blob_keys != nullptr);
+      ASSERT_EQ(blob_keys->size(), 1U);
+      const auto* prepacked_weights = prepacked_for_graph.GetPrepackedWeights(*blob_keys->cbegin());
+      ASSERT_TRUE(prepacked_weights != nullptr);
+      ASSERT_EQ(prepacked_weights->buffer_sizes_.size(), 1U);
+      ASSERT_EQ(prepacked_weights->buffer_sizes_[0], sizeof(float) * 2);
+    }
+  };
+
+  const auto& main_graph = model.MainGraph();
+  inspect(main_graph);
+
+  const auto& nodes = main_graph.Nodes();
+  auto if_node_hit = std::find_if(nodes.begin(), nodes.end(),
+                                  [](const Node& node) { return node.Name() == "if"; });
+  ASSERT_FALSE(if_node_hit == nodes.end());
+  const Node& if_node = *if_node_hit;
+  for (const auto& [_, subgraph] : if_node.GetAttributeNameToSubgraphMap()) {
+    inspect(*subgraph);
+  }
+}
+
+static void TestLoadedSharedUserSupplied(const Model& model) {
+  auto inspect = [](const Graph& graph) {
+    const auto& prepacked_for_graph = graph.GetPrepacked();
+    constexpr size_t expected_prepacks_for_writing = 0U;
+    ASSERT_EQ(expected_prepacks_for_writing, prepacked_for_graph.GetNumberOfWeightsForWriting());
+
+    // We have not loaded anything since this initializer is user supplied
+    const auto& key_to_blob = prepacked_for_graph.GetKeyToBlob();
+    ASSERT_EQ(0U, key_to_blob.size());
+  };
+
+  const auto& main_graph = model.MainGraph();
+  inspect(main_graph);
+
+  const auto& nodes = main_graph.Nodes();
+  auto if_node_hit = std::find_if(nodes.begin(), nodes.end(),
+                                  [](const Node& node) { return node.Name() == "if"; });
+  ASSERT_FALSE(if_node_hit == nodes.end());
+  const Node& if_node = *if_node_hit;
+  for (const auto& [_, subgraph] : if_node.GetAttributeNameToSubgraphMap()) {
+    inspect(*subgraph);
+  }
+}
+
+static void TestLoadedSharedNoUserSupplied(const Model& model) {
+  auto inspect = [](const Graph& graph) {
+    const auto& prepacked_for_graph = graph.GetPrepacked();
+    constexpr size_t expected_prepacks_for_writing = 0U;
+    ASSERT_EQ(expected_prepacks_for_writing, prepacked_for_graph.GetNumberOfWeightsForWriting());
+
+    // We have not loaded anything since this initializer is user supplied
+    const auto& key_to_blob = prepacked_for_graph.GetKeyToBlob();
+    ASSERT_EQ(1U, key_to_blob.size());
+  };
+
+  const auto& main_graph = model.MainGraph();
+  inspect(main_graph);
+
+  const auto& nodes = main_graph.Nodes();
+  auto if_node_hit = std::find_if(nodes.begin(), nodes.end(),
+                                  [](const Node& node) { return node.Name() == "if"; });
+  ASSERT_FALSE(if_node_hit == nodes.end());
+  const Node& if_node = *if_node_hit;
+  for (const auto& [_, subgraph] : if_node.GetAttributeNameToSubgraphMap()) {
+    inspect(*subgraph);
+  }
+}
+
+#endif  // __wasm__
+#endif  // ENABLE_TRAINING_CORE
+
 class TestOpKernel : public OpKernel {
  public:
   TestOpKernel(const OpKernelInfo& p) : OpKernel(p) {
@@ -378,7 +467,7 @@ class PrePackingTestOpKernel : public OpKernel {
     ORT_UNUSED_PARAMETER(tensor);
     ORT_UNUSED_PARAMETER(input_idx);
 
-    size_t weight_packed_len = 8;
+    constexpr const size_t weight_packed_len = sizeof(float) * 2;
     weight_packed_ = IAllocator::MakeUniquePtr<void>(alloc, weight_packed_len, true);
     float* data_weights_packed = reinterpret_cast<float*>(weight_packed_.get());
     data_weights_packed[0] = 1.2345f;
@@ -647,7 +736,8 @@ class SessionStateTestSharedInitalizersWithPrePacking : public ::testing::Test {
   }
 };
 
-// Pre-packing enabled + no shared initializers = no pre-packed weights caching
+// Pre-packing enabled + no shared initializers, however, we put all the pre-packs
+// in a session_state container for ownership.
 TEST_F(SessionStateTestSharedInitalizersWithPrePacking, test1) {
   SessionOptions sess_options;
   sess_options.enable_mem_pattern = true;
@@ -679,10 +769,11 @@ TEST_F(SessionStateTestSharedInitalizersWithPrePacking, test1) {
 
   const auto* kernel = reinterpret_cast<const PrePackingTestOpKernel*>(session_state_1.GetKernel(0));
 
-  // Assert that a pre-pack call was made and that no mechanism to store weight from shared container was invoked
+  // Assert that a pre-pack call was made. However, they sharing call is still made from a serialized container.
   ASSERT_EQ(session_state_1.GetNumberOfPrepacksCounter(), static_cast<size_t>(1));
   ASSERT_EQ(kernel->prepack_calls_count, 1);
-  ASSERT_EQ(kernel->store_pre_packed_weight_calls_count, 0);
+  // In this case the sharing comes from the serialized container
+  ASSERT_EQ(kernel->store_pre_packed_weight_calls_count, 1);
 
   // Second session/model
   Model model_2("graph_main", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
@@ -706,10 +797,11 @@ TEST_F(SessionStateTestSharedInitalizersWithPrePacking, test1) {
 
   kernel = reinterpret_cast<const PrePackingTestOpKernel*>(session_state_2.GetKernel(0));
 
-  // Assert that a pre-pack call was made and that no mechanism to store weight from shared container was invoked
+  // Assert that a pre-pack call was made. The weights are still shared from the serialized container
+  // either because they are loaded from disk or because the container takes ownership of them.
   ASSERT_EQ(session_state_2.GetNumberOfPrepacksCounter(), static_cast<size_t>(1));
   ASSERT_EQ(kernel->prepack_calls_count, 1);
-  ASSERT_EQ(kernel->store_pre_packed_weight_calls_count, 0);
+  ASSERT_EQ(kernel->store_pre_packed_weight_calls_count, 1);
 }
 
 // Pre-packing enabled + shared initializers + no pre-packed weights container = no pre-packed weights caching
@@ -754,10 +846,10 @@ TEST_F(SessionStateTestSharedInitalizersWithPrePacking, test2) {
 
   const auto* kernel = reinterpret_cast<const PrePackingTestOpKernel*>(session_state_1.GetKernel(0));
 
-  // Assert that a pre-pack call was made and that no mechanism to store weight from shared container was invoked
+  // Assert that a pre-pack call was made, but sharing still takes place from the serialized container
   ASSERT_EQ(session_state_1.GetNumberOfPrepacksCounter(), static_cast<size_t>(1));
   ASSERT_EQ(kernel->prepack_calls_count, 1);
-  ASSERT_EQ(kernel->store_pre_packed_weight_calls_count, 0);
+  ASSERT_EQ(kernel->store_pre_packed_weight_calls_count, 1);
 
   // Second session/model
   Model model_2("graph_main", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
@@ -781,10 +873,10 @@ TEST_F(SessionStateTestSharedInitalizersWithPrePacking, test2) {
 
   kernel = reinterpret_cast<const PrePackingTestOpKernel*>(session_state_2.GetKernel(0));
 
-  // Assert that a pre-pack call was made and that no mechanism to store weight from shared container was invoked
+  // Assert that a pre-pack call was made, but sharing still takes place from the serialized container
   ASSERT_EQ(session_state_2.GetNumberOfPrepacksCounter(), static_cast<size_t>(1));
   ASSERT_EQ(kernel->prepack_calls_count, 1);
-  ASSERT_EQ(kernel->store_pre_packed_weight_calls_count, 0);
+  ASSERT_EQ(kernel->store_pre_packed_weight_calls_count, 1);
 }
 
 // Pre-packing enabled + shared initializers + pre-packed weights container = pre-packed weights caching enabled
@@ -999,6 +1091,196 @@ TEST_F(SessionStateTestSharedInitalizersWithPrePacking, test4) {
   ASSERT_EQ(if_node_branches_shared_prepack_counter_2, static_cast<size_t>(2));
 }
 
+#ifndef __wasm__
+// sharing is on
+TEST_F(SessionStateTestSharedInitalizersWithPrePacking, TestPrepackedSerialization) {
+  const std::filesystem::path model_with_external_initializers =
+      "testdata/test_prepacked_serialization_optimized_model.onnx";
+
+  const std::filesystem::path external_initializers_file =
+      "test_prepacked_serialization_optimized_model.bin";
+
+  {
+    SessionOptions sess_options;
+    sess_options.enable_mem_pattern = true;
+    sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL;
+    sess_options.use_deterministic_compute = false;
+    sess_options.enable_mem_reuse = true;
+    sess_options.optimized_model_filepath = model_with_external_initializers;
+
+    // Enable pre-packing
+    sess_options.config_options.configurations[kOrtSessionOptionsConfigDisablePrepacking] = "0";
+    // Enable saving model with pre-packed weights
+    sess_options.config_options.configurations[kOrtSessionOptionsSavePrePackedConstantInitializers] = "1";
+
+    // Enable shared initializer
+    OrtMemoryInfo mem_info(CPU, OrtDeviceAllocator);
+    std::vector<float> float_data(1, 1);
+    auto value = std::make_unique<OrtValue>();
+    Tensor::InitOrtValue(DataTypeImpl::GetType<float>(), TensorShape(std::vector<int64_t>{1}),
+                         reinterpret_cast<void*>(float_data.data()), mem_info, *value);
+
+    ASSERT_STATUS_OK(sess_options.AddInitializer("if_shared", value.get()));
+
+    // Enable pre-packed weights container for shared initializers
+    PrepackedWeightsContainer prepacked_weights_container;
+    Model model_1("graph_main", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
+                  domain_to_version, std::vector<ONNX_NAMESPACE::FunctionProto>(),
+                  DefaultLoggingManager().DefaultLogger());
+
+    CreateGraphWithSubgraph(model_1.MainGraph());
+    PlaceAllNodesToCPUEP(model_1.MainGraph());
+    SessionState session_state_1(model_1.MainGraph(),
+                                 execution_providers,
+                                 tp.get(),
+                                 nullptr, /*inter_op_thread_pool*/
+                                 dtm,
+                                 edlm,
+                                 DefaultLoggingManager().DefaultLogger(),
+                                 profiler,
+                                 sess_options,
+                                 &prepacked_weights_container);
+
+    constexpr const bool saving_model_true = true;
+
+    ASSERT_STATUS_OK(session_state_1.FinalizeSessionState(std::basic_string<PATH_CHAR_TYPE>(),
+                                                          kernel_registry_manager,
+                                                          !saving_model_true));
+
+    TestSavedPrepacks(model_1);
+
+    ModelSavingOptions model_saving_options{4};
+    model_saving_options.align_offset = true;
+
+    ASSERT_STATUS_OK(Model::SaveWithExternalInitializers(model_1, model_with_external_initializers,
+                                                         external_initializers_file,
+                                                         model_saving_options));
+  }
+  ScopedFileDeleter test_model_deleter(model_with_external_initializers);
+  ScopedFileDeleter binary_file_deleter(external_initializers_file);
+
+  // Now let's load the model along with the initializers
+  {
+    SessionOptions sess_options;
+    sess_options.enable_mem_pattern = true;
+    sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL;
+    sess_options.use_deterministic_compute = false;
+    sess_options.enable_mem_reuse = true;
+
+    // Enable pre-packing
+    sess_options.config_options.configurations[kOrtSessionOptionsConfigDisablePrepacking] = "0";
+
+    // We are expecting this weight to be loaded from disk along
+    // with its pre-packed version
+    // Enable shared initializer
+    OrtMemoryInfo mem_info(CPU, OrtDeviceAllocator);
+    std::vector<float> float_data(1, 1);
+    auto value = std::make_unique<OrtValue>();
+    Tensor::InitOrtValue(DataTypeImpl::GetType<float>(), TensorShape(std::vector<int64_t>{1}),
+                         reinterpret_cast<void*>(float_data.data()), mem_info, *value);
+
+    ASSERT_STATUS_OK(sess_options.AddInitializer("if_shared", value.get()));
+
+    // Enable pre-packed weights container for shared initializers
+    PrepackedWeightsContainer prepacked_weights_container;
+
+    std::shared_ptr<Model> model;
+    ASSERT_STATUS_OK(Model::Load(model_with_external_initializers, model, nullptr,
+                                 DefaultLoggingManager().DefaultLogger()));
+
+    PlaceAllNodesToCPUEP(model->MainGraph());
+    SessionState session_state(model->MainGraph(),
+                               execution_providers,
+                               tp.get(),
+                               nullptr, /*inter_op_thread_pool*/
+                               dtm,
+                               edlm,
+                               DefaultLoggingManager().DefaultLogger(),
+                               profiler,
+                               sess_options,
+                               &prepacked_weights_container);
+
+    ASSERT_STATUS_OK(session_state.FinalizeSessionState(std::basic_string<PATH_CHAR_TYPE>(),
+                                                        kernel_registry_manager,
+                                                        false));
+
+    TestLoadedSharedUserSupplied(*model);
+  }
+
+  // Load again, this time sharing is enabled, but no shared initializer in the map
+  {
+    SessionOptions sess_options;
+    sess_options.enable_mem_pattern = true;
+    sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL;
+    sess_options.use_deterministic_compute = false;
+    sess_options.enable_mem_reuse = true;
+
+    // Enable pre-packing
+    sess_options.config_options.configurations[kOrtSessionOptionsConfigDisablePrepacking] = "0";
+
+    // Enable pre-packed weights container for shared initializers
+    PrepackedWeightsContainer prepacked_weights_container;
+
+    std::shared_ptr<Model> model;
+    ASSERT_STATUS_OK(Model::Load(model_with_external_initializers, model, nullptr,
+                                 DefaultLoggingManager().DefaultLogger()));
+
+    PlaceAllNodesToCPUEP(model->MainGraph());
+    SessionState session_state(model->MainGraph(),
+                               execution_providers,
+                               tp.get(),
+                               nullptr, /*inter_op_thread_pool*/
+                               dtm,
+                               edlm,
+                               DefaultLoggingManager().DefaultLogger(),
+                               profiler,
+                               sess_options,
+                               &prepacked_weights_container);
+
+    ASSERT_STATUS_OK(session_state.FinalizeSessionState(model_with_external_initializers,
+                                                        kernel_registry_manager,
+                                                        false));
+
+    TestLoadedSharedNoUserSupplied(*model);
+  }
+  // Load again, sharing is disabled
+  {
+    SessionOptions sess_options;
+    sess_options.enable_mem_pattern = true;
+    sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL;
+    sess_options.use_deterministic_compute = false;
+    sess_options.enable_mem_reuse = true;
+
+    // Enable pre-packing
+    sess_options.config_options.configurations[kOrtSessionOptionsConfigDisablePrepacking] = "0";
+
+    std::shared_ptr<Model> model;
+    ASSERT_STATUS_OK(Model::Load(model_with_external_initializers, model, nullptr,
+                                 DefaultLoggingManager().DefaultLogger()));
+
+    PlaceAllNodesToCPUEP(model->MainGraph());
+    SessionState session_state(model->MainGraph(),
+                               execution_providers,
+                               tp.get(),
+                               nullptr, /*inter_op_thread_pool*/
+                               dtm,
+                               edlm,
+                               DefaultLoggingManager().DefaultLogger(),
+                               profiler,
+                               sess_options,
+                               nullptr);
+
+    ASSERT_STATUS_OK(session_state.FinalizeSessionState(model_with_external_initializers,
+                                                        kernel_registry_manager,
+                                                        false));
+
+    const auto& prepacked_for_main_graph = model->MainGraph().GetPrepacked();
+    ASSERT_FALSE(prepacked_for_main_graph.IsSaveModeOn());
+    ASSERT_EQ(1U, prepacked_for_main_graph.GetKeyToBlob().size());
+  }
+}
+#endif  // __wasm__
+
 INSTANTIATE_TEST_SUITE_P(SessionStateTests,
                          SessionStatePrepackingTest,
                          testing::Values(PrepackingTestParam{false, false},
diff --git a/onnxruntime/test/framework/sparse_kernels_test.cc b/onnxruntime/test/framework/sparse_kernels_test.cc
index db9592c293fd0..7bd6b47f52b7d 100644
--- a/onnxruntime/test/framework/sparse_kernels_test.cc
+++ b/onnxruntime/test/framework/sparse_kernels_test.cc
@@ -1457,9 +1457,6 @@ TEST(SparseTensorConversionTests, CsrConversion) {
 
 #ifdef USE_CUDA
   auto cuda_provider = DefaultCudaExecutionProvider();
-  if (cuda_provider == nullptr) {
-    return;
-  }
   auto cuda_allocator = cuda_provider->CreatePreferredAllocators()[0];
   {
     auto cuda_transfer = cuda_provider->GetDataTransfer();
@@ -1687,9 +1684,6 @@ TEST(SparseTensorConversionTests, CooConversion) {
 
 #ifdef USE_CUDA
   auto cuda_provider = DefaultCudaExecutionProvider();
-  if (cuda_provider == nullptr) {
-    return;
-  }
   auto cuda_allocator = cuda_provider->CreatePreferredAllocators()[0];
   {
     auto cuda_transfer = cuda_provider->GetDataTransfer();
diff --git a/onnxruntime/test/framework/tensorutils_test.cc b/onnxruntime/test/framework/tensorutils_test.cc
index 6821f582ce2de..229f4f95b8394 100644
--- a/onnxruntime/test/framework/tensorutils_test.cc
+++ b/onnxruntime/test/framework/tensorutils_test.cc
@@ -1,6 +1,9 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include "core/common/inlined_containers.h"
+#include "core/framework/prepacked_weights.h"
+#include "core/framework/prepacked_weights_container.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/graph/onnx_protobuf.h"
 #include "test/util/include/asserts.h"
@@ -19,6 +22,76 @@ using namespace ONNX_NAMESPACE;
 namespace onnxruntime {
 namespace test {
 
+// Test ExternalData functionality
+TEST(TensorProtoUtilsTest, SetExternalDataInformation) {
+  ONNX_NAMESPACE::TensorProto tensor_proto;
+  const std::filesystem::path kExternalDataPath("test.bin");
+  constexpr const int64_t init_offset = 100;
+  constexpr const size_t init_length = 200;
+
+  ExternalDataInfo::SetExternalLocationToProto(kExternalDataPath, init_offset, init_length, tensor_proto);
+
+  ASSERT_EQ(tensor_proto.data_location(), ONNX_NAMESPACE::TensorProto_DataLocation::TensorProto_DataLocation_EXTERNAL);
+  ASSERT_EQ(tensor_proto.external_data_size(), 3);
+  ASSERT_EQ(tensor_proto.external_data(0).key(), "location");
+  ASSERT_EQ(tensor_proto.external_data(0).value(), ToUTF8String(kExternalDataPath.native()));
+  ASSERT_EQ(tensor_proto.external_data(1).key(), "offset");
+  ASSERT_EQ(tensor_proto.external_data(1).value(), std::to_string(init_offset));
+  ASSERT_EQ(tensor_proto.external_data(2).key(), "length");
+  ASSERT_EQ(tensor_proto.external_data(2).value(), std::to_string(init_length));
+
+  PrepackedKeyToBlobMap key_to_blob;
+  constexpr bool save_mode_on = true;
+  PrepackedWeightsForGraph prepacked_for_graph(key_to_blob, save_mode_on);
+  PrePackedWeights prepacked_weights;
+  const std::string init_name = "test_initializer";
+  const std::string blob_key = "test_key";
+
+  std::array<float, 2> kData = {1.2345f, 2.4690f};
+  const size_t buffer_size = kData.size() * sizeof(float);
+
+  prepacked_weights.buffers_.push_back(BufferUniquePtr(kData.data(), BufferDeleter(nullptr)));
+  prepacked_weights.buffer_sizes_.push_back(buffer_size);
+  // Write a second entry like this
+  prepacked_weights.buffers_.push_back(BufferUniquePtr(kData.data(), BufferDeleter(nullptr)));
+  prepacked_weights.buffer_sizes_.push_back(buffer_size);
+
+  prepacked_for_graph.WritePackedMaybeForSave(init_name, blob_key, std::move(prepacked_weights));
+
+  constexpr const int64_t starting_offset = 300;
+  int64_t external_offset = starting_offset;
+  std::stringstream ss;
+  const auto* blobs_for_weight = prepacked_for_graph.GetKeysForWeightForSaving(init_name);
+  ASSERT_TRUE(blobs_for_weight != nullptr);
+  InlinedHashSet<std::string> blob_keys{blobs_for_weight->begin(), blobs_for_weight->end()};
+  ASSERT_TRUE(ExternalDataInfo::WritePrepackedToFileAndAddToProto(prepacked_for_graph,
+                                                                  blob_keys,
+                                                                  true, 1024 * 1024, 0,
+                                                                  ss, external_offset,
+                                                                  tensor_proto));
+
+  auto external_data_info = std::make_unique<ExternalDataInfo>();
+  ASSERT_STATUS_OK(ExternalDataInfo::Create(tensor_proto.external_data(), external_data_info));
+
+  // This should have prepacked_data entry with two blobs for a single key.
+  ASSERT_TRUE(external_data_info->HasPrepackedInfo());
+  auto prepacked_infos = external_data_info->TakePrepackedInfos();
+  ASSERT_EQ(prepacked_infos.size(), 1U);
+  ASSERT_TRUE(prepacked_infos.count(blob_key) > 0);
+
+  int64_t final_offset = starting_offset;
+  for (const auto& blob_info : prepacked_infos[blob_key]) {
+    int64_t offset = std::get<0>(blob_info);
+    ASSERT_EQ(offset, final_offset);
+    size_t length = std::get<1>(blob_info);
+    std::string checksum = std::get<2>(blob_info);  // currently "0"
+    final_offset = offset + length;
+    ASSERT_EQ(length, buffer_size);
+    ASSERT_EQ(checksum, "0");
+  }
+  ASSERT_EQ(final_offset, external_offset);
+}
+
 // T must be float for double, and it must match with the 'type' argument
 template <typename T>
 void TestUnpackFloatTensor(TensorProto_DataType type, const std::filesystem::path& model_path) {
diff --git a/onnxruntime/test/lora/lora_test.cc b/onnxruntime/test/lora/lora_test.cc
index 9d8febb453739..e8291a36447ca 100644
--- a/onnxruntime/test/lora/lora_test.cc
+++ b/onnxruntime/test/lora/lora_test.cc
@@ -201,16 +201,6 @@ TEST(LoraAdapterTest, Load) {
 
 #ifdef USE_CUDA
 TEST(LoraAdapterTest, VerifyDeviceCopy) {
-  // These checks for CUDA/DML combined Package, Be careful when you want to remove it!
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    GTEST_SKIP() << "Skip This Test Due to this EP is null";
-  }
-#ifdef USE_DML
-  if (DefaultDmlExecutionProvider() != nullptr) {
-    GTEST_FAIL() << "It should not run with DML EP";
-  }
-#endif
-
   auto cpu_ep = DefaultCpuExecutionProvider();
   auto cpu_allocator = cpu_ep->CreatePreferredAllocators()[0];
   auto cuda_ep = DefaultCudaExecutionProvider();
diff --git a/onnxruntime/test/providers/base_tester.cc b/onnxruntime/test/providers/base_tester.cc
index b0958e05dc373..aa68f68f3e735 100644
--- a/onnxruntime/test/providers/base_tester.cc
+++ b/onnxruntime/test/providers/base_tester.cc
@@ -532,17 +532,6 @@ void BaseTester::Run(ExpectResult expect_result, const std::string& expected_fai
   so.use_deterministic_compute = use_determinism_;
   so.graph_optimization_level = TransformerLevel::Default;  // 'Default' == off
 
-  // remove nullptr in execution_providers.
-  // it's a little ugly but we need to do this because DefaultXXXExecutionProvider() can return nullptr in Runtime.
-  // And there're many places adding DefaultXXXExecutionProvider() to execution_providers directly.
-  if (execution_providers != nullptr) {
-    execution_providers->erase(std::remove(execution_providers->begin(), execution_providers->end(), nullptr), execution_providers->end());
-    if (execution_providers->size() == 0) {
-      // In fact, no ep is needed to run
-      return;
-    }
-  }
-
   Run(so, expect_result, expected_failure_string, excluded_provider_types, run_options, execution_providers, options);
 }
 
diff --git a/onnxruntime/test/providers/compare_provider_test_utils.cc b/onnxruntime/test/providers/compare_provider_test_utils.cc
index 9acb37c24ddd0..386a5656d8a01 100644
--- a/onnxruntime/test/providers/compare_provider_test_utils.cc
+++ b/onnxruntime/test/providers/compare_provider_test_utils.cc
@@ -53,11 +53,6 @@ void CompareOpTester::CompareWithCPU(const std::string& target_provider_type,
   SetTestFunctionCalled();
 
   std::unique_ptr<IExecutionProvider> target_execution_provider = GetExecutionProvider(target_provider_type);
-#if defined(USE_CUDA) && defined(USE_DML)
-  if (target_execution_provider == nullptr) {
-    return;
-  }
-#endif
   ASSERT_TRUE(target_execution_provider != nullptr) << "provider_type " << target_provider_type
                                                     << " is not supported.";
 
diff --git a/onnxruntime/test/providers/cpu/model_tests.cc b/onnxruntime/test/providers/cpu/model_tests.cc
index b46c253fb8ed9..e3c86a137484f 100644
--- a/onnxruntime/test/providers/cpu/model_tests.cc
+++ b/onnxruntime/test/providers/cpu/model_tests.cc
@@ -491,18 +491,6 @@ ::std::vector<::std::basic_string<ORTCHAR_T>> GetParameterStrings() {
   // the number of times these are run to reduce the CI time.
   provider_names.erase(provider_name_cpu);
 #endif
-
-#if defined(USE_CUDA) && defined(USE_DML)
-  const std::string no_cuda_ep_test = Env::Default().GetEnvironmentVar("NO_CUDA_TEST");
-  if (no_cuda_ep_test == "1") {
-    provider_names.erase(provider_name_cuda);
-  }
-  const std::string no_dml_ep_test = Env::Default().GetEnvironmentVar("NO_DML_TEST");
-  if (no_dml_ep_test == "1") {
-    provider_names.erase(provider_name_dml);
-  }
-#endif
-
   std::vector<std::basic_string<ORTCHAR_T>> v;
   // Permanently exclude following tests because ORT support only opset starting from 7,
   // Please make no more changes to the list
diff --git a/onnxruntime/test/providers/cpu/tensor/gather_elements_op_test.cc b/onnxruntime/test/providers/cpu/tensor/gather_elements_op_test.cc
index 5b2d00bb956bf..81e51375b9992 100644
--- a/onnxruntime/test/providers/cpu/tensor/gather_elements_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/gather_elements_op_test.cc
@@ -389,9 +389,10 @@ TEST(GatherElementsOpTest, IndicesOutOfBounds) {
   // skip openvino which will not throw error message but will ensure no out-of-bound access
   // skip TensorRT because it doesn't support out of bounds indices
   // skip QNN because it doesn't support out of bounds indices
+  // skip WebGPU because it doesn't support out of bounds indices
   test.Run(OpTester::ExpectResult::kExpectFailure, "",
            {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kRocmExecutionProvider, kOpenVINOExecutionProvider,
-            kTensorrtExecutionProvider, kDmlExecutionProvider, kQnnExecutionProvider});
+            kTensorrtExecutionProvider, kDmlExecutionProvider, kQnnExecutionProvider, kWebGpuExecutionProvider});
 }
 
 TEST(GatherElementsOpTest, BigIndices) {
diff --git a/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc b/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc
index 0f23e4c39d7e2..be79a6d29d539 100644
--- a/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc
@@ -3,9 +3,6 @@
 
 #include "core/session/onnxruntime_session_options_config_keys.h"
 #include "gtest/gtest.h"
-#if USE_CUDA
-#include "test/common/cuda_op_test_utils.h"
-#endif
 #include "test/providers/provider_test_utils.h"
 #include "test/util/include/default_providers.h"
 
@@ -125,9 +122,6 @@ TEST(GatherOpTest, Gather_invalid_index_gpu) {
                          4.0f, 5.0f, 6.0f, 7.0f,
                          0.0f, 0.0f, 0.0f, 0.0f});
 
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
   // On GPU, just set the value to 0 instead of report error. exclude all other providers
   test
 #if defined(USE_CUDA)
diff --git a/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc b/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc
index 7e1a2384d7fc6..05cfb5c13d689 100644
--- a/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc
@@ -15,13 +15,11 @@ std::vector<std::unique_ptr<IExecutionProvider>> GetExecutionProviders(int opset
   execution_providers.emplace_back(DefaultCpuExecutionProvider());
 
 #ifdef USE_CUDA
-  if (DefaultCudaExecutionProvider() != nullptr) {
-    if (opset_version < 20) {
-      execution_providers.emplace_back(DefaultCudaExecutionProvider());
+  if (opset_version < 20) {
+    execution_providers.emplace_back(DefaultCudaExecutionProvider());
 #ifdef ENABLE_CUDA_NHWC_OPS
-      execution_providers.push_back(DefaultCudaNHWCExecutionProvider());
+    execution_providers.push_back(DefaultCudaNHWCExecutionProvider());
 #endif
-    }
   }
 #endif
 
diff --git a/onnxruntime/test/providers/cuda/cuda_provider_test.cc b/onnxruntime/test/providers/cuda/cuda_provider_test.cc
index e745e1bcb8171..e57cdd2350fab 100644
--- a/onnxruntime/test/providers/cuda/cuda_provider_test.cc
+++ b/onnxruntime/test/providers/cuda/cuda_provider_test.cc
@@ -11,7 +11,7 @@ ProviderInfo_CUDA& GetProviderInfo_CUDA_Test();
 
 namespace test {
 namespace cuda {
-TEST(CudaEpUnittest, All) {
+TEST(CUDA_EP_Unittest, All) {
   onnxruntime::ProviderInfo_CUDA& ep = onnxruntime::GetProviderInfo_CUDA_Test();
   ep.TestAll();
 }
diff --git a/onnxruntime/test/providers/cuda/test_cases/allocator_cuda_test.cc b/onnxruntime/test/providers/cuda/test_cases/allocator_cuda_test.cc
index ec7c6ec4e1605..b413d04fe81e8 100644
--- a/onnxruntime/test/providers/cuda/test_cases/allocator_cuda_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/allocator_cuda_test.cc
@@ -11,7 +11,7 @@
 namespace onnxruntime {
 namespace test {
 
-TEST(CudaEpAllocatorTest, CUDAAllocatorTest) {
+TEST(AllocatorTest, CUDAAllocatorTest) {
   OrtDevice::DeviceId cuda_device_id = 0;
 
   // ensure CUDA device is available.
@@ -77,7 +77,7 @@ TEST(CudaEpAllocatorTest, CUDAAllocatorTest) {
 }
 
 // test that we fallback to smaller allocations if the growth of the arena exceeds the available memory
-TEST(CudaEpAllocatorTest, CUDAAllocatorFallbackTest) {
+TEST(AllocatorTest, CUDAAllocatorFallbackTest) {
   OrtDevice::DeviceId cuda_device_id = 0;
 
   size_t free = 0;
diff --git a/onnxruntime/test/providers/cuda/test_cases/attention_kernel_options_test.cc b/onnxruntime/test/providers/cuda/test_cases/attention_kernel_options_test.cc
index ccdc56de5937d..b2e986f680763 100644
--- a/onnxruntime/test/providers/cuda/test_cases/attention_kernel_options_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/attention_kernel_options_test.cc
@@ -17,7 +17,7 @@ using onnxruntime::contrib::attention::AttentionBackend;
 namespace onnxruntime {
 namespace test {
 
-TEST(CudaEpAttentionKernelOptionsTest, NonZeroValue) {
+TEST(AttentionKernelOptionsTest, NonZeroValue) {
   {
     AttentionKernelOptions options;
     int value = static_cast<int>(AttentionBackend::FLASH_ATTENTION) | static_cast<int>(AttentionBackend::EFFICIENT_ATTENTION);
@@ -156,7 +156,7 @@ TEST(CudaEpAttentionKernelOptionsTest, NonZeroValue) {
 }
 
 // Test all environment variables take effect when option value is 0.
-TEST(CudaEpAttentionKernelOptionsTest, DefaultOptionWithEnvVar) {
+TEST(AttentionKernelOptionsTest, DefaultOptionWithEnvVar) {
   constexpr int value = 0;
   ScopedEnvironmentVariables scoped_env_vars{
       EnvVarMap{
@@ -186,7 +186,7 @@ TEST(CudaEpAttentionKernelOptionsTest, DefaultOptionWithEnvVar) {
 }
 
 // Test default min sequence lengths when environment variables are not set.
-TEST(CudaEpAttentionKernelOptionsTest, DefaultMinSeqLens) {
+TEST(AttentionKernelOptionsTest, DefaultMinSeqLens) {
   constexpr int value = 0;
   ScopedEnvironmentVariables scoped_env_vars{
       EnvVarMap{
diff --git a/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc b/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc
index 97d50398a5550..a0d115c41c14b 100644
--- a/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc
@@ -68,7 +68,7 @@ void ComputeTopKReference(const std::vector<float>& values,
   }
 }
 
-TEST(CudaEpTestBeamSearch, TopK) {
+TEST(TestBeamSearch, TopK) {
   int32_t batch_size = 4;
   int32_t beam_size = 4;
   int32_t vocab_size = 50257;
diff --git a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc
index d8fb3c8256012..3fcb9045ee7e6 100644
--- a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc
@@ -230,7 +230,7 @@ void testPrepack(int rows, int columns) {
 }
 
 // TODO: code runs on CPU, but this is for sm80 only, maybe enable only when test on sm80
-TEST(CudaEpBlkQ4_GEMM, PrepackSm80Test) {
+TEST(BlkQ4_GEMM, PrepackSm80Test) {
   Status status = onnxruntime::cuda::test::sm80_supported();
   if (!status.IsOK()) {
     // skip the test if sm80 is not supported
@@ -263,7 +263,7 @@ TEST(CudaEpBlkQ4_GEMM, PrepackSm80Test) {
   testPrepack<true, false>(256, 256);
 }
 
-TEST(CudaEpBlkQ4_GEMM, Sm80RowBlockingTest) {
+TEST(BlkQ4_GEMM, Sm80RowBlockingTest) {
   Status status = onnxruntime::cuda::test::sm80_supported();
   if (!status.IsOK()) {
     // skip the test if sm80 is not supported
@@ -292,7 +292,7 @@ TEST(CudaEpBlkQ4_GEMM, Sm80RowBlockingTest) {
   onnxruntime::cuda::test::run_blkq4_gemm<64, false, false, true>(256, 1024, 576);
 }
 
-TEST(CudaEpBlkQ4_GEMM, Sm80ColBlockingTest) {
+TEST(BlkQ4_GEMM, Sm80ColBlockingTest) {
   Status status = onnxruntime::cuda::test::sm80_supported();
   if (!status.IsOK()) {
     // skip the test if sm80 is not supported
@@ -305,7 +305,7 @@ TEST(CudaEpBlkQ4_GEMM, Sm80ColBlockingTest) {
   onnxruntime::cuda::test::run_blkq4_gemm<64, true, false, true>(256, 1024, 576);
 }
 
-TEST(CudaEpBlkQ4_GEMM, Sm80SmallMTest) {
+TEST(BlkQ4_GEMM, Sm80SmallMTest) {
   Status status = onnxruntime::cuda::test::sm80_supported();
   if (!status.IsOK()) {
     // skip the test if sm80 is not supported
@@ -326,7 +326,7 @@ TEST(CudaEpBlkQ4_GEMM, Sm80SmallMTest) {
   onnxruntime::cuda::test::run_blkq4_gemm<64, true, true, true>(16, 1024, 576);
 }
 
-TEST(CudaEpBlkQ4_GEMM, Sm80SmallTileKernelTest) {
+TEST(BlkQ4_GEMM, Sm80SmallTileKernelTest) {
   Status status = onnxruntime::cuda::test::sm80_supported();
   if (!status.IsOK()) {
     // skip the test if sm80 is not supported
diff --git a/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc b/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc
index f3222c6f683b5..72357ec7e02d2 100644
--- a/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc
@@ -19,7 +19,7 @@ namespace cuda {
 namespace test {
 // TODO: Since the "DeferredRelease" has been migrated to CudaStream class,
 // we should migrate this test from CudaEP unit test to CudaStream unit test.
-TEST(CudaEpTestDeferredRelease, WithArena) {
+TEST(TestDeferredRelease, WithArena) {
   // Create CUDA EP.
   CUDAExecutionProviderInfo info;
   CUDAExecutionProvider ep(info);
@@ -52,7 +52,7 @@ TEST(CudaEpTestDeferredRelease, WithArena) {
   ORT_THROW_IF_ERROR(ep.OnRunEnd(true, run_opts));
 }
 
-TEST(CudaEpTestDeferredRelease, WithoutArena) {
+TEST(TestDeferredRelease, WithoutArena) {
   // Create CUDA EP.
   CUDAExecutionProviderInfo info;
   CUDAExecutionProvider ep(info);
diff --git a/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc b/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc
index 3538c7add94d0..7468a5718425e 100644
--- a/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc
@@ -40,7 +40,7 @@ void TestFillCorrectness(size_t num_elements, TElement value) {
 }
 }  // namespace
 
-TEST(CudaEpUnittest, FillCorrectness) {
+TEST(CudaUtilsTest, FillCorrectness) {
   TestFillCorrectness<int8_t>(1 << 20, 1);
   TestFillCorrectness<int16_t>(1 << 20, 2);
   TestFillCorrectness<int32_t>(1 << 20, 3);
diff --git a/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc b/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc
index 518fde5804b23..6636e15040393 100644
--- a/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc
@@ -10,7 +10,7 @@ namespace onnxruntime {
 namespace cuda {
 namespace test {
 
-TEST(CudaEpGemmOptions, TestDefaultOptions) {
+TEST(CudaGemmOptions, TestDefaultOptions) {
   HalfGemmOptions gemm_options;
   ASSERT_FALSE(gemm_options.IsCompute16F());
 #if defined(USE_CUDA)
@@ -22,7 +22,7 @@ TEST(CudaEpGemmOptions, TestDefaultOptions) {
 #endif
 }
 
-TEST(CudaEpGemmOptions, TestCompute16F) {
+TEST(CudaGemmOptions, TestCompute16F) {
   HalfGemmOptions gemm_options;
   gemm_options.Initialize(1);
   ASSERT_TRUE(gemm_options.IsCompute16F());
@@ -35,7 +35,7 @@ TEST(CudaEpGemmOptions, TestCompute16F) {
 #endif
 }
 
-TEST(CudaEpGemmOptions, NoReducedPrecision) {
+TEST(CudaGemmOptions, NoReducedPrecision) {
   HalfGemmOptions gemm_options;
   gemm_options.Initialize(2);
   ASSERT_FALSE(gemm_options.IsCompute16F());
@@ -48,7 +48,7 @@ TEST(CudaEpGemmOptions, NoReducedPrecision) {
 #endif
 }
 
-TEST(CudaEpGemmOptions, Pedantic) {
+TEST(CudaGemmOptions, Pedantic) {
   HalfGemmOptions gemm_options;
   gemm_options.Initialize(4);
   ASSERT_FALSE(gemm_options.IsCompute16F());
@@ -61,7 +61,7 @@ TEST(CudaEpGemmOptions, Pedantic) {
 #endif
 }
 
-TEST(CudaEpGemmOptions, Compute16F_Pedantic) {
+TEST(CudaGemmOptions, Compute16F_Pedantic) {
   HalfGemmOptions gemm_options;
   gemm_options.Initialize(5);
   ASSERT_TRUE(gemm_options.IsCompute16F());
@@ -74,7 +74,7 @@ TEST(CudaEpGemmOptions, Compute16F_Pedantic) {
 #endif
 }
 
-TEST(CudaEpGemmOptions, Compute16F_NoReducedPrecision) {
+TEST(CudaGemmOptions, Compute16F_NoReducedPrecision) {
   HalfGemmOptions gemm_options;
   gemm_options.Initialize(3);
   ASSERT_TRUE(gemm_options.IsCompute16F());
diff --git a/onnxruntime/test/providers/cuda/test_cases/greedy_search_top_one.cc b/onnxruntime/test/providers/cuda/test_cases/greedy_search_top_one.cc
index ba24cf858e80f..6b8cd68de0fca 100644
--- a/onnxruntime/test/providers/cuda/test_cases/greedy_search_top_one.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/greedy_search_top_one.cc
@@ -41,7 +41,7 @@ void ComputeTop1Reference(const std::vector<float>& values,
   }
 }
 
-TEST(CudaEpTestGreedySearch, TopOne) {
+TEST(TestGreedySearch, TopOne) {
   int32_t batch_size = 4;
   int32_t vocab_size = 50257;
   int32_t batch_x_vocab = batch_size * vocab_size;
diff --git a/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc b/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc
index 09c9c1e5f8f6a..ec7e98528504e 100644
--- a/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc
@@ -179,7 +179,7 @@ void TestReduceColumnsToColumn(int m, int n, float relative_error_tolerance = 1e
 }
 }  // namespace
 
-TEST(CudaEpReductionFunctionsTest, ReduceRowToScalar) {
+TEST(ReductionFunctionsTest, ReduceRowToScalar) {
   TestReduceRowToScalarApis(3);
   TestReduceRowToScalarApis(19);
   TestReduceRowToScalarApis(123);
@@ -188,7 +188,7 @@ TEST(CudaEpReductionFunctionsTest, ReduceRowToScalar) {
   TestReduceRowToScalarApis(941736, 2e-4f);
 }
 
-TEST(CudaEpReductionFunctionsTest, ReduceRowsToRow) {
+TEST(ReductionFunctionsTest, ReduceRowsToRow) {
   for (int m : {3, 193, 2945}) {
     for (int n : {3, 193, 2945}) {
       TestReduceRowsToRow(m, n, true);
@@ -197,7 +197,7 @@ TEST(CudaEpReductionFunctionsTest, ReduceRowsToRow) {
   }
 }
 
-TEST(CudaEpReductionFunctionsTest, ReduceColumnsToColumn) {
+TEST(ReductionFunctionsTest, ReduceColumnsToColumn) {
   for (int m : {3, 193, 2945}) {
     for (int n : {3, 193, 2945}) {
       TestReduceColumnsToColumn(m, n);
@@ -205,7 +205,7 @@ TEST(CudaEpReductionFunctionsTest, ReduceColumnsToColumn) {
   }
 }
 
-TEST(CudaEpReductionFunctionsTest, BufferOffsets) {
+TEST(ReductionFunctionsTest, BufferOffsets) {
   const int m = 2048;
   const int n = 1024;
   const TensorShape shape{m, n};
@@ -240,7 +240,7 @@ TEST(CudaEpReductionFunctionsTest, BufferOffsets) {
   }
 }
 
-TEST(CudaEpReductionFunctionsTest, InvalidBufferSize) {
+TEST(ReductionFunctionsTest, InvalidBufferSize) {
   const int m = 2048;
   const int n = 1024;
   const TensorShape shape{m, n};
@@ -262,7 +262,7 @@ TEST(CudaEpReductionFunctionsTest, InvalidBufferSize) {
   ASSERT_FALSE(status.IsOK());
 }
 
-TEST(CudaEpReductionFunctionsTest, GetApplicableMatrixReduction) {
+TEST(ReductionFunctionsTest, GetApplicableMatrixReduction) {
   auto test_get_applicable_matrix_reduction =
       [](cudnnReduceTensorOp_t cudnn_op,
          const std::vector<int64_t>& dims, const std::vector<int64_t>& axes,
diff --git a/onnxruntime/test/python/onnx_backend_test_series.py b/onnxruntime/test/python/onnx_backend_test_series.py
index a274b90dc042f..8fc76da3495a8 100644
--- a/onnxruntime/test/python/onnx_backend_test_series.py
+++ b/onnxruntime/test/python/onnx_backend_test_series.py
@@ -105,7 +105,7 @@ def load_jsonc(basename: str):
     return json.loads("\n".join(lines))
 
 
-def create_backend_test(devices: list[str], test_name=None):
+def create_backend_test(test_name=None):
     """Creates an OrtBackendTest and adds its TestCase's to global scope so unittest will find them."""
 
     overrides = load_jsonc("onnx_backend_test_series_overrides.jsonc")
@@ -126,29 +126,30 @@ def create_backend_test(devices: list[str], test_name=None):
     else:
         filters = load_jsonc("onnx_backend_test_series_filters.jsonc")
         current_failing_tests = apply_filters(filters, "current_failing_tests")
+
         if platform.architecture()[0] == "32bit":
             current_failing_tests += apply_filters(filters, "current_failing_tests_x86")
 
-        if backend.supports_device("DNNL") or "DNNL" in devices:
+        if backend.supports_device("DNNL"):
             current_failing_tests += apply_filters(filters, "current_failing_tests_DNNL")
 
-        if backend.supports_device("NNAPI") or "NNAPI" in devices:
+        if backend.supports_device("NNAPI"):
             current_failing_tests += apply_filters(filters, "current_failing_tests_NNAPI")
 
-        if backend.supports_device("OPENVINO_GPU") or "OPENVINO_GPU" in devices:
+        if backend.supports_device("OPENVINO_GPU"):
             current_failing_tests += apply_filters(filters, "current_failing_tests_OPENVINO_GPU")
 
-        if backend.supports_device("OPENVINO_CPU") or "OPENVINO_CPU" in devices:
+        if backend.supports_device("OPENVINO_CPU"):
             current_failing_tests += apply_filters(filters, "current_failing_tests_OPENVINO_CPU_FP32")
             current_failing_tests += apply_filters(filters, "current_failing_tests_OPENVINO_CPU_FP16")
 
-        if backend.supports_device("OPENVINO_NPU") or "OPENVINO_NPU" in devices:
+        if backend.supports_device("OPENVINO_NPU"):
             current_failing_tests += apply_filters(filters, "current_failing_tests_OPENVINO_NPU")
 
-        if backend.supports_device("OPENVINO") or "OPENVINO" in devices:
+        if backend.supports_device("OPENVINO"):
             current_failing_tests += apply_filters(filters, "current_failing_tests_OPENVINO_opset18")
 
-        if backend.supports_device("MIGRAPHX") or "MIGRAPHX" in devices:
+        if backend.supports_device("MIGRAPHX"):
             current_failing_tests += apply_filters(filters, "current_failing_tests_MIGRAPHX")
 
         if backend.supports_device("WEBGPU"):
@@ -157,16 +158,8 @@ def create_backend_test(devices: list[str], test_name=None):
         # Skip these tests for a "pure" DML onnxruntime python wheel. We keep these tests enabled for instances where both DML and CUDA
         # EPs are available (Windows GPU CI pipeline has this config) - these test will pass because CUDA has higher precedence than DML
         # and the nodes are assigned to only the CUDA EP (which supports these tests)
-        if (backend.supports_device("DML") and not backend.supports_device("GPU")) or "DML" in devices:
+        if backend.supports_device("DML") and not backend.supports_device("GPU"):
             current_failing_tests += apply_filters(filters, "current_failing_tests_pure_DML")
-            # exclude CUDA EP when DML test is running.
-            os.environ["ORT_ONNX_BACKEND_EXCLUDE_PROVIDERS"] = "TensorrtExecutionProvider,CUDAExecutionProvider"
-        elif backend.supports_device("DML") and "DML" not in devices:
-            # exclude DML EP when CUDA test is running.
-            os.environ["ORT_ONNX_BACKEND_EXCLUDE_PROVIDERS"] = "TensorrtExecutionProvider,DmlExecutionProvider"
-        else:
-            # exclude TRT EP temporarily and only test CUDA EP to retain previous behavior
-            os.environ["ORT_ONNX_BACKEND_EXCLUDE_PROVIDERS"] = "TensorrtExecutionProvider"
 
         filters = (
             current_failing_tests
@@ -179,6 +172,9 @@ def create_backend_test(devices: list[str], test_name=None):
         backend_test.exclude("(" + "|".join(filters) + ")")
         print("excluded tests:", filters)
 
+        # exclude TRT EP temporarily and only test CUDA EP to retain previous behavior
+        os.environ["ORT_ONNX_BACKEND_EXCLUDE_PROVIDERS"] = "TensorrtExecutionProvider"
+
     # import all test cases at global scope to make
     # them visible to python.unittest.
     globals().update(backend_test.enable_report().test_cases)
@@ -203,15 +199,6 @@ def parse_args():
         help="Only run tests that match this value. Matching is regex based, and '.*' is automatically appended",
     )
 
-    parser.add_argument(
-        "--devices",
-        type=str,
-        choices=["CPU", "CUDA", "MIGRAPHX", "DNNL", "DML", "OPENVINO_GPU", "OPENVINO_CPU", "OPENVINO_NPU", "OPENVINO"],
-        nargs="+",  # allows multiple values
-        default=["CPU"],  # default to ["CPU"] if no input is given
-        help="Select one or more devices CPU, CUDA, MIGRAPHX, DNNL, DML, OPENVINO_GPU, OPENVINO_CPU, OPENVINO_NPU, OPENVINO",
-    )
-
     # parse just our args. python unittest has its own args and arg parsing, and that runs inside unittest.main()
     parsed, unknown = parser.parse_known_args()
     sys.argv = sys.argv[:1] + unknown
@@ -222,5 +209,5 @@ def parse_args():
 if __name__ == "__main__":
     args = parse_args()
 
-    create_backend_test(args.devices, args.test_name)
+    create_backend_test(args.test_name)
     unittest.main()
diff --git a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
index 7ecaab6fedb02..f083ab14ad133 100644
--- a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
+++ b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
@@ -750,13 +750,6 @@
         "^test_reduce_log_sum_empty_set_cpu",
         "^test_reduce_log_sum_exp_empty_set_cpu",
         "^test_reduce_prod_empty_set_cpu",
-        // Bug: DML EP some how executes these CUDA tests and failed
-        // TODO: Remove these tests when DML EP is fixed
-        "^test_convtranspose_autopad_same_cuda",
-        "^test_asin_example_cuda",
-        "^test_dynamicquantizelinear_cuda",
-        "^test_dynamicquantizelinear_expanded_cuda",
-        "^test_reduce_min_empty_set_cuda",
         //Bug: DML EP does not execute operators with an empty input tensor
         //TODO: Resolve as a graph implementation that returns a constant inf tensor with appropriate strides
         "^test_reduce_min_empty_set_cpu"
diff --git a/onnxruntime/test/util/default_providers.cc b/onnxruntime/test/util/default_providers.cc
index 59926bbcd1c6f..c1564997c42b8 100644
--- a/onnxruntime/test/util/default_providers.cc
+++ b/onnxruntime/test/util/default_providers.cc
@@ -122,12 +122,6 @@ std::unique_ptr<IExecutionProvider> DefaultOpenVINOExecutionProvider() {
 
 std::unique_ptr<IExecutionProvider> DefaultCudaExecutionProvider() {
 #ifdef USE_CUDA
-#ifdef USE_DML
-  const std::string no_cuda_ep_test = Env::Default().GetEnvironmentVar("NO_CUDA_TEST");
-  if (no_cuda_ep_test == "1") {
-    return nullptr;
-  }
-#endif
   OrtCUDAProviderOptionsV2 provider_options{};
   provider_options.do_copy_in_default_stream = true;
   provider_options.use_tf32 = false;
@@ -140,12 +134,6 @@ std::unique_ptr<IExecutionProvider> DefaultCudaExecutionProvider() {
 #ifdef ENABLE_CUDA_NHWC_OPS
 std::unique_ptr<IExecutionProvider> DefaultCudaNHWCExecutionProvider() {
 #if defined(USE_CUDA)
-#ifdef USE_DML
-  const std::string no_cuda_ep_test = Env::Default().GetEnvironmentVar("NO_CUDA_TEST");
-  if (no_cuda_ep_test == "1") {
-    return nullptr;
-  }
-#endif
   OrtCUDAProviderOptionsV2 provider_options{};
   provider_options.do_copy_in_default_stream = true;
   provider_options.use_tf32 = false;
@@ -332,12 +320,6 @@ std::unique_ptr<IExecutionProvider> DefaultCannExecutionProvider() {
 
 std::unique_ptr<IExecutionProvider> DefaultDmlExecutionProvider() {
 #ifdef USE_DML
-#ifdef USE_CUDA
-  const std::string no_dml_ep_test = Env::Default().GetEnvironmentVar("NO_DML_TEST");
-  if (no_dml_ep_test == "1") {
-    return nullptr;
-  }
-#endif
   ConfigOptions config_options{};
   if (auto factory = DMLProviderFactoryCreator::CreateFromDeviceOptions(config_options, nullptr, false, false)) {
     return factory->CreateProvider();
diff --git a/onnxruntime/test/webgpu/delay_load/main.cc b/onnxruntime/test/webgpu/delay_load/main.cc
new file mode 100644
index 0000000000000..14300f3b3751b
--- /dev/null
+++ b/onnxruntime/test/webgpu/delay_load/main.cc
@@ -0,0 +1,143 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <iostream>
+#include <Windows.h>
+#include <stdlib.h>
+#include <filesystem>
+#define ORT_API_MANUAL_INIT
+#include "core/session/onnxruntime_cxx_api.h"
+
+// This program is to test the delay loading of onnxruntime.dll.
+//
+// To verify the delay loading actually works, we need to do the test in 2 steps:
+//
+// 1. Prepare a folder structure like below:
+//
+//    ├── webgpu_delay_load_test_root (newly created folder)
+//    │   ├── dlls
+//    │   │   ├── onnxruntime.dll
+//    │   │   ├── webgpu_dawn.dll
+//    │   │   ├── dxil.dll
+//    │   │   └── dxcompiler.dll
+//    │   └── test.exe
+//    └── onnxruntime_webgpu_delay_load_test.exe (this binary)
+//
+//    This folder structure ensures no DLLs are in the same folder as the executable (test.exe).
+//
+// 2. Launch the test binary from the root folder of the above structure.
+//
+// So, there are 2 modes of this program:
+// 1. "Prepare" mode: Do the step 1 above. (default)
+// 2. "Test" mode: Do the step 2 above. (specified by --test argument)
+
+int prepare_main();
+int test_main();
+
+int wmain(int argc, wchar_t* argv[]) {
+  if (argc == 2 && wcscmp(argv[1], L"--test") == 0) {
+    return test_main();
+  } else {
+    return prepare_main();
+  }
+}
+
+int prepare_main() {
+  std::wstring path_str(32768, L'\0');
+  GetModuleFileNameW(NULL, path_str.data(), static_cast<DWORD>(path_str.size()));
+
+  namespace fs = std::filesystem;
+  fs::path exe_full_path{path_str};                                    // <TEST_DIR>/onnxruntime_webgpu_delay_load_test.exe
+  fs::path test_dir = exe_full_path.parent_path();                     // <TEST_DIR>/
+  fs::path exe_name = exe_full_path.filename();                        // onnxruntime_webgpu_delay_load_test.exe
+  fs::path root_folder = test_dir / L"webgpu_delay_load_test_root\\";  // <TEST_DIR>/webgpu_delay_load_test_root/
+  fs::path dlls_folder = root_folder / L"dlls\\";                      // <TEST_DIR>/webgpu_delay_load_test_root/dlls/
+
+  // ensure the test folder exists and is empty
+  if (fs::exists(root_folder)) {
+    fs::remove_all(root_folder);
+  }
+  fs::create_directories(dlls_folder);
+
+  fs::current_path(test_dir);
+
+  // copy the required DLLs to the dlls folder
+  fs::copy_file(L"onnxruntime.dll", dlls_folder / L"onnxruntime.dll");
+  fs::copy_file(L"dxil.dll", dlls_folder / L"dxil.dll");
+  fs::copy_file(L"dxcompiler.dll", dlls_folder / L"dxcompiler.dll");
+  if (fs::exists(L"webgpu_dawn.dll")) {
+    fs::copy_file(L"webgpu_dawn.dll", dlls_folder / L"webgpu_dawn.dll");
+  }
+
+  // copy the test binary to the root folder
+  fs::copy_file(exe_full_path, root_folder / L"test.exe");
+
+  // run "test.exe --test" from the test root folder
+  fs::current_path(root_folder);
+  return _wsystem(L"test.exe --test");
+}
+
+int run() {
+  Ort::Env env{nullptr};
+  int retval = 0;
+  try {
+    env = Ort::Env{ORT_LOGGING_LEVEL_WARNING, "Default"};
+
+    // model is https://github.com/onnx/onnx/blob/v1.15.0/onnx/backend/test/data/node/test_abs/model.onnx
+    constexpr uint8_t MODEL_DATA[] = {8, 7, 18, 12, 98, 97, 99, 107, 101, 110,
+                                      100, 45, 116, 101, 115, 116, 58, 73, 10, 11,
+                                      10, 1, 120, 18, 1, 121, 34, 3, 65, 98,
+                                      115, 18, 8, 116, 101, 115, 116, 95, 97, 98,
+                                      115, 90, 23, 10, 1, 120, 18, 18, 10, 16,
+                                      8, 1, 18, 12, 10, 2, 8, 3, 10, 2,
+                                      8, 4, 10, 2, 8, 5, 98, 23, 10, 1,
+                                      121, 18, 18, 10, 16, 8, 1, 18, 12, 10,
+                                      2, 8, 3, 10, 2, 8, 4, 10, 2, 8,
+                                      5, 66, 4, 10, 0, 16, 13};
+
+    Ort::SessionOptions session_options;
+    session_options.DisableMemPattern();
+    std::unordered_map<std::string, std::string> provider_options;
+    session_options.AppendExecutionProvider("WebGPU", provider_options);
+    Ort::Session session{env, MODEL_DATA, sizeof(MODEL_DATA), session_options};
+
+    // successfully initialized
+    std::cout << "Successfully initialized WebGPU EP." << std::endl;
+    retval = 0;
+  } catch (const std::exception& ex) {
+    std::cerr << ex.what() << std::endl;
+
+    std::cerr << "Unexpected exception." << std::endl;
+    retval = -1;
+  }
+
+  return retval;
+}
+
+int test_main() {
+  HMODULE hModule = LoadLibraryA("dlls\\onnxruntime.dll");
+  if (hModule == NULL) {
+    std::cout << "Failed to load dlls\\onnxruntime.dll" << std::endl;
+    std::cout << "Error code: " << GetLastError() << std::endl;
+    return 1;
+  }
+
+  int retval = 0;
+
+  using OrtGetApiBaseFunction = decltype(&OrtGetApiBase);
+  auto fnOrtGetApiBase = (OrtGetApiBaseFunction)GetProcAddress(hModule, "OrtGetApiBase");
+  if (fnOrtGetApiBase == NULL) {
+    std::cout << "Failed to get OrtGetApiBase" << std::endl;
+    retval = 1;
+    goto cleanup;
+  }
+  Ort::InitApi(fnOrtGetApiBase()->GetApi(ORT_API_VERSION));
+
+  retval = run();
+
+cleanup:
+  if (hModule != NULL) {
+    FreeLibrary(hModule);
+  }
+  return retval;
+}
diff --git a/onnxruntime/test/webgpu/external_dawn/main.cc b/onnxruntime/test/webgpu/external_dawn/main.cc
index ed8d2eab94ce9..1cb22b131d76b 100644
--- a/onnxruntime/test/webgpu/external_dawn/main.cc
+++ b/onnxruntime/test/webgpu/external_dawn/main.cc
@@ -1,5 +1,4 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
-// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 // Licensed under the MIT License.
 
 #include <iostream>
diff --git a/orttraining/orttraining/core/session/training_session.cc b/orttraining/orttraining/core/session/training_session.cc
index f1545e96481fa..b03f1b1eadb3b 100644
--- a/orttraining/orttraining/core/session/training_session.cc
+++ b/orttraining/orttraining/core/session/training_session.cc
@@ -5,6 +5,7 @@
 
 #include "core/framework/data_transfer_utils.h"
 #include "core/graph/model.h"
+#include "core/graph/model_saving_options.h"
 #include "core/session/IOBinding.h"
 #include "core/optimizer/rule_based_graph_transformer.h"
 #include "core/providers/cpu/controlflow/utils.h"
@@ -1003,7 +1004,8 @@ Status TrainingSession::SaveWithExternalInitializers(const PathString& model_uri
   std::remove(ToUTF8String(model_uri).c_str());
   std::remove(external_file_name.c_str());
 
-  return Model::SaveWithExternalInitializers(*model_, model_uri, external_file_name, initializer_size_threshold);
+  ModelSavingOptions model_saving_options{initializer_size_threshold};
+  return Model::SaveWithExternalInitializers(*model_, model_uri, external_file_name, model_saving_options);
 }
 
 Status TrainingSession::Save(const PathString& model_uri, TrainingSession::SaveOption opt) {
diff --git a/orttraining/orttraining/training_api/module.cc b/orttraining/orttraining/training_api/module.cc
index 939e1de334e52..60708b05626c5 100644
--- a/orttraining/orttraining/training_api/module.cc
+++ b/orttraining/orttraining/training_api/module.cc
@@ -11,6 +11,7 @@
 #include "core/session/inference_session.h"
 #include "core/session/environment.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
+#include "core/graph/model_saving_options.h"
 #include "core/graph/graph_utils.h"
 
 #include "orttraining/training_api/checkpoint.h"
@@ -689,8 +690,10 @@ Status Module::ExportModelForInferencing(const std::string& inference_model_path
     std::string external_data_name =
         ORT_TSTR_CONVERT_TO_PRINTABLE_STRING(ExternalCheckpointDataPath(ToPathString(inference_model_path)));
     PathString inference_model_pathstring = ToPathString(inference_model_path);
+    ModelSavingOptions model_saving_options{64};
     ORT_THROW_IF_ERROR(
-        Model::SaveWithExternalInitializers(*inference_model, inference_model_pathstring, external_data_name, 64));
+        Model::SaveWithExternalInitializers(*inference_model, inference_model_pathstring, external_data_name,
+                                            model_saving_options));
   } else {
     ORT_THROW_IF_ERROR(Model::Save(*inference_model, ToPathString(inference_model_path)));
   }
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 3527a89ca7a7b..53dcdc6e0c6fa 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -260,6 +260,7 @@ def convert_arg_line_to_args(self, arg_line):
     )
 
     parser.add_argument("--disable_cuda_nhwc_ops", action="store_true", help="Disable CUDA NHWC ops in build.")
+    parser.add_argument("--enable_cuda_minimal_build", action="store_true", help="Enable CUDA minimal build.")
 
     # Python bindings
     parser.add_argument("--enable_pybind", action="store_true", help="Enable Python Bindings.")
@@ -1093,6 +1094,7 @@ def generate_build_tree(
         "-Donnxruntime_DISABLE_FLOAT8_TYPES=" + ("ON" if disable_float8_types else "OFF"),
         "-Donnxruntime_DISABLE_SPARSE_TENSORS=" + ("ON" if disable_sparse_tensors else "OFF"),
         "-Donnxruntime_DISABLE_OPTIONAL_TYPE=" + ("ON" if disable_optional_type else "OFF"),
+        "-Donnxruntime_CUDA_MINIMAL=" + ("ON" if args.enable_cuda_minimal_build else "OFF"),
     ]
 
     if args.rv64:
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml
new file mode 100644
index 0000000000000..2a32dd1a62408
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml
@@ -0,0 +1,108 @@
+##### start trigger Don't edit it manually, Please do edit set-trigger-rules.py ####
+### please do rerun set-trigger-rules.py ###
+trigger:
+  branches:
+    include:
+    - main
+    - rel-*
+  paths:
+    exclude:
+    - docs/**
+    - README.md
+    - CONTRIBUTING.md
+    - BUILD.md
+    - 'js/web'
+    - 'onnxruntime/core/providers/js'
+pr:
+  branches:
+    include:
+    - main
+    - rel-*
+  paths:
+    exclude:
+    - docs/**
+    - README.md
+    - CONTRIBUTING.md
+    - BUILD.md
+    - 'js/web'
+    - 'onnxruntime/core/providers/js'
+#### end trigger ####
+parameters:
+  - name: CudaVersion
+    displayName: CUDA version
+    type: string
+    default: '12.2'
+    values:
+      - 11.8
+      - 12.2
+
+variables:
+  - template: templates/common-variables.yml
+  - name: docker_base_image
+    ${{ if eq(parameters.CudaVersion, '11.8') }}:
+      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241120.3
+    ${{ if eq(parameters.CudaVersion, '12.2') }}:
+      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20241120.3
+  - name: linux_trt_version
+    ${{ if eq(parameters.CudaVersion, '11.8') }}:
+      value: ${{ variables.linux_trt_version_cuda11 }}
+    ${{ if eq(parameters.CudaVersion, '12.2') }}:
+      value: ${{ variables.linux_trt_version_cuda12 }}
+
+jobs:
+- job: Linux_Build
+  timeoutInMinutes: 180
+  variables:
+    skipComponentGovernanceDetection: true
+    ALLOW_RELEASED_ONNX_OPSET_ONLY: '1'
+    ORT_CACHE_DIR: '$(Agent.TempDirectory)/ort/ccache'
+    TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
+  workspace:
+    clean: all
+  pool: onnxruntime-tensorrt-linuxbuild-T4
+  steps:
+  - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+    displayName: 'Clean Agent Directories'
+    condition: always()
+
+  - checkout: self
+    clean: true
+    submodules: none
+
+  - template: templates/get-docker-image-steps.yml
+    parameters:
+      Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
+      Context: tools/ci_build/github/linux/docker
+      DockerBuildArgs: "
+      --network=host
+      --build-arg BASEIMAGE=${{ variables.docker_base_image }}
+      --build-arg TRT_VERSION=${{ variables.linux_trt_version }}
+      --build-arg BUILD_UID=$( id -u )
+      "
+      Repository: onnxruntimetensorrtcudaminimalbuild
+
+  - template: templates/linux-build-step-with-cache.yml
+    parameters:
+      WithCache: true
+      Today: $(TODAY)
+      AdditionalKey: gpu_tensorrt_cuda_minimal
+      CacheDir: '$(ORT_CACHE_DIR)'
+      BuildStep:
+        - task: CmdLine@2
+          inputs:
+            script: |
+              docker run --gpus all --rm \
+                  --volume /data/onnx:/data/onnx:ro \
+                  --volume $(Build.SourcesDirectory):/onnxruntime_src \
+                  --volume $(Build.BinariesDirectory):/build \
+                  --volume /data/models:/build/models:ro \
+                  --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
+                  --volume $(ORT_CACHE_DIR):/cache \
+                  -e ALLOW_RELEASED_ONNX_OPSET_ONLY=0 \
+                  -e NIGHTLY_BUILD \
+                  -e BUILD_BUILDNUMBER \
+                  -e CCACHE_DIR=/cache -w /onnxruntime_src \
+                  onnxruntimetensorrtcudaminimalbuild tools/ci_build/github/linux/build_tensorrt_ci.sh --cuda_minimal=ON
+            workingDirectory: $(Build.SourcesDirectory)
+
+  - template: templates/explicitly-defined-final-tasks.yml
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml
index 83cf26614a285..9286b5a54ac27 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml
@@ -8,12 +8,12 @@ parameters:
 - name: TrtVersion
   displayName: TensorRT Version
   type: string
-  default: 10.5.cuda_12_5_cudnn_9
+  default: 10.7_cuda12.5_cudnn9
   values:
-  - 8.6.cuda_11_8_cudnn_8
-  - 8.6.cuda_12_3_cudnn_9
-  - 10.5.cuda_11_8_cudnn_8
-  - 10.5.cuda_12_5_cudnn_9
+  - 8.6_cuda11.8_cudnn8
+  - 8.6_cuda12.3_cudnn9
+  - 10.7_cuda11.8_cudnn8
+  - 10.7_cuda12.5_cudnn9
   - BIN
 
 - name: UseTensorrtOssParser
@@ -198,4 +198,4 @@ jobs:
       parameters :
         condition : 'succeeded'
 
-    - template: templates/clean-agent-build-directory-step.yml
+    - template: templates/clean-agent-build-directory-step.yml
\ No newline at end of file
diff --git a/tools/ci_build/github/azure-pipelines/py-cuda-alt-package-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-cuda-alt-package-test-pipeline.yml
index 9296928ad97e0..cf434e4eadf0d 100644
--- a/tools/ci_build/github/azure-pipelines/py-cuda-alt-package-test-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-cuda-alt-package-test-pipeline.yml
@@ -19,6 +19,6 @@ stages:
           python_wheel_suffix: '_gpu'
           timeout: 480
           docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241120.3
-          trt_version: '10.6.0.26-1.cuda11.8'
+          trt_version: '10.7.0.23-1.cuda11.8'
           cuda_version: '11.8'
 
diff --git a/tools/ci_build/github/azure-pipelines/stages/jobs/steps/py_packaging_test_step.yml b/tools/ci_build/github/azure-pipelines/stages/jobs/steps/py_packaging_test_step.yml
deleted file mode 100644
index 9a721c65de332..0000000000000
--- a/tools/ci_build/github/azure-pipelines/stages/jobs/steps/py_packaging_test_step.yml
+++ /dev/null
@@ -1,21 +0,0 @@
-parameters:
-- name: EP_NAME
-  type: string
-  default: CPU
-
-- name: PYTHON_VERSION
-  type: string
-
-steps:
-- powershell: |
-    python -m pip uninstall -y onnxruntime onnxruntime-gpu -qq
-    Get-ChildItem -Path $(Build.ArtifactStagingDirectory)/*cp${{ replace(parameters.PYTHON_VERSION,'.','') }}*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname tabulate}
-    mkdir -p $(Agent.TempDirectory)\ort_test_data
-    Copy-Item -Path $(Build.sourcesDirectory)/onnxruntime/test/python/onnx_backend_test_series.py -Destination $(Agent.TempDirectory)\ort_test_data
-    Copy-Item -Recurse -Path $(Build.sourcesDirectory)/onnxruntime/test/testdata -Destination $(Agent.TempDirectory)\ort_test_data
-    cd $(Agent.TempDirectory)\ort_test_data
-    python onnx_backend_test_series.py --devices ${{ parameters.EP_NAME }} -v
-    cd $(Agent.TempDirectory)
-    Remove-Item -Path $(Agent.TempDirectory)\ort_test_data -Recurse -Force
-  workingDirectory: '$(Build.sourcesDirectory)'
-  displayName: 'Run Python Tests with ${{ parameters.EP_NAME }} EP'
\ No newline at end of file
diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml
index 0b3eac0110abc..9c7fbc24ab1b6 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml
@@ -50,8 +50,6 @@ stages:
     win_trt_home: ${{ parameters.win_trt_home }}
     win_cuda_home: ${{ parameters.win_cuda_home }}
     buildJava: ${{ parameters.buildJava }}
-    SpecificArtifact: ${{ parameters.SpecificArtifact }}
-    BuildId: ${{ parameters.BuildId }}
 
 - template: nuget-cuda-packaging-stage.yml
   parameters:
diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
index d6b25c98936f0..445066f08995a 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
@@ -34,7 +34,7 @@ parameters:
   displayName: Specific Artifact's BuildId
   type: string
   default: '0'
-
+  
 - name: buildJava
   type: boolean
 
@@ -50,14 +50,13 @@ stages:
     msbuildPlatform: x64
     packageName: x64-cuda
     CudaVersion: ${{ parameters.CudaVersion }}
-    buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80" --use_dml --build_csharp --parallel
+    buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80"
     runTests: ${{ parameters.RunOnnxRuntimeTests }}
     buildJava: ${{ parameters.buildJava }}
     java_artifact_id: onnxruntime_gpu
     UseIncreasedTimeoutForTests: ${{ parameters.UseIncreasedTimeoutForTests }}
     SpecificArtifact: ${{ parameters.SpecificArtifact }}
     BuildId: ${{ parameters.BuildId }}
-    ComboTests: true
 # Windows CUDA with TensorRT Packaging
 - template: ../templates/win-ci.yml
   parameters:
@@ -69,7 +68,7 @@ stages:
     msbuildPlatform: x64
     CudaVersion: ${{ parameters.CudaVersion }}
     packageName: x64-tensorrt
-    buildparameter: --use_tensorrt --tensorrt_home=${{ parameters.win_trt_home }} --cuda_home=${{ parameters.win_cuda_home }}  --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80" --parallel
+    buildparameter: --use_tensorrt --tensorrt_home=${{ parameters.win_trt_home }} --cuda_home=${{ parameters.win_cuda_home }}  --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80"
     runTests: ${{ parameters.RunOnnxRuntimeTests }}
     buildJava: ${{ parameters.buildJava }}
     java_artifact_id: onnxruntime_gpu
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml
index f7235e3ad2076..947e4f99b984f 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml
@@ -56,7 +56,7 @@ stages:
           PYTHON_VERSION: ${{ python_version }}
           EP_NAME: gpu
           CudaVersion: ${{ parameters.cuda_version }}
-          EP_BUILD_FLAGS: --use_dml --enable_lto --cuda_home=$(Agent.TempDirectory)\v${{ parameters.cuda_version }} --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
+          EP_BUILD_FLAGS: --enable_lto --cuda_home=$(Agent.TempDirectory)\v${{ parameters.cuda_version }} --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
           use_tensorrt: True
 
   - ${{ if eq(parameters.enable_linux_cuda, true) }}:
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml
index dd0539f751c89..aa7f2845fc0fa 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml
@@ -33,7 +33,7 @@ parameters:
    - Release
    - RelWithDebInfo
    - MinSizeRel
-
+   
 - name: use_tensorrt
   type: boolean
   default: false
@@ -134,7 +134,7 @@ stages:
                 --cmake_generator "$(VSGenerator)"
                 --enable_pybind
                 --enable_onnx_tests
-                --parallel 4 --use_binskim_compliant_compile_flags --update --build
+                --parallel --use_binskim_compliant_compile_flags --update --build
                 $(TelemetryOption) ${{ parameters.BUILD_PY_PARAMETERS }} ${{ parameters.EP_BUILD_FLAGS }} ${{ variables.trt_build_flag }}
               workingDirectory: '$(Build.BinariesDirectory)'
 
@@ -206,20 +206,19 @@ stages:
             DownloadTRT: ${{ parameters.use_tensorrt }}
 
         - task: PowerShell@2
-          displayName: 'Install Third Party Dependencies'
+          displayName: 'Install ONNX'
           inputs:
             filePath: '$(Build.SourcesDirectory)/tools/ci_build/github/windows/install_third_party_deps.ps1'
             workingDirectory: '$(Build.BinariesDirectory)'
             arguments: -cpu_arch x64 -install_prefix $(Build.BinariesDirectory)\${{ parameters.cmake_build_type }}\installed -build_config ${{ parameters.cmake_build_type }}
 
-        - template: jobs/steps/py_packaging_test_step.yml
-          parameters:
-            EP_NAME: DML
-            PYTHON_VERSION: ${{ parameters.PYTHON_VERSION }}
-
-        - template: jobs/steps/py_packaging_test_step.yml
-          parameters:
-            EP_NAME: CUDA
-            PYTHON_VERSION: ${{ parameters.PYTHON_VERSION }}
-
-
+        - powershell: |
+            python -m pip uninstall -y onnxruntime onnxruntime-gpu -qq
+            Get-ChildItem -Path $(Build.ArtifactStagingDirectory)/*cp${{ replace(parameters.PYTHON_VERSION,'.','') }}*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname tabulate}
+            mkdir -p $(Agent.TempDirectory)\ort_test_data
+            Copy-Item -Path $(Build.sourcesDirectory)/onnxruntime/test/python/onnx_backend_test_series.py -Destination $(Agent.TempDirectory)\ort_test_data
+            Copy-Item -Recurse -Path $(Build.sourcesDirectory)/onnxruntime/test/testdata -Destination $(Agent.TempDirectory)\ort_test_data
+            cd $(Agent.TempDirectory)\ort_test_data
+            python onnx_backend_test_series.py
+          workingDirectory: '$(Build.sourcesDirectory)'
+          displayName: 'Run Python Tests'
diff --git a/tools/ci_build/github/azure-pipelines/templates/common-variables.yml b/tools/ci_build/github/azure-pipelines/templates/common-variables.yml
index d35bed69ee409..3d4e5326ae7c6 100644
--- a/tools/ci_build/github/azure-pipelines/templates/common-variables.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/common-variables.yml
@@ -1,5 +1,5 @@
 variables:
-  common_trt_version: '10.6.0.26'
+  common_trt_version: '10.7.0.23'
   # As for Debian installation, replace '-1.' by '-1+' when assigning trt version below
   linux_trt_version_cuda11: ${{ variables.common_trt_version }}-1.cuda11.8
   linux_trt_version_cuda12: ${{ variables.common_trt_version }}-1.cuda12.6
diff --git a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
index 949479fb8b5e4..8409edb4d0429 100644
--- a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
@@ -11,7 +11,7 @@ steps:
       packageType: upack
       feed: '/7424c8e4-5c62-490e-95c4-79446f31017c'
       definition: '517c4f6f-5437-4392-a70d-4f15ec5be2f0'
-      version: 1.0.201
+      version: 1.0.202
       downloadPath: $(Build.BinariesDirectory)/deps
 
 # The private ADO project
@@ -22,7 +22,7 @@ steps:
       packageType: upack
       feed: '/4c7631f5-24c0-4307-8822-1aa8f180c325'
       definition: 'fd9dd5ad-b73e-4678-890e-edcf680dbc1a'
-      version: 1.0.201
+      version: 1.0.202
       downloadPath: $(Build.BinariesDirectory)/deps
 
 # You can add more ADO accounts at here.
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml
index ae54b3849a862..14b9c378bec14 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml
@@ -13,10 +13,10 @@ parameters:
       - 12.2
   - name: TrtVersion
     type: string
-    default: '10.6.0.26'
+    default: '10.7.0.23'
     values:
       - 8.6.1.6
-      - 10.6.0.26
+      - 10.7.0.23
 
 steps:
   - ${{ if eq(parameters.DownloadCUDA, true) }}:
@@ -42,7 +42,7 @@ steps:
         - powershell: |
             Write-Host "##vso[task.setvariable variable=trtCudaVersion;]12.0"
           displayName: Set trtCudaVersion
-    - ${{ if and(eq(parameters.CudaVersion, '12.2'), eq(parameters.TrtVersion, '10.6.0.26')) }}:
+    - ${{ if and(eq(parameters.CudaVersion, '12.2'), eq(parameters.TrtVersion, '10.7.0.23')) }}:
         - powershell: |
             Write-Host "##vso[task.setvariable variable=trtCudaVersion;]12.6"
           displayName: Set trtCudaVersion
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml
index dfaf237a711fe..45572416350c3 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml
@@ -15,10 +15,10 @@ parameters:
     default: '11.8'
   - name: win_trt_folder_cuda11
     type: string
-    default: 'TensorRT-10.6.0.26.Windows10.x86_64.cuda-11.8'
+    default: 'TensorRT-10.7.0.23.Windows10.x86_64.cuda-11.8'
   - name: win_trt_folder_cuda12
     type: string
-    default: 'TensorRT-10.6.0.26.Windows10.x86_64.cuda-12.6'
+    default: 'TensorRT-10.7.0.23.Windows10.x86_64.cuda-12.6'
 
 steps:
   - ${{ if eq(parameters.DownloadCUDA, 'true') }}:
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml
index 7bdd069de711b..e8f391a73fa7b 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml
@@ -218,32 +218,16 @@ jobs:
       - powershell: |
          python3 -m pip uninstall -y onnxruntime onnxruntime-gpu onnxruntime-training onnxruntime-directml -qq
          Get-ChildItem -Path dist/*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname}
+
         workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
         displayName: 'Install onnxruntime wheel'
 
   - ${{ if eq(parameters.RunOnnxRuntimeTests, true) }}:
-      - ${{ if and(contains(parameters.additionalBuildFlags, 'use_cuda'), contains(parameters.additionalBuildFlags, 'use_dml')) }}:
-        - powershell: |
-           python $(Build.SourcesDirectory)\tools\ci_build\build.py --config ${{ parameters.BuildConfig }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --enable_onnx_tests ${{ parameters.additionalBuildFlags }}
-          workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
-          displayName: 'Run tests excluding CUDA tests'
-          env:
-            NO_CUDA_TEST: '1'
-            GTEST_FILTER: '-CudaEp*:CudaNhwcTypedTest*:*cpu_*models*' # Exclude CUDA EP tests under providers/cuda/ and cpu models test
-            PATH: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }};$(PATH)' # For onnxruntime4j_test to find dependent dlls
-        - powershell: |
-            python $(Build.SourcesDirectory)\tools\ci_build\build.py --config ${{ parameters.BuildConfig }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --enable_onnx_tests ${{ parameters.additionalBuildFlags }}
-          workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
-          displayName: 'Run tests excluding DML tests'
-          env:
-            NO_DML_TEST: '1'
-            GTEST_FILTER: '-*cpu_*models*'
-            PATH: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }};$(PATH)'
-      - ${{ else }}:
-        - powershell: |
-           python $(Build.SourcesDirectory)\tools\ci_build\build.py --config ${{ parameters.BuildConfig }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --enable_onnx_tests ${{ parameters.additionalBuildFlags }}
-          workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
-          displayName: 'Run tests'
+      - powershell: |
+         python $(Build.SourcesDirectory)\tools\ci_build\build.py --config ${{ parameters.BuildConfig }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022"  --build_shared_lib --enable_onnx_tests ${{ parameters.additionalBuildFlags }}
+
+        workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
+        displayName: 'Run tests'
 
   - ${{ if eq(parameters.GenerateDocumentation, true) }}:
     - task: PythonScript@0
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
index e046997b4f49a..59950433b3d40 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
@@ -25,7 +25,7 @@ parameters:
 
 - name: runTests
   type: boolean
-  default: false
+  default: true
 
 - name: buildJava
   type: boolean
@@ -71,10 +71,6 @@ parameters:
       - 11.8
       - 12.2
 
-- name: ComboTests
-  type: boolean
-  default: false
-
 - name: SpecificArtifact
   displayName: Use Specific Artifact
   type: boolean
@@ -226,7 +222,7 @@ stages:
           condition: and(succeeded(), eq('${{ parameters.runTests}}', true))
           inputs:
             scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-            arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --test --skip_submodule_sync --build_shared_lib --cmake_generator "$(VSGenerator)" --enable_onnx_tests  $(TelemetryOption) ${{ parameters.buildparameter }}'
+            arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "$(VSGenerator)" --enable_onnx_tests  $(TelemetryOption) ${{ parameters.buildparameter }}'
             workingDirectory: '$(Build.BinariesDirectory)'
       - ${{ else }}:
         - powershell: |
@@ -338,10 +334,6 @@ stages:
           displayName: 'Clean Agent Directories'
           condition: always()
 
-        - script:
-            echo ${{ parameters.SpecificArtifact }}
-          displayName: 'Print Specific Artifact'
-
         - checkout: self
           clean: true
           submodules: none
@@ -407,35 +399,13 @@ stages:
           displayName: 'Append dotnet x86  Directory to PATH'
           condition: and(succeeded(), eq('${{ parameters.buildArch}}', 'x86'))
 
-        - ${{ if eq(parameters.ComboTests, 'true') }}:
-          - task: PythonScript@0
-            displayName: 'test excludes CUDA'
-            condition: and(succeeded(), eq('${{ parameters.runTests}}', true))
-            inputs:
-              scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-              arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --enable_onnx_tests  $(TelemetryOption) '
-              workingDirectory: '$(Build.BinariesDirectory)'
-            env:
-              NO_CUDA_TEST: '1'
-              GTEST_FILTER: '-CudaEp*:CudaNhwcTypedTest*' # Exclude CUDA EP tests under providers/cuda/
-          - task: PythonScript@0
-            displayName: 'test excludes DML'
-            condition: and(succeeded(), eq('${{ parameters.runTests}}', true))
-            inputs:
-              scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-              arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --enable_onnx_tests  $(TelemetryOption) '
-              workingDirectory: '$(Build.BinariesDirectory)'
-            env:
-              NO_DML_TEST: '1'
-        - ${{ else }}:
-          - task: PythonScript@0
-            displayName: 'test'
-            condition: and(succeeded(), eq('${{ parameters.runTests}}', true))
-            inputs:
-              scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-              arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --enable_onnx_tests  $(TelemetryOption) '
-              workingDirectory: '$(Build.BinariesDirectory)'
-
+        - task: PythonScript@0
+          displayName: 'test'
+          condition: and(succeeded(), eq('${{ parameters.runTests}}', true))
+          inputs:
+            scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
+            arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --enable_onnx_tests  $(TelemetryOption) '
+            workingDirectory: '$(Build.BinariesDirectory)'
 # Previous stage only assembles the java binaries, testing will be done in this stage with GPU machine
         - ${{ if eq(parameters.buildJava, 'true') }}:
           - template: make_java_win_binaries.yml
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-cuda-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-cuda-ci-pipeline.yml
index 67fd47c3150af..47ece37e66e09 100644
--- a/tools/ci_build/github/azure-pipelines/win-gpu-cuda-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-cuda-ci-pipeline.yml
@@ -62,28 +62,4 @@ stages:
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
         ORT_EP_NAME: CUDA
         WITH_CACHE: true
-        MachinePool: onnxruntime-Win2022-GPU-A10
-
-- stage: cuda_dml
-  dependsOn: []
-  jobs:
-    - template: templates/jobs/win-ci-vs-2022-job.yml
-      parameters:
-        BuildConfig: 'RelWithDebInfo'
-        EnvSetupScript: setup_env_cuda.bat
-        buildArch: x64
-        additionalBuildFlags: >-
-          --build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}"
-          --enable_cuda_profiling --enable_transformers_tool_test
-          --use_dml
-          --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
-          --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON
-          --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
-        msbuildPlatform: x64
-        isX86: false
-        job_name_suffix: x64_RelWithDebInfo
-        RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
-        ORT_EP_NAME: CUDA
-        EnablePython: false
-        WITH_CACHE: true
-        MachinePool: onnxruntime-Win2022-GPU-A10
+        MachinePool: onnxruntime-Win2022-GPU-A10
\ No newline at end of file
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-dml-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-dml-ci-pipeline.yml
index 911d99cd2adf3..94b0aa680d54d 100644
--- a/tools/ci_build/github/azure-pipelines/win-gpu-dml-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-dml-ci-pipeline.yml
@@ -43,11 +43,11 @@ stages:
         BuildConfig: 'RelWithDebInfo'
         EnvSetupScript: setup_env.bat
         buildArch: x64
-        additionalBuildFlags: --enable_pybind --use_dml --enable_wcos --use_winml
+        additionalBuildFlags: --enable_pybind --use_dml --enable_wcos  --use_winml
         msbuildPlatform: x64
         isX86: false
         job_name_suffix: x64_RelWithDebInfo
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
         ORT_EP_NAME: DML
         WITH_CACHE: false
-        MachinePool: onnxruntime-Win2022-GPU-dml-A10
+        MachinePool: onnxruntime-Win2022-GPU-dml-A10
\ No newline at end of file
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-cuda-minimal-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-cuda-minimal-ci-pipeline.yml
new file mode 100644
index 0000000000000..c68ba01485db2
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-cuda-minimal-ci-pipeline.yml
@@ -0,0 +1,86 @@
+##### start trigger Don't edit it manually, Please do edit set-trigger-rules.py ####
+### please do rerun set-trigger-rules.py ###
+trigger:
+  branches:
+    include:
+    - main
+    - rel-*
+  paths:
+    exclude:
+    - docs/**
+    - README.md
+    - CONTRIBUTING.md
+    - BUILD.md
+    - 'js/web'
+    - 'onnxruntime/core/providers/js'
+pr:
+  branches:
+    include:
+    - main
+    - rel-*
+  paths:
+    exclude:
+    - docs/**
+    - README.md
+    - CONTRIBUTING.md
+    - BUILD.md
+    - 'js/web'
+    - 'onnxruntime/core/providers/js'
+#### end trigger ####
+parameters:
+- name: CudaVersion
+  displayName: CUDA version
+  type: string
+  default: '12.2'
+  values:
+    - 11.8
+    - 12.2
+
+variables:
+  - template: templates/common-variables.yml
+  - name: win_trt_folder
+    ${{ if eq(parameters.CudaVersion, '11.8') }}:
+      value: ${{ variables.win_trt_folder_cuda11 }}
+    ${{ if eq(parameters.CudaVersion, '12.2') }}:
+      value: ${{ variables.win_trt_folder_cuda12 }}
+
+jobs:
+- job: 'build'
+  pool: 'onnxruntime-Win2022-GPU-A10'
+  variables:
+    MsbuildArguments: '-detailedsummary -maxcpucount -consoleloggerparameters:PerformanceSummary'
+    EnvSetupScript: setup_env_trt.bat
+    skipComponentGovernanceDetection: true
+    TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
+  timeoutInMinutes: 150
+  workspace:
+    clean: all
+  steps:
+  - template: templates/jobs/win-ci-prebuild-steps.yml
+    parameters:
+      EnvSetupScript: $(EnvSetupScript)
+      DownloadCUDA: true
+      DownloadTRT: true
+      BuildArch: 'x64'
+      BuildConfig: RelWithDebInfo
+      MachinePool: 'onnxruntime-Win2022-GPU-A10'
+      WithCache: true
+      Today: $(Today)
+
+  - template: templates/jobs/win-ci-build-steps.yml
+    parameters:
+      WithCache: True
+      Today: $(TODAY)
+      AdditionalKey: "gpu_tensorrt_cuda_minimal | RelWithDebInfo"
+      BuildPyArguments: '--config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\${{ variables.win_trt_folder }}" --cuda_home="$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --enable_cuda_minimal_build'
+      MsbuildArguments: $(MsbuildArguments)
+      BuildArch: 'x64'
+      Platform: 'x64'
+      BuildConfig: RelWithDebInfo
+
+  - task: PythonScript@0
+    displayName: 'Build wheel'
+    inputs:
+      scriptPath: '$(Build.SourcesDirectory)\setup.py'
+      arguments: 'bdist_wheel'
+      workingDirectory: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo'
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-webgpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-webgpu-ci-pipeline.yml
index 06f374afca57a..8460df2ec3799 100644
--- a/tools/ci_build/github/azure-pipelines/win-gpu-webgpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-webgpu-ci-pipeline.yml
@@ -48,7 +48,7 @@ stages:
           --enable_pybind
           --build_nodejs
           --use_webgpu
-          --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON
+          --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY=ON
         msbuildPlatform: x64
         isX86: false
         job_name_suffix: x64_RelWithDebInfo
diff --git a/tools/ci_build/github/linux/build_tensorrt_ci.sh b/tools/ci_build/github/linux/build_tensorrt_ci.sh
index 5b206bc0a92d9..ccf7a6f4ea630 100755
--- a/tools/ci_build/github/linux/build_tensorrt_ci.sh
+++ b/tools/ci_build/github/linux/build_tensorrt_ci.sh
@@ -21,6 +21,19 @@ BUILD_ARGS=('--config' 'Release'
 	      "CMAKE_CUDA_ARCHITECTURES=75"
 	      "onnxruntime_BUILD_UNIT_TESTS=ON"
 	      "onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON")
+
+# Parse external args
+for arg in "$@"; do
+  case $arg in
+    --cuda_minimal=ON)
+      # Replace onnxruntime_BUILD_UNIT_TESTS=ON with OFF
+      BUILD_ARGS=("${BUILD_ARGS[@]/onnxruntime_BUILD_UNIT_TESTS=ON/onnxruntime_BUILD_UNIT_TESTS=OFF}")
+      BUILD_ARGS+=("--enable_cuda_minimal_build")
+      BUILD_ARGS+=("--skip_tests")
+      ;;
+  esac
+done
+
 if [ -x "$(command -v ninja)" ]; then
     BUILD_ARGS+=('--cmake_generator' 'Ninja')
 fi
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0 b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0
index c2bae5fd7ee59..df5112dc38af4 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0
@@ -6,7 +6,7 @@
 
 # Build base image with required system packages
 ARG BASEIMAGE=nvidia/cuda:12.5.1-cudnn-devel-ubi8
-ARG TRT_VERSION=10.6.0.26-1.cuda12.6
+ARG TRT_VERSION=10.7.0.23-1.cuda12.6
 FROM $BASEIMAGE AS base
 ARG TRT_VERSION
 ENV PATH=/opt/python/cp310-cp310/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH}
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch
index 2ecc6d1918b1a..fef95b8574520 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch
@@ -6,7 +6,7 @@
 
 # Build base image with required system packages
 ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8
-ARG TRT_VERSION=10.6.0.26-1.cuda11.8
+ARG TRT_VERSION=10.7.0.23-1.cuda11.8
 FROM $BASEIMAGE AS base
 ARG TRT_VERSION
 ENV PATH=/opt/python/cp310-cp310/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH}
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
index 81aeada6a4a46..e91f14ff955b9 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
@@ -6,7 +6,7 @@
 
 # Build base image with required system packages
 ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
-ARG TRT_VERSION=10.6.0.26-1+cuda11.8
+ARG TRT_VERSION=10.7.0.23-1+cuda11.8
 ARG LD_LIBRARY_PATH_ARG=/usr/local/lib64:/usr/local/cuda/lib64
 FROM $BASEIMAGE AS base
 ARG TRT_VERSION
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_ffmpeg b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_ffmpeg
index 4298dd53e4c66..0b08d4b3024b8 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_ffmpeg
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_ffmpeg
@@ -6,7 +6,7 @@
 
 # Build base image with required system packages
 ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
-ARG TRT_VERSION=10.6.0.26-1+cuda11.8
+ARG TRT_VERSION=10.7.0.23-1+cuda11.8
 ARG LD_LIBRARY_PATH_ARG=/usr/local/lib64:/usr/local/cuda/lib64
 FROM $BASEIMAGE AS base
 ARG TRT_VERSION
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_opencv b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_opencv
index 1312475ceca3a..3a7e064686ae5 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_opencv
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_opencv
@@ -6,7 +6,7 @@
 
 # Build base image with required system packages
 ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
-ARG TRT_VERSION=10.6.0.26-1+cuda11.8
+ARG TRT_VERSION=10.7.0.23-1+cuda11.8
 ARG LD_LIBRARY_PATH_ARG=/usr/local/lib64:/usr/local/cuda/lib64
 FROM $BASEIMAGE AS base
 ARG TRT_VERSION
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10
index 22d5e3b0248a8..01f08ff41e2cc 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10
@@ -31,7 +31,7 @@ RUN pip install --upgrade pip
 RUN pip install psutil setuptools>=68.2.2
 
 # Install TensorRT
-RUN TRT_VERSION="10.6.0.26-1+cuda11.8" &&\
+RUN TRT_VERSION="10.7.0.23-1+cuda11.8" &&\
     apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\
     apt-get update &&\
     apt-get install -y \
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10
index 819d9bab7be75..781f0647a084b 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10
@@ -31,7 +31,7 @@ RUN pip install --upgrade pip
 RUN pip install setuptools>=68.2.2 psutil
 
 # Install TensorRT
-RUN TRT_VERSION="10.6.0.26-1+cuda12.6" &&\
+RUN TRT_VERSION="10.7.0.23-1+cuda12.6" &&\
     apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\
     apt-get update &&\
     apt-get install -y \
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile
index a69b98f86ba1b..5f10607b11626 100644
--- a/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile
@@ -5,7 +5,7 @@
 ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8
 
 FROM $BASEIMAGE
-ARG TRT_VERSION=10.6.0.26-1.cuda11.8
+ARG TRT_VERSION=10.7.0.23-1.cuda11.8
 
 #Install TensorRT only if TRT_VERSION is not empty
 RUN if [ -n "${TRT_VERSION}" ]; then  \
diff --git a/tools/ci_build/github/windows/setup_env_gpu.bat b/tools/ci_build/github/windows/setup_env_gpu.bat
index 34ddd75da16fc..4e2bd8f8386e2 100644
--- a/tools/ci_build/github/windows/setup_env_gpu.bat
+++ b/tools/ci_build/github/windows/setup_env_gpu.bat
@@ -6,10 +6,10 @@ if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ (
 ) else (
     set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\CUPTI\lib64;%PATH%
 )
-set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.6.0.26.Windows10.x86_64.cuda-12.6\lib;%PATH%
+set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.7.0.23.Windows10.x86_64.cuda-12.6\lib;%PATH%
 
 @REM The default version is still cuda v12.2, because set cuda v11.8 after it
-set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\TensorRT-10.6.0.26.Windows10.x86_64.cuda-11.8\lib
+set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\TensorRT-10.7.0.23.Windows10.x86_64.cuda-11.8\lib
 if exist PATH=%AGENT_TEMPDIRECTORY%\v11.8\ (
     set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v11.8\bin;%AGENT_TEMPDIRECTORY%\v11.8\extras\CUPTI\lib64
 ) else (
diff --git a/tools/ci_build/github/windows/setup_env_trt.bat b/tools/ci_build/github/windows/setup_env_trt.bat
index 03734293be5c4..6a602e46661e7 100644
--- a/tools/ci_build/github/windows/setup_env_trt.bat
+++ b/tools/ci_build/github/windows/setup_env_trt.bat
@@ -6,6 +6,6 @@ if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ (
 ) else (
     set PATH=%PATH%;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\CUPTI\lib64
 )
-set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.6.0.26.Windows10.x86_64.cuda-12.6\lib;%PATH%
+set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.7.0.23.Windows10.x86_64.cuda-12.6\lib;%PATH%
 set GRADLE_OPTS=-Dorg.gradle.daemon=false
 set CUDA_MODULE_LOADING=LAZY