Merge branch 'main' into adrianl/qnn-ep-dynamic-lib

microsoft · Dec 20, 2024 · 6347c5f · 6347c5f
2 parents 5ed035f + 6806174
commit 6347c5f
Show file tree

Hide file tree

Showing 127 changed files with 2,535 additions and 1,189 deletions.
diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json
@@ -196,7 +196,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "bc0d2e35909b8456abe32f3b30a49bb0c125e8b7",
+          "commitHash": "9c69a24bc2e20c8a511a4e6b06fd49639ec5300a",
           "repositoryUrl": "https://github.com/onnx/onnx-tensorrt.git"
         },
         "comments": "onnx_tensorrt"

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
@@ -130,8 +130,7 @@ option(onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS "Dump debug information about node
 cmake_dependent_option(onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS_ENABLE_DUMP_TO_SQLDB "Build dump debug information about node inputs and outputs with support for sql database." OFF "onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS" OFF)
 
 # When loading a delay loaded DLL, Windows searches the main EXE's folder first.
-# In a Python process, it searches where python.exe lives, but it doesn't search the python package's installation folder. Therefore we cannot enable this flag when Python is enabled.
-cmake_dependent_option(onnxruntime_ENABLE_DELAY_LOADING_WIN_DLLS "Delay load some of the dependent DLls that are part of the OS" ON "WIN32;NOT GDK_PLATFORM;NOT onnxruntime_ENABLE_PYTHON" OFF)
+cmake_dependent_option(onnxruntime_ENABLE_DELAY_LOADING_WIN_DLLS "Delay load some of the dependent DLls that are part of the OS" ON "WIN32;NOT GDK_PLATFORM" OFF)
 option(onnxruntime_USE_DML "Build with DirectML support" OFF)
 option(onnxruntime_USE_MIGRAPHX "Build with AMDMIGraphX support" OFF)
 option(onnxruntime_USE_WINML "Build with WinML support" OFF)

diff --git a/cmake/deps.txt b/cmake/deps.txt
@@ -36,8 +36,8 @@ microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.z
 mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee7d34223d0567892db5179849939c8769dc41
 mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063
 onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.16.1.zip;2eb9198bb352757d5ff13977cbe0634898e0837c
-# Use the latest commit of 10.6-GA-ORT-DDS
-onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/bc0d2e35909b8456abe32f3b30a49bb0c125e8b7.zip;f233ae871ad82c023da62e5dd620639f00bc2d15
+# Use the latest commit of 10.7-GA
+onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/9c69a24bc2e20c8a511a4e6b06fd49639ec5300a.zip;ff1fe9af78eb129b4a4cdcb7450b7390b4436dd3
 protobuf;https://github.com/protocolbuffers/protobuf/archive/refs/tags/v21.12.zip;7cf2733949036c7d52fda017badcab093fe73bfa
 protoc_win64;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip;b4521f7ada5b260380f94c4bd7f1b7684c76969a
 protoc_win32;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win32.zip;3688010318192c46ce73213cdfb6b3e5656da874

diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake
@@ -77,6 +77,7 @@ if(WIN32)
   onnxruntime_add_shared_library(onnxruntime
     ${SYMBOL_FILE}
     "${ONNXRUNTIME_ROOT}/core/dll/dllmain.cc"
+    "${ONNXRUNTIME_ROOT}/core/dll/delay_load_hook.cc"
     "${ONNXRUNTIME_ROOT}/core/dll/onnxruntime.rc"
   )
 elseif(onnxruntime_BUILD_APPLE_FRAMEWORK)

diff --git a/cmake/onnxruntime_nodejs.cmake b/cmake/onnxruntime_nodejs.cmake
@@ -60,15 +60,26 @@ else()
     endif()
 endif()
 
+# a list of DLLs that the Node.js binding depends on
+set(NODEJS_DLL_DEPS)
+
 # setup providers
 if (onnxruntime_USE_CUDA)
     set(NODEJS_BINDING_USE_CUDA "--use_cuda")
 endif()
 if (onnxruntime_USE_DML)
     set(NODEJS_BINDING_USE_DML "--use_dml")
+    list(APPEND NODEJS_DLL_DEPS "$<TARGET_FILE_DIR:onnxruntime>/DirectML.dll")
 endif()
 if (onnxruntime_USE_WEBGPU)
     set(NODEJS_BINDING_USE_WEBGPU "--use_webgpu")
+    if (WIN32 AND onnxruntime_ENABLE_DAWN_BACKEND_D3D12)
+        list(APPEND NODEJS_DLL_DEPS "$<TARGET_FILE_DIR:dxcompiler>/dxil.dll")
+        list(APPEND NODEJS_DLL_DEPS "$<TARGET_FILE_DIR:dxcompiler>/dxcompiler.dll")
+    endif()
+    if (onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY)
+        list(APPEND NODEJS_DLL_DEPS "$<TARGET_FILE:dawn::webgpu_dawn>")
+    endif()
 endif()
 if (onnxruntime_USE_TENSORRT)
     set(NODEJS_BINDING_USE_TENSORRT "--use_tensorrt")
@@ -94,9 +105,12 @@ add_custom_target(js_common_npm_ci ALL
 
 add_custom_target(nodejs_binding_wrapper ALL
     COMMAND ${NPM_CLI} ci
-    COMMAND ${NPM_CLI} run build -- --onnxruntime-build-dir=${CMAKE_CURRENT_BINARY_DIR} --config=${CMAKE_BUILD_TYPE} --onnxruntime-generator=${CMAKE_GENERATOR}
-        --arch=${NODEJS_BINDING_ARCH} ${NODEJS_BINDING_USE_CUDA} ${NODEJS_BINDING_USE_DML} ${NODEJS_BINDING_USE_WEBGPU} ${NODEJS_BINDING_USE_TENSORRT}
-        ${NODEJS_BINDING_USE_COREML} ${NODEJS_BINDING_USE_QNN}
+    COMMAND ${NPM_CLI} run build -- "--onnxruntime-build-dir=${CMAKE_CURRENT_BINARY_DIR}"
+        --config=${CMAKE_BUILD_TYPE}
+        "--onnxruntime-generator=${CMAKE_GENERATOR}"
+        "--dll_deps=${NODEJS_DLL_DEPS}"
+        --arch=${NODEJS_BINDING_ARCH} ${NODEJS_BINDING_USE_CUDA} ${NODEJS_BINDING_USE_DML} ${NODEJS_BINDING_USE_WEBGPU}
+        ${NODEJS_BINDING_USE_TENSORRT} ${NODEJS_BINDING_USE_COREML} ${NODEJS_BINDING_USE_QNN}
     WORKING_DIRECTORY ${JS_NODE_ROOT}
     COMMENT "Using cmake-js to build OnnxRuntime Node.js binding")
 

diff --git a/cmake/onnxruntime_providers_webgpu.cmake b/cmake/onnxruntime_providers_webgpu.cmake
@@ -23,24 +23,42 @@
   onnxruntime_add_include_to_target(onnxruntime_providers_webgpu
     onnxruntime_common dawn::dawncpp_headers dawn::dawn_headers onnx onnx_proto flatbuffers::flatbuffers Boost::mp11 safeint_interface)
 
+  set(onnxruntime_providers_webgpu_dll_deps)
+
   if (onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY)
     target_link_libraries(onnxruntime_providers_webgpu dawn::webgpu_dawn)
 
-    if (onnxruntime_ENABLE_DELAY_LOADING_WIN_DLLS)
-      list(APPEND onnxruntime_DELAYLOAD_FLAGS "/DELAYLOAD:webgpu_dawn.dll")
-    endif()
+    if (WIN32)
+      if (onnxruntime_ENABLE_DELAY_LOADING_WIN_DLLS)
+        list(APPEND onnxruntime_DELAYLOAD_FLAGS "/DELAYLOAD:webgpu_dawn.dll")
+      endif()
 
-    # Copy webgpu_dawn.dll to the output directory
-    add_custom_command(
-      TARGET onnxruntime_providers_webgpu
-      POST_BUILD
-      COMMAND ${CMAKE_COMMAND} -E copy_if_different "$<TARGET_FILE:dawn::webgpu_dawn>" "$<TARGET_FILE_DIR:onnxruntime_providers_webgpu>"
-      VERBATIM )
+      list(APPEND onnxruntime_providers_webgpu_dll_deps "$<TARGET_FILE:dawn::webgpu_dawn>")
+    endif()
   else()
     if (NOT onnxruntime_USE_EXTERNAL_DAWN)
       target_link_libraries(onnxruntime_providers_webgpu dawn::dawn_native)
     endif()
     target_link_libraries(onnxruntime_providers_webgpu dawn::dawn_proc)
   endif()
 
+  if (WIN32 AND onnxruntime_ENABLE_DAWN_BACKEND_D3D12)
+    # Ensure dxil.dll and dxcompiler.dll exist in the output directory $<TARGET_FILE_DIR:dxcompiler>
+    add_dependencies(onnxruntime_providers_webgpu copy_dxil_dll)
+    add_dependencies(onnxruntime_providers_webgpu dxcompiler)
+
+    list(APPEND onnxruntime_providers_webgpu_dll_deps "$<TARGET_FILE_DIR:dxcompiler>/dxil.dll")
+    list(APPEND onnxruntime_providers_webgpu_dll_deps "$<TARGET_FILE_DIR:dxcompiler>/dxcompiler.dll")
+  endif()
+
+  if (onnxruntime_providers_webgpu_dll_deps)
+    # Copy dependency DLLs to the output directory
+    add_custom_command(
+      TARGET onnxruntime_providers_webgpu
+      POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different "${onnxruntime_providers_webgpu_dll_deps}" "$<TARGET_FILE_DIR:onnxruntime_providers_webgpu>"
+      COMMAND_EXPAND_LISTS
+      VERBATIM )
+  endif()
+
   set_target_properties(onnxruntime_providers_webgpu PROPERTIES FOLDER "ONNXRuntime")
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
@@ -525,6 +525,9 @@ set (onnxruntime_global_thread_pools_test_SRC
 set (onnxruntime_webgpu_external_dawn_test_SRC
           ${TEST_SRC_DIR}/webgpu/external_dawn/main.cc)
 
+set (onnxruntime_webgpu_delay_load_test_SRC
+          ${TEST_SRC_DIR}/webgpu/delay_load/main.cc)
+
 # tests from lowest level library up.
 # the order of libraries should be maintained, with higher libraries being added first in the list
 
@@ -1863,4 +1866,13 @@ if (onnxruntime_USE_WEBGPU AND onnxruntime_USE_EXTERNAL_DAWN)
   onnxruntime_add_include_to_target(onnxruntime_webgpu_external_dawn_test dawn::dawncpp_headers dawn::dawn_headers)
 endif()
 
+if (onnxruntime_USE_WEBGPU AND WIN32 AND onnxruntime_BUILD_SHARED_LIB AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND NOT onnxruntime_MINIMAL_BUILD)
+  AddTest(DYN
+          TARGET onnxruntime_webgpu_delay_load_test
+          SOURCES ${onnxruntime_webgpu_delay_load_test_SRC}
+          LIBS ${SYS_PATH_LIB}
+          DEPENDS ${all_dependencies}
+  )
+endif()
+
 include(onnxruntime_fuzz_test.cmake)
diff --git a/cmake/winml.cmake b/cmake/winml.cmake
@@ -782,7 +782,7 @@ add_dependencies(winml_dll winml_api_native)
 add_dependencies(winml_dll winml_api_native_internal)
 
 # Link libraries
-target_link_libraries(winml_dll PRIVATE re2)
+target_link_libraries(winml_dll PRIVATE re2::re2)
 target_link_libraries(winml_dll PRIVATE ${WIL_TARGET})
 target_link_libraries(winml_dll PRIVATE winml_lib_api)
 if (NOT winml_is_inbox)

diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
@@ -1625,7 +1625,7 @@ This version of the operator has been available since version 1 of the 'com.micr
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(float16), tensor(float), tensor(double)</dt>
+<dt><tt>T</tt> : tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(float16), tensor(float), tensor(double), tensor(bfloat16)</dt>
 <dd>Constrain input and output types.</dd>
 </dl>
 

diff --git a/include/onnxruntime/core/framework/op_kernel.h b/include/onnxruntime/core/framework/op_kernel.h
@@ -7,6 +7,7 @@
 
 // It is safe to include the below header even if SHARED_PROVIDER macro is enabled
 // as it doesn't include any pb headers.
+#include "core/framework/buffer_deleter.h"
 #include "core/framework/prepacked_weights_container.h"
 
 #ifndef SHARED_PROVIDER

diff --git a/include/onnxruntime/core/graph/graph.h b/include/onnxruntime/core/graph/graph.h
@@ -3,14 +3,15 @@
 
 #pragma once
 
+#include <filesystem>
 #include <functional>
 #include <limits>
 #include <memory>
+#include <optional>
 #include <string>
 #include <type_traits>
 #include <unordered_map>
 #include <unordered_set>
-#include <filesystem>
 
 #include "core/common/flatbuffers.h"
 
@@ -19,13 +20,14 @@
 #include "core/common/common.h"
 #include "core/common/path_string.h"
 #include "core/common/const_pointer_container.h"
+#include "core/common/inlined_containers_fwd.h"
 #if !defined(ORT_MINIMAL_BUILD)
 #include "core/common/inlined_containers.h"
 #endif
-#include "core/common/inlined_containers_fwd.h"
 #include "core/common/span_utils.h"
 #include "core/common/status.h"
 #include "core/common/logging/logging.h"
+#include "core/framework/prepacked_weights_container.h"
 #include "core/graph/onnx_protobuf.h"
 #include "core/graph/basic_types.h"
 #include "core/graph/constants.h"
@@ -41,6 +43,7 @@ namespace onnxruntime {
 class Graph;
 struct IndexedSubGraph;
 class Model;
+struct ModelSavingOptions;
 class OpSignature;
 
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
@@ -1153,29 +1156,6 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
   const ONNX_NAMESPACE::GraphProto& ToGraphProto();
   ONNX_NAMESPACE::GraphProto ToGraphProto() const;
 
-  // Options to align external initializer offset.
-  // For models running on CPU, ORT will try to use mmap to load external initializers.
-  // To use mmap, external initializer need to be offset aligned.
-  // ORT saves external initializers into signle data file, each initializer is accessed with
-  // offset(start position of initializer) and length(byte length of initializer) of the data file.
-  // To use mmap, each offset need to be aligned which means offset need to divisible by
-  // allocation granularity(64KB for windows and 4K for other OSes).
-  // With align_offset to true, ORT will align offset for large initializer when
-  // save ONNX model with external data file.
-  struct OffsetAlignmentInfo {
-    // Offset will always be page aligned and allocation granularity aligned for mmap support.
-    // This is done by padding previous tensor data with zeros keeping same length.
-    bool align_offset = false;
-    // Alignment threshold for size of data.
-    // Having a low threshold will waste file space for small initializers.
-    // Only when tensor's data size is > the page_align_threshold it will be force aligned.
-    // Default to 1MB.
-    int64_t align_threshold = 1048576;
-    // The allocation Granularity for mmap() support.
-    // Typically 64KB for Windows & 4KB for other OSes. Default to 64KB.
-    int64_t allocation_granularity = 65536;
-  };
-
   /** Gets the GraphProto representation of this Graph
   @param external_file_path File path of the binary file to use for initializers.
   @param model_file_path path of the model file.
@@ -1186,15 +1166,7 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
   */
   ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path,
                                                                   const std::filesystem::path& model_file_path,
-                                                                  size_t initializer_size_threshold,
-                                                                  const OffsetAlignmentInfo& align_info) const;
-
-  ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path,
-                                                                  const std::filesystem::path& model_file_path,
-                                                                  size_t initializer_size_threshold) const {
-    OffsetAlignmentInfo default_options;
-    return ToGraphProtoWithExternalInitializers(external_file_path, model_file_path, initializer_size_threshold, default_options);
-  }
+                                                                  const ModelSavingOptions& model_saving_options) const;
 
   /** Gets the ISchemaRegistry instances being used with this Graph. */
   IOnnxRuntimeOpSchemaCollectionPtr GetSchemaRegistry() const;
@@ -1400,6 +1372,18 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
 
 #endif  // !defined(ORT_MINIMAL_BUILD)
 
+  // This function constructs PrepackedSharedContainer in the root graph only
+  // and initializes a reference to it in all (sub)graphs
+  void ConstructPrepackedSharedContainerAndSetMode(bool saving_mode_on);
+
+  const PrepackedWeightsForGraph& GetPrepacked() const noexcept {
+    return *prepacked_weights_for_graph_;
+  }
+
+  PrepackedWeightsForGraph& GetPrepacked() noexcept {
+    return *prepacked_weights_for_graph_;
+  }
+
   /** Returns the Node containing the GraphProto for this Graph instance if IsSubgraph is true */
   const Node* ParentNode() const { return parent_node_; }
 
@@ -1519,6 +1503,31 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
   Status AddConstantProtoAsInitializer(const ONNX_NAMESPACE::NodeProto& constant_node_proto,
                                        std::optional<std::string_view> new_name);
 
+  /// <summary>
+  /// This function traverses the graph bottom up and externalizes
+  /// constant initializers along with their pre-packed blobs from different
+  /// kernels. Writes constant initializers to the external file with any pre-packed
+  /// blobs (if enabled and produced for this initializer) and then modifies TensorProto
+  /// entry with external data references.
+  /// </summary>
+  /// <param name="model_path">model file path from Model</param>
+  /// <param name="external_file_path">a binary file path for relative to the model file path
+  /// where the initializers data is written</param>
+  /// <param name="model_external_file_path">model file folder path with external file path appended</param>
+  /// <param name="model_saving_options">model saving options including alignment and pre-packs</param>
+  /// <param name="output_graph_proto">The graph proto to be modified</param>
+  /// <param name="external_stream">external file stream</param>
+  /// <param name="external_offset">current external file offset updated with each write</param>
+  /// <returns>Status instance</returns>
+  Status AddExternalInitializersToGraphProtoImpl(
+      const std::filesystem::path& model_path,
+      const std::filesystem::path& external_file_path,
+      const std::filesystem::path& model_external_file_path,
+      const ModelSavingOptions& model_saving_options,
+      ONNX_NAMESPACE::GraphProto& output_graph_proto,
+      std::ostream& external_stream,
+      int64_t& external_offset) const;
+
 #endif
 
   Version IrVersion() const noexcept {
@@ -1703,6 +1712,21 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
                      std::hash<std::string>, std::equal_to<std::string>>
       sparse_tensor_names_;
 
+  // Prepacked blobs container that stored pre-packed initializers
+  // data that is:
+  // - mem-mapped from disk
+  // - shared within the session
+  // - shared across sessions by transferring the ownership of loaded data entries to
+  // SessionState::PrepackedWeightsContainer* if one is present.
+  // This container is optional because it is present only in the root graph.
+  std::optional<PrepackedKeyToBlobMap> prepacked_key_to_blobs_;
+
+  // This container contains a reference to the root prepacked_key_to_blobs_
+  // and also (in the save mode) records association between the initializer
+  // names and their pre-packed blobs (via keys).
+  // This is optional due to delayed construction.
+  std::optional<PrepackedWeightsForGraph> prepacked_weights_for_graph_;
+
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
   // Runtime optimization storage.
   // Note: runtime_optimizations_ == *runtime_optimizations_ptr_ and must be initialized

diff --git a/include/onnxruntime/core/graph/model_saving_options.h b/include/onnxruntime/core/graph/model_saving_options.h
@@ -0,0 +1,44 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+namespace onnxruntime {
+
+class PrepackedWeightsForGraph;
+
+// These options affect how the model initializers are written to the external file.
+// This includes options to align external initializer offset.
+// For models running on CPU, ORT will try to use mmap to load external
+// initializers. To use mmap, external initializer need to be offset aligned.
+// ORT saves external initializers into single data file, each initializer is
+// accessed with offset(start position of initializer) and length(byte length of
+// initializer) of the data file. To use mmap, each offset need to be aligned
+// which means offset need to divisible by allocation granularity(64KB for
+// windows and 4K for other OSes). With align_offset to true, ORT will align
+// offset for large initializer when save ONNX model with external data file.
+struct ModelSavingOptions {
+  explicit ModelSavingOptions(size_t size_threshold)
+      : initializer_size_threshold(size_threshold) {}
+
+  // Mimimal initializer size in bytes to be externalized on disk
+  size_t initializer_size_threshold;
+  // Offset will always be page aligned and allocation granularity aligned for
+  // mmap support. This is done by padding previous tensor data with zeros
+  // keeping same length.
+  bool align_offset = false;
+  // Alignment threshold for size of data.
+  // Having a low threshold will waste file space for small initializers.
+  // Only when tensor's data size is > the page_align_threshold it will be force
+  // aligned. Default to 1MB.
+  int64_t align_threshold = 1048576;
+  // The allocation Granularity for mmap() support.
+  // Typically 64KB for Windows & 4KB for other OSes. Default to 64KB.
+#ifdef _WIN32
+  int64_t allocation_granularity = 65536;
+#else
+  int64_t allocation_granularity = 4096;
+#endif
+};
+
+}  // namespace onnxruntime