Skip to content

Commit

Permalink
Merge branch 'main' into adrianl/qnn-ep-dynamic-lib
Browse files Browse the repository at this point in the history
  • Loading branch information
adrianlizarraga committed Dec 20, 2024
2 parents 5ed035f + 6806174 commit 6347c5f
Show file tree
Hide file tree
Showing 127 changed files with 2,535 additions and 1,189 deletions.
2 changes: 1 addition & 1 deletion cgmanifests/generated/cgmanifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@
"component": {
"type": "git",
"git": {
"commitHash": "bc0d2e35909b8456abe32f3b30a49bb0c125e8b7",
"commitHash": "9c69a24bc2e20c8a511a4e6b06fd49639ec5300a",
"repositoryUrl": "https://github.com/onnx/onnx-tensorrt.git"
},
"comments": "onnx_tensorrt"
Expand Down
3 changes: 1 addition & 2 deletions cmake/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -130,8 +130,7 @@ option(onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS "Dump debug information about node
cmake_dependent_option(onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS_ENABLE_DUMP_TO_SQLDB "Build dump debug information about node inputs and outputs with support for sql database." OFF "onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS" OFF)

# When loading a delay loaded DLL, Windows searches the main EXE's folder first.
# In a Python process, it searches where python.exe lives, but it doesn't search the python package's installation folder. Therefore we cannot enable this flag when Python is enabled.
cmake_dependent_option(onnxruntime_ENABLE_DELAY_LOADING_WIN_DLLS "Delay load some of the dependent DLls that are part of the OS" ON "WIN32;NOT GDK_PLATFORM;NOT onnxruntime_ENABLE_PYTHON" OFF)
cmake_dependent_option(onnxruntime_ENABLE_DELAY_LOADING_WIN_DLLS "Delay load some of the dependent DLls that are part of the OS" ON "WIN32;NOT GDK_PLATFORM" OFF)
option(onnxruntime_USE_DML "Build with DirectML support" OFF)
option(onnxruntime_USE_MIGRAPHX "Build with AMDMIGraphX support" OFF)
option(onnxruntime_USE_WINML "Build with WinML support" OFF)
Expand Down
4 changes: 2 additions & 2 deletions cmake/deps.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@ microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.z
mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee7d34223d0567892db5179849939c8769dc41
mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063
onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.16.1.zip;2eb9198bb352757d5ff13977cbe0634898e0837c
# Use the latest commit of 10.6-GA-ORT-DDS
onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/bc0d2e35909b8456abe32f3b30a49bb0c125e8b7.zip;f233ae871ad82c023da62e5dd620639f00bc2d15
# Use the latest commit of 10.7-GA
onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/9c69a24bc2e20c8a511a4e6b06fd49639ec5300a.zip;ff1fe9af78eb129b4a4cdcb7450b7390b4436dd3
protobuf;https://github.com/protocolbuffers/protobuf/archive/refs/tags/v21.12.zip;7cf2733949036c7d52fda017badcab093fe73bfa
protoc_win64;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip;b4521f7ada5b260380f94c4bd7f1b7684c76969a
protoc_win32;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win32.zip;3688010318192c46ce73213cdfb6b3e5656da874
Expand Down
1 change: 1 addition & 0 deletions cmake/onnxruntime.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ if(WIN32)
onnxruntime_add_shared_library(onnxruntime
${SYMBOL_FILE}
"${ONNXRUNTIME_ROOT}/core/dll/dllmain.cc"
"${ONNXRUNTIME_ROOT}/core/dll/delay_load_hook.cc"
"${ONNXRUNTIME_ROOT}/core/dll/onnxruntime.rc"
)
elseif(onnxruntime_BUILD_APPLE_FRAMEWORK)
Expand Down
20 changes: 17 additions & 3 deletions cmake/onnxruntime_nodejs.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -60,15 +60,26 @@ else()
endif()
endif()

# a list of DLLs that the Node.js binding depends on
set(NODEJS_DLL_DEPS)

# setup providers
if (onnxruntime_USE_CUDA)
set(NODEJS_BINDING_USE_CUDA "--use_cuda")
endif()
if (onnxruntime_USE_DML)
set(NODEJS_BINDING_USE_DML "--use_dml")
list(APPEND NODEJS_DLL_DEPS "$<TARGET_FILE_DIR:onnxruntime>/DirectML.dll")
endif()
if (onnxruntime_USE_WEBGPU)
set(NODEJS_BINDING_USE_WEBGPU "--use_webgpu")
if (WIN32 AND onnxruntime_ENABLE_DAWN_BACKEND_D3D12)
list(APPEND NODEJS_DLL_DEPS "$<TARGET_FILE_DIR:dxcompiler>/dxil.dll")
list(APPEND NODEJS_DLL_DEPS "$<TARGET_FILE_DIR:dxcompiler>/dxcompiler.dll")
endif()
if (onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY)
list(APPEND NODEJS_DLL_DEPS "$<TARGET_FILE:dawn::webgpu_dawn>")
endif()
endif()
if (onnxruntime_USE_TENSORRT)
set(NODEJS_BINDING_USE_TENSORRT "--use_tensorrt")
Expand All @@ -94,9 +105,12 @@ add_custom_target(js_common_npm_ci ALL

add_custom_target(nodejs_binding_wrapper ALL
COMMAND ${NPM_CLI} ci
COMMAND ${NPM_CLI} run build -- --onnxruntime-build-dir=${CMAKE_CURRENT_BINARY_DIR} --config=${CMAKE_BUILD_TYPE} --onnxruntime-generator=${CMAKE_GENERATOR}
--arch=${NODEJS_BINDING_ARCH} ${NODEJS_BINDING_USE_CUDA} ${NODEJS_BINDING_USE_DML} ${NODEJS_BINDING_USE_WEBGPU} ${NODEJS_BINDING_USE_TENSORRT}
${NODEJS_BINDING_USE_COREML} ${NODEJS_BINDING_USE_QNN}
COMMAND ${NPM_CLI} run build -- "--onnxruntime-build-dir=${CMAKE_CURRENT_BINARY_DIR}"
--config=${CMAKE_BUILD_TYPE}
"--onnxruntime-generator=${CMAKE_GENERATOR}"
"--dll_deps=${NODEJS_DLL_DEPS}"
--arch=${NODEJS_BINDING_ARCH} ${NODEJS_BINDING_USE_CUDA} ${NODEJS_BINDING_USE_DML} ${NODEJS_BINDING_USE_WEBGPU}
${NODEJS_BINDING_USE_TENSORRT} ${NODEJS_BINDING_USE_COREML} ${NODEJS_BINDING_USE_QNN}
WORKING_DIRECTORY ${JS_NODE_ROOT}
COMMENT "Using cmake-js to build OnnxRuntime Node.js binding")

Expand Down
36 changes: 27 additions & 9 deletions cmake/onnxruntime_providers_webgpu.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -23,24 +23,42 @@
onnxruntime_add_include_to_target(onnxruntime_providers_webgpu
onnxruntime_common dawn::dawncpp_headers dawn::dawn_headers onnx onnx_proto flatbuffers::flatbuffers Boost::mp11 safeint_interface)

set(onnxruntime_providers_webgpu_dll_deps)

if (onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY)
target_link_libraries(onnxruntime_providers_webgpu dawn::webgpu_dawn)

if (onnxruntime_ENABLE_DELAY_LOADING_WIN_DLLS)
list(APPEND onnxruntime_DELAYLOAD_FLAGS "/DELAYLOAD:webgpu_dawn.dll")
endif()
if (WIN32)
if (onnxruntime_ENABLE_DELAY_LOADING_WIN_DLLS)
list(APPEND onnxruntime_DELAYLOAD_FLAGS "/DELAYLOAD:webgpu_dawn.dll")
endif()

# Copy webgpu_dawn.dll to the output directory
add_custom_command(
TARGET onnxruntime_providers_webgpu
POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy_if_different "$<TARGET_FILE:dawn::webgpu_dawn>" "$<TARGET_FILE_DIR:onnxruntime_providers_webgpu>"
VERBATIM )
list(APPEND onnxruntime_providers_webgpu_dll_deps "$<TARGET_FILE:dawn::webgpu_dawn>")
endif()
else()
if (NOT onnxruntime_USE_EXTERNAL_DAWN)
target_link_libraries(onnxruntime_providers_webgpu dawn::dawn_native)
endif()
target_link_libraries(onnxruntime_providers_webgpu dawn::dawn_proc)
endif()

if (WIN32 AND onnxruntime_ENABLE_DAWN_BACKEND_D3D12)
# Ensure dxil.dll and dxcompiler.dll exist in the output directory $<TARGET_FILE_DIR:dxcompiler>
add_dependencies(onnxruntime_providers_webgpu copy_dxil_dll)
add_dependencies(onnxruntime_providers_webgpu dxcompiler)

list(APPEND onnxruntime_providers_webgpu_dll_deps "$<TARGET_FILE_DIR:dxcompiler>/dxil.dll")
list(APPEND onnxruntime_providers_webgpu_dll_deps "$<TARGET_FILE_DIR:dxcompiler>/dxcompiler.dll")
endif()

if (onnxruntime_providers_webgpu_dll_deps)
# Copy dependency DLLs to the output directory
add_custom_command(
TARGET onnxruntime_providers_webgpu
POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy_if_different "${onnxruntime_providers_webgpu_dll_deps}" "$<TARGET_FILE_DIR:onnxruntime_providers_webgpu>"
COMMAND_EXPAND_LISTS
VERBATIM )
endif()

set_target_properties(onnxruntime_providers_webgpu PROPERTIES FOLDER "ONNXRuntime")
12 changes: 12 additions & 0 deletions cmake/onnxruntime_unittests.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -525,6 +525,9 @@ set (onnxruntime_global_thread_pools_test_SRC
set (onnxruntime_webgpu_external_dawn_test_SRC
${TEST_SRC_DIR}/webgpu/external_dawn/main.cc)

set (onnxruntime_webgpu_delay_load_test_SRC
${TEST_SRC_DIR}/webgpu/delay_load/main.cc)

# tests from lowest level library up.
# the order of libraries should be maintained, with higher libraries being added first in the list

Expand Down Expand Up @@ -1863,4 +1866,13 @@ if (onnxruntime_USE_WEBGPU AND onnxruntime_USE_EXTERNAL_DAWN)
onnxruntime_add_include_to_target(onnxruntime_webgpu_external_dawn_test dawn::dawncpp_headers dawn::dawn_headers)
endif()

if (onnxruntime_USE_WEBGPU AND WIN32 AND onnxruntime_BUILD_SHARED_LIB AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND NOT onnxruntime_MINIMAL_BUILD)
AddTest(DYN
TARGET onnxruntime_webgpu_delay_load_test
SOURCES ${onnxruntime_webgpu_delay_load_test_SRC}
LIBS ${SYS_PATH_LIB}
DEPENDS ${all_dependencies}
)
endif()

include(onnxruntime_fuzz_test.cmake)
2 changes: 1 addition & 1 deletion cmake/winml.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -782,7 +782,7 @@ add_dependencies(winml_dll winml_api_native)
add_dependencies(winml_dll winml_api_native_internal)

# Link libraries
target_link_libraries(winml_dll PRIVATE re2)
target_link_libraries(winml_dll PRIVATE re2::re2)
target_link_libraries(winml_dll PRIVATE ${WIL_TARGET})
target_link_libraries(winml_dll PRIVATE winml_lib_api)
if (NOT winml_is_inbox)
Expand Down
2 changes: 1 addition & 1 deletion docs/ContribOperators.md
Original file line number Diff line number Diff line change
Expand Up @@ -1625,7 +1625,7 @@ This version of the operator has been available since version 1 of the 'com.micr
#### Type Constraints

<dl>
<dt><tt>T</tt> : tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(float16), tensor(float), tensor(double)</dt>
<dt><tt>T</tt> : tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(float16), tensor(float), tensor(double), tensor(bfloat16)</dt>
<dd>Constrain input and output types.</dd>
</dl>

Expand Down
1 change: 1 addition & 0 deletions include/onnxruntime/core/framework/op_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

// It is safe to include the below header even if SHARED_PROVIDER macro is enabled
// as it doesn't include any pb headers.
#include "core/framework/buffer_deleter.h"
#include "core/framework/prepacked_weights_container.h"

#ifndef SHARED_PROVIDER
Expand Down
92 changes: 58 additions & 34 deletions include/onnxruntime/core/graph/graph.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,15 @@

#pragma once

#include <filesystem>
#include <functional>
#include <limits>
#include <memory>
#include <optional>
#include <string>
#include <type_traits>
#include <unordered_map>
#include <unordered_set>
#include <filesystem>

#include "core/common/flatbuffers.h"

Expand All @@ -19,13 +20,14 @@
#include "core/common/common.h"
#include "core/common/path_string.h"
#include "core/common/const_pointer_container.h"
#include "core/common/inlined_containers_fwd.h"
#if !defined(ORT_MINIMAL_BUILD)
#include "core/common/inlined_containers.h"
#endif
#include "core/common/inlined_containers_fwd.h"
#include "core/common/span_utils.h"
#include "core/common/status.h"
#include "core/common/logging/logging.h"
#include "core/framework/prepacked_weights_container.h"
#include "core/graph/onnx_protobuf.h"
#include "core/graph/basic_types.h"
#include "core/graph/constants.h"
Expand All @@ -41,6 +43,7 @@ namespace onnxruntime {
class Graph;
struct IndexedSubGraph;
class Model;
struct ModelSavingOptions;
class OpSignature;

#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
Expand Down Expand Up @@ -1153,29 +1156,6 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
const ONNX_NAMESPACE::GraphProto& ToGraphProto();
ONNX_NAMESPACE::GraphProto ToGraphProto() const;

// Options to align external initializer offset.
// For models running on CPU, ORT will try to use mmap to load external initializers.
// To use mmap, external initializer need to be offset aligned.
// ORT saves external initializers into signle data file, each initializer is accessed with
// offset(start position of initializer) and length(byte length of initializer) of the data file.
// To use mmap, each offset need to be aligned which means offset need to divisible by
// allocation granularity(64KB for windows and 4K for other OSes).
// With align_offset to true, ORT will align offset for large initializer when
// save ONNX model with external data file.
struct OffsetAlignmentInfo {
// Offset will always be page aligned and allocation granularity aligned for mmap support.
// This is done by padding previous tensor data with zeros keeping same length.
bool align_offset = false;
// Alignment threshold for size of data.
// Having a low threshold will waste file space for small initializers.
// Only when tensor's data size is > the page_align_threshold it will be force aligned.
// Default to 1MB.
int64_t align_threshold = 1048576;
// The allocation Granularity for mmap() support.
// Typically 64KB for Windows & 4KB for other OSes. Default to 64KB.
int64_t allocation_granularity = 65536;
};

/** Gets the GraphProto representation of this Graph
@param external_file_path File path of the binary file to use for initializers.
@param model_file_path path of the model file.
Expand All @@ -1186,15 +1166,7 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
*/
ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path,
const std::filesystem::path& model_file_path,
size_t initializer_size_threshold,
const OffsetAlignmentInfo& align_info) const;

ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path,
const std::filesystem::path& model_file_path,
size_t initializer_size_threshold) const {
OffsetAlignmentInfo default_options;
return ToGraphProtoWithExternalInitializers(external_file_path, model_file_path, initializer_size_threshold, default_options);
}
const ModelSavingOptions& model_saving_options) const;

/** Gets the ISchemaRegistry instances being used with this Graph. */
IOnnxRuntimeOpSchemaCollectionPtr GetSchemaRegistry() const;
Expand Down Expand Up @@ -1400,6 +1372,18 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi

#endif // !defined(ORT_MINIMAL_BUILD)

// This function constructs PrepackedSharedContainer in the root graph only
// and initializes a reference to it in all (sub)graphs
void ConstructPrepackedSharedContainerAndSetMode(bool saving_mode_on);

const PrepackedWeightsForGraph& GetPrepacked() const noexcept {
return *prepacked_weights_for_graph_;
}

PrepackedWeightsForGraph& GetPrepacked() noexcept {
return *prepacked_weights_for_graph_;
}

/** Returns the Node containing the GraphProto for this Graph instance if IsSubgraph is true */
const Node* ParentNode() const { return parent_node_; }

Expand Down Expand Up @@ -1519,6 +1503,31 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
Status AddConstantProtoAsInitializer(const ONNX_NAMESPACE::NodeProto& constant_node_proto,
std::optional<std::string_view> new_name);

/// <summary>
/// This function traverses the graph bottom up and externalizes
/// constant initializers along with their pre-packed blobs from different
/// kernels. Writes constant initializers to the external file with any pre-packed
/// blobs (if enabled and produced for this initializer) and then modifies TensorProto
/// entry with external data references.
/// </summary>
/// <param name="model_path">model file path from Model</param>
/// <param name="external_file_path">a binary file path for relative to the model file path
/// where the initializers data is written</param>
/// <param name="model_external_file_path">model file folder path with external file path appended</param>
/// <param name="model_saving_options">model saving options including alignment and pre-packs</param>
/// <param name="output_graph_proto">The graph proto to be modified</param>
/// <param name="external_stream">external file stream</param>
/// <param name="external_offset">current external file offset updated with each write</param>
/// <returns>Status instance</returns>
Status AddExternalInitializersToGraphProtoImpl(
const std::filesystem::path& model_path,
const std::filesystem::path& external_file_path,
const std::filesystem::path& model_external_file_path,
const ModelSavingOptions& model_saving_options,
ONNX_NAMESPACE::GraphProto& output_graph_proto,
std::ostream& external_stream,
int64_t& external_offset) const;

#endif

Version IrVersion() const noexcept {
Expand Down Expand Up @@ -1703,6 +1712,21 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
std::hash<std::string>, std::equal_to<std::string>>
sparse_tensor_names_;

// Prepacked blobs container that stored pre-packed initializers
// data that is:
// - mem-mapped from disk
// - shared within the session
// - shared across sessions by transferring the ownership of loaded data entries to
// SessionState::PrepackedWeightsContainer* if one is present.
// This container is optional because it is present only in the root graph.
std::optional<PrepackedKeyToBlobMap> prepacked_key_to_blobs_;

// This container contains a reference to the root prepacked_key_to_blobs_
// and also (in the save mode) records association between the initializer
// names and their pre-packed blobs (via keys).
// This is optional due to delayed construction.
std::optional<PrepackedWeightsForGraph> prepacked_weights_for_graph_;

#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
// Runtime optimization storage.
// Note: runtime_optimizations_ == *runtime_optimizations_ptr_ and must be initialized
Expand Down
44 changes: 44 additions & 0 deletions include/onnxruntime/core/graph/model_saving_options.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

#pragma once

namespace onnxruntime {

class PrepackedWeightsForGraph;

// These options affect how the model initializers are written to the external file.
// This includes options to align external initializer offset.
// For models running on CPU, ORT will try to use mmap to load external
// initializers. To use mmap, external initializer need to be offset aligned.
// ORT saves external initializers into single data file, each initializer is
// accessed with offset(start position of initializer) and length(byte length of
// initializer) of the data file. To use mmap, each offset need to be aligned
// which means offset need to divisible by allocation granularity(64KB for
// windows and 4K for other OSes). With align_offset to true, ORT will align
// offset for large initializer when save ONNX model with external data file.
struct ModelSavingOptions {
explicit ModelSavingOptions(size_t size_threshold)
: initializer_size_threshold(size_threshold) {}

// Mimimal initializer size in bytes to be externalized on disk
size_t initializer_size_threshold;
// Offset will always be page aligned and allocation granularity aligned for
// mmap support. This is done by padding previous tensor data with zeros
// keeping same length.
bool align_offset = false;
// Alignment threshold for size of data.
// Having a low threshold will waste file space for small initializers.
// Only when tensor's data size is > the page_align_threshold it will be force
// aligned. Default to 1MB.
int64_t align_threshold = 1048576;
// The allocation Granularity for mmap() support.
// Typically 64KB for Windows & 4KB for other OSes. Default to 64KB.
#ifdef _WIN32
int64_t allocation_granularity = 65536;
#else
int64_t allocation_granularity = 4096;
#endif
};

} // namespace onnxruntime
Loading

0 comments on commit 6347c5f

Please sign in to comment.