diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json index 07dff50f9a3bd..ad4195f31aa7c 100644 --- a/cgmanifests/generated/cgmanifest.json +++ b/cgmanifests/generated/cgmanifest.json @@ -196,7 +196,7 @@ "component": { "type": "git", "git": { - "commitHash": "bc0d2e35909b8456abe32f3b30a49bb0c125e8b7", + "commitHash": "9c69a24bc2e20c8a511a4e6b06fd49639ec5300a", "repositoryUrl": "https://github.com/onnx/onnx-tensorrt.git" }, "comments": "onnx_tensorrt" diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index d2fe7e7457983..febefff6756e7 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -130,8 +130,7 @@ option(onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS "Dump debug information about node cmake_dependent_option(onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS_ENABLE_DUMP_TO_SQLDB "Build dump debug information about node inputs and outputs with support for sql database." OFF "onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS" OFF) # When loading a delay loaded DLL, Windows searches the main EXE's folder first. -# In a Python process, it searches where python.exe lives, but it doesn't search the python package's installation folder. Therefore we cannot enable this flag when Python is enabled. -cmake_dependent_option(onnxruntime_ENABLE_DELAY_LOADING_WIN_DLLS "Delay load some of the dependent DLls that are part of the OS" ON "WIN32;NOT GDK_PLATFORM;NOT onnxruntime_ENABLE_PYTHON" OFF) +cmake_dependent_option(onnxruntime_ENABLE_DELAY_LOADING_WIN_DLLS "Delay load some of the dependent DLls that are part of the OS" ON "WIN32;NOT GDK_PLATFORM" OFF) option(onnxruntime_USE_DML "Build with DirectML support" OFF) option(onnxruntime_USE_MIGRAPHX "Build with AMDMIGraphX support" OFF) option(onnxruntime_USE_WINML "Build with WinML support" OFF) diff --git a/cmake/deps.txt b/cmake/deps.txt index 21f9ee1701c46..04a306e0ee657 100644 --- a/cmake/deps.txt +++ b/cmake/deps.txt @@ -36,8 +36,8 @@ microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.z mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee7d34223d0567892db5179849939c8769dc41 mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063 onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.16.1.zip;2eb9198bb352757d5ff13977cbe0634898e0837c -# Use the latest commit of 10.6-GA-ORT-DDS -onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/bc0d2e35909b8456abe32f3b30a49bb0c125e8b7.zip;f233ae871ad82c023da62e5dd620639f00bc2d15 +# Use the latest commit of 10.7-GA +onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/9c69a24bc2e20c8a511a4e6b06fd49639ec5300a.zip;ff1fe9af78eb129b4a4cdcb7450b7390b4436dd3 protobuf;https://github.com/protocolbuffers/protobuf/archive/refs/tags/v21.12.zip;7cf2733949036c7d52fda017badcab093fe73bfa protoc_win64;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip;b4521f7ada5b260380f94c4bd7f1b7684c76969a protoc_win32;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win32.zip;3688010318192c46ce73213cdfb6b3e5656da874 diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake index 3b76aff829be2..5adacdc393da8 100644 --- a/cmake/onnxruntime.cmake +++ b/cmake/onnxruntime.cmake @@ -77,6 +77,7 @@ if(WIN32) onnxruntime_add_shared_library(onnxruntime ${SYMBOL_FILE} "${ONNXRUNTIME_ROOT}/core/dll/dllmain.cc" + "${ONNXRUNTIME_ROOT}/core/dll/delay_load_hook.cc" "${ONNXRUNTIME_ROOT}/core/dll/onnxruntime.rc" ) elseif(onnxruntime_BUILD_APPLE_FRAMEWORK) diff --git a/cmake/onnxruntime_nodejs.cmake b/cmake/onnxruntime_nodejs.cmake index 376d895be34a9..355575be3bcf7 100644 --- a/cmake/onnxruntime_nodejs.cmake +++ b/cmake/onnxruntime_nodejs.cmake @@ -60,15 +60,26 @@ else() endif() endif() +# a list of DLLs that the Node.js binding depends on +set(NODEJS_DLL_DEPS) + # setup providers if (onnxruntime_USE_CUDA) set(NODEJS_BINDING_USE_CUDA "--use_cuda") endif() if (onnxruntime_USE_DML) set(NODEJS_BINDING_USE_DML "--use_dml") + list(APPEND NODEJS_DLL_DEPS "$/DirectML.dll") endif() if (onnxruntime_USE_WEBGPU) set(NODEJS_BINDING_USE_WEBGPU "--use_webgpu") + if (WIN32 AND onnxruntime_ENABLE_DAWN_BACKEND_D3D12) + list(APPEND NODEJS_DLL_DEPS "$/dxil.dll") + list(APPEND NODEJS_DLL_DEPS "$/dxcompiler.dll") + endif() + if (onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY) + list(APPEND NODEJS_DLL_DEPS "$") + endif() endif() if (onnxruntime_USE_TENSORRT) set(NODEJS_BINDING_USE_TENSORRT "--use_tensorrt") @@ -94,9 +105,12 @@ add_custom_target(js_common_npm_ci ALL add_custom_target(nodejs_binding_wrapper ALL COMMAND ${NPM_CLI} ci - COMMAND ${NPM_CLI} run build -- --onnxruntime-build-dir=${CMAKE_CURRENT_BINARY_DIR} --config=${CMAKE_BUILD_TYPE} --onnxruntime-generator=${CMAKE_GENERATOR} - --arch=${NODEJS_BINDING_ARCH} ${NODEJS_BINDING_USE_CUDA} ${NODEJS_BINDING_USE_DML} ${NODEJS_BINDING_USE_WEBGPU} ${NODEJS_BINDING_USE_TENSORRT} - ${NODEJS_BINDING_USE_COREML} ${NODEJS_BINDING_USE_QNN} + COMMAND ${NPM_CLI} run build -- "--onnxruntime-build-dir=${CMAKE_CURRENT_BINARY_DIR}" + --config=${CMAKE_BUILD_TYPE} + "--onnxruntime-generator=${CMAKE_GENERATOR}" + "--dll_deps=${NODEJS_DLL_DEPS}" + --arch=${NODEJS_BINDING_ARCH} ${NODEJS_BINDING_USE_CUDA} ${NODEJS_BINDING_USE_DML} ${NODEJS_BINDING_USE_WEBGPU} + ${NODEJS_BINDING_USE_TENSORRT} ${NODEJS_BINDING_USE_COREML} ${NODEJS_BINDING_USE_QNN} WORKING_DIRECTORY ${JS_NODE_ROOT} COMMENT "Using cmake-js to build OnnxRuntime Node.js binding") diff --git a/cmake/onnxruntime_providers_webgpu.cmake b/cmake/onnxruntime_providers_webgpu.cmake index fea5964f0dda9..e527d538d8757 100644 --- a/cmake/onnxruntime_providers_webgpu.cmake +++ b/cmake/onnxruntime_providers_webgpu.cmake @@ -23,19 +23,18 @@ onnxruntime_add_include_to_target(onnxruntime_providers_webgpu onnxruntime_common dawn::dawncpp_headers dawn::dawn_headers onnx onnx_proto flatbuffers::flatbuffers Boost::mp11 safeint_interface) + set(onnxruntime_providers_webgpu_dll_deps) + if (onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY) target_link_libraries(onnxruntime_providers_webgpu dawn::webgpu_dawn) - if (onnxruntime_ENABLE_DELAY_LOADING_WIN_DLLS) - list(APPEND onnxruntime_DELAYLOAD_FLAGS "/DELAYLOAD:webgpu_dawn.dll") - endif() + if (WIN32) + if (onnxruntime_ENABLE_DELAY_LOADING_WIN_DLLS) + list(APPEND onnxruntime_DELAYLOAD_FLAGS "/DELAYLOAD:webgpu_dawn.dll") + endif() - # Copy webgpu_dawn.dll to the output directory - add_custom_command( - TARGET onnxruntime_providers_webgpu - POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy_if_different "$" "$" - VERBATIM ) + list(APPEND onnxruntime_providers_webgpu_dll_deps "$") + endif() else() if (NOT onnxruntime_USE_EXTERNAL_DAWN) target_link_libraries(onnxruntime_providers_webgpu dawn::dawn_native) @@ -43,4 +42,23 @@ target_link_libraries(onnxruntime_providers_webgpu dawn::dawn_proc) endif() + if (WIN32 AND onnxruntime_ENABLE_DAWN_BACKEND_D3D12) + # Ensure dxil.dll and dxcompiler.dll exist in the output directory $ + add_dependencies(onnxruntime_providers_webgpu copy_dxil_dll) + add_dependencies(onnxruntime_providers_webgpu dxcompiler) + + list(APPEND onnxruntime_providers_webgpu_dll_deps "$/dxil.dll") + list(APPEND onnxruntime_providers_webgpu_dll_deps "$/dxcompiler.dll") + endif() + + if (onnxruntime_providers_webgpu_dll_deps) + # Copy dependency DLLs to the output directory + add_custom_command( + TARGET onnxruntime_providers_webgpu + POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different "${onnxruntime_providers_webgpu_dll_deps}" "$" + COMMAND_EXPAND_LISTS + VERBATIM ) + endif() + set_target_properties(onnxruntime_providers_webgpu PROPERTIES FOLDER "ONNXRuntime") diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake index c19a18ef15089..17ee0e9c4e15a 100644 --- a/cmake/onnxruntime_unittests.cmake +++ b/cmake/onnxruntime_unittests.cmake @@ -525,6 +525,9 @@ set (onnxruntime_global_thread_pools_test_SRC set (onnxruntime_webgpu_external_dawn_test_SRC ${TEST_SRC_DIR}/webgpu/external_dawn/main.cc) +set (onnxruntime_webgpu_delay_load_test_SRC + ${TEST_SRC_DIR}/webgpu/delay_load/main.cc) + # tests from lowest level library up. # the order of libraries should be maintained, with higher libraries being added first in the list @@ -1863,4 +1866,13 @@ if (onnxruntime_USE_WEBGPU AND onnxruntime_USE_EXTERNAL_DAWN) onnxruntime_add_include_to_target(onnxruntime_webgpu_external_dawn_test dawn::dawncpp_headers dawn::dawn_headers) endif() +if (onnxruntime_USE_WEBGPU AND WIN32 AND onnxruntime_BUILD_SHARED_LIB AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND NOT onnxruntime_MINIMAL_BUILD) + AddTest(DYN + TARGET onnxruntime_webgpu_delay_load_test + SOURCES ${onnxruntime_webgpu_delay_load_test_SRC} + LIBS ${SYS_PATH_LIB} + DEPENDS ${all_dependencies} + ) +endif() + include(onnxruntime_fuzz_test.cmake) diff --git a/cmake/winml.cmake b/cmake/winml.cmake index ff6b71217ad87..63f356fcf831d 100644 --- a/cmake/winml.cmake +++ b/cmake/winml.cmake @@ -782,7 +782,7 @@ add_dependencies(winml_dll winml_api_native) add_dependencies(winml_dll winml_api_native_internal) # Link libraries -target_link_libraries(winml_dll PRIVATE re2) +target_link_libraries(winml_dll PRIVATE re2::re2) target_link_libraries(winml_dll PRIVATE ${WIL_TARGET}) target_link_libraries(winml_dll PRIVATE winml_lib_api) if (NOT winml_is_inbox) diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md index 6ea3f93cdea12..2290030073e5c 100644 --- a/docs/ContribOperators.md +++ b/docs/ContribOperators.md @@ -1625,7 +1625,7 @@ This version of the operator has been available since version 1 of the 'com.micr #### Type Constraints
-
T : tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(float16), tensor(float), tensor(double)
+
T : tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(float16), tensor(float), tensor(double), tensor(bfloat16)
Constrain input and output types.
diff --git a/include/onnxruntime/core/framework/op_kernel.h b/include/onnxruntime/core/framework/op_kernel.h index 07625c38d8474..375f0a4dc8dd2 100644 --- a/include/onnxruntime/core/framework/op_kernel.h +++ b/include/onnxruntime/core/framework/op_kernel.h @@ -7,6 +7,7 @@ // It is safe to include the below header even if SHARED_PROVIDER macro is enabled // as it doesn't include any pb headers. +#include "core/framework/buffer_deleter.h" #include "core/framework/prepacked_weights_container.h" #ifndef SHARED_PROVIDER diff --git a/include/onnxruntime/core/graph/graph.h b/include/onnxruntime/core/graph/graph.h index eb9581e8018d1..7798394b045dc 100644 --- a/include/onnxruntime/core/graph/graph.h +++ b/include/onnxruntime/core/graph/graph.h @@ -3,14 +3,15 @@ #pragma once +#include #include #include #include +#include #include #include #include #include -#include #include "core/common/flatbuffers.h" @@ -19,13 +20,14 @@ #include "core/common/common.h" #include "core/common/path_string.h" #include "core/common/const_pointer_container.h" +#include "core/common/inlined_containers_fwd.h" #if !defined(ORT_MINIMAL_BUILD) #include "core/common/inlined_containers.h" #endif -#include "core/common/inlined_containers_fwd.h" #include "core/common/span_utils.h" #include "core/common/status.h" #include "core/common/logging/logging.h" +#include "core/framework/prepacked_weights_container.h" #include "core/graph/onnx_protobuf.h" #include "core/graph/basic_types.h" #include "core/graph/constants.h" @@ -41,6 +43,7 @@ namespace onnxruntime { class Graph; struct IndexedSubGraph; class Model; +struct ModelSavingOptions; class OpSignature; #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD) @@ -1153,29 +1156,6 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi const ONNX_NAMESPACE::GraphProto& ToGraphProto(); ONNX_NAMESPACE::GraphProto ToGraphProto() const; - // Options to align external initializer offset. - // For models running on CPU, ORT will try to use mmap to load external initializers. - // To use mmap, external initializer need to be offset aligned. - // ORT saves external initializers into signle data file, each initializer is accessed with - // offset(start position of initializer) and length(byte length of initializer) of the data file. - // To use mmap, each offset need to be aligned which means offset need to divisible by - // allocation granularity(64KB for windows and 4K for other OSes). - // With align_offset to true, ORT will align offset for large initializer when - // save ONNX model with external data file. - struct OffsetAlignmentInfo { - // Offset will always be page aligned and allocation granularity aligned for mmap support. - // This is done by padding previous tensor data with zeros keeping same length. - bool align_offset = false; - // Alignment threshold for size of data. - // Having a low threshold will waste file space for small initializers. - // Only when tensor's data size is > the page_align_threshold it will be force aligned. - // Default to 1MB. - int64_t align_threshold = 1048576; - // The allocation Granularity for mmap() support. - // Typically 64KB for Windows & 4KB for other OSes. Default to 64KB. - int64_t allocation_granularity = 65536; - }; - /** Gets the GraphProto representation of this Graph @param external_file_path File path of the binary file to use for initializers. @param model_file_path path of the model file. @@ -1186,15 +1166,7 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi */ ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path, const std::filesystem::path& model_file_path, - size_t initializer_size_threshold, - const OffsetAlignmentInfo& align_info) const; - - ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path, - const std::filesystem::path& model_file_path, - size_t initializer_size_threshold) const { - OffsetAlignmentInfo default_options; - return ToGraphProtoWithExternalInitializers(external_file_path, model_file_path, initializer_size_threshold, default_options); - } + const ModelSavingOptions& model_saving_options) const; /** Gets the ISchemaRegistry instances being used with this Graph. */ IOnnxRuntimeOpSchemaCollectionPtr GetSchemaRegistry() const; @@ -1400,6 +1372,18 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi #endif // !defined(ORT_MINIMAL_BUILD) + // This function constructs PrepackedSharedContainer in the root graph only + // and initializes a reference to it in all (sub)graphs + void ConstructPrepackedSharedContainerAndSetMode(bool saving_mode_on); + + const PrepackedWeightsForGraph& GetPrepacked() const noexcept { + return *prepacked_weights_for_graph_; + } + + PrepackedWeightsForGraph& GetPrepacked() noexcept { + return *prepacked_weights_for_graph_; + } + /** Returns the Node containing the GraphProto for this Graph instance if IsSubgraph is true */ const Node* ParentNode() const { return parent_node_; } @@ -1519,6 +1503,31 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi Status AddConstantProtoAsInitializer(const ONNX_NAMESPACE::NodeProto& constant_node_proto, std::optional new_name); + /// + /// This function traverses the graph bottom up and externalizes + /// constant initializers along with their pre-packed blobs from different + /// kernels. Writes constant initializers to the external file with any pre-packed + /// blobs (if enabled and produced for this initializer) and then modifies TensorProto + /// entry with external data references. + /// + /// model file path from Model + /// a binary file path for relative to the model file path + /// where the initializers data is written + /// model file folder path with external file path appended + /// model saving options including alignment and pre-packs + /// The graph proto to be modified + /// external file stream + /// current external file offset updated with each write + /// Status instance + Status AddExternalInitializersToGraphProtoImpl( + const std::filesystem::path& model_path, + const std::filesystem::path& external_file_path, + const std::filesystem::path& model_external_file_path, + const ModelSavingOptions& model_saving_options, + ONNX_NAMESPACE::GraphProto& output_graph_proto, + std::ostream& external_stream, + int64_t& external_offset) const; + #endif Version IrVersion() const noexcept { @@ -1703,6 +1712,21 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi std::hash, std::equal_to> sparse_tensor_names_; + // Prepacked blobs container that stored pre-packed initializers + // data that is: + // - mem-mapped from disk + // - shared within the session + // - shared across sessions by transferring the ownership of loaded data entries to + // SessionState::PrepackedWeightsContainer* if one is present. + // This container is optional because it is present only in the root graph. + std::optional prepacked_key_to_blobs_; + + // This container contains a reference to the root prepacked_key_to_blobs_ + // and also (in the save mode) records association between the initializer + // names and their pre-packed blobs (via keys). + // This is optional due to delayed construction. + std::optional prepacked_weights_for_graph_; + #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD) // Runtime optimization storage. // Note: runtime_optimizations_ == *runtime_optimizations_ptr_ and must be initialized diff --git a/include/onnxruntime/core/graph/model_saving_options.h b/include/onnxruntime/core/graph/model_saving_options.h new file mode 100644 index 0000000000000..924799f15b247 --- /dev/null +++ b/include/onnxruntime/core/graph/model_saving_options.h @@ -0,0 +1,44 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +namespace onnxruntime { + +class PrepackedWeightsForGraph; + +// These options affect how the model initializers are written to the external file. +// This includes options to align external initializer offset. +// For models running on CPU, ORT will try to use mmap to load external +// initializers. To use mmap, external initializer need to be offset aligned. +// ORT saves external initializers into single data file, each initializer is +// accessed with offset(start position of initializer) and length(byte length of +// initializer) of the data file. To use mmap, each offset need to be aligned +// which means offset need to divisible by allocation granularity(64KB for +// windows and 4K for other OSes). With align_offset to true, ORT will align +// offset for large initializer when save ONNX model with external data file. +struct ModelSavingOptions { + explicit ModelSavingOptions(size_t size_threshold) + : initializer_size_threshold(size_threshold) {} + + // Mimimal initializer size in bytes to be externalized on disk + size_t initializer_size_threshold; + // Offset will always be page aligned and allocation granularity aligned for + // mmap support. This is done by padding previous tensor data with zeros + // keeping same length. + bool align_offset = false; + // Alignment threshold for size of data. + // Having a low threshold will waste file space for small initializers. + // Only when tensor's data size is > the page_align_threshold it will be force + // aligned. Default to 1MB. + int64_t align_threshold = 1048576; + // The allocation Granularity for mmap() support. + // Typically 64KB for Windows & 4KB for other OSes. Default to 64KB. +#ifdef _WIN32 + int64_t allocation_granularity = 65536; +#else + int64_t allocation_granularity = 4096; +#endif +}; + +} // namespace onnxruntime diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h index 8f1bc98ce7b49..64a4dd19c12b0 100644 --- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h +++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h @@ -250,6 +250,17 @@ static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersFil static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersMinSizeInBytes = "session.optimized_model_external_initializers_min_size_in_bytes"; +// Use this config when saving pre-packed constant initializers to an external data file. +// This allows you to memory map pre-packed initializers on model load and leave it to +// to the OS the amount of memory consumed by the pre-packed initializers. Otherwise, +// pre-packed data resides on the heap. +// +// - "0": Default is not save pre-packed initializers to a data file. +// - "1": Save pre-packed constant initializers to an external data file. +// Sample usage: sess_options.add_session_config_entry(kOrtSessionOptionsSavePrePackedConstantInitializers, "1") +static const char* const kOrtSessionOptionsSavePrePackedConstantInitializers = + "session.save_external_prepacked_constant_initializers"; + // Enable EP context feature to dump the partitioned graph which includes the EP context into Onnx file. // The dumped Onnx model with EP context can be used for future inference to avoid the EP graph partitioning/compile overhead. // "0": disable. (default) diff --git a/java/src/test/java/ai/onnxruntime/InferenceTest.java b/java/src/test/java/ai/onnxruntime/InferenceTest.java index 15d89b536b39a..e11537492d3a7 100644 --- a/java/src/test/java/ai/onnxruntime/InferenceTest.java +++ b/java/src/test/java/ai/onnxruntime/InferenceTest.java @@ -737,7 +737,6 @@ public void testCoreML() throws OrtException { runProvider(OrtProvider.CORE_ML); } - @Disabled("DirectML Java API hasn't been supported yet") @Test @EnabledIfSystemProperty(named = "USE_DML", matches = "1") public void testDirectML() throws OrtException { diff --git a/java/src/test/java/ai/onnxruntime/providers/ProviderOptionsTest.java b/java/src/test/java/ai/onnxruntime/providers/ProviderOptionsTest.java index fa0b6fd0ef9d9..57c4eb3577fd0 100644 --- a/java/src/test/java/ai/onnxruntime/providers/ProviderOptionsTest.java +++ b/java/src/test/java/ai/onnxruntime/providers/ProviderOptionsTest.java @@ -27,7 +27,6 @@ import java.util.HashMap; import java.util.Map; import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.condition.DisabledIfSystemProperty; import org.junit.jupiter.api.condition.EnabledIfSystemProperty; public class ProviderOptionsTest { @@ -35,7 +34,6 @@ public class ProviderOptionsTest { @Test @EnabledIfSystemProperty(named = "USE_CUDA", matches = "1") - @DisabledIfSystemProperty(named = "NO_CUDA_TEST", matches = "1") public void testCUDAOptions() throws OrtException { // Test standard options OrtCUDAProviderOptions cudaOpts = new OrtCUDAProviderOptions(0); @@ -63,7 +61,6 @@ public void testCUDAOptions() throws OrtException { @Test @EnabledIfSystemProperty(named = "USE_TENSORRT", matches = "1") - @DisabledIfSystemProperty(named = "NO_CUDA_TEST", matches = "1") public void testTensorRT() throws OrtException { // Test standard options OrtTensorRTProviderOptions rtOpts = new OrtTensorRTProviderOptions(0); diff --git a/js/node/CMakeLists.txt b/js/node/CMakeLists.txt index d79a82c572dc2..c78b40a3e7429 100644 --- a/js/node/CMakeLists.txt +++ b/js/node/CMakeLists.txt @@ -113,10 +113,12 @@ endif() if (WIN32) file(COPY ${ONNXRUNTIME_WIN_BIN_DIR}/onnxruntime.dll DESTINATION ${dist_folder}) - if (USE_DML) - file(COPY ${ONNXRUNTIME_WIN_BIN_DIR}/DirectML.dll - DESTINATION ${dist_folder}) - endif () + if (ORT_NODEJS_DLL_DEPS) + foreach(dll ${ORT_NODEJS_DLL_DEPS}) + file(COPY ${dll} DESTINATION ${dist_folder}) + endforeach() + endif() + elseif (APPLE) file(COPY ${ONNXRUNTIME_BUILD_DIR}/libonnxruntime.dylib DESTINATION ${dist_folder} FOLLOW_SYMLINK_CHAIN) diff --git a/js/node/script/build.ts b/js/node/script/build.ts index dcdcb93377b4c..b557368ed58c6 100644 --- a/js/node/script/build.ts +++ b/js/node/script/build.ts @@ -39,6 +39,8 @@ const USE_TENSORRT = !!buildArgs.use_tensorrt; const USE_COREML = !!buildArgs.use_coreml; // --use_qnn const USE_QNN = !!buildArgs.use_qnn; +// --dll_deps= +const DLL_DEPS = buildArgs.dll_deps; // build path const ROOT_FOLDER = path.join(__dirname, '..'); @@ -82,6 +84,9 @@ if (USE_COREML) { if (USE_QNN) { args.push('--CDUSE_QNN=ON'); } +if (DLL_DEPS) { + args.push(`--CDORT_NODEJS_DLL_DEPS=${DLL_DEPS}`); +} // set CMAKE_OSX_ARCHITECTURES for macOS build if (os.platform() === 'darwin') { diff --git a/js/node/src/directml_load_helper.cc b/js/node/src/directml_load_helper.cc deleted file mode 100644 index 6aafe4d5fa788..0000000000000 --- a/js/node/src/directml_load_helper.cc +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#ifdef _WIN32 -#include "common.h" -#include "windows.h" - -void LoadDirectMLDll(Napi::Env env) { - DWORD pathLen = MAX_PATH; - std::wstring path(pathLen, L'\0'); - HMODULE moduleHandle = nullptr; - - GetModuleHandleEx(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT, - reinterpret_cast(&LoadDirectMLDll), &moduleHandle); - - DWORD getModuleFileNameResult = GetModuleFileNameW(moduleHandle, const_cast(path.c_str()), pathLen); - while (getModuleFileNameResult == 0 || getModuleFileNameResult == pathLen) { - int ret = GetLastError(); - if (ret == ERROR_INSUFFICIENT_BUFFER && pathLen < 32768) { - pathLen *= 2; - path.resize(pathLen); - getModuleFileNameResult = GetModuleFileNameW(moduleHandle, const_cast(path.c_str()), pathLen); - } else { - ORT_NAPI_THROW_ERROR(env, "Failed getting path to load DirectML.dll, error code: ", ret); - } - } - - path.resize(path.rfind(L'\\') + 1); - path.append(L"DirectML.dll"); - HMODULE libraryLoadResult = LoadLibraryW(path.c_str()); - - if (!libraryLoadResult) { - int ret = GetLastError(); - ORT_NAPI_THROW_ERROR(env, "Failed loading bundled DirectML.dll, error code: ", ret); - } -} -#endif diff --git a/js/node/src/directml_load_helper.h b/js/node/src/directml_load_helper.h deleted file mode 100644 index 074a4f95ed476..0000000000000 --- a/js/node/src/directml_load_helper.h +++ /dev/null @@ -1,6 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#if defined(USE_DML) && defined(_WIN32) -void LoadDirectMLDll(Napi::Env env); -#endif diff --git a/js/node/src/inference_session_wrap.cc b/js/node/src/inference_session_wrap.cc index 23d859351f426..04ab71dc48ec2 100644 --- a/js/node/src/inference_session_wrap.cc +++ b/js/node/src/inference_session_wrap.cc @@ -4,7 +4,6 @@ #include "onnxruntime_cxx_api.h" #include "common.h" -#include "directml_load_helper.h" #include "inference_session_wrap.h" #include "run_options_helper.h" #include "session_options_helper.h" @@ -19,9 +18,6 @@ Napi::FunctionReference& InferenceSessionWrap::GetTensorConstructor() { } Napi::Object InferenceSessionWrap::Init(Napi::Env env, Napi::Object exports) { -#if defined(USE_DML) && defined(_WIN32) - LoadDirectMLDll(env); -#endif // create ONNX runtime env Ort::InitApi(); ORT_NAPI_THROW_ERROR_IF( diff --git a/js/node/src/tensor_helper.cc b/js/node/src/tensor_helper.cc index 27eb9b65c62d3..12b1a79793ff3 100644 --- a/js/node/src/tensor_helper.cc +++ b/js/node/src/tensor_helper.cc @@ -53,24 +53,24 @@ constexpr size_t DATA_TYPE_ELEMENT_SIZE_MAP[] = { static_assert(sizeof(DATA_TYPE_ELEMENT_SIZE_MAP) == sizeof(size_t) * ONNX_TENSOR_ELEMENT_DATA_TYPE_COUNT, "definition not matching"); -constexpr napi_typedarray_type DATA_TYPE_TYPEDARRAY_MAP[] = { - (napi_typedarray_type)(-1), // ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED not supported - napi_float32_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT - napi_uint8_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8 - napi_int8_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8 - napi_uint16_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16 - napi_int16_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16 - napi_int32_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32 - napi_bigint64_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64 - (napi_typedarray_type)(-1), // ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING not supported - napi_uint8_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL - napi_uint16_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16 FLOAT16 uses Uint16Array - napi_float64_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE - napi_uint32_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32 - napi_biguint64_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64 - (napi_typedarray_type)(-1), // ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX64 not supported - (napi_typedarray_type)(-1), // ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX128 not supported - (napi_typedarray_type)(-1) // ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16 not supported +constexpr std::underlying_type_t DATA_TYPE_TYPEDARRAY_MAP[] = { + std::underlying_type_t(-1), // ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED not supported + napi_float32_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT + napi_uint8_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8 + napi_int8_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8 + napi_uint16_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16 + napi_int16_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16 + napi_int32_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32 + napi_bigint64_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64 + std::underlying_type_t(-1), // ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING not supported + napi_uint8_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL + napi_uint16_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16 FLOAT16 uses Uint16Array + napi_float64_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE + napi_uint32_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32 + napi_biguint64_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64 + std::underlying_type_t(-1), // ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX64 not supported + std::underlying_type_t(-1), // ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX128 not supported + std::underlying_type_t(-1) // ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16 not supported }; static_assert(sizeof(DATA_TYPE_TYPEDARRAY_MAP) == sizeof(napi_typedarray_type) * ONNX_TENSOR_ELEMENT_DATA_TYPE_COUNT, "definition not matching"); @@ -98,7 +98,20 @@ static_assert(sizeof(DATA_TYPE_ID_TO_NAME_MAP) == sizeof(const char*) * ONNX_TEN "definition not matching"); const std::unordered_map DATA_TYPE_NAME_TO_ID_MAP = { - {"float32", ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT}, {"uint8", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8}, {"int8", ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8}, {"uint16", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16}, {"int16", ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16}, {"int32", ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32}, {"int64", ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64}, {"string", ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING}, {"bool", ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL}, {"float16", ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16}, {"float64", ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE}, {"uint32", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32}, {"uint64", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64}}; + {"float32", ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT}, + {"uint8", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8}, + {"int8", ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8}, + {"uint16", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16}, + {"int16", ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16}, + {"int32", ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32}, + {"int64", ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64}, + {"string", ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING}, + {"bool", ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL}, + {"float16", ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16}, + {"float64", ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE}, + {"uint32", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32}, + {"uint64", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64}, +}; // currently only support tensor Ort::Value NapiValueToOrtValue(Napi::Env env, Napi::Value value, OrtMemoryInfo* cpu_memory_info, OrtMemoryInfo* webgpu_memory_info) { @@ -181,7 +194,7 @@ Ort::Value NapiValueToOrtValue(Napi::Env env, Napi::Value value, OrtMemoryInfo* "Tensor.data must be a typed array for numeric tensor."); auto tensorDataTypedArray = tensorDataValue.As(); - auto typedArrayType = tensorDataValue.As().TypedArrayType(); + std::underlying_type_t typedArrayType = tensorDataValue.As().TypedArrayType(); ORT_NAPI_THROW_TYPEERROR_IF(DATA_TYPE_TYPEDARRAY_MAP[elemType] != typedArrayType, env, "Tensor.data must be a typed array (", DATA_TYPE_TYPEDARRAY_MAP[elemType], ") for ", tensorTypeString, " tensors, but got typed array (", typedArrayType, ")."); @@ -294,7 +307,7 @@ Napi::Value OrtValueToNapiValue(Napi::Env env, Ort::Value&& value) { } napi_value typedArrayData; napi_status status = - napi_create_typedarray(env, DATA_TYPE_TYPEDARRAY_MAP[elemType], size, arrayBuffer, 0, &typedArrayData); + napi_create_typedarray(env, (napi_typedarray_type)DATA_TYPE_TYPEDARRAY_MAP[elemType], size, arrayBuffer, 0, &typedArrayData); NAPI_THROW_IF_FAILED(env, status, Napi::Value); // new Tensor(type, typedArrayData, dims) diff --git a/js/web/test/data/ops/conv.jsonc b/js/web/test/data/ops/conv.jsonc index f514ae5fa75e6..262503214a50a 100644 --- a/js/web/test/data/ops/conv.jsonc +++ b/js/web/test/data/ops/conv.jsonc @@ -391,48 +391,48 @@ } ] }, - // { - // "name": "conv - vectorize group - B", - // "operator": "Conv", - // "inputShapeDefinitions": "rankOnly", - // "opset": { "domain": "", "version": 17 }, - // "attributes": [ - // { "name": "kernel_shape", "data": [2, 2], "type": "ints" }, - // { "name": "group", "data": 3, "type": "int" } - // ], - // "cases": [ - // { - // "name": "T[0]", - // "inputs": [ - // { - // "data": [ - // 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, - // 19.0, 20.0, 21.0, 22.0, 23.0, 0, 0, 0 - // ], - // "dims": [1, 3, 3, 3], - // "type": "float32" - // }, - // { - // "data": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0], - // "dims": [3, 1, 2, 2], - // "type": "float32" - // }, - // { - // "data": [0.1, 0.2, 0.3], - // "dims": [3], - // "type": "float32" - // } - // ], - // "outputs": [ - // { - // "data": [27.1, 37.1, 57.1, 67.1, 293.2, 319.2, 371.2, 397.2, 847.3, 889.3, 409.3, 428.3], - // "dims": [1, 3, 2, 2], - // "type": "float32" - // } - // ] - // } - // ] - // }, + { + "name": "conv - vectorize group - B", + "operator": "Conv", + "inputShapeDefinitions": "rankOnly", + "opset": { "domain": "", "version": 17 }, + "attributes": [ + { "name": "kernel_shape", "data": [2, 2], "type": "ints" }, + { "name": "group", "data": 3, "type": "int" } + ], + "cases": [ + { + "name": "T[0]", + "inputs": [ + { + "data": [ + 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, + 19.0, 20.0, 21.0, 22.0, 23.0, 0, 0, 0 + ], + "dims": [1, 3, 3, 3], + "type": "float32" + }, + { + "data": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0], + "dims": [3, 1, 2, 2], + "type": "float32" + }, + { + "data": [0.1, 0.2, 0.3], + "dims": [3], + "type": "float32" + } + ], + "outputs": [ + { + "data": [27.1, 37.1, 57.1, 67.1, 293.2, 319.2, 371.2, 397.2, 847.3, 889.3, 409.3, 428.3], + "dims": [1, 3, 2, 2], + "type": "float32" + } + ] + } + ] + }, { "name": "conv - vectorize group - C", "operator": "Conv", @@ -470,44 +470,44 @@ } ] }, - // { - // "name": "conv - vectorize group - D", - // "operator": "Conv", - // "inputShapeDefinitions": "rankOnly", - // "opset": { "domain": "", "version": 17 }, - // "attributes": [ - // { "name": "kernel_shape", "data": [2, 2], "type": "ints" }, - // { "name": "group", "data": 3, "type": "int" }, - // { "name": "strides", "data": [2, 2], "type": "ints" } - // ], - // "cases": [ - // { - // "name": "T[0] strides = [2, 2]", - // "inputs": [ - // { - // "data": [ - // 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, - // 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0 - // ], - // "dims": [1, 3, 3, 4], - // "type": "float32" - // }, - // { - // "data": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0], - // "dims": [3, 1, 2, 2], - // "type": "float32" - // } - // ], - // "outputs": [ - // { - // "data": [34, 54, 386, 438, 1122, 1206], - // "dims": [1, 3, 1, 2], - // "type": "float32" - // } - // ] - // } - // ] - // }, + { + "name": "conv - vectorize group - D", + "operator": "Conv", + "inputShapeDefinitions": "rankOnly", + "opset": { "domain": "", "version": 17 }, + "attributes": [ + { "name": "kernel_shape", "data": [2, 2], "type": "ints" }, + { "name": "group", "data": 3, "type": "int" }, + { "name": "strides", "data": [2, 2], "type": "ints" } + ], + "cases": [ + { + "name": "T[0] strides = [2, 2]", + "inputs": [ + { + "data": [ + 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, + 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0 + ], + "dims": [1, 3, 3, 4], + "type": "float32" + }, + { + "data": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0], + "dims": [3, 1, 2, 2], + "type": "float32" + } + ], + "outputs": [ + { + "data": [34, 54, 386, 438, 1122, 1206], + "dims": [1, 3, 1, 2], + "type": "float32" + } + ] + } + ] + }, { "name": "conv - pointwise", "operator": "Conv", diff --git a/js/web/test/data/ops/fused-conv.jsonc b/js/web/test/data/ops/fused-conv.jsonc index ebb0b5d3e1f58..d88c91ebc9de7 100644 --- a/js/web/test/data/ops/fused-conv.jsonc +++ b/js/web/test/data/ops/fused-conv.jsonc @@ -249,44 +249,44 @@ } ] }, - // { - // "name": "NHWC group-conv with HardSigmoid", - // "operator": "Conv", - // "attributes": [ - // { "name": "activation", "data": "HardSigmoid", "type": "string" }, - // { "name": "kernel_shape", "data": [2, 2], "type": "ints" }, - // { "name": "group", "data": 3, "type": "int" }, - // { "name": "activation_params", "data": [2.0, 5.0], "type": "floats" } - // ], - // "opset": { "domain": "com.ms.internal.nhwc", "version": 1 }, - // "cases": [ - // { - // "name": "T[0]", - // "inputs": [ - // { - // "data": [ - // 0.0, 1.0, 2.0, -3.0, 4.0, -5.0, 6.0, 7.0, 8.0, -9.0, -10.0, 11.0, -12.0, 13.0, -14.0, 15.0, 16.0, 17.0, - // 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0 - // ], - // "dims": [1, 3, 3, 3], - // "type": "float32" - // }, - // { - // "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], - // "dims": [3, 1, 2, 2], - // "type": "float32" - // } - // ], - // "outputs": [ - // { - // "data": [0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1], - // "dims": [1, 2, 2, 3], - // "type": "float32" - // } - // ] - // } - // ] - // }, + { + "name": "NHWC group-conv with HardSigmoid", + "operator": "Conv", + "attributes": [ + { "name": "activation", "data": "HardSigmoid", "type": "string" }, + { "name": "kernel_shape", "data": [2, 2], "type": "ints" }, + { "name": "group", "data": 3, "type": "int" }, + { "name": "activation_params", "data": [2.0, 5.0], "type": "floats" } + ], + "opset": { "domain": "com.ms.internal.nhwc", "version": 1 }, + "cases": [ + { + "name": "T[0]", + "inputs": [ + { + "data": [ + 0.0, 1.0, 2.0, -3.0, 4.0, -5.0, 6.0, 7.0, 8.0, -9.0, -10.0, 11.0, -12.0, 13.0, -14.0, 15.0, 16.0, 17.0, + 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0 + ], + "dims": [1, 3, 3, 3], + "type": "float32" + }, + { + "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], + "dims": [3, 1, 2, 2], + "type": "float32" + } + ], + "outputs": [ + { + "data": [0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1], + "dims": [1, 2, 2, 3], + "type": "float32" + } + ] + } + ] + }, { "name": "fused group-conv with LeakyRelu", "operator": "FusedConv", @@ -325,44 +325,44 @@ } ] }, - // { - // "name": "NHWC group-conv with LeakyRelu", - // "operator": "Conv", - // "attributes": [ - // { "name": "activation", "data": "LeakyRelu", "type": "string" }, - // { "name": "kernel_shape", "data": [2, 2], "type": "ints" }, - // { "name": "group", "data": 3, "type": "int" }, - // { "name": "activation_params", "data": [2.0], "type": "floats" } - // ], - // "opset": { "domain": "com.ms.internal.nhwc", "version": 1 }, - // "cases": [ - // { - // "name": "T[0]", - // "inputs": [ - // { - // "data": [ - // 0.0, 1.0, 2.0, -3.0, 4.0, -5.0, 6.0, 7.0, 8.0, -9.0, -10.0, 11.0, -12.0, 13.0, -14.0, 15.0, 16.0, 17.0, - // 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0 - // ], - // "dims": [1, 3, 3, 3], - // "type": "float32" - // }, - // { - // "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], - // "dims": [3, 1, 2, 2], - // "type": "float32" - // } - // ], - // "outputs": [ - // { - // "data": [-162, 63, -158, 33, 281, 85, 105, 337, 455, 177, 515, 609], - // "dims": [1, 2, 2, 3], - // "type": "float32" - // } - // ] - // } - // ] - // }, + { + "name": "NHWC group-conv with LeakyRelu", + "operator": "Conv", + "attributes": [ + { "name": "activation", "data": "LeakyRelu", "type": "string" }, + { "name": "kernel_shape", "data": [2, 2], "type": "ints" }, + { "name": "group", "data": 3, "type": "int" }, + { "name": "activation_params", "data": [2.0], "type": "floats" } + ], + "opset": { "domain": "com.ms.internal.nhwc", "version": 1 }, + "cases": [ + { + "name": "T[0]", + "inputs": [ + { + "data": [ + 0.0, 1.0, 2.0, -3.0, 4.0, -5.0, 6.0, 7.0, 8.0, -9.0, -10.0, 11.0, -12.0, 13.0, -14.0, 15.0, 16.0, 17.0, + 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0 + ], + "dims": [1, 3, 3, 3], + "type": "float32" + }, + { + "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], + "dims": [3, 1, 2, 2], + "type": "float32" + } + ], + "outputs": [ + { + "data": [-162, 63, -158, 33, 281, 85, 105, 337, 455, 177, 515, 609], + "dims": [1, 2, 2, 3], + "type": "float32" + } + ] + } + ] + }, { "name": "fused conv with LeakyRelu", "operator": "FusedConv", diff --git a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc index 9a49adf347a29..8abcd78bfff4c 100644 --- a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc +++ b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc @@ -60,7 +60,7 @@ Status MatMulNBitsProgram::GenerateShaderCode(ShaderHelper& shader) const { const auto& scales = shader.AddInput("scales", ShaderUsage::UseUniform); const auto& y = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias | ShaderUsage::UseIndicesTypeAlias); - if ((is_intel_ || tile_m_ > 1) && block_size_ == 32) { + if (block_size_ == 32) { const uint32_t workgroup_size = WorkgroupSizeX() * WorkgroupSizeY(); const uint32_t tile_size = WorkgroupSizeX() * components_b_ * 8; // each uint32 has 8 data. const uint32_t a_length_per_tile = tile_size / a.NumComponents(); @@ -408,14 +408,12 @@ Status MatMulNBits::ComputeInternal(onnxruntime::webgpu::ComputeContext& context const uint32_t components_b = GetMaxComponents(blob_size_in_words); uint32_t components = GetMaxComponents(N); - const bool is_intel = context.AdapterInfo().vendor == std::string_view{"intel"} && - context.AdapterInfo().architecture == std::string_view{"gen-12lp"}; const bool has_zero_points = zero_points != nullptr; // TODO: Support output_number > 1. Some cases are failed when output_number > 1. constexpr uint32_t output_number = 1; const uint32_t tile_m = M > kMinMForTileOptimization ? 4 : 1; - MatMulNBitsProgram program{output_number, block_size, tile_m, gsl::narrow(components_b), has_zero_points, is_intel}; + MatMulNBitsProgram program{output_number, block_size, tile_m, gsl::narrow(components_b), has_zero_points}; if (M > kMinMForTileOptimization && block_size == 32) { components = 1; constexpr uint32_t workgroup_size = 64; @@ -426,7 +424,7 @@ Status MatMulNBits::ComputeInternal(onnxruntime::webgpu::ComputeContext& context (M + tile_m - 1) / tile_m, batch_count); program.CacheHint("T_M" + std::to_string(tile_m)); - } else if (is_intel && block_size == 32) { + } else if (block_size == 32) { components = 1; constexpr uint32_t workgroup_size = 128; const uint32_t workgroup_y = N % 8 == 0 ? 8 : N % 4 == 0 ? 4 diff --git a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.h b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.h index 8a4626083419c..57615d3ddabcf 100644 --- a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.h +++ b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.h @@ -14,13 +14,12 @@ using namespace onnxruntime::webgpu; class MatMulNBitsProgram final : public Program { public: - MatMulNBitsProgram(uint32_t output_number, uint32_t block_size, uint32_t tile_m, int components_b, bool has_zero_points, bool is_intel) : Program{"MatMulNBits"}, - output_number_{output_number}, - block_size_{block_size}, - tile_m_{tile_m}, - components_b_{components_b}, - has_zero_points_{has_zero_points}, - is_intel_{is_intel} { + MatMulNBitsProgram(uint32_t output_number, uint32_t block_size, uint32_t tile_m, int components_b, bool has_zero_points) : Program{"MatMulNBits"}, + output_number_{output_number}, + block_size_{block_size}, + tile_m_{tile_m}, + components_b_{components_b}, + has_zero_points_{has_zero_points} { } Status GenerateShaderCode(ShaderHelper& sh) const override; @@ -32,7 +31,6 @@ class MatMulNBitsProgram final : public Program { uint32_t tile_m_; int components_b_; bool has_zero_points_; - bool is_intel_; }; class MatMulNBits final : public WebGpuKernel { diff --git a/onnxruntime/core/dll/delay_load_hook.cc b/onnxruntime/core/dll/delay_load_hook.cc new file mode 100644 index 0000000000000..bc5e1aa662721 --- /dev/null +++ b/onnxruntime/core/dll/delay_load_hook.cc @@ -0,0 +1,91 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +// == workaround for delay loading of dependencies of onnxruntime.dll == +// +// Problem: +// +// When onnxruntime.dll uses delay loading for its dependencies, the dependencies are loaded using LoadLibraryEx, +// which search the directory of process (.exe) instead of this library (onnxruntime.dll). This is a problem for +// usages of Node.js binding and python binding, because Windows will try to find the dependencies in the directory +// of node.exe or python.exe, which is not the directory of onnxruntime.dll. +// +// Solution: +// +// By using the delay load hook `__pfnDliNotifyHook2`, we can intervene the loading procedure by loading from an +// absolute path. The absolute path is constructed by appending the name of the DLL to load to the directory of +// onnxruntime.dll. This way, we can ensure that the dependencies are loaded from the same directory as onnxruntime.dll. +// +// See also: +// - https://learn.microsoft.com/en-us/cpp/build/reference/understanding-the-helper-function?view=msvc-170#structure-and-constant-definitions +// - https://learn.microsoft.com/en-us/windows/win32/dlls/dynamic-link-library-search-order#alternate-search-order-for-unpackaged-apps +// +// The DLL DelayLoad hook is only enabled when the compiler is MSVC and at least one of the following is True: +// - both USE_WEBGPU and BUILD_DAWN_MONOLITHIC_LIBRARY are defined +// - USE_DML is defined +// +#if defined(USE_WEBGPU) && defined(BUILD_DAWN_MONOLITHIC_LIBRARY) +#define ORT_DELAY_LOAD_WEBGPU_DAWN_DLL 1 +#else +#define ORT_DELAY_LOAD_WEBGPU_DAWN_DLL 0 +#endif +#if defined(USE_DML) +#define ORT_DELAY_LOAD_DIRECTML_DLL 1 +#else +#define ORT_DELAY_LOAD_DIRECTML_DLL 0 +#endif +#if defined(_MSC_VER) && (ORT_DELAY_LOAD_WEBGPU_DAWN_DLL || ORT_DELAY_LOAD_DIRECTML_DLL) + +#include +#include +#include +#include + +#include "core/platform/env.h" + +namespace { + +#define DEFINE_KNOWN_DLL(name) {#name ".dll", L#name L".dll"} + +constexpr struct { + const char* str; + const wchar_t* wstr; +} known_dlls[] = { +#if ORT_DELAY_LOAD_WEBGPU_DAWN_DLL + DEFINE_KNOWN_DLL(webgpu_dawn), +#endif +#if ORT_DELAY_LOAD_DIRECTML_DLL + DEFINE_KNOWN_DLL(DirectML), +#endif +}; +} // namespace + +FARPROC WINAPI delay_load_hook(unsigned dliNotify, PDelayLoadInfo pdli) { + if (dliNotify == dliNotePreLoadLibrary) { + for (size_t i = 0; i < _countof(known_dlls); ++i) { + if (_stricmp(pdli->szDll, known_dlls[i].str) == 0) { + // Try to load the DLL from the same directory as onnxruntime.dll + + // First, get the path to onnxruntime.dll + auto path = onnxruntime::Env::Default().GetRuntimePath(); + if (path.empty()) { + // Failed to get the path to onnxruntime.dll. In this case, we will just return NULL and let the system + // search for the DLL in the default search order. + return NULL; + } + + // Append the name of the DLL. Now `path` is the absolute path to the DLL to load. + path.append(known_dlls[i].wstr); + + // Load the DLL + return FARPROC(LoadLibraryExW(path.c_str(), NULL, + LOAD_LIBRARY_SEARCH_DEFAULT_DIRS | LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR)); + } + } + } + return NULL; +} + +extern "C" const PfnDliHook __pfnDliNotifyHook2 = delay_load_hook; + +#endif diff --git a/onnxruntime/core/dll/dllmain.cc b/onnxruntime/core/dll/dllmain.cc index 2e7bdafd0599f..ac5dcd9c96084 100644 --- a/onnxruntime/core/dll/dllmain.cc +++ b/onnxruntime/core/dll/dllmain.cc @@ -13,7 +13,7 @@ #pragma GCC diagnostic pop #endif -// dllmain.cpp : Defines the entry point for the DLL application. +// dllmain.cc : Defines the entry point for the DLL application. BOOL APIENTRY DllMain(HMODULE /*hModule*/, DWORD ul_reason_for_call, LPVOID /*lpReserved*/ diff --git a/onnxruntime/core/framework/graph_partitioner.cc b/onnxruntime/core/framework/graph_partitioner.cc index 406fc1b15effc..b97cf03e3bf59 100644 --- a/onnxruntime/core/framework/graph_partitioner.cc +++ b/onnxruntime/core/framework/graph_partitioner.cc @@ -681,7 +681,7 @@ static Status CreateEpContextModel(const ExecutionProviders& execution_providers context_cache_path, "' exist already."); } - Model ep_context_model(graph.Name(), false, graph.GetModel().MetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(), + Model ep_context_model(graph.Name(), false, graph.GetModel().MetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList{graph.GetSchemaRegistry()}, graph.DomainToVersionMap(), {}, logger); auto& ep_graph = ep_context_model.MainGraph(); ep_graph.SetDescription(graph.Description()); diff --git a/onnxruntime/core/framework/prepacked_weights.h b/onnxruntime/core/framework/prepacked_weights.h index fbf99b81937ee..9695be1e0554c 100644 --- a/onnxruntime/core/framework/prepacked_weights.h +++ b/onnxruntime/core/framework/prepacked_weights.h @@ -6,7 +6,8 @@ #include #include "core/common/basic_types.h" -#include "core/framework/buffer_deleter.h" +#include "core/common/inlined_containers_fwd.h" +#include "core/framework/allocator.h" #include "core/framework/tensor_shape.h" namespace onnxruntime { @@ -16,11 +17,14 @@ struct PrePackedWeights final { // Hence we hold them in container. It is upto the developer implementing each PrePack() // method to define what gets stored in which position of the container. - std::vector> buffers_; // cache pre-packed buffers associated with the kernel - std::vector buffer_sizes_; // cache sizes of pre-packed buffers (in bytes) + InlinedVector> buffers_; // cache pre-packed buffers associated with the kernel + InlinedVector buffer_sizes_; // cache sizes of pre-packed buffers (in bytes) // Produces a hash of the buffers stored in the given instance of this class HashValue GetHash() const; + + // The function creates a copy with non-owning BufferUniquePtrs. + PrePackedWeights CreateReferringCopy() const; }; } // namespace onnxruntime diff --git a/onnxruntime/core/framework/prepacked_weights_container.cc b/onnxruntime/core/framework/prepacked_weights_container.cc index b6d44dd248bdd..7c832a0ac2691 100644 --- a/onnxruntime/core/framework/prepacked_weights_container.cc +++ b/onnxruntime/core/framework/prepacked_weights_container.cc @@ -3,9 +3,21 @@ #include "core/framework/prepacked_weights_container.h" #include "core/framework/allocator_utils.h" +#include "core/graph/graph.h" namespace onnxruntime { +PrePackedWeights PrePackedWeights::CreateReferringCopy() const { + PrePackedWeights copy; + for (const auto& prepacked_buffer : buffers_) { + // No deleter is needed as the buffer is not owned by the unique_ptr + copy.buffers_.emplace_back(prepacked_buffer.get(), [](void*) {}); + } + + copy.buffer_sizes_ = buffer_sizes_; + return copy; +} + AllocatorPtr PrepackedWeightsContainer::GetOrCreateAllocator(const std::string& device_name) { auto iter = allocators_.find(device_name); @@ -49,4 +61,50 @@ size_t PrepackedWeightsContainer::GetNumberOfElements() const { return prepacked_weights_map_.size(); } +void PrepackedWeightsForGraph::InsertPrepackedWeights(const std::string& key, PrePackedWeights&& packed_weight) { + // We may have duplicate entries mapped from disk if the same weight is pre-packed from subgraphs and + // up the tree by the same kernel with the same result. The map prevents this from happening. + key_to_blobs_.emplace(key, std::move(packed_weight)); +} + +void PrepackedWeightsForGraph::WritePackedMaybeForSave(const std::string& weight_name, const std::string& key, + PrePackedWeights&& packed_weight) { + key_to_blobs_.insert_or_assign(key, std::move(packed_weight)); + + if (save_mode_on_) { + weight_prepacks_for_saving_[weight_name].insert(key); + } +} + +const PrePackedWeights* PrepackedWeightsForGraph::GetPrepackedWeights(const std::string& key) const { + auto it = key_to_blobs_.find(key); + if (it == key_to_blobs_.end()) { + return nullptr; + } + return &it->second; +} + +std::optional PrepackedWeightsForGraph::ReplaceWithReferenceIfSaving( + const std::string& weight_name, + const std::string& key, + const PrePackedWeights& refer_to_if_absent) { + auto it = key_to_blobs_.find(key); + if (it == key_to_blobs_.end()) { + if (save_mode_on_) { + key_to_blobs_.emplace(key, refer_to_if_absent.CreateReferringCopy()); + weight_prepacks_for_saving_[weight_name].insert(key); + } + return std::nullopt; + } + + PrePackedWeights result = std::move(it->second); + if (save_mode_on_) { + it->second = result.CreateReferringCopy(); + weight_prepacks_for_saving_[weight_name].insert(key); + } else { + key_to_blobs_.erase(it); + } + return result; +} + } // namespace onnxruntime diff --git a/onnxruntime/core/framework/prepacked_weights_container.h b/onnxruntime/core/framework/prepacked_weights_container.h index 37fc01c05f2ae..f48c790eb4126 100644 --- a/onnxruntime/core/framework/prepacked_weights_container.h +++ b/onnxruntime/core/framework/prepacked_weights_container.h @@ -3,19 +3,26 @@ #pragma once -#include -#include -#include -#include - -#include "core/framework/buffer_deleter.h" - +#include "core/common/common.h" #include "core/framework/allocator.h" -#include #include "prepacked_weights.h" +#include +#include +#include +#include +#include +#include +#include + namespace onnxruntime { +#ifndef SHARED_PROVIDER +class Graph; +#else +struct Graph; +#endif + class PrepackedWeightsContainer final { public: PrepackedWeightsContainer() { @@ -66,4 +73,98 @@ class PrepackedWeightsContainer final { std::unordered_map prepacked_weights_map_; }; +// Maps a pre-packed weight blob key to PrepackedWeights instance +using PrepackedKeyToBlobMap = std::unordered_map; + +/// +/// This class has a dual purpose. +/// If saving is OFF (IsSaveModeOn() false), it is used to contain the weights memory mapped from disk. +/// Those weights are then moved to the shared container if weight sharing is enabled. +/// If cross-session weight sharing is not enabled, the weights are stored in this container, +/// and shared with the interested kernels. +/// +/// When saving to disk is ON (IsSaveModeOn() true) +/// It records the pre-packed weights blobs and associates them with the weight name. +/// When saving the model with external initializers, the weights are written to disk along +/// with the pre-packed blobs. +/// +/// +class PrepackedWeightsForGraph { + public: + PrepackedWeightsForGraph(PrepackedKeyToBlobMap& key_blobs, bool save_mode_on_) + : key_to_blobs_(key_blobs), save_mode_on_(save_mode_on_) { + } + + ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(PrepackedWeightsForGraph); + + // WeightToPrePacksMap maps weight name to a set of pre-packed + // keys contained in the KeyToBlobMap + using KeysPerWeight = std::unordered_set; // blob keys + using WeightToPrePacksMap = std::unordered_map; + + void InsertPrepackedWeights(const std::string& key, PrePackedWeights&& packed_weight); + + // Overwrites the existing weights and associates key with weight_name + void WritePackedMaybeForSave(const std::string& weight_name, const std::string& key, + PrePackedWeights&& packed_weight); + + const PrePackedWeights* GetPrepackedWeights(const std::string& key) const; + + // The function would add or replace existing entry with references to it. + // If the entry is present, it would replace it with references to the existing entry. + // If the entry is not present, it would add reference to refer_if_absent + // If the entry is present it would return the existing entry otherwise std::nullopt + // Reference in this context means a non-owning smart pointer. Essentially, this function + // replaces the existing entry with the same entry, but transfers the ownership outside + // the container. + std::optional ReplaceWithReferenceIfSaving(const std::string& weight_name, + const std::string& key, + const PrePackedWeights& refer_to_if_absent); + + bool IsSaveModeOn() const noexcept { + return save_mode_on_; + } + + void SetSaveMode(bool value) noexcept { + save_mode_on_ = value; + } + + const KeysPerWeight* GetKeysForWeightForSaving(const std::string& weight_name) const { + auto hit = weight_prepacks_for_saving_.find(weight_name); + if (hit != weight_prepacks_for_saving_.end()) { + return &hit->second; + } + return nullptr; + } + + size_t GetNumberOfWeightsForWriting() const noexcept { + return weight_prepacks_for_saving_.size(); + } + + size_t GetNumberOfKeyedBlobsForWriting() const noexcept { + size_t result = 0; + for (const auto& [_, keys] : weight_prepacks_for_saving_) { + result += keys.size(); + } + return result; + } + + const WeightToPrePacksMap& GetWeightToPrepack() const noexcept { + return weight_prepacks_for_saving_; + } + + PrepackedKeyToBlobMap& GetKeyToBlob() noexcept { + return key_to_blobs_; + } + + const PrepackedKeyToBlobMap& GetKeyToBlob() const noexcept { + return key_to_blobs_; + } + + private: + PrepackedKeyToBlobMap& key_to_blobs_; + bool save_mode_on_; + WeightToPrePacksMap weight_prepacks_for_saving_; +}; + } // namespace onnxruntime diff --git a/onnxruntime/core/framework/session_state.cc b/onnxruntime/core/framework/session_state.cc index 0ac2271ba09f1..d7059bf848e83 100644 --- a/onnxruntime/core/framework/session_state.cc +++ b/onnxruntime/core/framework/session_state.cc @@ -13,6 +13,7 @@ #include "core/framework/node_index_info.h" #include "core/framework/op_kernel.h" #include "core/framework/ort_value_pattern_planner.h" +#include "core/framework/prepacked_weights_container.h" #include "core/framework/session_state_utils.h" #include "core/framework/utils.h" #include "core/providers/cpu/controlflow/utils.h" @@ -122,7 +123,9 @@ void SessionState::UpdateAllocatorsWithEnvAllocators(const std::vector& SessionState::GetConstantInitializedTen return constant_initialized_tensors_; } +const PrepackedWeightsForGraph& onnxruntime::SessionState::GetPrepackedIniitializersForGraph() const { + return graph_.GetPrepacked(); +} + #if !defined(DISABLE_SPARSE_TENSORS) bool SessionState::IsSparseInitializer(int ort_value_index) const { return sparse_initialized_tensors_.count(ort_value_index) > 0; @@ -396,8 +403,9 @@ static std::string GenerateKeyForPrepackedWeightsMap(const std::string& op_type, return ss_1.str(); } -Status SessionState::PrepackConstantInitializedTensors(InlinedHashMap& constant_initializers_use_count, - const std::unordered_map& initializers_to_share_map) { +Status SessionState::PrepackConstantInitializedTensors( + InlinedHashMap& constant_initializers_use_count, + const std::unordered_map& initializers_to_share_map) { auto prepacked_constant_weights = [this, &constant_initializers_use_count, &initializers_to_share_map]( bool should_cache_prepacked_weights_for_shared_initializers) -> Status { for (auto& node : GetGraphViewer().Nodes()) { @@ -407,6 +415,7 @@ Status SessionState::PrepackConstantInitializedTensors(InlinedHashMapExists()) { const std::string& input_name = input_def->Name(); SessionState* st = this; + auto* prepacked_for_graph = &graph_.GetPrepacked(); // subgraph can use the value from outer scope, // so it needs to check if current node uses constant initialized tensor from current and outer graphs do { @@ -423,7 +432,8 @@ Status SessionState::PrepackConstantInitializedTensors(InlinedHashMapGetOrCreateAllocator(CPU); ORT_ENFORCE(allocator_for_caching.get() != nullptr); @@ -431,16 +441,19 @@ Status SessionState::PrepackConstantInitializedTensors(InlinedHashMapPrePack(const_initialized_tensor, input_idx, allocator_for_caching, is_packed, &weights_to_be_filled_in)); if (is_packed) { - // BUG CHECK: Ensure that the kernel has filled in the pre-packed weight to be cached if the weight was pre-packed - ORT_ENFORCE(weights_to_be_filled_in.buffers_.size() > 0, "The kernel corresponding to the node ", node.Name(), + // BUG CHECK: Ensure that the kernel has filled in the pre-packed weight + // to be cached if the weight was pre-packed + ORT_ENFORCE(weights_to_be_filled_in.buffers_.size() > 0, + "The kernel corresponding to the node ", node.Name(), " doesn't have an implementation that can cache computed pre-packed weights"); const auto& op_type = node.OpType(); @@ -452,40 +465,117 @@ Status SessionState::PrepackConstantInitializedTensors(InlinedHashMapHasWeight(prepacked_weights_container_key); + bool container_contains_packed_weight = prepacked_weights_container_->HasWeight( + prepacked_weights_container_key); if (container_contains_packed_weight) { - LOGS(logger_, INFO) << "Using cached version of pre-packed weight for constant initializer: " << input_name - << " used in the node: " << node.Name() << " which is of op type: " << node.OpType(); + LOGS(logger_, INFO) << "Using cached version of pre-packed weight for constant initializer: " + << input_name + << " used in the node: " << node.Name() << " which is of op type: " + << node.OpType(); + const auto& prepacked_shared = prepacked_weights_container_->GetWeight( + prepacked_weights_container_key); ORT_RETURN_IF_ERROR(KernelUseSharedPrePackedBuffers(*kernel, input_idx, - prepacked_weights_container_->GetWeight(prepacked_weights_container_key), + prepacked_shared, node.Name())); ++used_shared_pre_packed_weights_counter_; - } else { // container doesn't contain the pre-packed weight - so write into it for sharing across kernel instances - if (!prepacked_weights_container_->WriteWeight(prepacked_weights_container_key, std::move(weights_to_be_filled_in))) { - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unable to write the provided PrePackedWeights instance into the container"); + // Write references to what is stored in the shared container + // and release memory mapped entries this container may have loaded from disk + std::ignore = prepacked_for_graph->ReplaceWithReferenceIfSaving(input_name, + prepacked_weights_container_key, + prepacked_shared); + + } else { + // container doesn't contain the pre-packed weight - so write into it for sharing across + // kernel instances + + // Check if we loaded it from disk, then put it into the shared container so + // everybody can share the same memory mapped entry + // the shared container takes ownership of the memory mapped entries + + // The next line replaces the existing entry with references to it + // and returns the container that holds the memory mapped entries + // so we can transfer it to shared container. + // if there is not an entry, we replace it with references to weights_to_be_filled_in + // in saving mode and return std::nullopt + auto prepacked_from_disk = prepacked_for_graph->ReplaceWithReferenceIfSaving( + input_name, + prepacked_weights_container_key, + weights_to_be_filled_in); + + if (prepacked_from_disk.has_value()) { + weights_to_be_filled_in = std::move(*prepacked_from_disk); } + if (!prepacked_weights_container_->WriteWeight(prepacked_weights_container_key, + std::move(weights_to_be_filled_in))) { + return ORT_MAKE_STATUS( + ONNXRUNTIME, FAIL, + "Unable to write the provided PrePackedWeights instance into the container"); + } + + const auto& shared_prepacked = prepacked_weights_container_->GetWeight( + prepacked_weights_container_key); ORT_RETURN_IF_ERROR(KernelUseSharedPrePackedBuffers(*kernel, input_idx, - prepacked_weights_container_->GetWeight(prepacked_weights_container_key), + shared_prepacked, node.Name())); } } - } else { // caching of pre-packed weights' turned OFF + } else { + // cross session caching of pre-packed weights' turned OFF + // we use serialization container to share weights loaded from disk + // within this session. Or if the weight is not present on disk, + // we store the newly minted pre-packed data. + AllocatorPtr session_cpu_alloc = GetAllocator(kernel->Info().GetDevice(OrtMemType::OrtMemTypeDefault)); - ORT_RETURN_IF_ERROR(kernel->PrePack(const_initialized_tensor, input_idx, - session_cpu_alloc, // use allocator tied to this session + PrePackedWeights weights_to_be_filled_in; + // The reason we invoke PrePack() before looking into the container for any pre-packed weight + // cached by another instance of the same op_type (for the same constant initializer) is because + // to truly know if we can use a cached pre-packed weight, we would have to compare the cached + // pre-packed weight with the pre-packed weight generated by this instance of the same op_type because + // other static properties of the node like node attributes could play a role in the pre-packed + // weights' contents. + ORT_RETURN_IF_ERROR(kernel->PrePack(const_initialized_tensor, input_idx, session_cpu_alloc, is_packed, - nullptr // no caching required - )); + &weights_to_be_filled_in)); + + // Some kernels (matmul_nbits and non-CPU related kernels) do not share their pre-packed results + // even though they set is_packed = true so we leave it up to them. + // We can change their behavior if we wish do so in a separate PR + // XXX: Interestingly enough, matmul_nbits does accept shared pre-packs, but does not + // produce them. + if (is_packed && !weights_to_be_filled_in.buffers_.empty()) { + const auto& op_type = node.OpType(); + const std::string prepacked_weights_container_key = GenerateKeyForPrepackedWeightsMap( + op_type, + weights_to_be_filled_in); + + // See if we can use pre-packed data from disk + const auto* weights_to_use = prepacked_for_graph->GetPrepackedWeights( + prepacked_weights_container_key); + + if (weights_to_use == nullptr) { + // In this case pre-packed container owns the data + prepacked_for_graph->WritePackedMaybeForSave(input_name, prepacked_weights_container_key, + std::move(weights_to_be_filled_in)); + weights_to_use = prepacked_for_graph->GetPrepackedWeights(prepacked_weights_container_key); + assert(weights_to_use != nullptr); + } + + ORT_RETURN_IF_ERROR(KernelUseSharedPrePackedBuffers(*kernel, input_idx, + *weights_to_use, + node.Name())); + } } + if (is_packed) { ++number_of_prepacks_counter_; @@ -504,6 +594,7 @@ Status SessionState::PrepackConstantInitializedTensors(InlinedHashMapParent(); + prepacked_for_graph = &st->graph_.GetPrepacked(); } while (st); } input_idx++; @@ -525,7 +616,8 @@ Status SessionState::PrepackConstantInitializedTensors(InlinedHashMap& tensor_inputs) { +static int64_t +CalculateMemoryPatternsKey(const gsl::span& tensor_inputs) { int64_t key = 0; for (const auto& input : tensor_inputs) { for (auto dim : input.Get().Shape().GetDims()) key ^= dim; @@ -1068,9 +1160,12 @@ Status SessionState::CreateSubgraphSessionState() { // Calculate the use count of a constant initialized tensor, including the use in subgraph. // Note: This function doesn't handle the case below: -// The main graph has a constant initializer called X, and the subgraph also has a constant initializer called X, which overrides the X from main graph. -// For case like this, the current implementation will calculate the use count as 2, but they could contain completely different values so each should have a use count of 1. -// This is a very rare case. If it happens and X is prepacked, the consequence is that X won't be released and memory usage of X won't be saved. This will be fine. +// The main graph has a constant initializer called X, and the subgraph also has a constant initializer called X, +// which overrides the X from main graph. +// For case like this, the current implementation will calculate the use count as 2, but they could contain completely +// different values so each should have a use count of 1. +// This is a very rare case. If it happens and X is prepacked, the consequence is that X won't be released and memory +// usage of X won't be saved. This will be fine. static void ComputeConstantInitializerUseCount(const Graph& graph, InlinedHashMap& constant_initializers_use_count) { for (const auto& node : graph.Nodes()) { for (const auto* arg : node.InputDefs()) { @@ -1189,7 +1284,30 @@ Status SessionState::FinalizeSessionState(const std::basic_string constant_initializers_use_count; ComputeConstantInitializerUseCount(graph_, constant_initializers_use_count); return FinalizeSessionStateImpl(graph_location, kernel_registry_manager, nullptr, sess_options_, - remove_initializers, constant_initializers_use_count); + remove_initializers, + GetSaveModeForPrepacks(!remove_initializers, saving_ort_format), + constant_initializers_use_count); +} + +bool SessionState::GetSaveModeForPrepacks(bool saving_model, bool saving_ort_format) { + bool save_prepacked_constant_initializers = + sess_options_.config_options.GetConfigOrDefault(kOrtSessionOptionsSavePrePackedConstantInitializers, + "0") == "1"; + + if (save_prepacked_constant_initializers && !saving_model) { + save_prepacked_constant_initializers = false; + LOGS(logger_, WARNING) + << "SavePrePackedConstantInitializers is set to true but the model is not being saved. Ignoring the flag."; + } + + if (save_prepacked_constant_initializers && saving_ort_format) { + save_prepacked_constant_initializers = false; + LOGS(logger_, WARNING) + << "Serializing optimized model in ORT format with external pre-packed constant initializers is not supported." + << " Ignoring the flag."; + } + + return save_prepacked_constant_initializers; } static Status Index(const OrtValueNameIdxMap& ort_value_name_idx_map, @@ -1322,11 +1440,12 @@ Status SessionState::FinalizeSessionStateImpl(const std::basic_string& constant_initializers_use_count, const InlinedHashMap& outer_scope_node_arg_to_location_map, bool graph_info_already_created) { if (!graph_info_already_created) { - CreateGraphInfo(); + CreateGraphInfo(save_prepacked_initializers); } #if defined(ORT_EXTENDED_MINIMAL_BUILD) @@ -1475,21 +1594,20 @@ Status SessionState::FinalizeSessionStateImpl(const std::basic_string Status { - ORT_RETURN_IF_ERROR(AddInitializedTensor(idx, value, &d, constant, sparse)); - if (remove_initializers) { - graph_.RemoveInitializedTensor(name); - } - return Status::OK(); - }, - logger_, data_transfer_mgr_, external_data_loader_mgr_, *p_seq_exec_plan_, session_options, - memory_profile_func, name_to_buffered_tensor_)); + ORT_RETURN_IF_ERROR(session_state_utils::SaveInitializedTensors( + Env::Default(), graph_location, *graph_viewer_, + GetAllocator(OrtDevice()), + ort_value_name_idx_map_, initializer_allocation_order, *tensor_allocator, + [this, remove_initializers](const std::string& name, int idx, const OrtValue& value, const OrtCallback& d, + bool constant, bool sparse) -> Status { + ORT_RETURN_IF_ERROR(AddInitializedTensor(idx, value, &d, constant, sparse)); + if (remove_initializers) { + graph_.RemoveInitializedTensor(name); + } + return Status::OK(); + }, + logger_, data_transfer_mgr_, external_data_loader_mgr_, *p_seq_exec_plan_, session_options, + memory_profile_func, name_to_buffered_tensor_, graph_.GetPrepacked())); #if !defined(ORT_MINIMAL_BUILD) && defined(ORT_MEMORY_PROFILE) // Record Weight allocation info on device @@ -1537,15 +1655,17 @@ Status SessionState::FinalizeSessionStateImpl(const std::basic_string subgraph_outer_scope_node_arg_to_location_map; ORT_RETURN_IF_ERROR(OuterScopeNodeArgLocationAccumulator(*p_seq_exec_plan_, GetOrtValueNameIdxMap(), node, subgraph_session_state.GetGraphViewer(), subgraph_outer_scope_node_arg_to_location_map)); + ORT_RETURN_IF_ERROR(subgraph_session_state.FinalizeSessionStateImpl( graph_location, kernel_registry_manager, &node, subgraph_session_options, remove_initializers, + save_prepacked_initializers, constant_initializers_use_count, subgraph_outer_scope_node_arg_to_location_map, true)); // setup all the info for handling the feeds and fetches used in subgraph execution diff --git a/onnxruntime/core/framework/session_state.h b/onnxruntime/core/framework/session_state.h index e1674ba4b690b..82f520f4a4252 100644 --- a/onnxruntime/core/framework/session_state.h +++ b/onnxruntime/core/framework/session_state.h @@ -164,6 +164,8 @@ class SessionState { */ const std::unordered_map& GetConstantInitializedTensors() const; + const PrepackedWeightsForGraph& GetPrepackedIniitializersForGraph() const; + #if !defined(DISABLE_SPARSE_TENSORS) bool IsSparseInitializer(int ort_value_index) const; #endif @@ -364,11 +366,20 @@ class SessionState { const SessionOptions& GetSessionOptions() const { return sess_options_; } + /// + /// Deduce the flag whether we need to enable or disable + /// saving for pre-packed weights serialization. + /// + /// + /// + /// true of false + bool GetSaveModeForPrepacks(bool saving_model, bool saving_ort_format); + private: ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(SessionState); // Populate OrtValueNameIdxMap and create the graph viewer. - void CreateGraphInfo(); + void CreateGraphInfo(bool save_prepacked_on); // create kernels using info in kernel_create_info_map_ Status CreateKernels(const KernelRegistryManager& custom_registry_manager); @@ -399,6 +410,7 @@ class SessionState { _In_opt_ const Node* parent_node, const SessionOptions& session_options, bool remove_initializers, + bool save_prepacked_initializers, InlinedHashMap& constant_initializers_use_count, const InlinedHashMap& outer_scope_node_arg_to_location_map = {}, bool graph_info_already_created = false); diff --git a/onnxruntime/core/framework/session_state_utils.cc b/onnxruntime/core/framework/session_state_utils.cc index 2c74805c57dce..83a353615bc35 100644 --- a/onnxruntime/core/framework/session_state_utils.cc +++ b/onnxruntime/core/framework/session_state_utils.cc @@ -68,18 +68,19 @@ struct ExtDataValueDeleter { // buffered_tensor is not null, buffered_tensor holds the real buffer pointed // by tensor_proto. buffered_tensor must be the owner of the buffer and deleter // should release the buffer when tensor_proto is released. -static inline common::Status ExtDataTensorProtoToTensor(const Env& env, - const std::basic_string& proto_path, - const ONNX_NAMESPACE::TensorProto& tensor_proto, - Tensor& tensor, OrtCallback& ext_data_deleter, - Tensor* buffered_tensor = nullptr) { +static common::Status ExtDataTensorProtoToTensor(const Env& env, + const std::basic_string& proto_path, + const ONNX_NAMESPACE::TensorProto& tensor_proto, + Tensor& tensor, OrtCallback& ext_data_deleter, + PrepackedWeightsForGraph& prepacked_for_graph, + Tensor* buffered_tensor = nullptr) { ORT_ENFORCE(utils::HasExternalData(tensor_proto)); void* ext_data_buf = nullptr; SafeInt ext_data_len = 0; ORT_RETURN_IF_ERROR(utils::GetExtDataFromTensorProto(env, proto_path.c_str(), tensor_proto, ext_data_buf, ext_data_len, ext_data_deleter, - buffered_tensor)); + buffered_tensor, &prepacked_for_graph)); // NB: creating a do-nothing allocator per tensor is wasteful; can perhaps be // avoided if the Tensor class implements the do-nothing behavior when given a @@ -100,6 +101,7 @@ static common::Status DeserializeTensorProto(const Env& env, const std::basic_st const AllocatorPtr& alloc, const AllocatorPtr& default_cpu_alloc, OrtValue& ort_value, const DataTransferManager& data_transfer_mgr, const ExternalDataLoaderManager& external_data_loader_mgr, + PrepackedWeightsForGraph& prepacked_for_graph, bool use_device_allocator_for_initializers = false, Tensor* buffered_tensor = nullptr) { if (bool(alloc) == (m != nullptr)) { @@ -127,8 +129,7 @@ static common::Status DeserializeTensorProto(const Env& env, const std::basic_st ORT_RETURN_IF_ERROR(utils::LoadExtDataToTensorFromTensorProto(env, proto_path, tensor_proto, *external_data_loader, *p_tensor)); - auto ml_tensor = DataTypeImpl::GetType(); - ort_value.Init(p_tensor.release(), ml_tensor, ml_tensor->GetDeleteFunc()); + Tensor::InitOrtValue(std::move(*p_tensor), ort_value); return common::Status::OK(); } else if (device_type == OrtDevice::CPU) { // for external initializer on CPU we will use mmap for large initializers so don't need to allocate memory in advance @@ -139,7 +140,8 @@ static common::Status DeserializeTensorProto(const Env& env, const std::basic_st // TensorProtoToTensor it would copy the data, causing unnecessary overhead OrtCallback ext_data_deleter; ORT_RETURN_IF_ERROR(ExtDataTensorProtoToTensor(env, proto_path, tensor_proto, *p_tensor, - ext_data_deleter, buffered_tensor)); + ext_data_deleter, prepacked_for_graph, + buffered_tensor)); ExtDataValueDeleter deleter{ext_data_deleter, p_tensor.get()}; MLDataType ml_tensor_type = DataTypeImpl::GetType(); @@ -163,8 +165,9 @@ static common::Status DeserializeTensorProto(const Env& env, const std::basic_st OrtCallback ext_data_deleter; std::optional scoped_ort_callback_invoker; ORT_RETURN_IF_ERROR(ExtDataTensorProtoToTensor(env, proto_path, tensor_proto, *p_deserialize_tensor, - ext_data_deleter, buffered_tensor)); - scoped_ort_callback_invoker = ScopedOrtCallbackInvoker(ext_data_deleter); + ext_data_deleter, prepacked_for_graph, + buffered_tensor)); + scoped_ort_callback_invoker.emplace(ext_data_deleter); // TODO!! Need a temp buffer allocator for non-escape buffers that maybe too big for stack allocation. return CopyTensorFromCPUToDevice(data_transfer_mgr, p_deserialize_tensor, p_tensor, ort_value); @@ -272,13 +275,14 @@ common::Status SaveInitializedTensors( const ExecutionPlanBase& exec_plan, const SessionOptions& session_options, const MemoryProfileFunction& memory_profile_func, - std::unordered_map>& buffered_tensors) { + std::unordered_map>& buffered_tensors, + PrepackedWeightsForGraph& prepacked_for_graph) { LOGS(logger, INFO) << "Saving initialized tensors."; ORT_ENFORCE(ort_value_name_idx_map.MaxIdx() > -1, "OrtValue indexes should have been populated."); // Determine if an intializer was supplied by the user for the purpose of sharing and if it requires a cross-device // copy. In case a cross-device copy is required, sharing cannot be accomplished since we allocate our own buffer - // for the destn device which cannot be shared between sessions. + // for the destination device which cannot be shared between sessions. auto use_user_supplied_initializer = [&session_options, &exec_plan, &logger, &ort_value_name_idx_map](const std::string& name) -> bool { bool retval = false; @@ -401,6 +405,7 @@ common::Status SaveInitializedTensors( Status st = DeserializeTensorProto(env, graph_loc, tensor_proto, (m.has_value()) ? &*m : nullptr, alloc, default_cpu_alloc, ort_value, data_transfer_mgr, external_data_loader_mgr, + prepacked_for_graph, use_device_allocator_for_initializers, p_tensor); if (!st.IsOK()) { std::ostringstream oss; diff --git a/onnxruntime/core/framework/session_state_utils.h b/onnxruntime/core/framework/session_state_utils.h index af27f5caba0f4..17400c45e5f32 100644 --- a/onnxruntime/core/framework/session_state_utils.h +++ b/onnxruntime/core/framework/session_state_utils.h @@ -9,6 +9,7 @@ #include "core/common/const_pointer_container.h" #include "core/framework/allocator.h" +#include "core/framework/prepacked_weights_container.h" #include "core/framework/tensor.h" #include "core/framework/tensor_allocator.h" #include "core/framework/session_options.h" @@ -50,7 +51,8 @@ common::Status SaveInitializedTensors( const ExecutionPlanBase& exec_plan, const SessionOptions& session_options, const MemoryProfileFunction& memory_profile_func, - std::unordered_map>& buffered_tensors); + std::unordered_map>& buffered_tensors, + PrepackedWeightsForGraph& prepacked_for_graph); common::Status AllocateTensor( const onnxruntime::MemBuffer* m, diff --git a/onnxruntime/core/framework/tensor_external_data_info.cc b/onnxruntime/core/framework/tensor_external_data_info.cc index 93146e66d9f24..ec8b25e9f4afe 100644 --- a/onnxruntime/core/framework/tensor_external_data_info.cc +++ b/onnxruntime/core/framework/tensor_external_data_info.cc @@ -3,8 +3,13 @@ #include "tensor_external_data_info.h" #include "core/common/common.h" +#include "core/common/narrow.h" +#include "core/common/safeint.h" +#include "core/common/string_utils.h" #include "core/platform/path_lib.h" +#include + #ifdef _WIN32 #include #endif @@ -14,8 +19,24 @@ using ::ONNX_NAMESPACE::StringStringEntryProto; namespace onnxruntime { Status ExternalDataInfo::Create(const RepeatedPtrField& input, std::unique_ptr& out) { + auto str_to_int = [](const std::string& s, OFFSET_TYPE& result) -> Status { + char* end; +#ifdef _WIN32 + result = _strtoi64(s.c_str(), &end, 10); +#else + result = OrtStrToPtrDiff(s.c_str(), &end); +#endif + if (end != s.c_str() + s.length()) { + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "parsing ", s, " failed"); + } + return Status::OK(); + }; + out = std::make_unique(); + PrepackedInfos prepacked_infos; + const int input_size = input.size(); + for (int i = 0; i != input_size; ++i) { StringStringEntryProto stringmap = input[i]; if (!stringmap.has_key()) @@ -25,28 +46,112 @@ Status ExternalDataInfo::Create(const RepeatedPtrField& if (stringmap.key() == "location" && !stringmap.value().empty()) { out->rel_path_ = ToWideString(stringmap.value()); } else if (stringmap.key() == "offset" && !stringmap.value().empty()) { - char* end; -#ifdef _WIN32 - out->offset_ = _strtoi64(stringmap.value().c_str(), &end, 10); -#else - out->offset_ = OrtStrToPtrDiff(stringmap.value().c_str(), &end); -#endif - if (end != stringmap.value().c_str() + stringmap.value().length()) - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "parsing ", stringmap.value(), " failed"); + ORT_RETURN_IF_ERROR(str_to_int(stringmap.value(), out->offset_)); } else if (stringmap.key() == "length" && !stringmap.value().empty()) { char* end; - out->length_ = static_cast(OrtStrToPtrDiff(stringmap.value().c_str(), &end)); + out->length_ = narrow(OrtStrToPtrDiff(stringmap.value().c_str(), &end)); if (end != stringmap.value().c_str() + stringmap.value().length()) return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "parsing ", stringmap.value(), " failed"); } else if (stringmap.key() == "checksum" && !stringmap.value().empty()) { out->checksum_ = stringmap.value(); + } else if (stringmap.key().find("prepacked", 0) == 0) { + // Starts with 'prepacked', each has its own key. + // Each prepacked entry may have multiple blobs with the same key + // we output them with the same key + // format = key|offset;length;checksum[|offset;length;checksum] + // We are ignoring invalid entries (should not be any), and rely + // on in memory pre-packs regenerated in this case. + // users can over-write this file with the correct pre-packed info. + const std::string& prepacked = stringmap.value(); + if (!prepacked.empty()) { + auto split_fields = utils::SplitString(prepacked, "|", false); + if (split_fields.size() > 1) { + const std::string key{split_fields[0]}; + auto& blob_infos = prepacked_infos[key]; + for (size_t f = 1; f < split_fields.size(); ++f) { + const auto& blob = split_fields[f]; + auto blob_fields = utils::SplitString(blob, ";", false); + if (blob_fields.size() == 3) { + OFFSET_TYPE offset, len; + ORT_RETURN_IF_ERROR(str_to_int(std::string(blob_fields[0]), offset)); + ORT_RETURN_IF_ERROR(str_to_int(std::string(blob_fields[1]), len)); + blob_infos.push_back(std::make_tuple(offset, narrow(len), std::string(blob_fields[2]))); + } + } + if (blob_infos.empty()) { + prepacked_infos.erase(key); + } + } + } } else { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "model format error!"); } } + if (out->rel_path_.empty()) { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "model format error! Missing 'location'"); } + + if (!prepacked_infos.empty()) { + out->prepacked_infos_ = std::move(prepacked_infos); + } + return Status::OK(); } +void ExternalDataInfo::SetExternalLocationToProto(const std::filesystem::path& external_file_path, + int64_t external_offset, size_t tensor_bytes_size, + ::ONNX_NAMESPACE::TensorProto& proto) { + proto.set_data_location(ONNX_NAMESPACE::TensorProto_DataLocation::TensorProto_DataLocation_EXTERNAL); + + auto* location = proto.add_external_data(); + location->set_key("location"); + location->set_value(ToUTF8String(external_file_path.native())); + + auto* offset = proto.add_external_data(); + offset->set_key("offset"); + offset->set_value(std::to_string(external_offset)); + + auto* length = proto.add_external_data(); + length->set_key("length"); + length->set_value(std::to_string(tensor_bytes_size)); +} + +std::ostream& ExternalDataInfo::WritePrepackedToFileAndAddToProto( + const PrepackedWeightsForGraph& prepacked_for_graph, + const InlinedHashSet& blob_keys, bool align, + int64_t align_threshold, int64_t allocation_granularity, + std::ostream& os, int64_t& external_offset, ::ONNX_NAMESPACE::TensorProto& proto) { + size_t key_count = 0; + for (const auto& key : blob_keys) { + size_t prepack_count = 0; + const auto* prepacked_weights = prepacked_for_graph.GetPrepackedWeights(key); + ORT_ENFORCE(prepacked_weights != nullptr, "Prepacked weights not found for key ", key); + std::stringstream prepacked_entry; + prepacked_entry << key << "|"; + for (size_t i = 0, size = prepacked_weights->buffers_.size(); i < size; ++i) { + const auto size_in_bytes = prepacked_weights->buffer_sizes_[i]; + if (align && static_cast(size_in_bytes) > align_threshold) { + // return early on error + if (!AlignAndPad(os, allocation_granularity, external_offset)) { + return os; + } + } + if (prepack_count++ > 0) { + prepacked_entry << "|"; + } + // Checksum is currently not validated + prepacked_entry << external_offset << ";" << size_in_bytes << ";0"; + if (!os.write(reinterpret_cast(prepacked_weights->buffers_[i].get()), size_in_bytes)) { + return os; + } + external_offset = SafeInt(external_offset) + size_in_bytes; + } + auto* prepacked = proto.add_external_data(); + std::string prepacked_key("prepacked_"); + prepacked_key.append(std::to_string(key_count++)); + prepacked->set_key(std::move(prepacked_key)); + prepacked->set_value(prepacked_entry.str()); + } + return os; +} } // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/core/framework/tensor_external_data_info.h b/onnxruntime/core/framework/tensor_external_data_info.h index afc8fda6c3037..1b185b8c5da7d 100644 --- a/onnxruntime/core/framework/tensor_external_data_info.h +++ b/onnxruntime/core/framework/tensor_external_data_info.h @@ -2,12 +2,21 @@ // Licensed under the MIT License. #pragma once +#include +#include +#include #include +#include + +#include +#include "core/common/path_string.h" +#include "core/common/safeint.h" #include "core/common/status.h" +#include "core/framework/prepacked_weights_container.h" #include "core/graph/onnx_protobuf.h" -#include "core/session/onnxruntime_c_api.h" namespace onnxruntime { + class ExternalDataInfo { public: #ifdef _WIN32 @@ -16,7 +25,7 @@ class ExternalDataInfo { using OFFSET_TYPE = off_t; #endif - const std::basic_string& GetRelPath() const { return rel_path_; } + const PathString& GetRelPath() const { return rel_path_; } OFFSET_TYPE GetOffset() const { return offset_; } size_t GetLength() const { return length_; } @@ -29,12 +38,58 @@ class ExternalDataInfo { const ::google::protobuf::RepeatedPtrField<::ONNX_NAMESPACE::StringStringEntryProto>& input, std::unique_ptr& out); + static void SetExternalLocationToProto(const std::filesystem::path& external_file_path, + int64_t offset, + size_t tensor_bytes_size, + ::ONNX_NAMESPACE::TensorProto& proto); + + // Pads the output with zeros according to the specified allocation_granularity + // It updates external_offset for alignment. + // need to do padding before write actual tensor data as we do offset alignment at the begin of + // large tensors (offset need to be page aligned and allocation granularity aligned) like below: + // \242\2557\256\023.\031&0000000000000000\332)k+\253\246\342\246(&\006!\347\232\374\236\325\026\032+\36XXXX + // |<---smaller tensor---->|<---padding--->|<------------------large tensor----------------------------->| + static std::ostream& AlignAndPad(std::ostream& stream, int64_t allocation_granularity, int64_t& external_offset) { + // Align to the larger of the page size or the allocation granularity + int64_t alignment_factor = std::max(static_cast(4096), allocation_granularity); + // Align to the next page or alloc granularity boundary + SafeInt safe_external_offset = external_offset; + int64_t new_external_offset = ((safe_external_offset + alignment_factor - 1) / alignment_factor) * + alignment_factor; + + // padding tensor with zeros for alignment + for (int64_t index = external_offset; index != new_external_offset; ++index) { + stream << '\0'; + } + external_offset = new_external_offset; + return stream; + } + + static std::ostream& WritePrepackedToFileAndAddToProto( + const PrepackedWeightsForGraph& prepacked_for_graph, + const InlinedHashSet& blob_keys, + bool align, int64_t align_threshold, int64_t allocation_granularity, + std::ostream& os, + int64_t& external_offset, + ::ONNX_NAMESPACE::TensorProto& proto); + + using PrepackedInfo = std::tuple; + using PrepackedInfos = std::unordered_map>; + + bool HasPrepackedInfo() const noexcept { return !prepacked_infos_.empty(); } + + PrepackedInfos&& TakePrepackedInfos() { return std::move(prepacked_infos_); } + private: - std::basic_string rel_path_; + PathString rel_path_; OFFSET_TYPE offset_ = 0; // 0 means the whole file size_t length_ = 0; std::string checksum_; + + // Pre-packed blobs found associated with this TensorProto if present + // format key, offset, length, checksum + PrepackedInfos prepacked_infos_; }; } // namespace onnxruntime diff --git a/onnxruntime/core/framework/tensorprotoutils.cc b/onnxruntime/core/framework/tensorprotoutils.cc index 2af9f95ad059e..097ce436f4419 100644 --- a/onnxruntime/core/framework/tensorprotoutils.cc +++ b/onnxruntime/core/framework/tensorprotoutils.cc @@ -234,7 +234,8 @@ Status GetExternalDataInfo(const ONNX_NAMESPACE::TensorProto& tensor_proto, const std::filesystem::path& tensor_proto_dir, std::basic_string& external_file_path, onnxruntime::FileOffsetType& file_offset, - SafeInt& tensor_byte_size) { + SafeInt& tensor_byte_size, + ExternalDataInfo::PrepackedInfos* prepacked_infos) { ORT_RETURN_IF_NOT(onnxruntime::utils::HasExternalData(tensor_proto), "Tensor does not have external data to read from."); @@ -258,6 +259,10 @@ Status GetExternalDataInfo(const ONNX_NAMESPACE::TensorProto& tensor_proto, file_offset = external_data_info->GetOffset(); + if (prepacked_infos != nullptr && external_data_info->HasPrepackedInfo()) { + *prepacked_infos = external_data_info->TakePrepackedInfos(); + } + return Status::OK(); } @@ -988,7 +993,8 @@ static Status GetFileContent(const Env& env, const std::filesystem::path& file_p Status GetExtDataFromTensorProto(const Env& env, const std::filesystem::path& model_path, const ONNX_NAMESPACE::TensorProto& tensor_proto, void*& ext_data_buf, SafeInt& ext_data_len, OrtCallback& ext_data_deleter, - Tensor* buffered_tensor) { + Tensor* buffered_tensor, + PrepackedWeightsForGraph* prepacked_info) { ORT_ENFORCE(utils::HasExternalData(tensor_proto)); std::basic_string tensor_proto_dir; if (!model_path.empty()) { @@ -997,8 +1003,13 @@ Status GetExtDataFromTensorProto(const Env& env, const std::filesystem::path& mo std::basic_string external_data_file_path; FileOffsetType file_offset; SafeInt raw_data_safe_len = 0; + std::optional prepacked_infos; + if (prepacked_info != nullptr) { + prepacked_infos.emplace(); + } ORT_RETURN_IF_ERROR( - GetExternalDataInfo(tensor_proto, tensor_proto_dir, external_data_file_path, file_offset, raw_data_safe_len)); + GetExternalDataInfo(tensor_proto, tensor_proto_dir, external_data_file_path, file_offset, + raw_data_safe_len, (prepacked_info != nullptr) ? &*prepacked_infos : nullptr)); if (external_data_file_path == onnxruntime::utils::kTensorProtoMemoryAddressTag) { // the value in location is the memory address of the data @@ -1042,6 +1053,33 @@ Status GetExtDataFromTensorProto(const Env& env, const std::filesystem::path& mo ORT_RETURN_IF_ERROR(GetFileContent(env, external_data_file_path.c_str(), file_offset, raw_data_safe_len, ext_data_buf, ext_data_deleter)); ext_data_len = raw_data_safe_len; + + if (prepacked_info != nullptr && !prepacked_infos->empty()) { + for (const auto& [key, blobs] : *prepacked_infos) { + PrePackedWeights prepacked_weights; + prepacked_weights.buffers_.reserve(blobs.size()); + prepacked_weights.buffer_sizes_.reserve(blobs.size()); + for (const auto& blob : blobs) { + const auto blob_offset = std::get<0>(blob); + const auto blob_length = std::get<1>(blob); + SafeInt end_of_blob{blob_offset}; + end_of_blob += blob_length; + ORT_RETURN_IF(blob_offset < 0 || static_cast(end_of_blob) > file_length, + "Pre-packed blob: ", key, " offset: ", blob_offset, " file_length: ", file_length, + " is out of bounds and can not read in full"); + void* data_ptr; + OrtCallback data_deleter; + ORT_RETURN_IF_ERROR(GetFileContent(env, external_data_file_path.c_str(), blob_offset, blob_length, + data_ptr, data_deleter)); + IAllocatorUniquePtr data_ptr_unique{data_ptr, OrtCallbackInvoker(data_deleter)}; + prepacked_weights.buffers_.push_back(std::move(data_ptr_unique)); + prepacked_weights.buffer_sizes_.push_back(blob_length); + } + if (!blobs.empty()) { + prepacked_info->InsertPrepackedWeights(key, std::move(prepacked_weights)); + } + } + } #endif } diff --git a/onnxruntime/core/framework/tensorprotoutils.h b/onnxruntime/core/framework/tensorprotoutils.h index 262f7adaca1cb..7b9a47842388c 100644 --- a/onnxruntime/core/framework/tensorprotoutils.h +++ b/onnxruntime/core/framework/tensorprotoutils.h @@ -3,20 +3,21 @@ #pragma once -#include -#include -#include #include +#include +#include +#include #ifndef SHARED_PROVIDER #include "core/common/common.h" #include "core/common/status.h" #include "core/common/safeint.h" -#include "core/framework/endian_utils.h" #include "core/framework/allocator.h" +#include "core/framework/endian_utils.h" #include "core/framework/external_data_loader.h" -#include "core/framework/ort_value.h" #include "core/framework/mem_buffer.h" +#include "core/framework/ort_value.h" +#include "core/framework/prepacked_weights_container.h" #include "core/framework/tensor_external_data_info.h" #include "core/graph/onnx_protobuf.h" #include "core/platform/env.h" @@ -36,7 +37,8 @@ Status GetExternalDataInfo(const ONNX_NAMESPACE::TensorProto& tensor_proto, const std::filesystem::path& tensor_proto_dir, std::basic_string& external_file_path, onnxruntime::FileOffsetType& file_offset, - SafeInt& tensor_byte_size); + SafeInt& tensor_byte_size, + ExternalDataInfo::PrepackedInfos* prepacked_infos = nullptr); /** * This function is used to convert the endianess of Tensor data. * Mostly, will be used in big endian system to support the model file @@ -172,7 +174,8 @@ common::Status GetExtDataFromTensorProto(const Env& env, const std::filesystem:: const ONNX_NAMESPACE::TensorProto& tensor_proto, void*& ext_data_buf, SafeInt& ext_data_len, OrtCallback& ext_data_deleter, - Tensor* buffered_tensor = nullptr); + Tensor* buffered_tensor = nullptr, + PrepackedWeightsForGraph* prepacked_for_graph = nullptr); // Given a tensor proto with external data obtain a tensor using the specified custom external data loader. common::Status LoadExtDataToTensorFromTensorProto(const Env& env, const std::filesystem::path& model_path, diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc index c7a0793c4748f..d78fe7111c9be 100644 --- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc +++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc @@ -3371,7 +3371,8 @@ void RegisterContribSchemas() { "tensor(uint64)", "tensor(float16)", "tensor(float)", - "tensor(double)"}, + "tensor(double)", + "tensor(bfloat16)"}, "Constrain input and output types."); static const char* BitmaskDropout_ver1_doc = R"DOC( diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc index e8a5855b36496..0b6610db5e007 100644 --- a/onnxruntime/core/graph/graph.cc +++ b/onnxruntime/core/graph/graph.cc @@ -18,6 +18,7 @@ #include "core/flatbuffers/flatbuffers_utils.h" #include "core/flatbuffers/schema/ort.fbs.h" #include "core/framework/tensor_shape.h" +#include "core/framework/tensor_external_data_info.h" #include "core/framework/tensorprotoutils.h" #include "core/framework/utils.h" #include "core/graph/graph_flatbuffers_utils.h" @@ -25,6 +26,7 @@ #include "core/graph/indexed_sub_graph.h" #include "core/graph/model.h" #include "core/graph/model_load_utils.h" +#include "core/graph/model_saving_options.h" #include "core/graph/node_attr_utils.h" #include "core/graph/op.h" #include "core/graph/runtime_optimization_record_container.h" @@ -1543,6 +1545,17 @@ Status Graph::VerifyNoDuplicateName() { #endif // !defined(ORT_MINIMAL_BUILD) +void Graph::ConstructPrepackedSharedContainerAndSetMode(bool saving_mode_on) { + if (parent_graph_ == nullptr) { + prepacked_key_to_blobs_.emplace(); + prepacked_weights_for_graph_.emplace(*prepacked_key_to_blobs_, saving_mode_on); + } else { + // Subgraph + prepacked_weights_for_graph_.emplace(parent_graph_->prepacked_weights_for_graph_->GetKeyToBlob(), + saving_mode_on); + } +} + #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD) void Graph::AddEdge(NodeIndex src_node_index, NodeIndex dst_node_index, int src_arg_slot, int dst_arg_slot) { if (nodes_.size() <= src_node_index || src_arg_slot < 0 || nodes_.size() <= dst_node_index || dst_arg_slot < 0 || @@ -4084,82 +4097,103 @@ ONNX_NAMESPACE::GraphProto Graph::ToGraphProto() const { return result; } -ONNX_NAMESPACE::GraphProto Graph::ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path, - const std::filesystem::path& model_file_path, - size_t initializer_size_threshold, - const OffsetAlignmentInfo& align_info) const { - GraphProto result; - ToGraphProtoInternal(result); - ORT_ENFORCE(external_file_path.is_relative()); - // If model_file_path is just a file name without a path separator, for example: "model.onnx". Its parent path could - // be empty. Else, save external data file in same directory as the model. - const std::filesystem::path modified_external_file_path = model_file_path.parent_path() / external_file_path; +Status Graph::AddExternalInitializersToGraphProtoImpl( + const std::filesystem::path& model_path, + const std::filesystem::path& external_file_path, + const std::filesystem::path& model_external_file_path, + const ModelSavingOptions& model_saving_options, + ONNX_NAMESPACE::GraphProto& output_graph_proto, + std::ostream& external_stream, + int64_t& external_offset) const { + // Process initializers in a subgraph, check their size and + // write to an external file. This function also saves pre-packed + // blobs for the initializer being saved to disk, if the initializer has any pre-packs. + // This function is invoked by ToGraphProtoWithExternalInitiallizers() and processes subgraphs + // bottom up. + for (const auto& node : Nodes()) { + if (node.ContainsSubgraph()) { + // Let find this node in the output_graph_proto + auto hit = std::find_if(output_graph_proto.mutable_node()->begin(), + output_graph_proto.mutable_node()->end(), + [&node](const ONNX_NAMESPACE::NodeProto& proto) { + return proto.name() == node.Name(); + }); + ORT_RETURN_IF_NOT(hit != output_graph_proto.mutable_node()->end(), "Node ", node.Name(), + " not found in output_graph_proto"); + auto& result_node = *hit; + for (const auto& e : node.GetAttributeNameToSubgraphMap()) { + const auto& name = e.first; + const auto& subgraph = e.second; + // Lets find this subgraph in the result_node + auto sub_hit = std::find_if(result_node.mutable_attribute()->begin(), + result_node.mutable_attribute()->end(), + [&name](const ONNX_NAMESPACE::AttributeProto& proto) { + return proto.name() == name; + }); + ORT_RETURN_IF_NOT(sub_hit != result_node.mutable_attribute()->end() && utils::HasGraph(*sub_hit), + "Subgraph ", name, " is referred to in GetAttributeNameToSubgraphMap, but not found in node ", + node.Name(), " while attempting to recurse into it."); + auto& result_subgraph = *sub_hit->mutable_g(); + ORT_RETURN_IF_ERROR(subgraph->AddExternalInitializersToGraphProtoImpl( + model_path, external_file_path, + model_external_file_path, model_saving_options, + result_subgraph, + external_stream, external_offset)); + } + } + } - std::ofstream external_stream(modified_external_file_path, std::ofstream::out | std::ofstream::binary); - ORT_ENFORCE(external_stream.is_open()); - int64_t external_offset = 0; + // Used only when pre-packed weights are serialized + InlinedHashSet processed_weights; + // prepacked_weights_for_graph_ is present only when SessionState is finalized. + const bool process_prepacks = prepacked_weights_for_graph_.has_value() && + prepacked_weights_for_graph_->GetNumberOfWeightsForWriting() > 0; + if (process_prepacks) { + processed_weights.reserve(graph_proto_->initializer_size()); + } // Add the initializers to the result graph. - const auto& model_path = ModelPath(); -#if !defined(DISABLE_SPARSE_TENSORS) - const auto sparse_end = sparse_tensor_names_.end(); -#endif - for (const auto& initializer : graph_proto_->initializer()) { #if !defined(DISABLE_SPARSE_TENSORS) - if (sparse_end != sparse_tensor_names_.find(initializer.name())) { + if (IsSparseInitializer(initializer.name())) { // Sparse tensors are added to the ONNX file. - auto& sparse_initializer = *result.add_sparse_initializer(); + auto& sparse_initializer = *output_graph_proto.add_sparse_initializer(); auto status = utils::DenseTensorToSparseTensorProto(initializer, model_path, sparse_initializer); - ORT_ENFORCE(status.IsOK(), "Failed to convert dense initializer to sparse"); + ORT_RETURN_IF_NOT(status.IsOK(), "Failed to convert dense initializer to sparse"); } else { #endif // Dense tensors larger than the threshold are added to the external file. - TensorProto* output_proto = result.add_initializer(); + TensorProto* output_proto = output_graph_proto.add_initializer(); std::vector raw_data; - ORT_THROW_IF_ERROR(utils::UnpackInitializerData(initializer, model_path, raw_data)); + ORT_RETURN_IF_ERROR(utils::UnpackInitializerData(initializer, model_path, raw_data)); size_t tensor_bytes_size = raw_data.size(); - if (tensor_bytes_size < initializer_size_threshold) { + if (tensor_bytes_size < model_saving_options.initializer_size_threshold) { *output_proto = initializer; + if (process_prepacks) { + // These pre-packs will reside in memory + processed_weights.insert(initializer.name()); + } continue; } // update external_offset for alignment // need to do padding before write actual tensor data as we do offset alignment at the begin of - // large tensors (offset need to be page aligned and alloction granularity aligned) like below: + // large tensors (offset need to be page aligned and allocation granularity aligned) like below: // \242\2557\256\023.\031&0000000000000000\332)k+\253\246\342\246(&\006!\347\232\374\236\325\026\032+\36XXXX - // |<---small tensor---->|<---padding--->|<------------------large tensor----------------------------->| - if (align_info.align_offset && static_cast(tensor_bytes_size) > align_info.align_threshold) { - // Align to the larger of the page size or the allocation granularity - int64_t alignment_factor = std::max(static_cast(4096), align_info.allocation_granularity); - // Align to the next page or alloc granularity boundary - int64_t new_external_offset = static_cast( - std::floor((external_offset + alignment_factor - 1) / alignment_factor)) * - alignment_factor; - - // padding tensor with zeros for alignment - for (int64_t index = external_offset; index != new_external_offset; ++index) { - external_stream << '0'; - } - - external_offset = new_external_offset; + // |<---smaller tensor---->|<---padding--->|<------------------large tensor----------------------------->| + if (model_saving_options.align_offset && static_cast(tensor_bytes_size) > + model_saving_options.align_threshold) { + ORT_RETURN_IF_NOT(ExternalDataInfo::AlignAndPad(external_stream, model_saving_options.allocation_granularity, + external_offset), + "Failed writing external data to: ", model_external_file_path); } - for (size_t index = 0; index != tensor_bytes_size; ++index) { - external_stream << raw_data[index]; - } + ORT_RETURN_IF_NOT(external_stream.write(reinterpret_cast(raw_data.data()), tensor_bytes_size), + "Failed to write external initializers to file: ", model_external_file_path); - output_proto->set_data_location(ONNX_NAMESPACE::TensorProto_DataLocation::TensorProto_DataLocation_EXTERNAL); - ONNX_NAMESPACE::StringStringEntryProto* location = output_proto->add_external_data(); - location->set_key("location"); - location->set_value(ToUTF8String(external_file_path.native())); - ONNX_NAMESPACE::StringStringEntryProto* offset = output_proto->add_external_data(); - offset->set_key("offset"); - offset->set_value(std::to_string(external_offset)); - ONNX_NAMESPACE::StringStringEntryProto* length = output_proto->add_external_data(); - length->set_key("length"); - length->set_value(std::to_string(tensor_bytes_size)); + ExternalDataInfo::SetExternalLocationToProto(external_file_path, external_offset, + tensor_bytes_size, *output_proto); output_proto->set_name(initializer.name()); output_proto->set_data_type(initializer.data_type()); @@ -4168,12 +4202,74 @@ ONNX_NAMESPACE::GraphProto Graph::ToGraphProtoWithExternalInitializers(const std } output_proto->set_doc_string(initializer.doc_string()); - external_offset += tensor_bytes_size; + external_offset = SafeInt(external_offset) + tensor_bytes_size; + + if (process_prepacks) { + // check if this weight was referred to in subgraphs + InlinedHashSet blob_keys_to_external_data; + + // See if this weight has any pre-prepacks referred to in this graph. + const auto* blobs_keys_for_weight = prepacked_weights_for_graph_->GetKeysForWeightForSaving(initializer.name()); + if (blobs_keys_for_weight != nullptr && !blobs_keys_for_weight->empty()) { + // Add all the blob_keys to the set of keys to process + blob_keys_to_external_data.insert(blobs_keys_for_weight->begin(), blobs_keys_for_weight->end()); + } + + if (!blob_keys_to_external_data.empty()) { + auto& os = ExternalDataInfo::WritePrepackedToFileAndAddToProto( + *prepacked_weights_for_graph_, blob_keys_to_external_data, + model_saving_options.align_offset, model_saving_options.align_threshold, + model_saving_options.allocation_granularity, + external_stream, external_offset, *output_proto); + ORT_RETURN_IF_NOT(os.good(), "Failed to write pre-packed blobs to external file"); + } + + processed_weights.insert(initializer.name()); + } + #if !defined(DISABLE_SPARSE_TENSORS) } #endif } + // Check if there are any pre-packed weights this graph refers to, but they have + // not been processed. + if (process_prepacks) { + const auto& sorted_by_weights = prepacked_weights_for_graph_->GetWeightToPrepack(); + for (const auto& [weight_name, blob_keys] : sorted_by_weights) { + ORT_ENFORCE(processed_weights.find(weight_name) != processed_weights.end()); + } + } + + return Status::OK(); +} + +ONNX_NAMESPACE::GraphProto Graph::ToGraphProtoWithExternalInitializers( + const std::filesystem::path& external_file_path, + const std::filesystem::path& model_file_path, + const ModelSavingOptions& model_saving_options) const { + GraphProto result; + ToGraphProtoInternal(result); + ORT_ENFORCE(external_file_path.is_relative()); + // If model_file_path is just a file name without a path separator, for example: "model.onnx". Its parent path could + // be empty. Else, save external data file in same directory as the model. + const std::filesystem::path modified_external_file_path = model_file_path.parent_path() / external_file_path; + const auto& model_path = ModelPath(); + + // Create the external file. + std::ofstream external_stream(modified_external_file_path, std::ofstream::out | std::ofstream::binary); + ORT_ENFORCE(external_stream.is_open(), "Failed to open for writing:", modified_external_file_path); + int64_t external_offset = 0; + + ORT_THROW_IF_ERROR(AddExternalInitializersToGraphProtoImpl(model_path, external_file_path, + modified_external_file_path, model_saving_options, + result, + external_stream, external_offset)); + + if (!external_stream.flush()) { + ORT_THROW("Failed to flush file with external initializers: ", modified_external_file_path); + } + return result; } diff --git a/onnxruntime/core/graph/model.cc b/onnxruntime/core/graph/model.cc index 1bae63b510563..be0531e6473fb 100644 --- a/onnxruntime/core/graph/model.cc +++ b/onnxruntime/core/graph/model.cc @@ -383,14 +383,12 @@ ModelProto Model::ToProto() const { ModelProto Model::ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_name, const std::filesystem::path& file_path, - size_t initializer_size_threshold, - const Graph::OffsetAlignmentInfo& align_info) const { + const ModelSavingOptions& model_saving_options) const { ModelProto result(model_proto_); const auto& graph = *graph_; *(result.mutable_graph()) = graph.ToGraphProtoWithExternalInitializers(external_file_name, file_path, - initializer_size_threshold, - align_info); + model_saving_options); return result; } @@ -607,16 +605,13 @@ template static Status SaveModelWithExternalInitializers(Model& model, const T& file_path, const std::filesystem::path& external_file_name, - size_t initializer_size_threshold, - const Graph::OffsetAlignmentInfo& align_info) { + const ModelSavingOptions& save_options) { int fd = 0; Status status = Env::Default().FileOpenWr(file_path, fd); ORT_RETURN_IF_ERROR(status); ORT_TRY { - status = Model::SaveWithExternalInitializers(model, fd, file_path, external_file_name, - initializer_size_threshold, - align_info); + status = Model::SaveWithExternalInitializers(model, fd, file_path, external_file_name, save_options); } ORT_CATCH(const std::exception& ex) { ORT_HANDLE_EXCEPTION([&]() { @@ -646,10 +641,8 @@ Status Model::Load(const PathString& file_path, std::shared_ptr& p_model, Status Model::SaveWithExternalInitializers(Model& model, const std::filesystem::path& file_path, const std::filesystem::path& external_file_name, - size_t initializer_size_threshold, - const Graph::OffsetAlignmentInfo& align_info) { - return SaveModelWithExternalInitializers(model, file_path, external_file_name, initializer_size_threshold, - align_info); + const ModelSavingOptions& save_options) { + return SaveModelWithExternalInitializers(model, file_path, external_file_name, save_options); } Status Model::LoadFromBytes(int count, const void* p_bytes, /*out*/ ONNX_NAMESPACE::ModelProto& model_proto) { @@ -765,8 +758,7 @@ Status Model::SaveWithExternalInitializers(Model& model, int fd, const std::filesystem::path& file_path, const std::filesystem::path& external_file_name, - size_t initializer_size_threshold, - const Graph::OffsetAlignmentInfo& align_info) { + const ModelSavingOptions& model_saving_options) { if (fd < 0) { return Status(ONNXRUNTIME, INVALID_ARGUMENT, " is less than 0."); } @@ -774,8 +766,7 @@ Status Model::SaveWithExternalInitializers(Model& model, ORT_RETURN_IF_ERROR(model.MainGraph().Resolve()); auto model_proto = model.ToGraphProtoWithExternalInitializers(external_file_name, file_path, - initializer_size_threshold, - align_info); + model_saving_options); google::protobuf::io::FileOutputStream output(fd); const bool result = model_proto.SerializeToZeroCopyStream(&output) && output.Flush(); if (result) { diff --git a/onnxruntime/core/graph/model.h b/onnxruntime/core/graph/model.h index 9bcec6f78ca08..2d2086aef41fd 100644 --- a/onnxruntime/core/graph/model.h +++ b/onnxruntime/core/graph/model.h @@ -20,6 +20,8 @@ namespace onnxruntime { +class PrepackedShareableWeightsContainer; + namespace fbs { struct Model; } // namespace fbs @@ -190,15 +192,7 @@ class Model { // initializer offset could be page aligned and allocation granularity aligned for mmap support. ONNX_NAMESPACE::ModelProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_name, const std::filesystem::path& file_path, - size_t initializer_size_threshold, - const Graph::OffsetAlignmentInfo& align_info) const; - - ONNX_NAMESPACE::ModelProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_name, - const std::filesystem::path& file_path, - size_t initializer_size_threshold) const { - Graph::OffsetAlignmentInfo default_align_info; - return ToGraphProtoWithExternalInitializers(external_file_name, file_path, initializer_size_threshold, default_align_info); - } + const ModelSavingOptions& model_saving_options) const; static common::Status Save(Model& model, const PathString& file_path); @@ -209,32 +203,13 @@ class Model { static common::Status SaveWithExternalInitializers(Model& model, const std::filesystem::path& file_path, const std::filesystem::path& external_file_path, - size_t initializer_size_threshold, - const Graph::OffsetAlignmentInfo& align_info); - - static common::Status SaveWithExternalInitializers(Model& model, - const std::filesystem::path& file_path, - const std::filesystem::path& external_file_path, - size_t initializer_size_threshold) { - Graph::OffsetAlignmentInfo default_align_info; - return SaveWithExternalInitializers(model, file_path, external_file_path, initializer_size_threshold, default_align_info); - } - - static common::Status SaveWithExternalInitializers(Model& model, - int fd, - const std::filesystem::path& file_path, - const std::filesystem::path& external_file_path, - size_t initializer_size_threshold, - const Graph::OffsetAlignmentInfo& align_info); + const ModelSavingOptions& save_options); static common::Status SaveWithExternalInitializers(Model& model, int fd, const std::filesystem::path& file_path, const std::filesystem::path& external_file_path, - size_t initializer_size_threshold) { - Graph::OffsetAlignmentInfo default_align_info; - return SaveWithExternalInitializers(model, fd, file_path, external_file_path, initializer_size_threshold, default_align_info); - } + const ModelSavingOptions& save_options); static common::Status Load(std::istream& model_istream, ONNX_NAMESPACE::ModelProto* p_model_proto); diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h index 76ccd361761a7..ede8d67e64381 100644 --- a/onnxruntime/core/providers/shared_library/provider_interfaces.h +++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h @@ -42,6 +42,8 @@ using ProviderType = const std::string&; class RandomGenerator; class IOnnxRuntimeOpSchemaCollection; +struct ModelSavingOptions; + #ifdef ENABLE_TRAINING_TORCH_INTEROP namespace contrib { class PythonOpBase; @@ -964,7 +966,11 @@ struct ProviderHost { virtual void Model__operator_delete(Model* p) = 0; virtual Graph& Model__MainGraph(Model* p) = 0; virtual std::unique_ptr Model__ToProto(Model* p) = 0; - virtual std::unique_ptr Model__ToGraphProtoWithExternalInitializers(Model* p, const std::filesystem::path& external_file_name, const std::filesystem::path& file_path, size_t initializer_size_threshold) = 0; + virtual std::unique_ptr Model__ToGraphProtoWithExternalInitializers( + Model* p, + const std::filesystem::path& external_file_name, + const std::filesystem::path& file_path, + const ModelSavingOptions&) = 0; virtual const ModelMetaData& Model__MetaData(const Model* p) const noexcept = 0; virtual Status Model__Load(const PathString& file_path, /*out*/ ONNX_NAMESPACE::ModelProto& model_proto) = 0; diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h index b7817e98377eb..6a74221428fc3 100644 --- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h +++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h @@ -993,6 +993,8 @@ struct NodeUnit final { void operator=(const NodeUnit& v) = delete; }; +struct ModelSavingOptions; + struct Model final { static std::unique_ptr Create(ONNX_NAMESPACE::ModelProto&& model_proto, const PathString& model_path, const IOnnxRuntimeOpSchemaRegistryList* local_registries, const logging::Logger& logger) { @@ -1007,7 +1009,12 @@ struct Model final { Graph& MainGraph() { return g_host->Model__MainGraph(this); } std::unique_ptr ToProto() { return g_host->Model__ToProto(this); } - std::unique_ptr ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_name, const std::filesystem::path& file_path, size_t initializer_size_threshold) { return g_host->Model__ToGraphProtoWithExternalInitializers(this, external_file_name, file_path, initializer_size_threshold); } + std::unique_ptr ToGraphProtoWithExternalInitializers( + const std::filesystem::path& external_file_name, + const std::filesystem::path& file_path, const ModelSavingOptions& model_saving_options) { + return g_host->Model__ToGraphProtoWithExternalInitializers(this, external_file_name, file_path, + model_saving_options); + } const ModelMetaData& MetaData() const noexcept { return g_host->Model__MetaData(this); } Model() = delete; diff --git a/onnxruntime/core/providers/vitisai/imp/graph.cc b/onnxruntime/core/providers/vitisai/imp/graph.cc index 191d26f3ab269..e7b39546fda6a 100644 --- a/onnxruntime/core/providers/vitisai/imp/graph.cc +++ b/onnxruntime/core/providers/vitisai/imp/graph.cc @@ -9,6 +9,7 @@ #include #include +#include "core/graph/model_saving_options.h" #include "core/providers/shared_library/provider_api.h" #include "./vai_assert.h" @@ -111,7 +112,9 @@ void graph_save(const Graph& graph, const std::string& filename, const std::stri if (initializer_size_threshold == std::numeric_limits::max()) { model_proto = model->ToProto(); } else { - model_proto = model->ToGraphProtoWithExternalInitializers(ToPathString(filename_dat), ToPathString(filename), initializer_size_threshold); + ModelSavingOptions model_saving_options{initializer_size_threshold}; + model_proto = model->ToGraphProtoWithExternalInitializers(ToPathString(filename_dat), ToPathString(filename), + model_saving_options); } auto& metadata = model->MetaData(); if (!metadata.empty()) { diff --git a/onnxruntime/core/providers/webgpu/buffer_manager.cc b/onnxruntime/core/providers/webgpu/buffer_manager.cc index 45eb123943de9..233bb24083289 100644 --- a/onnxruntime/core/providers/webgpu/buffer_manager.cc +++ b/onnxruntime/core/providers/webgpu/buffer_manager.cc @@ -321,8 +321,8 @@ void BufferManager::Download(WGPUBuffer src, void* dst, size_t size) { // TODO: revise wait in whole project - ORT_ENFORCE(context_.Wait(staging_buffer.MapAsync(wgpu::MapMode::Read, 0, buffer_size, wgpu::CallbackMode::WaitAnyOnly, [](wgpu::MapAsyncStatus status, const char* message) { - ORT_ENFORCE(status == wgpu::MapAsyncStatus::Success, "Failed to download data from buffer: ", message); + ORT_ENFORCE(context_.Wait(staging_buffer.MapAsync(wgpu::MapMode::Read, 0, buffer_size, wgpu::CallbackMode::WaitAnyOnly, [](wgpu::MapAsyncStatus status, wgpu::StringView message) { + ORT_ENFORCE(status == wgpu::MapAsyncStatus::Success, "Failed to download data from buffer: ", std::string_view{message}); })) == Status::OK()); auto mapped_data = staging_buffer.GetConstMappedRange(); diff --git a/onnxruntime/core/providers/webgpu/tensor/gather_elements.cc b/onnxruntime/core/providers/webgpu/tensor/gather_elements.cc new file mode 100644 index 0000000000000..00d8caf2624a9 --- /dev/null +++ b/onnxruntime/core/providers/webgpu/tensor/gather_elements.cc @@ -0,0 +1,86 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/common/inlined_containers.h" +#include "core/providers/webgpu/tensor/gather_elements.h" +#include "core/providers/cpu/tensor/utils.h" +#include "core/providers/webgpu/shader_helper.h" +#include "core/providers/webgpu/webgpu_supported_types.h" + +namespace onnxruntime { +namespace webgpu { + +ONNX_OPERATOR_VERSIONED_KERNEL_EX( + GatherElements, + kOnnxDomain, + 11, 12, + kWebGpuExecutionProvider, + (*KernelDefBuilder::Create()).TypeConstraint("T", WebGpuSupportedFloatTypes()), + GatherElements); + +ONNX_OPERATOR_KERNEL_EX( + GatherElements, + kOnnxDomain, + 13, + kWebGpuExecutionProvider, + (*KernelDefBuilder::Create()).TypeConstraint("T", WebGpuSupportedFloatTypes()), + GatherElements); + +Status GatherElementsProgram::GenerateShaderCode(ShaderHelper& shader) const { + const ShaderVariableHelper& input = shader.AddInput("input", ShaderUsage::UseUniform); + const ShaderVariableHelper& indices = shader.AddInput("indices", ShaderUsage::UseUniform); + const ShaderVariableHelper& output = shader.AddOutput("output", ShaderUsage::UseUniform); + + shader.MainFunctionBody() << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size") + << "let output_indices = " << output.OffsetToIndices("global_idx") << ";\n" + << "var idx = " << indices.GetByOffset("global_idx") << ";\n" + << "if (idx < 0) {\n" + << " idx = idx + uniforms.axis_dim_limit;\n" + << "}\n" + << "var input_indices = output_indices;\n" + << input.IndicesSet("input_indices", "uniforms.axis", "u32(idx)") << ";\n" + << "let value = " << input.GetByIndices("input_indices") << ";\n" + << output.SetByOffset("global_idx", "value") << ";\n"; + + return Status::OK(); +} + +Status GatherElements::ComputeInternal(ComputeContext& context) const { + const auto* input_tensor = context.Input(0); + const TensorShape& input_shape = input_tensor->Shape(); + int64_t input_rank = input_shape.NumDimensions(); + + const auto* indices_tensor = context.Input(1); + const TensorShape& indices_shape = indices_tensor->Shape(); + + // Handle negative axis + int64_t axis = axis_; + if (axis < 0) { + axis += input_rank; + } + + auto axis_dim_limit = input_shape[axis]; + + auto output_dims = indices_shape.AsShapeVector(); + TensorShape output_shape(output_dims); + auto* output_tensor = context.Output(0, output_shape); + int64_t output_size = output_tensor->Shape().Size(); + + if (output_size == 0) { + return Status::OK(); + } + + GatherElementsProgram program{}; + program + .AddInputs({{input_tensor, ProgramTensorMetadataDependency::TypeAndRank}}) + .AddInputs({{indices_tensor, ProgramTensorMetadataDependency::TypeAndRank}}) + .AddOutputs({output_tensor}) + .SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE) + .AddUniformVariables({{static_cast(output_size)}, + {static_cast(axis_dim_limit)}, + {static_cast(axis)}}); + return context.RunProgram(program); +} + +} // namespace webgpu +} // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/core/providers/webgpu/tensor/gather_elements.h b/onnxruntime/core/providers/webgpu/tensor/gather_elements.h new file mode 100644 index 0000000000000..f70bbda84c933 --- /dev/null +++ b/onnxruntime/core/providers/webgpu/tensor/gather_elements.h @@ -0,0 +1,36 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "core/providers/webgpu/webgpu_kernel.h" +#include "core/providers/webgpu/program.h" + +namespace onnxruntime { +namespace webgpu { + +class GatherElementsProgram final : public Program { + public: + GatherElementsProgram() : Program{"GatherElements"} {} + + Status GenerateShaderCode(ShaderHelper& sh) const override; + + WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"output_size", ProgramUniformVariableDataType::Uint32}, + {"axis_dim_limit", ProgramUniformVariableDataType::Int32}, + {"axis", ProgramUniformVariableDataType::Int32}); +}; + +class GatherElements final : public WebGpuKernel { + public: + GatherElements(const OpKernelInfo& info) : WebGpuKernel(info) { + axis_ = info.GetAttrOrDefault("axis", 0); + } + + Status ComputeInternal(ComputeContext& context) const override; + + private: + int64_t axis_; +}; + +} // namespace webgpu +} // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.cc b/onnxruntime/core/providers/webgpu/webgpu_context.cc index d66c2a79d28a8..b2f7748a54743 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_context.cc +++ b/onnxruntime/core/providers/webgpu/webgpu_context.cc @@ -10,6 +10,8 @@ #endif #include "core/common/common.h" +#include "core/common/path_string.h" +#include "core/platform/env.h" #include "core/providers/webgpu/compute_context.h" #include "core/providers/webgpu/webgpu_context.h" @@ -23,37 +25,38 @@ namespace onnxruntime { namespace webgpu { -void WebGpuContext::Initialize(const WebGpuExecutionProviderInfo& webgpu_ep_info, const void* dawn_proc_table) { - std::call_once(init_flag_, [this, &webgpu_ep_info, dawn_proc_table]() { - // Initialization.Step.1 - Create wgpu::Instance - if (instance_ == nullptr) { - const DawnProcTable* dawn_procs = reinterpret_cast(dawn_proc_table); -#if defined(BUILD_DAWN_MONOLITHIC_LIBRARY) - ORT_ENFORCE(dawn_procs == nullptr, "setting DawnProcTable is not allowed when dynamically linked to webgpu_dawn."); -#else -#if !defined(USE_EXTERNAL_DAWN) - if (dawn_procs == nullptr) { - dawn_procs = &dawn::native::GetProcs(); +void WebGpuContext::Initialize(const WebGpuBufferCacheConfig& buffer_cache_config, int backend_type) { + std::call_once(init_flag_, [this, &buffer_cache_config, backend_type]() { + // Create wgpu::Adapter + if (adapter_ == nullptr) { +#if !defined(__EMSCRIPTEN__) && defined(_MSC_VER) && defined(DAWN_ENABLE_D3D12) && !defined(USE_EXTERNAL_DAWN) + // If we are using the D3D12 backend on Windows and the build does not use external Dawn, dxil.dll and dxcompiler.dll are required. + // + // Dawn will try to load them later, but if they are in the different directory to the executable, it may fail to find them. + // To avoid this issue, we try to load them from the same directory as current module (usually onnxruntime.dll). + auto runtime_path = Env::Default().GetRuntimePath(); + if (!runtime_path.empty()) { + Status status; + void* module_handle = nullptr; + + PathString dxil_path = runtime_path + ToPathString(L"dxil.dll"); + status = Env::Default().LoadDynamicLibrary(dxil_path, false, &module_handle); + if (status.IsOK() && module_handle != nullptr) { + modules_.Add(dxil_path, module_handle); + } + + PathString dxcompiler_path = runtime_path + ToPathString(L"dxcompiler.dll"); + status = Env::Default().LoadDynamicLibrary(dxcompiler_path, false, &module_handle); + if (status.IsOK() && module_handle != nullptr) { + modules_.Add(dxcompiler_path, module_handle); + } } -#else - ORT_ENFORCE(dawn_procs != nullptr, "DawnProcTable must be provided."); -#endif - dawnProcSetProcs(dawn_procs); #endif - wgpu::InstanceDescriptor instance_desc{}; - instance_desc.features.timedWaitAnyEnable = true; - instance_ = wgpu::CreateInstance(&instance_desc); - - ORT_ENFORCE(instance_ != nullptr, "Failed to create wgpu::Instance."); - } - - // Initialization.Step.2 - Create wgpu::Adapter - if (adapter_ == nullptr) { wgpu::RequestAdapterOptions req_adapter_options = {}; wgpu::DawnTogglesDescriptor adapter_toggles_desc = {}; req_adapter_options.nextInChain = &adapter_toggles_desc; - req_adapter_options.backendType = static_cast(webgpu_ep_info.backend_type); + req_adapter_options.backendType = static_cast(backend_type); req_adapter_options.powerPreference = wgpu::PowerPreference::HighPerformance; auto enabled_adapter_toggles = GetEnabledAdapterToggles(); @@ -72,7 +75,7 @@ void WebGpuContext::Initialize(const WebGpuExecutionProviderInfo& webgpu_ep_info ORT_ENFORCE(adapter_ != nullptr, "Failed to get a WebGPU adapter."); } - // Initialization.Step.3 - Create wgpu::Device + // Create wgpu::Device if (device_ == nullptr) { wgpu::DeviceDescriptor device_desc = {}; wgpu::DawnTogglesDescriptor device_toggles_desc = {}; @@ -124,7 +127,10 @@ void WebGpuContext::Initialize(const WebGpuExecutionProviderInfo& webgpu_ep_info device_limits_ = device_supported_limits.limits; // create buffer manager - buffer_mgr_ = BufferManagerFactory::Create(*this, webgpu_ep_info.storage_buffer_cache_mode, webgpu_ep_info.uniform_buffer_cache_mode, webgpu_ep_info.query_resolve_buffer_cache_mode); + buffer_mgr_ = BufferManagerFactory::Create(*this, + buffer_cache_config.storage.mode, + buffer_cache_config.uniform.mode, + buffer_cache_config.query_resolve.mode); // create program manager program_mgr_ = std::make_unique(Device(), DeviceLimits()); @@ -526,8 +532,8 @@ void WebGpuContext::CollectProfilingData(profiling::Events& events) { 0, query_read_buffer.GetSize(), wgpu::CallbackMode::WaitAnyOnly, - [](wgpu::MapAsyncStatus status, const char* message) { - ORT_ENFORCE(status == wgpu::MapAsyncStatus::Success, "Failed to download data from buffer: ", message); + [](wgpu::MapAsyncStatus status, wgpu::StringView message) { + ORT_ENFORCE(status == wgpu::MapAsyncStatus::Success, "Failed to download data from buffer: ", std::string_view{message}); })) == Status::OK()); auto mapped_data = static_cast(query_read_buffer.GetConstMappedRange()); @@ -635,18 +641,46 @@ void WebGpuContext::Flush() { num_pending_dispatches_ = 0; } -std::unordered_map> WebGpuContextFactory::contexts_; +std::unordered_map WebGpuContextFactory::contexts_; std::mutex WebGpuContextFactory::mutex_; +std::once_flag WebGpuContextFactory::init_default_flag_; +wgpu::Instance WebGpuContextFactory::default_instance_; + +WebGpuContext& WebGpuContextFactory::CreateContext(const WebGpuContextConfig& config) { + const int context_id = config.context_id; + WGPUInstance instance = config.instance; + WGPUAdapter adapter = config.adapter; + WGPUDevice device = config.device; -WebGpuContext& WebGpuContextFactory::CreateContext(int context_id, - WGPUInstance instance, - WGPUAdapter adapter, - WGPUDevice device, - ValidationMode validation_mode) { if (context_id == 0) { // context ID is preserved for the default context. User cannot use context ID 0 as a custom context. ORT_ENFORCE(instance == nullptr && adapter == nullptr && device == nullptr, "WebGPU EP default context (contextId=0) must not have custom WebGPU instance, adapter or device."); + + std::call_once(init_default_flag_, [dawn_proc_table = config.dawn_proc_table]() { + // Step.1 - setup dawn proc table + const DawnProcTable* dawn_procs = reinterpret_cast(dawn_proc_table); +#if defined(BUILD_DAWN_MONOLITHIC_LIBRARY) + ORT_ENFORCE(dawn_procs == nullptr, "setting DawnProcTable is not allowed when dynamically linked to webgpu_dawn."); +#else +#if !defined(USE_EXTERNAL_DAWN) + if (dawn_procs == nullptr) { + dawn_procs = &dawn::native::GetProcs(); + } +#else + ORT_ENFORCE(dawn_procs != nullptr, "DawnProcTable must be provided."); +#endif + dawnProcSetProcs(dawn_procs); +#endif + + // Step.2 - Create wgpu::Instance + wgpu::InstanceDescriptor instance_desc{}; + instance_desc.features.timedWaitAnyEnable = true; + default_instance_ = wgpu::CreateInstance(&instance_desc); + + ORT_ENFORCE(default_instance_ != nullptr, "Failed to create wgpu::Instance."); + }); + instance = default_instance_.Get(); } else { // for context ID > 0, user must provide custom WebGPU instance, adapter and device. ORT_ENFORCE(instance != nullptr && adapter != nullptr && device != nullptr, @@ -658,13 +692,16 @@ WebGpuContext& WebGpuContextFactory::CreateContext(int context_id, auto it = contexts_.find(context_id); if (it == contexts_.end()) { GSL_SUPPRESS(r.11) - auto context = std::unique_ptr(new WebGpuContext(instance, adapter, device, validation_mode)); - it = contexts_.emplace(context_id, std::move(context)).first; + auto context = std::unique_ptr(new WebGpuContext(instance, adapter, device, config.validation_mode)); + it = contexts_.emplace(context_id, WebGpuContextFactory::WebGpuContextInfo{std::move(context), 0}).first; } else if (context_id != 0) { - ORT_ENFORCE(it->second->instance_.Get() == instance && it->second->adapter_.Get() == adapter && it->second->device_.Get() == device, + ORT_ENFORCE(it->second.context->instance_.Get() == instance && + it->second.context->adapter_.Get() == adapter && + it->second.context->device_.Get() == device, "WebGPU EP context ID ", context_id, " is already created with different WebGPU instance, adapter or device."); } - return *it->second; + it->second.ref_count++; + return *it->second.context; } WebGpuContext& WebGpuContextFactory::GetContext(int context_id) { @@ -673,12 +710,24 @@ WebGpuContext& WebGpuContextFactory::GetContext(int context_id) { auto it = contexts_.find(context_id); ORT_ENFORCE(it != contexts_.end(), "WebGPU EP context ID ", context_id, " is not found."); - return *it->second; + return *it->second.context; +} + +void WebGpuContextFactory::ReleaseContext(int context_id) { + std::lock_guard lock(mutex_); + + auto it = contexts_.find(context_id); + ORT_ENFORCE(it != contexts_.end(), "WebGPU EP context ID ", context_id, " is not found."); + + if (--it->second.ref_count == 0) { + contexts_.erase(it); + } } void WebGpuContextFactory::Cleanup() { std::lock_guard lock(mutex_); contexts_.clear(); + default_instance_ = nullptr; } void CleanupWebGpuContexts() { diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.h b/onnxruntime/core/providers/webgpu/webgpu_context.h index be05b06523b9c..d1f43cdc4ddff 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_context.h +++ b/onnxruntime/core/providers/webgpu/webgpu_context.h @@ -13,6 +13,7 @@ #include #include "core/common/common.h" +#include "core/framework/library_handles.h" #include "core/providers/webgpu/webgpu_execution_provider.h" #include "core/providers/webgpu/buffer_manager.h" #include "core/providers/webgpu/program_manager.h" @@ -25,28 +26,53 @@ class WebGpuContext; class ComputeContext; class ProgramBase; +struct WebGpuContextConfig { + int context_id; + WGPUInstance instance; + WGPUAdapter adapter; + WGPUDevice device; + const void* dawn_proc_table; + ValidationMode validation_mode; +}; + +struct WebGpuBufferCacheConfig { + struct ConfigEntry { + BufferCacheMode mode; + std::string config_string; + }; + ConfigEntry storage; + ConfigEntry uniform; + ConfigEntry query_resolve; + ConfigEntry default_entry; +}; + class WebGpuContextFactory { public: - static WebGpuContext& CreateContext(int context_id, - WGPUInstance instance, - WGPUAdapter adapter, - WGPUDevice device, - ValidationMode validation_mode); + struct WebGpuContextInfo { + std::unique_ptr context; + int ref_count; + }; + + static WebGpuContext& CreateContext(const WebGpuContextConfig& config); static WebGpuContext& GetContext(int context_id); + static void ReleaseContext(int context_id); + static void Cleanup(); private: WebGpuContextFactory() {} - static std::unordered_map> contexts_; + static std::unordered_map contexts_; static std::mutex mutex_; + static std::once_flag init_default_flag_; + static wgpu::Instance default_instance_; }; // Class WebGpuContext includes all necessary resources for the context. class WebGpuContext final { public: - void Initialize(const WebGpuExecutionProviderInfo& webgpu_ep_info, const void* dawn_proc_table); + void Initialize(const WebGpuBufferCacheConfig& buffer_cache_config, int backend_type); Status Wait(wgpu::Future f); @@ -153,6 +179,8 @@ class WebGpuContext final { std::once_flag init_flag_; + LibraryHandles modules_; + wgpu::Instance instance_; wgpu::Adapter adapter_; wgpu::Device device_; diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc index 66209adf6f1a9..76a55b7ce4f2e 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc +++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc @@ -649,8 +649,8 @@ std::unique_ptr RegisterKernels() { BuildKernelCreateInfo, BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, // BuildKernelCreateInfo, // BuildKernelCreateInfo, @@ -743,13 +743,13 @@ using namespace webgpu; WebGpuExecutionProvider::WebGpuExecutionProvider(int context_id, WebGpuContext& context, - WebGpuExecutionProviderInfo&& info) + WebGpuExecutionProviderConfig&& config) : IExecutionProvider{kWebGpuExecutionProvider, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0)}, context_id_{context_id}, context_{context}, - preferred_data_layout_{info.data_layout}, - force_cpu_node_names_{std::move(info.force_cpu_node_names)}, - enable_graph_capture_{info.enable_graph_capture} { + preferred_data_layout_{config.data_layout}, + force_cpu_node_names_{std::move(config.force_cpu_node_names)}, + enable_graph_capture_{config.enable_graph_capture} { } std::vector WebGpuExecutionProvider::CreatePreferredAllocators() { @@ -824,6 +824,7 @@ std::unique_ptr WebGpuExecutionProvider::GetDataTran } WebGpuExecutionProvider::~WebGpuExecutionProvider() { + WebGpuContextFactory::ReleaseContext(context_id_); } std::unique_ptr WebGpuExecutionProvider::GetProfiler() { diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h index f9c43c6bfd7d0..ad81924e06901 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h +++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h @@ -22,32 +22,22 @@ enum class BufferCacheMode; class WebGpuProfiler; } // namespace webgpu -struct WebGpuExecutionProviderInfo { - WebGpuExecutionProviderInfo(DataLayout data_layout, bool enable_graph_capture) +struct WebGpuExecutionProviderConfig { + WebGpuExecutionProviderConfig(DataLayout data_layout, bool enable_graph_capture) : data_layout{data_layout}, - enable_graph_capture{enable_graph_capture}, - backend_type{}, - storage_buffer_cache_mode{}, - uniform_buffer_cache_mode{}, - query_resolve_buffer_cache_mode{}, - default_buffer_cache_mode{} {} - WebGpuExecutionProviderInfo(WebGpuExecutionProviderInfo&&) = default; - WebGpuExecutionProviderInfo& operator=(WebGpuExecutionProviderInfo&&) = default; - ORT_DISALLOW_COPY_AND_ASSIGNMENT(WebGpuExecutionProviderInfo); + enable_graph_capture{enable_graph_capture} {} + WebGpuExecutionProviderConfig(WebGpuExecutionProviderConfig&&) = default; + WebGpuExecutionProviderConfig& operator=(WebGpuExecutionProviderConfig&&) = default; + ORT_DISALLOW_COPY_AND_ASSIGNMENT(WebGpuExecutionProviderConfig); DataLayout data_layout; bool enable_graph_capture; - int backend_type; - webgpu::BufferCacheMode storage_buffer_cache_mode; - webgpu::BufferCacheMode uniform_buffer_cache_mode; - webgpu::BufferCacheMode query_resolve_buffer_cache_mode; - webgpu::BufferCacheMode default_buffer_cache_mode; std::vector force_cpu_node_names; }; class WebGpuExecutionProvider : public IExecutionProvider { public: - WebGpuExecutionProvider(int context_id, webgpu::WebGpuContext& context, WebGpuExecutionProviderInfo&& info); + WebGpuExecutionProvider(int context_id, webgpu::WebGpuContext& context, WebGpuExecutionProviderConfig&& config); ~WebGpuExecutionProvider() override; std::vector> GetCapability( diff --git a/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc b/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc index 6cfe9aac0b0e9..64eb80b26fbf9 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc +++ b/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc @@ -17,25 +17,25 @@ using namespace onnxruntime::webgpu::options; namespace onnxruntime { struct WebGpuProviderFactory : IExecutionProviderFactory { - WebGpuProviderFactory(int context_id, webgpu::WebGpuContext& context, WebGpuExecutionProviderInfo&& webgpu_ep_info) - : context_id_{context_id}, context_{context}, info_{std::move(webgpu_ep_info)} { + WebGpuProviderFactory(int context_id, webgpu::WebGpuContext& context, WebGpuExecutionProviderConfig&& webgpu_ep_config) + : context_id_{context_id}, context_{context}, config_{std::move(webgpu_ep_config)} { } std::unique_ptr CreateProvider() override { - return std::make_unique(context_id_, context_, std::move(info_)); + return std::make_unique(context_id_, context_, std::move(config_)); } private: int context_id_; webgpu::WebGpuContext& context_; - WebGpuExecutionProviderInfo info_; + WebGpuExecutionProviderConfig config_; }; std::shared_ptr WebGpuProviderFactoryCreator::Create(const ConfigOptions& config_options) { // - // STEP.1 - prepare WebGpuExecutionProviderInfo + // STEP.1 - prepare WebGpuExecutionProviderConfig // - WebGpuExecutionProviderInfo webgpu_ep_info{ + WebGpuExecutionProviderConfig webgpu_ep_config{ // preferred layout is NHWC by default DataLayout::NHWC, // graph capture feature is disabled by default @@ -45,109 +45,33 @@ std::shared_ptr WebGpuProviderFactoryCreator::Create( std::string preferred_layout_str; if (config_options.TryGetConfigEntry(kPreferredLayout, preferred_layout_str)) { if (preferred_layout_str == kPreferredLayout_NHWC) { - webgpu_ep_info.data_layout = DataLayout::NHWC; + webgpu_ep_config.data_layout = DataLayout::NHWC; } else if (preferred_layout_str == kPreferredLayout_NCHW) { - webgpu_ep_info.data_layout = DataLayout::NCHW; + webgpu_ep_config.data_layout = DataLayout::NCHW; } else { ORT_THROW("Invalid preferred layout: ", preferred_layout_str); } } - LOGS_DEFAULT(VERBOSE) << "WebGPU EP preferred layout: " << int(webgpu_ep_info.data_layout) << " (parsed from \"" + LOGS_DEFAULT(VERBOSE) << "WebGPU EP preferred layout: " << int(webgpu_ep_config.data_layout) << " (parsed from \"" << preferred_layout_str << "\")"; std::string enable_graph_capture_str; if (config_options.TryGetConfigEntry(kEnableGraphCapture, enable_graph_capture_str)) { if (enable_graph_capture_str == kEnableGraphCapture_ON) { - webgpu_ep_info.enable_graph_capture = true; + webgpu_ep_config.enable_graph_capture = true; } else if (enable_graph_capture_str == kEnableGraphCapture_OFF) { - webgpu_ep_info.enable_graph_capture = false; + webgpu_ep_config.enable_graph_capture = false; } else { ORT_THROW("Invalid enable graph capture: ", enable_graph_capture_str); } } - LOGS_DEFAULT(VERBOSE) << "WebGPU EP graph capture enable: " << webgpu_ep_info.enable_graph_capture; - - std::string backend_type_str; - if (config_options.TryGetConfigEntry(kDawnBackendType, backend_type_str)) { -#ifdef _WIN32 - // Setup Windows default backend type based on the build configuration -#if defined(onnxruntime_ENABLE_DAWN_BACKEND_D3D12) - webgpu_ep_info.backend_type = static_cast(WGPUBackendType_D3D12); -#elif defined(onnxruntime_ENABLE_DAWN_BACKEND_VULKAN) - webgpu_ep_info.backend_type = static_cast(WGPUBackendType_Vulkan); -#endif -#endif - if (backend_type_str == kDawnBackendType_D3D12) { - webgpu_ep_info.backend_type = static_cast(WGPUBackendType_D3D12); - } else if (backend_type_str == kDawnBackendType_Vulkan) { - webgpu_ep_info.backend_type = static_cast(WGPUBackendType_Vulkan); - } else { - ORT_THROW("Invalid Dawn backend type: ", backend_type_str); - } - } - LOGS_DEFAULT(VERBOSE) << "WebGPU EP Dawn backend type: " << webgpu_ep_info.backend_type; - - auto parse_buffer_cache_mode = [&config_options](const std::string& config_entry_str, - webgpu::BufferCacheMode default_value) -> webgpu::BufferCacheMode { - std::string buffer_cache_mode_str; - if (config_options.TryGetConfigEntry(config_entry_str, buffer_cache_mode_str)) { - if (buffer_cache_mode_str == kBufferCacheMode_Disabled) { - return webgpu::BufferCacheMode::Disabled; - } else if (buffer_cache_mode_str == kBufferCacheMode_LazyRelease) { - return webgpu::BufferCacheMode::LazyRelease; - } else if (buffer_cache_mode_str == kBufferCacheMode_Simple) { - return webgpu::BufferCacheMode::Simple; - } else if (buffer_cache_mode_str == kBufferCacheMode_Bucket) { - return webgpu::BufferCacheMode::Bucket; - } else { - ORT_THROW("Invalid buffer cache mode: ", config_entry_str); - } - } else { - return default_value; - } - }; - - webgpu_ep_info.storage_buffer_cache_mode = parse_buffer_cache_mode(kStorageBufferCacheMode, webgpu::BufferCacheMode::Bucket); - LOGS_DEFAULT(VERBOSE) << "WebGPU EP storage buffer cache mode: " << webgpu_ep_info.storage_buffer_cache_mode; - - webgpu_ep_info.uniform_buffer_cache_mode = parse_buffer_cache_mode(kUniformBufferCacheMode, webgpu::BufferCacheMode::Simple); - LOGS_DEFAULT(VERBOSE) << "WebGPU EP uniform buffer cache mode: " << webgpu_ep_info.uniform_buffer_cache_mode; - - webgpu_ep_info.query_resolve_buffer_cache_mode = parse_buffer_cache_mode(kQueryResolveBufferCacheMode, webgpu::BufferCacheMode::Disabled); - LOGS_DEFAULT(VERBOSE) << "WebGPU EP query resolve buffer cache mode: " << webgpu_ep_info.query_resolve_buffer_cache_mode; - - webgpu_ep_info.default_buffer_cache_mode = parse_buffer_cache_mode(kDefaultBufferCacheMode, webgpu::BufferCacheMode::Disabled); - LOGS_DEFAULT(VERBOSE) << "WebGPU EP default buffer cache mode: " << webgpu_ep_info.default_buffer_cache_mode; - - webgpu::ValidationMode validation_mode = -#ifndef NDEBUG - webgpu::ValidationMode::Full // for debug build, enable full validation by default -#else - webgpu::ValidationMode::Basic // for release build, enable basic validation by default -#endif // !NDEBUG - ; - std::string validation_mode_str; - if (config_options.TryGetConfigEntry(kValidationMode, validation_mode_str)) { - if (validation_mode_str == kValidationMode_Disabled) { - validation_mode = webgpu::ValidationMode::Disabled; - } else if (validation_mode_str == kValidationMode_wgpuOnly) { - validation_mode = webgpu::ValidationMode::WGPUOnly; - } else if (validation_mode_str == kValidationMode_basic) { - validation_mode = webgpu::ValidationMode::Basic; - } else if (validation_mode_str == kValidationMode_full) { - validation_mode = webgpu::ValidationMode::Full; - } else { - ORT_THROW("Invalid validation mode: ", validation_mode_str); - } - } + LOGS_DEFAULT(VERBOSE) << "WebGPU EP graph capture enable: " << webgpu_ep_config.enable_graph_capture; // parse force CPU node names // The force CPU node names are separated by EOL (\n or \r\n) in the config entry. // each line is a node name that will be forced to run on CPU. std::string force_cpu_node_names_str; if (config_options.TryGetConfigEntry(kForceCpuNodeNames, force_cpu_node_names_str)) { - std::vector force_cpu_node_names; - // split the string by EOL (\n or \r\n) std::istringstream ss(force_cpu_node_names_str); std::string line; @@ -157,14 +81,13 @@ std::shared_ptr WebGpuProviderFactoryCreator::Create( continue; } - force_cpu_node_names.push_back(line); + webgpu_ep_config.force_cpu_node_names.push_back(line); } - - webgpu_ep_info.force_cpu_node_names = std::move(force_cpu_node_names); } + LOGS_DEFAULT(VERBOSE) << "WebGPU EP force CPU node count: " << webgpu_ep_config.force_cpu_node_names.size(); // - // STEP.2 - prepare WebGpuContext + // STEP.2 - prepare WebGpuContextConfig // int context_id = 0; std::string context_id_str; @@ -204,14 +127,110 @@ std::shared_ptr WebGpuProviderFactoryCreator::Create( std::from_chars(dawn_proc_table_str.data(), dawn_proc_table_str.data() + dawn_proc_table_str.size(), dawn_proc_table).ec); } - auto& context = webgpu::WebGpuContextFactory::CreateContext(context_id, - reinterpret_cast(webgpu_instance), - reinterpret_cast(webgpu_adapter), - reinterpret_cast(webgpu_device), - validation_mode); - context.Initialize(webgpu_ep_info, reinterpret_cast(dawn_proc_table)); + webgpu::ValidationMode validation_mode = +#ifndef NDEBUG + webgpu::ValidationMode::Full // for debug build, enable full validation by default +#else + webgpu::ValidationMode::Basic // for release build, enable basic validation by default +#endif // !NDEBUG + ; + std::string validation_mode_str; + if (config_options.TryGetConfigEntry(kValidationMode, validation_mode_str)) { + if (validation_mode_str == kValidationMode_Disabled) { + validation_mode = webgpu::ValidationMode::Disabled; + } else if (validation_mode_str == kValidationMode_wgpuOnly) { + validation_mode = webgpu::ValidationMode::WGPUOnly; + } else if (validation_mode_str == kValidationMode_basic) { + validation_mode = webgpu::ValidationMode::Basic; + } else if (validation_mode_str == kValidationMode_full) { + validation_mode = webgpu::ValidationMode::Full; + } else { + ORT_THROW("Invalid validation mode: ", validation_mode_str); + } + } + + webgpu::WebGpuContextConfig context_config{ + context_id, + reinterpret_cast(webgpu_instance), + reinterpret_cast(webgpu_adapter), + reinterpret_cast(webgpu_device), + reinterpret_cast(dawn_proc_table), + validation_mode, + }; + + // + // STEP.3 - prepare parameters for WebGPU context initialization. + // + + int backend_type = 0; +#ifdef _WIN32 + // Setup Windows default backend type based on the build configuration +#if defined(DAWN_ENABLE_D3D12) + backend_type = static_cast(WGPUBackendType_D3D12); +#elif defined(DAWN_ENABLE_VULKAN) + backend_type = static_cast(WGPUBackendType_Vulkan); +#endif +#endif + + std::string backend_type_str; + if (config_options.TryGetConfigEntry(kDawnBackendType, backend_type_str)) { + if (backend_type_str == kDawnBackendType_D3D12) { + backend_type = static_cast(WGPUBackendType_D3D12); + } else if (backend_type_str == kDawnBackendType_Vulkan) { + backend_type = static_cast(WGPUBackendType_Vulkan); + } else { + ORT_THROW("Invalid Dawn backend type: ", backend_type_str); + } + } + LOGS_DEFAULT(VERBOSE) << "WebGPU EP Dawn backend type: " << backend_type; + + // buffer cache modes + auto parse_buffer_cache_mode = [&config_options](const std::string& config_entry_str, + webgpu::BufferCacheMode default_value) -> webgpu::BufferCacheMode { + std::string buffer_cache_mode_str; + if (config_options.TryGetConfigEntry(config_entry_str, buffer_cache_mode_str)) { + if (buffer_cache_mode_str == kBufferCacheMode_Disabled) { + return webgpu::BufferCacheMode::Disabled; + } else if (buffer_cache_mode_str == kBufferCacheMode_LazyRelease) { + return webgpu::BufferCacheMode::LazyRelease; + } else if (buffer_cache_mode_str == kBufferCacheMode_Simple) { + return webgpu::BufferCacheMode::Simple; + } else if (buffer_cache_mode_str == kBufferCacheMode_Bucket) { + return webgpu::BufferCacheMode::Bucket; + } else { + ORT_THROW("Invalid buffer cache mode: ", config_entry_str); + } + } else { + return default_value; + } + }; + + webgpu::WebGpuBufferCacheConfig buffer_cache_config; + + buffer_cache_config.storage.mode = parse_buffer_cache_mode(kStorageBufferCacheMode, webgpu::BufferCacheMode::Bucket); + LOGS_DEFAULT(VERBOSE) << "WebGPU EP storage buffer cache mode: " << buffer_cache_config.storage.mode; + + buffer_cache_config.uniform.mode = parse_buffer_cache_mode(kUniformBufferCacheMode, webgpu::BufferCacheMode::Simple); + LOGS_DEFAULT(VERBOSE) << "WebGPU EP uniform buffer cache mode: " << buffer_cache_config.uniform.mode; + + buffer_cache_config.query_resolve.mode = parse_buffer_cache_mode(kQueryResolveBufferCacheMode, webgpu::BufferCacheMode::Disabled); + LOGS_DEFAULT(VERBOSE) << "WebGPU EP query resolve buffer cache mode: " << buffer_cache_config.query_resolve.mode; + + buffer_cache_config.default_entry.mode = parse_buffer_cache_mode(kDefaultBufferCacheMode, webgpu::BufferCacheMode::Disabled); + LOGS_DEFAULT(VERBOSE) << "WebGPU EP default buffer cache mode: " << buffer_cache_config.default_entry.mode; + + // + // STEP.4 - start initialization. + // + + // Load the Dawn library and create the WebGPU instance and adapter. + auto& context = webgpu::WebGpuContextFactory::CreateContext(context_config); + + // Create WebGPU device and initialize the context. + context.Initialize(buffer_cache_config, backend_type); - return std::make_shared(context_id, context, std::move(webgpu_ep_info)); + // Create WebGPU EP factory. + return std::make_shared(context_id, context, std::move(webgpu_ep_config)); } } // namespace onnxruntime diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc index a60ee500a9898..223eed248800e 100644 --- a/onnxruntime/core/session/inference_session.cc +++ b/onnxruntime/core/session/inference_session.cc @@ -38,6 +38,7 @@ #include "core/framework/utils.h" #include "core/graph/graph_viewer.h" #include "core/graph/model.h" +#include "core/graph/model_saving_options.h" #include "core/optimizer/graph_transformer_utils.h" #include "core/optimizer/graph_transformer.h" #include "core/optimizer/layout_transformation/layout_transformation.h" @@ -2099,13 +2100,12 @@ common::Status InferenceSession::Initialize() { const size_t optimized_model_external_initializers_min_size_in_bytes = ParseStringWithClassicLocale(session_options_.config_options.GetConfigOrDefault( kOrtSessionOptionsOptimizedModelExternalInitializersMinSizeInBytes, "1024")); - Graph::OffsetAlignmentInfo align_info; - align_info.align_offset = true; + ModelSavingOptions model_saving_options{optimized_model_external_initializers_min_size_in_bytes}; + model_saving_options.align_offset = true; ORT_RETURN_IF_ERROR_SESSIONID_(Model::SaveWithExternalInitializers(*model_, session_options_.optimized_model_filepath, optimized_model_external_initializers_file_name, - optimized_model_external_initializers_min_size_in_bytes, - align_info)); + model_saving_options)); } } } diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc index 78c441efea856..53770df228f5a 100644 --- a/onnxruntime/core/session/provider_bridge_ort.cc +++ b/onnxruntime/core/session/provider_bridge_ort.cc @@ -1200,7 +1200,14 @@ struct ProviderHostImpl : ProviderHost { void Model__operator_delete(Model* p) override { delete p; } Graph& Model__MainGraph(Model* p) override { return p->MainGraph(); } std::unique_ptr Model__ToProto(Model* p) override { return std::make_unique(p->ToProto()); } - std::unique_ptr Model__ToGraphProtoWithExternalInitializers(Model* p, const std::filesystem::path& external_file_name, const std::filesystem::path& file_path, size_t initializer_size_threshold) override { return std::make_unique(p->ToGraphProtoWithExternalInitializers(external_file_name, file_path, initializer_size_threshold)); }; + std::unique_ptr Model__ToGraphProtoWithExternalInitializers(Model* p, + const std::filesystem::path& external_file_name, + const std::filesystem::path& file_path, + const ModelSavingOptions& model_saving_options) override { + return std::make_unique(p->ToGraphProtoWithExternalInitializers(external_file_name, + file_path, + model_saving_options)); + }; const ModelMetaData& Model__MetaData(const Model* p) const noexcept override { return p->MetaData(); }; Status Model__Load(const PathString& file_path, /*out*/ ONNX_NAMESPACE::ModelProto& model_proto) override { return Model::Load(file_path, model_proto); } diff --git a/onnxruntime/python/tools/tensorrt/perf/build/build_image.py b/onnxruntime/python/tools/tensorrt/perf/build/build_image.py index 3ebc33c02592d..541dc4978dad1 100644 --- a/onnxruntime/python/tools/tensorrt/perf/build/build_image.py +++ b/onnxruntime/python/tools/tensorrt/perf/build/build_image.py @@ -15,10 +15,10 @@ from typing import List, Optional TRT_DOCKER_FILES = { - "8.6.cuda_11_8_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_6", - "8.6.cuda_12_3_cudnn_9": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_3_tensorrt8_6", - "10.5.cuda_11_8_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10", - "10.5.cuda_12_5_cudnn_9": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10", + "8.6_cuda11.8_cudnn8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_6", + "8.6_cuda12.3_cudnn9": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_3_tensorrt8_6", + "10.7_cuda11.8_cudnn8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10", + "10.7_cuda12.5_cudnn9": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10", "BIN": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin", } diff --git a/onnxruntime/test/common/cuda_op_test_utils.h b/onnxruntime/test/common/cuda_op_test_utils.h index d3e069237217e..6f3e460628566 100644 --- a/onnxruntime/test/common/cuda_op_test_utils.h +++ b/onnxruntime/test/common/cuda_op_test_utils.h @@ -5,11 +5,6 @@ #include "test/util/include/default_providers.h" -#define SKIP_CUDA_TEST_WITH_DML \ - if (DefaultCudaExecutionProvider() == nullptr) { \ - GTEST_SKIP() << "CUDA Tests are not supported while DML is enabled"; \ - } - namespace onnxruntime { namespace test { @@ -18,10 +13,6 @@ namespace test { int GetCudaArchitecture(); inline bool HasCudaEnvironment(int min_cuda_architecture) { - if (DefaultCudaExecutionProvider() == nullptr) { - return false; - } - if (DefaultCudaExecutionProvider().get() == nullptr) { return false; } diff --git a/onnxruntime/test/contrib_ops/beam_search_test.cc b/onnxruntime/test/contrib_ops/beam_search_test.cc index 8c69e2d9810b8..9f4ee071925b4 100644 --- a/onnxruntime/test/contrib_ops/beam_search_test.cc +++ b/onnxruntime/test/contrib_ops/beam_search_test.cc @@ -75,9 +75,6 @@ TEST(BeamSearchTest, GptBeamSearchFp32) { const char* const output_names[] = {"sequences"}; Ort::SessionOptions session_options; -#if defined(USE_CUDA) && defined(USE_DML) - SKIP_CUDA_TEST_WITH_DML; -#endif #ifdef USE_CUDA OrtCUDAProviderOptionsV2 cuda_options; cuda_options.use_tf32 = false; @@ -171,9 +168,6 @@ TEST(BeamSearchTest, GptBeamSearchFp16) { bool enable_rocm = (nullptr != DefaultRocmExecutionProvider().get()); if (enable_cuda || enable_rocm) { Ort::SessionOptions session_options; -#if defined(USE_CUDA) && defined(USE_DML) - SKIP_CUDA_TEST_WITH_DML; -#endif #ifdef USE_CUDA OrtCUDAProviderOptionsV2 cuda_options; cuda_options.use_tf32 = false; diff --git a/onnxruntime/test/contrib_ops/bias_dropout_op_test.cc b/onnxruntime/test/contrib_ops/bias_dropout_op_test.cc index 297629b015796..027d4b3fff1b0 100644 --- a/onnxruntime/test/contrib_ops/bias_dropout_op_test.cc +++ b/onnxruntime/test/contrib_ops/bias_dropout_op_test.cc @@ -181,9 +181,6 @@ void RunBiasDropoutTest(const bool use_mask, const std::vector& input_s t.SetCustomOutputVerifier(output_verifier); std::vector> t_eps; #ifdef USE_CUDA - if (DefaultCudaExecutionProvider() == nullptr) { - return; - } t_eps.emplace_back(DefaultCudaExecutionProvider()); #elif USE_ROCM t_eps.emplace_back(DefaultRocmExecutionProvider()); diff --git a/onnxruntime/test/contrib_ops/bitmask_dropout_op_test.cc b/onnxruntime/test/contrib_ops/bitmask_dropout_op_test.cc index 26b0e3a4dd7a9..7ca4e1004066c 100644 --- a/onnxruntime/test/contrib_ops/bitmask_dropout_op_test.cc +++ b/onnxruntime/test/contrib_ops/bitmask_dropout_op_test.cc @@ -61,9 +61,7 @@ void RunTestForInference(const std::vector& input_dims, bool has_ratio std::vector> test_eps; #ifdef USE_CUDA - if (DefaultCudaExecutionProvider() != nullptr) { - test_eps.emplace_back(DefaultCudaExecutionProvider()); - } + test_eps.emplace_back(DefaultCudaExecutionProvider()); #elif USE_ROCM test_eps.emplace_back(DefaultRocmExecutionProvider()); #endif @@ -124,9 +122,6 @@ void RunTestForTraining(const std::vector& input_dims) { std::vector> dropout_eps; #ifdef USE_CUDA - if (DefaultCudaExecutionProvider() == nullptr) { - return; - } dropout_eps.emplace_back(DefaultCudaExecutionProvider()); #elif USE_ROCM dropout_eps.emplace_back(DefaultRocmExecutionProvider()); diff --git a/onnxruntime/test/contrib_ops/layer_norm_test.cc b/onnxruntime/test/contrib_ops/layer_norm_test.cc index b414a98c4e756..46082e1b0cd31 100644 --- a/onnxruntime/test/contrib_ops/layer_norm_test.cc +++ b/onnxruntime/test/contrib_ops/layer_norm_test.cc @@ -2,7 +2,6 @@ // Licensed under the MIT License. #include "test/providers/compare_provider_test_utils.h" -#include "test/util/include/default_providers.h" namespace onnxruntime { namespace test { @@ -80,20 +79,14 @@ static void TestLayerNorm(const std::vector& x_dims, #endif #ifdef USE_CUDA - if (DefaultCudaExecutionProvider() != nullptr) { - test.CompareWithCPU(kCudaExecutionProvider); - } + test.CompareWithCPU(kCudaExecutionProvider); #elif USE_ROCM test.CompareWithCPU(kRocmExecutionProvider); +#elif USE_DML + test.CompareWithCPU(kDmlExecutionProvider); #elif USE_WEBGPU test.CompareWithCPU(kWebGpuExecutionProvider); #endif - -#ifdef USE_DML - if (DefaultDmlExecutionProvider() != nullptr) { - test.CompareWithCPU(kDmlExecutionProvider); - } -#endif } TEST(CudaKernelTest, LayerNorm_NullInput) { diff --git a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc index 6dedce24e7e07..eebe9197573c6 100644 --- a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc +++ b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc @@ -490,17 +490,13 @@ void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, int64_t accura std::vector> execution_providers; if (use_float16) { #ifdef USE_CUDA - if (DefaultCudaExecutionProvider() != nullptr) { - execution_providers.push_back(DefaultCudaExecutionProvider()); - } + execution_providers.push_back(DefaultCudaExecutionProvider()); #endif #ifdef USE_ROCM execution_providers.push_back(DefaultRocmExecutionProvider()); #endif #ifdef USE_DML - if (DefaultDmlExecutionProvider() != nullptr) { - execution_providers.push_back(DefaultDmlExecutionProvider()); - } + execution_providers.push_back(DefaultDmlExecutionProvider()); #endif #ifdef USE_WEBGPU execution_providers.push_back(DefaultWebGpuExecutionProvider()); @@ -518,11 +514,8 @@ void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, int64_t accura } // namespace TEST(MatMulNBits, Float16Cuda) { -#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_DML) - std::vector has_gidx_options = {true, false}; - if (DefaultDmlExecutionProvider() != nullptr) { - has_gidx_options.assign(1, false); - } +#if defined(USE_CUDA) || defined(USE_ROCM) + auto has_gidx_options = {true, false}; #else auto has_gidx_options = {false}; #endif @@ -533,9 +526,7 @@ TEST(MatMulNBits, Float16Cuda) { for (auto block_size : {16, 32, 64, 128}) { for (auto has_gidx : has_gidx_options) { #ifdef USE_DML - if (DefaultDmlExecutionProvider() != nullptr) { - RunTest(M, N, K, block_size, 0, false, true, has_gidx, true, 0.04f); - } + RunTest(M, N, K, block_size, 0, false, true, has_gidx, true, 0.04f); #else RunTest(M, N, K, block_size, 0, false, true, has_gidx); RunTest(M, N, K, block_size, 0, true, true, has_gidx, false); @@ -548,16 +539,12 @@ TEST(MatMulNBits, Float16Cuda) { } TEST(MatMulNBits, Float16Large) { -#if defined(USE_CUDA) || defined(USE_DML) +#ifdef USE_DML // For some reason, the A10 machine that runs these tests during CI has a much bigger error than all retail // machines we tested on. All consumer-grade machines from Nvidia/AMD/Intel seem to pass these tests with an // absolute error of 0.08, but the A10 has errors going as high as 0.22. Ultimately, given the large number // of elements in this test, ULPs should probably be used instead of absolute/relative tolerances. - float abs_error = 0.05f; - if (DefaultDmlExecutionProvider() != nullptr) { - // it means the ep is dml in runtime, the abs_error is changed to 0.3f - abs_error = 0.3f; - } + float abs_error = 0.3f; #elif USE_WEBGPU // See Intel A770 to pass these tests with an absolute error of 0.08. float abs_error = 0.08f; @@ -573,6 +560,7 @@ TEST(MatMulNBits, Float16Large) { } } } + #endif // defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_DML) } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc index d88c3131a4ca5..8d7629b5fda1c 100644 --- a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc +++ b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc @@ -227,7 +227,7 @@ TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_U8S8) { } // DML EP supports Float16 output type and Signed A Matrix and Unsigned B Matric for Float32 output -#if defined(USE_DML) && !defined(USE_CUDA) +#if defined(USE_DML) TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_S8U8) { RunMatMulIntegerToFloatTest(); diff --git a/onnxruntime/test/contrib_ops/tensor_op_test.cc b/onnxruntime/test/contrib_ops/tensor_op_test.cc index d5e2ddebfe67f..bc2ff5f4f724d 100644 --- a/onnxruntime/test/contrib_ops/tensor_op_test.cc +++ b/onnxruntime/test/contrib_ops/tensor_op_test.cc @@ -121,15 +121,7 @@ void MeanVarianceNormalizationAcrossChannels(bool across_channels, bool normaliz test.AddAttribute("normalize_variance", normalize_variance ? one : zero); test.AddInput("input", {N, C, H, W}, X); test.AddOutput("output", {N, C, H, W}, result); -#if defined(USE_CUDA) && defined(USE_DML) - if (DefaultCudaExecutionProvider() == nullptr) { - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kCudaExecutionProvider, kTensorrtExecutionProvider}); - } else if (DefaultDmlExecutionProvider() == nullptr) { - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kDmlExecutionProvider, kTensorrtExecutionProvider}); - } -#else test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kTensorrtExecutionProvider}); // OpenVINO doesn't support MVN operator below opset 9. TensorRT doesn't support opset 8 of MVN operator. -#endif } void MeanVarianceNormalizationPerChannel(bool across_channels, bool normalize_variance) { @@ -196,15 +188,7 @@ void MeanVarianceNormalizationPerChannel(bool across_channels, bool normalize_va test.AddAttribute("normalize_variance", normalize_variance ? one : zero); test.AddInput("input", {N, C, H, W}, X); test.AddOutput("output", {N, C, H, W}, result); -#if defined(USE_CUDA) && defined(USE_DML) - if (DefaultCudaExecutionProvider() == nullptr) { - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kCudaExecutionProvider, kTensorrtExecutionProvider}); - } else if (DefaultDmlExecutionProvider() == nullptr) { - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kDmlExecutionProvider, kTensorrtExecutionProvider}); - } -#else test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kTensorrtExecutionProvider}); // OpenVINO doesn't support MVN operator below opset 9. TensorRT doesn't support opset 8 of MVN operator. -#endif } TEST(MVNContribOpTest, MeanVarianceNormalizationCPUTest_Version1_TO_8) { @@ -246,9 +230,7 @@ TEST(UnfoldTensorOpTest, LastDim) { std::vector> execution_providers; #ifdef USE_CUDA - if (DefaultCudaExecutionProvider() != nullptr) { - execution_providers.push_back(DefaultCudaExecutionProvider()); - } + execution_providers.push_back(DefaultCudaExecutionProvider()); #endif execution_providers.push_back(DefaultCpuExecutionProvider()); tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); diff --git a/onnxruntime/test/framework/allocation_planner_test.cc b/onnxruntime/test/framework/allocation_planner_test.cc index adab93908cdc4..eaebac177ca91 100644 --- a/onnxruntime/test/framework/allocation_planner_test.cc +++ b/onnxruntime/test/framework/allocation_planner_test.cc @@ -28,7 +28,6 @@ using json = nlohmann::json; #ifdef USE_CUDA #include "core/providers/cuda/cuda_execution_provider.h" #include "core/providers/cuda/cuda_provider_factory.h" -#include "test/common/cuda_op_test_utils.h" #endif // USE_CUDA #include "core/session/onnxruntime_session_options_config_keys.h" using namespace ONNX_NAMESPACE; @@ -897,9 +896,6 @@ TEST_F(PlannerTest, LocationPlanningForPassThroughExplicitAndImplicitSubgraphInp SessionOptions so; InferenceSession sess{so, GetEnvironment()}; - if (DefaultCudaExecutionProvider() == nullptr) { - return; - } auto status = sess.RegisterExecutionProvider(DefaultCudaExecutionProvider()); ASSERT_TRUE(status.IsOK()); @@ -1042,9 +1038,6 @@ TEST_F(PlannerTest, LocationPlanningForInitializersOnlyUsedInANestedSubgraph) { SessionOptions so; InferenceSession sess{so, GetEnvironment()}; - if (DefaultCudaExecutionProvider() == nullptr) { - return; - } auto status = sess.RegisterExecutionProvider(DefaultCudaExecutionProvider()); ASSERT_TRUE(status.IsOK()); @@ -1152,9 +1145,6 @@ TEST_F(PlannerTest, LocationPlanningForInitializersUsedOnDifferentDevicesInMainG SessionOptions so; InferenceSession sess{so, GetEnvironment()}; - if (DefaultCudaExecutionProvider() == nullptr) { - return; - } auto status = sess.RegisterExecutionProvider(DefaultCudaExecutionProvider()); ASSERT_TRUE(status.IsOK()); @@ -1247,9 +1237,6 @@ TEST_F(PlannerTest, LocationPlanningForImplicitInputsWithoutExplicitConsumersInM SessionOptions so; InferenceSession sess{so, GetEnvironment()}; - if (DefaultCudaExecutionProvider() == nullptr) { - return; - } auto status = sess.RegisterExecutionProvider(DefaultCudaExecutionProvider()); ASSERT_TRUE(status.IsOK()); @@ -1282,10 +1269,6 @@ TEST_F(PlannerTest, LocationPlanningForImplicitInputsWithoutExplicitConsumersInM // Test MultiStream scenario for the graph: // node1(CPU ep)->node2(CPU ep)->node3(CUDA ep)->node4(CPU ep) TEST_F(PlannerTest, MultiStream) { -#if defined(USE_CUDA) && defined(USE_DML) - SKIP_CUDA_TEST_WITH_DML; -#endif - ONNX_NAMESPACE::TensorProto tensor; tensor.add_dims(1); tensor.add_float_data(1.0f); @@ -1304,7 +1287,6 @@ TEST_F(PlannerTest, MultiStream) { onnxruntime::ProviderInfo_CUDA& ep = onnxruntime::GetProviderInfo_CUDA(); auto epFactory = ep.CreateExecutionProviderFactory(epi); std::unique_ptr execution_provider = epFactory->CreateProvider(); - ORT_THROW_IF_ERROR(GetExecutionProviders().Add("CUDAExecutionProvider", std::move(execution_provider))); CreatePlan({}, false); @@ -1332,9 +1314,6 @@ TEST_F(PlannerTest, MultiStream) { // node3 // All 3 nodes are CUDA EP, node1 is in stream0, node2 is in stream1, node3 is in stream2 TEST_F(PlannerTest, MultiStream1StreamWaitFor2Streams) { -#if defined(USE_CUDA) && defined(USE_DML) - SKIP_CUDA_TEST_WITH_DML; -#endif std::unique_ptr<::onnxruntime::KernelDef> cudaKernel = KernelDefBuilder().SetName("Transpose").Provider(kCudaExecutionProvider).SinceVersion(1, 10).Build(); std::unique_ptr<::onnxruntime::KernelDef> cudaKernelAdd = KernelDefBuilder().SetName("Add").Provider(kCudaExecutionProvider).SinceVersion(1, 10).Build(); std::string Graph_input("Graph_input"), Arg1("Arg1"), Arg2("Arg2"), Arg3("Arg3"), node1("node1"), node2("node2"), node3("node3"); @@ -1376,9 +1355,6 @@ TEST_F(PlannerTest, MultiStream1StreamWaitFor2Streams) { // stream 1: node2 (CPU EP) // node1's output, which is consumed by both node2 and node3, is in CPU. TEST_F(PlannerTest, MultiStreamCudaEPNodeCPUOutput) { -#if defined(USE_CUDA) && defined(USE_DML) - SKIP_CUDA_TEST_WITH_DML; -#endif MemcpyToHostInCuda_TransposeInCudaAndCpu("./testdata/multi_stream_models/memcpyToHost_same_stream_with_transpose.json"); EXPECT_EQ(GetState().GetExecutionPlan()->execution_plan.size(), 2) << "2 logic streams"; EXPECT_EQ(GetState().GetExecutionPlan()->execution_plan[0]->steps_.size(), 5) << "stream 0 has 5 steps"; @@ -1400,11 +1376,6 @@ TEST_F(PlannerTest, MultiStreamCudaEPNodeCPUOutput) { // TODO(leca): there is a bug in the corresponding graph that node2 will be visited twice when traversing node1's output nodes // (see: for (auto it = node->OutputNodesBegin(); it != node->OutputNodesEnd(); ++it) in BuildExecutionPlan()). We can just break the loop and don't need the extra variables once it is fixed TEST_F(PlannerTest, MultiStreamMultiOutput) { -#if defined(USE_CUDA) && defined(USE_DML) - if (DefaultCudaExecutionProvider() == nullptr) { - return; - } -#endif std::unique_ptr<::onnxruntime::KernelDef> cudaKernel = KernelDefBuilder().SetName("RNN").Provider(kCudaExecutionProvider).SinceVersion(7).Build(); std::string Graph_input1("Graph_input1"), Graph_input2("Graph_input2"), Graph_input3("Graph_input3"), Arg1("Arg1"), Arg2("Arg2"), Arg3("Arg3"), node1("node1"), node2("node2"); std::vector input1{Arg(Graph_input1), Arg(Graph_input2), Arg(Graph_input3)}, output1{Arg(Arg1), Arg(Arg2)}, input2{Arg(Arg1), Arg(Arg2)}, output2{Arg(Arg3)}; @@ -1442,9 +1413,6 @@ TEST_F(PlannerTest, MultiStreamMultiOutput) { // TODO(leca): the ideal case is there is only 1 wait step before launching node3, // as there is a specific order between node1 and node2 if they are in the same stream, thus node3 will only need to wait the latter one TEST_F(PlannerTest, MultiStream2NodesSameStreamConsumedBy1NodeInDifferentStream) { -#if defined(USE_CUDA) && defined(USE_DML) - SKIP_CUDA_TEST_WITH_DML; -#endif std::unique_ptr<::onnxruntime::KernelDef> cudaKernel = KernelDefBuilder().SetName("Transpose").Provider(kCudaExecutionProvider).SinceVersion(1, 10).Build(); std::string Graph_input1("Graph_input1"), Graph_input2("Graph_input2"), Graph_input3("Graph_input3"), Arg1("Arg1"), Arg2("Arg2"), Arg3("Arg3"), node1("node1"), node2("node2"), node3("node3"); std::vector input1{Arg(Graph_input1)}, input2{Arg(Graph_input2)}, output1{Arg(Arg1)}, output2{Arg(Arg2)}, input3{Arg(Arg1), Arg(Arg2)}, output3{Arg(Arg3)}; @@ -1482,9 +1450,6 @@ TEST_F(PlannerTest, MultiStream2NodesSameStreamConsumedBy1NodeInDifferentStream) #if !defined(__wasm__) && defined(ORT_ENABLE_STREAM) TEST_F(PlannerTest, ParaPlanCreation) { -#if defined(USE_CUDA) && defined(USE_DML) - SKIP_CUDA_TEST_WITH_DML; -#endif TypeProto graph_in_type; graph_in_type.mutable_tensor_type()->set_elem_type(TensorProto_DataType_FLOAT); auto* graph_in_shape = graph_in_type.mutable_tensor_type()->mutable_shape(); @@ -1926,10 +1891,6 @@ TEST_F(PlannerTest, ParaPlanCreation) { } TEST_F(PlannerTest, TestMultiStreamConfig) { -#if defined(USE_CUDA) && defined(USE_DML) - SKIP_CUDA_TEST_WITH_DML; -#endif - const char* type = "DeviceBasedPartitioner"; constexpr size_t type_len = 22; @@ -2003,10 +1964,6 @@ TEST_F(PlannerTest, TestMultiStreamSaveConfig) { // Load with partition config where a node is missing, session load expected to fail. TEST_F(PlannerTest, TestMultiStreamMissingNodeConfig) { -#if defined(USE_CUDA) && defined(USE_DML) - SKIP_CUDA_TEST_WITH_DML; -#endif - const char* config_file_path = "./testdata/multi_stream_models/conv_add_relu_single_stream_missing_node.json"; SessionOptions sess_opt; sess_opt.graph_optimization_level = TransformerLevel::Default; @@ -2027,9 +1984,6 @@ TEST_F(PlannerTest, TestMultiStreamMissingNodeConfig) { // Load with partition config where streams and devices has mismatch TEST_F(PlannerTest, TestMultiStreamMismatchDevice) { -#if defined(USE_CUDA) && defined(USE_DML) - SKIP_CUDA_TEST_WITH_DML; -#endif const char* config_file_path = "./testdata/multi_stream_models/conv_add_relu_single_stream_mismatch_device.json"; SessionOptions sess_opt; sess_opt.graph_optimization_level = TransformerLevel::Default; @@ -2055,9 +2009,6 @@ TEST_F(PlannerTest, TestCpuIf) { sess_opt.graph_optimization_level = TransformerLevel::Default; InferenceSession sess(sess_opt, GetEnvironment(), ORT_TSTR("./testdata/multi_stream_models/cpu_if.onnx")); - if (DefaultCudaExecutionProvider() == nullptr) { - return; - } ASSERT_STATUS_OK(sess.RegisterExecutionProvider(DefaultCudaExecutionProvider())); ASSERT_STATUS_OK(sess.Load()); ASSERT_STATUS_OK(sess.Initialize()); @@ -2118,17 +2069,10 @@ TEST_F(PlannerTest, TestCpuIf) { // onnx.save(model, 'issue_19480.onnx') // TEST(AllocationPlannerTest, ReusedInputCrossDifferentStreams) { -#if defined(USE_CUDA) && defined(USE_DML) - SKIP_CUDA_TEST_WITH_DML; -#endif - SessionOptions sess_opt; sess_opt.graph_optimization_level = TransformerLevel::Default; InferenceSession sess(sess_opt, GetEnvironment(), ORT_TSTR("./testdata/multi_stream_models/issue_19480.onnx")); - if (DefaultCudaExecutionProvider() == nullptr) { - return; - } auto status = sess.RegisterExecutionProvider(DefaultCudaExecutionProvider()); status = sess.Load(); status = sess.Initialize(); diff --git a/onnxruntime/test/framework/cuda/fence_cuda_test.cc b/onnxruntime/test/framework/cuda/fence_cuda_test.cc index 3e5ef30e7ebef..e28327941dda4 100644 --- a/onnxruntime/test/framework/cuda/fence_cuda_test.cc +++ b/onnxruntime/test/framework/cuda/fence_cuda_test.cc @@ -115,9 +115,6 @@ TEST(CUDAFenceTests, DISABLED_PartOnCPU) { SessionOptions so; FenceCudaTestInferenceSession session(so, GetEnvironment()); ASSERT_STATUS_OK(LoadInferenceSessionFromModel(session, *model)); - if (DefaultCudaExecutionProvider() == nullptr) { - return; - } ASSERT_STATUS_OK(session.RegisterExecutionProvider(DefaultCudaExecutionProvider())); ASSERT_TRUE(session.Initialize().IsOK()); ASSERT_TRUE(1 == CountCopyNodes(graph)); @@ -167,9 +164,6 @@ TEST(CUDAFenceTests, TileWithInitializer) { SessionOptions so; FenceCudaTestInferenceSession session(so, GetEnvironment()); ASSERT_STATUS_OK(LoadInferenceSessionFromModel(session, *model)); - if (DefaultCudaExecutionProvider() == nullptr) { - return; - } ASSERT_STATUS_OK(session.RegisterExecutionProvider(DefaultCudaExecutionProvider())); ASSERT_STATUS_OK(session.Initialize()); @@ -230,9 +224,6 @@ TEST(CUDAFenceTests, TileWithComputedInput) { SessionOptions so; FenceCudaTestInferenceSession session(so, GetEnvironment()); ASSERT_STATUS_OK(LoadInferenceSessionFromModel(session, *model)); - if (DefaultCudaExecutionProvider() == nullptr) { - return; - } ASSERT_STATUS_OK(session.RegisterExecutionProvider(DefaultCudaExecutionProvider())); ASSERT_TRUE(session.Initialize().IsOK()); diff --git a/onnxruntime/test/framework/inference_session_test.cc b/onnxruntime/test/framework/inference_session_test.cc index 7f4616c964e33..740c566794f15 100644 --- a/onnxruntime/test/framework/inference_session_test.cc +++ b/onnxruntime/test/framework/inference_session_test.cc @@ -34,7 +34,6 @@ #ifdef USE_CUDA #include "core/providers/cuda/cuda_provider_factory.h" #include "core/providers/cuda/gpu_data_transfer.h" -#include "test/common/cuda_op_test_utils.h" #endif #ifdef USE_TENSORRT #include "core/providers/tensorrt/tensorrt_provider_options.h" @@ -636,9 +635,6 @@ TEST(InferenceSessionTests, CheckRunProfilerWithSessionOptions) { InferenceSession session_object(so, GetEnvironment()); #ifdef USE_CUDA - if (DefaultCudaExecutionProvider() == nullptr) { - return; - } ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultCudaExecutionProvider())); #endif #ifdef USE_ROCM @@ -693,9 +689,6 @@ TEST(InferenceSessionTests, CheckRunProfilerWithSessionOptions2) { InferenceSession session_object(so, GetEnvironment()); #ifdef USE_CUDA - if (DefaultCudaExecutionProvider() == nullptr) { - return; - } ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultCudaExecutionProvider())); #endif #ifdef USE_ROCM @@ -1049,9 +1042,6 @@ static void TestBindHelper(const std::string& log_str, if (bind_provider_type == kCudaExecutionProvider || bind_provider_type == kRocmExecutionProvider) { #ifdef USE_CUDA auto provider = DefaultCudaExecutionProvider(); - if (provider == nullptr) { - return; - } gpu_provider = provider.get(); ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(std::move(provider))); #endif @@ -1647,9 +1637,6 @@ TEST(InferenceSessionTests, Test3LayerNestedSubgraph) { #if USE_TENSORRT ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultTensorrtExecutionProvider())); #elif USE_CUDA - if (DefaultCudaExecutionProvider() == nullptr) { - return; - } ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultCudaExecutionProvider())); #elif USE_ROCM ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultRocmExecutionProvider())); @@ -1802,9 +1789,6 @@ TEST(InferenceSessionTests, Test2LayerNestedSubgraph) { #if USE_TENSORRT ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultTensorrtExecutionProvider())); #elif USE_CUDA - if (DefaultCudaExecutionProvider() == nullptr) { - return; - } ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultCudaExecutionProvider())); #elif USE_ROCM ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultRocmExecutionProvider())); @@ -2160,9 +2144,6 @@ TEST(InferenceSessionTests, TestStrictShapeInference) { #ifdef USE_CUDA // disable it, since we are going to enable parallel execution with cuda ep TEST(InferenceSessionTests, DISABLED_TestParallelExecutionWithCudaProvider) { -#if defined(USE_CUDA) && defined(USE_DML) - SKIP_CUDA_TEST_WITH_DML; -#endif string model_uri = "testdata/transform/fusion/fuse-conv-bn-mul-add-unsqueeze.onnx"; SessionOptions so; @@ -2186,10 +2167,6 @@ TEST(InferenceSessionTests, DISABLED_TestParallelExecutionWithCudaProvider) { } TEST(InferenceSessionTests, TestArenaShrinkageAfterRun) { -#if defined(USE_CUDA) && defined(USE_DML) - SKIP_CUDA_TEST_WITH_DML; -#endif - OrtArenaCfg arena_cfg; arena_cfg.arena_extend_strategy = 1; // kSameAsRequested diff --git a/onnxruntime/test/framework/memcpy_transformer_test.cc b/onnxruntime/test/framework/memcpy_transformer_test.cc index 2313f00e4d123..6e86e5b58aead 100644 --- a/onnxruntime/test/framework/memcpy_transformer_test.cc +++ b/onnxruntime/test/framework/memcpy_transformer_test.cc @@ -9,9 +9,6 @@ #include "default_providers.h" #include "gtest/gtest.h" #include "test_utils.h" -#ifdef USE_CUDA -#include "test/common/cuda_op_test_utils.h" -#endif #include "test/test_environment.h" #include "asserts.h" @@ -77,9 +74,6 @@ void ExpectCopy(const onnxruntime::Node& source, const std::string copy_op, #ifdef USE_CUDA TEST(TransformerTest, MemcpyTransformerTest) { -#if defined(USE_CUDA) && defined(USE_DML) - SKIP_CUDA_TEST_WITH_DML; -#endif std::unordered_map domain_to_version; domain_to_version[kOnnxDomain] = 7; auto model = std::make_shared("test", false, ModelMetaData(), PathString(), @@ -112,9 +106,7 @@ TEST(TransformerTest, MemcpyTransformerTest) { KernelRegistryManager kernel_registry_manager; ExecutionProviders execution_providers; -#if defined(USE_CUDA) ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCudaExecutionProvider, DefaultCudaExecutionProvider())); -#endif ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCpuExecutionProvider, std::make_unique(CPUExecutionProviderInfo()))); KernelRegistryManager test_registry_manager; @@ -137,9 +129,6 @@ TEST(TransformerTest, MemcpyTransformerTest) { } TEST(TransformerTest, MemcpyTransformerTestCudaFirst) { -#if defined(USE_CUDA) && defined(USE_DML) - SKIP_CUDA_TEST_WITH_DML; -#endif std::unordered_map domain_to_version; domain_to_version[kOnnxDomain] = 7; auto model = std::make_shared("test", false, ModelMetaData(), PathString(), @@ -172,9 +161,7 @@ TEST(TransformerTest, MemcpyTransformerTestCudaFirst) { KernelRegistryManager kernel_registry_manager; ExecutionProviders execution_providers; - ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCudaExecutionProvider, DefaultCudaExecutionProvider())); - ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCpuExecutionProvider, std::make_unique(CPUExecutionProviderInfo()))); KernelRegistryManager test_registry_manager; @@ -294,11 +281,7 @@ TEST(TransformerTest, TestInitializerDuplicationInSubgraph) { KernelRegistryManager kernel_registry_manager; ExecutionProviders execution_providers; -#if defined(USE_CUDA) && defined(USE_DML) - SKIP_CUDA_TEST_WITH_DML; -#endif ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCudaExecutionProvider, DefaultCudaExecutionProvider())); - ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCpuExecutionProvider, std::make_unique(CPUExecutionProviderInfo()))); KernelRegistryManager test_registry_manager; @@ -340,11 +323,7 @@ TEST(TransformerTest, MemcpyTransformerTestGraphInputConsumedOnMultipleDevices) KernelRegistryManager kernel_registry_manager; ExecutionProviders execution_providers; -#if defined(USE_CUDA) && defined(USE_DML) - SKIP_CUDA_TEST_WITH_DML; -#endif ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCudaExecutionProvider, DefaultCudaExecutionProvider())); - ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCpuExecutionProvider, std::make_unique(CPUExecutionProviderInfo()))); KernelRegistryManager test_registry_manager; @@ -446,11 +425,7 @@ TEST(TransformerTest, MemcpyTransformerTestImplicitInputConsumedOnMultipleDevice KernelRegistryManager kernel_registry_manager; ExecutionProviders execution_providers; -#if defined(USE_CUDA) && defined(USE_DML) - SKIP_CUDA_TEST_WITH_DML; -#endif ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCudaExecutionProvider, DefaultCudaExecutionProvider())); - ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCpuExecutionProvider, std::make_unique(CPUExecutionProviderInfo()))); KernelRegistryManager test_registry_manager; diff --git a/onnxruntime/test/framework/save_model_with_external_initializers.cc b/onnxruntime/test/framework/save_model_with_external_initializers.cc index d0bc088175755..98874874d50e9 100644 --- a/onnxruntime/test/framework/save_model_with_external_initializers.cc +++ b/onnxruntime/test/framework/save_model_with_external_initializers.cc @@ -6,6 +6,7 @@ #include "core/common/path_string.h" #include "core/framework/data_types.h" #include "core/graph/model.h" +#include "core/graph/model_saving_options.h" #include "core/framework/tensorprotoutils.h" #include "test/test_environment.h" #include "test_utils.h" @@ -23,15 +24,14 @@ Status LoadSaveAndCompareModel(const std::filesystem::path& input_onnx, const std::filesystem::path& input_external_init_file, const std::filesystem::path& output_onnx, const std::filesystem::path& output_external_init_file, - size_t initializer_size_threshold, - const Graph::OffsetAlignmentInfo& align_info) { + const ModelSavingOptions& model_saving_options) { auto logger = DefaultLoggingManager().CreateLogger("LoadSaveAndCompareModel"); std::shared_ptr model; ORT_RETURN_IF_ERROR(Model::Load(input_onnx, model, nullptr, *logger)); std::filesystem::remove(output_onnx); std::filesystem::remove(output_external_init_file); - ORT_RETURN_IF_ERROR(Model::SaveWithExternalInitializers(*model, output_onnx, output_external_init_file, initializer_size_threshold, - align_info)); + ORT_RETURN_IF_ERROR(Model::SaveWithExternalInitializers(*model, output_onnx, output_external_init_file, + model_saving_options)); std::shared_ptr model_from_external; ORT_RETURN_IF_ERROR(Model::Load(output_onnx.native(), model_from_external, nullptr, *logger)); @@ -67,7 +67,7 @@ Status LoadSaveAndCompareModel(const std::filesystem::path& input_onnx, ORT_RETURN_IF_ERROR(utils::UnpackInitializerData(*from_external_tensor_proto, model_path, from_external_tensor_proto_data)); size_t from_external_tensor_proto_size = from_external_tensor_proto_data.size(); - if (from_external_tensor_proto_size < initializer_size_threshold) { + if (from_external_tensor_proto_size < model_saving_options.initializer_size_threshold) { // 'Small' tensors should be embedded in the onnx file. ORT_RETURN_IF_NOT(from_external_tensor_proto->data_location() == ONNX_NAMESPACE::TensorProto_DataLocation::TensorProto_DataLocation_DEFAULT, "location mismatch"); } else { @@ -78,13 +78,14 @@ Status LoadSaveAndCompareModel(const std::filesystem::path& input_onnx, ORT_RETURN_IF_NOT(tensor_proto_size == from_external_tensor_proto_size, "size mismatch"); ORT_RETURN_IF_NOT(memcmp(tensor_proto_data.data(), from_external_tensor_proto_data.data(), tensor_proto_size) == 0, "data mismatch"); - if (align_info.align_offset) { + if (model_saving_options.align_offset) { for (const StringStringEntryProto& entry : from_external_tensor_proto->external_data()) { if (entry.has_key() && entry.has_value() && entry.key() == "offset") { size_t tensor_offset; std::stringstream stream(entry.value()); stream >> tensor_offset; - ORT_RETURN_IF_NOT(tensor_offset % align_info.allocation_granularity == 0, "tensor offset not align"); + ORT_RETURN_IF_NOT(tensor_offset % model_saving_options.allocation_granularity == 0, + "tensor offset not align"); } } } @@ -97,22 +98,35 @@ Status LoadSaveAndCompareModel(const std::filesystem::path& input_onnx, // Original model does not have external initializers TEST(SaveWithExternalInitializers, Mnist) { - Graph::OffsetAlignmentInfo align_info; - ASSERT_STATUS_OK(LoadSaveAndCompareModel(ORT_TSTR("testdata/mnist.onnx"), ORT_TSTR(""), ORT_TSTR("testdata/mnist_with_external_initializers.onnx"), ORT_TSTR("mnist_external_initializers.bin"), 100, align_info)); + ModelSavingOptions model_saving_options{100}; + ASSERT_STATUS_OK(LoadSaveAndCompareModel( + ORT_TSTR("testdata/mnist.onnx"), + ORT_TSTR(""), ORT_TSTR("testdata/mnist_with_external_initializers.onnx"), + ORT_TSTR("mnist_external_initializers.bin"), + model_saving_options)); } // Original model has external initializers TEST(SaveWithExternalInitializers, ModelWithOriginalExternalData) { - Graph::OffsetAlignmentInfo align_info; - ASSERT_STATUS_OK(LoadSaveAndCompareModel(ORT_TSTR("testdata/model_with_orig_ext_data.onnx"), ORT_TSTR("model_with_orig_ext_data.onnx.data"), ORT_TSTR("testdata/model_with_new_external_initializers.onnx"), ORT_TSTR("model_with_new_external_initializers.bin"), 0, align_info)); + ModelSavingOptions model_saving_options{0}; + ASSERT_STATUS_OK(LoadSaveAndCompareModel( + ORT_TSTR("testdata/model_with_orig_ext_data.onnx"), + ORT_TSTR("model_with_orig_ext_data.onnx.data"), + ORT_TSTR("testdata/model_with_new_external_initializers.onnx"), + ORT_TSTR("model_with_new_external_initializers.bin"), + model_saving_options)); } // Original model has external initializers, align offset TEST(SaveWithExternalInitializers, ModelWithOriginalExternalDataAlignOffset) { - Graph::OffsetAlignmentInfo align_info; - align_info.align_offset = true; - align_info.align_threshold = 0; - ASSERT_STATUS_OK(LoadSaveAndCompareModel(ORT_TSTR("testdata/model_with_orig_ext_data.onnx"), ORT_TSTR("model_with_orig_ext_data.onnx.data"), ORT_TSTR("testdata/model_with_new_external_initializers.onnx"), ORT_TSTR("model_with_new_external_initializers.bin"), 0, align_info)); + ModelSavingOptions model_saving_options{0}; + model_saving_options.align_offset = true; + model_saving_options.align_threshold = 0; + ASSERT_STATUS_OK(LoadSaveAndCompareModel( + ORT_TSTR("testdata/model_with_orig_ext_data.onnx"), + ORT_TSTR("model_with_orig_ext_data.onnx.data"), + ORT_TSTR("testdata/model_with_new_external_initializers.onnx"), + ORT_TSTR("model_with_new_external_initializers.bin"), model_saving_options)); } } // namespace test diff --git a/onnxruntime/test/framework/session_state_test.cc b/onnxruntime/test/framework/session_state_test.cc index 3e694020f796b..e7f8b1aaa49d8 100644 --- a/onnxruntime/test/framework/session_state_test.cc +++ b/onnxruntime/test/framework/session_state_test.cc @@ -15,6 +15,7 @@ #include "core/graph/graph_utils.h" #include "core/graph/graph_viewer.h" #include "core/graph/model.h" +#include "core/graph/model_saving_options.h" #include "core/graph/op.h" #include "core/providers/cpu/cpu_execution_provider.h" #include "core/session/onnxruntime_session_options_config_keys.h" @@ -22,13 +23,101 @@ #include "gtest/gtest.h" #include "test/test_environment.h" #include "test/util/include/default_providers.h" +#include "test/util/include/file_util.h" #include "core/optimizer/layout_transformation/layout_transformation.h" using namespace ONNX_NAMESPACE; -using namespace std; namespace onnxruntime { - namespace test { + +#ifndef ENABLE_TRAINING_CORE +#ifndef __wasm__ +static void TestSavedPrepacks(const Model& model) { + auto inspect = [](const Graph& graph) { + const auto& prepacked_for_graph = graph.GetPrepacked(); + const auto& key_to_blob = prepacked_for_graph.GetKeyToBlob(); + ASSERT_EQ(1U, key_to_blob.size()); + const size_t expected_prepacks_for_writing = (graph.ParentGraph() == nullptr) ? 1U : 0U; + ASSERT_EQ(expected_prepacks_for_writing, prepacked_for_graph.GetNumberOfWeightsForWriting()); + + const size_t expected_blobs_for_writing = (graph.ParentGraph() == nullptr) ? 1U : 0U; + ASSERT_EQ(expected_blobs_for_writing, prepacked_for_graph.GetNumberOfKeyedBlobsForWriting()); + + if (graph.ParentGraph() == nullptr) { + const auto* blob_keys = prepacked_for_graph.GetKeysForWeightForSaving("if_shared"); + ASSERT_TRUE(blob_keys != nullptr); + ASSERT_EQ(blob_keys->size(), 1U); + const auto* prepacked_weights = prepacked_for_graph.GetPrepackedWeights(*blob_keys->cbegin()); + ASSERT_TRUE(prepacked_weights != nullptr); + ASSERT_EQ(prepacked_weights->buffer_sizes_.size(), 1U); + ASSERT_EQ(prepacked_weights->buffer_sizes_[0], sizeof(float) * 2); + } + }; + + const auto& main_graph = model.MainGraph(); + inspect(main_graph); + + const auto& nodes = main_graph.Nodes(); + auto if_node_hit = std::find_if(nodes.begin(), nodes.end(), + [](const Node& node) { return node.Name() == "if"; }); + ASSERT_FALSE(if_node_hit == nodes.end()); + const Node& if_node = *if_node_hit; + for (const auto& [_, subgraph] : if_node.GetAttributeNameToSubgraphMap()) { + inspect(*subgraph); + } +} + +static void TestLoadedSharedUserSupplied(const Model& model) { + auto inspect = [](const Graph& graph) { + const auto& prepacked_for_graph = graph.GetPrepacked(); + constexpr size_t expected_prepacks_for_writing = 0U; + ASSERT_EQ(expected_prepacks_for_writing, prepacked_for_graph.GetNumberOfWeightsForWriting()); + + // We have not loaded anything since this initializer is user supplied + const auto& key_to_blob = prepacked_for_graph.GetKeyToBlob(); + ASSERT_EQ(0U, key_to_blob.size()); + }; + + const auto& main_graph = model.MainGraph(); + inspect(main_graph); + + const auto& nodes = main_graph.Nodes(); + auto if_node_hit = std::find_if(nodes.begin(), nodes.end(), + [](const Node& node) { return node.Name() == "if"; }); + ASSERT_FALSE(if_node_hit == nodes.end()); + const Node& if_node = *if_node_hit; + for (const auto& [_, subgraph] : if_node.GetAttributeNameToSubgraphMap()) { + inspect(*subgraph); + } +} + +static void TestLoadedSharedNoUserSupplied(const Model& model) { + auto inspect = [](const Graph& graph) { + const auto& prepacked_for_graph = graph.GetPrepacked(); + constexpr size_t expected_prepacks_for_writing = 0U; + ASSERT_EQ(expected_prepacks_for_writing, prepacked_for_graph.GetNumberOfWeightsForWriting()); + + // We have not loaded anything since this initializer is user supplied + const auto& key_to_blob = prepacked_for_graph.GetKeyToBlob(); + ASSERT_EQ(1U, key_to_blob.size()); + }; + + const auto& main_graph = model.MainGraph(); + inspect(main_graph); + + const auto& nodes = main_graph.Nodes(); + auto if_node_hit = std::find_if(nodes.begin(), nodes.end(), + [](const Node& node) { return node.Name() == "if"; }); + ASSERT_FALSE(if_node_hit == nodes.end()); + const Node& if_node = *if_node_hit; + for (const auto& [_, subgraph] : if_node.GetAttributeNameToSubgraphMap()) { + inspect(*subgraph); + } +} + +#endif // __wasm__ +#endif // ENABLE_TRAINING_CORE + class TestOpKernel : public OpKernel { public: TestOpKernel(const OpKernelInfo& p) : OpKernel(p) { @@ -378,7 +467,7 @@ class PrePackingTestOpKernel : public OpKernel { ORT_UNUSED_PARAMETER(tensor); ORT_UNUSED_PARAMETER(input_idx); - size_t weight_packed_len = 8; + constexpr const size_t weight_packed_len = sizeof(float) * 2; weight_packed_ = IAllocator::MakeUniquePtr(alloc, weight_packed_len, true); float* data_weights_packed = reinterpret_cast(weight_packed_.get()); data_weights_packed[0] = 1.2345f; @@ -647,7 +736,8 @@ class SessionStateTestSharedInitalizersWithPrePacking : public ::testing::Test { } }; -// Pre-packing enabled + no shared initializers = no pre-packed weights caching +// Pre-packing enabled + no shared initializers, however, we put all the pre-packs +// in a session_state container for ownership. TEST_F(SessionStateTestSharedInitalizersWithPrePacking, test1) { SessionOptions sess_options; sess_options.enable_mem_pattern = true; @@ -679,10 +769,11 @@ TEST_F(SessionStateTestSharedInitalizersWithPrePacking, test1) { const auto* kernel = reinterpret_cast(session_state_1.GetKernel(0)); - // Assert that a pre-pack call was made and that no mechanism to store weight from shared container was invoked + // Assert that a pre-pack call was made. However, they sharing call is still made from a serialized container. ASSERT_EQ(session_state_1.GetNumberOfPrepacksCounter(), static_cast(1)); ASSERT_EQ(kernel->prepack_calls_count, 1); - ASSERT_EQ(kernel->store_pre_packed_weight_calls_count, 0); + // In this case the sharing comes from the serialized container + ASSERT_EQ(kernel->store_pre_packed_weight_calls_count, 1); // Second session/model Model model_2("graph_main", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(), @@ -706,10 +797,11 @@ TEST_F(SessionStateTestSharedInitalizersWithPrePacking, test1) { kernel = reinterpret_cast(session_state_2.GetKernel(0)); - // Assert that a pre-pack call was made and that no mechanism to store weight from shared container was invoked + // Assert that a pre-pack call was made. The weights are still shared from the serialized container + // either because they are loaded from disk or because the container takes ownership of them. ASSERT_EQ(session_state_2.GetNumberOfPrepacksCounter(), static_cast(1)); ASSERT_EQ(kernel->prepack_calls_count, 1); - ASSERT_EQ(kernel->store_pre_packed_weight_calls_count, 0); + ASSERT_EQ(kernel->store_pre_packed_weight_calls_count, 1); } // Pre-packing enabled + shared initializers + no pre-packed weights container = no pre-packed weights caching @@ -754,10 +846,10 @@ TEST_F(SessionStateTestSharedInitalizersWithPrePacking, test2) { const auto* kernel = reinterpret_cast(session_state_1.GetKernel(0)); - // Assert that a pre-pack call was made and that no mechanism to store weight from shared container was invoked + // Assert that a pre-pack call was made, but sharing still takes place from the serialized container ASSERT_EQ(session_state_1.GetNumberOfPrepacksCounter(), static_cast(1)); ASSERT_EQ(kernel->prepack_calls_count, 1); - ASSERT_EQ(kernel->store_pre_packed_weight_calls_count, 0); + ASSERT_EQ(kernel->store_pre_packed_weight_calls_count, 1); // Second session/model Model model_2("graph_main", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(), @@ -781,10 +873,10 @@ TEST_F(SessionStateTestSharedInitalizersWithPrePacking, test2) { kernel = reinterpret_cast(session_state_2.GetKernel(0)); - // Assert that a pre-pack call was made and that no mechanism to store weight from shared container was invoked + // Assert that a pre-pack call was made, but sharing still takes place from the serialized container ASSERT_EQ(session_state_2.GetNumberOfPrepacksCounter(), static_cast(1)); ASSERT_EQ(kernel->prepack_calls_count, 1); - ASSERT_EQ(kernel->store_pre_packed_weight_calls_count, 0); + ASSERT_EQ(kernel->store_pre_packed_weight_calls_count, 1); } // Pre-packing enabled + shared initializers + pre-packed weights container = pre-packed weights caching enabled @@ -999,6 +1091,196 @@ TEST_F(SessionStateTestSharedInitalizersWithPrePacking, test4) { ASSERT_EQ(if_node_branches_shared_prepack_counter_2, static_cast(2)); } +#ifndef __wasm__ +// sharing is on +TEST_F(SessionStateTestSharedInitalizersWithPrePacking, TestPrepackedSerialization) { + const std::filesystem::path model_with_external_initializers = + "testdata/test_prepacked_serialization_optimized_model.onnx"; + + const std::filesystem::path external_initializers_file = + "test_prepacked_serialization_optimized_model.bin"; + + { + SessionOptions sess_options; + sess_options.enable_mem_pattern = true; + sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL; + sess_options.use_deterministic_compute = false; + sess_options.enable_mem_reuse = true; + sess_options.optimized_model_filepath = model_with_external_initializers; + + // Enable pre-packing + sess_options.config_options.configurations[kOrtSessionOptionsConfigDisablePrepacking] = "0"; + // Enable saving model with pre-packed weights + sess_options.config_options.configurations[kOrtSessionOptionsSavePrePackedConstantInitializers] = "1"; + + // Enable shared initializer + OrtMemoryInfo mem_info(CPU, OrtDeviceAllocator); + std::vector float_data(1, 1); + auto value = std::make_unique(); + Tensor::InitOrtValue(DataTypeImpl::GetType(), TensorShape(std::vector{1}), + reinterpret_cast(float_data.data()), mem_info, *value); + + ASSERT_STATUS_OK(sess_options.AddInitializer("if_shared", value.get())); + + // Enable pre-packed weights container for shared initializers + PrepackedWeightsContainer prepacked_weights_container; + Model model_1("graph_main", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(), + domain_to_version, std::vector(), + DefaultLoggingManager().DefaultLogger()); + + CreateGraphWithSubgraph(model_1.MainGraph()); + PlaceAllNodesToCPUEP(model_1.MainGraph()); + SessionState session_state_1(model_1.MainGraph(), + execution_providers, + tp.get(), + nullptr, /*inter_op_thread_pool*/ + dtm, + edlm, + DefaultLoggingManager().DefaultLogger(), + profiler, + sess_options, + &prepacked_weights_container); + + constexpr const bool saving_model_true = true; + + ASSERT_STATUS_OK(session_state_1.FinalizeSessionState(std::basic_string(), + kernel_registry_manager, + !saving_model_true)); + + TestSavedPrepacks(model_1); + + ModelSavingOptions model_saving_options{4}; + model_saving_options.align_offset = true; + + ASSERT_STATUS_OK(Model::SaveWithExternalInitializers(model_1, model_with_external_initializers, + external_initializers_file, + model_saving_options)); + } + ScopedFileDeleter test_model_deleter(model_with_external_initializers); + ScopedFileDeleter binary_file_deleter(external_initializers_file); + + // Now let's load the model along with the initializers + { + SessionOptions sess_options; + sess_options.enable_mem_pattern = true; + sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL; + sess_options.use_deterministic_compute = false; + sess_options.enable_mem_reuse = true; + + // Enable pre-packing + sess_options.config_options.configurations[kOrtSessionOptionsConfigDisablePrepacking] = "0"; + + // We are expecting this weight to be loaded from disk along + // with its pre-packed version + // Enable shared initializer + OrtMemoryInfo mem_info(CPU, OrtDeviceAllocator); + std::vector float_data(1, 1); + auto value = std::make_unique(); + Tensor::InitOrtValue(DataTypeImpl::GetType(), TensorShape(std::vector{1}), + reinterpret_cast(float_data.data()), mem_info, *value); + + ASSERT_STATUS_OK(sess_options.AddInitializer("if_shared", value.get())); + + // Enable pre-packed weights container for shared initializers + PrepackedWeightsContainer prepacked_weights_container; + + std::shared_ptr model; + ASSERT_STATUS_OK(Model::Load(model_with_external_initializers, model, nullptr, + DefaultLoggingManager().DefaultLogger())); + + PlaceAllNodesToCPUEP(model->MainGraph()); + SessionState session_state(model->MainGraph(), + execution_providers, + tp.get(), + nullptr, /*inter_op_thread_pool*/ + dtm, + edlm, + DefaultLoggingManager().DefaultLogger(), + profiler, + sess_options, + &prepacked_weights_container); + + ASSERT_STATUS_OK(session_state.FinalizeSessionState(std::basic_string(), + kernel_registry_manager, + false)); + + TestLoadedSharedUserSupplied(*model); + } + + // Load again, this time sharing is enabled, but no shared initializer in the map + { + SessionOptions sess_options; + sess_options.enable_mem_pattern = true; + sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL; + sess_options.use_deterministic_compute = false; + sess_options.enable_mem_reuse = true; + + // Enable pre-packing + sess_options.config_options.configurations[kOrtSessionOptionsConfigDisablePrepacking] = "0"; + + // Enable pre-packed weights container for shared initializers + PrepackedWeightsContainer prepacked_weights_container; + + std::shared_ptr model; + ASSERT_STATUS_OK(Model::Load(model_with_external_initializers, model, nullptr, + DefaultLoggingManager().DefaultLogger())); + + PlaceAllNodesToCPUEP(model->MainGraph()); + SessionState session_state(model->MainGraph(), + execution_providers, + tp.get(), + nullptr, /*inter_op_thread_pool*/ + dtm, + edlm, + DefaultLoggingManager().DefaultLogger(), + profiler, + sess_options, + &prepacked_weights_container); + + ASSERT_STATUS_OK(session_state.FinalizeSessionState(model_with_external_initializers, + kernel_registry_manager, + false)); + + TestLoadedSharedNoUserSupplied(*model); + } + // Load again, sharing is disabled + { + SessionOptions sess_options; + sess_options.enable_mem_pattern = true; + sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL; + sess_options.use_deterministic_compute = false; + sess_options.enable_mem_reuse = true; + + // Enable pre-packing + sess_options.config_options.configurations[kOrtSessionOptionsConfigDisablePrepacking] = "0"; + + std::shared_ptr model; + ASSERT_STATUS_OK(Model::Load(model_with_external_initializers, model, nullptr, + DefaultLoggingManager().DefaultLogger())); + + PlaceAllNodesToCPUEP(model->MainGraph()); + SessionState session_state(model->MainGraph(), + execution_providers, + tp.get(), + nullptr, /*inter_op_thread_pool*/ + dtm, + edlm, + DefaultLoggingManager().DefaultLogger(), + profiler, + sess_options, + nullptr); + + ASSERT_STATUS_OK(session_state.FinalizeSessionState(model_with_external_initializers, + kernel_registry_manager, + false)); + + const auto& prepacked_for_main_graph = model->MainGraph().GetPrepacked(); + ASSERT_FALSE(prepacked_for_main_graph.IsSaveModeOn()); + ASSERT_EQ(1U, prepacked_for_main_graph.GetKeyToBlob().size()); + } +} +#endif // __wasm__ + INSTANTIATE_TEST_SUITE_P(SessionStateTests, SessionStatePrepackingTest, testing::Values(PrepackingTestParam{false, false}, diff --git a/onnxruntime/test/framework/sparse_kernels_test.cc b/onnxruntime/test/framework/sparse_kernels_test.cc index db9592c293fd0..7bd6b47f52b7d 100644 --- a/onnxruntime/test/framework/sparse_kernels_test.cc +++ b/onnxruntime/test/framework/sparse_kernels_test.cc @@ -1457,9 +1457,6 @@ TEST(SparseTensorConversionTests, CsrConversion) { #ifdef USE_CUDA auto cuda_provider = DefaultCudaExecutionProvider(); - if (cuda_provider == nullptr) { - return; - } auto cuda_allocator = cuda_provider->CreatePreferredAllocators()[0]; { auto cuda_transfer = cuda_provider->GetDataTransfer(); @@ -1687,9 +1684,6 @@ TEST(SparseTensorConversionTests, CooConversion) { #ifdef USE_CUDA auto cuda_provider = DefaultCudaExecutionProvider(); - if (cuda_provider == nullptr) { - return; - } auto cuda_allocator = cuda_provider->CreatePreferredAllocators()[0]; { auto cuda_transfer = cuda_provider->GetDataTransfer(); diff --git a/onnxruntime/test/framework/tensorutils_test.cc b/onnxruntime/test/framework/tensorutils_test.cc index 6821f582ce2de..229f4f95b8394 100644 --- a/onnxruntime/test/framework/tensorutils_test.cc +++ b/onnxruntime/test/framework/tensorutils_test.cc @@ -1,6 +1,9 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. +#include "core/common/inlined_containers.h" +#include "core/framework/prepacked_weights.h" +#include "core/framework/prepacked_weights_container.h" #include "core/framework/tensorprotoutils.h" #include "core/graph/onnx_protobuf.h" #include "test/util/include/asserts.h" @@ -19,6 +22,76 @@ using namespace ONNX_NAMESPACE; namespace onnxruntime { namespace test { +// Test ExternalData functionality +TEST(TensorProtoUtilsTest, SetExternalDataInformation) { + ONNX_NAMESPACE::TensorProto tensor_proto; + const std::filesystem::path kExternalDataPath("test.bin"); + constexpr const int64_t init_offset = 100; + constexpr const size_t init_length = 200; + + ExternalDataInfo::SetExternalLocationToProto(kExternalDataPath, init_offset, init_length, tensor_proto); + + ASSERT_EQ(tensor_proto.data_location(), ONNX_NAMESPACE::TensorProto_DataLocation::TensorProto_DataLocation_EXTERNAL); + ASSERT_EQ(tensor_proto.external_data_size(), 3); + ASSERT_EQ(tensor_proto.external_data(0).key(), "location"); + ASSERT_EQ(tensor_proto.external_data(0).value(), ToUTF8String(kExternalDataPath.native())); + ASSERT_EQ(tensor_proto.external_data(1).key(), "offset"); + ASSERT_EQ(tensor_proto.external_data(1).value(), std::to_string(init_offset)); + ASSERT_EQ(tensor_proto.external_data(2).key(), "length"); + ASSERT_EQ(tensor_proto.external_data(2).value(), std::to_string(init_length)); + + PrepackedKeyToBlobMap key_to_blob; + constexpr bool save_mode_on = true; + PrepackedWeightsForGraph prepacked_for_graph(key_to_blob, save_mode_on); + PrePackedWeights prepacked_weights; + const std::string init_name = "test_initializer"; + const std::string blob_key = "test_key"; + + std::array kData = {1.2345f, 2.4690f}; + const size_t buffer_size = kData.size() * sizeof(float); + + prepacked_weights.buffers_.push_back(BufferUniquePtr(kData.data(), BufferDeleter(nullptr))); + prepacked_weights.buffer_sizes_.push_back(buffer_size); + // Write a second entry like this + prepacked_weights.buffers_.push_back(BufferUniquePtr(kData.data(), BufferDeleter(nullptr))); + prepacked_weights.buffer_sizes_.push_back(buffer_size); + + prepacked_for_graph.WritePackedMaybeForSave(init_name, blob_key, std::move(prepacked_weights)); + + constexpr const int64_t starting_offset = 300; + int64_t external_offset = starting_offset; + std::stringstream ss; + const auto* blobs_for_weight = prepacked_for_graph.GetKeysForWeightForSaving(init_name); + ASSERT_TRUE(blobs_for_weight != nullptr); + InlinedHashSet blob_keys{blobs_for_weight->begin(), blobs_for_weight->end()}; + ASSERT_TRUE(ExternalDataInfo::WritePrepackedToFileAndAddToProto(prepacked_for_graph, + blob_keys, + true, 1024 * 1024, 0, + ss, external_offset, + tensor_proto)); + + auto external_data_info = std::make_unique(); + ASSERT_STATUS_OK(ExternalDataInfo::Create(tensor_proto.external_data(), external_data_info)); + + // This should have prepacked_data entry with two blobs for a single key. + ASSERT_TRUE(external_data_info->HasPrepackedInfo()); + auto prepacked_infos = external_data_info->TakePrepackedInfos(); + ASSERT_EQ(prepacked_infos.size(), 1U); + ASSERT_TRUE(prepacked_infos.count(blob_key) > 0); + + int64_t final_offset = starting_offset; + for (const auto& blob_info : prepacked_infos[blob_key]) { + int64_t offset = std::get<0>(blob_info); + ASSERT_EQ(offset, final_offset); + size_t length = std::get<1>(blob_info); + std::string checksum = std::get<2>(blob_info); // currently "0" + final_offset = offset + length; + ASSERT_EQ(length, buffer_size); + ASSERT_EQ(checksum, "0"); + } + ASSERT_EQ(final_offset, external_offset); +} + // T must be float for double, and it must match with the 'type' argument template void TestUnpackFloatTensor(TensorProto_DataType type, const std::filesystem::path& model_path) { diff --git a/onnxruntime/test/lora/lora_test.cc b/onnxruntime/test/lora/lora_test.cc index 9d8febb453739..e8291a36447ca 100644 --- a/onnxruntime/test/lora/lora_test.cc +++ b/onnxruntime/test/lora/lora_test.cc @@ -201,16 +201,6 @@ TEST(LoraAdapterTest, Load) { #ifdef USE_CUDA TEST(LoraAdapterTest, VerifyDeviceCopy) { - // These checks for CUDA/DML combined Package, Be careful when you want to remove it! - if (DefaultCudaExecutionProvider() == nullptr) { - GTEST_SKIP() << "Skip This Test Due to this EP is null"; - } -#ifdef USE_DML - if (DefaultDmlExecutionProvider() != nullptr) { - GTEST_FAIL() << "It should not run with DML EP"; - } -#endif - auto cpu_ep = DefaultCpuExecutionProvider(); auto cpu_allocator = cpu_ep->CreatePreferredAllocators()[0]; auto cuda_ep = DefaultCudaExecutionProvider(); diff --git a/onnxruntime/test/providers/base_tester.cc b/onnxruntime/test/providers/base_tester.cc index b0958e05dc373..aa68f68f3e735 100644 --- a/onnxruntime/test/providers/base_tester.cc +++ b/onnxruntime/test/providers/base_tester.cc @@ -532,17 +532,6 @@ void BaseTester::Run(ExpectResult expect_result, const std::string& expected_fai so.use_deterministic_compute = use_determinism_; so.graph_optimization_level = TransformerLevel::Default; // 'Default' == off - // remove nullptr in execution_providers. - // it's a little ugly but we need to do this because DefaultXXXExecutionProvider() can return nullptr in Runtime. - // And there're many places adding DefaultXXXExecutionProvider() to execution_providers directly. - if (execution_providers != nullptr) { - execution_providers->erase(std::remove(execution_providers->begin(), execution_providers->end(), nullptr), execution_providers->end()); - if (execution_providers->size() == 0) { - // In fact, no ep is needed to run - return; - } - } - Run(so, expect_result, expected_failure_string, excluded_provider_types, run_options, execution_providers, options); } diff --git a/onnxruntime/test/providers/compare_provider_test_utils.cc b/onnxruntime/test/providers/compare_provider_test_utils.cc index 9acb37c24ddd0..386a5656d8a01 100644 --- a/onnxruntime/test/providers/compare_provider_test_utils.cc +++ b/onnxruntime/test/providers/compare_provider_test_utils.cc @@ -53,11 +53,6 @@ void CompareOpTester::CompareWithCPU(const std::string& target_provider_type, SetTestFunctionCalled(); std::unique_ptr target_execution_provider = GetExecutionProvider(target_provider_type); -#if defined(USE_CUDA) && defined(USE_DML) - if (target_execution_provider == nullptr) { - return; - } -#endif ASSERT_TRUE(target_execution_provider != nullptr) << "provider_type " << target_provider_type << " is not supported."; diff --git a/onnxruntime/test/providers/cpu/model_tests.cc b/onnxruntime/test/providers/cpu/model_tests.cc index b46c253fb8ed9..e3c86a137484f 100644 --- a/onnxruntime/test/providers/cpu/model_tests.cc +++ b/onnxruntime/test/providers/cpu/model_tests.cc @@ -491,18 +491,6 @@ ::std::vector<::std::basic_string> GetParameterStrings() { // the number of times these are run to reduce the CI time. provider_names.erase(provider_name_cpu); #endif - -#if defined(USE_CUDA) && defined(USE_DML) - const std::string no_cuda_ep_test = Env::Default().GetEnvironmentVar("NO_CUDA_TEST"); - if (no_cuda_ep_test == "1") { - provider_names.erase(provider_name_cuda); - } - const std::string no_dml_ep_test = Env::Default().GetEnvironmentVar("NO_DML_TEST"); - if (no_dml_ep_test == "1") { - provider_names.erase(provider_name_dml); - } -#endif - std::vector> v; // Permanently exclude following tests because ORT support only opset starting from 7, // Please make no more changes to the list diff --git a/onnxruntime/test/providers/cpu/tensor/gather_elements_op_test.cc b/onnxruntime/test/providers/cpu/tensor/gather_elements_op_test.cc index 5b2d00bb956bf..81e51375b9992 100644 --- a/onnxruntime/test/providers/cpu/tensor/gather_elements_op_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/gather_elements_op_test.cc @@ -389,9 +389,10 @@ TEST(GatherElementsOpTest, IndicesOutOfBounds) { // skip openvino which will not throw error message but will ensure no out-of-bound access // skip TensorRT because it doesn't support out of bounds indices // skip QNN because it doesn't support out of bounds indices + // skip WebGPU because it doesn't support out of bounds indices test.Run(OpTester::ExpectResult::kExpectFailure, "", {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kRocmExecutionProvider, kOpenVINOExecutionProvider, - kTensorrtExecutionProvider, kDmlExecutionProvider, kQnnExecutionProvider}); + kTensorrtExecutionProvider, kDmlExecutionProvider, kQnnExecutionProvider, kWebGpuExecutionProvider}); } TEST(GatherElementsOpTest, BigIndices) { diff --git a/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc b/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc index 0f23e4c39d7e2..be79a6d29d539 100644 --- a/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc @@ -3,9 +3,6 @@ #include "core/session/onnxruntime_session_options_config_keys.h" #include "gtest/gtest.h" -#if USE_CUDA -#include "test/common/cuda_op_test_utils.h" -#endif #include "test/providers/provider_test_utils.h" #include "test/util/include/default_providers.h" @@ -125,9 +122,6 @@ TEST(GatherOpTest, Gather_invalid_index_gpu) { 4.0f, 5.0f, 6.0f, 7.0f, 0.0f, 0.0f, 0.0f, 0.0f}); -#if defined(USE_CUDA) && defined(USE_DML) - SKIP_CUDA_TEST_WITH_DML; -#endif // On GPU, just set the value to 0 instead of report error. exclude all other providers test #if defined(USE_CUDA) diff --git a/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc b/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc index 7e1a2384d7fc6..05cfb5c13d689 100644 --- a/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc @@ -15,13 +15,11 @@ std::vector> GetExecutionProviders(int opset execution_providers.emplace_back(DefaultCpuExecutionProvider()); #ifdef USE_CUDA - if (DefaultCudaExecutionProvider() != nullptr) { - if (opset_version < 20) { - execution_providers.emplace_back(DefaultCudaExecutionProvider()); + if (opset_version < 20) { + execution_providers.emplace_back(DefaultCudaExecutionProvider()); #ifdef ENABLE_CUDA_NHWC_OPS - execution_providers.push_back(DefaultCudaNHWCExecutionProvider()); + execution_providers.push_back(DefaultCudaNHWCExecutionProvider()); #endif - } } #endif diff --git a/onnxruntime/test/providers/cuda/cuda_provider_test.cc b/onnxruntime/test/providers/cuda/cuda_provider_test.cc index e745e1bcb8171..e57cdd2350fab 100644 --- a/onnxruntime/test/providers/cuda/cuda_provider_test.cc +++ b/onnxruntime/test/providers/cuda/cuda_provider_test.cc @@ -11,7 +11,7 @@ ProviderInfo_CUDA& GetProviderInfo_CUDA_Test(); namespace test { namespace cuda { -TEST(CudaEpUnittest, All) { +TEST(CUDA_EP_Unittest, All) { onnxruntime::ProviderInfo_CUDA& ep = onnxruntime::GetProviderInfo_CUDA_Test(); ep.TestAll(); } diff --git a/onnxruntime/test/providers/cuda/test_cases/allocator_cuda_test.cc b/onnxruntime/test/providers/cuda/test_cases/allocator_cuda_test.cc index ec7c6ec4e1605..b413d04fe81e8 100644 --- a/onnxruntime/test/providers/cuda/test_cases/allocator_cuda_test.cc +++ b/onnxruntime/test/providers/cuda/test_cases/allocator_cuda_test.cc @@ -11,7 +11,7 @@ namespace onnxruntime { namespace test { -TEST(CudaEpAllocatorTest, CUDAAllocatorTest) { +TEST(AllocatorTest, CUDAAllocatorTest) { OrtDevice::DeviceId cuda_device_id = 0; // ensure CUDA device is available. @@ -77,7 +77,7 @@ TEST(CudaEpAllocatorTest, CUDAAllocatorTest) { } // test that we fallback to smaller allocations if the growth of the arena exceeds the available memory -TEST(CudaEpAllocatorTest, CUDAAllocatorFallbackTest) { +TEST(AllocatorTest, CUDAAllocatorFallbackTest) { OrtDevice::DeviceId cuda_device_id = 0; size_t free = 0; diff --git a/onnxruntime/test/providers/cuda/test_cases/attention_kernel_options_test.cc b/onnxruntime/test/providers/cuda/test_cases/attention_kernel_options_test.cc index ccdc56de5937d..b2e986f680763 100644 --- a/onnxruntime/test/providers/cuda/test_cases/attention_kernel_options_test.cc +++ b/onnxruntime/test/providers/cuda/test_cases/attention_kernel_options_test.cc @@ -17,7 +17,7 @@ using onnxruntime::contrib::attention::AttentionBackend; namespace onnxruntime { namespace test { -TEST(CudaEpAttentionKernelOptionsTest, NonZeroValue) { +TEST(AttentionKernelOptionsTest, NonZeroValue) { { AttentionKernelOptions options; int value = static_cast(AttentionBackend::FLASH_ATTENTION) | static_cast(AttentionBackend::EFFICIENT_ATTENTION); @@ -156,7 +156,7 @@ TEST(CudaEpAttentionKernelOptionsTest, NonZeroValue) { } // Test all environment variables take effect when option value is 0. -TEST(CudaEpAttentionKernelOptionsTest, DefaultOptionWithEnvVar) { +TEST(AttentionKernelOptionsTest, DefaultOptionWithEnvVar) { constexpr int value = 0; ScopedEnvironmentVariables scoped_env_vars{ EnvVarMap{ @@ -186,7 +186,7 @@ TEST(CudaEpAttentionKernelOptionsTest, DefaultOptionWithEnvVar) { } // Test default min sequence lengths when environment variables are not set. -TEST(CudaEpAttentionKernelOptionsTest, DefaultMinSeqLens) { +TEST(AttentionKernelOptionsTest, DefaultMinSeqLens) { constexpr int value = 0; ScopedEnvironmentVariables scoped_env_vars{ EnvVarMap{ diff --git a/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc b/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc index 97d50398a5550..a0d115c41c14b 100644 --- a/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc +++ b/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc @@ -68,7 +68,7 @@ void ComputeTopKReference(const std::vector& values, } } -TEST(CudaEpTestBeamSearch, TopK) { +TEST(TestBeamSearch, TopK) { int32_t batch_size = 4; int32_t beam_size = 4; int32_t vocab_size = 50257; diff --git a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc index d8fb3c8256012..3fcb9045ee7e6 100644 --- a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc +++ b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc @@ -230,7 +230,7 @@ void testPrepack(int rows, int columns) { } // TODO: code runs on CPU, but this is for sm80 only, maybe enable only when test on sm80 -TEST(CudaEpBlkQ4_GEMM, PrepackSm80Test) { +TEST(BlkQ4_GEMM, PrepackSm80Test) { Status status = onnxruntime::cuda::test::sm80_supported(); if (!status.IsOK()) { // skip the test if sm80 is not supported @@ -263,7 +263,7 @@ TEST(CudaEpBlkQ4_GEMM, PrepackSm80Test) { testPrepack(256, 256); } -TEST(CudaEpBlkQ4_GEMM, Sm80RowBlockingTest) { +TEST(BlkQ4_GEMM, Sm80RowBlockingTest) { Status status = onnxruntime::cuda::test::sm80_supported(); if (!status.IsOK()) { // skip the test if sm80 is not supported @@ -292,7 +292,7 @@ TEST(CudaEpBlkQ4_GEMM, Sm80RowBlockingTest) { onnxruntime::cuda::test::run_blkq4_gemm<64, false, false, true>(256, 1024, 576); } -TEST(CudaEpBlkQ4_GEMM, Sm80ColBlockingTest) { +TEST(BlkQ4_GEMM, Sm80ColBlockingTest) { Status status = onnxruntime::cuda::test::sm80_supported(); if (!status.IsOK()) { // skip the test if sm80 is not supported @@ -305,7 +305,7 @@ TEST(CudaEpBlkQ4_GEMM, Sm80ColBlockingTest) { onnxruntime::cuda::test::run_blkq4_gemm<64, true, false, true>(256, 1024, 576); } -TEST(CudaEpBlkQ4_GEMM, Sm80SmallMTest) { +TEST(BlkQ4_GEMM, Sm80SmallMTest) { Status status = onnxruntime::cuda::test::sm80_supported(); if (!status.IsOK()) { // skip the test if sm80 is not supported @@ -326,7 +326,7 @@ TEST(CudaEpBlkQ4_GEMM, Sm80SmallMTest) { onnxruntime::cuda::test::run_blkq4_gemm<64, true, true, true>(16, 1024, 576); } -TEST(CudaEpBlkQ4_GEMM, Sm80SmallTileKernelTest) { +TEST(BlkQ4_GEMM, Sm80SmallTileKernelTest) { Status status = onnxruntime::cuda::test::sm80_supported(); if (!status.IsOK()) { // skip the test if sm80 is not supported diff --git a/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc b/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc index f3222c6f683b5..72357ec7e02d2 100644 --- a/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc +++ b/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc @@ -19,7 +19,7 @@ namespace cuda { namespace test { // TODO: Since the "DeferredRelease" has been migrated to CudaStream class, // we should migrate this test from CudaEP unit test to CudaStream unit test. -TEST(CudaEpTestDeferredRelease, WithArena) { +TEST(TestDeferredRelease, WithArena) { // Create CUDA EP. CUDAExecutionProviderInfo info; CUDAExecutionProvider ep(info); @@ -52,7 +52,7 @@ TEST(CudaEpTestDeferredRelease, WithArena) { ORT_THROW_IF_ERROR(ep.OnRunEnd(true, run_opts)); } -TEST(CudaEpTestDeferredRelease, WithoutArena) { +TEST(TestDeferredRelease, WithoutArena) { // Create CUDA EP. CUDAExecutionProviderInfo info; CUDAExecutionProvider ep(info); diff --git a/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc b/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc index 3538c7add94d0..7468a5718425e 100644 --- a/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc +++ b/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc @@ -40,7 +40,7 @@ void TestFillCorrectness(size_t num_elements, TElement value) { } } // namespace -TEST(CudaEpUnittest, FillCorrectness) { +TEST(CudaUtilsTest, FillCorrectness) { TestFillCorrectness(1 << 20, 1); TestFillCorrectness(1 << 20, 2); TestFillCorrectness(1 << 20, 3); diff --git a/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc b/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc index 518fde5804b23..6636e15040393 100644 --- a/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc +++ b/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc @@ -10,7 +10,7 @@ namespace onnxruntime { namespace cuda { namespace test { -TEST(CudaEpGemmOptions, TestDefaultOptions) { +TEST(CudaGemmOptions, TestDefaultOptions) { HalfGemmOptions gemm_options; ASSERT_FALSE(gemm_options.IsCompute16F()); #if defined(USE_CUDA) @@ -22,7 +22,7 @@ TEST(CudaEpGemmOptions, TestDefaultOptions) { #endif } -TEST(CudaEpGemmOptions, TestCompute16F) { +TEST(CudaGemmOptions, TestCompute16F) { HalfGemmOptions gemm_options; gemm_options.Initialize(1); ASSERT_TRUE(gemm_options.IsCompute16F()); @@ -35,7 +35,7 @@ TEST(CudaEpGemmOptions, TestCompute16F) { #endif } -TEST(CudaEpGemmOptions, NoReducedPrecision) { +TEST(CudaGemmOptions, NoReducedPrecision) { HalfGemmOptions gemm_options; gemm_options.Initialize(2); ASSERT_FALSE(gemm_options.IsCompute16F()); @@ -48,7 +48,7 @@ TEST(CudaEpGemmOptions, NoReducedPrecision) { #endif } -TEST(CudaEpGemmOptions, Pedantic) { +TEST(CudaGemmOptions, Pedantic) { HalfGemmOptions gemm_options; gemm_options.Initialize(4); ASSERT_FALSE(gemm_options.IsCompute16F()); @@ -61,7 +61,7 @@ TEST(CudaEpGemmOptions, Pedantic) { #endif } -TEST(CudaEpGemmOptions, Compute16F_Pedantic) { +TEST(CudaGemmOptions, Compute16F_Pedantic) { HalfGemmOptions gemm_options; gemm_options.Initialize(5); ASSERT_TRUE(gemm_options.IsCompute16F()); @@ -74,7 +74,7 @@ TEST(CudaEpGemmOptions, Compute16F_Pedantic) { #endif } -TEST(CudaEpGemmOptions, Compute16F_NoReducedPrecision) { +TEST(CudaGemmOptions, Compute16F_NoReducedPrecision) { HalfGemmOptions gemm_options; gemm_options.Initialize(3); ASSERT_TRUE(gemm_options.IsCompute16F()); diff --git a/onnxruntime/test/providers/cuda/test_cases/greedy_search_top_one.cc b/onnxruntime/test/providers/cuda/test_cases/greedy_search_top_one.cc index ba24cf858e80f..6b8cd68de0fca 100644 --- a/onnxruntime/test/providers/cuda/test_cases/greedy_search_top_one.cc +++ b/onnxruntime/test/providers/cuda/test_cases/greedy_search_top_one.cc @@ -41,7 +41,7 @@ void ComputeTop1Reference(const std::vector& values, } } -TEST(CudaEpTestGreedySearch, TopOne) { +TEST(TestGreedySearch, TopOne) { int32_t batch_size = 4; int32_t vocab_size = 50257; int32_t batch_x_vocab = batch_size * vocab_size; diff --git a/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc b/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc index 09c9c1e5f8f6a..ec7e98528504e 100644 --- a/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc +++ b/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc @@ -179,7 +179,7 @@ void TestReduceColumnsToColumn(int m, int n, float relative_error_tolerance = 1e } } // namespace -TEST(CudaEpReductionFunctionsTest, ReduceRowToScalar) { +TEST(ReductionFunctionsTest, ReduceRowToScalar) { TestReduceRowToScalarApis(3); TestReduceRowToScalarApis(19); TestReduceRowToScalarApis(123); @@ -188,7 +188,7 @@ TEST(CudaEpReductionFunctionsTest, ReduceRowToScalar) { TestReduceRowToScalarApis(941736, 2e-4f); } -TEST(CudaEpReductionFunctionsTest, ReduceRowsToRow) { +TEST(ReductionFunctionsTest, ReduceRowsToRow) { for (int m : {3, 193, 2945}) { for (int n : {3, 193, 2945}) { TestReduceRowsToRow(m, n, true); @@ -197,7 +197,7 @@ TEST(CudaEpReductionFunctionsTest, ReduceRowsToRow) { } } -TEST(CudaEpReductionFunctionsTest, ReduceColumnsToColumn) { +TEST(ReductionFunctionsTest, ReduceColumnsToColumn) { for (int m : {3, 193, 2945}) { for (int n : {3, 193, 2945}) { TestReduceColumnsToColumn(m, n); @@ -205,7 +205,7 @@ TEST(CudaEpReductionFunctionsTest, ReduceColumnsToColumn) { } } -TEST(CudaEpReductionFunctionsTest, BufferOffsets) { +TEST(ReductionFunctionsTest, BufferOffsets) { const int m = 2048; const int n = 1024; const TensorShape shape{m, n}; @@ -240,7 +240,7 @@ TEST(CudaEpReductionFunctionsTest, BufferOffsets) { } } -TEST(CudaEpReductionFunctionsTest, InvalidBufferSize) { +TEST(ReductionFunctionsTest, InvalidBufferSize) { const int m = 2048; const int n = 1024; const TensorShape shape{m, n}; @@ -262,7 +262,7 @@ TEST(CudaEpReductionFunctionsTest, InvalidBufferSize) { ASSERT_FALSE(status.IsOK()); } -TEST(CudaEpReductionFunctionsTest, GetApplicableMatrixReduction) { +TEST(ReductionFunctionsTest, GetApplicableMatrixReduction) { auto test_get_applicable_matrix_reduction = [](cudnnReduceTensorOp_t cudnn_op, const std::vector& dims, const std::vector& axes, diff --git a/onnxruntime/test/python/onnx_backend_test_series.py b/onnxruntime/test/python/onnx_backend_test_series.py index a274b90dc042f..8fc76da3495a8 100644 --- a/onnxruntime/test/python/onnx_backend_test_series.py +++ b/onnxruntime/test/python/onnx_backend_test_series.py @@ -105,7 +105,7 @@ def load_jsonc(basename: str): return json.loads("\n".join(lines)) -def create_backend_test(devices: list[str], test_name=None): +def create_backend_test(test_name=None): """Creates an OrtBackendTest and adds its TestCase's to global scope so unittest will find them.""" overrides = load_jsonc("onnx_backend_test_series_overrides.jsonc") @@ -126,29 +126,30 @@ def create_backend_test(devices: list[str], test_name=None): else: filters = load_jsonc("onnx_backend_test_series_filters.jsonc") current_failing_tests = apply_filters(filters, "current_failing_tests") + if platform.architecture()[0] == "32bit": current_failing_tests += apply_filters(filters, "current_failing_tests_x86") - if backend.supports_device("DNNL") or "DNNL" in devices: + if backend.supports_device("DNNL"): current_failing_tests += apply_filters(filters, "current_failing_tests_DNNL") - if backend.supports_device("NNAPI") or "NNAPI" in devices: + if backend.supports_device("NNAPI"): current_failing_tests += apply_filters(filters, "current_failing_tests_NNAPI") - if backend.supports_device("OPENVINO_GPU") or "OPENVINO_GPU" in devices: + if backend.supports_device("OPENVINO_GPU"): current_failing_tests += apply_filters(filters, "current_failing_tests_OPENVINO_GPU") - if backend.supports_device("OPENVINO_CPU") or "OPENVINO_CPU" in devices: + if backend.supports_device("OPENVINO_CPU"): current_failing_tests += apply_filters(filters, "current_failing_tests_OPENVINO_CPU_FP32") current_failing_tests += apply_filters(filters, "current_failing_tests_OPENVINO_CPU_FP16") - if backend.supports_device("OPENVINO_NPU") or "OPENVINO_NPU" in devices: + if backend.supports_device("OPENVINO_NPU"): current_failing_tests += apply_filters(filters, "current_failing_tests_OPENVINO_NPU") - if backend.supports_device("OPENVINO") or "OPENVINO" in devices: + if backend.supports_device("OPENVINO"): current_failing_tests += apply_filters(filters, "current_failing_tests_OPENVINO_opset18") - if backend.supports_device("MIGRAPHX") or "MIGRAPHX" in devices: + if backend.supports_device("MIGRAPHX"): current_failing_tests += apply_filters(filters, "current_failing_tests_MIGRAPHX") if backend.supports_device("WEBGPU"): @@ -157,16 +158,8 @@ def create_backend_test(devices: list[str], test_name=None): # Skip these tests for a "pure" DML onnxruntime python wheel. We keep these tests enabled for instances where both DML and CUDA # EPs are available (Windows GPU CI pipeline has this config) - these test will pass because CUDA has higher precedence than DML # and the nodes are assigned to only the CUDA EP (which supports these tests) - if (backend.supports_device("DML") and not backend.supports_device("GPU")) or "DML" in devices: + if backend.supports_device("DML") and not backend.supports_device("GPU"): current_failing_tests += apply_filters(filters, "current_failing_tests_pure_DML") - # exclude CUDA EP when DML test is running. - os.environ["ORT_ONNX_BACKEND_EXCLUDE_PROVIDERS"] = "TensorrtExecutionProvider,CUDAExecutionProvider" - elif backend.supports_device("DML") and "DML" not in devices: - # exclude DML EP when CUDA test is running. - os.environ["ORT_ONNX_BACKEND_EXCLUDE_PROVIDERS"] = "TensorrtExecutionProvider,DmlExecutionProvider" - else: - # exclude TRT EP temporarily and only test CUDA EP to retain previous behavior - os.environ["ORT_ONNX_BACKEND_EXCLUDE_PROVIDERS"] = "TensorrtExecutionProvider" filters = ( current_failing_tests @@ -179,6 +172,9 @@ def create_backend_test(devices: list[str], test_name=None): backend_test.exclude("(" + "|".join(filters) + ")") print("excluded tests:", filters) + # exclude TRT EP temporarily and only test CUDA EP to retain previous behavior + os.environ["ORT_ONNX_BACKEND_EXCLUDE_PROVIDERS"] = "TensorrtExecutionProvider" + # import all test cases at global scope to make # them visible to python.unittest. globals().update(backend_test.enable_report().test_cases) @@ -203,15 +199,6 @@ def parse_args(): help="Only run tests that match this value. Matching is regex based, and '.*' is automatically appended", ) - parser.add_argument( - "--devices", - type=str, - choices=["CPU", "CUDA", "MIGRAPHX", "DNNL", "DML", "OPENVINO_GPU", "OPENVINO_CPU", "OPENVINO_NPU", "OPENVINO"], - nargs="+", # allows multiple values - default=["CPU"], # default to ["CPU"] if no input is given - help="Select one or more devices CPU, CUDA, MIGRAPHX, DNNL, DML, OPENVINO_GPU, OPENVINO_CPU, OPENVINO_NPU, OPENVINO", - ) - # parse just our args. python unittest has its own args and arg parsing, and that runs inside unittest.main() parsed, unknown = parser.parse_known_args() sys.argv = sys.argv[:1] + unknown @@ -222,5 +209,5 @@ def parse_args(): if __name__ == "__main__": args = parse_args() - create_backend_test(args.devices, args.test_name) + create_backend_test(args.test_name) unittest.main() diff --git a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc index 7ecaab6fedb02..f083ab14ad133 100644 --- a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc +++ b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc @@ -750,13 +750,6 @@ "^test_reduce_log_sum_empty_set_cpu", "^test_reduce_log_sum_exp_empty_set_cpu", "^test_reduce_prod_empty_set_cpu", - // Bug: DML EP some how executes these CUDA tests and failed - // TODO: Remove these tests when DML EP is fixed - "^test_convtranspose_autopad_same_cuda", - "^test_asin_example_cuda", - "^test_dynamicquantizelinear_cuda", - "^test_dynamicquantizelinear_expanded_cuda", - "^test_reduce_min_empty_set_cuda", //Bug: DML EP does not execute operators with an empty input tensor //TODO: Resolve as a graph implementation that returns a constant inf tensor with appropriate strides "^test_reduce_min_empty_set_cpu" diff --git a/onnxruntime/test/util/default_providers.cc b/onnxruntime/test/util/default_providers.cc index 59926bbcd1c6f..c1564997c42b8 100644 --- a/onnxruntime/test/util/default_providers.cc +++ b/onnxruntime/test/util/default_providers.cc @@ -122,12 +122,6 @@ std::unique_ptr DefaultOpenVINOExecutionProvider() { std::unique_ptr DefaultCudaExecutionProvider() { #ifdef USE_CUDA -#ifdef USE_DML - const std::string no_cuda_ep_test = Env::Default().GetEnvironmentVar("NO_CUDA_TEST"); - if (no_cuda_ep_test == "1") { - return nullptr; - } -#endif OrtCUDAProviderOptionsV2 provider_options{}; provider_options.do_copy_in_default_stream = true; provider_options.use_tf32 = false; @@ -140,12 +134,6 @@ std::unique_ptr DefaultCudaExecutionProvider() { #ifdef ENABLE_CUDA_NHWC_OPS std::unique_ptr DefaultCudaNHWCExecutionProvider() { #if defined(USE_CUDA) -#ifdef USE_DML - const std::string no_cuda_ep_test = Env::Default().GetEnvironmentVar("NO_CUDA_TEST"); - if (no_cuda_ep_test == "1") { - return nullptr; - } -#endif OrtCUDAProviderOptionsV2 provider_options{}; provider_options.do_copy_in_default_stream = true; provider_options.use_tf32 = false; @@ -332,12 +320,6 @@ std::unique_ptr DefaultCannExecutionProvider() { std::unique_ptr DefaultDmlExecutionProvider() { #ifdef USE_DML -#ifdef USE_CUDA - const std::string no_dml_ep_test = Env::Default().GetEnvironmentVar("NO_DML_TEST"); - if (no_dml_ep_test == "1") { - return nullptr; - } -#endif ConfigOptions config_options{}; if (auto factory = DMLProviderFactoryCreator::CreateFromDeviceOptions(config_options, nullptr, false, false)) { return factory->CreateProvider(); diff --git a/onnxruntime/test/webgpu/delay_load/main.cc b/onnxruntime/test/webgpu/delay_load/main.cc new file mode 100644 index 0000000000000..14300f3b3751b --- /dev/null +++ b/onnxruntime/test/webgpu/delay_load/main.cc @@ -0,0 +1,143 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include +#include +#include +#include +#define ORT_API_MANUAL_INIT +#include "core/session/onnxruntime_cxx_api.h" + +// This program is to test the delay loading of onnxruntime.dll. +// +// To verify the delay loading actually works, we need to do the test in 2 steps: +// +// 1. Prepare a folder structure like below: +// +// ├── webgpu_delay_load_test_root (newly created folder) +// │ ├── dlls +// │ │ ├── onnxruntime.dll +// │ │ ├── webgpu_dawn.dll +// │ │ ├── dxil.dll +// │ │ └── dxcompiler.dll +// │ └── test.exe +// └── onnxruntime_webgpu_delay_load_test.exe (this binary) +// +// This folder structure ensures no DLLs are in the same folder as the executable (test.exe). +// +// 2. Launch the test binary from the root folder of the above structure. +// +// So, there are 2 modes of this program: +// 1. "Prepare" mode: Do the step 1 above. (default) +// 2. "Test" mode: Do the step 2 above. (specified by --test argument) + +int prepare_main(); +int test_main(); + +int wmain(int argc, wchar_t* argv[]) { + if (argc == 2 && wcscmp(argv[1], L"--test") == 0) { + return test_main(); + } else { + return prepare_main(); + } +} + +int prepare_main() { + std::wstring path_str(32768, L'\0'); + GetModuleFileNameW(NULL, path_str.data(), static_cast(path_str.size())); + + namespace fs = std::filesystem; + fs::path exe_full_path{path_str}; // /onnxruntime_webgpu_delay_load_test.exe + fs::path test_dir = exe_full_path.parent_path(); // / + fs::path exe_name = exe_full_path.filename(); // onnxruntime_webgpu_delay_load_test.exe + fs::path root_folder = test_dir / L"webgpu_delay_load_test_root\\"; // /webgpu_delay_load_test_root/ + fs::path dlls_folder = root_folder / L"dlls\\"; // /webgpu_delay_load_test_root/dlls/ + + // ensure the test folder exists and is empty + if (fs::exists(root_folder)) { + fs::remove_all(root_folder); + } + fs::create_directories(dlls_folder); + + fs::current_path(test_dir); + + // copy the required DLLs to the dlls folder + fs::copy_file(L"onnxruntime.dll", dlls_folder / L"onnxruntime.dll"); + fs::copy_file(L"dxil.dll", dlls_folder / L"dxil.dll"); + fs::copy_file(L"dxcompiler.dll", dlls_folder / L"dxcompiler.dll"); + if (fs::exists(L"webgpu_dawn.dll")) { + fs::copy_file(L"webgpu_dawn.dll", dlls_folder / L"webgpu_dawn.dll"); + } + + // copy the test binary to the root folder + fs::copy_file(exe_full_path, root_folder / L"test.exe"); + + // run "test.exe --test" from the test root folder + fs::current_path(root_folder); + return _wsystem(L"test.exe --test"); +} + +int run() { + Ort::Env env{nullptr}; + int retval = 0; + try { + env = Ort::Env{ORT_LOGGING_LEVEL_WARNING, "Default"}; + + // model is https://github.com/onnx/onnx/blob/v1.15.0/onnx/backend/test/data/node/test_abs/model.onnx + constexpr uint8_t MODEL_DATA[] = {8, 7, 18, 12, 98, 97, 99, 107, 101, 110, + 100, 45, 116, 101, 115, 116, 58, 73, 10, 11, + 10, 1, 120, 18, 1, 121, 34, 3, 65, 98, + 115, 18, 8, 116, 101, 115, 116, 95, 97, 98, + 115, 90, 23, 10, 1, 120, 18, 18, 10, 16, + 8, 1, 18, 12, 10, 2, 8, 3, 10, 2, + 8, 4, 10, 2, 8, 5, 98, 23, 10, 1, + 121, 18, 18, 10, 16, 8, 1, 18, 12, 10, + 2, 8, 3, 10, 2, 8, 4, 10, 2, 8, + 5, 66, 4, 10, 0, 16, 13}; + + Ort::SessionOptions session_options; + session_options.DisableMemPattern(); + std::unordered_map provider_options; + session_options.AppendExecutionProvider("WebGPU", provider_options); + Ort::Session session{env, MODEL_DATA, sizeof(MODEL_DATA), session_options}; + + // successfully initialized + std::cout << "Successfully initialized WebGPU EP." << std::endl; + retval = 0; + } catch (const std::exception& ex) { + std::cerr << ex.what() << std::endl; + + std::cerr << "Unexpected exception." << std::endl; + retval = -1; + } + + return retval; +} + +int test_main() { + HMODULE hModule = LoadLibraryA("dlls\\onnxruntime.dll"); + if (hModule == NULL) { + std::cout << "Failed to load dlls\\onnxruntime.dll" << std::endl; + std::cout << "Error code: " << GetLastError() << std::endl; + return 1; + } + + int retval = 0; + + using OrtGetApiBaseFunction = decltype(&OrtGetApiBase); + auto fnOrtGetApiBase = (OrtGetApiBaseFunction)GetProcAddress(hModule, "OrtGetApiBase"); + if (fnOrtGetApiBase == NULL) { + std::cout << "Failed to get OrtGetApiBase" << std::endl; + retval = 1; + goto cleanup; + } + Ort::InitApi(fnOrtGetApiBase()->GetApi(ORT_API_VERSION)); + + retval = run(); + +cleanup: + if (hModule != NULL) { + FreeLibrary(hModule); + } + return retval; +} diff --git a/onnxruntime/test/webgpu/external_dawn/main.cc b/onnxruntime/test/webgpu/external_dawn/main.cc index ed8d2eab94ce9..1cb22b131d76b 100644 --- a/onnxruntime/test/webgpu/external_dawn/main.cc +++ b/onnxruntime/test/webgpu/external_dawn/main.cc @@ -1,5 +1,4 @@ // Copyright (c) Microsoft Corporation. All rights reserved. -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates // Licensed under the MIT License. #include diff --git a/orttraining/orttraining/core/session/training_session.cc b/orttraining/orttraining/core/session/training_session.cc index f1545e96481fa..b03f1b1eadb3b 100644 --- a/orttraining/orttraining/core/session/training_session.cc +++ b/orttraining/orttraining/core/session/training_session.cc @@ -5,6 +5,7 @@ #include "core/framework/data_transfer_utils.h" #include "core/graph/model.h" +#include "core/graph/model_saving_options.h" #include "core/session/IOBinding.h" #include "core/optimizer/rule_based_graph_transformer.h" #include "core/providers/cpu/controlflow/utils.h" @@ -1003,7 +1004,8 @@ Status TrainingSession::SaveWithExternalInitializers(const PathString& model_uri std::remove(ToUTF8String(model_uri).c_str()); std::remove(external_file_name.c_str()); - return Model::SaveWithExternalInitializers(*model_, model_uri, external_file_name, initializer_size_threshold); + ModelSavingOptions model_saving_options{initializer_size_threshold}; + return Model::SaveWithExternalInitializers(*model_, model_uri, external_file_name, model_saving_options); } Status TrainingSession::Save(const PathString& model_uri, TrainingSession::SaveOption opt) { diff --git a/orttraining/orttraining/training_api/module.cc b/orttraining/orttraining/training_api/module.cc index 939e1de334e52..60708b05626c5 100644 --- a/orttraining/orttraining/training_api/module.cc +++ b/orttraining/orttraining/training_api/module.cc @@ -11,6 +11,7 @@ #include "core/session/inference_session.h" #include "core/session/environment.h" #include "core/session/onnxruntime_session_options_config_keys.h" +#include "core/graph/model_saving_options.h" #include "core/graph/graph_utils.h" #include "orttraining/training_api/checkpoint.h" @@ -689,8 +690,10 @@ Status Module::ExportModelForInferencing(const std::string& inference_model_path std::string external_data_name = ORT_TSTR_CONVERT_TO_PRINTABLE_STRING(ExternalCheckpointDataPath(ToPathString(inference_model_path))); PathString inference_model_pathstring = ToPathString(inference_model_path); + ModelSavingOptions model_saving_options{64}; ORT_THROW_IF_ERROR( - Model::SaveWithExternalInitializers(*inference_model, inference_model_pathstring, external_data_name, 64)); + Model::SaveWithExternalInitializers(*inference_model, inference_model_pathstring, external_data_name, + model_saving_options)); } else { ORT_THROW_IF_ERROR(Model::Save(*inference_model, ToPathString(inference_model_path))); } diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index 3527a89ca7a7b..53dcdc6e0c6fa 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -260,6 +260,7 @@ def convert_arg_line_to_args(self, arg_line): ) parser.add_argument("--disable_cuda_nhwc_ops", action="store_true", help="Disable CUDA NHWC ops in build.") + parser.add_argument("--enable_cuda_minimal_build", action="store_true", help="Enable CUDA minimal build.") # Python bindings parser.add_argument("--enable_pybind", action="store_true", help="Enable Python Bindings.") @@ -1093,6 +1094,7 @@ def generate_build_tree( "-Donnxruntime_DISABLE_FLOAT8_TYPES=" + ("ON" if disable_float8_types else "OFF"), "-Donnxruntime_DISABLE_SPARSE_TENSORS=" + ("ON" if disable_sparse_tensors else "OFF"), "-Donnxruntime_DISABLE_OPTIONAL_TYPE=" + ("ON" if disable_optional_type else "OFF"), + "-Donnxruntime_CUDA_MINIMAL=" + ("ON" if args.enable_cuda_minimal_build else "OFF"), ] if args.rv64: diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml new file mode 100644 index 0000000000000..2a32dd1a62408 --- /dev/null +++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml @@ -0,0 +1,108 @@ +##### start trigger Don't edit it manually, Please do edit set-trigger-rules.py #### +### please do rerun set-trigger-rules.py ### +trigger: + branches: + include: + - main + - rel-* + paths: + exclude: + - docs/** + - README.md + - CONTRIBUTING.md + - BUILD.md + - 'js/web' + - 'onnxruntime/core/providers/js' +pr: + branches: + include: + - main + - rel-* + paths: + exclude: + - docs/** + - README.md + - CONTRIBUTING.md + - BUILD.md + - 'js/web' + - 'onnxruntime/core/providers/js' +#### end trigger #### +parameters: + - name: CudaVersion + displayName: CUDA version + type: string + default: '12.2' + values: + - 11.8 + - 12.2 + +variables: + - template: templates/common-variables.yml + - name: docker_base_image + ${{ if eq(parameters.CudaVersion, '11.8') }}: + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241120.3 + ${{ if eq(parameters.CudaVersion, '12.2') }}: + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20241120.3 + - name: linux_trt_version + ${{ if eq(parameters.CudaVersion, '11.8') }}: + value: ${{ variables.linux_trt_version_cuda11 }} + ${{ if eq(parameters.CudaVersion, '12.2') }}: + value: ${{ variables.linux_trt_version_cuda12 }} + +jobs: +- job: Linux_Build + timeoutInMinutes: 180 + variables: + skipComponentGovernanceDetection: true + ALLOW_RELEASED_ONNX_OPSET_ONLY: '1' + ORT_CACHE_DIR: '$(Agent.TempDirectory)/ort/ccache' + TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)] + workspace: + clean: all + pool: onnxruntime-tensorrt-linuxbuild-T4 + steps: + - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3 + displayName: 'Clean Agent Directories' + condition: always() + + - checkout: self + clean: true + submodules: none + + - template: templates/get-docker-image-steps.yml + parameters: + Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda + Context: tools/ci_build/github/linux/docker + DockerBuildArgs: " + --network=host + --build-arg BASEIMAGE=${{ variables.docker_base_image }} + --build-arg TRT_VERSION=${{ variables.linux_trt_version }} + --build-arg BUILD_UID=$( id -u ) + " + Repository: onnxruntimetensorrtcudaminimalbuild + + - template: templates/linux-build-step-with-cache.yml + parameters: + WithCache: true + Today: $(TODAY) + AdditionalKey: gpu_tensorrt_cuda_minimal + CacheDir: '$(ORT_CACHE_DIR)' + BuildStep: + - task: CmdLine@2 + inputs: + script: | + docker run --gpus all --rm \ + --volume /data/onnx:/data/onnx:ro \ + --volume $(Build.SourcesDirectory):/onnxruntime_src \ + --volume $(Build.BinariesDirectory):/build \ + --volume /data/models:/build/models:ro \ + --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \ + --volume $(ORT_CACHE_DIR):/cache \ + -e ALLOW_RELEASED_ONNX_OPSET_ONLY=0 \ + -e NIGHTLY_BUILD \ + -e BUILD_BUILDNUMBER \ + -e CCACHE_DIR=/cache -w /onnxruntime_src \ + onnxruntimetensorrtcudaminimalbuild tools/ci_build/github/linux/build_tensorrt_ci.sh --cuda_minimal=ON + workingDirectory: $(Build.SourcesDirectory) + + - template: templates/explicitly-defined-final-tasks.yml diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml index 83cf26614a285..9286b5a54ac27 100644 --- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml @@ -8,12 +8,12 @@ parameters: - name: TrtVersion displayName: TensorRT Version type: string - default: 10.5.cuda_12_5_cudnn_9 + default: 10.7_cuda12.5_cudnn9 values: - - 8.6.cuda_11_8_cudnn_8 - - 8.6.cuda_12_3_cudnn_9 - - 10.5.cuda_11_8_cudnn_8 - - 10.5.cuda_12_5_cudnn_9 + - 8.6_cuda11.8_cudnn8 + - 8.6_cuda12.3_cudnn9 + - 10.7_cuda11.8_cudnn8 + - 10.7_cuda12.5_cudnn9 - BIN - name: UseTensorrtOssParser @@ -198,4 +198,4 @@ jobs: parameters : condition : 'succeeded' - - template: templates/clean-agent-build-directory-step.yml + - template: templates/clean-agent-build-directory-step.yml \ No newline at end of file diff --git a/tools/ci_build/github/azure-pipelines/py-cuda-alt-package-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-cuda-alt-package-test-pipeline.yml index 9296928ad97e0..cf434e4eadf0d 100644 --- a/tools/ci_build/github/azure-pipelines/py-cuda-alt-package-test-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/py-cuda-alt-package-test-pipeline.yml @@ -19,6 +19,6 @@ stages: python_wheel_suffix: '_gpu' timeout: 480 docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241120.3 - trt_version: '10.6.0.26-1.cuda11.8' + trt_version: '10.7.0.23-1.cuda11.8' cuda_version: '11.8' diff --git a/tools/ci_build/github/azure-pipelines/stages/jobs/steps/py_packaging_test_step.yml b/tools/ci_build/github/azure-pipelines/stages/jobs/steps/py_packaging_test_step.yml deleted file mode 100644 index 9a721c65de332..0000000000000 --- a/tools/ci_build/github/azure-pipelines/stages/jobs/steps/py_packaging_test_step.yml +++ /dev/null @@ -1,21 +0,0 @@ -parameters: -- name: EP_NAME - type: string - default: CPU - -- name: PYTHON_VERSION - type: string - -steps: -- powershell: | - python -m pip uninstall -y onnxruntime onnxruntime-gpu -qq - Get-ChildItem -Path $(Build.ArtifactStagingDirectory)/*cp${{ replace(parameters.PYTHON_VERSION,'.','') }}*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname tabulate} - mkdir -p $(Agent.TempDirectory)\ort_test_data - Copy-Item -Path $(Build.sourcesDirectory)/onnxruntime/test/python/onnx_backend_test_series.py -Destination $(Agent.TempDirectory)\ort_test_data - Copy-Item -Recurse -Path $(Build.sourcesDirectory)/onnxruntime/test/testdata -Destination $(Agent.TempDirectory)\ort_test_data - cd $(Agent.TempDirectory)\ort_test_data - python onnx_backend_test_series.py --devices ${{ parameters.EP_NAME }} -v - cd $(Agent.TempDirectory) - Remove-Item -Path $(Agent.TempDirectory)\ort_test_data -Recurse -Force - workingDirectory: '$(Build.sourcesDirectory)' - displayName: 'Run Python Tests with ${{ parameters.EP_NAME }} EP' \ No newline at end of file diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml index 0b3eac0110abc..9c7fbc24ab1b6 100644 --- a/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml @@ -50,8 +50,6 @@ stages: win_trt_home: ${{ parameters.win_trt_home }} win_cuda_home: ${{ parameters.win_cuda_home }} buildJava: ${{ parameters.buildJava }} - SpecificArtifact: ${{ parameters.SpecificArtifact }} - BuildId: ${{ parameters.BuildId }} - template: nuget-cuda-packaging-stage.yml parameters: diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml index d6b25c98936f0..445066f08995a 100644 --- a/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml @@ -34,7 +34,7 @@ parameters: displayName: Specific Artifact's BuildId type: string default: '0' - + - name: buildJava type: boolean @@ -50,14 +50,13 @@ stages: msbuildPlatform: x64 packageName: x64-cuda CudaVersion: ${{ parameters.CudaVersion }} - buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80" --use_dml --build_csharp --parallel + buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80" runTests: ${{ parameters.RunOnnxRuntimeTests }} buildJava: ${{ parameters.buildJava }} java_artifact_id: onnxruntime_gpu UseIncreasedTimeoutForTests: ${{ parameters.UseIncreasedTimeoutForTests }} SpecificArtifact: ${{ parameters.SpecificArtifact }} BuildId: ${{ parameters.BuildId }} - ComboTests: true # Windows CUDA with TensorRT Packaging - template: ../templates/win-ci.yml parameters: @@ -69,7 +68,7 @@ stages: msbuildPlatform: x64 CudaVersion: ${{ parameters.CudaVersion }} packageName: x64-tensorrt - buildparameter: --use_tensorrt --tensorrt_home=${{ parameters.win_trt_home }} --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80" --parallel + buildparameter: --use_tensorrt --tensorrt_home=${{ parameters.win_trt_home }} --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80" runTests: ${{ parameters.RunOnnxRuntimeTests }} buildJava: ${{ parameters.buildJava }} java_artifact_id: onnxruntime_gpu diff --git a/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml index f7235e3ad2076..947e4f99b984f 100644 --- a/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml @@ -56,7 +56,7 @@ stages: PYTHON_VERSION: ${{ python_version }} EP_NAME: gpu CudaVersion: ${{ parameters.cuda_version }} - EP_BUILD_FLAGS: --use_dml --enable_lto --cuda_home=$(Agent.TempDirectory)\v${{ parameters.cuda_version }} --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80" + EP_BUILD_FLAGS: --enable_lto --cuda_home=$(Agent.TempDirectory)\v${{ parameters.cuda_version }} --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80" use_tensorrt: True - ${{ if eq(parameters.enable_linux_cuda, true) }}: diff --git a/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml index dd0539f751c89..aa7f2845fc0fa 100644 --- a/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml @@ -33,7 +33,7 @@ parameters: - Release - RelWithDebInfo - MinSizeRel - + - name: use_tensorrt type: boolean default: false @@ -134,7 +134,7 @@ stages: --cmake_generator "$(VSGenerator)" --enable_pybind --enable_onnx_tests - --parallel 4 --use_binskim_compliant_compile_flags --update --build + --parallel --use_binskim_compliant_compile_flags --update --build $(TelemetryOption) ${{ parameters.BUILD_PY_PARAMETERS }} ${{ parameters.EP_BUILD_FLAGS }} ${{ variables.trt_build_flag }} workingDirectory: '$(Build.BinariesDirectory)' @@ -206,20 +206,19 @@ stages: DownloadTRT: ${{ parameters.use_tensorrt }} - task: PowerShell@2 - displayName: 'Install Third Party Dependencies' + displayName: 'Install ONNX' inputs: filePath: '$(Build.SourcesDirectory)/tools/ci_build/github/windows/install_third_party_deps.ps1' workingDirectory: '$(Build.BinariesDirectory)' arguments: -cpu_arch x64 -install_prefix $(Build.BinariesDirectory)\${{ parameters.cmake_build_type }}\installed -build_config ${{ parameters.cmake_build_type }} - - template: jobs/steps/py_packaging_test_step.yml - parameters: - EP_NAME: DML - PYTHON_VERSION: ${{ parameters.PYTHON_VERSION }} - - - template: jobs/steps/py_packaging_test_step.yml - parameters: - EP_NAME: CUDA - PYTHON_VERSION: ${{ parameters.PYTHON_VERSION }} - - + - powershell: | + python -m pip uninstall -y onnxruntime onnxruntime-gpu -qq + Get-ChildItem -Path $(Build.ArtifactStagingDirectory)/*cp${{ replace(parameters.PYTHON_VERSION,'.','') }}*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname tabulate} + mkdir -p $(Agent.TempDirectory)\ort_test_data + Copy-Item -Path $(Build.sourcesDirectory)/onnxruntime/test/python/onnx_backend_test_series.py -Destination $(Agent.TempDirectory)\ort_test_data + Copy-Item -Recurse -Path $(Build.sourcesDirectory)/onnxruntime/test/testdata -Destination $(Agent.TempDirectory)\ort_test_data + cd $(Agent.TempDirectory)\ort_test_data + python onnx_backend_test_series.py + workingDirectory: '$(Build.sourcesDirectory)' + displayName: 'Run Python Tests' diff --git a/tools/ci_build/github/azure-pipelines/templates/common-variables.yml b/tools/ci_build/github/azure-pipelines/templates/common-variables.yml index d35bed69ee409..3d4e5326ae7c6 100644 --- a/tools/ci_build/github/azure-pipelines/templates/common-variables.yml +++ b/tools/ci_build/github/azure-pipelines/templates/common-variables.yml @@ -1,5 +1,5 @@ variables: - common_trt_version: '10.6.0.26' + common_trt_version: '10.7.0.23' # As for Debian installation, replace '-1.' by '-1+' when assigning trt version below linux_trt_version_cuda11: ${{ variables.common_trt_version }}-1.cuda11.8 linux_trt_version_cuda12: ${{ variables.common_trt_version }}-1.cuda12.6 diff --git a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml index 949479fb8b5e4..8409edb4d0429 100644 --- a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml +++ b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml @@ -11,7 +11,7 @@ steps: packageType: upack feed: '/7424c8e4-5c62-490e-95c4-79446f31017c' definition: '517c4f6f-5437-4392-a70d-4f15ec5be2f0' - version: 1.0.201 + version: 1.0.202 downloadPath: $(Build.BinariesDirectory)/deps # The private ADO project @@ -22,7 +22,7 @@ steps: packageType: upack feed: '/4c7631f5-24c0-4307-8822-1aa8f180c325' definition: 'fd9dd5ad-b73e-4678-890e-edcf680dbc1a' - version: 1.0.201 + version: 1.0.202 downloadPath: $(Build.BinariesDirectory)/deps # You can add more ADO accounts at here. diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml index ae54b3849a862..14b9c378bec14 100644 --- a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml +++ b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml @@ -13,10 +13,10 @@ parameters: - 12.2 - name: TrtVersion type: string - default: '10.6.0.26' + default: '10.7.0.23' values: - 8.6.1.6 - - 10.6.0.26 + - 10.7.0.23 steps: - ${{ if eq(parameters.DownloadCUDA, true) }}: @@ -42,7 +42,7 @@ steps: - powershell: | Write-Host "##vso[task.setvariable variable=trtCudaVersion;]12.0" displayName: Set trtCudaVersion - - ${{ if and(eq(parameters.CudaVersion, '12.2'), eq(parameters.TrtVersion, '10.6.0.26')) }}: + - ${{ if and(eq(parameters.CudaVersion, '12.2'), eq(parameters.TrtVersion, '10.7.0.23')) }}: - powershell: | Write-Host "##vso[task.setvariable variable=trtCudaVersion;]12.6" displayName: Set trtCudaVersion diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml index dfaf237a711fe..45572416350c3 100644 --- a/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml +++ b/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml @@ -15,10 +15,10 @@ parameters: default: '11.8' - name: win_trt_folder_cuda11 type: string - default: 'TensorRT-10.6.0.26.Windows10.x86_64.cuda-11.8' + default: 'TensorRT-10.7.0.23.Windows10.x86_64.cuda-11.8' - name: win_trt_folder_cuda12 type: string - default: 'TensorRT-10.6.0.26.Windows10.x86_64.cuda-12.6' + default: 'TensorRT-10.7.0.23.Windows10.x86_64.cuda-12.6' steps: - ${{ if eq(parameters.DownloadCUDA, 'true') }}: diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml index 7bdd069de711b..e8f391a73fa7b 100644 --- a/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml +++ b/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml @@ -218,32 +218,16 @@ jobs: - powershell: | python3 -m pip uninstall -y onnxruntime onnxruntime-gpu onnxruntime-training onnxruntime-directml -qq Get-ChildItem -Path dist/*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname} + workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}' displayName: 'Install onnxruntime wheel' - ${{ if eq(parameters.RunOnnxRuntimeTests, true) }}: - - ${{ if and(contains(parameters.additionalBuildFlags, 'use_cuda'), contains(parameters.additionalBuildFlags, 'use_dml')) }}: - - powershell: | - python $(Build.SourcesDirectory)\tools\ci_build\build.py --config ${{ parameters.BuildConfig }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --enable_onnx_tests ${{ parameters.additionalBuildFlags }} - workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}' - displayName: 'Run tests excluding CUDA tests' - env: - NO_CUDA_TEST: '1' - GTEST_FILTER: '-CudaEp*:CudaNhwcTypedTest*:*cpu_*models*' # Exclude CUDA EP tests under providers/cuda/ and cpu models test - PATH: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }};$(PATH)' # For onnxruntime4j_test to find dependent dlls - - powershell: | - python $(Build.SourcesDirectory)\tools\ci_build\build.py --config ${{ parameters.BuildConfig }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --enable_onnx_tests ${{ parameters.additionalBuildFlags }} - workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}' - displayName: 'Run tests excluding DML tests' - env: - NO_DML_TEST: '1' - GTEST_FILTER: '-*cpu_*models*' - PATH: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }};$(PATH)' - - ${{ else }}: - - powershell: | - python $(Build.SourcesDirectory)\tools\ci_build\build.py --config ${{ parameters.BuildConfig }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --enable_onnx_tests ${{ parameters.additionalBuildFlags }} - workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}' - displayName: 'Run tests' + - powershell: | + python $(Build.SourcesDirectory)\tools\ci_build\build.py --config ${{ parameters.BuildConfig }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_shared_lib --enable_onnx_tests ${{ parameters.additionalBuildFlags }} + + workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}' + displayName: 'Run tests' - ${{ if eq(parameters.GenerateDocumentation, true) }}: - task: PythonScript@0 diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml index e046997b4f49a..59950433b3d40 100644 --- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml +++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml @@ -25,7 +25,7 @@ parameters: - name: runTests type: boolean - default: false + default: true - name: buildJava type: boolean @@ -71,10 +71,6 @@ parameters: - 11.8 - 12.2 -- name: ComboTests - type: boolean - default: false - - name: SpecificArtifact displayName: Use Specific Artifact type: boolean @@ -226,7 +222,7 @@ stages: condition: and(succeeded(), eq('${{ parameters.runTests}}', true)) inputs: scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py' - arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --test --skip_submodule_sync --build_shared_lib --cmake_generator "$(VSGenerator)" --enable_onnx_tests $(TelemetryOption) ${{ parameters.buildparameter }}' + arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "$(VSGenerator)" --enable_onnx_tests $(TelemetryOption) ${{ parameters.buildparameter }}' workingDirectory: '$(Build.BinariesDirectory)' - ${{ else }}: - powershell: | @@ -338,10 +334,6 @@ stages: displayName: 'Clean Agent Directories' condition: always() - - script: - echo ${{ parameters.SpecificArtifact }} - displayName: 'Print Specific Artifact' - - checkout: self clean: true submodules: none @@ -407,35 +399,13 @@ stages: displayName: 'Append dotnet x86 Directory to PATH' condition: and(succeeded(), eq('${{ parameters.buildArch}}', 'x86')) - - ${{ if eq(parameters.ComboTests, 'true') }}: - - task: PythonScript@0 - displayName: 'test excludes CUDA' - condition: and(succeeded(), eq('${{ parameters.runTests}}', true)) - inputs: - scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py' - arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --enable_onnx_tests $(TelemetryOption) ' - workingDirectory: '$(Build.BinariesDirectory)' - env: - NO_CUDA_TEST: '1' - GTEST_FILTER: '-CudaEp*:CudaNhwcTypedTest*' # Exclude CUDA EP tests under providers/cuda/ - - task: PythonScript@0 - displayName: 'test excludes DML' - condition: and(succeeded(), eq('${{ parameters.runTests}}', true)) - inputs: - scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py' - arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --enable_onnx_tests $(TelemetryOption) ' - workingDirectory: '$(Build.BinariesDirectory)' - env: - NO_DML_TEST: '1' - - ${{ else }}: - - task: PythonScript@0 - displayName: 'test' - condition: and(succeeded(), eq('${{ parameters.runTests}}', true)) - inputs: - scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py' - arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --enable_onnx_tests $(TelemetryOption) ' - workingDirectory: '$(Build.BinariesDirectory)' - + - task: PythonScript@0 + displayName: 'test' + condition: and(succeeded(), eq('${{ parameters.runTests}}', true)) + inputs: + scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py' + arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --enable_onnx_tests $(TelemetryOption) ' + workingDirectory: '$(Build.BinariesDirectory)' # Previous stage only assembles the java binaries, testing will be done in this stage with GPU machine - ${{ if eq(parameters.buildJava, 'true') }}: - template: make_java_win_binaries.yml diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-cuda-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-cuda-ci-pipeline.yml index 67fd47c3150af..47ece37e66e09 100644 --- a/tools/ci_build/github/azure-pipelines/win-gpu-cuda-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/win-gpu-cuda-ci-pipeline.yml @@ -62,28 +62,4 @@ stages: RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }} ORT_EP_NAME: CUDA WITH_CACHE: true - MachinePool: onnxruntime-Win2022-GPU-A10 - -- stage: cuda_dml - dependsOn: [] - jobs: - - template: templates/jobs/win-ci-vs-2022-job.yml - parameters: - BuildConfig: 'RelWithDebInfo' - EnvSetupScript: setup_env_cuda.bat - buildArch: x64 - additionalBuildFlags: >- - --build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}" - --enable_cuda_profiling --enable_transformers_tool_test - --use_dml - --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 - --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON - --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON - msbuildPlatform: x64 - isX86: false - job_name_suffix: x64_RelWithDebInfo - RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }} - ORT_EP_NAME: CUDA - EnablePython: false - WITH_CACHE: true - MachinePool: onnxruntime-Win2022-GPU-A10 + MachinePool: onnxruntime-Win2022-GPU-A10 \ No newline at end of file diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-dml-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-dml-ci-pipeline.yml index 911d99cd2adf3..94b0aa680d54d 100644 --- a/tools/ci_build/github/azure-pipelines/win-gpu-dml-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/win-gpu-dml-ci-pipeline.yml @@ -43,11 +43,11 @@ stages: BuildConfig: 'RelWithDebInfo' EnvSetupScript: setup_env.bat buildArch: x64 - additionalBuildFlags: --enable_pybind --use_dml --enable_wcos --use_winml + additionalBuildFlags: --enable_pybind --use_dml --enable_wcos --use_winml msbuildPlatform: x64 isX86: false job_name_suffix: x64_RelWithDebInfo RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }} ORT_EP_NAME: DML WITH_CACHE: false - MachinePool: onnxruntime-Win2022-GPU-dml-A10 + MachinePool: onnxruntime-Win2022-GPU-dml-A10 \ No newline at end of file diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-cuda-minimal-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-cuda-minimal-ci-pipeline.yml new file mode 100644 index 0000000000000..c68ba01485db2 --- /dev/null +++ b/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-cuda-minimal-ci-pipeline.yml @@ -0,0 +1,86 @@ +##### start trigger Don't edit it manually, Please do edit set-trigger-rules.py #### +### please do rerun set-trigger-rules.py ### +trigger: + branches: + include: + - main + - rel-* + paths: + exclude: + - docs/** + - README.md + - CONTRIBUTING.md + - BUILD.md + - 'js/web' + - 'onnxruntime/core/providers/js' +pr: + branches: + include: + - main + - rel-* + paths: + exclude: + - docs/** + - README.md + - CONTRIBUTING.md + - BUILD.md + - 'js/web' + - 'onnxruntime/core/providers/js' +#### end trigger #### +parameters: +- name: CudaVersion + displayName: CUDA version + type: string + default: '12.2' + values: + - 11.8 + - 12.2 + +variables: + - template: templates/common-variables.yml + - name: win_trt_folder + ${{ if eq(parameters.CudaVersion, '11.8') }}: + value: ${{ variables.win_trt_folder_cuda11 }} + ${{ if eq(parameters.CudaVersion, '12.2') }}: + value: ${{ variables.win_trt_folder_cuda12 }} + +jobs: +- job: 'build' + pool: 'onnxruntime-Win2022-GPU-A10' + variables: + MsbuildArguments: '-detailedsummary -maxcpucount -consoleloggerparameters:PerformanceSummary' + EnvSetupScript: setup_env_trt.bat + skipComponentGovernanceDetection: true + TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)] + timeoutInMinutes: 150 + workspace: + clean: all + steps: + - template: templates/jobs/win-ci-prebuild-steps.yml + parameters: + EnvSetupScript: $(EnvSetupScript) + DownloadCUDA: true + DownloadTRT: true + BuildArch: 'x64' + BuildConfig: RelWithDebInfo + MachinePool: 'onnxruntime-Win2022-GPU-A10' + WithCache: true + Today: $(Today) + + - template: templates/jobs/win-ci-build-steps.yml + parameters: + WithCache: True + Today: $(TODAY) + AdditionalKey: "gpu_tensorrt_cuda_minimal | RelWithDebInfo" + BuildPyArguments: '--config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\${{ variables.win_trt_folder }}" --cuda_home="$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --enable_cuda_minimal_build' + MsbuildArguments: $(MsbuildArguments) + BuildArch: 'x64' + Platform: 'x64' + BuildConfig: RelWithDebInfo + + - task: PythonScript@0 + displayName: 'Build wheel' + inputs: + scriptPath: '$(Build.SourcesDirectory)\setup.py' + arguments: 'bdist_wheel' + workingDirectory: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo' diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-webgpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-webgpu-ci-pipeline.yml index 06f374afca57a..8460df2ec3799 100644 --- a/tools/ci_build/github/azure-pipelines/win-gpu-webgpu-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/win-gpu-webgpu-ci-pipeline.yml @@ -48,7 +48,7 @@ stages: --enable_pybind --build_nodejs --use_webgpu - --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON + --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY=ON msbuildPlatform: x64 isX86: false job_name_suffix: x64_RelWithDebInfo diff --git a/tools/ci_build/github/linux/build_tensorrt_ci.sh b/tools/ci_build/github/linux/build_tensorrt_ci.sh index 5b206bc0a92d9..ccf7a6f4ea630 100755 --- a/tools/ci_build/github/linux/build_tensorrt_ci.sh +++ b/tools/ci_build/github/linux/build_tensorrt_ci.sh @@ -21,6 +21,19 @@ BUILD_ARGS=('--config' 'Release' "CMAKE_CUDA_ARCHITECTURES=75" "onnxruntime_BUILD_UNIT_TESTS=ON" "onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON") + +# Parse external args +for arg in "$@"; do + case $arg in + --cuda_minimal=ON) + # Replace onnxruntime_BUILD_UNIT_TESTS=ON with OFF + BUILD_ARGS=("${BUILD_ARGS[@]/onnxruntime_BUILD_UNIT_TESTS=ON/onnxruntime_BUILD_UNIT_TESTS=OFF}") + BUILD_ARGS+=("--enable_cuda_minimal_build") + BUILD_ARGS+=("--skip_tests") + ;; + esac +done + if [ -x "$(command -v ninja)" ]; then BUILD_ARGS+=('--cmake_generator' 'Ninja') fi diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0 b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0 index c2bae5fd7ee59..df5112dc38af4 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0 +++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0 @@ -6,7 +6,7 @@ # Build base image with required system packages ARG BASEIMAGE=nvidia/cuda:12.5.1-cudnn-devel-ubi8 -ARG TRT_VERSION=10.6.0.26-1.cuda12.6 +ARG TRT_VERSION=10.7.0.23-1.cuda12.6 FROM $BASEIMAGE AS base ARG TRT_VERSION ENV PATH=/opt/python/cp310-cp310/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH} diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch index 2ecc6d1918b1a..fef95b8574520 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch +++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch @@ -6,7 +6,7 @@ # Build base image with required system packages ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8 -ARG TRT_VERSION=10.6.0.26-1.cuda11.8 +ARG TRT_VERSION=10.7.0.23-1.cuda11.8 FROM $BASEIMAGE AS base ARG TRT_VERSION ENV PATH=/opt/python/cp310-cp310/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH} diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu index 81aeada6a4a46..e91f14ff955b9 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu +++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu @@ -6,7 +6,7 @@ # Build base image with required system packages ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 -ARG TRT_VERSION=10.6.0.26-1+cuda11.8 +ARG TRT_VERSION=10.7.0.23-1+cuda11.8 ARG LD_LIBRARY_PATH_ARG=/usr/local/lib64:/usr/local/cuda/lib64 FROM $BASEIMAGE AS base ARG TRT_VERSION diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_ffmpeg b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_ffmpeg index 4298dd53e4c66..0b08d4b3024b8 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_ffmpeg +++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_ffmpeg @@ -6,7 +6,7 @@ # Build base image with required system packages ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 -ARG TRT_VERSION=10.6.0.26-1+cuda11.8 +ARG TRT_VERSION=10.7.0.23-1+cuda11.8 ARG LD_LIBRARY_PATH_ARG=/usr/local/lib64:/usr/local/cuda/lib64 FROM $BASEIMAGE AS base ARG TRT_VERSION diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_opencv b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_opencv index 1312475ceca3a..3a7e064686ae5 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_opencv +++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_opencv @@ -6,7 +6,7 @@ # Build base image with required system packages ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 -ARG TRT_VERSION=10.6.0.26-1+cuda11.8 +ARG TRT_VERSION=10.7.0.23-1+cuda11.8 ARG LD_LIBRARY_PATH_ARG=/usr/local/lib64:/usr/local/cuda/lib64 FROM $BASEIMAGE AS base ARG TRT_VERSION diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10 index 22d5e3b0248a8..01f08ff41e2cc 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10 +++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10 @@ -31,7 +31,7 @@ RUN pip install --upgrade pip RUN pip install psutil setuptools>=68.2.2 # Install TensorRT -RUN TRT_VERSION="10.6.0.26-1+cuda11.8" &&\ +RUN TRT_VERSION="10.7.0.23-1+cuda11.8" &&\ apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\ apt-get update &&\ apt-get install -y \ diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10 index 819d9bab7be75..781f0647a084b 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10 +++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10 @@ -31,7 +31,7 @@ RUN pip install --upgrade pip RUN pip install setuptools>=68.2.2 psutil # Install TensorRT -RUN TRT_VERSION="10.6.0.26-1+cuda12.6" &&\ +RUN TRT_VERSION="10.7.0.23-1+cuda12.6" &&\ apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\ apt-get update &&\ apt-get install -y \ diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile index a69b98f86ba1b..5f10607b11626 100644 --- a/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile @@ -5,7 +5,7 @@ ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8 FROM $BASEIMAGE -ARG TRT_VERSION=10.6.0.26-1.cuda11.8 +ARG TRT_VERSION=10.7.0.23-1.cuda11.8 #Install TensorRT only if TRT_VERSION is not empty RUN if [ -n "${TRT_VERSION}" ]; then \ diff --git a/tools/ci_build/github/windows/setup_env_gpu.bat b/tools/ci_build/github/windows/setup_env_gpu.bat index 34ddd75da16fc..4e2bd8f8386e2 100644 --- a/tools/ci_build/github/windows/setup_env_gpu.bat +++ b/tools/ci_build/github/windows/setup_env_gpu.bat @@ -6,10 +6,10 @@ if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ ( ) else ( set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\CUPTI\lib64;%PATH% ) -set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.6.0.26.Windows10.x86_64.cuda-12.6\lib;%PATH% +set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.7.0.23.Windows10.x86_64.cuda-12.6\lib;%PATH% @REM The default version is still cuda v12.2, because set cuda v11.8 after it -set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\TensorRT-10.6.0.26.Windows10.x86_64.cuda-11.8\lib +set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\TensorRT-10.7.0.23.Windows10.x86_64.cuda-11.8\lib if exist PATH=%AGENT_TEMPDIRECTORY%\v11.8\ ( set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v11.8\bin;%AGENT_TEMPDIRECTORY%\v11.8\extras\CUPTI\lib64 ) else ( diff --git a/tools/ci_build/github/windows/setup_env_trt.bat b/tools/ci_build/github/windows/setup_env_trt.bat index 03734293be5c4..6a602e46661e7 100644 --- a/tools/ci_build/github/windows/setup_env_trt.bat +++ b/tools/ci_build/github/windows/setup_env_trt.bat @@ -6,6 +6,6 @@ if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ ( ) else ( set PATH=%PATH%;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\CUPTI\lib64 ) -set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.6.0.26.Windows10.x86_64.cuda-12.6\lib;%PATH% +set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.7.0.23.Windows10.x86_64.cuda-12.6\lib;%PATH% set GRADLE_OPTS=-Dorg.gradle.daemon=false set CUDA_MODULE_LOADING=LAZY