diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json index 475f75b5bf19b..ced418e0f4cc9 100644 --- a/cgmanifests/generated/cgmanifest.json +++ b/cgmanifests/generated/cgmanifest.json @@ -196,7 +196,7 @@ "component": { "type": "git", "git": { - "commitHash": "bc0d2e35909b8456abe32f3b30a49bb0c125e8b7", + "commitHash": "9c69a24bc2e20c8a511a4e6b06fd49639ec5300a", "repositoryUrl": "https://github.com/onnx/onnx-tensorrt.git" }, "comments": "onnx_tensorrt" diff --git a/cmake/deps.txt b/cmake/deps.txt index ed41ad5b0ceb1..cb5a5910fb3d0 100644 --- a/cmake/deps.txt +++ b/cmake/deps.txt @@ -36,8 +36,8 @@ microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.z mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee7d34223d0567892db5179849939c8769dc41 mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063 onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.17.0.zip;13a60ac5217c104139ce0fd024f48628e7bcf5bc -# Use the latest commit of 10.6-GA-ORT-DDS -onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/bc0d2e35909b8456abe32f3b30a49bb0c125e8b7.zip;f233ae871ad82c023da62e5dd620639f00bc2d15 +# Use the latest commit of 10.7-GA +onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/9c69a24bc2e20c8a511a4e6b06fd49639ec5300a.zip;ff1fe9af78eb129b4a4cdcb7450b7390b4436dd3 protobuf;https://github.com/protocolbuffers/protobuf/archive/refs/tags/v21.12.zip;7cf2733949036c7d52fda017badcab093fe73bfa protoc_win64;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip;b4521f7ada5b260380f94c4bd7f1b7684c76969a protoc_win32;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win32.zip;3688010318192c46ce73213cdfb6b3e5656da874 diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake index 732c0511d400f..d72b61a0859b2 100644 --- a/cmake/onnxruntime.cmake +++ b/cmake/onnxruntime.cmake @@ -77,6 +77,7 @@ if(WIN32) onnxruntime_add_shared_library(onnxruntime ${SYMBOL_FILE} "${ONNXRUNTIME_ROOT}/core/dll/dllmain.cc" + "${ONNXRUNTIME_ROOT}/core/dll/delay_load_hook.cc" "${ONNXRUNTIME_ROOT}/core/dll/onnxruntime.rc" ) elseif(onnxruntime_BUILD_APPLE_FRAMEWORK) diff --git a/cmake/onnxruntime_nodejs.cmake b/cmake/onnxruntime_nodejs.cmake index 376d895be34a9..355575be3bcf7 100644 --- a/cmake/onnxruntime_nodejs.cmake +++ b/cmake/onnxruntime_nodejs.cmake @@ -60,15 +60,26 @@ else() endif() endif() +# a list of DLLs that the Node.js binding depends on +set(NODEJS_DLL_DEPS) + # setup providers if (onnxruntime_USE_CUDA) set(NODEJS_BINDING_USE_CUDA "--use_cuda") endif() if (onnxruntime_USE_DML) set(NODEJS_BINDING_USE_DML "--use_dml") + list(APPEND NODEJS_DLL_DEPS "$/DirectML.dll") endif() if (onnxruntime_USE_WEBGPU) set(NODEJS_BINDING_USE_WEBGPU "--use_webgpu") + if (WIN32 AND onnxruntime_ENABLE_DAWN_BACKEND_D3D12) + list(APPEND NODEJS_DLL_DEPS "$/dxil.dll") + list(APPEND NODEJS_DLL_DEPS "$/dxcompiler.dll") + endif() + if (onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY) + list(APPEND NODEJS_DLL_DEPS "$") + endif() endif() if (onnxruntime_USE_TENSORRT) set(NODEJS_BINDING_USE_TENSORRT "--use_tensorrt") @@ -94,9 +105,12 @@ add_custom_target(js_common_npm_ci ALL add_custom_target(nodejs_binding_wrapper ALL COMMAND ${NPM_CLI} ci - COMMAND ${NPM_CLI} run build -- --onnxruntime-build-dir=${CMAKE_CURRENT_BINARY_DIR} --config=${CMAKE_BUILD_TYPE} --onnxruntime-generator=${CMAKE_GENERATOR} - --arch=${NODEJS_BINDING_ARCH} ${NODEJS_BINDING_USE_CUDA} ${NODEJS_BINDING_USE_DML} ${NODEJS_BINDING_USE_WEBGPU} ${NODEJS_BINDING_USE_TENSORRT} - ${NODEJS_BINDING_USE_COREML} ${NODEJS_BINDING_USE_QNN} + COMMAND ${NPM_CLI} run build -- "--onnxruntime-build-dir=${CMAKE_CURRENT_BINARY_DIR}" + --config=${CMAKE_BUILD_TYPE} + "--onnxruntime-generator=${CMAKE_GENERATOR}" + "--dll_deps=${NODEJS_DLL_DEPS}" + --arch=${NODEJS_BINDING_ARCH} ${NODEJS_BINDING_USE_CUDA} ${NODEJS_BINDING_USE_DML} ${NODEJS_BINDING_USE_WEBGPU} + ${NODEJS_BINDING_USE_TENSORRT} ${NODEJS_BINDING_USE_COREML} ${NODEJS_BINDING_USE_QNN} WORKING_DIRECTORY ${JS_NODE_ROOT} COMMENT "Using cmake-js to build OnnxRuntime Node.js binding") diff --git a/cmake/onnxruntime_providers_webgpu.cmake b/cmake/onnxruntime_providers_webgpu.cmake index fea5964f0dda9..e527d538d8757 100644 --- a/cmake/onnxruntime_providers_webgpu.cmake +++ b/cmake/onnxruntime_providers_webgpu.cmake @@ -23,19 +23,18 @@ onnxruntime_add_include_to_target(onnxruntime_providers_webgpu onnxruntime_common dawn::dawncpp_headers dawn::dawn_headers onnx onnx_proto flatbuffers::flatbuffers Boost::mp11 safeint_interface) + set(onnxruntime_providers_webgpu_dll_deps) + if (onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY) target_link_libraries(onnxruntime_providers_webgpu dawn::webgpu_dawn) - if (onnxruntime_ENABLE_DELAY_LOADING_WIN_DLLS) - list(APPEND onnxruntime_DELAYLOAD_FLAGS "/DELAYLOAD:webgpu_dawn.dll") - endif() + if (WIN32) + if (onnxruntime_ENABLE_DELAY_LOADING_WIN_DLLS) + list(APPEND onnxruntime_DELAYLOAD_FLAGS "/DELAYLOAD:webgpu_dawn.dll") + endif() - # Copy webgpu_dawn.dll to the output directory - add_custom_command( - TARGET onnxruntime_providers_webgpu - POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy_if_different "$" "$" - VERBATIM ) + list(APPEND onnxruntime_providers_webgpu_dll_deps "$") + endif() else() if (NOT onnxruntime_USE_EXTERNAL_DAWN) target_link_libraries(onnxruntime_providers_webgpu dawn::dawn_native) @@ -43,4 +42,23 @@ target_link_libraries(onnxruntime_providers_webgpu dawn::dawn_proc) endif() + if (WIN32 AND onnxruntime_ENABLE_DAWN_BACKEND_D3D12) + # Ensure dxil.dll and dxcompiler.dll exist in the output directory $ + add_dependencies(onnxruntime_providers_webgpu copy_dxil_dll) + add_dependencies(onnxruntime_providers_webgpu dxcompiler) + + list(APPEND onnxruntime_providers_webgpu_dll_deps "$/dxil.dll") + list(APPEND onnxruntime_providers_webgpu_dll_deps "$/dxcompiler.dll") + endif() + + if (onnxruntime_providers_webgpu_dll_deps) + # Copy dependency DLLs to the output directory + add_custom_command( + TARGET onnxruntime_providers_webgpu + POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different "${onnxruntime_providers_webgpu_dll_deps}" "$" + COMMAND_EXPAND_LISTS + VERBATIM ) + endif() + set_target_properties(onnxruntime_providers_webgpu PROPERTIES FOLDER "ONNXRuntime") diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake index e822f0a3655fc..9e3ab4d41f416 100644 --- a/cmake/onnxruntime_unittests.cmake +++ b/cmake/onnxruntime_unittests.cmake @@ -525,6 +525,9 @@ set (onnxruntime_global_thread_pools_test_SRC set (onnxruntime_webgpu_external_dawn_test_SRC ${TEST_SRC_DIR}/webgpu/external_dawn/main.cc) +set (onnxruntime_webgpu_delay_load_test_SRC + ${TEST_SRC_DIR}/webgpu/delay_load/main.cc) + # tests from lowest level library up. # the order of libraries should be maintained, with higher libraries being added first in the list @@ -1864,4 +1867,13 @@ if (onnxruntime_USE_WEBGPU AND onnxruntime_USE_EXTERNAL_DAWN) onnxruntime_add_include_to_target(onnxruntime_webgpu_external_dawn_test dawn::dawncpp_headers dawn::dawn_headers) endif() +if (onnxruntime_USE_WEBGPU AND WIN32 AND onnxruntime_BUILD_SHARED_LIB AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND NOT onnxruntime_MINIMAL_BUILD) + AddTest(DYN + TARGET onnxruntime_webgpu_delay_load_test + SOURCES ${onnxruntime_webgpu_delay_load_test_SRC} + LIBS ${SYS_PATH_LIB} + DEPENDS ${all_dependencies} + ) +endif() + include(onnxruntime_fuzz_test.cmake) diff --git a/js/node/CMakeLists.txt b/js/node/CMakeLists.txt index d79a82c572dc2..c78b40a3e7429 100644 --- a/js/node/CMakeLists.txt +++ b/js/node/CMakeLists.txt @@ -113,10 +113,12 @@ endif() if (WIN32) file(COPY ${ONNXRUNTIME_WIN_BIN_DIR}/onnxruntime.dll DESTINATION ${dist_folder}) - if (USE_DML) - file(COPY ${ONNXRUNTIME_WIN_BIN_DIR}/DirectML.dll - DESTINATION ${dist_folder}) - endif () + if (ORT_NODEJS_DLL_DEPS) + foreach(dll ${ORT_NODEJS_DLL_DEPS}) + file(COPY ${dll} DESTINATION ${dist_folder}) + endforeach() + endif() + elseif (APPLE) file(COPY ${ONNXRUNTIME_BUILD_DIR}/libonnxruntime.dylib DESTINATION ${dist_folder} FOLLOW_SYMLINK_CHAIN) diff --git a/js/node/script/build.ts b/js/node/script/build.ts index dcdcb93377b4c..b557368ed58c6 100644 --- a/js/node/script/build.ts +++ b/js/node/script/build.ts @@ -39,6 +39,8 @@ const USE_TENSORRT = !!buildArgs.use_tensorrt; const USE_COREML = !!buildArgs.use_coreml; // --use_qnn const USE_QNN = !!buildArgs.use_qnn; +// --dll_deps= +const DLL_DEPS = buildArgs.dll_deps; // build path const ROOT_FOLDER = path.join(__dirname, '..'); @@ -82,6 +84,9 @@ if (USE_COREML) { if (USE_QNN) { args.push('--CDUSE_QNN=ON'); } +if (DLL_DEPS) { + args.push(`--CDORT_NODEJS_DLL_DEPS=${DLL_DEPS}`); +} // set CMAKE_OSX_ARCHITECTURES for macOS build if (os.platform() === 'darwin') { diff --git a/js/node/src/directml_load_helper.cc b/js/node/src/directml_load_helper.cc deleted file mode 100644 index 6aafe4d5fa788..0000000000000 --- a/js/node/src/directml_load_helper.cc +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#ifdef _WIN32 -#include "common.h" -#include "windows.h" - -void LoadDirectMLDll(Napi::Env env) { - DWORD pathLen = MAX_PATH; - std::wstring path(pathLen, L'\0'); - HMODULE moduleHandle = nullptr; - - GetModuleHandleEx(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT, - reinterpret_cast(&LoadDirectMLDll), &moduleHandle); - - DWORD getModuleFileNameResult = GetModuleFileNameW(moduleHandle, const_cast(path.c_str()), pathLen); - while (getModuleFileNameResult == 0 || getModuleFileNameResult == pathLen) { - int ret = GetLastError(); - if (ret == ERROR_INSUFFICIENT_BUFFER && pathLen < 32768) { - pathLen *= 2; - path.resize(pathLen); - getModuleFileNameResult = GetModuleFileNameW(moduleHandle, const_cast(path.c_str()), pathLen); - } else { - ORT_NAPI_THROW_ERROR(env, "Failed getting path to load DirectML.dll, error code: ", ret); - } - } - - path.resize(path.rfind(L'\\') + 1); - path.append(L"DirectML.dll"); - HMODULE libraryLoadResult = LoadLibraryW(path.c_str()); - - if (!libraryLoadResult) { - int ret = GetLastError(); - ORT_NAPI_THROW_ERROR(env, "Failed loading bundled DirectML.dll, error code: ", ret); - } -} -#endif diff --git a/js/node/src/directml_load_helper.h b/js/node/src/directml_load_helper.h deleted file mode 100644 index 074a4f95ed476..0000000000000 --- a/js/node/src/directml_load_helper.h +++ /dev/null @@ -1,6 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#if defined(USE_DML) && defined(_WIN32) -void LoadDirectMLDll(Napi::Env env); -#endif diff --git a/js/node/src/inference_session_wrap.cc b/js/node/src/inference_session_wrap.cc index 23d859351f426..04ab71dc48ec2 100644 --- a/js/node/src/inference_session_wrap.cc +++ b/js/node/src/inference_session_wrap.cc @@ -4,7 +4,6 @@ #include "onnxruntime_cxx_api.h" #include "common.h" -#include "directml_load_helper.h" #include "inference_session_wrap.h" #include "run_options_helper.h" #include "session_options_helper.h" @@ -19,9 +18,6 @@ Napi::FunctionReference& InferenceSessionWrap::GetTensorConstructor() { } Napi::Object InferenceSessionWrap::Init(Napi::Env env, Napi::Object exports) { -#if defined(USE_DML) && defined(_WIN32) - LoadDirectMLDll(env); -#endif // create ONNX runtime env Ort::InitApi(); ORT_NAPI_THROW_ERROR_IF( diff --git a/js/node/src/tensor_helper.cc b/js/node/src/tensor_helper.cc index 27eb9b65c62d3..12b1a79793ff3 100644 --- a/js/node/src/tensor_helper.cc +++ b/js/node/src/tensor_helper.cc @@ -53,24 +53,24 @@ constexpr size_t DATA_TYPE_ELEMENT_SIZE_MAP[] = { static_assert(sizeof(DATA_TYPE_ELEMENT_SIZE_MAP) == sizeof(size_t) * ONNX_TENSOR_ELEMENT_DATA_TYPE_COUNT, "definition not matching"); -constexpr napi_typedarray_type DATA_TYPE_TYPEDARRAY_MAP[] = { - (napi_typedarray_type)(-1), // ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED not supported - napi_float32_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT - napi_uint8_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8 - napi_int8_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8 - napi_uint16_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16 - napi_int16_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16 - napi_int32_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32 - napi_bigint64_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64 - (napi_typedarray_type)(-1), // ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING not supported - napi_uint8_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL - napi_uint16_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16 FLOAT16 uses Uint16Array - napi_float64_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE - napi_uint32_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32 - napi_biguint64_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64 - (napi_typedarray_type)(-1), // ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX64 not supported - (napi_typedarray_type)(-1), // ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX128 not supported - (napi_typedarray_type)(-1) // ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16 not supported +constexpr std::underlying_type_t DATA_TYPE_TYPEDARRAY_MAP[] = { + std::underlying_type_t(-1), // ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED not supported + napi_float32_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT + napi_uint8_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8 + napi_int8_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8 + napi_uint16_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16 + napi_int16_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16 + napi_int32_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32 + napi_bigint64_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64 + std::underlying_type_t(-1), // ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING not supported + napi_uint8_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL + napi_uint16_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16 FLOAT16 uses Uint16Array + napi_float64_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE + napi_uint32_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32 + napi_biguint64_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64 + std::underlying_type_t(-1), // ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX64 not supported + std::underlying_type_t(-1), // ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX128 not supported + std::underlying_type_t(-1) // ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16 not supported }; static_assert(sizeof(DATA_TYPE_TYPEDARRAY_MAP) == sizeof(napi_typedarray_type) * ONNX_TENSOR_ELEMENT_DATA_TYPE_COUNT, "definition not matching"); @@ -98,7 +98,20 @@ static_assert(sizeof(DATA_TYPE_ID_TO_NAME_MAP) == sizeof(const char*) * ONNX_TEN "definition not matching"); const std::unordered_map DATA_TYPE_NAME_TO_ID_MAP = { - {"float32", ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT}, {"uint8", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8}, {"int8", ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8}, {"uint16", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16}, {"int16", ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16}, {"int32", ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32}, {"int64", ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64}, {"string", ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING}, {"bool", ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL}, {"float16", ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16}, {"float64", ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE}, {"uint32", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32}, {"uint64", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64}}; + {"float32", ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT}, + {"uint8", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8}, + {"int8", ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8}, + {"uint16", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16}, + {"int16", ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16}, + {"int32", ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32}, + {"int64", ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64}, + {"string", ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING}, + {"bool", ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL}, + {"float16", ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16}, + {"float64", ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE}, + {"uint32", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32}, + {"uint64", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64}, +}; // currently only support tensor Ort::Value NapiValueToOrtValue(Napi::Env env, Napi::Value value, OrtMemoryInfo* cpu_memory_info, OrtMemoryInfo* webgpu_memory_info) { @@ -181,7 +194,7 @@ Ort::Value NapiValueToOrtValue(Napi::Env env, Napi::Value value, OrtMemoryInfo* "Tensor.data must be a typed array for numeric tensor."); auto tensorDataTypedArray = tensorDataValue.As(); - auto typedArrayType = tensorDataValue.As().TypedArrayType(); + std::underlying_type_t typedArrayType = tensorDataValue.As().TypedArrayType(); ORT_NAPI_THROW_TYPEERROR_IF(DATA_TYPE_TYPEDARRAY_MAP[elemType] != typedArrayType, env, "Tensor.data must be a typed array (", DATA_TYPE_TYPEDARRAY_MAP[elemType], ") for ", tensorTypeString, " tensors, but got typed array (", typedArrayType, ")."); @@ -294,7 +307,7 @@ Napi::Value OrtValueToNapiValue(Napi::Env env, Ort::Value&& value) { } napi_value typedArrayData; napi_status status = - napi_create_typedarray(env, DATA_TYPE_TYPEDARRAY_MAP[elemType], size, arrayBuffer, 0, &typedArrayData); + napi_create_typedarray(env, (napi_typedarray_type)DATA_TYPE_TYPEDARRAY_MAP[elemType], size, arrayBuffer, 0, &typedArrayData); NAPI_THROW_IF_FAILED(env, status, Napi::Value); // new Tensor(type, typedArrayData, dims) diff --git a/onnxruntime/core/dll/delay_load_hook.cc b/onnxruntime/core/dll/delay_load_hook.cc new file mode 100644 index 0000000000000..23fc8bca7368e --- /dev/null +++ b/onnxruntime/core/dll/delay_load_hook.cc @@ -0,0 +1,83 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +// == workaround for delay loading of dependencies of onnxruntime.dll == +// +// Problem: +// +// When onnxruntime.dll uses delay loading for its dependencies, the dependencies are loaded using LoadLibraryEx, +// which search the directory of process (.exe) instead of this library (onnxruntime.dll). This is a problem for +// usages of Node.js binding and python binding, because Windows will try to find the dependencies in the directory +// of node.exe or python.exe, which is not the directory of onnxruntime.dll. +// +// Solution: +// +// By using the delay load hook `__pfnDliNotifyHook2`, we can intervene the loading procedure by loading from an +// absolute path. The absolute path is constructed by appending the name of the DLL to load to the directory of +// onnxruntime.dll. This way, we can ensure that the dependencies are loaded from the same directory as onnxruntime.dll. +// +// See also: +// - https://learn.microsoft.com/en-us/cpp/build/reference/understanding-the-helper-function?view=msvc-170#structure-and-constant-definitions +// - https://learn.microsoft.com/en-us/windows/win32/dlls/dynamic-link-library-search-order#alternate-search-order-for-unpackaged-apps +// +// The DLL DelayLoad hook is only enabled when the compiler is MSVC and at least one of the following is True: +// - both USE_WEBGPU and BUILD_DAWN_MONOLITHIC_LIBRARY are defined +// - USE_DML is defined +// +#define ORT_DELAY_LOAD_WEBGPU_DAWN_DLL (defined(USE_WEBGPU) && defined(BUILD_DAWN_MONOLITHIC_LIBRARY)) +#define ORT_DELAY_LOAD_DIRECTML_DLL defined(USE_DML) +#if defined(_MSC_VER) && (ORT_DELAY_LOAD_WEBGPU_DAWN_DLL || ORT_DELAY_LOAD_DIRECTML_DLL) + +#include +#include +#include +#include + +#include "core/platform/env.h" + +namespace { + +#define DEFINE_KNOWN_DLL(name) {#name ".dll", L#name L".dll"} + +constexpr struct { + const char* str; + const wchar_t* wstr; +} known_dlls[] = { +#if ORT_DELAY_LOAD_WEBGPU_DAWN_DLL + DEFINE_KNOWN_DLL(webgpu_dawn), +#endif +#if ORT_DELAY_LOAD_DIRECTML_DLL + DEFINE_KNOWN_DLL(DirectML), +#endif +}; +} // namespace + +FARPROC WINAPI delay_load_hook(unsigned dliNotify, PDelayLoadInfo pdli) { + if (dliNotify == dliNotePreLoadLibrary) { + for (size_t i = 0; i < _countof(known_dlls); ++i) { + if (_stricmp(pdli->szDll, known_dlls[i].str) == 0) { + // Try to load the DLL from the same directory as onnxruntime.dll + + // First, get the path to onnxruntime.dll + auto path = Env::Default().GetRuntimePath(); + if (path.empty()) { + // Failed to get the path to onnxruntime.dll. In this case, we will just return NULL and let the system + // search for the DLL in the default search order. + return NULL; + } + + // Append the name of the DLL. Now `path` is the absolute path to the DLL to load. + path.append(known_dlls[i].wstr); + + // Load the DLL + return FARPROC(LoadLibraryExW(path.c_str(), NULL, + LOAD_LIBRARY_SEARCH_DEFAULT_DIRS | LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR)); + } + } + } + return NULL; +} + +extern "C" const PfnDliHook __pfnDliNotifyHook2 = delay_load_hook; + +#endif diff --git a/onnxruntime/core/dll/dllmain.cc b/onnxruntime/core/dll/dllmain.cc index 2e7bdafd0599f..ac5dcd9c96084 100644 --- a/onnxruntime/core/dll/dllmain.cc +++ b/onnxruntime/core/dll/dllmain.cc @@ -13,7 +13,7 @@ #pragma GCC diagnostic pop #endif -// dllmain.cpp : Defines the entry point for the DLL application. +// dllmain.cc : Defines the entry point for the DLL application. BOOL APIENTRY DllMain(HMODULE /*hModule*/, DWORD ul_reason_for_call, LPVOID /*lpReserved*/ diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.cc b/onnxruntime/core/providers/webgpu/webgpu_context.cc index d66c2a79d28a8..c85a15017659c 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_context.cc +++ b/onnxruntime/core/providers/webgpu/webgpu_context.cc @@ -10,6 +10,8 @@ #endif #include "core/common/common.h" +#include "core/common/path_string.h" +#include "core/platform/env.h" #include "core/providers/webgpu/compute_context.h" #include "core/providers/webgpu/webgpu_context.h" @@ -50,6 +52,30 @@ void WebGpuContext::Initialize(const WebGpuExecutionProviderInfo& webgpu_ep_info // Initialization.Step.2 - Create wgpu::Adapter if (adapter_ == nullptr) { +#if !defined(__EMSCRIPTEN__) && defined(_MSC_VER) && defined(DAWN_ENABLE_D3D12) && !defined(USE_EXTERNAL_DAWN) + // If we are using the D3D12 backend on Windows and the build does not use external Dawn, dxil.dll and dxcompiler.dll are required. + // + // Dawn will try to load them later, but if they are in the different directory to the executable, it may fail to find them. + // To avoid this issue, we try to load them from the same directory as current module (usually onnxruntime.dll). + auto runtime_path = Env::Default().GetRuntimePath(); + if (!runtime_path.empty()) { + Status status; + void* module_handle = nullptr; + + PathString dxil_path = runtime_path + ToPathString(L"dxil.dll"); + status = Env::Default().LoadDynamicLibrary(dxil_path, false, &module_handle); + if (status.IsOK() && module_handle != nullptr) { + modules_.Add(dxil_path, module_handle); + } + + PathString dxcompiler_path = runtime_path + ToPathString(L"dxcompiler.dll"); + status = Env::Default().LoadDynamicLibrary(dxcompiler_path, false, &module_handle); + if (status.IsOK() && module_handle != nullptr) { + modules_.Add(dxcompiler_path, module_handle); + } + } +#endif + wgpu::RequestAdapterOptions req_adapter_options = {}; wgpu::DawnTogglesDescriptor adapter_toggles_desc = {}; req_adapter_options.nextInChain = &adapter_toggles_desc; diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.h b/onnxruntime/core/providers/webgpu/webgpu_context.h index be05b06523b9c..c41ef3e211264 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_context.h +++ b/onnxruntime/core/providers/webgpu/webgpu_context.h @@ -13,6 +13,7 @@ #include #include "core/common/common.h" +#include "core/framework/library_handles.h" #include "core/providers/webgpu/webgpu_execution_provider.h" #include "core/providers/webgpu/buffer_manager.h" #include "core/providers/webgpu/program_manager.h" @@ -153,6 +154,8 @@ class WebGpuContext final { std::once_flag init_flag_; + LibraryHandles modules_; + wgpu::Instance instance_; wgpu::Adapter adapter_; wgpu::Device device_; diff --git a/onnxruntime/python/tools/tensorrt/perf/build/build_image.py b/onnxruntime/python/tools/tensorrt/perf/build/build_image.py index 3ebc33c02592d..541dc4978dad1 100644 --- a/onnxruntime/python/tools/tensorrt/perf/build/build_image.py +++ b/onnxruntime/python/tools/tensorrt/perf/build/build_image.py @@ -15,10 +15,10 @@ from typing import List, Optional TRT_DOCKER_FILES = { - "8.6.cuda_11_8_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_6", - "8.6.cuda_12_3_cudnn_9": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_3_tensorrt8_6", - "10.5.cuda_11_8_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10", - "10.5.cuda_12_5_cudnn_9": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10", + "8.6_cuda11.8_cudnn8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_6", + "8.6_cuda12.3_cudnn9": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_3_tensorrt8_6", + "10.7_cuda11.8_cudnn8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10", + "10.7_cuda12.5_cudnn9": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10", "BIN": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin", } diff --git a/onnxruntime/test/webgpu/delay_load/main.cc b/onnxruntime/test/webgpu/delay_load/main.cc new file mode 100644 index 0000000000000..f909b4a6916b4 --- /dev/null +++ b/onnxruntime/test/webgpu/delay_load/main.cc @@ -0,0 +1,142 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include +#include +#include +#include +#define ORT_API_MANUAL_INIT +#include "core/session/onnxruntime_cxx_api.h" + +// This program is to test the delay loading of onnxruntime.dll. +// +// To verify the delay loading actually works, we need to do the test in 2 steps: +// +// 1. Prepare a folder structure like below: +// +// ├── webgpu_delay_load_test_root (newly created folder) +// │ ├── dlls +// │ │ ├── onnxruntime.dll +// │ │ ├── webgpu_dawn.dll +// │ │ ├── dxil.dll +// │ │ └── dxcompiler.dll +// │ └── test.exe +// └── onnxruntime_webgpu_delay_load_test.exe (this binary) +// +// This folder structure ensures no DLLs are in the same folder as the executable (test.exe). +// +// 2. Launch the test binary from the root folder of the above structure. +// +// So, there are 2 modes of this program: +// 1. "Prepare" mode: Do the step 1 above. (default) +// 2. "Test" mode: Do the step 2 above. (specified by --test argument) + +int prepare_main(); +int test_main(); + +int wmain(int argc, wchar_t* argv[]) { + if (argc == 2 && wcscmp(argv[1], L"--test") == 0) { + return test_main(); + } else { + return prepare_main(); + } +} + +int prepare_main() { + std::wstring path_str(32768, L'\0'); + GetModuleFileNameW(NULL, path_str.data(), static_cast(path_str.size())); + + namespace fs = std::filesystem; + fs::path exe_full_path{path_str}; // /onnxruntime_webgpu_delay_load_test.exe + fs::path test_dir = exe_full_path.parent_path(); // / + fs::path exe_name = exe_full_path.filename(); // onnxruntime_webgpu_delay_load_test.exe + fs::path root_folder = test_dir / L"webgpu_delay_load_test_root\\"; // /webgpu_delay_load_test_root/ + fs::path dlls_folder = root_folder / L"dlls\\"; // /webgpu_delay_load_test_root/dlls/ + + // ensure the test folder exists and is empty + if (fs::exists(root_folder)) { + fs::remove_all(root_folder); + } + fs::create_directories(dlls_folder); + + fs::current_path(test_dir); + + // copy the required DLLs to the dlls folder + fs::copy_file(L"onnxruntime.dll", dlls_folder / L"onnxruntime.dll"); + fs::copy_file(L"dxil.dll", dlls_folder / L"dxil.dll"); + fs::copy_file(L"dxcompiler.dll", dlls_folder / L"dxcompiler.dll"); + if (fs::exists(L"webgpu_dawn.dll")) { + fs::copy_file(L"webgpu_dawn.dll", dlls_folder / L"webgpu_dawn.dll"); + } + + // copy the test binary to the root folder + fs::copy_file(exe_full_path, root_folder / L"test.exe"); + + // run "test.exe --test" from the test root folder + fs::current_path(root_folder); + return _wsystem(L"test.exe --test"); +} + +int run() { + Ort::Env env{nullptr}; + int retval = 0; + try { + env = Ort::Env{ORT_LOGGING_LEVEL_WARNING, "Default"}; + + // model is https://github.com/onnx/onnx/blob/v1.15.0/onnx/backend/test/data/node/test_abs/model.onnx + constexpr uint8_t MODEL_DATA[] = {8, 7, 18, 12, 98, 97, 99, 107, 101, 110, + 100, 45, 116, 101, 115, 116, 58, 73, 10, 11, + 10, 1, 120, 18, 1, 121, 34, 3, 65, 98, + 115, 18, 8, 116, 101, 115, 116, 95, 97, 98, + 115, 90, 23, 10, 1, 120, 18, 18, 10, 16, + 8, 1, 18, 12, 10, 2, 8, 3, 10, 2, + 8, 4, 10, 2, 8, 5, 98, 23, 10, 1, + 121, 18, 18, 10, 16, 8, 1, 18, 12, 10, + 2, 8, 3, 10, 2, 8, 4, 10, 2, 8, + 5, 66, 4, 10, 0, 16, 13}; + + Ort::SessionOptions session_options; + session_options.DisableMemPattern(); + std::unordered_map provider_options; + session_options.AppendExecutionProvider("WebGPU", provider_options); + Ort::Session session{env, MODEL_DATA, sizeof(MODEL_DATA), session_options}; + + // successfully initialized + std::cout << "Successfully initialized WebGPU EP." << std::endl; + retval = 0; + } catch (const std::exception& ex) { + std::cerr << ex.what() << std::endl; + + std::cerr << "Unexpected exception." << std::endl; + retval = -1; + } + + return retval; +} + +int test_main() { + HMODULE hModule = LoadLibraryA("dlls\\onnxruntime.dll"); + if (hModule == NULL) { + std::cout << "Failed to load dlls\\onnxruntime.dll" << std::endl; + return 1; + } + + int retval = 0; + + using OrtGetApiBaseFunction = decltype(&OrtGetApiBase); + auto fnOrtGetApiBase = (OrtGetApiBaseFunction)GetProcAddress(hModule, "OrtGetApiBase"); + if (fnOrtGetApiBase == NULL) { + std::cout << "Failed to get OrtGetApiBase" << std::endl; + retval = 1; + goto cleanup; + } + Ort::InitApi(fnOrtGetApiBase()->GetApi(ORT_API_VERSION)); + + retval = run(); + +cleanup: + if (hModule != NULL) { + FreeLibrary(hModule); + } + return retval; +} diff --git a/onnxruntime/test/webgpu/external_dawn/main.cc b/onnxruntime/test/webgpu/external_dawn/main.cc index ed8d2eab94ce9..1cb22b131d76b 100644 --- a/onnxruntime/test/webgpu/external_dawn/main.cc +++ b/onnxruntime/test/webgpu/external_dawn/main.cc @@ -1,5 +1,4 @@ // Copyright (c) Microsoft Corporation. All rights reserved. -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates // Licensed under the MIT License. #include diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index 3527a89ca7a7b..53dcdc6e0c6fa 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -260,6 +260,7 @@ def convert_arg_line_to_args(self, arg_line): ) parser.add_argument("--disable_cuda_nhwc_ops", action="store_true", help="Disable CUDA NHWC ops in build.") + parser.add_argument("--enable_cuda_minimal_build", action="store_true", help="Enable CUDA minimal build.") # Python bindings parser.add_argument("--enable_pybind", action="store_true", help="Enable Python Bindings.") @@ -1093,6 +1094,7 @@ def generate_build_tree( "-Donnxruntime_DISABLE_FLOAT8_TYPES=" + ("ON" if disable_float8_types else "OFF"), "-Donnxruntime_DISABLE_SPARSE_TENSORS=" + ("ON" if disable_sparse_tensors else "OFF"), "-Donnxruntime_DISABLE_OPTIONAL_TYPE=" + ("ON" if disable_optional_type else "OFF"), + "-Donnxruntime_CUDA_MINIMAL=" + ("ON" if args.enable_cuda_minimal_build else "OFF"), ] if args.rv64: diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml new file mode 100644 index 0000000000000..2a32dd1a62408 --- /dev/null +++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml @@ -0,0 +1,108 @@ +##### start trigger Don't edit it manually, Please do edit set-trigger-rules.py #### +### please do rerun set-trigger-rules.py ### +trigger: + branches: + include: + - main + - rel-* + paths: + exclude: + - docs/** + - README.md + - CONTRIBUTING.md + - BUILD.md + - 'js/web' + - 'onnxruntime/core/providers/js' +pr: + branches: + include: + - main + - rel-* + paths: + exclude: + - docs/** + - README.md + - CONTRIBUTING.md + - BUILD.md + - 'js/web' + - 'onnxruntime/core/providers/js' +#### end trigger #### +parameters: + - name: CudaVersion + displayName: CUDA version + type: string + default: '12.2' + values: + - 11.8 + - 12.2 + +variables: + - template: templates/common-variables.yml + - name: docker_base_image + ${{ if eq(parameters.CudaVersion, '11.8') }}: + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241120.3 + ${{ if eq(parameters.CudaVersion, '12.2') }}: + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20241120.3 + - name: linux_trt_version + ${{ if eq(parameters.CudaVersion, '11.8') }}: + value: ${{ variables.linux_trt_version_cuda11 }} + ${{ if eq(parameters.CudaVersion, '12.2') }}: + value: ${{ variables.linux_trt_version_cuda12 }} + +jobs: +- job: Linux_Build + timeoutInMinutes: 180 + variables: + skipComponentGovernanceDetection: true + ALLOW_RELEASED_ONNX_OPSET_ONLY: '1' + ORT_CACHE_DIR: '$(Agent.TempDirectory)/ort/ccache' + TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)] + workspace: + clean: all + pool: onnxruntime-tensorrt-linuxbuild-T4 + steps: + - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3 + displayName: 'Clean Agent Directories' + condition: always() + + - checkout: self + clean: true + submodules: none + + - template: templates/get-docker-image-steps.yml + parameters: + Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda + Context: tools/ci_build/github/linux/docker + DockerBuildArgs: " + --network=host + --build-arg BASEIMAGE=${{ variables.docker_base_image }} + --build-arg TRT_VERSION=${{ variables.linux_trt_version }} + --build-arg BUILD_UID=$( id -u ) + " + Repository: onnxruntimetensorrtcudaminimalbuild + + - template: templates/linux-build-step-with-cache.yml + parameters: + WithCache: true + Today: $(TODAY) + AdditionalKey: gpu_tensorrt_cuda_minimal + CacheDir: '$(ORT_CACHE_DIR)' + BuildStep: + - task: CmdLine@2 + inputs: + script: | + docker run --gpus all --rm \ + --volume /data/onnx:/data/onnx:ro \ + --volume $(Build.SourcesDirectory):/onnxruntime_src \ + --volume $(Build.BinariesDirectory):/build \ + --volume /data/models:/build/models:ro \ + --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \ + --volume $(ORT_CACHE_DIR):/cache \ + -e ALLOW_RELEASED_ONNX_OPSET_ONLY=0 \ + -e NIGHTLY_BUILD \ + -e BUILD_BUILDNUMBER \ + -e CCACHE_DIR=/cache -w /onnxruntime_src \ + onnxruntimetensorrtcudaminimalbuild tools/ci_build/github/linux/build_tensorrt_ci.sh --cuda_minimal=ON + workingDirectory: $(Build.SourcesDirectory) + + - template: templates/explicitly-defined-final-tasks.yml diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml index 83cf26614a285..9286b5a54ac27 100644 --- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml @@ -8,12 +8,12 @@ parameters: - name: TrtVersion displayName: TensorRT Version type: string - default: 10.5.cuda_12_5_cudnn_9 + default: 10.7_cuda12.5_cudnn9 values: - - 8.6.cuda_11_8_cudnn_8 - - 8.6.cuda_12_3_cudnn_9 - - 10.5.cuda_11_8_cudnn_8 - - 10.5.cuda_12_5_cudnn_9 + - 8.6_cuda11.8_cudnn8 + - 8.6_cuda12.3_cudnn9 + - 10.7_cuda11.8_cudnn8 + - 10.7_cuda12.5_cudnn9 - BIN - name: UseTensorrtOssParser @@ -198,4 +198,4 @@ jobs: parameters : condition : 'succeeded' - - template: templates/clean-agent-build-directory-step.yml + - template: templates/clean-agent-build-directory-step.yml \ No newline at end of file diff --git a/tools/ci_build/github/azure-pipelines/py-cuda-alt-package-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-cuda-alt-package-test-pipeline.yml index 9296928ad97e0..cf434e4eadf0d 100644 --- a/tools/ci_build/github/azure-pipelines/py-cuda-alt-package-test-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/py-cuda-alt-package-test-pipeline.yml @@ -19,6 +19,6 @@ stages: python_wheel_suffix: '_gpu' timeout: 480 docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241120.3 - trt_version: '10.6.0.26-1.cuda11.8' + trt_version: '10.7.0.23-1.cuda11.8' cuda_version: '11.8' diff --git a/tools/ci_build/github/azure-pipelines/templates/common-variables.yml b/tools/ci_build/github/azure-pipelines/templates/common-variables.yml index d35bed69ee409..3d4e5326ae7c6 100644 --- a/tools/ci_build/github/azure-pipelines/templates/common-variables.yml +++ b/tools/ci_build/github/azure-pipelines/templates/common-variables.yml @@ -1,5 +1,5 @@ variables: - common_trt_version: '10.6.0.26' + common_trt_version: '10.7.0.23' # As for Debian installation, replace '-1.' by '-1+' when assigning trt version below linux_trt_version_cuda11: ${{ variables.common_trt_version }}-1.cuda11.8 linux_trt_version_cuda12: ${{ variables.common_trt_version }}-1.cuda12.6 diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml index ae54b3849a862..14b9c378bec14 100644 --- a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml +++ b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml @@ -13,10 +13,10 @@ parameters: - 12.2 - name: TrtVersion type: string - default: '10.6.0.26' + default: '10.7.0.23' values: - 8.6.1.6 - - 10.6.0.26 + - 10.7.0.23 steps: - ${{ if eq(parameters.DownloadCUDA, true) }}: @@ -42,7 +42,7 @@ steps: - powershell: | Write-Host "##vso[task.setvariable variable=trtCudaVersion;]12.0" displayName: Set trtCudaVersion - - ${{ if and(eq(parameters.CudaVersion, '12.2'), eq(parameters.TrtVersion, '10.6.0.26')) }}: + - ${{ if and(eq(parameters.CudaVersion, '12.2'), eq(parameters.TrtVersion, '10.7.0.23')) }}: - powershell: | Write-Host "##vso[task.setvariable variable=trtCudaVersion;]12.6" displayName: Set trtCudaVersion diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml index dfaf237a711fe..45572416350c3 100644 --- a/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml +++ b/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml @@ -15,10 +15,10 @@ parameters: default: '11.8' - name: win_trt_folder_cuda11 type: string - default: 'TensorRT-10.6.0.26.Windows10.x86_64.cuda-11.8' + default: 'TensorRT-10.7.0.23.Windows10.x86_64.cuda-11.8' - name: win_trt_folder_cuda12 type: string - default: 'TensorRT-10.6.0.26.Windows10.x86_64.cuda-12.6' + default: 'TensorRT-10.7.0.23.Windows10.x86_64.cuda-12.6' steps: - ${{ if eq(parameters.DownloadCUDA, 'true') }}: diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-cuda-minimal-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-cuda-minimal-ci-pipeline.yml new file mode 100644 index 0000000000000..c68ba01485db2 --- /dev/null +++ b/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-cuda-minimal-ci-pipeline.yml @@ -0,0 +1,86 @@ +##### start trigger Don't edit it manually, Please do edit set-trigger-rules.py #### +### please do rerun set-trigger-rules.py ### +trigger: + branches: + include: + - main + - rel-* + paths: + exclude: + - docs/** + - README.md + - CONTRIBUTING.md + - BUILD.md + - 'js/web' + - 'onnxruntime/core/providers/js' +pr: + branches: + include: + - main + - rel-* + paths: + exclude: + - docs/** + - README.md + - CONTRIBUTING.md + - BUILD.md + - 'js/web' + - 'onnxruntime/core/providers/js' +#### end trigger #### +parameters: +- name: CudaVersion + displayName: CUDA version + type: string + default: '12.2' + values: + - 11.8 + - 12.2 + +variables: + - template: templates/common-variables.yml + - name: win_trt_folder + ${{ if eq(parameters.CudaVersion, '11.8') }}: + value: ${{ variables.win_trt_folder_cuda11 }} + ${{ if eq(parameters.CudaVersion, '12.2') }}: + value: ${{ variables.win_trt_folder_cuda12 }} + +jobs: +- job: 'build' + pool: 'onnxruntime-Win2022-GPU-A10' + variables: + MsbuildArguments: '-detailedsummary -maxcpucount -consoleloggerparameters:PerformanceSummary' + EnvSetupScript: setup_env_trt.bat + skipComponentGovernanceDetection: true + TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)] + timeoutInMinutes: 150 + workspace: + clean: all + steps: + - template: templates/jobs/win-ci-prebuild-steps.yml + parameters: + EnvSetupScript: $(EnvSetupScript) + DownloadCUDA: true + DownloadTRT: true + BuildArch: 'x64' + BuildConfig: RelWithDebInfo + MachinePool: 'onnxruntime-Win2022-GPU-A10' + WithCache: true + Today: $(Today) + + - template: templates/jobs/win-ci-build-steps.yml + parameters: + WithCache: True + Today: $(TODAY) + AdditionalKey: "gpu_tensorrt_cuda_minimal | RelWithDebInfo" + BuildPyArguments: '--config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\${{ variables.win_trt_folder }}" --cuda_home="$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --enable_cuda_minimal_build' + MsbuildArguments: $(MsbuildArguments) + BuildArch: 'x64' + Platform: 'x64' + BuildConfig: RelWithDebInfo + + - task: PythonScript@0 + displayName: 'Build wheel' + inputs: + scriptPath: '$(Build.SourcesDirectory)\setup.py' + arguments: 'bdist_wheel' + workingDirectory: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo' diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-webgpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-webgpu-ci-pipeline.yml index 06f374afca57a..8460df2ec3799 100644 --- a/tools/ci_build/github/azure-pipelines/win-gpu-webgpu-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/win-gpu-webgpu-ci-pipeline.yml @@ -48,7 +48,7 @@ stages: --enable_pybind --build_nodejs --use_webgpu - --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON + --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY=ON msbuildPlatform: x64 isX86: false job_name_suffix: x64_RelWithDebInfo diff --git a/tools/ci_build/github/linux/build_tensorrt_ci.sh b/tools/ci_build/github/linux/build_tensorrt_ci.sh index 5b206bc0a92d9..ccf7a6f4ea630 100755 --- a/tools/ci_build/github/linux/build_tensorrt_ci.sh +++ b/tools/ci_build/github/linux/build_tensorrt_ci.sh @@ -21,6 +21,19 @@ BUILD_ARGS=('--config' 'Release' "CMAKE_CUDA_ARCHITECTURES=75" "onnxruntime_BUILD_UNIT_TESTS=ON" "onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON") + +# Parse external args +for arg in "$@"; do + case $arg in + --cuda_minimal=ON) + # Replace onnxruntime_BUILD_UNIT_TESTS=ON with OFF + BUILD_ARGS=("${BUILD_ARGS[@]/onnxruntime_BUILD_UNIT_TESTS=ON/onnxruntime_BUILD_UNIT_TESTS=OFF}") + BUILD_ARGS+=("--enable_cuda_minimal_build") + BUILD_ARGS+=("--skip_tests") + ;; + esac +done + if [ -x "$(command -v ninja)" ]; then BUILD_ARGS+=('--cmake_generator' 'Ninja') fi diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0 b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0 index c2bae5fd7ee59..df5112dc38af4 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0 +++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0 @@ -6,7 +6,7 @@ # Build base image with required system packages ARG BASEIMAGE=nvidia/cuda:12.5.1-cudnn-devel-ubi8 -ARG TRT_VERSION=10.6.0.26-1.cuda12.6 +ARG TRT_VERSION=10.7.0.23-1.cuda12.6 FROM $BASEIMAGE AS base ARG TRT_VERSION ENV PATH=/opt/python/cp310-cp310/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH} diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch index 2ecc6d1918b1a..fef95b8574520 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch +++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch @@ -6,7 +6,7 @@ # Build base image with required system packages ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8 -ARG TRT_VERSION=10.6.0.26-1.cuda11.8 +ARG TRT_VERSION=10.7.0.23-1.cuda11.8 FROM $BASEIMAGE AS base ARG TRT_VERSION ENV PATH=/opt/python/cp310-cp310/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH} diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu index 81aeada6a4a46..e91f14ff955b9 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu +++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu @@ -6,7 +6,7 @@ # Build base image with required system packages ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 -ARG TRT_VERSION=10.6.0.26-1+cuda11.8 +ARG TRT_VERSION=10.7.0.23-1+cuda11.8 ARG LD_LIBRARY_PATH_ARG=/usr/local/lib64:/usr/local/cuda/lib64 FROM $BASEIMAGE AS base ARG TRT_VERSION diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_ffmpeg b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_ffmpeg index 4298dd53e4c66..0b08d4b3024b8 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_ffmpeg +++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_ffmpeg @@ -6,7 +6,7 @@ # Build base image with required system packages ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 -ARG TRT_VERSION=10.6.0.26-1+cuda11.8 +ARG TRT_VERSION=10.7.0.23-1+cuda11.8 ARG LD_LIBRARY_PATH_ARG=/usr/local/lib64:/usr/local/cuda/lib64 FROM $BASEIMAGE AS base ARG TRT_VERSION diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_opencv b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_opencv index 1312475ceca3a..3a7e064686ae5 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_opencv +++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_opencv @@ -6,7 +6,7 @@ # Build base image with required system packages ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 -ARG TRT_VERSION=10.6.0.26-1+cuda11.8 +ARG TRT_VERSION=10.7.0.23-1+cuda11.8 ARG LD_LIBRARY_PATH_ARG=/usr/local/lib64:/usr/local/cuda/lib64 FROM $BASEIMAGE AS base ARG TRT_VERSION diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10 index 22d5e3b0248a8..01f08ff41e2cc 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10 +++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10 @@ -31,7 +31,7 @@ RUN pip install --upgrade pip RUN pip install psutil setuptools>=68.2.2 # Install TensorRT -RUN TRT_VERSION="10.6.0.26-1+cuda11.8" &&\ +RUN TRT_VERSION="10.7.0.23-1+cuda11.8" &&\ apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\ apt-get update &&\ apt-get install -y \ diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10 index 819d9bab7be75..781f0647a084b 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10 +++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10 @@ -31,7 +31,7 @@ RUN pip install --upgrade pip RUN pip install setuptools>=68.2.2 psutil # Install TensorRT -RUN TRT_VERSION="10.6.0.26-1+cuda12.6" &&\ +RUN TRT_VERSION="10.7.0.23-1+cuda12.6" &&\ apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\ apt-get update &&\ apt-get install -y \ diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile index a69b98f86ba1b..5f10607b11626 100644 --- a/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile @@ -5,7 +5,7 @@ ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8 FROM $BASEIMAGE -ARG TRT_VERSION=10.6.0.26-1.cuda11.8 +ARG TRT_VERSION=10.7.0.23-1.cuda11.8 #Install TensorRT only if TRT_VERSION is not empty RUN if [ -n "${TRT_VERSION}" ]; then \ diff --git a/tools/ci_build/github/windows/setup_env_gpu.bat b/tools/ci_build/github/windows/setup_env_gpu.bat index 34ddd75da16fc..4e2bd8f8386e2 100644 --- a/tools/ci_build/github/windows/setup_env_gpu.bat +++ b/tools/ci_build/github/windows/setup_env_gpu.bat @@ -6,10 +6,10 @@ if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ ( ) else ( set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\CUPTI\lib64;%PATH% ) -set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.6.0.26.Windows10.x86_64.cuda-12.6\lib;%PATH% +set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.7.0.23.Windows10.x86_64.cuda-12.6\lib;%PATH% @REM The default version is still cuda v12.2, because set cuda v11.8 after it -set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\TensorRT-10.6.0.26.Windows10.x86_64.cuda-11.8\lib +set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\TensorRT-10.7.0.23.Windows10.x86_64.cuda-11.8\lib if exist PATH=%AGENT_TEMPDIRECTORY%\v11.8\ ( set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v11.8\bin;%AGENT_TEMPDIRECTORY%\v11.8\extras\CUPTI\lib64 ) else ( diff --git a/tools/ci_build/github/windows/setup_env_trt.bat b/tools/ci_build/github/windows/setup_env_trt.bat index 03734293be5c4..6a602e46661e7 100644 --- a/tools/ci_build/github/windows/setup_env_trt.bat +++ b/tools/ci_build/github/windows/setup_env_trt.bat @@ -6,6 +6,6 @@ if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ ( ) else ( set PATH=%PATH%;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\CUPTI\lib64 ) -set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.6.0.26.Windows10.x86_64.cuda-12.6\lib;%PATH% +set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.7.0.23.Windows10.x86_64.cuda-12.6\lib;%PATH% set GRADLE_OPTS=-Dorg.gradle.daemon=false set CUDA_MODULE_LOADING=LAZY