diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 5de48bfcd1719..4a47d879facb3 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -257,12 +257,24 @@ list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/external) #2. if ONNX_CUSTOM_PROTOC_EXECUTABLE is not set, Compile everything(including protoc) from source code. +if (onnxruntime_USE_PREINSTALLED_PROTOBUF) + add_executable(protoc IMPORTED GLOBAL) + find_program(PROTOC_EXE protoc PATHS ${protobuf_INSTALL_PATH}/bin) + set_property(TARGET protoc PROPERTY IMPORTED_LOCATION ${PROTOC_EXE}) + foreach(proto_libname protobuf protobuf-lite) + add_library(lib${proto_libname} SHARED IMPORTED GLOBAL) + find_library(${proto_libname}_path ${proto_libname} PATHS ${protobuf_INSTALL_PATH}/lib NO_DEFAULT_PATH) + set_property(TARGET lib${proto_libname} PROPERTY IMPORTED_LOCATION ${${proto_libname}_path}) + target_include_directories(lib${proto_libname} INTERFACE "${protobuf_INSTALL_PATH}/include") + endforeach(proto_libname) +else() # use protobuf as a submodule add_subdirectory(${PROJECT_SOURCE_DIR}/external/protobuf/cmake EXCLUDE_FROM_ALL) set_target_properties(libprotobuf PROPERTIES FOLDER "External/Protobuf") set_target_properties(libprotobuf-lite PROPERTIES FOLDER "External/Protobuf") set_target_properties(libprotoc PROPERTIES FOLDER "External/Protobuf") set_target_properties(protoc PROPERTIES FOLDER "External/Protobuf") +endif() if (onnxruntime_USE_FULL_PROTOBUF) add_library(protobuf::libprotobuf ALIAS libprotobuf) else() diff --git a/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.h b/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.h index a3accc70b86e3..43e4d5b670074 100644 --- a/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.h +++ b/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.h @@ -92,11 +92,6 @@ class DeepCpuAttnLstmOp final : public OpKernel { ActivationFuncs activation_funcs_; -// Threadpool for operator. If concurrent Compute calls are possible, it will be shared -// across them. mutable due to this. -// The alternative would be to create a threadpool in each call to Compute but that would incur thread creation -// cost on every call. - mutable onnxruntime::concurrency::ThreadPool ttp_{"DEEPCPU_ATTN_LSTM", (int)std::thread::hardware_concurrency()}; }; } // namespace contrib diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp index dd722be009bbe..34f2a499fadde 100644 --- a/onnxruntime/core/mlas/lib/platform.cpp +++ b/onnxruntime/core/mlas/lib/platform.cpp @@ -16,6 +16,7 @@ Module Name: --*/ #include "mlasi.h" +#include // // Stores the platform information. @@ -142,7 +143,11 @@ Return Value: uint64_t xcr0 = MlasReadExtendedControlRegister(_XCR_XFEATURE_ENABLED_MASK); - if ((xcr0 & 0x6) == 0x6) { + const char *cpu_opt = std::getenv("MLAS_DYNAMIC_CPU_ARCH"); + if (cpu_opt == nullptr) cpu_opt = "99"; + auto opt = std::stoi(cpu_opt); + + if (opt > 0 && (xcr0 & 0x6) == 0x6) { this->GemmFloatKernel = MlasGemmFloatKernelAvx; @@ -171,7 +176,7 @@ Return Value: __cpuid_count(7, 0, Cpuid7[0], Cpuid7[1], Cpuid7[2], Cpuid7[3]); #endif - if (((Cpuid1[2] & 0x1000) != 0) && ((Cpuid7[1] & 0x20) != 0)) { + if (opt > 1 && ((Cpuid1[2] & 0x1000) != 0) && ((Cpuid7[1] & 0x20) != 0)) { this->GemmU8S8CopyPackARoutine = MlasGemmU8S8CopyPackAAvx2; this->GemmU8S8CopyPackBRoutine = MlasGemmU8S8CopyPackBAvx2; @@ -198,7 +203,7 @@ Return Value: // operating system supports saving AVX512F state. // - if (((Cpuid7[1] & 0x10000) != 0) && ((xcr0 & 0xE0) == 0xE0)) { + if (opt > 2 && ((Cpuid7[1] & 0x10000) != 0) && ((xcr0 & 0xE0) == 0xE0)) { this->GemmFloatKernel = MlasGemmFloatKernelAvx512F; this->GemmDoubleKernel = MlasGemmDoubleKernelAvx512F; diff --git a/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.cc b/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.cc index af0bdbce199f4..aad4f35c52bbf 100644 --- a/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.cc +++ b/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.cc @@ -196,7 +196,6 @@ class UniDirectionalLstm { const gsl::span& initial_hidden_state, const gsl::span& initial_cell_state, const ActivationFuncs::Entry& activation_func_f, const ActivationFuncs::Entry& activation_func_g, const ActivationFuncs::Entry& activation_func_h, float clip, - concurrency::ThreadPool& lstm_tp_, concurrency::ThreadPool* mlas_tp_); void Compute(const gsl::span& inputs, const gsl::span& sequence_lengths, int num_directions, @@ -279,7 +278,6 @@ class UniDirectionalLstm { ActivationInfo activation_g_; ActivationInfo activation_h_; - concurrency::ThreadPool& lstm_tp_; concurrency::ThreadPool* mlas_tp_; }; @@ -459,7 +457,7 @@ Status DeepCpuLstmOp::ComputeImpl(OpKernelContext& context) const { activation_funcs_.Entries()[0], activation_funcs_.Entries()[1], activation_funcs_.Entries()[2], - clip_, lstm_tp_, mlas_thread_pool); + clip_, mlas_thread_pool); detail::UniDirectionalLstm bw(alloc, logger, seq_length, batch_size, input_size, hidden_size_, Direction::kReverse, input_forget_, @@ -467,7 +465,7 @@ Status DeepCpuLstmOp::ComputeImpl(OpKernelContext& context) const { activation_funcs_.Entries()[3], activation_funcs_.Entries()[4], activation_funcs_.Entries()[5], - clip_, lstm_tp_, mlas_thread_pool); + clip_, mlas_thread_pool); fw.Compute(input, sequence_lens_span, num_directions_, input_weights_1, recurrent_weights_1, output_1, hidden_output_1, last_cell_1); @@ -480,7 +478,7 @@ Status DeepCpuLstmOp::ComputeImpl(OpKernelContext& context) const { activation_funcs_.Entries()[0], activation_funcs_.Entries()[1], activation_funcs_.Entries()[2], - clip_, lstm_tp_, mlas_thread_pool); + clip_, mlas_thread_pool); fw.Compute(input, sequence_lens_span, num_directions_, input_weights_1, recurrent_weights_1, output_1, hidden_output_1, last_cell_1); @@ -553,7 +551,6 @@ UniDirectionalLstm::UniDirectionalLstm(AllocatorPtr allocator, const ActivationFuncs::Entry& activation_func_g, const ActivationFuncs::Entry& activation_func_h, const float clip, - concurrency::ThreadPool& lstm_tp, concurrency::ThreadPool* mlas_tp) : allocator_(allocator), logger_(logger), @@ -566,7 +563,6 @@ UniDirectionalLstm::UniDirectionalLstm(AllocatorPtr allocator, clip_(clip), use_bias_(!bias.empty()), use_peepholes_(!peephole_weights.empty()), - lstm_tp_(lstm_tp), mlas_tp_(mlas_tp) { activation_f_ = {deepcpu::ActivationFuncByName(activation_func_f.name), activation_func_f.alpha, @@ -884,7 +880,7 @@ void UniDirectionalLstm::Compute(const gsl::span& inputs_arg, } }; - ExecuteLambdaInParallel("Processing batch", hidden_gemm_and_activations, batch_size_, fused_hidden_rows, lstm_tp_, logger_); + ExecuteLambdaInParallel("Processing batch", hidden_gemm_and_activations, batch_size_, fused_hidden_rows, nullptr, logger_); } else { span_T_const_iter previous_state_end = batched_hidden_state_one_step.cend(); @@ -1123,10 +1119,7 @@ void UniDirectionalLstm::GateComputations(span_T_iter& out, span_T_iter& out_ template void UniDirectionalLstm::SetNumThreads() { - int threads = std::thread::hardware_concurrency() - 1; - - if (threads < 1) - threads = 1; + int threads = 1; hidden_num_threads_ = threads; batch_parallel_ = false; diff --git a/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.h b/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.h index faf32e3a77a2f..0ebab72e2b042 100644 --- a/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.h +++ b/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.h @@ -77,13 +77,6 @@ class DeepCpuLstmOp final : public OpKernel { bool input_forget_ = false; rnn::detail::ActivationFuncs activation_funcs_; - - // Threadpool for operator. If concurrent Compute calls are possible, it will be shared - // across them. mutable due to this. - // The alternative would be to create a threadpool in each call to Compute but that would incur thread creation - // cost on every call. - mutable onnxruntime::concurrency::ThreadPool lstm_tp_{"DEEPCPU_LSTM", - static_cast(std::thread::hardware_concurrency())}; }; } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cpu/rnn/rnn_helpers.h b/onnxruntime/core/providers/cpu/rnn/rnn_helpers.h index 0ef37b5126cdb..3c5564e0d8ca9 100644 --- a/onnxruntime/core/providers/cpu/rnn/rnn_helpers.h +++ b/onnxruntime/core/providers/cpu/rnn/rnn_helpers.h @@ -212,78 +212,72 @@ T* SafeRawPointer(typename gsl::span span, size_t offset, size_t size) { template void ExecuteLambdaInParallel(const std::string& name, TLambda lambda, int max, int step, - onnxruntime::concurrency::ThreadPool& ttp, + onnxruntime::concurrency::ThreadPool* ttp, const ::onnxruntime::logging::Logger& logger) { - // #define NOTHREADS to execute the lambdas directly and in order if you need to do that to debug - -#ifdef NOTHREADS - ORT_UNUSED_PARAMETER(ttp); - ORT_UNUSED_PARAMETER(logger); - - for (int i = 0; i < max; i += step) { - (void)name; - std::bind(lambda, i)(); - } -#else - + ORT_UNUSED_PARAMETER(name); ORT_UNUSED_PARAMETER(logger); - // ORT_ENFORCE may and does throw at times from within the tasks that run - // on a thread-pool. Without propagating exceptions the process exits silently - // which will make diagnosing bugs more difficult. - - // \! UGLY - // We have a problem here with the current thread-pool is that it takes std::function - // by value and copies it more than once (even though it is movable). - // - // To report status and exceptions properly it's better to use - // futures and promises but they are not copyable, so we can't come up with a functor - // with a promise member and we are downgrading to C++11 where we can't have captures that moved in. - // - // At the same time promises MUST live in the child thread so if we throw from the main thread - // we don't destroy any promises that are on the main thread stack which children threads may still be using. - // - // The only solution with the current Eigen that comes to mind is to have shared_ptr to with std::promise. - // - const int total_tasks = max / (step > 0 ? step : 1) + (max % step > 0 ? 1 : 0); - std::vector > futures; - futures.reserve(total_tasks); - - for (int i = 0, t = 0; i < max; i += step, ++t) { - auto p_ptr = std::make_shared >(); - futures.push_back(p_ptr->get_future()); - ttp.Schedule([p_ptr, lambda, i]() { + if (ttp == nullptr){ + for (int i = 0; i < max; i += step) { + std::bind(lambda, i)(); + } + } else { + // ORT_ENFORCE may and does throw at times from within the tasks that run + // on a thread-pool. Without propagating exceptions the process exits silently + // which will make diagnosing bugs more difficult. + + // \! UGLY + // We have a problem here with the current thread-pool is that it takes std::function + // by value and copies it more than once (even though it is movable). + // + // To report status and exceptions properly it's better to use + // futures and promises but they are not copyable, so we can't come up with a functor + // with a promise member and we are downgrading to C++11 where we can't have captures that moved in. + // + // At the same time promises MUST live in the child thread so if we throw from the main thread + // we don't destroy any promises that are on the main thread stack which children threads may still be using. + // + // The only solution with the current Eigen that comes to mind is to have shared_ptr to with std::promise. + // + const int total_tasks = max / (step > 0 ? step : 1) + (max % step > 0 ? 1 : 0); + std::vector > futures; + futures.reserve(total_tasks); + + for (int i = 0, t = 0; i < max; i += step, ++t) { + auto p_ptr = std::make_shared >(); + futures.push_back(p_ptr->get_future()); + ttp->Schedule([p_ptr, lambda, i]() { + try { + lambda(i); + p_ptr->set_value(); + } catch (...) { + p_ptr->set_exception(std::current_exception()); + } + }); + } + + // We'd like to wait until all of the tasks have finished + // even though one or more have already thrown. We will store + // the first exception and then will re-throw at the end. + std::exception_ptr pending_exception; + for (auto& fut : futures) { try { - lambda(i); - p_ptr->set_value(); + // get() will re-throw any exceptions + // the running task may throw + fut.get(); } catch (...) { - p_ptr->set_exception(std::current_exception()); - } - }); - } - - // We'd like to wait until all of the tasks have finished - // even though one or more have already thrown. We will store - // the first exception and then will re-throw at the end. - std::exception_ptr pending_exception; - for (auto& fut : futures) { - try { - // get() will re-throw any exceptions - // the running task may throw - fut.get(); - } catch (...) { - if (!pending_exception) { - pending_exception = std::current_exception(); + if (!pending_exception) { + pending_exception = std::current_exception(); + } } } - } - if (pending_exception) { - std::rethrow_exception(pending_exception); + if (pending_exception) { + std::rethrow_exception(pending_exception); + } } -#endif } void DumpMatrixImpl(const std::string& name, const float* src, int row, int col,