Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cms/v1.0.0 #2

Merged
merged 3 commits into from
Nov 13, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions cmake/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -257,12 +257,24 @@ list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/external)
#2. if ONNX_CUSTOM_PROTOC_EXECUTABLE is not set, Compile everything(including protoc) from source code.


if (onnxruntime_USE_PREINSTALLED_PROTOBUF)
add_executable(protoc IMPORTED GLOBAL)
find_program(PROTOC_EXE protoc PATHS ${protobuf_INSTALL_PATH}/bin)
set_property(TARGET protoc PROPERTY IMPORTED_LOCATION ${PROTOC_EXE})
foreach(proto_libname protobuf protobuf-lite)
add_library(lib${proto_libname} SHARED IMPORTED GLOBAL)
find_library(${proto_libname}_path ${proto_libname} PATHS ${protobuf_INSTALL_PATH}/lib NO_DEFAULT_PATH)
set_property(TARGET lib${proto_libname} PROPERTY IMPORTED_LOCATION ${${proto_libname}_path})
target_include_directories(lib${proto_libname} INTERFACE "${protobuf_INSTALL_PATH}/include")
endforeach(proto_libname)
else()
# use protobuf as a submodule
add_subdirectory(${PROJECT_SOURCE_DIR}/external/protobuf/cmake EXCLUDE_FROM_ALL)
set_target_properties(libprotobuf PROPERTIES FOLDER "External/Protobuf")
set_target_properties(libprotobuf-lite PROPERTIES FOLDER "External/Protobuf")
set_target_properties(libprotoc PROPERTIES FOLDER "External/Protobuf")
set_target_properties(protoc PROPERTIES FOLDER "External/Protobuf")
endif()
if (onnxruntime_USE_FULL_PROTOBUF)
add_library(protobuf::libprotobuf ALIAS libprotobuf)
else()
Expand Down
5 changes: 0 additions & 5 deletions onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,11 +92,6 @@ class DeepCpuAttnLstmOp final : public OpKernel {

ActivationFuncs activation_funcs_;

// Threadpool for operator. If concurrent Compute calls are possible, it will be shared
// across them. mutable due to this.
// The alternative would be to create a threadpool in each call to Compute but that would incur thread creation
// cost on every call.
mutable onnxruntime::concurrency::ThreadPool ttp_{"DEEPCPU_ATTN_LSTM", (int)std::thread::hardware_concurrency()};
};

} // namespace contrib
Expand Down
11 changes: 8 additions & 3 deletions onnxruntime/core/mlas/lib/platform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ Module Name:
--*/

#include "mlasi.h"
#include <string>

//
// Stores the platform information.
Expand Down Expand Up @@ -142,7 +143,11 @@ Return Value:

uint64_t xcr0 = MlasReadExtendedControlRegister(_XCR_XFEATURE_ENABLED_MASK);

if ((xcr0 & 0x6) == 0x6) {
const char *cpu_opt = std::getenv("MLAS_DYNAMIC_CPU_ARCH");
if (cpu_opt == nullptr) cpu_opt = "99";
auto opt = std::stoi(cpu_opt);

if (opt > 0 && (xcr0 & 0x6) == 0x6) {

this->GemmFloatKernel = MlasGemmFloatKernelAvx;

Expand Down Expand Up @@ -171,7 +176,7 @@ Return Value:
__cpuid_count(7, 0, Cpuid7[0], Cpuid7[1], Cpuid7[2], Cpuid7[3]);
#endif

if (((Cpuid1[2] & 0x1000) != 0) && ((Cpuid7[1] & 0x20) != 0)) {
if (opt > 1 && ((Cpuid1[2] & 0x1000) != 0) && ((Cpuid7[1] & 0x20) != 0)) {

this->GemmU8S8CopyPackARoutine = MlasGemmU8S8CopyPackAAvx2;
this->GemmU8S8CopyPackBRoutine = MlasGemmU8S8CopyPackBAvx2;
Expand All @@ -198,7 +203,7 @@ Return Value:
// operating system supports saving AVX512F state.
//

if (((Cpuid7[1] & 0x10000) != 0) && ((xcr0 & 0xE0) == 0xE0)) {
if (opt > 2 && ((Cpuid7[1] & 0x10000) != 0) && ((xcr0 & 0xE0) == 0xE0)) {

this->GemmFloatKernel = MlasGemmFloatKernelAvx512F;
this->GemmDoubleKernel = MlasGemmDoubleKernelAvx512F;
Expand Down
17 changes: 5 additions & 12 deletions onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,6 @@ class UniDirectionalLstm {
const gsl::span<const T>& initial_hidden_state, const gsl::span<const T>& initial_cell_state,
const ActivationFuncs::Entry& activation_func_f, const ActivationFuncs::Entry& activation_func_g,
const ActivationFuncs::Entry& activation_func_h, float clip,
concurrency::ThreadPool& lstm_tp_,
concurrency::ThreadPool* mlas_tp_);

void Compute(const gsl::span<const T>& inputs, const gsl::span<const int>& sequence_lengths, int num_directions,
Expand Down Expand Up @@ -279,7 +278,6 @@ class UniDirectionalLstm {
ActivationInfo<deepcpu::ActivationFuncPtr> activation_g_;
ActivationInfo<deepcpu::LstmMergeGatesFuncPtr> activation_h_;

concurrency::ThreadPool& lstm_tp_;
concurrency::ThreadPool* mlas_tp_;
};

Expand Down Expand Up @@ -459,15 +457,15 @@ Status DeepCpuLstmOp::ComputeImpl(OpKernelContext& context) const {
activation_funcs_.Entries()[0],
activation_funcs_.Entries()[1],
activation_funcs_.Entries()[2],
clip_, lstm_tp_, mlas_thread_pool);
clip_, mlas_thread_pool);

detail::UniDirectionalLstm<T> bw(alloc, logger, seq_length, batch_size, input_size,
hidden_size_, Direction::kReverse, input_forget_,
bias_2, peephole_weights_2, initial_hidden_2, initial_cell_2,
activation_funcs_.Entries()[3],
activation_funcs_.Entries()[4],
activation_funcs_.Entries()[5],
clip_, lstm_tp_, mlas_thread_pool);
clip_, mlas_thread_pool);

fw.Compute(input, sequence_lens_span, num_directions_, input_weights_1, recurrent_weights_1,
output_1, hidden_output_1, last_cell_1);
Expand All @@ -480,7 +478,7 @@ Status DeepCpuLstmOp::ComputeImpl(OpKernelContext& context) const {
activation_funcs_.Entries()[0],
activation_funcs_.Entries()[1],
activation_funcs_.Entries()[2],
clip_, lstm_tp_, mlas_thread_pool);
clip_, mlas_thread_pool);

fw.Compute(input, sequence_lens_span, num_directions_, input_weights_1, recurrent_weights_1,
output_1, hidden_output_1, last_cell_1);
Expand Down Expand Up @@ -553,7 +551,6 @@ UniDirectionalLstm<T>::UniDirectionalLstm(AllocatorPtr allocator,
const ActivationFuncs::Entry& activation_func_g,
const ActivationFuncs::Entry& activation_func_h,
const float clip,
concurrency::ThreadPool& lstm_tp,
concurrency::ThreadPool* mlas_tp)
: allocator_(allocator),
logger_(logger),
Expand All @@ -566,7 +563,6 @@ UniDirectionalLstm<T>::UniDirectionalLstm(AllocatorPtr allocator,
clip_(clip),
use_bias_(!bias.empty()),
use_peepholes_(!peephole_weights.empty()),
lstm_tp_(lstm_tp),
mlas_tp_(mlas_tp) {
activation_f_ = {deepcpu::ActivationFuncByName(activation_func_f.name),
activation_func_f.alpha,
Expand Down Expand Up @@ -884,7 +880,7 @@ void UniDirectionalLstm<T>::Compute(const gsl::span<const T>& inputs_arg,
}
};

ExecuteLambdaInParallel("Processing batch", hidden_gemm_and_activations, batch_size_, fused_hidden_rows, lstm_tp_, logger_);
ExecuteLambdaInParallel("Processing batch", hidden_gemm_and_activations, batch_size_, fused_hidden_rows, nullptr, logger_);

} else {
span_T_const_iter previous_state_end = batched_hidden_state_one_step.cend();
Expand Down Expand Up @@ -1123,10 +1119,7 @@ void UniDirectionalLstm<T>::GateComputations(span_T_iter& out, span_T_iter& out_

template <typename T>
void UniDirectionalLstm<T>::SetNumThreads() {
int threads = std::thread::hardware_concurrency() - 1;

if (threads < 1)
threads = 1;
int threads = 1;

hidden_num_threads_ = threads;
batch_parallel_ = false;
Expand Down
7 changes: 0 additions & 7 deletions onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,13 +77,6 @@ class DeepCpuLstmOp final : public OpKernel {
bool input_forget_ = false;

rnn::detail::ActivationFuncs activation_funcs_;

// Threadpool for operator. If concurrent Compute calls are possible, it will be shared
// across them. mutable due to this.
// The alternative would be to create a threadpool in each call to Compute but that would incur thread creation
// cost on every call.
mutable onnxruntime::concurrency::ThreadPool lstm_tp_{"DEEPCPU_LSTM",
static_cast<int>(std::thread::hardware_concurrency())};
};

} // namespace onnxruntime
116 changes: 55 additions & 61 deletions onnxruntime/core/providers/cpu/rnn/rnn_helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -212,78 +212,72 @@ T* SafeRawPointer(typename gsl::span<T> span, size_t offset, size_t size) {

template <typename TLambda>
void ExecuteLambdaInParallel(const std::string& name, TLambda lambda, int max, int step,
onnxruntime::concurrency::ThreadPool& ttp,
onnxruntime::concurrency::ThreadPool* ttp,
const ::onnxruntime::logging::Logger& logger) {
// #define NOTHREADS to execute the lambdas directly and in order if you need to do that to debug

#ifdef NOTHREADS
ORT_UNUSED_PARAMETER(ttp);
ORT_UNUSED_PARAMETER(logger);

for (int i = 0; i < max; i += step) {
(void)name;
std::bind(lambda, i)();
}
#else


ORT_UNUSED_PARAMETER(name);
ORT_UNUSED_PARAMETER(logger);

// ORT_ENFORCE may and does throw at times from within the tasks that run
// on a thread-pool. Without propagating exceptions the process exits silently
// which will make diagnosing bugs more difficult.

// \! UGLY
// We have a problem here with the current thread-pool is that it takes std::function
// by value and copies it more than once (even though it is movable).
//
// To report status and exceptions properly it's better to use
// futures and promises but they are not copyable, so we can't come up with a functor
// with a promise member and we are downgrading to C++11 where we can't have captures that moved in.
//
// At the same time promises MUST live in the child thread so if we throw from the main thread
// we don't destroy any promises that are on the main thread stack which children threads may still be using.
//
// The only solution with the current Eigen that comes to mind is to have shared_ptr to with std::promise.
//
const int total_tasks = max / (step > 0 ? step : 1) + (max % step > 0 ? 1 : 0);
std::vector<std::future<void> > futures;
futures.reserve(total_tasks);

for (int i = 0, t = 0; i < max; i += step, ++t) {
auto p_ptr = std::make_shared<std::promise<void> >();
futures.push_back(p_ptr->get_future());
ttp.Schedule([p_ptr, lambda, i]() {
if (ttp == nullptr){
for (int i = 0; i < max; i += step) {
std::bind(lambda, i)();
}
} else {
// ORT_ENFORCE may and does throw at times from within the tasks that run
// on a thread-pool. Without propagating exceptions the process exits silently
// which will make diagnosing bugs more difficult.

// \! UGLY
// We have a problem here with the current thread-pool is that it takes std::function
// by value and copies it more than once (even though it is movable).
//
// To report status and exceptions properly it's better to use
// futures and promises but they are not copyable, so we can't come up with a functor
// with a promise member and we are downgrading to C++11 where we can't have captures that moved in.
//
// At the same time promises MUST live in the child thread so if we throw from the main thread
// we don't destroy any promises that are on the main thread stack which children threads may still be using.
//
// The only solution with the current Eigen that comes to mind is to have shared_ptr to with std::promise.
//
const int total_tasks = max / (step > 0 ? step : 1) + (max % step > 0 ? 1 : 0);
std::vector<std::future<void> > futures;
futures.reserve(total_tasks);

for (int i = 0, t = 0; i < max; i += step, ++t) {
auto p_ptr = std::make_shared<std::promise<void> >();
futures.push_back(p_ptr->get_future());
ttp->Schedule([p_ptr, lambda, i]() {
try {
lambda(i);
p_ptr->set_value();
} catch (...) {
p_ptr->set_exception(std::current_exception());
}
});
}

// We'd like to wait until all of the tasks have finished
// even though one or more have already thrown. We will store
// the first exception and then will re-throw at the end.
std::exception_ptr pending_exception;
for (auto& fut : futures) {
try {
lambda(i);
p_ptr->set_value();
// get() will re-throw any exceptions
// the running task may throw
fut.get();
} catch (...) {
p_ptr->set_exception(std::current_exception());
}
});
}

// We'd like to wait until all of the tasks have finished
// even though one or more have already thrown. We will store
// the first exception and then will re-throw at the end.
std::exception_ptr pending_exception;
for (auto& fut : futures) {
try {
// get() will re-throw any exceptions
// the running task may throw
fut.get();
} catch (...) {
if (!pending_exception) {
pending_exception = std::current_exception();
if (!pending_exception) {
pending_exception = std::current_exception();
}
}
}
}

if (pending_exception) {
std::rethrow_exception(pending_exception);
if (pending_exception) {
std::rethrow_exception(pending_exception);
}
}

#endif
}

void DumpMatrixImpl(const std::string& name, const float* src, int row, int col,
Expand Down