diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 5de48bfcd1719..4a47d879facb3 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -257,12 +257,24 @@ list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/external)
 #2. if ONNX_CUSTOM_PROTOC_EXECUTABLE is not set, Compile everything(including protoc) from source code.
 
 
+if (onnxruntime_USE_PREINSTALLED_PROTOBUF)
+    add_executable(protoc IMPORTED GLOBAL)
+    find_program(PROTOC_EXE protoc PATHS ${protobuf_INSTALL_PATH}/bin)
+    set_property(TARGET protoc PROPERTY IMPORTED_LOCATION ${PROTOC_EXE})
+    foreach(proto_libname protobuf protobuf-lite)
+      add_library(lib${proto_libname} SHARED IMPORTED GLOBAL)
+      find_library(${proto_libname}_path ${proto_libname} PATHS ${protobuf_INSTALL_PATH}/lib NO_DEFAULT_PATH)
+      set_property(TARGET lib${proto_libname} PROPERTY IMPORTED_LOCATION ${${proto_libname}_path})
+      target_include_directories(lib${proto_libname} INTERFACE "${protobuf_INSTALL_PATH}/include")
+    endforeach(proto_libname)
+else()
 # use protobuf as a submodule
 add_subdirectory(${PROJECT_SOURCE_DIR}/external/protobuf/cmake EXCLUDE_FROM_ALL)
 set_target_properties(libprotobuf PROPERTIES FOLDER "External/Protobuf")
 set_target_properties(libprotobuf-lite PROPERTIES FOLDER "External/Protobuf")
 set_target_properties(libprotoc PROPERTIES FOLDER "External/Protobuf")
 set_target_properties(protoc PROPERTIES FOLDER "External/Protobuf")
+endif()
 if (onnxruntime_USE_FULL_PROTOBUF)
   add_library(protobuf::libprotobuf ALIAS libprotobuf)
 else()
diff --git a/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.h b/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.h
index a3accc70b86e3..43e4d5b670074 100644
--- a/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.h
+++ b/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.h
@@ -92,11 +92,6 @@ class DeepCpuAttnLstmOp final : public OpKernel {
 
   ActivationFuncs activation_funcs_;
 
-// Threadpool for operator. If concurrent Compute calls are possible, it will be shared
-// across them. mutable due to this.
-// The alternative would be to create a threadpool in each call to Compute but that would incur thread creation
-// cost on every call.
-  mutable onnxruntime::concurrency::ThreadPool ttp_{"DEEPCPU_ATTN_LSTM", (int)std::thread::hardware_concurrency()};
 };
 
 }  // namespace contrib
diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp
index dd722be009bbe..34f2a499fadde 100644
--- a/onnxruntime/core/mlas/lib/platform.cpp
+++ b/onnxruntime/core/mlas/lib/platform.cpp
@@ -16,6 +16,7 @@ Module Name:
 --*/
 
 #include "mlasi.h"
+#include <string>
 
 //
 // Stores the platform information.
@@ -142,7 +143,11 @@ Return Value:
 
         uint64_t xcr0 = MlasReadExtendedControlRegister(_XCR_XFEATURE_ENABLED_MASK);
 
-        if ((xcr0 & 0x6) == 0x6) {
+        const char *cpu_opt = std::getenv("MLAS_DYNAMIC_CPU_ARCH");
+        if (cpu_opt == nullptr) cpu_opt = "99";
+        auto opt = std::stoi(cpu_opt);
+
+        if (opt > 0 && (xcr0 & 0x6) == 0x6) {
 
             this->GemmFloatKernel = MlasGemmFloatKernelAvx;
 
@@ -171,7 +176,7 @@ Return Value:
             __cpuid_count(7, 0, Cpuid7[0], Cpuid7[1], Cpuid7[2], Cpuid7[3]);
 #endif
 
-            if (((Cpuid1[2] & 0x1000) != 0) && ((Cpuid7[1] & 0x20) != 0)) {
+            if (opt > 1 && ((Cpuid1[2] & 0x1000) != 0) && ((Cpuid7[1] & 0x20) != 0)) {
 
                 this->GemmU8S8CopyPackARoutine = MlasGemmU8S8CopyPackAAvx2;
                 this->GemmU8S8CopyPackBRoutine = MlasGemmU8S8CopyPackBAvx2;
@@ -198,7 +203,7 @@ Return Value:
                 // operating system supports saving AVX512F state.
                 //
 
-                if (((Cpuid7[1] & 0x10000) != 0) && ((xcr0 & 0xE0) == 0xE0)) {
+                if (opt > 2 && ((Cpuid7[1] & 0x10000) != 0) && ((xcr0 & 0xE0) == 0xE0)) {
 
                     this->GemmFloatKernel = MlasGemmFloatKernelAvx512F;
                     this->GemmDoubleKernel = MlasGemmDoubleKernelAvx512F;
diff --git a/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.cc b/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.cc
index af0bdbce199f4..aad4f35c52bbf 100644
--- a/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.cc
+++ b/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.cc
@@ -196,7 +196,6 @@ class UniDirectionalLstm {
                      const gsl::span<const T>& initial_hidden_state, const gsl::span<const T>& initial_cell_state,
                      const ActivationFuncs::Entry& activation_func_f, const ActivationFuncs::Entry& activation_func_g,
                      const ActivationFuncs::Entry& activation_func_h, float clip,
-                     concurrency::ThreadPool& lstm_tp_,
                      concurrency::ThreadPool* mlas_tp_);
 
   void Compute(const gsl::span<const T>& inputs, const gsl::span<const int>& sequence_lengths, int num_directions,
@@ -279,7 +278,6 @@ class UniDirectionalLstm {
   ActivationInfo<deepcpu::ActivationFuncPtr> activation_g_;
   ActivationInfo<deepcpu::LstmMergeGatesFuncPtr> activation_h_;
 
-  concurrency::ThreadPool& lstm_tp_;
   concurrency::ThreadPool* mlas_tp_;
 };
 
@@ -459,7 +457,7 @@ Status DeepCpuLstmOp::ComputeImpl(OpKernelContext& context) const {
                                      activation_funcs_.Entries()[0],
                                      activation_funcs_.Entries()[1],
                                      activation_funcs_.Entries()[2],
-                                     clip_, lstm_tp_, mlas_thread_pool);
+                                     clip_, mlas_thread_pool);
 
     detail::UniDirectionalLstm<T> bw(alloc, logger, seq_length, batch_size, input_size,
                                      hidden_size_, Direction::kReverse, input_forget_,
@@ -467,7 +465,7 @@ Status DeepCpuLstmOp::ComputeImpl(OpKernelContext& context) const {
                                      activation_funcs_.Entries()[3],
                                      activation_funcs_.Entries()[4],
                                      activation_funcs_.Entries()[5],
-                                     clip_, lstm_tp_, mlas_thread_pool);
+                                     clip_, mlas_thread_pool);
 
     fw.Compute(input, sequence_lens_span, num_directions_, input_weights_1, recurrent_weights_1,
                output_1, hidden_output_1, last_cell_1);
@@ -480,7 +478,7 @@ Status DeepCpuLstmOp::ComputeImpl(OpKernelContext& context) const {
                                      activation_funcs_.Entries()[0],
                                      activation_funcs_.Entries()[1],
                                      activation_funcs_.Entries()[2],
-                                     clip_, lstm_tp_, mlas_thread_pool);
+                                     clip_, mlas_thread_pool);
 
     fw.Compute(input, sequence_lens_span, num_directions_, input_weights_1, recurrent_weights_1,
                output_1, hidden_output_1, last_cell_1);
@@ -553,7 +551,6 @@ UniDirectionalLstm<T>::UniDirectionalLstm(AllocatorPtr allocator,
                                           const ActivationFuncs::Entry& activation_func_g,
                                           const ActivationFuncs::Entry& activation_func_h,
                                           const float clip,
-                                          concurrency::ThreadPool& lstm_tp,
                                           concurrency::ThreadPool* mlas_tp)
     : allocator_(allocator),
       logger_(logger),
@@ -566,7 +563,6 @@ UniDirectionalLstm<T>::UniDirectionalLstm(AllocatorPtr allocator,
       clip_(clip),
       use_bias_(!bias.empty()),
       use_peepholes_(!peephole_weights.empty()),
-      lstm_tp_(lstm_tp),
       mlas_tp_(mlas_tp) {
   activation_f_ = {deepcpu::ActivationFuncByName(activation_func_f.name),
                    activation_func_f.alpha,
@@ -884,7 +880,7 @@ void UniDirectionalLstm<T>::Compute(const gsl::span<const T>& inputs_arg,
       }
     };
 
-    ExecuteLambdaInParallel("Processing batch", hidden_gemm_and_activations, batch_size_, fused_hidden_rows, lstm_tp_, logger_);
+    ExecuteLambdaInParallel("Processing batch", hidden_gemm_and_activations, batch_size_, fused_hidden_rows, nullptr, logger_);
 
   } else {
     span_T_const_iter previous_state_end = batched_hidden_state_one_step.cend();
@@ -1123,10 +1119,7 @@ void UniDirectionalLstm<T>::GateComputations(span_T_iter& out, span_T_iter& out_
 
 template <typename T>
 void UniDirectionalLstm<T>::SetNumThreads() {
-  int threads = std::thread::hardware_concurrency() - 1;
-
-  if (threads < 1)
-    threads = 1;
+  int threads = 1;
 
   hidden_num_threads_ = threads;
   batch_parallel_ = false;
diff --git a/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.h b/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.h
index faf32e3a77a2f..0ebab72e2b042 100644
--- a/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.h
+++ b/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.h
@@ -77,13 +77,6 @@ class DeepCpuLstmOp final : public OpKernel {
   bool input_forget_ = false;
 
   rnn::detail::ActivationFuncs activation_funcs_;
-
-  // Threadpool for operator. If concurrent Compute calls are possible, it will be shared
-  // across them. mutable due to this.
-  // The alternative would be to create a threadpool in each call to Compute but that would incur thread creation
-  // cost on every call.
-  mutable onnxruntime::concurrency::ThreadPool lstm_tp_{"DEEPCPU_LSTM",
-                                                        static_cast<int>(std::thread::hardware_concurrency())};
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/rnn/rnn_helpers.h b/onnxruntime/core/providers/cpu/rnn/rnn_helpers.h
index 0ef37b5126cdb..3c5564e0d8ca9 100644
--- a/onnxruntime/core/providers/cpu/rnn/rnn_helpers.h
+++ b/onnxruntime/core/providers/cpu/rnn/rnn_helpers.h
@@ -212,78 +212,72 @@ T* SafeRawPointer(typename gsl::span<T> span, size_t offset, size_t size) {
 
 template <typename TLambda>
 void ExecuteLambdaInParallel(const std::string& name, TLambda lambda, int max, int step,
-                             onnxruntime::concurrency::ThreadPool& ttp,
+                             onnxruntime::concurrency::ThreadPool* ttp,
                              const ::onnxruntime::logging::Logger& logger) {
-  // #define NOTHREADS to execute the lambdas directly and in order if you need to do that to debug
-
-#ifdef NOTHREADS
-  ORT_UNUSED_PARAMETER(ttp);
-  ORT_UNUSED_PARAMETER(logger);
-
-  for (int i = 0; i < max; i += step) {
-    (void)name;
-    std::bind(lambda, i)();
-  }
-#else
-
+                             
   ORT_UNUSED_PARAMETER(name);
   ORT_UNUSED_PARAMETER(logger);
 
-  // ORT_ENFORCE may and does throw at times from within the tasks that run
-  // on a thread-pool. Without propagating exceptions the process exits silently
-  // which will make diagnosing bugs more difficult.
-
-  // \! UGLY
-  // We have a problem here with the current thread-pool is that it takes std::function
-  // by value and copies it more than once (even though it is movable).
-  //
-  // To report status and exceptions properly it's better to use
-  // futures and promises but they are not copyable, so we can't come up with a functor
-  // with a promise member and we are downgrading to C++11 where we can't have captures that moved in.
-  //
-  // At the same time promises MUST live in the child thread so if we throw from the main thread
-  // we don't destroy any promises that are on the main thread stack which children threads may still be using.
-  //
-  // The only solution with the current Eigen that comes to mind is to have shared_ptr to with std::promise.
-  //
-  const int total_tasks = max / (step > 0 ? step : 1) + (max % step > 0 ? 1 : 0);
-  std::vector<std::future<void> > futures;
-  futures.reserve(total_tasks);
-
-  for (int i = 0, t = 0; i < max; i += step, ++t) {
-    auto p_ptr = std::make_shared<std::promise<void> >();
-    futures.push_back(p_ptr->get_future());
-    ttp.Schedule([p_ptr, lambda, i]() {
+  if (ttp == nullptr){
+    for (int i = 0; i < max; i += step) {
+      std::bind(lambda, i)();
+    }
+  } else {
+    // ORT_ENFORCE may and does throw at times from within the tasks that run
+    // on a thread-pool. Without propagating exceptions the process exits silently
+    // which will make diagnosing bugs more difficult.
+
+    // \! UGLY
+    // We have a problem here with the current thread-pool is that it takes std::function
+    // by value and copies it more than once (even though it is movable).
+    //
+    // To report status and exceptions properly it's better to use
+    // futures and promises but they are not copyable, so we can't come up with a functor
+    // with a promise member and we are downgrading to C++11 where we can't have captures that moved in.
+    //
+    // At the same time promises MUST live in the child thread so if we throw from the main thread
+    // we don't destroy any promises that are on the main thread stack which children threads may still be using.
+    //
+    // The only solution with the current Eigen that comes to mind is to have shared_ptr to with std::promise.
+    //
+    const int total_tasks = max / (step > 0 ? step : 1) + (max % step > 0 ? 1 : 0);
+    std::vector<std::future<void> > futures;
+    futures.reserve(total_tasks);
+
+    for (int i = 0, t = 0; i < max; i += step, ++t) {
+      auto p_ptr = std::make_shared<std::promise<void> >();
+      futures.push_back(p_ptr->get_future());
+      ttp->Schedule([p_ptr, lambda, i]() {
+        try {
+          lambda(i);
+          p_ptr->set_value();
+        } catch (...) {
+          p_ptr->set_exception(std::current_exception());
+        }
+      });
+    }
+
+    // We'd like to wait until all of the tasks have finished
+    // even though one or more have already thrown. We will store
+    // the first exception and then will re-throw at the end.
+    std::exception_ptr pending_exception;
+    for (auto& fut : futures) {
       try {
-        lambda(i);
-        p_ptr->set_value();
+        // get() will re-throw any exceptions
+        // the running task may throw
+        fut.get();
       } catch (...) {
-        p_ptr->set_exception(std::current_exception());
-      }
-    });
-  }
-
-  // We'd like to wait until all of the tasks have finished
-  // even though one or more have already thrown. We will store
-  // the first exception and then will re-throw at the end.
-  std::exception_ptr pending_exception;
-  for (auto& fut : futures) {
-    try {
-      // get() will re-throw any exceptions
-      // the running task may throw
-      fut.get();
-    } catch (...) {
-      if (!pending_exception) {
-        pending_exception = std::current_exception();
+        if (!pending_exception) {
+          pending_exception = std::current_exception();
+        }
       }
     }
-  }
 
-  if (pending_exception) {
-    std::rethrow_exception(pending_exception);
+    if (pending_exception) {
+      std::rethrow_exception(pending_exception);
+    }
   }
 
-#endif
 }
 
 void DumpMatrixImpl(const std::string& name, const float* src, int row, int col,