init

alvoron · Jan 24, 2025 · dbdca52 · dbdca52
1 parent d2ecd45
commit dbdca52
Show file tree

Hide file tree

Showing 3 changed files with 68 additions and 30 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -64,7 +64,7 @@
 	url = https://github.com/google/snappy.git
 [submodule "ARMComputeLibrary"]
 	path = src/plugins/intel_cpu/thirdparty/ComputeLibrary
-	url = https://github.com/ARM-software/ComputeLibrary.git
+	url = https://review.mlplatform.org/ml/ComputeLibrary
 	ignore = dirty
 [submodule "src/plugins/intel_cpu/thirdparty/mlas"]
 	path = src/plugins/intel_cpu/thirdparty/mlas

diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_ie_scheduler.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_ie_scheduler.cpp
@@ -7,6 +7,8 @@
 #include "arm_compute/core/CPP/ICPPKernel.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
+#include "src/runtime/SchedulerUtils.h"
+
 #include "openvino/core/parallel.hpp"
 
 namespace ov {
@@ -24,38 +26,74 @@ void ACLScheduler::set_num_threads(unsigned int num_threads) {}
 
 void ACLScheduler::schedule_custom(ICPPKernel* kernel, const Hints& hints, const Window& window, ITensorPack& tensors) {
     const Window& max_window = window;
-    const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension());
-#if OV_THREAD == OV_THREAD_OMP
-    // In OpenMP case parallel_get_num_threads() method returns 1 here because it's called outside parallel section
-    // This is the reason why this method isn't used to initialize _num_threads
-    const auto _num_threads = num_iterations;
-#else
-    const auto _num_threads = std::min(num_iterations, static_cast<unsigned int>(parallel_get_num_threads()));
-#endif
-    std::function<void(const Window& window, const ThreadInfo& info)> main_run;
-    if (tensors.empty()) {
-        main_run = [&](const Window& window, const ThreadInfo& info) {
-            kernel->run(window, info);
-        };
-    } else {
-        main_run = [&](const Window& window, const ThreadInfo& info) {
-            kernel->run_op(tensors, window, info);
-        };
-    }
+    if (hints.split_dimension() == IScheduler::split_dimensions_all) {
+        const std::size_t m = max_window.num_iterations(Window::DimX);
+        const std::size_t n = max_window.num_iterations(Window::DimY);
+        const unsigned int num_iterations = m * n;
+        #if OV_THREAD == OV_THREAD_OMP
+            // In OpenMP case parallel_get_num_threads() method returns 1 here because it's called outside parallel section
+            // This is the reason why this method isn't used to initialize _num_threads
+            const auto num_threads = num_iterations;
+        #else
+            const auto num_threads = std::min(num_iterations, static_cast<unsigned int>(parallel_get_num_threads()));
+        #endif
 
-    if (!kernel->is_parallelisable() || _num_threads == 1) {
-        ThreadInfo info;
-        info.cpu_info = &cpu_info();
-        main_run(max_window, info);
-    } else {
-        const auto num_windows = _num_threads;
-        const auto hints_split_dimension = hints.split_dimension();
+        unsigned m_threads, n_threads;
+        std::tie(m_threads, n_threads) = scheduler_utils::split_2d(num_threads, m, n);
+        unsigned int max_parallelism = std::min<unsigned int>(m, m_threads) * std::min<unsigned int>(n, n_threads);
+        if (max_parallelism < num_threads)
+        {
+            m_threads = std::min<unsigned int>(m, m_threads);
+            n_threads = std::min<unsigned int>(n, n_threads);
+        }
 
-        ov::parallel_for(num_windows, [&](int wid) {
-            Window win = max_window.split_window(hints_split_dimension, wid, num_windows);
+        ov::parallel_for2d(m_threads, n_threads, [&](int mi, int ni) {
+            Window win = max_window.split_window(Window::DimX, mi, m_threads)
+                                   .split_window(Window::DimY, ni, n_threads);
             win.validate();
-            main_run(win, {wid, static_cast<int>(_num_threads), &cpu_info()});
+            Window thread_locator;
+            thread_locator.set(Window::DimX, Window::Dimension(mi, m_threads));
+            thread_locator.set(Window::DimY, Window::Dimension(ni, n_threads));
+            thread_locator.validate();
+
+            ThreadInfo info;
+            info.cpu_info = &cpu_info();
+            kernel->run_nd(win, info, thread_locator);
         });
+    } else {
+        const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension());
+        #if OV_THREAD == OV_THREAD_OMP
+            // In OpenMP case parallel_get_num_threads() method returns 1 here because it's called outside parallel section
+            // This is the reason why this method isn't used to initialize _num_threads
+            const auto _num_threads = num_iterations;
+        #else
+            const auto _num_threads = std::min(num_iterations, static_cast<unsigned int>(parallel_get_num_threads()));
+        #endif
+        std::function<void(const Window& window, const ThreadInfo& info)> main_run;
+        if (tensors.empty()) {
+            main_run = [&](const Window& window, const ThreadInfo& info) {
+                kernel->run(window, info);
+            };
+        } else {
+            main_run = [&](const Window& window, const ThreadInfo& info) {
+                kernel->run_op(tensors, window, info);
+            };
+        }
+
+        if (!kernel->is_parallelisable() || _num_threads == 1) {
+            ThreadInfo info;
+            info.cpu_info = &cpu_info();
+            main_run(max_window, info);
+        } else {
+            const auto num_windows = _num_threads;
+            const auto hints_split_dimension = hints.split_dimension();
+
+            ov::parallel_for(num_windows, [&](int wid) {
+                Window win = max_window.split_window(hints_split_dimension, wid, num_windows);
+                win.validate();
+                main_run(win, {wid, static_cast<int>(_num_threads), &cpu_info()});
+            });
+        }
     }
 }
 

diff --git a/src/plugins/intel_cpu/thirdparty/ComputeLibrary b/src/plugins/intel_cpu/thirdparty/ComputeLibrary