Skip to content

Commit

Permalink
init
Browse files Browse the repository at this point in the history
  • Loading branch information
alvoron committed Jan 24, 2025
1 parent d2ecd45 commit dbdca52
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 30 deletions.
2 changes: 1 addition & 1 deletion .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@
url = https://github.com/google/snappy.git
[submodule "ARMComputeLibrary"]
path = src/plugins/intel_cpu/thirdparty/ComputeLibrary
url = https://github.com/ARM-software/ComputeLibrary.git
url = https://review.mlplatform.org/ml/ComputeLibrary
ignore = dirty
[submodule "src/plugins/intel_cpu/thirdparty/mlas"]
path = src/plugins/intel_cpu/thirdparty/mlas
Expand Down
94 changes: 66 additions & 28 deletions src/plugins/intel_cpu/src/nodes/executors/acl/acl_ie_scheduler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
#include "arm_compute/core/CPP/ICPPKernel.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "src/runtime/SchedulerUtils.h"

#include "openvino/core/parallel.hpp"

namespace ov {
Expand All @@ -24,38 +26,74 @@ void ACLScheduler::set_num_threads(unsigned int num_threads) {}

void ACLScheduler::schedule_custom(ICPPKernel* kernel, const Hints& hints, const Window& window, ITensorPack& tensors) {
const Window& max_window = window;
const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension());
#if OV_THREAD == OV_THREAD_OMP
// In OpenMP case parallel_get_num_threads() method returns 1 here because it's called outside parallel section
// This is the reason why this method isn't used to initialize _num_threads
const auto _num_threads = num_iterations;
#else
const auto _num_threads = std::min(num_iterations, static_cast<unsigned int>(parallel_get_num_threads()));
#endif
std::function<void(const Window& window, const ThreadInfo& info)> main_run;
if (tensors.empty()) {
main_run = [&](const Window& window, const ThreadInfo& info) {
kernel->run(window, info);
};
} else {
main_run = [&](const Window& window, const ThreadInfo& info) {
kernel->run_op(tensors, window, info);
};
}
if (hints.split_dimension() == IScheduler::split_dimensions_all) {
const std::size_t m = max_window.num_iterations(Window::DimX);
const std::size_t n = max_window.num_iterations(Window::DimY);
const unsigned int num_iterations = m * n;
#if OV_THREAD == OV_THREAD_OMP
// In OpenMP case parallel_get_num_threads() method returns 1 here because it's called outside parallel section
// This is the reason why this method isn't used to initialize _num_threads
const auto num_threads = num_iterations;
#else
const auto num_threads = std::min(num_iterations, static_cast<unsigned int>(parallel_get_num_threads()));
#endif

if (!kernel->is_parallelisable() || _num_threads == 1) {
ThreadInfo info;
info.cpu_info = &cpu_info();
main_run(max_window, info);
} else {
const auto num_windows = _num_threads;
const auto hints_split_dimension = hints.split_dimension();
unsigned m_threads, n_threads;
std::tie(m_threads, n_threads) = scheduler_utils::split_2d(num_threads, m, n);
unsigned int max_parallelism = std::min<unsigned int>(m, m_threads) * std::min<unsigned int>(n, n_threads);
if (max_parallelism < num_threads)
{
m_threads = std::min<unsigned int>(m, m_threads);
n_threads = std::min<unsigned int>(n, n_threads);
}

ov::parallel_for(num_windows, [&](int wid) {
Window win = max_window.split_window(hints_split_dimension, wid, num_windows);
ov::parallel_for2d(m_threads, n_threads, [&](int mi, int ni) {
Window win = max_window.split_window(Window::DimX, mi, m_threads)
.split_window(Window::DimY, ni, n_threads);
win.validate();
main_run(win, {wid, static_cast<int>(_num_threads), &cpu_info()});
Window thread_locator;
thread_locator.set(Window::DimX, Window::Dimension(mi, m_threads));
thread_locator.set(Window::DimY, Window::Dimension(ni, n_threads));
thread_locator.validate();

ThreadInfo info;
info.cpu_info = &cpu_info();
kernel->run_nd(win, info, thread_locator);
});
} else {
const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension());
#if OV_THREAD == OV_THREAD_OMP
// In OpenMP case parallel_get_num_threads() method returns 1 here because it's called outside parallel section
// This is the reason why this method isn't used to initialize _num_threads
const auto _num_threads = num_iterations;
#else
const auto _num_threads = std::min(num_iterations, static_cast<unsigned int>(parallel_get_num_threads()));
#endif
std::function<void(const Window& window, const ThreadInfo& info)> main_run;
if (tensors.empty()) {
main_run = [&](const Window& window, const ThreadInfo& info) {
kernel->run(window, info);
};
} else {
main_run = [&](const Window& window, const ThreadInfo& info) {
kernel->run_op(tensors, window, info);
};
}

if (!kernel->is_parallelisable() || _num_threads == 1) {
ThreadInfo info;
info.cpu_info = &cpu_info();
main_run(max_window, info);
} else {
const auto num_windows = _num_threads;
const auto hints_split_dimension = hints.split_dimension();

ov::parallel_for(num_windows, [&](int wid) {
Window win = max_window.split_window(hints_split_dimension, wid, num_windows);
win.validate();
main_run(win, {wid, static_cast<int>(_num_threads), &cpu_info()});
});
}
}
}

Expand Down
2 changes: 1 addition & 1 deletion src/plugins/intel_cpu/thirdparty/ComputeLibrary
Submodule ComputeLibrary updated from c61bd3 to 614917

0 comments on commit dbdca52

Please sign in to comment.