Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CPU][ARM] KleidiAI integration and KleidiAI MM executor #28830

Merged
merged 14 commits into from
Feb 17, 2025
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -87,3 +87,6 @@
[submodule "src/plugins/intel_cpu/thirdparty/shl"]
path = src/plugins/intel_cpu/thirdparty/shl
url = https://github.com/openvinotoolkit/shl.git
[submodule "src/plugins/intel_cpu/thirdparty/kleidiai"]
path = src/plugins/intel_cpu/thirdparty/kleidiai
url = https://git.gitlab.arm.com/kleidi/kleidiai.git
30 changes: 30 additions & 0 deletions src/plugins/intel_cpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,15 @@ endif()

set(TARGET_NAME "openvino_intel_cpu_plugin")

#TODO: Patch KleidiAI CMakeLists to remove absolute include path
execute_process(
COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/kleidiai_include_directories_patch.diff
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/kleidiai
RESULT_VARIABLE patch_result
OUTPUT_VARIABLE patch_output
ERROR_VARIABLE patch_error
)

if((CMAKE_COMPILER_IS_GNUCXX OR OV_COMPILER_IS_CLANG) AND CMAKE_CXX_STANDARD GREATER_EQUAL 20)
set(CMAKE_CXX_FLAGS "-Wno-error=deprecated ${CMAKE_CXX_FLAGS}")
endif()
Expand Down Expand Up @@ -150,6 +159,13 @@ else()
endif()
ov_dependent_option(ENABLE_SHL_FOR_CPU "Enable SHL for OpenVINO CPU Plugin" ${ENABLE_SHL_FOR_CPU_DEFAULT} "RISCV64" OFF)

if(ARM OR AARCH64)
set(ENABLE_KLEIDIAI_FOR_CPU_DEFAULT ON)
else()
set(ENABLE_KLEIDIAI_FOR_CPU_DEFAULT OFF)
endif()
ov_dependent_option(ENABLE_KLEIDIAI_FOR_CPU "Enable KleidiAI for OpenVINO CPU Plugin" ${ENABLE_KLEIDIAI_FOR_CPU_DEFAULT} "ARM OR AARCH64" OFF)

add_subdirectory(thirdparty)

if(WIN32)
Expand All @@ -175,6 +191,11 @@ if(DNNL_USE_ACL)
set(OV_CPU_WITH_ACL ON)
endif()

if(ENABLE_KLEIDIAI_FOR_CPU)
add_definitions(-DOV_CPU_WITH_KLEIDIAI)
set(OV_CPU_WITH_KLEIDIAI ON)
endif()

if (ENABLE_SHL_FOR_CPU)
add_definitions(-DOV_CPU_WITH_SHL)
set(OV_CPU_WITH_SHL ON)
Expand Down Expand Up @@ -210,6 +231,7 @@ if(NOT (AARCH64 OR ARM))
list(APPEND EXCLUDE_PATHS ${CMAKE_CURRENT_SOURCE_DIR}/src/transformations/cpu_opset/arm/*
${CMAKE_CURRENT_SOURCE_DIR}/src/emitters/plugin/aarch64/*
${CMAKE_CURRENT_SOURCE_DIR}/src/nodes/executors/aarch64/*
${CMAKE_CURRENT_SOURCE_DIR}/src/nodes/executors/kleidiai/*
${CMAKE_CURRENT_SOURCE_DIR}/src/nodes/kernels/aarch64/*)
endif()

Expand Down Expand Up @@ -269,6 +291,10 @@ endif ()
if(ENABLE_SHL_FOR_CPU)
target_link_libraries(${TARGET_NAME} PRIVATE shl)
endif()
if(ENABLE_KLEIDIAI_FOR_CPU)
target_link_libraries(${TARGET_NAME} PRIVATE kleidiai)
endif()

target_include_directories(${TARGET_NAME} SYSTEM PRIVATE $<TARGET_PROPERTY:dnnl,INCLUDE_DIRECTORIES>)

# Temporal solution to use template reference implementations in cases where optimizied implementation
Expand Down Expand Up @@ -372,6 +398,10 @@ if(BUILD_SHARED_LIBS)
target_include_directories(${TARGET_NAME}_obj SYSTEM PUBLIC $<TARGET_PROPERTY:shl,INTERFACE_INCLUDE_DIRECTORIES>)
endif()

if(ENABLE_KLEIDIAI_FOR_CPU)
target_include_directories(${TARGET_NAME}_obj SYSTEM PUBLIC $<TARGET_PROPERTY:kleidiai,INTERFACE_INCLUDE_DIRECTORIES>)
endif()

ov_set_threading_interface_for(${TARGET_NAME}_obj)

target_compile_definitions(${TARGET_NAME}_obj PRIVATE USE_STATIC_IE)
Expand Down
2 changes: 2 additions & 0 deletions src/plugins/intel_cpu/src/node.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -638,6 +638,7 @@ std::string Node::getPrimitiveDescriptorType() const {
SEARCH_TYPE(sparse);
SEARCH_TYPE(acl);
SEARCH_TYPE(shl);
SEARCH_TYPE(kleidiai);
SEARCH_TYPE(_dw);
SEARCH_TYPE(_1x1);

Expand Down Expand Up @@ -1331,6 +1332,7 @@ const std::vector<impl_desc_type>& Node::getDefaultImplPriority() {
#endif
impl_desc_type::gemm_any, impl_desc_type::gemm_blas, impl_desc_type::gemm_avx512, impl_desc_type::gemm_avx2,
impl_desc_type::gemm_avx, impl_desc_type::gemm_sse42, impl_desc_type::gemm_acl, impl_desc_type::acl,
impl_desc_type::gemm_kleidiai, impl_desc_type::kleidiai,
impl_desc_type::jit_gemm, impl_desc_type::ref_any, impl_desc_type::ref,
};

Expand Down
2 changes: 2 additions & 0 deletions src/plugins/intel_cpu/src/nodes/executors/executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ std::string ExecutorTypeToString(const ExecutorType type) {
CASE(Mlas);
CASE(jit_aarch64);
CASE(Shl);
CASE(Kleidiai);
}
#undef CASE
return "Undefined";
Expand All @@ -42,6 +43,7 @@ ExecutorType ExecutorTypeFromString(const std::string& typeStr) {
CASE(Mlas);
CASE(jit_aarch64);
CASE(Shl);
CASE(Kleidiai);
#undef CASE
return ExecutorType::Undefined;
}
Expand Down
8 changes: 7 additions & 1 deletion src/plugins/intel_cpu/src/nodes/executors/executor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,12 @@ namespace intel_cpu {
# define OV_CPU_INSTANCE_DNNL(...)
#endif

#if defined(OV_CPU_WITH_KLEIDIAI)
# define OV_CPU_INSTANCE_KLEIDIAI(...) {__VA_ARGS__},
#else
# define OV_CPU_INSTANCE_KLEIDIAI(...)
#endif

#if defined(OPENVINO_ARCH_X86_64)
# define OV_CPU_INSTANCE_X64(...) {__VA_ARGS__},
#else
Expand All @@ -75,7 +81,7 @@ namespace intel_cpu {
// @todo another option is to determine shape relation by executor type
enum class ShapeTolerance { Agnostic, Dependant };

enum class ExecutorType { Undefined, Graph, Common, jit_x64, Dnnl, Acl, Mlas, jit_aarch64, Shl };
enum class ExecutorType { Undefined, Graph, Common, jit_x64, Dnnl, Acl, Mlas, jit_aarch64, Shl, Kleidiai };

enum class OperationType { FullyConnected, MatMul, Convolution };

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@
#include "utils/cpp/maybe_unused.hpp"
#include "utils/debug_capabilities.h"

#if defined(OV_CPU_WITH_KLEIDIAI) && defined(OPENVINO_ARCH_ARM64)
// kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla.h included in kleidiai_mm.hpp supports aarch64 only
#include "nodes/executors/kleidiai/kleidiai_mm.hpp"
#endif

#if defined(OV_CPU_WITH_ACL)
# include "nodes/executors/acl/acl_fullyconnected.hpp"
# include "nodes/executors/acl/acl_lowp_fullyconnected.hpp"
Expand Down Expand Up @@ -225,6 +230,35 @@ OV_CPU_MAYBE_UNUSED_FUNCTION static inline bool noPostOps(const FCConfig& config
template <>
const std::vector<ExecutorImplementation<FCAttrs>>& getImplementations() {
static const std::vector<ExecutorImplementation<FCAttrs>> fullyconnectedImplementations {
#if defined(OPENVINO_ARCH_ARM64)
OV_CPU_INSTANCE_KLEIDIAI(
"fullyconnected_kleidiai",
ExecutorType::Kleidiai,
OperationType::MatMul,
ShapeTolerance::Agnostic,
// supports
[](const FCConfig& config) -> bool {
VERIFY(noPostOps(config), UNSUPPORTED_POST_OPS);
VERIFY(noSparseDecompression(config), UNSUPPORTED_SPARSE_WEIGHTS);
VERIFY(noWeightsDecompression(config), UNSUPPORTED_WEIGHTS_DECOMPRESSION);
VERIFY(everyone_is(f32, srcType(config), weiType(config), dstType(config)), UNSUPPORTED_SRC_PRECISIONS);
VERIFY(srcRank(config) == 2U, UNSUPPORTED_SRC_RANK);
VERIFY(weiRank(config) == 2U, UNSUPPORTED_WEI_RANK);
return MatMulKleidiAIExecutor::supports(config);
},
// requiresFallback
[](const FCConfig& config) -> ov::optional<executor::Config<FCAttrs>> {
return {};
},
// acceptsShapes
[](const MemoryArgs& memory) -> bool {
return true;
},
// create
[](const FCAttrs& attrs, const PostOps& postOps, const MemoryArgs& memory, ExecutorContext::CPtr context) {
return std::make_shared<MatMulKleidiAIExecutor>(attrs, postOps, memory, context);
})
#endif
OV_CPU_INSTANCE_MLAS_X64(
"fullyconnected_mlas",
ExecutorType::Mlas,
Expand Down
169 changes: 169 additions & 0 deletions src/plugins/intel_cpu/src/nodes/executors/kleidiai/kleidiai_mm.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "kleidiai_mm.hpp"

#include <cstdint>
#include <memory>

#include "cpu_memory.h"
#include "memory_desc/cpu_blocked_memory_desc.h"
#include "memory_desc/cpu_memory_desc_utils.h"
#include "nodes/executors/executor.hpp"
#include "nodes/executors/fullyconnected_config.hpp"
#include "nodes/executors/memory_arguments.hpp"
#include "utils/debug_capabilities.h"
#include "utils/cpu_utils.hpp"

#include "openvino/core/parallel.hpp"

#define FLOAT_MAX 3.4028235e38f
#define FLOAT_MIN -3.4028235e38f

namespace ov {
namespace intel_cpu {

using namespace executor;
using namespace ov::element;

template <typename T>
static std::vector<T> normalizeDimsTo2D(const std::vector<T>& dims) {
return {std::accumulate(dims.begin(), dims.end() - 1, (T)1, std::multiplies<T>()), dims[dims.size() - 1]};
}

bool MatMulKleidiAIExecutor::supports(const FCConfig& config) {
return true;
}

MatMulKleidiAIExecutor::MatMulKleidiAIExecutor(const FCAttrs& attrs,
const PostOps& postOps,
const MemoryArgs& memory,
const ExecutorContext::CPtr& context)
: m_attrs(attrs),
m_memoryArgs(memory) {
auto srcMem = memory.at(ARG_SRC);
auto weiMem = memory.at(ARG_WEI);
auto weiDims = weiMem->getDesc().getShape().getDims();
auto N = weiDims[0];
auto K = weiDims[1];

if (memory.at(ARG_BIAS)->getDataAs<float>() == nullptr) {
auto biasDesc = std::make_shared<CpuBlockedMemoryDesc>(f32, Shape({N}));
biasMem = std::make_shared<Memory>(context->getEngine(), biasDesc);
biasMem->nullify();
} else {
biasMem = memory.at(ARG_BIAS);
}
if (memory.at(ARG_SRC)->getPrecision() != memory.at(ARG_WEI)->getPrecision()) {
aclfcAttrs.isConvertedWeights = true;
}
auto originalWeightsDesc = memory.at(ARG_WEI)->getDescPtr();
const auto& wgtDims = originalWeightsDesc->getShape().getStaticDims();
const VectorDims wgtDims2D = reshapeDownToRank<2>(wgtDims);
originalWeightsDesc = std::make_shared<CpuBlockedMemoryDesc>(originalWeightsDesc->getPrecision(), Shape{wgtDims2D});
auto dnnlSrcDesc = MemoryDescUtils::convertToDnnlMemoryDesc(originalWeightsDesc);
auto dstDesc = originalWeightsDesc->cloneWithNewPrecision(memory.at(ARG_SRC)->getDescPtr()->getPrecision());
auto dnnlDstDesc = MemoryDescUtils::convertToDnnlMemoryDesc(dstDesc);
if (!attrs.weightsNonTransposed) {
dnnlDstDesc = acl_fc_executor::makeTransposedWeightDescriptor(dnnlDstDesc, dnnlSrcDesc);
aclfcAttrs.isWeightsRepacked = true;
}
packedWeights = acl_fc_executor::reorderWeights(memory, context, aclfcAttrs, dnnlSrcDesc, dnnlDstDesc);
const size_t rhsPackedSize = kai_get_rhs_packed_size_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon(N, K);
auto rhsPackedDesc = std::make_shared<CpuBlockedMemoryDesc>(f32, Shape({rhsPackedSize}));
rhsPackedMem = std::make_shared<Memory>(context->getEngine(), rhsPackedDesc);

float* bias = biasMem->getDataAs<float>();
float* rhs_packed = static_cast<float*>(rhsPackedMem->getData());
float* rhs = static_cast<float*>(packedWeights->getData());
const size_t rhs_stride = N * sizeof(float);

const size_t nr = ukernel.get_nr();
const size_t kr = ukernel.get_kr();
const size_t sr = ukernel.get_sr();

kai_run_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon(
1, N, K, nr, kr, sr, // Packing arguments
rhs_stride, // RHS stride
rhs, // RHS
bias, // Bias
nullptr, // Scale
rhs_packed, // RHS packed
0, nullptr);
}

bool MatMulKleidiAIExecutor::update(const MemoryArgs& memory) {
const auto& weiDesc = memory.at(ARG_WEI)->getDescPtr();
const auto& dstDesc = memory.at(ARG_DST)->getDescPtr();
const auto& wgtDims = weiDesc->getShape().getStaticDims();
// Weights are transposed by MatMulConstTransposesExtraction
// K is the IC of weight
// the weight is reshaped to [-1, K] in ConvertMatMulToFC
K = wgtDims[1];
N = wgtDims[0];

const auto& outDims = dstDesc->getShape().getStaticDims();
if (outDims.size() > 2) {
M = std::accumulate(outDims.begin(), outDims.end() - 1, 1, std::multiplies<size_t>());
} else {
M = outDims[0];
}
return true;
}

void MatMulKleidiAIExecutor::execute(const MemoryArgs& memory) {
size_t BLOCK_SIZE = 8;
auto srcMem = memory.at(ARG_SRC);
auto weiMem = memory.at(ARG_WEI);
auto dstMem = memory.at(ARG_DST);
auto srcDims = normalizeDimsTo2D(srcMem->getDesc().getShape().getDims());
auto weiDims = weiMem->getDesc().getShape().getDims();
auto M = srcDims[0];
auto K = srcDims[1];
auto N = weiDims[0];

const size_t lhs_stride = K * sizeof(float);
const size_t dst_stride_row = N * sizeof(float);
const size_t dst_stride_col = sizeof(float);

float* lhs = srcMem->getDataAs<float>();
float* rhs_packed = static_cast<float*>(rhsPackedMem->getData());
float* dst = dstMem->getDataAs<float>();

size_t n_blocks = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
parallel_for(n_blocks, [&](size_t n_block) {
size_t n_start = (n_block * BLOCK_SIZE);
size_t n_end = std::min(n_start + BLOCK_SIZE, N);
size_t n_block_size = n_end - n_start;
const size_t rhs_packed_offset = ukernel.get_rhs_packed_offset(n_start, K);
const size_t dst_offset = ukernel.get_dst_offset(0, n_start, dst_stride_row);
const float* rhs_ptr = (rhs_packed + rhs_packed_offset / sizeof(float));
float* dst_ptr = (dst + dst_offset / (sizeof(float)));
ukernel.run_matmul(
M,
n_block_size,
K,
lhs,
lhs_stride,
rhs_ptr,
dst_ptr,
dst_stride_row,
dst_stride_col,
FLOAT_MIN,
FLOAT_MAX);
});
}

void MatMulKleidiAIExecutor::moveMemToNumaNode(int numaNodeID) {
if (curNumaNode == numaNodeID)
return;
curNumaNode = numaNodeID;
mbind_move(packedWeights, numaNodeID);
if (m_attrs.withBias) {
mbind_move(m_memoryArgs.at(ARG_BIAS), numaNodeID);
}
}

} // namespace intel_cpu
} // namespace ov
Loading
Loading