openvinotoolkit · dmitry-gorokhov · Feb 17, 2025 · Feb 4, 2025 · Feb 4, 2025 · Feb 5, 2025
@@ -87,3 +87,6 @@
 [submodule "src/plugins/intel_cpu/thirdparty/shl"]
 	path = src/plugins/intel_cpu/thirdparty/shl
 	url = https://github.com/openvinotoolkit/shl.git
+[submodule "src/plugins/intel_cpu/thirdparty/kleidiai"]
+	path = src/plugins/intel_cpu/thirdparty/kleidiai
+	url = https://git.gitlab.arm.com/kleidi/kleidiai.git
@@ -8,6 +8,15 @@ endif()
 
 set(TARGET_NAME "openvino_intel_cpu_plugin")
 
+#TODO: Patch KleidiAI CMakeLists to remove absolute include path
+execute_process(
+    COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/kleidiai_include_directories_patch.diff
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/kleidiai
+    RESULT_VARIABLE patch_result
+    OUTPUT_VARIABLE patch_output
+    ERROR_VARIABLE patch_error
+)
+
 if((CMAKE_COMPILER_IS_GNUCXX OR OV_COMPILER_IS_CLANG) AND CMAKE_CXX_STANDARD GREATER_EQUAL 20)
     set(CMAKE_CXX_FLAGS "-Wno-error=deprecated ${CMAKE_CXX_FLAGS}")
 endif()
@@ -150,6 +159,13 @@ else()
 endif()
 ov_dependent_option(ENABLE_SHL_FOR_CPU "Enable SHL for OpenVINO CPU Plugin" ${ENABLE_SHL_FOR_CPU_DEFAULT} "RISCV64" OFF)
 
+if(ARM OR AARCH64)
+    set(ENABLE_KLEIDIAI_FOR_CPU_DEFAULT ON)
+else()
+    set(ENABLE_KLEIDIAI_FOR_CPU_DEFAULT OFF)
+endif()
+ov_dependent_option(ENABLE_KLEIDIAI_FOR_CPU "Enable KleidiAI for OpenVINO CPU Plugin" ${ENABLE_KLEIDIAI_FOR_CPU_DEFAULT} "ARM OR AARCH64" OFF)
+
 add_subdirectory(thirdparty)
 
 if(WIN32)
@@ -175,6 +191,11 @@ if(DNNL_USE_ACL)
     set(OV_CPU_WITH_ACL ON)
 endif()
 
+if(ENABLE_KLEIDIAI_FOR_CPU)
+    add_definitions(-DOV_CPU_WITH_KLEIDIAI)
+    set(OV_CPU_WITH_KLEIDIAI ON)
+endif()
+
 if (ENABLE_SHL_FOR_CPU)
     add_definitions(-DOV_CPU_WITH_SHL)
     set(OV_CPU_WITH_SHL ON)
@@ -210,6 +231,7 @@ if(NOT (AARCH64 OR ARM))
     list(APPEND EXCLUDE_PATHS ${CMAKE_CURRENT_SOURCE_DIR}/src/transformations/cpu_opset/arm/*
                               ${CMAKE_CURRENT_SOURCE_DIR}/src/emitters/plugin/aarch64/*
                               ${CMAKE_CURRENT_SOURCE_DIR}/src/nodes/executors/aarch64/*
+                              ${CMAKE_CURRENT_SOURCE_DIR}/src/nodes/executors/kleidiai/*
                               ${CMAKE_CURRENT_SOURCE_DIR}/src/nodes/kernels/aarch64/*)
 endif()
 
@@ -269,6 +291,10 @@ endif ()
 if(ENABLE_SHL_FOR_CPU)
     target_link_libraries(${TARGET_NAME} PRIVATE shl)
 endif()
+if(ENABLE_KLEIDIAI_FOR_CPU)
+    target_link_libraries(${TARGET_NAME} PRIVATE kleidiai)
+endif()
+
 target_include_directories(${TARGET_NAME} SYSTEM PRIVATE $<TARGET_PROPERTY:dnnl,INCLUDE_DIRECTORIES>)
 
 # Temporal solution to use template reference implementations in cases where optimizied implementation
@@ -372,6 +398,10 @@ if(BUILD_SHARED_LIBS)
         target_include_directories(${TARGET_NAME}_obj SYSTEM PUBLIC $<TARGET_PROPERTY:shl,INTERFACE_INCLUDE_DIRECTORIES>)
     endif()
 
+    if(ENABLE_KLEIDIAI_FOR_CPU)
+        target_include_directories(${TARGET_NAME}_obj SYSTEM PUBLIC $<TARGET_PROPERTY:kleidiai,INTERFACE_INCLUDE_DIRECTORIES>)
+    endif()
+
     ov_set_threading_interface_for(${TARGET_NAME}_obj)
 
     target_compile_definitions(${TARGET_NAME}_obj PRIVATE USE_STATIC_IE)

@@ -638,6 +638,7 @@ std::string Node::getPrimitiveDescriptorType() const {
     SEARCH_TYPE(sparse);
     SEARCH_TYPE(acl);
     SEARCH_TYPE(shl);
+    SEARCH_TYPE(kleidiai);
     SEARCH_TYPE(_dw);
     SEARCH_TYPE(_1x1);
 
@@ -1331,6 +1332,7 @@ const std::vector<impl_desc_type>& Node::getDefaultImplPriority() {
 #endif
             impl_desc_type::gemm_any, impl_desc_type::gemm_blas, impl_desc_type::gemm_avx512, impl_desc_type::gemm_avx2,
             impl_desc_type::gemm_avx, impl_desc_type::gemm_sse42, impl_desc_type::gemm_acl, impl_desc_type::acl,
+            impl_desc_type::gemm_kleidiai, impl_desc_type::kleidiai,
             impl_desc_type::jit_gemm, impl_desc_type::ref_any, impl_desc_type::ref,
     };
 

@@ -23,6 +23,7 @@ std::string ExecutorTypeToString(const ExecutorType type) {
         CASE(Mlas);
         CASE(jit_aarch64);
         CASE(Shl);
+        CASE(Kleidiai);
     }
 #undef CASE
     return "Undefined";
@@ -42,6 +43,7 @@ ExecutorType ExecutorTypeFromString(const std::string& typeStr) {
     CASE(Mlas);
     CASE(jit_aarch64);
     CASE(Shl);
+    CASE(Kleidiai);
 #undef CASE
     return ExecutorType::Undefined;
 }

@@ -52,6 +52,12 @@ namespace intel_cpu {
 #    define OV_CPU_INSTANCE_DNNL(...)
 #endif
 
+#if defined(OV_CPU_WITH_KLEIDIAI)
+#    define OV_CPU_INSTANCE_KLEIDIAI(...) {__VA_ARGS__},
+#else
+#    define OV_CPU_INSTANCE_KLEIDIAI(...)
+#endif
+
 #if defined(OPENVINO_ARCH_X86_64)
 #    define OV_CPU_INSTANCE_X64(...) {__VA_ARGS__},
 #else
@@ -75,7 +81,7 @@ namespace intel_cpu {
 // @todo another option is to determine shape relation by executor type
 enum class ShapeTolerance { Agnostic, Dependant };
 
-enum class ExecutorType { Undefined, Graph, Common, jit_x64, Dnnl, Acl, Mlas, jit_aarch64, Shl };
+enum class ExecutorType { Undefined, Graph, Common, jit_x64, Dnnl, Acl, Mlas, jit_aarch64, Shl, Kleidiai };
 
 enum class OperationType { FullyConnected, MatMul, Convolution };
 

@@ -30,6 +30,11 @@
 #include "utils/cpp/maybe_unused.hpp"
 #include "utils/debug_capabilities.h"
 
+#if defined(OV_CPU_WITH_KLEIDIAI) && defined(OPENVINO_ARCH_ARM64)
+// kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla.h included in kleidiai_mm.hpp supports aarch64 only
+#include "nodes/executors/kleidiai/kleidiai_mm.hpp"
+#endif
+
 #if defined(OV_CPU_WITH_ACL)
 #    include "nodes/executors/acl/acl_fullyconnected.hpp"
 #    include "nodes/executors/acl/acl_lowp_fullyconnected.hpp"
@@ -225,6 +230,35 @@ OV_CPU_MAYBE_UNUSED_FUNCTION static inline bool noPostOps(const FCConfig& config
 template <>
 const std::vector<ExecutorImplementation<FCAttrs>>& getImplementations() {
     static const std::vector<ExecutorImplementation<FCAttrs>> fullyconnectedImplementations {
+#if defined(OPENVINO_ARCH_ARM64)
+        OV_CPU_INSTANCE_KLEIDIAI(
+            "fullyconnected_kleidiai",
+            ExecutorType::Kleidiai,
+            OperationType::MatMul,
+            ShapeTolerance::Agnostic,
+            // supports
+            [](const FCConfig& config) -> bool {
+                VERIFY(noPostOps(config), UNSUPPORTED_POST_OPS);
+                VERIFY(noSparseDecompression(config), UNSUPPORTED_SPARSE_WEIGHTS);
+                VERIFY(noWeightsDecompression(config), UNSUPPORTED_WEIGHTS_DECOMPRESSION);
+                VERIFY(everyone_is(f32, srcType(config), weiType(config), dstType(config)), UNSUPPORTED_SRC_PRECISIONS);
+                VERIFY(srcRank(config) == 2U, UNSUPPORTED_SRC_RANK);
+                VERIFY(weiRank(config) == 2U, UNSUPPORTED_WEI_RANK);
+                return MatMulKleidiAIExecutor::supports(config);
+            },
+            // requiresFallback
+            [](const FCConfig& config) -> ov::optional<executor::Config<FCAttrs>> {
+                return {};
+            },
+            // acceptsShapes
+            [](const MemoryArgs& memory) -> bool {
+                return true;
+            },
+            // create
+            [](const FCAttrs& attrs, const PostOps& postOps, const MemoryArgs& memory, ExecutorContext::CPtr context) {
+                return std::make_shared<MatMulKleidiAIExecutor>(attrs, postOps, memory, context);
+            })
+#endif
         OV_CPU_INSTANCE_MLAS_X64(
             "fullyconnected_mlas",
             ExecutorType::Mlas,

@@ -0,0 +1,169 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "kleidiai_mm.hpp"
+
+#include <cstdint>
+#include <memory>
+
+#include "cpu_memory.h"
+#include "memory_desc/cpu_blocked_memory_desc.h"
+#include "memory_desc/cpu_memory_desc_utils.h"
+#include "nodes/executors/executor.hpp"
+#include "nodes/executors/fullyconnected_config.hpp"
+#include "nodes/executors/memory_arguments.hpp"
+#include "utils/debug_capabilities.h"
+#include "utils/cpu_utils.hpp"
+
+#include "openvino/core/parallel.hpp"
+
+#define FLOAT_MAX 3.4028235e38f
+#define FLOAT_MIN -3.4028235e38f
+
+namespace ov {
+namespace intel_cpu {
+
+using namespace executor;
+using namespace ov::element;
+
+template <typename T>
+static std::vector<T> normalizeDimsTo2D(const std::vector<T>& dims) {
+    return {std::accumulate(dims.begin(), dims.end() - 1, (T)1, std::multiplies<T>()), dims[dims.size() - 1]};
+}
+
+bool MatMulKleidiAIExecutor::supports(const FCConfig& config) {
+    return true;
+}
+
+MatMulKleidiAIExecutor::MatMulKleidiAIExecutor(const FCAttrs& attrs,
+                                               const PostOps& postOps,
+                                               const MemoryArgs& memory,
+                                               const ExecutorContext::CPtr& context)
+    : m_attrs(attrs),
+      m_memoryArgs(memory) {
+    auto srcMem = memory.at(ARG_SRC);
+    auto weiMem = memory.at(ARG_WEI);
+    auto weiDims = weiMem->getDesc().getShape().getDims();
+    auto N = weiDims[0];
+    auto K = weiDims[1];
+
+    if (memory.at(ARG_BIAS)->getDataAs<float>() == nullptr) {
+        auto biasDesc = std::make_shared<CpuBlockedMemoryDesc>(f32, Shape({N}));
+        biasMem = std::make_shared<Memory>(context->getEngine(), biasDesc);
+        biasMem->nullify();
+    } else {
+        biasMem = memory.at(ARG_BIAS);
+    }
+    if (memory.at(ARG_SRC)->getPrecision() != memory.at(ARG_WEI)->getPrecision()) {
+        aclfcAttrs.isConvertedWeights = true;
+    }
+    auto originalWeightsDesc = memory.at(ARG_WEI)->getDescPtr();
+    const auto& wgtDims = originalWeightsDesc->getShape().getStaticDims();
+    const VectorDims wgtDims2D = reshapeDownToRank<2>(wgtDims);
+    originalWeightsDesc = std::make_shared<CpuBlockedMemoryDesc>(originalWeightsDesc->getPrecision(), Shape{wgtDims2D});
+    auto dnnlSrcDesc = MemoryDescUtils::convertToDnnlMemoryDesc(originalWeightsDesc);
+    auto dstDesc = originalWeightsDesc->cloneWithNewPrecision(memory.at(ARG_SRC)->getDescPtr()->getPrecision());
+    auto dnnlDstDesc = MemoryDescUtils::convertToDnnlMemoryDesc(dstDesc);
+    if (!attrs.weightsNonTransposed) {
+        dnnlDstDesc = acl_fc_executor::makeTransposedWeightDescriptor(dnnlDstDesc, dnnlSrcDesc);
+        aclfcAttrs.isWeightsRepacked = true;
+    }
+    packedWeights = acl_fc_executor::reorderWeights(memory, context, aclfcAttrs, dnnlSrcDesc, dnnlDstDesc);
+    const size_t rhsPackedSize = kai_get_rhs_packed_size_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon(N, K);
+    auto rhsPackedDesc = std::make_shared<CpuBlockedMemoryDesc>(f32, Shape({rhsPackedSize}));
+    rhsPackedMem = std::make_shared<Memory>(context->getEngine(), rhsPackedDesc);
+
+    float* bias = biasMem->getDataAs<float>();
+    float* rhs_packed = static_cast<float*>(rhsPackedMem->getData());
+    float* rhs = static_cast<float*>(packedWeights->getData());
+    const size_t rhs_stride = N * sizeof(float);
+
+    const size_t nr = ukernel.get_nr();
+    const size_t kr = ukernel.get_kr();
+    const size_t sr = ukernel.get_sr();
+
+    kai_run_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon(
+            1, N, K, nr, kr, sr,  // Packing arguments
+            rhs_stride,           // RHS stride
+            rhs,                  // RHS
+            bias,                 // Bias
+            nullptr,              // Scale
+            rhs_packed,           // RHS packed
+            0, nullptr);
+}
+
+bool MatMulKleidiAIExecutor::update(const MemoryArgs& memory) {
+    const auto& weiDesc = memory.at(ARG_WEI)->getDescPtr();
+    const auto& dstDesc = memory.at(ARG_DST)->getDescPtr();
+    const auto& wgtDims = weiDesc->getShape().getStaticDims();
+    // Weights are transposed by MatMulConstTransposesExtraction
+    // K is the IC of weight
+    // the weight is reshaped to [-1, K] in ConvertMatMulToFC
+    K = wgtDims[1];
+    N = wgtDims[0];
+
+    const auto& outDims = dstDesc->getShape().getStaticDims();
+    if (outDims.size() > 2) {
+        M = std::accumulate(outDims.begin(), outDims.end() - 1, 1, std::multiplies<size_t>());
+    } else {
+        M = outDims[0];
+    }
+    return true;
+}
+
+void MatMulKleidiAIExecutor::execute(const MemoryArgs& memory) {
+    size_t BLOCK_SIZE = 8;
+    auto srcMem = memory.at(ARG_SRC);
+    auto weiMem = memory.at(ARG_WEI);
+    auto dstMem = memory.at(ARG_DST);
+    auto srcDims = normalizeDimsTo2D(srcMem->getDesc().getShape().getDims());
+    auto weiDims = weiMem->getDesc().getShape().getDims();
+    auto M = srcDims[0];
+    auto K = srcDims[1];
+    auto N = weiDims[0];
+
+    const size_t lhs_stride = K * sizeof(float);
+    const size_t dst_stride_row = N * sizeof(float);
+    const size_t dst_stride_col = sizeof(float);
+
+    float* lhs = srcMem->getDataAs<float>();
+    float* rhs_packed = static_cast<float*>(rhsPackedMem->getData());
+    float* dst = dstMem->getDataAs<float>();
+
+    size_t n_blocks = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    parallel_for(n_blocks, [&](size_t n_block) {
+        size_t n_start = (n_block * BLOCK_SIZE);
+        size_t n_end = std::min(n_start + BLOCK_SIZE, N);
+        size_t n_block_size = n_end - n_start;
+        const size_t rhs_packed_offset = ukernel.get_rhs_packed_offset(n_start, K);
+        const size_t dst_offset = ukernel.get_dst_offset(0, n_start, dst_stride_row);
+        const float* rhs_ptr = (rhs_packed + rhs_packed_offset / sizeof(float));
+        float* dst_ptr = (dst + dst_offset / (sizeof(float)));
+        ukernel.run_matmul(
+                M,
+                n_block_size,
+                K,
+                lhs,
+                lhs_stride,
+                rhs_ptr,
+                dst_ptr,
+                dst_stride_row,
+                dst_stride_col,
+                FLOAT_MIN,
+                FLOAT_MAX);
+    });
+}
+
+void MatMulKleidiAIExecutor::moveMemToNumaNode(int numaNodeID) {
+    if (curNumaNode == numaNodeID)
+        return;
+    curNumaNode = numaNodeID;
+    mbind_move(packedWeights, numaNodeID);
+    if (m_attrs.withBias) {
+        mbind_move(m_memoryArgs.at(ARG_BIAS), numaNodeID);
+    }
+}
+
+}  // namespace intel_cpu
+}  // namespace ov