move neural_speed gemms to contrib_ops

luoyu-intel · Jan 9, 2024 · 009adb6 · 009adb6
1 parent 3b1155e
commit 009adb6
Show file tree

Hide file tree

Showing 14 changed files with 624 additions and 955 deletions.
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
@@ -1177,8 +1177,8 @@ if (onnxruntime_USE_DNNL)
   add_compile_definitions(DNNL_OPENMP)
 endif()
 
-
-if (onnxruntime_USE_NEURAL_SPEED AND NOT onnxruntime_MINIMAL_BUILD)
+set(USE_NEURAL_SPEED FALSE)
+if (onnxruntime_USE_NEURAL_SPEED)
   include(neural_speed)
 endif()
 

diff --git a/cmake/external/neural_speed.cmake b/cmake/external/neural_speed.cmake
@@ -1,7 +1,6 @@
-set(BTLA_URL https://github.com/intel/neural-speed.git)
-set(BTLA_TAG 368ccbd2823e7ecef862d09e7b2385e6b2553081) # bestla v0.1
+set(NEURAL_SPEED_URL https://github.com/intel/neural-speed.git)
+set(NEURAL_SPEED_TAG 18720b319d6921c28e59cc9e003e50cee9a85fcc) # kernel-only release v0.2
 
-set(USE_NEURAL_SPEED FALSE)
 if (onnxruntime_USE_NEURAL_SPEED)
   if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" AND onnxruntime_target_platform STREQUAL "x86_64")
     set(USE_NEURAL_SPEED TRUE)
@@ -10,11 +9,11 @@ if (onnxruntime_USE_NEURAL_SPEED)
   endif()
   if(USE_NEURAL_SPEED)
     FetchContent_Declare(
-        bestla
-        GIT_REPOSITORY ${BTLA_URL}
-        GIT_TAG        ${BTLA_TAG}
+        neural_speed
+        GIT_REPOSITORY ${NEURAL_SPEED_URL}
+        GIT_TAG        ${NEURAL_SPEED_TAG}
     )
-    FetchContent_MakeAvailable(bestla)
-    add_compile_definitions(MLAS_NEURAL_SPEED)
+    FetchContent_MakeAvailable(neural_speed)
+    add_compile_definitions(ORT_NEURAL_SPEED)
   endif()
 endif()
diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
@@ -45,14 +45,6 @@ endif()
 
 set(ONNXRUNTIME_MLAS_LIBS onnxruntime_mlas)
 
-function(add_neural_speed)
-    target_link_libraries(onnxruntime_mlas PRIVATE bestla::bestla)
-    target_sources(onnxruntime_mlas PRIVATE
-        ${MLAS_SRC_DIR}/bestla_gemm.cpp
-     )
-    set_target_properties(${target_name} PROPERTIES COMPILE_WARNING_AS_ERROR OFF)
-endfunction()
-
 #TODO: set MASM flags properly
 function(setup_mlas_source_for_windows)
 
@@ -611,10 +603,6 @@ else()
     target_sources(onnxruntime_mlas PRIVATE ${mlas_platform_srcs})
 endif()
 
-if(USE_NEURAL_SPEED)
-  add_neural_speed()
-endif()
-
 foreach(mlas_target ${ONNXRUNTIME_MLAS_LIBS})
     target_include_directories(${mlas_target} PRIVATE ${ONNXRUNTIME_ROOT}/core/mlas/inc ${MLAS_SRC_DIR})
     onnxruntime_add_include_to_target(${mlas_target} ${GSL_TARGET})

diff --git a/cmake/onnxruntime_providers_cpu.cmake b/cmake/onnxruntime_providers_cpu.cmake
@@ -60,6 +60,13 @@ if(NOT onnxruntime_DISABLE_CONTRIB_OPS)
       "${ONNXRUNTIME_ROOT}/contrib_ops/cpu/aten_ops/aten_op_executor.cc"
     )
   endif()
+  if(NOT USE_NEURAL_SPEED)
+    list(REMOVE_ITEM onnxruntime_cpu_contrib_ops_srcs
+        "${ONNXRUNTIME_ROOT}/contrib_ops/cpu/quantization/bestla_defs.h"
+        "${ONNXRUNTIME_ROOT}/contrib_ops/cpu/quantization/bestla_gemm.cc"
+        "${ONNXRUNTIME_ROOT}/contrib_ops/cpu/quantization/bestla_gemm.h"
+    )
+  endif()
   # add using ONNXRUNTIME_ROOT so they show up under the 'contrib_ops' folder in Visual Studio
   source_group(TREE ${ONNXRUNTIME_ROOT} FILES ${onnxruntime_cpu_contrib_ops_srcs})
   list(APPEND onnxruntime_providers_src ${onnxruntime_cpu_contrib_ops_srcs})
@@ -144,6 +151,11 @@ if (HAS_BITWISE_INSTEAD_OF_LOGICAL)
   target_compile_options(onnxruntime_providers PRIVATE "-Wno-bitwise-instead-of-logical")
 endif()
 
+if(USE_NEURAL_SPEED)
+  target_link_libraries(onnxruntime_providers PRIVATE bestla::bestla)
+  set_target_properties(onnxruntime_providers PROPERTIES COMPILE_WARNING_AS_ERROR OFF) # ignore warnings inside neural-speed
+endif()
+
 if (MSVC)
    target_compile_options(onnxruntime_providers PRIVATE "/bigobj")
 #   if(NOT CMAKE_SIZEOF_VOID_P EQUAL 8)

diff --git a/onnxruntime/core/mlas/lib/bestla_defs.h → ...ontrib_ops/cpu/quantization/bestla_defs.h b/onnxruntime/core/mlas/lib/bestla_defs.h → ...ontrib_ops/cpu/quantization/bestla_defs.h
@@ -11,8 +11,7 @@ Licensed under the MIT License.
 #include "bestla/bestla_prologue_a.h"
 #include "bestla/bestla_wrapper.h"
 
-namespace bestla
-{
+namespace bestla {
 
 using tAVX512F = gemm::SCoreRowNAvx512f<48, 8>;
 using tAMX_BF16 = gemm::HCoreRowNAmxbf16<64, 16>;
@@ -33,14 +32,13 @@ using tWeiNInt = prologue_b::gemm::WeightKBlockNInteger<GC_T, ISA_T>;
 template <class GC_T, BTLA_ISA ISA_T>
 using tWeiNFloat = prologue_b::gemm::WeightKBlockNFloat<GC_T, ISA_T>;
 
-class ORTThreading : public parallel::IThreading
-{
-   public:
-    ORTThreading(void* tp);
-    void parallel_for(const parallel::thread_func& func) const override;
-    void set_threads(int nthreads) override { assert(0); }
-    void sync() const override { assert(0); }
-    void* mTp;
+class ORTThreading : public parallel::IThreading {
+ public:
+  explicit ORTThreading(void* tp);
+  void parallel_for(const parallel::thread_func& func) const override;
+  void set_threads(int nthreads) override { assert(0); }
+  void sync() const override { assert(0); }
+  void* mTp;
 };
 
 }  // namespace bestla