[mlas] add loongarch lsx and lasx optimize code (#17937)

### Description Hello we(@lixing-star) are the developers of loongson team. We add 128 (lsx), 256 (lasx) vector optimization code for the loongarch architecture [100% tests passed, 0 tests failed out of 7](https://cloud.a-boat.cn:2021/api/public/dl/6831z1Bi?inline=true) ### Development Environments1 ``` CPU: Loongson-3C5000L uname -a: Linux localhost.localdomain 4.19.190-6.4.lns8.loongarch64 #1 SMP Thu Jul 14 12:08:04 CST 2022 loongarch64 loongarch64 loongarch64 GNU/Linux ``` ### LonngArch Documents - [LoongArch Reference Manual - Volume 1: Basic Architecture: This manual describes the basic part of the LoongArch architecture.](https://loongson.github.io/LoongArch-Documentation/LoongArch-Vol1-EN.html) - [LoongArch ELF psABI: This manual describes the LoongArch ELF psABI.](https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-EN.html) - [more](https://loongson.github.io/LoongArch-Documentation/README-EN.html)
microsoft · Dec 7, 2023 · 4abec97 · 4abec97
1 parent a045be3
commit 4abec97
Show file tree

Hide file tree

Showing 41 changed files with 7,696 additions and 34 deletions.
diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
@@ -284,6 +284,8 @@ else()
           set(X86 TRUE)
         elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|amd64)$")
           set(X86_64 TRUE)
+        elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^loongarch64.*")
+          set(LOONGARCH64 TRUE)
         endif()
     endif()
 
@@ -575,6 +577,26 @@ else()
           set(MLAS_SOURCE_IS_NOT_SET 0)
         endif()
     endif()
+    if(LOONGARCH64 AND MLAS_SOURCE_IS_NOT_SET)
+        set(mlas_platform_srcs
+          ${MLAS_SRC_DIR}/qgemm_kernel_lsx.cpp
+          ${MLAS_SRC_DIR}/loongarch64/SgemmKernelLasx.S
+          ${MLAS_SRC_DIR}/loongarch64/DgemmKernelLsx.S
+          ${MLAS_SRC_DIR}/loongarch64/DgemmKernelLasx.S
+          ${MLAS_SRC_DIR}/loongarch64/SgemmKernelLsx.S
+          ${MLAS_SRC_DIR}/loongarch64/SconvKernelLsx.S
+          ${MLAS_SRC_DIR}/loongarch64/SconvKernelLasx.S
+          ${MLAS_SRC_DIR}/loongarch64/SpoolKernelLSX.S
+          ${MLAS_SRC_DIR}/loongarch64/SpoolKernelLasx.S
+          ${MLAS_SRC_DIR}/loongarch64/SgemmTransposePackB16x4LSX.S
+          ${MLAS_SRC_DIR}/loongarch64/SgemmTransposePackB16x4Lasx.S
+          ${MLAS_SRC_DIR}/loongarch64/SoftmaxKernelLasx.S
+            )
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mlsx -mlasx")
+        if(NOT ONNXRUNTIME_MLAS_MULTI_ARCH)
+          set(MLAS_SOURCE_IS_NOT_SET 0)
+        endif()
+    endif()
     if(NOT ONNXRUNTIME_MLAS_MULTI_ARCH AND MLAS_SOURCE_IS_NOT_SET)
         file(GLOB_RECURSE mlas_platform_srcs
           "${MLAS_SRC_DIR}/scalar/*.cpp")

diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h
@@ -69,6 +69,9 @@ Module Name:
 #endif
 #endif
 
+#if defined(__loongarch64)
+#define MLAS_TARGET_LARCH64
+#endif
 //
 // Define the support levels for the target architecture.
 //
@@ -87,7 +90,7 @@ Module Name:
 
 #define MLAS_F16VEC_INTRINSICS_SUPPORTED
 
-#endif // 
+#endif //
 #endif // ARM64
 #endif // Visual Studio 16 or earlier does not support fp16 intrinsic
 
@@ -1619,7 +1622,7 @@ MlasHalfGemmConvertPackB(
  * @param Channels      # of input channels
  * @param OutputCount   # of output pixels
  * @param KernelSize    # kernel size
- * @return 
+ * @return
 */
 void
 MLASCALL
@@ -1657,7 +1660,7 @@ MlasTranspose(
  * @param Channels      C in NHWC
  * @param OutputCount   Number of output pixels
  * @param KernelSize    Size of the kernel
- * @return 
+ * @return
 */
 void
 MLASCALL
@@ -1676,7 +1679,7 @@ MlasNhwcMaxPool(
  * @param Channels      C in NHWC
  * @param OutputCount   Number of output pixels
  * @param KernelSize    size of the kernel
- * @return 
+ * @return
 */
 void
 MLASCALL

diff --git a/onnxruntime/core/mlas/lib/activate.cpp b/onnxruntime/core/mlas/lib/activate.cpp
@@ -143,6 +143,8 @@ struct MLAS_ACTIVATION_FUNCTION<MlasLeakyReluActivation>
         return MlasBlendFloat32x4(ValueTimesAlpha, Value, _mm_cmple_ps(ZeroFloat32x4, Value));
 #elif defined(MLAS_VSX_INTRINSICS)
         return vec_sel(ValueTimesAlpha, Value, vec_cmple(ZeroFloat32x4, Value));
+#elif defined(MLAS_LSX_INTRINSICS)
+        return MlasBlendFloat32x4(ValueTimesAlpha, Value, (__m128)__lsx_vfcmp_cle_s(ZeroFloat32x4, Value));
 #else
         return MlasBlendFloat32x4(ValueTimesAlpha, Value, ZeroFloat32x4 < Value);
 #endif

diff --git a/onnxruntime/core/mlas/lib/compute.cpp b/onnxruntime/core/mlas/lib/compute.cpp
@@ -148,6 +148,9 @@ Return Value:
     // instead.
     normal = _mm_min_epi16(normal, MaximumExponent);
     normal = _mm_max_epi16(normal, MinimumExponent);
+#elif defined(MLAS_LSX_INTRINSICS)
+    normal = __lsx_vmin_h(normal, MaximumExponent);
+    normal = __lsx_vmax_h(normal, MinimumExponent);
 #else
     normal = MlasMinimumInt32x4(normal, MaximumExponent);
     normal = MlasMaximumInt32x4(normal, MinimumExponent);
@@ -215,6 +218,8 @@ Return Value:
             // N.B. SSE2 lacks a broadcast load instruction, so avoid a shuffle
             // and use zeroes for the upper elements.
             Vector = _mm_load_ss(Input);
+#elif defined(MLAS_LSX_INTRINSICS)
+            Vector = (MLAS_FLOAT32X4)__lsx_vldrepl_w(Input, 0);
 #else
             Vector = MlasBroadcastFloat32x4(Input);
 #endif
@@ -467,6 +472,8 @@ Return Value:
         // N.B. SSE2 lacks a broadcast load instruction, so avoid a shuffle and
         // use zeroes for the upper elements.
         MLAS_FLOAT32X4 Vector = _mm_load_ss(Input);
+#elif defined(MLAS_LSX_INTRINSICS)
+        MLAS_FLOAT32X4 Vector = (MLAS_FLOAT32X4)__lsx_vldrepl_w(Input, 0);
 #else
         MLAS_FLOAT32X4 Vector = MlasBroadcastFloat32x4(Input);
 #endif
@@ -849,7 +856,7 @@ Return Value:
         // Find the maximum value for the row.
         //
 
-#if defined(MLAS_TARGET_AMD64)
+#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64)
         float Maximum = GetMlasPlatform().ReduceMaximumF32Kernel(Input, D);
 #else
         float Maximum = MlasReduceMaximumF32Kernel(Input, D);
@@ -874,7 +881,7 @@ Return Value:
 
             float Parameters[] = { NegativeMaximum, std::log(Accumulation)};
 
-#if defined(MLAS_TARGET_AMD64)
+#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64)
             GetMlasPlatform().ComputeLogSoftmaxOutputF32Kernel(Input, Output, D, Parameters);
 #else
             MlasComputeLogSoftmaxOutputF32Kernel(Input, Output, D, Parameters);
@@ -899,7 +906,7 @@ Return Value:
 
             float Parameters[] = { 1.0f / Accumulation };
 
-#if defined(MLAS_TARGET_AMD64)
+#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64)
             GetMlasPlatform().ComputeSoftmaxOutputF32Kernel(Output, D, Parameters);
 #else
             MlasComputeSoftmaxOutputF32Kernel(Output, D, Parameters);

diff --git a/onnxruntime/core/mlas/lib/dgemm.cpp b/onnxruntime/core/mlas/lib/dgemm.cpp
@@ -530,7 +530,7 @@ Return Value:
 
         size_t RowsHandled;
 
-#if defined(MLAS_TARGET_AMD64_IX86) || defined (MLAS_TARGET_POWER)
+#if defined(MLAS_TARGET_AMD64_IX86) || defined(MLAS_TARGET_POWER) || defined(MLAS_TARGET_LARCH64)
         RowsHandled = GetMlasPlatform().GemmDoubleKernel(A, B, C, CountK, CountM, CountN, lda, ldc, alpha, ZeroMode);
 #else
         if (ZeroMode) {

diff --git a/onnxruntime/core/mlas/lib/loongarch64/DgemmKernelCommon.h b/onnxruntime/core/mlas/lib/loongarch64/DgemmKernelCommon.h
@@ -0,0 +1,27 @@
+/*++
+
+Copyright (C) 2023 Loongson Technology Corporation Limited. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    DgemmKernelCommon.h
+
+Abstract:
+
+    This module contains common kernel macros and structures for the double
+    precision matrix/matrix multiply operation (DGEMM).
+
+--*/
+
+#define     LFgemmElementShift      3
+#define     LFgemmElementSize       (1 << LFgemmElementShift)
+#define     LFgemmYmmElementCount   (32/LFgemmElementSize)
+
+#include "FgemmKernelCommon.h"
+
+FGEMM_TYPED_INSTRUCTION(xvfadd, xvfadd.d)
+FGEMM_TYPED_INSTRUCTION(xvfmadd, xvfmadd.d)
+FGEMM_TYPED_INSTRUCTION(xvldrepl, xvldrepl.d)
+FGEMM_TYPED_INSTRUCTION(xvfmul, xvfmul.d)
diff --git a/onnxruntime/core/mlas/lib/loongarch64/DgemmKernelLasx.S b/onnxruntime/core/mlas/lib/loongarch64/DgemmKernelLasx.S
@@ -0,0 +1,32 @@
+/*++
+
+Copyright (C) 2023 Loongson Technology Corporation Limited. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    DgemmKernelLasx.s
+
+Abstract:
+
+    This module implements the kernels for the double precision matrix/matrix
+    multiply operation (DGEMM).
+
+    This implementation uses Lasx instructions.
+
+--*/
+
+#include "asmmacro.h"
+#include "DgemmKernelCommon.h"
+#include "FgemmKernelLasxCommon.h"
+
+        .text
+
+//
+// Generate the GEMM kernel.
+//
+
+FgemmKernelLasxFunction MlasGemmDoubleKernelLasx
+
+        .end