From 4ceb69587acfad44cae29ebd1d5a00482f1ffd85 Mon Sep 17 00:00:00 2001
From: youdao <wangjiachen@BIH-L-51551.youdao.internal>
Date: Fri, 4 Jun 2021 09:57:53 +0800
Subject: [PATCH] v1.0

---
 .gitignore                                    |    4 +
 CMakeLists.txt                                |  102 +
 LICENSE                                       |  201 +
 ReadMe.md                                     |   70 +
 ReadMe_ZH.md                                  |   65 +
 bench/test_sgemm_en1.png                      |  Bin 0 -> 30142 bytes
 bench/test_sgemm_en2.png                      |  Bin 0 -> 33434 bytes
 bench/test_sgemm_zh1.png                      |  Bin 0 -> 40646 bytes
 bench/test_sgemm_zh2.png                      |  Bin 0 -> 38261 bytes
 doc/EMLL.png                                  |  Bin 0 -> 3927 bytes
 doc/Usage_EN.md                               |  141 +
 doc/Usage_ZH.md                               |  145 +
 example/CMakeLists.txt                        |   42 +
 example/Gemm.c                                |  195 +
 example/Usage_EN.md                           |   61 +
 example/Usage_ZH.md                           |   61 +
 include/Gemm.h                                |  120 +
 include/Layer.h                               |   55 +
 include/Quant.h                               |  254 +
 include/arm_neon/ARMCompareAndSwap.h          |   56 +
 include/arm_neon/ARMCpuType.h                 |   85 +
 include/arm_neon/NeonBias.h                   |  200 +
 include/arm_neon/NeonExtreme.h                |  112 +
 include/arm_neon/NeonI8I32DotGemmSkinnyDot.h  |  153 +
 include/arm_neon/NeonI8I32MlaGemmCopy.h       |  181 +
 include/arm_neon/NeonI8I32MlaGemmKernel.h     |  742 +++
 include/arm_neon/NeonI8I32MlaGemmSkinnyDot.h  |  200 +
 include/arm_neon/NeonI8I32MlaGemmSkinnyGer.h  |  317 ++
 include/arm_neon/NeonIntOpSign.h              |  441 ++
 include/arm_neon/NeonQuant.h                  |  814 ++++
 include/arm_neon/NeonSgemmCopy.h              |  217 +
 include/arm_neon/NeonSgemmKernel.h            |  973 ++++
 include/arm_neon/NeonSum.h                    |  394 ++
 include/common/CommonCopy.h                   |  121 +
 include/common/CommonDriver.h                 |  497 ++
 include/common/CommonKernel.h                 |  190 +
 include/common/CommonLayer.h                  |   90 +
 include/common/CommonQuant.h                  |  311 ++
 include/common/CommonSched.h                  |  265 +
 include/common/CommonSkinnyDot.h              |  586 +++
 include/common/CommonSkinnyGer.h              |  526 ++
 include/common/CommonTest.h                   |  620 +++
 include/common/ExpandMacro.h                  |  932 ++++
 include/neon_armv7a/Bias.h                    |   35 +
 include/neon_armv7a/I8I32MlaGemmKernel.h      |  242 +
 include/neon_armv7a/S8S32MlaGemmCopy.h        |   31 +
 include/neon_armv7a/S8S32MlaGemmDriver.h      |   28 +
 include/neon_armv7a/S8S32MlaGemmKernel.h      |   29 +
 include/neon_armv7a/S8S32MlaGemmSkinnyDot.h   |   47 +
 include/neon_armv7a/S8S32MlaGemmSkinnyGer.h   |   47 +
 include/neon_armv7a/SgemmCopy.h               |   31 +
 include/neon_armv7a/SgemmDriver.h             |   27 +
 include/neon_armv7a/SgemmKernel.h             |   27 +
 include/neon_armv7a/SgemmSkinnyDot.h          |   67 +
 include/neon_armv7a/SgemmSkinnyGer.h          |   67 +
 include/neon_armv7a/U8U32MlaGemmCopy.h        |   31 +
 include/neon_armv7a/U8U32MlaGemmDriver.h      |   28 +
 include/neon_armv7a/U8U32MlaGemmKernel.h      |   29 +
 include/neon_armv7a/U8U32MlaGemmSkinnyDot.h   |   47 +
 include/neon_armv7a/U8U32MlaGemmSkinnyGer.h   |   47 +
 include/neon_armv8a/Bias.h                    |   36 +
 include/neon_armv8a/HgemmCopy.h               |   32 +
 include/neon_armv8a/HgemmDriver.h             |   25 +
 include/neon_armv8a/HgemmKernel.h             |   28 +
 include/neon_armv8a/HgemmSkinnyDot.h          |  116 +
 include/neon_armv8a/HgemmSkinnyGer.h          |  116 +
 include/neon_armv8a/I8I32DotGemmCopy.h        |  454 ++
 include/neon_armv8a/I8I32DotGemmKernel.h      | 1030 ++++
 include/neon_armv8a/I8I32MlaGemmKernel.h      |  378 ++
 include/neon_armv8a/I8I32MlaGemmSkinnyDot.h   |  501 ++
 include/neon_armv8a/S8S32DotGemmCopy.h        |   31 +
 include/neon_armv8a/S8S32DotGemmDriver.h      |   28 +
 include/neon_armv8a/S8S32DotGemmKernel.h      |   29 +
 include/neon_armv8a/S8S32DotGemmSkinnyDot.h   |  103 +
 include/neon_armv8a/S8S32MlaGemmCopy.h        |   31 +
 include/neon_armv8a/S8S32MlaGemmDriver.h      |   28 +
 include/neon_armv8a/S8S32MlaGemmKernel.h      |   29 +
 include/neon_armv8a/S8S32MlaGemmSkinnyDot.h   |   75 +
 include/neon_armv8a/S8S32MlaGemmSkinnyGer.h   |   74 +
 include/neon_armv8a/SgemmCopy.h               |   31 +
 include/neon_armv8a/SgemmDriver.h             |   27 +
 include/neon_armv8a/SgemmKernel.h             |   26 +
 include/neon_armv8a/SgemmSkinnyDot.h          |  319 ++
 include/neon_armv8a/SgemmSkinnyGer.h          |   91 +
 include/neon_armv8a/U8U32DotGemmCopy.h        |   31 +
 include/neon_armv8a/U8U32DotGemmDriver.h      |   28 +
 include/neon_armv8a/U8U32DotGemmKernel.h      |   29 +
 include/neon_armv8a/U8U32DotGemmSkinnyDot.h   |  115 +
 include/neon_armv8a/U8U32MlaGemmCopy.h        |   31 +
 include/neon_armv8a/U8U32MlaGemmDriver.h      |   28 +
 include/neon_armv8a/U8U32MlaGemmKernel.h      |   29 +
 include/neon_armv8a/U8U32MlaGemmSkinnyDot.h   |   75 +
 include/neon_armv8a/U8U32MlaGemmSkinnyGer.h   |   74 +
 .../sgemm_skinny_dot_kernel/ReadME.md         |   23 +
 .../SgemmSkinnyDotA35.h                       |  488 ++
 .../SgemmSkinnyDotA53.h                       |  488 ++
 .../SgemmSkinnyDotA7x.h                       |  488 ++
 .../SgemmSkinnyDotCopy.h                      |  108 +
 .../SgemmSkinnyDotDriver.h                    |  485 ++
 .../SgemmSkinnyDotKernelA35.h                 | 2439 ++++++++++
 .../SgemmSkinnyDotKernelA53.h                 | 4306 +++++++++++++++++
 .../SgemmSkinnyDotKernelA7x.h                 | 2556 ++++++++++
 src/arm_neon/ARMCompareAndSwap.c              |  112 +
 src/arm_neon/ARMCpuType.c                     |  451 ++
 src/neon_armv7a/Bias.c                        |   28 +
 src/neon_armv7a/Layer.c                       |   24 +
 src/neon_armv7a/Quant.c                       |   52 +
 src/neon_armv7a/S8S32GemmDriver.c             |   43 +
 src/neon_armv7a/S8S32MlaGemmCopy.c            |   30 +
 src/neon_armv7a/S8S32MlaGemmDriver.c          |   27 +
 src/neon_armv7a/S8S32MlaGemmKernel.c          |   27 +
 src/neon_armv7a/S8S32MlaGemmSkinnyDot.c       |   29 +
 src/neon_armv7a/S8S32MlaGemmSkinnyGer.c       |   29 +
 src/neon_armv7a/SgemmCopy.c                   |   31 +
 src/neon_armv7a/SgemmDriver.c                 |   26 +
 src/neon_armv7a/SgemmKernel.c                 |  328 ++
 src/neon_armv7a/SgemmSkinnyDot.c              |  495 ++
 src/neon_armv7a/SgemmSkinnyGer.c              |  280 ++
 src/neon_armv7a/U8U32GemmDriver.c             |   42 +
 src/neon_armv7a/U8U32MlaGemmCopy.c            |   30 +
 src/neon_armv7a/U8U32MlaGemmDriver.c          |   27 +
 src/neon_armv7a/U8U32MlaGemmKernel.c          |   27 +
 src/neon_armv7a/U8U32MlaGemmSkinnyDot.c       |   29 +
 src/neon_armv7a/U8U32MlaGemmSkinnyGer.c       |   29 +
 src/neon_armv8a/Bias.c                        |   28 +
 src/neon_armv8a/HgemmDriver.c                 |   28 +
 src/neon_armv8a/Layer.c                       |   24 +
 src/neon_armv8a/Quant.c                       |   52 +
 src/neon_armv8a/S8S32DotGemmDriver.c          |   36 +
 src/neon_armv8a/S8S32GemmDriver.c             |   48 +
 src/neon_armv8a/S8S32MlaGemmCopy.c            |   30 +
 src/neon_armv8a/S8S32MlaGemmDriver.c          |   27 +
 src/neon_armv8a/S8S32MlaGemmKernel.c          |   27 +
 src/neon_armv8a/S8S32MlaGemmSkinnyDot.c       |   34 +
 src/neon_armv8a/S8S32MlaGemmSkinnyGer.c       |   32 +
 src/neon_armv8a/SgemmCopy.c                   |   30 +
 src/neon_armv8a/SgemmDriver.c                 |   26 +
 src/neon_armv8a/SgemmKernel.c                 | 1071 ++++
 src/neon_armv8a/SgemmSkinnyDot.c              |  800 +++
 src/neon_armv8a/SgemmSkinnyGer.c              |  283 ++
 src/neon_armv8a/U8U32DotGemmDriver.c          |   36 +
 src/neon_armv8a/U8U32GemmDriver.c             |   48 +
 src/neon_armv8a/U8U32MlaGemmCopy.c            |   30 +
 src/neon_armv8a/U8U32MlaGemmDriver.c          |   27 +
 src/neon_armv8a/U8U32MlaGemmKernel.c          |   27 +
 src/neon_armv8a/U8U32MlaGemmSkinnyDot.c       |   34 +
 src/neon_armv8a/U8U32MlaGemmSkinnyGer.c       |   32 +
 src/neon_armv8a/extension/HgemmCopy.c         |  111 +
 src/neon_armv8a/extension/HgemmKernel.c       | 1420 ++++++
 src/neon_armv8a/extension/HgemmSkinnyDot.c    |  350 ++
 src/neon_armv8a/extension/HgemmSkinnyGer.c    |  232 +
 src/neon_armv8a/extension/S8S32DotGemmCopy.c  |   30 +
 .../extension/S8S32DotGemmKernel.c            |  116 +
 .../extension/S8S32DotGemmSkinnyDot.c         |   37 +
 src/neon_armv8a/extension/U8U32DotGemmCopy.c  |   30 +
 .../extension/U8U32DotGemmKernel.c            |  116 +
 .../extension/U8U32DotGemmSkinnyDot.c         |   37 +
 .../SgemmSkinnyDotA35.c                       |   72 +
 .../SgemmSkinnyDotA53.c                       |   71 +
 .../SgemmSkinnyDotA7x.c                       |   71 +
 .../SgemmSkinnyDotCopy.c                      |  547 +++
 test/TestBias.c                               |  281 ++
 test/TestCompilerOpenMP.c                     |   12 +
 test/TestGemm.c                               |   98 +
 test/TestQuant.c                              |   95 +
 165 files changed, 35590 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 CMakeLists.txt
 create mode 100644 LICENSE
 create mode 100644 ReadMe.md
 create mode 100644 ReadMe_ZH.md
 create mode 100644 bench/test_sgemm_en1.png
 create mode 100644 bench/test_sgemm_en2.png
 create mode 100644 bench/test_sgemm_zh1.png
 create mode 100644 bench/test_sgemm_zh2.png
 create mode 100644 doc/EMLL.png
 create mode 100644 doc/Usage_EN.md
 create mode 100644 doc/Usage_ZH.md
 create mode 100644 example/CMakeLists.txt
 create mode 100644 example/Gemm.c
 create mode 100644 example/Usage_EN.md
 create mode 100644 example/Usage_ZH.md
 create mode 100644 include/Gemm.h
 create mode 100644 include/Layer.h
 create mode 100644 include/Quant.h
 create mode 100644 include/arm_neon/ARMCompareAndSwap.h
 create mode 100644 include/arm_neon/ARMCpuType.h
 create mode 100644 include/arm_neon/NeonBias.h
 create mode 100644 include/arm_neon/NeonExtreme.h
 create mode 100644 include/arm_neon/NeonI8I32DotGemmSkinnyDot.h
 create mode 100644 include/arm_neon/NeonI8I32MlaGemmCopy.h
 create mode 100644 include/arm_neon/NeonI8I32MlaGemmKernel.h
 create mode 100644 include/arm_neon/NeonI8I32MlaGemmSkinnyDot.h
 create mode 100644 include/arm_neon/NeonI8I32MlaGemmSkinnyGer.h
 create mode 100644 include/arm_neon/NeonIntOpSign.h
 create mode 100644 include/arm_neon/NeonQuant.h
 create mode 100644 include/arm_neon/NeonSgemmCopy.h
 create mode 100644 include/arm_neon/NeonSgemmKernel.h
 create mode 100644 include/arm_neon/NeonSum.h
 create mode 100644 include/common/CommonCopy.h
 create mode 100644 include/common/CommonDriver.h
 create mode 100644 include/common/CommonKernel.h
 create mode 100644 include/common/CommonLayer.h
 create mode 100644 include/common/CommonQuant.h
 create mode 100644 include/common/CommonSched.h
 create mode 100644 include/common/CommonSkinnyDot.h
 create mode 100644 include/common/CommonSkinnyGer.h
 create mode 100644 include/common/CommonTest.h
 create mode 100644 include/common/ExpandMacro.h
 create mode 100644 include/neon_armv7a/Bias.h
 create mode 100644 include/neon_armv7a/I8I32MlaGemmKernel.h
 create mode 100644 include/neon_armv7a/S8S32MlaGemmCopy.h
 create mode 100644 include/neon_armv7a/S8S32MlaGemmDriver.h
 create mode 100644 include/neon_armv7a/S8S32MlaGemmKernel.h
 create mode 100644 include/neon_armv7a/S8S32MlaGemmSkinnyDot.h
 create mode 100644 include/neon_armv7a/S8S32MlaGemmSkinnyGer.h
 create mode 100644 include/neon_armv7a/SgemmCopy.h
 create mode 100644 include/neon_armv7a/SgemmDriver.h
 create mode 100644 include/neon_armv7a/SgemmKernel.h
 create mode 100644 include/neon_armv7a/SgemmSkinnyDot.h
 create mode 100644 include/neon_armv7a/SgemmSkinnyGer.h
 create mode 100644 include/neon_armv7a/U8U32MlaGemmCopy.h
 create mode 100644 include/neon_armv7a/U8U32MlaGemmDriver.h
 create mode 100644 include/neon_armv7a/U8U32MlaGemmKernel.h
 create mode 100644 include/neon_armv7a/U8U32MlaGemmSkinnyDot.h
 create mode 100644 include/neon_armv7a/U8U32MlaGemmSkinnyGer.h
 create mode 100644 include/neon_armv8a/Bias.h
 create mode 100644 include/neon_armv8a/HgemmCopy.h
 create mode 100644 include/neon_armv8a/HgemmDriver.h
 create mode 100644 include/neon_armv8a/HgemmKernel.h
 create mode 100644 include/neon_armv8a/HgemmSkinnyDot.h
 create mode 100644 include/neon_armv8a/HgemmSkinnyGer.h
 create mode 100644 include/neon_armv8a/I8I32DotGemmCopy.h
 create mode 100644 include/neon_armv8a/I8I32DotGemmKernel.h
 create mode 100644 include/neon_armv8a/I8I32MlaGemmKernel.h
 create mode 100644 include/neon_armv8a/I8I32MlaGemmSkinnyDot.h
 create mode 100644 include/neon_armv8a/S8S32DotGemmCopy.h
 create mode 100644 include/neon_armv8a/S8S32DotGemmDriver.h
 create mode 100644 include/neon_armv8a/S8S32DotGemmKernel.h
 create mode 100644 include/neon_armv8a/S8S32DotGemmSkinnyDot.h
 create mode 100644 include/neon_armv8a/S8S32MlaGemmCopy.h
 create mode 100644 include/neon_armv8a/S8S32MlaGemmDriver.h
 create mode 100644 include/neon_armv8a/S8S32MlaGemmKernel.h
 create mode 100644 include/neon_armv8a/S8S32MlaGemmSkinnyDot.h
 create mode 100644 include/neon_armv8a/S8S32MlaGemmSkinnyGer.h
 create mode 100644 include/neon_armv8a/SgemmCopy.h
 create mode 100644 include/neon_armv8a/SgemmDriver.h
 create mode 100644 include/neon_armv8a/SgemmKernel.h
 create mode 100644 include/neon_armv8a/SgemmSkinnyDot.h
 create mode 100644 include/neon_armv8a/SgemmSkinnyGer.h
 create mode 100644 include/neon_armv8a/U8U32DotGemmCopy.h
 create mode 100644 include/neon_armv8a/U8U32DotGemmDriver.h
 create mode 100644 include/neon_armv8a/U8U32DotGemmKernel.h
 create mode 100644 include/neon_armv8a/U8U32DotGemmSkinnyDot.h
 create mode 100644 include/neon_armv8a/U8U32MlaGemmCopy.h
 create mode 100644 include/neon_armv8a/U8U32MlaGemmDriver.h
 create mode 100644 include/neon_armv8a/U8U32MlaGemmKernel.h
 create mode 100644 include/neon_armv8a/U8U32MlaGemmSkinnyDot.h
 create mode 100644 include/neon_armv8a/U8U32MlaGemmSkinnyGer.h
 create mode 100644 include/neon_armv8a/sgemm_skinny_dot_kernel/ReadME.md
 create mode 100644 include/neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotA35.h
 create mode 100644 include/neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotA53.h
 create mode 100644 include/neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotA7x.h
 create mode 100644 include/neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotCopy.h
 create mode 100644 include/neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotDriver.h
 create mode 100644 include/neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotKernelA35.h
 create mode 100644 include/neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotKernelA53.h
 create mode 100644 include/neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotKernelA7x.h
 create mode 100644 src/arm_neon/ARMCompareAndSwap.c
 create mode 100644 src/arm_neon/ARMCpuType.c
 create mode 100644 src/neon_armv7a/Bias.c
 create mode 100644 src/neon_armv7a/Layer.c
 create mode 100644 src/neon_armv7a/Quant.c
 create mode 100644 src/neon_armv7a/S8S32GemmDriver.c
 create mode 100644 src/neon_armv7a/S8S32MlaGemmCopy.c
 create mode 100644 src/neon_armv7a/S8S32MlaGemmDriver.c
 create mode 100644 src/neon_armv7a/S8S32MlaGemmKernel.c
 create mode 100644 src/neon_armv7a/S8S32MlaGemmSkinnyDot.c
 create mode 100644 src/neon_armv7a/S8S32MlaGemmSkinnyGer.c
 create mode 100644 src/neon_armv7a/SgemmCopy.c
 create mode 100644 src/neon_armv7a/SgemmDriver.c
 create mode 100644 src/neon_armv7a/SgemmKernel.c
 create mode 100644 src/neon_armv7a/SgemmSkinnyDot.c
 create mode 100644 src/neon_armv7a/SgemmSkinnyGer.c
 create mode 100644 src/neon_armv7a/U8U32GemmDriver.c
 create mode 100644 src/neon_armv7a/U8U32MlaGemmCopy.c
 create mode 100644 src/neon_armv7a/U8U32MlaGemmDriver.c
 create mode 100644 src/neon_armv7a/U8U32MlaGemmKernel.c
 create mode 100644 src/neon_armv7a/U8U32MlaGemmSkinnyDot.c
 create mode 100644 src/neon_armv7a/U8U32MlaGemmSkinnyGer.c
 create mode 100644 src/neon_armv8a/Bias.c
 create mode 100644 src/neon_armv8a/HgemmDriver.c
 create mode 100644 src/neon_armv8a/Layer.c
 create mode 100644 src/neon_armv8a/Quant.c
 create mode 100644 src/neon_armv8a/S8S32DotGemmDriver.c
 create mode 100644 src/neon_armv8a/S8S32GemmDriver.c
 create mode 100644 src/neon_armv8a/S8S32MlaGemmCopy.c
 create mode 100644 src/neon_armv8a/S8S32MlaGemmDriver.c
 create mode 100644 src/neon_armv8a/S8S32MlaGemmKernel.c
 create mode 100644 src/neon_armv8a/S8S32MlaGemmSkinnyDot.c
 create mode 100644 src/neon_armv8a/S8S32MlaGemmSkinnyGer.c
 create mode 100644 src/neon_armv8a/SgemmCopy.c
 create mode 100644 src/neon_armv8a/SgemmDriver.c
 create mode 100644 src/neon_armv8a/SgemmKernel.c
 create mode 100644 src/neon_armv8a/SgemmSkinnyDot.c
 create mode 100644 src/neon_armv8a/SgemmSkinnyGer.c
 create mode 100644 src/neon_armv8a/U8U32DotGemmDriver.c
 create mode 100644 src/neon_armv8a/U8U32GemmDriver.c
 create mode 100644 src/neon_armv8a/U8U32MlaGemmCopy.c
 create mode 100644 src/neon_armv8a/U8U32MlaGemmDriver.c
 create mode 100644 src/neon_armv8a/U8U32MlaGemmKernel.c
 create mode 100644 src/neon_armv8a/U8U32MlaGemmSkinnyDot.c
 create mode 100644 src/neon_armv8a/U8U32MlaGemmSkinnyGer.c
 create mode 100644 src/neon_armv8a/extension/HgemmCopy.c
 create mode 100644 src/neon_armv8a/extension/HgemmKernel.c
 create mode 100644 src/neon_armv8a/extension/HgemmSkinnyDot.c
 create mode 100644 src/neon_armv8a/extension/HgemmSkinnyGer.c
 create mode 100644 src/neon_armv8a/extension/S8S32DotGemmCopy.c
 create mode 100644 src/neon_armv8a/extension/S8S32DotGemmKernel.c
 create mode 100644 src/neon_armv8a/extension/S8S32DotGemmSkinnyDot.c
 create mode 100644 src/neon_armv8a/extension/U8U32DotGemmCopy.c
 create mode 100644 src/neon_armv8a/extension/U8U32DotGemmKernel.c
 create mode 100644 src/neon_armv8a/extension/U8U32DotGemmSkinnyDot.c
 create mode 100644 src/neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotA35.c
 create mode 100644 src/neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotA53.c
 create mode 100644 src/neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotA7x.c
 create mode 100644 src/neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotCopy.c
 create mode 100644 test/TestBias.c
 create mode 100644 test/TestCompilerOpenMP.c
 create mode 100644 test/TestGemm.c
 create mode 100644 test/TestQuant.c

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..59119ab
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+*build
+*install
+.nfs*
+toolchain*
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..396239c
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,102 @@
+cmake_minimum_required(VERSION 3.7)
+
+if (NOT CMAKE_BUILD_TYPE OR CMAKE_BUILD_TYPE STREQUAL "")
+  set(CMAKE_BUILD_TYPE "Release" CACHE STRING "" FORCE)
+endif()
+
+set(CMAKE_C_STANDARD 99)
+if(CMAKE_BUILD_TYPE STREQUAL "Release" AND NOT CMAKE_C_FLAGS)
+  set(CMAKE_C_FLAGS "-O2" CACHE STRING "" FORCE)
+endif()
+
+option(EML_ARMV7A "build for armv7a architecture instead of armv8a" OFF)
+
+if(ANDROID)
+#Android build
+#ANDROID_NDK must be provided
+#ANDROID_PLATFORM is optional
+  if(NOT DEFINED ANDROID_PLATFORM)
+    set(ANDROID_PLATFORM 27)
+  endif()
+  if(EML_ARMV7A)
+    set(ANDROID_ABI "armeabi-v7a")
+    set(ANDROID_ARM_MODE arm) #not to use thumb
+    set(ANDROID_ARM_NEON ON) #enable NEON on armv7a
+  else() #armv8a
+    set(ANDROID_ABI "arm64-v8a")
+  endif()
+  include(${ANDROID_NDK}/build/cmake/android.toolchain.cmake)
+  if(CMAKE_BUILD_TYPE STREQUAL "Release")
+    add_compile_options(-g0) #disable NDK debug info generation
+  endif()
+  set(RUNTIME_LIB dl log)
+else()
+#Linux build
+#CMAKE_C_COMPILER must be provided
+#CMAKE_SYSROOT is optional
+  set(CMAKE_SYSTEM_NAME Linux)
+  if(EML_ARMV7A)
+    set(CMAKE_SYSTEM_PROCESSOR arm)
+    add_compile_options(-marm -march=armv7ve)
+    add_compile_options(-mfpu=neon-vfpv4 -mfp16-format=ieee)
+  else()
+    set(CMAKE_SYSTEM_PROCESSOR aarch64)
+  endif()
+  set(RUNTIME_LIB pthread)
+endif()
+
+project(emll
+ VERSION 1.0
+ LANGUAGES C)
+
+try_compile(EMLL_COMPILER_OPENMP_SUPPORTED ${CMAKE_BINARY_DIR}
+  "${PROJECT_SOURCE_DIR}/test/TestCompilerOpenMP.c"
+  COMPILE_DEFINITIONS -fopenmp
+  LINK_LIBRARIES -fopenmp)
+
+if (EMLL_COMPILER_OPENMP_SUPPORTED)
+  add_compile_options(-fopenmp)
+  list(APPEND RUNTIME_LIB -fopenmp)
+else()
+  message(STATUS "The compiler doesn't support OpenMP. Build serial version only.")
+  add_definitions(-DEMLL_SERIAL_ONLY)
+endif()
+
+include_directories("${PROJECT_SOURCE_DIR}/include")
+
+file(GLOB interface_header "${PROJECT_SOURCE_DIR}/include/*.h")
+file(GLOB arm_src "${PROJECT_SOURCE_DIR}/src/arm_neon/*.c")
+if(EML_ARMV7A)
+  file(GLOB_RECURSE neon_src "${PROJECT_SOURCE_DIR}/src/neon_armv7a/*.c")
+  add_library(eml-armneon ${arm_src} ${neon_src})
+else()
+  file(GLOB neon_src "${PROJECT_SOURCE_DIR}/src/neon_armv8a/*.c")
+  file(GLOB skinny_dot_src
+    "${PROJECT_SOURCE_DIR}/src/neon_armv8a/sgemm_skinny_dot_kernel/*.c")
+  file(GLOB ext_src "${PROJECT_SOURCE_DIR}/src/neon_armv8a/extension/*.c")
+  set_source_files_properties(${arm_src} ${ext_src}
+    PROPERTIES COMPILE_FLAGS "-march=armv8.2-a+dotprod+fp16")
+  add_library(eml-armneon ${arm_src} ${ext_src} ${skinny_dot_src} ${neon_src})
+endif()
+
+option(EML_TEST "build test programs for the library" ON)
+
+if(EML_TEST)
+  message(STATUS "Build testing executables for EML")
+  set(EML_TEST_EXECUTABLES test_emll_gemm test_emll_bias test_emll_quant)
+  add_executable(test_emll_gemm "${PROJECT_SOURCE_DIR}/test/TestGemm.c")
+  add_executable(test_emll_bias "${PROJECT_SOURCE_DIR}/test/TestBias.c")
+  add_executable(test_emll_quant "${PROJECT_SOURCE_DIR}/test/TestQuant.c")
+  target_link_libraries(test_emll_gemm eml-armneon ${RUNTIME_LIB})
+  target_link_libraries(test_emll_bias eml-armneon ${RUNTIME_LIB})
+  target_link_libraries(test_emll_quant eml-armneon ${RUNTIME_LIB})
+endif()
+
+set_target_properties(eml-armneon PROPERTIES PUBLIC_HEADER "${interface_header}")
+install(TARGETS eml-armneon ${EML_TEST_EXECUTABLES}
+  EXPORT EMLLTargets
+  LIBRARY DESTINATION lib
+  ARCHIVE DESTINATION lib
+  RUNTIME DESTINATION bin
+  PUBLIC_HEADER DESTINATION include)
+
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..169249f
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright YouDao, Inc.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/ReadMe.md b/ReadMe.md
new file mode 100644
index 0000000..2eb5614
--- /dev/null
+++ b/ReadMe.md
@@ -0,0 +1,70 @@
+![logo](doc/EMLL.png)
+
+[中文介绍](ReadMe_ZH.md)
+
+# Edge ML Library - High-performance Compute Library for On-device Machine Learning Inference
+
+Edge ML Library (EMLL) offers optimized basic routines like general matrix multiplications (GEMM) and quantizations, to speed up machine learning (ML) inference on ARM-based devices. EMLL supports fp32, fp16 and int8 data types. EMLL accelerates on-device NMT, ASR and OCR engines of Youdao, Inc.
+
+## Features
+
+### Performance-Oriented Design
+
+The matrix-multiplication routines are heavily-optimized for matrix shapes common in on-device ML tasks, including "skinny" ones. The matrix-multiplication kernels are tuned for specific CPUs with a large portion of inline assembly codes.
+
+Here are benchmarks of SGEMM on 2 machines<sup>[1]</sup>:
+
+| armv8a cortex-A35 4-thread | armv8a cortex-A53 4-thread |
+| -------------------------- | -------------------------- |
+| ![test1](bench/test_sgemm_en1.png) | ![test2](bench/test_sgemm_en2.png) |
+
+[1].The fomular of GEMM: C[MxN] = A[MxK] B[KxN]; For each test case, the better performance in all-row-major and all-column-major situations is selected.
+
+### Facile Interface
+
+The data and parameters are passed straightforward without wrappings. Matrices and arrays are passed with base address + dimensions. GEMM parameters seldom used in on-device inference like LDA-LDC are excluded from the interface. There is no dependency on any third-party compute libraries.
+
+
+### Extensibility
+
+EMLL abstracts the core structures of CPU-based high-performance matrix multiplication algorithms and also bias/quant functions to general macros (see files under include/common), which can be applied to a variety of processors. When developing for a new architecture, a lot of coding works can be saved with these macros.
+
+## EMLL APIs
+
+EMLL provides a series of C functions. See [Usage_EN.md](doc/Usage_EN.md) for details.
+
+| Type | Name | Parameters |
+| ---- | ---- | ---------- |
+| Matrix Multiplication | data_type + "gemm" | matrix_orders, addresses of matrices, M, N, K, beta, number of threads |
+| Fully-connect Layer (fp32) | "fc" | addresses of src/weight/bias/output, dimensions M/K/N, orders of source matrices, (number of threads) |
+| Quantization | "quantize_" + "symmetric"/"asymmetric" + input_type + output_type | input array, output array, (zero point), scale, size of array, input range |
+| Requantization | "requantize_" + "symmetric/asymmetric" + "_XtoY" | input array, output array, (zero point), output scale, size of array, input range |
+| Bias | "bias" + data_type | the matrix to be biased, scalar bias to all elements, vector bias along major direction, vector bias along minor direction, dimensions of the matrix |
+
+## Supported Architectures and Data Types
+
+| Target CPU     | Matrix Multiplication                            | Bias              | Quantization             | Requantization                              |
+| -------------- | ------------------------------------------------ | ----------------- | ------------------------ | ------------------------------------------- |
+| ARMv7a 32-bit  | fp32 -> fp32, (u)int8 -> (u)int32                | fp32, int32       | fp32 -> (u)int8/(u)int16 | int32 -> (u)int8/(u)int16, int16 -> (u)int8 |
+| ARMv8a 64-bit  | fp32 -> fp32, (u)int8 -> (u)int32, fp16 -> fp16  | fp32, fp16, int32 | fp32 -> (u)int8/(u)int16 | int32 -> (u)int8/(u)int16, int16 -> (u)int8 |
+
+Supported OS: Linux & Android
+
+Supported Compilers: GCC & Clang
+
+## Future Plan
+
+EMLL may support on-device GPUs and NPUs in the future, with the expansion of available functions, according to business requirements.
+
+## License
+
+Apache 2.0
+
+## Reference
+
+Eigen: [https://eigen.tuxfamily.org]
+
+OpenBLAS: [https://github.com/xianyi/OpenBLAS]
+
+
+
diff --git a/ReadMe_ZH.md b/ReadMe_ZH.md
new file mode 100644
index 0000000..b37621c
--- /dev/null
+++ b/ReadMe_ZH.md
@@ -0,0 +1,65 @@
+# EMLL - 高性能端侧机器学习计算库
+
+EMLL(Edge ML Library)为加速终端侧设备上机器学习的推理而设计，提供基于端侧处理器的高性能机器学习计算函数库。EMLL支持fp32、fp16、int8等数据类型，已在有道词典笔、翻译王和超级词典等硬件产品的机器翻译和语音识别引擎中应用，大幅降低了推理延迟。
+
+## 特点
+
+### 高性能
+
+EMLL实现的矩阵乘法函数，为端侧人工智能中常见的扁平矩阵作了专门的优化，为各常见ARM处理器作了特定的优化。对于cortex-A35/A53/A55处理器，本库针对它们的流水线特点，使用了汇编级别的优化。
+
+下面是单精度矩阵乘法的测试结果<sup>[1]</sup>：
+
+| ARMv8A Cortex-A35 四线程 | ARMv8A Cortex-A53 四线程 |
+| ------------------------ | ------------------------ |
+| ![结果1](bench/test_sgemm_zh1.png) | ![结果2](bench/test_sgemm_zh2.png) |
+
+[1]矩阵乘法的通式为 C[MxN] = A[MxK] B[KxN]；所列数据为全行主序和全列主序的最好性能。
+
+### 易用性
+
+EMLL使用的函数接口在参数设计上力求简洁直接，矩阵乘法去掉了不常用的LD*参数，矩阵和向量的传递通过指针和整数维度分别传递。本库的构建和运行不依赖第三方计算库。
+
+### 扩展性
+
+对于矩阵乘法和量化函数，EMLL 库提取了它们和架构无关的代码作为通用的宏，这些宏可以在支持新的CPU架构时大大节省所需的代码量。
+
+## EMLL 应用接口
+
+EMLL提供基于 C 的接口，详情请见 [Usage_ZH.md](doc/Usage_ZH.md)。
+
+| 函数类型 | 函数名称 | 函数参数 |
+| -------- | -------- | -------- |
+| 矩阵乘法 | data_type + "gemm" | 源矩阵排列顺序，各矩阵地址，M，N，K，beta，并行线程数 |
+| 全连接层(单精度) | "fc" | src/weight/bias/output的地址，M，K，N，源矩阵排列顺序，(并行线程数) |
+| 量化 | "quantize_" + "symmetric"/"asymmetric" + input_type + output_type | 输入数组，输出数组，(输出零点值)，缩放值，数组大小，输入范围 |
+| 重量化 | "requantize_" + "symmetric/asymmetric" + "_XtoY" | 输入数组，输出数组，(输出零点值)，输出缩放值，数组大小，输入范围 |
+| 偏置 | "bias" + data_type | 被偏置的矩阵，标量偏置，平行于主方向的向量偏置，平行于次方向的向量偏置，矩阵大小 |
+
+## 各函数支持的数据类型
+
+| 处理器         | 矩阵乘法                 | 偏置             | 量化            | 重量化        |
+| -------------- | ------------------------ | ---------------- | --------------- | ------------- |
+| ARMv7a 32-bit  | fp32，(u)int8        | fp32，int32 | fp32 -> (u)int16/(u)int8 | int32 -> (u)int16/(u)int8，int16 -> (u)int8 |
+| ARMv8a 64-bit  | fp32，fp16，(u)int8  | fp32，int32 | fp32 -> (u)int16/(u)int8 | int32 -> (u)int16/(u)int8，int16 -> (u)int8 |
+
+EMLL 支持在 Linux 和安卓系统上运行。
+
+
+EMLL 支持用 GCC 和 Clang 编译。
+
+## 展望
+
+EMLL 将来会根据需求，增加对端侧 GPU 和 NPU 的支持，并拓展支持的算子范围（卷积、激活函数等）。
+
+## 许可证
+
+Apache 2.0
+
+## 参考
+
+Eigen: https://eigen.tuxfamily.org/
+
+OpenBLAS: https://github.com/xianyi/OpenBLAS/
+
+
diff --git a/bench/test_sgemm_en1.png b/bench/test_sgemm_en1.png
new file mode 100644
index 0000000000000000000000000000000000000000..7373a4393520b0ea29e17d1482522d962f187fe7
GIT binary patch
literal 30142
zcmdRWby!qg`|hYO2B-)~2m(rpv{Finw17yLbmvGTH7Xz=C0zp2E!`zGh=4G3qYT{*
zL(N$;_}1_H&iVUyye_YiaqqR)de#&7eLrjC|KhpiHC%FB2n2FXT1re20{KG-0y)oe
z`8@b0^5~Qie4Mjalza;L^5ga*_~8#z5jhbEq$mXMSRV)ceC3sthCKvALXY`-PFj&}
z0|IeWk`@zDcF|d#Pzb!O<vF$Q@lEDqEx-I*s;gv|Z``@^N5i8xP(9j@3~vb<WS@|g
zOEc~@(MpI&z@PXpy2Nsn8-_}M)@%F5u;mvja)(?hTGQNIY44c~qPWPptsgh4Fc%RS
zRVrp-%J5E#In5bL$kDKfjzuRv&djOK$vkpM__BWKQuCK3r2&=9=W0n)%DWp%4cEb3
zA*uV>`<PD-12Iy}C-I$X%tsakfmzBE96vV9k4#S-&Vi2?=Th*%#~a)m+L&*Z-z;Ek
z^7;J#vRmI2@e2vn+alp*mP1qmE;mfhEt`03{rq`H_JX<~Sb^Tk+C)v}gA~SQZ;h|j
z8WqZK9t#RyD}m&ZA#fNasiF8(0+o)-QP-M`oSoIEJ>`>xu5t4Tx*xoVER#|{7%fXk
zpv+(W`Sa&UiD`1Lv8(vNm9jDV086)(3jdT#t`i$#_&F;bOX=J4J4WGq`}@sAf@b_4
z8g<XQoh;LgcY8OBdT_b3V?(HguSY&~UK_tsre(JMbl{eCVcC0<t0jj;UsBSi%X^cD
zK2`aO;=RXVTkaeAGAynpre)lf#d@WLjc($;iDU`wQWkx_$yns=^-fKd7)OzUTvOx)
zqZ89LghI3wUt-xSXS%);r<Yc4Vg+O9%7ekf>GCqo%tYMC_kG3nw3_;D6!T1_Bd3Id
zUb*FBK}1PRWeI%pPAhrWpJk2Gytbq$YY-e8^ntKY0;7mg+X>lxn>8$sc~5DL<jm}z
z6Zm>?&$7Y@lGc5&(c+Uxdr<`s@8vgNRTM)P<!V(L0ZEO#D8)}?K96fM+*aMCS}BVU
z$)BrswZyf@)6s$0d`y7Td0jc#_G)Inc4feq2)IxAHI&Z$#QjVK^DK|p{cU*gOc7_n
zXR0y83-SlNJWzoiX#bwX0+mX|YJyU#qt&Vke7yZV>z?OI)F;I`8|&k-piiTBg{9-F
z*2%A^u$M&lb#q+PYxQG$a%Gc43ZDzNzpb_<DLQ${81+grygbiLaC5#i5jQ}4<84BY
zk8vCwEo)a%x_i7^Fw|Wn^V;QZ?G3rD$f-qVaT@BhrP(I?I(jeGpoiS^wO6iR-??+j
zo$qZ-^UHrkhWD97QhgUH**xO54R<Jc+u$dt&Y0BD(9a3kQkQX+tzWxM+NF>&J8PAg
z;L_cV16(h&{pRxoDkY}7cRH&-!#C%?bH;a(t_w?V^HABqstyVd>2dltJqdK5#osjP
zGJBqtpm6U>q={C#bpDH-C(MW_mfPA}Q%Vk2@TUGk{Z&qjnGG?#^)ES#9*;bcX3^n^
zP^-o?($>*Qn|dFM0S>&cOa$6*2#+tQc!kV(DbYeaT6jF^a!rntb3?k}_}bU_tQo`c
zV|L6fI_BgZd+<gSg`URo8*t>9M0&usL$+PQ^Xhl1a*z~A{L+M9Ri5r5h_`0$3SIpc
z*8LPF;#L}0`nIoHqh0QdRXUH>R?2YcxGkC;yWEM-n~Emy7sq?CUw*BR$6F&bm}F3=
zMN&X`e6o8xV{h;qXJVF{S*W|uX2)#9#1M_9wdc3#o-8=NJ^O-2S?zg=;#-@F*3Gui
z+bRo{vK3^OH2EWKYp*$V5)x<}Zq|;?U!&+#U1XeboKBpI8MJcV;gsFa7%Cp%*^`IJ
zNIIN*R@lvBep|nE^TAvAB@_o&@>}J$RW2`~Fy!3?kCcw_Ya5MkB`#9Rw8r0Tc;Qm*
zv-iWo!fFVX9Yfw06;{<?KB#YTOmC^~$&H|=(&nzOMwFvCzSwq-dVH(hEwzf_HsmyZ
zLCX4-x6*cFVxno3bTvce8@{WS<cBF|QLScOn-}x_<s*uOCc8JBW8m;j&*Gi&>2-$H
zj05DxPS`=C%8?7v;(~YPrL8GEE+=BQp`F)Bb1R?kK^Y;;PaV#w{t$jc-??;GC&%#4
zSiYOy5Cc`f!G_t|`pwl6U9QpXw$V2mJ3cPq0v|h1g6g4Ra9O;ok<)iqR6HMBSbpRX
z?EbvBZ~4N0UuJc#CVW3A7*5LVBglN(VLtV|=rQsaavgs2L4}XU*SHD`L&>e{1te>y
zI51l1@o9n*i4D3?wy)#blAEZ8Zdrx3At#%UZmXj2zI|)cmBxyaB9WD^tb?kPZ6pgE
zkoPpz5axO3n^I`qX(x1WVP@-l_IC9a$#zPL^$U1-7kRGVuzsWRs+|F4<AgRi_#jG;
zj?CXnN8kN6@{CTlyRYHoWN5h86wgCBx}Yj`Gf0qGN56<o^9>KY{Mk8%&T1<gJU7wV
zI{I}f3*TVbVPD?*>X$JCUhAzrwpFb3s7X{X`$<?pMST8asKmT^tu2KBeR)|VC7$q(
zTCLyAShoH17~-M}LxTa;s6~3{xiin$*!&o@GQUL-UwZAyQ4D{fWkjI3`_=vj<TCZS
zOBX8Lk>PS%zhsauO$UXO<@ZKvh1`4kV4)J_guvBzpfKpyU?;!V@lr94$=Dg&`Xp<Y
zUWI-wH;El<@?6-SjCY``TjXQf|8-7h=TYZs<CK9Y#Z*+7JJ)*AU0HU2&GzaKye@4=
zh@@UPU8wr1;WDCo<~~wzJjCaocWZSPuaqHUnRD0pzIkd@-OaCrnDBKsqj4QSP@von
zvG(}ltAQ7LHtS=xEhhx+eCn+NJtV~cQX9h*otEmTgj+pNZqEZ{i?ptiL%UXfz9vo8
z8`|#F1@%glLG0F~>hGNp59g?s*GPibPrvUoYE;1!c&diKsW0!(a4sB?ZAx6n6{uei
zV_BFG1v!Y-f1e1}x@Oov$S^r-eILPmfVcRz%UmFC!o6gSyk(NxRW*OfmFhZlwVadg
z4L$G(`j%#Lgk*ux9KVo<2N#F)w=ub-`f@6d42y&+xUJXRn4+tmy*sk(X*^K}^mu_x
z@Q4BR1%-17RuvPXcFY@xnseM%i{P+S&}U`#V@sSO*QyL1h&BoZZcI(U5YJT_5cP}p
z2Yf0*MhlNQ`E^4WycHQd6EgO`{Bd3B>*IqWosJMa=Pd=;^#DPSoc#c57FHoHH8<o8
ztx5iPV(+2jjj|)YwHy=9UClTv%iimWZphn&yoNEvo2&#=BIp5GyZ%q*BTRI@%{6BG
z@MRy9YwpjrX8FIgj#ZaZ`+1K(TNU7K_8#r1hvVD<sRmd2{rikn(IPjlb+0rrfkckO
znd{wUcq+6gb+HRL@qLe2wokV7--cW$J;&3AVnAiJHcKPdS?cZ|N5!~rS38D;yE7(9
z!DSf)oF5E#(WiF1n~te-w`=7EIeTFiJw!696ei4{6%d6eVwc#?e@aoNIL0xg)x576
z3D$fCCfQO-wVrP6PPS_GYJA%<p}~@FN(4%h|8m9lc>=6wdGuEgE;7CL7N+8KPBrbZ
z^%PHDwl?pI=S59?V6d()7wG)n!iwDaU>$E-o5|hFR<TQi)XuUFfw16#^8YLeh8m_v
z%kFj~;2~OWOs2G6>Pc_4v&h@qXvnRHZ%f1ZP6=&uHdnhPKj{chYNCJ2OL)4nQn<8c
zGfdUm=m<lpH|4r~$U`^8YK+3koy$X2!fRVcOSy}?hX@dRWP-u`I*E4L-E5i?V?un;
zUlJ3d_Nx(U+nXLp$8Rs&mM!A>n-bR*o>KH|rw0{C;y2|<6~A_cnVWJe?T{#$UY5UF
zRgus7+=a8;kq|~2pgJv`k*b8ca@V~5QB*0n-Lc&jN2{_YS<bP88JW|QDK}CCgQoDT
zJHC$x<LYVnv3(_<No{r+XZO7Sd9zQOIxBA7{Z1IZ!Or8+J>^UJeNnw$9L&}H^tUPx
zjt@9%kY2U6;qJ;2#}5bQ+q8NsUsIMgsx}l6aE0}`8mMYK4U%$6(e8b8GB%YXJ!xR=
z?jFT59W<3Sf8(_cDQl`VbwWj9InV8e4zyb}gXlnD;uhazpV3?0PnMjegj1S=3nhft
zq6bc<kpm^6butxoFUDDd<Ve?{S{MDyWuYe`!p(Vjww&)ZZmnhvY%=t^jw+_l6H&Ht
zFi&5qkmn^Slkbh?zLmHvLa|s7vtG09H9V+r4zr2s@iP~=Pk-(sp0s%o{P4YxI;ccu
zUk@@2b>RRgLuhC_ws%uxvQk<zB*c9Btum5bmx8yzt06^)rgO<Cl3rG_iKuSfY<=of
zRc?#Kl89u1RJ(LNt^lEvWSg_RkYemjx<;TKMiT5h*PbpA?LdeBYjn0;!LcH-dHiK<
znf;|fstalPc1gkW@qGKltz%5WH>5esrK^p{&Jp9A1pR7B8rbvII62si6j?v-lT@hg
zmbY8CJj@P}5pg&-c%N#y<$@*JWsMWEM3%90t8r!&P9yYTH)OtCM567|>oEq@*CyS5
z$*a<y%a4`qHov8If23DtrO>XW;JQ9!H%gGpu0QA#4A*_4MatT&qc*yip6XMVR9fdB
z7Mv!yOEo#Kb}`7^Esgad_g3nlAdQ;o-p45f`pBWyXrDi;WF&xxN2YtX#b3s9`LeI)
z&4}z$TK`9LWjOv4uHw+@(~9<6Q)%Uf)#JRBZ?21x>UB*&$bu2JrLI;{cOMp(#-TXU
zfvv3#36j<;%7*W`3f)*dxC0%dOIp7mhtJvm%=LPtfsoN6(bc>R??ovW10007*VEiZ
z>#?p&uO?r9N!=vv8y4?~sbQyNy^XIz!Ok*6dEawtxAWs;>bbr3B*Kty&3S=5-TPTJ
zNiHQ0e<Yu-(-vft4j{d$X$1=(qZ&ULCY$5oZ{CfuG&{ek^lVbA8YN`0-jvr-+f+R1
z02#fZ{bskMTfkp<?>O9ZO}*T3XK+*bl83;UV*9*<fcJoU3oIxOst*j^#(-k8?9yxy
z{NX!i9Y56Q%YquUQ&nnmpc5#1w=g|az{PW|gzt5KyCjq{sDVQ(nZ-82qjMs9jbsn%
z%_}rE9D4lZo5AC*iYgfT<`e-SGutxM5BER)gi>EAd3Z7<SuZI#o4<OgY0AHMxK^hm
zL<l+`vu<YyTlkroQSSMu;`|%qyVa0o)1=pgu&A~MpViYxD`a(!0t3m0vPfE!pp9c&
zp+7c@p4ZP(#?Pcm<d0A<i?|#<Mc<7M5FHv%e_B_Jpr34l%luJvz}u(E(OK7nIKV3*
z{%(+p@_9{dX&DY`q82wat2}syJ!doP;gguLvClH@Klt4Zod!$}tljMG17i7O_K1Jf
z6?IYMWE$bs)o8jMj*=H#Mw!gFb+k5Gy|}ken|n!L6@Q}cn|G#L(k%!};KKR5Sjh<Y
zZ_|c&f&&kXFjK}~87;lG*iuk>a5>fX81cY$Z8*(EY<ju7EtoQX=<cbtjWf^X$nO%Q
zo4QiY)N3v`#~z(uHal*0a6qWUyXDQ77gk8rzb}u>^(o8T>;L@Vi;GtK=a_j)Az15y
zA8-8x`D<s3K)y+vu!h!)v2KCxNLJd#S$ETLhxyFts+z^SX{H~;AHBW$Eb>&-S}R?!
z7<YE+&euNx)`;x8mjss&Y8dT@G+mNLU21qQbefm-6nUrF_d069x~FHvF+_Y?E(}ig
z6WQv%GhQbx^o%128MfACNfd$>WA8t}>vxeNH4$$j%JpX(eG1hQG+X8nT2V@Xp8{)J
zldPrW&KYW0>&G~bjFarKBs<}iusJx;>E_clzuavXvW~P0;fqh~#ha~rehFdtenzqj
zKa1}bk^nk0#7Fl!9DXnA_<eISU!20Rh2r=fPxZ1}wq7p$ok1j<<V|@aMS}{1-L;WB
zOs9K8G}1mk_(8E_SI)5zfE4(c`t`yE#gN@w=F_1LW9!>o-!_tS{D(fbX_~sEEPC|`
zE0u*6%7)KyYS~-e7rb0TvpE~!enZbadK>>}E#x(IXbimQ>HeH!PsZW<Tii8HlFJ0w
zg2OYe2Y{tEr^-kkwKc@cH|0e2erwM$=SOxamAN#kOCl>a#Yq-Yn9~!@3%+>Zm#{6b
zJx}pklZ%_Vw)(ki2o#s;?q}ouwp!WUpLqLL7Fw*+GsE^HpqX@aKX~Q2gjrsRI5ydr
zm|AB`puX`_IW3Fl$LT!ijO(&;M_*5I939=k@JP9>IvJksc!a3Ad)k&q`)gx-lX`kW
zXG0ikc-xPSman2n<&mWYkr*63V^HzFqt+?M7C4IyA<3CL*|&1a`<`tTSqbPZeRQ98
zf)QFO>vw;EOX#l|3#opS?6z-<6M6!r$k(y`{JEJRM2g(}kMIyEoq(V~wS&~&(@PI-
zna@`9%Yv0w8IWq<U&jw>9;up;cK@TG?sAFAoD&^X+S+Tqo=?IVtuSvc8)=Vki8Sfd
z{yrOE8DG}%YcX)JHo|Ze5m>*8Qra|c;mgsbR=S{>V1v>h;H#_P>V0eJuIpSHW|lou
ztkOifjt;(1$dQ8g;+>EZH(&p~m&ffE4ZkdpCSE`F3WRf;1<&yYFNw)G!lL45yhhq?
zSW^%0DdfxVln<3D_%O}5Wj^NFvJ4Ou3beUx^liLrS+v9ALjRj`CIwORwDbq6Z3IQ|
z{ypEt*N3GId%XR@W>?ed3GlUcQ{wtN%XyrI(w0ZcItOYNKDgLG1;!^6N@w*|C4$Ty
zOD**W1D(_0qIWib1yG1=QyhiNl{#vsMrL^}J{sL4lXI%|@z|fYaN{?tCvVe5l-v7z
zS;3kay0Qd|Q5N)N>M<0XIIF`bBWDV})pr9!Uwkw;H3eYn3-tZhgW~la%(l{3dK_4c
z^Q^<WAcZtQbNlLIPG=!1eXX*YMUN}u^+M7UY0S4-BkA~_xj_V59R`BzB<Ct=Fp^jQ
z@z~A4>koLMa(6j_4EEscJ1rtgU9)rFx7WvJV7`#AA27O}q4WAQ6u<L?8RyA^@4ZXm
zyqc#H0&i3>x*+_yf9q+m(w1r78}7JtbG9moIKlsFB@n2S|4XII{-uM1gT=l~{qr#w
zcVF)wb3BNzby^*5Y!r#P2+>O=(B@DCGGtE-OD0P77d(g}?xl<?P$>sYR@T-=85SFx
zees*^X`R+51+DW(S!GT0q$hs-2!^+ej%@4%4j{TsyAyphV%tKf$H;d+=&RnNY~goa
zJII}?c9iLguU&Ic$Mg2@Sdk+Tu1Q%)8CV)d6%8UEu|IY^Vw%LUmuJvPu4y~`0nbl{
z4}B^pQrItnyh-+BBfEd}ZAj`VzJZXzAV-QDq>hj09pbX$SSoA3Ap*!6jQ6h$JAFtI
zk8z7lEUq-zQMWHGvHj3<n*zaP<*ji`>&58%pAVo<Skn)M`1&t;lSlTzbB&u?h|Q+P
zM!O#B3)Jd}J8aHm8z!?BwT+h1$&S7)xn9!H0If6lBq^trZoT{{cA;oB)w4W7@n+{v
zo`0$ztBk|BHxI$gvLC=^ETu@HG1qpZ9$INrQn=iQ4tI0xKNO6-8fQJT)bNQU*YH=`
zChz7TdAVM3`6v8a$i4m>8r!**ABjt}uM?^DWKMh<mD?S7sqcErp>ja<_@@OU)bOy9
zVt9Ee=thPq?<>tRxK5D{u#ReDllrHthNBv*pT(D5R3?~{j(Q<)MEuy)lZC5}LJlKd
z92Hj!ZIxvcGN}ICKV51V%k&<VV<}}M9zJ-8x>k)iHtJb<Oed?`UqoRwU6fZsTAbD|
z_{R{LnxhftMJF(>OibnZ{Ir@00}H-^p?G@P0{H2vXR(S?#DHMaPN0hA3USBh?rfp-
z#_@1}wVoQ$rg6WmcqYbyVTE9|fx4~gABGSPPchQG7laC<&$2v5#zT(bp(#;YHIXlm
zEtRiDbDK9ipY$VLZWK6LWSk66Ety%jAFyN|mJHp3Iux;%2C$AA3)<9XB%|6tYpf=9
zmP*S|T72Tzf7dmHH;XsNf*w|N>>C_~)`+v`+_iJ>Unwi)5iGRebILL7K1}XcYOi}z
z{@K%_KB!e3f7(J-lKDy>r?g#3J$WDVs1hNxma230dCuUV4GMnn$%u56&0{!YZm}tj
zQP#*R+a;HLS*vH&{s}4Je0Xs0*K%EJ;tWom<kboN?SXF=-bVj2I9*C<BrtasHHzo`
zWg^bg8hutv(Q61pY8HzbeNi^STvYkRAO?@yEB4BAmPENzf?I_{inAPb=cUEWnUkI1
z??-v=ER*S4)N9_6?KH~_)86LedGTBm9vr9hbbGsuKCg^IRJk)w&0n0EG~nE(Ldj4u
zN;7jw-F@JlC!;mJ>+O(5YZ15aB&BkSWk$2E-x-1!kg+*P_MA)8BX=fU)}K$*<l$Pc
zMnPrk*HLUOD+{@vF^RDhkFD6JhX&G|m}FlT2APi?Fb8Z)OPYUX9eQuR-Q)mgUbq>$
z{!UYI)jY_gi=rUMZPs3w9KvxKfx{dj7Ew-Up=&~oBIjIu8z2e6tecdSWcpV`P{p)<
zO3=piRJLV89P)ykR%Q;WP(Hx&xlm4@`xqTXp@s73BgL0dSex-1@~>%`U066K-Hu!I
z;!!F88h-U%U2s_-`N$ka*?Y?Kl-VxUK~}ldl8!?C!4-{);p9^ct7R=AW!lFC*0t0&
z14FlUpSd?4FnQ;X_r(qPM~AtqggHpH)4tt%8c(NR?N4HES>bMYxbn2g)yjQmH6bPc
zT3_@CPwu;lv^ul;&>-aLtUqSedTqH#qZDbC!pE<V)YU2`jU}tBM_&HUIqJ!0JN)qI
zo4lEc3m?UOi&r0&rRIh!nXcUBf$ug(<#XQZvkbo0W~ka{rkAOpP~2}vPmrGPJsLeC
z1n2)El0VntrOi1OFH;)-+S0{-#BWg4C#XNaF@LkpgjVI_nnFTQQ@BFeYUm}1TRxrG
z$hi-soxJUHblOda{!L+WnhBM<{tVl(dmr=-;5?I($8fu+Z92;%<xU;E8QpAjTd4(6
zJMd18lgiht;uO5a)~63!P-&@Lg~&+1cm&h1NKp&bCE>EP(YC`X%GraB){eRABpk?9
zafe3S-}CG``6e&%At$@t<xyB?5rKbH@28lqQoeS+pt%{&g!leDPX~UQI4;ur1BdIc
z?{pe3kdhkfQRO7=&C}S)W2iHJDbOeJtKP&?(DDz<WB1LN6W1Cy<xEwT1P%iK`FUR*
zE5qr?`S||I&t@6JNH||yr|Bs3d;eQwQYrzn6j41TSpQ7YmZFoKI&VsN{7EpEmRNVU
zn6!(xm&SMuuB-7$p6}U(vJjrU)Gj1$IqB&b?wAHN>3GAjz<o{T0_5V|YCWGYzh2d+
zs~vNmGh<{lvM<LuNbuvW^i~_3D~NfvbT#iSbE)2wEiJ?ugTvIKe>?FNeHcua^{2^E
z|0|FCHctYKa&BjsPV;Tv8NTQ~8OHN7U?{3YLgq^row7)iSW8q{+n#2-lcrB4dgkxd
z4892}AY(B3G1U81j+DuG$Er582<vt~s=X;sW~>SOB$zy;i_7+Owj1j@v|O^M-Fp?t
zhc|oYd=_OD8(r(m{L4Zro=g^0s|oF=pM!Ya&@Oz6%?^9U_QG4+p((nIJ;gjf{n%NC
zTQ3O`XT;Ae-|T8PWw5eT<1y0`s8+8w3om^?*HWWrE=D_az`G_*X=z{gVM29=R*$kW
z&4jIi=zb<HAp;H1K{;jt^~1@5NAmU3L3kfbJ_y?68T!_25;wO5#<9ir`Ti9&Kgy@p
zZuLpb-4$c0$kGhc>a^NjJapXQ=ByvE_`QVFoctyShHO3l6o}znR-vGolmC(^c=l<9
zZ`~g?6GiG#$XfQNx1qGzJT*Qm2p#@jM2(s!@mVr#nHF)ch$)f^57gkkcZ*Bwd?2a(
zlKLIxGNV7N@)Pn{^&Qh}Us<jA(GinF7ZG<0r48}mC-<i4;=_&ZS>7rbRBzKP!>mLW
z3GXIuaHW~*6RP=Ga<QL3oz=?hT9npSqM_B0Kz3Yh1&a#Cqr9r_h1b4Y^*r4xq71Qg
z`oH^9|2h@VNWoHga+*%lJl!%^98tHj$C0B|OyY#Fp{35qXY0)c0}2voeutK5c)N=l
ze{L)lAzij%Llh6+ANp&p@7hQ0oJ@lUsj1EKl~~P({$L@eUoJ||dpB&^TEOPSh#Z)U
zH?pS_de#12kl{5&&X!g3oLXs3Vc>x5`TSMw7_$`d(NR{+a(ccI)?3?e1^1AQmQaqF
z*N{Ln(<!g?b&z69)${ANrcyJ+ETw*X-|*|y16sLj{?n)SLSgG$Q=G)qe+-i~Q3bpw
z<sos7>nCHQ;u>C$5#P49<`LT#(J6k;sWr{d$t~8Gc9E;xntXbpXpXJ{rSs)19lqPs
zrii+lkZ|qC^i-wgL#jY>OBE=*HuzcuJyqG5%)F({0i(vw(;Rl5#Z`0Sojl5iV~H-A
zD`h*%F^NUHEt0}u*N|Q=`HdjgyfB12_3(UU8lLr;A?7ykSb7Uhgw~E~Y?u8+EqG2+
zORD13z3BV`opH0Uw{laKlV8lFT6RR9bN=x#*=(#S53jrd#Upjw&T7aCcB1|9rLRWL
zqa=rji=$7i<29x64JHM5)QQ~{SFCMh0H?9=PRn4;bt+_dIC<`WCPGvXQ*!bfm7dE_
z!uQmzuFhY0Sws*+M@!gMp&zwidz?FKlpq1L$WAk#&1a>``|F4TwzHi0S)ymI%sT$s
z53VfSK2}7lYDyAM>`+mZyvgWyx9f;fpTVsZ)d53F59-&;Z*?XJ1I)a_m#>tctM`Ei
zdkXab;B6=EzmTsQvb#i{!{Fs9r#bTiM@7k4i(}+!ZU&05;{70$nY4q$7T)o?;lqgK
z$TA$*Vn-=9*^tz3^rBmG+C)p-mrxx><+#;?z1EO`s1HfnZ?Q%IaMAU1yU|g;JOk&p
z=Rl=hnRSxsIfy8McHv#C@u;h~tL0JduP*vYq&wgPw(``W@qMB0-FppI<+>RR4V0Iz
zf`b@uszFM+yJtEzUPxibMMhk6JvGT5n%KyNVP9E5+>NI_{qD_JYNq_mMuY-$M#B+^
zWZ=*L*T*>iU*zrGz~KQhq~RhsjIxXBnmX>{c$j+|fs?znHo<QP*8g?XT#5v48cAEN
zNB`r;50(l6*y%A57%@+UEJY=F;oa<Z04pvB#kMLq*YedBY=$f@Q=-;oR&a?rAk0y4
zC}E#GW+X^d0J!=m!voCCHxW2$#8Y&8d#hs{nBy|)OeuI@yyIKIKVMu#-~_mBPWxzE
z{W`mt9DH$h-%SaJM!j)5%xt#+SD72gRdXuxIlGMzf#cE2>tE-2!q<s8a`WL%_2+0_
z=;W(#GVs%&-@-f8F+))_yKa~kIFAF{U!Q95JUP6?>AJCiNCo|rd-93n^&W1PmNEeg
znN-x&)V}i%bF8@c_x74ttk_cUs<8v9$0+(fZQa}3J3T#}sBvaByiG;*J~N(iEGw9C
ze!TW|AC+ZeczE~=<il?9oZBY)_5|8JA21XA+H_MPOw887A?-$QsOZe;$&s6zn`dk*
z3Br|YYOZC4p5+dKcE+29cP6H$roR{U^?byW2*xn}78Vl+T^H%`I=2eeSgUS<*IV8T
z4++Fb)28EFZt69?1m?IKK_9^w!3;Q!_B{piD}911>+8G>;Ya)Hmh~wF2xC4!wo0sh
zzbbM`MB^B|*UnOS1GvP!d-t@ow92VcIvj3jZ%|@w8`VnV>-l~$_85<2f+fW)G&MDq
z!xGNcs?J7JEqe)pbB@>l9OMl>pO8>_fAWjuw7Yo6+#E+KGGQ>QuG689{%n0B4vo$>
zghXuRU}naT`S=vS7D5d5o4-W4EG?M87+<@&@uLEaoCMp@xVNx#Q)-hl00*GRWN2|Y
z^s5ul^s_Un59mqq@77-zb%?#5(vb$XHWX~!*PM}SdhNQhKTxpwuo;pf88*%C6{=Eu
z9#gkHYTX~&oH-H>v5Yo^7a$Oy5$82}*@}(v-h-)zVHjy&#(ch6)=h-*2dpDwx06w{
zs#Q_)7(G2kx>iw9(Y%uv=Mmq4hH155GX$%My^S3ShcSgQ9OE1?7#f^-a?N`~0FxAF
zag>UwugppjNM#KB2BqQ@i(b4_E&Hv;dA(BlDg+W?>+YU++n~bJpK4xLti7OG-N8J~
zkB~__Ln_s@OfLpL2dwD>V&E8MkByHUu+l%U&wH}!$42p71nbyeFW0@^PG5R9?2y;4
zoYDc@4jjf>8?PeDV0Sy*$xc+n85tRQ{OHjmaPq3VEWW1SAXl5FL0VS09fY7BUHxu8
z)E(6^wL8<Exx^&AtCVy&F?Hfoh_CM36~fk?C^$ATQFi=lc`%=YcWdL9sGuVfeyx=v
z9nUzhJg%rphP8Z@ei{>o98af?=R#DXwdxOA<po_fvZ>LwDIIavE)EKwFI+yl?XPh&
z`1bWF<-T%vzim<ngDOaNykG1bPbbk@Ep7pE>2bt^iA(j00@#Ub7%p75e#7UZmLHpC
z#V>@ohWJ|%)Gr|81R-V{y)MNyG&E3Wm{JLQ=EYh&N4H}~QN->TS@JU|;nDHDsM+1>
zz{dBI(LkUX@hV?1m=Z^n=)2DC>?m_W$14dkIPIe~t;*E)r(5xAhMO?%r6-V6FCDn>
zkcxsz`=yox;&xXEbopW2eSI2E=j>KhH)5>!!xvH8-E`S{$Mp5mENo)YyK$}#<{gB*
zHsfZWpmS{86-{<1Sa|o@fILWsqRJcEg8>V6Y_ei2#nLg~I<+lqd}dvDmm=-(uz|9f
zwaM@x5+1;+<Hxov?(mqDfo88FuvyGxP0^m2g)$i!C9sWQf{uhVrNdQDuzHx9h3YUp
z0%Lnk5I@&mEXo;xW(h9BxGsZAd*wQtAVr4hiu~LAp5ABn1snZjfM@G$CYDQx*`Gh3
zY=@)Yw4Tx)gMIP&`>Hyg-M_CoI8fgPma=tq1r_+K&HfgeNP<h!(f!AGy6YDbU@lIp
z4LJkbyVZF^z|24aqwXB9ah5OAH#KczKx$Qi6M3um1$&hC<PK-v`Okg(oIa-tGVL&g
z0v4|R60EiC7ozo4?dYZ5(4f~P{FN}j1+3aeL>iXMhFv_kT6q;%TRB?Gb8Viala&dW
zdvpVH81fA<>SPN2(;23gii-PQaQuC`O5A6mSO<GTl4Z_of1NupFWddtHcsLqF|Zp}
zJ}o3m!TSJCO@O&S5F@<_h~f7-P?qA&-XUg(U?-W)Kh$Q!<%O;^7k>wZ>Zxn4`|)O*
zE=U#e3#((oaesJiMl(x9hX_?uJJs{(zfygif~PvWveG%p^o$*hxh>04eZ0jke3@UD
zIOu!RL7p8DhP)@~qO^GC^1+-Rx)0$19k<YEQ_g_G-jE&wF?;GDq98`f5)Q(AYgPsm
z=JB9rrT~E}?ttL|=4YZOXP01;DhJS=G1FE_NJ!AI!InRU_t@(z2{D;LugQ;%@x#J9
zoI1DtU`$ns;lZw0kH9m0czeDZ6Z5jf2+S-v$gs|L>+Js_IK<e@LV9+8{V~IX6S@ju
zs1!V0+Un1w*#G=}ZSVm@Q3Z7Bebq3Fudgq4DPCmUZEEVNIyYxGw=s*g)m6dQTXX`^
zj~_oSiUYMnp9J#R20LvLFiiaA*`RnimXF@gZq1VT+)#OON5R;Q#d=ZvZX1(S3=SlV
z0z12&rPcsaY?aGcM5Ic7Cj-S$pU>XG!Qtf45uXWEr8p}svB#3~6ZIZP0wBU5dZIV9
zUjW!}4J20sHEli{NQcI#xhw;TCd*eY;CITPzyrHV*@lzNEG$T!GXyn8c0ct~UyQ7k
zwe$Mq01}Q1dBaD59VEKeb@w}4`K$2ZmJ#u==H_OOi4czo^jufG{^2_hqPdwFu@KXC
zAK-}bZ?VH(29;K)!IN^%AS~iR1e<`>j_H2PL<3A%wtF}|=H$d9UI3pB_~!79vkj~|
z10J6Y-uL=UObsM?Am|&~Rt|TU;}-9mMS<Xe6y7@9iuLi|uF-P3ABjXFIVQZ&$unmq
z*XC{jRDl1V1IKdN5Q=_TJz}Cf?xvZa3R1ug<9NU+gMIW>?4x7Zz!kJd_1{w^Lz;F3
z61Bn#D*e(w*VWaXe0%*W)?=5x03VWd4Z8~rOTf7?W*{@zi#(j+lfKl=3U>Z_2@USO
zl25%>mX^%@7a;w>;echsiKgf>AVhQLIrn%&$G-S7rZYlYrc8<sgR0WISjt~DlR>O5
zV!cfk<h@7-7Z)=%IwLo?8aMX>Tvg5r*rusZS9`nvoQp?YOYvIm7J6wCO|IUQxQo`E
zV$T`xoY8f)EtiUcK>CBQ4<rIfo8J83%em_FaMTb#NH=?X)$1XOg+(H;&Mkiu{793<
zCX*!%t&)MO^14Cdp}`IQpI$mHR#LM}&Up{^Ox%RLam6}~F;k^<9$nBWw>z~1g$iwr
z8PU{dV|%_Y<^XNpz*>SQX^M{FrPckPMtiyl=>BPj8^saa-Cex(w7V_$9t{Am-q<-&
zfT;Uk(r|g)woO?c_-s24i4RHFW<lJ|?Jnr)Z~$picJA>U)RVzV5TfUU-Qlg%ot>SZ
zKlMya=SMr&*1j+_+b2*7xo3T^k6>sf3Keoxy$7n&xMHW``S%`%LRAiiQ+8K~(%pA+
zN2BEjRD{qdP;NOE`^X^?*lZHLPr_<5QJvmW3<UjKCTj(q>);T!MUIusM5V20xb)nj
zlBsy5v{jJJkNVFAv^Kh3Lxw=1K450HAE_`7^UQi|2eyX&E-vS@XV$PAD<A%Z2$5;;
zHlvdcvo{tAoi0EMp8|K{-d(_ef>wXf^YC6A5hv>)bG>0=oG9dxgZo`T-*u1QPU{)I
z)$=4iW;-V*lPZ8%2Ak|#cbA7E1zYAAzwQX^e4vSX9VYuwYxyx6y|~I3sKeN=o;cqv
ztfXy&nkMaLFF(kX876{!#RZYs-=Esi*!Z@%1=$wTywjPlRl%h+LMv-}({U@EE4aV&
z5<b<iQ4(M7S2+VqQxSR;t=?X1WN-uj?Gk9mjBeYItp9dMaZq}+WC)dyM#si(fv&t_
z<3R^${j2*GS>6ffAbFVG1|^nJoKc2R0jx+R8f01uUK{U42B%Fr*_SV#A!Hlg^kx3A
zJIk=}uPHKH+R?||K8}T4VK6NTPI_?B`aowTzOMKxuYW833gOvCTPz`3q4*`tcjHRg
zvmA8E6mVMOIcyu(Zq$Gi@N#?EC1QW3=w)M4b&_B<)sjsrj+y`#*_Y}D4kK^kTc!#3
zaQ`wp4S;4!@Io<t{Azm{t>Ye7ch4j~nVmnujG-R7%{`2Mg{n}9H;{W0YC<j=dLmMi
zJH=6c3p*>aKi$u6Z4=q%v?}i8?95+3OGByl{A{GdT}!s|z_!iSDegV?kt1G30mrN#
zgM(2fqiH3^yUH>3om(O2Wn(<irwa?;mhR~;P^b0Aw*YV4LDJpV2d_FXi(v{^^v1yJ
z)+@6?US1bEI4EIFm8?-_J&W5%{f!q(gst<8QYPu6w5Da)oYZ2x7xA2~q$8(0fqSU@
z0=ZT*KWBv1NUdsS3llpS1QcE93o1WN_2;Z?Nzx-c!~L}hyuK2*YL+@W*zL}Q#KfhM
zwe_N%Y_G04fAI+3Fb9OwfH<dkHvpO-vrGpq4ae<YJavBH-&F&!oBq9XXR}laVT)3X
z;A+~DJx!{k+sX6x$J-lf=F%%`7jk^#IMR=?y%X_Yw(lk5nA4#^DCK;78C4`KmoaP@
z?y&^1R6xoZFbT4kyIgU@gQ(vHsSY5+qFlD`O!}_yQ(kE_zhPSm8b`SPsDrJeWdv@+
z8O||La6Mzr*>m$=!+{rdUd3^0!x8jesNQip8X&7HLD&Bt=gr*@%$GQ8YHH&8<r@Mz
z*_!<1$O|&WLDnOd;);D~xTtig&-VCNi2JYZlV7`0x(6dLdTAUayO#}NfQ*^*UjY=r
z7|U3p{$(SL0Cq&*utbydnaco;A82;1J)HuKLA2RQeSAlnz^4Pr56gm&7k_OfVNYH>
zr&j`q?5J*h#OmtmSOwJgp5~`dpUk+XVUJCqP<9l$@zbCcjFmxl{|wRSm8~!o(Md(H
zqqlTDbE>ztg_At>t}^`s1npuGx?KISn7LSlXm$8|JDFp(-BtMwnJ^t?Kn@Lw=JtM%
z-~)dZ9uqm!qAiYULq9tCN~YZO0P;poA8kH7-of&qm6(+f?W`X{CD{AvpkSMtl5&Lm
zmR5uJJ=eR{pBYo|3YiQmho|Wu+o~ymYWnl}^XE62roSQD23OGM`ijAhd}%&_;bpj_
zS~yY8VGsl#9stnb#Ku<848Xv2_tuzjfg&{i>sRtq9|MV)HIZLp0f4}B%IX&8G6Qv>
zq8*7e?B<zo^jYWyXBz=>Z?phZY;K_Ccoh8_YE>nUefSqL?WRr0o1hI*;T=(2Vti_$
zt9^o5#UzM#`iD*U=nZ_!dcdVY=|%nY;y^UtOFAx^=@&e;YhBpNZEz)Xs%HXxU*8Ce
zHKm}D1#+E8>H|Y*6EbP(=i@370FS<sg6Pq@gEyU$p0+0NJJ+L+8qjX_^KLup!n+5}
z_`+AYfld(sq6<We<tO$ycKsCJ3MkslZixz3W#E8XNIlY@w<NlFCgKR8X2L^+PlqR&
zqIP*Fhmt(?OaHNwx97l07Uf~1cCAS#B{0<yQL7*Op6mObI|xbeiVleacsT&LUed`3
z9A$TUp4$BegKNxBm1ra~={i8g*E*YS<O5X9G8NAYWvN)0hu5l9Q^F2tX=_0|w)@w7
zXmZwud>u&=7ZMP-+Q+#^8iW+SLsBBl9t9K3Kw5HQCQn+wj77)~8>CleJEDo&S!wZE
zRl|s!i7|le2S~{1A~QwKZT4x3BmQqFY9*Tht`rm@!*9okVyKs10Vzg*O~TFja7<-}
z1nXp=vGU34Q4DyY={omlb}&42Z{I?k_J<4I5;|AHYJs}BHKQ1h#VbW-tYomSKfKvN
zY@1$o=Qn=8#Ooha``cxZMh!*vo+s53-bFJFKfal#nI(%aiZdfN6qx;C+xd|T8d~ub
zJ7|0ZFMtnqCupmbfd)`E!NK6F8ur5+ea_f$xcm^T@(h??0b0b00RT)v<$sNjphwN_
zza(d1s_;V#zj=%k`8HKT*01_wwXdS@{0+gv8`E_U8boD7xbIyTh;jm2RMU>J>$4^?
z_A~4m3E1}iuijM@tLUSXXk8RDY<f}mnAY(xGmr$ZIR!l5qFnw1fisaudjCrn(TCZn
zgRlU{=Z<L{9-*<^N*dzbnqi1f;JfJ^E06h|ykX?pn(G-BR{sRB5>eGK&D7y+H+mAb
zy{@cwwma`^i+@e@EEuGAav<I<ysPRz$zyK^TM@pVJENv8c3-@z^?ab`(#WF<zlje~
zeRw|P+Q0dX+y=t~QHT4gK-9Yzz($TvK<l{f#V;D&iBZA<d4R1pRGd^R-;9iJ|A_Nz
z^%;l{h#8eLc#tAf1f<?aBWgH=lvCj(N22n3>Fy_`;$iCx0TZ1>W22)>&VAPl0Yb(Q
zD~>+kh!STHVnyeifU1$=Gw~sO=3|UFX#kB=m4rfD)CbOx!8y`bJ5L6zvXgl;_2VWJ
zPMlD{74*eOuX2pUw-|^;8~7}|(;Ozf%2!YJQ%!rC;%fxVq>=8Vx$MD_rqk8{&YeyE
zhJ*0s!<y)Z{aqOMW5<w&8`?md643?ZFuaTPDRvH7em);LLW>r{U_-6-<Gr>D>DD4&
zG4Ue(i4o)+#y}uiR{lBRERaOZL)e_&f=8!o9l;)g(ZGy4Z86Bp_&X^3er&BEr<D~N
z^`1l<t;VzJPro~f91Es^uM`a0HIvy{VxkZj0^A6wS_gsl2jUdGoX<b?VO%Vyah_u^
zi!Y3X8SDnqZN$X%G<X7((Z(Ed%XNR<R04REZMZt^Z=2tLJ`9s(1K*V~-)r|F5R7aI
zLk6JxUvlr}(ckLY6uzI~o}w~RqT0B$-4+^3)(k4m*~2e@$VP3?&qV!!NLR-MW(4yJ
zKa)hrwgciIDGsFi``g2cALw0q>Q#W3x1onf@uXftMXcz+8o}ZL2?jJzofkWea}Z}x
z6$4o-vH;Wv?;TDhe+Ik($gY^RL1GBrlQ5q>^e3<X;=QW$3)n|~=9@Yk<gGv5T^@*>
z@5XPsuIDb{f>kXrHn$(A8>BwFjt!%FdHf}y;ZwOUMSCuG`+9u+7PyVoQouuvmjFGS
ztoNXpOQAfwjix&Z#^C=4IQ)Ik^x6gy@3ZjmIVOu?_w=FpI9)jeJ9`mUi`wBWRJZSV
z%2S71@l!{m(YfD%41(FC*e|5)I>W6bHiQ)33yOI+v^hL?C+SEUfMcXSJO!8q>ygp-
zrx}WJ?^NFe%M2Xfo=?hVt2iF-PRGb452=WKelH)RWY*A}ZCuQNCw8f{w#QSaz71PR
zXf4M`;e#L;>tAjf8>LpDGlF@x7TNDT`Ym<Af1*!LqT65BeAq&}BX#AUPaxUFqH}<D
zP4heNvc0E0n@taV94l&(-N0;y%mCn!8*^P0U`&#;i40ryVg;tabJ%An2Q3u;rO|&0
zK3g*tbkg0t8Gsy6zOP~(Jl#Sp`sc4^Ux1Z{Oq`sYfFetpXhzDOaw+bryx3vC1|@g`
z&e;>B`dz|@NgIZFG426>gB=n?;bZ6JK~`}*`Y5khz-)j9Mk_G%bOcU1GBu^<A-81f
z;+i3!Fz}0^i1_Z=QW`UW<$*r;_fpo0nIaeuUz*s0w^|2iz+ChXPmTa$kYNCC<bX*e
zl$3B{bw}_D9VV}-7eGChDY)mNnwxj(I4b_04bSd^ovUjOOHoM)QN~e~{d{ng>+yjt
z=q_H~`Pq0OenrWY_)O;?589Y%56{of#}_)#1E{QX=@)1tXo`VjtFGwNjg1}?9^6Px
ztg-P1&5+Q4*$iM{fK8O6$Xtn{wFRyakwz$TDfzcD9QA8dcY&P)+63U0leM=@y6oNE
zt;{7OD4NGLaC@KGsd6e?h@7n<y1;YSi;g(3;VL#ML_fg#U33mcwH#6aw2(aG2tkAL
zA+Hf#8O+z>839DfdQZ;4DPab5dJeM#3Fdwn?yj2N)+QHp-H{rY)9F1ka9(=|y>`)u
zupP7raNCR6{Hj=nAp!4B8(Y=3cQIH8^O~?k2CJtTLDE#_4?=68PTIK5wTIt3N$uF&
zECPB{ln|ozOZ5VMlLOw_{>OoS-|x}5Q#xKdgicIo<B%0S#^tcq%~-Frn+8lpg9e?X
z#j{I++VhA6%<<X>HbCrwu*DY)-fl!7LYEjReHNY0Qa(4xR@eUW!tKvt$Ii@q*N)0V
zZM0YrH%8|70neK{-K)@T@T|+Ongu%2m6qve=mi7r)jNCsY)(oezkcb**!*@0q7=Nr
zzkM=<?HRegYX&mTzw|_**d<%Yu_qh{za}IX=Ab}{L)A(pm`I_mEJs}Hd6xMa|FJlX
zscODHWUTf}j0CLn)HNm~1aPpIZ6(44bGixZx3pLA_CarEUV=3D6~Kuc6aLs(C}aQ<
z=I`jU2Ffwe-oF!Gibg*wK53l^4*jB?D}CBca%5=ixvr7K7t^tZ+*+sH4P@I(IORal
zR%h8AGfkXT9ZOJ!$$tkrMyBq#$Is87nVHEWiO-cC>v><o(FUmUQzynF4D~GAD<hl(
zp%`{lojn`L7@^bT(6C?tlX!eO_{`a%=+MLFDQKr)YV1D^4qP~eAU_>L3<0Oq|1u%W
zpzmq|UM;s@Sr5xDKJU*EZpq@Y^w~?>uF7ryi)Q!mr`1(!LC>891@OvJ;>74^@u5Uf
zOOrnpC^0^+q$?xkWwZY|dq1{$fg<%@sa|o<0rBVZ^4!<=(m}%|++APduwTTtu1?VB
z1CKS@In7Qc3(?X5F<g20+}(NP<$q3CjI=*(^ZkP(MNfBKji=`ni2-_+1XqWd0tkbt
zPlG^J;{F{&0MvxT7Yy3zVsBXnu<dD+FK+XOo|O|lIgmdtU_Ck;G()+I(Az>)DpqSM
zq(|Kc+cL~N4f9iC|II^5^~nB99N=+WJ<I9;9yWJ(x;3C6AMOiVPdeU*9Vwu;V8^=+
zQJ*Sv88OfuP!>WyT#lYxHJISy2bG94G(J-Z)yZPvwt5mW6}05WR3fgN$>+fuDqK`}
zv?UmsohiwEtG$gHc6;(cN5<t%!K72f`sAq<YD*XGgQmgox8LBg>`t{FE7*?jT&V%x
zn;Kwe)crvc?O;7bIgJbwYk~^+ArkPR9_=@80gJJhR?ADFJAu7G^+td3NoYGf4wym(
z&D3g($Lcwt=F!Ktk4#hm*^&QENr1wa0Vx@;_}=ZlxC~o5f(f(}`rrXW{HkG7Cjy*<
z`)3WnPg9N<6L;JMSNl=Vv9-^8PBU9}V>%Q6(;WfLk#i3p?k$iBzFP`l45k9-T|wYj
zKEWnS70@z&tm+&-EPPTHjcot#BfL}rBP<UUG>_BYYRA^X{~YD#GX^_sy~cpMe})+_
z(0Hy|!HPjgFO<;j^`XfpOwq?6g@1-M5L3lvSFup#KV6Pv7!WngBr}D#^$CgLpiuqi
zQ^-IM-wm}j>wvAo7{n&q*xufLa<pHd+i=<#NQLQv&yQ06)g*`-9UF7onvr%{dpB6y
z(6H`UY9jZ3eh%#KU%8WXhkF(aVKB8(NiTU8=t19!f%se=gGhJ~p;ca9t}M5!qSP^}
zDDCP$wDK6dtk?(G!au^GE^mjSXzWN=cDgYJvyWg}Z_i8IeoQ6kn)W8yh%h7mHb9ae
z8NYfj?ynO|mG)0g>R|M&jSY@idpkRWx0v@IRsYU^z)&bG8|yH_mQL)5;k`cI{+>5I
zOZ~PmVwfj5WoLiW8`EY_YKO9R{Ik5^RsSYl9ot?>C#L8|{NtzEvwQfbK^Tz(!{%?M
zbo`M3bd^idZ$X_JXhF<^`W;6!MU}w~g+iTmyQ`l3okacE%*ue`7MRkJG=7p^{T)`Z
zFU$Z`i)VJzUs7&pGkmLG>s_a~|MsE3M?QT$e7eH-e)Diz{s_bTKWh{zm)67?OA5op
zu~@YKBLRW&q)liqv$^;D=m<?to*&I?2GFA~CZ5C`U)bpY+~luJ4kl3uCV?S5zoix;
zeO-|UxOc%$1vk8#VqoD&HA*21zmy0nr@4>nJ(L?H;a|HHeIRF0TphrQfpcdJ*VrOB
zI2f?jo1786hiRm#?gqSpi>BcB0KlANtEBwJ8{rQFsk}35{$<<x)Xsk`!@M|v1JkPL
z6FlJ6I&LUY7L!&@d1>Sb-tPT(QoR8-C9)mMx6bm+L%ArvX<^bqUmU82EQ9;>RX!}}
zZXE!d{sszo&!*@|Fd!C__MSS#G6b**o`=(uXdee0xU%dG&!SIc@cvo5W2>==suctX
zNbJ9fwv~YK74e9JQ9r6N-3!}V4Kn~@0azg^Dan&<g;Y}uOpFtc#6PBZ=rll&(y8|d
z&1jpt1(L>JF%m7U)$y-I2bqS7K0pE_^^A5eI+*(&DkWgfZ~lfcz8ur9tokTL<J=pW
zlr_AvH`Pp}O{W^ZC!y~agJVyBCnVW^a5`boF<d?H@7=&%X$qhxsd~L_^681J-GI1q
z+u98x!E|mqQgu3w64ew)A8iFVF8z-|La~Pywz=m>FiG#sM*jpO0BOzSoiG$(n8TUd
z|Bph)ASPmb^^C9n>CI~g&D5xAe9ss4w_ykC|A_^`n99JD4j+e&)BNp>N}!<l2=Dt2
zDSFPrPDKB#2;gMEUyS16mDK7p2L0#uH*5z8SHUras}bz_Z)QTl1~4#CKK0kkFl7c4
zu^^v(oS=&`@^1n-1QaKbnK3-49=WsQfNAEGS%NbGmrF3e(B>5YA-m%ET(=VacCt(O
zkiSKS-%sSW#4<niGfWD~%Pgju#cC)OGq5&U=a!R^W&i5cXPihUuI#p7i?N_1G>`3$
z8e9F#Mj3{fhQq(+@(vJ=SzJ5{!ntx6a2R9>ywmc2ab!A5cIWFHm<SDR!@rhJU<kwn
zAOPgyVACbwTZHOnMG#tjn*czAvVLIOE`i#muQz_a#k4GNk^jCQClwb%VU=Fl+1a5_
zPe9#*aim@XO=9rdOh7<ejbPfA%v&?<L<NOqW$A79FiYt_YvCuiy_Hx(A3Gna<+sDf
zJb!gt=t(dPXHlol)tFQZ?*1>=EIFARsuyU&Y*p-eJXTy{&{Z9uoGcI2Z+;^L(!5cC
zpW?F^H<4w8d;T%zV2=CIo^_p=4(l0R0TuWYruz~GUtG+lV%D4qrA@{Ks1(Q|6L&LI
zMnEe`2GsdI5SJXzJD~FYc1E7z;WNI88$mza!Salhga>p|HDBIz+gdQlmOw1b%+Pf%
z`C=<AAd^>Yn;HZQ+6+P{%3ASj91Jm(A_b@-Dh~fzj8YH}1|`UTG<d%yktj-je>*F~
zlShKICb8sew5C6%?qZnjKA@6dj!y03H(6V#c$fa<C^hmJG1PJPA2Bx~u>Kt8HiKpQ
za`TPpk`4;rMkXg!g%C)AYQ`e){+{pRU2H*5xA>F(`Jrow<(|cu^^(Y}FrQ8!=)aV_
z9PlB*Xm4lUTC9)R1kj--G71%Ka^OFr3TTZKs2;|NJ)Mg6$HG>b0RZrwOqxJ#0>zO5
zfc%_~EcCW7L3;rsYyBQaI_EORS%{3@8Ud~<Tk!~-7l7`K8Bu9l309bvv)7*#SO9e&
zmwWX0pbV7SD`6c#;l552N9)?Nc`q>kc}%4qOt2F+)SG$zANwOpj17c`+-+@bxOCq!
zJ;}04fcSGzx}r7#Y$@Z@FrCi1PB|b<0MMpbCM`+oLxe{bgkc~{#+c?QZaaF!0L>XV
z<u>@Qk-zeyCrul+)J|La8V)3xm~_zy810{6mnej7uk{cdsq<YVaZ-v+k&0QXT_Vgd
zEj8&XEogK2lMC}zb19a7<am%;5@{goz8|gq#ee<C6?p$ri+QFz<=^SW>J2Y!URb7j
z`k(`SqJ%c-CC+7(MpG+i+}2*nks{<F*XTO<F019xzMkJBdxZ#tjn90fpQY;M2ev(&
zNTWH>P@Y?u2TjxGinw%fvR^=RjMMh-1WAFRml%*7YvoYH^>Ck=%^>>a7u|+0^7f`v
zCWvP{!>6QxIHpKzeK*hW_9r<r7<Y<{g_HJ7uTSW4PQWM7bsJBMeMW0H|AUhL)5f@u
zkR!uCs`&-Xf-qd5FOV%tD?;M%cq$asI1=H}VN$)k(XI~I?GZ!y7@NOhQV#%xzd{Xy
zsS$_h&kq{`|7)$}uNaQrLZTgnL#H?xobJOLzOw@|9v80rLHe&$t$q)1n%`*V8U|~J
zoeF2kvQ{6^-xMcp4T+h4mgNvC%SYp@B9+nsPC;<lTUvIUHpL9Yy0we{X+zRHqQ9zB
z$3@K6VJy!;p>6k!JXtX$m(A%hef?L12%QKOlVyH1bGv;(#Tl2v(1T^Mzy16iW;MO&
zOOfrKaxmq;#~-i3FtR^;L{INEYhs`Q#!$M4y-8OtU0vS3R3P-c+JM*?98aq7Tvqs#
zuM}br7WDvRE_8Z;EGRpYfh~7PbCAI><yI|_Eje62zaBMWDXn%J2Dr}aodtQ{3-1d&
zRu_d)v*<18u@P#EcYRX+_t2>!Yrgf0@f?q^2YdfdcUK+{<@)yPoS*z82?r5rv9#E?
z7$Jv}B`P6=D7&FD*0EGd$yUhN#=h_CU~E~2qp@Z;mNprC_KfYlpBajCe(!(p=l#5&
z_rGDDdFH<F=en=!`d-)fdk?u!e_5WeV}_tZM1wwLb9s9m&tRYZF6kPJ{Jt=<FwV6x
zGu;*@v^5vI8U@AH9sbi$@;Uam&nXa%22_GRPiB937!Z7{ff6_YpgQ&#`U9G*doONq
zc*~TLPZcWqf%XsZm^HzG&Dm0!AN(r|`DsN;C{(KbX4lG=OEAwt)}NRc|Mh~etk-5m
z8!M>({?mHqi1T8`B+>vxlY{bh5Zaz5kGt-1z(2o2D^M_Y%{P~u!PJlfm!KgrRsaVA
zy0TD1;UC5&iS_?v>i26jy3AWV`3hx_HSNTVU4|?g{1e%L5cE-%WxIsG?HKbw4(9p2
zh8m(gVPgSV7!)Z4N{fq`%?+hsFs#QK<{P2VE$jWKG7ZRp#<#rRcO~Fmeu6J(Zd7N}
z_?`C5ZvXqEy}8D^?_LA5@hRK5{ZRy{sC%s9FZ-GS%#zuCuTnJ@w6nH%%Q>w7^zI4y
zmA>CI_4vc|^lwF`GG2bb#StHYdf+D|d7fr;zM6<;%te~c*%1Q?T)6NM*$qJ6<tl>h
z(++%vk|g0iW`Lo7<-&)OvmOCv*YhK|Lzn#&I&dFeQXQpnut41rPw(~_XUj~v%9KLb
z*&kXj$<N1M8Y<du=VwPx?Br+6JC=JAdnh^Fgx=>0E1nO8+wOa^N1}w_nFVSh4znPS
zrtD}*IjsN1ZFCL|ti88jPm<1WYl*nglASGpKS_?DfE3zWXo7_jB*-ll)O3u-ZVuqj
zJ=$ehMn)AbCaeZIpo+dLk2dj6@Y7xv$|2_W-J3J`3lwD|Hv~l2?(I`T&5>&*QS62d
z6lNsFjk7j0>?9v5zYK{*@(lLOGh2K6wQS?U=ttu=q-9hiK*NidXvUUt#wskp6v*x-
zz+aGD-6w}Q3BXU!0*!!QDL1~Bq>QSoQ|#_)6CV1*P{|i3Sorezk0h`16uO<sxdH$>
z;`JsP+i0A#F&e7^90sMmV!1TYeZIV}U~mugqn+e3#8}kUH}l6Nb?{hp@TeE<Eb~jj
zdjMNP!4`~!9RlXNN8$#uoUa9dU(F>=FIbpcH3*1vU3dV5(*i%JQL3Sk4+k~z>p%Tx
z5<sJcd!Z-oB%9$M8s^f;{XIQ8X{h{Xop0bA+j9Zk6Gd_f271U3e93%Ey!P0xY)S97
z0YX|%PUnQm%n`L_Q?*|7UCg8G`!ik>WIzG78DQ8R`sol9t^<a{u1Q=6xk}{Fg5#$0
zJAT8yq*@>Cysndy{bzFHiW2|0p9v}+orRHNE3_n^?T8CNNc>OpmC^<D+}(EzJO4;k
zR42Yon^ha63JzPF9$O2h`pY1R^~fOyn(b1!52aBE`H^*@69inC=pIZUx#M>B%%4}F
zN<|$U`X=!5KRQ^EKX%-z85R&h&${R1e(hBLUVNw5jVfr9LGD`#wd?AH9?&D(W%7ES
z-_(q^v_`hRj8ty)&yS96&9CqpK3oSxT_`Goa@BLt7qYC}Gvu((4)Swvxiv!hx4qk*
zWse9G0ZpS{*t-ntXlOgP<F|HO2#N-(nC)`cnYRH!2hf!%8-KS&;%|W>Kma#@u{(ei
zUn{r5WXz^$cjG@^|LquI&#iI(+!P-IB?)`(>;rQ3!a>oPJsU-Fqth5EodMw%ioe}6
z?iJ86sss)z;Mj#6!}63^H+R68kC|Nj{;3XhCj@wa%IUjO66im%jqMqPLW2XP3<zz1
zz<EBp9qOREJ>Yo7oCk@mSrGGGP_kAGU{RIIC@U-kz%mce1e=tD>N>?dCbA%P!WHE1
zhnQSVZjn*~&8|lQ+EVi$YrFw2{9zUbIQDw)+TEi%iGZS?kyyl(GE&{kZmVFd4VZtQ
z$&uN;d3QJ@xqxycAeKPe2}(bAZ3ciH9<8mdsj`;X4IS*f7V30s42&KSc32jh0rBpf
z|JI(Jat8v_fQC;FguBLf1);*p!O<KbNJf%*$hOeiwgsB#(VN%P_xM4zk~c3v5A;8M
zzMu6Lw)#!NPe}MJB2^R3GapsISQxgu$^gGY>6enzyj=?)Kz!5$kfou3>++Od_**MK
zF8z9P2S~`MBF|4idygSoiBnp(|G^;CrpnKdnQqr$IniK71+R$n0Fz<`F;c&hkwK_J
z8%%cvhzR~Uj(N`I%b|eDI8*IE$f6b)6f|}PReBie8dFMcJoy{OvYRVgwVFNJRf~#S
zDj%2|je*95(tH9PY@R%MKYq@VS8P<r)TB&Ee{$%Ox}N+BwDnc1UK{wN^Ex*?=W=cN
z{=UUq_3Un>g-03qTdM-V#tH}34S2zBq*G7!24+wO2D%=AaqiAna?$;VaC77s;A;)(
z9tKLJ<<us39*+q!U?G#UcqlxLok8{ia_IQ@_{@x*b95e=W~T;@5q2Lzjo7oKtaU>L
zA4;)F4f0S&49Tt$aK-YVE~5M_Ghe6(qvt}nhr(ss54S&dw0V1b1KxJFLP(QD*d9dl
zx9>gLF?taoI@5gEsx~V)IYgViLp?KV$;G2LwYC12-~8JYqudZRZvXAOpvt!k1X1WD
zyIf`-Iq{=V+b9H=evm~#{Rfcz?uI{5L^+AI-R2SiGDubr*x*LRYfrXT^&ZU}|AiRf
zuZ%aAMko=UvL#nvY(0=!EW9>Xy{_a@%?i0(fGw&?nJ?mFwSnq^-@o~sGLY7fXU`XF
zc#Ye8NJ~3IBo;~!rb!Yf<go}KNEWC?Q$hvQ3u<qZ#8p-sjLhP|6MxT~h*#zsqTk5c
z5mS5y>WfvNH=NEk)vNBHU)Q|jc=i^Q7LUH_;6W5zn=2lH?HMEW)?M-#6Xh2G0^!)1
zPh+DzCkgbJglQFv-h&>KyZvxQ{$N7vkUewN<hId|IQq^Rf<>DJ>jVAsyCFoXs0lvX
z527fL0k#JGfV{$gx5|({gDPCO<QGi*n_Yo}lfvrk8ACQ#v>3q(P%4IhvjXs(X$}Bh
z>#ul5si060Cz<6@qfG}86lCfYc)$o<tArv0TM7dRC$J~e|A%=}(Za|vXiw7h&%v^Q
zY!maqj^UR}`G@r!a#-3E$Nph^N8f=?(B}GTw}an~b0||qtWM}|=@aiqJq7y?f`~O-
z?x%raL4ZGn#SkcaHh+ApG@bx-6=VFaTCCY8V!@kO$TgeY>=>=1ueLco0#$6FzcP(C
zY`%sSgY~Ndky2ppS4aq@O2DTC)<M2aoeyjKT?*Rm;MEcRD}Zt)4GCNB*!`NNVj|vr
zOOAXEH`aEUay8g8--U42fWVB}?W95b+;)A17nI`OHz=dd`qx1=+@K7oQpbQ6IPz>-
z7ID`&Vmi!DRntRHD~i*QSM>1qtA7ie2KV)T3-}^CPyGWQR()*61e%(6A|A1sb{=fo
z{w^ZmDJ5P;C{Uy@W|wzmNgxP>kiL+hPn=G~cQueddpZ%K39<o-6NDQe<*f5tC>TKf
zAcPW%W|KTkTO&loNAfhmt506h96-eM)O6Rqi!P(AwlBSnSPdAF`nQLrDl}8x%V4IN
zzjaWJsqP;9QNtAJ!%V<x3Pd{0-hUCMyaG&rKnjV#0<92$oiECgckDqW0blRM1=+LY
zX{@?C$Ih!|l&J&|JPzpZf?vLgc%h9Cv9Uj9$xD4eiIqIl<Yj1Sx<Re9m)zb|`?Sj{
zFJgtsqq*sbFySzV=HJ}`4MqtOkpR)7b{0K>SS!^GIbgR$T2?-&kDh)3`&}(xmBEmA
z44l^r(b`fpQA2H;h1mC7n<ZWoEhDQ{6YtknyehEsbB+FWRf-)vQ0dQWy?TQlaz3tW
z3tLuDk<P!))N3m-B<Xx$^?JaD5Y(-hpSE9DhAvtK^2#snrCokRQ31gC$D@RLV}NdT
z-{VbCI2QqVBJ`RH^zTe=K?1FP`)E&KZL4<3Lg4Q-){fCZa4MCvvvY0>@d?m~ph^72
zeUT`jM}(4kNZWi)3E}Dmj;tDe1%f^~kZk}FgXK+-<q1$O#@SQkxBKFpK<O7`oZ8WS
zLql``xdUahK|?z@EU(id9oJLAPmxp8i2)iCDDav20-@iF@dywaTim~YR0wZf)U(c)
z5TYUv7D(U#)5q+!@IwVxMX|TCwTP6qn||4=ugHqmeNjLJaaqCC!h#PR#)GT``6xJV
z$RkYOwneuF(o62^yBiWH2v_(Fm^_^O6QPf~nlu?Ro`#fYcNdNrMtUnLzatiQnc5ij
zfk)3@as((K_vNL(6r@x5{NqRI>ARHCOc4Od2DBd_y&L7Gpa^}CQL*~X2WV12eY-EN
zB`MMG%+;$<1)18`3n;ZLNc`QY&~G(7jsZC+7+f-cyb=SDQhL>R{rGOOJyW0t+-E7i
zL`?W2o}Xc)W@)Ow@R*nh>9bc&(Cp?!;jLI0xL}_fp#DQ#BC|sq7p0{70Bx+O{l0r~
zeO25kKuMXo3F_6*7s&xGiE+AaPG5ijjhxpYmp+sEY|MN<Q&80TEU1=)OD=Jd6^;KJ
zKNs;}1{{CZn`7Ad>rAuEK%nRpYGx+_i4x#z>DK|BW^LAAovE6|<|B-<a)2z|)?zsa
zEDYFLzs?;X7Vf_9l5@eJI*hK9L6*nFG;YRF4DiajCR;HZk;|v-8%m!Zw-mj|pr?bj
z)4yKM#B1Iqk~LnGYN8{rdy!ZW`lZULDk#&%&-KgK1I2IQN%~Ui%8Q|(#ZKb5Gd^P2
z&19>&AfTtp*YOlTvHW3ZVQFe}v&#NE=n4b}b`&m2J|1Xfk1r=6`Pc>yTU%EuIde^F
zrAEy@AI6#VP$5<2>lQB!Fgos|>11#`iR=i6X#*<XR!+J}CEnL97iQv;LtXi$=vKi?
zowSI7R^g$#LAseT<+&N+*nw(qULYZKSRU@t^mq!61>Oq1CO3rqKr__lyP8%@4TsV7
zk5#5sY21u&z$v3*x<7BG%M+u|6f(!+HJv2o)em)4+B7(+60!~^IQ5+RRK_NQGoYFd
z&AtvnFMNg=ip4jmAn>V#C&9ufDw7ML_x_==3_)V!4$eTG+R%^N7VdqxvJ!h|W2j@{
z<Kc;%3}J(oGU3VE0*%A^is7YymY`;0W^FKZm@C&kG=*dh?8VZkY-bk>1^OnBG!J|n
zjGHUEt!1-et>Wd!L(L=4;fl7yuqL4XT`1V6<=tbBD0Rf<b*qLS%dd)$6@hz!bs5WX
z4btY6&0Gm&;V5d!$ijU|yr+C^lC8^r%q~xxC>OV>Mor-C?z-2~|Ce(+IMMWh#;mnG
zt@x&St(@xgMsBQ$oxS9@cG9#`bhO!5Kpk^?Z#>W@vl(?xhNhuT4(*-Q8W-{g?&JB*
zLZl^BxPvfg$dno4o=}g#>QbTK)Gbb3xPoyW=EY}pzL;oK?w6B?VX8a2>9PEirMYNH
zmt_0=XpeU#0aVA*Hw%V2Jr2*r&16VN&MHO-JDdCRy~uP%Mi3U@EpzS!Dz7EE>=Kc#
z!vn=LJvw|hXm6tesT)0rm(pD8>Mwt>ft$Epu?f_*v2RGyxImyqu5l|5#;BUNCVSi2
z6wYve@_mUGtX1-pD>t8&VMAcA_Suna^imD)q8%=jw(Tzt>nJe6Hl_+gr;12dCNJsq
zjwKuy_k;~tJeXe{h((Kv`xCtwOPB(x!vv1T`qpb`ddoH0qMZuVBo!UF)fZDPW1X*Q
zT2-}Tz-e9*_65#8zj;c9X{XY6uY{Et#RoA(O=+qb&c*o&AeX=n!5H_12Q+?uWuO0y
z-Qvko-zhbB=lXeXnY8i&9lghI!0h$VN=qIKEHldqROQGjAHB%jg8)G{q`R7nxlY0l
z+3;|tFFdr<(r@q6W$sF%<>-t?zaGnJoOtGY+*W96BeU`)y`8}YVhTw=2>x|7MI7!g
zaW$MMU>voSBSp{uE-3N1%}TaB-|Tx_w7uP$-8bqknEL#;V(S3TLm)1Hj2`{~qZ!CM
z!F8YD9w3IW&WrryJG{OkxS{!3r_&gJg%MUakkr@okQnh-j}<c1{hejtGvDEFGm#$Q
zj<W?AZZ9kJqJ&n9T?qNn8{i2Rj|Zgf(qeKv+s*VEJ*I7|byir^EVcF9+Vihgyis$7
zrcbCY*OxD{i=#GOXnCZ0*zLhvJ0S;Tl{tht`?KI_sFLA^JVULv5|+PYvRiaBtyi+m
zT}*H2tE{*o20<%v=48L$k2r$XT(sPx<l>BT)!A#u?$I&wlB@<L9GBuJn_Ov$`Dm|s
z<4gQ;ZTWczJugi^dwrp_*8kfYfobU!H0!qIcuR}uuhDj?+~5fQs$OxJGO_;(Y{5sB
zp{17{u=`5X<n|#p3!Vg`=_IU}ePTE~*5Gq4rHLvuOn8C>INnBuDdRQ0MM#g44q7g&
zs<`ay7)c=@F+x8ch=iP0Q#T6&*9P=LCTcQ9OcPf1Rfd_q&7+J+<%;v(nN2Z6+94AM
zU?ey9Do#|R2!4WFUrFe`O~`pCBxrzZ0pCXjw%Nn@?1Cl(`+I$Xlckw8ODCikousBN
z5Z&He4`^lNnjLd|=7e;<8g3W7vQ@U2psz$QpX`-HqViG~J9*8BVKxtGm>6tc=Y0h&
z#S_cN$6wDz!XJE?qHRB5DQmqS8OM51^Um!lmlj!;)&1QEB_eXg#^piELN&JNi4j8$
zPrF4&Bg8B67<`5oj5>+`%9ajb*SzAS(E+N^wT-9dhzdc)Y?0}7dDMz8cuoB#pHx5d
zdIy!v^?Gvp$8Ro3m=`-vq#j?(8O^r7RWl}(siCQ3ycqY;f?A?g`!LpWegZO&wWYh=
z_97jP6)|YSM`-~wp~^*PWzPp|&bpc7<%ysbKXeo3D<J6hV98DhE<1t|)~oli3{cLV
z&dz7cdW#5M>eFvPqSM}1WKXr<zSeQF=l*Hr1&IPwC!3VIal0DvY(Bg_>T1jJn8_L)
z8-jVzhOAu($tpw?fuBwnX2_Rg(voi~H(kl;UsL;Zop}oTOw)&bBdv<1C|3WXedQ)n
zmUz4Gd;_)kIA6|O@0B!gWHMNup7NpN1dF8Dvs0UNXwJpH)`vz8`<=q_iyiawbww+f
z;Qm3!5s&VC<Vd0dR&#G%+uR2J!-bd*@0}7qw`_7kQd!0{ww*J3VgaR{%Gcf7GtlTW
zaIx7i)JYHg?uGDB!gL(IJ7{gd3cqnAYf&Jjn|E?n$kR&38t>-(mAb^sxj2Vd?oh>f
z&7dGOq>iz?vS`U&=5xY<jzVh=<c+5DYK;c>e!qRbf3Bb-d$~7;k&}T_M}{=PY~&yU
z8`bCY?R=B}_E`4~n=waNJ@V76f`PYex66GbY$PQ`E|+yapzC+!NtMVnWbZ9$Pu24`
z)(Fdu)TV++M?Y$qwaLt%Bre*Pps0bOB_={`<|3Cr5Fc$X%A|O_%2rDNnrf=z?wC|=
z**+O(qD8o!Y3$_OU0mN-iIHJcyhI;OCQ`f1Wo_g422WHvwW+2+9K1Jcf5^zQ{?#G5
zqXYWd@EP*LWZGA6=`i7957JuNMBFAQpprROiVF?%8qa(9SrzMh8=X?8XQ*Ce9N`S0
zF0pVf9`YTLiv-)o1Vj`wM`Xu=BDl|hQf{)vWr4&mW|gO^Z+Ff0^d0I^g*|YyRW_Ef
z(bVIcYqb?HmtajiS={sb!XtIgcdc#Sq)Qd7a#pC{U|0oB{X4Zve8|%-emv~XE$n&%
zOwPhc)~j1aV**FKu!bFj_c~lI5lFt3xSS7O!i-9+(Vc60vrb6cG5)n|Uys*)tGZJh
zV-l>wE8ID{y(+3uFDakui&#N-m%2yM3q4tGf_?SZLTR@Xl3x4a<vlL!0R}f?>D;Db
zd+eAU-R4OcE!}lyeFJ<c##<0G1sR2F$SbEB1IKZD5yySm3X?RQirhk#q0Y*Ln(2N+
z?UU(Zgx6{Y1z8#iq-Cty__AopSfg!w1V<^Rhrr#VTJ=TpR`Kr^_pXgNbD}GJNrlC@
zjU?|%owoPEQ_ry_r0f>9E;@EA;5yKYk21#XW3M#j6bpzJ6<1Of&C*t^!HPKjFuz;d
zCx7jIE{X^e)+bL{TF#;0UUcz^N$NiN{i9=2GKv!KH90<bFs|%Z-js$fIBGx=!DD({
z%S-X3xn)1#C{mx50YAFNt)YQZQH#kHU%P*^&?aST##+yjblqb_QuZ%at}ZEBkX`kF
z><a#1a875Dr4Yw@+`*>*NAl>a*}xfZo29z?=cVPOCpu#f3!k@^hiqjyLh%&UH;u#S
z$k!qGCK>095KuJHnHNiP+{Y;;@%(r_y8rO2-XhDPGrr}fb%UE5HJEd*GSI2rS=82a
zt{09Ljhbpc_tjX|nI4|H-`iEr&I%ncvyx^o_#1|8qH-=dI3772Oee`nEpGoHq#Aeq
zYyxd7&lr^qj@T8arRlEGr0JKD&ZLysfG>|6anWGdininUmt_?YD|={k`c_nVy^F5C
zw8)4bkg#dKUsah#??2V;v2H!pHI<#81@lO&DhYP#2~=BWlT3RkuH`pr5gv%(LUd>`
z@(S`l80;17|Hxd4%liEgtS7%hyZBsGdMx9&WkN<7b*~Pg!!Y~pfaNIW<+Xu(&bW}e
z?s9OjXir=$IEx7mzj!!`;C8G||E!$_;i1~n+>F7Cxk&4Tg%tZ>@kqyW0e9Bu4T>2B
zM8zlSktf}3Y*OHubI-~YF@u(nL9EvUCN5F?7T>Uh&3B+pko2_lbEWP3<TvtIab+O=
z&w`pE>aO8y4~fi`iV}q~r{h_Gaj0pCEKdq17G4Q2_dl74_Wtb1UYW3QC29JPF#6L<
zW1=lxO)&Mg43#g!wc>Ww9!258Hl-ND1+kRa`%CZSnBAR=dLz|a=P^PiZmy2D1&O}g
zdcGUSZ>dRmYBU#rS75D7JJ&2b^CguoFzVJ?jx?`bL}mppD>WTgi}<$w4&~;&C|&A-
zwXwsTL}8ScBrL}HUVtcTHMvRy4@Ms;T9ZuGgwm|!AK&}abb}j8a>6-?Wm!?_vV_1y
zowQ*_`OHQqrHTy!MsD3trmIFTc_liVrw?%2(wQWnD?M&?UPx@q@#T5)qs4UXST!e$
z<^IV!Z>Ng318uT#!S1!HRY8=H8WyN|)0AtS(d)yc((B$S#W7lRb-f%CL~4)G<wvmT
ziS%b4(S&~0Tb*IWou-dA3t)1;{WY$aEe7!ShNy{-y@T_$sgT=zTFCdU8BG?CX9Ld>
z&#wJT)$?R!C@eBGke5;Ws(w@Jnro+HDM<kp)9A5Y)H0AOECP7);e8^iByHPe>*rAF
zozpy>Q?@W&TEUYkBhBAFovhi`i?Q;O&=N91PM;PV?{|w*2#7-mC`Sw^w=hU(gi9n2
zX<E0AX5RER_#NhzdYiKtW2ABH%Z9M0BjGA1N*LX=QcHX;ZhY%x^MtZN@@4;Ag^E~W
z@!}0r0)GH!I#?dJl(rJ>{+n~@Dhm=0*J+8}@Zt^n{y{p-!4D5F;k;(=PIeSxuDY<l
z1-;Se;chqk7rtz0oB0T*9-@?Maekp@t7gISjiJBV+}z@<tInZ)5UQ_U@VaHK!lGV`
z^JcV{Za5Vfee9BS?)8`KX7nR!&(}mq8d(_SCI&QXuIT1=rur&51}o-L757Ju$Y%Kq
zjK<*yzZg-$S5-lB#vANX#4zbVBpA6paaL~gCJY%K>$mgqrFD=|X&b6`D`6e+SGq{-
zqU~4L)>)V+C1|BsOu`Rvrn^^Xc?3uF@T(X&F1)c5l<QI1>d<PUxl?&-AtXYC;O9;d
zZ`6@VqxJpm`)bPl3i?ckjIrBWe^jb|Ay~}hu)$MTV3Q$p1Z6kOrGot#xe?wPoI}S3
z7aqUrNt9j=WzTNVq$>gpaEHEh2O?j6QoB-!#GImLTVt*G*Y2xs?LUxp?Y|ubT^Pw1
zt=UbphOgs%d2K$wh=~b$-cR*<{b>VY`|WyAgOY$*l^Y!Rlj&MdJ-w_fmoCt#6^?8&
z7o3fw5(Xkgs1m~b|KN8n!8HDlo^Jm?yuyDX^1Yy2-fgD*9BIQ0I6-ks9+BxQ8~^I`
Qb?7aX%Nj~Kf8KriU$0g)0ssI2

literal 0
HcmV?d00001

diff --git a/bench/test_sgemm_en2.png b/bench/test_sgemm_en2.png
new file mode 100644
index 0000000000000000000000000000000000000000..a335514d88135a2a158de05c8a820c309eff1b18
GIT binary patch
literal 33434
zcmeFZXH-*b)HNCqR8Rzwq99;Fl&(|(0qFwLdp{yI^xi>H6p*4+0TJmPq=t?lUFn_B
zdq;XFcSYa(j_=p~b^qKkZpJv{n2_x3{p@G0HRoJ&B|uSLiinVk5P?7tNk0=;Mj-Gw
z5C~kRi@5NL*4NWY_#cjgvXm$yujA$tJi#*+krP263PLZPyugR&1h&t#91sYKd)WVQ
zq?PIaAP~On(&8ejuKH_ZM6P7J)%!k{64Fmep8t{0@W=c3a$3dBtmM_ZWYrHhL!_=`
zstW$t-1Co6u_5WMjehz~Kada=ND{gibzm#~CL@)!U=inX-v!#hFXxH58gO${K0c-Q
z!wXD56P^`#{(h|8#HA>1ti8s7a`4Y_dxy1&sfnp~Lj1(<uLTYZ9nNbbyDNhVVsM%W
zGp`BTvkUN#SC4N;VPEj)G$i=@^zQ#hpHBa`yWBrRWi?jOe*>TXp;bm{qxI1Dx*Pbn
zvi$GCWoi}}`n}VubgtfNW45~dv86Z>>7X{HQF{~_j6<Slx;k2}f`R4fTt*}%Ra6|E
z?oVJ=i#nN@nXR6ytE-373Z5sXePYx?LVfM6wx#<trGclCnws%T$H%J!xjORl9dpqJ
zwBK)}*FN^xTOAqEP9J}})SK>+Q1jyFJ9zKrP{B(km&L9Zj(0sWv|^<T7La>8gPfe4
zYqziUQWD3T%%Eyd*J`|uN1YO5x+_{bItI3GJ~Dh8ytCPIH`H>XW-Jpgsa(6*?30T5
z34_yAy<hCq=;-KsTKAS4)VOF>A<T(Gdt&8LafzAJ!rUCbfuU?Po7eFgDn3x^0kzI0
z-foH>8LcYU0z!Oze4&j;6mfDA5{)@pMO60*)KpfKn2%;cg~v-QqK}6!myL2!-@A8D
zK9x@>L8kk%&3KiTg_Ho!{+ii`>iJj;-+UTOf7X4Z03A0{b)v@0>UqP?xX)fu!mxT)
zboKtYht657P}T7x)}h1g1@u9hnTX)gVsfc}>~wAL4V}nw!8f^Ww|-@nmAS$VV>-y^
zK(Ow&GpeL5)^2wQ!18O$uVvl;>MF{5&gR);`&s`GpEw??ks{MhH4VMFpxxwwAZ--#
zlbNAl;|+9Pd`P7`H;s_+$)BGJ!l$ZMxHKa1%~VSymENcB={?2Fd1XnaonPfs^(wh7
z28LX&ZS`vyWZw23Hjefy6jQXA95zkpSINm5UKy)I?ML2AW2Ys|^{3Q#6>0ac_SknH
z^wZZ;3+@Z+dtPcYp})qnP@wYFW1TO(53jwS6*W4oI#SY`b-0)us=7E@zWrzFr<GAr
zU)!M%W&>lcqGa(KUEGs3c%8K%&7_k>r}X!6jqZ9=8n5e8YP9p}P`BG;M3iNc(q<cT
z{vuJ9?Vz5^;ctBwo;b|e24>*6aP#(-L1^q$`d9a5%Gv|J(wiHmy-{&V`KYz*y*%`g
zRa9qsVMfMcYP9~**oJp{$&8mk?CVL3`;*B2Yup|y1Gxg*zd4WhH#UNTTf!BD%q`@0
z3Zr%HrSTHlCL=P7_lpWCat6M2hIG6i$njeDRqW{^b!H0QVWag}Er?{+I9k17sdipV
zeMEgXgV5*jmx1Bw4IE{X0z7|?#y6OQP|SA*llNC>93TJw^{dDvl?LNDw9R9x=bWNi
z{Cy$tE4NwKEg}iNl*Gh3+MbowiQ0+u_LP}y?kcXi?=OC?YMnVg*e%8Fw=8K2H|eK(
zy70vzgsL!@`26-B+UB^pf-76dD&ljEYf)dR$4oHqD9?K;UJZS>)L1I*i^Idi_vOEQ
zk4mM;obUAsUMyF?4xz%zPj~LZl^q`4(Yc9S9h9PBK(2Mk56f1wt9qWBjj>|Yg_u?D
z{%6{i&bieh?Lk&1dpp9I!_M`zLMwEsjn6T9Jd$>kfr_<_m{65zc7A@`_9qcf5BZ07
zNio)IIznt-$~*GAW<NrMDxU}C=64EXbdB3T3rX>gIfm;S?CcyJxw9(Ruyo@JujCZ1
zrp8Wptd$Jvukh56vyNIR^e*ml_a!`bT>QvN)Lu0HK|w%gr=jr!C7R1UuoP{PS1u{5
zi4HR+z<<IjE!g+iZAd*wGcjjQmCAfPiB?wMbqXK7C0CT|v+uBQdhy_3e7|O)x5e#r
zcaZhORoI0I?Mup<z1=B1ff-wJ5<2-K)}E-|3|oCIJOihgB);s`1CN8L_qipB=j-%b
z#^%=CQwBble5s1}3F3AiwZ4s14!X(LQ;QCxia-V|zsh<8yV)VWAUH=U$rmFgQszbK
zsEC>=rh0l-v|zI}+ls758kW?rosK(n3JQ(M8Xis8v1)$*ah7Bz5kw;}P^<Ayy*c0K
zbfvJ6gZqJj#~QckD&tI3a1k5~$FK6FI=-Q}qB)e##yHw!qkQ43122Kx<iy19v7wXb
zP}H?gohwqM0g|84`z-%$#Hyn$>$QzVD71}6$!E6*xiz$fw2hIw?wCLq%Ea?KAJ}Wm
zX7u*<)UNY8E~ZF_Tc+4iy86rZB%B=u-Qe7j?jFseYMTyz#Us38v5<H4YNPL$Ucig?
zpwdUx_^b~SIwPFF50I-E=PtggKJc}vMrixWj=z3TbguLOrPs&tmEQ^Z!=!F&?$_A)
z_aXU6@$;7Zx^8ok?tb<4qBM%}N=ivlU$`Il6?T6bm*kOPWwJo{4PEhH`f#k0JkU;`
zEE%kfzVn}gU2vcMS6-X1v6LMd14;XfCZwlMnCEw5t1r;H{x~H)T@c2UAFEkE==;w;
zPB&0r{oyC8B&8k5O<E$XHq3Z^ZE<oWBT1Ht=1yh~NH>P7E~yy=*7(VXgIj~k+lS0q
z<IZR+b@j?&)7ND-26aF0hqvUS#<Mn=%&6qd@BU1piQpcn#hh-SKWiTB=KEOrot${4
z;*#%3u_jAK3)uzwp79w746WqXj<>Df?|bhtURYH%N{C4E)5H}IN9B^Zqa`}+*)J(M
zqzpSpj)a_D=!<fQlcMIeE*4c8mBcs=>Y4hPH10H3_c?1^8gDyFMt;;PXVjL<#YCo~
zdwSe2Zw+$WxJ-DV!m`?s30|-hd4$C8N`(~rey!Z?lXJa`7EgZ0++sD&D{wBibQo{J
z`-x4>p$T*G#{pk2qQqj7qxp2nB9*Nf$@a88+I3f#wk|bQ)gjWZWq*lfhw?GSUHR>;
zt*W~^?f8_slM%7s+=@bt@MKA!+BT5himlQM7Q<K{zabX-E-EiCzry=T@2J<gOzjeT
z%kS)9!{(PIN}@J{O>p|-J{YfKZ4IAwNI|Gi1QIa~HA57<d|VP&`>YbmSvN@BC?~s<
zrE*Dz8K@+zpDy>Nhm!l6m9#Tl+;5CbPU?T)qB8JJ<V_9<TS7L~C~H=xdWnFLZ_~~%
z_S&38^@a?kWXtDIwH5b@Xs_LuR~^2wQt)3LZrqQt^Hp3u<9=3u)?<p-(5Pl(u@WNo
zTLJ;;r$L>3%m`L-7M!Hk(>rqdZgb?!c<FcZ&^ou$Xy`?!N?-Mtp?rn+6YJhw(77%C
z$jdGo?M%mGsoAHKp)^%`$-%)vEJn<wT%zmS4rLt!m7a5C`4^A%a`lUOPCRk9M%hdg
zb)D%5xrfJci-&cW=k{s~w+Ym*e4@@6MO?d6Qf2kC_kzsrVaf9;Qe%4+V?959t!ZbP
z<91Ye2%fss9_tb9c7ONQC0%!`kFvd%Tsd{P(w+RC{Or$rVNIN;swA?n9yk!2@SkVb
z{Qhlt<8YMyO21~1`Go+**@Xq2#7rBUUFXpdim48(=LVyT<f9f1?6oKM%=A<!y>%XI
z-|f;VT<`W#vW*6SPX#K3KG%ZTcy+naI!(p(CfB}H))W>M<#vf$`xCB=h76=p%ldv%
z5p@0YO(ifaRW=%#VNCsp@By96sNH+NaadK4e1dL%d(*Rov!#Tb$)D&qznWZmRhjV;
z-+p6lm<u8bH|aTx8khK}BDNkixo(9J$A>~!h_<O6tA4H}Rd$2Zgp88%g888z`kB>0
zqS*T#%2TcFJCC_9TFzU<m9-rFBo;1aQ@&b^fn?ZpT78M&AlUNG!So0A91<CJhRssl
zu^P=p*M)eOoR8t*GhUx;dAk>T_RRgTNt>{)1ba!x1ab-r8axk}nB1|+nltQq9FN>f
z+&wMA79yo-1~#AARil}TPa|zIp$bnn-&VK#`r=|_fAz(pL?5L0?ZnDw@{O1e4M&m)
zMDMMkU538DP#>_Yr0$38wdTaNfFu05?cv^f)xv{OZ(VnlcD5<hgtte*`&=sCOZX{+
z#rIg;)3tM*ZMclX0FB}Cai_4>5$>fS#4lk>ehH?ESFxu+U1~JpWcEH%6YohO4;Pna
z-I4IY=f`$QCElJH_hz3@NlD3^O87Xd{4y!e4SYp;qY_0NxLQB`xu&dda8T#=WWDF?
zC#f%uF8tgEUl?%<6c^ZYu2Jd_oDhCMuQ55s{!HV)@UAjIG8!FrY5Z;r^FW-Of1kY?
zk7liRHEPK~O)d3NsL)YHR3ax+#=e5yai6LI!#2RD$7?H%ANtix^Z&q}Pgh8weR$hd
zfdy}rL+pznyI&xHQU&~W<l_F&bz$!+>btVCvT9u?L*(0S@^Vhv4jV{g!LlG8RTNz`
z>+A1k{bt>amR#iJ3GT~`rdQeZbH<X@vxUq6TzuiR(f`BFJ?ap;u5lTW<O}&RAtY^k
zp_Bc=1KZkqG8%Q$_Sn>>Bo*PtQ#PJ&X|dny637+(<#hI`W!k@YPI>;!L|>5bd|~(8
zK=+Q$=Q<C=FF}s4EzzazF?aT21;f+!++V8mOD`Ehjg1xaFk9?OJWIb?KO#QE_ID*Z
zN=>q((1O8|!BobV6ypXuea^NOMdM9|2;FI*OyuhDrq3pR$<$R<Rh^yaP`9*oNRvwz
zrXRUnELtKc{#~0qc=c|tDk?jzY@fN;V35OYw_hXIeI*xlURTOpK)Z;^?8VV64zG3j
zulDz)R=F2bO9tiLj+WBNgGU>)zi<BL?9?27j}aX>?*H=p^P{^5<;MGZTqUwNBsUBR
zNSKqQLiK^5DEv}rHZP%ymr0&97g?j_eym#;$H8#_elySswuAhza>oWIOkk%()4EIl
zRRhhNy_C^KokD0ve8bwW^K(x~K27>CSHzwwt6MQzX-gUl+)b2|MTcVYHOoXt#giWt
zsyr_kw-ilTGaavTQ%RTaw^y;y?Cba{=utX{Q=%9rcstYBB3ts(6Iav&?wt5etCs>g
ztTrwLIR>?71<kbE3Jl|%|HU@R<30!;qU&#(Tt{*(s9zMdXiM@vv;L!U;cVw<otPO*
zS)!<6GsPWMu9L5g)9N<KrMO>`vwGsLwQ&<^(JiHuElph-ti(Yu6lbwMTHbNTd#waO
z0jq<+FJv}{a{?|x4w!Yp+{9LI-S}~P>am3DkF@j=L4M5&?ZmZv>js<NVoRrzoDn8+
zDv&eQ)%tKK)2Ns`?4LBc-FlEdJe@vlARdtPy(U6!MJ1ORKkNdX=fj8?OXq9BN#FB=
zHv^YUt|iH`SNEO9j;21crsK0&$a|H;9{$$J;J;IXYJrj>hy&|xQKD0gll7#u*44|O
z2WsM;sw&cUJ;TrXUYf)^;i)1fC|Fo&8G3`WX}<0(qs?W^Nn_?N4&G{=@lG0KiS^tH
z(G()Imvxb_*AQy6`5T$)=)+_==7Us5L{f+*Zt;1VTVpnAjmO#O_2~7hJHG<2mb}jK
z(_SfW89I%s@LQ{ez(0O@1$nuo-k9{09q${G4Yp;kyK<}vP==bNWrNcj`zn-g5u8pF
z5flb6_<pwLVV(Wz+*_q}`x{{Rit>9o$GDYwRa<Q@(%FiXl7}&G1Iz@GYZlbDyzhYg
zKD#^ibS`iDmU_aG+A$Wuco}N$u-NP@Lzz~|Pi{xE`6@dqA^Y2Qr<_adZOBRZ_jf*c
zB>S0|YYp>VlP*@k>6*VP`$|12d27sN!pfLCcI*nl>G3{F^rJpuZl{%US-er7P?w_I
z`zY9{m6RHl?KR2d$Fzbt<?T6>gT-Z|6YaL0CJoP2;>^fO=!F{`<1YJoi21eT=v597
zp3||t@q52!UohSA$ZH|i_?L`m`AmX``oh8HOr!TGLyh{|;7ttSQ|i#;J4qjtGiFrf
z7rf<5TD6)v>DE`weADz(7@=Z(B^N!(xq+(H)$GLWvTq>$w(DA3?d(yMac_->ckG?)
zXn8K(vGTy$wS?$sO1lqj<h_(OdsM?^SWZhGyQXP4+Ir$KE~WQFj!UG$v527l^iyEe
zmrm86V^%&F%Gqs;>iN&j@1iHp1_;+PU)>rl;@56D8x0qAm*SZptUW6UwkwY=uaxuL
zIQ6X_Z6pmO5$0gq*U%Mp8)W1C&2{fZX9I34Y=(yS=&@&`^0)X!?>~E~olA$(**o5-
zrE6nnean`{zJ7KwmP(;9z(s}8jvCn<LSs?7eFVzM*zrke2Omzf%eCM;`;O{mR+X>^
zM9K<MGG(Ru*x){#DOvy2Tbr@XzqP;hwVu!mpOJWIx~JZS{SNV+9LcJKFU>j?j+q`Y
z$9<V9YyCTJx;36|Gc!KPRzK4Em^Jf@UXA|xK_-i95|>t%&62|zY`r1lqO)7zgYjm5
zTlo!vC&hxJ9g05R-`#=cx*_OZjQ-}f*+_XUg*h5+Dk<Hn-4xpzRh&^}xIc}os&K(k
z_IQNDuHApry;k%}VMRZ#L#>ngB?X0I`&F(IEN2O5u|oT?rJ;>f*ve-~nKOc^AIg+h
z2l<`|rFlg9RxaP2`Km}{fiH+T*>3y6beWomC4y6STu1fL0X@A=E;sUK_<Hx9LIZ*B
zn;|p;>iGL-_1cO;I+Y$yihnp?-<1sd{`Q;8)OcyO){)M7=I8GnTon^U+>!jF>z+Is
z{}6lc-Owjne|{2IPJDG=Qd}rK99>b@LybAdd5n5Z2iy;gJLq2n9gHt^$RQ5bI)Ml?
z9HRN$z!MU>^>1cI#%>vcp;$+1y4(#w!7dv6^?sM)M*cJD{Ax9E&tm!(F9)G)L!4Q0
z*?l_Es_6;lv^Y7HDpPZ;Dw04v?~8wFmD#ObQzgBqSe2FBjLPPc(6`p``zze%-t_}4
zt5~zd@Y33P3`FJkTV2hw=&d!mF7Ydh)#2_(eDlWp`*FCuCe$dED2!m4XIVfWI#l+O
z(MYkmoYn6mBQ33^dcnJ_+6(}mFS|+yspnDDR66N7v~#QfnQ~sZG$%@fNEG;)7qc>$
zpU{s}{8k%bO)VkCvj3}e;%pDK-XD&9)R`bW(WZ~|KUp;ub_n%PO0UfS7JUjRZPk^X
zO4wyxmzkR89VL`}Un;uw+oKcM;N6H`=jTC-=)JeXP|s74lb286t<bc`@pfXe)%RH^
zuoRe*t9hsKjmq|PjOo{c1uE7(A+Mv9)YQzJdztV2T$F0}Ml9r)BJ5_GwCXU4uSB)Z
zq7A&ukBLy~s93pm!v3OS`8DLhyLabHsz--=>ti^KWq#y0-PIIg9Tra(wQ@9;AU}WQ
z=3}R2RqCaFWL$7%+ljE8P!+SR4;MSTf<>DtH#r#@7dc-9mu2Crjh6S7Q*OKgV(+gk
z(UklA^1(O1lHfW`K)yp(@X0-p^#Va5xgT@Obbny)vWf_2ty%XSYR01{9iJclq?iPX
zvRczW9vb8_{c4o=fuEmW=`{yl@@j!<i%SIB?3K=I_Pi#7JCkpS-Cf+*C&&}Ve8;1Y
zKdD9_mwA2ejI-lh3@N(zl&kzy(9yEjPWJut8QQy0MCn{7ahQ)4KfKsm9bO*{oc6-s
z*i>dGopp)sceeX_%^%a4DdJjt8hqr}2qdODmMRvB$W$_XSxngXrNr;M*RiwBrTpP2
zLrq^6pAF)l7EakQi>0@AS@v-a+tUR+v<H!HQmNeL+Jym<ekg?q@_N$V_B2NAGvSjd
zVk?~<`y%eSJceN6FzLRF&Vy<DLfUe6wS)WCq)e{71Q_z2TpO=nghB#D)S2zi<+dMK
zOW-hioSZBOqY68T=gqjwe=UHlS;Q&7PSY>C^ddDz!@Z!;qwQHhy&mI{BD!+G-Hz9!
z+Rn?fDn^`)ezY^b)4Fb~8Oe%wNTa-O`l*HfsiT)lv&8A(s@HwQ+vy%C+N*`XI`|g8
z08IA*2(~UFZ)HHE-tT-vz}fQSCAHlyV3F^LsMFGqYwP_e^$$Sa+m{kP>J~nm5za!+
z=$V_FCwOiJ2zaT$zE~U&(}Z<>N%O?BiF)k@2#bk4Z$nNoNBI~kTdB>Z^n`UZO5xh#
z#WhZiM_!wXW=<K89Oq+9kyD<aA4!GWEZ9U&>@0SReb8Q7H#nO!8085ZjODf{ez=<+
zx90Q`wkz_JMbsCW8Gafe&x6B-g#6m1{<Ts8wf+iq9k~w@EBr0zU;dKvv&gUJX_)J+
zmn)ttP69<%Eceo>d3ylH-KL~{ezNXO?P0r3F-b9wu2a8S#}I=VRc;ODFag77H~qHq
zwt31~L+vTL*6SEOQTpgiMpkxzeJs1aD^YBWPXF?BAakDofB5D>JoHwx<7>JK-MD=W
zDU&p|A`<h-p+beaQ$_vNn6qPws*-801fHH$z5?=e-c(fDnj+mJozSDp?>IbShkT>H
zT6+z-@2z5Sugg-8gmHrLT+QYy>LMW*P<z+=GQ-=5h=|<tf>GpIUaJ$B(+Tu-A03o+
z3A0}Wu@w(cq4+WcBDZbLpL~}7+{e#RY<fqKC9{7FToR}GzyCB>?Rn!Fp*3ZpqIJpT
zTPzKvh^N`hXjHz_7j%tRrPBr6Fww?LI_(=nz6r}0y2zmI@~tFO%@lt>n)y}3!Z?S~
zpUFCP1c?OpgBH|Kk;XL5#ZJ5h&tHtzTea3`LSIKHfh2%vc&*2#tD{p{vNTbCf%N^I
z;;~ic2!Sjde}v04wp~SH`h^we@BiUQ@UWAF4`E;a|DXOpuEyJxq$I<i?}+5_pW*cP
z_m9kTJ1(z{kW*5IQuBro!^#%eHg&`c>>AX1pXP-06+15V0Fg##x&~2kFAwP0sJ)nO
z2mpXnx$xEF-Q^pG#3fWEmnM8Lr%*FA)YLL#BO|#jhrR<sMJE$j0MjceDJd^6Cl=iK
z$l3N1WP2>RS23T()Qp<%be$u9@-ZgHe8!86oIKZ{R^QLp*H@9^S0Fia@k3^2X0>&Q
z5Ky)B_4TLOJ2e%5XjZvy`d_`<6iT-dwB-t98yXkuUaAE~&}>n{keLNXL`SdKWR8|t
z42rxt9}Kdusnd#zqPBL0c`t*NY8JAx0(v~!n4i}Pb*$wZUmoo&VP$v1bv%*Mp}@dE
zDLOCJ<D0A>B|cE_y+oH_W#y8k8f<|(b*!VqN6<g+h?-29X%4k`Y&-dsyT0mZf5YZl
zhTS&?OIVZKkyz+1Jd#M;*>P+SomZSnNl6hW5?`2lo6?lcP1k?2<vq5}@4l1qE)cKu
z3v?H<mUe3>STz5;<}a7rn1{;w&*ZLG#;!?>_$vGPM7|gvtJt%^SnELkqZP=Bv0}l}
z6`TlAnpIW?G8-gO)?-|KQ8xY8m9qM_2DQ7EyJWg$`kJ6C()pN}w<oYS{pk$1>>gv+
z4xws5`9Q=yu3{qeiq6DQlnbuaP}@tMrUaoa9<3rBZp63tNlR(8-D$CbK<c;gIE-NW
z?A7G_*U6%+OK~<e!|{*W)92N#VnM3vZW13$jG1tjkz3(e;FV`2N&Nsfeqnwd`b1yt
zwze=|_2oK2PRd}M_s+EJ%^=E(-a70UGESH2OH%9kQ=|QKPp@#K2INu~Y;!HeH#qjy
z;o;$)RJfmE!1yj!n0FWVO<>)m{16&YuLh0D(cZhrOy<;#N$c~5J{rSLJt@*WO+xr`
zR^;8b@3wt7Yn;#a&GsVv<o=^}{&9`rhy?^3;euRkwGCOjQar@S!;Yw1-R*#k(1faY
zmBo%3xL-s@M`!P1-q4*qH7=>PG_plTUZLW8<_C?E;p@F1Gc?AHjEn$5<4=A9y}6Xq
zOe$OXYIEd9R%6=Z{U7Iu%VNY;vD&%4aaieW;(J=<2lkbgnZbFiQurGtit6fVQ=cEb
z%wakr8_JA)r6ZxNNIIaHK$h)3TrM(4k&TN`#w)<P?FM_RIhcAuRi2TJ?dub_!X^*5
z%G;p)X7Jlhiw6%X8ry_2RHsQIgY=uzy0dfjt4G|*2*x#@J$Jt(BZpMY%+3F;2$Ip<
z<P!>2ZS7bUem#BtukS1d^GdR0rNw+6G}x@?p1MF0<aG)TEa;!g8!+!pW7VtZbIbWe
z+wWTPj(eP;x66HIv0S?`xKm*GXPc?q@HMrrt?pl0tF_WYTtZ&qJIYi`rL|*nN{q7j
zb{-CgzpI*1DgCszCPDkG5VD6=D$(s(oli2dKDoDK9!o1IxX1)^RSy(~rFRCdfPnl?
zz<KpMvZp|y8Nk4)Fpc(nTW6+Q^)*BahatfoXwy)cTAR91XBL(uO0;J}^f_+Nw^Jo=
z3cBwo9%A*3(y6}jp3jAag%Hj}H2lShi6T~YuZTnjne(fX5*bR7xB#HT-<3YMG=J+_
zKMQT<+;`AGQ+LYrN>h)u_fyy&)ZeRE$tyzIA~ELIPpHkRYxb$mvl``o!=^QA?)L2b
znygJEGg_P3ylTX>Dd<KZp?gx(O6OAQmrTJO<yd9vHw-G5dv*Slhtwy`i-=YBNbXSV
z=#uwksEMqygm?b7;AsB|G(sorIWAUgs<L@;dbBH7jjt0)=W|k^7Och&R4MyN0J>xE
zqWBxe%f@rm^e7D1vcA(B2OJI73gzx+3Y|fkES1MWF#2iw)()51C6DOj-P(LA_q3#`
zPc>G+WgSR$x_mrxP0xaHgbP`oCBo-*<S3YB&Xr0_=(@XC<+jbhUAQ47d@y;AdP5uB
z0PQ`qt>OJ2)l`PiiT4uCS~Ix6L`E9Zq}CjdZ6L2dF$;)S@2OXMGp_eaKaf}ZGoANA
zqKb`YAjc~Tk3U6pP3@fH>pEvq>;`#mT^1wAeySLdpawFd$~$!we|8V#R1QR3;7UzH
zIpy*uMQ{1c5Nhm$aA-xkH|<X8p0{m#ac2P*iA-3PM8}Y48(Ky7DCyJNti>+lZt0tD
zO3C}Q(>mW97JC(1v*i4wEh3oxD(=^Tha)u+yF8`|UZpBB&-PwB;Xip{CP4IIlWon9
z>+&PZbBGaALxMBOT!oT6-2qy{mi36qQ%MH^6;xRg#{CeqS;3#<;#R87raUGD+)7U2
zNG56y`(rL10s|IhHWNbG6{9zbD0IZcegaKddJ^>?3!kQk?9SdB-E-euxbnKeHNawh
zdR&XC+#XuWxm%<6@;{cbVMCj!+<X(QbDP-aNtRi^X7a>+W~5xX#lS$U>xN;DW>O-C
z!4ZP3lE_-l(wKY2>bb1he2JCo2HQ7E4U_M^;!0#kI*J!}w2qlFvO4`Lzj;>e4(L!-
zeBkoG#8&o$@gDv0kBYsGiH#?diE}Rp<;-!ZlTUJpeR;l49^OE7F<h1X!wW0iX+b+9
zRw=0TbYSijOcx!|sllG^7S}O*?_(K(gq1rijIhWh2<gdDaO=`~?R1l{51AS7>6KB-
zHna%|x&_5kD^V$}X!REt7pG}w3q@x_xuLAAT$`Ef5XN|>_i>ecE~(*?72579GF2ej
z{ixRNwh+G%@gz3ZjXj~MSnc$US()X02_jx@jcj2mgK&lQAjgZjjD`kV!L>dU-;28W
z#I@_+)x?e{S}B#?j{vmGz2ohPKxDqr^nIoa?P@Cb+S{dhl<Q7c2j<53QLg4jPz<T=
z2fWo@zygKX4&<8}n#Fy(8QGW@$9rq;ZwUXBuPiImB$5-H?G0~U41RE2?v+~X>Fc;(
z_%1%4{tyI#5<Jo0^wTE7DBft{{b?e)@ohbVUnb)<F8e*7qbzM$O<65n)^w(pv-3-{
z1_59U<QpVJMnZSbg)f<E*tCu9@#dDoM(Z=2=gel1gi4J^81ogCZy>&{-Hy6N8WR(P
z6lQdK0!|F18@t&_%#~of{14wlHnR8)qTKOL`nSJlaOc0k<@r@i!lnA3EmdQ%Wy(Cy
zv)Qh#Vu}51gVVOAgQD@hfzCE}2fu!dzJ-+7MnB_incqdqLaXnEN?tssj*(jRo?sv&
zd>hN0dzu@ps;IaroSX7oSy?&BY#HNl=KPWklUof5Pz*y&PfxRQDXS`%_`^5@W#zf2
zZFk@tkxx~*^{Mp?4D#*W+{)XKN_@l#%fnlZlm?pI6w}A7u1}+$5?;C#PqE1e@z~=k
zkk}UiI4xT(D`imivZ4hyFRw;SbHx#!<=A;UFmWI=BK!LYt_kc<*S4){-)fIj(`iXx
z>83mTbAgD}w8As3n`b^E)+F80u%fzL)a{bKbN%d$B-6qEWa0CqwZjg9jU9vZQvKo=
z2RV-gHtoA5D7CoV%29sbTiP4f5g!Zv@A<#oP0>)XJUcl=I@dxw2?X!m(u}?+Wie;C
z?K%pzeLBI!36JW`-4hIcC=SQ6%Q<z0QZ^E#`sJwXPi_^yGymbOtG_2`_p!sAr|xKX
zh5PcRq03;)r@JMRNDGSuxA_}3TT0~zeR_i|dl^XLfT+^2AVTMMuV0x2>(&T&yW>i%
zRuNqAXGTxa`3*<9J%56?oOVmBTY`aBcV*FGXwsxjU4qkaPaLT+@qSQJx9k!=w|83v
zqug$E(OPp}<wN&C_0$0k>M}0f4ebP~YMM2+Vp;QZ8y68IF*eJZ85mssAAz%5ZOh>k
zBRnf18wpju!_Dal-r8DPhr5FY8{o?_9{hAyYBg{HFl*MB?Jri;&<>QpgV}He9-Hx#
zCPs^aobH{8gC<@}ck_vw>LyPB4p3GPPPRMMyyqe{cKAnzR^vPp2D#iBiN-!%Af`QO
zWcMzx9^-HSI==?|(H-t3mj*IJ?(@UCw1A3tC77v**r+Isjr<Z>BSCvg><0ABcA`pd
z)|_npa_Hb|u2G+;bjh!eY}3%S^OL(vBjDU8pk|>%q!oFZuB2tvn_qJz)4neq&M^F}
zyRbiD&yAbTy>4}Nb>d{67b(4xZ|2?ibs&4#5;$(959SO><9)K(TuQW<Un%R+m!ULh
z;8R^4a>|NN?;`P`Q~Mf5tME}<UhPqTJ`nxn+N7u1ME^l=$eBmF_*=2wk^=g2)t~xH
zgg>yI()r;9b|kK#QC!r#N~npEk=eZP@hWFE_q`i(-&|0Z!-ZJ+jGE`S+sKk7fyL{U
zfDtM^N?qI6^9?3L$1YYu<K56NthN!+nGrti6&~g~E1u;}IMR`kb6>|Au%2`+0-oz}
zwYpyEvf+Go!uaz&G0J>3cd2vuxZiXQgc6mo^5gZ&m!&Igj%PMamJ2?_?d4Ie378X8
zRyKCl3voJeKPpSi$WPW?YE?vH<j{%1WSF(Kuh|*`<*($s3rpIc&)o{eMUdV!m9-7i
zBTV{SaO5aAc2|>?p60Cou{vjcMr)<l@gCMZ6WyHjsW)Arv%OtSwnHCEn>D!G#&yb$
zR9?!0UcsA*t+?8#xw$!aKk)W3UtN+&laF71{^{Wk4|g$p>FKa*TpFV@jW1QEc2&m%
z;eq<nUIsQDulBQ#eiZY77>f1Dg!^m<e=pjY<D0vUNHoR1hBl01(GjGKp`L-0AUa{8
z1Ey;UB@A-yJ?*>`IV6U0Z?j@=*wiSSm4lsa%u13va$d?pmpD_&qW5|uqCJj>PtHV`
zB_n3AviJ9joOwz{L3Zt&1AcjQby}&>DIfd2M{&1qS$V`-$)yP=X0^3j2d^I?mv2U6
zp7?JMW$sS1QFu|n*)u#+U>S<)DmyQK?GJ6;JQl#owJew*czQCq3bR=0qS0>b2g!(6
zcrNd_UqRfPI^81cO_NipEdBny3;nQ=Z3~%NGTTvamL2%vLqCsN%jMJah6uzhC)-lI
zPZ?HK>|J2Yu~2(?H}1597WncX5f%~p9F?}!;i3;f1F7Y}ln0bjOG|6?8P-QSh0U+w
z4022l9xM*zqTw5<kAnr`S5JMMu&RR@#o+(=N<02P>M#9&$s#N8a-k#sC?CWF^$T}Y
zG&D5y^t`4U2-2^B758Dh(|{HTQO-<^?~3mE`^LsnaeyiJ=_N0KP4bMpaBEoAw6x4$
z)8D<D4E|0Ytgds_3{swo%%cWU@eZ^d^ZonxjfEw!D}I~b?zX6Pb8c}l7xpONYyxbd
zs;<ry^&s2`9;T+@((rN@X#?roba3faf2O~GU#j5kbC5V*QNZoI%+1XWMMi}|i7au3
z0^SOSAYVl{0qfn>;qbrjGP@o1%`Nbb<0yRO=_-kAS5RzhEL=d3-6eQ?ff$R(Lqht0
zf6cinOY+%iG%G(}L+kb15FPltKaC*)?w_Ty$gf}5pTKiO1EK)0CMzpzu?(!#uLi_f
z>LxUsb0NWG12C6pW`(yDUoj-ODMW*+a9o=H_nGSjc=yX`Q2Z|I&;PxH!%#YF8;%+u
zFUJ5|?<w=$sFdQ03VwcTcy?>u|6XQxRu&St^}iDvy(+8x2{ofGDS4g-1>Zno&=3|e
z1LAc-gB$jw{I3~GBQrBI|1AM2Q31W*-$l%L`uFST$qc1sGP1K(7_d5K+rMKb-Hs~u
z8V}KFqyIvWJq7G@yMKS0z8h7n_3xjBx1x&WV2wd!`uBn|ydW}QCmRFz_bw;rzcq#i
zmAT4^fb}Qr&H8xIF;wo5e)hM;p;8ARt?kz@l+8q*{a~Kn1tKag``MP?zn^=ks^{zL
z>FTbkJ^?rb_i#lOH#0Afjg@t**j&ZRDmM!Rr$JqZ_K&y4R=YK`Gc)--5599qYim<!
zpw7W61$a6@2|wDX+U_WFkJC_9-2trfKzMd`)^V{*#ILQb4RXNL&-Vf<#TF>A+6Mn_
zYvYHOR;Am6myY0UjcEzhUj}xHF16ADg@Vk9%vDugy!*q>>Tqw&0~4;FAUhwf7+_@n
ztzmqGfX){9oFDL%rAvYHQ3()T{mxQvJ~V!tq_NstzJB!)^q2@%#NP{AK<R@_Q~1bv
zK;qD-J4u3vfr5&PYSmO)b$;0yR*612qVlioKm~3Ow6O(<+mNT2us(|>@tmBT=RY64
zY|>Pd47#33nV*|GR^gP*$uu&WAfrlkI~+cLOWLm0qY@;l^gdidT9B<&9{$l!zT<VY
z(`^gS)PvbJXiy*{OaI+pVRxd6|4~=EDjw@m%~7+)KyrA1#C=oHUG-<4UZr+_WK>kz
z)aFccE){FwozcJHVI`T>97aUV(>E~Syx3LeaU<ZbQj~A4s;tZbkR~bVjtAE_=+v1H
zf#Q-~(IIqfnNB7M0yRk0sec1<BuF*%o=x>`)I0EAlx8V8fm)Z(?GBEOON4|GIso7e
zZZ784)#=&R;mb3+1A3AD8wKB3rI$37!6l-m770}FnruTgvxbRD8tv&6u}{WUnp~Wv
z(c%@_CyNnD_nDbL{Eb#KS!Ld+9B|PCcSlZ704?aqNDIFHNL1|jQ^PATV{sS%QmyiR
zl6vnVJ**+TohxdnGtv=;4qjiN{ZsCuZnWyGe|8%T!6IUpu(6E5m1)#8j7fn`6>wft
z|Ck1E*9MHF4qK&R4d@va6(dcbx3=tw*ee$AfF=YrDB8Y297n7`JjMui=Q=)p9iw^G
z(Q-D^Wh$<grk3UYY><mk;HgU=&l~&k`W!b4D{D#CB60t~!2J9?=qrOaurnGzI&Xkn
z(B}ge0-KxHZo1)RbBNR0NI%A;2F4^xeA8N6TbIDfhHVA_c95Zzp#j`oLXZLkt5rO_
zyfZ5!!*!=yYInNrw);Am!2pKloNPl^j8qnm|LPW8`s2vnYEVVh6=yRnjns^ZjY$(K
zofyN-I1+M_j0!D>IRuq_L|@ahv)cgPGa{w;1$$8y;3iO~#;QF~S=H{lRDAZcAd{^s
zX~6C^H(zQBrdC@YuRaD_u?K+QjQ>K<Ck|?eYrIQ%_@bW1WppYCuI{2-TO;W$ZbsGz
zA5+rP0blA|73H#D9V);^41v+%*4zgQ)*dcMs)M$$@nfGaxx+9)2br8^39=u2I66R%
z3Og-ex!>MN^+Dl1qiMSweqRAnhg#DEM#k38&cPNihmRyD;T0OSlBRwFdpZ&nra+3=
z9F2T^Sgt_k4)NE&6NE9F&2$@mnZ3~YiC2PUd2uHS!6<U$C)#*ry>eqfQ7bhk=XVqK
znzOFZ35}xHY~d`a&SP#X8B_b+Or<ftZb}BZ^RxfGr%YS5x<-)3sN8LM@1Hw=xvOj4
zVJzZ)f~Wob@0dhgA2{1CvOHE1uK34x^^CU*)<#NJAX_NC0EEIbk2y)eBpVgFZcam6
zcIY~%VZ%kDu~uK#v%`gGb^{@k-=8bjD}G_OMtxt6mq&VpA@HscsARB6*JQ&Qscot@
z&yF`h!aX}^7A}3{9Ph5A^hr%GkQtJ}Hr+<2{x-m`%DE_=FL0f=ZkonBsc(!|$IApJ
zUZYI<mX##~MKz2C>x%^4Uj#n^t$)EtcoOogb{%i+p<H@|`(qtn3MH~(;1u{_kf7M~
zDu!I?IV!cjcM=B;RhJLs=~<|0h=|m^K6l}y`IwoBNxRhY2Jgl5Q2r+^3GM)lFpWt8
zP|U~2XB3w5(6r;GF{vL(0-a#-u^9-an*GMLwy;PfDu5C){#Cy=`gn+f;Ud#E{jZU_
z5z_D-0<M21V>nIuljWD774}X$SDiabx@?;riXgWkL4Dc-Wi2hYgFi1Kiy%CUidyNV
z>o@{9nu^VPzbMhdWI@0?Az>xMTI63MOOE)+Uurq5tf=@gAtB!vS{waYYMds&yONVy
zMaww@7r#$$vvjonlwc8YT`Y@C0U^yg#t1xJI6g0DVLf|o+he)cuZsHh^eB@{hTJrN
zX;lA~AnW`SaU8_RW&C%rZ#qLRN`=yvd}5<fq4hc0+}hfTv#x0Mc6AvpG<KN(T_6>g
z1b#g3Ukh5;u$IDMbCbG`q{{394G4;)d>H+u)QdEFs){TH9olRgt>G}QlCN8?oM;SU
z+0xPyv|@U3q%RO}3`d?cG0$AObm=`2waH6;??Yn&L$AGIQ~sgZxw+v0%DI~x#7*ZA
zVZM7OfrO>P!cc^P*Hs^TW@coR!On2}tXXIj;MJL$4knAAJ~HfboG<F}$YHL7aewhY
z)PFHg+_oUlEiWtIFqI`CT`4ohB7ip`I?^&8J~ZZ4Tup8{dIF8aG`Ag7lpbr9TE-Ca
zs~+qeC<k>PfJ=mb*Lh7wMuw|kv0t^nD+_9F5#T*(BiNlI;VZV^ZKp&NYzlvacAPQ=
zBPwGK8%#rPyG|<u!`FQ_YL2bFysE#>oi?>~#2oC<Nw!T}62!uW0$`eJlw>b2f8{kE
z;#KJ~<kR=WbfteD8n=;2kR>?W!m+tvldkB#yIcZ!z_f1(;;6uGMlvC~X97sTGX({d
z(9dUfUsy#rkx1m|KLBHfu5j6)6`|Di2;9G16$&#GM6j1~kgJ{_1n~gg)&aB{kq!%z
z<RP_c-JH2Ea84l7$acr9(#A*NkA@xM)gqy5>Hj1#<s`^Qc9QY@I!U4`!nGFB3`fHJ
z_Os6#!_vO`MToM9{O33-gMYkVTWs37`m#9|yv5x&g*>+=tWaKSeK?2$vWHRrdRci!
z_AscCm9+)wbuH2MWxy$cA;DXD-HWe6bTa>2=pYbq2~kLT4kFQgP6EJ>^X+k_kAAM(
zDH|9FyKNb%_4f8s(-Hr~@kZgLlgLi~@@QsV#~o#$5w7u=Y5K6Y!S1|Fu+j1<JZ9GC
ztT_`GF`W(No71#|g^^JUZ3em5nLpKKW1Q!8t$ww;xu%lR3pL6DJT{PU)AI5<{uh+@
z>_I=er_}0DdeH6zfbW3c-X?>EQcs=pr?Rq`vZ;R2i~POn*U4#VO>7L(va-oD>N+K+
z3NJXENAH9qp89%#m5i%cRt~q&U}X0&APj~x{*~4{jaPwbJB=zcv`(gzqVs~O*<swO
zR7KjgIkXp`N|>0El9HR-Atdq(m#<F-AqTBNkAlaqXm->x0D<%=unUap>o7sRA5wUP
zI~u^;&&|a()W62$5SEfF(<ER`!`EZPFeR^xAQ6zc1o`w`<Y2B&dRC7FppLA);{YAl
z)O-#m(4TRt87Z@yf%s9wPE4F$2`JF9bn|gi5;2}@i;R`TVGtA?(?3azvyO%4pWG`l
zDHdsS0MW2F)ftqimWGS&$E3)O6s125kPmxSbYB{V|KK}mK;KSK;Tr?HwT*DxW7Cc=
z;0Z^&5#y82MG35zCEJD77LahbZOsBQe4IQ|;&C_^4UpQRwj$$x2sBBLA1L1}pm%cf
zVJZOPWfaC^4mUS!k-k;Kpafl8dl;gF?W&K^=CMjk$lM*Z%Wg;4Vp3<@=-(5+j{xNU
zG_GMdEJCa~SM|H8!qnMmW2;khZr3>iQ(43<{pDpLTN&vjhXso-q@=V3^d(((%;d$b
zAv*eSQ2!4p0yQ@UM=bg>CIp=5pYYqI652H9g5R~rE?X`MaNsS9u$_Y>BVT$^7HI-l
z3fNyIAv#bBAcu9FGZR~dMB@1SvF)n_8N65YA03-1y#slry92V<RBmMgAA235V>NVk
zO3BSd-;N_Dbo5~B+<7oFeCwxoYMu<C@fh>uj4um2Uy#Bo02C*lUWt#HSvIDjDVen*
zL$5N0&l1jVJs0)G9uL;Kix92`bazjCr_rrLMtwtR`^0M+adW+YcP&D+7<Qz+F)9Cf
z6wVh`^EhC8S5|tZn{$WIg~q>N2GM)+$7^h1|M3c^LldO#CTg1;9TfP6r()+|ziAh@
zWFBb22$u@st0EZdTr}f!ah$c3wx6<;)^SMM4AF^_hT>E<qy8KqICwg}qIg5Ha@W$K
zrw^DBtY#~KpoSo8IAPmj+u!NrqA2~h9|Bu-JUOslU-Wl{BgHBI&3{%jsDsRg&&p&m
z0xs4d7<2pt9nyJYTu)Q;vE|Uig(($==byj5XHfY1#N_vNQ`H0YT<zV1nNS!Mb%jFK
zk%4%ugNrcxlyb~C{^o)Lt)ee9k?o}$i|#S2=U{vDP29>J1pq4L++eVfOLAs6LoRKJ
z*K%m`$v|<-S8`T0HMPG4d}*|tGlqTuYwdZS)K_3TRWBdU&-YRx{c+;+u!Mw(u;+Jx
zs)fA6&eSj=5K1V!83}1e%E|m1ei&JqjKPz>KW8WnQ7kRp3NYBg-X7Kn<X-@6r~jJa
z$`!7w1<hPFAfbWD@cLuhZ)pZqu6BQdt_~L^)u+#>J3I5h89*~1SY9WuQ-D9zOu(VJ
zYIlo_e^JaOK)@a+1mYmR&4uO741BulIEsu@W{bMX@Axcc3Oq<(0c@NcZUZmjQnoq*
zscr;MSzB9MO^w@eQBjW5`{W=~B^{Zu3DH$TfE&1~#R|8enFghuU@r<>JznNoge<q^
zi!WWysZ+w3ZDAd9#H$Ao=C<h4liB4mr$v&O6f2o>&8;pmVn$XY6BBKx|C9DT)6$B0
zHy*+(!{Ll<jMMiN-^-!M!jgLR`a;<$EDSVmi#U-Cti$4l&BU$<z~L~Bv7ibmf0^vr
zVwqn<U3}R{X8S?}6{F})0cRF4v<I`LDI^F{tq&Fa%o4iep)RZ0%-TpA$;rV1faW^|
zt9FXjZ|k3_S%%Ov<y=iPsord&rUI;61rf?~9`P!wC`9KyalXScK`7LkSDN;PI??Z<
z=o+lIM@lR-vMYLedbpl=pOmS2!Rh*g$f2rEq0CaS`Rpl7AD;YeXlj?)CgL{1;%nq;
z&po->SK)M@_&VkfG3GxtfR}7b{MIx=&zo~2J}ws+fu{v}s5JbJ*Ex**gNI?*_Z={R
zqO28|b9R<_4d2o7;A0sLDnk0Pbl7B7^tT7$L*##C7<d@`A@t}T+umYl&maDv2_$38
zkFo<c`TqA8Yhlx%W!-;CpezD(7DeSF_yX2looL$k5}U2el>e$$-LS+ORANt|({n#}
z>n_ll9e}UkM;0&wD@~c^u%x6UNCGg+tZ3K@1twnkF|4F*;5~3*!?v3)GB8DX+%8*{
zvInGL+@H3n<!mqsh`bw%*I_eQvuG6<HiFy;IAXnGh0Rnod3Dv2XwCOpK6VP5?3q!g
zD4GX5i@J8D)&%0;*)$jD-1(u;0QTU)ZcsCt#a5a1HbD^&fndBN@iKrz8Xl1Qv8KfY
z4^ypXYRu#AuC8y1jl@2m1TVF&01}c&A0qN0r_VuNYXRVf#Xw;0gsB;`D$<tUF`V3I
z&BA8|9;U37Ee-OCWlT&=5&EiSQ}3uPFGDFnM+Fu%a7aT#Z@p?;KH>Qt=QR&w<5UIN
zf~qRrpC8yNTClr`B5V-)Nbil)HCdECe$NtxHUl)>esy+Y1CZMrilLjEn}X0`3d$gA
zcYbU5ity3mT_T%`C5H7&@GZ&>|0l^s0owDsM41KPEdqQ2J(5S7-~X!vC?geg{e^HZ
zTa=MKxHR1tZ&Me+#Y|661JQsK)8F)Vi{+LHnOFF{dMGFi7FEOIZ>=lWEYk51{tY)e
zFs+jcKFIyEV|hMZ8iBov!}$pTa{8)>5BH$S@Wgpl6PEGRW*$Ocml@QCCoohI`#c~C
zKs#K#@;K-?KVd9yaXd>cM!Xf&$NM>^IFH1@ruNof(^otqsh}bA%R10q2btg-F9Fm?
zMn=YJcHj9&n#Ucdd`iV*y(pCHD5=LCWpXo`Kbk|qd}`)@@0mCwrR=NENBIeYf0}n~
z9!IK22>ZBbH)+V<o4N`mHxK)k%rl3;r^Q^V7`t|_L%UY2YJv5!i;Fa}Q+RW2?7i&w
zaa@KMFJ36XGt0yr4WA>o5fsB}KK|DSqi!MPzdYXT*N}E_6I1ovoja4;zE3ZDOFGj+
z*19}(oDm06A06eq#rrK>;!-|q`P944FEU||!Fz;8*=(pPUoODwR#H)M06KN=TKZSq
zkdr-OVk^xoqv^gGrc}e$AFnZM5g@CtOkg&?D07%ev+%9t&_?S~EQQacL(52>v5`cU
z1Rqv};+gXOi4BjH0Ab9YaJ&revrJ9<eznRkq=;Lde0*Nm55Hi!@o@57dKf)q6d-QE
zOhLGn0L=`FfoTkO;mHl9#dhhg{aJRlCb^8b2;5Tg&UBcIcpjihj({+)K%+4Bf!fx;
z3Mou}_+C`|KZWFUQHIQe#(FE^IFY~0!~d5pPU}Ap3lC4$?3w2bI8`-l)aiWe1j{K-
zUVt|Rf-08nas(t@{HMB-g9=@Tw#$ZT{eNO8%ny=~+BCD`hp8@>-TV8R8BGHzS0`iz
zcXy6ceoG^yi{;7gxu3OpV=j9!6VC#DIy`|?)87Xu-mdr;+T8vplxNz8iHDpS4s7Pu
zIx@9Rj(P|mOAgjPj(pp^5rufFyIBdl!Lv1xL93bd7kwrc^McDWRMYo|^7p?w+cXH@
z2*^^(YyX7nyM_evFKJMa%5&`1)hYg7K^LL_y@NQ=nE{*ue}7C2$3PT_v%F6JKlpaq
z-@|EN1D%@&^_N1!o6O`I!c?XHDxnQD1@x)?eSKYjPr(kbtrF<L1>$o-L6`sO*CC)^
zx1`TR83t@)zkq}o)M~MRDk>ES)&HYXlN8Xa_4W5NhJSf%Cu;oflZhm<%D$+XC=P=A
z07%V@FI>6#8Jy<c-e)kMx*D{lS>t(#9huG`g9kaex%)yv8fXWQ&P-=pKDkG|W3mX^
zzW&kD|DII>=;bCR3^70q)k~56Sp?S@z*d;;xwEK5y%a0#bCx9W0i@8+(b0q18tnLC
z#=oN4;{^u6odr0*9H#LgFC52jc$$LZ1MLEJt(T&$oKJB<NHspALA5|67IUPAG}HN1
zf$sL^0<p7Xyu65Lc~VM>5fCQXC>CJMg_2A^AGi+T;ea|kI5>zkSOTm1!et)z)uUIA
z*Zema^B11u(8JD-Q&Uk<(bJ1}q(S*6%<v>WpP3VEW_4fBQp^6*4AG4JwS(ow<e@G&
zSs=2I7SnaRt36z@Ltro}Z4|sl!JFv~Nux=v)>WI5rmB_B;Ebw#hqYj<y`B|iF^|iE
zWt9fROXwYfe#;RY%@MGF6LsZsg3s9@U|M+&97J)*|HxI=%NHfUr}Rbn8aOla5<8bD
zPYLiNkB*Ke)3shTh_jemKwO3u&!`tIl--Lk_hiwlP!%w{cSE2#cM5I~mp9}=9>_tD
zxc~csNGIc{6r$(l9n3mg{ZI~~Ms@{D=i99knvakKtpYu;-Jjx=%Ktr>s)mMz>rWzH
zO*O2&d<t+wE+OKnC?hko3JZACQ0LU2CV1_|ygM`fZ-*rvk3Oz1_#QyQcn~Onl&RWE
z#uvKpB7^q_SLOEmaM{?{N~0sCXx*^TqT^-3!dh8mkZ|PnHoo<X9dFQAMm+e@ZHWms
zfC{-;wF<?5!amXDW=Zt1x<DUG7JQG87_6@1(2i(3yU)MBhE9J0zqHb^)FHU93q4Td
zux471=bzI47m`jwwu0G=QNmVnr4*u3JLU+7(W_rB(DB))yz2yU93UQHTNG3FL%_-5
z#ODwnpUrnB)B@H5Sb-gJyFq?s<u0~4@ZsK38T?4XPeNj34YTXxm*>h4=i?T>cGO--
z*8m%-np!`YYPskB8OpK}o7lk;$l(9!UvG;k{6XNN+`X5#0y5#JU54W+uoBbDrpuLy
z9xq<Jz$3VTc=cHuLa8m1*<*K^+iHZ%RQ1|wj=!GoG)P@r08tA;je-t}QnDoD6LAzK
z)$7K_srAbS|En@WfX~iO8<HCTB|@XHSH;TKP|?Y;eFXyp?0nagPq>J#qX=n0mwgaA
zZrk(RXuaW)hiA7TNbgdCKWZZDDXDb1raeqCQBqKhgX2zKP=@RY2t`padF&vW`oEY}
zX?CqF^=ViNpzPPDEv-PnwzjunS4$I-%dc1O>tjp7vzt-UWIXG&m@`GT#Ns%|!;|mQ
z@XIl<b9Q$W^>LnYL7^q1KEk$`e0!v-%etW_6<a+t#-_M5Qj)EP9aW&B74{y@>YfG)
zh_=Ri<gz{rN*xRZb$;P<0GdaYrE6e-{jD9IN^Km389t;4Es)DX<^hVEXN7J;l)`Co
zzg7eFEp0WC#4w&aXqOuGf$4Qhcv5uL1=A%%07lu=kr5HUW4W_4GFoVf%pk!~Ij;<~
zXXX8!EtLqxC7?lx&YoS&Lpj8^jTAA*c!Gf6@c@A^oQGfTQD+ui&8wl(?mu<51WB2n
z$!WPS6MmtlZOIb1;CRHt!)o|m4t$G>dUv+@H26@KKe{y2xD4SvwZ9#O_t1T35q|8a
zc3=^R4jVMU0oMQ`Eh(~1yEy#!Ckc|$f!8>PLEp2zut;+ct*k#V7-A5IizxOD3kx$2
zhAtj#x_{E$Z@^wVABlkx4wiWw2Tr)o1j8H-Hce7>XD=;5uR=}6j`n;QbRxfFu^i#i
z33KWRh|8*O(BS9ca0GvgqfB?QW<3omcG@AN5<WrY4Py7WM`T77m|n;amH;I&KLAPJ
z{?EeKRvRR40fG2!@qVoKyDJH?tOKk<F7pqkc@KRalgQScsJBM2J*h@EzTic~$LCH#
z%Ohu*aEt7~Z0{zDOCY$DZyhkTrcW*e_z>SZKx>5<4vALrnQ|#$)=zzJMTx)Hb6lqr
zh1AAX+~qqd%GW%$%5}C-_J5W4-eEcR|Nr>Kof{>QtkS?0qH&|7(ojNME^QSl8Y-oM
zrqR}LwX~$YRN84Gq|%Z~TPn1ry}yq$_x=9f@89qE9mnU7-#_2u`=^G^>pWlQ^Ywf_
z)@zvAc6!$|-W8}x%+y|rP_U<QbpD&<tc0`vB5>zg(di94kG$F0_yEJB*w~vbV8*mz
zH$GZfi_*kz57MjHR8PPuPz98yRMFyg#y9o4AB0OLgB3Y<hTDx^>-sUbJ%Reuhs|`l
zNlQ^^bza9*4rR47d0Ln6#FqfgdV^(=kTmhxhRD|9<r2z9wQ`?gvy$0q#Wm}F)j!&E
zgFkh|h>R`(^YLZdV!y@|we8!tGweEk18e2j)mQh{gbA8YDQ4-t^~5v%JDlkqhsGv3
zJx}%3B>G0D1;fta^5o?opS0>3bcAou*Xf_+mSM#U6mS>{z|<nBk1?}I(-mf<v+Uoe
z82;Gbzai!fLm&$?GZu*H5y89e=*2wtg^Advc>`M}I4ECMk@#KF-`Ox{bH#y4V-TC+
zNRZWxp12<|zj2`@DLcr^X`21EQ_Ii=>Z%MH67?*)+oj88VQ6@Pk?Nry>2+j;SN6B3
zse0}T^uAt{>2{>zM}17H@j`8oJC&#k6Q^9Ztf3Ya6`^E3lF`&ttPjm=mR=V86(+!A
z(&5fqF>%EUjrZAYCL>k8Y8;RvkmCbJ-ng8~O)oGz-^~XDMXHxNBgh9*5|<BaUJ<%)
z`F0OF55vB-B0Z1Mjkp(h>T=6NdO+sZThCY%9xdSFwM_8ehDXI5CvrXWo`9G=xk=7X
zf=T-H@=Rm?MSM?!`|sY}&Bp%gr&auSP2@HSfU-k2(2axL^R?b~UQT01_rl5Dt<=ZH
z-8r0?i`^m^N#xu`6CKYUydBi@%|Q!`UCYB~Li{CxMgVMC>5Vyr30#WZrzm6QTR~T8
z&au0^vU`KxT%f!1cVEtNS0&wkptf{TDS?XiJ1E>*)whb*YocPTq14G~g+Ve*!VEOj
zQP^DLf68yvq~yf0c^4&PV_%`kHxk?)N!#90Q_^JP|2XK(G|||-Xs1)KoWeIPGD+iq
zK12VdGXsW&A0Ol7PM>zA$l!)%pVijZwzkeW;;~w$_K9ejH|HVo&$F)w!x+)B>&eJl
z+`oiz+TQ8lVWxJD9UdNXott0aGk_A}^0yjk7rH0^3qSk<?dMaTSxiMQS@WFVqr+Y=
z{iw21w(v2bI1mvLX_r4TSDr3bY9B*cMdGMvVa;30U!qUg$RUsRJQa8}FT@~VN7v`A
z?X6_6Ip}|bN<*r<YM-&B;COJe;cjqnxBkPhd{`%Jmb!51%-3s-X*?cW_}(5kRt`ag
zkU!Sj-##%hfhCo3rF`R#zh1JLR-y|ng1ot-&^DzVjhJX3pucRlx^E7%OE)oi4PT(X
zdMaHpF@Ncz7FF1tbDb1s<_?QPF^{}AXfsBTcL%*;%Ai?q0+z{Fd@29qdr%z$7e1U5
zQ|?YmctRMkjy{SMy7}#Zztw2xVyj7dYI?eE)>Us;u$yd0uRc7wrZu~Ob9p|cBP>Gp
z$PZQEQL=x&p?-qy=`7fbq;P-#Eh92iB`4dT3F7nl$#b=eqTP$VMS;!M?ISDimb)js
zIiPV*%=Y?^^gjZyfxfrHszCu|IqR?(2(A*t`e=M+OyaGNd5{MdE<|zRlnXkL^7(A`
z<u}yFdxBy&;x>){CQPLLWO^k!bQzYFnkURe{rkE?u0{5I5O%&0E&mZW$UHc;z5OD6
zs;#O7v{)a1t`(oTt0JwmTw_~8(bd&eRsHsGHHqI%{oYlARC!C@xw(S?^W6dTAGa7t
zSY1Ix)eI<&NtK~B&}_VDS;5H+KCnEQ<a~9rTwN?inQ1-hx)F$RWWED$|9As1*4C~2
z(|L)f$i>wUAeCOQy>8}K;<Hgzznk1768jrQ8dVDA%69)#gvNndgFvPLd8YkDGvVT|
zk#&{p(!xZ7IK8FES>XPzAlR~!$DW`+d;o0{X{Cuiefl(w?W#YWd#w>(2Ou@Sq@;6X
z^E1m}UhSbMXS3qTCafE5z=|P9;u7m0tlxr0sP_a$R{9K@1S3O3=!vBFty-ZTqpg&6
zbi~jMzJC20cbynQ9v_zKiQrn8Ks-uKxR5L#8uS<YbY55*fGD4|5{=E*Yao$Ar6D}Z
zw{1sFj_KLveBmY_X>5&bLYAivUxB&i_Xc?xgA=+Sw96XTq9y*^u;VWT<zc?YwHYT`
zLa)N#=}K}w2TJItP5W2OjpyqMM@vAji`&oK7^T0r^#Z7{G=0Y22Og#9Ugf1=1bl$W
z+Ga&h<qScG@sF<Y$~4=&Gg20~0_4~IuwI!zm`G2BFfpo8C|S0nItGl=n=#;>HJEDK
zYZG0TgO3T(guz3iIem=%7|C9lN&fpQ$%rXQPfsV3N=~mLNlJh=fkH{tZi?(rI%^=<
zenSe^%p@6bfQbj%9yCEmZ+?>(jXj{(K#-YmnEUTXK2K(Rb3mw0ZRin$O7Lal<Kv_G
zJsZO?htlfFcoJm9%nvLJK$Gu$Si_z?DY5LU#05G0s@M;-3T@1lcMmQ085ilD+LA9m
z=-8;DX7n&VyX{7j{qo|p)?+M$k%yqA2Vkw6(_l?{8Xx}+uW^$_inSv3S?9&YHy{1Q
zL&CZDS>NhwJt}VVU$eX|Xu`Zs%gW99!@221nRotwz0OW^l)~vQ+sf{N&l%0tg3Fyr
z9NiYV<UW+?;FZb#?169|ncbdyY^)0_X~|2rVIHhCL2sC^$vOW>PMQBw#n&tpsQ+`a
z83}>iw-W9(y=|dB{(4<zfFmf@vuQNHPnz2UO!+2hyD6rTmZNP+<ok#x3l0;R&#byC
zWSz~6N)5T$+KBw~k}4`&fg5lzu?Xq}oC@ik8T*GJlt90gZQ;2G&<;Urfo$;-F<T`;
znHqJH3F-Zu(Sf>S)T~1R5><D4lU!?NXv1$-s^$-pbHi{8CI9C48FD3A*WThtke>`G
zd*>VKv8v(x#uqoQVWCKDs%W$rOi72x?kPR~(n+Z4ej^c^&VedyEsd04z48qKYin!3
zF3#sZv*u9``0qMS2;NM$WyC5ZB`3G$nX}{apK3H&9MU-AVcjll^C4-qM4Nv>hCW_v
zqNWO(h11p?@EsTco~Rcl!FId%5!6n{eFb<juSvW7M{)O*%i5Zqz&jWfJl`?8xFPy6
zIyYCf4=+IL+2D_FulS1Q0+TfBD1g{V4=)1nv;U$denTN&tcZL5`H{SByYMYP;&qdR
z5XQyLjoneb`(*X>vuE7bhWhbJL;UBukri^mcF3`mB7{|>ymb*Ep|I(key#BfNG^$c
zbby-C6V#=4jZ}9m?jm;!?wo-^AI$MgY#$pMA~F(C02^+A%vTNEy!Z5hGDx`o92zqK
za81`=d?08=8o2K3%vc9;|4s#2X7nOrfR~$_FtkJzuuHq6CGYF)1xq0u!#vsFaHb3j
za`nbDxB?H9T>~mwheSfoxK6M3$!pWLU;QY;9^8F;9(qzMWC({%5&==C4&Vbd{KTND
zItij?&o|6-@~>qWNrtq<+Tx1+!8v+zs5M8f47X3C(HfcskvIulrvB7OhaB<$zZ!`U
z;Tx#CJpJN(uAHZnB)-OlRxNqOapST#^V*tb#5O=eO5By3V)-*)JA$>Fcr})C%$4?e
z)S>?VT12TRzkS>fX;jyQ#igaCYfh0gH)oJ9tjr-R3;<Wq#arDjgckDUP-n4-sHo_u
z6$DO}Y#RyeBBUk;LwZdaNs~3>jhc;3t}D>t%;YM^-dHXG?m!HsA*b2?WW+~g<OUm^
zK5A2q6`PG$4d(hbeiG2MH^IZyg`QJVQukfAi7!Y`VJq_Ez=#d5E+RNs5J1T_CTG=C
z++zX=aHYq?OG!yFeaOYf*Zecui#Idb_x%&F35=(VjFMa-i}Y4ml?@ds348gcIvb^{
z(DPw1hh%c3+69!Es9Agd8oF)jDw4OMigkCvk1<8)iQm{2gu>~~B8b1*F`xCGq)DlK
zOC^>+W$65-VON4Iz!E}C;nOcfsH<U7<{23ok9S03Vo0<$p=mezF`TotG7BTqCoaVU
zLEIt+P0utSX~#%x!M5=gpVO=yeR}bfX3A<q2aFAb9O<CY4!W%vJaUbtrCk&npamsz
zf&>NgGBeNjR8>_0<x4!=>Ca~K!hL#ri`2>$7ep>iBp~4x?sBv$z}Hk|LB7Yk)-atk
z%EUgCxZ<(cK}sF{S21>XX$(t=R3C^fiTx|7>`?QQ*$j%zpWi#@TLcT1-X@WY;Ae&s
zk>c}`&{qUFUoHBS)^RK0@do5|X&b~*HZ~gsSC<TBYACMpF4fP(69)v~_L0lvw<N8y
zwpNX5aoQ!utO9<B>8lCU4Fy{r2Lh<CB9}gmEKWyQ<_p^lS&vtj+8#W6>mqnnQY&*|
zMS87+Vd$^9!E}?e47?mAAfO>(N9AM%gNa9lt@`HaRqksElf6LVm>`rn3kwTy-yV;-
z2NB=nqt|>yrQ_v87+(zs`eow5rCvs3qH_N^g|qeJGg=Ov{9GDksI;epOKD1qE)VCT
zW7r=%xN!?jGcgXV;QD@d3^?lLBZMazoM!IR5?Q}W*k@Q;0?-NYYGgw1NLZ@GK+s;O
zs~<p+!Mdfw4zboer&PJ%$m4<fbyO3%rIAs#f}gMLB4jJkAM^0?D$2=eJ->7k?f<)l
zrRma00TWfod7!+%4)5o5ms4rHJroKN%H8wtw5D_Bdye6LogZkWMo9?@1@>NMF{Ooo
zTNvrZO5w-5FD%$+j?bhRoIzvD@vkw)?*x|DIBp)pj=wH}8iQnjkV%Y7aLDparJP%y
zGGA`OIK#5@F0XcHW#!y*xGX_LRMae4*i7jsQ1v>2O+ZIsQaDf-bw3$wb#n4hhhwf;
z#~e7SlrLN491#m_6t?S#{(E1?iM?RR3(`Uw9G#8~2mklbL|xQ5zc=V<+<zzu9-ivc
zb2F3I&IA`N$6=kJIW`81Av~eqwSgQ=P{qgb77`{%^R5!Y5$Da22)u?+G~NPKQIqIy
zHI#p1?_TvK8GarQPGo2}-I<b)r${NsZu$lJp(D_-shj%vd6;l(ft>Lb{ge!ofW2lM
zX}<9Z33g6vHAWzgqGz}Nd2;kk_rSS1jtsJm<_@`%@d#a!KyrP?l`}O#7@<6VD+Lev
zXX8>pUoZ<Zg!{12lrs|?3ZWwj=GL%8doy@L1ghTO?Z2?^8VKWVU^l}e5uSXH@h$sL
zvrW4w?HtFgZ&Zlvf#fquJz73moqeS}quqhm)pL(v=dBnd-+6k{ib^+#Sd%1-IpDhR
z+%xi)jA!S`sn1ASMUc#slatE}waY`F!YJ@RcSE&+$^@;#rM&L*uf9`B98Vn@txIRG
z>mGt@?wE5Bbc(OH>6P>F30u5nj>z|W48{^YdnYay0ArG-DuM*}^M;*vLGL|k|6K%L
zKUvg?XZ1h-(cx@khAt&&yG~cpuluaB`xJ|FPITVv>+9d&W0m=I%XY>s5Vk$}?F^7`
zM{Gv&aSO<igeU!=&@`!_a{dk7N52Oq$d9HmaP3BNL+ZQw&=KB+x8qfUxsLl*ZQ{WY
zS}AOn<Pw!hz<DGZHD%=tjEs1tk(@0YBiV$7fo4@C&}JA7`6ab5@C#+{NW9XI>ct(c
z=^~Px$!9RmVZ6frXTq9iX=zF9=W9t%*8`txtaE%>1F?zX6@=|P{RP~SIpJ4ae+1*E
zd-z}{l70qb2Wutitq&#0n$5@WO!WFDrd@nt*}`R0s|(_vCwhkEuS*hl?%aV`1Q~z@
z=5a&m4|hKmk~TrNg+7dtrUwxDZ!bd>iuMyrceTrxquwF0JL&rNS5?2E&f$U0;~+=Q
z5cb=a<mk)}fzNGObV(nC&0q>sbdYGnNa}AdY-Zj6@hvP0(buL&{KeaJwY5!EUZBB%
zAPp-n`Wf`yu~C)G<D-a^u3<WB3cx_7oM*+X`5cn5!G9V;ciKX;H{ZWz6Y{gih?-F8
z$E+Ov9%?Ui7=<v5fn+9&i|ba9@5z0ypZdV1Ny(oPuOLgbNpauh8_c)<w<44jtp`XK
z#^`UD-aoznjVOkG9R5R9Q#1KYf5{s35vD8{)sj+DFkF#1>e2ccjuX_`Z@8i!ouJ=2
zom&n^flwU#_Xd<Cqx?pZniyXH{@t0Xf$Byj?$jFxD=wXPQ%e&azk>Q#Q#_qzZnfw|
zk+}t~+HR01Vb|R@v-wu8-M23+=ykInE9$?XS+z!LU~Z|$9HngLM~qEiAgJqL?PDtq
zdf>5B%H%O6>hGni0^HE*(84D!&imqvNgeZ}B7HZkhdvK}3v(L1;P(^Ko-Q9P<ytc7
ze0%8puS1=8zNt^LYt8DG-Q^1xlCZ=u<aZuq9T-$sw4-{xbw1veWZ89!tm}5-qg{~V
zmeF2m@;x~ZUenjVfhAS1GUnwHxpe){5Sq<97JJkpbxZ1#NN4E*ctoR*NKQ!^)}w}p
zhpVq802Kis1<^7O{K;K&osoQnCZ#Bo7q#?smKr<NRQFRtBSNOEMg*O{1DgO_eHe4j
zR(N-EBrzSIXHH%|i$5rl@6O5P=A$KX9<cdiaVN)<Cr>)YV0#P*3>5MeP-ysv{PT1@
zIA|SS9-aJkJDqr_tq7REO&Dfq@f0chEJVyK%6T~Y56HFKjlxKfc0q`bk55$80YMyD
zSF35C$U_$uXC#JR1C|j0mbqZ{ps#5h1a|r|WREQ16(KOglBc<4wnD#Va~LnTbFYe;
zS~DsJhIhcJfJ}(J6k9s8uBN8(7Mj@FB?ZBbSdRo4(EwA6s@1|;<b>@jf=lxu?l<9J
zilrGuKTLh}=ut=ryc*O`&a3_i;05z_RYf`gz*F*bi!0CVwslyPv(l17KPTmx2;Ta6
zi5`X(<Hl{oF#EI*ELylzInR;xbwPx$h07%dcm!9@=W^f9)5Hy|+AP<41hWjBGsJqW
zNMoo#8oKcTD4#R0R)?ZqLjPN{bL(cjCx+$w6<Kj%aQSFp^aes=m?DNW0(uyC^c&+L
zkYExBMIm&U?r26K!(#wW(R};3Jwbk-+MW?hZ)|Az6w8Vh_fthjrxSMv+Ql+J;&^uq
zyrJMuF-Gx`Gb@|a6}N+a75X?#cme_mjt7;MK|V;Zs46Rufoe&<feoqcF$ci{O6<Yj
zf`LtSpGLM}Jv3SoZc<nO0scr50pU11LUFM<T~|Cs>fg4a`_s1*|35uNs$ZUvE8f2+
zAk44&)RoSW*}u!o>yZqr&WouJzUV}bPj}Xq)KBt3lEM0qfN(`y67V|U5FsYEn>I6e
zNa!)INP!nTX41sH^$7^ox<bpv+Ti(;1EHVKjk{2kZhhbIthz+N=$oq8+}T2lgk5P>
zykQP42K$?hSbqLWpDM`|4yk!tEpI109aehtV8v6d?gT}yoeTX+!<|lDTy7~>%5?<0
z7VHx4Jt%LXl0Y*4&v?k9%VmFF^He|249?we50KN#uOF;C;*p=XBP}^OQ_@|SwJoxx
z|AYQ)b6lBV#O$9W@(Dc|e^3hFUL;g~_+Z6G7n_}ZVeJga6ywVO$1E!HTgC~Oj~{LO
zio|b-le5}$Xz4YkA!r1QG2HM^INlV}d*cqjz&~LudJ=cv6!)kCASTN56v7ZR_7R=^
z(E%%{A)yZB2Ks|ABd&MPG;=DG`0t=Q5x_(?j~9cGCXE5V*to{`_jEcFt_)J^?|>33
zNF(Mo_}4HJ_4~=!2OY901qF91`w7*418hV_d0%eEq=K=xhe+SX_mV=9egze)|6E!i
zc3107a-rE!NPRsG;bUT=;Wx`+0^j=l-zv8lLf4?V12D#Ui5Ki^w%Q$l&Q#<9Hgqi&
zSdxxGPqcN2?UgJyq{m94lS~P+B$CH{bw=m^iEl~;FVXo1xMVI*&rS-WVuZz`U5ks0
zYqii&VN#y>LFJUYE-T6p(lKZuXb(h`XSmm2f|L#ZVyD#l1|@OrbLT({yTM1iw#ekh
z4Z<@`@njo}ijw?~1mw?6a;pn*zDOz}o00;4m9a0K>%)b&ysu*=G9RRs@OFY`KwcBz
zOu{Tek~q@yok>c-to=AWk&~Y?r`|C`_Yf0hxUhBd(d0*`2cm#5e6GGn*(RQ_!(<^`
zFZp)&WHt8kjP!z<q&&~%#-&=mzJ+!`O<C#3KaG7W#<unIrl)Hp<)MJ1;${zzEYAAN
zYT<3Zg?7+n8$OPRFjXNLzCc4XIorpgCFH+i3!(abX7HWqGOYS%LhcCb{+IbmLhgX)
z0sl>Bq1j`|h`lY#41;?zEdpmU4?_?zfJaN@{}Tlip-=RPS57@hJn7Wb>gwt~A3>7l
zh)c7e(!m8zY#c;V$7*Coe??BE<*)s6@E?16B41J)t?H-lVljOn9L#s@Oud-}l0j&B
z9-6wy8APQrdS^t^K{J_O_;Wi^5DCDXg!%=1_wbqx8l{d7h<l=?rL|8p$g7er>L>sw
zJr7G3oL{IO10IBt>j-mM5+jM{GHYJ0z0I$bO_(7Mf~RVF8IiB&En<<QV)3F!m6E?j
z?U}`X-m>3c+>M=+hQ!E(ev^pjSWQ}w4U<Id->aP8fMSJe(4F|IzG6KA9g)s&3NG4<
z4wI0Fbze}->&}X)Exq|NiTnreZQrDAI=_~tow|+{9#e6;J=85-s@N;>nsYRxi7MAQ
zKk;JuHQlK7L75S6`n78I;qn#Xa-CO_IfIKL!7r_hcI0|=x)1|l_<~7kb=)KD0mlMm
zdi^8Hb$LVv2p)nQk0IsHjSJ=ppIo1HX-<AyTbiCE`t8#e#td?kCZk+tmQ#3yXiF0L
zg>5eXvpI$P{~51d3}v2;+$9d>rL8QNtNOhlbQ~#ZY0%Y|C?UA$-4nC2slB!7$-rHm
zLFa|@jE&96v73IL+n+#v>CTA{p>EmjvW(o}enrlT#a}%IeJmS=1N8~M>w<_DHsYv1
z1OKM86!vZf1{WR@((-AoM;E#o$2_vka7vn7bV{2CXU4nHpdI*1(@X~=c^`t12&IBP
zdl5SspTV6|X5fqqyQ&Ji#liJ_JuGV@2j}uvJ_PwU{>r$;_Nycr9TQ3?Seq06B*V&`
z!y@+MLy8r@Mn7cLT6|+6IUDW`;<%1F`;Wpp>xfL{lrvPI64{vQp8Sgz3j^7Rm4~0d
z08F&<Z%6YdIGTxa2*kn0ozw6Mx&XgI5(H?ZN616jvFxLTacrer(<ne{oDpfBfR+M+
z15XIc2(ANNj^DYZG4|QZ+%KEsf@=mk=5@nqohfOhj;Rz$OzJPyq@@194>=2455&nw
zM^V<+{V43xww&$Dnl&Wp&<^Z6Lfwd7+F~>x{!;WKWFI-c2<r}O6QP$RouJ0~iJhb1
zML+KD8A$OG(kH`jz4Wfl!Yfkt)Yo%mUVQggu@~Dd(`m5&yymtc;a;pH_bVBKeUMz8
z$#>uA)+#Y1A#vL8+%WUBM>TVA&=>vkR4g|_0E4F81O6mu@u~s5i}pYKELsf0)s)OY
zOF#tFh-922>lq)c3~B86fz{FJ=51gcw9)nuhH%My&-v~zH!l}7CH-OeMy1HPbiB3{
zU9#z)8D!~Mh=sA**hpVJ(XmKO89b+~syplUFePnIN**izLaYQ}N-~F|<Eo$KDn$|!
zkeb_}(S9pz*4W(qw^f|Hj#bUdx%;5b%e8n#oowEJxmiC20i3(iKk7V-x4(aP^+J7f
z9z8tbo*0UR&33OE8>e#?$8%j(Z;@gOx3)u*#UMzCgOCM!NR@BJcb0hjzHcXqul%RE
zXaMt9+~vFR3v}z;aaW!22{!-hL@7f#^k+k4894uO;!}?uRe$-_6-6!?efx1giY#N-
ziD^DXE}<J%7UbqL_iLqx#_ErINS&4+dm*8R_Dz8}>&Bma5D>Hl*jDU$4;3$(ZXzlV
z$?qZ>R0RcvYE*>)8N5&*et((i$D6I{SgUZ2V&{a?Uj>DjIJ}7nB=kMT2?XAYK;ZYX
z5Hdw_zB%YL>E#I6a8=z7XrGw+d<8BtoaRstj<CRHI63q6yV>uNU3jk7_J}_cJ)}vD
zBgx6!{&0kcBM{{n5fO3L9u--~Y0;X~6Z<3%(^y10%3Fkn#HlVJ(4_z^KadFFEwdg&
z8p_bVW<kbUx-^*%!e^-|ujIGEpS(SdXZ_QEd$b<~2C5+D(t1$!T0rFcCykaEpfNN3
zE5g9f%AK=>Y4>l1lYdA^|3l+X2}<nrwnLvUtFS*3O>x2H2FSGgXc4jjk~7N07~BEM
z^vJU{R#8JVoGVVF$s=IkM#fGNGp-d{8){%L^{=?-|56gIVFb<|bKvW6{?_S8b0Yad
z)BV5j@&pNd4-_Qo2;P|o1DK-~xpv{6<s%P-?>}zpqqt<yE&?YIdZwa1MdzV_yYdJc
z-z-EMU`E5Qhn?geLpDNMlvB8&p<!L!SGUp?oSGzo-^GKBzHDG%08N1RVeD<pI#%`4
zD=YvmiL@?*><k%^u{vfY6$s>|O-)VB%*?Q}ntu4-Ht7`E1GTsxq%Wgklz91iCn9US
zare%hI}yt^hKNhtzcD=abd9WU0D}ghYJ#C=m^TqEkQy@l;^OGx+a)C8#GhPp=UioI
zM6k%nGoWSj+zAlB0J*J8le)`|b{~77nvZmxGr~z}SP=$SR<Lst$?00+s?=qOrLGau
zV5m}8kX;f-4T0p6NM9dfYej+m|MQ>!5*O9GDnP>xw-QVd(85**s_FDU6z#wr^78Uh
zfeYpfLd8w>)v*sEDJkT)Xs-i`x|+|FWAO5#?*Z2u=3i8FLZJz?)c5Xp3Bf@@X06#t
zps*UznIr2LzM`=yL-(PGfft})ZBIu;yYVC8YkMkl&=>P+IO?PEb1(__0d4u#+dIR3
zdXy%#4D_uxn`kSWrL;-t7f`JUErjZaV%}^9dJV(yr)WDUo={FAGIkqw%LXStn{Hxi
zP-;M=ovF|a%VE7!KfE{#*dLs0ig{3CU>t{;Kx_0u`nRL|m0Q@N^IhbAdSCSA)h)S5
zSAO-lpx`v29<VK2XVK3Y{MeC}9N^OQYYj_>ug9VHf5x~u{_>AJzSS2D&!`*cL|)U7
z#r}Llhx4Hk-J80Ut9;GQ>JYSP?%ukeC?OW7XX{5^(kkS5-fP%w2+ct2<`nyd84H&F
zk1<}``CyF78Wt4Te-M6!nAS5yg2eRaE51gVps12&SKL4`#_(5<-!Pourq7|=?DCOg
zXo=~jU26J3NWl;d(cJt&4m8nJY0VXHgoA!jd}^EE@+_G(OS?XF-$hM2qB+%}xV?L>
zd7VeMpD}Q<GnC&gW1ipW<ZQ!gE4GaxSCz^V?KeqY^u25vVqD%v9gFl=zs|7PygRy-
zO@D9mArn`OyXHrqO-*AOY)q!K)$Akg3~TYD&knSb8ivQbZL-k(exxeN$$4FRHGn|Z
z>K+#d*6X8HgG6M;dh+j^ba8IWW_#i`ol{MG^Yw=-`Pa%pcjBXNt4teVVzKCU*`E9y
z?$0XT!VgYyCh24F%$qnxvEKDbFLPtCWn1UXOkcXt$5$)#eOK{48-8XTi?>_xsi6n`
zjFQj3JX4<6c_&4u2>-`Qyz+GYdn~yNTYe-y-TA({IzqrC=t)X(8v*8}YE9#1W?(t?
z4D;QTaP>!#Mjxg0XECpdbHW^=**^XZ(lX01b`Nzhi8c~joS@{Kb)0L698eyc^|=Nj
z<<5P?5a5u#Y_`V2!d|M}grl{)rP`V7`1LN^Tx@xk%bga=ky9;Fy$n|Hy8Hpnl8>C4
zgT(@GCmO5>qW=y%5Mx9dycb8Vlu@HdgTQ`IVtrmw*SZQtDK0y9*c25i`ObH;W}DQ7
z+#5zVd-K#(4;e#Wcjs)%D4c5Txu$JPi}S<jZkZEQWqI*6QQZ6rH=%7@Mx^Nr>~Gn<
zM6yUXSYM2`mw7%PaFmzNVtKf&8QWm?VUzpyS~rni(-5+!cV!hbDbL38i<Ct@t21Qn
z@GCm5MYSXfYd0j|sRMDWajva9H3TjqZKe4pko^vb@t_aSK6(ckSnWr1iC|nC;e+;|
zRgb+ZQOtU@@95D69R)j1mMwdR^#zW$_Mt5kj?sMDk{5r)H@-fD=Y7&R1bPx>qfX27
zs#|$z?XpG&+?)8jHNikXNx6SAznGi2ZJvY0fp13|dZo?`qAAHS&n87EM|m`@-5oHw
zqbX<>35zDjjXxbox${=f&ixi$@Kr&}c}vaVzfwF~U0PWZLsBN<j1S21-G2LnvZ+$C
zfgjlqzZ_%C>D%KK4F%%@eYZ!g>q~X9Y+tYTCNkjawc`O>nZ06IsB{_RzR#N_Q+K5l
zSk1H}3ISWj8{M<=NJ|tsGOCk=5DFv3q<Wl7mKqf?bF6h3s>4K#Xg0lcUOBf=9Vhl2
z+oxAulEWara8Alxsxq=T9n-6NV`5ZN_xDIAvqnROEkTfPCVMAdY5RlW#FvM(8dYiq
zr)-P!;y*it+JCMW&itm$B)(2(Y4Gp_4<#&|c_;pV>$U?r%3W-lnJ#i!EH6T{nCIj#
znJ-M2E*Gmf%<vWWHX27F6Im1VFNKz3!p66!XSqpIXyiehOC%F}b(&YukW`&2bQeCY
ztArK7)z9scc3d3PHQ8}vZ64B_zwY~Ndv{;^sD-owJ-undUIZRSXtEIxUH5{Dn;V|8
zbk-dtFtyQKt`}(>JI~_^JDf<&!l}X3mYQs+sa5IE-i+=jrU-G0ifT)lG^y`C*ziD+
z?l!y%nnOgKXRQT)fJ?)#K;C-o(Nr#*$bEb!oknqIY&x#}=d+F@J?s-RJA`Nyac!$S
zw{2$KjIU1ff}m=sD@dFj*S%AbVlB4j<B%jJ<qia8n$2DFozvmUyXpF_?1}nt+nwd1
z5A)Qi;`ndvJKG*OoAU(pDE`n~k{P2l(9YIpznAN&PW6QDnxC*UsCZ9tw6f0F%U$jS
zV%w--D6^ktm?m|+3{q&bE7^y_j~IAf?HPv40ZDpT_{Cs>loh(Bk~Ik9gN>JQ7k|#+
zLr@L7HOe5v6-9Or6brkc_;5W~xRyoal9t@tsy1IC&4vekI1-_LL}yrgS;~mJsc;!P
zRr++(C)&t(u$a1KLH7HZCmHT7-`U_$^(nwu5ektH3H?2uBU5#>TKQL1%C;ux{$;9=
z=DgaGma2Kko;Fm|ur)T-q3F`9dr*q(;o(u3*D=`ez553L{NYE5*N<YEmAE2TcuP)%
zNli}DplB;ie9Owa&dswN`yhu`Gn3&Y3y@~%)=|UnDSL|xw|uIR7Rirq73C|L`D&uz
zJ~XJ6Wj|{<OatotN5}f)>sblfxmRJ47$hu>Y;=*Wg2s!9PZvwP$2NE5#lKC=bzh41
zd4oj4=Y?Z5La#lRSH0BDjvrIg)iw7{a4z1fm(lr0nw7`M9dm_aZP<qT4H}Cmuk89I
z^P`te8XAkvjYodLd41<i1ZLVinx^W6wel5+ZLaBYt3KoufM%QaUG|-$bLi)!)6zIB
z8qc3o2c3U<DQE6W0T!q%vfW-c4wavZ&a6(1L&S;IcOxY6J1*QZ?QZ?FbwfB8v&&2q
z|H=GvRwtCZf87G*slK2|`9QRDCOckvjJg}u#)&64Rp7agJT#8oTi`R;nnxV**jh30
zGWXV)o;O4Yk(%N*MGuGQVxPk$gp?QE@*N>?N1K_Y*%VHx*is~gZk;k=c7pYsSopN!
zIAL?Ioy&9noR6FXzc+)};GFxml<(Ulncgx;`c3|*y;g`cUTH)TsapKZ+mbpQWu1GY
zI!f#pC{4{G&Esfp51It_?=Dn)cj#GM@F~4Hu63Dp#2_==L*zu-T{1mb>DKBP?pF56
zxSY<MqSfZBT@|JOwL@0Q-emJG+Xd-9f5y22!wNAPP)}s(zVF{n)X~32iH*0i80&6p
z@fcla=$q~kIkhC_px!uCmnQ3Wb66|qeuJV7JmaDAg@0M>qxIM6Du^wGQRLV@8~e1O
zB5XGN*)@Gc<f7}8bwD$=rHJzr@$0>Oe1s1S{6&DTLNR(VNrAi>>UinnBll^ohSVb0
z_)YXA@zh=O4F-l2c^DV$`hJ)(q-MDcDjAxkD&#^U3h=)*uSRVD?R)mLRP7<LHM@EF
zMPrAJ^?DdMjORU`@vx-&%H&Vj4l{Z=+x_}t&hy@4KezHiEKOwhwh^g1jHKrSY$UK-
zgOP7N$=`NUTO`;*;b*@hEA8T27jJR)lz7)^mc1ELq0@s~6fW`X7%XQD(K0Vq;n~f)
zFLf|kExR*x>bYXl`rPqU(;?a+6=VWkCUiMOdu2S;@1E%W>bptmfwXCkL)v=TomvV*
zVhXK#Icbw$jUz;`-}SLS;21m{z(e7STYo|I=M4cx>m#1rrbREcqf11yUvC>RC#EV*
zxhk4vCUW5p2in)#h9mnqzvf=|^Yy?PO6zNuQw8j>Jz0r!t!kF0o8C=Ogh^F%J8-Vj
zvG*<>-ym9Oj#;IRX`|*;l!dJfL^4a|JG`6grWAPR558yfe2m(myEvexYArpOI*Ax0
zDHHenXIiO+M|1ef>`O*+OqQa>6!J$PUu5k@P}Zm@;~A4~8nlarlbyzT6GoXlHtJWL
zrW5OQ>;KARkjxs)XyIuH-469xGk==@76GwA;!qGv?p(LWX5C8@YM*kes;jNLq_gDG
z>D}iNo^sNqayHAD9OIu@etdQ)Hw4v1QmT+y?D+KWotImTYh**7J^Q_5qw?RKVxNE~
z8o`^FE7x!@&o0_pDi$-Vnyy2mjm%U>0>d!<Wa5--=cw!Oc9x>~5)bbLR|pec)!D84
zQQIG(ddG47D{M#ox1PBq-5-OTA$51Opvk=I^t6e9_f?5PyWvXHhg0>f-2>W%a(hvh
z3*Gr!pjRn_)t6ALK%SPHw<*x2VMsQ3Q?uP3ueS*mcV6F8OYv=5;CPV97S?fOQf*<m
zZpCSVxRhrHcbhR>E?)d>+P#$hb?8GagX7OXShiGtH2EWXo@<>!6tN|sL$=Hl%-~Ev
z*PDAMX6Ept9i~U=wsF^qsKm5KRGjP7JW$SR6rm)Wb#zyt!@Aa@-yp>7gu<yXBKE;*
zu*r1hXSeF`=ZqeV_Sp&NW(fgU-d$i&#)97?j+|;;Yub|82WY<0>}^fiK-`5!_RQ^0
z14#+QY1qTM$nD6SvTD7FDcL9#G8Kq~-toshna6p7iBH}5-9`j?!F=gDOa@LlPcb=(
zjeZJ8{^A{jLtU0tJgV!Iv&<FBfW+fu@NpcUT)(GL%J`~LC7_K0rxZXJL>~B)#LIAt
zAz5bv2R|6GJG7bRwWys^4v%k<CX30*)OKWD^VL!HPOejPxS(3_AhUCe>@#tNUpN8=
z*E5=&lJ@bA)#->d0r7QF&&to`$~%S1&^vr1Z=1jSex?0YyMG<xvrwt(DI~0P+K(&n
zaOM=JAz19)_H~mKKP#dZ8bl>6&S4*KnTi*Po}ESh#AE=HXJ5z(tn%HCPV9N7i#L;L
zXDL$6`R&j-+!P!=*BPpl+S22bU9&%~Wv5RE3gdd@8qSQ`myUUx8`h>M6;9h$BLFT@
zRMN)ACSUsuPNZHeZ51yN(W6%hLWqj)JhOMbnOvdbF#DvR@88F4hstk(6LU~w>O4+!
zuEAI#ik$xWn$r`sh#5+o(j$(fF}||UA2C;3s-s(|nKPa}KM!@&=tFu1-0bXz<?+bn
zXU%F2rx=Vj=3Kz?WN(f6hJ(qb5W;NAJbatjp`Dh#JLSqrYx6|z3y>x+wMj*aIbbS?
z-~LIDVQfd;ufQ}TNsUpDeNB=3ZezTvIWhjhX*SqteZG@fAA{t9L1oF7I}QOsK?A0D
zN@=~nIzH{w%I7b9WYRb&(8$#A9_=PJoW&n!#4V3{JjkD*Dy~c!*`@P^rSnKEOpbq{
znzF6rong0$KL#vQdv6+1ogWHD_n{_a$SgFON)5T{E0Q`TV`SrcC6D2xmZhuM*1z*U
z882E`j1lk)4(AZn*ub|<`QX{HoDZjc&OMKKPUCkmzn4`e9zxaq``tvZZ{m)=;P=%e
z#CuoNC6UYPIbOj1B0r?_fH=`qrQ26JfcCsSPu8QoOxO2wuOhPwTL3a5xgY(%JYIEg
z5CI#xABEr#K;QBIBmTJ(39Go7xNuF~LYrlAk*ED4H4N<~#8+gb6;Gu}Ub^?c06<1S
ADgXcg

literal 0
HcmV?d00001

diff --git a/bench/test_sgemm_zh1.png b/bench/test_sgemm_zh1.png
new file mode 100644
index 0000000000000000000000000000000000000000..fa83e7bf14acf322982a54b23808420c271a72ff
GIT binary patch
literal 40646
zcmd43XEa=2_%<v>3Xz0pAqawmAfg4)Ylv=?(GtB4(L0G=L!w2G&M@kXHcBG8=w{R*
zh~C5K<=KA!=Uwal_<VYwH)~~$GnRAq*=O(jzV7S3t`n}NB2Pj@LqtG8K=MXGMuULh
zx;O#B)n~V^f^XROuIGV|E6y77QUs-ak5<6JbxTQQNdkh37~<3Sgy8u02L(N60s=~w
z%l}v2Xgt{_Ajr^vBO|HhVYG2Z>_dr9Ke#Z(5J_e|;;+z9{97KNuXh)F&8mSE{-+W_
zG{;U-Qse(gd99y(5U+h}3Tr}Atnt0Zi<pTcKwXJY?@Q*Vk53D-#RG$b)MQwKug8*K
zEp<C<?y~%Lcl4RFVV%=AgR-T-j_tF)-~7qH+%1|iled1SRX8v8QF21}mItK`!59gw
zQWO;xn--p20Y87YX7;-LynjcA`|@`IpO!E0L*m8##>>N7H?pZNKYvQ>T~5Jc>M*0r
z!|T@uiNVLmcaQ(S`{TZ`I_HA(wZWXwg-`i*7&ye*+FI0me|@AVZkVIqyeHB6bbn*g
z(!#>R(o)1X&T_iJb9LIOhttpE1{kWVmWGCtg^J2&4LV_$aw7?oYS`rdpghAfl@I?0
zId#kbq`ObAj8&Lk&D)#}WdMJ>o5F9)RrmKR)Za)?ug)G<|10#~Jz9Yhr6ewunJ0t)
zxQrS+wx%0t1?}x7t6_8n;<ni5h`4_7iv}<hWlPJEmts4u4?^$K)&B_~hSu0`>>>Q~
z^78yo=NV>v4{$Xze%^!GvJW3VTv=J^D}VA(-0w7zQ(x@xPhe|zR>jM5@XT`LV~(E@
zur~$=2fH5Sqy$8yRm-+!+BP=sw9}>}b2pz>uEaAD5PV(!^D|I}Tj45X-l9JPvHIsT
zoyhs-);X7)vJOSb#bL~a8p&Hyg&KR@93r9Vc(r(S+Gn>{$H+4eA1>)Ho27zw+hBeB
z?a!VfeecoHQf9O5D3`4KF~`ljkEma~`^fv;kC2cM?Y?#1V|+eTtW%adI`W{}dN}`J
z#=ki?_wTT(INfWH@?k-Gc4~)4eB$a_rq93dfHyGGq=*O;C6ccP)81PL%@>U}`x8y<
z^z5|Iykqpq{)UL-Tr0!*_D$F(^cNci=S4?Fb~aP5bg?$VzHWiCK<D)QbV=OyS3*L9
z_3%l!5%MF=QL01JLtcxhVGep|a!(TXC^^!vPh_iMn}d+($jI-=93f1o6)~3HRG_7$
z#b<Vlh(N;kv6rM%VT+-mA!7BNr)M2pn_Xp)-C0%aDAz_k;;BVn+RBog@~ucCxoDb&
zf>s+ptvK{{3vn5GQ@B7!!}=kZ1CRAFo-;;Sx+FulWZuUrP{y)`e-UJ+9iJOd|Meg&
zheVH;a~zt_s@Vr-wM%pd>ztRi3s-stXS7ZSz@&xJdv&f*uqp43TV~>nq$gXHBqQ19
z<T%tMzdzM>^e_DLf|)tTVWw&3WcmR4)ij9`v%b5#8yOjCdb7V2zc#woAN@$^<tM4-
zp|{Xar~c=e7ahkQ2q<c``QoIxkq{!8QjKD&XpQKeYl}n)L96XN4{+|5=H^1aqAn6M
z<?jQ@;@DL0j!W)M7G!v_oBX~;An|e8tizdGfd=!KiW8bkAh#kJcq4dbN4MH;{40x+
z)|tld5@^w|AS>hwfBwn4u8o?R(oH4q#@x~yMb%!z%7Pm+&HkHHb+ee4gjy*N&x8nH
zsFZ2mr4t@$@;w2M=he}Rd*Y|%HW(GwH>@gY9(at7$isX3oDs{5zmw<ph^pTG49L#T
z-X^?Vkf)-gL~P8bENT0@D@~<%87w~cm-O=;F*>iZ;N|_%6kj;g^Q5c41U!IrpfXMl
zw#r_q`I1+-Y)#`r+Rd>gdivFN>Qba_q6br&Ych5xa+6Hwp2TLiP997jRVcr3Ssu8a
z(K6JBUo*ad-lp(CyXHtdJ^fU?ee{fz)3EkK7KH_D`6h$Uiwgdzj~Z7z)+;)Y%xIRI
zRtB4cC7bf^a`V4_C7Tus$yKKl6c;z<SI-l4m@cWx#|w7DnTlIGzcBk^Lm(`W;H_lK
z499;3!<lNq5H<_hS(P#cmkDhQXY1^*(Da|m9~b3uP}&SxgV+z0qH;O#DtI*)Ve;yD
zfhq&-YMxj1dA13>K8P#3cjL4o$w=evLf#>E?xE~g;m|iN6l*7&=>A06$TFs^svNw7
zOhhcEac@W|j*PQmf5LWke-k&!{ymb4JL9E!_c4CW_J&;kwj8>Dq(~FO9*_Af<gxu{
z%V$YMfc)89wBYo>=aL}0<WF)1&Jlgj6&vtaeeBIJssp1~p9qb5mz6mL9?pa>Ha=72
z@2TZ_U3?X>nnboqL31!vX~GEUiT~_qDd8%DX3L4F9PW|U3{yuSiI+&1w3SA)q(iN?
zM2GKKnwjP1<&ED#Yh|8qx;N9FKR_gBUYsr|{#{sT`^JC{B!%fU@crx;J9+MtF4M4`
znVMQ0oh=69fhJ$!)0T&7nn&7E1<p}5mSI*0o~MzobZ$Fu?IP{Q|5Z=g51S$UYj4yZ
zm;^SY^3MjN7KW<73BXb)7*qWn80OCh-DiAj3JVo5+6W@TcIl8iZxrvdOd!OLmzmMl
z7QXBI`4NrXmPpt0<(wG%nI>G9Ot$&XwD-vQOYeQVUjGIXD#*D`nNg$nfqie5_ZKEn
zQNuP}^?DQQwFh5Jq%ipOs8GLB!bh(@yo(w#@jsZ45qHyx%rwRb_9kQ-AmoC%BDT|;
z`1urq+2&jy!Q^4R)aLED!E9Xr;H~UtTEEVr8C>~$X-CHj^rw<jtGlKSbQ>ban59hr
zO{4FFt>%Z@v!wUh<T{Qx^{cJW%b}EQMY$_FVl|N>H4`QK&Z(11Ty-*QZ0$4A%5lEO
zD+P$c5wCK^Z|LC_)x(CBlpl&2V0qTNuBE+w!icV}2+SgLZK#PEX%*>E(}Bx2g22Dk
ze9^RfOO_-(@%*quTsT>CDkm$h?0s|ezCpy@6cM-eflq#;rG~;@@EB2x54*jB#b2|e
zn{5Pg<f7pZ1+RK<%Ayj(hGPsD`5lC-Y;LqLsDBglt!F&}8T0&?_^I?EGi8kU`61X+
z-)($5O*i4)GCuV4m$83eI^FPmuRwfU*b$*yF&9i_G|*y$?C$QS?wQML&Td{Xd%Z67
zV2jMUZDLnozCC(a$Y@E3v$978vnqU~@R?q8)iTlKR%UViGNyPGgo(0xl!d;gw(!KR
z4zw}FKT~zG;}4~RD@qKfT+&^`E-DP0@mxyB%9H8qD(?$6AN7tJ`y)(Iezv0AVE07R
z3fhoL85O?Ut+ME)Im_DZ6HyH$&oXrG<$oj8!eQ^ZKW+(GDqo6EYvP1jO~*r2Isbq>
z;|#t=@2(tgIQXySW)HAdVXXKrwi*vI$Y2x59pfKbEXU0k2N!TSJi&r+tnfQLM$rQ~
z&I=F&o-0X7yk^^oxR9TriP^UI^mw!xJA*%;I>WGK`Z%>w=)Cf!O}xvU!oi{?bXU)Y
zV$do<Wio|bcF;&JJFr?Hmdp_CM#B9&<zGuMG3;z@X_}OLsPM(wpB0>hwja_J*(3A1
z&?eyG#Q(zKOdX+$fD}0N`q&8YWllsg)9-0jU>UV;>P8nen~37wVd#-#I0XeBTzO~_
zdsf$r$S(@WZFwg;pNG$>G_zK|#2#;i2)tHeOaIwfpWUV-G3kf3aOj!8oqv_AAms+j
zL6lgt?-*a;jiXWSk83?P30)VAB;S1NE8^jrUPHbgy}J>CGKBiA78R5f98A>RSQ$%6
zOtjFeSTfQI+{?&bPn43AlUt|y(pG!$(t7BLvg8@F5oeXqp1=A`{-POE#oBSj7V{CQ
z->flzIEzx^7z&oYI?~bjs(^>zuhrJVXhN8DYuAcpj6#u(Aw5&%HP_^KRH=S?h$X;3
zu6YZUd3VbY6TxG#S54Q;WE#><kepK@4%KyRsu`zK&izGdb?&$Aa3KcUcW18q>8jsp
z?u5xog%9S)qp1)uKcqo~zKL<yXPr(8?X&Pm?xtEuv9=<ExStQ$8=DKaZyS3Hn36e`
zktST(x`S3)s`_30w4SO;woO$WNk%Q^J<Y#pUJShrdsQQ`y42Rzwy6Rq^#AkeX2WO}
zqM_*3Geq+wi2NrA)`dRRs5z~&_3i9BBLDJU!(WW@3L7*jF#Sj2hu3qw$L?|ep18ls
znZ8XcX3z7XN40Y`y0#9tk*$ZhRqZ7XqMCcE%@xbOw1-a4KMW-X)%%D7@4njqm=<2a
zJHZ7WoH}Qu;Z-%%8{Qnxrt{;q2i~6*ZtB~<`V(>R?T>utUH!G6uPYLs@BfO`_?st5
z>Mn@hWq?&m++ks1k>M_k8>4tBF3p@#+F-{g(yqtMq5R@BbklP%26|CQ_l3EyjwC`m
zw0d04XJP5XI#&vYvqdL~U5=7=EV&jt{+ZEdZa>$%H6@KCqI%YbVzfFeDYH{;t#@K&
zTtmcno;0W4TFDY6?Cz}J<x+!2NTgRcs_V6?^#<fw6;CXRL)V_woYwYA+4+4n(s476
zc<lCzw}B#J)y&=*MLihV?-?oRwNw2zvi2pvck|hX0}lR8Zon$ht&XC0`{g2{C*<wS
zSpxPq&V6PBw}Y48;Bs7#Fclp%de<)(Do;vgO2To~bEl08$4Nh3h`}LRbWmu4mS3*{
z2yd8MEiSERHw&1)N;w^`Fg#)umykd)G=o-e`Q^=d?%Xe0ut7c4%JF0(O4rp*=d+xG
zR~47OY)U2kgz}xt^9on9PsP;%L)z=6HG_(FERmclSC?38vMi9rw<Y+%v=(m*S0rU-
zq3H4JCK;^R*U~VbTm%3XnJs_+tAYPcx`Pz^qd}X~!A)w9anf;sHn~qC9LJ5;e<Q?g
z^wiTKD9pCGR&V<>;onjQeMZv$`-6#X&D*f|W#q*7kVwCEtoFLRqsfQBQfM0&<Lp#<
zYuBh8$C1d)k;>s=wQmlgRK?N^XAo4#9{CPsdBA6|#)-N=Yb&AevoV26$;V_)wd}7H
zWcnQ~Bqd^*Y8QHm<kpv21O*Q|#LpdDWj{;ODx`J|4Q4ohDB89gD`#xr(kL5@tO=mV
zVa@fdD%gtutP4YFWXZG!yf=s3KErx*{4(Ur9DPm`CGv|M>h^RuqA>&)_;F~K7MJd%
zQ0H29AaBrEF+Qts`$GPjV-B|VbM*Ap2aXG4uq!M~cM-(y4wYSU2GOod_69H!S5Ad6
z58B?CF%F^GW!X@@$kH6c^efrwdo3az10yTsuPpl>IYi5#<mjGC!>P{K%u>Gh*P{l7
zQ2B|=<(egZFqLzc{ReEC%ur5jO2`_YL(E<+SN$>*8zj?Zalm-gnz<n)YodUpazOUV
z$G%__>9eqEFMiVWRkNMBHkGpBqQ2aR&M&`Mb<gsa2)SsyN8WOTWW9}=d%u<i!ZV{|
zdSeZuwpQ1WGXQ6*u8iwh<p9Y`ll+#UD+a<=Fz2}aN3xj5O24o^r#?(wR)(KGXULqH
zgA|rfkF<wa@e`g6tSkij;s5n;>(~2VoL5Lw4NiQfd_BV89*(p(J$3D!1VA@ORK*$r
zNpN1<ibH3v*4RJ8Z&KZkM|$BwF{{T^`YU~~vQW&%S`hkKneh{C{olWTThRiie0(HH
zV`UA&W;0E`Dy!r$O=3gj4ZBQbCIpMC>jwq<S~8R2eC5OeF^)t~`&O~cC5$PSCPd_I
zL~T0W*^XHzR8E4|#jz?^9+x_vq)s|Ci<UxNuWBJ&R=*XDz44uE4ZoaQ4mRg*P(ve{
zZ2>lS>!b%|;l=RjCBS*^NnTf1C#KVRvejH>k921#UK22>?q5+r;7!GmBV^1{C-^dd
zL@%hNVuX(-tdv#Kg&}xvOjWuOzNo$)ii?W6(^FsC@K4c|!of;_h?qFxGnVLe+SROB
zT?MV5m6~XFVRk3&FE2sOti8hvYM$z=)_4T<RJWIb>o}%Zr?|2-$fEt^MnJxF8>Ug>
zz7yl)G;EdrA}(K1%=2IvlyiD5Ct;qy;w3Vl>p}qZ6RjxkrnmX`W;PAeBP%QT$)n4)
zPiW~?k;hoyS8{vIj8RL5jPP4MJp_NtXt-*qWVi=HvUnmyPK0&RoE+vQQmuQ!v(f2j
zSa&O_HNkFuEbn3Q*6ynmhHR=QKNovb(^q#_)=gB`vnp0}9q@*xqUpbjO770VQ6dlU
z9%uMbem=QqWAPIF_af=$IC$W8cp=lb^mIg(DFV;QO}CKF5+j9A*zFgec__k~KJCIz
zBpz-PJmz~1^A(}mpCQVrzUF!Wtvl$*#yxc|))Soe$YD`bQliL|A-0%0&hG05PuupR
z=6Ay)l4j~6p&Is<2nvo8=p#Q779$}cAtK+U-^si=Aw*Gw+QUSAa8`B!?bq|Z<%~X}
zR%f~km0)!`V$5;ueg^N|q%|sPWLjy%XQ!6i&fXi=j>assQ5hp;G~dBm+D>921}=Y*
zGKkqAw_mc6lzQ>rpn}9e`R^6rrf`oo8?TB_#VNEB^Y{fBhksaV6}`b4jMY$A|3c=*
z(&8mIZ*($XPfJE&YdGFi5+G>Sd4FvsFCAAMjAq3J$@1^OPZU45i^lo^G@MA_!{jAN
z7Su}3h3+Hwwz?yJ)TO#U9qf4Tr%F?PS2P9zv2|x%eV;^cI&x^sihZTkJ9OKwCFr&z
z=9cS(bK+|<+78uPbLCVh%7((pC>~`y3SVn<i2abxZ=$twr#PBDn6l<ASd|pZ8<REi
z4K{_InuyDVAMpq&0{~OHphE_4iJxvo-Qip$=NdnVj$LtV4Dcg`yvfZ{E+3;GSO%xH
zNbNYR*DKHWSks>YgcGEEe5EIB?DQbVGo1dpnP_D*l_)MYS>rKkvu+W=*uactZ1iwg
zENl(g(u_6%=p#{~WISfIqk1I=@})7ubMYaqUg+}mQb()kI_?pAKC9RZI(gek6{d-^
zd7|Wo*)ssWec2$=yz?r<$4?>y^oiM99NI#A`ueIL4*Vu3C)ZW2a#_(x?Zb~6Gwl3X
zzd=iyljPA;(4Sxc2ouz5#9+#34+$M^{?4BVgT!AYC>#rJ_zI(2E~l3fC3sN(ML&AL
zQzg^^@-Y2bdl!k{oC#(aAZ=VxOlDtuc(L}#Hqr!9UuF|JTpXezv|l1?;2vZ$Ak=kL
zHi`GU05p!Wrtn{`!aKlB@&ptXmo*8Ci_b(1gBr%A9(9iYNy%tZF?i>Zh}&;_Z36(#
zu~nb7V#&RA2P3S8a^FmU>}|Sj$`hpn_$#fz@B>MkgX6}-J9#u2-jR<@6dv}WZ%o~I
z=htwwn3_01v#%m^B};!8gUx(tt{8|ijrw_4Gh5l=cBa-}t?a4L3CR0sLkncN+rWV0
zE&nP#85I)uvtt1KRvfIO>wc<6b#(Ka_Vr`SMj0SoHNMLJ2cK@zmpl46O@>%61YK9M
zPgYe@8kW-{KL8P|extTMhM}wkcN`XJF};ZO_t>9jxELc7&aY!5NAT!RvImh~lkKfS
z3r&gV22rR7i&iikD0;Bi5@V6vZp+I6!%91@zi9V;7*440(O-u9){O)E+RLJ0KSP|X
zR7Y=n-pr?J-p-?OYV4IGCc?lzIOoc;R<3aAK}Lv*w;Q3BYuI&drP?s{Jl+xkD1~sZ
zrKP3##p&)n(Y;u>8p_^0`nBVnO!~;iet=4&u&=P|=?m?JXvzSZGlK_3$<@EG2{x}u
zM`N_VH%2J#e*f6*h5f>b(yV4ux84)W{2m}^+J2`uq^e#unb-2OP2!Hdws)sIQYLX*
zgTD{%!sz7eoX*2E$R6E^$ZYEoeSSzqkHaWBTO~+66tMfo5`~5Dvli1uEXptZT@t_W
zL_li17PN2Lo~acneyLipSzcZh^I@1Lv|r4L5vxp>Vm`UU=(6l)Q|r8BgIEQR!+OK|
zBAWEnO4+78Efdq=DMf5WVgOjPdG@iCcuXftI|Uyd6S?-Yh+#K(oqP+39$srUhC{-4
z12sB=fz~f1>X)b@T<$nBG*o2j^O^hl{fo=gWeTEAt3A+<I@ph7X<>0>iNq#gcTHnN
zDVfKdxv$lYJ|@WYFnzfyQ@}ts@8U9C*RirNYgx!H2X_!^%pM+ZL5O~fJ@8m<D4s9L
zl$#b-GxC@XuDB><FB2}3PjjZ5Gc|rae2a8uXf3Y}ll(kaJ=Eg2%HK_jXD8HQ><fx(
zE@a+@)+wuSxXt+e*-0rjiu4fF(7Gx4Ce41|$3eK7pa})Gzj~J_`AsTEx*je<37Pi|
zL>!gV^ZxGw=4^3=u~*Ce&W-^NNq6oOvj0h(tPlA~OiX-9;_`TfH;TwSFs%wv9gxEc
zqzk~H4XEf3o5E4L+jb0&FAIn>^;0XT0SYE+AVbkJOg*y|+<C}X?@a&iK}|DJuJ;r5
z$A}F3GYCbu)xk}qw6$xUX^MxSd!>xNToO8jd6jN|gND22ofY311sy8B6c3wVZJ)tZ
z(~G#-wdNGFZ*J12Nf#p5wvMOv=1P;pWI4?CU0yT*7HgwA<8wRw82v>x>g)ZM&q&3f
z5+-+kIgTgOdD>6v9A?CbpPQ1i9~kyCZlOo@y3S-QIv?wpaOi1HH~Tlw-LF9|zo<3s
zxQlq>ZX8zOP=DgkD*fm_sl!{j7t4o*5h(Ar5iaISwww_b<We$XU8F56jE;-ooSTPV
zf~#A8?4MIeEJIW(hC>cYKc|PTb^%V$Rb@!AinnmmNBp$<D<1HX*qM9j{VqhK;}cz(
zCd$Y&WINVc-%>l_I-7ASs1-nwn=$l|Cr#xP`XOB{-G}_xvor2M_{v%h58TT4WFJDk
z*vte`)v7D<US_Ib7}efT_V4U@7#rI<&6n^E%Do2<fv2}Tg(bY8<I_5cQh$%R%MXAo
z<|U*)&4~YCZ58#*l<%C|!qz^0L(rwr`{tXDNeT{aUEvU7?uLOzp`2@#nY30DB58Cb
zA#T!Vh9ZyGV*ibq#>l3HbyppHp8MMshr^d3ay*#Z3H{32BB>@nDf#nEY!I7<hMk`J
zZ2U{H9Q3i-!p@*bU1a__zZ&|YY%XzIc|g@=DKD>Sqk8f=f)F_%I~<O!)Y&yHxC85m
z-Hor@Y<Im@em@*xrmV-&;r(Oqy_%9RY=ZBAn{~lA;Ui!a)!5wd3O8%R1V!)49@p0P
zgkZg35SQc8D>{;qbZ)NiFoOe4KNgItV;sXLdMtF8vaCi+qYN`69oq^&V?%xQHP>TS
z(`I2TdCo`f%_oK%1;U=KMmiOv60L+a8JogdP=DcuIa*kEir`j>m-RJYP_aNCp?&}E
ziMFwtr(%w4BQ5-U&}1%(et-~1d>e;l3p;T)txi2czkiD`DRCwtdUizWeiQTZ<~{mH
zIeZhD1IHC5Ny-CgW!T?s+u{k0NJ`Fnkjj0HI0ZIctswXkbpyNoQ5K)zR%-GuCk3{m
zVzrw4%{6i-;rcHckn8XqW2Y{b31um)%^mnsQLD|GZZ-Cr$`Ga=nvHnhTX!-Lxv1xC
zZ_O=Y|MY#&!e%G?WH@e5lVav9*2(}~cR)?Hkr<$~TOw5Bg7lqH{pq~ra47V#c8%Rf
zI;cc$?IBmLZu`@vO>3Sx-xuzM0YtLq*x{I3?aHLPBF&{o^m##PqXs+l4+&<ciC4Qz
zw^yDZe%Zbf@$8_7azKnW28x&~@uYMq2<38%@7SJw*8*m{)})sm-!o+z!mOAw{E#Xm
zBtot7ci6tRL1k)_tX;{X*BoSt4P~+-RenR6(ib5YYdCa`K&HyOzCS6NMc#7c(M8U=
zWNsANORo_UysoQSA{{4Dd|DS_>QDGauvLo%pR<$@Qi^`9I7SnZ=ns8k1(DS^J&UQJ
z6wnH2=EVIqEMnI)e4>_tcku0T^BU&I)EZjMxPH|z)(Ry7xerNtNi?YyiMi6@pQo&a
zY#7bhfYg2ZF67QknVGi$+CEyHSwTc*oskkgL+?9$6$~F#oZ;@BS&Nu-u40NgNxyt<
z-*neOeRLf^0F<AGlFZ~4Lw39B6<2~N?J=7Lu`?#<7_~#n;Llz%&`4Aa{V;iLP3`{t
z6nWo;;@zwVK@@kaKMwP05DgNq{%xmqh{B4=uRkv=jf;7*zu~P{?^0BsoSe*v3cdY+
z)vEj6>um{NtxX8Oi$=k6m;*#Elh1*N{b2SqO@-XzRfYPKK82mCwXiOJpTiFjoXa#s
zRt?__T0;tW@dNW4n-B?+XA{wcguHOeV(r<CZG)8f01^%rl-C-f#zZTNPs{L3b_>;Y
z{HkhkVL?dtG(HR+D7U+7PHnk9+-AfG7Xl5;cir*NC|PU}HmA@veIxcC_J}rG8-Zt=
z(a$s64gpnNE#twX#6_31S+_*5#Bmx~+@$MXt;!-NQU-HRpcKCnmAF@m-mKgMJyevt
zzAzVj3a$U-^yFZ0Srx5pWO}N|;lIB=?w0o*V1K{Oz`C-M!brfkkDgnBjvbOWWWc&b
zPP66sX?wXC7hOt2s)Q!7=ch`gp5;xPQsVPu+V;jg)_s&orcdeLuqhnxG{e;j(#G#9
ztA3V`-6)LZoxS5<{}mONW6O$d-qiY=v%4GqHpv_H8xaomRZp<)38_*HtoKuiH3BSk
zc@QojOrfAOrSs`aTDH^E(Wgkmf4jRB@4d=iu_GXSUVBzGm>JL%O&D0!P-W4Q#Rh<C
zrb9blDUQhD7U>sCV@<W>a9urQ7CgBw(?D(E52&3lv*OS->dd57{*->6v*F5O@^Mj)
zgohM52qRE$cX>qS>DyW}EwCGaM(T6Yd2>ahO{>jp48vBU^15hHw3JBMNgG?Hdhy-)
z%|^!!TzIezDJ}IV9Bce6>L7$`lURz$Y0K`(Lds`vU98PWA9V{IJP2Q5%V87&dCoeR
zrl0AWLG{$TjUx7NO3xr1MkD01Dw{%TY&C7RwVb`e>?JvzN0evSuDO9-g1M*PFxhq9
z7-zTHgjhP-GJ@W;!)R%`iXCZlL}48!Q_XzGQv4)&xTgk(cGs6$s<XNhg$>k9k1_f8
zw~J?jB4}V0<)8^|#k`o(#xr8VzA($zUx}g=pVe(qxd8o-yD<^CITuidqG|W4`o%O0
zb2LadA}sEo0SH>HSfKr{HH9tafj!&2_8U|LWl*C_*0T?`uJ8=3SB?Zm3{As?@(bdr
zo3ijA`6cQDy2Arxj*dE??^=l!e=}{58uYVVS<3L{`eZl;YoQtCcr<0~88gP!+wS0h
zqMw-+`P9kT;fieJ*d@iO6y#qZF4GW-jXn9qUb-1~>^|*T;9celdgqDTPS?I}qnk=W
zXS2I~aP!e9J`7MpGzpImg1pa<mh@X}FV4>rtva3Wr`*h@;xp`jy&FY$!Zhs>-YKLY
zPlc|CuDG#B8dD%#gL54<6b=X#m-@a=>(G!T=|1Hs-$|DRT_LwkV8{I752n8I*3Qp9
z__FZ|njL+;y=O=L{`USOhzpD$#Gg5YtDW}8>8eh%;-AVi98}$8q);8((Nk(-Q|09_
zZxyZgW#9%Tr6NN)7AJIvbVDwCRT%mdqlNtH8*(=I^tW>=&h~7ChMw<rZ<hW?es7-A
zV_R<LVqdtWw?ov(19w;Jg3rEjLMu}>6=+OHt<^&9%^#P0j|^20x`PO}VsV<SYcHTW
zl%@)xef3<OgN|kD!$>3X(h#M)@r~Hc6VsH<05k(?Z%DGekyCR`ptrBjP385MS~g~u
z64lN}^=*(2Il8fpiR8+ZPqB-h6M`^lTOLz#JLT5s7tCCSG8H;bcd)Da*}VyqulplN
z{xA#@&pDrL)TpNLbrp$X+HG_I)AeoLyM13rhK>H@PG%pAYssg!9{WHUk+*A6<~o%S
zQg>T6#izy6A5>ISu42f&$u&Yfls!)mcNEzk{TNZS?3NC_TgUbeql}lQ&}sGH{1m-n
zrcz*hRQIF|QPftk{kWY%jrbjshTl6zbYHh&HVEf2cr_&R8DvS#C;J&1)@}fQuj{+8
zkYtSYf%fMw{`LL!Vvs!_tMGjKzHo}ZeVr`J@DTgo7_nok)*mlRmFl_{#2_WBTkGRh
z%#^gf(@5*#k}+E|OclbP&KhRL9kx$}n-4_A+9_<pt{lKU8>YSu_}l7J2#NTFuc8_G
z?dYzYmT+uDO-mOZuZ?`SKWr*;vSJvcfDLb_TC0kmAz_Xb?=l&N<4a1OD=j}>?@Lg(
zTW|7FmG3j#Fd?<x82@b>teci2ZdpL@31+epo#j2BhgDLyPnB?(uL#nq^c@mB{rK*t
zkKm%Cao7D8UuxFB!<G^VP=w6Qle}2{rBu)?Fj$&f&&GSQ#yC$+``1VylJ%6OiqU24
za&eZ*tm{axN;uSzYE71wmR6&=!h^Z8c|#W!<0|9gL>z0f%c@SMCZ@kVO6}u;3yuZY
z9t-mn`|bW>I3KVW3z_(~6H)OpT3{|)Hgf%~k7n)`D)_DBJbGgU)j25kj!J5Ha0E#K
z^b-#?iU$NBy%5`1f6LH(0lxVN_pRy6I&*k5l$!9RL2EDq5+zP5_Oo`eLma^XxlX|?
zbd<9C2XI7oqf}br+MKPlioyLKd%qo~336;JB5g<e5DvkIKkxbd*|eX2$$Xrk_?aII
z<T14%+O4;ZsX}V1&7*cSKx;&mPY=L``F?o@Uo%1@Wa*rjBut}F%G#K11Ri#H)x@1d
zUIbn%5<JBIj@<%lJoQVqwPpP-#g?~I<C>!X2z$wdFyIcHMiXv`0^QB@8BT=%Oz}D5
z&G^XM&PNdkp%Hu~C2}qph)(dM_gcGH(xIUVpZ%{q3C=ab_OohqIZ4#T>Zw~59rRTR
z3{I?0Iyv6u|Llv#3OI9Of62M^zq(y!=H7f)=<%}7W?x|f!-nj}?RCeG@$vEZ8y}0o
zSq{s-PxA20J><%UveD-_Ez--Kh4Cl}^!u?tin!AHd5{M#gQ{$%*66w=RKX(X<>=+P
zY<85s<-<17^{U-wG<+BIrotSX3>rLk03(Y)<wA0K^U|Li4lZ_c9oD(h$T||eHaVr}
z4QXTBJQ`aANe+Tyb~DQ!ANc(=OK9*^w;$T3%R6i05MX6;8oqrSD=h9NDY)Mq)sBl-
z5hL4p^(%yg*!a`t<_eIOxvo)5VRJvzmo@IE)KpTam_Kf3`}?3!$vS<E<7syI8P!^X
z;RAIhljv&fvh0+iPiK=Up=!8Q@VH@|aEWyXvTalqNOTsc7vzJNK8tML5{g(=+x{D;
z>Sa2AU$}hAZ4x#s+m{{+4dxF1%yt>6X^VpfXimoa{j?8YG^YIZybNNU(MoMZa207c
zR8POPQj4H%3&;3fgbM51R@0^=TKS{b&j~`d8Xwz-Lk4`|C#|mcqc~3w*YW~u8#9`*
ziS#s@mQ)9COM+}vgd=sa;*h%a>y%DmqYrzyK3W%^C*n)Q;ZslCUSi9#F@X@#?%rMq
zPK850fm+s`N5zzpPdPmV$HwJSin*<?scEim``gP4^s=Q2jLs6a#j`S(PvN^(>+MqP
z)WS&|gYlCw7+jLEUr|C)N<>^t$2T6+4F7Y_wooEgh^o_ZM*UzG#VgN$W-wG3`dhoY
zj{eU|)arOu!P%3LSWC3)o*JhCUWkHr(mm?DuyVCXv0>GQr@QWi>P-B7nQL|zJB`-t
zPG31T<~&<lVZB+mYzpx6+~NB}FDDkmD%5Y>ePksqZREbjWsU4ugVfbHB6a*g_Zq87
zXbMy~q&+dm%V7mA5B!J*vVK6NLL9Ua&dSQeQM!L^d~>H4a3ADS-_oi)7|r}J8q-@q
z^esY9qq+#aFQuK6ZV*2JCAuhS(hloR&D%h6Yz{MvG2F*kpn_;|EDc{#iCUe^n~o`O
z&e+>|jOCp~K23b03L^&fU<?_0?2&YoQjb_!K*u*0jNS&RYI+`1HR^@%A|)l|1d1cN
z=*`OCS)(}YC+{C0uOu0H6=mAiY|>q$Tl>ZBZ?hc#OtZB4+zS)CdNQqKCpL;uQcuIi
zc-xG0J)2lqf-Wf%mu~;^R#&44i|w#L%C1NDrG&AXi$;4}PtoPx#@-QnL)(W)^wcl6
z3fcH-hM0r$hQk7={h53MP<=6mF*-2j(wI%=Ff9QssrXwn<{#yj=+h|Zj+)B}Z*g|0
zHLq%zets&q@Y_=K<;3&4tjHnK$>_p}20%YRx?zuGp9O+KQp~3o(v9B*<vBS8MV-5;
z%+HWCK*vxSx_qewZIGq|0oj$JcJ&l31x<$}_=Eu4l?bB-5&@)@FlXB<-lH^*U}+j<
z%;vV2Zq%&-|KTqm#3t_smpwci1j$kM^mTx!b6%LBPvIi(gko65zop((fJKG3?jcx0
z0v^~F%~V;c3R-EALeXvr{@v~2SM<Ff35zF4aF)HFSeXO`M^*AwMe$l1o92wJ_Zz?a
z@&8<AX=t$|SMoXC=^#>Z$#*Y9a~m}r_HZ|A_5K6R1@~WM=ozPl1Rc58f8ZI01_qC&
z$#t8ZCjBF|BPXvLIA`C6URc>F+f1J}q_){~F&1b4%%&2&yN*L26sqT#jDCfc`b-m*
zAGyCQsD;)3nkR`aTM<d7iTE($Iv*Mp6@^41QyBeEUKXWmN>}Uf0rI(3D&H!V^;A{2
zLaQa)eCuALDy{slguapYCrM;}cV9{b4&ENkn4WEaOeV1xa12SJahB1J+imJ-r!bAh
zQL9T~yox8=zOh8FHyUjCy69qdu3WouSajWriB`Z)jZ>Rkp)jVVyYSipkWnB$uQ@N&
zqV8YNaCnh&o&!DTY01;n;-IZ(EB6{f;Xjc-NUww0qX-2uvY*~mOaiu{i)`PW^h-QP
zqrChM&@=d>8UuE+Rk^6W79Br@usP1%YN;s=DycQSo&`(vch3)NVdu2cS4n<|<{UNj
zka&P{$4`%+Bb-X7gq3Q?gUU2EXyp8ifd_S~BZgsXxIh(|%8Akl*6`Ll%+xp7-Dk_R
zl|#i+Z(S!Y5Oz@@x3qoyNimAH<tWDJv`YWIt2HSbG!B%mZyGZ1H|;aQNBK>s2)gu5
zP;@B%gvT_Jh$-Eq#Q-bel4x7Hm{SEk(0n$=MzDs<7Y5}}6%$qil_xXvucfm%i*_Se
z@{j5PxHlk3<NxqoyRNcQ#2q?CzF;aZam~5x<ghvF78w1d@b}gYS#F=L|DP&^|L<v&
z|4*;{xDYhqM$3%1%s!5Me_8f*%gm+{D1lD`2#M2P6A<*`9*Vecj`)ju?UlbCka-UF
zBGBx8cu6maa~TF1B%T_;pzsH5|65i$`Td5bx_X5deibOj;**j%QZq8dfZh*Oa}xus
zmp#~&rAtjQApSx3dYs+$B_|ZXD|--JLBY6nDV$y#d6l975IJZI1KF%T$|eC*0CblA
zK%%VIPpw60jCO<R=_Q}xfB*gs<Y#7`v5eErslw#ZRwTc_@%-xY9#fg6gF^(}H>ZFC
z5Gb$pGjj*b1Yr|e1_pMM^{Sa-6I=D}Tbp;p%*QLuWo2cR{kaSA@8&+=N4G96E>5%n
zLZi-BTGo(IY~s_?eKi7!2B1x^JqTc3y^KOS3AZu78^8^kGZ)$;ydZ!x;+%~20}92W
z(zwLL{G-7~aH%S@uCJ_qXZoZ`QdBHj3=r3DuaJSq2#9`vwit~jaq-N=Efb8M>x|E?
za)Ayt#D>%HaQnSd>-TSp_$|=NtFgf-;+-G?6zx{{Qr(J{hs=-Z{SMy;yQ$>+fvqoT
z2xIQSK7hHvNE0Z}7-6I^+dKyEbsh(v`><CO-Xsw9hoJ9P?HY$8wh{d{c8h%sIs}`n
z*6Y8|j-jmYJU=_(yh32*A;WDo1hVosU*Q8Ue?Px-5M3EI{`vc+zH*$4ddSNs4p_>}
z`c+LWEwjLsd=grL!b!BdgDX(<84Pm=08(bOWHAn;?wsyo8#^GvcDpfJ3xvv$GDyQL
ztmUr;PKQG8Y@n+nx4RT~cj$N|bC?}t;S0EFt*VkOVJ}}b$n+b`JwzhNs$KmXCH1d7
zw!e43F*3@i4zBJ56fvg}o-z>VjctED8yV5oR8M~Kz9sOUSm7WN&zu;M|G52U>)y(+
zJg%ihswZUNZYkjilZYpQ1T&+Tq-ed^fuZ<CT1B8VO4vc<KIRz)F`BZLC4c_2U)*wC
zAtEvo(+KKR$TiJBq)v3ADL`*rbh6XIppP|=EITeXZ+y<TI@8>2ZmPyVx_LF4L3|o5
zCgQr9n_@of5I$-U{g8JtE0o?zU#=)Z{M`=%mWH->K&`TRmsa3+W%dpX8J5TNuy6Z-
z+mXlt={}Z0W&#BHMDrBWK!_>?qE>X!GNB}X``10N(Kb{T;|Yl=-=^@rIcv1&O<A~8
zTExRqyjpo&T#T%tcZl{Mksq#b<l`@X-|>f+%t|IjTJ||0M6}<%qx!a?{T7jp!90JZ
z$FPsRjxair#<o8bC|<Pr3uU+s<kNcg_V&H)uj^z%e}BnvDTM@Y+~83`CXiB{-d>Tl
znLqM9R!(wX+7)wg3B{rD+n+7wgZRRRW;9e?Ig7Kp6B81i`8tvD^S%pg71>oBfu@;+
zz2apxBhAJUROq=bO_>*Wb_H2u;0bZUZIswKw<O$$$wu!Q^FRgf4iFZm@?*Hrj6kns
zt_PkB=!WlR5f#Hjz#7-d>u%gHLt;i{`5QXuz27ES;MFfc`#^1Fdr5etps>&i#Du^b
zB*K?fmJgg?IFR{LAB90pk}Jodjw<l)aA8#6e);YV%CB0OndxFNjo$9EbfNbc?&|Sw
zHF^J2Nihs8OEBzluTkNfUgF3F9tf~kiN|fSm4)0l4ZNw}pkUEC4D8TyW$PHW-V9NZ
z252+z5Ex$^j|%u=nkjs(kz=GqqO9)?FZ<_secM$xC(6*fjX<{si*;Yzn7vO?u-)xQ
zXU!Y@T<Xx2y<Nb(bAxn){}>@q7<bQkd4SQi5pjjz&tq6D(v`onQqpYaP3Q4o%E<ZG
zjErgB4XrB#Mwfd~t4a&jPr)3aMj}n2jVYY!P=EeWodL39mj=`cMK=5Zl6&jQn%bxx
zI@7&$Ui^KVzvS#IDsD%EaKyqIG@apVg(hA-oHeh0rO+2g8+tAztlE4*?C2tUcIDQb
zfldgTM<X<stv8ZIVy0K+xofdS6oa@Fu#CA|0~zM;;<;BqEE=cEZf3c(v{amdKv4eE
zZ9a~o7oDg+Z#<YT7CX+$4o*wE)DJuMCR8`I=@`*$<fqzKGQf+&ZIJ;1S1rr$=@|Q>
z{EEx<R^qC@rNbO5E0agB<3gpiN|9o5^o>^IzFYy?0Rpk@2ll?i<~eO?+U`1YH^mP>
z;VW<hiKE5pMp?6&nStSR4-lD^l`prB$AM7*y95~1^w$L6)d5g>qq(38kpPMVblR2o
z>$sU|nz0|Bu3;c&n94~TXlb#@QsYN2P#71{{xj$Rs-*d*;?e9R(CyO0F7$oNFFRh|
zneS*moFlf>eURhzjk6Z5R-K5vm5A49MC9)bZirtT^=~XR+!H%|*D6S5+iUfcpY(vH
z89|aMXe8sjkegjqU=>0_yO<uTo+Ae?l5(A}cr)iEU^8oI32Sg}spvZHrb81~L|0rA
zHj0pT5m5A6f^u(m`z@wftx6U$;4Kwfi)|dGvgPH&Xgo!lm8M*z5Cx%i2+GU0ad)8A
zLHY3YH(e3AEeaBEqnQAI0*M?zvBn7iQ%ch7%C&9vNIDU>!eXy)&MPRn!vqh<OdYp6
z##r0-sD}bdKwhLRt4EOow$k%`TvWRWUd!Pc0+?AsSH0HCxX{i_o9Kj=(U6|}6)~wQ
z0MT7JAMZ)~_q>s%+BqWX{c0085wFrDcZeX^*^R;jG4yh-jV{HMsu_eM^$3_J2A;wA
zTY5BbZKzv}`o?$<#2bg5?IAUE9?&$q3pZxlrYy`X?7lNOu*2miMDU_mQGQJA+XNp&
z<o5zBvVjP8)vK&TKZo@onZ#RBl6l0y<BS%s1*x@&mnqCY9FV^E?4vrxi#e(tnsmd(
znHq;oO<$vO3A{C4J^gt@{Yn$hIE|IHtF@1%F15G9kIhe38Z>(1;iMZu<sQE#Vq_nX
zW!Flrt|`a$HfZ*#R4`41MVDxqG)8xnq|oztjRPbNa(7ANJm@F<!{T7ZRiW=2QaW_h
zv_3+jD9mh3pV@|zW{lqam>;-I*c)V{v%=3H>~lC93YpD->>7IF9l0knLDjU%zGmQs
zS37(^|K#_?Gfam$UKs4cMH^hbtG2g*0N)&fEooAs2S!UNg@1>5j|s%<l;lNUSeTl8
z%Jcpr!Ziz7V(rW}DTqz)vyb7UbQqRI_@igFdH*^fl`n<D1kc&n*v#xXzuxBFpbQIX
zo@e}EcV;);pvN+EmpK{@GRI}aVAW#Q;lGsQ!KI!dgp-FrtLGzct!!i@SG@ub*uADI
z@mX94A6C0Am2I2LgE{HY-Dj5z#NYb`bIK+TZA%9~r<uohV}IwDm%k`o!uRGS8No|!
zLg7#KGe1yLhD51Bp>TLrW{9O<&~>T(n)R)Esdga{iQ^)=?C-bf)|>5A-7><~tJ)fQ
zM^rl;MpG&Ol%h&V3!`vPt);y|ODvNrxtDEH;_C@q6#I+UE~l0?_;PARb;}eXetu0h
zOJJa<z;(6++jIhcoZ_Zpnl}nCC*R&Bv9zdrG?)1ZKmA#4qUEF2y?cC2#cCnqA4=GZ
zupuTN$9n7?#BLSkdXYSQDPstTk>TjI_DHM9y;4u5Eg*+o@{KTDa|YmUU-|2$K>;^v
z<c+WhGhG?VTb&rjxE3TIG+kf6^h1rFm!|ojAFYBkSEp~lJo!M3)RT5z&&u+F%lME`
z@{{n!TeIXz6Uw~$_r1y5^MsH!yU$$0_j}d!`L?{&BsJb^=7~?J2DA}XQK~7m?j92i
z<fbLZ#pPU_`ati7jYbxWxoYr4(_!1*c=~Mrya|8l2OJClPisxM0)Y{ip~`1tUERX&
z127i5d1D~Y0Ny<<{!NVLwC56C^aaAfx2HV`bUXBY1%P-9C;~yW;W(R%<hBuJzi?xC
zY?dMyr!l(Wx&7zo%-N<pz}JK8*3-b;)<|XKA?ciVJXPyBZ<m%UVsz-Aru7z4dG4<Q
z50SbGX(nepU6vPj^733qI4&kz)~kmmB0;PiHFQrmmn~K)R$r4z@X(KDoJZQ@pd7<C
z`{3fPF;7w|nd)PUf=|%RCuk5A)Wk<SkEJg?90H#LR0)$o9;us);6NH88~ON2Fjf<0
zz*SRH2`vOrU_xfy;n|1A^~#=-n6dnk>^gmMpTv@w)tzH>N@XQf7^aEu>g`22B4wZw
zBpNG2cdV`>G+?%}J&gY9^SWQuw8~A|B7O$mJXpmj@s@qfPaaj{j3<K%M?c7kC=+))
zJ#<Vk;|y3^D{IDPRz2K5H(bj(x}>qCDJ4^x+lTpq?~(o4vq5%^`zPEN|F~6VXf7>x
zi+U)x1h5!0v%m10N1I0Gix-q8>UjSIrz!n-$qEjvkGp*3>_2Lz98S9sMOv6Jyp6}+
znjUh(>RDP3#t*=&@8hgkg5o(&HY_uJ^Kre{uU5fhbg5&~R|s}A_@lhJt+Lm7C@{0y
z>0A@uDiH1p_}GBv%a<>;jMGHiOT7(^MS%yPB%*BffdD_oTt0?=!*|l|6=c4(Yh((g
z)~E+T&dr%9OS<Y2s-$#YZkVV+v$6UJH~G%e+R<a1caboYD7Rkz(ZXViIbWlh)!u15
zrFwx%I%q$l_V=7&Vz4Bz)eKfr?d!XIyD-=8z5Y&85CnUvtSN2ltJQUmtU0EKC6Yh3
z4ul3upSj9joxTz31fYL$0TWU5_d&i&+}BE`RT{t}$!S82a*F63L{Tm&o@eR_5AZeY
z{{SMll4c(2Jg#fs8~Qo%@KN|8n}~>r8EJzc{V(fB`V`f^zk>S%FBNkS4XO-pkkI4~
z?#ZKECTyU*)p;o?9`{(YaRy4cK`>h#kGjfmTeQg9GPyXIz1y#(UB%8ubL8*!T)UD0
zOj+zwpg5?x+h{jl2^<)Oa(VUX!^E#--y;I_BW@B9NUCdSv@}xjJqWd({<sH7>5j^F
zuMh)f@GA)k03edfZ@d5+(^NqRqh5Js%D;<?rBD>Rs3>mIjt=k$KV?CtG6CvKpw|Uf
ztVYpyk5`32wFPCkWEmvXwx|B{7Zd!c`_F#-N%H@?{rLa1#J?Q>|EX)*a(${!^8&v-
z`0Jm96EM2^g4nEb=~gD$oSZp#1LR)X+t`Ho_zzoz_=Urr`5-RvyXZ||gLTyS+0jTf
z5(yks%<`X~x`3hpA}1Fzd^uauv+Gs;ZW@x>8X8i20e_tx>Ri^xDnL!zYw7^3-V*cK
z3)ElVN=DMwn^{<}doj9Pepv}x64vY-90!}Y;sIIGNGYm0N^q_xR6zHcQ&RGFFj6WB
z_=??DhJpt!>7O?MYSjO$rqrC0vgHBCBM-8vM&2mcRv?jlmf)Bm)(%*b)8;6lk>K>-
ze}dDKVxc7f30wcqH}AnW_DITW;MzIBr0dh&(__sE?){@2c-GA&s4>7|#H*;P7NDrX
z*=VQ-iG7OTaBxN2$!Y}e<rS=+{=et1_TN7z^uiEWazHxn*aOCRi2N|y0{#J$_){E^
z^8zG^`|yiI;Asl5pYq#c?WSrg>g$C$IX8gfo`KimEx7mep9RC6yq0|<->%)ftJSg+
zX#vm84f5s=W64G7%E{fx0?TsS`gm_;cd1{;D(ky>YZ&RJM;H*O#6(1CKfjb1)H$t=
z6cKp{14;zge9p92K!VU|c5Q!^u55gJEqffmOs)U!`?h9$_77>&PGGtP_Ul-%=iae$
zaw_l0A=PBq+1UYGaQK_I-wHWiWhoA<<j3JmaTllk7cTAF1DRuh>ebdxjFTq<b7uyY
z&ZU8J8QeYX^yMVZp+M!~>d>$w*Q40%cgEFx>J-cb*cX+u7a+By3pwXcs=O7i+pJ%U
zFE0xA01Ph}t`DI9ggtW!vZ;b9u_h5l@tb26%!oGdfUcH2)48NxE)DTyuHr|3AM_lP
zZuB@2Q*pii@JGVUtvXKL=CRjLYbj{tGcU)>6+EH`B+(dD=%FvXuuyHZg+6I65yTO|
zArxTU3BD&Ydi|UkfZhQsObLX~5yL8rXmjN#HY28?!^^a6^5@4VY)1p|EMhR2nrW}%
zIQj0!UY`Q7Cg3Fm;Q)jK_D5S+_gh>XZ&^D?yO-ADJNrwZsi+nIH6a00W-MNy6G;gS
z_rNEv^P1OV`&|K*#*IN@6|iKN|6z2DpDACi+gNgt<THVZ(Ig7|z-?HswIc@xE+0*k
zCpk!Oe!IeHLGc80S>Wu!D?D~gg8AHWu1Rt7?2KM-T&J-nM258s;Kc}E^NgGQz_Kx@
zFp+)#KC52QePaT0hM=9qBJ}-tQ)iy`)m@yQY*;_k1W>QY+}vFD#E&IKFgq!J5ysgr
zzjp;V)lYy*2ZP}EQeXbp$sD7lqXWn@+@!@@N^YYPdocS5{*<vCez4foycW6>_)W9M
zg-yXwkIO(7OO%f&1h&J%!Vmh9q%~j@9p@960yv^r0tl^RnZF@iz`JoNd`)XAc$@sv
z97JXn*59C_1MaaEvw`*hT9Uch{}WAgZF%hLz&UPwA7BBt=TD(mzg3uaK>LS=hQLxj
z00uh4O0#hMpjCt_Py{{TaS1O31Oiy}zQ=pXqTbcR#J%K$O6*Py16#lh0=SslhF}f(
z>H?D!Fd2A(T~?%7u)1o~#u@Rpva*ui^Dn7ol;)>UGVq=QY==Dp&^E4X*xuvX%Bm`9
z=$D7tNh4Xy!AnLOouH<ap5~NW1~b0eMWJIb2zE8_a!wF;#5aNYZOn?yEhykn+YyU*
zAm1Pl6>?ku#(U_pFXW%O#Unl7xWxjgMYI8Zf7_A)Sfck|O?khe*th2i=JE+A17G}X
zYfQ3GA?=xN^u}RLS~)@73NM61*Kz}Llv8cf$Z4dBL{Z`n(+yyF_)Xl&B`M|QSpptC
zpra;h9CYe|{Px<7JJe!69$-=1S@EIH*(<tOzf3~kN&*+C;jPPw(A-`~@)kRNvDO+c
zsv`FIF1?*&8$~dc!3RW~e4tWz;^mC>fZpSB_8tIoB?Y6E<~@sj>H6B*0#*ax0m};S
zheL`p{En?@X_OJoiguN6{=)ueQc}XsPK<9EEdRrga*ACv$i~;)_%50LA4V2-S$=n9
zA;;JXIv*5B1GjN=5HW0Ln$CdoW$8|_y1M#K>dfh4NmZX0z|Ns0wE2S}_ZXVq{|W)@
z7u*uaO44>zbk@qqQtc|Ant+fLg95RC2-JGK{jY~_fEdr0oi5Vh`>zXG)_mU5b@S(7
z4%=4!DZY$t7*}te>1{unGHrDo^vLWM4~)R828K!vMS|~xDC^>tZjWEZM8Lqr4FFo{
za^FS>j&O2uRom4<{S(H_;<76%hv;2|PS&e<d3im(AA{IW)O709pRYUv-bL^!?p_$P
zJoJ(9-ykk4q7=_PAm=`P0-OZ2QbA*Y9;?6XB$_J&iW|ep!BO?UPr9b!Nc>-L(|iD4
zvl3x6kjuBb+;%mDdi~9}OJ7Z8JAFxwBq`>&xRq6(Dngf>3ZkRKfv64UR=-N{k##Ok
z^`G13@4MQ|PJ4w+T=sQ*vPI{OROkm}e$M%T{Ze5EZw>)tf8YkN@@2;RnBNYK1k=OE
z#&91=Y1^$K>J`4&Ld@-=u~UfG30JPxHul*e{BJ~YWUW{eDccIx&&w>8(h;@u|1<B1
zq2Xb0|Nmvll(3JNfvklyiIoeJLxnlPaB^~@O4hb#fBipKUJIxNNsy293XC(~KZr}&
z7%tDfANC(1Z41)N2<q08?Ei)cp50a0#W@vu{dFXYI!ozsuGikz<m95wOId|p)%#yJ
zTuoo+Xk0$~=V1MQ7=+#-d5p<P0@`OqTyLu2WHhOIHn81?1EtFvGq8Dq7dXsR!&1#{
z&l38(z+UD1U!1*VTvcn|HM$H8z(5H_5ll)zB_t$NKmqAqloC=3l1j5p1OXKRMUYk+
zDN)iOL_|6yrKD3r`iu$tzMtnk=lyj2vVU7-uDPyx&3}wB{^Qz_Nq_!}ATKNHg%t;o
zX_FP-v4asBXFqLb9&XOUHgp?|W;^#2#UKg{DQ&L5|7_Uo2zM2m?h<T26o^acnQH`r
zQ05H0erfUArUu~UIHpz%u#v?)dhOEj`ku}9SfOHu!3xVedJ*@$;?o_1+Tl44<zlm~
zDGHmmjRM}PIOFUDp>T)C$`WudDm{fz!N^RQp`6y#wYlfZd-<d1%6#gI%G=02=kMUY
zPP_gc7a4v3<$eeNc{b+{NJnjNG%S_^!-=o8v6(fz$8C8o;1i`@!J-O{=Iv}<FMY<~
zs9;{l(bjl1{{x~)L_PMal)=C_OfapaY-7<s8@S$CE9PZ2JwCYh%AdW%$<BU|aM6)4
zOebe<(V9}xXz0qH4YgmtdM$<mjjg!G{}}IpiXKC#52eLRAl;m~_k{&l>89<v^hH5N
zP^Ssw9SOVic%qeR5DC}Y+dH*j{#)4wlIPVJx!~~EvDa-+ty3=Tk2S4i_#t8)5wP`{
ze!jC%JPV2v(S)Q($v<Ax$VFaLQ!T1o&I7f5u`JR&s2oR{(xL;lo<fNijCITHjna#C
z8b*9%p&r|R{!Ukz(Zzy<1dh}8W#7La`ENP-f+#0PTHaK}%D)t#)HCBHs=FhArE8uA
z2L8OmjcO9+J-dmLiuBZ{pF(D5JNeQ76qgA}Ntum@YKa<wH0|9_;b53tOq;V>Gmxfz
z`zf{ptQI8R7gE(@Fa5_`X4?$008wk@A~Jm!QfttYCufOl;d%e;)NMgF!^pLtQdLte
z;_|JzH&f3sNF86RLcbh#{)Y!CT`$Lett9>;@;~`?^Eo!dG#zePPAeHXxr}8q1xiXv
zqUJFEOiNEM9Df3ZM&ZiR7>F<XUmw?*AWI8k@1TKhsoKd1nCXIEU=&mVf^O?b>w@Z|
ziWyn=?ldvsYImuWVZHqI@n<UT_YGwcpVn&qw7Z$~hwaT%h!T=@m+1{!O1)7)!@cFf
zoXJnr&(M}&3@s}z?yK~c37`|?$EtdI*WIuVT`2eCXtS%x$EBz-d+GQbT1MqNgchXx
zZel`$N>e~^aJF&TE?;qv<wZcd2_8iY9Fz{Uea7&Mal$J7rO9VmFLfOCiQ_-IHdJHu
za`M&%bFT^OqwH(%F<!!MXw&kL{pntQi)XuD3S3VY(}(kj#!XOxL5MXMN!3*65FOHy
zAJIlr9H~BF_%IItqcCV6LcVgP(~IUXK?V}NoEId73aa}03YI6mvX*mTM8Y)`Alk5N
z`Qvwxp|qO_`v!#r*kK1Hd2VP~rV(9E*sE6xmT79QSz_M4Rhp32J*{uN$^-2=@iF~1
zYLNHuZ=oCy6S?U~yiZ3!`>?93t2^oVWL<A**z5UhYA`W1!3A2+VmfQo={1ZmeVIfW
z2Gos~7v~0@2(XA)1WB6wU&fYcz-QR)!BA{VE@rW82z7N@5nt{WCpY+`3__wjE!&D9
z(ol*aIw%PH8g#6COR$mSL{QVw*N~4NK76N&!uvG#K^3}y6~!jYMT6zZU{tfH#0DEv
z!)vokI8*bY-lHh9vYEuwbC8&Nnq$Uw-)s0M-9nQOGo{|^OE1y^R3Zr8lVqpg;xN+e
z@an-f6~oTY_k!>)jX&%^Dy6QWQA4Kg9NxO{BTS?N?Ttd}^VX`PoSd9JHVkQsn*b7-
z#}mI)_Vnm~eyJA~o)nMl&Ecje4Tj_Ctol)Ik+!H788aP*G*~;LiU}Pql^#);y$OEM
z_2Tmhfr}5l!h6u$dv!j-`VJNz5Ovl%P@#k6m6E!bMc*AH=f=s#rbM9^=n@Oa4R6+%
zm%1yj^Hl!^erCC?r!DH@Ae0w;5z@khh?MY|dLLY-QfS6pjznZ;PL1Y{X%II^!<^p8
zta6%%r?NdJ56#=$pFU2S<$r=LQ$8GP&fpfHIsIrIY2%CS1FSqQ6c@F%eP-M3YPRk<
z6*GRDD#hWH*$r41pr$*B{CW{nte|h=Q&KdB4|<;zuxjn6_K{5UHT2xsdGPLfYI!FB
zl#IrzNDg)&S%;}pUd~!<?>i~>Ju3DhT!~g9+->$yl>K;nBILH<UTlk7H&8wzaIsq{
zM;UD!dg%N#JiO3#ZqjTdei0o@>nl;0SGT&h&PwSwn1UzDE2bh<ZA9a~&D({CJu)HT
z+cv#Zyu4fW23wHZXmjl0po9SlOuDWQ6my$@A6fn_kaOhV?sLc(k3-(i)_hlQP>Mp}
z3=?*U%F^1v#>Vy>4Mtoovxdw^D4_(dH}~XMoI=SBMs_wWU{eN?pMks6<Ukl{ZhnZq
zEl1wr6#4)Hqltr!QB*KJ9UUT2QWLDrU`rGzuc%mv39O8ajO%99<w0DWs0ny@G|7c3
z_iTH<J`woZYZ^M=eVwb}-F2OF!S|m$+w<i>#8~TxjM=QuGsYep#K)F9A823j+c^B)
zzUQJ(^aY(qd;j<bY$nw)Kx@rL_6wVWkB|j$3Gk21C_f|4-XYR$(t>=x8JO2ti&*J*
z2JDl{j+lh~@r%*DB%ZZ1xp<2n<0H|xaFUu~SA9**EOv&IQ%)~7pNBD}e#~RV#R2le
z%<kxX<D`fP7KGIRx?*0+pd^nP%>TTSe_6o%zN+`vr-15mgOx#JWHP&_gVp6Ipn)#9
z%#Mmed_FupjGcJtr9ko;N9i+B69eAc8ZeirsN0WFp_xSZ*C-XLG@(};^ayoU(bhIk
zm9naR?F&pKNIDGzvlMiPr629#w@?E@g9kB%e(i7T)O;sAVxo%(c-bvaEedhMqW!dh
z>`y$Pd(r}I55?w=jD6XWp%A)7<xW~^q1D7(AD15!QMyFT$xQbblYO=kk*o&Y5kbvv
zwN%CYf0kuE6NV;DVsZNdG*ZVKN-G7YKNU#ESH+@}Kht01bxb~l&tutDDj6<D!`NFI
znwl+4fn+>w^PZ%nBrj`jcVNzh)0u*HH)543y2l^CjNzu&${K!&iYQ>se3lkIppTW{
z4aRZ_^@T>KSS$twqI`hvEDPRC2%L`2_tGW+>sn|B5VawQ4Xz>6(4Ms<9OC!*<Cg;c
z)DvIQ)AA?3aSq<++h@m4?Wn@<Q7rvOsF$p5nYOgBFUrkm7t@)huBwB?HV-LO^Vh#m
zz1MK0i0E;X&IL1|1V#mm3F3U;oz(=4on_f)tdO>2-2Ac0(xU*$z&8bwKNHkB$v*rk
z!tUN_adG`<VhL{?kN&~}@yUxsPhV9O>pVrQB3RUWdjW!!eAyv>27wSNnO6H+d29lk
zk}VnTOU@2x+N`yMXE*e?m&in+(#%ON23U+esEOjUzOBG5*Tfej%JZW1lTE%1xthD9
z(3B5y<fSF0ZO&Z0BqXp_+4G<TA85%Q!}K5=$+R^z9O30OH8ll}=<8nk9rC3zQhlbO
z8#9rHtUgqcli1$_w_n9P7C25EJ$}q4{4EC;m%8gIOs<sY)D^yeFNku!YnE-K4{z+-
zq-$+CSx9-^xdxe+BqVfvn1ohAH<j(xaz0LfeiQbE=p;fRC8>%$PxtYAn8@jWW*HZp
z!qi0buV1PzpDmab?Fu>;g*}9gt5oz9tVzvLsPSJ}8NHpHoJ2EzI?qO%s$?i7_ec6c
z+8TSL8ejwhy%-Q&2wf>$9x9Y}E$Hwk+DkJcc_1;@qfLlnGXWd7v@jcf(x?QWN*F8(
zoFnrawL+gLGy<Y>*uf-htlbC^OxUwRo2o~>ko*ii3T{Az%|*jrUwdEH**VAg{)7Dl
z=(FOnSn6Rvc?Em*MPOhS3V(UWxyfF{IH#_MtXA(kH<{iDbsXKFZD%U5+9)aE77YD&
zwxU)DqO-jCx(LFHWJNCkfbo{8<e3Bn|I>#LpVt%bj&OPjK?{&JO3Uxz;^Id>_P6IR
z0FB_g9Uv|xTD+i<wv+M}k??$tR5eZnXuUkS`ea#T@m}h$n0&gQh&N%d<QDJq{J(-d
zzc0bWE62*X%XEK)^2W5R=c`p|=#>uRVZxqLwE|czUF@Y$!RiHvV02OT=(R^*mO}dl
z<xrYNGPwfdNYeir?$L%5N~(FnMG|m|>=~%Fh?o2#e|W0P^}h4~lL(ry+Sy+QJo8`C
zK%lGc4!ULiq1C4Jv;rkb$pB4*R`-z&QAtU~E@MIi8>#oI-1<cEMEvXhO-d>%2!B4s
zdt|5J5+{~M(0qZx!EfHY5g-4!!BO$*()=`>3)H*NT&XkyqHi`st2_Xdh48s!V-pc+
zS#XjS?`Cm$wru4%Fr+B*Li(wIMP>Gnh32x2ce{d$tUkadRALF;!<|FULNcGq@WPgV
z`<4{{oMIn}#+xq#?sd34K^(b#`!*^<m-kYiD)}3AcTYOG<FZnDe|(a5(}fGYD7m5G
zU+B7!e&-)5J+YND6K~K+k@(~i$iS}*D_;d9d!VRDYQC56aVi9m0f7sSx-6o~j+6CU
z*nWve-U%g~tl4}+DKY577R3<Jj)+sTO|K_sC8J!1;=vDYZ~a};Q|lw<%F4>vxw$o}
zY)7(KIjg*{JoBZw#%2gi%v*Y>_D$u{U|rZxD1#_mc5-rRCrL;5_VrmeXO^W4_=$rK
zR+8kAShAo@F8=Pzv+<O3wa~FEFis8GOR)z)W97GR-fkJimatF(+{1Lz@a{sxav-Os
z*z(_QZ;#eleb|VqyU~6$<C8aV^U^nDaW>cW*8fXBs$|eUC#A8Q0Lsq3=|2~FFaLB0
zykMjbxAUBDMNx+cS6@e)sTGcbfJxMu4;9ja-l>SPL(c1-j{R{+<)6R9P4fI*RG?(K
zt;AKuRYgy)4P|HX8HH5F!RS%Q$GAkG-9tTf!FAUrv)g7u_vcZ&Il$VIHGpw@s?K7c
zyft4|)lmu^za{^bv`U$E662yvQ?+0l2~~;+P~JD1*NI#Q$v$)r;)@XG|DH;iU|yME
zMvMaF^;dl5kGM#_!iS}qmip-eWBzx3OxB_c!#Be&Nz|qFYIPK(W?Fj$25j{M$C``e
zrEu&koWrIQ^YRuidb=6i7|Y8oST-ySFqc(1;XKf>;*b4(d+Z_WD>hEfeW#4Qek9LC
z)j`h)ht~u}0+4#5FpI>LqMuJb+=?b1-xb};Z!|qL?FK;9BSul`9M?(Y)^iUD`h8}I
znbzTvzVnQ0XWpO1O^@YeGi`SrhbWfE0JIArzk2l?bh_SulRbDHZ<M}P)Sok{rPez_
z5gp9D6!)1}kfqd>>B6KRj@42OdcYlI;~3wFq2)wvv9LT_Xu#WGPS33y$JPtA4x@-;
zQf*>MiSIr-s(Z7+I@bB>ZE##IPS3JvnF&`JX)o^$=CN!Xm1W(%zvA^N>8ab!%6K5k
z%8^x43o?KTRSw#-FSQLT!YA$-vwdfn5oxE{OiGVTz_B)X+A5^da}nKvdF`<=&f>h`
zdq%M6OBdlt!Fz_X_wH;kuowrYJ}v4zyCj>5A199(I62Lss-n%ooYOI>a||xodn)@k
zkOp<W{+Y#_s@YheyEg?T`Jjf{-FC0-V;eSZ0rMkGBM0@TXg@Smubw=q9zGx3O&304
zb&{nX{nIJVr!_SyrjPphegM{2@6HT|*-(>D%q1uere_&NoilmPSY$jmk5I9NJ-
zF$zSYao~ElZvBIPiZXm(lN~v1hQJIE`;vJh<~Z<$XRo%)^skR$%>9uZVh~<i@7$`|
zWCzq469UBe2)-ucNkc@z3vJmA-ts?Rw6Y^@zl*$Rqp>TGLK^K>gjJ?sk^XI6=ue|R
z48NR5O_65u&t>Qocd4!RN)obrQLYcy-s2nCs!S%&1NM~0CcD-3<iY`{9?w}>jV{h%
z4C>5{V>e}W!;eYwI>MhXE*DOLr)iZxA4p|}DeiRIzB^ECXD6{Gj>5OZLYtk9t!f7X
z5&*}WZy0v6VPoyK<a*sNH2~J^HNd=;Wy6l375<(U4|eO?$?v2kyWa;S4$m!(c^JTJ
z<@p<+$O=3x2Pg$lSw?<5%*tx2zwIcT2WQU*I4=BXQM-mL0kt>cN*H35d(=l{NufbI
zXb6Oi>Rjt;j<GeA)_B_;r{ZJJv!i}!*TMDL9&ZXqtL+mW@mt+ZL!WYKo5jX9u+r_v
z&*cLw91^G?zAc`<K8e*#jM}@Pt}vq#e@lP%?z8BwzIOM4=By#8_=&W;?+B-63Na**
zk)EEHJ&ZgS?lLMw$5mYEt+UE4@wM;c0T~o3?okjh(ck*v4>E1&B{h7|Ht*BN4%-Cs
zKQlu;0Z?4!h&@eI@TT)qw_le<F{tl!=vUQfBg}8o_I1Y1-vSjrJ5XziL<kSn<Ic)%
zb?FH&XNDSi9PjW&5Cg+!CrXTuu+<|wdzRl0LTO!m;uPExq>jza%?a>Dfdq5_?w0V?
z6#cBXtK?Rb>l=+!-<5sC1T<`N@;8w5TV2k&s$^t6herV;^{jrC@%6Tx`U3_+2yx2F
z)p+!YiHV2Z7b?v{Q9^QD8F8eIHCx2g=>@&9o16Hv07Nuh4@GegQz_2-%EIj$78m79
zIrnu+ok|vNFGIU^gzR@vjJ<{Y-KRg6;62%fuiV+QW2@Fw4d@CursQyaqFuk##Zahp
zuF$*T%yDsX4^%nYZ^TPTcmKM!S&9T#SUqdJygc3^X5I1UeZbZ&QNa??y?eTT0nkbZ
zTy{63^!3Qu!971jXLYTg)ub8}qJv}uN3(rs_mB3w<-yf7=Lg2Fdi-srZU5z82k5IA
zv4!2<MhEO1fo-h}2lPqQ{46aErGCulrLP1US=MD0ri-78pduB$0}kL|l((OtHn8X5
zVW_Z7o2yrwsL6vt?8=)>wTqsOeM9;L0L>7?62(yTUu^D>%0@%x>cW}kLjHR@R(b%o
z8l;SZbKv5X9m+-}_mF;ujV*n%d7kQ~yA6gHP0nQbfAwP)FMtKIntQeu-<hnA%eO3;
z<!jLg(AlC6RUa4}97Nh<-?G)|ZF|Ai?xl}%=H>&0RC0p<Ds~k5R@i~n*3`Umos@AY
z)51dx6|w`(bp0lo?=vDW|8k5>06e9GLd;35{P8;6_{>custaC8pcT#qqa35jA$c|e
zoT62$bA4hmFxgv9lpCbOJz<yWTW;yNvP-_e3Zv5)Ie*go5^V`e$d0A&CdfgwIg0~|
z9_j3s8!5z5FWluO(D9w4`7F;CJl#wEm{8Qo_3ygv!n5tZ3lHfR5#flCUU!aRG(X}W
zn=y(v%2l6p0b-h247VA!I;;cNd;@XnfRHVm7Y+q+>*egsmi&?*yy;6m!^R)M2BSYS
zSS!mmUMLT0bRBc~sjx}@(qZ?9H9LsQO(eSsnsNPRu9mxlaQAk&W92hCJos$)i1=;j
z`BSNb-w;Q8Mxk&(UZ(aM*Kde0Hc6MC3<WWdxB?i|b0gNU{YThOK-w@}-AnK{3iR^8
zz&yRY^i{|vCcVg%$yr*6EM#&BX!u_(AYoQ|0?ZYTvH0y5n)9!UH5bk_+C4_UAqpI+
zp0`dbkMblo)sf3<6hj(8F$Ad3=l*0KKPt;292Kl)oLrocm>BpP;Ovp#;8oVoMwAB|
z)R2>8qO<lWalUF+6w1>{^1EeG_-VXOEF_(!-B!r9rzeBEGg2<w3OnY=vmbjL&zRZB
zWrw@jjuw}OUXmEEbB~z#k@gG_V@j3O)^@I-6aR)C_V;7-K91%0`eWJu4lykFEiT+Z
z624wCBAhs+%({E0B+ox3BBd%`S@aoIfCR!oIX@;L-bi#Ga+cYO7peW`8lVUSuA-fb
zKB=nr>l6CGK!`I!c0>jLPR+}61))Ky2juMw%Cg6v2?<$wI@F|)^-&C^z#?FeWSx2v
zVq)!#t3n&1{wyO(g{?aP07U%Yzx^kPML*p2X|~aYj|Gx*(~WpCRcC1OL#H5O=h05@
zDSEj3yk%UFfv)Z_wa>Xu--Q53>Y2n`wSH1iDYy()10r~U+g7>>?&qC{cqEnQ?}z=d
z#VCis@XpVUiG%hXX2g(g6_~CuYCmH%0%%-i6cw|>FFile30elSpI(qO2=z<;svbb{
z3EIhF0@UU0Ci3vdbyyIyAK<X!<Kncktx%ti0K^k}6TAa8g!;{<w^xD!0TJxjb3$V4
ze?SXD8=#zn!mlDei+D@%h9jS#jOm)1yaG&f{vliEltpz^M8rpD>7GX$S?;-EGd^Ix
zN|Bb;^l-NVJt-s>hylcT5h)aO%j{>AZuVD22Jyp(V{o)Rza<?_e9mKn%L`ps@p&zf
z2fj^V1~v}|inOUTBaekp4N&F_l3moB9yb$6IiAXEwH0!2l6D3-$x~4OBAYi4^nQWs
zc{?C-6jOq8j6=X{As;im4|{Vq0`(xM65!v$4uf^y$v$+nv}Pt(O-$(M={deX2LsN_
z@)|k^OV@>ZBv8UypW4e9s0-nZ{4eJ|<$X6*U0_dv#zX1Jb3XJx6;27#bJ_0N`pYs<
zew{($wG*z%r>g#vO?*(BCFWvHi7pbu@{~e#>dpu%0_Oy!SO(u&ceJOcfJhTz_vk*N
z%xlH<+xfp*4x9ZvczRs+%E$YryjKnBZ~h(s1%Wq-q_R<QBs>$?9Gco~U<$B|kT7<l
zO#s}?G?p<9_{~A3dTuY~6Ln}mvQ;mVeO}z7VmKgpgGEH7AThEMnTJen#z<1A>m{9R
zE0DDso-u~M2xMPk_xUN?8VF(G$fa0dbzH;+at#z|Vpv=_vxI$G{`=5zaNxQT4q~{d
zsHoXra~!qGdlTTl$PXVn@9zuM7sL1@02_0_`H*GYbS};`ET(k8=>i(%_=Je9M6;RY
z!K&UJVws|R6K`A4w(9>6-0-xY5K&>VUYhhD0_tqx;NXyFQw-GoX;6?O0r$^6r)9xI
zvv9S_!>aI^#2?6q(|C={+|?b*O&fUiXAUL7chX_$(FSoWWiXF{*o|Kwf!&~`qi0}1
zosom~8Gt)2OFTg<%<U)CYrcoTjiS+u$7ACz55ZVy!oI^LUz40BCh$TXw;cYnIGVcn
z=nk1nKTqpmhu+P$-kv4xx0nCvdW;kf^GobS={5oi$pqa>>`#AvytApJ05<^ACi(B8
zx7BqXh;4Y2nMrI3!j&_rDHXfGR2$msmxlt-BOw&m%>UiHInBJHMed8MSgs)mD^$n7
z3S(dRcDzvxf4@u|a>CgD0yz0RPtPa+x=0Gm3J1I$1QOFai4Z^-bYa4SuoYbRTEx8a
zx0E=^0kOv_9HilOyyF1L27`iwE&B-pC(Y>l?(Mm~U3UA61Q1^c)m6%01gz}3zl5^o
z!X1l(>#BQCwtq#t^cxbkg|@L~MajY%4YBS=ykc`b0oHCqItOP9@HO8={`~n<EIPPw
z`9n`4(>;Y_f@G7WpE4Asrr}vzEZz1#z~Kkx3Y1=z54gySKV&wQ!gX8d;l@nMn?I!(
zmI4CK9-jMMWKiI`aukuhZ%TA3d2BHlbyS$Py}Jy;)z7|HE%l(=;7s!P&5RC0-lqbu
z!USo+;ObAN-5bUi|5R#LI*v38%`p;s??d~g4$qgtmXP)_K%J94<@4?(XQ^i#7D37J
zh{nDhH*Si!eBRu0l#UR9gX0k>NZaHx$dT{1;JB{>dpX_%3)}4+2_k{SD3#!0$^0qm
zvJoUR2RoXxt$T1@gXrYN9>rgkFHbAl$Q4<Sw3PlW_1)LEa&T_}3ZlOaVvJimnlBvw
zrL(Z@zV*mvM*NT6<?Cgwtig*JeHPFDR{Ogf6BJ<8Gf;RJaok9<74ox*d}9jh%kXBM
zgv03u+Kof`A(lJ<0Dno9;h+8^QGSv|EOWo*iOl`y4@dUyVO+;m%1F&sy52X)2tOyS
zzvF_DQDBD2DJhPpWU`DZ8JNf7zbx4AcBt<3{i1l>?$oCzw`Z|ASE2*fcU8X|85$T^
z`E7hVntN7X*#7SX+$ZH&??~JJPV?Sgxbgy>KW*M@O-3J8U)bp?)G%Z+8J+H$#BpbS
zM1d>hkCb|K4wEv|%^~8+3lVvf<6BufJnz5bDw8)DGQN#@0gzr4zPC|W$6TE{l<z!k
zt8y02^E^O$r=G7g_qZVob_AqYds@b-V60dAp$;{wp?D)g6B4?pzjJ45M&+`9MrA<@
zK3^{?*oT`DtiPSYCf}aa2eWR|-{B@82>9W{?~rU^`}csk9t;|F8Q*@?Pci)&WC-3+
zNM4TFSZfzJ_GP4AYu8ln?b)|s?K3{BE?lHl9Eqku@j-kh{@>T<&CC;HAPnm0qwk@b
zvsa#aT2$5hQ}Z34^C~kfEje{NCLUb(W<A<82r}=x2FChHFAqhTQVO6hC6Qu4yB~Sb
zMK8resVD3RBU!oNBki_xUrN!;?}XNi<Oy!P7p(z$^r;3zC%cYS`q~pKssKISD);%A
zdX?k$gqm}groO(uBB=DVH7fle!JS)jIb!u<hoZBZ?&fvuCO(#BBPetYlsg{D#iL~l
zOsh=$P8$3rN-C7=V)wWaq2v7<40XiY`+C;JcmkL^uKWH58ZT}BXW*-D+?UeCCJYZ3
z2duqN57G)iVCa=MX5aq&uCObueirC(e;sK(n1A<sne^pfzeZ&3@f241brS3}1`BS&
zL&y6hIziB0p-orrp9_Zi>?S%T<dTL|NH8<N&Z0%Tzwt!KWKFCbQL@{hc~(X{on&~x
z^aP3JxtIPgx%_wZFU7`~P_VFb^6*#_lC@)(ZuLC5Fj~0cKCNtQc;`?}XB?6kVLDOL
zm^!6H!8%xoe6#N?VkYgDZJ=;pheDkxi7)fo74*L85t0N6y{IE}m($JUVPAfc&@-O2
zytr>L{vt2vxrnCOVPrwMdTOXqG4diIEOqsvhcry7aUs#$o06ch$mOyK3Uq(axNY9N
zpS0z~zim`LTN`#+&pY}6?bP9=@)sSVW4GnjPmP5NFbLZ1VEMT#_oOX>+d0TW6j;B6
z*viEjk`B?F)19LviC<_q6aFoNWE_5LcWiGcv<`Ff;~gSB9my_SO3rE6^_{i&2<m<H
zE(}d&icLl{FMVx($ff&UN!vG?Jy8TtYu`H3P9U9LNy|i|t$FtBr#wbVF0@&6IRx1L
z<8A*iD;jPb)S#*rm69dQ3sU$MRFpZ8#a6~gvRj4{sSHu%HeOenI!n+p<8UPysbOqL
zAA%@w#_CJcFe`#*YDx;hz)o{6weHo3M!Si?A95M!`X-0uPU;P$jr4>}KwUkysuzWs
zO3v#2SmGJp;|F5%km!4DfDQ!(1qt7YejEOEX|^MEr__`0*leL!_@T;&9%ur|xVN|0
zmyus~-FKXe(l=4YK-wZ0#jxm`o%Q@21H_JI7f}Jfge-Rbb99^VNEC6iwEME{biXF$
zWqeq-`+$n_H?)PQS7{lEiV4bf0{&qk$TWfNBo=f>zsBFu53TB1Jf&^Qjt0kBrk<H8
zbK94`z`0=f9THL_ZDlh)t5(6I7|b?8us(}7e+$8QlK4`fnr5Zj;i8=0lx;v;uhpO>
zFjHzwBFXQsmP<$kix(G1yd3v6ajuccH`=k<8QGS6ngvoU0cD|%1`lcRGbGqIZ04``
zCOQTNPDlzhwV0{g`P|AG7xon1{6_Nph<41pfqW^FkiGHcp<V#24agqY9o^&isp3Dr
zeOpzCJ|Slk>S70ntOXg_B@58xkYzReoey+h@G)3Cx5O$R(&O&g!T3-fX2{jb@^5(+
z-KMg097pDfjZ^}Ho~ByP`(JSmUkX$sSo~>#I@m2U^WAtD>|31h`h}jfVCw-9WT$fv
zG7ODwFyh+XLz?p=D3SS@tUpF)&c}IRCzQ}k@8`FugM-`okbrfh+r9rwfe;^&o<0fI
zjZzP)&ieP^c5}t8T|3GX&;aV%HIzK$QgqK3(Q58<`xSG$JAan^ED#LI8g=4TNySAr
z7AgCsmHAcXNQ6#yxV}^GdZeq#MxGg_x!F!6g62mp4jPToXl?DUR?i*1a{fN8{*6yr
z5R;|E5UOzY(iZX~LJvq{k$6A*^c-y(tS=goCTP3pNAvmi00+1s>v4Lab~;t)#U}ge
zdiA!IfgQN;``KOgtuE##T3xpCjpoNZ2L3{&N1Hirb&@3CDpL5B!)<8*#0dXlVfSXS
zn^)J9s-|}4$85B=Od|k+9FkeOn89oav`aLEMh0CcWUD?4Rws+O&}d4iYM{FYigXN6
zkL3BO$<yG13H>jCdBSO8%G(^{TF~h+8az{K*!!y!ME`C`R?@6BH%NSef|$?(ecqYB
zD3AA^kP3iTygU8y3brl!sOslv8kkTMy2lOQrg$Sd-H%_7et!@B=K6mKd3Ly${(iJv
z2u?#7>JIQX|8H{{&TFdR=s0tC>+S;>&_b1pkr3J132_C{v<5>|O|lSsN$kFWU_p2u
z`@yvQd>b)Hlb)8Wm#fhREvmK#^h08oF#}GUro}AYUAGzWY0FNsPoy$&EDxGUY_jr%
zNXfHj|Ka@}4hcb%HQtTEsYx~t4%-W0Dc;viyLIlm$Rx?b3bbFIrWmpW+RK9Zh8&4=
zTZ%!Sa#_-H#N~o}ET7-YI<PjCz_N4uNRy=HNj~~}y9wbxHQxPj=xxTprUQi~(%GiQ
zs|W>zCJKFK>G94Q`_~F#WL@1ehDBZ&JkSGn9a5lB_2V+hfoP8`Q}Y`D1;Cc9xUBIv
zYHA3X>`uLD(%U}!Vy}4jktbR7BDOtVH8>oOX#SrCE?doPu9QD$f7S?+mREe{H6Sl>
z+FL%n4*>N~>lV0v*p^&RNT>d=P1;Nzpu7g`2yC3D)CD~0e>2_`>!uwCh99kqu=3JZ
z5Q~U%F5J!ueTl7eam+;hb&TPd=q2l9JE|;$4Tp%+<zM9#;?R!d;;MdHg<Cw@?MTgo
z95L)19O+^wp?<=;|AP9-Ve*Y?f7EE<QcEFwUYZyo=!-2zTR=3?*h3<7lGE@oSmVK6
zF<QO~kHBP(B>!{5yAraeV_!XDBHyEX)d%owSv13lJp@4m`e^^k`~9R{kN+#=!1f#b
zQzA$0e&B23n+G#%s#sovwkPpv#}>X^YX2^>Rj+E}2h%=s%T40kBg)Hw0sDFsXW`wK
z;DXq!MK9rYak}4xP^UC!?Gv(%37Cx}=4KBF+kZ_@0inIxQ8U<3dA@$sU}*MDBUL|n
z6oH9rpKn(Xs(ygftPtKQD_?0N?hw=4?Lt8T0T~z<*tbriE!KW29S91VcbtcHeEjWM
zS$KIg)f^n=_H}ElDKAKkijN;M-A}qm0cGrNR7n<<Lmg0JH?W3)uTbur7#_aH%8XG0
zq9>=LB2rhioQ=%{P;3R#h9h#a7kKF9`gdl0`?86BVy4AfM>sV9Zxhqc{_&WUeJ{}W
zkwUVUR)lUYsQ&C&mUDU;WV)%hL&!9&^Sh`Dyp-nT>NjIW7#6P3QQ6Y^qDZM6pu1P1
z4O#!f=aZIKuPXl85SRPRPamE8AC*1(Vv2whS(`Jxg{!$Z&g&`nc6;gp;y)-5?-2Vb
zgp=5oEftl5q^i6uj*f<r?{|WAt2iW93R*^6ezYE@xP%D{97^DQy2OuZXv+ID**Na^
z<)3@by^6_VA4<i;V2Yi$<8dGEC|tp-(0=tYr8u4{h-5%z49B%jKyr^a%`&vbDz{?q
zmkvQd0BLOY?8lE2oA0pynXW1CsLV_?BXn#Ikdg~WzC<l*R)rHBTkS>N7}lZgm5U4F
z5544z?_uC}PE2&}<Axp7jC>cXzoI8cDhz({LJ0W8W1gV}EFOvP48rkaFya`tCX|ym
zarbhf#|^6`pDj<-DiY9Eb`DC5v>a}i$QR3xP1e;_??nOnAM2LK?kBVVen}5_bC>kt
zcaraJ9d7$mpQ_i`m$CfDsHE>QtiSw3FveweM*Yu9U!%)A!55x_ed*_uJ@OjMHkL+7
zDI8kYma{bVjWaB`A;xQs*JF!%E@XE#(?7qbua5`)Ep%oy;v<L9!ULiw=xWW!z6V6f
z$`!j6&vc3%;5GU749Gclj|J+L2G*!T_q4IEg^km-cH2@b-s_n<hz=UBKW9oGX0S`Z
z_+lpogYJ-V)vr|p7UVRQuUHtm94%+;0?Ko`-dah8O#vJ;yNp+ceU1IkOnM#Tw{AZp
zuNvv|Ii?A;#0oJYC&2;s*;N=AQbPl0Nof*r`MPaR4ZZjGS-pqi18bD$I%jLCCjdnt
z=xC8U>+1G$l~z4B6Kc2&R$BUOdD^D)<!KSoy;^r7d$l5<h(#mQ&Lr;MDs#{2miyjn
zxi1$#_4odMspwv{)psZ{*h6}%=EUHxd|mr$@g4jBCZYRRo|-f$^j-9nR}H#dU9PT~
zzqhEr7o!dTs-sc5yQ8sDS5qT8P6e^=wbp8Z%X5~i;P1BAe${d-%O3W4G$ZXV+-H_O
zuY1ux`k#h2V3MOyaH2V+P2u&rFprt)747M<wWe|4=bw9je(dWvULT8N<P<o*f?Dwh
zbn3rfD5~nHbW-?A^Vuj!>2ObUn!~pEfWCt16Zk@LMpHG>rKw?;EL==$+Z!8}<JTHr
zZovf}9^-;x<J*TpLyNiTQyNYYBNw0)g<lYwZ8vrdH0W?Qt7m?Zq`G`-H1-n)CmAU!
zQG2{{iO(JQ)9z$cv7NQ{7qsM;n4%e8xtQzbo!ncdsjl98pBuobw|>Y45jp7OvQncg
zdJ4HMimE7x<a_b;8YP4S5GeJ$bdUjBC$mIYdzSRqJqp-dl?jx=99S}KKASypYU<i#
zT*O0{nLsWlyA^|T3@j4pD>tItAm;a2wEk^qh4zxxfsV={#MiJL_>cG+Rchm?8bKxz
zg|rgue^uBaw-*-|QT`&43p#lKzdC{DWa-A=@v0r})(Al80tc<n2R3~Dae~FnjBC?I
zMl`rFUMNN+RHIK26Hw?eG;?xtnw{sT2(9#`U6beK|A6U5@W{jYof!10Pu4jdOUFp_
zk;RL+0u!#Fj1ymPj_ISeke5Xz@tK-}PFq`Bfu{o~Al)Cy#ZH1vymgF+M{##JWJM6w
z@i#;NsgE|_l$vD=F2lD2-P1nMCa=wrV9*`8PnNN!<_nDS#1pVz8{|@24OyVs?wtpA
zzIuM@P@Ne@%F~WlO^Ggi3o<qMd8vOr-v(Uf4!V@pIZAn1R8PRQA+8<maZ~kD(b5t`
zc`@E`K*aG<8}f2vx^y)AB^9;93Oo^8Ga3x<a!dTd!M%lm0gz3FHjwzX5t1Jj8#3U=
znSRI>Ys(fQ`lZ^KME(j0g=C7m=AoaL;5|RJ-e46SHzP&};NNrM(Vi>oWH1(B@4$b~
z+&q5z2?FeXQRliclDv9FZgD#xhmgG(5BN;>`6=S1<`1EL)pyWj1~%o;Lk{;SHi8$Y
z5Rh#}50AzX71K*X^=i|N?#k#eNq#q$>NiCGg!FxFz*8baNl7U}um`n;@2&1)#9}{W
z!NbQ9_Rx}e%-4p~Ruc5!9e`ceyGiojK4;Hpgx8LUbmSkGSotA0Hu7E669B+Ybtfkw
z;_4$PEtIt&p2!xQV*@CE0`2-|Mp-t?rQPl|n@FBT=$zO83xt5|;M2sZx0@~6@1Lne
zl1;nOc~0&@Z7OlQ*#@BNF5*xQgb*pqlPq*if3BhMBM)A=^GpjVH1^BSn;g6U9y<n2
zH(I<}ICG7)WGhC>;U5uOgBQ;Nl>svZ5+<MsEqy=bU3<KX5c=7B;RxJf0c@9L0D3(@
z(<35x=6|I5kDh!>5Rhl`5YGbTGK8MT6K{QyX5%4xXusLr0hG6FZEdfqsDe-tcCY|C
zk6xI$`>Y3M<+h@U#$ZGhHJe$y`C?X`RV15iS`q~>WF09V4?EUH(qR654_wEf1h}q(
zs%tMa7T9P``pl#%w*PhT@{`*fmTc8)Gq(IaT)_t4xt8A^zs%EJxSHjp-9kQ;S%lC2
z6F})V+it0>@jGVsjZ&r#L3X}Z33DrdhD3GHnT6W@jBF5Z2cBnCeTemmr&XIfSPQDq
zOBrv+lk<5}=D(M1lzd~&G%w%$<-bjsr(3SZ|20uPtx3vyevcR?i9w_RUVaMmg1s1y
z66WPqt@fGTTy=aT@vq#M6}x2tj4hulPou5(N-oY@OE^K_f>PV&hB;r(s`Gs=n7I|)
zD}Isl=Baf^Lsvb6jJDkmGBUfIxnE8hRk%ti0Kc-A3o8P$K<*@JX!2u%F!geeAWl_^
zzIy0QUHWv46$k8&<0nK*H!JsSbv~5O=s)2N3fuEjPdW0Eiy0@*GLW|YB5=S4kc6Z<
zPEdzDc_2rrsvc=uo`gVFB13WtGH?wI4K+1jbr&f-{@!9HV7Q8X(0xL)T!{h)1VDC&
zdv1<wFd&dGhByEiE>tP*5O)qykB{>zETOf+><zY^3;!6s%LB~UROLY*n-2B|M-^M4
zbVC3_CSX+vRhoyDZ6bz9OT5mV{Q(IF-!dAa&D$95VVGvZa9nW2ZH|y>uSNC8!ni8w
zA`M8B{W$0guD_^2@NV;7{r>!nlP$1xOj+pXH}5_C6!*&c73|{w(Ey%ZQc8R5(oq7k
zvdAr43oi1+U0Qo<tzMQ1mxV7HVf;1No}WuhD~b?(AEb0fL%aV+7cHgwmYKsaO$_(7
z18D<;oM!E>+)T><g)vn&SLWZvR}BpCR-n0^L&%wvAUIOH2}mwznpeS=otO#w_p6+h
z<cY*N#xW`P_Etko8-;~|T^OJl!cdA(2qWVGeB`iKE`t6x!U^m&iNyrG>}w2^<KVZ4
zgpQ?wwH3$k{=d4ZL&aQH|LNT;_7Kl7{!xD~A0iBvy&p4-ps>R93+Coz)(1>Sqo@Wy
zjX?azQL%k&bvaBVc|0=kJuplI5y>cyKd^T${!;m}h#=Cu^@){piC(#0KU2AqYtljm
zxKPJ5toyqtiBu*r)z!3S$<vCCGj}v6QS}WB|1LVu49BO49_Y^USaE0Vwcbw}w_f?Q
z`$55(>|Q5^jYA#TKNjC*;DtJ}p}Jg#QV-20aoSaAXefE#P3>!T*b?PApzZ4`zJ66|
zMD3417(klLt}=n!ideW@+Uci+I1Y1^vo{a#7Vjqh!8nxCI2GKRI0>x(lx@$&H)8|H
zsn}3NTY=FNDLA&*>t$QP7v<pNRVFKE_XO{D;&)fTHakJdj+UHGnV_dTYuTIL+RTh~
zq>h)gP+#i2;Oy*-G!6Y0+F&FY)0@^-xg!Fm)E;Y}?H^>s${wfmoTdvEi@+4Y@!Z-y
zIZ(Fwz^%(w5ohNk^W%^6^4j4`9yDl}UHj1R{}ufHxj*Iep_n;{Vz`{HF*M7@u$*9~
z{%_z?f_9U~co9?K;vH}v7s#UZ3*UF4xa`w)jGOkK6@Kym!klW4A9uAJ^GOu5JGHKQ
zK<9ybz|33xqCmG}Sp9U|e|W3^b7T<5d|*vHFS8J%PNbdG5TWAF?SK08l?tJ!S;i#1
z;ENN5<oQ1T*#ih@pX_*0*;eK+uBtovsU9p(cFZn^D~4XWG2RM>q}+~YU?3Xv{W7ez
z@PJzWl|H*+M(HRj$>+jivo+acoFgp;y~ek@;Vl7Q089g>6H_G&b<xm|LlcFAVPCmN
zCrniuh`jKGuJr`8maD{&1wC#4B{!kLd!EM#JyhLM=Yguu(vFp7p0G<9<G&laIahYR
zl{1ngZm(Ld@)b|$&2|aa-E9BE?GV%YBW<~wda>Yg6TLd-A3RU%uk{!2ZI?Swv;r5z
z6K<$foa;4M7U?^ki!ZbjH=QHN!=QU&FunY@;ESuBvtycE+>05jQmRd=rTx8=<-u8&
zH)FBNdjiu4abW)1{Jc_s)Dk8oxlhi}w?A*ae9OqTYNDiP)0ce#FW0U8#VN);IA3TI
zVR`gFN!{=P;maWb&GKVDZ>_bmn85JC9TRg7q$6NWo<?H?$gS!qFK>oZUL(3Il01$U
zdHNBf;gM9?#5lSVGfH1g<Pu!!3tSz#Fy;B5dB<&FzPA4Ffk@&n9=;_)J3$G>tFo?s
z5Ffl7ida{Z2rkG-3_<>aVMcud9pIa1dd>rCtP~Lf9xJo8p|qaQe~7Py*j0`U8dqHe
zdL5jdwP%d}yy}eN%G4X?MbG9vN+R`NG7FXVZCHGdaSBb{G|#2jYJ&Dyi~sBYN!ar%
zJi!p4XnG;Zy~*L^Oz>-PQCV?krjN=JQ<djcrPXnGB>q`|<L(q6es&bI#VxB^BTQ<i
z(`GBtZCH%p*z~v5A?=t~KgE;PV7&fQg5l~k{N|lZ>L)*nTPX|3;}?PS+JbM!RckL0
zSJipuspzQpC@j_-6Qyj$T=)SCuIHMbuqT^so%nn1h9V5|Stn&_0m-iM)*&D!AKVHv
zaZTKe8|Dg8Y{~(E-b9k8#@`k)FW(Tue|Mz#P*AK@Ot)_nH?LmIT_D&}w!zD9$Z*}c
zsi%X;-@qr#|IaH@P*uyxQkky~OqpT`#eQf;HNntRQLUI1{+^p&Qj#YgQs>YH!z~-t
zi&OG!*QuY3(7+E@-b?KT*AoAgjJXb5N!X)AvPTn(!X-<5`Av=g3s?xr$EE;(WXaXg
zpRG$@kTv@yodc02*fnvg!A?W)#V<2nt6)@L>FFtb;gvWyVddAu(%_BOhubGdPP>F_
zcNwqmN=ms<o4$;u(x@@zT9<31&dVK1&8{DbqKkLC!nL_5ycI#(9?Ooa$oKMMAAR4e
z@+CYdVo+Iz3uwM$IWf`;<ghD<h(fo|Ra0@5sz|~8<v&|~Be58GrMlXH2$Qxz<@Gxi
z4-jc?O&3K--bg!j&&=A^ASoduAgTw#?S=Wwt#RP_*OjQ<-2<DR1;B}>%@(4eZ9_Zo
z3mX+=WXcG|f-Esc?i?Ed1yw#!ZIHh2&iTMl8g>#e${Gj4Rc;cj_xlme!*GM60=>p?
zj!~&Ij+DE4^*87%(<Q`APZnXAf?y^d;vNH3P0r5qs*0kq5r%~IxUMZ>=T-=H2b2bd
z{_n_(iyUd}ta8x5VU`7dwUUbkAY8!KJZmCUu`4R`lm0hMopSN~LrffLh2L2FDwxoK
zi_R_-sy(ZVVkUrurYvR921^HW1DM~0Rn+<ZpFfF3uXQh=xtPzjvOIZUb)I^4j_tn#
zQJ6ti)x2|j-;YYn<lE}qf_|pzo(>U&%&w{kJ8KuZ=jRa4wn>S^@GZwKakxZ$+p1dB
z^{?3osp|Mde(;w0nUSHGuFgKV4}8$@*1@hO3@Y$LntVxk(hdIiXKPBGH7A~sY~X@K
z?J$b_K0efpgV+MRi&k&sbHUPxb@ug^pw%Jn3LT)7*dr>lAgF>I;GU59^2w7cKX<M!
z2a&ak9uIPr4%3yO^h-}7hB8h|KU%#TLM9=>hYE0NtizxU-$7dg70l~B(HPl7uE`Wb
z2L*gOoX`BJNpU5_G>Dm`kA<sa_bZq=y)sAp?Bz=x7+Ae}<vtJ_d;n*6qvb@c^%V!7
zi1U2_DTU&fW1>cmN1Hi*S`&jjxB|%?J}9e}a%#YslQb#kA9(zI0Kpp8(i#OQq!1&R
z^-a!h4+#wHrM9Z>g3qDr%F=8lI%xvr&Uamu{r6`4q5H92Gs-66s^j@YA|QQ2H;(`O
z&oqDa|2>QMQ{Z1ooWO7m{`~)Nle#F6<q^jbkEp`!Y;B1LJtf)(sFVJ<R)i#KtTBl7
zhV~Mr7!G^RNlF?fW)^s_KB}F4MhruO*nj`N>30mK-58VcwQ=NEcq&G5u!v|}H3URO
zm#}m=Mdn?k*w3ef*(m!JQdRjbw!!4SJ>BLZCOp^do)Ii$v!R26Z-p_x^M$JY+d-bL
zj+X`|*%;>H-Q{;d!*FAgi~R2Z6xo6BsE4jMv_P!RCR$GAolA6Lquw=Oo$qWvZ(|a!
zTgj0fZSOSg_Fm~qczObd&T+-Rra#3h!+u2O_{^@{9!oP|HX42vH;^s}3nc4L;~!>1
z{sF67h@${yi@0aNh-h3y-O@$gL57SwwLRTL?$N2=rlC!(u}bl^3Z|mI+q|`?7pt}(
zlD#|ThHnjdKv;l~6l*Xc2z+siJ{fTHA9E=M?GY**l93CJ;ksF}{csJ<HI$F-;on2}
z<vg5GRDyW9;avZ^VeiZb$@XINb{8c|gi#c43h8Y+ZKpe2YX|5`wnywrentDz&l0_R
z^#|Pnp2Ec%=Nvv_L|FKMc9y<f)Rd9G=<DuhywX1Oh4b~Pd}QoK2dk=TrEa#wFWJ0J
z%uBL*Ow+B!Hs+7F`DzYJ@e)yxXyDu`@{Em*`{M?w6C<PZL~aKuaB6EOmghX>SIEnx
zqvs;Wm_z3)oFc5-b~@*XN$zRk3df}pL$XU}rbgJO0`r+V`ETu=cHJfFy6SD&{JX_i
zr<E`6;+}~rJP_Su`7Pv1y~y?Z>V$zu?&l*Eassz2vl`Dd%~!Wan%Fl=Y@xfDnDC9T
z!0rst$g)V|5PGL-CfsTAy@YU4sGzaIf+z1U7;B0cH&o)hP8g2VIqtNXcOFJ*)fV8g
zFmNqy;)5B7Eyv~NjS0Oi3Tk?K`PfaSH_l~p@ST-n$ZoausV#p%Hf^fhq<(gi`v<&K
zn;y+Z)JaaQf8c1%w8efG22dPua7xwH%~Y(+tm=pDZxCK-*cxH_Z$w~%y$M?sBVLuz
zGU9A3je4DY=PcjLraZpUga{4K@43MBkP_u>EbZM-=vDRPPq1%JCo{<N`3;^F-dXvA
z_W~~uaoWx7xn{I|BW%`(%?7}3p5+SD$juePGOyQnosdd)t?<xgI6PEPee_{Vesz5Z
zw_}IftohR2DsoK((KQPADV8;hEW(n^J8|ADI&r74lfN^^k%x3H@mW1B&(!17_cr`i
z^^6Jp(`=dgbBlX}K5sPnbq*Ypg0?BEsm1U|M0NFaM^3-4PwY-^h>;DUJ1AduKL<wf
z{Z`u@>RDYw$>y`w`ukM^6dnzBaFht^vOT`xEEV~4hO*E@eJIMj&lytvKqZ+Bo6Ozu
z!#}@1tX{M%t<%Wu_7myawro`SHhf!xo=o%y)vD)?%I9-m@orl0*H7VLblO!t{B2Sr
z+O2wDEr;<h2~5KDiR=5Ogxj#7YLPd`2Ekz6L{60@=0_SXyxHiG{HA(c_3Nw2%uCoe
z`B$YkrxVT?`GXM+QmRWb-Tg`Q?kYC_!}w-$T)Wa%@g&KnD+>l(iH0T5SG)$-7Lq-#
zQ1AHN^&9H_>8PYEW9jA7XSR#)JDt)sFD;)Dx=J1EukBYA%d+f+6rgUH*pa9?h+O>r
zvw*l|>WjUCHzF)@4&IFM9sI=cozfbE9ijRYN)(OzJYWkNHkfeNk~vK*DkLX6KVZF_
z1HYfiQU-IvF}+z!PG<MNotTwW_t#|UoRaklmr_+r5+ln^9N><s?h=QpCFJ+@>^Ax_
zKFhF9^65CrM~+8F@@905W<3tkpMXe$YPrNhq1`B_daDCoX8;SMRgrgUITakGSW~D@
zpm(;#ak@f%`yFt+NA<)6pcvs!Cx3e}Z6-GMzUO3y>xevq&Nwje+K0Rc<h1f};KJ{&
zrI$SPQP_~cpP)Ytj!Q7#HE}^YxW{T%TGJy}YYP?iE}wf(-`~7-&suk-SiM&%tq25o
zD`mcB4Hd-t`E}&UWDfdMx4Hg?FHf<S-``^3QsEDq=ocnexKfAjUxMeH$<WtI8ureo
zsY|EXKNafYY<mc_F3l@*!}U5$tlj0&$uobx^CC?&JUL=KrXZ2ra>(@#TE_Ejay<EU
z1Va6)`gL{t$ke&VM1iixtF$DuIqY$BeePI6qE*rSY!@Q6<P`=Q`SqIiYI91BKl)GM
z;O}}Vuu*Rc<C6T$;QSnQb$Wg%HJhd@>S&hTCB*w$&P(a^6&&iT<6ehrLgNy8FgOn8
z)cB{dZ+E!HBtZHsoJBB{@|gE-c%bU6FLznnsBg-$<4GQ$duqFOCHft)85^hUc)#p3
z1a|(l;m+h~S6eC>CZGHxj_Y!vh#qVXpzi$KsQ<?Z*5cPsU*C=+#mA@(;IJ-iDSqkJ
zmLA(|wMQR6HZ#;W8*?5y{n!89VXttZMyr0-s$G4-{5vWv9faJXw;1qMym1UVLeEw4
z&`(_W{@uJ-mb4M|p?6z4B4)glGmXj#{eX|eVc6*<O-ul5J#Hz0aoi1Oi+tFu3zy#c
zJ*TOUeR3x(?b!{Tks-lXnC1R?tK0W94lU?}B5vTwn?I6YC#nkxbl>7k^^cQfWWKOH
zb%X;V&Qpdi=PwS6($kA=<CM^Nl9l5_IQxaEO_n??uT>OJmNhQ=y%9FyUyu&ma!==M
zYAPNc@M_9lFqE5brzx;mV$k%(fV%*WnJ^3d&_n-p6WdIPy2E*kYxLogv^I@q?wOyn
zu4X3Z$68@~Ht%nq#})_5{iE5xXfOLCJ@$U4nTt;a9@NJ`M`V0@zRS6xH8$?NNcwN>
z*y;hgV`)M=OexCdv@7;e!KrT7fvrXQd@fYy(8Ec8t*EHqD+|erw3p8h`;gsIFB>At
z%U4uNv$#H=j?+6yUFe@|6cTU_@ar!=VN=T;jT4{q??!*|9_*;LI7c`j%PH6O3fN<}
z_#r3F&T6ncd-t<8O(r9zyBi~&J=8wo7H1r#A36HTj7FLBCpbnG#_euFN_n6Eo9=vd
z!u?};leFC0-FIutz9*>CyiBSqb@YE<ee?rD`p@*b4(``w&4>BgOF6m%r}7UsmuzDY
z`%EaK$>iZ;8I6PH&kd~)V|>FHLYjmq7-v;lm<#ZQu4FOTJI=K(*xrr`>6=c?o0Mf8
zi|nt{Ubyo6mj2%1*z+mJ;)*awkmN5nv|@ZT=c;L)!5KV7?DS4*pMY-(Bi+@0<6FNw
zNqX-@O*8yJR;w<?)+Bh7t~-5lUY@R{bl)*~$?DrBb$>Zt{J7kK<3Q}>?iyH*7uGFj
zOFi1g$lYvbw&f7v`e^xOCi8w$qEpzSx~87V&rK0VLCbN2oCg`KhvmhDN%EnB$``bl
z8sSElDE8ym-w%DyH<@iA^nxz;eDYUf009eE6uI9~8g42%Vi|e^qho|y;x5!PwuY)+
za+$u|G97s5^lzTb)kqGp$`qj!J8CoLm1gth9_>35{;}P)j{f<AWgJJgsSfi$5Z;ca
zh|HA?WYzroscq?|xAMBkE@w?SKUwiTwCJ44b7!K;7$P){Gkz}*xy_z`V3H`K`0C70
zfeMsML0Wrn+*yHGmK^RJCEmmBwTnZo;WI~1Lka@*M{#(=EGXpPfCcykQT6dwpW|YB
zIAC5l*F5p)&bjxC);@do@##KldAVgju`ng2j-uyFec*k?^k{KE<I{8MN9*^gU8gj+
zPx1EhixJKqc<%@*d1y=Gmqh8>c)idkT`>bX<+o;~Vy9VqxZiD`7$onT1x1*!&;2q4
z<}BLXMX?xpOSn4F8o_(FHql+OrC)`aC=G_VuO^zn{W3^iD}Bb#Y+s|PO?_8f*MWr}
z(uhW1XkC`JW_v?mzrUVw#7gliSM;7->5Q0siG|yGe>EE{I?J5(J%|mao)}tWr4qM(
z9Cq*VHA$y-Mo(*Aqg+<E%C_z}V7o7-;Aiw8`JbR|G)C35*)CMiOX|#IkBm1IAPVao
z)yOqo?_%K9)N-hK6M^r}!aFU<K}v^fb)VEs91<1P+e{HAXn9@2dWWqfZTtK?jd`CL
z6H9e5zI`xz)~NrZ?^d@(<4v05sfX3dihX<3nZHyLk%*}@OpVs@#}@Io@F_{O+}vgC
zyPHRSqr>klg^z=ZMh+N7%ZXl}Em8N#eeRt`>3M8a>?4`Mp+m&*nyz@UdO*z<SxzyV
z`bQ3@4AV@fjt<<5S?BF0xNs_h!r|CVO119Kf{1OJI1?xKcWixikRHWHyhboc>3L-D
zZIKfaa_%sNH|8~xMi1zVxv5NOcqSBKG~2{2uPfwF43&1~UFJuRq{~x8r5--s_k1gl
zL(z2_qdU#_Jk3h!$GGqR0@Ac`$<nHWfu4SlHfqp^>E7WS98kf}7X|0@qP=CzvuYsG
z`p0Vn=MydY4+OVWy*<0NHVF^iPo^OXeXM-jV}JS<lY^(D-bUMf4dk#CdopFy#qT(*
zmHYU2^5<igy+!T)DjqHQHubgKSBr1mh`A8DJGwB+%FyLP^1kebE{iN*G^3XL8_S+p
zxK9(xpByntnSA<%8}po$L27h%!BGuajk@M078RD!Eu3S9w7)T{@$;Mi*S+!g>+R{b
zw87L`0jH_u&(PlKGNoEB9&qqp5%O<D8<>!?oMK{hgF!x4ZeaXH=0eoN`e}^K48pg^
zoyiv3I)zkW9-Iwa)E+T&zY+&&eodFQdFFUB`NiW66zd0%_j!CY!uddGx$gUImn}`%
zT3kReW*{Y1OdKJp#*`N5<)_P-V3#)=mF`yeS;zXVoEXl3h*)tHoVO#_j2qodr<b2r
zOTMJ8g5trdRGuqI{c|frjphCd&ji`gv$_rpm@%&#ZNAtbIFWM57bga06V4}dec_ES
zzSAFM=X}%P%Pp4e@~f_t`>JnMUW>5g`B5f!0i1-a?s0T+M;nfR87#n1OAuZ7y(1~b
zsfC8j3j2PH9&SVWgylxFWeF@OBUBDHr_+Qto=$LFcsNAZ4e+;tAz(4AP(OMhR%-h4
z&A5ibwycZ`b&~roiX>U|*tm(PP+Uc?)|u;HDdZpQ$V@fRc0bykUjA7FN6o36qq+lM
z_*0b~J@$+-R+$kZ=yXfgZz*&71>vzXykD}cl$bX2WqYgVm2Rxa+m?R(_UQM}5Yvp$
z23!xG5ym?h@>)jOl5q`r8kSv=(f4bRP#t_qF^TE0f5LAdVEa+Dr%S)??65zcvG5_}
zS^E5WfeF@dS6h%=!kX}l$Q%b#o>glj+kLI0K+i*u_w~u?7D8yYbz9B;oY{il<Jzzc
zH5lqK72nE`9`hS>Q}39Vp|P=kcULisJ0yn~YzJTh%~+N!|3x|;-}qY3g>0P5lm&Ol
zB_%y3yABZh@a@ra2qp{@QxsWkLdlv>?j{{wnh8)8t2S1zm@FI!NUtugbhMrybYTd<
zz}XeFjWAx6%5*f__TlE=pYPqhEd?uLat_+5FmHZPv|5cayG<$Af8=`*GJ+B8zN|+5
zTCz$6ygM2jE`1zJsXo)>T?5O=Sf6R3p`;A8BhTK$@}9O&N3pug(#^RvDUf4|koE7v
zxf$&^pJTJm1L;!Z7FD1Se{(yQtSUnIaY~PxP+dw4e`>!o^dkHL=8@YHB10}_T5ju<
zJ1Co27`Tp(mCJ8>_PpIPv<L%z0w&XQ<p#@9ncrd@_nWBmheI}gRjbZ`!W^>+^b<M3
z8y1g;z;Sy5{QC;j`TRD%uVn^UIot_r$-x65yR<fd;BOFa+=!CIHT-1~z=4djz2H?E
z7F}_`jsVk0b8};6Hxv)eTek28!dN?$R^M{2A-f>9*>n5*_SvMwqW+ktPoB^$SlGM=
zn(xs_BUwr#;)f%Q`XjDx?sDKC6HsnDcjsKcSL%jv>DsT**3JjY%Wh7^?3*nZ<8~ju
zmAUYh_>6_Z#qvTRlz9E-8mAoUG%6G?WJ6~w<hB&=P%ZgDE$kMb8<Pb8MZ35xXS{De
zmHzBl8Q+0=F2hExabOJ$W1M_%(N;Q<;e90Uj4rgD=!+tc#nm5(zB-^j>ziV;cx_f9
zZcOjeE4LYK^#yk*H>A(JIz7G$;e;Pz!a)Oc<V#~r504z!Kd3((+we41KGmFjeGCl`
z1_I~$JeEyQ+tY+wraEm&olb~}e?vbLzB~|~N0pe9q~W9SY~}ArrN?px#kquaEtK~0
zw=G-9s+bH(NT|`+eLZB%ZOny|lsE0NWIo;acT`#QE|=8by!fy2Pm|?^%&J&cq%Nx6
zG5BEHTV`uF*x(zRl|h~A<Z=s`q!|292s3{vwI%h(d~PFRR{e+YR@)a<Ig3G?E5c64
zt3H)K_IUimrgQJ2bDFcn7lwJmANZ}J5p>Cv@_i7D&JTrfPFG)%-+Y&(H-3&<Sou}t
zWaOGP=uTZK3o(Jvu&@zP*Z*gSyF;gJ3aYsr{Ar>=s}FD-C{PA?zTEpSUWRKJO*DbW
z+5z{hKU=(Fg@&>5<XfIRYZgtJ`e^pNN&PKr<M#dnu9>v|_pyI&&8tM-1uKD*S;mR^
zI>2$}HK~GqFk=J7f#clrCK&-*_8-nlIRrbO2A$f@zzEy~2BL@Q0sF97QL@HU2f1dB
ds4YMKGcRXXeGxTnyB{bLJzf1=);T3K0RSip(9r+@

literal 0
HcmV?d00001

diff --git a/bench/test_sgemm_zh2.png b/bench/test_sgemm_zh2.png
new file mode 100644
index 0000000000000000000000000000000000000000..66aba94e2f77430b9b0accdfd8e2f1c0552a59eb
GIT binary patch
literal 38261
zcmdSBWmuG7_$~@4Dj*=OARr(ht#pHQ_t2pN(j_e^p@1|3(jCJv<P04GQX<{m-Q92&
z{{H*A_Lu$Td^qRYAGn68ch<Y&S@&~4_q~Eul%%mSNik7SP_SiXB-Bt)?sK4^pfNu}
z1FyV8cm4?;P#x8z#ZZcRp09%^_s!lazC}SPjljBkj}D$=*vsfRqM#7d-u<D<s=eGp
zL5anbm3XV+X1G0#?Mbwpa_VD^j81>?s0xE93Z;fl*MOp?#%}?RAl9*C!urFARajXm
zqhs(Y)5qvQ+t3fQGAOcWn+2~vCnG-xzjO`l!NxsY$;u`%F+ymh7+RDD{wv$d+vwTX
z<UQ&u$;mN}N<FLB3tal6f0JVptcnt<y*Wnz@-4bHUQUlToFKVD8&M(!L*yRG-~IP;
z{8_Eb_JEJq#Zjr?-3y44gM&}DcejZmBTP+j_t+_Ia(DMH&Vv;0p1!1v!T=AxOEm1L
z;4#uug6{6+=!lN{;6d~NFX8U#|Ce9h4K=8BS#K2fy3nSO!Lu4I>h9@jSc;afaavyp
z#k3nPfFw4m=P9|nyBoKKI<&PZzki?cwdEV`Y58l=(jZND_w!MDrR4mf^VKxFEMGc?
z2l+5qS7)cg!F+cD$7+nS@cIZ`5A;*AYJ<?pn6X}k^?1qq7Snix5tHjeD0IBa!Pv%T
z+qK~^WoNopz<AN*@?^U!o^7_)b@w5qXJ3Z+w~yYpe&|@>DwTG#4HY(%xjo)_&Ym04
zJRC~lD;}Rio;TKoKi79PH8l%F<>cfD4R5AruMMVdFRC0CD{QAP=D$(8>`d{kpjGmh
zfE#3Myc$qOSJ^gb^!9SR$!~ZIQOdzPSR^7MDsH@~AznG#UxccEM?o<U7ypKXm+*q$
zQuwCv_Vjkf>vUS=W<Ns9&^?EPn-KlAUHx&~M2ck}?$g&VCWu#=it94xw<~A8ySYZk
zem)xds{VAa)VO_rv0Wf*LHf-YI*ERX$EmHP@CDODT#BvP#zqd7&rDiH8i?G1+M@ya
zYYA5t<HMEi-NsvQ(5t&Xx0i4IAMhDxWpu;mRBRk#l_8MWiXi#!&tBDZ!G4iEGj8JW
zlX%0=(5fnJB3a1-IW+?VxJw1;6wj2s|6$m?1^4Z@!hr!b7I96@k-hnr6xIIA7RsA+
z?7Zcn+*5)Cb9Amlqnm?xf!(^nXGGYsV;jS;W9*Q>y+Wr`b#;Pf%?x1P?|Pq<r$N^U
znHfl3+fp-|tvmPdQGBaju)iJBfP_;B738KSDq3ouY>ji6EUQkgt>>lPY`}^S4i8s*
zQpm&wI;msuXr#`d$+{HCfg0z{l{^Eodt6dP`r{!?nL*E^CDT8gpZEF{{a)+OYP{aI
zo9w7c;xG&UAjTE$AN%8n)Aj@xFSQz?#OK!Arcj#FYv>tKj>zqKFBp&;qhZWI$_Zja
z0hg^lx2(oY6lD#bEq_ZFJ)Sy0KQ!8*#E9o5&|zVpo2#<2vc~Jl#$)wjZkvg05!X$4
z<%7!I>p^Orz^hZATUWBhoO}h?E<As?{xr1$>9;Xf@aJH;3!-8(Sy6vF<8e1&p~rot
zdLC)`>bVLSV{ebf1F=ZHhMIxZYRX@`pN#P#BX_J9be3^@ICMXoJB^n>NvGsp6PUKc
zsUBzZ-@JsAGIr}W3Sd(S(qW`I^IdF>g&A~ET7K)Xl6wzvn03ltR&_#*jEo?VoIq9k
zg;4%wFlLs!J*h&Ioe$l&O52A%xUf5Xy1hO%^4LZywHbr8+1lDlF5seH=T^$WD5I*X
zYGGlq7jl14*UZf9VR2%^TiO*9r9A4-H|MJ<q3vKfAh?GA4%697N%nHr;5U(pB`Hva
zP~W%tjm1`{V(cmWmhkU2Q-Q(hw5!qXOudaKZ-rNj4c=`X5xeg-#_quw-BWfGna9dP
z3`d_2I_YCobjDdo(D|B7iRuKw^683WDgRTbxYt1w!KuS5BVC|d@J^1?@f*7CR}TfN
z8;7TGLw9$+-4!wBp`}C~yVS%FmOTZ}wnK#^>6MV=!`wBSMYWz4!oyQT)hE@fsg4JE
z>RFmbXJ?BMOg~+u0y6_u%b9(3TY8oV)0g{8I(yzxX}$dE_h$88hB+B@e1nwNK3$5K
zkZ??c&4%q-vQ@s@dRZZnv-2j-kFr(X&@!D#ct{Xa-sxlXSTlaVQK@tP225@(*=#XJ
zroZ@WypgxNKJ^zZBQHqH)s-ifd>osRu=B={ViGr((qqMp?=ETUgLyE`SChp?jbL4-
zxX!q96T$x$KwzBKH#k^`#}EyO7OnR&%xL`S3$&etbe$mzQhulW%2;UJLZ;;Rp@FcC
zjUW%AzUv<zvQ?RxVv4LeH2N<zYP)xG^6|Dp5Bhi{%-JW<cO&B~htYU%uMS%brbp3M
zS5`iP!7R*`n@gI#`nV@O<moy0<>8N?e2yOjmP;=g=SN!Qjm|*yeHb51%zWl|z0?^q
z{A2Xz_2ntkKsj$y{(JbDq8ZL^2g>|T?-@z42T7ZpEC%{7*C~f@nj>uwt-g5Bad+10
zh})uxOx0R)eEeI&XwtPyZagJ&v1xQ$gFt9_B!`H--3{ZJd0MPTI|28;oSZFa*8WA~
z!lN*3c+6pv08_H$qcOr!nakcA=yc=G@rdZ4-yYR<WtnX7x<p)Xt6}Aqk-mN>y3`GK
z+dUpNa%HFB<Kxq9>YyCuZ@500r1ZHe)noP+CSr3bRR6Hl5rxVF<5~yzRGHZqhw08E
z@Z1KywD&VplE>LGn|_`O`O{gS+w19C*V=duvA85X%Bx4h4CY@dZKn(1izAruaLX;{
zQ3De9&HHwmUMj<~!)50JKSr+eM6RLRBo`B4UwLgP0PU*XguYwtO|x5Uap<ka3lM;e
z=N?f!vsRaS#U${Ou_;VgqOG?22|6+WgnomRt7K_n(skamriYZg2~w6!p6AQ){A?70
zZhg;&*K*<w2ma32OmE&Vaa>g**K?jx%g=hVL$GHly0({BJ{^QdL%NFHl**yn!oTE5
z7X_<~Q^d+ee`HvpQ~q(?O5|cKD{?pV^FlDa@YS9;*Z4h(XQOZUmkx0oua?=6T~HI@
z>mzk00xenDPOw(fV~4G|xVZ4Z(ECXGbalF!Lp46kaH%y-9I}A;Go0Yj9!{=n8lTe0
z<#do?o^+v;{RokGL6N2$$+JEn&sIO@hc#q7^1;e07UvYXe^F~2IHa<?Zu=o68)=k}
zbN~Th@+rp5eeX4^G_RBK;)V;kZ>C-&Qik5=%M+~#OjI`dl4Y=s```3|g$k1mX&AZ+
z>1xuiJy-TVAV9uEn5`=d7*8Vl%NSr$x=qfTBPq@!Iy;|8W82!o$Wj!Z>gedqgk;l&
zBkevPt<9HLX{g-pQ{HC0Ql%U^sOn`7E+h7^!a%Bu?Yt=a(2BYHxmu%lIs&%j?`jhJ
z7eET=3G8%nCr|its~lF`F!3zuDO<x?rlwJ_$8J5(yM2V@mReT60wT%Cp_Ryc!#^BD
zbDtUJRbbYeTG4p5<Jvg*dE6~+qSB7iKD`f3<oal^@vN0i!%;)CXH0!Lw|<Q5ZS`>z
zF68O1w}G~H{yiv8`HXJHa6g*F3_leQOgO)Fkkw@+!ED2KF*K}GU0KLfPw@yDZ{VV%
zkK=EM2`fy3<-bFsw?MqJt`VJ|a152;ty9&GTE&Ls_9i8Y&oT-%w74L_BG>E6!86j4
zBW^wSRZFZYmFVcA*r+xmd7L*i$dNYOwJ+=J=f%j@>sR2i{C816PjicpfJ>r|JZDQ&
z)YjH^iNr;+%8dPe^L;P>wr_QU*79q&AL_i4VlY~Z+Bcs+o4<*mY$Dm(X3sY0(qNJ>
zo?_B-SyqrFneQj*AHir4G5%iAD^)eVP$vHqm5lZIqzjduiW`)?&b*&GHg=?Igg-rl
z8SkbDlJv+&l4)(`O?kJF>7O5IH%EB|8at5pOq0HsC{0PfaA6@$l{kgTv3rJemCOB~
z8R+(XET(X|cI)~BZzPHh>j{-!P51T9)vvIi=B%<>{_0uFu{+#h-)YrIYhA9tV!jZc
zSkt@*vC;6|$p-k2u?@@p%ZN~-MN2hO%SuaAbH!VqiDK_(@-PpCKIeG(q~}4-G33N%
zz$C{8=M+b(;}YLi)^gmM%4_E9Cy})SiV@CXeAd`<E<bTw({1vZd$~wcc#nA#XQP;K
zQg%&ti3IwN*O}tx83=C^odNT#@Kdxpy;BKDGY-10hK)KCHik|+ns7d2hP~3mhv*z3
zJJz06<-U|nu3r*;uy}q>6S=a#b*1xnKNTP&V6G~xb5<}NhVQ$JksD9@McZ4M$>-zt
zB=Pjt>Y*AgFE2~b?RE7m&*SiW=W>r9lw_rnR~IhE2xq?GNBf+#K0<fMFjwA0&8)W+
zTh^+wwY2(TvnVmepJnqp%kpT9VMHlLk=5mXlaZ}**hAObCrLv~xfq=uQp)#5!USWJ
zj#C_!6Z%8;w5$nmpYy6T!E3AGA8RJ3F<-wBEbm}^myIqqA2JaqqObh4@}{a6IeGBA
zo_wp}o#>X7sr~6=D*;`SGW)3DTg)w6UXfUpZvD!x5IGtXX<-)sN_F)P9@McSxWwMf
zwvF@y=IBd?*uMY`ZBFk)=v-WRJ_XDw*wP~+FU$0zEn{n<{L_|8ztd?Nv=DvQ($M1*
z%V=$z7De?jkk2{jr>k8_w5s!=eZhGk^|r14qlrgN7^m%*OXkjZJ48Gz@G|cZo(Hxa
z8@3mbRvXJBarxm?7e6%Lq7#+Oy8EBlKP>G)lDrl~jTmBgT2o0uar*nIm+5v7_cs0B
z=U;5xYb8bg(D*j<x)rB`y!`XC>s_B&;o+C$j_qM&d58H{vkhLPw-Z*yeCLDXd8zIz
z&**Q*jQLwE%oppOCQ454-#@zK2eBkYW0Av8i4pbBpFf<BZ%m)2aM?uv8E=zVj+SRl
z;B(AXA5knNaNMiFYOk<}(XiFjLtsu!^?VbezMW=J&*k)1d<iglR*dDKl#l$X0qs^!
z{Rc&Cr1cMb9dY`1*{317@wyt0D}PIWW%=CR{2@*0fY65t7x6hS7*$v&crliB#?Wt)
zC5PJSKHy<~#%=vGk%Y=4%qn|g6J!PISS`<n)eJrNC7JYMhkpkX__yGzCMm~o`VZX5
z|IJy9xJ05BpO(O1G_hF6c4o`Q;FHF6v{U+62hVwp%cFjUc*H_thDEWxm<G-&V#6$d
z5a}u@g%_xc-i^{L%Rs$m^K;0}%-}cGlcL(4GQ@_s=$9&3vAXi!^f5>f)E8s>1QQ3P
zq)eCp#Eux^-b`oke%fWnrVt#%QMt$>cKaRg;wkChbeR&~!ZYm@lT&}HTsLKC=rVxX
zOvU~dv!CI&5{pv9zGv~`yI%~0x6d?;o<hDGNgu{2cBpTKW<JP?5lQj--N{+Tq5m2g
zb_B6x9H-%A_3*LHw3%%%@G;1P&3*imILh5gnjVs_R1xG=X?`3?rB{@xiS*;NvKPow
z%4?)mqWcl4$JZ>4c-h1>mi=hB<<HygXkTZ<Y0`Kn#1s}*<en_LN+{rHwYH?J`hw*7
zeF}c3j-KTSVob5P<b#XNV%SlrWqfyGz@L$U`B!rqIm0|_{yncKkDzJSTV`>ZNyVLp
zqKw<b4SP`d?gOR8wy^Id+nnp+0y`xLIp$A_SSveu6VwyG<4$%+-V7mD1|O7JrfYh~
zV9ixb@Yu~*2is3kA4|>cFSuaUZC6Y(_bW_b63<U^F_VpFKZ3MQmp7+qPVMg`-#{01
zFw7m#L~L4rrJ}1Pm@7_`yq3E3RpH=r=N~K^KlXbnhyN_F<32-E{rudg_9WFTsdKwG
z?NW+TdDm`oOI_P14D^nFKuTp(XVv6~?(%|y(>(I_K{tS7$y~TCx03s(vpz;G_Pbak
zecKGSA|y156<$eBS7#1zWPlZc=f!&7V4m{m$K%^djp&P@i5<&KzASubLxW+<)RcZ#
z&DJO62Mm_Bi{niIAVei!qw}FUgy(-TrMKo{)>xLH3VX))DZ3yof!#C&(qVsUsK0Za
z{0O`3on7N~1?hsE`TIKM!#o_84B8=1yRwwL0d;rXn$A^XG`G{iEa?c{=F2MguAyHj
zwxL5XEjD?HIZZ}%3cNT?MKb|@==$?h`>yLxB9mS8Jbed^gXoXkY<xZV81!2v$n7@q
ziE54Vu-{mZy(S}w#*~v5Q>`Os4;jhm{b;}G8*f0V+Kh9sbImCrrBg$z(~8v@{*J%r
z<%M-PLdwK%EnstkuzE2@F-)O-B!*>N-8wslFLpRXhvn0%ywH&=7nhioA*J<!%F`*6
z(6@W>`B6L{|Ni`P@8N@~*TD+ywr1TeDu+jYSgiId;rWGy3V9)Oo8K9q)Xo%hCx=x7
zQiU0BxJysGtW6yt)n+c&Eq5oDpO=*g!3ni_G@?^Nn7L(J=rb1k50i~PjbBfK^5REi
zSXgKQr?K_f(b{Xijt&qlB`*65t>t9U=BGAXT<f9Dk6Y_QAa%GE590m*wAZ9aOpZ7e
zr$`ftRb~Txtk+ym^(28T%*v-!!m&!rbGeno2q7U~hRGo818p<#;>pBXQ3A-^mv*6J
z+haxeU`D|bS&QT>Xn{`Rb?q=}Ptb3Fj<Yv@nuf5`Z5_lRI$z3!+QxXbh7fTN<OofS
z=pF{>yVah&^tnCtDb8>)-0Y*>kqjZ8dk{ud?Dy={{^D}>b_QScH8p_*z5Xa&?F6a2
z<^9>)o73%7e+NAZrG`?nP-K$$4MuIq^9-)pGt<tD!+^w_#rBAZ0%TrHWcf<Qn4q^9
zLv3<gY%G!I{uao)`ujZA-&CCBEB(T}Vn%b)DI9w#psJ%#X3~jo1Ff9-%(Mg%XGzjs
z0#m#(6Ad2qk(HU*DVO#uBq?$<ukkQtS0RCY*?tk1xyEZgvBk_v)~6%vH9WTP6R$hN
za8`c4jDTORl=H0kaElA|`AB7hX|Tp3Yeuha<j4eph?Z1comTF&y}=Y+ylw~<F|+wR
z*1(R#-_;FV5DW3=o84F%C$TzgeA=VP<j`l`Lc_?UC5fD^tZJfU_s~hTmeTgaz8ILO
zj)(8>e~RkQlAfffH*UwVqgSw@ah-1w;#lAQV`^MSG>ngz|G7$RCT3q8%fd(A3UB^F
zlpev9*Rw}=6E)vr4H2$v7UtWbPV>k-p<214`k@@M(}_{dvG2U?RrMD|jl%V3=NM-{
zahF-d(XABEeP>6tHfK|9f;I+>@6#`e;HJ@2#O?)P5HQpACqS@qPWOEyhrAO-d>W>g
z04JBF`;A)|4<SSPwUAra7p%#Ngfozzmq&GHhh$eZL-jK9)9adkUatgu5Y3vlKHc`<
zo1SfYlJ;H=>u|?hIH`ZfBQvRv9e)mFjU@qcV|t;OCtEHF=9|hX{K(b|;t=xi->cnf
zlc%wLJH=9YVMk@{e}BkGXj9voWMXG@=S{#Yt@q0nQU!I4#SFhb>I}>Jja}ns!EI&o
zJ+X^lLhyVkrXrC>I^1a&w-(W;-F$M|ag?G|z<?>gMIj%c-62n3^)_!NBD2+nY4}ep
z)3yg<iON+W@J)rwi~y1CH&(bt3~gacvAk)HB8OQ|a)(vHt~K*q3?g@;@T7v1Jlsh?
z#6G8On${-AH!?^e^3#+!d`)!S@J;m3Nd@9D-e<F0Yf!0*@ZBbBib}H28dFkISZ;vi
zac<r-3>mc8Z5O#I#aA`>{8-xNbdJY~LCTp<IvanaY&{$H(-ExS7&=8Y{^sEB&}SS{
zv&EWO)5I#9ujg_s4Wllfk$UMH4V+EmFfk!@#o{6``(`?zra@Y;o{b9iN-<TP@raP8
zkh<}5Gpjer_#&+#Zzj4eaff9(zCW^rYFn}UlE(Kuf2+{~$)>7?7v=vh$wI$X%B0j4
z!<wL-^O9qPY%AK@XSV#<=)8w3n7&ecOYHJ@s!cyeAgIe`>#HZuj8`XwI@Rk`Z4n}U
z;G1?b=m0UM@VRn0ts+dgeEy@rJb<9-J6ppVqAZ?S)p4Aaa%pvS^AQ2N%-)RQBlm>P
z)dhFx*T&-!Jvnn#f8z{$Cv8WX&gf`-T7=*UN<EImJS$bT)_x&VysD(LfRhe}gygYN
z4#|A((V()34K4FYnKgOawx7(BBf=wteq!=x$14ds89r1y!v1mEx$%2k!Dtlmxb38c
zJ?)vJOH2;eQz88W4=R(kofd3ZVNrQ}Jm(E`t7$!IE^;o$b*rf!Jv$cnMG}lTcoPQ4
zv)^5K-zw(#vJ&t+_NIv#<(cfB|LskA(t6J_zINM=ONUf%mrC1;PT$R9N@grJB?V3=
zS(PvD<GS!Wn4f@tK`~uvn%}_wQpyHx67Pf6+CzIbyKjX0!gS3kZ;(Yp*;t((K6DvZ
z`W-4U(+jk!t?KTU7&ZgHzTER?^5w3#-0Fy;nYP-_Y)h+8YU<2;d^%Ab_c}dwO~p8g
zjWwR0x*Y4th4>B``B!YRkLdXY$K`3)f0gxDDaek6;vRKgVu`VGA&E$#AVVuR@BJjM
zn6$bS=Bt>bfsJ?zq5Lpb7!qzBzkl2Fd9k5M!N<V#0A%_)Cp`kYkE_EF{KRhgukTsb
zk~vf_6N_eqa|fJ0&qHuWdf14)RjqQyWi^2p$C+#lK~d>9QD$f=Xno&-erMr-zR8?9
z_|tUz<q?2K4-uNq1&*Y#c<Xzs-Vl?SG-3y?@wpRsCaLf9%zX?l`X}7+G@J*C%2B~e
z1|!-UBSIGFm7_LwG5aO^-fI&1G3vj#bZs;*qv_b@))y1$n_JJ`&3kuB5FVPhI-|K9
zykA>HRW60eg0$3)z;^-_ER&E??3kZW1ETnkrX3f%Wt5tSpXh$Stk8S*=8-#wpf6*R
z3uYGL1TSppt+?XqoKdB{#W&qLHwRJAOuIb-jrO+Qn)~Y0_LKXly-l?q=3rvureAQ_
z%?NI@G4gD)X9`(?9hVq??VY*EFOn~RM!3T;1()Es@h*C1#kNz$$lY4n^xPAWy!-nT
z?0UWKHm*b)dQ~Y>@MkVn;%aMmLe1sJ;OsF0naBDiL^IC&YQKFuJ2EYea%Z;&x5}gT
z(=(n|!;Arif(2w3y1RTz7O~q~<K@%%>KFl!Nx34_e~0ad{+U{MHu)oX5-XDG4BPGO
zSbI_VeB;rM3(+xPDAK~^qVfvfW{LmgV0=tI<fJ_s)sM5ozXwgQT4f0vs#XkEz45-D
zP-0QMQPpEh`E`}-{2@KpWshf)G6jII)1jQhcAmy_#qDT*whhtZtVRwhnI-&3)eGDM
zEaExGdH!U``39XX;sa*bpr@>?thji~&GjWN&&^&nM@AGpaK*vLU&p#hgIQM*$uT7A
zKqBt53_`k!`39>QhYlZ8hLp+D1H~BXAJudgl69ka@K<P3mTXTs%~b*>%zc{cKdze^
ztxe~0mnWj5?w9}B13Ve1GH3;Sb$&hhTrMAv_&hHw?AjUQ)ss^XYOQx{umm}Y8OXF<
z9NnoG!90y^>)t?(>*mOISr@DNvDHCAfE8f9b-gQBc-(&lhoueU>O}86gpd&_j>~LE
z;|lJH{*rO{PJ6K5x2kD-Fo{6uexyYsrulgZDJ3d)!8|4T(?YI?Aa@=pRgWzzc&sac
zJRAO9S%xn{)JqJZ#PuHOQr@>59J6cF<6b(gTKSt`EQEVciMB1Jz?{Ay>q{sZPfy%i
zs+wLct^$;83xZ(#I&9Z9l*f&z5wF%F;qUh#F2>TO9qLIM4jVa<Gw~j4CFb!_2$ZJy
z+6ATd?npMea8b(sgx+({$VjsnvB6(aSEC~{#OlTba;<4Xl=f7w;gRoy*W?kbWMm$Z
zdV2cw-YIdv6Iuq{|K{^$I{q}^9hG(((Lzn6PxBnu`6=mE`#oM)2CpC>SC?;dGVxg)
zrz}%u<8qmc<u92uk1rgj>swg<?qq$8nCK^r;;|JKL6h#ixTnHjiP@JR1KEczNuedR
z?$rhl#UFS`*+?KGL&eyU<<7tMDdU0);ylic(7d^Y*PX8V4WR)AktdJH)@MY5pLios
z$OD=2iawLs*lwf6g>Q$E^Lt|RA9sW)y`jP-b*cU3WiKBLYBd@((~H@)!8CnM-NeVa
zNwy)|?~0V&XXBsFw{DI!ieiwzjsOLuFtT6u%=j(JtTjCE8_pK6<T+(cmv_ve6(ld}
zpCtS$c35m1rxJK6@X}rR5Y<oM@BMPE>P;A$?gIc4NBOoubuWBdn=-+p$1(7GadUrJ
zpjFkkr96Y2C(XegVh$feQhCIJ;&(k)1^CTi$GgI<Me0AexHdL7hc#W4(@1rN7+6_n
zYF&%bl*ra|o@oqva%1}d5+A+J@vpo(*AU6);&WQ+s^gPk?vbT|zD}M2Yj5xK-wBsa
zwq&d?k*XmfA-PH^mS5_1GZRHVS+V&tmejIr$*YB(c)4Og+?nwbP88*}2L&Av)__$_
z%ORRLf1I2-ko=_CX1zwJYRGz6WjSHZY&PuDJ8*1lH$krPqagm`^l|3rfzCg2em)bg
z#@f^y-F-l6wjGT+MUqyf%!6)(uCnaVD53FfTUh(Gs|X}k4?pBmD3NL^K44LbD=JMC
zD;?V;SR&axk}pVMPK%?sXgw6ax*AI0%k^Gd7$OL=7GKBcK^a`O0xZd$iE6ll6;?sF
z&G@^Kj*-_BGw1@YI|zw@n=1ZzPAQ>`alsu1j%p33!_{5@A<0sq3K|mp@ssP0fT(vd
z9Bo6E$NR-hO-;>d$#;Wdm6_t;2JDrFnHcwdn}P|$o#F!~xS0M=QcNInbA;!Nu0
zyB@585i6z%8?fLGKZ9ryAs57BeGtQCozfp;dQNy9W~B)e1oZ6LuBr}g$H!(y<9Thf
zrff;_3$%v{J_J{1?0&mkBvM7%-mBsnpl$B*OCmfhI-Q)o8Ns(b{{Bmd7Z&MXO@f7x
z;$9hTyg`OJR4dol!ml8y@sB*JTNl3d8ni7xQ1u>}a0aC@0qASBLvLm^>?jO1zwo*f
zC0z1{Ufc@G6@4@Zj@@IV?`N1!cQCBi`Z7SRt+1c~^ZY=91%4Ai>UVo$ce@gLunj2b
z(f}ovc|U?!|3M90<3B$b6dQ>D<}4s*A)y<bg9^G_0<HycPgowuFra3p=3X9r*QYMm
zqdq@BZ{jJO+He|F<f$e*I_wqEdYYwM`e|8?Y!LGf<Z5;>4qWaw8c}6d=4UOOq^si*
z<;+;+{>FZdAWyPh$VYpnq29dnV!R_RDqZ?&x<v!gEwGKYBVaP?eQ6`oh>1@E0JV4y
zEn2>XZSW)VoT-Z1a0AupM4Y*L?|Gkl>oOl&bDCY#7$ROw+_U~QEeO%AolM`(S<Lp*
z>kyCI@a=}$*(+gWRK?R1CImdTYj)t#(KscWHgY*lRjbE&&igJnPLP0V`$@tJTEWvP
zM`D>XA8eT88g{R;<L~fns)%TV8YfU9@Nf#i-YF~I+AMA?l69SS)<TewJAW?G6Spkh
zB%{7hAGQ8+dMRxr__nm|6i;OD*S!<z=}mi#9Oq;XGo_A_-&IY)^&PYY>ba$Q)N-Lu
z-!!+mjrYb_lz+Z-fc`F&Gxx7i%p^2%y+RA9MK$l<0j1djnIv~D^=+-wdQrx-d<u1o
zX9Y600_y;nI|Nu}eayd358y1<_Wbl+UclLrv=6VUK6q*+#RsGY+q^EXR9;7>FFP5y
zjGM;w6_K(heySxKSGT9CR%Yfr99wq5_-=V`{kO^fcKTcYtLGbL9)`L^e}($1rc>+N
zrHG;+x-1#wBUAA>uynMY$F<6sk`0vk{{9V$81TSL>S8(u(XzqPv*339o$V3(FW)AM
zd*d)41$W%fSbxI7m}u6t-M`V%Xts)ZmVk$Ry+yy|#A}uCF`&!V=xA$vung4+t(-&J
z!J%~WKt)W4TpOq_p0+@zdtO-*uO&fn<l3JnbIRuB0F6eif%ovv5G4yEV@QdPjS{^h
zzt^AM9H$f`P@3PAGMZ5@j@KO+!5P`Gx``Ajqb?z%(`}klA}y%lkFvT~eO;w*tW-aJ
zU?y^@^LSpRONE2==cLM`cjT1|!}Mlnd-FTPR*+AD@l<3#Y%jBxt21&fovSRfTw*V&
zbA8tkd>W(^SYmAm#wVv|>2!s(3qAGd^M(m0y*3#&L<|mH>jyJ1jPRGved?{6VpzmG
z4kWP!HqzY$J2DH2uiyJMaOlUDVA0~&El_QiTQ+c4lD8A$wx29{X=oN|&r7{^(H=Z+
zt=FDAIbgTlW!}YQ{<`!aX9mZh;4gkr;Z!^?Y{D&qg^?hoTuhiED-71`YBdT<f@Z-e
zwvECfxZ5v`_eb6Z<oX{+tFu8B?jxU~8S+?2%T~gRt7KZ@#p|{E=mKjQ7css2Em6o7
zww)uJiVcs;R3tEv<(r$E+qSaSV4psM=6p)p+KE3EIA)b+I-fh(S2n{zrdhWhU-Q$#
z6tf6^vXm$y`Lc*IUWYa4gn6eJK_vLUxi2cDtPDTFa5KoQy~uPd(QWf49lv1G#kvlj
zJGHP(>iPLboK}$eZx<_<fBg@2o7{h@hfVPS%x8aVF_Kk<CC&(%jGhi3oaG(3gDO;E
zr#T_)Xk)mmh<t0ZQrQNp@|p@2YmS2FO6y&v*apo9ntS;MYqsWgDR{dJ7V%n#U5T#T
z3{zGdh1`sIDpWM~BAFq1F0r-wck+wrIU^s3o2&D<@Y*?JNtDW$;+l2M&0?n%k{w>s
zp2Xj8;e}Z>WV9O*pwR9FvW1L{w@ICFNxUMXv3@lSk=p2Oyi8FstdG6+Fgf$EsYar@
zFNgqT;qz>KdZt;ZO+kb<Ul6gtN|u-JqgJbSPFlp?49Tn7cBY(Z<3zn1RE}>;Md?*$
z{I0T@j7)a0U$wLACF!^|fK^_aH&lDx;{8oQ-Oy0U#IMgu^lITd8#Q9|-WXo{?Yz{t
zu8#JLN7!tqkC&ta)Du%Wt~u$3=$@Qjksb#Bx%u#G{bjlk-fYM3XUjk);V|RA8i<8L
zP3W=2m;nVE97Jw}=3c7D4#ulYdoGF1%d5lQT2le@z87{JoIh_OTPdi>I2}oi<|+9$
z?8#6WDPyC<tkpE4M3)qwC<Tb%T?Hz#MjY#CS+(67ZVi~te%@rMgyKcJB;UafRP?pR
z?u}2IKKD_Fr4FrpoRU<iBw4SX>dMSmby7h`ra{pXufUeWqBlj@s|J65BFk3xtG?^$
zB0g)yz7DghjVVjC0o?w8VqzAF^$knDZhNkSBtjvEh+*l$!2a*+>gad6O@K&hFL@Y9
zSmcI$Y<q^Me^9VN992-ooT_iXf!FZpjc+{JTy?2pOf26NPc7_;($cl<6f{kC-~EN;
zgN0LtCviAF`~1(p4PD8r@FOIN`#xf(A0m$5*_LVfuMCxr36(SM169w4^?RLL!qD+x
zjyZ+G#i|}3IrD;>xG;t1HX~!C$^eJTiJNI+0r(p7{L0QK<*RcI5?^J!;K`7Rif_j5
zV;xdjT3Y_D0te!$^!|0t_rX20mOi{$VydkrN_5bvJlJ_SrAX0A7hnRmm$EWezBfy@
zdRsO#qB*ps)-)HOC4-D+;e5`Z5J=V-eQ2%Mq(N~pF_~{J%JkZRtGk6IHPHx*gf+BJ
z5Wm|3;}%yOrZ6Nt5)V75q9kGQO>|&<g5FZPts`u)TOJF!#;vxhl3z9uj2E^YHq3z|
zOQKM>{1d4@l~(ETEm2~ohaCeUOMXzD-XHE5mn-}C!BuRn17_FBp@>%FF;UX2cQ4YZ
z8Q^b7i>tbN(@r|!tEjD)JGHwL=aN30ye~BH#=i~<jy$UZD@5EYsW)D@mb_*vzPKt@
zhpgFvQB~74M|K{Kf+$E%ZIbiM|AO1=b+dOKM>;$gj`IWuj0oj?QBXT`Gwu5{tzsJe
zN==U;?^u2Ml1HvB&M$#<t==GIAo-%;;^OnEZdrI2|AG;y&3kK(*AEoaxa~otYCpOF
zOtBsiwdO$=CFyJ?`W?=ZLp)ahnS4HiTSwflc<|vYk%I@_TJf6QxXhv<x)Z%c6KP&@
zHukEE!-%`r>CW`XJ?T}MG_RUW#)L8MxBqLJ8@z%DwDdwC+31zhngJ9lqL!a<`-coA
zMD19AE0Rjm*=lDxb)ah=x`IsxQ^uM?^2H3R)P)<!EiEjdmDq4mu^v#jzY`pdW3g8K
z^2mW_W-i?EWeti(cRe0Z_bBfpf<?AQf{s|RIxxM1rRrgPd>ZKj%j8^neD2rpJBeGy
zPrmfZk?FHvWPQV>bn%J)+m-5|mT+Hr_|w-(PCU%n#OxVd-Nr}g6d_@;^?-Mu+H5Ki
z+pRkc7pfB&l;(MwjYTe7C>?hjL@I+9L$6$+zR?_rMex}b&myejzC(;(l&rmR&)U|_
zemf=8o=Wmu>U2Dtk@sD7q#0CD>fknBSBX|d817FEkd_b~odo1<-4O#<`gFuA8T(ot
zE7_w!ZsF(OnGn|AGlm~n=6o~F6FMXao5vU77kW{YICwBBZ#hxCpm6ynqRwpta{X>Z
zgRT|*piKYwueR2aNl8g5Dbu{m#l=xb4%1&JnDizl_<FpAwuAR?F<^0W+K83wbUji$
zdz-Rj2P^V!fowdU{2dZd)%*o;m@PhG0wB0)&)7W!WL^3VFu2Cy)5IBHiV~*PYp0<l
z2*B5!?#?!LawFE{?}RPzTJ1bKJ4cmoi>6(17SRKEalgA3FzFYS%cu!d?K3md^5hD?
z{;tr-{N%mgN`h2$Y_2O6wSqrg1VQq2Mkwg-#n<!o-K7)i!?;i>@57|o&<bZzJAdDC
z`LrpJ@`<<GW*t$G-X_|s*MpzWp}%C<b`JYmn(Mr9WZv$$vxJA;N61Og1ts~$&vU0C
z2gE2^sTHDq<LYl0({;aKV`HPEqtDw$bIOLbMv`2{^EA+Qt_D&g^=aw^n0`*Qqkj1k
zEBg3aXln1TkI6#Xbb1gU`y0m2t*tHInTL;9>m%`X+1s$a>$o&0RrS;{sIjZI-obe5
zI(gdA7@9w)ap%Rev)_-xSZK=({8%AS{~=tVFfeb>#n!f{_mTo0TF=Mx)zLR(XIRS!
z;urHUvTVwYYst&_#c-9D%aze9i8kv0(eZ{ToC*EkA^83OPip*s{$(RZMn)jaTr%-}
ztO7Kw{KXkDc7wV3`Q}6Us#}kMY`WurQ(ChR0%fL=&$ToCA}I5f+!d#T0Y?CGv#lGD
zDxFMN*{FS29oz#_+?g6@D<GOJ17b7(2iP@$l(w<_1JI?Ijg5`B_f1xW@J8st7+BJG
zGG!A|^x}Ad2s)vwy%vzQl<uQqVqx*vPUZgwwCy!v7f||X{EUr_m8-Ow<Yr?lPEUVv
z+{J-MXbhyACLNJfclu~4_rD<31>&n2Xo$i{kpUS<KlHLeDbPoLJ)B(NWOvpFq>-pe
zdLD0qk}HlyPXqb`QeU%Ev#INCcGU8xyVLEiI8^{EQfp435(|Up=eq$;(fP0H7iPn{
zv1gOH-Nq3@xdoR8U2Fp9gOaJ3C^8N}V=~3jB-<WAIa*@W=xzDU-Qe?j4!8frL(A3*
ztI?h^7K@#!>X5{;@^Yl0u2?HQP>ovB@W-e}PZW>%X<+^J=%02zT2sX)BGTr&Q;{56
z)e|#ozvBq%&iXLQ$i!V9g*xP0ULvu@CY4T`An0Cp6QXMu;&L=vURH((AdPW3MU?`Q
z)(1r~vY|>T{1*9%cH{R|-$4&XHTzcjr23^amhXUSa%UJ~9zkMWN8EXW)?NIQk*hMX
z83P;s!NUtgjia9Q5P0O@w)=yucI8|;FjN#c#;JmCL=xTYgskMz$d!-18U;qp)8{0X
z3bQVgHamV6d`N`lh#*i(a?|bLC|?TD6B*{|G&-ndy!tf;M&8@IK{s;8i@tacc5LYd
z8U}&CIUOkL2H9T&<J_OpF592^s<3x0mo`-6M5bImyS2LwGxZ*mGOgT}=s2L7cVk|o
zVv>nW74oQviI83LpliY}O?uU^F#=EWh>-|$MrV|(8W!E!W%MK$@vaLit$(2p5BkQD
ze4DgKvg#8-XQw~*ShG}ohP%H3q_nu&aLtiR$P@>vuV$zRoy<MJr<`)%9}XYtE6U3)
zEGuJ2N145xIsKt|*d-;E`_<z1<~sIlEi1x0bbSaKL!CS^F|j>aDUc-yB=ngp&<T4b
zVL>kUVRLnf)4dkl+V{b)ge(TKk1*wRWA4Z|hp0(c`EIA-=d^Xh=|IKVGT-t|PZL2h
zh8-7)q-J^bbVS7?#I$}3Cqs;+3k`++VHAevoy-^v<+6zq10&;I8rJ6Fen|t-(0C<a
zJH;C|s$yX`O6Qd)Wf|JgzS3U_H>e#+c>eX2Of5$9?U$Gdh~*BGb$&sC8XxZ<HBixf
zQp;m{^{UVQ`r<gwnPIGt^1%IiCearNMAWNRA)z)nuH{Opg6`)B%RLCG#DaaxCsuXc
zxf#w~{btBuG7;A@801%687^bSLH{TtxjwCRs;`pV1!I^~(wr?sBRwaeYPtrE*l;mv
z&>pw?bz3cFU|=As)kc{lw)O*4F146s&&aJUYx-fCLn|hxv&Uoh^`DOi>;W)|QxvFh
z-5M*Uaobf%U<&B@YnF!cq6LWkr7R<%UMnu1l~Hnf9J)i9pTX+8la;<9orUZ1<lH6B
zHf_+(oHsc+IY41c$!q^kT--m=pK8*th{VWKRlM7JoDgB!MT`wsDCG2<;XdEC^MSnw
z`-!Q6WhOKuw~sm<-P*`wF^or-o1F`lE7X%*cXrn+^=)7w_Faj^BTMlR)NL&461ysf
zL&(yu68N19GSR(M7LQktAU#tQ7mVG|Kk4Iy!4^blsnAEcrhR>>HF$78VOYQfQZ8uP
z3RByyJJ(bjVXo@v7ZYjdvCXy1XgyW+W{Pnv*AuA6S*(~n#!cgOxW~YR;6nTow{At2
z#X0Ldw^9zuVZbWzx!xSC$*?>vhQpCIm?#~b5_D&<?j#;q=W@sss(hmS2MN0J<z~GU
zKq?H#tin#KFv%q8FfAlQo<c$<G*Q~oQmx}#^(y#ELzDrn*J_HZb=@6pUm-(4r=X&z
z34d?b8}}nKTGgMIp1%FF!u9p+ZT;6MW0uk+dB*3+^=Y&twScT<Pr#R^crHp4KFA-V
znZ5VMR%UU#oyZ#Kk8~e+<=JT3L_Op9Xr5H?_nzik8|lfi9ZhwabUv;|8!LEF1EX6j
zk<l~|32V}Bbs>o_3OU9fUh{crVo1J_3yS^8as@mnQ~XeoY_YqcAQd6u(REFOy?bjc
z`i-{h>vwv%L6PWCJy8WPY+&E@P7090DlyWI<7{HOPxj%E5_i@*5f1d3)nTKUn+7TD
zX<7Ofd|57|Q<XVcxWDV%je75adA?MvDX^>*ef$6-(JAUzpR;c<xU)WBM#XX6alw@_
z^<OVfcPk6K3|?CyxHvgI0lw-%5E?i6b5%>gDCgZ#;QjehK!{Trv^jgTJ)3NiSFUt^
zQun14r8lHrwxT}K#O5~$T9aj&ipJYFzV@nmf2PYShJRmL8cWiVKG4cYvaRW~akTm<
z4>PqhP87!~u9^25_ifOB??25PLURjC=Y9QdlPRO{1gkO~tqyFRq&1SHO@SnaYuRei
z+a*Q7n7~UQs3CmvIQC=78_WzOP5yxdP6mk0L{S(w8(_cn88;4XBq0tonVyXx7g*6}
zw;}AaX}Qnyc1X=ZE$=VjgNZEw9cqcp-#!3J{!34{tiW+12Of4j`G-#HZ(P1fpazrn
z{rPBW8faDWK(H+HL9)>04tEqyEzl?@Yq(i*T6tJRp~Yo*obHfXMw@m6RrqeUU^7dd
zmZ-->SyQPw!g?O!s{|2fO?a-Q3K#n|QW-G0_(l}?lv)qHsbheNhY$M0M4*sKUlN&V
zZ^r~&CFX0>UMMJc0;<v~kREUz8PZn>Ki%QLAGx0XT<vCIGFf3mW)>Vsx)vdPk+0ip
z`L%&!BabRZ{(TGB#SwlXP06_qs!<b(iG+PQ?dFn%H`OV*8p3!NUOJOKwy|{}4`f75
z0~zCd+9aIm=j97&lwE`3AEsD#TC??>xqQd)^TMcYTQ7Ni$8N5;V_l@>FwjLie)_ai
ze`;u^K{J`RHBV4HhaybQ@L=%4M2$0SSfO@*nf~=Xlz(7ZGM?37C_rs)f6O=nx?i8Z
zzbe4l<|Wn4-^8@3&wl*qJZVi;)!(p%&^iJ7gj3`19O66KxD1O;K)(30h2$yB==x9F
z{qoe)k@-J0&XU6<MtaiXyD^NyIV<&0loPV_f~QKfhcZ*UGiMn;RpfKvjrCFZArL=1
zRt9EJEzxu$y)wQ%DJYtonCa_Ff4_H$KM?MW3F&~RNWwbHcTplcgA|@%A*l*TDPxO@
zi?wP10J7jJNbbz5sR9&LZ&`5Ul+`?y&J%XW6@@%i&p%)>a?aQ?1v&y>PQm+vcWnAE
z(;wXJ-^~=8hXiF`$eNbpjfyh3HLE7+TprT7#f&??)#dKuRu`qrz7rFX>gKOw{*-q9
zD;sFW<ASd{tM9GZ)McAgH!Bl7SENrImoGf)*eldDIaKkTQR>&%@<}!jtV>M?<1gbL
zeXnspAxB|<7KQQQef`ETX%f)D=+5+t++Km`Ru{&|l{XYO4UAkTgK1Zqmo)DbOcF)r
z;@|wJAXcdAx#-Lc5<?a5fmVKU*O}xr`NqmttNT17KC!#RVNnX(OZal<&ID2+13YdV
zy@HF_-alCRk14pnd;UPmW)5h+&I%CnoAtB|tR)gPvh_LDo4tq&D?Mx1Q#tv=PVZRb
zi=)J7!s7X>xY&H#`h4-#0ptDh20dK{)wB5-EzS!@uZ%Y3WZpOW?R+qilgD5Cr4bCs
zdfld&1zoFcE?shRRwo$?bgTO)LF1#&{6jueX=ak2-5s_&ttu5suTx@hIUk+@?7+mt
zbbAorxHW;4w-^OZ7z<oc;Bqlk{5EOVIK8qv0-pqyi4q{=*HY4{v`tyLofvKSc-W<5
zmXh;kx8ZW9Cq+Qye2E~(2*^{tA{dhl8gb8*vWf@#$lU?lHM&|LA>+30Td)|P0bCb}
zq1#*eoQjFDQe&IcDl6?<N+rg+!lv6v*%Z*1SWx0RcuKsc&hj46swI2uRJ%BM0W(ra
z@sEasKTI|h7NbQeBO2JYNYhxHRA~z?<;IH*CgdDIzjic|rZ5rmJsZ@c!0PJi#`A8w
zp`Rm0{RjE7aJ%x>o$>NjP<}3WumG|IJ7-a!X_oExEnlZW?d3%1q>_-PnoJk1BJn<3
zbeQVd1y~~+<-Nj#Myd)h%-8mFqP2N18D9-EU0dk+rd=)3hkRkHUczT_{AD!sIMD<m
z-vx!H2)K?k--UNwE_9B>w^o%OT0;ovl^&CFcASr`7Y0D)F48TtHk+~7hN8#NJZ_7^
z`bpvX)dRJd1z~ry*e3NIqdQC9O&l7%*9l<TT<ADIAN5Of84?-yrV90>i7bC5vn_GA
z9zI&N_CgldgLDS-BVe2w0?2F3_-}dMe?4sc7N>7|0N51O*5#Z}B{|4vbx6si^tw8I
zj6j=$Z`^8ro3UP+vo2-6N~+_!T<ZMI8YrS6>aYk8I^IqSEY<AN6Q_B@=Y>+yx+s>E
zFxvM+8+TJbnpS=bzs!8;2ZK5=_6`3mx7!?o4uZ6(e%Bd0_au%tp=}d-C$}T5GBq{5
z_!>sW!}w|-{?2`sKx1mo#f9jaY_hKFCI}abNp~p-8GT0dn{?de@V6;Y)KXTdya#f5
z<w;3AAT-r|dd*_<A~sLRu@K#DDM}iV5B#9z10x5Zd`Ndd7vl@3n<rkcn6y=Y{sAJU
z!xGB#TW#tSuq;+NOU24{Ww8{=T8>T#N=5m`DMFOW$L(+|=aDYDZtx#cArrM1dZ@TR
zJjPuM=r()Z8J?Cu6H}WW{9wg>jbR#VTgF{8$uyNxG-vw6Yy!HyK>Ym#9iH2ky|RrC
z!wA!x<D@crZf#q)F9j1r-F;9JrIZ_G3is5vWozhAM@50|KDN4dygt}|;7bWOCCxz>
zARm~ih;okC5{L5@lSx?t7@fTWT~N3!CcKA&l8{wbm+V0oBx$pl<Of7Lf?cF!<4ds~
zz*YO=>AzT{4$8Eu+S=Mf<_Alich20-&iP^fJJ&5Lc>fLHqoQb}T{ea+fDSp#t0M*H
zw5hQTyp3|!<oKVhICS-&>p1)V|0QPR|1(2y*DL?;Jk+jjMFzD;Mrp!c%(%aEi;9YB
zYA%3$UWo!+QuU&Zr=h>xOqo9B{hzL`YBN9Hd?=KP8F;hJbMKs2N{loPz4yBT$7k_I
z7Jz>Nr?q!9;N2(w<vsjevZg<Q;hUP^ch96h6B`?wvFl54QBH#T>L&t~4}o!o`2C&R
zI}w!jN`W3&!z8mL+i|rA#HIjXaYr$y1FG1(jEp9)l9z03%262qeHwIvDrm)cqfcWq
zz9h}j959fJKF3S=#b$?F1U%9Nap1cz!h;n0IRDqXVt5JZ7{Cw)FjCG#67V*DNXPxC
zS9BTwp6H@PVThGe=pP>+%Q5}WYj+=@x_>nHr>Do|-&_fPYvLygfk6KIIV>9XCL<l(
zBA~E*l@b#pj>`%zig-#*z>D_T-g&LB>7P&iB}Ej5(kBmlq}^;Pg+BO%xw`~i0&Dt1
zEn8dGe>bB`5hW`HgTb=?HAD?HN_M57pkSiWM?|O!d_EF0NP#^HbQS2@^(JrxfNwSp
zdnYfd5YQU(-qC5kOH}Ng%;3)cYZfLod*=&4!GtLMNdNDO*x>rWaIUWXY2i0U&VSb+
zlrR~%aQCohV4l~!>wmQ;QL;?Z|85~0ojOMI`@goz&`8k1)fE*Lot?S<YYc0Uf-co&
zQ_jCZ0u0c<c_6w$!)^ib^RUPJ*X=CHrB<V?*Q0sLlwD?i5SS)t&GuNS`2WW4+;K<H
zGMDX%%uH%PQ31mTAQc1>R0?1)c{w?dyQ}Hx!GMV52Gk>6w~?b0LlQ73l<(70P;+!)
zAv1-`LonW?3~X$-ck?usAMhJC8Za$w?XxyAyH^1nU0qg_6>xm;!@zTUXZ7ZgPU0rx
zu^sA2PEH1tdV0*a=1}nSd>rWgTHPrEr*||Lpe!=UTLtBq<oe^Cc4wv&7~Ug(-IWZ|
zZm-Nf+k+|nV*5G2)2qdu1i)ZPNBMiinEdB2mir$<$}%}eQB}2n0%-*d>Nokeu48!*
zfB?Yz%8ktU)Q~Xqj-P6KDF(9PYyIE>!7UsxdQKXKhN%v{34*}hct?+2+D?DogabBC
z&J%2G+;ks77EKCV{8MG@CLK?_+yO{B*`7>M-Cwy0$wb3%#s{O9&6etG7n72*ot^l$
zv>j|lgk)}MT2Y4epDn6@)v~KC{*+uFNZsl0R;}m-HM@o5+1`6>mVmv9I(J9aEHFkO
z@bVhB@P0nZ{|Z<_L?M7X@-09;q{nSY7`BJ=Z*J2iP$XDSAfOc$z=2jVBQ3{#$02P_
zNCC18WMFK}%m^se_DsEA3m}qoMx0N^%><xl7Ut$-wXSUc=1*Uu$!C)T;4-ps129&1
z<dpz0%f*wi=vUbTp$w)0x9yZR;Rd8R%D`<tP>Ip|PV4lq=hI_5*E+IGN=ga}7)3-x
zUa;ex3K@V|)ITJ!QzgF6qE|5{bBjaHzm&Z|Pf!2tzm;an&cm+rU*0hMrx94xQ&i`%
zpRr4bUta%?{N(7!g^<>)qxQ_^3z7w$*!Rc8+~k9Z@c`(*Z<OA=b2hrAr6iu{?Noqa
zC)J($cSc3BLZ|{Wv&Ce^HaGxak%PIE@+F4%aa<QgdU0W4VNp@Vn_K|oTY&g?tPETe
zMeI*ihs2w&)C7UF0OmMR(giCtwE=T!wgh0VY(np<bb!AF5_$Urn~9^%QRNmO1MUjB
zQTfNddIM0GqWbSTe8!_i1{cN3X~J=K#~=~9V`e~}^`LRy6`U&&*Cz;0pO~(3R<Fs(
z$N()s{fZVL8$&n#SPg8ba}wp%X$Ak$q{qvcl1Jk}_+U3(1EK1pi4YX~clw5e3Ydl&
zQNb~M)4+9ajUh=M8^#RV6MrxJ>#~+G&b-3cUb3k(gox?r?KMX0OHgrW`HD@${Faf|
zZEr5guA$od`a;O#<kck5!GrTC?j~$*jhwtZSkBybGlfo|_2B4;3jmf4Fp@~WG#IJY
zG@pT<2FD{zJi|gknNy~dq7(h7_ACA8O96peU_%X|mY|UaIu@{clHO_-=~uh$f9Q5>
z!R9fqdf)P(I~T>HKU1>j`TIhh@|AJ3G&o42_~!(`1iw--iSBPSLo>k%#0G52D+UJD
zpQ`%$f^K{7!u0DuBNd6zspR8XA)*IBPpu5(^ZkJzH$ib26sf3*&2m*%ded%y6DTJ;
z_VC?%@Yr4oTv7sTI315LIqJMB0hkoVCMx<77dP>fXVz1FoXobWY4X<ocMu+_)J>Rb
zAWw)R;od92bN{dJNi;=-gV7Qf-^E(X2FG)dPIMKwxP!Afaut(SR&@wL5O@Uat@}<O
zJktjb0X}8tPR<mJv>KiGd1o5YvIoA6Xt0{cpK^d0fIU4<?Y2|B3i^c4^K1{K3!~4#
zzL4oOqGQGfecVE@7nGEkc&Ge-l92r!9L2E=X#Sqw-rFWz?{~&qrN=<5&{acN`dq+;
z+0FCyWe=HM-Tuj>ohxXuues+U+3ebQd0tHCotN<Jj;S;e(~f5KzP+5an?T40`?|jW
zW+Ji$;F<0(ARKe1kBzD_^ZTgES?0QH_+9qXr%(I<0W3!S*}`r{6%qVkx2X$Cv8D!~
zJ$FN}clHi2kW89|bMPh*Q6Efkye%IZ9@f$R1ey<Mdc3&oK;ZXgU{-yOzvmHj@(~^q
zf(hJI@ycKw;Dg%ZoBwvl#^z`c2VHk(gpAWh<k&FfM~aO^-u?2m7|i9%QsP@pcH|uT
zh%78E2^dAahvG{tYT}pAKng7^{C~=O^KdNNwte(&6wQW2QAvgjl_^7_Ldlf55DjK!
zD3Mu{%+Vb(%Mg(v5g8InB=ek^GS4z*`0ZCc@B6*KZ>?`zYg^mew)MyQ>v`IW`@YWW
zJkR4e_I*G0;{>8eE$-JpDTYa55%nVm<f1k*b~=zUF>YM4_E`R2hPvv;C6Z)cGj2NM
z@bI>H{gD6b^@<Qd7`-&8)!cB_ea0f1yx!4MY<hx)@09gGVZckg5R<EAd3MvMS7uA0
z08Z$}?bt?sNDWH={1ztB`jS!<bOBrK0@so<7(E%B=Zp7!XXr)?8XARef2C16p{$`{
zQ_hpsY%!H`@y)fy*O!C@_d1y_{u*2p83Ke|VZ%fAuAPrUWX@+`A7Kx-9hRl1Vzs`n
zeA?e*=KC!>ix=M_C0u5P8a%Z(ZrSY@9-fb*5>B>WndrFA&^8~nguq3yl~MmXXca^D
z<R=m<7yCou1CqNOec{XR+fyXhjHrVJ(Hq1fT+q4iFgr5IJRJw_88vBx$+f}SIGa~;
z`+&BZPK4=(7-3VNKY`Bhv6jM)QnXk<1X$X-Tc0n&^|4;#gYu%MNCxX3a48I~k0Sd7
zl?c~;&rn_;O@62-<f#OO(u;?cyVhJTpe-|YB)o4NLxm7WLN5Kf`PJ081)x%?AtIYU
zD#oF6DRqUghgi#N7>rw7shBu&`Rh_2wqNc<gS`*sY+tBFV^h<xYKNDGO+RbY!DD<3
zbMn<0IXO8)L#7+p&-Y_rk#WUqF<}eheeRNw4--~aKL0V?&u`p4YY(Tm{jZBT*01=7
ztnq%^5fnda4qEu^72y*TYZcvo5atK#6_9~gbrworTuYK1Zp7i-z1Z{pY~Eb=y85Oj
zv*%>;^-I@nFW;cvj~&g5%)=%r_V&R&9*sZdC&^3?!sSJU?K3K1x-+o(7Rhy<%C1Tu
zFs&E;5seGDoJPDw?^H>7SQdOUd&0R<i3xZJ3)wRAv<z|_cD_@_$U@gsY|EQyG2-`S
z+U(%L)v#CO)>P+Z+vU$?>qyj``uZ%hKXUGqAs*Dd6i4(?lB0d(N0fF1OAb^#s>bf5
zv>9d0HEVh<KqWUjD#n0J?WuW3p4~2!W-cWtec9!oN8PZovFZKAvG>QEd>4`GcGy}@
zns#Xn&QFSm9@lg1lJqAl*}(&Zk9%Q!)g~R8=g{}e;S7^HPcN^g<60TU{_SdmxH>9!
zygvJcnNdSc?P8;}{@NofG)wB2FNgoWB8wQqLR0j#vH#;e`jd;smfvT(Sh&}?su>o1
zH0wTlZgy=C;(IycW;qr|s$jR$^}Zn?j1Fn6TwKUUlN*d8E9Ad0>!2|3>*=`LyLId6
z8?xWX*}LnUb7Bed=5N$RzMamjLnS9V&E=Wg)K@|8GTX9+ojm2cFCO`D?Bk-MB3Ru3
zeEv1s+QiWCvSMjr8s|F?g3@yQRv*R`?X1et4{;Bdy*-xCAKaf8%2fy8xw&}k{OzRJ
z%=2GtEu9t&n$sTI-d-cw!*S*A&j3ws;+z@KU9ldlmA>@b<TKJEn4%;{MSZltdK0S*
zY~p+s+TEQaKQ+r`<w=w#Tr|Kb4LN&E-p1yg(*jRc8=u{je&`wo->i*BIe%|&4sK=1
z+I$DwqNf^zZ>tPn)+AG#OleTU9gEd>?UQ{g6OR@9tWnj6El2kt2-=J^U$vE>C39Fa
zh}&EDa=RE<m7;geZTr44Z__%%mUM&Rrqrx``t=l#w=oTcdu}<4Fqu~n-TdMV1wO)L
zy3ea_K%-1kFH^)aYLHonu-(cyxfCJmrXpD+maUb)jzafbYTA#|oq9ywVw{|9^wf&j
zSSNqq=)1R0dT?eN@9Ht|k)-{XhK4$<R7|3seh-Jysxarj=i4Tp&37tkr0^&914-83
zravg=tizT)v#-c!zs4Nb+&DSgHPn$WCZ4Tzh{*S-4=7X}p;GkIMeTE1L7@WGK{AD=
zE_$VH-tn(VcF%vn<)gTVV*3=AC&_PP?buRL>aUka>~Ci}d&*)n2cW#-vg!KxdFxh_
z+m(KE9bU^}r!06$#>Y(B&tzZDWzK4su}H}P1d!b3LBl?x;XwohD!Cgcv|_wkGh{>g
zzo0Ex*P%l8iz{5mv?MRLi;3C#%BC{wtSP9K(|@Xaw=!06^T40WfaJJ1@-}#oF0lGc
z_MNT>*<m$x^*d5x9OBv37Qo1rPE39(d_WOtKlBPDy6AOrfx!SxT^NI$`O*1R5V|_T
znTSl8?!0WzPPZ1TN1CiL?U*bTC;uuECu<By^@?S4bC&_vVA7#fM}-4>s_N6eF{b7e
zuM!fT3tROdDa)}L6^*Vz(0dD7&FI#S3XL6}{0t;lDN$#~?c;Z)sI97E)nB2=M25<&
zv*da0G$&)QxPuK+o+3Axeci$ah5URkI-vO%o*^wWt^L>5@$3i@G>f>~<AD?N4O^Ju
z@0YB2YEw|_Fi>4abAEjrzw8kP`zoRsLM6x0a898FbzNp(xJkURcZpeI#mj}`P%g-C
zKJrfKvz)qm1KMZk0ujO@(pYO?JN7C~>fRUJ@0B9`dyPg0X_Yy+8BW}_yj?_X2jWQp
zHCKYc8j^c%Md%U57|nGqGYw^$>&|YI^Yi87<>Tb$=0<nIo9kz>JrorcQ6HiHiiClI
z(XE3Ub^}Xjm`6E+ygwGie~3%j(?kBn-pwcJ=utwW2%was9tD}`goLU{ocG%6rbd|W
zShO_Xz!5%h0cq<4i&8`mN0X)p!{tW?3mZ%gbo6hbe0sDMI=V3AytqHk`tE(@Z`z2U
z+-~Ui^yyQiC}?>$jW~9_1OT7x86Uk5j_)gWsN9G&{g9I*+ELgrMSJ+ySbJ;P!xz<R
z#fCDpiJFWnbrQ>D;z47oPKGYjoPAK2MA6h;{P0k+Mu{{V!LqD%<Gq}7akf$00yvj)
z4~o|GV=7u&i7&D!eD5>l7WQntvi8&LE5pQBuU=4SeobfZ#r6`h>iJaYLrYl|B3NGX
z`-MnTs$OJxc*Ry4l4Khv0d=TQjrT$B9Z5Eg<>*0zf=a^t*ERc@p}1}YaGpK;-zf<`
zp6OE2kGB4UK?J1lI->hp)bL23iiUql{VUa*Dr6*?bXzU5yWIdJecuf{r^?=vNhBQ|
zd?)NJ4K}fm$;-I{89Ppow5R^J@>23fq-SvJ138mNXFTO^oKUF-34!>PH|evBM=N$W
z0qdGIzP4gJn{l<c>~@L7O0R_0E|P1XTrRWDS>NghT&gH3(~<Ro`bV+`E;m!i5XwA8
zpnhfPDZr;&Wgk(u#$Sth{%I7z#Y62woRz<YJ&ULv_UjgU_;_4F7{>k2&B0+jb#E8S
zhunc^hIJc97z7d&w3deJ`t|kiZ}A`W2ZvjbXi1c<LiZuftCp2M_)baMuqqoV&ddgD
zTJJ{4Y+}0SKFo<jDnQCGXJ87Z-L*?YUHxZ&|8EcZKh<gU&enJqRsCCoaY42Cu@;N0
z)});}v%MlnK+<jqFFwoXo<}lcLlh4gronrHUxWApc8Ti=nNKQzqB3Z5x7i5~&6_uq
z$yA5lDZRKe44XW++3$L9p}BEr(twf2&2Rq^({e^AX`C<bB$4!exA@4@2GY{gqj;P|
zPtdlzp@B0WDSbH@85spKHnCp35qWx16ghe~=j^U4pQ!^g8T6rH$5(4cQVXC(Q1?_R
z!o9!>g~a^5-zhu}U7yTjvms2P{m3y18@#3x<megp!MlSmUaZB-05ie9pL?eR?FT3~
zqTXE8(0JKRWY#UDH%?T8!*d0z%bu#-e9GKcFMXyHdV=f4Vxp@s(s9b81gh1XsHm=2
z@exxt)#>J!*UU`7R;6wtP}{uQfG8Yp{|CyHW#5`*p@;l~8bvnYA9q9Fb;IUeH&75m
zbToT#>{_AxUoDgAbtsOs)zs?2-4ONhZi&dX)G{>|-}6={2Wo_T#v|Noh@?t(4e6Sd
z+q8=)k)r1<za#r<<l2av=Z>nfKX`^f6k+9G2Cq0&;ElK~yN)jr7If4An~+jx8>(KY
zffFN&Y_1>1fC^A#_k1W&zv^+HoR^oEaB>6d76kE+>?|~~(cgUJ!@do|G|GUzVYlJy
zi*KUS(^dGnayU3RcEjG}_{U8jj93($se-R`zSIm?+dO15st=^vRRW!-(pYE`<&2($
z)BLngs&Mcwfx9aauPgOLf}atVnzGXR_tj!cS?gR(cDe_-4OKNQTUx3RfRU>P!kYU7
zPd#pXi;ng$i4wL`cyaF3{*=`^t21?vxpW=e(%`lkC4#!t>NYxiQ$L&!J<HV7lXPU;
zjbI5Ox@%9p?JIxGwho8WlJ?tQy2S3nEP7dBT7*3AJa?U)%vO0~`WpfNmg(<~a6Q{r
zUtf=WRjSZ~Tt(7bnU&aXp_S70sf=A+Mw)GTak&!99c?=p=Q12yd+Ig|*}^fVl}_{1
zV)u{tFntVHbAvsKqnd_>mi$u@<&~ADh^1bO)w;4+S8q@Gln?e<<h3LzsLg`N=P4?@
z<rin2zLdp!0FQwW2T*IEz&e0%2pItZA|Ih)A!9LAC$8FdFm54D_+)K7NQX`b0XYQ)
zpGSv0$N5>gTe2qAI|#7VZxeKv$_55;1cHN7q_U!-;$d2B553pP#X+?{YbizeJkt7P
zZx8(O^Cu^t$;?%cWzTD$&8md$u&AH*_)|u!s;a(4ag1LsGw=t(y^9B;%@fbgu^m_Z
z1G;|N3}q3LAQkzil;YVB|KL|VNPrcPWT3}dv4Ck$==bnYm*t7AF0a~MW`p(Qyj&`8
zp$L!VaYk)&L0&~qFIDO<{O(7Yb^gb|k(c{ASLE2q+bSI<dQ<N#k;$@lOA$yO=?aYQ
zt9=>|-cLPTXmC}w?=Nnw0$pszmXSYSj};Z|CIH>Rvwwg;R(j;r*Tu7a_liVYD=H$F
z!-G}x5t5EOckXmB>!4h*{DY8F%IL2>`=0^+FA#TCifz+KVL)i89EvB@;xhVA2mdaF
znk14?Q&ZFb`qPWo6hSOBacHlT|Fbgqa9K90fx@`bPk#Y*N`Kjszo0z<S{wfJzcl``
zFaJBpt#(HL=}xRNZJ=e+dVH)6E*GSGkgeTCHeAW^B!Ekkki4lH0DJ@}5z-mFOIKR@
zGm>eyZA^(uCJ7lRT3rSH!OHIp);_|iaNYIJup$Jf9&Q_)PS6DI8H6RoO3!C6ge{#5
z7ph~<ZvV^*fa5tTc773W)B@u3T@R3aug+!_h+q5U4zb+f<YUgK8TSkB49+kt6wfBi
za*eN2ognfoz)nUVUDs45)Nux0Fh*?%ud(Pjxgf>oM^7E3E8&<;V>pc~H!(HUW#GkQ
zU`Rqh$ed<zG1ffaw>tl&a<6300X~yVtSaF2x)ch{AEE6fRVQ>iLffOOQZBtM3EoJO
zyq~O_uVu7PdK+$cOPO$@6r|!o8fn@>6!BrWiQgS7q|M;4uDJn}>n(qxLMT{N?c(hx
z=Rc-Pe#Zxd<3$xc{WVlbw_unx$^Rs%h<d!jJbcedwB)fz+`Zqm9(yDLX)j{)`+JY~
z0g>?~QvvZ~V%*sFuqN`tYi!wl3x=Eh(m^VOy1BUtj21Ths4$=8S91MX9yGi=`AUho
zjhQg4-hA}lTzgaj?E3T!iZ<B=!zxcmw6%qvUk)9vjq@_NysLKp{Vj^v{h%&BI}x;!
z#`pXONoAq!Jw_3vhK2@4=egX@%-7(V&lMwOyNC?p8Id`a0&Hwmym-I~lr-bU&c`+J
zVYGg3z>QmwtRn>`G@B3khn}I_H+4*7>--ciW2vm{@03hvSnFOYRAzjdp6xpSMNj$H
z?YClXdwOiVXFHnSWvu1`G0%v_YcmzL8M#<4^LaHuZ2txWWBus)?lShs*U8E1qVvA}
z$b<#IA-NZ8x9bl#;(r*MrcdZlO-1hv2|zFlP!-I{b>yr(JHj{^O*V{oP5u2GK&%Zw
z10s2Nc+^jK$JG)Ye?33+!Q_-7_Rqm;7FZZwrC3b#Rq+k30s&V39<$9~m>uLNA81W@
ziif><(FTR+TP6Co(7L49*x1BG)2@&ALT-P1ehiiZwHJCJ=QrnPJ@>czq6_3;tT`Wd
zUH<6>sD#s-jwpgGL~qk<^c)tVWXBI&l^2L&Cg1)J8kX6VB1XKaipPAKdpUFwsmvJY
zgQ1*AN>9JloOac7Ur#H29dDUKs^u2I@r6nNyDcBTx%*EMcvbe?caE&KTVB-Ih0w+v
z;oxxJ`)n<#dWtp>3`#a=UYLEErFW)hUcKMzhw??G7?65>x}<$sb}6(<gb`UEOmGH&
z^h*7qL#C#t){YVVUvkB`!?lEbiA#YTYqC9M*A)q8*e80TLX()OS7khW(r%)V!p`g{
zY5$#z7o+d?zIe<$H7y*M|BkDRLJ7wXV0+Ay-wPY@E_L*yOJ4whRAgfr9hLUcNY-lj
z@pgPR?L4+P&oHE&wG;434)h6fW!A}aKX@N@BtZcI3eS&T{hnds^n149!r<cKqEV#N
zyRr%@lH_IrYpjivyCW1SXr_@}S%S0|Wl)-7+6Y=GX`MN?z2V(cBI?(=E>KeRYc+e<
z!MCLA!;Q&id+6xt!3*T4rB%Ol%OKQvVt0o0FEwFzjK;!M83D!dmJ3T$Yq_EcQyX1a
z9x};F4uGiyCSWS<o<F2{raXtf6j#a1x3wrDRn^o4Car}YG?$O%w`Yx@)n&SmQ>P}&
zq;F=pY53W3`(MVPYtD&Du^ngwRRynKyZ1F{4nYyrf=7U`Vd!r6x9-+I61kpu>{Od1
zL^OqThfA|>RdpK&efLEJ0Cnx@>7nz%GCgakPT%MB%fI3ZZsr8pHIVQz+s)Z97A*9I
z!gmJCrnG?-%|DKS*s0XQ$|M<C_$WuA|4x6nZ4H&$Sa$EWxU3s3*A6a&diEoG?vrNA
zd%zXu8rqIZux<jn^~n$UQiEIf2yqlDg#fmel$G_k|M<$-@{%R=+xtDwJPQ}P8A7%Y
zFs_c0_S)4F=IK)MWM35x_N^4#2V#qYF(UtAj7~Vtrx{${dGbC<p<^VFt)DWxr5&l~
zaJC>$d2Wk1e3TJk4hz@Z)!n-VXF}9nzNa|i)OO!9e{$^X4gL%7TiSn|C_J7!>Tf$|
zI0vT(fS;z@KZ)H&P~E@bKeAe*XE?6VJWu^2*$-E~dZY8hGHwq5uf2#j;mW6o`>7ji
zpI>n;)0HmeTbM{)|K>tJ7T=X?i<6@T)_j9qc@z>WT3;lP6x!mwHR;~2J(fM3QXtGp
zI(K_@hWr>wWloPvT-{p^m-#cHS)DeuHKt7|Uv6eO4e9$$f%5l|5Al^pC_!?et*$Nz
z2Et*x8<?wi2?+@(sc~DR9|MC0Dlo}50#qWlV}E}?Vb?0OaqxaA>kIh0N4;2_p6hFO
zdcAm?z4QKuxuP<9&zGB7$lv@u{5^`iL3+tqw4}$qd#6$JD1WrchXWK>P*6}4At>I%
z$cSBXL+JRXosSMtDGId09^|Gx;&ks53FndKV~<!CM{v{8C1l)eo%KJ<-0dcN`&r&w
zv(-Wg(cr*avI{P>OD0KVSqdOpelmxh-we!!>MESS$bySWfv8mqxY72TRTU8J`&C9r
zxBGF?Q*Dd<hIF?12oDb7ONcxt6XvCk$(}k@2oc^UxsdA`-8oIiKdsXL3KbABD)!qp
z_)E;2_~pEf$gtC|QJ>MjU;k_{|GnSWS<NP&$OXBr;g01je6k$K+=Dqi#Iy6@mlQ%9
zJYmio8{IHIa{d|tDz~XGT~j{%@Q~!;gM6pH2CHYA_dQ)@1!bGTI4<F=iHvebAjuyB
z>_;4o?`J5E9Qs*^T%P&`miCUsUdOxr6pGA}cTX7I&j#fRT<9U$qr=6#L;IK4cGVvm
zlvTm3YW$J>3twDL^u&_(XEI%!J5AkLNT(<xZZsva@<q=jRfI?1z<{u!Rb(rkSbw7D
zWn$t`sef~;SE~8^ugn?FU*3%VvZn?4`39TDZ563VDi>#l;#cQeSFQP8vHlNv``$2N
zt83sfZL(P1I12i_uJb!jol+P*#vaM&E>E2p>|fTP8DA$tC%JKPsPtZy?%e^@7{Uv*
zWMzNZ&JW|l!2&hl69Ghs1#iK+_F*Fe#nQMK{X7;#S&a-^QfJEyInqM5@Nv6`Jhf-m
z#&yP@pZ@0Zbe?Fu63f=X=xzAwkDTH!Vf1B3;L?wNZ9720QGyGpkte^2WxGDdqVDpP
zIn2d1T1b!<FqD?r6WF%+ASgoneM=e6ao-4&PNan+z;<^CI-)f+HI23-B$kzzd%ZjU
zW60L#uaF|Dp&6Bfv-D2mlM(!<tW6m?Ymi3(wE0y1z-&fLOjUxrTZH_uM}#Iey|%U%
zNKr+oQb#@J3GkxcJ9bTqDBewl;S#Q0;Jl1H@!42S%;S@{rYnxOvS!%p<t_JmHtiA(
zwyt)u5?wamT@|dq4fT~Z@y(fZ!XaQ?tsMYFEDN>EeuHE%qaN%L7bI^HtZBlzGY34W
zl_Zv-ZO9J~zW*&mQ0}ViKfEjg5u}SI!hcB|vD;DVN!p7AAHKhBD2sid$S+~7GU!^$
zdK+FNM<692_!Mb}q#wmoXCxNk>eJw2J;gbGZ(6gn+5GXJgC?oty;o$atMGx6_{=j0
zGMIZy_~9X_m&1Ddro1aMZ(fA*sHiI-afJ+llOmZ?B^V|u%^R#*$5_Y@p<~&b7rOZa
zYRByHF3$tp{txcBoSf=pdMkEEDager!YSqL0Ey3|=qa<f{oUW@Kv~R;+cQpoPdD&#
zBbr3GxfbWM(6EhXgTucOE$Jf#3yr{w^z>S0?!movV%enqJ{Pz1er8u(QIoGNaB(T$
zsJcG-Dz0O0Gv3@0zO53!NB9BdETO==h(c%V!J%V$4>pr(D*@L>cLVt1I;?ooL_<qU
zOGl>-86s-mT%Wq-bVZ$=V;LmUeY}H=zXV1f>L5QG1eMe-shyu6Y{!M74_43fnDeig
zebRSzV!`Dxnm4Bvrk6N0Z6I~gK#i34hs3ZQ+gqG;Z_R$Ec8pd49i5Pxnnh2+)eg`;
zl(rU0PB2J5q=|?+yM!3g-$ld)?AT6sdCL&w)W|)+m+*Z-V5YtDEFi!Ky9}Z`@b^T8
zg1(6ovD@nh+jHko^w55Egk^n*9heE|5*oe;%I@^4SVx+8P`cjb=Z6Q7gb=bIz6vBu
z@d}}{&>J7F-7Vp~g!BYhodARYNUcx6B(_ri1;`a!)D&PoQTQ}n7{og=&fB_%bP^)1
z?$N@}SbhB{qO$n#VFi-Si4TGo-<-#0qzjYEzDYfTtG&?=(0lZ0{rf_Ht7cG24M(##
z2a2zlt1c6&1CXn~#Qzj0RHUwTsQ*CFwZNt3A4Qm7*~oF(5u3+Q0^Nk-l8S#LL41|~
zvauehw#>Rf8wjqn;KK)J6n{vE2t<XUqV+)%ZWUt1T8N1z_|_cTlrOv(h{gs69g9s`
z$0~hL=m|p5*?W&2HX9&?3emf5>g;3+lmz)IYRd`2Q*<NARSCb)6AQK@>Tkxn2j|<y
zW|r`FdVO&c7U{IKv<TRc;&fNa`zSpz-Z*!*6F%amJ~d&p*LdUh*$*7>!tey;|ImJ%
zWrOUE&_T^~nFebgGRxACc)#>x`{8)6$(wq4E8pe_o^ibJNkQYJk|NtrCh_lmnN4BV
zThKjsgt)LJ+iO>68X}Sv*_f<D7G3}_`5%J*dyqS`W9@Q(C1H$X!TLL-56-6L)_Tg`
z@01`51M?Rc3nopfYtB^y(X&LNam3~QK(tqhFhd6SSTHN?oIKxI%rd}xF<xNSK^5Qu
zaNtu7NW7(yw+#3KBdtwr5-ufln)0?&#+RhnD5dqEiVRc)Z~{Fi@Brr<9gbFvj}Co#
z61hcg<#$0iHz`&S1)gHfIRk^A&~-km{KJZnc2wS5cZ7pm#OB5m5R{@BP4x(r#^R_d
z--gt=NWVb_TDhL|agZRYQZ8Sw<p91jt`Aki+9*MyQf_0`LDLw)4odmyif4aVB;J}9
zJ3flz-#WdA&wRzRx9#1}y{1|eMY|C00|GaZKwL1P4cvR;92P2>!Cy8>MO{4;gE+uq
zNs?bvXy!aT)c9by`560VQnE^)3>$#T$fH+_5|m?}S<2X{X=>8b(Y0*hL!OI5UZk$b
zYgE+==cD2MZ9;M;DSYS7cs1@El?!e%Y@Yifa6rF**NQ`lWqR;3)zQAna9mg4|BK(i
z1#mH4U-rjQVAG$aCAafRv;F(gp&$IUE#=H(tnGePeN->=-EY1yt1d<Da)iw>|Ex6*
zD?lIgvk<Jt^JCZ96hdNOlpN9f&}O)a6C{A1g2Gl{KOdi1j}+2b&uk%SC3}0x<MW~Y
z6oH%U=j|;mEtSo60fB+0!jAKn<zM{{oS`kLZ)|iMSv0uzTaDl3GHo!&DJ+$H7f59+
zpet<<kkms#I%sqFMhsagqfn-`LVgRN6k$_1h7p(1KUEu;0#jeGCyx%n&gDV~ak2`D
z-82wW_^;pO!SeX8kZ|M;83$df5j=ccH^$5SVKF{`1J7_3;23HDUSe@`N{K3}Io~16
zEnM_Y`s?A5w^cSOFTkKQkF@6o;RCvgBQEFinz!0i+4X12dbym&R}c0s{gnD&1VAIQ
z6G#$?->2FE16y%Tof@{wE8%x`U6u7FZX4lJT&-bcFAxxGgO8j;&ka9~YoN4#PZNGJ
zOicjmqVFlsn=`;-9)KB8C@`VqyKI?*yGNLOrmU*Uwr_d>wKG41B;StU^<OJ08YVFH
zB=AnllgNiOt?@ubbYOaOyJk@43G=A@W&>20m5$7_wUuD(nop3lcKgfj7dyP+EbSj1
z+j?s5cS;cY&?;+b<v?b2|Neb~my980aI=sln{*wL-LPnYOoU1hE==yC-%48uOAp|V
zGdea_su&YUD$_}XYIPHO7VD~gFBpRO%goG#QPbF-$GF4AOzRT>zhBf?a>Sr3)*{4<
z%}Q@NPx_um(%x4QiY+^|{8^vG!`;h59vT{oRvme>gUN&2jK35KHdvgb=IfPVB8;gM
zF&Nu(UB-KN_qUpt6$DoD%cgT`niJw;TTc$!;Mc+ih@ub{wY|F;%HQ&snAF3l>#{r(
zxWSJWkb@(ftEia0KhtSvnP=XRmVQ)GmuowImf4O~LZSdTdvE#IukdG%Kh~kWZ+5_a
zE}m|g_{N{wpCB%yJe~0&@GC4M$Aw^2h;Eg8cbdFIp2TVOdo-(cH%Zp&KZMh)Mc<Pf
zlz45|7Gxa#FzVG5x$pt0P`~*6WRvJhSy^lzd~EY(L&&~`e+dSKyDt<%k16so<<04a
zem&35l+2a#9K#su!nZKGbnAonf_QDBPR#7DOay@??LLE%n!r%tN^0utjXQjUQFY(8
zhpYWh6h$Q=aF5D-b9{AUULac-M6CUcwE>&Yi4sD0zxUAQ5pKMY>k$)_Vo0onupsJC
z!t!y1N4znZ&^9p8DgT8-M-K9VMcl2M{m?rTU5#6Ftp~5QY#MR%Fby$S!@9EymI`=}
z7<y3sMD+FQfLDLV#LVt%o?XawP5W-MbuOo*jxn4QM5(w4-}%%}IPg<3`<6>(+}<L0
z0ZJw$AVBna^+&pVpcd7>Z1*$3-=#t9<0Yf~wim_g5G(}xMaURWboNNjz$td}B>bM?
zDhC}0ny{;JSNbeK%NMT{*#Z6$^0v?~#!F|mi^qxt@o4vXdv-MG{=OPoC2@;yaDPk$
zKZCHP7g=_nh6(=#B0y30#GuF~^zLk>6uG_yYWDSIvFnh5ni}fYrgn~p(bj%JuHn2g
z9XW@r2I&MCQi-MR?ctO!I9%W8c`bIX{&vwL`pNk=GKuB|lBZWikfMi`vL2tKc>kUz
zOx_I-qzQq>3;Tw5H?<9t5)+@Pf7f<-XVJORcDssm=Z@f|U-1&^;Y54k>O$md3j<xg
z@33L`o&Djya=|pg5aqR}WV1AGXIHxI>+r{cqq@4982GeV4vPS~OV7p8dazYbIM^D9
z`US8c8@HWkkzF@vP#Y`D=*uWLM4&-P^S~LK_MQQd^&QG7LuNjGcmBg{qu-N9+C)?~
zJu?lV3D&W$DX<-nP(1G5bIz)ikak*hp8Or0jv#}(*bL1|^p~yuEA|spPmrcXKz6+B
zq5mE|P6Cc-M#<wpn{=}b!f}tfZ>2wAH<wA4r2!Dr8-WdWO-)r5mUyV+jl%C}fTb0*
zW($Mlg`Lr2tIT0y?im0uBJT;e!pqdXa3KwIHjK15{%Qxia8|<|NA8*()mm$@Rgt5E
zF-!!12#Lc{28uwvVQwcFnPB;2GI0Ki_QzMpI${SuIvKqXeuA|BIw%g5d2JMO00e1+
zktA#)v>2~aQ&;0%7TMW5pFw@JwP0zg92y}2{NKIaKihQo1Zk6r%bj+MlO3H!t6>FO
zqgtS-N?0JYEEXwJF~{juRB=d<(_g%B;V4{F4Ip|A9jP#w=I{>>hh3CQE|Pny8yIpG
z-~BJ0PsKj_+XjJ=OOC@R9S;jzP4WEn7)vS$aN>(rr2o}-+BEt&ee^p}MhGhtCg}b4
z<bXbbnSERvkw1|4vgpn48Hj3JjKP9tfifw&`HNkJ*7mEyTD4C6E|YCvD|q$=lH3on
z(99daKe6<J76Ag1q%@S3-{8FAbYtmT8Gg0`?5oQ((_}|pi^_Afj0|}6`|gi%#y=YO
z<z@Kjs!J{&<hys|Z`(vBvko>s5#raAHeJV;V*GB=8Vp#?_#1v>fbLsEg<hR|0tMu?
zj`L+zS-!wM5Ydv?`BR3-AZ{LmxctX>627ecXU<ucE7EM|wVzSHhGqi45(&EE3hnTh
zfHxA7ZhnSzDd>i%4#E4PMngwf7!j@~APTUflFl=%40|<+`sI8p|71Ft7<_gsnY9l)
z?VTGjUh6f-%~a}Ho5|B`=;uhah_c;Rorqub8kNvro=S2F(Q@kSGJpggE{v!$LQ;-x
zN0$P&-V~J2Qft9-C_e>XE8b1bgU!YP=+xALMFvvt&JW^P$4`&M&rnZ9Y2jT6j1crg
z98h3S87MB(u)1_y8jiq_A5d)9u%s>~|1x&M<AlEdAwenUs)06_d#@>){z+cTDk=_f
zuh~HlFnDIRmyf7rWE4Y%D&U|QW^R-QR7@dmA3gH540|ayVTH?=nF#9bl^e5;B-?&;
zpCDTs?JS5yqT>2&3K$3)Tl^?1n={RVR)*{1pZxR>ruAt~rj8Rko<2F5Zeun>F6Ro9
z7v0@C`+2y^u@j~JH1pED8Pn<yeU*-XDW1Aga#wm{{Z9S^l-=Ru`B`K(Y`<F7iOXf8
z!{O#(u5jy&l<aGNGIzdY_h|q60`)1WxrWZKDY#+(FQSq-&HC9cPlt<Z_ZKJs?(Xtz
ziV)FQw%fdFCY)V$X>BY!-ZYedtaT0b^+<dr@9*t+e+kU_diL$}*hTx-?F`oEl|G0R
z1r08sO8CdS*6z@wE4cWpTPdD8aR0`vm997#+uHDv_%;F_N%*S)g@_Q$Z_P0R1!F|G
zYKSdLermB&ye~C-&J|($gweIYWz%slvpUYG@W9NYtHVDLbSA<mRcP=_IIszKfuqxZ
z2I81#ph0)p9}?295jQFH;u5!hlotBH<P=!Iko;z)X@D!lO}Xgqc7E6)%*0B4(o1^0
zA~zKL;SPkBEh?It@oCVbpbmD;h_UQJHAqO)Yq76!8c;_@oOH7P(WMW;aY#Sz7AR}>
zAP=&FMCvuVPCIW;be%Htg$HQi&mT|>Z6w&wSFhes$YHa@rsA0q2a!U(&Cb3HfCU%O
zC~%~bpl8pTw;V9(Ech9yEierv1|6ZPRYR)idb0Si@RI-RTkLmeolv+I+M_pU5T?#(
zquwOvD$oFwd&6#m`*|Jx4ATlZ-q_ua&D!n5EUtCT!_WrZ3tM&{l`KX&q4h49sNURx
z?rQL6Q_x?ftPaV5#fq$S%!dyhvVyI@pL9y&K9JPe0rJy)d&=MJIr*tLm&%0;S1w5a
zs=!Ff9QYGKm|nv`M>AY_=MT6E)eljwfxNkk+d(UTx>Af0b}^ml-D+xTv_e?&#_cZi
z#RP#0YAWAh&V~)30S-qt`jZ|3fXDaJzmI=~#P-hUclf!~R3;onwW%RMNYvA@@)dT+
znp~EDUBxaW_4t;?X1)@c;qv?gKxABlyrb&m@7L7Z#eexhH&h*U%2PHLD_r!Dl1dJm
zs1q(Ljz;MgsP_mt2H5|ees2IN|4a`3-`br@!}tBqy-vHjIt~*{51pn8JNKpB%*C>w
zlna-#)-(LyIJkVzPkX&mjYlW%*{da}6T;toKs^S!?y?8r^zIUwvbu6@^+S=>R>9w4
zj$g&H^lpxYo4n1K>TnSC78^9aZ1vfjD@FLdg7!Uwg73e3zFX%UJkDcwb)~5s*usFo
z^cnVmZP1<ySrm-M#x$f{e$s0GyS{Ay&w!wy+f}R&;rKy_p~<#6cs*5yn`Ez+;Zf)s
z;Dl%bF%C+GR)kh`%MxZr-~7)}og?1LCQS1Yv$?SaegNZ4DT#^c50(1&idFv%<FEUv
z_xPQjZQve~-!^yoq8!YGF^o*Xj;_qOfGp737ycqqeS7~JZ3xuQgE?;9;_;8-g1AgR
zZk5}(ow@Bt?;!=x(YRt1By0U;+t9yZI*{)y0+KZ>?BBLA)RM<MesBAygD|=EiObUU
z&2mJWhPUWt1bIwa1<5t|qcuaB^{rtABYT}O@E=B0z-aILAOy?@y?GwnKQn^?EH5wL
zoSj;W*(_Ua$J$I<<{?U+^)m64#p7n!dvc8Y)IyO(qm%{RacF#E;`zsqTs1AkCqL1E
zBG06!FzZ8FjDcpq?W~!hIXI(F-H6i3i!oU%Gch{aBR&h!C-Evi=s&5(7z8;}eY6GY
zyQc5=RKKBu!hYK(?7_^}m=j?K0_}XW*8{qYEzBq8F@?0xjgquKZb$ILD{D%SpEL;_
z=H(?W9@MA`TTT9(i0(6;c(&^|v|AfHELfx=`)ECZBgV_gS#z&EJ(@#W6*q<HZPQu^
z#6qU|F&vgy=S`S3GK`jJUGogf?jnRe9Bv!v*$(I(+S~G%ApDYJ%HjX-{>g1pY%ea~
zX;2RwKww=P@PF*?5)%^x5DB#pf$D(8B<-)5l=E`-VH7-HjJEui^fiq_E5g?Sp98-5
z;cXi09|9(Qhr<KQlAfSlG(LJ--vlHU78*gjwCP>$|DpY<Gi>Gy{giW`er;ikCbPM1
zt1d({^KsI}M|jv63o_j6KONyI@ey}5mslQaAj}*Hr<yfU3+Bs(a#hJ~ZAF9}8sJus
z*YbYZiKHx@e-+VV!&dT8*t-4kQCJftLhHl9eSi&o4$RyXRMRFQJLRka?!4z90751~
z(|U~XCE6f}vN|m`_EUPgX+t7pjhMCBac^DJg7egLa<9?KQv{J}t)0OSkD9k-gXs=i
z;v`2$BV~ox3$(|a6B8%7(oq_01$m1x35}>Yq`7A=ZQwzB1><#zgCtUgRp&Q70#>Td
zA$|>LPwY9)5<gJ4M6Wae(KS!rwZ<ImSh<W>?2N1)E$vwKnL@&WiJ!3;;?at$;@R(<
z9H&e-kx0qAF|kyxt6j$f!yvViVnM&|?WhPa&I>J{Fi(-{ZPz)c04SfNLQmu`gr^8e
z<&TPp!JYWu&4}S7eRTSg&K-D5)kMS~zu>!TnXPb&AwHvo$AiMYeNJOO4~Uq&-tn~H
zH0Av2qy+RU@k2*^#ox_0z1mC0UqlP`Q_kz=Gv^FMQ-sThI2hEuP#=$1wXCF>plk?Z
z?<$z|F<(bvZpCNP^ZIS&P}}>r=yLiqe$IW}237#STj>M;&r3ixu5>GhhNN*vEmena
zW$~;mWGOR6{imV7AGEbmDza7Nze&DYVZn&xYCoTBf}sqXrq0gc$K-yz;yWq~5OP^v
z(A3m~Z(Ev_vNIuKpS2FPaa?E%F~wd359aB>wkG0kcfLaFYL~tW5SwU0c0RzVF5?Ym
zo@CkE1&#&gKVP1~^X~wS_p+TI@!>6_8hE-L!BOx3i)VV+8Z<9Xmg=38udQDh$`P<{
z4liY`f$v`NI1(QSzv>C$mkj_kG+Sq+NliLoMq-KEyyUl++?>C!ho<z~3<m64O&7gG
z+_<jmL49zevj1B$@y#mpL<Lt0Cyh2$on!~bWjFht(4hH|Ad=I9P~=~_Zrwj18=;!U
zU;LGe5kk5N5O<1KiAn?j1o7ub2+kM|3|dqG$U9dNhz`iZ2sZ9%*B{;je~G^Qhw7@T
z*CodOD&<%i|8{CqA=P>GY)WF4y`XRN`kbYyL2-u_FY+48uL{SLZB@?EKyy}KT@45{
zHtKfe;EOb=Gwj==YC2b!Yb~72Hs)^Zin|!DDkI9mS7_2^Yl#8``|efmRC#>o-Cp}j
zxWPUe&bRlmnj$d{KC`M2ln&#o-3ddVudNTf^pvHBF0^{d6-l~Ah_61fWZGmHzlHsQ
zHTX#~tq%?sXDKgb-R#=6OUs={em*_A1J`mgI?NSkyT1RMpg28*{_Gv!itZptt+D=^
z-9t5&5--j|mhT7rOSFAZ$8P$}3S@IFe&Hq$#e!sHd3{r}&<FyCjNO#WioxEu^T`y|
z?vL&7U7^>pM0KDAapM*`sG=cj3Wn4VSGM2C(=_s$>T~hlNvUi)7V1Ph`88^L%L5xG
z-)@mS$~?5;bTox#l9c|_OTeq0obriYh+kc&J*!_y_wUAE|M`48;H($Q7uE;ph##tc
z@qmBLxx;^dmVSpfzT6%n1x-whhMs(fL^)|a3b-BA_(FK1#y?{4TT&+Y&^rV5`1bpo
ze=c#a+wn<}VT3*zNFo;w2fEK=_hT&%Sjq0*B(pu`+kb#-Pa2_5`f48VPup8rzWS}M
z>SOEyY@#m9Y2@=XdJ^MGiigE5?_0$-r4%~*-yLCOxB7aHV|H`wOTvDD+&Bn)$0kW?
z5Va%$vy%1)aPqx4uE`~ZOcWwYCj66^FJBs>E0;<RGexSvPhpet4ph)-0qXy2L}+Se
z7En=DRRA(2C4W{clu;4db$ZkWl|DC!;r4z#P)ih?wCa(HtCU!E+_OR1^%!D5q1*=m
ziNt8U`Atl%=W*WbuKUCwcm53?qpFiA1vJ&wA;i?x?SKuuSub%0>TL(A53rDT5TFxR
z!fs*9F2GsV0FiW^W*STfTrfuoNG3+}CGe~I5zP!J(J8j=ou7_eZ3CQZ0gq9Fo*+ru
zE9)2CW;;NnJO75{(V~e*f$i3qHx-oeA*p}x6Y9V6RurW8F&qoZe|H0*&)P~x5WH28
z8!`3ZhHLZ526?Osl^mFp6MrO~zohl42f%wV0_3VmCsO*YPxeWe6GHDqk-YIPm5W8m
z5G(v4a3t}~8`IEd_b<z^XLx`;(gz_Vm0h*6#lMPzjm1*VMA7E(f8cRh<otw2l2lE#
zBX}O>3J2KfeDG5(aJq{8ckaL@#deQq>;5a!neFxHCZPO34XxEL*8MeG!1<6wp3k`P
ze}}_Kq@(y@;;n8)oC7j&)0>lvt8N1R+3$$wg%p?s=h%nUXfoZ3o@Q>YB#q?6<Ya7z
z@%G#-pG&~B!RZi|ke4P2fSN=);N%2KaB&<KuvsSvPYc2m0A(2(UKR)ipmx?+a#n{a
z^x<H3`*+nt@q&+rU8H$-_r^WF{i?(-xev&~y_9u*K#4)nWevoS<}>~!5WA%2O?`D)
zZgQFFSR|-W^s7G)3bLJP&}=0X?fAhR^8RHErf0Ys;KHzn`1T~8*DzE0Yj(oA4_GgD
z*^h(&ShEtnUT6sXf5lrQ{J-2EV(8$x&_*C>?{10#Ku$`@`O4ss1FC!%9UW)QKN|ef
z1khj-)8F_})T6097j;AUn|K%vpe3yW_wqZU+dL#07)#}kk|(HPZ~RpXwIFdvueu`?
zi$V?T+Nw@AFlDbyP!*6Z29Dg3uI!O#N>CFpPykSJ9!(F&fB{79&gVC#L=&tvS~<S6
zYUcHM+&~My7BMYMY(eNIFxd|P9qHPkssha2<2~~tc+cgY^mc8P({nE=Y1l4`tltCJ
zboH9YxNnz{nCEK9E1#3sOSUAx`oMEVwnt@O&&A?Hw+=PcL*tNWdE?&n^mM^o{<s~N
zTNSba%J$8llwJrHT<OgX&1O;CtzeHy<ze1m2%{h|uT^!ksXUrXd9%;h=$rc)1y@Jh
zeyQu~R*I9iT>G+iD<acYBefk&J26@2Ds8ahX`&hV&Ba0ZG_;-zddUluRR{uY2I`-3
zEW=1{_~<CXo1pv>)Q(pSuV1@|D3z4oK`0nJ^5kUP3tzLbp=^P&moi*aiNS=}40KCp
z>Z2g&=9%t?oA_WmcU4AZsvh7ni@!;%RN%WJ!n|O-zU5e=jP~^LoKFV1S*$!)duG|_
z)Q-|c@wXi3iEy)O+39HhCBgJ%$dm?Oq1;pV#T~b|(l5_!3ef3%y`ck>^LWxix<FN4
zC|0Zx3oSGnjAKeZ*E73T;*sI>z)?(gbO~^Tgm=q^B#XwhLsb~z+}v0`OdKSJPpgT7
zb{!spD^gn=&XYHf7(F=8QfYqfh+d?yRrJdA*mmP&^dbd#@#dWKK<=swLZxueWeQ(n
zmgnOwOq;De^8uxVQLZGkPn3f%0{o5vrWa$nakUQ0n`J=ih`xnkG`JK(T=DA2I2^HX
zo9<z96_|pdY||FeW5#Qc4FjG|sTv|3zw*%)2q>YO05$^MT)2}GCgh3E4u;;jmQBp_
z-B-`L6(c=u$`|W@Bzp8Mz)GxvW1;4WbL=Z>%Q!+qtW}7OjBy;MiH&2J;Z(Oc#QoFU
z_ijb+W6z@8fYn}!71$u@luLlx-P3!PSah@8tk{QR3YvOOnSI;SfR#W4JB2BdAB6Jx
z^;CPZ56-vvDmQ!sLqZsd&D=y(`)VJ=sylOlS5PP9^^e&K({l>wOCx725RDoLPy-mi
zHrurPOjq>$fNocOwLsMddL6=$RR@zu4ljVIgoRly;a#S-Vb7QLPXd>&s_6i)S2oai
zRFn7~eGFsYx*ZxqPK?j*3=)qk+c5j~RPP;|GmG#7*a0O#WoDCZQa5T$p5eIbac-i_
zmxQzkKNcgSZ_m*ybKIzY(E0@7<kha7rFr>IfUnqjM@X`TN_Z1n&eg@q(f<}=dVa%O
zNwm`Rs&zdVCno~Z8FJFgx+i@#gFHnTJlbJ*wwM?t5J*S&DEW1+NROaWM&J=Wyd=jR
zH*LN8EUZ0xZ7i;Ow1%!7cz9V|J&w(ns+!@aQl8CviPmF@Aqy}TH4_GlU2cCU`s62_
z;<&Q&=0OP!v~OQ0m?Xs9)5@Xx_-sbievs>w!@bDUbtiD>=5(8~gv_&BzjtR$EEH{*
zNb2j)qe*@~N1WcOjS8PywwgRHW$kU2J-wBwJjZ$?e8SE;J3D_1I`k4Ypx1-vKGwtd
zn47&}0L*h^^_c4(CcEu$&OpT`wjlkPQ5;v-BK01cw``iD?Y5l_)vbY4hQovcvz#$g
zP;lu7S&4g3s$F!t({CF=nmhImNDIQ1KM1<S$T?;?$enNNp#7-W{3de(TjK5pi?geO
zs_KfP-e^tCsh~~GZhG8TZ)uYLG)X$?>c=hbdLo?{Um2`hAEdiz`X>J)ZYco);a41)
zKNeVo3jaJ>fZ-VDnvk|#6(Vj?bc73$Db;|QvbOdfOO}P1RF~wdrPtf0ewKR`rE%S7
z`=0bZy3x(erp+HPp0dvqLKl4Ga1#F~v}q?u=MTONz2TN%tZ?V6TFg~YoRtatCouvj
zVir?6(hr!(T7R<Ti`O?YGBf7cBlpRGc~z~{YZs5o8^WY;O}vW4qq-#@-pTec9!@I<
za!}?(M$L7HEdst`_cbSMe(CY+InAiwoFoMB3w58fP71trliL!Y16h6Siy8IJyL9Xa
zlMH6pnxt3Pr*%Lu_HIUEsaQfd#&P;>sC(q0xl32A8`HKAiDmw%Y(9EAB_wj%*QKIU
zHE-#LSKr<<)t*41qv4ydmFTp;6!XB{chj1t!`~nq%@p$P`z2FtRc(LDaag-E{g^1D
zBCH0yCb7LFW}B}Ary@La(k6}+nbzW+CLfFm5$^Vlz_W)32)5zMf+7g1z;V-hj;#SU
zRmW{A-<5a<Whvdp_4^g}h%pC0gya<()k)o$>+<2X>Kx;FwPE&UR;ROeai0!@&u`xy
zc}&Yq@)fUkH9(iOR!bsfzVY#{gl;Xht(xI0OBtCtr>Wl4$7b$o-dsc%l9(arPtSdE
z0y_!zqJzyzM~j2^`mov3c*upnhr6giz3!ELYT`{#lM>CCZf>mm@#pqEKRx#>;hpqt
z-vh?&L^-idTy$kff1H`(j$_R+UY)vrq-8E5-p#x{Aqav)!p+E<#;q=xXrExp%shwk
z0@u<Deja<%o`f~A54SA~u}L~=Dtu1pb6<&8EJ4@a+qr2e;WF6?k^L9*`Y1K@^ejf>
z{n}Zn-?X1GxAqvfv5<Y&jFbE;)3lqj+sXTYmR0A@qMKi?6!H@cs%e^xZus4X$APc9
zg>HiiWYjkeq}#QNXrNIqdbx2LRBe%v#_1}~ZmLn-dyCrFHl%zOhIU-RQ-hrA@hPq{
z;mt{dueANwj}%}I*zN{P^*f(8R_^t2ObIb`pUtD;=5;u))*3DHWk-fNu=20GR+o;b
z6BA+Gj<v0M61+`!F~&5i4QX9vf{f^^;rf)~;pMr`)ZW*XH{}AkYjzvm`c+R#S2es5
z<R8rk>R@5Ie^}rk{DR@q(&XR7d-ri~|I*G>yZZ)4M#O$UzcgYb;WfG^n%X=@u8la^
z10AucnbWq}Vp~prwmR#c#`IuAeG*~gq^@p4bM{|PuXTEK3Y6VpT{=bJP_^InL-eb@
z-IZ3iH(`C@aO&adTr5_yN1kQ(X{LMX>a;A0E5hWM`fL50EzGoMzC;XPu<S{WJE_Uz
zHm#VJJ;J$?_2-q(;FZ@y+9}A*!_#mfH-DSEXxL{PiW3_}2Zkr&X(T0GG%`<EXVfxW
zeyqvVYaJGj$ZS}rEFTse9Xp;iHVBDZqK|BtD}{J!?wIJ-<d_A>0is`Ael{g)+2u<1
z>^z}dL#fg)9bV?V)VohChWcQeOc1tv;Ir!dvhxVIYXW^P<=H&6Y^^b>s2@rYp<Woe
z<v4t%yxww|k|$ZPoPgp@YFTq<_gW^g7wP(qQD@Q5a|<Y_<TEzSx4I`vyb(0&%?y>2
zb7^AE+GNN2B+qnbrI@luZv?g7y->R?r;=ce@WOh_*RnE5hii{Tm+6j(e(<I`7-g2Z
zZ_&$((M3lidF%L{gKyr3%&~YHrpb8UG4`k+@`Vz=%4dx}4P3fOFCV-nQx*FbWcsw<
z&xa?0;5P_&j_OWJ$?-^p)2cJ9<u(6om3y{%;Azxo$2ID#<F86gwTH_aByADdrfnZ?
zYOPVKsMj>v7)R%j&Dbx>;KOa<*m1c>cb`_28$~N2T54?O^PjtK9Tb(n|NH6NbI+cK
zzTIg<BWoP)!F`tq#Zp`!9L|^Ctoe3*6CYtKmtMIMV$3HKVbSk#`w9rgmh<L*3ohF?
zmmq*m*j0yKedEXML6>)=%O0zXHl2sR=hyU<yUAM6q6lWZb1%<n&iSI6%eu?5ripot
z_wN%mjN<kginkERe?@rrstfxc&tWys&3e9mR6*l9h2X5ylg-O?Td%k73~BRKXp}d*
zO4uMxXDe3L&%XjNTwd#uu$?lwyYQE0eO@_)PEFBewPqW7KF|xqSBFeDXY4vt+|@2c
zt!V_OpZFKzd-nzBnmSjC)tvtn(Zg#w{p3LDdYF11oNj|{2i&D{#9!@tKS-0R$*QiY
zS%A(_8?|(<s5RD-^=eEz@-^K@gZAb<F0k^MbKht8W9RDS4Zr#`UIs+yojXe0zE9Pk
zi|*SFm!9t|T9?G~ath3o>@rBAJ;TQ4*R7v79@8`%4V1wp{LWpVmC3eESxxO1GE%);
zWryBTS#gH39nYC>8+(OrXYw77N%KY$&wC;b!(%T8bV&#h*Ezi849Y#L7I!oZv8T9E
z)VaO=EkOvCZ+J%%kDW_D&zS){_ZKLwWQ%KuQnO2M&X-sn_g~)-3^|ptLIbZYeQj0Z
zg2?!}@)1r2UP~Qctia#Qck@8Jy9S|VfDeA%*5hqgTEI$-w!K%S*c=BEY_#znLd(#(
z{_r|>CO3fav*`aGD!q1Ge@6Z5(Ds%f3HbOBjn~_b5sr6@b%R|2WO9rhp;;+N*-#g@
zI6{Q&hioR4?ZdnPk`Z46w%wogzp}>6^uwf&#Xu;J6GTzkMe%H!3dJD5$6yTu;_`yO
z=~FGOklW}j8|%!$JL$wY6{icuVBOG$G>){e?&@Mn?pXR}X{)gKGM1;VDz%uVHp1_e
z{9S$JxRZ`h&V0)l=b;)0avlHyU?}jiP0C~nSB^I;V=An&`K8MLz~q47mWxlN_EKC+
z>aNv4q#1exWuMk*o>@d%04C}IPeb&Y^jw^J>F3)w*D9W#L#6SOZ3N-kU`9b(s(cz9
zHp2&z(AmlfZm6PvB0l@Pu}H=<(bmrH{VoeQi<-~OzBOHZ$5tg)%SV(QVcFL=X%V*B
z$OHVkddHhS%nC9&W)?fROvyK#rg(%&kH)|KmLgZDU98=$pL}LK%maMx-nQe6F$aTu
zBSfW4Rb07J^U?u1hbZtj5!OYs>Y(Sw1e<jDF-0Kv*1<(L$N^L-(upcR^H^cV%fj;1
z@unIL1#zY|#DI3;>&qs$8l=7~ORP*jS#-`+6p)omP|5H06}yqRkX!IO7gX%`6Qc9F
z*r2PwlveeXd~-r1o<y=d?OLo((o99te@-vq+MFGu@rq2-3FmhXiL$>sSS1$9l!%(Z
zWg*Cg($3Mmn^<F7iOl0f)09S?!zlsZFZcSIh_unHZdj~=v91(-?)0>@OVbjoZ}$Yc
zqsSFQk-J3uov>qHS*UW+7}g0@f0TOcu%o)i($Z4N__><tM;hnp;FQN*)SXhM&aV(*
zt93)k<kmeJ+byTeqKiK>d_VSV2nq(DwJJJ)4Hdt5GxX-0Z8_q}L8Cvf@$zZ$_94nc
zuD?`J9Mepk<42;5K(|-_V3a%GDK`s)!y-#j5f|1HO={=ZXYUum-c5T~AxE^lU$=Jf
z`E3#j?9WKGJM-TYum>x{-~ayq+5(m^3;-(n|I9!3|GOXl<6Ij?Sjv)Er9*{QCDbUQ
VmhT7q{Xx8Br4^-8PG7nIzW^GJ&eH$@

literal 0
HcmV?d00001

diff --git a/doc/EMLL.png b/doc/EMLL.png
new file mode 100644
index 0000000000000000000000000000000000000000..a7ebffdb901e6bcbd1606baa7de0b55f88aef077
GIT binary patch
literal 3927
zcmcInc{r3^8y_+fkzI(Q2!*JjGLmINBD<KelieHBSZ2tU$x>uzWJqKSWlRjlR?}h|
z8e17gVn!pfgqkeh^nKTNz4h;VecvC?d7g8h`#QhhegE!rU(ZRlv@qu9mEr{e0Q{yV
zM%DlT#}NA)!^6oQBTeVV*e?#awXp%9Zs5>o_Jqqv-&`L6c$Ut$<ql-e_uVjYgaZKL
z3cnr>Q|n{P0D!<3QzL!b2$#je#fLR2f{?$Qm6g`ZFlXQ?kq5*CAAIaRVuc&6M`cT*
zb)RPCQ)6~H0G&$9FhFd`Gywqk)CuPRyozIR0`Bf10|5v3V)+241i%siw?h$XfX=i3
z)G?%uu|veB$H+vq2N_)Zj;iUtAWPmXmI>uaMq~UI0ZAIsyg(G#_09(471i+VBi+rf
zBcvP0&r;-o?&-n{+l?Z=gL3@J@Apt9N#@z$4{)4MBfrH>6RYu~vnp3o@_eTW#7=lk
zC;UL^$-o%mW%!JOR79})GF62Ysg^5kB^T&J>Pb&t7Ul;xNkjyT1vV#G-oDrFnm@0C
ze@bP*p`5iN(>&ie$+hwP;eyuV46`<&#Y%ks*B|0}zO~%OB@)_J!epWndj;Ato)|>0
zO7+lks_7D%XVZBJS5FZQYjD_RzD{EB!bM14jrIvg2U81`k{K@9w_6p}V$i{aP_^Iu
zFZaOa>iI5+!>)J-!C)15sY7utR3lUHW~qsLtw@-*vqlz*aWZ(&i9hY4dT{A6EXu0v
z=>Tu}XBJe)1niK{j`eT4e&j{}7Rg=`eBp1KBw3XjsDCXu-?+~p0Cbj<*qodhu>^TL
z;eY%+1oNha<op|(6j80OcThKJitoz=;eTSTd`ci6X3}Nn2BryiH@J{-4Erv$PEsc0
zR$vR{Z%BilVP0w5@_E8xOx+uGPm-Whd5B2GSq^Qh)h%qVfL%tneZB`VSKjtnfCk#a
zFKQ5&^dCYQ-J{JO<@s|AUJ-enqIb>7HV<O|oi|bCCmYjlG__hBtl)u#AN8fOMW=c8
zY~&IoquU?by6~{hq$o-7lW)!|^9M6rRHL^m*k1HPyr_Nnt1yaKfz2n<#Xlo-znAiv
z?sVkNaaAd{%0~zJz)iebRDD}iF!X@R;kkjNw%=3}RSslj=tqmZd;@F3aPO-9CJzN$
zvwze$HKgD{kMExdjC4C%83^1{)b+bML0IUTxl`{Wsw5kbttPuB$8YQ%nkRTug2D$k
zl8SHtw^Z1mQ*g6d!SLSb4??S{>}-dKD6CLZ<r7sS>x5@Koqn;5`jh2*1@F83AG`^g
zGXE~%8O6&p1PKT))IF=ma8lX7B64Z}&%fnoXS)q#*I|lU{ATexn`dM=h$3dIp!tW$
zMc5i(a3vmAkUZ-D8=1-NjWdrwgnG;W5q;S1PfBd=QWCh)-cX10YsaO)IvQQ^a@r=9
z2xih^w>hoH4@{LHF)*95E;xvFh$Cz_D7!6<(Xc3A_LP6Mtlc!g<{-JYlp^G|NrU`4
z2~o3zeUu<kwvm&(2*g4m{eNZoyN>~*=mgdQgtUwZ+nX$OabiL#vXNn7pxZjGMuvM%
zk&R0hKCSp?`%v;^RxKoFgr5R)1dL-R_27I8>6<VB;Ku*6(f!lbnMOe_(NG`BJN)Zg
zjw?G-B*O)Fm@_kI8`VTrfA=1~yE<+qwndcPRg|rVgND>zx~UJldvY(n*@TZS(bj_0
zv(G)8Iw~gp*0Cw9cG`bi=Q)-9ZUkPmY|7LL9k?f{_B2{m#BjswC&x93F!3&DNCo`y
z)pNzeeE9f<_12n*V0LvT`o`hQA^OMHVTVJdh*upfwJlm2m<_{mOY&GKU635zHnjoo
z;nt~(=ZgrQ<mRKzN)0dxA{DGd_1G^HH@b}#&Er#9;lYIB*(2ob2(zLI_~G>OmQ?}V
z!VQj!4XVvhh`K(1@xEJkRJD1*O~4KuHs=JMBYiAj7TflMW4<}B1Wo4Zc#$hjItL!x
zJ{7oKYnsStDXus;ntp!n&O!fc3JU|(NOK<UoDDBaJ7;GC;GAQ#FUl`fg}UJZ!N$s3
zH!YSwmV^r<Z*%tg{>;<guVmK9U+~@~Fw@C-GxOP@w!tIKnGWx1aV?+kPS_f_yuQ=#
zv@}anN}qp`r#lvru{|hrD$z&RJQWExv!d5&6|7uEbzFIhquQEE6w#sR*-N`beCd}^
z+f1Q=i#Gw;c7-7IJr3EYaVOHti-+;3D)(31{f5A9XGRUHWmtzCU*@!&!WzM3+uydT
zZllb}_Nx?+GUs9jv~|b$G}iJ%V!*Xtnj`H>t`jdkr4D*kjh_4BzjenTLT_ZPq}or0
z)X+$pIMFqaxZbfAA9s3!%Jc}EE#MJfCk)$hn4jiOs33V6*YgHuhrbx*pxs9@94$k{
zL|;oVT?0~>1%g%h%49@|w0nN$VsLIp!?{=igw@^^T{1s%jn|d4pAYIZTC!Kw>Q>$N
zkud)1>WB%+Ji7zWJ@X2ya<3-5)QPo|^!{eh_B~ReD+lpFPRie9XK~IvI9#`-<*i6x
zejmk|xw<?mS<#2bpd9IYztd|+gz>kyCgn27KdOco?&ad{cYB{mtFP3v*Nz}pHG0S^
ze+8GC;iCF3y#yl6jYE<ciaZjHvZA%5n#z^M1CK-@le(aXZWPQFkMNpPnk!o-GD{<q
z!TMbv_If+&OQF0ya1DtYU`fapQLD=A(S#4RJhJ+JSn`(Lx<|CEYrRu`M`0y&Mo~FU
zefY3z)04TfQejG1V6M}q0ezGs>Mh)^-vCx^(nc=*R!9FDEJ!p31)zx6esC9z`BW+m
zLl$s^Rx5dgTz{m%{gzYTV<lgZkU(omjKVvHANkW}f|Q#jW581Pf@@5cC8_e5lTdl!
zQhRgIUuPtqWqD#%!LDl(Xz9_sCo|}wtEq}NZ;RWV4=0szgg$W%&7O2|R?n3Hd%BEN
z)H}KE1SEi7U^Ukj#>^%#44x_*g82Eq{Gq6OR~3D8-js|EKd5MK$UFmG732C+8Dk**
zv2Knf5@LjNN@V4K1(Q|PZKPKmZIdz2TY=o3Im)N=vK{*rk*{1qT;D}M^kgb|b5~Xk
z-ya3v$^m|h(m<g67<UmE=@h#tD6<hfC!~Rhmkt}&dpKeP^69u<1!`n|J4aigMl++t
z?c|A2dsy9<FbX%JTMNfUQ|mkF+JB*V2SHUYYuf<eucWuUt+p3Gy4Jj+@M~|rDGg5|
z9o@!>wol&gZxSG@eL*$-d^kGvQg0gTcq1~!D!Q;->|>o*JaSD3s}tQ+_(FhO8V;)K
z-4T=c%$q8|O(JNaLiA7m+MRgij$)?lJB*KNL-Ftrpf-8Dlpt3h`_N84L5gj4j%Z5~
zLoDr|yPGSse%1E}pB1std;Arl1Fk{*v4l(fW{tNshKyv*`;VLGcpZ(iH#&%ioGkMb
zOV(6nwz0`!AItOc0BxOoY{!C!uJ@gA`;~2UBVAb?2DM0IuDoYu4f%J~*0>DOdwt;J
zR%RtdrOouZsMqlAH`y0<Ze=s`E4g7R8E>bS^^9l&gJCdD$5T<xNi9AiUYgRwwNKDz
z7$F=Wi1>}FQWb@Jr?1<W`MOwl-aotysoLI-kCm0MJXBn(XP4&C<UUf+`V)h9yJMcA
zWnODk^gWZoddR$1Kfkfju7uYYLL3f<JFn;TC;e={J#As$P%?vvOw`Tk1%DKV_*cp;
zMzYL`u6X$sY7phak6UCrbPlRz$iBnAmUyB{t;rnnPUv-DL(xi3x~m><FK|V$md+PG
zPQnI}(ok(*c4eyUDkWY=)a^i@?dxdqb>KP>pmnuh8~G)b7j<msYHD(n=!DnXo1l)~
z?a3pW)TF8$qblpv;&>PG^lmXjyVy)*&fYL=_OluSsZN+xJ^kDt>(KesP#)xEj76x@
zk)fc%jWTvG0+L7dMAqfDTtg;CMrx5Q#ZE3ZxI=NF=$YlILo)<FgXEc<Le#=n{kyL=
z9*=8i_Us&$#`71p@9vimK#&n5WXqV-qXDXKIynE5@YtA$dM|QP{vd`HRc+$`CJl^k
zw?4k_aZrKAKx-NWL_Xm*E|($Ia5Vi|ljFVZ*V)H~vW|&g>Qd+#+vWA=Ep*$-4Ct*#
zLMgBNS8sRc7;n;3u>tumdmgo4DJpQq!F8@99mZrq9T$VU6=i28tuRpzd96%c+HC)1
zKT7z8wfS{9O|NdAr)hn8RrFucJI*U1G9x~qC81~7*x#T3>i7ouX%Z_(2ICubYedAk
zP;A-(db;_vvu?Qh+OBHBg|^{EfmC5T9C-&u{k9;FTI_AkbYiabp6ypT_NlzCc<3Iu
z9;y1Iu*>CCo|JBr_GQ!7&r?t8Q{cSh#YCG`X>}KXPdx7a$JTn|@KxAz$xcomIm2GV
zq&>|T#$!EA)?Pgt>lE;m=s>qx`%dUh4RX;svQN)PLuWuJI=z!mXN&7WBKMQ?l)Mes
zF?tHNklv-3;2MV;jNL8j?sgy%sB6?8iLk)uKjyIMR?9Xw#H;s#wO3m4A=J-0X8^Id
mg#Y=~@SpxMrR?tfJWQ)M?mXu5MU#CQ2AD!DjOq;B;{OFRK6<VI

literal 0
HcmV?d00001

diff --git a/doc/Usage_EN.md b/doc/Usage_EN.md
new file mode 100644
index 0000000..9e3e901
--- /dev/null
+++ b/doc/Usage_EN.md
@@ -0,0 +1,141 @@
+## Building the Library
+
+### Compilers
+
+
+| Tested Compilers | ARMv7A | ARMv8A |
+| ---------------- | ------ | ------ |
+| Linux target| Linaro-GCC-gnueabihf 201912 | Linaro-GCC-aarch64 201912 |
+| Android target | NDK-r20 clang | NDK-r20 clang |
+
+### CMake
+
+
+The CMake version should be 3.7 or newer.
+
+
+### Linux
+
+A cross-compiling gcc toolchain (7.5.0 or later) is required.
+
+
+```
+git clone https://github.com/netease-youdao/EMLL.git
+cd EMLL
+mkdir install
+mkdir build && cd build
+cmake .. -DCMAKE_INSTALL_PREFIX=../install -DCMAKE_C_COMPILER=/path/to/gcc [-DCMAKE_SYSROOT=/path/to/toolchain/sysroot] [-DEML_ARMV7A=ON #if built for armv7a 32-bit target]
+make install
+```
+
+### Android
+
+
+NDK r19 or newer is required.
+
+
+```
+git clone https://github.com/netease-youdao/EMLL.git
+cd EMLL
+mkdir install
+mkdir build && cd build
+cmake .. -DCMAKE_INSTALL_PREFIX=../install -DANDROID=ON -DANDROID_NDK=/path/to/ndk [-DANDROID_PLATFORM=XX #SDK version of the target device] [-DEML_ARMV7A=ON #if built for armv7a 32-bit target]
+make 
+make install
+```
+
+
+### Linking with your application
+
+
+The static library "libeml-armneon.a" will be generated under EMLL/install/lib on building. There are 3 headers under <install_directory>/include (Gemm.h, Quant.h, Layer.h) which summarize the C interfaces provided by the library.
+
+## Testing
+
+When the test option is enabled in cmake, additional executables for testing results and performances will be generated under EMLL/install/bin. They can be executed on the target device with calling from command line (terminal/adb).
+
+
+| Executable | Command-Line Usage | Notes |
+| ---------- | ------------------ | ----- |
+| test_gemm | test_gemm < M > < N > < K > <matrix_order> <num_threads> <gemm_type> | matrix_order: 0-3; gemm_type: sgemm, hgemm, u8u32, s8s32 |
+| test_bias | test_bias <major_dimension> <minor_dimension> <bias_type> | bias_type: 0-7 for bias, 8-9 for summing of rows/cols |
+| test_quant | test_quant <array_size> <job_type> <additional_params> | array_size: the number of elements; job_type: qs/qu/d/rs/ru |
+
+## API
+
+The library provide C functions for GEMM, bias and quantization.
+
+
+| Functions | Header |
+| --------- | ------ |
+| General Matrix Multiplication (GEMM) | include/Gemm.h |
+| Fully-Connected Layer (FC) with bias | include/Layer.h |
+| Quantization, Dequantization, Requantization | include/Quant.h |
+
+### GEMM
+
+For simplicity, the GEMM interface does not include LDA-LDC and alpha (assume 1.0).
+
+The storage order of output matrix C is fixed to column-major. The storage orders of input matrices are specified via function parameters. An element in the matrix can be accessed via column_id "([0, column_numbers))" and row_id "([0, row_numbers))", which can be combined into a 1D index if its storage order is known:
+
+| Storage Order | Element Index |
+| ------------- | ------------- |
+| Column-Major | column_id * row_numbers + row_id |
+| Row-Major | row_id * column_numbers + column_id |
+
+The GEMM interface is summarized in [include/Gemm.h](../include/Gemm.h).
+
+#### Function Name
+
+| Data Types | Function Name |
+| ---------- | ------------- |
+| fp32 -> fp32 | sgemm |
+| fp16 -> fp16 | hgemm <sup>[1] |
+| int8 -> int32 | s8s32gemm <sup>[2] |
+| uint8 -> uint32 | u8u32gemm <sup>[2] |
+
+
+[1] Currently not implemented for Aarch32. Return error code 2 when the processor has no support for ARMv8.2a-fp16 ISA
+
+
+[2] Aarch64 version: Use dot instructions automatically on processors supporting ARMv8.2a-dotprod, use mla-long instructions otherwise
+
+
+#### Function Parameters
+
+The operation of GEMM: C[MxN] = A[MxK] B[KxN] + beta * C[MxN]
+
+| Parameters | Description |
+| ---------- | ----------- |
+| a_rowmajor | The storage order of matrix A, row-major if not 0 |
+| b_rowmajor | The storage order of matrix B, row-major if not 0 |
+| A | The address of the first element in source matrix A |
+| B | The address of the first element in source matrix B |
+| C | The address of the first element in output matrix C<sup>[1] |
+| M | The number of rows in source matrix A |
+| N | The number of columns in source matrix B |
+| K | The number of columns in A, must be equal to the number of rows in B |
+| beta | The scaling factor on C prior to the addition of AB product |
+| num_threads | The (maximum) number of threads to use in parallel run |
+
+
+[1] The output matrix C is fixed to column-major.
+
+### Quantization
+
+Please refer to [include/Quant.h](../include/Quant.h) for details.
+
+#### Function Name
+
+
+| Name | Description |
+| ---- | ----------- |
+| bias_int32_t | Perform bias on a 32-bit integer matrix, can be used as a component in asymmetric quantitized GEMM |
+| u8u32_sum | Perform row-wise or column-wise sum on the input 8-bit unsigned integer matrix, can be used as a component in asymmetric quantitized GEMM |
+| quantize_asymmetric_fX_uY | Asymmetric quantization of X-bit float data to unsigned Y-bit values |
+| quantize_symmetric_fX_sY | Symmetric quantization of X-bit float data to signed Y-bit values |
+| dequantize_symmetric_fX_sY | Symmetric dequantization of Y-bit integer results to X-bit float ones |
+| requantize_asymmetric_XtoY | Asymmetric requantization of X-bit integer values to unsigned Y-bit values |
+| requantize_symmetric_XtoY | Symmetric requantization of X-bit integer values to signed Y-bit values |
+
+
diff --git a/doc/Usage_ZH.md b/doc/Usage_ZH.md
new file mode 100644
index 0000000..da846fa
--- /dev/null
+++ b/doc/Usage_ZH.md
@@ -0,0 +1,145 @@
+## 如何构建 Edge ML 库
+
+### 测试过的编译器
+
+
+| 端侧设备 | ARMv7A | ARMv8A |
+| -------- | ------ | ------ |
+| Linux | Linaro-GCC-gnueabihf 201912 | Linaro-GCC-aarch64 201912 |
+| Android | NDK-r20 clang | NDK-r20 clang |
+
+目前支持在Linux系统上交叉编译。
+
+### CMake 版本
+
+
+CMake 需要 3.7 或更新的版本。
+
+
+### 为运行 Linux 系统的端侧设备构建
+
+需要 7.5.0 及以后的 GCC 交叉编译工具链。
+
+以下为在 Linux 系统开发机上的构建命令
+
+```
+git clone https://github.com/netease-youdao/EMLL.git
+mkdir install
+mkdir build && cd build
+cmake .. -DCMAKE_INSTALL_PREFIX=../install -DCMAKE_C_COMPILER=GCC编译器的目录 [-DCMAKE_SYSROOT=GCC工具链中sysroot的目录] [-DEML_ARMV7A=ON #若端侧为32位请开此选项]
+make
+make install
+```
+
+### 为运行 Android 系统的端侧设备构建
+
+
+需要 r19 或更高版本的 Android NDK。
+
+以下为在 Linux 系统开发机上的构建命令
+
+```
+git clone https://github.com/netease-youdao/EMLL.git
+mkdir install
+mkdir build && cd build
+cmake .. -DCMAKE_INSTALL_PREFIX=../install -DANDROID=ON -DANDROID_NDK=NDK的安装目录 [-DANDROID_PLATFORM=目标安卓SDK版本] [-DEML_ARMV7A=ON #若端侧为32位请开此选项]
+make
+make install
+```
+
+
+### 使用构建好的库
+
+在 EMLL/install 下会生成 bin，lib 和 include 文件夹，其中 lib 下包含了生成的静态库 libeml-armneon.a，include 下包含了定义 EMLL 对外接口的头文件。应用程序只需在源码中包含对应的头文件，链接时静态链接 libeml-armneon.a 即可。
+
+## 如何测试 Edge ML 库
+
+构建过程中，默认会在 EMLL/install/bin 下生成三个用于测试的可执行文件：test_gemm，test_bias 和 test_quant。把它们拷贝到端侧设备上，命令行 (adb/ssh) 运行它们即可。
+
+
+| 测试程序  | 命令行参数 | 说明 |
+| --------- | ------------------ | ----- |
+| test_gemm | test_gemm < M > < N > < K > <源矩阵排列顺序> <并行线程数> <数据类型> | 源矩阵排列顺序：0-3；数据类型：sgemm、hgemm、u8u32、s8s32 |
+| test_bias | test_bias <主维度长> <次维度长> <任务种类> | 任务种类：0-7 偏置，8-9 按行或列求和 |
+| test_quant | test_quant <测试数组大小> <任务类型> <其他参数> | 任务类型：qs/qu/d/rs/ru |
+
+## 应用程序接口
+
+Edge ML 库提供基于 C 的矩阵乘法和量化接口
+
+
+| 函数类别 | 头文件 |
+| --------- | ------ |
+| 矩阵乘法 | include/Gemm.h |
+| 全连接层 | include/Layer.h |
+| 量化、反量化、重量化 | include/Quant.h |
+
+### 矩阵乘法
+
+为了简便，矩阵乘法接口去掉了 LDA-LDC 参数，固定 alpha = 1.0。
+
+输出矩阵的排列顺序固定为列主序。输入矩阵的排列顺序由函数参数确定。矩阵中的每个元素位置可以通过行号 ([0，行数)) 和列号 ([0，列数)) 确定。当矩阵的排列顺序确定时，其元素地址的偏移量是确定的：
+
+| 排列顺序 | 元素偏移量（相对于首元素）|
+| -------- | ------------------------- |
+| 列主序 | 列号 * 行数 + 行号 |
+| 行主序 | 行号 * 列数 + 列号 |
+
+具体接口定义详见[include/Gemm.h](../include/Gemm.h)。
+
+#### 函数名称
+
+| 数据类型 | 函数名称 |
+| ---------- | ------------- |
+| fp32 -> fp32 | sgemm |
+| fp16 -> fp16 | hgemm <sup>[1] |
+| int8 -> int32 | s8s32gemm <sup>[2] |
+| uint8 -> uint32 | u8u32gemm <sup>[2] |
+
+
+[1] 目前不支持 Aarch32 设备；当目标处理器不支持 ARMv8.2-a 半精扩展时，返回错误 2 。
+
+
+[2] Aarch64 版本：在支持 ARMv8.2a 点积扩展的处理器上自动使用点积指令运算，其他处理器上使用变长乘加指令运算。
+
+
+#### 函数参数
+
+
+
+矩阵乘法通式：C[MxN] = A[MxK] B[KxN] + C[MxN] * beta
+
+| 参数 | 描述 |
+| ---------- | ----------- |
+| a_rowmajor | 源矩阵 A 的排列顺序，非零表示行主序 |
+| b_rowmajor | 源矩阵 B 的排列顺序，非零表示行主序 |
+| A | 源矩阵 A 的地址 |
+| B | 源矩阵 B 的地址 |
+| C | 输出矩阵 C 的地址 |
+| M | 矩阵 A 的行数 |
+| N | 矩阵 B 的列数 |
+| K | A的列数，必须等于 B 的行数 |
+| beta | 作用于矩阵 C 的预乘因子 |
+| num_threads | 并行时能够使用的线程数 <sup>[2] |
+
+
+[1] 输出矩阵 C 固定为列主序。
+
+
+[2] 等于 1 时运行串行版本；等于 0 时使用所有 OpenMP 运行时提供的线程。
+
+### 量化相关函数
+
+详见[include/Quant.h](../include/Quant.h)。
+
+| 函数名 | 描述 |
+| ---- | ----------- |
+| bias_int32_t | 对32位整数的矩阵施加偏置；可用于非对称量化的整数乘法的后处理 |
+| u8u32_sum | 对8位整数的矩阵按行或按列求和，结果存于32位向量 |
+| quantize_asymmetric_fX_uY | 非对称量化，从X位浮点到Y位整数 |
+| quantize_symmetric_fX_sY | 对称量化，从X位浮点到Y位整数 |
+| dequantize_symmetric_fX_sY | 对称反量化，从Y位整数到X位浮点 |
+| requantize_asymmetric_XtoY | 非对称重量化，从X位整数到Y位整数 |
+| requantize_symmetric_XtoY | 对称重量化，从X位整数到Y位整数 |
+
+
diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
new file mode 100644
index 0000000..6fdbd28
--- /dev/null
+++ b/example/CMakeLists.txt
@@ -0,0 +1,42 @@
+#Command line for Android NDK:
+# cmake <example_dir> -DANDROID=ON -DANDROID_NDK=/path/to/ndk \
+# -DEMLL_DIR=emll/installation/path [-DEML_ARMV7A=ON]
+# make
+
+#Command line for GCC:
+# cmake <example_dir> [-DCMAKE_SYSROOT=/path/to/gcc/sysroot] \
+# -DCMAKE_C_COMPILER=/path/to/gcc \
+# -DEMLL_DIR=emll/installation/path [-DEML_ARMV7A=ON]
+# make
+
+cmake_minimum_required(VERSION 3.7)
+set(CMAKE_BUILD_TYPE Release)
+
+set(CMAKE_C_STANDARD 99)
+set(CMAKE_C_FLAGS_RELEASE "-O2")
+
+if(ANDROID) #variable ANDROID_NDK must be provided prior to this section
+  set(ANDROID_PLATFORM 27)
+  if(EML_ARMV7A)
+    set(ANDROID_ABI "armeabi-v7a")
+  else() #armv8a
+    set(ANDROID_ABI "arm64-v8a")
+  endif()
+  include(${ANDROID_NDK}/build/cmake/android.toolchain.cmake)
+  set(RUNTIME_LIB dl log -fopenmp)
+else() #Linux. Variables CMAKE_C_COMPILER must be provided, CMAKE_SYSROOT is optional
+  set(CMAKE_SYSTEM_NAME Linux)
+  if(EML_ARMV7A)
+    set(CMAKE_SYSTEM_PROCESSOR arm)
+  else()
+    set(CMAKE_SYSTEM_PROCESSOR aarch64)
+  endif()
+  set(RUNTIME_LIB pthread -fopenmp -lm)
+endif()
+
+# variable EMLL_DIR must be provided
+project(example_emll C)
+include_directories(${EMLL_DIR}/include)
+add_executable(example_emll_gemm Gemm.c)
+target_link_libraries(example_emll_gemm ${EMLL_DIR}/lib/libeml-armneon.a ${RUNTIME_LIB})
+
diff --git a/example/Gemm.c b/example/Gemm.c
new file mode 100644
index 0000000..7550b36
--- /dev/null
+++ b/example/Gemm.c
@@ -0,0 +1,195 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+/******************************************************************************
+ * File:        example/Gemm.c
+ * Description: This file is an example of using EMLL in your application.
+ *              In this example we do 3 types of fp32 GEMM:
+ *              (1) direct SGEMM
+ *              (2) asymmetrically quantize to uint8, do GEMM to int32,
+ *                  finally dequantize to fp32.
+ *              (3) symmetrically quantize to int8, do GEMM to int32,
+ *                  finally dequantize to fp32.
+ *              Users should tell compilers to include the "include"
+ *              directory of the library and link to the static
+ *              library of EMLL.
+ *****************************************************************************/
+
+#include "Gemm.h"
+#include "Quant.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <time.h>
+#include <math.h>
+
+int main(int argc, char **argv) {
+
+  if (argc == 1 || !strcmp(argv[1], "-h") || !strcmp(argv[1], "--help")) {
+    printf("Usage: %s [M] [N] [K]\n", argv[0]);
+    return 0;
+  }
+
+  uint16_t M = 300, N = 400, K = 500;
+  if (argc > 1) M = atoi(argv[1]);
+  if (argc > 2) N = atoi(argv[2]);
+  if (argc > 3) K = atoi(argv[3]);
+
+  if (!M || !N || !K) {
+    fprintf(stderr, "Invalid (zero or negative) M, N or K.\n");
+    return -1;
+  }
+
+  printf("Test matmul C=AB with fp32, symmetric & asymmetric quantizations.\n");
+  printf("matrix A (column-major): %u x %u\n", M, K);
+  printf("matrix B (column-major): %u x %u\n", K, N);
+  printf("matrix C (column-major): %u x %u\n", M, N);
+  const uint32_t size_a = (uint32_t)M * (uint32_t)K;
+  const uint32_t size_b = (uint32_t)N * (uint32_t)K;
+  const uint32_t size_c = (uint32_t)M * (uint32_t)N;
+
+  /* allocate fp32 matrices */
+  float * const A_f = (float *)malloc(size_a * 4);
+  float * const B_f = (float *)malloc(size_b * 4);
+  float * const C_f = (float *)malloc(size_c * 4);
+
+  /* allocate quant-u8 matrices and arrays */
+  uint8_t * const A_u = (uint8_t *)malloc(size_a);
+  uint8_t * const B_u = (uint8_t *)malloc(size_b);
+  int32_t * const C_qu = (int32_t *)malloc(size_c * 4);
+  float * const C_fqu = (float *)malloc(size_c * 4);
+  uint32_t * const A_sum = (uint32_t *)malloc(M * 4);
+  uint32_t * const B_sum = (uint32_t *)malloc(N * 4);
+
+  /* allocate quant-s8 matrices and arrays */
+  int8_t * const A_s = (int8_t *)malloc(size_a);
+  int8_t * const B_s = (int8_t *)malloc(size_b);
+  int32_t * const C_qs = (int32_t *)malloc(size_c * 4);
+  float * const C_fqs = (float *)malloc(size_c * 4);
+
+  int ret_status = 0;
+  do {
+    if (!A_f || !B_f || !C_f || !A_u || !B_u || !C_qu || !C_fqu ||
+      !A_sum || !B_sum || !A_s || !B_s || !C_qs || !C_fqs) {
+      fprintf(stderr, "Memory allocation failed.\n");
+      ret_status = -1;
+      break;
+    }
+
+    /* prepare data */
+    srand(time(NULL));
+    for (uint32_t i = 0; i < size_a; ++i) {
+      A_f[i] = (float)rand() / (float)RAND_MAX - 0.3;
+    }
+    for (uint32_t i = 0; i < size_b; ++i) {
+      B_f[i] = (float)rand() / (float)RAND_MAX - 0.3;
+    }
+    printf("Matrix preparation done. rand [-0.3, 0.7)\n");
+
+    /* all matrices are column-major */
+    /* example 1: do normal fp32 GEMM */
+    /* gemm(a_rowmajor, b_rowmajor, a_addr, b_addr, c_addr, m, n, k, beta, threads) */
+    int sgemm_status = sgemm(0, 0, A_f, B_f, C_f, M, N, K, 1, 0);
+    if (sgemm_status != 0) {
+      fprintf(stderr, "sgemm returns error code %d\n", sgemm_status);
+      ret_status = -1;
+      break;
+    }
+    printf("Normal SGEMM done.\n");
+
+    /* example 2: do asymmetric quant 8-bit GEMM */
+    float scale_a, scale_b;
+    uint8_t zero_point_a, zero_point_b;
+    /* quantitize the source matrices */
+    /* quant_asym(input_addr, output_addr, &zero_point, &scale, array_length, input_min, input_Max) */
+    quantize_asymmetric_f32_u8(A_f, A_u, &zero_point_a, &scale_a, size_a, 0, -1);
+    quantize_asymmetric_f32_u8(B_f, B_u, &zero_point_b, &scale_b, size_b, 0, -1);
+    /* do unsigned 8->32 bit GEMM */
+    /* gemm(a_rowmajor, b_rowmajor, a_addr, b_addr, c_addr, m, n, k, beta, threads) */
+    int u8u32_status = u8u32gemm(0, 0, A_u, B_u, (uint32_t *)C_qu, M, N, K, 1, 0);
+    if (u8u32_status != 0) {
+      fprintf(stderr, "u8u32gemm returns error code %d\n", u8u32_status);
+      ret_status = -1;
+      break;
+    }
+    /* sum row/col of source matrices (along K dim) */
+    u8u32_sum(A_u, A_sum, M, K, 0);
+    u8u32_sum(B_u, B_sum, K, N, 1);
+    /* bias the result of 8->32 bit GEMM */
+    bias_int32_t(C_qu,
+      (int32_t)zero_point_a * (int32_t)zero_point_b * (int32_t)K,
+      (int32_t *)A_sum, -(int32_t)zero_point_b,
+      (int32_t *)B_sum, -(int32_t)zero_point_a, M, N);
+    /* dequantitize the result */
+    /* dequant(input_addr, output_addr, scale, array_length) */
+    dequantize_symmetric_f32_s32(C_qu, C_fqu, scale_a * scale_b, size_c);
+    printf("Asym quant GEMM done.\n");
+
+    /* example 3: do symmetric quant 8-bit GEMM */
+    /* quantitize the source matrices */
+    /* quant_sym(input_addr, output_addr, &scale, array_length, input_min, input_Max) */
+    quantize_symmetric_f32_s8(A_f, A_s, &scale_a, size_a, 0, -1);
+    quantize_symmetric_f32_s8(B_f, B_s, &scale_b, size_b, 0, -1);
+    /* do signed 8->32 bit GEMM */
+    int s8s32_status = s8s32gemm(0, 0, A_s, B_s, C_qs, M, N, K, 1, 0);
+    if (s8s32_status != 0) {
+      fprintf(stderr, "s8s32gemm returns error code %d\n", s8s32_status);
+      ret_status = -1;
+      break;
+    }
+    /* dequantitize the result */
+    /* dequant(input_addr, output_addr, scale, array_length) */
+    dequantize_symmetric_f32_s32(C_qs, C_fqs, scale_a * scale_b, size_c);
+    printf("Sym quant GEMM done.\n");
+
+    /* evaluate the results */
+    float max_diff_qu = 0, max_diff_qs = 0;
+    double sum_diff_sqr_qu = 0, sum_diff_sqr_qs = 0;
+    for (uint32_t i = 0; i < size_c; ++i) {
+      float tmp_diff_qu = fabsf(C_fqu[i] - C_f[i]);
+      float tmp_diff_qs = fabsf(C_fqs[i] - C_f[i]);
+      max_diff_qu = fmaxf(max_diff_qu, tmp_diff_qu);
+      max_diff_qs = fmaxf(max_diff_qs, tmp_diff_qs);
+      sum_diff_sqr_qu += max_diff_qu * max_diff_qu;
+      sum_diff_sqr_qs += max_diff_qs * max_diff_qs;
+    }
+    double std_dev_qu = size_c == 1 ? 0 : sqrt(sum_diff_sqr_qu / (size_c - 1));
+    double std_dev_qs = size_c == 1 ? 0 : sqrt(sum_diff_sqr_qs / (size_c - 1));
+    printf("The results of asym quant compared to std fp32: ");
+    printf("max_diff = %.2e, stdev = %.2e\n", max_diff_qu, std_dev_qu);
+    printf("The results of sym quant compared to std fp32: ");
+    printf("max_diff = %.2e, stdev = %.2e\n", max_diff_qs, std_dev_qs);
+  } while (false);
+
+  /* clean up */
+  free(A_f);
+  free(B_f);
+  free(C_f);
+  free(A_u);
+  free(B_u);
+  free(C_qu);
+  free(C_fqu);
+  free(A_sum);
+  free(B_sum);
+  free(A_s);
+  free(B_s);
+  free(C_qs);
+  free(C_fqs);
+  return ret_status;
+}
diff --git a/example/Usage_EN.md b/example/Usage_EN.md
new file mode 100644
index 0000000..5521372
--- /dev/null
+++ b/example/Usage_EN.md
@@ -0,0 +1,61 @@
+## How to link and use EMLL in your application with CMake
+
+### Build EMLL
+
+Please refer to doc/Usage_EN.md for detailed procedure.
+
+### Include Headers in Your Source
+
+```
+#include "Gemm.h" // for GEMM functions
+#include "Layer.h" // for FC functions
+#include "Quant.h" // for quantization/dequantization/requantization
+
+<your code>
+```
+
+### Write CMakeLists.txt
+
+You can use the default CMakeLists.txt or manually rewrite it as follows:
+
+```
+cmake_minimum_required(VERSION <your minimum version>)
+set(CMAKE_BUILD_TYPE <your build type>)
+
+set(CMAKE_C_COMPILER ndk/or/arm-gcc/compiler)
+# add your compile options
+
+project(<your project name> C)
+
+add_executable(<your program> <your source file>)
+target_include_directories(<your program> <emll_installation_path>/include)
+target_link_libraries(<your program> <emll_installation_path>/lib/libeml-armneon.a)
+
+if(ANDROID)
+  target_link_libraries(<your program> dl log -fopenmp)
+else()
+  target_link_libraries(<your program> pthread -fopenmp)
+endif()
+```
+
+### Build Your Application
+
+```
+cd <your_source_dir>
+mkdir build && cd build
+cmake .. [-DANDROID=ON # for android] [#other options of your project]
+make
+```
+
+### Example
+
+The source file "Gemm.c" gives an example of using GEMM and quantization functions of EMLL library. It can be built into an executable by the following commands.
+
+```
+cd <path/to/emll/example>
+mkdir build && cd build
+cmake .. [-DANDROID=ON -DANDROID_NDK=/path/to/ndk #options for Android] [-DCMAKE_C_COMPILER=/path/to/gcc [-DCMAKE_SYSROOT=/path/to/gnu/sysroot] #options for GNU-Linux] [-DEML_ARMV7A=ON #armv7 device]
+make
+# The executable "example_emll_gemm" will be generated under the build directory, which can be executed on the target device.
+```
+
diff --git a/example/Usage_ZH.md b/example/Usage_ZH.md
new file mode 100644
index 0000000..0bdb2c3
--- /dev/null
+++ b/example/Usage_ZH.md
@@ -0,0 +1,61 @@
+## 如何借助 CMake 链接和使用 EMLL
+
+### 构建 EMLL
+
+详细步骤请参阅 doc/Usage_ZH.md。
+
+### 在源码中包含 EMLL 的头文件
+
+```
+#include "Gemm.h" // 矩阵乘法函数
+#include "Layer.h" // 全连接函数
+#include "Quant.h" // 量化、反量化、重量化
+
+<其他代码>
+```
+
+### 编写 CMakeLists.txt
+
+可以参照 example 文件夹中的 CMakeLists.txt，也可以按如下样式重写：
+
+```
+cmake_minimum_required(VERSION <用户指定的最低版本>)
+set(CMAKE_BUILD_TYPE <用户指定的构建类型>)
+
+set(CMAKE_C_COMPILER ndk/or/arm-gcc/compiler)
+# 添加其他编译选项
+
+project(<用户指定的工程名称> C)
+
+add_executable(<应用程序名> <源文件>)
+target_include_directories(<应用程序名> <EMLL安装目录>/include)
+target_link_libraries(<应用程序名> <EMLL安装目录>/lib/libeml-armneon.a)
+
+if(ANDROID)
+  target_link_libraries(<应用程序名> dl log -fopenmp)
+else()
+  target_link_libraries(<应用程序名> pthread -fopenmp)
+endif()
+```
+
+### 构建应用程序
+
+```
+cd <CMakeLists.txt 位置>
+mkdir build && cd build
+cmake .. [-DANDROID=ON #安卓平台] <其他您的工程需要的选项>
+make
+```
+
+### 示例代码
+
+本文件夹中的 Gemm.c 提供了 EMLL 函数的使用示例，可以通过以下命令编译它并用 adb 拷贝到端侧设备上运行。
+
+```
+cd <example 目录>
+mkdir build && cd build
+cmake .. [-DANDROID=ON -DANDROID_NDK=/path/to/ndk #安卓平台] [-DCMAKE_C_COMPILER=/path/to/gcc [-DCMAKE_SYSROOT=/path/to/gnu/sysroot] #GNU-Linux平台] [-DEML_ARMV7A=ON #armv7平台]
+make
+# 在 build 文件夹中生成 example_emll_gemm 程序，可到端侧设备上运行它
+```
+
diff --git a/include/Gemm.h b/include/Gemm.h
new file mode 100644
index 0000000..8480143
--- /dev/null
+++ b/include/Gemm.h
@@ -0,0 +1,120 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+#include <arm_neon.h>
+
+#ifndef INCLUDE_ARM_GEMM_INTERFACE
+#define INCLUDE_ARM_GEMM_INTERFACE
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/********************************************************************
+Function:    sgemm
+Description: fp32 general matrix multiplication, do C = AB + beta * C
+             with OpenMP parallelization.
+Input: int a_rowmajor: an integer indicating the storage order
+                       of input matrix A. Non-zero number for
+                       row-major storage, 0 for column-major storage.
+       int b_rowmajor: an integer indicating the storage order
+                       of input matrix B. Non-zero number for
+                       row-major storage, 0 for column-major storage.
+       (matrix C is fixed to column-major)
+       const float *A, *B: the addresses of input matrices
+       uint32_t M, N, K: the dimensions of matrices
+                         A: M x K; B: K x N; C: M x N
+       float beta: the scale on matrix C prior to GEMM
+       uint32_t num_threads: the maximum number of threads
+                        in OpenMP parallelization.
+                        0 : the function will determine
+                            the number of threads from
+                            the problem size, use as many
+                            threads as possible up to
+                            omp_get_max_threads() when
+                            M, N and K are large.
+                        positive number: limit the maximum
+                            number of threads the function
+                            can use in OpenMP parallelization
+                        1 : force serial execution
+Output: float *C: the address of output matrix
+Return: 0 on success, 1 on illegal parameters
+********************************************************************/
+int sgemm(int a_rowmajor, int b_rowmajor,
+  const float *A, const float *B, float *C,
+  uint32_t M, uint32_t N, uint32_t K,
+  float beta, uint32_t num_threads);
+
+/**************************************************************************
+Function:    s8s32gemm
+Description: signed 8bit -> 32bit integer matrix multiplication,
+             do C = AB + beta * C with OpenMP parallelization,
+             use *mlal NEON instructions on CPUs without ARMv8.2a feature,
+             use *dot NEON instructions on CPUs support ARMv8.2a-dotprod.
+Input: int a_rowmajor, b_rowmajor: the same as in function sgemm
+       const int8_t *A, *B: the addresses of int8_t input matrices
+       M, N, K, beta, num_threads: the same meaning as in function sgemm
+Output: int32_t *C: the address of int32_t output matrix C
+Return: 0 on success, 1 on illegal parameters
+**************************************************************************/
+int s8s32gemm(int a_rowmajor, int b_rowmajor,
+  const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t N, uint32_t K,
+  int32_t beta, uint32_t num_threads);
+
+/**************************************************************************
+Function:    u8u32gemm
+Description: unsigned 8bit -> 32bit integer matrix multiplication,
+             do C = AB with OpenMP parallelization,
+             use *mlal NEON instructions on CPUs without ARMv8.2a feature,
+             use *dot NEON instructions on CPUs support ARMv8.2a-dotprod.
+Input: int a_rowmajor, b_rowmajor: the same as in function sgemm
+       const uint8_t *A, *B: the addresses of uint8_t input matrices
+       M, N, K, beta, num_threads: the same meaning as in function sgemm
+Output: uint32_t *C: the address of uint32_t output matrix C
+Return: 0 on success, 1 on illegal parameters
+**************************************************************************/
+int u8u32gemm(int a_rowmajor, int b_rowmajor,
+  const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t N, uint32_t K,
+  uint32_t beta, uint32_t num_threads);
+
+#if __aarch64__
+/**************************************************************************
+Function:    hgemm
+Description: fp16 (half precision) matrix multiplication,
+             do C = AB with OpenMP parallelization.
+Input: int a_rowmajor, b_rowmajor: the same as in function sgemm
+       const float16_t *A, *B: the addresses of input matrices
+       M, N, K, beta, num_threads: the same meaning as in function sgemm
+Output: float16_t *C: the address of output matrix C
+Return: 0 on success, 1 on illegal parameters,
+        2 when the CPU doesn't support ARMv8.2a-fp16
+**************************************************************************/
+int hgemm(int a_rowmajor, int b_rowmajor,
+  const float16_t *A, const float16_t *B, float16_t *C,
+  uint32_t M, uint32_t N, uint32_t K,
+  float16_t beta, uint32_t num_threads);
+
+#endif //aarch64
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //INCLUDE_ARM_GEMM_INTERFACE
diff --git a/include/Layer.h b/include/Layer.h
new file mode 100644
index 0000000..2dcd804
--- /dev/null
+++ b/include/Layer.h
@@ -0,0 +1,55 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+#include <arm_neon.h>
+
+#ifndef INCLUDE_ARM_LAYER_INTERFACE
+#define INCLUDE_ARM_LAYER_INTERFACE
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************
+Function:       fc
+Description:    Function to perform transformation in a fully-connected layer,
+                paralleled with OpenMP.
+                output = src * weight + bias
+Input:          float *src: the address of source data matrix.
+                float *weight: the address of weight matrix.
+                float *bias: the address of bias vector.
+Output:         float *output: the address of output matrix.
+Parameters:     int M: the number of rows in source data matrix.
+                int K: the number of columns in source data matrix.
+                int N: the number of columns in output matrix.
+                int trans_src: 1 for column-major source data matrix,
+                              0 for row-major source data matrix.
+                int trans_weight: 1 for column-major weight matrix,
+                                 0 for row-major weight matrix.
+                int num_threads: number of OpenMP threads to use.
+Return:         0 on success, non-zero number on errors.
+******************************************************************************/
+int fc(const float *src, const float *weight, const float *bias,
+  float *output, int M, int K, int N, int trans_src, int trans_weight,
+  int num_threads);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/Quant.h b/include/Quant.h
new file mode 100644
index 0000000..7e6c17e
--- /dev/null
+++ b/include/Quant.h
@@ -0,0 +1,254 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+#include <arm_neon.h>
+
+#ifndef INCLUDE_ARM_QUANT_INTERFACE
+#define INCLUDE_ARM_QUANT_INTERFACE
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/***********************************************************************
+Function:    bias_int32_t
+Description: Perform bias operation on a 32-bit signed int matrix.
+             This function can be used in asymmetric quantitized GEMM.
+Parameters:  dst: the address of the matrix to apply bias on
+             bias_dim0: the bias value on every element
+             bias_dim1: the address of the input bias vector which
+                        will be applied to the matrix along its
+                        major dimension, i.e. when the element
+                        can be indexed by x * dim1 + y, each element
+                        is biased by bias_dim1[y]. No bias will be
+                        performed with NULL pointer as input.
+             bias_dim1_scale: the scale to be applied on elements
+                              of bias_dim1[] prior to the bias
+                              operation
+             bias_dim2: the address of the input bias vector which
+                        whill be applied to the matrix along its
+                        minor dimension, i.e. when the element
+                        can be indexed by x * dim1 + y, each element
+                        is biased by bias_dim2[x]. No bias will be
+                        performed with NULL pointer as input.
+             bias_dim2_scale: the scale to be applied on elements
+                              of bias_dim2[] prior to the bias
+                              operation
+             dim1: the length of the major dimension of input matrix
+             dim2: the length of the minor dimension of input matrix
+***********************************************************************/
+void bias_int32_t(int32_t *dst, int32_t bias_dim0,
+  const int32_t *bias_dim1, int32_t bias_dim1_scale,
+  const int32_t *bias_dim2, int32_t bias_dim2_scale,
+  uint32_t dim1, uint32_t dim2);
+
+/***********************************************************************
+Function:    u8u32_sum
+Description: Perform summing operation of cols/rows of the unsigned
+             8-bit int matrix. The sum of each col/row is an unsigned
+             32-bit integer.
+Parameters:  src: the address of input matrix.
+             dst: the address of output vector.
+             dim1: the length of major dimension of input matrix.
+             dim2: the length of minor dimension of input matrix.
+             (the major dimension is the vertical one for column-
+             major matrix, or the horizontal one for row-major
+             matrix)
+             direction: the direction of summing
+                        0: sum along the minor dimension,
+                           output_vector_size == dim1;
+                        1: sum along the major dimension,
+                           output_vector_size == dim2.
+***********************************************************************/
+void u8u32_sum(const uint8_t *src, uint32_t *dst,
+  uint32_t dim1, uint32_t dim2, uint8_t direction);
+
+/***********************************************************************
+Function:    quantize_asymmetric_f32_u8
+Description: Asymmetric quantization from fp32 to unsigned 8-bit int,
+             producing an 8-bit zero-point integer Z0, a fp32 scale S0
+             and quantitized unsigned 8-bit data Q1-Qn on the run.
+             For each quantitized element Qi, S0 * (Qi - Z0) can
+             approximate the original input (fp32) Fi.
+Parameters:  const float32_t *input: the address of the input fp32 array
+             uint8_t *output: the address of the output integer array
+             uint8_t *zero_point: the address to output Z0
+             float32_t *scale: the address to output S0
+             uint32_t size: the number of elements in the input
+             float32_t input_min, input_max:
+                the min and max of input float32_t numbers.
+                when input_min > input_max, the min and max
+                of input are reevaluated.
+***********************************************************************/
+void quantize_asymmetric_f32_u8(const float32_t *input, uint8_t *output,
+  uint8_t *zero_point, float32_t *scale, uint32_t size,
+  float32_t input_min, float32_t input_max);
+
+/***********************************************************************
+Function:    quantize_symmetric_f32_s8
+Description: symmetric quantization from fp32 to signed 8-bit int,
+             producing a fp32 scale S0 and quantitized 8-bit data
+             Q1-Qn on the run.
+             For each quantitized element Qi, S0 * Qi can
+             approximate the original input (fp32) Fi.
+Parameters:  const float32_t *input: the address of the input fp32 array
+             int8_t *output: the address of the output integer array
+             float32_t *scale: the address to output S0
+             uint32_t size: the number of elements in the input
+             float32_t input_min, input_max:
+                the min and max of input float32_t numbers.
+                when input_min > input_max, the min and max
+                of input are reevaluated.
+***********************************************************************/
+void quantize_symmetric_f32_s8(const float32_t *input, int8_t *output,
+  float32_t *scale, uint32_t size, float32_t input_min, float32_t input_max);
+
+/***********************************************************************
+Function:    quantize_asymmetric_f32_u16
+Description: Asymmetric quantization from fp32 to unsigned 16-bit int,
+             producing an 16-bit zero-point integer Z0, a fp32 scale S0
+             and quantitized unsigned 16-bit data Q1-Qn on the run.
+             This function does the same thing as
+             quantize_asymmetric_f32_u8 except the zero point and
+             outputs are 16-bit integers.
+***********************************************************************/
+void quantize_asymmetric_f32_u16(const float32_t *input, uint16_t *output,
+  uint16_t *zero_point, float32_t *scale, uint32_t size,
+  float32_t input_min, float32_t input_max);
+
+/***********************************************************************
+Function:    quantize_symmetric_f32_s16
+Description: symmetric quantization from fp32 to signed 16-bit int,
+             producing a fp32 scale S0 and quantitized 16-bit data
+             Q1-Qn on the run. This function does the same thing
+             as quantize_symmetric_f32_s8 except the outputs are
+             16-bit integers.
+***********************************************************************/
+void quantize_symmetric_f32_s16(const float32_t *input, int16_t *output,
+  float32_t *scale, uint32_t size, float32_t input_min, float32_t input_max);
+
+/***********************************************************************
+Function:    dequantize_symmetric_f32_s32
+Description: Convert 32-bit signed int values to fp32 ones with scaling.
+Parameters:  const int32_t *src: the address of the input integer array
+             float32_t *dst: the address of the output fp32 array
+             float32_t scale: the scaling factor on the input
+             uint32_t size: the number of elements in the input
+***********************************************************************/
+void dequantize_symmetric_f32_s32(const int32_t *src, float32_t *dst,
+  float32_t scale, uint32_t size);
+
+/************************************************************************
+Function:    requantize_asymmetric_32to8
+Description: asymmetric requantization from signed 32-bit int to
+             unsigned 8-bit int, which produces an 8-bit zero-point
+             integer Z0, updates the fp32 scale S0 and outputs
+             requantitized unsigned 8-bit data Q1-Qn on the run.
+             For each requantitized element Qi, S0 * (Qi - Z0) can
+             approximate the original dequantized value (fp32) Fi
+             of the corresponding 32-bit input.
+Parameters:  const int32_t *input: the address of the input int array
+             uint8_t *output: the address of the output integer array
+             float *scale: the address to update scaling factor S0
+             uint8_t *zero_point: the address to output Z0
+             uint32_t size: the number of elements in the input
+             int32_t input_min, input_max: the min and max value
+               of input int32 numbers. if input_min > input_max,
+               the min and max of the input integers are recalculated.
+Note: The following function is near-equivalent to this sequence:
+       dequant_cvt_float_int32_t(input, temporal_array, *scale, size);
+       quant_unsym_float_uint8_t(temporal_array, output,
+         zero_point, scale, size);
+************************************************************************/
+void requantize_asymmetric_32to8(const int32_t *input, uint8_t *output,
+  float *scale, uint8_t *zero_point, uint32_t size,
+  int32_t input_min, int32_t input_max);
+
+/************************************************************************
+Function:    requantize_symmetric_32to8
+Description: symmetric requantization from signed 32-bit int to
+             signed 8-bit int, which updates the fp32 scale S0
+             and outputs requantitized signed 8-bit data Q1-Qn
+             on the run.
+             For each requantitized element Qi, S0 * Qi can
+             approximate the original dequantized value (fp32) Fi
+             of the corresponding 32-bit input.
+Parameters:  const int32_t *input: the address of the input int array
+             int8_t *output: the address of the output integer array
+             float *scale: the address to update scaling factor S0
+             uint32_t size: the number of elements in the input
+             int32_t input_min, input_max: the min and max value
+               of input int32 numbers. if input_min > input_max,
+               the min and max of the input integers are recalculated.
+Note: The following function is near-equivalent to this sequence:
+       dequant_cvt_float_int32_t(input, temporal_array, *scale, size);
+       quant_sym_float_int8_t(temporal_array, output, scale, size);
+************************************************************************/
+void requantize_symmetric_32to8(const int32_t *input, int8_t *output,
+  float *scale, uint32_t size,
+  int32_t input_min, int32_t input_max);
+
+/************************************************************************
+ * Function:    requantize_asymmetric_32to16
+ * Description: asymmetric requantization from signed 32-bit int to
+ *              unsigned 16-bit int, which does the same thing as
+ *              requantize_asymmetric_32to8 except that the outputs
+ *              and zero point are 16-bit integers
+ ***********************************************************************/
+void requantize_asymmetric_32to16(const int32_t *input, uint16_t *output,
+  float *scale, uint16_t *zero_point, uint32_t size,
+  int32_t input_min, int32_t input_max);
+
+/************************************************************************
+ * Function:    requantize_symmetric_32to16
+ * Description: symmetric requantization from signed 32-bit int to
+ *              signed 16-bit int, which does the same thing as
+ *              requantize_symmetric_32to8 except that the outputs
+ *              are 16-bit integers
+ ***********************************************************************/
+void requantize_symmetric_32to16(const int32_t *input, int16_t *output,
+  float *scale, uint32_t size,
+  int32_t input_min, int32_t input_max);
+
+/************************************************************************
+ * Function:    requantize_asymmetric_16to8
+ * Description: asymmetric requantization from signed 16-bit int to
+ *              unsigned 8-bit int, which does the same thing as
+ *              requantize_asymmetric_32to8 except that the inputs
+ *              are 16-bit integers
+ ***********************************************************************/
+void requantize_asymmetric_16to8(const int16_t *input, uint8_t *output,
+  float *scale, uint8_t *zero_point, uint32_t size,
+  int16_t input_min, int16_t input_max);
+
+/************************************************************************
+ * Function:    requantize_symmetric_16to8
+ * Description: symmetric requantization from signed 16-bit int to
+ *              signed 8-bit int, which does the same thing as
+ *              requantize_symmetric_32to8 except that the inputs
+ *              are 16-bit integers
+ ***********************************************************************/
+void requantize_symmetric_16to8(const int16_t *input, int8_t *output,
+  float *scale, uint32_t size,
+  int16_t input_min, int16_t input_max);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/arm_neon/ARMCompareAndSwap.h b/include/arm_neon/ARMCompareAndSwap.h
new file mode 100644
index 0000000..6d11746
--- /dev/null
+++ b/include/arm_neon/ARMCompareAndSwap.h
@@ -0,0 +1,56 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+/******************************************************************************
+ * File:        ARMCompareAndSwap.h
+ * Description: Atomic compare and swap functions on ARM processors
+ *****************************************************************************/
+
+#include <stdint.h>
+
+/******************************************************************************
+ * Function:    atomicCAS_U32
+ * Description: Atomic "compare and swap" of 32-bit integer in main memory.
+ * Parameters:  comp: the value to compare
+ *              write: the value to write
+ *              dst: the memory location of 32-bit integer
+ * Operation:   # atomic operation
+ *              {
+ *                uint32_t ret = *dst;
+ *                if (*dst == comp) *dst = write;
+ *                return ret;
+ *              }
+ * Return:      The original value of the 32-bit integer in memory
+ *****************************************************************************/
+uint32_t atomicCAS_U32(uint32_t comp, uint32_t write, uint32_t *dst);
+
+/******************************************************************************
+ * Function:    atomicCAS_U64
+ * Description: Atomic "compare and swap" of 64-bit integer in main memory.
+ * Parameters:  comp: the value to compare
+ *              write: the value to write
+ *              dst: the memory location of 64-bit integer
+ * Operation:   # atomic operation
+ *              {
+ *                uint64_t ret = *dst;
+ *                if (*dst == comp) *dst = write;
+ *                return ret;
+ *              }
+ * Return:      The original value of the 64-bit integer in memory
+ *****************************************************************************/
+uint64_t atomicCAS_U64(uint64_t comp, uint64_t write, uint64_t *dst);
+
diff --git a/include/arm_neon/ARMCpuType.h b/include/arm_neon/ARMCpuType.h
new file mode 100644
index 0000000..94aac99
--- /dev/null
+++ b/include/arm_neon/ARMCpuType.h
@@ -0,0 +1,85 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+/******************************************************************************
+ * File:        ARMCpuType.h
+ * Description: Functions support real-time ARM CPU detection:
+ *              CPU pipeline type & ISA support.
+ *              On ARM, a user program is not allowed to access system
+ *              registers holding CPUID information. As a result, the CPU
+ *              recognization relies on a "healthy" linux kernel which
+ *              read those registers and store their information into sysfs
+ *              on initialization process.
+ *****************************************************************************/
+
+#include <stdint.h>
+
+/* currently the function can work only on Linux kernels since 2015 */
+
+#ifndef INCLUDE_ARM_CPUTYPE
+#define INCLUDE_ARM_CPUTYPE
+
+/*****************************************************************************
+ * Function:    blas_arm_get_cpu_type
+ * Description: Detect the NEON pipeline type of the CPU. There're 4 major
+ *              types of NEON pipelines:
+ *              (1) only 1 64-bit NEON pipeline, shared by vector load & arith,
+ *                  with in-order execution,
+ *                  like that in cortex-A7 and cortex-A35.
+ *              (2) 2 64-bit NEON pipelines, can be combined to execute 128-bit
+ *                  wide operations, shared by vector load & arith, with
+ *                  in-order execution & dual-issue ability,
+ *                  like that in cortex-A53.
+ *              (3) has identical NEON piplines as stated in (2), with an
+ *                  additional load unit capable of simple 64-bit NEON loads
+ *                  and element insertion, like that in cortex-A55.
+ *              (4) at least 2 64-bit NEON pipelines, out-of-order execution,
+ *                  has additional load unit(s) supporting vector loads, like
+ *                  that in cortex-A57.
+ * Parameter:   cpuid: the ID of CPU core whose type need to be determined,
+ *              e.g. the return value of sched_getcpu() when the core where
+ *              the calling thread runs needs to be determined.
+ * Return:      A 8-bit integer representing the type, 35 for (1), 53 for (2),
+ *              55 for (3) and 0 for (4)
+ ****************************************************************************/
+uint8_t blas_arm_get_cpu_type(uint8_t cpuid);
+
+/*****************************************************************************
+ * Function:    blas_arm_get_fp16_support()
+ * Description: Determine the support level for half-precision arithmetic
+ *              operations of the current system. Rely on "healthy" linux
+ *              kernel which detects the CPU correctly.
+ * Return:      0 for no-support, 1 for support of conversion from/to fp32,
+ *              2 for support of add/mul/fma operations
+ ****************************************************************************/
+uint8_t blas_arm_get_fp16_support();
+
+/*****************************************************************************
+ * Function:    blas_arm_get_i8i32_support()
+ * Description: Determine the support level for int8->int32 accumulate
+ *              operations of the current system. Rely on "healthy" linux
+ *              kernel which detects the CPU correctly.
+ * Return:      0 for no-support, 1 for support with *mlal instructions,
+ *              2 for support with *dot instructions
+ ****************************************************************************/
+/* return an integer indicating i8->i32 GEMM support */
+/* return 0 for non-support from SIMD */
+/* return 1 for basic support with SIMD multiply add */
+/* return 2 when armv8.2a-dotprod is available */
+uint8_t blas_arm_get_i8i32_support();
+
+#endif
diff --git a/include/arm_neon/NeonBias.h b/include/arm_neon/NeonBias.h
new file mode 100644
index 0000000..22bd6c0
--- /dev/null
+++ b/include/arm_neon/NeonBias.h
@@ -0,0 +1,200 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+/*****************************************************************************
+ * File:        NeonBias.h
+ * Description: Bias functions based on ARM NEON instructions.
+ ****************************************************************************/
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <arm_neon.h>
+
+#ifndef INCLUDE_NEON_BIAS
+#define INCLUDE_NEON_BIAS
+
+/*****************************************************************************
+ * Template:    NEON_BIAS
+ * Description: Function template for NEON-based bias
+ * Template Parameters: type_scalar: the type of scalar data,
+ *                                   e.g. float for fp32 bias
+ *                      type_vector: the type of SIMD vector data,
+ *                                   e.g. float32x4_t
+ *                      type_short: the short of data type in NEON intrinsics,
+ *                                  e.g. f32 for fp32 bias
+ *                      vector_size: the length of SIMD vector, e.g. 4 when
+ *                                   type_vector == float32x4_t
+ *                      fma: the short for multiply-add operation in the name
+ *                           of NEON intrinsics. Use "fma" for fused
+ *                           multiply-add and "mla" for sequential multiply-add
+ * Function Parameters: C: the address of the matrix to apply bias on
+ *                      bias_dim0: the bias value on every element
+ *                      bias_dim1: the address of the input bias vector which
+ *                                 will be applied to the matrix along its
+ *                                 major dimension, i.e. when the element
+ *                                 can be indexed by x * dim1 + y, each element
+ *                                 is biased by bias_dim1[y]. No bias will be
+ *                                 performed with NULL pointer as input.
+ *                      bias_dim1_scale: the scale to be applied on elements
+ *                                       of bias_dim1[] prior to the bias
+ *                                       operation
+ *                      bias_dim2: the address of the input bias vector which
+ *                                 whill be applied to the matrix along its
+ *                                 minor dimension, i.e. when the element
+ *                                 can be indexed by x * dim1 + y, each element
+ *                                 is biased by bias_dim2[x]. No bias will be
+ *                                 performed with NULL pointer as input.
+ *                      bias_dim2_scale: the scale to be applied on elements
+ *                                       of bias_dim2[] prior to the bias
+ *                                       operation
+ *                      dim1: the length of the major dimension of input matrix
+ *                      dim2: the length of the minor dimension of input matrix
+ ****************************************************************************/
+#define NEON_BIAS(type_scalar, type_vector, type_short, vector_size, fma) \
+void bias_##type_scalar(type_scalar *C,\
+  type_scalar bias_dim0,\
+  const type_scalar *bias_dim1,\
+  type_scalar bias_dim1_scale,\
+  const type_scalar *bias_dim2,\
+  type_scalar bias_dim2_scale,\
+  uint32_t dim1, uint32_t dim2) {\
+\
+  bool do_bias_0 = (bias_dim0 != 0);\
+  bool do_bias_1 = bias_dim1 && (bias_dim1_scale != 0);\
+  bool do_bias_2 = bias_dim2 && (bias_dim2_scale != 0);\
+\
+  if (!do_bias_0 && !do_bias_1 && !do_bias_2) return;\
+\
+  if (!do_bias_1 && (do_bias_0 || do_bias_2)) {\
+    type_scalar *c_ptr = C;\
+    for (uint32_t dim2_pos = 0; dim2_pos < dim2; ++dim2_pos) {\
+      const type_scalar bs = bias_dim0 + \
+        (bias_dim2 ? bias_dim2[dim2_pos] * bias_dim2_scale : (type_scalar)0);\
+      const type_vector bv = vdupq_n_##type_short(bs);\
+      uint32_t dim1_left = dim1;\
+      for (; dim1_left >= vector_size * 4; dim1_left -= vector_size * 4) {\
+        type_vector c1 = vld1q_##type_short(c_ptr);\
+        type_vector c2 = vld1q_##type_short(c_ptr + vector_size);\
+        type_vector c3 = vld1q_##type_short(c_ptr + vector_size * 2);\
+        type_vector c4 = vld1q_##type_short(c_ptr + vector_size * 3);\
+        c1 = vaddq_##type_short(c1, bv);\
+        c2 = vaddq_##type_short(c2, bv);\
+        c3 = vaddq_##type_short(c3, bv);\
+        c4 = vaddq_##type_short(c4, bv);\
+        vst1q_##type_short(c_ptr, c1);\
+        vst1q_##type_short(c_ptr + vector_size, c2);\
+        vst1q_##type_short(c_ptr + vector_size * 2, c3);\
+        vst1q_##type_short(c_ptr + vector_size * 3, c4);\
+        c_ptr += vector_size * 4;\
+      }\
+      for (; dim1_left >= vector_size; dim1_left -= vector_size) {\
+        type_vector c1 = vld1q_##type_short(c_ptr);\
+        c1 = vaddq_##type_short(c1, bv);\
+        vst1q_##type_short(c_ptr, c1); c_ptr += vector_size;\
+      }\
+      for (; dim1_left > 0; dim1_left--) {\
+        *c_ptr += bs; c_ptr++;\
+      }\
+    }\
+  } else if (do_bias_1 && !do_bias_0 && !do_bias_2) {\
+    type_scalar *c_ptr = C;\
+    for (uint32_t dim2_pos = 0; dim2_pos < dim2; ++dim2_pos) {\
+      uint32_t dim1_left = dim1;\
+      const type_scalar *bias_ptr = bias_dim1;\
+      for (; dim1_left >= vector_size * 4; dim1_left -= vector_size * 4) {\
+        type_vector c1 = vld1q_##type_short(c_ptr);\
+        type_vector c2 = vld1q_##type_short(c_ptr + vector_size);\
+        type_vector c3 = vld1q_##type_short(c_ptr + vector_size * 2);\
+        type_vector c4 = vld1q_##type_short(c_ptr + vector_size * 3);\
+        type_vector b1 = vld1q_##type_short(bias_ptr);\
+        type_vector b2 = vld1q_##type_short(bias_ptr + vector_size);\
+        type_vector b3 = vld1q_##type_short(bias_ptr + vector_size * 2);\
+        type_vector b4 = vld1q_##type_short(bias_ptr + vector_size * 3);\
+        bias_ptr += vector_size * 4;\
+        c1 = v##fma##q_n_##type_short(c1, b1, bias_dim1_scale);\
+        c2 = v##fma##q_n_##type_short(c2, b2, bias_dim1_scale);\
+        c3 = v##fma##q_n_##type_short(c3, b3, bias_dim1_scale);\
+        c4 = v##fma##q_n_##type_short(c4, b4, bias_dim1_scale);\
+        vst1q_##type_short(c_ptr, c1);\
+        vst1q_##type_short(c_ptr + vector_size, c2);\
+        vst1q_##type_short(c_ptr + vector_size * 2, c3);\
+        vst1q_##type_short(c_ptr + vector_size * 3, c4);\
+        c_ptr += vector_size * 4;\
+      }\
+      for (; dim1_left >= vector_size; dim1_left -= vector_size) {\
+        type_vector c1 = vld1q_##type_short(c_ptr);\
+        type_vector b1 = vld1q_##type_short(bias_ptr);\
+        bias_ptr += vector_size;\
+        c1 = v##fma##q_n_##type_short(c1, b1, bias_dim1_scale);\
+        vst1q_##type_short(c_ptr, c1);\
+        c_ptr += vector_size;\
+      }\
+      for (; dim1_left > 0; dim1_left--) {\
+        *c_ptr += (*bias_ptr) * bias_dim1_scale; bias_ptr++; c_ptr++;\
+      }\
+    }\
+  } else {\
+    type_scalar *c_ptr = C;\
+    for (uint32_t dim2_pos = 0; dim2_pos < dim2; ++dim2_pos) {\
+      const type_scalar bs = bias_dim0 + \
+        (bias_dim2 ? bias_dim2[dim2_pos] * bias_dim2_scale : (type_scalar)0);\
+      const type_vector bv = vdupq_n_##type_short(bs);\
+      const type_scalar *bias_ptr = bias_dim1;\
+      uint32_t dim1_left = dim1;\
+      for (; dim1_left >= vector_size * 4; dim1_left -= vector_size * 4) {\
+        type_vector c1 = vld1q_##type_short(c_ptr);\
+        type_vector c2 = vld1q_##type_short(c_ptr + vector_size);\
+        type_vector c3 = vld1q_##type_short(c_ptr + vector_size * 2);\
+        type_vector c4 = vld1q_##type_short(c_ptr + vector_size * 3);\
+        c1 = vaddq_##type_short(c1, bv);\
+        c2 = vaddq_##type_short(c2, bv);\
+        c3 = vaddq_##type_short(c3, bv);\
+        c4 = vaddq_##type_short(c4, bv);\
+        type_vector b1 = vld1q_##type_short(bias_ptr);\
+        type_vector b2 = vld1q_##type_short(bias_ptr + vector_size);\
+        type_vector b3 = vld1q_##type_short(bias_ptr + vector_size * 2);\
+        type_vector b4 = vld1q_##type_short(bias_ptr + vector_size * 3);\
+        bias_ptr += vector_size * 4;\
+        c1 = v##fma##q_n_##type_short(c1, b1, bias_dim1_scale);\
+        c2 = v##fma##q_n_##type_short(c2, b2, bias_dim1_scale);\
+        c3 = v##fma##q_n_##type_short(c3, b3, bias_dim1_scale);\
+        c4 = v##fma##q_n_##type_short(c4, b4, bias_dim1_scale);\
+        vst1q_##type_short(c_ptr, c1);\
+        vst1q_##type_short(c_ptr + vector_size, c2);\
+        vst1q_##type_short(c_ptr + vector_size * 2, c3);\
+        vst1q_##type_short(c_ptr + vector_size * 3, c4);\
+        c_ptr += vector_size * 4;\
+      }\
+      for (; dim1_left >= vector_size; dim1_left -= vector_size) {\
+        type_vector c1 = vld1q_##type_short(c_ptr);\
+        c1 = vaddq_##type_short(c1, bv);\
+        type_vector b1 = vld1q_##type_short(bias_ptr);\
+        bias_ptr += vector_size;\
+        c1 = v##fma##q_n_##type_short(c1, b1, bias_dim1_scale);\
+        vst1q_##type_short(c_ptr, c1);\
+        c_ptr += vector_size;\
+      }\
+      for (; dim1_left > 0; dim1_left--) {\
+        *c_ptr += (*bias_ptr) * bias_dim1_scale + bs;\
+        bias_ptr++; c_ptr++;\
+      }\
+    }\
+  }\
+}
+
+#endif
+
diff --git a/include/arm_neon/NeonExtreme.h b/include/arm_neon/NeonExtreme.h
new file mode 100644
index 0000000..3255e30
--- /dev/null
+++ b/include/arm_neon/NeonExtreme.h
@@ -0,0 +1,112 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+/******************************************************************************
+ * File:        NeonExtreme.h
+ * Description: Source code template for NEON max/min functions.
+ *****************************************************************************/
+
+#include "common/ExpandMacro.h"
+#include <stdint.h>
+#include <arm_neon.h>
+
+#ifndef INCLUDE_NEON_EXTREME
+#define INCLUDE_NEON_EXTREME
+
+#define NEON_REDUC_S_ITEM(n, type, short) {\
+  type tmin = vget_lane_##short(vmin1d, n - 1);\
+  type tmax = vget_lane_##short(vmax1d, n - 1);\
+  smin = tmin < smin ? tmin : smin;\
+  smax = tmax > smax ? tmax : smax;\
+}
+
+#define NEON_REDUC_S_MIN_MAX(n, type, short) \
+  MACRO_EXPANSION_##n(VOID_BASE, NEON_REDUC_S_ITEM, type, short)
+
+#define NEON_FIND_EXTREME(type, short, dvec, qvec, dlen) \
+static inline void inline_find_extreme_##type(const type *dat, uint32_t size,\
+  type *min, type *max) {\
+\
+  qvec vmin1, vmin2, vmin3, vmin4;\
+  qvec vmax1, vmax2, vmax3, vmax4;\
+\
+  if (size == 0) return;\
+  vmin1 = vmin2 = vmin3 = vmin4 = \
+    vmax1 = vmax2 = vmax3 = vmax4 = vld1q_dup_##short(dat);\
+  uint32_t elem_left = size;\
+  for (; elem_left >= dlen * 8; elem_left -= dlen * 8) {\
+    qvec l1 = vld1q_##short(dat);\
+    qvec l2 = vld1q_##short(dat + dlen * 2);\
+    qvec l3 = vld1q_##short(dat + dlen * 4);\
+    qvec l4 = vld1q_##short(dat + dlen * 6);\
+    dat += dlen * 8;\
+    vmin1 = vminq_##short(vmin1, l1);\
+    vmax1 = vmaxq_##short(vmax1, l1);\
+    vmin2 = vminq_##short(vmin2, l2);\
+    vmax2 = vmaxq_##short(vmax2, l2);\
+    vmin3 = vminq_##short(vmin3, l3);\
+    vmax3 = vmaxq_##short(vmax3, l3);\
+    vmin4 = vminq_##short(vmin4, l4);\
+    vmax4 = vmaxq_##short(vmax4, l4);\
+  }\
+  vmin1 = vminq_##short(vmin1, vmin3);\
+  vmin2 = vminq_##short(vmin2, vmin4);\
+  vmax1 = vmaxq_##short(vmax1, vmax3);\
+  vmax2 = vmaxq_##short(vmax2, vmax4);\
+  if (elem_left >= dlen * 4) {\
+    qvec l1 = vld1q_##short(dat);\
+    qvec l2 = vld1q_##short(dat + dlen * 2);\
+    dat += dlen * 4;\
+    vmin1 = vminq_##short(vmin1, l1);\
+    vmax1 = vmaxq_##short(vmax1, l1);\
+    vmin2 = vminq_##short(vmin2, l2);\
+    vmax2 = vmaxq_##short(vmax2, l2);\
+    elem_left -= dlen * 4;\
+  }\
+  vmin1 = vminq_##short(vmin1, vmin2);\
+  vmax1 = vmaxq_##short(vmax1, vmax2);\
+  if (elem_left >= dlen * 2) {\
+    qvec l1 = vld1q_##short(dat);\
+    dat += dlen * 2;\
+    vmin1 = vminq_##short(vmin1, l1);\
+    vmax1 = vmaxq_##short(vmax1, l1);\
+    elem_left -= dlen * 2;\
+  }\
+  dvec vmin1d = vmin_##short(vget_low_##short(vmin1),\
+    vget_high_##short(vmin1));\
+  dvec vmax1d = vmax_##short(vget_low_##short(vmax1),\
+    vget_high_##short(vmax1));\
+  if (elem_left >= dlen) {\
+    dvec d1 = vld1_##short(dat);\
+    dat += dlen;\
+    vmin1d = vmin_##short(vmin1d, d1);\
+    vmax1d = vmax_##short(vmax1d, d1);\
+    elem_left -= dlen;\
+  }\
+  type smin = vget_lane_##short(vmin1d, 0);\
+  type smax = vget_lane_##short(vmax1d, 0);\
+  NEON_REDUC_S_MIN_MAX(dlen, type, short)\
+  for (; elem_left > 0; elem_left--) {\
+    type s1 = *dat++;\
+    smin = s1 < smin ? s1 : smin;\
+    smax = s1 > smax ? s1 : smax;\
+  }\
+  *min = smin;\
+  *max = smax;\
+}
+
+#endif
diff --git a/include/arm_neon/NeonI8I32DotGemmSkinnyDot.h b/include/arm_neon/NeonI8I32DotGemmSkinnyDot.h
new file mode 100644
index 0000000..c3dfa07
--- /dev/null
+++ b/include/arm_neon/NeonI8I32DotGemmSkinnyDot.h
@@ -0,0 +1,153 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+/*****************************************************************************
+ * File:        NeonI8I32DotGemmSkinnyDot.h
+ * Description: Source code template for NEON 8->32bit GEMM skinny dot kernel
+ ****************************************************************************/
+
+#include "common/CommonSkinnyDot.h"
+#include "arm_neon/NeonIntOpSign.h"
+
+#ifndef INCLUDE_I8I32_DOT_SKINNYDOT
+#define INCLUDE_I8I32_DOT_SKINNYDOT
+
+typedef I8 I8I32DOTGEMM_SKINNYDOT_ASCALAR;
+typedef I8 I8I32DOTGEMM_SKINNYDOT_BSCALAR;
+typedef I32 I8I32DOTGEMM_SKINNYDOT_CSCALAR;
+
+typedef I16 I8I32DOTGEMM_SKINNYDOT_AVEC1;
+typedef I16 I8I32DOTGEMM_SKINNYDOT_BVEC1;
+typedef I32 I8I32DOTGEMM_SKINNYDOT_CVEC1;
+
+typedef I8X8 I8I32DOTGEMM_SKINNYDOT_AVEC4;
+typedef I8X8 I8I32DOTGEMM_SKINNYDOT_BVEC4;
+typedef I32X2 I8I32DOTGEMM_SKINNYDOT_CVEC4;
+
+typedef I8X8 I8I32DOTGEMM_SKINNYDOT_AVEC8;
+typedef I8X8 I8I32DOTGEMM_SKINNYDOT_BVEC8;
+typedef I32X2 I8I32DOTGEMM_SKINNYDOT_CVEC8;
+
+typedef I8X16 I8I32DOTGEMM_SKINNYDOT_AVEC16;
+typedef I8X16 I8I32DOTGEMM_SKINNYDOT_BVEC16;
+typedef I32X4 I8I32DOTGEMM_SKINNYDOT_CVEC16;
+
+#define GEMM_SKINNY_DOT_UNIT_DEDUCE(TYPE, ...) \
+  GEMM_SKINNY_DOT_##TYPE##_UNIT(__VA_ARGS__)
+
+GEMM_SKINNY_DOT_UNIT_DEDUCE(CALC, I8I32DOTGEMM, 16) {
+  return VDOTQ_I32(c_vec, a_vec, b_vec);
+}
+
+GEMM_SKINNY_DOT_UNIT_DEDUCE(CALC, I8I32DOTGEMM, 8) {
+  return VDOT_I32(c_vec, a_vec, b_vec);
+}
+
+GEMM_SKINNY_DOT_UNIT_DEDUCE(CALC, I8I32DOTGEMM, 4) {
+  return VDOT_I32(c_vec, a_vec, b_vec);
+}
+
+GEMM_SKINNY_DOT_UNIT_DEDUCE(CALC, I8I32DOTGEMM, 1) {
+  return c_vec + a_vec * b_vec;
+}
+
+GEMM_SKINNY_DOT_UNIT_DEDUCE(LOADA, I8I32DOTGEMM, 16) {
+#if __aarch64__
+  __asm__("prfm pldl1keep,[%0,#80]"::"r"(a_ptr):);
+#else
+  __asm__("pld [%0,#80]"::"r"(a_ptr):);
+#endif
+  return VLD1Q_I8(a_ptr);
+}
+
+GEMM_SKINNY_DOT_UNIT_DEDUCE(LOADA, I8I32DOTGEMM, 8) {
+#if __aarch64__
+  __asm__("prfm pldl1keep,[%0,#72]"::"r"(a_ptr):);
+#else
+  __asm__("pld [%0,#72]"::"r"(a_ptr):);
+#endif
+  return VLD1_I8(a_ptr);
+}
+
+GEMM_SKINNY_DOT_UNIT_DEDUCE(LOADA, I8I32DOTGEMM, 4) {
+#if __aarch64__
+  I8X8 ret; /* higher 4 elements not used */
+  __asm__("ldr %s0,[%1]; prfm pldl1keep,[%1,#72]":"=w"(ret):"r"(a_ptr):"memory");
+#else
+  register I8X8 ret __asm("d0"); /* higher 4 elements not used */
+  __asm__("vld1.32 {%0[0]},[%1]; pld [%1,#72]":"=w"(ret):"r"(a_ptr):"memory");
+#endif
+  return ret;
+}
+
+GEMM_SKINNY_DOT_UNIT_DEDUCE(LOADA, I8I32DOTGEMM, 1) {
+  return *a_ptr;
+}
+
+GEMM_SKINNY_DOT_UNIT_DEDUCE(LOADB, I8I32DOTGEMM, 16) {
+  return VLD1Q_I8(b_ptr);
+}
+
+GEMM_SKINNY_DOT_UNIT_DEDUCE(LOADB, I8I32DOTGEMM, 8) {
+  return VLD1_I8(b_ptr);
+}
+
+GEMM_SKINNY_DOT_UNIT_DEDUCE(LOADB, I8I32DOTGEMM, 4) {
+#if __aarch64__
+  I8X8 ret; /* higher 4 elements not used */
+  __asm__("ldr %s0,[%1]":"=w"(ret):"r"(b_ptr):"memory");
+#else
+  register I8X8 ret __asm("d0"); /* higher 4 elements not used */
+  __asm__("vld1.32 {%0[0]},[%1]":"=w"(ret):"r"(b_ptr):"memory");
+#endif
+  return ret;
+}
+
+GEMM_SKINNY_DOT_UNIT_DEDUCE(LOADB, I8I32DOTGEMM, 1) {
+  return *b_ptr;
+}
+
+GEMM_SKINNY_DOT_UNIT_DEDUCE(REDUC, I8I32DOTGEMM, 16, 8) {
+  return VADD_I32(VGET_LOW_I32(c_vec), VGET_HIGH_I32(c_vec));
+}
+
+GEMM_SKINNY_DOT_UNIT_DEDUCE(REDUC, I8I32DOTGEMM, 8, 4) {
+  const static I32X2 z0 = {0, 0};
+  return VPADD_I32(c_vec, z0);
+}
+
+GEMM_SKINNY_DOT_UNIT_DEDUCE(REDUC, I8I32DOTGEMM, 4, 1) {
+  return VGET_LANE_I32(c_vec, 0);
+}
+
+GEMM_SKINNY_DOT_UNIT_DEDUCE(INITC, I8I32DOTGEMM, 16) {
+  return VDUPQ_N_I32(0);
+}
+
+GEMM_SKINNY_DOT_UNIT_DEDUCE(INITC, I8I32DOTGEMM, 8) {
+  return VDUP_N_I32(0);
+}
+
+GEMM_SKINNY_DOT_UNIT_DEDUCE(INITC, I8I32DOTGEMM, 4) {
+  return VDUP_N_I32(0);
+}
+
+GEMM_SKINNY_DOT_UNIT_DEDUCE(INITC, I8I32DOTGEMM, 1) {
+  return 0;
+}
+
+#endif
\ No newline at end of file
diff --git a/include/arm_neon/NeonI8I32MlaGemmCopy.h b/include/arm_neon/NeonI8I32MlaGemmCopy.h
new file mode 100644
index 0000000..1ac52c3
--- /dev/null
+++ b/include/arm_neon/NeonI8I32MlaGemmCopy.h
@@ -0,0 +1,181 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+/*****************************************************************************
+ * File:        NeonI8I32MlaGemmCopy.h
+ * Description: Source code template for NEON 8->32bit GEMM packing functions
+ ****************************************************************************/
+
+#include "NeonIntOpSign.h"
+
+#ifndef INCLUDE_NEON_I8I32_COPY
+#define INCLUDE_NEON_I8I32_COPY
+
+static inline void pref_ab(const I8 *dat) {
+#if __aarch64__
+  __asm__ ("prfm pldl1keep,[%0,#64]\n\t"::"r"(dat):);
+#else
+  __asm__ ("pld [%0,#64]\n\t"::"r"(dat):);
+#endif
+}
+
+#define NCOPY_LOOP_K8_UNROLL4(inc, dst_ptr, src1, src2, src3, src4) \
+  for (dim1_count = dim1_cache; dim1_count > 7; dim1_count -= 8) {\
+    I8X8 d1 = VLD1_I8(src1); src1 += 8; pref_ab(src1);\
+    I8X8 d2 = VLD1_I8(src2); src2 += 8; pref_ab(src2);\
+    I8X8 d3 = VLD1_I8(src3); src3 += 8; pref_ab(src3);\
+    I8X8 d4 = VLD1_I8(src4); src4 += 8; pref_ab(src4);\
+    I16X8X4 tm1;\
+    tm1.val[0] = VMOVL_I8(d1); tm1.val[1] = VMOVL_I8(d2);\
+    tm1.val[2] = VMOVL_I8(d3); tm1.val[3] = VMOVL_I8(d4);\
+    VST4Q_LANE_I16(dst_ptr, tm1, 0);\
+    VST4Q_LANE_I16(dst_ptr + inc, tm1, 1);\
+    VST4Q_LANE_I16(dst_ptr + inc * 2, tm1, 2);\
+    VST4Q_LANE_I16(dst_ptr + inc * 3, tm1, 3);\
+    VST4Q_LANE_I16(dst_ptr + inc * 4, tm1, 4);\
+    VST4Q_LANE_I16(dst_ptr + inc * 5, tm1, 5);\
+    VST4Q_LANE_I16(dst_ptr + inc * 6, tm1, 6);\
+    VST4Q_LANE_I16(dst_ptr + inc * 7, tm1, 7);\
+    dst_ptr += inc * 8;\
+  }
+
+#define NCOPY_LOOP_K8_UNROLL3(inc, dst_ptr, src1, src2, src3) \
+  for (dim1_count = dim1_cache; dim1_count > 7; dim1_count -= 8) {\
+    I8X8 d1 = VLD1_I8(src1); src1 += 8; pref_ab(src1);\
+    I8X8 d2 = VLD1_I8(src2); src2 += 8; pref_ab(src2);\
+    I8X8 d3 = VLD1_I8(src3); src3 += 8; pref_ab(src3);\
+    I16X8X3 tm1;\
+    tm1.val[0] = VMOVL_I8(d1);\
+    tm1.val[1] = VMOVL_I8(d2);\
+    tm1.val[2] = VMOVL_I8(d3);\
+    VST3Q_LANE_I16(dst_ptr, tm1, 0);\
+    VST3Q_LANE_I16(dst_ptr + inc, tm1, 1);\
+    VST3Q_LANE_I16(dst_ptr + inc * 2, tm1, 2);\
+    VST3Q_LANE_I16(dst_ptr + inc * 3, tm1, 3);\
+    VST3Q_LANE_I16(dst_ptr + inc * 4, tm1, 4);\
+    VST3Q_LANE_I16(dst_ptr + inc * 5, tm1, 5);\
+    VST3Q_LANE_I16(dst_ptr + inc * 6, tm1, 6);\
+    VST3Q_LANE_I16(dst_ptr + inc * 7, tm1, 7);\
+    dst_ptr += inc * 8;\
+  }
+
+#define NCOPY_UNROLL_12 {\
+  I16 *dst_h1 = dst1; uint32_t dim1_cache = dim1_count;\
+  NCOPY_LOOP_K8_UNROLL4(12, dst_h1, src1, src2, src3, src4)\
+  dst_h1 = dst1 + 4;\
+  NCOPY_LOOP_K8_UNROLL4(12, dst_h1, src5, src6, src7, src8)\
+  dst_h1 = dst1 + 8;\
+  NCOPY_LOOP_K8_UNROLL4(12, dst_h1, src9, src10, src11, src12)\
+  dst1 = dst_h1 - 8;\
+  NCOPY_STD(12)\
+}
+
+#define NCOPY_UNROLL_8 {\
+  I16 *dst_h1 = dst1; uint32_t dim1_cache = dim1_count;\
+  NCOPY_LOOP_K8_UNROLL4(8, dst_h1, src1, src2, src3, src4)\
+  dst_h1 = dst1 + 4;\
+  NCOPY_LOOP_K8_UNROLL4(8, dst_h1, src5, src6, src7, src8)\
+  dst1 = dst_h1 - 4;\
+  NCOPY_STD(8)\
+}
+
+#define NCOPY_UNROLL_6 {\
+  I16 *dst_h1 = dst1; uint32_t dim1_cache = dim1_count;\
+  NCOPY_LOOP_K8_UNROLL3(6, dst_h1, src1, src2, src3)\
+  dst_h1 = dst1 + 3;\
+  NCOPY_LOOP_K8_UNROLL3(6, dst_h1, src4, src5, src6)\
+  dst1 = dst_h1 - 3;\
+  NCOPY_STD(6)\
+}
+
+#define NCOPY_UNROLL_4 {\
+  uint32_t dim1_cache = dim1_count;\
+  NCOPY_LOOP_K8_UNROLL4(4, dst1, src1, src2, src3, src4)\
+  NCOPY_STD(4)\
+}
+
+#define NCOPY_UNROLL_2 NCOPY_STD(2)
+#define NCOPY_UNROLL_1 NCOPY_STD(1)
+
+#ifdef GEMM_UNSIGNED_INT
+#define NCOPY_uint8_t_uint16_t(unroll) NCOPY_UNROLL_##unroll
+#else
+#define NCOPY_int8_t_int16_t(unroll) NCOPY_UNROLL_##unroll
+#endif
+
+#define TCOPY_UNIT_1(src_ptr, dst_ptr, dst_offset) \
+  TCOPY_UNIT_STD(src_ptr, dst_ptr, dst_offset, 1)
+
+#define TCOPY_UNIT_2(src_ptr, dst_ptr, dst_offset) \
+  TCOPY_UNIT_STD(src_ptr, dst_ptr, dst_offset, 2)
+
+static inline I16X4 vld1_i16_i8(const I8 *src) {
+#if __aarch64__
+  I16X4 ret;
+  __asm__("ldr %s0,[%1]; "ISHLL" %0.8h,%0.8b,#0\n\t"
+    :"=w"(ret):"r"(src):"memory","cc");
+  return ret;
+#else
+  I16X8 ret;
+  __asm__("vld1.32 {d0[0]},[%1]; "ASM_VMOVL_I8" %q0,d0\n\t"
+    :"=w"(ret):"r"(src):"memory","cc","d0");
+  return VGET_LOW_I16(ret);
+#endif
+}
+
+static inline I16X8 vld1q_i16_i8(const I8 *src) {
+  return VMOVL_I8(VLD1_I8(src));
+}
+
+#define TCOPY_UNIT_4(src_ptr, dst_ptr, dst_offset) {\
+  I16X4 tmp = vld1_i16_i8(src_ptr);\
+  VST1_I16(dst_ptr + dst_offset, tmp);\
+}
+
+#define TCOPY_UNIT_6(src_ptr, dst_ptr, dst_offset) {\
+  I16X4 tmp = vld1_i16_i8(src_ptr);\
+  I16 t5 = src_ptr[4];\
+  I16 t6 = src_ptr[5];\
+  pref_ab(src_ptr + 6);\
+  VST1_I16(dst_ptr + dst_offset, tmp);\
+  dst_ptr[dst_offset + 4] = t5;\
+  dst_ptr[dst_offset + 5] = t6;\
+}
+
+#define TCOPY_UNIT_8(src_ptr, dst_ptr, dst_offset) {\
+  I16X8 tmp = vld1q_i16_i8(src_ptr);\
+  pref_ab(src_ptr + 8);\
+  VST1Q_I16(dst_ptr + dst_offset, tmp);\
+}
+
+#define TCOPY_UNIT_12(src_ptr, dst_ptr, dst_offset) {\
+  I16X8 tmpq = vld1q_i16_i8(src_ptr);\
+  I16X4 tmpd = vld1_i16_i8(src_ptr + 8);\
+  pref_ab(src_ptr + 12);\
+  VST1Q_I16(dst_ptr + dst_offset, tmpq);\
+  VST1_I16(dst_ptr + dst_offset + 8, tmpd);\
+}
+
+#ifdef GEMM_UNSIGNED_INT
+#define TCOPY_UNIT_uint8_t_uint16_t(src_ptr, dst_ptr, dst_offset, num_elements) \
+  TCOPY_UNIT_##num_elements(src_ptr, dst_ptr, dst_offset)
+#else
+#define TCOPY_UNIT_int8_t_int16_t(src_ptr, dst_ptr, dst_offset, num_elements) \
+  TCOPY_UNIT_##num_elements(src_ptr, dst_ptr, dst_offset)
+#endif
+
+#endif
diff --git a/include/arm_neon/NeonI8I32MlaGemmKernel.h b/include/arm_neon/NeonI8I32MlaGemmKernel.h
new file mode 100644
index 0000000..5c9fcde
--- /dev/null
+++ b/include/arm_neon/NeonI8I32MlaGemmKernel.h
@@ -0,0 +1,742 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+/******************************************************************************
+ * File:        NeonI8I32MlaGemmKernel.h
+ * Description: Inline kernel function templates for NEON 8->32bit ingeter
+ *              GEMM. This template can be used to generate signed and unsigned
+ *              integer matmul functions.
+ * Names:       "KERNEL_MxNy": code blocks that read panels from souce
+ *              matrices and multiply them.
+ *              "SAVE_MxNy": code blocks that store the multiply results
+ *              to a region of output matrix.
+ *****************************************************************************/
+
+#include "arm_neon/NeonIntOpSign.h"
+
+#ifndef INCLUDE_NEON_I8I32_KERNEL
+#define INCLUDE_NEON_I8I32_KERNEL
+
+#define COMMON_KERNEL_HEADER(a_head, b_head) \
+  const I16 *a_ptr = a_head;\
+  const I16 *b_ptr = b_head;\
+  uint32_t k_left = K;
+
+#define KERNEL_M1N1 \
+  COMMON_KERNEL_HEADER(a_head, b_head) \
+  I16X4 ad1, bd1;\
+  I32X4 cq1 = VDUPQ_N_I32(0);\
+  for (; k_left > 3; k_left -= 4) {\
+    ad1 = VLD1_I16(a_ptr); a_ptr += 4;\
+    bd1 = VLD1_I16(b_ptr); b_ptr += 4;\
+    cq1 = VMLAL_I16(cq1, ad1, bd1);\
+  }\
+  I32X2 cd1 = VADD_I32(VGET_LOW_I32(cq1), VGET_HIGH_I32(cq1));\
+  I32 cs1 = VGET_LANE_I32(cd1, 0) + VGET_LANE_I32(cd1, 1);\
+  for (; k_left > 0; k_left--) {\
+    cs1 += (I32)(*a_ptr++) * (I32)(*b_ptr++);\
+  }
+
+#define SAVE_M1N1 \
+  cs1 += c_ptr[0] * beta; c_ptr[0] = cs1;
+
+#define KERNEL_M2N1_UNIT(a_head, b_head) \
+  COMMON_KERNEL_HEADER(a_head, b_head) \
+  I16X4 ad1, ad2, bd1;\
+  I16X4X2 add1;\
+  I32X4 cq1, cq2;\
+  cq1 = cq2 = VDUPQ_N_I32(0);\
+  for (; k_left > 3; k_left -= 4) {\
+    ad1 = VLD1_I16(a_ptr); ad2 = VLD1_I16(a_ptr + 4); a_ptr += 8;\
+    bd1 = VLD1_I16(b_ptr); b_ptr += 4;\
+    add1 = VUZP_I16(ad1, ad2);\
+    cq1 = VMLAL_I16(cq1, add1.val[0], bd1);\
+    cq2 = VMLAL_I16(cq2, add1.val[1], bd1);\
+  }\
+  I32X2 cd1 = VADD_I32(VGET_LOW_I32(cq1), VGET_HIGH_I32(cq1));\
+  I32X2 cd2 = VADD_I32(VGET_LOW_I32(cq2), VGET_HIGH_I32(cq2));\
+  I32 cs1 = VGET_LANE_I32(cd1, 0) + VGET_LANE_I32(cd1, 1);\
+  I32 cs2 = VGET_LANE_I32(cd2, 0) + VGET_LANE_I32(cd2, 1);\
+  for (; k_left > 0; k_left--) {\
+    I32 bs1 = *b_ptr++;\
+    cs1 += (I32)a_ptr[0] * bs1;\
+    cs2 += (I32)a_ptr[1] * bs1;\
+    a_ptr += 2;\
+  }
+
+#define KERNEL_M2N1 KERNEL_M2N1_UNIT(a_head, b_head)
+#define KERNEL_M1N2 KERNEL_M2N1_UNIT(b_head, a_head)
+
+#define SAVE_M2N1 \
+  cs1 += c_ptr[0] * beta; cs2 += c_ptr[1] * beta;\
+  c_ptr[0] = cs1; c_ptr[1] = cs2;
+
+#define SAVE_M1N2 \
+  cs1 += c_ptr[0] * beta; cs2 += c_ptr[ldc] * beta;\
+  c_ptr[0] = cs1; c_ptr[ldc] = cs2;
+
+#define KERNEL_M2N2 \
+  COMMON_KERNEL_HEADER(a_head, b_head) \
+  I16X4 ad1, ad2, bd1, bd2;\
+  I16X4X2 add1, bdd1;\
+  I32X4 cq1, cq2, cq3, cq4;\
+  cq1 = cq2 = cq3 = cq4 = VDUPQ_N_I32(0);\
+  for (; k_left > 3; k_left -= 4) {\
+    ad1 = VLD1_I16(a_ptr); ad2 = VLD1_I16(a_ptr + 4); a_ptr += 8;\
+    bd1 = VLD1_I16(b_ptr); bd2 = VLD1_I16(b_ptr + 4); b_ptr += 8;\
+    add1 = VUZP_I16(ad1, ad2); bdd1 = VUZP_I16(bd1, bd2);\
+    cq1 = VMLAL_I16(cq1, add1.val[0], bdd1.val[0]);\
+    cq2 = VMLAL_I16(cq2, add1.val[1], bdd1.val[0]);\
+    cq3 = VMLAL_I16(cq3, add1.val[0], bdd1.val[1]);\
+    cq4 = VMLAL_I16(cq4, add1.val[1], bdd1.val[1]);\
+  }\
+  I32X2 cd1 = VADD_I32(VGET_LOW_I32(cq1), VGET_HIGH_I32(cq1));\
+  I32X2 cd2 = VADD_I32(VGET_LOW_I32(cq2), VGET_HIGH_I32(cq2));\
+  I32X2 cd3 = VADD_I32(VGET_LOW_I32(cq3), VGET_HIGH_I32(cq3));\
+  I32X2 cd4 = VADD_I32(VGET_LOW_I32(cq4), VGET_HIGH_I32(cq4));\
+  I32 cs1 = VGET_LANE_I32(cd1, 0) + VGET_LANE_I32(cd1, 1);\
+  I32 cs2 = VGET_LANE_I32(cd2, 0) + VGET_LANE_I32(cd2, 1);\
+  I32 cs3 = VGET_LANE_I32(cd3, 0) + VGET_LANE_I32(cd3, 1);\
+  I32 cs4 = VGET_LANE_I32(cd4, 0) + VGET_LANE_I32(cd4, 1);\
+  for (; k_left > 0; k_left--) {\
+    I32 as1 = a_ptr[0];\
+    I32 as2 = a_ptr[1]; a_ptr += 2;\
+    I32 bs1 = b_ptr[0];\
+    I32 bs2 = b_ptr[1]; b_ptr += 2;\
+    cs1 += as1 * bs1; cs2 += as2 * bs1;\
+    cs3 += as1 * bs2; cs4 += as2 * bs2;\
+  }
+
+#define SAVE_M2N2 \
+  I32 *c_l1 = c_ptr + ldc;\
+  cs1 += c_ptr[0] * beta; cs2 += c_ptr[1] * beta;\
+  cs3 += c_l1[0] * beta; cs4 += c_l1[1] * beta;\
+  c_ptr[0] = cs1; c_ptr[1] = cs2;\
+  c_l1[0] = cs3; c_l1[1] = cs4;
+
+#define KERNEL_M4N1_UNIT(a_head, b_head) \
+  COMMON_KERNEL_HEADER(a_head, b_head) \
+  I16X4 ad1, ad2, ad3, ad4, bd1;\
+  I32X4 cq1, cq2, cq3, cq4;\
+  cq1 = cq2 = cq3 = cq4 = VDUPQ_N_I32(0);\
+  for (; k_left > 3; k_left -= 4) {\
+    ad1 = VLD1_I16(a_ptr); ad2 = VLD1_I16(a_ptr + 4);\
+    ad3 = VLD1_I16(a_ptr + 8); ad4 = VLD1_I16(a_ptr + 12); a_ptr += 16;\
+    bd1 = VLD1_I16(b_ptr); b_ptr += 4;\
+    cq1 = VMLAL_LANE_I16(cq1, ad1, bd1, 0);\
+    cq2 = VMLAL_LANE_I16(cq2, ad2, bd1, 1);\
+    cq3 = VMLAL_LANE_I16(cq3, ad3, bd1, 2);\
+    cq4 = VMLAL_LANE_I16(cq4, ad4, bd1, 3);\
+  }\
+  cq1 = VADDQ_I32(cq1, cq3); cq2 = VADDQ_I32(cq2, cq4);\
+  cq1 = VADDQ_I32(cq1, cq2);\
+  for (; k_left > 0; k_left--) {\
+    ad1 = VLD1_I16(a_ptr); a_ptr += 4;\
+    I16 bs1 = *b_ptr++;\
+    cq1 = VMLAL_N_I16(cq1, ad1, bs1);\
+  }
+
+#define KERNEL_M4N1 KERNEL_M4N1_UNIT(a_head, b_head)
+#define KERNEL_M1N4 KERNEL_M4N1_UNIT(b_head, a_head)
+
+#define SAVE_M4N1 \
+  cq1 = VMLAQ_N_I32(cq1, VLD1Q_I32(c_ptr), beta);\
+  VST1Q_I32(c_ptr, cq1);
+
+#define UNIT_SAVE_M1N4(cq1) \
+  c_tmp[0] = c_tmp[0] * beta + VGETQ_LANE_I32(cq1, 0);\
+  c_tmp[ldc] = c_tmp[ldc] * beta + VGETQ_LANE_I32(cq1, 1);\
+  c_tmp += ldc * 2;\
+  c_tmp[0] = c_tmp[0] * beta + VGETQ_LANE_I32(cq1, 2);\
+  c_tmp[ldc] = c_tmp[ldc] * beta + VGETQ_LANE_I32(cq1, 3);\
+  c_tmp += ldc * 2;
+
+#define SAVE_M1N4 \
+  I32 *c_tmp = c_ptr; UNIT_SAVE_M1N4(cq1)
+
+#define KERNEL_M4N2_UNIT(a_head, b_head) \
+  COMMON_KERNEL_HEADER(a_head, b_head) \
+  I16X4 ad1, ad2, bd1;\
+  I32X4 cq1, cq2, cq3, cq4;\
+  cq1 = cq2 = cq3 = cq4 = VDUPQ_N_I32(0);\
+  for (; k_left > 1; k_left -= 2) {\
+    ad1 = VLD1_I16(a_ptr); ad2 = VLD1_I16(a_ptr + 4); a_ptr += 8;\
+    bd1 = VLD1_I16(b_ptr); b_ptr += 4;\
+    cq1 = VMLAL_LANE_I16(cq1, ad1, bd1, 0);\
+    cq2 = VMLAL_LANE_I16(cq2, ad1, bd1, 1);\
+    cq3 = VMLAL_LANE_I16(cq3, ad2, bd1, 2);\
+    cq4 = VMLAL_LANE_I16(cq4, ad2, bd1, 3);\
+  }\
+  cq1 = VADDQ_I32(cq1, cq3); cq2 = VADDQ_I32(cq2, cq4);\
+  for (; k_left > 0; k_left--) {\
+    ad1 = VLD1_I16(a_ptr); a_ptr += 4;\
+    I16 bs1 = b_ptr[0];\
+    I16 bs2 = b_ptr[1]; b_ptr += 2;\
+    cq1 = VMLAL_N_I16(cq1, ad1, bs1); cq2 = VMLAL_N_I16(cq2, ad1, bs2);\
+  }
+
+#define KERNEL_M4N2 KERNEL_M4N2_UNIT(a_head, b_head)
+#define KERNEL_M2N4 KERNEL_M4N2_UNIT(b_head, a_head)
+
+#define UNIT_SAVE_M4N2(cq1, cq2) \
+  cq1 = VMLAQ_N_I32(cq1, VLD1Q_I32(c_tmp), beta);\
+  cq2 = VMLAQ_N_I32(cq2, VLD1Q_I32(c_tmp + ldc), beta);\
+  VST1Q_I32(c_tmp, cq1); VST1Q_I32(c_tmp + ldc, cq2);\
+  c_tmp += ldc * 2;
+
+#define SAVE_M4N2 \
+  I32 *c_tmp = c_ptr; UNIT_SAVE_M4N2(cq1, cq2)
+
+#define UNIT_SAVE_M2N4(cq1, cq2) {\
+  I32X4X2 tm1 = VZIPQ_I32(cq1, cq2);\
+  I32X2 l1 = VMLA_N_I32(VGET_LOW_I32(tm1.val[0]),\
+    VLD1_I32(c_tmp), beta);\
+  I32X2 l2 = VMLA_N_I32(VGET_HIGH_I32(tm1.val[0]),\
+    VLD1_I32(c_tmp + ldc), beta);\
+  VST1_I32(c_tmp, l1); VST1_I32(c_tmp + ldc, l2); c_tmp += ldc * 2;\
+  I32X2 l3 = VMLA_N_I32(VGET_LOW_I32(tm1.val[1]),\
+    VLD1_I32(c_tmp), beta);\
+  I32X2 l4 = VMLA_N_I32(VGET_HIGH_I32(tm1.val[1]),\
+    VLD1_I32(c_tmp + ldc), beta);\
+  VST1_I32(c_tmp, l3); VST1_I32(c_tmp + ldc, l4); c_tmp += ldc * 2;\
+}
+
+#define SAVE_M2N4 \
+  I32 *c_tmp = c_ptr; UNIT_SAVE_M2N4(cq1, cq2)
+
+#define KERNEL_M4N4 \
+  COMMON_KERNEL_HEADER(a_head, b_head) \
+  I16X4 ad1, bd1;\
+  I32X4 cq1, cq2, cq3, cq4;\
+  cq1 = cq2 = cq3 = cq4 = VDUPQ_N_I32(0);\
+  for (; k_left > 0; k_left--) {\
+    ad1 = VLD1_I16(a_ptr); a_ptr += 4;\
+    bd1 = VLD1_I16(b_ptr); b_ptr += 4;\
+    cq1 = VMLAL_LANE_I16(cq1, ad1, bd1, 0);\
+    cq2 = VMLAL_LANE_I16(cq2, ad1, bd1, 1);\
+    cq3 = VMLAL_LANE_I16(cq3, ad1, bd1, 2);\
+    cq4 = VMLAL_LANE_I16(cq4, ad1, bd1, 3);\
+  }
+
+#define SAVE_M4N4 \
+  I32 *c_tmp = c_ptr; UNIT_SAVE_M4N2(cq1, cq2) UNIT_SAVE_M4N2(cq3, cq4)
+
+#define KERNEL_M8N1_UNIT(a_head, b_head) \
+  COMMON_KERNEL_HEADER(a_head, b_head) \
+  I16X4 ad1, ad2, ad3, ad4, ad5, ad6, ad7, ad8, bd1;\
+  I32X4 cq1, cq2, cq3, cq4;\
+  cq1 = cq2 = cq3 = cq4 = VDUPQ_N_I32(0);\
+  for (; k_left > 3; k_left -= 4) {\
+    ad1 = VLD1_I16(a_ptr); ad2 = VLD1_I16(a_ptr + 4);\
+    ad3 = VLD1_I16(a_ptr + 8); ad4 = VLD1_I16(a_ptr + 12);\
+    ad5 = VLD1_I16(a_ptr + 16); ad6 = VLD1_I16(a_ptr + 20);\
+    ad7 = VLD1_I16(a_ptr + 24); ad8 = VLD1_I16(a_ptr + 28); a_ptr += 32;\
+    bd1 = VLD1_I16(b_ptr); b_ptr += 4;\
+    cq1 = VMLAL_LANE_I16(cq1, ad1, bd1, 0);\
+    cq2 = VMLAL_LANE_I16(cq2, ad2, bd1, 0);\
+    cq3 = VMLAL_LANE_I16(cq3, ad3, bd1, 1);\
+    cq4 = VMLAL_LANE_I16(cq4, ad4, bd1, 1);\
+    cq1 = VMLAL_LANE_I16(cq1, ad5, bd1, 2);\
+    cq2 = VMLAL_LANE_I16(cq2, ad6, bd1, 2);\
+    cq3 = VMLAL_LANE_I16(cq3, ad7, bd1, 3);\
+    cq4 = VMLAL_LANE_I16(cq4, ad8, bd1, 3);\
+  }\
+  cq1 = VADDQ_I32(cq1, cq3); cq2 = VADDQ_I32(cq2, cq4);\
+  for (; k_left > 0; k_left--) {\
+    ad1 = VLD1_I16(a_ptr); ad2 = VLD1_I16(a_ptr + 4); a_ptr += 8;\
+    I16 bs1 = *b_ptr++;\
+    cq1 = VMLAL_N_I16(cq1, ad1, bs1); cq2 = VMLAL_N_I16(cq2, ad2, bs1);\
+  }
+
+#define KERNEL_M8N1 KERNEL_M8N1_UNIT(a_head, b_head)
+#define KERNEL_M1N8 KERNEL_M8N1_UNIT(b_head, a_head)
+
+#define SAVE_M8N1 \
+  cq1 = VMLAQ_N_I32(cq1, VLD1Q_I32(c_ptr), beta);\
+  cq2 = VMLAQ_N_I32(cq2, VLD1Q_I32(c_ptr + 4), beta);\
+  VST1Q_I32(c_ptr, cq1); VST1Q_I32(c_ptr + 4, cq2);
+
+#define SAVE_M1N8 \
+  I32 *c_tmp = c_ptr; UNIT_SAVE_M1N4(cq1) UNIT_SAVE_M1N4(cq2)
+
+#define KERNEL_M8N2_UNIT(a_head, b_head) \
+  COMMON_KERNEL_HEADER(a_head, b_head) \
+  I16X4 ad1, ad2, ad3, ad4, bd1;\
+  I32X4 cq1, cq2, cq3, cq4;\
+  cq1 = cq2 = cq3 = cq4 = VDUPQ_N_I32(0);\
+  for (; k_left > 1; k_left -= 2) {\
+    ad1 = VLD1_I16(a_ptr); ad2 = VLD1_I16(a_ptr + 4);\
+    ad3 = VLD1_I16(a_ptr + 8); ad4 = VLD1_I16(a_ptr + 12); a_ptr += 16;\
+    bd1 = VLD1_I16(b_ptr); b_ptr += 4;\
+    cq1 = VMLAL_LANE_I16(cq1, ad1, bd1, 0);\
+    cq2 = VMLAL_LANE_I16(cq2, ad2, bd1, 0);\
+    cq3 = VMLAL_LANE_I16(cq3, ad1, bd1, 1);\
+    cq4 = VMLAL_LANE_I16(cq4, ad2, bd1, 1);\
+    cq1 = VMLAL_LANE_I16(cq1, ad3, bd1, 2);\
+    cq2 = VMLAL_LANE_I16(cq2, ad4, bd1, 2);\
+    cq3 = VMLAL_LANE_I16(cq3, ad3, bd1, 3);\
+    cq4 = VMLAL_LANE_I16(cq4, ad4, bd1, 3);\
+  }\
+  if (k_left > 0) {\
+    ad1 = VLD1_I16(a_ptr); ad2 = VLD1_I16(a_ptr + 4); a_ptr += 8;\
+    I16 bs1 = b_ptr[0];\
+    I16 bs2 = b_ptr[1]; b_ptr += 2;\
+    cq1 = VMLAL_N_I16(cq1, ad1, bs1); cq2 = VMLAL_N_I16(cq2, ad2, bs1);\
+    cq3 = VMLAL_N_I16(cq3, ad1, bs2); cq4 = VMLAL_N_I16(cq4, ad2, bs2);\
+  }
+
+#define KERNEL_M8N2 KERNEL_M8N2_UNIT(a_head, b_head)
+#define KERNEL_M2N8 KERNEL_M8N2_UNIT(b_head, a_head)
+
+#define UNIT_SAVE_M8N2(cq1, cq2, cq3, cq4) \
+  cq1 = VMLAQ_N_I32(cq1, VLD1Q_I32(c_tmp), beta);\
+  cq2 = VMLAQ_N_I32(cq2, VLD1Q_I32(c_tmp + 4), beta);\
+  cq3 = VMLAQ_N_I32(cq3, VLD1Q_I32(c_tmp + ldc), beta);\
+  cq4 = VMLAQ_N_I32(cq4, VLD1Q_I32(c_tmp + ldc + 4), beta);\
+  VST1Q_I32(c_tmp, cq1); VST1Q_I32(c_tmp + 4, cq2);\
+  VST1Q_I32(c_tmp + ldc, cq3); VST1Q_I32(c_tmp + ldc + 4, cq4);\
+  c_tmp += ldc * 2;
+
+#define SAVE_M8N2 \
+  I32 *c_tmp = c_ptr; UNIT_SAVE_M8N2(cq1, cq2, cq3, cq4)
+
+#define SAVE_M2N8 \
+  I32 *c_tmp = c_ptr; UNIT_SAVE_M2N4(cq1, cq3) UNIT_SAVE_M2N4(cq2, cq4)
+
+#define KERNEL_M8N4_UNIT(a_head, b_head) \
+  COMMON_KERNEL_HEADER(a_head, b_head) \
+  I16X4 ad1, ad2, bd1;\
+  I32X4 cq1, cq2, cq3, cq4, cq5, cq6, cq7, cq8;\
+  cq1 = cq2 = cq3 = cq4 = cq5 = cq6 = cq7 = cq8 = VDUPQ_N_I32(0);\
+  for (; k_left > 0; k_left--) {\
+    ad1 = VLD1_I16(a_ptr); ad2 = VLD1_I16(a_ptr + 4); a_ptr += 8;\
+    bd1 = VLD1_I16(b_ptr); b_ptr += 4;\
+    cq1 = VMLAL_LANE_I16(cq1, ad1, bd1, 0);\
+    cq2 = VMLAL_LANE_I16(cq2, ad2, bd1, 0);\
+    cq3 = VMLAL_LANE_I16(cq3, ad1, bd1, 1);\
+    cq4 = VMLAL_LANE_I16(cq4, ad2, bd1, 1);\
+    cq5 = VMLAL_LANE_I16(cq5, ad1, bd1, 2);\
+    cq6 = VMLAL_LANE_I16(cq6, ad2, bd1, 2);\
+    cq7 = VMLAL_LANE_I16(cq7, ad1, bd1, 3);\
+    cq8 = VMLAL_LANE_I16(cq8, ad2, bd1, 3);\
+  }
+
+#define KERNEL_M8N4 KERNEL_M8N4_UNIT(a_head, b_head)
+#define KERNEL_M4N8 KERNEL_M8N4_UNIT(b_head, a_head)
+
+#define SAVE_M8N4 \
+  I32 *c_tmp = c_ptr;\
+  UNIT_SAVE_M8N2(cq1, cq2, cq3, cq4) UNIT_SAVE_M8N2(cq5, cq6, cq7, cq8)
+
+#define UNIT_SAVE_M4N4_TRANS(cq1, cq2, cq3, cq4) {\
+  I32X4 l1 = VLD1Q_I32(c_tmp);\
+  I32X4 l2 = VLD1Q_I32(c_tmp + ldc);\
+  I32X4 l3 = VLD1Q_I32(c_tmp + ldc * 2);\
+  I32X4 l4 = VLD1Q_I32(c_tmp + ldc * 3);\
+  I32X4X2 tm1 = VZIPQ_I32(cq1, cq2);\
+  I32X4X2 tm2 = VZIPQ_I32(cq3, cq4);\
+  cq1 = VCOMBINE_I32(VGET_LOW_I32(tm1.val[0]), VGET_LOW_I32(tm2.val[0]));\
+  cq2 = VCOMBINE_I32(VGET_HIGH_I32(tm1.val[0]), VGET_HIGH_I32(tm2.val[0]));\
+  cq3 = VCOMBINE_I32(VGET_LOW_I32(tm1.val[1]), VGET_LOW_I32(tm2.val[1]));\
+  cq4 = VCOMBINE_I32(VGET_HIGH_I32(tm1.val[1]), VGET_HIGH_I32(tm2.val[1]));\
+  cq1 = VMLAQ_N_I32(cq1, l1, beta); cq2 = VMLAQ_N_I32(cq2, l2, beta);\
+  cq3 = VMLAQ_N_I32(cq3, l3, beta); cq4 = VMLAQ_N_I32(cq4, l4, beta);\
+  VST1Q_I32(c_tmp, cq1); VST1Q_I32(c_tmp + ldc, cq2);\
+  VST1Q_I32(c_tmp + ldc * 2, cq3); VST1Q_I32(c_tmp + ldc * 3, cq4);\
+  c_tmp += ldc * 4;\
+}
+  
+#define SAVE_M4N8 \
+  I32 *c_tmp = c_ptr;\
+  UNIT_SAVE_M4N4_TRANS(cq1, cq3, cq5, cq7)\
+  UNIT_SAVE_M4N4_TRANS(cq2, cq4, cq6, cq8)
+
+#define KERNEL_M8N8 \
+  COMMON_KERNEL_HEADER(a_head, b_head) \
+  I16X4 ad1, ad2, bd1, bd2;\
+  I32X4 cq01, cq02, cq03, cq04, cq05, cq06, cq07, cq08;\
+  I32X4 cq09, cq10, cq11, cq12, cq13, cq14, cq15, cq16;\
+  cq01 = cq02 = cq03 = cq04 = cq05 = cq06 = cq07 = cq08 = VDUPQ_N_I32(0);\
+  cq09 = cq10 = cq11 = cq12 = cq13 = cq14 = cq15 = cq16 = VDUPQ_N_I32(0);\
+  for (; k_left > 0; k_left--) {\
+    ad1 = VLD1_I16(a_ptr); ad2 = VLD1_I16(a_ptr + 4); a_ptr += 8;\
+    bd1 = VLD1_I16(b_ptr); bd2 = VLD1_I16(b_ptr + 4); b_ptr += 8;\
+    cq01 = VMLAL_LANE_I16(cq01, ad1, bd1, 0);\
+    cq02 = VMLAL_LANE_I16(cq02, ad2, bd1, 0);\
+    cq03 = VMLAL_LANE_I16(cq03, ad1, bd1, 1);\
+    cq04 = VMLAL_LANE_I16(cq04, ad2, bd1, 1);\
+    cq05 = VMLAL_LANE_I16(cq05, ad1, bd1, 2);\
+    cq06 = VMLAL_LANE_I16(cq06, ad2, bd1, 2);\
+    cq07 = VMLAL_LANE_I16(cq07, ad1, bd1, 3);\
+    cq08 = VMLAL_LANE_I16(cq08, ad2, bd1, 3);\
+    cq09 = VMLAL_LANE_I16(cq09, ad1, bd2, 0);\
+    cq10 = VMLAL_LANE_I16(cq10, ad2, bd2, 0);\
+    cq11 = VMLAL_LANE_I16(cq11, ad1, bd2, 1);\
+    cq12 = VMLAL_LANE_I16(cq12, ad2, bd2, 1);\
+    cq13 = VMLAL_LANE_I16(cq13, ad1, bd2, 2);\
+    cq14 = VMLAL_LANE_I16(cq14, ad2, bd2, 2);\
+    cq15 = VMLAL_LANE_I16(cq15, ad1, bd2, 3);\
+    cq16 = VMLAL_LANE_I16(cq16, ad2, bd2, 3);\
+  }
+
+#define SAVE_M8N8 \
+  I32 *c_tmp = c_ptr;\
+  UNIT_SAVE_M8N2(cq01, cq02, cq03, cq04)\
+  UNIT_SAVE_M8N2(cq05, cq06, cq07, cq08)\
+  UNIT_SAVE_M8N2(cq09, cq10, cq11, cq12)\
+  UNIT_SAVE_M8N2(cq13, cq14, cq15, cq16)
+
+#define KERNEL_M12N1_UNIT(a_head, b_head) \
+  COMMON_KERNEL_HEADER(a_head, b_head) \
+  I16X4 ad1, ad2, ad3;\
+  I16 bs1;\
+  I32X4 cq1, cq2, cq3;\
+  cq1 = cq2 = cq3 = VDUPQ_N_I32(0);\
+  for (; k_left > 0; k_left--) {\
+    ad1 = VLD1_I16(a_ptr); ad2 = VLD1_I16(a_ptr + 4);\
+    ad3 = VLD1_I16(a_ptr + 8); a_ptr += 12;\
+    bs1 = *b_ptr++;\
+    cq1 = VMLAL_N_I16(cq1, ad1, bs1);\
+    cq2 = VMLAL_N_I16(cq2, ad2, bs1);\
+    cq3 = VMLAL_N_I16(cq3, ad3, bs1);\
+  }
+
+#define KERNEL_M12N1 KERNEL_M12N1_UNIT(a_head, b_head)
+#define KERNEL_M1N12 KERNEL_M12N1_UNIT(b_head, a_head)
+
+#define SAVE_M12N1 \
+  cq1 = VMLAQ_N_I32(cq1, VLD1Q_I32(c_ptr), beta);\
+  cq2 = VMLAQ_N_I32(cq2, VLD1Q_I32(c_ptr + 4), beta);\
+  cq3 = VMLAQ_N_I32(cq3, VLD1Q_I32(c_ptr + 8), beta);\
+  VST1Q_I32(c_ptr, cq1); VST1Q_I32(c_ptr + 4, cq2); VST1Q_I32(c_ptr + 8, cq3);
+
+#define SAVE_M1N12 \
+  I32 *c_tmp = c_ptr; UNIT_SAVE_M1N4(cq1)\
+  UNIT_SAVE_M1N4(cq2) UNIT_SAVE_M1N4(cq3)
+
+#define KERNEL_M12N2_UNIT(a_head, b_head) \
+  COMMON_KERNEL_HEADER(a_head, b_head) \
+  I16X4 ad1, ad2, ad3, ad4, ad5, ad6, bd1;\
+  I32X4 cq1, cq2, cq3, cq4, cq5, cq6;\
+  cq1 = cq2 = cq3 = cq4 = cq5 = cq6 = VDUPQ_N_I32(0);\
+  for (; k_left > 1; k_left -= 2) {\
+    ad1 = VLD1_I16(a_ptr); ad2 = VLD1_I16(a_ptr + 4);\
+    ad3 = VLD1_I16(a_ptr + 8); ad4 = VLD1_I16(a_ptr + 12);\
+    ad5 = VLD1_I16(a_ptr + 16); ad6 = VLD1_I16(a_ptr + 20); a_ptr += 24;\
+    bd1 = VLD1_I16(b_ptr); b_ptr += 4;\
+    cq1 = VMLAL_LANE_I16(cq1, ad1, bd1, 0);\
+    cq2 = VMLAL_LANE_I16(cq2, ad2, bd1, 0);\
+    cq3 = VMLAL_LANE_I16(cq3, ad3, bd1, 0);\
+    cq4 = VMLAL_LANE_I16(cq4, ad1, bd1, 1);\
+    cq5 = VMLAL_LANE_I16(cq5, ad2, bd1, 1);\
+    cq6 = VMLAL_LANE_I16(cq6, ad3, bd1, 1);\
+    cq1 = VMLAL_LANE_I16(cq1, ad4, bd1, 2);\
+    cq2 = VMLAL_LANE_I16(cq2, ad5, bd1, 2);\
+    cq3 = VMLAL_LANE_I16(cq3, ad6, bd1, 2);\
+    cq4 = VMLAL_LANE_I16(cq4, ad4, bd1, 3);\
+    cq5 = VMLAL_LANE_I16(cq5, ad5, bd1, 3);\
+    cq6 = VMLAL_LANE_I16(cq6, ad6, bd1, 3);\
+  }\
+  if (k_left > 0) {\
+    ad1 = VLD1_I16(a_ptr); ad2 = VLD1_I16(a_ptr + 4);\
+    ad3 = VLD1_I16(a_ptr + 8); a_ptr += 12;\
+    I16 bs1 = b_ptr[0];\
+    I16 bs2 = b_ptr[1]; b_ptr += 2;\
+    cq1 = VMLAL_N_I16(cq1, ad1, bs1);\
+    cq2 = VMLAL_N_I16(cq2, ad2, bs1);\
+    cq3 = VMLAL_N_I16(cq3, ad3, bs1);\
+    cq4 = VMLAL_N_I16(cq4, ad1, bs2);\
+    cq5 = VMLAL_N_I16(cq5, ad2, bs2);\
+    cq6 = VMLAL_N_I16(cq6, ad3, bs2);\
+  }
+
+#define KERNEL_M12N2 KERNEL_M12N2_UNIT(a_head, b_head)
+#define KERNEL_M2N12 KERNEL_M12N2_UNIT(b_head, a_head)
+
+#define UNIT_SAVE_M12N2(cq1, cq2, cq3, cq4, cq5, cq6) \
+  cq1 = VMLAQ_N_I32(cq1, VLD1Q_I32(c_tmp), beta);\
+  cq2 = VMLAQ_N_I32(cq2, VLD1Q_I32(c_tmp + 4), beta);\
+  cq3 = VMLAQ_N_I32(cq3, VLD1Q_I32(c_tmp + 8), beta);\
+  cq4 = VMLAQ_N_I32(cq4, VLD1Q_I32(c_tmp + ldc), beta);\
+  cq5 = VMLAQ_N_I32(cq5, VLD1Q_I32(c_tmp + ldc + 4), beta);\
+  cq6 = VMLAQ_N_I32(cq6, VLD1Q_I32(c_tmp + ldc + 8), beta);\
+  VST1Q_I32(c_tmp, cq1); VST1Q_I32(c_tmp + 4, cq2);\
+  VST1Q_I32(c_tmp + 8, cq3); VST1Q_I32(c_tmp + ldc, cq4);\
+  VST1Q_I32(c_tmp + ldc + 4, cq5); VST1Q_I32(c_tmp + ldc + 8, cq6);\
+  c_tmp += ldc * 2;
+
+#define SAVE_M12N2 \
+  I32 *c_tmp = c_ptr; UNIT_SAVE_M12N2(cq1, cq2, cq3, cq4, cq5, cq6)
+
+#define SAVE_M2N12 \
+  I32 *c_tmp = c_ptr; UNIT_SAVE_M2N4(cq1, cq4) UNIT_SAVE_M2N4(cq2, cq5)\
+  UNIT_SAVE_M2N4(cq3, cq6)
+
+#define KERNEL_M12N4_UNIT(a_head, b_head) \
+  COMMON_KERNEL_HEADER(a_head, b_head) \
+  I16X4 ad1, ad2, ad3, bd1;\
+  I32X4 cq01, cq02, cq03, cq04, cq05, cq06;\
+  I32X4 cq07, cq08, cq09, cq10, cq11, cq12;\
+  cq01 = cq02 = cq03 = cq04 = cq05 = cq06 = VDUPQ_N_I32(0);\
+  cq07 = cq08 = cq09 = cq10 = cq11 = cq12 = VDUPQ_N_I32(0);\
+  for (; k_left > 0; k_left--) {\
+    ad1 = VLD1_I16(a_ptr); ad2 = VLD1_I16(a_ptr + 4);\
+    ad3 = VLD1_I16(a_ptr + 8); a_ptr += 12;\
+    bd1 = VLD1_I16(b_ptr); b_ptr += 4;\
+    cq01 = VMLAL_LANE_I16(cq01, ad1, bd1, 0);\
+    cq02 = VMLAL_LANE_I16(cq02, ad2, bd1, 0);\
+    cq03 = VMLAL_LANE_I16(cq03, ad3, bd1, 0);\
+    cq04 = VMLAL_LANE_I16(cq04, ad1, bd1, 1);\
+    cq05 = VMLAL_LANE_I16(cq05, ad2, bd1, 1);\
+    cq06 = VMLAL_LANE_I16(cq06, ad3, bd1, 1);\
+    cq07 = VMLAL_LANE_I16(cq07, ad1, bd1, 2);\
+    cq08 = VMLAL_LANE_I16(cq08, ad2, bd1, 2);\
+    cq09 = VMLAL_LANE_I16(cq09, ad3, bd1, 2);\
+    cq10 = VMLAL_LANE_I16(cq10, ad1, bd1, 3);\
+    cq11 = VMLAL_LANE_I16(cq11, ad2, bd1, 3);\
+    cq12 = VMLAL_LANE_I16(cq12, ad3, bd1, 3);\
+  }
+
+#define KERNEL_M12N4 KERNEL_M12N4_UNIT(a_head, b_head)
+#define KERNEL_M4N12 KERNEL_M12N4_UNIT(b_head, a_head)
+
+#define SAVE_M12N4 \
+  I32 *c_tmp = c_ptr;\
+  UNIT_SAVE_M12N2(cq01, cq02, cq03, cq04, cq05, cq06)\
+  UNIT_SAVE_M12N2(cq07, cq08, cq09, cq10, cq11, cq12)
+
+#define SAVE_M4N12 \
+  I32 *c_tmp = c_ptr;\
+  UNIT_SAVE_M4N4_TRANS(cq01, cq04, cq07, cq10)\
+  UNIT_SAVE_M4N4_TRANS(cq02, cq05, cq08, cq11)\
+  UNIT_SAVE_M4N4_TRANS(cq03, cq06, cq09, cq12)
+
+#define KERNEL_M6N1_UNIT(a_head, b_head) \
+  COMMON_KERNEL_HEADER(a_head, b_head) \
+  I16X4 ad1, ad2, ad3, ad4, ad5, ad6, bd1;\
+  I16X4X2 add1;\
+  I32X4 cq1, cq2, cq3, cq4, cq5, cq6;\
+  cq1 = cq2 = cq3 = cq4 = cq5 = cq6 = VDUPQ_N_I32(0);\
+  for (; k_left > 3; k_left -= 4) {\
+    ad1 = VLD1_I16(a_ptr); ad2 = VLD1_I16(a_ptr + 4);\
+    ad3 = VLD1_I16(a_ptr + 8); ad4 = VLD1_I16(a_ptr + 12);\
+    ad5 = VLD1_I16(a_ptr + 16); ad6 = VLD1_I16(a_ptr + 20); a_ptr += 24;\
+    bd1 = VLD1_I16(b_ptr); b_ptr += 4;\
+    add1 = VUZP_I16(ad2, ad5);\
+    cq1 = VMLAL_LANE_I16(cq1, ad1, bd1, 0);\
+    cq2 = VMLAL_I16(cq2, add1.val[0], bd1);\
+    cq3 = VMLAL_LANE_I16(cq3, ad3, bd1, 1);\
+    cq4 = VMLAL_LANE_I16(cq4, ad4, bd1, 2);\
+    cq5 = VMLAL_I16(cq5, add1.val[1], bd1);\
+    cq6 = VMLAL_LANE_I16(cq6, ad6, bd1, 3);\
+  }\
+  cq1 = VADDQ_I32(cq1, cq4); cq3 = VADDQ_I32(cq3, cq6);\
+  cq4 = VCOMBINE_I32(VGET_LOW_I32(cq2), VGET_LOW_I32(cq5));\
+  cq6 = VCOMBINE_I32(VGET_HIGH_I32(cq2), VGET_HIGH_I32(cq5));\
+  cq2 = VADDQ_I32(cq4, cq6);\
+  I32 cs1 = VGETQ_LANE_I32(cq1, 0) + VGETQ_LANE_I32(cq2, 1);\
+  I32 cs2 = VGETQ_LANE_I32(cq1, 1) + VGETQ_LANE_I32(cq2, 3);\
+  I32 cs3 = VGETQ_LANE_I32(cq1, 2) + VGETQ_LANE_I32(cq3, 0);\
+  I32 cs4 = VGETQ_LANE_I32(cq1, 3) + VGETQ_LANE_I32(cq3, 1);\
+  I32 cs5 = VGETQ_LANE_I32(cq2, 0) + VGETQ_LANE_I32(cq3, 2);\
+  I32 cs6 = VGETQ_LANE_I32(cq2, 2) + VGETQ_LANE_I32(cq3, 3);\
+  for (; k_left > 0; k_left--) {\
+    I32 bs1 = *b_ptr++;\
+    cs1 += bs1 * (I32)a_ptr[0];\
+    cs2 += bs1 * (I32)a_ptr[1];\
+    cs3 += bs1 * (I32)a_ptr[2];\
+    cs4 += bs1 * (I32)a_ptr[3];\
+    cs5 += bs1 * (I32)a_ptr[4];\
+    cs6 += bs1 * (I32)a_ptr[5];\
+    a_ptr += 6;\
+  }
+
+#define KERNEL_M6N1 KERNEL_M6N1_UNIT(a_head, b_head)
+#define KERNEL_M1N6 KERNEL_M6N1_UNIT(b_head, a_head)
+
+#define SAVE_M6N1 \
+  cs1 += c_ptr[0] * beta; cs2 += c_ptr[1] * beta;\
+  cs3 += c_ptr[2] * beta; cs4 += c_ptr[3] * beta;\
+  cs5 += c_ptr[4] * beta; cs6 += c_ptr[5] * beta;\
+  c_ptr[0] = cs1; c_ptr[1] = cs2; c_ptr[2] = cs3;\
+  c_ptr[3] = cs4; c_ptr[4] = cs5; c_ptr[5] = cs6;
+
+#define SAVE_M1N6 \
+  I32 *c_tmp = c_ptr;\
+  cs1 += c_tmp[0] * beta; cs2 += c_tmp[ldc] * beta;\
+  c_tmp[0] = cs1; c_tmp[ldc] = cs2; c_tmp += ldc * 2;\
+  cs3 += c_tmp[0] * beta; cs4 += c_tmp[ldc] * beta;\
+  c_tmp[0] = cs3; c_tmp[ldc] = cs4; c_tmp += ldc * 2;\
+  cs5 += c_tmp[0] * beta; cs6 += c_tmp[ldc] * beta;\
+  c_tmp[0] = cs5; c_tmp[ldc] = cs6;
+
+#define KERNEL_M6N2_UNIT(a_head, b_head) \
+  COMMON_KERNEL_HEADER(a_head, b_head) \
+  I16X4 ad1, ad2, ad3, bd1;\
+  I16X4X2 bdd1;\
+  I32X4 cq1, cq2, cq3, cq4, cq5, cq6;\
+  cq1 = cq2 = cq3 = cq4 = cq5 = cq6 = VDUPQ_N_I32(0);\
+  for (; k_left > 1; k_left -= 2) {\
+    ad1 = VLD1_I16(a_ptr); ad2 = VLD1_I16(a_ptr + 4);\
+    ad3 = VLD1_I16(a_ptr + 8); a_ptr += 12;\
+    bd1 = VLD1_I16(b_ptr); b_ptr += 4;\
+    bdd1 = VTRN_I16(bd1, bd1);\
+    cq1 = VMLAL_LANE_I16(cq1, ad1, bd1, 0);\
+    cq2 = VMLAL_I16(cq2, ad2, bdd1.val[0]);\
+    cq3 = VMLAL_LANE_I16(cq3, ad3, bd1, 2);\
+    cq4 = VMLAL_LANE_I16(cq4, ad1, bd1, 1);\
+    cq5 = VMLAL_I16(cq5, ad2, bdd1.val[1]);\
+    cq6 = VMLAL_LANE_I16(cq6, ad3, bd1, 3);\
+  }\
+  I32X2 cd1 = VADD_I32(VGET_LOW_I32(cq1), VGET_HIGH_I32(cq2));\
+  I32X2 cd2 = VADD_I32(VGET_HIGH_I32(cq1), VGET_LOW_I32(cq3));\
+  I32X2 cd3 = VADD_I32(VGET_LOW_I32(cq2), VGET_HIGH_I32(cq3));\
+  I32X2 cd4 = VADD_I32(VGET_LOW_I32(cq4), VGET_HIGH_I32(cq5));\
+  I32X2 cd5 = VADD_I32(VGET_HIGH_I32(cq4), VGET_LOW_I32(cq6));\
+  I32X2 cd6 = VADD_I32(VGET_LOW_I32(cq5), VGET_HIGH_I32(cq6));\
+  cq1 = VCOMBINE_I32(cd1, cd2); cq2 = VCOMBINE_I32(cd4, cd5);\
+  I32 cs1 = VGET_LANE_I32(cd3, 0);\
+  I32 cs2 = VGET_LANE_I32(cd3, 1);\
+  I32 cs3 = VGET_LANE_I32(cd6, 0);\
+  I32 cs4 = VGET_LANE_I32(cd6, 1);\
+  if (k_left > 0) {\
+    ad1 = VLD1_I16(a_ptr);\
+    I32 as1 = a_ptr[4];\
+    I32 as2 = a_ptr[5]; a_ptr += 6;\
+    I32 bs1 = b_ptr[0];\
+    I32 bs2 = b_ptr[1]; b_ptr += 2;\
+    cq1 = VMLAL_N_I16(cq1, ad1, bs1);\
+    cq2 = VMLAL_N_I16(cq2, ad1, bs2);\
+    cs1 += as1 * bs1; cs2 += as2 * bs1;\
+    cs3 += as1 * bs2; cs4 += as2 * bs2;\
+  }
+
+#define KERNEL_M6N2 KERNEL_M6N2_UNIT(a_head, b_head)
+#define KERNEL_M2N6 KERNEL_M6N2_UNIT(b_head, a_head)
+
+#define SAVE_M6N2 \
+  I32 *c_l1 = c_ptr + ldc;\
+  cq1 = VMLAQ_N_I32(cq1, VLD1Q_I32(c_ptr), beta);\
+  cq2 = VMLAQ_N_I32(cq2, VLD1Q_I32(c_l1), beta);\
+  cs1 += c_ptr[4] * beta; cs2 += c_ptr[5] * beta;\
+  cs3 += c_l1[4] * beta; cs4 += c_l1[5] * beta;\
+  VST1Q_I32(c_ptr, cq1); VST1Q_I32(c_l1, cq2);\
+  c_ptr[4] = cs1; c_ptr[5] = cs2;\
+  c_l1[4] = cs3; c_l1[5] = cs4;
+
+#define SAVE_M2N6 \
+  I32 *c_tmp = c_ptr; UNIT_SAVE_M2N4(cq1, cq2)\
+  cs1 += c_tmp[0] * beta; cs3 += c_tmp[1] * beta;\
+  c_tmp[0] = cs1; c_tmp[1] = cs3; c_tmp += ldc;\
+  cs2 += c_tmp[0] * beta; cs4 += c_tmp[1] * beta;\
+  c_tmp[0] = cs2; c_tmp[1] = cs4;
+
+#define KERNEL_M6N4_UNIT(a_head, b_head) \
+  COMMON_KERNEL_HEADER(a_head, b_head) \
+  I16X4 ad1, ad2, ad3, bd1, bd2;\
+  I32X4 cq1, cq2, cq3, cq4, cq5, cq6;\
+  cq1 = cq2 = cq3 = cq4 = cq5 = cq6 = VDUPQ_N_I32(0);\
+  for (; k_left > 1; k_left -= 2) {\
+    ad1 = VLD1_I16(a_ptr); ad2 = VLD1_I16(a_ptr + 4);\
+    ad3 = VLD1_I16(a_ptr + 8); a_ptr += 12;\
+    bd1 = VLD1_I16(b_ptr); bd2 = VLD1_I16(b_ptr + 4); b_ptr += 8;\
+    cq1 = VMLAL_LANE_I16(cq1, bd1, ad1, 0);\
+    cq2 = VMLAL_LANE_I16(cq2, bd1, ad1, 1);\
+    cq3 = VMLAL_LANE_I16(cq3, bd1, ad1, 2);\
+    cq4 = VMLAL_LANE_I16(cq4, bd1, ad1, 3);\
+    cq5 = VMLAL_LANE_I16(cq5, bd1, ad2, 0);\
+    cq6 = VMLAL_LANE_I16(cq6, bd1, ad2, 1);\
+    cq1 = VMLAL_LANE_I16(cq1, bd2, ad2, 2);\
+    cq2 = VMLAL_LANE_I16(cq2, bd2, ad2, 3);\
+    cq3 = VMLAL_LANE_I16(cq3, bd2, ad3, 0);\
+    cq4 = VMLAL_LANE_I16(cq4, bd2, ad3, 1);\
+    cq5 = VMLAL_LANE_I16(cq5, bd2, ad3, 2);\
+    cq6 = VMLAL_LANE_I16(cq6, bd2, ad3, 3);\
+  }\
+  if (k_left > 0) {\
+    ad1 = VLD1_I16(a_ptr);\
+    I32 as1 = a_ptr[4];\
+    I32 as2 = a_ptr[5]; a_ptr += 6;\
+    bd1 = VLD1_I16(b_ptr); b_ptr += 4;\
+    cq1 = VMLAL_LANE_I16(cq1, bd1, ad1, 0);\
+    cq2 = VMLAL_LANE_I16(cq2, bd1, ad1, 1);\
+    cq3 = VMLAL_LANE_I16(cq3, bd1, ad1, 2);\
+    cq4 = VMLAL_LANE_I16(cq4, bd1, ad1, 3);\
+    cq5 = VMLAL_N_I16(cq5, bd1, as1);\
+    cq6 = VMLAL_N_I16(cq6, bd1, as2);\
+  }
+
+#define KERNEL_M6N4 KERNEL_M6N4_UNIT(a_head, b_head)
+#define KERNEL_M4N6 KERNEL_M6N4_UNIT(b_head, a_head)
+
+#define UNIT_SAVE_M6N4(cq1, cq2, cq3, cq4, cq5, cq6) \
+  UNIT_SAVE_M4N4_TRANS(cq1, cq2, cq3, cq4)\
+  c_tmp -= 4 * ldc;\
+  c_tmp += 4;\
+  UNIT_SAVE_M2N4(cq5, cq6)\
+  c_tmp -= 4;
+
+#define SAVE_M6N4 \
+  I32 *c_tmp = c_ptr;\
+  UNIT_SAVE_M6N4(cq1, cq2, cq3, cq4, cq5, cq6)
+
+#define SAVE_M4N6 \
+  I32 *c_tmp = c_ptr;\
+  UNIT_SAVE_M4N2(cq1, cq2) UNIT_SAVE_M4N2(cq3, cq4) UNIT_SAVE_M4N2(cq5, cq6)
+
+#define NEON_IMLAGEMM_INLINE_DUALPACK_UNIT_FUNC(mdim, ndim, srcint, dstint) \
+static inline void\
+  inline_dualpack_gemm_a##srcint##_b##srcint##_c##dstint##_m##mdim##_n##ndim(\
+  const srcint *a_head, const srcint *b_head, dstint *c_ptr,\
+  uint32_t K, dstint beta, uint32_t ldc) {\
+  KERNEL_M##mdim##N##ndim\
+  SAVE_M##mdim##N##ndim\
+}
+
+#define IMLAGEMM_INLINE_DUALPACK_UNIT_FUNC(mdim, ndim, srcint, dstint)\
+  NEON_IMLAGEMM_INLINE_DUALPACK_UNIT_FUNC(mdim, ndim, srcint, dstint)
+
+IMLAGEMM_INLINE_DUALPACK_UNIT_FUNC(1, 1, I16, I32)
+IMLAGEMM_INLINE_DUALPACK_UNIT_FUNC(1, 2, I16, I32)
+IMLAGEMM_INLINE_DUALPACK_UNIT_FUNC(2, 1, I16, I32)
+IMLAGEMM_INLINE_DUALPACK_UNIT_FUNC(2, 2, I16, I32)
+IMLAGEMM_INLINE_DUALPACK_UNIT_FUNC(1, 4, I16, I32)
+IMLAGEMM_INLINE_DUALPACK_UNIT_FUNC(2, 4, I16, I32)
+IMLAGEMM_INLINE_DUALPACK_UNIT_FUNC(4, 1, I16, I32)
+IMLAGEMM_INLINE_DUALPACK_UNIT_FUNC(4, 2, I16, I32)
+IMLAGEMM_INLINE_DUALPACK_UNIT_FUNC(4, 4, I16, I32)
+IMLAGEMM_INLINE_DUALPACK_UNIT_FUNC(1, 8, I16, I32)
+IMLAGEMM_INLINE_DUALPACK_UNIT_FUNC(2, 8, I16, I32)
+IMLAGEMM_INLINE_DUALPACK_UNIT_FUNC(4, 8, I16, I32)
+IMLAGEMM_INLINE_DUALPACK_UNIT_FUNC(8, 1, I16, I32)
+IMLAGEMM_INLINE_DUALPACK_UNIT_FUNC(8, 2, I16, I32)
+IMLAGEMM_INLINE_DUALPACK_UNIT_FUNC(8, 4, I16, I32)
+IMLAGEMM_INLINE_DUALPACK_UNIT_FUNC(8, 8, I16, I32)
+IMLAGEMM_INLINE_DUALPACK_UNIT_FUNC(1, 6, I16, I32)
+IMLAGEMM_INLINE_DUALPACK_UNIT_FUNC(2, 6, I16, I32)
+IMLAGEMM_INLINE_DUALPACK_UNIT_FUNC(4, 6, I16, I32)
+IMLAGEMM_INLINE_DUALPACK_UNIT_FUNC(6, 1, I16, I32)
+IMLAGEMM_INLINE_DUALPACK_UNIT_FUNC(6, 2, I16, I32)
+IMLAGEMM_INLINE_DUALPACK_UNIT_FUNC(6, 4, I16, I32)
+IMLAGEMM_INLINE_DUALPACK_UNIT_FUNC(1, 12, I16, I32)
+IMLAGEMM_INLINE_DUALPACK_UNIT_FUNC(2, 12, I16, I32)
+IMLAGEMM_INLINE_DUALPACK_UNIT_FUNC(4, 12, I16, I32)
+IMLAGEMM_INLINE_DUALPACK_UNIT_FUNC(12, 1, I16, I32)
+IMLAGEMM_INLINE_DUALPACK_UNIT_FUNC(12, 2, I16, I32)
+IMLAGEMM_INLINE_DUALPACK_UNIT_FUNC(12, 4, I16, I32)
+
+#endif
diff --git a/include/arm_neon/NeonI8I32MlaGemmSkinnyDot.h b/include/arm_neon/NeonI8I32MlaGemmSkinnyDot.h
new file mode 100644
index 0000000..d35a1fa
--- /dev/null
+++ b/include/arm_neon/NeonI8I32MlaGemmSkinnyDot.h
@@ -0,0 +1,200 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+/******************************************************************************
+ * File:        NeonI8I32MlaGemmSkinnyDot.h
+ * Description: Source code template for NEON mlal 8->32bit GEMM skinny dot
+ *              kernels.
+ *****************************************************************************/
+
+#include "common/CommonSkinnyDot.h"
+#include "arm_neon/NeonIntOpSign.h"
+
+#ifndef INCLUDE_I8I32_MLA_SKINNYDOT
+#define INCLUDE_I8I32_MLA_SKINNYDOT
+
+typedef I8 I8I32MLAGEMM_SKINNYDOT_ASCALAR;
+typedef I8 I8I32MLAGEMM_SKINNYDOT_BSCALAR;
+typedef I32 I8I32MLAGEMM_SKINNYDOT_CSCALAR;
+
+typedef I16 I8I32MLAGEMM_SKINNYDOT_AVEC1;
+typedef I16 I8I32MLAGEMM_SKINNYDOT_BVEC1;
+typedef I32 I8I32MLAGEMM_SKINNYDOT_CVEC1;
+
+typedef I32X2 I8I32MLAGEMM_SKINNYDOT_AVEC2;
+typedef I32X2 I8I32MLAGEMM_SKINNYDOT_BVEC2;
+typedef I32X2 I8I32MLAGEMM_SKINNYDOT_CVEC2;
+
+typedef I8X8 I8I32MLAGEMM_SKINNYDOT_AVEC4;
+typedef I8X8 I8I32MLAGEMM_SKINNYDOT_BVEC4;
+typedef I32X2 I8I32MLAGEMM_SKINNYDOT_CVEC4;
+
+typedef I8X8 I8I32MLAGEMM_SKINNYDOT_AVEC8;
+typedef I8X8 I8I32MLAGEMM_SKINNYDOT_BVEC8;
+typedef I32X4 I8I32MLAGEMM_SKINNYDOT_CVEC8;
+
+typedef I8X16 I8I32MLAGEMM_SKINNYDOT_AVEC16;
+typedef I8X16 I8I32MLAGEMM_SKINNYDOT_BVEC16;
+typedef I32X4X2 I8I32MLAGEMM_SKINNYDOT_CVEC16;
+
+#define GEMM_SKINNY_DOT_UNIT_DEDUCE(type, ...)\
+  GEMM_SKINNY_DOT_##type##_UNIT(__VA_ARGS__)
+
+GEMM_SKINNY_DOT_UNIT_DEDUCE(CALC, I8I32MLAGEMM, 16) {
+  I16X8 low_product = VMULL_I8(VGET_LOW_I8(a_vec), VGET_LOW_I8(b_vec));
+#if __aarch64__
+  I16X8 high_product = VMULL_HIGH_I8(a_vec, b_vec);
+#else
+  I16X8 high_product = VMULL_I8(VGET_HIGH_I8(a_vec), VGET_HIGH_I8(b_vec));
+#endif
+  I32X4X2 ret;
+  ret.val[0] = VPADALQ_I16(c_vec.val[0], low_product);
+  ret.val[1] = VPADALQ_I16(c_vec.val[1], high_product);
+  return ret;
+}
+
+GEMM_SKINNY_DOT_UNIT_DEDUCE(CALC, I8I32MLAGEMM, 8) {
+  I16X8 product = VMULL_I8(a_vec, b_vec);
+  return VPADALQ_I16(c_vec, product);
+}
+
+GEMM_SKINNY_DOT_UNIT_DEDUCE(CALC, I8I32MLAGEMM, 4) {
+  I16X8 product = VMULL_I8(a_vec, b_vec);
+  return VPADAL_I16(c_vec, VGET_LOW_I16(product));
+}
+
+GEMM_SKINNY_DOT_UNIT_DEDUCE(CALC, I8I32MLAGEMM, 2) {
+  return VMLA_I32(c_vec, a_vec, b_vec);
+}
+
+GEMM_SKINNY_DOT_UNIT_DEDUCE(CALC, I8I32MLAGEMM, 1) {
+  return c_vec + a_vec * b_vec;
+}
+
+GEMM_SKINNY_DOT_UNIT_DEDUCE(LOADA, I8I32MLAGEMM, 16) {
+#if __aarch64__
+  __asm__("prfm pldl1keep,[%0,#80]"::"r"(a_ptr):);
+#else
+  __asm__("pld [%0,#80]"::"r"(a_ptr):);
+#endif
+  return VLD1Q_I8(a_ptr);
+}
+
+GEMM_SKINNY_DOT_UNIT_DEDUCE(LOADA, I8I32MLAGEMM, 8) {
+#if __aarch64__
+  __asm__("prfm pldl1keep,[%0,#72]"::"r"(a_ptr):);
+#else
+  __asm__("pld [%0,#72]"::"r"(a_ptr):);
+#endif
+  return VLD1_I8(a_ptr);
+}
+
+GEMM_SKINNY_DOT_UNIT_DEDUCE(LOADA, I8I32MLAGEMM, 4) {
+#if __aarch64__
+  I8X8 ret; /* higher 4 elements not used */
+  __asm__("ldr %s0,[%1]; prfm pldl1keep,[%1,#72]":"=w"(ret):"r"(a_ptr):"memory");
+  return ret;
+#else
+  register I8X16 ret __asm("q0"); /* higher 12 elements not used */
+  __asm__("vld1.32 {%e0[0]},[%1]; pld [%1,#72]":"=w"(ret):"r"(a_ptr):"memory");
+  return VGET_LOW_I8(ret);
+#endif
+}
+
+GEMM_SKINNY_DOT_UNIT_DEDUCE(LOADA, I8I32MLAGEMM, 2) {
+  I32 lo = a_ptr[0];
+  I32 hi = a_ptr[1];
+  return (I32X2){lo, hi};
+}
+
+GEMM_SKINNY_DOT_UNIT_DEDUCE(LOADA, I8I32MLAGEMM, 1) {
+  return *a_ptr;
+}
+
+GEMM_SKINNY_DOT_UNIT_DEDUCE(LOADB, I8I32MLAGEMM, 16) {
+  return VLD1Q_I8(b_ptr);
+}
+
+GEMM_SKINNY_DOT_UNIT_DEDUCE(LOADB, I8I32MLAGEMM, 8) {
+  return VLD1_I8(b_ptr);
+}
+
+GEMM_SKINNY_DOT_UNIT_DEDUCE(LOADB, I8I32MLAGEMM, 4) {
+#if __aarch64__
+  I8X8 ret; /* higher 4 elements not used */
+  __asm__("ldr %s0,[%1]":"=w"(ret):"r"(b_ptr):"memory");
+  return ret;
+#else
+/* armeabi-gcc is always buggy. It always put a 64-bit wide 
+ * neon variable into s* register ! 
+ * here to use 128-bit wide neon variable to avoid this bug */
+  register I8X16 ret __asm("q0"); /* higher 12 elements not used */
+  __asm__("vld1.32 {%e0[0]},[%1]":"=w"(ret):"r"(b_ptr):"memory");
+  return VGET_LOW_I8(ret);
+#endif
+}
+
+GEMM_SKINNY_DOT_UNIT_DEDUCE(LOADB, I8I32MLAGEMM, 2) {
+  I32 lo = b_ptr[0];
+  I32 hi = b_ptr[1];
+  return (I32X2){lo, hi};
+}
+
+GEMM_SKINNY_DOT_UNIT_DEDUCE(LOADB, I8I32MLAGEMM, 1) {
+  return *b_ptr;
+}
+
+GEMM_SKINNY_DOT_UNIT_DEDUCE(REDUC, I8I32MLAGEMM, 16, 8) {
+  return VADDQ_I32(c_vec.val[0], c_vec.val[1]);
+}
+
+GEMM_SKINNY_DOT_UNIT_DEDUCE(REDUC, I8I32MLAGEMM, 8, 4) {
+  return VADD_I32(VGET_LOW_I32(c_vec), VGET_HIGH_I32(c_vec));
+}
+
+GEMM_SKINNY_DOT_UNIT_DEDUCE(REDUC, I8I32MLAGEMM, 4, 2) {
+  return c_vec;
+}
+
+GEMM_SKINNY_DOT_UNIT_DEDUCE(REDUC, I8I32MLAGEMM, 2, 1) {
+  return VGET_LANE_I32(c_vec, 0) + VGET_LANE_I32(c_vec, 1);
+}
+
+GEMM_SKINNY_DOT_UNIT_DEDUCE(INITC, I8I32MLAGEMM, 16) {
+  I32X4X2 ret;
+  ret.val[0] = VDUPQ_N_I32(0);
+  ret.val[1] = VDUPQ_N_I32(0);
+  return ret;
+}
+
+GEMM_SKINNY_DOT_UNIT_DEDUCE(INITC, I8I32MLAGEMM, 8) {
+  return VDUPQ_N_I32(0);
+}
+
+GEMM_SKINNY_DOT_UNIT_DEDUCE(INITC, I8I32MLAGEMM, 4) {
+  return VDUP_N_I32(0);
+}
+
+GEMM_SKINNY_DOT_UNIT_DEDUCE(INITC, I8I32MLAGEMM, 2) {
+  return VDUP_N_I32(0);
+}
+
+GEMM_SKINNY_DOT_UNIT_DEDUCE(INITC, I8I32MLAGEMM, 1) {
+  return 0;
+}
+
+#endif
\ No newline at end of file
diff --git a/include/arm_neon/NeonI8I32MlaGemmSkinnyGer.h b/include/arm_neon/NeonI8I32MlaGemmSkinnyGer.h
new file mode 100644
index 0000000..dd21ed6
--- /dev/null
+++ b/include/arm_neon/NeonI8I32MlaGemmSkinnyGer.h
@@ -0,0 +1,317 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+/******************************************************************************
+ * File:        NeonI8I32MlaGemmSkinnyGer.h
+ * Description: Source code template for NEON mlal 8->32bit GEMM skinny ger
+ *              kernels.
+ *****************************************************************************/
+
+#include "common/CommonSkinnyGer.h"
+#include "arm_neon/NeonIntOpSign.h"
+
+#ifndef INCLUDE_I8I32_MLA_SKINNYGER
+#define INCLUDE_I8I32_MLA_SKINNYGER
+
+typedef I8 I8I32MLAGEMM_SKINNYGER_ASCALAR;
+typedef I8 I8I32MLAGEMM_SKINNYGER_BSCALAR;
+typedef I32 I8I32MLAGEMM_SKINNYGER_CSCALAR;
+
+typedef I16 I8I32MLAGEMM_SKINNYGER_AVEC1;
+typedef I16 I8I32MLAGEMM_SKINNYGER_BVEC1;
+typedef I32 I8I32MLAGEMM_SKINNYGER_CVEC1;
+
+typedef I16X4 I8I32MLAGEMM_SKINNYGER_AVEC4;
+typedef I16X4 I8I32MLAGEMM_SKINNYGER_BVEC4;
+typedef I32X4 I8I32MLAGEMM_SKINNYGER_CVEC4;
+
+typedef I16X8 I8I32MLAGEMM_SKINNYGER_AVEC8;
+typedef I32X4X2 I8I32MLAGEMM_SKINNYGER_CVEC8;
+
+typedef I16X8X2 I8I32MLAGEMM_SKINNYGER_AVEC16;
+typedef I32X4X4 I8I32MLAGEMM_SKINNYGER_CVEC16;
+
+#if !__aarch64__
+#ifdef VMLAL_HIGH_LANE_I16
+#undef VMLAL_HIGH_LANE_I16
+#endif
+#ifdef VMLAL_HIGH_N_I16
+#undef VMLAL_HIGH_N_I16
+#endif
+#define VMLAL_HIGH_LANE_I16(c, a, b, v) VMLAL_LANE_I16(c, VGET_HIGH_I16(a), b, v)
+#define VMLAL_HIGH_N_I16(c, a, b) VMLAL_N_I16(c, VGET_HIGH_I16(a), b)
+#endif
+#define VMLAL_LOW_LANE_I16(c, a, b, v) VMLAL_LANE_I16(c, VGET_LOW_I16(a), b, v)
+#define VMLAL_LOW_N_I16(c, a, b) VMLAL_N_I16(c, VGET_LOW_I16(a), b)
+
+#define GEMM_SKINNY_GER_CALC_UNIT_DEDUCE(type, a, b, c)\
+  GEMM_SKINNY_GER_CALC_UNIT(type, a, b, c)
+
+GEMM_SKINNY_GER_CALC_UNIT_DEDUCE(I8I32MLAGEMM, 16, 4, 1) {
+  I32X4X4 ret;
+  ret.val[0] = VMLAL_LOW_LANE_I16(c_vec.val[0], a_vec.val[0], b_vec, 0);
+  ret.val[1] = VMLAL_HIGH_LANE_I16(c_vec.val[1], a_vec.val[0], b_vec, 0);
+  ret.val[2] = VMLAL_LOW_LANE_I16(c_vec.val[2], a_vec.val[1], b_vec, 0);
+  ret.val[3] = VMLAL_HIGH_LANE_I16(c_vec.val[3], a_vec.val[1], b_vec, 0);
+  return ret;
+}
+
+GEMM_SKINNY_GER_CALC_UNIT_DEDUCE(I8I32MLAGEMM, 16, 4, 2) {
+  I32X4X4 ret;
+  ret.val[0] = VMLAL_LOW_LANE_I16(c_vec.val[0], a_vec.val[0], b_vec, 1);
+  ret.val[1] = VMLAL_HIGH_LANE_I16(c_vec.val[1], a_vec.val[0], b_vec, 1);
+  ret.val[2] = VMLAL_LOW_LANE_I16(c_vec.val[2], a_vec.val[1], b_vec, 1);
+  ret.val[3] = VMLAL_HIGH_LANE_I16(c_vec.val[3], a_vec.val[1], b_vec, 1);
+  return ret;
+}
+
+GEMM_SKINNY_GER_CALC_UNIT_DEDUCE(I8I32MLAGEMM, 16, 4, 3) {
+  I32X4X4 ret;
+  ret.val[0] = VMLAL_LOW_LANE_I16(c_vec.val[0], a_vec.val[0], b_vec, 2);
+  ret.val[1] = VMLAL_HIGH_LANE_I16(c_vec.val[1], a_vec.val[0], b_vec, 2);
+  ret.val[2] = VMLAL_LOW_LANE_I16(c_vec.val[2], a_vec.val[1], b_vec, 2);
+  ret.val[3] = VMLAL_HIGH_LANE_I16(c_vec.val[3], a_vec.val[1], b_vec, 2);
+  return ret;
+}
+
+GEMM_SKINNY_GER_CALC_UNIT_DEDUCE(I8I32MLAGEMM, 16, 4, 4) {
+  I32X4X4 ret;
+  ret.val[0] = VMLAL_LOW_LANE_I16(c_vec.val[0], a_vec.val[0], b_vec, 3);
+  ret.val[1] = VMLAL_HIGH_LANE_I16(c_vec.val[1], a_vec.val[0], b_vec, 3);
+  ret.val[2] = VMLAL_LOW_LANE_I16(c_vec.val[2], a_vec.val[1], b_vec, 3);
+  ret.val[3] = VMLAL_HIGH_LANE_I16(c_vec.val[3], a_vec.val[1], b_vec, 3);
+  return ret;
+}
+
+GEMM_SKINNY_GER_CALC_UNIT_DEDUCE(I8I32MLAGEMM, 16, 1, 1) {
+  I32X4X4 ret;
+  ret.val[0] = VMLAL_LOW_N_I16(c_vec.val[0], a_vec.val[0], b_vec);
+  ret.val[1] = VMLAL_HIGH_N_I16(c_vec.val[1], a_vec.val[0], b_vec);
+  ret.val[2] = VMLAL_LOW_N_I16(c_vec.val[2], a_vec.val[1], b_vec);
+  ret.val[3] = VMLAL_HIGH_N_I16(c_vec.val[3], a_vec.val[1], b_vec);
+  return ret;
+}
+
+GEMM_SKINNY_GER_CALC_UNIT_DEDUCE(I8I32MLAGEMM, 8, 4, 1) {
+  I32X4X2 ret;
+  ret.val[0] = VMLAL_LOW_LANE_I16(c_vec.val[0], a_vec, b_vec, 0);
+  ret.val[1] = VMLAL_HIGH_LANE_I16(c_vec.val[1], a_vec, b_vec, 0);
+  return ret;
+}
+
+GEMM_SKINNY_GER_CALC_UNIT_DEDUCE(I8I32MLAGEMM, 8, 4, 2) {
+  I32X4X2 ret;
+  ret.val[0] = VMLAL_LOW_LANE_I16(c_vec.val[0], a_vec, b_vec, 1);
+  ret.val[1] = VMLAL_HIGH_LANE_I16(c_vec.val[1], a_vec, b_vec, 1);
+  return ret;
+}
+
+GEMM_SKINNY_GER_CALC_UNIT_DEDUCE(I8I32MLAGEMM, 8, 4, 3) {
+  I32X4X2 ret;
+  ret.val[0] = VMLAL_LOW_LANE_I16(c_vec.val[0], a_vec, b_vec, 2);
+  ret.val[1] = VMLAL_HIGH_LANE_I16(c_vec.val[1], a_vec, b_vec, 2);
+  return ret;
+}
+
+GEMM_SKINNY_GER_CALC_UNIT_DEDUCE(I8I32MLAGEMM, 8, 4, 4) {
+  I32X4X2 ret;
+  ret.val[0] = VMLAL_LOW_LANE_I16(c_vec.val[0], a_vec, b_vec, 3);
+  ret.val[1] = VMLAL_HIGH_LANE_I16(c_vec.val[1], a_vec, b_vec, 3);
+  return ret;
+}
+
+GEMM_SKINNY_GER_CALC_UNIT_DEDUCE(I8I32MLAGEMM, 8, 1, 1) {
+  I32X4X2 ret;
+  ret.val[0] = VMLAL_LOW_N_I16(c_vec.val[0], a_vec, b_vec);
+  ret.val[1] = VMLAL_HIGH_N_I16(c_vec.val[1], a_vec, b_vec);
+  return ret;
+}
+
+GEMM_SKINNY_GER_CALC_UNIT_DEDUCE(I8I32MLAGEMM, 4, 4, 1) {
+  return VMLAL_LANE_I16(c_vec, a_vec, b_vec, 0);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT_DEDUCE(I8I32MLAGEMM, 4, 4, 2) {
+  return VMLAL_LANE_I16(c_vec, a_vec, b_vec, 1);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT_DEDUCE(I8I32MLAGEMM, 4, 4, 3) {
+  return VMLAL_LANE_I16(c_vec, a_vec, b_vec, 2);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT_DEDUCE(I8I32MLAGEMM, 4, 4, 4) {
+  return VMLAL_LANE_I16(c_vec, a_vec, b_vec, 3);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT_DEDUCE(I8I32MLAGEMM, 4, 1, 1) {
+  return VMLAL_N_I16(c_vec, a_vec, b_vec);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT_DEDUCE(I8I32MLAGEMM, 1, 4, 1) {
+  return c_vec + a_vec * VGET_LANE_I16(b_vec, 0);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT_DEDUCE(I8I32MLAGEMM, 1, 4, 2) {
+  return c_vec + a_vec * VGET_LANE_I16(b_vec, 1);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT_DEDUCE(I8I32MLAGEMM, 1, 4, 3) {
+  return c_vec + a_vec * VGET_LANE_I16(b_vec, 2);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT_DEDUCE(I8I32MLAGEMM, 1, 4, 4) {
+  return c_vec + a_vec * VGET_LANE_I16(b_vec, 3);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT_DEDUCE(I8I32MLAGEMM, 1, 1, 1) {
+  return c_vec + a_vec * b_vec;
+}
+
+#define GEMM_SKINNY_GER_LOADA_UNIT_DEDUCE(type, a)\
+  GEMM_SKINNY_GER_LOADA_UNIT(type, a)
+
+GEMM_SKINNY_GER_LOADA_UNIT_DEDUCE(I8I32MLAGEMM, 16) {
+  I8X16 ld = VLD1Q_I8(a_ptr);
+  I16X8X2 ret;
+  ret.val[0] = VMOVL_I8(VGET_LOW_I8(ld));
+#if __aarch64__
+  ret.val[1] = VMOVL_HIGH_I8(ld);
+  __asm__("prfm pldl1keep,[%0,#80]"::"r"(a_ptr):);
+#else
+  ret.val[1] = VMOVL_I8(VGET_HIGH_I8(ld));
+  __asm__("pld [%0,#80]"::"r"(a_ptr):);
+#endif
+  return ret;
+}
+
+GEMM_SKINNY_GER_LOADA_UNIT_DEDUCE(I8I32MLAGEMM, 8) {
+  I8X8 t1 = VLD1_I8(a_ptr);
+#if __aarch64__
+  __asm__("prfm pldl1keep,[%0,#72]"::"r"(a_ptr):);
+#else
+  __asm__("pld [%0,#72]"::"r"(a_ptr):);
+#endif
+  return VMOVL_I8(t1);
+}
+
+GEMM_SKINNY_GER_LOADA_UNIT_DEDUCE(I8I32MLAGEMM, 4) {
+#if __aarch64__
+  I16X4 ret;
+  __asm__("ldr %s0,[%1]; "ISHLL" %0.8h,%0.8b,#0; prfm pldl1keep,[%1,#72]\n\t"
+    :"=w"(ret):"r"(a_ptr):"memory","cc");
+  return ret;
+#else
+  I16X8 ret;
+  __asm__("vld1.32 {d0[0]},[%1]; "ASM_VMOVL_I8" %q0,d0; pld [%1,#68]\n\t"
+    :"=w"(ret):"r"(a_ptr):"memory","cc","d0");
+  return VGET_LOW_I16(ret);
+#endif
+}
+
+GEMM_SKINNY_GER_LOADA_UNIT_DEDUCE(I8I32MLAGEMM, 1) {
+  return *a_ptr;
+}
+
+#define GEMM_SKINNY_GER_LOADC_UNIT_DEDUCE(type, a)\
+  GEMM_SKINNY_GER_LOADC_UNIT(type, a)
+
+GEMM_SKINNY_GER_LOADC_UNIT_DEDUCE(I8I32MLAGEMM, 16) {
+  I32X4X4 ret;
+  ret.val[0] = VLD1Q_I32(c_ptr);
+  ret.val[1] = VLD1Q_I32(c_ptr + 4);
+  ret.val[2] = VLD1Q_I32(c_ptr + 8);
+  ret.val[3] = VLD1Q_I32(c_ptr + 12);
+  return ret;
+}
+
+GEMM_SKINNY_GER_LOADC_UNIT_DEDUCE(I8I32MLAGEMM, 8) {
+  I32X4X2 ret;
+  ret.val[0] = VLD1Q_I32(c_ptr);
+  ret.val[1] = VLD1Q_I32(c_ptr + 4);
+  return ret;
+}
+
+GEMM_SKINNY_GER_LOADC_UNIT_DEDUCE(I8I32MLAGEMM, 4) {
+  return VLD1Q_I32(c_ptr);
+}
+
+GEMM_SKINNY_GER_LOADC_UNIT_DEDUCE(I8I32MLAGEMM, 1) {
+  return *c_ptr;
+}
+
+#define GEMM_SKINNY_GER_STOREC_UNIT_DEDUCE(type, c)\
+  GEMM_SKINNY_GER_STOREC_UNIT(type, c)
+
+GEMM_SKINNY_GER_STOREC_UNIT_DEDUCE(I8I32MLAGEMM, 16) {
+  VST1Q_I32(c_ptr, c_vec.val[0]);
+  VST1Q_I32(c_ptr + 4, c_vec.val[1]);
+  VST1Q_I32(c_ptr + 8, c_vec.val[2]);
+  VST1Q_I32(c_ptr + 12, c_vec.val[3]);
+}
+
+GEMM_SKINNY_GER_STOREC_UNIT_DEDUCE(I8I32MLAGEMM, 8) {
+  VST1Q_I32(c_ptr, c_vec.val[0]);
+  VST1Q_I32(c_ptr + 4, c_vec.val[1]);
+}
+
+GEMM_SKINNY_GER_STOREC_UNIT_DEDUCE(I8I32MLAGEMM, 4) {
+  VST1Q_I32(c_ptr, c_vec);
+}
+
+GEMM_SKINNY_GER_STOREC_UNIT_DEDUCE(I8I32MLAGEMM, 1) {
+  *c_ptr = c_vec;
+}
+
+#define GEMM_SKINNY_GER_LOADB_UNIT_DEDUCE(mode, type, b)\
+  GEMM_SKINNY_GER_LOADB_UNIT_##mode(type, b)
+
+GEMM_SKINNY_GER_LOADB_UNIT_DEDUCE(BROWMAJOR, I8I32MLAGEMM, 4) {
+  I16X4 ret = VDUP_N_I16(0);
+  I16 r1 = *b_ptr; b_ptr += ldb;
+  I16 r2 = *b_ptr; b_ptr += ldb;
+  I16 r3 = *b_ptr; b_ptr += ldb;
+  I16 r4 = *b_ptr;
+  ret = VSET_LANE_I16(r1, ret, 0);
+  ret = VSET_LANE_I16(r2, ret, 1);
+  ret = VSET_LANE_I16(r3, ret, 2);
+  ret = VSET_LANE_I16(r4, ret, 3);
+  return ret;
+}
+
+GEMM_SKINNY_GER_LOADB_UNIT_DEDUCE(BROWMAJOR, I8I32MLAGEMM, 1) {
+  return *b_ptr;
+}
+
+GEMM_SKINNY_GER_LOADB_UNIT_DEDUCE(BCOLMAJOR, I8I32MLAGEMM, 4) {
+#if __aarch64__
+  I16X4 ret;
+  __asm__("ldr %s0,[%1]; "ISHLL" %0.8h,%0.8b,#0\n\t"
+    :"=w"(ret):"r"(b_ptr):"memory","cc");
+  return ret;
+#else
+  I16X8 ret;
+  __asm__("vld1.32 {d0[0]},[%1]; "ASM_VMOVL_I8" %q0,d0\n\t"
+    :"=w"(ret):"r"(b_ptr):"memory","cc","d0");
+  return VGET_LOW_I16(ret);
+#endif
+}
+
+GEMM_SKINNY_GER_LOADB_UNIT_DEDUCE(BCOLMAJOR, I8I32MLAGEMM, 1) {
+  return *b_ptr;
+}
+
+#endif
diff --git a/include/arm_neon/NeonIntOpSign.h b/include/arm_neon/NeonIntOpSign.h
new file mode 100644
index 0000000..62339a6
--- /dev/null
+++ b/include/arm_neon/NeonIntOpSign.h
@@ -0,0 +1,441 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+/******************************************************************************
+ * File:        NeonIntOpSign.h
+ * Description: Sign-irrelevant representations of NEON intrinsics and
+ *              ASMs for integer operations involved in 8->32bit GEMM.
+ *              With a macro as (signed/unsigned) switch, these
+ *              representations are converted to the corresponding
+ *              type of intrinsics/ASMs.
+ *****************************************************************************/
+
+#include <stdint.h>
+#include <arm_neon.h>
+
+#ifndef INCLUDE_NEON_INTEGER_SIGN
+#define INCLUDE_NEON_INTEGER_SIGN
+
+#ifdef GEMM_UNSIGNED_INT
+#define I8 uint8_t
+#define I16 uint16_t
+#define I32 uint32_t
+#define I8X8 uint8x8_t
+#define I8X16 uint8x16_t
+#define I16X4 uint16x4_t
+#define I16X8 uint16x8_t
+#define I32X2 uint32x2_t
+#define I32X4 uint32x4_t
+#define I64X2 uint64x2_t
+#define I8X8X2 uint8x8x2_t
+#define I8X16X2 uint8x16x2_t
+#define I16X4X2 uint16x4x2_t
+#define I16X8X2 uint16x8x2_t
+#define I32X2X2 uint32x2x2_t
+#define I32X4X2 uint32x4x2_t
+#define I8X8X3 uint8x8x3_t
+#define I8X16X3 uint8x16x3_t
+#define I16X4X3 uint16x4x3_t
+#define I16X8X3 uint16x8x3_t
+#define I32X2X3 uint32x2x3_t
+#define I32X4X3 uint32x4x3_t
+#define I8X8X4 uint8x8x4_t
+#define I8X16X4 uint8x16x4_t
+#define I16X4X4 uint16x4x4_t
+#define I16X8X4 uint16x8x4_t
+#define I32X2X4 uint32x2x4_t
+#define I32X4X4 uint32x4x4_t
+#else
+#define I8 int8_t
+#define I16 int16_t
+#define I32 int32_t
+#define I8X8 int8x8_t
+#define I8X16 int8x16_t
+#define I16X4 int16x4_t
+#define I16X8 int16x8_t
+#define I32X2 int32x2_t
+#define I32X4 int32x4_t
+#define I64X2 int64x2_t
+#define I8X8X2 int8x8x2_t
+#define I8X16X2 int8x16x2_t
+#define I16X4X2 int16x4x2_t
+#define I16X8X2 int16x8x2_t
+#define I32X2X2 int32x2x2_t
+#define I32X4X2 int32x4x2_t
+#define I8X8X3 int8x8x3_t
+#define I8X16X3 int8x16x3_t
+#define I16X4X3 int16x4x3_t
+#define I16X8X3 int16x8x3_t
+#define I32X2X3 int32x2x3_t
+#define I32X4X3 int32x4x3_t
+#define I8X8X4 int8x8x4_t
+#define I8X16X4 int8x16x4_t
+#define I16X4X4 int16x4x4_t
+#define I16X8X4 int16x8x4_t
+#define I32X2X4 int32x2x4_t
+#define I32X4X4 int32x4x4_t
+#endif
+
+/* asm instruction switch */
+#if __aarch64__
+#ifdef GEMM_UNSIGNED_INT
+#define IMLAL "umlal"
+#define IMLAL2 "umlal2"
+#define ISHLL "ushll"
+#define ISHLL2 "ushll2"
+#define IXTL "uxtl"
+#define IADALP "uadalp"
+#define IMULL "umull"
+#define IMULL2 "umull2"
+#define IDOT "udot"
+#else
+#define IMLAL "smlal"
+#define IMLAL2 "smlal2"
+#define ISHLL "sshll"
+#define ISHLL2 "sshll2"
+#define IXTL "sxtl"
+#define IADALP "sadalp"
+#define IMULL "smull"
+#define IMULL2 "smull2"
+#define IDOT "sdot"
+#endif
+#else //armv7a
+#ifdef GEMM_UNSIGNED_INT
+#define ASM_VMLAL_I16 "vmlal.u16"
+#define ASM_VMOVL_I8 "vmovl.u8"
+#define ASM_VPADAL_I16 "vpadal.u16"
+#define ASM_VMULL_I8 "vmull.u8"
+#else
+#define ASM_VMLAL_I16 "vmlal.s16"
+#define ASM_VMOVL_I8 "vmovl.s8"
+#define ASM_VPADAL_I16 "vpadal.s16"
+#define ASM_VMULL_I8 "vmull.s8"
+#endif
+#endif
+
+/* intrinsic function switch */
+#ifdef GEMM_UNSIGNED_INT
+#define VMLAQ_N_I32(a, b, c) vmlaq_n_u32(a, b, c)
+#define VMLA_N_I32(a, b, c) vmla_n_u32(a, b, c)
+#define VMLA_I32(a, b, c) vmla_u32(a, b, c)
+#define VMLA_LANE_I32(a, b, c, d) vmla_lane_u32(a, b, c, d)
+#define VLD1_I8(a) vld1_u8(a)
+#define VLD1Q_I8(a) vld1q_u8(a)
+#define VLD1_I16(a) vld1_u16(a)
+#define VLD1Q_I16(a) vld1q_u16(a)
+#define VLD1_I32(a) vld1_u32(a)
+#define VLD1Q_I32(a) vld1q_u32(a)
+#define VLD3Q_I32(a) vld3q_u32(a)
+#define VMOVL_I8(a) vmovl_u8(a)
+#define VMOVL_HIGH_I8(a) vmovl_high_u8(a)
+#define VST1_I32(a, b) vst1_u32(a, b)
+#define VST2_I32(a, b) vst2_u32(a, b)
+#define VST3_I32(a, b) vst3_u32(a, b)
+#define VST4_I32(a, b) vst4_u32(a, b)
+#define VST1Q_I32(a, b) vst1q_u32(a, b)
+#define VST2Q_I32(a, b) vst2q_u32(a, b)
+#define VST3Q_I32(a, b) vst3q_u32(a, b)
+#define VST4Q_I32(a, b) vst4q_u32(a, b)
+#define VST1_LANE_I32(a, b, c) vst1_lane_u32(a, b, c)
+#define VST1Q_LANE_I32(a, b, c) vst1q_lane_u32(a, b, c)
+#define VST2_LANE_I32(a, b, c) vst2_lane_u32(a, b, c)
+#define VST2Q_LANE_I32(a, b, c) vst2q_lane_u32(a, b, c)
+#define VST3_LANE_I32(a, b, c) vst3_lane_u32(a, b, c)
+#define VST3Q_LANE_I32(a, b, c) vst3q_lane_u32(a, b, c)
+#define VST4_LANE_I32(a, b, c) vst4_lane_u32(a, b, c)
+#define VST4Q_LANE_I32(a, b, c) vst4q_lane_u32(a, b, c)
+#define VST1_I16(a, b) vst1_u16(a, b)
+#define VST1Q_I16(a, b) vst1q_u16(a, b)
+#define VST1_LANE_I16(a, b, c) vst1_lane_u16(a, b, c)
+#define VST1Q_LANE_I16(a, b, c) vst1q_lane_u16(a, b, c)
+#define VST2_LANE_I16(a, b, c) vst2_lane_u16(a, b, c)
+#define VST2Q_LANE_I16(a, b, c) vst2q_lane_u16(a, b, c)
+#define VST3_LANE_I16(a, b, c) vst3_lane_u16(a, b, c)
+#define VST3Q_LANE_I16(a, b, c) vst3q_lane_u16(a, b, c)
+#define VST4_LANE_I16(a, b, c) vst4_lane_u16(a, b, c)
+#define VST4Q_LANE_I16(a, b, c) vst4q_lane_u16(a, b, c)
+#define VMLAL_LANE_I16(a, b, c, d) vmlal_lane_u16(a, b, c, d)
+#define VMLAL_HIGH_LANE_I16(a, b, c, d) vmlal_high_lane_u16(a, b, c, d)
+#define VMLAL_N_I16(a, b, c) vmlal_n_u16(a, b, c)
+#define VMLAL_HIGH_N_I16(a, b, c) vmlal_high_n_u16(a, b, c)
+#define VMLAL_I16(a, b, c) vmlal_u16(a, b, c)
+#define VPADAL_I16(a, b) vpadal_u16(a, b)
+#define VPADALQ_I16(a, b) vpadalq_u16(a, b)
+#define VPADD_I32(a, b) vpadd_u32(a, b)
+#define VMULL_I8(a, b) vmull_u8(a, b)
+#define VMULL_HIGH_I8(a, b) vmull_high_u8(a, b)
+#define VGET_LOW_I8(a) vget_low_u8(a)
+#define VGET_HIGH_I8(a) vget_high_u8(a)
+#define VGET_LOW_I16(a) vget_low_u16(a)
+#define VGET_HIGH_I16(a) vget_high_u16(a)
+#define VGET_LANE_I16(a, b) vget_lane_u16(a, b)
+#define VGETQ_LANE_I16(a, b) vgetq_lane_u16(a, b)
+#define VGET_LOW_I32(a) vget_low_u32(a)
+#define VGET_HIGH_I32(a) vget_high_u32(a)
+#define VGET_LANE_I32(a, b) vget_lane_u32(a, b)
+#define VGETQ_LANE_I32(a, b) vgetq_lane_u32(a, b)
+#define VDUP_N_I32(a) vdup_n_u32(a)
+#define VDUPQ_N_I32(a) vdupq_n_u32(a)
+#define VDUP_N_I16(a) vdup_n_u16(a)
+#define VDUPQ_N_I16(a) vdupq_n_u16(a)
+#define VDUP_N_I8(a) vdup_n_u8(a)
+#define VDUPQ_N_I8(a) vdupq_n_u8(a)
+#define VSET_LANE_I32(a, b, c) vset_lane_u32(a, b, c)
+#define VSETQ_LANE_I32(a, b, c) vsetq_lane_u32(a, b, c)
+#define VSET_LANE_I16(a, b, c) vset_lane_u16(a, b, c)
+#define VSETQ_LANE_I16(a, b, c) vsetq_lane_u16(a, b, c)
+#define VZIP_I16(a, b) vzip_u16(a, b)
+#define VUZP_I16(a, b) vuzp_u16(a, b)
+#define VZIPQ_I32(a, b) vzipq_u32(a, b)
+#define VZIP1_I32(a, b) vzip1_u32(a, b)
+#define VZIP2_I32(a, b) vzip2_u32(a, b)
+#define VZIP1Q_I32(a, b) vzip1q_u32(a, b)
+#define VZIP2Q_I32(a, b) vzip2q_u32(a, b)
+#define VZIP1Q_I64(a, b) vzip1q_u64(a, b)
+#define VZIP2Q_I64(a, b) vzip2q_u64(a, b)
+#define VTRN_I16(a, b) vtrn_u16(a, b)
+#define VADD_I32(a, b) vadd_u32(a, b)
+#define VADDQ_I32(a, b) vaddq_u32(a, b)
+#define VCOMBINE_I32(a, b) vcombine_u32(a, b)
+#define VDOT_I32(a, b, c) vdot_u32(a, b, c)
+#define VDOT_LANE_I32(a, b, c, d) vdot_lane_u32(a, b, c, d)
+#define VDOTQ_I32(a, b, c) vdotq_u32(a, b, c)
+#define VDOTQ_LANE_I32(a, b, c, d) vdotq_lane_u32(a, b, c, d)
+#define VDOTQ_LANEQ_I32(a, b, c, d) vdotq_laneq_u32(a, b, c, d)
+#define VREINTERPRETQ_I32_I64(a) vreinterpretq_u32_u64(a)
+#define VREINTERPRETQ_I64_I32(a) vreinterpretq_u64_u32(a)
+#define VREINTERPRET_I8_I32(a) vreinterpret_u8_u32(a)
+#define VREINTERPRETQ_I8_I32(a) vreinterpretq_u8_u32(a)
+#define VREINTERPRET_I32_I8(a) vreinterpret_u32_u8(a)
+#define VREINTERPRETQ_I32_I8(a) vreinterpretq_u32_u8(a)
+#else
+#define VMLAQ_N_I32(a, b, c) vmlaq_n_s32(a, b, c)
+#define VMLA_N_I32(a, b, c) vmla_n_s32(a, b, c)
+#define VMLA_I32(a, b, c) vmla_s32(a, b, c)
+#define VMLA_LANE_I32(a, b, c, d) vmla_lane_s32(a, b, c, d)
+#define VLD1_I8(a) vld1_s8(a)
+#define VLD1Q_I8(a) vld1q_s8(a)
+#define VLD1_I16(a) vld1_s16(a)
+#define VLD1Q_I16(a) vld1q_s16(a)
+#define VLD1_I32(a) vld1_s32(a)
+#define VLD1Q_I32(a) vld1q_s32(a)
+#define VLD3Q_I32(a) vld3q_s32(a)
+#define VMOVL_I8(a) vmovl_s8(a)
+#define VMOVL_HIGH_I8(a) vmovl_high_s8(a)
+#define VST1_I32(a, b) vst1_s32(a, b)
+#define VST2_I32(a, b) vst2_s32(a, b)
+#define VST3_I32(a, b) vst3_s32(a, b)
+#define VST4_I32(a, b) vst4_s32(a, b)
+#define VST1Q_I32(a, b) vst1q_s32(a, b)
+#define VST2Q_I32(a, b) vst2q_s32(a, b)
+#define VST3Q_I32(a, b) vst3q_s32(a, b)
+#define VST4Q_I32(a, b) vst4q_s32(a, b)
+#define VST1_LANE_I32(a, b, c) vst1_lane_s32(a, b, c)
+#define VST1Q_LANE_I32(a, b, c) vst1q_lane_s32(a, b, c)
+#define VST2_LANE_I32(a, b, c) vst2_lane_s32(a, b, c)
+#define VST2Q_LANE_I32(a, b, c) vst2q_lane_s32(a, b, c)
+#define VST3_LANE_I32(a, b, c) vst3_lane_s32(a, b, c)
+#define VST3Q_LANE_I32(a, b, c) vst3q_lane_s32(a, b, c)
+#define VST4_LANE_I32(a, b, c) vst4_lane_s32(a, b, c)
+#define VST4Q_LANE_I32(a, b, c) vst4q_lane_s32(a, b, c)
+#define VST1_I16(a, b) vst1_s16(a, b)
+#define VST1Q_I16(a, b) vst1q_s16(a, b)
+#define VST1_LANE_I16(a, b, c) vst1_lane_s16(a, b, c)
+#define VST1Q_LANE_I16(a, b, c) vst1q_lane_s16(a, b, c)
+#define VST2_LANE_I16(a, b, c) vst2_lane_s16(a, b, c)
+#define VST2Q_LANE_I16(a, b, c) vst2q_lane_s16(a, b, c)
+#define VST3_LANE_I16(a, b, c) vst3_lane_s16(a, b, c)
+#define VST3Q_LANE_I16(a, b, c) vst3q_lane_s16(a, b, c)
+#define VST4_LANE_I16(a, b, c) vst4_lane_s16(a, b, c)
+#define VST4Q_LANE_I16(a, b, c) vst4q_lane_s16(a, b, c)
+#define VMLAL_LANE_I16(a, b, c, d) vmlal_lane_s16(a, b, c, d)
+#define VMLAL_HIGH_LANE_I16(a, b, c, d) vmlal_high_lane_s16(a, b, c, d)
+#define VMLAL_N_I16(a, b, c) vmlal_n_s16(a, b, c)
+#define VMLAL_HIGH_N_I16(a, b, c) vmlal_high_n_s16(a, b, c)
+#define VMLAL_I16(a, b, c) vmlal_s16(a, b, c)
+#define VPADAL_I16(a, b) vpadal_s16(a, b)
+#define VPADALQ_I16(a, b) vpadalq_s16(a, b)
+#define VPADD_I32(a, b) vpadd_s32(a, b)
+#define VMULL_I8(a, b) vmull_s8(a, b)
+#define VMULL_HIGH_I8(a, b) vmull_high_s8(a, b)
+#define VGET_LOW_I8(a) vget_low_s8(a)
+#define VGET_HIGH_I8(a) vget_high_s8(a)
+#define VGET_LOW_I16(a) vget_low_s16(a)
+#define VGET_HIGH_I16(a) vget_high_s16(a)
+#define VGET_LANE_I16(a, b) vget_lane_s16(a, b)
+#define VGETQ_LANE_I16(a, b) vgetq_lane_s16(a, b)
+#define VGET_LOW_I32(a) vget_low_s32(a)
+#define VGET_HIGH_I32(a) vget_high_s32(a)
+#define VGET_LANE_I32(a, b) vget_lane_s32(a, b)
+#define VGETQ_LANE_I32(a, b) vgetq_lane_s32(a, b)
+#define VDUP_N_I32(a) vdup_n_s32(a)
+#define VDUPQ_N_I32(a) vdupq_n_s32(a)
+#define VDUP_N_I16(a) vdup_n_s16(a)
+#define VDUPQ_N_I16(a) vdupq_n_s16(a)
+#define VDUP_N_I8(a) vdup_n_s8(a)
+#define VDUPQ_N_I8(a) vdupq_n_s8(a)
+#define VSET_LANE_I32(a, b, c) vset_lane_s32(a, b, c)
+#define VSETQ_LANE_I32(a, b, c) vsetq_lane_s32(a, b, c)
+#define VSET_LANE_I16(a, b, c) vset_lane_s16(a, b, c)
+#define VSETQ_LANE_I16(a, b, c) vsetq_lane_s16(a, b, c)
+#define VZIP_I16(a, b) vzip_s16(a, b)
+#define VUZP_I16(a, b) vuzp_s16(a, b)
+#define VZIPQ_I32(a, b) vzipq_s32(a, b)
+#define VZIP1_I32(a, b) vzip1_s32(a, b)
+#define VZIP2_I32(a, b) vzip2_s32(a, b)
+#define VZIP1Q_I32(a, b) vzip1q_s32(a, b)
+#define VZIP2Q_I32(a, b) vzip2q_s32(a, b)
+#define VZIP1Q_I64(a, b) vzip1q_s64(a, b)
+#define VZIP2Q_I64(a, b) vzip2q_s64(a, b)
+#define VTRN_I16(a, b) vtrn_s16(a, b)
+#define VADD_I32(a, b) vadd_s32(a, b)
+#define VADDQ_I32(a, b) vaddq_s32(a, b)
+#define VCOMBINE_I32(a, b) vcombine_s32(a, b)
+#define VDOT_I32(a, b, c) vdot_s32(a, b, c)
+#define VDOT_LANE_I32(a, b, c, d) vdot_lane_s32(a, b, c, d)
+#define VDOTQ_I32(a, b, c) vdotq_s32(a, b, c)
+#define VDOTQ_LANE_I32(a, b, c, d) vdotq_lane_s32(a, b, c, d)
+#define VDOTQ_LANEQ_I32(a, b, c, d) vdotq_laneq_s32(a, b, c, d)
+#define VREINTERPRETQ_I32_I64(a) vreinterpretq_s32_s64(a)
+#define VREINTERPRETQ_I64_I32(a) vreinterpretq_s64_s32(a)
+#define VREINTERPRET_I8_I32(a) vreinterpret_s8_s32(a)
+#define VREINTERPRETQ_I8_I32(a) vreinterpretq_s8_s32(a)
+#define VREINTERPRET_I32_I8(a) vreinterpret_s32_s8(a)
+#define VREINTERPRETQ_I32_I8(a) vreinterpretq_s32_s8(a)
+#endif
+
+#ifndef GEMM_UNSIGNED_INT
+#define I8I32MLAGEMM s8s32mlagemm
+#define I8I32MLAGEMM_SKINNYGER_ASCALAR s8s32mlagemm_skinnyger_ascalar
+#define I8I32MLAGEMM_SKINNYGER_BSCALAR s8s32mlagemm_skinnyger_bscalar
+#define I8I32MLAGEMM_SKINNYGER_CSCALAR s8s32mlagemm_skinnyger_cscalar
+#define I8I32MLAGEMM_SKINNYGER_AVEC1 s8s32mlagemm_skinnyger_avec1
+#define I8I32MLAGEMM_SKINNYGER_BVEC1 s8s32mlagemm_skinnyger_bvec1
+#define I8I32MLAGEMM_SKINNYGER_CVEC1 s8s32mlagemm_skinnyger_cvec1
+#define I8I32MLAGEMM_SKINNYGER_AVEC2 s8s32mlagemm_skinnyger_avec2
+#define I8I32MLAGEMM_SKINNYGER_BVEC2 s8s32mlagemm_skinnyger_bvec2
+#define I8I32MLAGEMM_SKINNYGER_CVEC2 s8s32mlagemm_skinnyger_cvec2
+#define I8I32MLAGEMM_SKINNYGER_AVEC4 s8s32mlagemm_skinnyger_avec4
+#define I8I32MLAGEMM_SKINNYGER_BVEC4 s8s32mlagemm_skinnyger_bvec4
+#define I8I32MLAGEMM_SKINNYGER_CVEC4 s8s32mlagemm_skinnyger_cvec4
+#define I8I32MLAGEMM_SKINNYGER_AVEC8 s8s32mlagemm_skinnyger_avec8
+#define I8I32MLAGEMM_SKINNYGER_BVEC8 s8s32mlagemm_skinnyger_bvec8
+#define I8I32MLAGEMM_SKINNYGER_CVEC8 s8s32mlagemm_skinnyger_cvec8
+#define I8I32MLAGEMM_SKINNYGER_AVEC16 s8s32mlagemm_skinnyger_avec16
+#define I8I32MLAGEMM_SKINNYGER_BVEC16 s8s32mlagemm_skinnyger_bvec16
+#define I8I32MLAGEMM_SKINNYGER_CVEC16 s8s32mlagemm_skinnyger_cvec16
+#define I8I32MLAGEMM_SKINNYDOT_ASCALAR s8s32mlagemm_skinnydot_ascalar
+#define I8I32MLAGEMM_SKINNYDOT_BSCALAR s8s32mlagemm_skinnydot_bscalar
+#define I8I32MLAGEMM_SKINNYDOT_CSCALAR s8s32mlagemm_skinnydot_cscalar
+#define I8I32MLAGEMM_SKINNYDOT_AVEC1 s8s32mlagemm_skinnydot_avec1
+#define I8I32MLAGEMM_SKINNYDOT_BVEC1 s8s32mlagemm_skinnydot_bvec1
+#define I8I32MLAGEMM_SKINNYDOT_CVEC1 s8s32mlagemm_skinnydot_cvec1
+#define I8I32MLAGEMM_SKINNYDOT_AVEC2 s8s32mlagemm_skinnydot_avec2
+#define I8I32MLAGEMM_SKINNYDOT_BVEC2 s8s32mlagemm_skinnydot_bvec2
+#define I8I32MLAGEMM_SKINNYDOT_CVEC2 s8s32mlagemm_skinnydot_cvec2
+#define I8I32MLAGEMM_SKINNYDOT_AVEC4 s8s32mlagemm_skinnydot_avec4
+#define I8I32MLAGEMM_SKINNYDOT_BVEC4 s8s32mlagemm_skinnydot_bvec4
+#define I8I32MLAGEMM_SKINNYDOT_CVEC4 s8s32mlagemm_skinnydot_cvec4
+#define I8I32MLAGEMM_SKINNYDOT_AVEC8 s8s32mlagemm_skinnydot_avec8
+#define I8I32MLAGEMM_SKINNYDOT_BVEC8 s8s32mlagemm_skinnydot_bvec8
+#define I8I32MLAGEMM_SKINNYDOT_CVEC8 s8s32mlagemm_skinnydot_cvec8
+#define I8I32MLAGEMM_SKINNYDOT_AVEC16 s8s32mlagemm_skinnydot_avec16
+#define I8I32MLAGEMM_SKINNYDOT_BVEC16 s8s32mlagemm_skinnydot_bvec16
+#define I8I32MLAGEMM_SKINNYDOT_CVEC16 s8s32mlagemm_skinnydot_cvec16
+#else
+#define I8I32MLAGEMM u8u32mlagemm
+#define I8I32MLAGEMM_SKINNYGER_ASCALAR u8u32mlagemm_skinnyger_ascalar
+#define I8I32MLAGEMM_SKINNYGER_BSCALAR u8u32mlagemm_skinnyger_bscalar
+#define I8I32MLAGEMM_SKINNYGER_CSCALAR u8u32mlagemm_skinnyger_cscalar
+#define I8I32MLAGEMM_SKINNYGER_AVEC1 u8u32mlagemm_skinnyger_avec1
+#define I8I32MLAGEMM_SKINNYGER_BVEC1 u8u32mlagemm_skinnyger_bvec1
+#define I8I32MLAGEMM_SKINNYGER_CVEC1 u8u32mlagemm_skinnyger_cvec1
+#define I8I32MLAGEMM_SKINNYGER_AVEC2 u8u32mlagemm_skinnyger_avec2
+#define I8I32MLAGEMM_SKINNYGER_BVEC2 u8u32mlagemm_skinnyger_bvec2
+#define I8I32MLAGEMM_SKINNYGER_CVEC2 u8u32mlagemm_skinnyger_cvec2
+#define I8I32MLAGEMM_SKINNYGER_AVEC4 u8u32mlagemm_skinnyger_avec4
+#define I8I32MLAGEMM_SKINNYGER_BVEC4 u8u32mlagemm_skinnyger_bvec4
+#define I8I32MLAGEMM_SKINNYGER_CVEC4 u8u32mlagemm_skinnyger_cvec4
+#define I8I32MLAGEMM_SKINNYGER_AVEC8 u8u32mlagemm_skinnyger_avec8
+#define I8I32MLAGEMM_SKINNYGER_BVEC8 u8u32mlagemm_skinnyger_bvec8
+#define I8I32MLAGEMM_SKINNYGER_CVEC8 u8u32mlagemm_skinnyger_cvec8
+#define I8I32MLAGEMM_SKINNYGER_AVEC16 u8u32mlagemm_skinnyger_avec16
+#define I8I32MLAGEMM_SKINNYGER_BVEC16 u8u32mlagemm_skinnyger_bvec16
+#define I8I32MLAGEMM_SKINNYGER_CVEC16 u8u32mlagemm_skinnyger_cvec16
+#define I8I32MLAGEMM_SKINNYDOT_ASCALAR u8u32mlagemm_skinnydot_ascalar
+#define I8I32MLAGEMM_SKINNYDOT_BSCALAR u8u32mlagemm_skinnydot_bscalar
+#define I8I32MLAGEMM_SKINNYDOT_CSCALAR u8u32mlagemm_skinnydot_cscalar
+#define I8I32MLAGEMM_SKINNYDOT_AVEC1 u8u32mlagemm_skinnydot_avec1
+#define I8I32MLAGEMM_SKINNYDOT_BVEC1 u8u32mlagemm_skinnydot_bvec1
+#define I8I32MLAGEMM_SKINNYDOT_CVEC1 u8u32mlagemm_skinnydot_cvec1
+#define I8I32MLAGEMM_SKINNYDOT_AVEC2 u8u32mlagemm_skinnydot_avec2
+#define I8I32MLAGEMM_SKINNYDOT_BVEC2 u8u32mlagemm_skinnydot_bvec2
+#define I8I32MLAGEMM_SKINNYDOT_CVEC2 u8u32mlagemm_skinnydot_cvec2
+#define I8I32MLAGEMM_SKINNYDOT_AVEC4 u8u32mlagemm_skinnydot_avec4
+#define I8I32MLAGEMM_SKINNYDOT_BVEC4 u8u32mlagemm_skinnydot_bvec4
+#define I8I32MLAGEMM_SKINNYDOT_CVEC4 u8u32mlagemm_skinnydot_cvec4
+#define I8I32MLAGEMM_SKINNYDOT_AVEC8 u8u32mlagemm_skinnydot_avec8
+#define I8I32MLAGEMM_SKINNYDOT_BVEC8 u8u32mlagemm_skinnydot_bvec8
+#define I8I32MLAGEMM_SKINNYDOT_CVEC8 u8u32mlagemm_skinnydot_cvec8
+#define I8I32MLAGEMM_SKINNYDOT_AVEC16 u8u32mlagemm_skinnydot_avec16
+#define I8I32MLAGEMM_SKINNYDOT_BVEC16 u8u32mlagemm_skinnydot_bvec16
+#define I8I32MLAGEMM_SKINNYDOT_CVEC16 u8u32mlagemm_skinnydot_cvec16
+#endif
+
+#ifndef GEMM_UNSIGNED_INT
+#define I8I32DOTGEMM s8s32dotgemm
+#define I8I32DOTGEMM_SKINNYDOT_ASCALAR s8s32dotgemm_skinnydot_ascalar
+#define I8I32DOTGEMM_SKINNYDOT_BSCALAR s8s32dotgemm_skinnydot_bscalar
+#define I8I32DOTGEMM_SKINNYDOT_CSCALAR s8s32dotgemm_skinnydot_cscalar
+#define I8I32DOTGEMM_SKINNYDOT_AVEC1 s8s32dotgemm_skinnydot_avec1
+#define I8I32DOTGEMM_SKINNYDOT_BVEC1 s8s32dotgemm_skinnydot_bvec1
+#define I8I32DOTGEMM_SKINNYDOT_CVEC1 s8s32dotgemm_skinnydot_cvec1
+#define I8I32DOTGEMM_SKINNYDOT_AVEC2 s8s32dotgemm_skinnydot_avec2
+#define I8I32DOTGEMM_SKINNYDOT_BVEC2 s8s32dotgemm_skinnydot_bvec2
+#define I8I32DOTGEMM_SKINNYDOT_CVEC2 s8s32dotgemm_skinnydot_cvec2
+#define I8I32DOTGEMM_SKINNYDOT_AVEC4 s8s32dotgemm_skinnydot_avec4
+#define I8I32DOTGEMM_SKINNYDOT_BVEC4 s8s32dotgemm_skinnydot_bvec4
+#define I8I32DOTGEMM_SKINNYDOT_CVEC4 s8s32dotgemm_skinnydot_cvec4
+#define I8I32DOTGEMM_SKINNYDOT_AVEC8 s8s32dotgemm_skinnydot_avec8
+#define I8I32DOTGEMM_SKINNYDOT_BVEC8 s8s32dotgemm_skinnydot_bvec8
+#define I8I32DOTGEMM_SKINNYDOT_CVEC8 s8s32dotgemm_skinnydot_cvec8
+#define I8I32DOTGEMM_SKINNYDOT_AVEC16 s8s32dotgemm_skinnydot_avec16
+#define I8I32DOTGEMM_SKINNYDOT_BVEC16 s8s32dotgemm_skinnydot_bvec16
+#define I8I32DOTGEMM_SKINNYDOT_CVEC16 s8s32dotgemm_skinnydot_cvec16
+#else
+#define I8I32DOTGEMM u8u32dotgemm
+#define I8I32DOTGEMM_SKINNYDOT_ASCALAR u8u32dotgemm_skinnydot_ascalar
+#define I8I32DOTGEMM_SKINNYDOT_BSCALAR u8u32dotgemm_skinnydot_bscalar
+#define I8I32DOTGEMM_SKINNYDOT_CSCALAR u8u32dotgemm_skinnydot_cscalar
+#define I8I32DOTGEMM_SKINNYDOT_AVEC1 u8u32dotgemm_skinnydot_avec1
+#define I8I32DOTGEMM_SKINNYDOT_BVEC1 u8u32dotgemm_skinnydot_bvec1
+#define I8I32DOTGEMM_SKINNYDOT_CVEC1 u8u32dotgemm_skinnydot_cvec1
+#define I8I32DOTGEMM_SKINNYDOT_AVEC2 u8u32dotgemm_skinnydot_avec2
+#define I8I32DOTGEMM_SKINNYDOT_BVEC2 u8u32dotgemm_skinnydot_bvec2
+#define I8I32DOTGEMM_SKINNYDOT_CVEC2 u8u32dotgemm_skinnydot_cvec2
+#define I8I32DOTGEMM_SKINNYDOT_AVEC4 u8u32dotgemm_skinnydot_avec4
+#define I8I32DOTGEMM_SKINNYDOT_BVEC4 u8u32dotgemm_skinnydot_bvec4
+#define I8I32DOTGEMM_SKINNYDOT_CVEC4 u8u32dotgemm_skinnydot_cvec4
+#define I8I32DOTGEMM_SKINNYDOT_AVEC8 u8u32dotgemm_skinnydot_avec8
+#define I8I32DOTGEMM_SKINNYDOT_BVEC8 u8u32dotgemm_skinnydot_bvec8
+#define I8I32DOTGEMM_SKINNYDOT_CVEC8 u8u32dotgemm_skinnydot_cvec8
+#define I8I32DOTGEMM_SKINNYDOT_AVEC16 u8u32dotgemm_skinnydot_avec16
+#define I8I32DOTGEMM_SKINNYDOT_BVEC16 u8u32dotgemm_skinnydot_bvec16
+#define I8I32DOTGEMM_SKINNYDOT_CVEC16 u8u32dotgemm_skinnydot_cvec16
+#endif
+
+#endif
diff --git a/include/arm_neon/NeonQuant.h b/include/arm_neon/NeonQuant.h
new file mode 100644
index 0000000..dfb84b8
--- /dev/null
+++ b/include/arm_neon/NeonQuant.h
@@ -0,0 +1,814 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+/******************************************************************************
+ * File:        NeonQuant.h
+ * Description: Source code template for NEON quantization kernels.
+ *****************************************************************************/
+
+#include "arm_neon/NeonExtreme.h"
+
+#ifndef INCLUDE_NEON_QUANT
+#define INCLUDE_NEON_QUANT
+
+static inline void inline_dequant_cvt_f32_s32(
+  float *dst, const int32_t *src, float scale, uint32_t size) {
+
+  const float32x4_t sc4 = vdupq_n_f32(scale);
+  const float32x2_t sc2 = vdup_n_f32(scale);
+  for (; size >= 16; size -= 16) {
+    int32x4_t v1 = vld1q_s32(src);
+    int32x4_t v2 = vld1q_s32(src + 4);
+    int32x4_t v3 = vld1q_s32(src + 8);
+    int32x4_t v4 = vld1q_s32(src + 12); src += 16;
+    float32x4_t q1 = vcvtq_f32_s32(v1);
+    float32x4_t q2 = vcvtq_f32_s32(v2);
+    float32x4_t q3 = vcvtq_f32_s32(v3);
+    float32x4_t q4 = vcvtq_f32_s32(v4);
+    q1 = vmulq_f32(q1, sc4);
+    q2 = vmulq_f32(q2, sc4);
+    q3 = vmulq_f32(q3, sc4);
+    q4 = vmulq_f32(q4, sc4);
+    vst1q_f32(dst, q1);
+    vst1q_f32(dst + 4, q2);
+    vst1q_f32(dst + 8, q3);
+    vst1q_f32(dst + 12, q4); dst += 16;
+  }
+  if (size >= 8) {
+    int32x4_t v1 = vld1q_s32(src);
+    int32x4_t v2 = vld1q_s32(src + 4); src += 8;
+    float32x4_t q1 = vcvtq_f32_s32(v1);
+    float32x4_t q2 = vcvtq_f32_s32(v2);
+    q1 = vmulq_f32(q1, sc4);
+    q2 = vmulq_f32(q2, sc4);
+    vst1q_f32(dst, q1);
+    vst1q_f32(dst + 4, q2); dst += 8;
+    size -= 8;
+  }
+  if (size >= 4) {
+    int32x4_t v1 = vld1q_s32(src); src += 4;
+    float32x4_t q1 = vcvtq_f32_s32(v1);
+    q1 = vmulq_f32(q1, sc4);
+    vst1q_f32(dst, q1); dst += 4;
+    size -= 4;
+  }
+  if (size >= 2) {
+    int32x2_t v1 = vld1_s32(src); src += 2;
+    float32x2_t d1 = vcvt_f32_s32(v1);
+    d1 = vmul_f32(d1, sc2);
+    vst1_f32(dst, d1); dst += 2;
+    size -= 2;
+  }
+  if (size >= 1) {
+    *dst = (float)(*src) * scale;
+  }
+}
+
+static inline void inline_quant_asym_u8_from_f32(
+  const float32_t *src, uint8_t *dst,
+  uint32_t size, uint8_t zero_point, float32_t scale) {
+
+  if (scale <= 0) return;
+  if (size == 0) return;
+  const float32_t add_zero_s = (float32_t)zero_point + 0.5f;
+  const float32x4_t add_zero_q = vdupq_n_f32(add_zero_s);
+  const float32_t mult_s = 1.0f / scale;
+  const float32x4_t mult_q = vdupq_n_f32(mult_s);
+
+  for (; size >= 16; size -= 16) {
+    float32x4_t f1 = vld1q_f32(src);
+    float32x4_t f2 = vld1q_f32(src + 4);
+    float32x4_t f3 = vld1q_f32(src + 8);
+    float32x4_t f4 = vld1q_f32(src + 12); src += 16;
+    f1 = vmlaq_f32(add_zero_q, f1, mult_q);
+    f2 = vmlaq_f32(add_zero_q, f2, mult_q);
+    f3 = vmlaq_f32(add_zero_q, f3, mult_q);
+    f4 = vmlaq_f32(add_zero_q, f4, mult_q);
+    uint32x4_t u1 = vcvtq_u32_f32(f1);
+    uint32x4_t u2 = vcvtq_u32_f32(f2);
+    uint32x4_t u3 = vcvtq_u32_f32(f3);
+    uint32x4_t u4 = vcvtq_u32_f32(f4);
+    uint16x4_t t1 = vqmovn_u32(u1);
+    uint16x4_t t2 = vqmovn_u32(u2);
+    uint16x4_t t3 = vqmovn_u32(u3);
+    uint16x4_t t4 = vqmovn_u32(u4);
+    uint8x8_t d1 = vqmovn_u16(vcombine_u16(t1, t2));
+    uint8x8_t d2 = vqmovn_u16(vcombine_u16(t3, t4));
+    vst1_u8(dst, d1);
+    vst1_u8(dst + 8, d2); dst += 16;
+  }
+  if (size >= 8) {
+    float32x4_t f1 = vld1q_f32(src);
+    float32x4_t f2 = vld1q_f32(src + 4); src += 8;
+    f1 = vmlaq_f32(add_zero_q, f1, mult_q);
+    f2 = vmlaq_f32(add_zero_q, f2, mult_q);
+    uint32x4_t u1 = vcvtq_u32_f32(f1);
+    uint32x4_t u2 = vcvtq_u32_f32(f2);
+    uint16x4_t t1 = vqmovn_u32(u1);
+    uint16x4_t t2 = vqmovn_u32(u2);
+    uint8x8_t d1 = vqmovn_u16(vcombine_u16(t1, t2));
+    vst1_u8(dst, d1); dst += 8;
+    size -= 8;
+  }
+  if (size >= 4) {
+    float32x4_t f1 = vld1q_f32(src); src += 4;
+    f1 = vmlaq_f32(add_zero_q, f1, mult_q);
+    uint32x4_t u1 = vcvtq_u32_f32(f1);
+    uint16x4_t t1 = vqmovn_u32(u1);
+    uint16x4_t z1 = vdup_n_u16(0);
+    uint8x8_t d1 = vqmovn_u16(vcombine_u16(t1, z1));
+    vst1_lane_u8(dst, d1, 0);
+    vst1_lane_u8(dst + 1, d1, 1);
+    vst1_lane_u8(dst + 2, d1, 2);
+    vst1_lane_u8(dst + 3, d1, 3);
+    dst += 4;
+    size -= 4;
+  }
+  for (; size > 0; size--) {
+    float32_t f1 = *src++;
+    f1 = f1 * mult_s + add_zero_s;
+    f1 = f1 < 0 ? 0.0 : f1;
+    f1 = f1 > 255 ? 255.0 : f1;
+    uint32_t u1 = (uint32_t)f1;
+    uint8_t s1 = u1 >= 256 ? 255 : u1;
+    *dst = s1; dst++;
+  }
+}
+
+static inline void inline_quant_asym_u16_from_f32(
+  const float32_t *src, uint16_t *dst,
+  uint32_t size, uint16_t zero_point, float32_t scale) {
+
+  if (scale <= 0) return;
+  if (size == 0) return;
+  const float32_t add_zero_s = (float32_t)zero_point + 0.5f;
+  const float32x4_t add_zero_q = vdupq_n_f32(add_zero_s);
+  const float32_t mult_s = 1.0f / scale;
+  const float32x4_t mult_q = vdupq_n_f32(mult_s);
+
+  for (; size >= 16; size -= 16) {
+    float32x4_t f1 = vld1q_f32(src);
+    float32x4_t f2 = vld1q_f32(src + 4);
+    float32x4_t f3 = vld1q_f32(src + 8);
+    float32x4_t f4 = vld1q_f32(src + 12); src += 16;
+    f1 = vmlaq_f32(add_zero_q, f1, mult_q);
+    f2 = vmlaq_f32(add_zero_q, f2, mult_q);
+    f3 = vmlaq_f32(add_zero_q, f3, mult_q);
+    f4 = vmlaq_f32(add_zero_q, f4, mult_q);
+    uint32x4_t u1 = vcvtq_u32_f32(f1);
+    uint32x4_t u2 = vcvtq_u32_f32(f2);
+    uint32x4_t u3 = vcvtq_u32_f32(f3);
+    uint32x4_t u4 = vcvtq_u32_f32(f4);
+    uint16x4_t t1 = vqmovn_u32(u1);
+    uint16x4_t t2 = vqmovn_u32(u2);
+    uint16x4_t t3 = vqmovn_u32(u3);
+    uint16x4_t t4 = vqmovn_u32(u4);
+    vst1_u16(dst, t1);
+    vst1_u16(dst + 4, t2);
+    vst1_u16(dst + 8, t3);
+    vst1_u16(dst + 12, t4); dst += 16;
+  }
+  if (size >= 8) {
+    float32x4_t f1 = vld1q_f32(src);
+    float32x4_t f2 = vld1q_f32(src + 4); src += 8;
+    f1 = vmlaq_f32(add_zero_q, f1, mult_q);
+    f2 = vmlaq_f32(add_zero_q, f2, mult_q);
+    uint32x4_t u1 = vcvtq_u32_f32(f1);
+    uint32x4_t u2 = vcvtq_u32_f32(f2);
+    uint16x4_t t1 = vqmovn_u32(u1);
+    uint16x4_t t2 = vqmovn_u32(u2);
+    vst1_u16(dst, t1);
+    vst1_u16(dst + 4, t2); dst += 8;
+    size -= 8;
+  }
+  if (size >= 4) {
+    float32x4_t f1 = vld1q_f32(src); src += 4;
+    f1 = vmlaq_f32(add_zero_q, f1, mult_q);
+    uint32x4_t u1 = vcvtq_u32_f32(f1);
+    uint16x4_t t1 = vqmovn_u32(u1);
+    vst1_u16(dst, t1); dst += 4;
+    size -= 4;
+  }
+  if (size > 0) {
+    float32x4_t f1 = vdupq_n_f32(0);
+    f1 = vsetq_lane_f32(src[0], f1, 0);
+    if (size > 1) f1 = vsetq_lane_f32(src[1], f1, 1);
+    if (size > 2) f1 = vsetq_lane_f32(src[2], f1, 2);
+    f1 = vmlaq_f32(add_zero_q, f1, mult_q);
+    uint32x4_t u1 = vcvtq_u32_f32(f1);
+    uint16x4_t t1 = vqmovn_u32(u1);
+    vst1_lane_u16(dst, t1, 0);
+    if (size > 1) vst1_lane_u16(dst + 1, t1, 1);
+    if (size > 2) vst1_lane_u16(dst + 2, t1, 2);
+  }
+}
+
+#if !__aarch64__
+static inline int32x4_t vcvtaq_s32_f32(float32x4_t src) {
+  const static float32x4_t cvt_positive_offset = {0.5f, 0.5f, 0.5f, 0.5f};
+  const static float32x4_t cvt_negative_offset = {-0.5f, -0.5f, -0.5f, -0.5f};
+  const static float32x4_t cmp_ref = {0.0f, 0.0f, 0.0f, 0.0f};
+  uint32x4_t mask = vcgtq_f32(src, cmp_ref); //src big, set 1
+  float32x4_t offset = vbslq_f32(mask, cvt_positive_offset, cvt_negative_offset);
+  src = vaddq_f32(src, offset);
+  return vcvtq_s32_f32(src);
+}
+#endif
+
+static inline void inline_quant_sym_s8_from_f32(
+  const float32_t *src, int8_t *dst,
+  uint32_t size, float32_t scale) {
+
+  if (scale <= 0) return;
+  if (size == 0) return;
+  const float32_t mult_s = 1.0f / scale;
+  const float32x4_t mult_q = vdupq_n_f32(mult_s);
+
+  for (; size >= 16; size -= 16) {
+    float32x4_t f1 = vld1q_f32(src);
+    float32x4_t f2 = vld1q_f32(src + 4);
+    float32x4_t f3 = vld1q_f32(src + 8);
+    float32x4_t f4 = vld1q_f32(src + 12); src += 16;
+    f1 = vmulq_f32(f1, mult_q);
+    f2 = vmulq_f32(f2, mult_q);
+    f3 = vmulq_f32(f3, mult_q);
+    f4 = vmulq_f32(f4, mult_q);
+    int32x4_t i1 = vcvtaq_s32_f32(f1);
+    int32x4_t i2 = vcvtaq_s32_f32(f2);
+    int32x4_t i3 = vcvtaq_s32_f32(f3);
+    int32x4_t i4 = vcvtaq_s32_f32(f4);
+    int16x4_t v1 = vqmovn_s32(i1);
+    int16x4_t v2 = vqmovn_s32(i2);
+    int16x4_t v3 = vqmovn_s32(i3);
+    int16x4_t v4 = vqmovn_s32(i4);
+    int8x8_t w1 = vqmovn_s16(vcombine_s16(v1, v2));
+    int8x8_t w2 = vqmovn_s16(vcombine_s16(v3, v4));
+    vst1_s8(dst, w1);
+    vst1_s8(dst + 8, w2); dst += 16;
+  }
+  if (size >= 8) {
+    float32x4_t f1 = vld1q_f32(src);
+    float32x4_t f2 = vld1q_f32(src + 4); src += 8;
+    f1 = vmulq_f32(f1, mult_q);
+    f2 = vmulq_f32(f2, mult_q);
+    int32x4_t i1 = vcvtaq_s32_f32(f1);
+    int32x4_t i2 = vcvtaq_s32_f32(f2);
+    int16x4_t v1 = vqmovn_s32(i1);
+    int16x4_t v2 = vqmovn_s32(i2);
+    int8x8_t w1 = vqmovn_s16(vcombine_s16(v1, v2));
+    vst1_s8(dst, w1); dst += 8;
+    size -= 8;
+  }
+  if (size >= 4) {
+    float32x4_t f1 = vld1q_f32(src); src += 4;
+    f1 = vmulq_f32(f1, mult_q);
+    int32x4_t i1 = vcvtaq_s32_f32(f1);
+    int16x4_t v1 = vqmovn_s32(i1);
+    int16x4_t z1 = vdup_n_s16(0);
+    int8x8_t w1 = vqmovn_s16(vcombine_s16(v1, z1));
+    vst1_lane_s8(dst, w1, 0);
+    vst1_lane_s8(dst + 1, w1, 1);
+    vst1_lane_s8(dst + 2, w1, 2);
+    vst1_lane_s8(dst + 3, w1, 3); dst += 4;
+    size -= 4;
+  }
+  for (; size > 0; size--) {
+    float32_t f1 = *src++;
+    f1 *= mult_s;
+    f1 += f1 > 0 ? 0.5f : -0.5f;
+    f1 = f1 < -128 ? -128.0 : f1;
+    f1 = f1 > 127 ? 127.0 : f1;
+    int8_t s1 = f1;
+    *dst = s1; dst++;
+  }
+}
+
+static inline void inline_quant_sym_s16_from_f32(
+  const float32_t *src, int16_t *dst,
+  uint32_t size, float32_t scale) {
+
+  if (scale <= 0) return;
+  if (size == 0) return;
+  const float32_t mult_s = 1.0f / scale;
+  const float32x4_t mult_q = vdupq_n_f32(mult_s);
+
+  for (; size >= 16; size -= 16) {
+    float32x4_t f1 = vld1q_f32(src);
+    float32x4_t f2 = vld1q_f32(src + 4);
+    float32x4_t f3 = vld1q_f32(src + 8);
+    float32x4_t f4 = vld1q_f32(src + 12); src += 16;
+    f1 = vmulq_f32(f1, mult_q);
+    f2 = vmulq_f32(f2, mult_q);
+    f3 = vmulq_f32(f3, mult_q);
+    f4 = vmulq_f32(f4, mult_q);
+    int32x4_t i1 = vcvtaq_s32_f32(f1);
+    int32x4_t i2 = vcvtaq_s32_f32(f2);
+    int32x4_t i3 = vcvtaq_s32_f32(f3);
+    int32x4_t i4 = vcvtaq_s32_f32(f4);
+    int16x4_t v1 = vqmovn_s32(i1);
+    int16x4_t v2 = vqmovn_s32(i2);
+    int16x4_t v3 = vqmovn_s32(i3);
+    int16x4_t v4 = vqmovn_s32(i4);
+    vst1_s16(dst, v1);
+    vst1_s16(dst + 4, v2);
+    vst1_s16(dst + 8, v3);
+    vst1_s16(dst + 12, v4); dst += 16;
+  }
+  if (size >= 8) {
+    float32x4_t f1 = vld1q_f32(src);
+    float32x4_t f2 = vld1q_f32(src + 4); src += 8;
+    f1 = vmulq_f32(f1, mult_q);
+    f2 = vmulq_f32(f2, mult_q);
+    int32x4_t i1 = vcvtaq_s32_f32(f1);
+    int32x4_t i2 = vcvtaq_s32_f32(f2);
+    int16x4_t v1 = vqmovn_s32(i1);
+    int16x4_t v2 = vqmovn_s32(i2);
+    vst1_s16(dst, v1);
+    vst1_s16(dst + 4, v2); dst += 8;
+    size -= 8;
+  }
+  if (size >= 4) {
+    float32x4_t f1 = vld1q_f32(src); src += 4;
+    f1 = vmulq_f32(f1, mult_q);
+    int32x4_t i1 = vcvtaq_s32_f32(f1);
+    int16x4_t v1 = vqmovn_s32(i1);
+    vst1_s16(dst, v1); dst += 4;
+    size -= 4;
+  }
+  if (size > 0) {
+    float32x4_t f1 = vdupq_n_f32(0);
+    f1 = vsetq_lane_f32(src[0], f1, 0);
+    if (size > 1) f1 = vsetq_lane_f32(src[1], f1, 1);
+    if (size > 2) f1 = vsetq_lane_f32(src[2], f1, 2);
+    f1 = vmulq_f32(f1, mult_q);
+    int32x4_t i1 = vcvtaq_s32_f32(f1);
+    int16x4_t v1 = vqmovn_s32(i1);
+    vst1_lane_s16(dst, v1, 0);
+    if (size > 1) vst1_lane_s16(dst + 1, v1, 1);
+    if (size > 2) vst1_lane_s16(dst + 2, v1, 2);
+  }
+}
+
+static inline void inline_requant_asym_u8_from_s32_mulhi(const int32_t *src,
+  uint8_t *dst, uint32_t size, uint8_t src_lshift,
+  int32_t mult_factor_22redun, uint8_t zero_point) {
+
+  if (size == 0) return;
+  const int32x4_t src_sh4 = vdupq_n_s32(src_lshift);
+  const int32x4_t mult_v4 = vdupq_n_s32(mult_factor_22redun);
+  const int16x4_t add_z4 = vdup_n_s16((int16_t)zero_point << 6);
+
+  for (; size > 15; size -= 16) {
+    int32x4_t l1 = vld1q_s32(src);
+    int32x4_t l2 = vld1q_s32(src + 4);
+    int32x4_t l3 = vld1q_s32(src + 8);
+    int32x4_t l4 = vld1q_s32(src + 12); src += 16;
+    l1 = vqrshlq_s32(l1, src_sh4);
+    l2 = vqrshlq_s32(l2, src_sh4);
+    l3 = vqrshlq_s32(l3, src_sh4);
+    l4 = vqrshlq_s32(l4, src_sh4);
+    l1 = vqrdmulhq_s32(l1, mult_v4);
+    l2 = vqrdmulhq_s32(l2, mult_v4);
+    l3 = vqrdmulhq_s32(l3, mult_v4);
+    l4 = vqrdmulhq_s32(l4, mult_v4);
+    int16x4_t m1 = vrshrn_n_s32(l1, 16);
+    int16x4_t m2 = vrshrn_n_s32(l2, 16);
+    int16x4_t m3 = vrshrn_n_s32(l3, 16);
+    int16x4_t m4 = vrshrn_n_s32(l4, 16);
+    m1 = vadd_s16(m1, add_z4);
+    m2 = vadd_s16(m2, add_z4);
+    m3 = vadd_s16(m3, add_z4);
+    m4 = vadd_s16(m4, add_z4);
+    uint8x8_t u1 = vqrshrun_n_s16(vcombine_s16(m1, m2), 6);
+    uint8x8_t u2 = vqrshrun_n_s16(vcombine_s16(m3, m4), 6);
+    vst1_u8(dst, u1);
+    vst1_u8(dst + 8, u2); dst += 16;
+  }
+  if (size > 7) {
+    int32x4_t l1 = vld1q_s32(src);
+    int32x4_t l2 = vld1q_s32(src + 4); src += 8;
+    l1 = vqrshlq_s32(l1, src_sh4);
+    l2 = vqrshlq_s32(l2, src_sh4);
+    l1 = vqrdmulhq_s32(l1, mult_v4);
+    l2 = vqrdmulhq_s32(l2, mult_v4);
+    int16x4_t m1 = vrshrn_n_s32(l1, 16);
+    int16x4_t m2 = vrshrn_n_s32(l2, 16);
+    m1 = vadd_s16(m1, add_z4);
+    m2 = vadd_s16(m2, add_z4);
+    uint8x8_t u1 = vqrshrun_n_s16(vcombine_s16(m1, m2), 6);
+    vst1_u8(dst, u1); dst += 8;
+    size -= 8;
+  }
+  if (size > 3) {
+    int32x4_t l1 = vld1q_s32(src); src += 4;
+    l1 = vqrshlq_s32(l1, src_sh4);
+    l1 = vqrdmulhq_s32(l1, mult_v4);
+    int16x4_t m1 = vrshrn_n_s32(l1, 16);
+    m1 = vadd_s16(m1, add_z4);
+    uint8x8_t u1 = vqrshrun_n_s16(vcombine_s16(m1, m1), 6);
+    vst1_lane_u8(dst, u1, 0);
+    vst1_lane_u8(dst + 1, u1, 1);
+    vst1_lane_u8(dst + 2, u1, 2);
+    vst1_lane_u8(dst + 3, u1, 3); dst += 4;
+    size -= 4;
+  }
+  if (size > 0) {
+    int32x4_t l1 = vdupq_n_s32(0);
+    l1 = vsetq_lane_s32(src[0], l1, 0);
+    if (size > 1) l1 = vsetq_lane_s32(src[1], l1, 1);
+    if (size > 2) l1 = vsetq_lane_s32(src[2], l1, 2);
+    l1 = vqrshlq_s32(l1, src_sh4);
+    l1 = vqrdmulhq_s32(l1, mult_v4);
+    int16x4_t m1 = vrshrn_n_s32(l1, 16);
+    m1 = vadd_s16(m1, add_z4);
+    uint8x8_t u1 = vqrshrun_n_s16(vcombine_s16(m1, m1), 6);
+    vst1_lane_u8(dst, u1, 0);
+    if (size > 1) vst1_lane_u8(dst + 1, u1, 1);
+    if (size > 2) vst1_lane_u8(dst + 2, u1, 2);
+  }
+}
+
+static inline void inline_requant_sym_s8_from_s32_mulhi(const int32_t *src,
+  int8_t *dst, uint32_t size,
+  uint8_t src_lshift, int32_t mult_factor_22redun) {
+
+  if (size == 0) return;
+  const int32x4_t src_sh4 = vdupq_n_s32(src_lshift);
+  const int32x4_t mult_v4 = vdupq_n_s32(mult_factor_22redun);
+
+  for (; size > 15; size -= 16) {
+    int32x4_t l1 = vld1q_s32(src);
+    int32x4_t l2 = vld1q_s32(src + 4);
+    int32x4_t l3 = vld1q_s32(src + 8);
+    int32x4_t l4 = vld1q_s32(src + 12); src += 16;
+    l1 = vqrshlq_s32(l1, src_sh4);
+    l2 = vqrshlq_s32(l2, src_sh4);
+    l3 = vqrshlq_s32(l3, src_sh4);
+    l4 = vqrshlq_s32(l4, src_sh4);
+    l1 = vqrdmulhq_s32(l1, mult_v4);
+    l2 = vqrdmulhq_s32(l2, mult_v4);
+    l3 = vqrdmulhq_s32(l3, mult_v4);
+    l4 = vqrdmulhq_s32(l4, mult_v4);
+    int16x4_t m1 = vrshrn_n_s32(l1, 16);
+    int16x4_t m2 = vrshrn_n_s32(l2, 16);
+    int16x4_t m3 = vrshrn_n_s32(l3, 16);
+    int16x4_t m4 = vrshrn_n_s32(l4, 16);
+    int8x8_t s1 = vqrshrn_n_s16(vcombine_s16(m1, m2), 7);
+    int8x8_t s2 = vqrshrn_n_s16(vcombine_s16(m3, m4), 7);
+    vst1_s8(dst, s1);
+    vst1_s8(dst + 8, s2); dst += 16;
+  }
+  if (size > 7) {
+    int32x4_t l1 = vld1q_s32(src);
+    int32x4_t l2 = vld1q_s32(src + 4); src += 8;
+    l1 = vqrshlq_s32(l1, src_sh4);
+    l2 = vqrshlq_s32(l2, src_sh4);
+    l1 = vqrdmulhq_s32(l1, mult_v4);
+    l2 = vqrdmulhq_s32(l2, mult_v4);
+    int16x4_t m1 = vrshrn_n_s32(l1, 16);
+    int16x4_t m2 = vrshrn_n_s32(l2, 16);
+    int8x8_t s1 = vqrshrn_n_s16(vcombine_s16(m1, m2), 7);
+    vst1_s8(dst, s1); dst += 8;
+    size -= 8;
+  }
+  if (size > 3) {
+    int32x4_t l1 = vld1q_s32(src); src += 4;
+    l1 = vqrshlq_s32(l1, src_sh4);
+    l1 = vqrdmulhq_s32(l1, mult_v4);
+    int16x4_t m1 = vrshrn_n_s32(l1, 16);
+    int8x8_t s1 = vqrshrn_n_s16(vcombine_s16(m1, m1), 7);
+    vst1_lane_s8(dst, s1, 0);
+    vst1_lane_s8(dst + 1, s1, 1);
+    vst1_lane_s8(dst + 2, s1, 2);
+    vst1_lane_s8(dst + 3, s1, 3); dst += 4;
+    size -= 4;
+  }
+  if (size > 0) {
+    int32x4_t l1 = vdupq_n_s32(0);
+    l1 = vsetq_lane_s32(src[0], l1, 0);
+    if (size > 1) l1 = vsetq_lane_s32(src[1], l1, 1);
+    if (size > 2) l1 = vsetq_lane_s32(src[2], l1, 2);
+    l1 = vqrshlq_s32(l1, src_sh4);
+    l1 = vqrdmulhq_s32(l1, mult_v4);
+    int16x4_t m1 = vrshrn_n_s32(l1, 16);
+    int8x8_t s1 = vqrshrn_n_s16(vcombine_s16(m1, m1), 7);
+    vst1_lane_s8(dst, s1, 0);
+    if (size > 1) vst1_lane_s8(dst + 1, s1, 1);
+    if (size > 2) vst1_lane_s8(dst + 2, s1, 2);
+  }
+}
+
+static inline void inline_requant_asym_u16_from_s32_mulhi(const int32_t *src,
+  uint16_t *dst, uint32_t size, uint8_t src_lshift,
+  int32_t mult_factor, uint16_t zero_point) {
+
+  if (size == 0) return;
+  const int32x4_t src_sh4 = vdupq_n_s32(src_lshift);
+  const int32x4_t mult_v4 = vdupq_n_s32(mult_factor);
+  const int32x4_t add_z4 = vdupq_n_s32((int32_t)zero_point << 14);
+
+  for (; size > 15; size -= 16) {
+    int32x4_t l1 = vld1q_s32(src);
+    int32x4_t l2 = vld1q_s32(src + 4);
+    int32x4_t l3 = vld1q_s32(src + 8);
+    int32x4_t l4 = vld1q_s32(src + 12); src += 16;
+    l1 = vqrshlq_s32(l1, src_sh4);
+    l2 = vqrshlq_s32(l2, src_sh4);
+    l3 = vqrshlq_s32(l3, src_sh4);
+    l4 = vqrshlq_s32(l4, src_sh4);
+    l1 = vqrdmulhq_s32(l1, mult_v4);
+    l2 = vqrdmulhq_s32(l2, mult_v4);
+    l3 = vqrdmulhq_s32(l3, mult_v4);
+    l4 = vqrdmulhq_s32(l4, mult_v4);
+    l1 = vqaddq_s32(l1, add_z4);
+    l2 = vqaddq_s32(l2, add_z4);
+    l3 = vqaddq_s32(l3, add_z4);
+    l4 = vqaddq_s32(l4, add_z4);
+    uint16x4_t m1 = vqrshrun_n_s32(l1, 14);
+    uint16x4_t m2 = vqrshrun_n_s32(l2, 14);
+    uint16x4_t m3 = vqrshrun_n_s32(l3, 14);
+    uint16x4_t m4 = vqrshrun_n_s32(l4, 14);
+    vst1_u16(dst, m1);
+    vst1_u16(dst + 4, m2);
+    vst1_u16(dst + 8, m3);
+    vst1_u16(dst + 12, m4); dst += 16;
+  }
+  for (; size > 7; size -= 8) {
+    int32x4_t l1 = vld1q_s32(src);
+    int32x4_t l2 = vld1q_s32(src + 4); src += 8;
+    l1 = vqrshlq_s32(l1, src_sh4);
+    l2 = vqrshlq_s32(l2, src_sh4);
+    l1 = vqrdmulhq_s32(l1, mult_v4);
+    l2 = vqrdmulhq_s32(l2, mult_v4);
+    l1 = vqaddq_s32(l1, add_z4);
+    l2 = vqaddq_s32(l2, add_z4);
+    uint16x4_t m1 = vqrshrun_n_s32(l1, 14);
+    uint16x4_t m2 = vqrshrun_n_s32(l2, 14);
+    vst1_u16(dst, m1);
+    vst1_u16(dst + 4, m2); dst += 8;
+  }
+  for (; size > 3; size -= 4) {
+    int32x4_t l1 = vld1q_s32(src); src += 4;
+    l1 = vqrshlq_s32(l1, src_sh4);
+    l1 = vqrdmulhq_s32(l1, mult_v4);
+    l1 = vqaddq_s32(l1, add_z4);
+    uint16x4_t m1 = vqrshrun_n_s32(l1, 14);
+    vst1_u16(dst, m1); dst += 4;
+  }
+  if (size > 0) {
+    int32x4_t l1 = vdupq_n_s32(0);
+    l1 = vsetq_lane_s32(src[0], l1, 0);
+    if (size > 1) l1 = vsetq_lane_s32(src[1], l1, 1);
+    if (size > 2) l1 = vsetq_lane_s32(src[2], l1, 2);
+    l1 = vqrshlq_s32(l1, src_sh4);
+    l1 = vqrdmulhq_s32(l1, mult_v4);
+    l1 = vqaddq_s32(l1, add_z4);
+    uint16x4_t m1 = vqrshrun_n_s32(l1, 14);
+    vst1_lane_u16(dst, m1, 0);
+    if (size > 1) vst1_lane_u16(dst + 1, m1, 1);
+    if (size > 2) vst1_lane_u16(dst + 2, m1, 2);
+  }
+}
+
+static inline void inline_requant_sym_s16_from_s32_mulhi(const int32_t *src,
+  int16_t *dst, uint32_t size,
+  uint8_t src_lshift, int32_t mult_factor) {
+
+  if (size == 0) return;
+  const int32x4_t src_sh4 = vdupq_n_s32(src_lshift);
+  const int32x4_t mult_v4 = vdupq_n_s32(mult_factor);
+
+  for (; size > 15; size -= 16) {
+    int32x4_t l1 = vld1q_s32(src);
+    int32x4_t l2 = vld1q_s32(src + 4);
+    int32x4_t l3 = vld1q_s32(src + 8);
+    int32x4_t l4 = vld1q_s32(src + 12); src += 16;
+    l1 = vqrshlq_s32(l1, src_sh4);
+    l2 = vqrshlq_s32(l2, src_sh4);
+    l3 = vqrshlq_s32(l3, src_sh4);
+    l4 = vqrshlq_s32(l4, src_sh4);
+    l1 = vqrdmulhq_s32(l1, mult_v4);
+    l2 = vqrdmulhq_s32(l2, mult_v4);
+    l3 = vqrdmulhq_s32(l3, mult_v4);
+    l4 = vqrdmulhq_s32(l4, mult_v4);
+    int16x4_t m1 = vqrshrn_n_s32(l1, 15);
+    int16x4_t m2 = vqrshrn_n_s32(l2, 15);
+    int16x4_t m3 = vqrshrn_n_s32(l3, 15);
+    int16x4_t m4 = vqrshrn_n_s32(l4, 15);
+    vst1_s16(dst, m1);
+    vst1_s16(dst + 4, m2);
+    vst1_s16(dst + 8, m3);
+    vst1_s16(dst + 12, m4); dst += 16;
+  }
+  if (size > 7) {
+    int32x4_t l1 = vld1q_s32(src);
+    int32x4_t l2 = vld1q_s32(src + 4); src += 8;
+    l1 = vqrshlq_s32(l1, src_sh4);
+    l2 = vqrshlq_s32(l2, src_sh4);
+    l1 = vqrdmulhq_s32(l1, mult_v4);
+    l2 = vqrdmulhq_s32(l2, mult_v4);
+    int16x4_t m1 = vqrshrn_n_s32(l1, 15);
+    int16x4_t m2 = vqrshrn_n_s32(l2, 15);
+    vst1_s16(dst, m1);
+    vst1_s16(dst + 4, m2); dst += 8;
+    size -= 8;
+  }
+  if (size > 3) {
+    int32x4_t l1 = vld1q_s32(src); src += 4;
+    l1 = vqrshlq_s32(l1, src_sh4);
+    l1 = vqrdmulhq_s32(l1, mult_v4);
+    int16x4_t m1 = vqrshrn_n_s32(l1, 15);
+    vst1_s16(dst, m1); dst += 4;
+    size -= 4;
+  }
+  if (size > 0) {
+    int32x4_t l1 = vdupq_n_s32(0);
+    l1 = vsetq_lane_s32(src[0], l1, 0);
+    if (size > 1) l1 = vsetq_lane_s32(src[1], l1, 1);
+    if (size > 2) l1 = vsetq_lane_s32(src[2], l1, 2);
+    l1 = vqrshlq_s32(l1, src_sh4);
+    l1 = vqrdmulhq_s32(l1, mult_v4);
+    int16x4_t m1 = vqrshrn_n_s32(l1, 15);
+    vst1_lane_s16(dst, m1, 0);
+    if (size > 1) vst1_lane_s16(dst + 1, m1, 1);
+    if (size > 2) vst1_lane_s16(dst + 2, m1, 2);
+  }
+}
+
+static inline void inline_requant_asym_u8_from_s16_mulhi(const int16_t *src,
+  uint8_t *dst, uint32_t size, uint8_t src_lshift,
+  int16_t mult_factor, uint8_t zero_point) {
+
+  if (size == 0) return;
+  const int16x8_t src_sh8 = vdupq_n_s16(src_lshift);
+  const int16x8_t mult_v8 = vdupq_n_s16(mult_factor);
+  const int16x8_t add_z8 = vdupq_n_s16((int16_t)zero_point << 6);
+
+  for (; size > 31; size -= 32) {
+    int16x8_t l1 = vld1q_s16(src);
+    int16x8_t l2 = vld1q_s16(src + 8);
+    int16x8_t l3 = vld1q_s16(src + 16);
+    int16x8_t l4 = vld1q_s16(src + 24); src += 32;
+    l1 = vqrshlq_s16(l1, src_sh8);
+    l2 = vqrshlq_s16(l2, src_sh8);
+    l3 = vqrshlq_s16(l3, src_sh8);
+    l4 = vqrshlq_s16(l4, src_sh8);
+    l1 = vqrdmulhq_s16(l1, mult_v8);
+    l2 = vqrdmulhq_s16(l2, mult_v8);
+    l3 = vqrdmulhq_s16(l3, mult_v8);
+    l4 = vqrdmulhq_s16(l4, mult_v8);
+    l1 = vqaddq_s16(l1, add_z8);
+    l2 = vqaddq_s16(l2, add_z8);
+    l3 = vqaddq_s16(l3, add_z8);
+    l4 = vqaddq_s16(l4, add_z8);
+    uint8x8_t m1 = vqrshrun_n_s16(l1, 6);
+    uint8x8_t m2 = vqrshrun_n_s16(l2, 6);
+    uint8x8_t m3 = vqrshrun_n_s16(l3, 6);
+    uint8x8_t m4 = vqrshrun_n_s16(l4, 6);
+    vst1_u8(dst, m1);
+    vst1_u8(dst + 8, m2);
+    vst1_u8(dst + 16, m3);
+    vst1_u8(dst + 24, m4); dst += 32;
+  }
+  if (size > 15) {
+    int16x8_t l1 = vld1q_s16(src);
+    int16x8_t l2 = vld1q_s16(src + 8); src += 16;
+    l1 = vqrshlq_s16(l1, src_sh8);
+    l2 = vqrshlq_s16(l2, src_sh8);
+    l1 = vqrdmulhq_s16(l1, mult_v8);
+    l2 = vqrdmulhq_s16(l2, mult_v8);
+    l1 = vqaddq_s16(l1, add_z8);
+    l2 = vqaddq_s16(l2, add_z8);
+    uint8x8_t m1 = vqrshrun_n_s16(l1, 6);
+    uint8x8_t m2 = vqrshrun_n_s16(l2, 6);
+    vst1_u8(dst, m1);
+    vst1_u8(dst + 8, m2); dst += 16;
+    size -= 16;
+  }
+  if (size > 7) {
+    int16x8_t l1 = vld1q_s16(src); src += 8;
+    l1 = vqrshlq_s16(l1, src_sh8);
+    l1 = vqrdmulhq_s16(l1, mult_v8);
+    l1 = vqaddq_s16(l1, add_z8);
+    uint8x8_t m1 = vqrshrun_n_s16(l1, 6);
+    vst1_u8(dst, m1); dst += 8;
+    size -= 8;
+  }
+  if (size > 3) {
+    int16x4_t l1 = vld1_s16(src); src += 4;
+    l1 = vqrshl_s16(l1, vget_low_s16(src_sh8));
+    l1 = vqrdmulh_s16(l1, vget_low_s16(mult_v8));
+    l1 = vqadd_s16(l1, vget_low_s16(add_z8));
+    uint8x8_t m1 = vqrshrun_n_s16(vcombine_s16(l1, vdup_n_s16(0)), 6);
+    vst1_lane_u8(dst, m1, 0);
+    vst1_lane_u8(dst + 1, m1, 1);
+    vst1_lane_u8(dst + 2, m1, 2);
+    vst1_lane_u8(dst + 3, m1, 3); dst += 4;
+    size -= 4;
+  }
+  if (size > 0) {
+    int16x4_t l1 = vdup_n_s16(0);
+    l1 = vset_lane_s16(src[0], l1, 0);
+    if (size > 1) l1 = vset_lane_s16(src[1], l1, 1);
+    if (size > 2) l1 = vset_lane_s16(src[2], l1, 2);
+    l1 = vqrshl_s16(l1, vget_low_s16(src_sh8));
+    l1 = vqrdmulh_s16(l1, vget_low_s16(mult_v8));
+    l1 = vqadd_s16(l1, vget_low_s16(add_z8));
+    uint8x8_t m1 = vqrshrun_n_s16(vcombine_s16(l1, vdup_n_s16(0)), 6);
+    vst1_lane_u8(dst, m1, 0);
+    if (size > 1) vst1_lane_u8(dst + 1, m1, 1);
+    if (size > 2) vst1_lane_u8(dst + 2, m1, 2);
+  }
+}
+
+static inline void inline_requant_sym_s8_from_s16_mulhi(const int16_t *src,
+  int8_t *dst, uint32_t size,
+  uint8_t src_lshift, int16_t mult_factor) {
+
+  if (size == 0) return;
+  const int16x8_t src_sh8 = vdupq_n_s16(src_lshift);
+  const int16x8_t mult_v8 = vdupq_n_s16(mult_factor);
+
+  for (; size > 31; size -= 32) {
+    int16x8_t l1 = vld1q_s16(src);
+    int16x8_t l2 = vld1q_s16(src + 8);
+    int16x8_t l3 = vld1q_s16(src + 16);
+    int16x8_t l4 = vld1q_s16(src + 24); src += 32;
+    l1 = vqrshlq_s16(l1, src_sh8);
+    l2 = vqrshlq_s16(l2, src_sh8);
+    l3 = vqrshlq_s16(l3, src_sh8);
+    l4 = vqrshlq_s16(l4, src_sh8);
+    l1 = vqrdmulhq_s16(l1, mult_v8);
+    l2 = vqrdmulhq_s16(l2, mult_v8);
+    l3 = vqrdmulhq_s16(l3, mult_v8);
+    l4 = vqrdmulhq_s16(l4, mult_v8);
+    int8x8_t m1 = vqrshrn_n_s16(l1, 7);
+    int8x8_t m2 = vqrshrn_n_s16(l2, 7);
+    int8x8_t m3 = vqrshrn_n_s16(l3, 7);
+    int8x8_t m4 = vqrshrn_n_s16(l4, 7);
+    vst1_s8(dst, m1);
+    vst1_s8(dst + 8, m2);
+    vst1_s8(dst + 16, m3);
+    vst1_s8(dst + 24, m4); dst += 32;
+  }
+  if (size > 15) {
+    int16x8_t l1 = vld1q_s16(src);
+    int16x8_t l2 = vld1q_s16(src + 8); src += 16;
+    l1 = vqrshlq_s16(l1, src_sh8);
+    l2 = vqrshlq_s16(l2, src_sh8);
+    l1 = vqrdmulhq_s16(l1, mult_v8);
+    l2 = vqrdmulhq_s16(l2, mult_v8);
+    int8x8_t m1 = vqrshrn_n_s16(l1, 7);
+    int8x8_t m2 = vqrshrn_n_s16(l2, 7);
+    vst1_s8(dst, m1);
+    vst1_s8(dst + 8, m2); dst += 16;
+    size -= 16;
+  }
+  if (size > 7) {
+    int16x8_t l1 = vld1q_s16(src); src += 8;
+    l1 = vqrshlq_s16(l1, src_sh8);
+    l1 = vqrdmulhq_s16(l1, mult_v8);
+    int8x8_t m1 = vqrshrn_n_s16(l1, 7);
+    vst1_s8(dst, m1); dst += 8;
+    size -= 8;
+  }
+  if (size > 3) {
+    int16x4_t l1 = vld1_s16(src); src += 4;
+    l1 = vqrshl_s16(l1, vget_low_s16(src_sh8));
+    l1 = vqrdmulh_s16(l1, vget_low_s16(mult_v8));
+    int8x8_t m1 = vqrshrn_n_s16(vcombine_s16(l1, vdup_n_s16(0)), 7);
+    vst1_lane_s8(dst, m1, 0);
+    vst1_lane_s8(dst + 1, m1, 1);
+    vst1_lane_s8(dst + 2, m1, 2);
+    vst1_lane_s8(dst + 3, m1, 3); dst += 4;
+    size -= 4;
+  }
+  if (size > 0) {
+    int16x4_t l1 = vdup_n_s16(0);
+    l1 = vset_lane_s16(src[0], l1, 0);
+    if (size > 1) l1 = vset_lane_s16(src[1], l1, 1);
+    if (size > 2) l1 = vset_lane_s16(src[2], l1, 2);
+    l1 = vqrshl_s16(l1, vget_low_s16(src_sh8));
+    l1 = vqrdmulh_s16(l1, vget_low_s16(mult_v8));
+    int8x8_t m1 = vqrshrn_n_s16(vcombine_s16(l1, vdup_n_s16(0)), 7);
+    vst1_lane_s8(dst, m1, 0);
+    if (size > 1) vst1_lane_s8(dst + 1, m1, 1);
+    if (size > 2) vst1_lane_s8(dst + 2, m1, 2);
+  }
+}
+
+#endif
diff --git a/include/arm_neon/NeonSgemmCopy.h b/include/arm_neon/NeonSgemmCopy.h
new file mode 100644
index 0000000..736ba52
--- /dev/null
+++ b/include/arm_neon/NeonSgemmCopy.h
@@ -0,0 +1,217 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+/******************************************************************************
+ * File:        NeonSgemmCopy.h
+ * Description: Code templates for NEON SGEMM packing functions.
+ *****************************************************************************/
+
+#include <arm_neon.h>
+
+#ifndef INCLUDE_NEON_SGEMM_COPY
+#define INCLUDE_NEON_SGEMM_COPY
+
+#if __aarch64__
+static inline void pref_ab(const float *dat) {
+  __asm__ ("prfm pldl1keep,[%0,#64]\n\t"::"r"(dat):);
+}
+#else
+static inline void pref_ab(const float *dat) {
+  __asm__ ("pld [%0,#64]\n\t"::"r"(dat):);
+}
+#endif
+
+#define NCOPY_NEON_LOOP_K8_UNROLL4(inc, dst_ptr, src1, src2, src3, src4) \
+  for (dim1_count = dim1_cache; dim1_count > 7; dim1_count -= 8) {\
+    t1.val[0] = vld1q_f32(src1); t2.val[0] = vld1q_f32(src1 + 4);\
+    src1 += 8; pref_ab(src1);\
+    t1.val[1] = vld1q_f32(src2); t2.val[1] = vld1q_f32(src2 + 4);\
+    src2 += 8; pref_ab(src2);\
+    t1.val[2] = vld1q_f32(src3); t2.val[2] = vld1q_f32(src3 + 4);\
+    src3 += 8; pref_ab(src3);\
+    t1.val[3] = vld1q_f32(src4); t2.val[3] = vld1q_f32(src4 + 4);\
+    src4 += 8; pref_ab(src4);\
+    vst4q_lane_f32(dst_ptr, t1, 0);\
+    vst4q_lane_f32(dst_ptr + inc, t1, 1);\
+    vst4q_lane_f32(dst_ptr + inc * 2, t1, 2);\
+    vst4q_lane_f32(dst_ptr + inc * 3, t1, 3);\
+    vst4q_lane_f32(dst_ptr + inc * 4, t2, 0);\
+    vst4q_lane_f32(dst_ptr + inc * 5, t2, 1);\
+    vst4q_lane_f32(dst_ptr + inc * 6, t2, 2);\
+    vst4q_lane_f32(dst_ptr + inc * 7, t2, 3);\
+    dst_ptr += inc * 8;\
+  }\
+
+
+#define NCOPY_UNROLL_24 {\
+  float32x4x4_t t1, t2;\
+  float *dst_h1 = dst1; uint32_t dim1_cache = dim1_count;\
+  NCOPY_NEON_LOOP_K8_UNROLL4(24, dst_h1, src1, src2, src3, src4)\
+  dst_h1 = dst1 + 4;\
+  NCOPY_NEON_LOOP_K8_UNROLL4(24, dst_h1, src5, src6, src7, src8)\
+  dst_h1 = dst1 + 8;\
+  NCOPY_NEON_LOOP_K8_UNROLL4(24, dst_h1, src9, src10, src11, src12)\
+  dst_h1 = dst1 + 12;\
+  NCOPY_NEON_LOOP_K8_UNROLL4(24, dst_h1, src13, src14, src15, src16)\
+  dst_h1 = dst1 + 16;\
+  NCOPY_NEON_LOOP_K8_UNROLL4(24, dst_h1, src17, src18, src19, src20)\
+  dst_h1 = dst1 + 20;\
+  NCOPY_NEON_LOOP_K8_UNROLL4(24, dst_h1, src21, src22, src23, src24)\
+  dst1 = dst_h1 - 20;\
+  NCOPY_STD(24)\
+}
+
+#define NCOPY_UNROLL_12 {\
+  float32x4x4_t t1, t2;\
+  float *dst_h1 = dst1; uint32_t dim1_cache = dim1_count;\
+  NCOPY_NEON_LOOP_K8_UNROLL4(12, dst_h1, src1, src2, src3, src4)\
+  dst_h1 = dst1 + 4;\
+  NCOPY_NEON_LOOP_K8_UNROLL4(12, dst_h1, src5, src6, src7, src8)\
+  dst_h1 = dst1 + 8;\
+  NCOPY_NEON_LOOP_K8_UNROLL4(12, dst_h1, src9, src10, src11, src12)\
+  dst1 = dst_h1 - 8;\
+  NCOPY_STD(12)\
+}
+
+#define NCOPY_UNROLL_8 {\
+  float32x4x4_t t1, t2;\
+  float *dst_h1 = dst1; uint32_t dim1_cache = dim1_count;\
+  NCOPY_NEON_LOOP_K8_UNROLL4(8, dst_h1, src1, src2, src3, src4)\
+  dst_h1 = dst1 + 4;\
+  NCOPY_NEON_LOOP_K8_UNROLL4(8, dst_h1, src5, src6, src7, src8)\
+  dst1 = dst_h1 - 4;\
+  NCOPY_STD(8)\
+}
+
+#define NCOPY_UNROLL_6 {\
+  float32x4x3_t t1, t2;\
+  float *dst_h1 = dst1; uint32_t dim1_cache = dim1_count;\
+  for (; dim1_count > 7; dim1_count -= 8) {\
+    t1.val[0] = vld1q_f32(src1); t2.val[0] = vld1q_f32(src1 + 4);\
+    src1 += 8; pref_ab(src1);\
+    t1.val[1] = vld1q_f32(src2); t2.val[1] = vld1q_f32(src2 + 4);\
+    src2 += 8; pref_ab(src2);\
+    t1.val[2] = vld1q_f32(src3); t2.val[2] = vld1q_f32(src3 + 4);\
+    src3 += 8; pref_ab(src3);\
+    vst3q_lane_f32(dst_h1, t1, 0);\
+    vst3q_lane_f32(dst_h1 + 6, t1, 1);\
+    vst3q_lane_f32(dst_h1 + 12, t1, 2);\
+    vst3q_lane_f32(dst_h1 + 18, t1, 3);\
+    vst3q_lane_f32(dst_h1 + 24, t2, 0);\
+    vst3q_lane_f32(dst_h1 + 30, t2, 1);\
+    vst3q_lane_f32(dst_h1 + 36, t2, 2);\
+    vst3q_lane_f32(dst_h1 + 42, t2, 3);\
+    dst_h1 += 48;\
+  }\
+  float *dst_h2 = dst1 + 3;\
+  for (dim1_count = dim1_cache; dim1_count > 7; dim1_count -= 8) {\
+    t1.val[0] = vld1q_f32(src4); t2.val[0] = vld1q_f32(src4 + 4);\
+    src4 += 8; pref_ab(src4);\
+    t1.val[1] = vld1q_f32(src5); t2.val[1] = vld1q_f32(src5 + 4);\
+    src5 += 8; pref_ab(src5);\
+    t1.val[2] = vld1q_f32(src6); t2.val[2] = vld1q_f32(src6 + 4);\
+    src6 += 8; pref_ab(src6);\
+    vst3q_lane_f32(dst_h2, t1, 0);\
+    vst3q_lane_f32(dst_h2 + 6, t1, 1);\
+    vst3q_lane_f32(dst_h2 + 12, t1, 2);\
+    vst3q_lane_f32(dst_h2 + 18, t1, 3);\
+    vst3q_lane_f32(dst_h2 + 24, t2, 0);\
+    vst3q_lane_f32(dst_h2 + 30, t2, 1);\
+    vst3q_lane_f32(dst_h2 + 36, t2, 2);\
+    vst3q_lane_f32(dst_h2 + 42, t2, 3);\
+    dst_h2 += 48;\
+  }\
+  dst1 = dst_h1;\
+  NCOPY_STD(6)\
+}
+
+#define NCOPY_UNROLL_4 {\
+  float32x4x4_t t1;\
+  for (; dim1_count > 3; dim1_count -= 4) {\
+    t1.val[0] = vld1q_f32(src1); src1 += 4; pref_ab(src1);\
+    t1.val[1] = vld1q_f32(src2); src2 += 4; pref_ab(src2);\
+    t1.val[2] = vld1q_f32(src3); src3 += 4; pref_ab(src3);\
+    t1.val[3] = vld1q_f32(src4); src4 += 4; pref_ab(src4);\
+    vst4q_f32(dst1,t1); dst1 += 16;\
+  }\
+  NCOPY_STD(4)\
+}
+
+#define NCOPY_UNROLL_2 NCOPY_STD(2)
+#define NCOPY_UNROLL_1 NCOPY_STD(1)
+
+//#define NCOPY_a(unroll) NCOPY_UNROLL_##unroll
+//#define NCOPY_b(unroll) NCOPY_UNROLL_##unroll
+
+#define TCOPY_UNIT_1(src_ptr, dst_ptr, dst_offset) \
+  dst_ptr[dst_offset] = *src_ptr;
+
+#define TCOPY_UNIT_2(src_ptr, dst_ptr, dst_offset) {\
+  float32x2_t tmp = vld1_f32(src_ptr);\
+  vst1_f32(dst_ptr + dst_offset, tmp);\
+}
+
+#define TCOPY_UNIT_4(src_ptr, dst_ptr, dst_offset) {\
+  float32x4_t tmp = vld1q_f32(src_ptr); pref_ab(src_ptr + 4);\
+  vst1q_f32(dst_ptr + dst_offset, tmp);\
+}
+
+#define TCOPY_UNIT_6(src_ptr, dst_ptr, dst_offset) {\
+  float32x4_t tmpq = vld1q_f32(src_ptr);\
+  float32x2_t tmpd = vld1_f32(src_ptr + 4); pref_ab(src_ptr + 6);\
+  vst1q_f32(dst_ptr + dst_offset, tmpq);\
+  vst1_f32(dst_ptr + dst_offset + 4, tmpd);\
+}
+
+#define TCOPY_UNIT_8(src_ptr, dst_ptr, dst_offset) {\
+  float32x4_t tmp1 = vld1q_f32(src_ptr);\
+  float32x4_t tmp2 = vld1q_f32(src_ptr + 4); pref_ab(src_ptr + 8);\
+  vst1q_f32(dst_ptr + dst_offset, tmp1);\
+  vst1q_f32(dst_ptr + dst_offset + 4, tmp2);\
+}
+
+#define TCOPY_UNIT_12(src_ptr, dst_ptr, dst_offset) {\
+  float32x4_t tmp1 = vld1q_f32(src_ptr);\
+  float32x4_t tmp2 = vld1q_f32(src_ptr + 4);\
+  float32x4_t tmp3 = vld1q_f32(src_ptr + 8); pref_ab(src_ptr + 12);\
+  vst1q_f32(dst_ptr + dst_offset, tmp1);\
+  vst1q_f32(dst_ptr + dst_offset + 4, tmp2);\
+  vst1q_f32(dst_ptr + dst_offset + 8, tmp3);\
+}
+
+#define TCOPY_UNIT_24(src_ptr, dst_ptr, dst_offset) {\
+  float32x4_t tmp1 = vld1q_f32(src_ptr);\
+  float32x4_t tmp2 = vld1q_f32(src_ptr + 4);\
+  float32x4_t tmp3 = vld1q_f32(src_ptr + 8);\
+  float32x4_t tmp4 = vld1q_f32(src_ptr + 12);\
+  float32x4_t tmp5 = vld1q_f32(src_ptr + 16); pref_ab(src_ptr + 24);\
+  float32x4_t tmp6 = vld1q_f32(src_ptr + 20); pref_ab(src_ptr + 40);\
+  vst1q_f32(dst_ptr + dst_offset, tmp1);\
+  vst1q_f32(dst_ptr + dst_offset + 4, tmp2);\
+  vst1q_f32(dst_ptr + dst_offset + 8, tmp3);\
+  vst1q_f32(dst_ptr + dst_offset + 12, tmp4);\
+  vst1q_f32(dst_ptr + dst_offset + 16, tmp5);\
+  vst1q_f32(dst_ptr + dst_offset + 20, tmp6);\
+}
+
+//#define TCOPY_UNIT_a(src_ptr, dst_ptr, dst_offset, num_elements) \
+  TCOPY_UNIT_##num_elements(src_ptr, dst_ptr, dst_offset)
+
+//#define TCOPY_UNIT_b(src_ptr, dst_ptr, dst_offset, num_elements) \
+  TCOPY_UNIT_##num_elements(src_ptr, dst_ptr, dst_offset)
+
+#endif
diff --git a/include/arm_neon/NeonSgemmKernel.h b/include/arm_neon/NeonSgemmKernel.h
new file mode 100644
index 0000000..3f00c3c
--- /dev/null
+++ b/include/arm_neon/NeonSgemmKernel.h
@@ -0,0 +1,973 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+/******************************************************************************
+ * File:        NeonSgemmKernel.h
+ * Description: Common building blocks for NEON SGEMM kernel functions.
+ *****************************************************************************/
+
+#include <stdint.h>
+#include <arm_neon.h>
+
+#ifndef INCLUDE_NEON_SGEMM_KERNEL
+#define INCLUDE_NEON_SGEMM_KERNEL
+
+#if __aarch64__
+
+static inline void pref_c(float *dat) {
+  __asm__ ("prfm pstl1keep,[%0]\n\t"::"r"(dat):);
+}
+
+#else
+
+static inline void pref_c(float *dat) {
+  __asm__ ("pld [%0]\n\t"::"r"(dat):);
+}
+
+#define vfmaq_lane_f32(c1,a1,b1,id) vmlaq_lane_f32(c1,a1,b1,id)
+#define vfma_lane_f32(c1,a1,b1,id) vmla_lane_f32(c1,a1,b1,id)
+#define vmlaq_laneq0_f32(c1,a1,b1) vmlaq_lane_f32(c1,a1,vget_low_f32(b1),0)
+#define vmlaq_laneq1_f32(c1,a1,b1) vmlaq_lane_f32(c1,a1,vget_low_f32(b1),1)
+#define vmlaq_laneq2_f32(c1,a1,b1) vmlaq_lane_f32(c1,a1,vget_high_f32(b1),0)
+#define vmlaq_laneq3_f32(c1,a1,b1) vmlaq_lane_f32(c1,a1,vget_high_f32(b1),1)
+#define vfmaq_laneq_f32(c1,a1,b1,laneid) vmlaq_laneq##laneid##_f32(c1,a1,b1)
+#define vmla_laneq0_f32(c1,a1,b1) vmla_lane_f32(c1,a1,vget_low_f32(b1),0)
+#define vmla_laneq1_f32(c1,a1,b1) vmla_lane_f32(c1,a1,vget_low_f32(b1),1)
+#define vmla_laneq2_f32(c1,a1,b1) vmla_lane_f32(c1,a1,vget_high_f32(b1),0)
+#define vmla_laneq3_f32(c1,a1,b1) vmla_lane_f32(c1,a1,vget_high_f32(b1),1)
+#define vfma_laneq_f32(c1,a1,b1,laneid) vmla_laneq##laneid##_f32(c1,a1,b1)
+#define vfma_n_f32(c1,a1,b1) vmla_n_f32(c1,a1,b1)
+#define vfmaq_n_f32(c1,a1,b1) vmlaq_n_f32(c1,a1,b1)
+#define vfma_f32(c1,a1,b1) vmla_f32(c1,a1,b1)
+#define vfmaq_f32(c1,a1,b1) vmlaq_f32(c1,a1,b1)
+
+#endif
+
+#define NEON_SGEMM_KERNEL_M1N1 \
+  const float *a_ptr = a_head;\
+  const float *b_ptr1 = b_head;\
+  float32x2_t ad01, bd01;\
+  float32x2_t cd01 = vdup_n_f32(0.0f);\
+  uint32_t k_left = K;\
+  if (k_left > 1) {\
+    ad01 = vld1_f32(a_ptr); a_ptr += 2;\
+    bd01 = vld1_f32(b_ptr1); b_ptr1 += 2;\
+  }\
+  for (; k_left > 3; k_left-=2) {\
+    cd01 = vfma_f32(cd01, ad01, bd01);\
+    ad01 = vld1_f32(a_ptr); a_ptr += 2;\
+    bd01 = vld1_f32(b_ptr1); b_ptr1 += 2;\
+  }\
+  if (k_left > 1) {\
+    cd01 = vfma_f32(cd01, ad01, bd01); k_left -= 2;\
+  }\
+  float cs01 = vget_lane_f32(cd01, 0) + vget_lane_f32(cd01, 1);\
+  if (k_left > 0) {\
+    cs01 += (*a_ptr) * (*b_ptr1); a_ptr++;\
+  }
+    
+#define NEON_SGEMM_SAVE_M1N1 \
+  cs01 += beta * (*c_ptr);\
+  *c_ptr = cs01;
+
+#define NEON_SGEMM_KERNEL_M2N1_UNIT(a_ptr1, b_ptr1) \
+  float32x2_t ad01, ad02, bd01, cd01, cd02;\
+  cd01 = cd02 = vdup_n_f32(0.0f);\
+  uint32_t k_left = K;\
+  if (k_left > 1) {\
+    ad01 = vld1_f32(a_ptr1); ad02 = vld1_f32(a_ptr1 + 2); a_ptr1 += 4;\
+    bd01 = vld1_f32(b_ptr1); b_ptr1 += 2;\
+  }\
+  for (; k_left > 3; k_left -= 2) {\
+    cd01 = vfma_lane_f32(cd01, ad01, bd01, 0); ad01 = vld1_f32(a_ptr1);\
+    cd02 = vfma_lane_f32(cd02, ad02, bd01, 1); ad02 = vld1_f32(a_ptr1 + 2);\
+    a_ptr1 += 4; bd01 = vld1_f32(b_ptr1); b_ptr1 += 2;\
+  }\
+  if(k_left > 1) {\
+    cd01 = vfma_lane_f32(cd01, ad01, bd01, 0);\
+    cd02 = vfma_lane_f32(cd02, ad02, bd01, 1); k_left -= 2;\
+  }\
+  cd01 = vadd_f32(cd01, cd02);\
+  if(k_left > 0) {\
+    ad01 = vld1_f32(a_ptr1); a_ptr1 += 2;\
+    cd01 = vfma_n_f32(cd01, ad01, *b_ptr1); b_ptr1++;\
+  }
+
+#define NEON_SGEMM_KERNEL_M2N1 \
+  const float *b_ptr1 = b_head;\
+  const float *a_ptr = a_head;\
+  NEON_SGEMM_KERNEL_M2N1_UNIT(a_ptr, b_ptr1)
+
+#define NEON_SGEMM_KERNEL_M1N2 \
+  const float *b_ptr1 = b_head;\
+  const float *a_ptr = a_head;\
+  NEON_SGEMM_KERNEL_M2N1_UNIT(b_ptr1, a_ptr)
+
+#define NEON_SGEMM_SAVE_M2N1 \
+  float32x2_t ct1 = vld1_f32(c_ptr);\
+  cd01 = vfma_n_f32(cd01, ct1, beta);\
+  vst1_f32(c_ptr, cd01);
+
+#define NEON_SGEMM_SAVE_M1N2_UNIT(cd01) \
+  c_tmp[0] = c_tmp[0] * beta + vget_lane_f32(cd01, 0);\
+  c_tmp[ldc] = c_tmp[ldc] * beta + vget_lane_f32(cd01, 1);\
+  c_tmp += ldc * 2;
+
+#define NEON_SGEMM_SAVE_M1N2 float *c_tmp = c_ptr; NEON_SGEMM_SAVE_M1N2_UNIT(cd01)
+
+#define NEON_SGEMM_KERNEL_M2N2 \
+  const float *a_ptr = a_head;\
+  const float *b_ptr1 = b_head;\
+  float32x2_t ad01, ad02, bd01, bd02;\
+  float32x2_t cd01, cd02, cd03, cd04;\
+  cd01 = cd02 = cd03 = cd04 = vdup_n_f32(0.0f);\
+  uint32_t k_left = K;\
+  if (k_left > 1) {\
+    ad01 = vld1_f32(a_ptr); ad02 = vld1_f32(a_ptr + 2); a_ptr += 4;\
+    bd01 = vld1_f32(b_ptr1); bd02 = vld1_f32(b_ptr1 + 2); b_ptr1 += 4;\
+  }\
+  for (; k_left > 3; k_left -= 2) {\
+    cd01 = vfma_lane_f32(cd01, ad01, bd01, 0);\
+    cd02 = vfma_lane_f32(cd02, ad01, bd01, 1);\
+    ad01 = vld1_f32(a_ptr); bd01 = vld1_f32(b_ptr1);\
+    cd03 = vfma_lane_f32(cd03, ad02, bd02, 0);\
+    cd04 = vfma_lane_f32(cd04, ad02, bd02, 1);\
+    ad02 = vld1_f32(a_ptr + 2); a_ptr += 4;\
+    bd02 = vld1_f32(b_ptr1 + 2); b_ptr1 += 4;\
+  }\
+  if (k_left > 1) {\
+    cd01 = vfma_lane_f32(cd01, ad01, bd01, 0);\
+    cd02 = vfma_lane_f32(cd02, ad01, bd01, 1);\
+    cd03 = vfma_lane_f32(cd03, ad02, bd02, 0);\
+    cd04 = vfma_lane_f32(cd04, ad02, bd02, 1); k_left -= 2;\
+  }\
+  cd01 = vadd_f32(cd01, cd03);\
+  cd02 = vadd_f32(cd02, cd04);\
+  if (k_left > 0) {\
+    ad01 = vld1_f32(a_ptr); a_ptr += 2;\
+    bd01 = vld1_f32(b_ptr1);\
+    cd01 = vfma_lane_f32(cd01, ad01, bd01, 0);\
+    cd02 = vfma_lane_f32(cd02, ad01, bd01, 1);\
+  }
+
+#define NEON_SGEMM_SAVE_M2N2_UNIT(cd01, cd02) \
+  ct1 = vld1_f32(c_tmp);\
+  ct2 = vld1_f32(c_tmp + ldc);\
+  cd01 = vfma_n_f32(cd01, ct1, beta);\
+  cd02 = vfma_n_f32(cd02, ct2, beta);\
+  vst1_f32(c_tmp, cd01);\
+  vst1_f32(c_tmp + ldc, cd02); c_tmp += ldc * 2;
+
+#define NEON_SGEMM_SAVE_M2N2 \
+  float *c_tmp = c_ptr;\
+  float32x2_t ct1, ct2; NEON_SGEMM_SAVE_M2N2_UNIT(cd01, cd02)
+
+#define NEON_SGEMM_KERNEL_M4N1_UNIT(a_ptr1, b_ptr1) \
+  uint32_t k_left = K;\
+  float32x4_t aq01, aq02, cq01, cq02;\
+  float32x2_t bd01;\
+  cq01 = cq02 = vdupq_n_f32(0.0f);\
+  if (k_left > 1) {\
+    aq01 = vld1q_f32(a_ptr1); aq02 = vld1q_f32(a_ptr1 + 4); a_ptr1 += 8;\
+    bd01 = vld1_f32(b_ptr1); b_ptr1 += 2;\
+  }\
+  for (; k_left > 3; k_left -= 2) {\
+    cq01 = vfmaq_lane_f32(cq01, aq01, bd01, 0); aq01 = vld1q_f32(a_ptr1);\
+    cq02 = vfmaq_lane_f32(cq02, aq02, bd01, 1); aq02 = vld1q_f32(a_ptr1 + 4);\
+    a_ptr1 += 8; bd01 = vld1_f32(b_ptr1); b_ptr1 += 2;\
+  }\
+  if (k_left > 1) {\
+    cq01 = vfmaq_lane_f32(cq01, aq01, bd01, 0);\
+    cq02 = vfmaq_lane_f32(cq02, aq02, bd01, 1);\
+    k_left -= 2;\
+  }\
+  cq01 = vaddq_f32(cq01, cq02);\
+  if (k_left > 0) {\
+    aq01 = vld1q_f32(a_ptr1); a_ptr1 += 4;\
+    cq01 = vfmaq_n_f32(cq01, aq01, *b_ptr1); b_ptr1++;\
+  }
+
+#define NEON_SGEMM_KERNEL_M4N1 \
+  const float *a_ptr = a_head;\
+  const float *b_ptr1 = b_head;\
+  NEON_SGEMM_KERNEL_M4N1_UNIT(a_ptr, b_ptr1)
+
+#define NEON_SGEMM_KERNEL_M1N4 \
+  const float *a_ptr = a_head;\
+  const float *b_ptr1 = b_head;\
+  NEON_SGEMM_KERNEL_M4N1_UNIT(b_ptr1, a_ptr)
+
+#define NEON_SGEMM_SAVE_M4N1 \
+  float32x4_t ct1 = vld1q_f32(c_ptr);\
+  cq01 = vfmaq_n_f32(cq01, ct1, beta);\
+  vst1q_f32(c_ptr, cq01);
+
+#define NEON_SGEMM_SAVE_M1N4_UNIT(cq01) \
+  c_tmp[0] = c_tmp[0] * beta + vgetq_lane_f32(cq01, 0);\
+  c_tmp[ldc] = c_tmp[ldc] * beta + vgetq_lane_f32(cq01, 1);\
+  c_tmp += ldc * 2;\
+  c_tmp[0] = c_tmp[0] * beta + vgetq_lane_f32(cq01, 2);\
+  c_tmp[ldc] = c_tmp[ldc] * beta + vgetq_lane_f32(cq01, 3);\
+  c_tmp += ldc * 2;
+
+#define NEON_SGEMM_SAVE_M1N4 \
+  float *c_tmp = c_ptr; NEON_SGEMM_SAVE_M1N4_UNIT(cq01)
+
+#define NEON_SGEMM_KERNEL_M4N2_UNIT(a_ptr1, b_ptr1) \
+  float32x4_t aq01, aq02, cq01, cq02, cq03, cq04;\
+  float32x2_t bd01, bd02;\
+  cq01 = cq02 = cq03 = cq04 = vdupq_n_f32(0.0f);\
+  uint32_t k_left = K;\
+  if (k_left > 1) {\
+    aq01 = vld1q_f32(a_ptr1); aq02 = vld1q_f32(a_ptr1 + 4); a_ptr1 += 8;\
+    bd01 = vld1_f32(b_ptr1); bd02 = vld1_f32(b_ptr1 + 2); b_ptr1 += 4;\
+  }\
+  for (; k_left > 3; k_left -= 2) {\
+    cq01 = vfmaq_lane_f32(cq01, aq01, bd01, 0);\
+    cq02 = vfmaq_lane_f32(cq02, aq01, bd01, 1);\
+    aq01 = vld1q_f32(a_ptr1); bd01 = vld1_f32(b_ptr1);\
+    cq03 = vfmaq_lane_f32(cq03, aq02, bd02, 0);\
+    cq04 = vfmaq_lane_f32(cq04, aq02, bd02, 1);\
+    aq02 = vld1q_f32(a_ptr1 + 4); a_ptr1 += 8;\
+    bd02 = vld1_f32(b_ptr1 + 2); b_ptr1 += 4;\
+  }\
+  if (k_left > 1) {\
+    cq01 = vfmaq_lane_f32(cq01, aq01, bd01, 0);\
+    cq02 = vfmaq_lane_f32(cq02, aq01, bd01, 1);\
+    cq03 = vfmaq_lane_f32(cq03, aq02, bd02, 0);\
+    cq04 = vfmaq_lane_f32(cq04, aq02, bd02, 1); k_left -= 2;\
+  }\
+  cq01 = vaddq_f32(cq01, cq03);\
+  cq02 = vaddq_f32(cq02, cq04);\
+  if (k_left > 0) {\
+    aq01 = vld1q_f32(a_ptr1); a_ptr1 += 4;\
+    bd01 = vld1_f32(b_ptr1); b_ptr1 += 2;\
+    cq01 = vfmaq_lane_f32(cq01, aq01, bd01, 0);\
+    cq02 = vfmaq_lane_f32(cq02, aq01, bd01, 1);\
+  }
+
+#define NEON_SGEMM_KERNEL_M4N2 \
+  const float *a_ptr = a_head;\
+  const float *b_ptr1 = b_head;\
+  NEON_SGEMM_KERNEL_M4N2_UNIT(a_ptr, b_ptr1)
+
+#define NEON_SGEMM_KERNEL_M2N4 \
+  const float *a_ptr = a_head;\
+  const float *b_ptr1 = b_head;\
+  NEON_SGEMM_KERNEL_M4N2_UNIT(b_ptr1, a_ptr)
+
+#define NEON_SGEMM_SAVE_M4N2_UNIT(cq01, cq02) \
+  ct1 = vld1q_f32(c_tmp); ct2 = vld1q_f32(c_tmp + ldc);\
+  cq01 = vfmaq_n_f32(cq01, ct1, beta);\
+  cq02 = vfmaq_n_f32(cq02, ct2, beta);\
+  vst1q_f32(c_tmp, cq01);\
+  vst1q_f32(c_tmp + ldc, cq02);\
+  c_tmp += ldc * 2;
+
+#define NEON_SGEMM_SAVE_M4N2 \
+  float32x4_t ct1, ct2;\
+  float *c_tmp = c_ptr; NEON_SGEMM_SAVE_M4N2_UNIT(cq01, cq02)
+
+#define NEON_SGEMM_SAVE_M2N4_UNIT(cq01, cq02) \
+  ctd1 = vzipq_f32(cq01, cq02);\
+  cd1 = vget_low_f32(ctd1.val[0]);\
+  cd2 = vget_high_f32(ctd1.val[0]);\
+  cd3 = vget_low_f32(ctd1.val[1]);\
+  cd4 = vget_high_f32(ctd1.val[1]);\
+  cd1 = vfma_n_f32(cd1, vld1_f32(c_tmp), beta);\
+  cd2 = vfma_n_f32(cd2, vld1_f32(c_tmp + ldc), beta);\
+  cd3 = vfma_n_f32(cd3, vld1_f32(c_tmp + ldc * 2), beta);\
+  cd4 = vfma_n_f32(cd4, vld1_f32(c_tmp + ldc * 3), beta);\
+  vst1_f32(c_tmp, cd1);\
+  vst1_f32(c_tmp + ldc, cd2);\
+  vst1_f32(c_tmp + ldc * 2, cd3);\
+  vst1_f32(c_tmp + ldc * 3, cd4);\
+  c_tmp += ldc * 4;
+
+#define NEON_SGEMM_SAVE_M2N4 \
+  float32x4x2_t ctd1; float32x2_t cd1, cd2, cd3, cd4;\
+  float *c_tmp = c_ptr; NEON_SGEMM_SAVE_M2N4_UNIT(cq01, cq02)
+
+#define NEON_SGEMM_KERNEL_M4N4 \
+  const float *a_ptr = a_head;\
+  const float *b_ptr1 = b_head;\
+  float32x4_t aq01, cq01, cq02, cq03, cq04;\
+  float32x2_t bd01, bd02;\
+  cq01 = cq02 = cq03 = cq04 = vdupq_n_f32(0.0f);\
+  uint32_t k_left = K;\
+  if (k_left > 0) {\
+    aq01 = vld1q_f32(a_ptr); a_ptr += 4;\
+    bd01 = vld1_f32(b_ptr1); bd02 = vld1_f32(b_ptr1 + 2); b_ptr1 += 4;\
+  }\
+  for (; k_left > 1; k_left--) {\
+    cq01 = vfmaq_lane_f32(cq01, aq01, bd01, 0);\
+    cq02 = vfmaq_lane_f32(cq02, aq01, bd01, 1); bd01 = vld1_f32(b_ptr1);\
+    cq03 = vfmaq_lane_f32(cq03, aq01, bd02, 0);\
+    cq04 = vfmaq_lane_f32(cq04, aq01, bd02, 1); bd02 = vld1_f32(b_ptr1 + 2);\
+    b_ptr1 += 4; aq01 = vld1q_f32(a_ptr); a_ptr += 4;\
+  }\
+  if (k_left > 0) {\
+    cq01 = vfmaq_lane_f32(cq01, aq01, bd01, 0);\
+    cq02 = vfmaq_lane_f32(cq02, aq01, bd01, 1);\
+    cq03 = vfmaq_lane_f32(cq03, aq01, bd02, 0);\
+    cq04 = vfmaq_lane_f32(cq04, aq01, bd02, 1);\
+  }
+
+#define NEON_SGEMM_SAVE_M4N4_UNIT(cq01, cq02, cq03, cq04) \
+  ct1 = vld1q_f32(c_tmp);\
+  ct2 = vld1q_f32(c_tmp + ldc);\
+  ct3 = vld1q_f32(c_tmp + ldc * 2);\
+  ct4 = vld1q_f32(c_tmp + ldc * 3);\
+  cq01 = vfmaq_n_f32(cq01, ct1, beta);\
+  cq02 = vfmaq_n_f32(cq02, ct2, beta);\
+  cq03 = vfmaq_n_f32(cq03, ct3, beta);\
+  cq04 = vfmaq_n_f32(cq04, ct4, beta);\
+  vst1q_f32(c_tmp, cq01);\
+  vst1q_f32(c_tmp + ldc, cq02);\
+  vst1q_f32(c_tmp + ldc * 2, cq03);\
+  vst1q_f32(c_tmp + ldc * 3, cq04); c_tmp += ldc * 4;
+
+#define NEON_SGEMM_SAVE_M4N4 \
+  float32x4_t ct1, ct2, ct3, ct4;\
+  float *c_tmp = c_ptr; NEON_SGEMM_SAVE_M4N4_UNIT(cq01, cq02, cq03, cq04)
+
+#define NEON_SGEMM_KERNEL_M8N1_UNIT(a_ptr1, b_ptr1) \
+  float32x4_t aq01, aq02, aq03, aq04, cq01, cq02, cq03, cq04;\
+  float32x2_t bd01;\
+  cq01 = cq02 = cq03 = cq04 = vdupq_n_f32(0.0f);\
+  uint32_t k_left = K;\
+  if (k_left > 1) {\
+    aq01 = vld1q_f32(a_ptr1); aq02 = vld1q_f32(a_ptr1 + 4);\
+    aq03 = vld1q_f32(a_ptr1 + 8); aq04 = vld1q_f32(a_ptr1 + 12); a_ptr1 += 16;\
+    bd01 = vld1_f32(b_ptr1); b_ptr1 += 2;\
+  }\
+  for (; k_left > 3; k_left -= 2) {\
+    cq01 = vfmaq_lane_f32(cq01, aq01, bd01, 0); aq01 = vld1q_f32(a_ptr1);\
+    cq02 = vfmaq_lane_f32(cq02, aq02, bd01, 0); aq02 = vld1q_f32(a_ptr1 + 4);\
+    cq03 = vfmaq_lane_f32(cq03, aq03, bd01, 1); aq03 = vld1q_f32(a_ptr1 + 8);\
+    cq04 = vfmaq_lane_f32(cq04, aq04, bd01, 1); aq04 = vld1q_f32(a_ptr1 + 12);\
+    a_ptr1 += 16; bd01 = vld1_f32(b_ptr1); b_ptr1 += 2;\
+  }\
+  if (k_left > 1) {\
+    cq01 = vfmaq_lane_f32(cq01, aq01, bd01, 0);\
+    cq02 = vfmaq_lane_f32(cq02, aq02, bd01, 0);\
+    cq03 = vfmaq_lane_f32(cq03, aq03, bd01, 1);\
+    cq04 = vfmaq_lane_f32(cq04, aq04, bd01, 1); k_left -= 2;\
+  }\
+  cq01 = vaddq_f32(cq01, cq03);\
+  cq02 = vaddq_f32(cq02, cq04);\
+  if (k_left > 0) {\
+    float bs1 = *b_ptr1; b_ptr1++;\
+    aq01 = vld1q_f32(a_ptr1);\
+    aq02 = vld1q_f32(a_ptr1 + 4); a_ptr1 += 8;\
+    cq01 = vfmaq_n_f32(cq01, aq01, bs1);\
+    cq02 = vfmaq_n_f32(cq02, aq02, bs1);\
+  }
+
+#define NEON_SGEMM_KERNEL_M8N1 \
+  const float *a_ptr = a_head;\
+  const float *b_ptr1 = b_head;\
+  NEON_SGEMM_KERNEL_M8N1_UNIT(a_ptr, b_ptr1)
+
+#define NEON_SGEMM_KERNEL_M1N8 \
+  const float *a_ptr = a_head;\
+  const float *b_ptr1 = b_head;\
+  NEON_SGEMM_KERNEL_M8N1_UNIT(b_ptr1, a_ptr)
+
+#define NEON_SGEMM_SAVE_M8N1 \
+  float32x4_t ct1, ct2;\
+  ct1 = vld1q_f32(c_ptr); ct2 = vld1q_f32(c_ptr + 4);\
+  cq01 = vfmaq_n_f32(cq01, ct1, beta);\
+  cq02 = vfmaq_n_f32(cq02, ct2, beta);\
+  vst1q_f32(c_ptr, cq01);\
+  vst1q_f32(c_ptr + 4, cq02);
+
+#define NEON_SGEMM_SAVE_M1N8 \
+  float *c_tmp = c_ptr; NEON_SGEMM_SAVE_M1N4_UNIT(cq01) NEON_SGEMM_SAVE_M1N4_UNIT(cq02)
+
+#define NEON_SGEMM_KERNEL_M8N2_UNIT(a_ptr1, b_ptr1) \
+  float32x4_t aq01, aq02, cq01, cq02, cq03, cq04;\
+  float32x2_t bd01;\
+  cq01 = cq02 = cq03 = cq04 = vdupq_n_f32(0.0f);\
+  uint32_t k_left = K;\
+  if (k_left > 0) {\
+    aq01 = vld1q_f32(a_ptr1); aq02 = vld1q_f32(a_ptr1 + 4); a_ptr1 += 8;\
+    bd01 = vld1_f32(b_ptr1); b_ptr1 += 2;\
+  }\
+  for (; k_left > 1; k_left--) {\
+    cq01 = vfmaq_lane_f32(cq01, aq01, bd01, 0);\
+    cq03 = vfmaq_lane_f32(cq03, aq01, bd01, 1); aq01 = vld1q_f32(a_ptr1);\
+    cq02 = vfmaq_lane_f32(cq02, aq02, bd01, 0);\
+    cq04 = vfmaq_lane_f32(cq04, aq02, bd01, 1); aq02 = vld1q_f32(a_ptr1 + 4);\
+    a_ptr1 += 8; bd01 = vld1_f32(b_ptr1); b_ptr1 += 2;\
+  }\
+  if (k_left > 0) {\
+    cq01 = vfmaq_lane_f32(cq01, aq01, bd01, 0);\
+    cq03 = vfmaq_lane_f32(cq03, aq01, bd01, 1);\
+    cq02 = vfmaq_lane_f32(cq02, aq02, bd01, 0);\
+    cq04 = vfmaq_lane_f32(cq04, aq02, bd01, 1);\
+  }
+
+#define NEON_SGEMM_KERNEL_M8N2 \
+  const float *a_ptr = a_head;\
+  const float *b_ptr1 = b_head;\
+  NEON_SGEMM_KERNEL_M8N2_UNIT(a_ptr, b_ptr1)
+
+#define NEON_SGEMM_KERNEL_M2N8 \
+  const float *a_ptr = a_head;\
+  const float *b_ptr1 = b_head;\
+  NEON_SGEMM_KERNEL_M8N2_UNIT(b_ptr1, a_ptr)
+
+#define NEON_SGEMM_SAVE_M8N2_UNIT(cq01, cq02, cq03, cq04) \
+  ct1 = vld1q_f32(c_tmp);\
+  ct2 = vld1q_f32(c_tmp + 4);\
+  ct3 = vld1q_f32(c_tmp + ldc);\
+  ct4 = vld1q_f32(c_tmp + ldc + 4);\
+  cq01 = vfmaq_n_f32(cq01, ct1, beta);\
+  cq02 = vfmaq_n_f32(cq02, ct2, beta);\
+  cq03 = vfmaq_n_f32(cq03, ct3, beta);\
+  cq04 = vfmaq_n_f32(cq04, ct4, beta);\
+  vst1q_f32(c_tmp, cq01);\
+  vst1q_f32(c_tmp + 4, cq02);\
+  vst1q_f32(c_tmp + ldc, cq03);\
+  vst1q_f32(c_tmp + ldc + 4, cq04); c_tmp += 2 * ldc;
+
+#define NEON_SGEMM_SAVE_M8N2 \
+  float *c_tmp = c_ptr;\
+  float32x4_t ct1, ct2, ct3, ct4;\
+  NEON_SGEMM_SAVE_M8N2_UNIT(cq01, cq02, cq03, cq04)
+
+#define NEON_SGEMM_SAVE_M2N8 \
+  float32x4x2_t ctd1; float32x2_t cd1, cd2, cd3, cd4;\
+  float *c_tmp = c_ptr; NEON_SGEMM_SAVE_M2N4_UNIT(cq01, cq03) NEON_SGEMM_SAVE_M2N4_UNIT(cq02, cq04)
+
+#define NEON_SGEMM_KERNEL_M8N4_UNIT(a_ptr1, b_ptr1) \
+  float32x4_t aq01, aq02, bq01, cq01, cq02, cq03, cq04, cq05, cq06, cq07, cq08;\
+  cq01 = cq02 = cq03 = cq04 = cq05 = cq06 = cq07 = cq08 = vdupq_n_f32(0.0f);\
+  uint32_t k_left = K;\
+  if (k_left > 0) {\
+    aq01 = vld1q_f32(a_ptr1); aq02 = vld1q_f32(a_ptr1 + 4); a_ptr1 += 8;\
+    bq01 = vld1q_f32(b_ptr1); b_ptr1 += 4;\
+  }\
+  for (; k_left > 1; k_left--) {\
+    cq01 = vfmaq_laneq_f32(cq01, aq01, bq01, 0);\
+    cq03 = vfmaq_laneq_f32(cq03, aq01, bq01, 1);\
+    cq05 = vfmaq_laneq_f32(cq05, aq01, bq01, 2);\
+    cq07 = vfmaq_laneq_f32(cq07, aq01, bq01, 3);\
+    aq01 = vld1q_f32(a_ptr1);\
+    cq02 = vfmaq_laneq_f32(cq02, aq02, bq01, 0);\
+    cq04 = vfmaq_laneq_f32(cq04, aq02, bq01, 1);\
+    cq06 = vfmaq_laneq_f32(cq06, aq02, bq01, 2);\
+    cq08 = vfmaq_laneq_f32(cq08, aq02, bq01, 3);\
+    aq02 = vld1q_f32(a_ptr1 + 4); a_ptr1 += 8;\
+    bq01 = vld1q_f32(b_ptr1); b_ptr1 += 4;\
+  }\
+  if (k_left > 0) {\
+    cq01 = vfmaq_laneq_f32(cq01, aq01, bq01, 0);\
+    cq03 = vfmaq_laneq_f32(cq03, aq01, bq01, 1);\
+    cq05 = vfmaq_laneq_f32(cq05, aq01, bq01, 2);\
+    cq07 = vfmaq_laneq_f32(cq07, aq01, bq01, 3);\
+    cq02 = vfmaq_laneq_f32(cq02, aq02, bq01, 0);\
+    cq04 = vfmaq_laneq_f32(cq04, aq02, bq01, 1);\
+    cq06 = vfmaq_laneq_f32(cq06, aq02, bq01, 2);\
+    cq08 = vfmaq_laneq_f32(cq08, aq02, bq01, 3);\
+  }
+
+#define NEON_SGEMM_KERNEL_M8N4 \
+  const float *a_ptr = a_head;\
+  const float *b_ptr1 = b_head;\
+  NEON_SGEMM_KERNEL_M8N4_UNIT(a_ptr, b_ptr1)
+
+#define NEON_SGEMM_KERNEL_M4N8 \
+  const float *a_ptr = a_head;\
+  const float *b_ptr1 = b_head;\
+  NEON_SGEMM_KERNEL_M8N4_UNIT(b_ptr1, a_ptr)
+
+#define NEON_SGEMM_SAVE_M8N4 \
+  float *c_tmp = c_ptr;\
+  float32x4_t ct1, ct2, ct3, ct4;\
+  NEON_SGEMM_SAVE_M8N2_UNIT(cq01, cq02, cq03, cq04)\
+  NEON_SGEMM_SAVE_M8N2_UNIT(cq05, cq06, cq07, cq08)
+
+#define TRANSPOSE_4x4(cq1, cq2, cq3, cq4) {\
+  float32x4x2_t ctd1 = vzipq_f32(cq1, cq2);\
+  float32x4x2_t ctd2 = vzipq_f32(cq3, cq4);\
+  cq1 = vcombine_f32(vget_low_f32(ctd1.val[0]), vget_low_f32(ctd2.val[0]));\
+  cq2 = vcombine_f32(vget_high_f32(ctd1.val[0]), vget_high_f32(ctd2.val[0]));\
+  cq3 = vcombine_f32(vget_low_f32(ctd1.val[1]), vget_low_f32(ctd2.val[1]));\
+  cq4 = vcombine_f32(vget_high_f32(ctd1.val[1]), vget_high_f32(ctd2.val[1]));\
+}
+
+#define NEON_SGEMM_SAVE_M4N8 \
+  float *c_tmp = c_ptr;\
+  float32x4_t ct1, ct2, ct3, ct4;\
+  TRANSPOSE_4x4(cq01, cq03, cq05, cq07)\
+  TRANSPOSE_4x4(cq02, cq04, cq06, cq08)\
+  NEON_SGEMM_SAVE_M4N4_UNIT(cq01, cq03, cq05, cq07)\
+  NEON_SGEMM_SAVE_M4N4_UNIT(cq02, cq04, cq06, cq08)
+
+#define NEON_SGEMM_KERNEL_M8N8 \
+  const float *a_ptr = a_head;\
+  const float *b_ptr1 = b_head;\
+  float32x4_t aq01, aq02, bq01, bq02;\
+  float32x4_t cq01, cq02, cq03, cq04, cq05, cq06, cq07, cq08;\
+  float32x4_t cq09, cq10, cq11, cq12, cq13, cq14, cq15, cq16;\
+  cq01 = cq02 = cq03 = cq04 = cq05 = cq06 = cq07 = cq08 = vdupq_n_f32(0.0f);\
+  cq09 = cq10 = cq11 = cq12 = cq13 = cq14 = cq15 = cq16 = vdupq_n_f32(0.0f);\
+  uint32_t k_left = K;\
+  if (k_left > 0) {\
+    aq01 = vld1q_f32(a_ptr); aq02 = vld1q_f32(a_ptr + 4); a_ptr += 8;\
+    bq01 = vld1q_f32(b_ptr1); bq02 = vld1q_f32(b_ptr1 + 4); b_ptr1 += 8;\
+  }\
+  for (; k_left > 1; k_left--) {\
+    cq01 = vfmaq_laneq_f32(cq01, aq01, bq01, 0);\
+    cq03 = vfmaq_laneq_f32(cq03, aq01, bq01, 1);\
+    cq05 = vfmaq_laneq_f32(cq05, aq01, bq01, 2);\
+    cq07 = vfmaq_laneq_f32(cq07, aq01, bq01, 3);\
+    cq02 = vfmaq_laneq_f32(cq02, aq02, bq01, 0);\
+    cq04 = vfmaq_laneq_f32(cq04, aq02, bq01, 1);\
+    cq06 = vfmaq_laneq_f32(cq06, aq02, bq01, 2);\
+    cq08 = vfmaq_laneq_f32(cq08, aq02, bq01, 3);\
+    bq01 = vld1q_f32(b_ptr1);\
+    cq09 = vfmaq_laneq_f32(cq09, aq01, bq02, 0);\
+    cq11 = vfmaq_laneq_f32(cq11, aq01, bq02, 1);\
+    cq13 = vfmaq_laneq_f32(cq13, aq01, bq02, 2);\
+    cq15 = vfmaq_laneq_f32(cq15, aq01, bq02, 3);\
+    aq01 = vld1q_f32(a_ptr);\
+    cq10 = vfmaq_laneq_f32(cq10, aq02, bq02, 0);\
+    cq12 = vfmaq_laneq_f32(cq12, aq02, bq02, 1);\
+    cq14 = vfmaq_laneq_f32(cq14, aq02, bq02, 2);\
+    cq16 = vfmaq_laneq_f32(cq16, aq02, bq02, 3);\
+    aq02 = vld1q_f32(a_ptr + 4); a_ptr += 8;\
+    bq02 = vld1q_f32(b_ptr1 + 4); b_ptr1 += 8;\
+  }\
+  if (k_left > 0) {\
+    cq01 = vfmaq_laneq_f32(cq01, aq01, bq01, 0);\
+    cq03 = vfmaq_laneq_f32(cq03, aq01, bq01, 1);\
+    cq05 = vfmaq_laneq_f32(cq05, aq01, bq01, 2);\
+    cq07 = vfmaq_laneq_f32(cq07, aq01, bq01, 3);\
+    cq02 = vfmaq_laneq_f32(cq02, aq02, bq01, 0);\
+    cq04 = vfmaq_laneq_f32(cq04, aq02, bq01, 1);\
+    cq06 = vfmaq_laneq_f32(cq06, aq02, bq01, 2);\
+    cq08 = vfmaq_laneq_f32(cq08, aq02, bq01, 3);\
+    cq09 = vfmaq_laneq_f32(cq09, aq01, bq02, 0);\
+    cq11 = vfmaq_laneq_f32(cq11, aq01, bq02, 1);\
+    cq13 = vfmaq_laneq_f32(cq13, aq01, bq02, 2);\
+    cq15 = vfmaq_laneq_f32(cq15, aq01, bq02, 3);\
+    cq10 = vfmaq_laneq_f32(cq10, aq02, bq02, 0);\
+    cq12 = vfmaq_laneq_f32(cq12, aq02, bq02, 1);\
+    cq14 = vfmaq_laneq_f32(cq14, aq02, bq02, 2);\
+    cq16 = vfmaq_laneq_f32(cq16, aq02, bq02, 3);\
+  }
+
+#define NEON_SGEMM_SAVE_M8N8 \
+  float *c_tmp = c_ptr;\
+  float32x4_t ct1, ct2, ct3, ct4;\
+  NEON_SGEMM_SAVE_M8N2_UNIT(cq01, cq02, cq03, cq04)\
+  NEON_SGEMM_SAVE_M8N2_UNIT(cq05, cq06, cq07, cq08)\
+  NEON_SGEMM_SAVE_M8N2_UNIT(cq09, cq10, cq11, cq12)\
+  NEON_SGEMM_SAVE_M8N2_UNIT(cq13, cq14, cq15, cq16)
+
+#define NEON_SGEMM_KERNEL_M6N1_UNIT(a_ptr1, b_ptr1) \
+  uint32_t k_left = K;\
+  float32x2_t cd01, cd02, cd03, cd04, cd05, cd06;\
+  float32x2_t ad01, ad02, ad03, ad04, ad05, ad06, bd01;\
+  cd01 = cd02 = cd03 = cd04 = cd05 = cd06 = vdup_n_f32(0.0f);\
+  if (k_left > 1) {\
+    bd01 = vld1_f32(b_ptr1); b_ptr1 += 2;\
+    ad01 = vld1_f32(a_ptr1); ad02 = vld1_f32(a_ptr1 + 2);\
+    ad03 = vld1_f32(a_ptr1 + 4); ad04 = vld1_f32(a_ptr1 + 6);\
+    ad05 = vld1_f32(a_ptr1 + 8); ad06 = vld1_f32(a_ptr1 + 10);\
+    a_ptr1 += 12;\
+  }\
+  for (; k_left > 3; k_left -= 2) {\
+    cd01 = vfma_lane_f32(cd01, ad01, bd01, 0); ad01 = vld1_f32(a_ptr1);\
+    cd02 = vfma_lane_f32(cd02, ad02, bd01, 0); ad02 = vld1_f32(a_ptr1 + 2);\
+    cd03 = vfma_lane_f32(cd03, ad03, bd01, 0); ad03 = vld1_f32(a_ptr1 + 4);\
+    cd04 = vfma_lane_f32(cd04, ad04, bd01, 1); ad04 = vld1_f32(a_ptr1 + 6);\
+    cd05 = vfma_lane_f32(cd05, ad05, bd01, 1); ad05 = vld1_f32(a_ptr1 + 8);\
+    cd06 = vfma_lane_f32(cd06, ad06, bd01, 1); ad06 = vld1_f32(a_ptr1 + 10);\
+    a_ptr1 += 12; bd01 = vld1_f32(b_ptr1); b_ptr1 += 2;\
+  }\
+  if (k_left > 1) {\
+    cd01 = vfma_lane_f32(cd01, ad01, bd01, 0);\
+    cd02 = vfma_lane_f32(cd02, ad02, bd01, 0);\
+    cd03 = vfma_lane_f32(cd03, ad03, bd01, 0);\
+    cd04 = vfma_lane_f32(cd04, ad04, bd01, 1);\
+    cd05 = vfma_lane_f32(cd05, ad05, bd01, 1);\
+    cd06 = vfma_lane_f32(cd06, ad06, bd01, 1); k_left -= 2;\
+  }\
+  cd01 = vadd_f32(cd01, cd04);\
+  cd02 = vadd_f32(cd02, cd05);\
+  cd03 = vadd_f32(cd03, cd06);\
+  if (k_left > 0) {\
+    float bs1 = *b_ptr1; b_ptr1++;\
+    ad01 = vld1_f32(a_ptr1);\
+    ad02 = vld1_f32(a_ptr1 + 2);\
+    ad03 = vld1_f32(a_ptr1 + 4); a_ptr1 += 6;\
+    cd01 = vfma_n_f32(cd01, ad01, bs1);\
+    cd02 = vfma_n_f32(cd02, ad02, bs1);\
+    cd03 = vfma_n_f32(cd03, ad03, bs1);\
+  }
+
+#define NEON_SGEMM_KERNEL_M6N1 \
+  const float *b_ptr = b_head;\
+  const float *a_ptr = a_head;\
+  NEON_SGEMM_KERNEL_M6N1_UNIT(a_ptr, b_ptr)
+
+#define NEON_SGEMM_KERNEL_M1N6 \
+  const float *b_ptr = b_head;\
+  const float *a_ptr = a_head;\
+  NEON_SGEMM_KERNEL_M6N1_UNIT(b_ptr, a_ptr)
+
+#define NEON_SGEMM_SAVE_M6N1 \
+  float32x2_t ct1, ct2, ct3;\
+  ct1 = vld1_f32(c_ptr); ct2 = vld1_f32(c_ptr + 2); ct3 = vld1_f32(c_ptr + 4);\
+  cd01 = vfma_n_f32(cd01, ct1, beta);\
+  cd02 = vfma_n_f32(cd02, ct2, beta);\
+  cd03 = vfma_n_f32(cd03, ct3, beta);\
+  vst1_f32(c_ptr, cd01); vst1_f32(c_ptr + 2, cd02); vst1_f32(c_ptr + 4, cd03);
+
+#define NEON_SGEMM_SAVE_M1N6 \
+  float *c_tmp = c_ptr;\
+  NEON_SGEMM_SAVE_M1N2_UNIT(cd01) NEON_SGEMM_SAVE_M1N2_UNIT(cd02) NEON_SGEMM_SAVE_M1N2_UNIT(cd03)
+
+#define NEON_SGEMM_KERNEL_M6N2_UNIT(a_ptr1, b_ptr1) \
+  uint32_t k_left = K;\
+  float32x2_t cd01, cd02, cd03, cd04, cd05, cd06;\
+  float32x2_t ad01, ad02, ad03, bd01;\
+  cd01 = cd02 = cd03 = cd04 = cd05 = cd06 = vdup_n_f32(0.0f);\
+  if (k_left > 0) {\
+    bd01 = vld1_f32(b_ptr1); b_ptr1 += 2;\
+    ad01 = vld1_f32(a_ptr1); ad02 = vld1_f32(a_ptr1 + 2);\
+    ad03 = vld1_f32(a_ptr1 + 4); a_ptr1 += 6;\
+  }\
+  for (; k_left > 1; k_left--) {\
+    cd01 = vfma_lane_f32(cd01, ad01, bd01, 0);\
+    cd04 = vfma_lane_f32(cd04, ad01, bd01, 1); ad01 = vld1_f32(a_ptr1);\
+    cd02 = vfma_lane_f32(cd02, ad02, bd01, 0);\
+    cd05 = vfma_lane_f32(cd05, ad02, bd01, 1); ad02 = vld1_f32(a_ptr1 + 2);\
+    cd03 = vfma_lane_f32(cd03, ad03, bd01, 0);\
+    cd06 = vfma_lane_f32(cd06, ad03, bd01, 1); ad03 = vld1_f32(a_ptr1 + 4);\
+    a_ptr1 += 6; bd01 = vld1_f32(b_ptr1); b_ptr1 += 2;\
+  }\
+  if (k_left > 0) {\
+    cd01 = vfma_lane_f32(cd01, ad01, bd01, 0);\
+    cd04 = vfma_lane_f32(cd04, ad01, bd01, 1);\
+    cd02 = vfma_lane_f32(cd02, ad02, bd01, 0);\
+    cd05 = vfma_lane_f32(cd05, ad02, bd01, 1);\
+    cd03 = vfma_lane_f32(cd03, ad03, bd01, 0);\
+    cd06 = vfma_lane_f32(cd06, ad03, bd01, 1);\
+  }
+
+#define NEON_SGEMM_KERNEL_M6N2 \
+  const float *b_ptr = b_head;\
+  const float *a_ptr = a_head;\
+  NEON_SGEMM_KERNEL_M6N2_UNIT(a_ptr, b_ptr)
+
+#define NEON_SGEMM_KERNEL_M2N6 \
+  const float *b_ptr = b_head;\
+  const float *a_ptr = a_head;\
+  NEON_SGEMM_KERNEL_M6N2_UNIT(b_ptr, a_ptr)
+
+#define TRANS_M2N2(cd01, cd02) \
+  cdd1 = vzip_f32(cd01, cd02); cd01 = cdd1.val[0]; cd02 = cdd1.val[1];
+
+#define NEON_SGEMM_SAVE_M6N2 \
+  float32x2_t ct1, ct2; float *c_tmp = c_ptr;\
+  NEON_SGEMM_SAVE_M2N2_UNIT(cd01, cd04) c_tmp = c_ptr + 2;\
+  NEON_SGEMM_SAVE_M2N2_UNIT(cd02, cd05) c_tmp = c_ptr + 4;\
+  NEON_SGEMM_SAVE_M2N2_UNIT(cd03, cd06)
+
+#define NEON_SGEMM_SAVE_M2N6 \
+  float32x2x2_t cdd1; float32x2_t ct1, ct2; float *c_tmp = c_ptr;\
+  TRANS_M2N2(cd01, cd04) TRANS_M2N2(cd02, cd05) TRANS_M2N2(cd03, cd06)\
+  NEON_SGEMM_SAVE_M2N2_UNIT(cd01, cd04)\
+  NEON_SGEMM_SAVE_M2N2_UNIT(cd02, cd05)\
+  NEON_SGEMM_SAVE_M2N2_UNIT(cd03, cd06)
+
+#define NEON_SGEMM_KERNEL_M6N4_UNIT(a_ptr1, b_ptr1) \
+  uint32_t k_left = K;\
+  float32x4_t cq01, cq02, cq03, cq04, cq05, cq06;\
+  float32x4_t bq01; float32x2_t ad01, ad02, ad03;\
+  cq01 = cq02 = cq03 = cq04 = cq05 = cq06 = vdupq_n_f32(0.0f);\
+  if (k_left > 0) {\
+    bq01 = vld1q_f32(b_ptr1); b_ptr1 += 4;\
+    ad01 = vld1_f32(a_ptr1); ad02 = vld1_f32(a_ptr1 + 2);\
+    ad03 = vld1_f32(a_ptr1 + 4); a_ptr1 += 6;\
+  }\
+  for (; k_left > 1; k_left--) {\
+    cq01 = vfmaq_lane_f32(cq01, bq01, ad01, 0);\
+    cq02 = vfmaq_lane_f32(cq02, bq01, ad01, 1); ad01 = vld1_f32(a_ptr1);\
+    cq03 = vfmaq_lane_f32(cq03, bq01, ad02, 0);\
+    cq04 = vfmaq_lane_f32(cq04, bq01, ad02, 1); ad02 = vld1_f32(a_ptr1 + 2);\
+    cq05 = vfmaq_lane_f32(cq05, bq01, ad03, 0);\
+    cq06 = vfmaq_lane_f32(cq06, bq01, ad03, 1); ad03 = vld1_f32(a_ptr1 + 4);\
+    a_ptr1 += 6; bq01 = vld1q_f32(b_ptr1); b_ptr1 += 4;\
+  }\
+  if (k_left > 0) {\
+    cq01 = vfmaq_lane_f32(cq01, bq01, ad01, 0);\
+    cq02 = vfmaq_lane_f32(cq02, bq01, ad01, 1);\
+    cq03 = vfmaq_lane_f32(cq03, bq01, ad02, 0);\
+    cq04 = vfmaq_lane_f32(cq04, bq01, ad02, 1);\
+    cq05 = vfmaq_lane_f32(cq05, bq01, ad03, 0);\
+    cq06 = vfmaq_lane_f32(cq06, bq01, ad03, 1);\
+  }
+
+#define NEON_SGEMM_KERNEL_M6N4 \
+  const float *b_ptr = b_head;\
+  const float *a_ptr = a_head;\
+  NEON_SGEMM_KERNEL_M6N4_UNIT(a_ptr, b_ptr)
+
+#define NEON_SGEMM_KERNEL_M4N6 \
+  const float *b_ptr = b_head;\
+  const float *a_ptr = a_head;\
+  NEON_SGEMM_KERNEL_M6N4_UNIT(b_ptr, a_ptr)
+
+#define NEON_SGEMM_SAVE_M6N4 \
+  float32x4x2_t ctd1; float32x2_t cd1, cd2, cd3, cd4; float *c_tmp = c_ptr;\
+  NEON_SGEMM_SAVE_M2N4_UNIT(cq01, cq02) c_tmp = c_ptr + 2;\
+  NEON_SGEMM_SAVE_M2N4_UNIT(cq03, cq04) c_tmp = c_ptr + 4;\
+  NEON_SGEMM_SAVE_M2N4_UNIT(cq05, cq06)
+
+#define NEON_SGEMM_SAVE_M4N6 \
+  float32x4_t ct1, ct2; float *c_tmp = c_ptr;\
+  NEON_SGEMM_SAVE_M4N2_UNIT(cq01, cq02) NEON_SGEMM_SAVE_M4N2_UNIT(cq03, cq04)\
+  NEON_SGEMM_SAVE_M4N2_UNIT(cq05, cq06)
+
+#define NEON_SGEMM_KERNEL_M12N1_UNIT(a_ptr1, b_ptr1) \
+  uint32_t k_left = K;\
+  float32x4_t cq01, cq02, cq03, cq04, cq05, cq06;\
+  float32x4_t aq01, aq02, aq03, aq04, aq05, aq06;\
+  float32x2_t bd01;\
+  cq01 = cq02 = cq03 = cq04 = cq05 = cq06 = vdupq_n_f32(0.0f);\
+  if (k_left > 1) {\
+    bd01 = vld1_f32(b_ptr1); b_ptr1 += 2;\
+    aq01 = vld1q_f32(a_ptr1); aq02 = vld1q_f32(a_ptr1 + 4);\
+    aq03 = vld1q_f32(a_ptr1 + 8); aq04 = vld1q_f32(a_ptr1 + 12);\
+    aq05 = vld1q_f32(a_ptr1 + 16); aq06 = vld1q_f32(a_ptr1 + 20);\
+    a_ptr1 += 24;\
+  }\
+  for (; k_left > 3; k_left -= 2) {\
+    cq01 = vfmaq_lane_f32(cq01, aq01, bd01, 0); aq01 = vld1q_f32(a_ptr1);\
+    cq02 = vfmaq_lane_f32(cq02, aq02, bd01, 0); aq02 = vld1q_f32(a_ptr1 + 4);\
+    cq03 = vfmaq_lane_f32(cq03, aq03, bd01, 0); aq03 = vld1q_f32(a_ptr1 + 8);\
+    cq04 = vfmaq_lane_f32(cq04, aq04, bd01, 1); aq04 = vld1q_f32(a_ptr1 + 12);\
+    cq05 = vfmaq_lane_f32(cq05, aq05, bd01, 1); aq05 = vld1q_f32(a_ptr1 + 16);\
+    cq06 = vfmaq_lane_f32(cq06, aq06, bd01, 1); aq06 = vld1q_f32(a_ptr1 + 20);\
+    a_ptr1 += 24; bd01 = vld1_f32(b_ptr1); b_ptr1 += 2;\
+  }\
+  if (k_left > 1) {\
+    cq01 = vfmaq_lane_f32(cq01, aq01, bd01, 0);\
+    cq02 = vfmaq_lane_f32(cq02, aq02, bd01, 0);\
+    cq03 = vfmaq_lane_f32(cq03, aq03, bd01, 0);\
+    cq04 = vfmaq_lane_f32(cq04, aq04, bd01, 1);\
+    cq05 = vfmaq_lane_f32(cq05, aq05, bd01, 1);\
+    cq06 = vfmaq_lane_f32(cq06, aq06, bd01, 1); k_left -= 2;\
+  }\
+  cq01 = vaddq_f32(cq01, cq04);\
+  cq02 = vaddq_f32(cq02, cq05);\
+  cq03 = vaddq_f32(cq03, cq06);\
+  if (k_left > 0) {\
+    float bs1 = *b_ptr1; b_ptr1++;\
+    aq01 = vld1q_f32(a_ptr1); aq02 = vld1q_f32(a_ptr1 + 4);\
+    aq03 = vld1q_f32(a_ptr1 + 8); a_ptr1 += 12;\
+    cq01 = vfmaq_n_f32(cq01, aq01, bs1);\
+    cq02 = vfmaq_n_f32(cq02, aq02, bs1);\
+    cq03 = vfmaq_n_f32(cq03, aq03, bs1);\
+  }
+
+#define NEON_SGEMM_KERNEL_M12N1 \
+  const float *b_ptr = b_head;\
+  const float *a_ptr = a_head;\
+  NEON_SGEMM_KERNEL_M12N1_UNIT(a_ptr, b_ptr)
+
+#define NEON_SGEMM_KERNEL_M1N12 \
+  const float *b_ptr = b_head;\
+  const float *a_ptr = a_head;\
+  NEON_SGEMM_KERNEL_M12N1_UNIT(b_ptr, a_ptr)
+
+#define NEON_SGEMM_SAVE_M12N1 \
+  float32x4_t ct1, ct2, ct3;\
+  ct1 = vld1q_f32(c_ptr); ct2 = vld1q_f32(c_ptr + 4); ct3 = vld1q_f32(c_ptr + 8);\
+  cq01 = vfmaq_n_f32(cq01, ct1, beta);\
+  cq02 = vfmaq_n_f32(cq02, ct2, beta);\
+  cq03 = vfmaq_n_f32(cq03, ct3, beta);\
+  vst1q_f32(c_ptr, cq01); vst1q_f32(c_ptr + 4, cq02); vst1q_f32(c_ptr + 8, cq03);
+
+#define NEON_SGEMM_SAVE_M1N12 \
+  float *c_tmp = c_ptr;\
+  NEON_SGEMM_SAVE_M1N4_UNIT(cq01) NEON_SGEMM_SAVE_M1N4_UNIT(cq02) NEON_SGEMM_SAVE_M1N4_UNIT(cq03)
+
+#define NEON_SGEMM_KERNEL_M12N2_UNIT(a_ptr1, b_ptr1) \
+  uint32_t k_left = K;\
+  float32x4_t cq01, cq02, cq03, cq04, cq05, cq06;\
+  float32x4_t aq01, aq02, aq03; float32x2_t bd01;\
+  cq01 = cq02 = cq03 = cq04 = cq05 = cq06 = vdupq_n_f32(0.0f);\
+  if (k_left > 0) {\
+    bd01 = vld1_f32(b_ptr1); b_ptr1 += 2;\
+    aq01 = vld1q_f32(a_ptr1); aq02 = vld1q_f32(a_ptr1 + 4);\
+    aq03 = vld1q_f32(a_ptr1 + 8); a_ptr1 += 12;\
+  }\
+  for (; k_left > 1; k_left--) {\
+    cq01 = vfmaq_lane_f32(cq01, aq01, bd01, 0);\
+    cq04 = vfmaq_lane_f32(cq04, aq01, bd01, 1); aq01 = vld1q_f32(a_ptr1);\
+    cq02 = vfmaq_lane_f32(cq02, aq02, bd01, 0);\
+    cq05 = vfmaq_lane_f32(cq05, aq02, bd01, 1); aq02 = vld1q_f32(a_ptr1 + 4);\
+    cq03 = vfmaq_lane_f32(cq03, aq03, bd01, 0);\
+    cq06 = vfmaq_lane_f32(cq06, aq03, bd01, 1); aq03 = vld1q_f32(a_ptr1 + 8);\
+    a_ptr1 += 12; bd01 = vld1_f32(b_ptr1); b_ptr1 += 2;\
+  }\
+  if (k_left > 0) {\
+    cq01 = vfmaq_lane_f32(cq01, aq01, bd01, 0);\
+    cq04 = vfmaq_lane_f32(cq04, aq01, bd01, 1);\
+    cq02 = vfmaq_lane_f32(cq02, aq02, bd01, 0);\
+    cq05 = vfmaq_lane_f32(cq05, aq02, bd01, 1);\
+    cq03 = vfmaq_lane_f32(cq03, aq03, bd01, 0);\
+    cq06 = vfmaq_lane_f32(cq06, aq03, bd01, 1);\
+  }
+
+#define NEON_SGEMM_KERNEL_M12N2 \
+  const float *b_ptr = b_head;\
+  const float *a_ptr = a_head;\
+  NEON_SGEMM_KERNEL_M12N2_UNIT(a_ptr, b_ptr)
+
+#define NEON_SGEMM_KERNEL_M2N12 \
+  const float *b_ptr = b_head;\
+  const float *a_ptr = a_head;\
+  NEON_SGEMM_KERNEL_M12N2_UNIT(b_ptr, a_ptr)
+
+#define NEON_SGEMM_SAVE_M12N2 \
+  float32x4_t ct1, ct2; float *c_tmp = c_ptr;\
+  NEON_SGEMM_SAVE_M4N2_UNIT(cq01, cq04) c_tmp = c_ptr + 4;\
+  NEON_SGEMM_SAVE_M4N2_UNIT(cq02, cq05) c_tmp = c_ptr + 8;\
+  NEON_SGEMM_SAVE_M4N2_UNIT(cq03, cq06)
+
+#define NEON_SGEMM_SAVE_M2N12 \
+  float32x4x2_t ctd1; float32x2_t cd1, cd2, cd3, cd4;\
+  float *c_tmp = c_ptr; NEON_SGEMM_SAVE_M2N4_UNIT(cq01, cq04)\
+  NEON_SGEMM_SAVE_M2N4_UNIT(cq02, cq05) NEON_SGEMM_SAVE_M2N4_UNIT(cq03, cq06)
+
+#define NEON_SGEMM_KERNEL_M12N4_UNIT(a_ptr1, b_ptr1) \
+  uint32_t k_left = K;\
+  float32x4_t cq01, cq02, cq03, cq04, cq05, cq06, cq07, cq08;\
+  float32x4_t cq09, cq10, cq11, cq12, aq01, aq02, aq03, bq01;\
+  cq01 = cq02 = cq03 = cq04 = cq05 = cq06 = vdupq_n_f32(0.0f);\
+  cq07 = cq08 = cq09 = cq10 = cq11 = cq12 = vdupq_n_f32(0.0f);\
+  if (k_left > 0) {\
+    bq01 = vld1q_f32(b_ptr1); b_ptr1 += 4;\
+    aq01 = vld1q_f32(a_ptr1); aq02 = vld1q_f32(a_ptr1 + 4);\
+    aq03 = vld1q_f32(a_ptr1 + 8); a_ptr1 += 12;\
+  }\
+  for (; k_left > 1; k_left--) {\
+    cq01 = vfmaq_laneq_f32(cq01, aq01, bq01, 0);\
+    cq04 = vfmaq_laneq_f32(cq04, aq01, bq01, 1);\
+    cq07 = vfmaq_laneq_f32(cq07, aq01, bq01, 2);\
+    cq10 = vfmaq_laneq_f32(cq10, aq01, bq01, 3);\
+    aq01 = vld1q_f32(a_ptr1);\
+    cq02 = vfmaq_laneq_f32(cq02, aq02, bq01, 0);\
+    cq05 = vfmaq_laneq_f32(cq05, aq02, bq01, 1);\
+    cq08 = vfmaq_laneq_f32(cq08, aq02, bq01, 2);\
+    cq11 = vfmaq_laneq_f32(cq11, aq02, bq01, 3);\
+    aq02 = vld1q_f32(a_ptr1 + 4);\
+    cq03 = vfmaq_laneq_f32(cq03, aq03, bq01, 0);\
+    cq06 = vfmaq_laneq_f32(cq06, aq03, bq01, 1);\
+    cq09 = vfmaq_laneq_f32(cq09, aq03, bq01, 2);\
+    cq12 = vfmaq_laneq_f32(cq12, aq03, bq01, 3);\
+    aq03 = vld1q_f32(a_ptr1 + 8); a_ptr1 += 12;\
+    bq01 = vld1q_f32(b_ptr1); b_ptr1 += 4;\
+  }\
+  if (k_left > 0) {\
+    cq01 = vfmaq_laneq_f32(cq01, aq01, bq01, 0);\
+    cq04 = vfmaq_laneq_f32(cq04, aq01, bq01, 1);\
+    cq07 = vfmaq_laneq_f32(cq07, aq01, bq01, 2);\
+    cq10 = vfmaq_laneq_f32(cq10, aq01, bq01, 3);\
+    cq02 = vfmaq_laneq_f32(cq02, aq02, bq01, 0);\
+    cq05 = vfmaq_laneq_f32(cq05, aq02, bq01, 1);\
+    cq08 = vfmaq_laneq_f32(cq08, aq02, bq01, 2);\
+    cq11 = vfmaq_laneq_f32(cq11, aq02, bq01, 3);\
+    cq03 = vfmaq_laneq_f32(cq03, aq03, bq01, 0);\
+    cq06 = vfmaq_laneq_f32(cq06, aq03, bq01, 1);\
+    cq09 = vfmaq_laneq_f32(cq09, aq03, bq01, 2);\
+    cq12 = vfmaq_laneq_f32(cq12, aq03, bq01, 3);\
+  }
+
+#define NEON_SGEMM_KERNEL_M12N4 \
+  const float *b_ptr = b_head;\
+  const float *a_ptr = a_head;\
+  NEON_SGEMM_KERNEL_M12N4_UNIT(a_ptr, b_ptr)
+
+#define NEON_SGEMM_KERNEL_M4N12 \
+  const float *b_ptr = b_head;\
+  const float *a_ptr = a_head;\
+  NEON_SGEMM_KERNEL_M12N4_UNIT(b_ptr, a_ptr)
+
+#define NEON_SGEMM_SAVE_M12N4 \
+  float32x4_t ct1, ct2, ct3, ct4;\
+  float *c_tmp = c_ptr; NEON_SGEMM_SAVE_M4N4_UNIT(cq01, cq04, cq07, cq10)\
+  c_tmp = c_ptr + 4; NEON_SGEMM_SAVE_M4N4_UNIT(cq02, cq05, cq08, cq11)\
+  c_tmp = c_ptr + 8; NEON_SGEMM_SAVE_M4N4_UNIT(cq03, cq06, cq09, cq12)
+
+#define NEON_SGEMM_SAVE_M4N12 \
+  float *c_tmp = c_ptr;\
+  float32x4_t ct1, ct2, ct3, ct4;\
+  TRANSPOSE_4x4(cq01, cq04, cq07, cq10)\
+  TRANSPOSE_4x4(cq02, cq05, cq08, cq11)\
+  TRANSPOSE_4x4(cq03, cq06, cq09, cq12)\
+  NEON_SGEMM_SAVE_M4N4_UNIT(cq01, cq04, cq07, cq10)\
+  NEON_SGEMM_SAVE_M4N4_UNIT(cq02, cq05, cq08, cq11)\
+  NEON_SGEMM_SAVE_M4N4_UNIT(cq03, cq06, cq09, cq12)
+
+#define NEON_SGEMM_INLINE_DUALPACK_UNIT_FUNC(mdim, ndim) \
+static inline void inline_dualpack_gemm_afloat_bfloat_cfloat_m##mdim##_n##ndim(\
+  const float *a_head, const float *b_head, float *c_ptr,\
+  uint32_t K, float beta, uint32_t ldc) {\
+  NEON_SGEMM_KERNEL_M##mdim##N##ndim\
+  NEON_SGEMM_SAVE_M##mdim##N##ndim\
+}
+
+NEON_SGEMM_INLINE_DUALPACK_UNIT_FUNC(1, 1)
+NEON_SGEMM_INLINE_DUALPACK_UNIT_FUNC(1, 2)
+NEON_SGEMM_INLINE_DUALPACK_UNIT_FUNC(2, 1)
+NEON_SGEMM_INLINE_DUALPACK_UNIT_FUNC(2, 2)
+NEON_SGEMM_INLINE_DUALPACK_UNIT_FUNC(1, 4)
+NEON_SGEMM_INLINE_DUALPACK_UNIT_FUNC(2, 4)
+NEON_SGEMM_INLINE_DUALPACK_UNIT_FUNC(4, 1)
+NEON_SGEMM_INLINE_DUALPACK_UNIT_FUNC(4, 2)
+NEON_SGEMM_INLINE_DUALPACK_UNIT_FUNC(4, 4)
+NEON_SGEMM_INLINE_DUALPACK_UNIT_FUNC(1, 8)
+NEON_SGEMM_INLINE_DUALPACK_UNIT_FUNC(2, 8)
+NEON_SGEMM_INLINE_DUALPACK_UNIT_FUNC(4, 8)
+NEON_SGEMM_INLINE_DUALPACK_UNIT_FUNC(8, 1)
+NEON_SGEMM_INLINE_DUALPACK_UNIT_FUNC(8, 2)
+NEON_SGEMM_INLINE_DUALPACK_UNIT_FUNC(8, 4)
+NEON_SGEMM_INLINE_DUALPACK_UNIT_FUNC(8, 8)
+NEON_SGEMM_INLINE_DUALPACK_UNIT_FUNC(1, 6)
+NEON_SGEMM_INLINE_DUALPACK_UNIT_FUNC(2, 6)
+NEON_SGEMM_INLINE_DUALPACK_UNIT_FUNC(4, 6)
+NEON_SGEMM_INLINE_DUALPACK_UNIT_FUNC(6, 1)
+NEON_SGEMM_INLINE_DUALPACK_UNIT_FUNC(6, 2)
+NEON_SGEMM_INLINE_DUALPACK_UNIT_FUNC(6, 4)
+NEON_SGEMM_INLINE_DUALPACK_UNIT_FUNC(1, 12)
+NEON_SGEMM_INLINE_DUALPACK_UNIT_FUNC(2, 12)
+NEON_SGEMM_INLINE_DUALPACK_UNIT_FUNC(4, 12)
+NEON_SGEMM_INLINE_DUALPACK_UNIT_FUNC(12, 1)
+NEON_SGEMM_INLINE_DUALPACK_UNIT_FUNC(12, 2)
+NEON_SGEMM_INLINE_DUALPACK_UNIT_FUNC(12, 4)
+
+#endif
+
diff --git a/include/arm_neon/NeonSum.h b/include/arm_neon/NeonSum.h
new file mode 100644
index 0000000..1794109
--- /dev/null
+++ b/include/arm_neon/NeonSum.h
@@ -0,0 +1,394 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+/*****************************************************************************
+ * File:        NeonSum.h
+ * Description: Sum functions based on ARM NEON instructions.
+ ****************************************************************************/
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <arm_neon.h>
+
+#ifndef INCLUDE_NEON_SUM
+#define INCLUDE_NEON_SUM
+
+static inline int16x8_t vaddl_low_s8(int8x16_t v1, int8x16_t v2) {
+  return vaddl_s8(vget_low_s8(v1), vget_low_s8(v2));
+}
+
+static inline int32x4_t vaddl_low_s16(int16x8_t v1, int16x8_t v2) {
+  return vaddl_s16(vget_low_s16(v1), vget_low_s16(v2));
+}
+
+static inline uint16x8_t vaddl_low_u8(uint8x16_t v1, uint8x16_t v2) {
+  return vaddl_u8(vget_low_u8(v1), vget_low_u8(v2));
+}
+
+static inline uint32x4_t vaddl_low_u16(uint16x8_t v1, uint16x8_t v2) {
+  return vaddl_u16(vget_low_u16(v1), vget_low_u16(v2));
+}
+
+#if !__aarch64__
+static inline int16x8_t vaddl_high_s8(int8x16_t v1, int8x16_t v2) {
+  return vaddl_s8(vget_high_s8(v1), vget_high_s8(v2));
+}
+
+static inline int32x4_t vaddl_high_s16(int16x8_t v1, int16x8_t v2) {
+  return vaddl_s16(vget_high_s16(v1), vget_high_s16(v2));
+}
+
+static inline uint16x8_t vaddl_high_u8(uint8x16_t v1, uint8x16_t v2) {
+  return vaddl_u8(vget_high_u8(v1), vget_high_u8(v2));
+}
+
+static inline uint32x4_t vaddl_high_u16(uint16x8_t v1, uint16x8_t v2) {
+  return vaddl_u16(vget_high_u16(v1), vget_high_u16(v2));
+}
+
+static inline int32x4_t vaddw_high_s16(int32x4_t qv, int16x8_t dv) {
+  return vaddw_s16(qv, vget_high_s16(dv));
+}
+
+static inline uint32x4_t vaddw_high_u16(uint32x4_t qv, uint16x8_t dv) {
+  return vaddw_u16(qv, vget_high_u16(dv));
+}
+#endif
+
+static inline void pref_src(const void *dat) {
+#if __aarch64__
+  __asm__("prfm pldl1keep,[%0,#64]\n\t"::"r"(dat):);
+#else
+  __asm__("pld [%0,#64]\n\t"::"r"(dat):);
+#endif
+}
+
+/*****************************************************************************
+ * Template:    NEON_I8I32_SUM
+ * Description: Function template for NEON-based summing operation of a matrix.
+ * Template Parameters: sign_short: the integer sign char in the name of
+ *                          NEON intrinsics. Please use 's' for signed int
+ *                          and 'u' for unsigned int.
+ *                      sign_scalar: the string showing integer sign in the
+ *                           name of integer type. Please use "int" for
+ *                           signed int and "uint" for unsigned int.
+ * Function Parameters: src: the address of input matrix.
+ *                      dst: the address of output vector.
+ *                      dim1: the length of major dimension of input matrix.
+ *                      dim2: the length of minor dimension of input matrix.
+ *                      (the major dimension is the vertical one for column-
+ *                       major matrix, or the horizontal one for row-major
+ *                       matrix)
+ *                      direction: the direction of summing
+ *                                 0: sum along the minor dimension,
+ *                                    output_vector_size == dim1;
+ *                                 1: sum along the major dimension,
+ *                                    output_vector_size == dim2.
+ ****************************************************************************/
+#define NEON_I8I32_SUM(sign_short, sign_scalar) \
+void sign_short##8##sign_short##32##_sum(const sign_scalar##8_t *src,\
+  sign_scalar##32_t *dst, uint32_t dim1, uint32_t dim2, uint8_t direction) {\
+\
+  if (direction == 0) {/* output_size = dim1 */\
+    /* first zero output */\
+    const sign_scalar##32x4_t z1 = vdupq_n_##sign_short##32(0);\
+    uint32_t dim1_left = dim1;\
+    sign_scalar##32_t *dst1 = dst;\
+    for (; dim1_left > 3; dim1_left -= 4) {\
+      vst1q_##sign_short##32(dst1, z1); dst1 += 4;\
+    }\
+    for (; dim1_left > 0; dim1_left--) {\
+      *dst1 = 0; dst1++;\
+    }\
+    /* then accumulate */\
+    const sign_scalar##8_t *src1 = src;\
+    uint32_t dim2_left = dim2;\
+    for (; dim2_left > 3; dim2_left -= 4) {\
+      const sign_scalar##8_t *src_l1 = src1;\
+      const sign_scalar##8_t *src_l2 = src1 + dim1;\
+      const sign_scalar##8_t *src_l3 = src1 + dim1 * 2;\
+      const sign_scalar##8_t *src_l4 = src_l2 + dim1 * 2;\
+      src1 = src_l3 + dim1 * 2;\
+      sign_scalar##32_t *dst1 = dst;\
+      dim1_left = dim1;\
+      for (; dim1_left > 15; dim1_left -= 16) {\
+        sign_scalar##8x16_t q1 = vld1q_##sign_short##8(src_l1);\
+        src_l1 += 16; pref_src(src_l1);\
+        sign_scalar##8x16_t q2 = vld1q_##sign_short##8(src_l2);\
+        src_l2 += 16; pref_src(src_l2);\
+        sign_scalar##8x16_t q3 = vld1q_##sign_short##8(src_l3);\
+        src_l3 += 16; pref_src(src_l3);\
+        sign_scalar##8x16_t q4 = vld1q_##sign_short##8(src_l4);\
+        src_l4 += 16; pref_src(src_l4);\
+        sign_scalar##16x8_t m1 = vaddl_low_##sign_short##8(q1, q2);\
+        sign_scalar##16x8_t m2 = vaddl_high_##sign_short##8(q1, q2);\
+        sign_scalar##16x8_t m3 = vaddl_low_##sign_short##8(q3, q4);\
+        sign_scalar##16x8_t m4 = vaddl_high_##sign_short##8(q3, q4);\
+        sign_scalar##32x4_t c1 = vld1q_##sign_short##32(dst1);\
+        sign_scalar##32x4_t c2 = vld1q_##sign_short##32(dst1 + 4);\
+        sign_scalar##32x4_t c3 = vld1q_##sign_short##32(dst1 + 8);\
+        sign_scalar##32x4_t c4 = vld1q_##sign_short##32(dst1 + 12);\
+        m1 = vaddq_##sign_short##16(m1, m3);\
+        m2 = vaddq_##sign_short##16(m2, m4);\
+        c1 = vaddw_##sign_short##16(c1, vget_low_##sign_short##16(m1));\
+        c2 = vaddw_high_##sign_short##16(c2, m1);\
+        c3 = vaddw_##sign_short##16(c3, vget_low_##sign_short##16(m2));\
+        c4 = vaddw_high_##sign_short##16(c4, m2);\
+        vst1q_##sign_short##32(dst1, c1);\
+        vst1q_##sign_short##32(dst1 + 4, c2);\
+        vst1q_##sign_short##32(dst1 + 8, c3);\
+        vst1q_##sign_short##32(dst1 + 12, c4); dst1 += 16;\
+      }\
+      if (dim1_left > 7) {\
+        sign_scalar##8x8_t d1 = vld1_##sign_short##8(src_l1); src_l1 += 8;\
+        sign_scalar##8x8_t d2 = vld1_##sign_short##8(src_l2); src_l2 += 8;\
+        sign_scalar##8x8_t d3 = vld1_##sign_short##8(src_l3); src_l3 += 8;\
+        sign_scalar##8x8_t d4 = vld1_##sign_short##8(src_l4); src_l4 += 8;\
+        sign_scalar##32x4_t c1 = vld1q_##sign_short##32(dst1);\
+        sign_scalar##32x4_t c2 = vld1q_##sign_short##32(dst1 + 4);\
+        sign_scalar##16x8_t m1 = vaddl_##sign_short##8(d1, d2);\
+        sign_scalar##16x8_t m2 = vaddl_##sign_short##8(d3, d4);\
+        m1 = vaddq_##sign_short##16(m1, m2);\
+        c1 = vaddw_##sign_short##16(c1, vget_low_##sign_short##16(m1));\
+        c2 = vaddw_high_##sign_short##16(c2, m1);\
+        vst1q_##sign_short##32(dst1, c1);\
+        vst1q_##sign_short##32(dst1 + 4, c2); dst1 += 8;\
+        dim1_left -= 8;\
+      }\
+      for (; dim1_left > 0; dim1_left--) {\
+        sign_scalar##16_t s1 = *src_l1++;\
+        sign_scalar##16_t s2 = *src_l2++;\
+        sign_scalar##16_t s3 = *src_l3++;\
+        sign_scalar##16_t s4 = *src_l4++;\
+        sign_scalar##32_t cs1 = *dst1;\
+        s1 += s2; s3 += s4; s1 += s3; cs1 += s1;\
+        *dst1 = cs1; dst1++;\
+      }\
+    }\
+    for (; dim2_left > 0; dim2_left--) {\
+      sign_scalar##32_t *dst1 = dst;\
+      dim1_left = dim1;\
+      for (; dim1_left > 15; dim1_left -= 16) {\
+        sign_scalar##8x8_t d1 = vld1_##sign_short##8(src1);\
+        sign_scalar##8x8_t d2 = vld1_##sign_short##8(src1 + 8); src1 += 16;\
+        sign_scalar##16x8_t q1 = vmovl_##sign_short##8(d1);\
+        sign_scalar##16x8_t q2 = vmovl_##sign_short##8(d2);\
+        sign_scalar##32x4_t c1 = vld1q_##sign_short##32(dst1);\
+        sign_scalar##32x4_t c2 = vld1q_##sign_short##32(dst1 + 4);\
+        sign_scalar##32x4_t c3 = vld1q_##sign_short##32(dst1 + 8);\
+        sign_scalar##32x4_t c4 = vld1q_##sign_short##32(dst1 + 12);\
+        c1 = vaddw_##sign_short##16(c1, vget_low_##sign_short##16(q1));\
+        c2 = vaddw_high_##sign_short##16(c2, q1);\
+        c3 = vaddw_##sign_short##16(c3, vget_low_##sign_short##16(q2));\
+        c4 = vaddw_high_##sign_short##16(c4, q2);\
+        vst1q_##sign_short##32(dst1, c1);\
+        vst1q_##sign_short##32(dst1 + 4, c2);\
+        vst1q_##sign_short##32(dst1 + 8, c3);\
+        vst1q_##sign_short##32(dst1 + 12, c4);\
+        dst1 += 16;\
+      }\
+      if (dim1_left > 7) {\
+        sign_scalar##8x8_t d1 = vld1_##sign_short##8(src1); src1 += 8;\
+        sign_scalar##16x8_t q1 = vmovl_##sign_short##8(d1);\
+        sign_scalar##32x4_t c1 = vld1q_##sign_short##32(dst1);\
+        sign_scalar##32x4_t c2 = vld1q_##sign_short##32(dst1 + 4);\
+        c1 = vaddw_##sign_short##16(c1, vget_low_##sign_short##16(q1));\
+        c2 = vaddw_high_##sign_short##16(c2, q1);\
+        vst1q_##sign_short##32(dst1, c1);\
+        vst1q_##sign_short##32(dst1 + 4, c2);\
+        dst1 += 8; dim1_left -= 8;\
+      }\
+      for (; dim1_left > 0; dim1_left--) {\
+        *dst1 += *src1; src1++; dst1++;\
+      }\
+    }\
+  } else {/* output size = dim2 */\
+    const sign_scalar##8_t *src1 = src;\
+    for (uint32_t dim2_pos = 0; dim2_pos < dim2; dim2_pos++) {\
+      sign_scalar##32x4_t cq1 = vdupq_n_##sign_short##32(0);\
+      uint32_t dim1_left = dim1;\
+      for (; dim1_left > 15; dim1_left -= 16) {\
+        sign_scalar##8x16_t aq1 = vld1q_##sign_short##8(src1); src1 += 16;\
+        sign_scalar##16x8_t tq1 = vpaddlq_##sign_short##8(aq1);\
+        cq1 = vpadalq_##sign_short##16(cq1, tq1);\
+      }\
+      sign_scalar##32x2_t cd1 = vadd_##sign_short##32(\
+        vget_low_##sign_short##32(cq1), vget_high_##sign_short##32(cq1));\
+      if (dim1_left > 7) {\
+        sign_scalar##8x8_t ad1 = vld1_##sign_short##8(src1); src1 += 8;\
+        sign_scalar##16x4_t td1 = vpaddl_##sign_short##8(ad1);\
+        cd1 = vpadal_##sign_short##16(cd1, td1);\
+        dim1_left -= 8;\
+      }\
+      sign_scalar##32_t cs1 = vget_lane_##sign_short##32(\
+        vpadd_##sign_short##32(cd1, cd1), 0);\
+      for (; dim1_left > 0; dim1_left--) {\
+        cs1 += *src1; src1++;\
+      }\
+      dst[dim2_pos] = cs1;\
+    }\
+  }\
+}
+
+static inline int32x4_t vmull_low_s16(int16x8_t a, int16x8_t b) {
+  return vmull_s16(vget_low_s16(a), vget_low_s16(b));
+}
+
+static inline uint32x4_t vmull_low_u16(uint16x8_t a, uint16x8_t b) {
+  return vmull_u16(vget_low_u16(a), vget_low_u16(b));
+}
+
+#if !__aarch64__
+static inline int32x4_t vmull_high_s16(int16x8_t a, int16x8_t b) {
+  return vmull_s16(vget_high_s16(a), vget_high_s16(b));
+}
+
+static inline uint32x4_t vmull_high_u16(uint16x8_t a, uint16x8_t b) {
+  return vmull_u16(vget_high_u16(a), vget_high_u16(b));
+}
+#endif
+
+#define NEON_I16_SUMSQUARE(sign_short, sign_scalar) \
+void sign_short##16_sumsquare(const sign_scalar##16_t *dat,\
+  sign_scalar##32_t *sum, sign_scalar##64_t *sumsquare, uint32_t size) {\
+\
+  sign_scalar##32x4_t sum1 = vdupq_n_##sign_short##32(0);\
+  sign_scalar##32x4_t sum2 = vdupq_n_##sign_short##32(0);\
+  sign_scalar##64x2_t sumsq1 = vdupq_n_##sign_short##64(0);\
+  sign_scalar##64x2_t sumsq2 = vdupq_n_##sign_short##64(0);\
+  sign_scalar##64x2_t sumsq3 = vdupq_n_##sign_short##64(0);\
+  sign_scalar##64x2_t sumsq4 = vdupq_n_##sign_short##64(0);\
+\
+  if (!sumsquare) {\
+    if (sum) {\
+      for (; size > 15; size -= 16) {\
+        sign_scalar##16x8_t l1 = vld1q_##sign_short##16(dat);\
+        sign_scalar##16x8_t l2 = vld1q_##sign_short##16(dat + 8); dat += 16;\
+        sum1 = vpadalq_##sign_short##16(sum1, l1);\
+        sum2 = vpadalq_##sign_short##16(sum2, l2);\
+      }\
+      sum1 = vaddq_##sign_short##32(sum1, sum2);\
+      if (size > 7) {\
+        sign_scalar##16x8_t l1 = vld1q_##sign_short##16(dat); dat += 8;\
+        sum1 = vpadalq_##sign_short##16(sum1, l1);\
+        size -= 8;\
+      }\
+      if (size > 3) {\
+        sign_scalar##16x4_t l1 = vld1_##sign_short##16(dat); dat += 4;\
+        sum1 = vaddw_##sign_short##16(sum1, l1);\
+        size -= 4;\
+      }\
+      sign_scalar##32x2_t sumd = vadd_##sign_short##32(\
+        vget_low_##sign_short##32(sum1), vget_high_##sign_short##32(sum1));\
+      sign_scalar##32_t sums = vget_lane_##sign_short##32(sumd, 0) + \
+        vget_lane_##sign_short##32(sumd, 1);\
+      for (; size > 0; size--) {\
+        sign_scalar##32_t l1 = *dat++;\
+        sums += l1;\
+      }\
+      *sum = sums;\
+    }\
+  } else if (!sum) {\
+    for (; size > 15; size -= 16) {\
+      sign_scalar##16x8_t l1 = vld1q_##sign_short##16(dat);\
+      sign_scalar##16x8_t l2 = vld1q_##sign_short##16(dat + 8); dat += 16;\
+      sign_scalar##32x4_t sq1 = vmull_low_##sign_short##16(l1, l1);\
+      sign_scalar##32x4_t sq2 = vmull_high_##sign_short##16(l1, l1);\
+      sign_scalar##32x4_t sq3 = vmull_low_##sign_short##16(l2, l2);\
+      sign_scalar##32x4_t sq4 = vmull_high_##sign_short##16(l2, l2);\
+      sumsq1 = vpadalq_##sign_short##32(sumsq1, sq1);\
+      sumsq2 = vpadalq_##sign_short##32(sumsq2, sq2);\
+      sumsq3 = vpadalq_##sign_short##32(sumsq3, sq3);\
+      sumsq4 = vpadalq_##sign_short##32(sumsq4, sq4);\
+    }\
+    sumsq1 = vaddq_##sign_short##64(sumsq1, sumsq3);\
+    sumsq2 = vaddq_##sign_short##64(sumsq2, sumsq4);\
+    if (size > 7) {\
+      sign_scalar##16x8_t l1 = vld1q_##sign_short##16(dat); dat += 8;\
+      sign_scalar##32x4_t sq1 = vmull_low_##sign_short##16(l1, l1);\
+      sign_scalar##32x4_t sq2 = vmull_high_##sign_short##16(l1, l1);\
+      sumsq1 = vpadalq_##sign_short##32(sumsq1, sq1);\
+      sumsq2 = vpadalq_##sign_short##32(sumsq2, sq2);\
+      size -= 8;\
+    }\
+    sumsq1 = vaddq_##sign_short##64(sumsq1, sumsq2);\
+    if (size > 3) {\
+      sign_scalar##16x4_t l1 = vld1_##sign_short##16(dat); dat += 4;\
+      sign_scalar##32x4_t sq1 = vmull_##sign_short##16(l1, l1);\
+      sumsq1 = vpadalq_##sign_short##32(sumsq1, sq1);\
+      size -= 4;\
+    }\
+    sign_scalar##64_t sumsqs = vgetq_lane_##sign_short##64(sumsq1, 0) + \
+      vgetq_lane_##sign_short##64(sumsq1, 1);\
+    for (; size > 0; size--) {\
+      sign_scalar##32_t l1 = *dat++;\
+      sumsqs += l1 * l1;\
+    }\
+    *sumsquare = sumsqs;\
+  } else {\
+    for (; size > 15; size -= 16) {\
+      sign_scalar##16x8_t l1 = vld1q_##sign_short##16(dat);\
+      sign_scalar##16x8_t l2 = vld1q_##sign_short##16(dat + 8); dat += 16;\
+      sum1 = vpadalq_##sign_short##16(sum1, l1);\
+      sum2 = vpadalq_##sign_short##16(sum2, l2);\
+      sign_scalar##32x4_t sq1 = vmull_low_##sign_short##16(l1, l1);\
+      sign_scalar##32x4_t sq2 = vmull_high_##sign_short##16(l1, l1);\
+      sign_scalar##32x4_t sq3 = vmull_low_##sign_short##16(l2, l2);\
+      sign_scalar##32x4_t sq4 = vmull_high_##sign_short##16(l2, l2);\
+      sumsq1 = vpadalq_##sign_short##32(sumsq1, sq1);\
+      sumsq2 = vpadalq_##sign_short##32(sumsq2, sq2);\
+      sumsq3 = vpadalq_##sign_short##32(sumsq3, sq3);\
+      sumsq4 = vpadalq_##sign_short##32(sumsq4, sq4);\
+    }\
+    sum1 = vaddq_##sign_short##32(sum1, sum2);\
+    sumsq1 = vaddq_##sign_short##64(sumsq1, sumsq3);\
+    sumsq2 = vaddq_##sign_short##64(sumsq2, sumsq4);\
+    if (size > 7) {\
+      sign_scalar##16x8_t l1 = vld1q_##sign_short##16(dat); dat += 8;\
+      sum1 = vpadalq_##sign_short##16(sum1, l1);\
+      sign_scalar##32x4_t sq1 = vmull_low_##sign_short##16(l1, l1);\
+      sign_scalar##32x4_t sq2 = vmull_high_##sign_short##16(l1, l1);\
+      sumsq1 = vpadalq_##sign_short##32(sumsq1, sq1);\
+      sumsq2 = vpadalq_##sign_short##32(sumsq2, sq2);\
+      size -= 8;\
+    }\
+    sumsq1 = vaddq_##sign_short##64(sumsq1, sumsq2);\
+    if (size > 3) {\
+      sign_scalar##16x4_t l1 = vld1_##sign_short##16(dat); dat += 4;\
+      sum1 = vaddw_##sign_short##16(sum1, l1);\
+      sign_scalar##32x4_t sq1 = vmull_##sign_short##16(l1, l1);\
+      sumsq1 = vpadalq_##sign_short##32(sumsq1, sq1);\
+      size -= 4;\
+    }\
+    sign_scalar##32x2_t sumd = vadd_##sign_short##32(\
+      vget_low_##sign_short##32(sum1), vget_high_##sign_short##32(sum1));\
+    sign_scalar##32_t sums = vget_lane_##sign_short##32(sumd, 0) + \
+      vget_lane_##sign_short##32(sumd, 1);\
+    sign_scalar##64_t sumsqs = vgetq_lane_##sign_short##64(sumsq1, 0) + \
+      vgetq_lane_##sign_short##64(sumsq1, 1);\
+    for (; size > 0; size--) {\
+      sign_scalar##32_t l1 = *dat++;\
+      sums += l1;\
+      sumsqs += l1 * l1;\
+    }\
+    *sum = sums;\
+    *sumsquare = sumsqs;\
+  }\
+}
+
+#endif
+
diff --git a/include/common/CommonCopy.h b/include/common/CommonCopy.h
new file mode 100644
index 0000000..779dee9
--- /dev/null
+++ b/include/common/CommonCopy.h
@@ -0,0 +1,121 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+/******************************************************************************
+ * File:        CommonCopy.h
+ * Description: Common building blocks for packing functions in GEMM operation
+ * Terms:       "ncopy": pack from K-major source matrix
+ *              "tcopy": pack from K-minor source matrix
+ *****************************************************************************/
+
+#include "ExpandMacro.h"
+#include <stdint.h>
+
+#ifndef INCLUDE_COMMON_COPY
+#define INCLUDE_COMMON_COPY
+
+#define NCOPY_INIT_SRC_PTR_ITEM(n, type) \
+  const type *src##n = src0 + (n - 1) * ld_dim;
+#define NCOPY_INIT_SRC_PTR(n, type) \
+  MACRO_EXPANSION_##n(VOID_BASE, NCOPY_INIT_SRC_PTR_ITEM, type)
+
+#define NCOPY_COPY_1(n) \
+  dst1[n - 1] = *src##n; src##n ++;
+#define NCOPY_COPY(n) \
+  MACRO_EXPANSION_##n(VOID_BASE, NCOPY_COPY_1) dst1 += n;
+
+/* a standard-C fallback for NCOPY_<type_<stype> */
+#define NCOPY_STD(unroll) \
+  for (; dim1_count > 0; dim1_count--) {\
+    NCOPY_COPY(unroll)\
+  }
+
+/* the macro NCOPY_<type>_<stype>(unroll) is architecture dependant,
+ *  * which should be defined in the source file including this header */
+#define NCOPY_LOOP(unroll, type, stype) \
+  for (; dim2_count >= unroll; dim2_count -= unroll) {\
+    uint32_t dim1_count = dim1;\
+    NCOPY_INIT_SRC_PTR(unroll, type)\
+    NCOPY_##type##_##stype(unroll)\
+    src0 += ld_dim * unroll;\
+  }
+#define NCOPY(max_unroll, side, type) \
+  MACRO_EXP_E_##max_unroll(NCOPY_LOOP, side, type)
+
+#define GENERIC_NCOPY_FUNC(gemmtype, type, stype, max_unroll) \
+void gemmtype##_##type##_##stype##_ncopy_unroll##max_unroll(\
+  const type * __restrict__ src, stype * __restrict__ dst,\
+  uint32_t ld_dim, uint32_t dim1, uint32_t dim2) {\
+  const type *src0 = src;\
+  stype *dst1 = dst;\
+  uint32_t dim2_count = dim2;\
+  NCOPY(max_unroll, type, stype)\
+}
+
+
+/* this macro is the fallback for TCOPY_UNIT_<type>_<stype> */
+#define TCOPY_UNIT_STD(src_ptr, dst_ptr, dst_offset, num_elements) \
+  _Pragma("omp simd")\
+  for (int i = 0; i < num_elements; ++i) \
+    dst_ptr[dst_offset + i] = src_ptr[i];
+
+/* the macro 
+ * TCOPY_UNIT_<type>_<stype>(src_ptr, dst_ptr, dst_offset, num_elements)
+ * is architecture dependant,
+ * which should be defined in source file including this header */
+
+#define TCOPY_LINE_1(n, unroll, type, stype) \
+  TCOPY_UNIT_##type##_##stype(src##n, dst1, ((n-1)*unroll), unroll)\
+  src##n += unroll;
+#define TCOPY_LINES(n, unroll, type, stype) \
+  MACRO_EXPANSION_##n(VOID_BASE, TCOPY_LINE_1, unroll, type, stype)
+
+#define TCOPY_LOOP(unroll, type, stype, read_width) \
+  dst1 = dst + (dim1 - dim1_count) * dim2 + (dim2 - dim2_count) * unroll;\
+  for (; dim1_count >= unroll; dim1_count -= unroll) {\
+    TCOPY_LINES(read_width, unroll, type, stype)\
+    dst1 += dim2 * unroll;\
+  }
+#define TCOPY(max_unroll, type, stype, read_width) \
+  MACRO_EXPANSION_E_##max_unroll(TCOPY_LOOP, type, stype, read_width)
+
+#define GENERIC_TCOPY_FUNC(gemmtype, type, stype, max_unroll) \
+void gemmtype##_##type##_##stype##_tcopy_unroll##max_unroll(\
+  const type * __restrict__ src, stype * __restrict__ dst,\
+  uint32_t ld_dim, uint32_t dim1, uint32_t dim2) {\
+  uint32_t dim2_count = dim2;\
+  const type *src0 = src;\
+  for (; dim2_count > 3; dim2_count -= 4) {\
+    const type *src1 = src0;\
+    const type *src2 = src0 + ld_dim;\
+    const type *src3 = src0 + ld_dim * 2;\
+    const type *src4 = src2 + ld_dim * 2;\
+    stype *dst1;\
+    uint32_t dim1_count = dim1;\
+    TCOPY(max_unroll, type, stype, 4)\
+    src0 += ld_dim * 4;\
+  }\
+  for (; dim2_count > 0; dim2_count--) {\
+    const type *src1 = src0;\
+    stype *dst1;\
+    uint32_t dim1_count = dim1;\
+    TCOPY(max_unroll, type, stype, 1)\
+    src0 += ld_dim;\
+  }\
+}
+
+#endif
diff --git a/include/common/CommonDriver.h b/include/common/CommonDriver.h
new file mode 100644
index 0000000..22bd3f2
--- /dev/null
+++ b/include/common/CommonDriver.h
@@ -0,0 +1,497 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+/******************************************************************************
+ * File:        CommonDriver.h
+ * Description: Common driver functions for GEMM operation. A driver function
+ *              does blocking and calls packing/kernel/skinny_kernel functions
+ *              to perform efficient matrix multiplication
+ *****************************************************************************/
+
+#include "ExpandMacro.h"
+#include "CommonSched.h"
+#include <string.h>
+#include <malloc.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#ifndef EMLL_SERIAL_ONLY
+#include <omp.h>
+#endif
+
+#ifndef INCLUDE_COMMON_DRIVER
+#define INCLUDE_COMMON_DRIVER
+
+#define SKINNY_FUNC_BASE(FUNCTITLE, VOIDTITLE, ...) VOIDTITLE
+
+#define SKINNY_FUNC_LIST_ITEM(NO, FUNCTITLE, VOIDTITLE, ...)\
+  ,FUNCTITLE##NO##__VA_ARGS__
+
+#define SKINNY_GEMM_FUNC_LIST(NUM, FUNCTITLE, VOIDTITLE, ...)\
+  MACRO_EXP_##NUM(SKINNY_FUNC_BASE, SKINNY_FUNC_LIST_ITEM,\
+    FUNCTITLE, VOIDTITLE, ##__VA_ARGS__)
+
+/* blocking parameters */
+
+#ifndef GEMM_R_MN
+#define GEMM_R_MN 1024
+#endif
+
+#ifndef GEMM_D_MN
+#define GEMM_D_MN 192
+#endif
+
+#ifndef GEMM_D_K
+#define GEMM_D_K 192
+#endif
+
+/* GEMM_D_K * GEMM_UNROLL_M or GEMM_D_K * GEMM_UNROLL_N fit in L1 cache */
+/* GEMM_D_K * GEMM_D_MN fit in L2 cache */
+/* GEMM_R_MN is the last to optimize, not crucial to performance */
+
+#if GEMM_D_MN > GEMM_R_MN
+#define GEMM_S_MN GEMM_D_MN
+#else
+#define GEMM_S_MN GEMM_R_MN
+#endif
+
+#ifndef SCRATCH_K_CORD
+#define SCRATCH_K_CORD(k) (k)
+#endif
+
+#define SCRATCH_GEMM_D_K (SCRATCH_K_CORD(GEMM_D_K - 1) + 1)
+
+#define GEMM_STATIC_BUFFER(gemmtype, sbtype, satype) \
+__thread __attribute__((aligned(4096))) satype\
+  blas_##gemmtype##_sa[GEMM_S_MN * SCRATCH_GEMM_D_K];\
+__thread __attribute__((aligned(4096))) sbtype\
+  blas_##gemmtype##_sb[GEMM_S_MN * SCRATCH_GEMM_D_K];
+
+/* serial driver function with packing both source matrices,
+ * loop order: N { M { K } } */
+#define GEMM_SERIAL_FUNC_LM(gemmtype, atype, satype, btype, sbtype, ctype,\
+  unroll_m, unroll_n) \
+static void gemmtype##_serial_lm_m##unroll_m##n##unroll_n(\
+  int a_rowmajor, int b_rowmajor,\
+  const atype *A, const btype *B, ctype *C,\
+  uint32_t M, uint32_t N, uint32_t K, ctype beta_inp) {\
+\
+  satype * const sa = blas_##gemmtype##_sa;\
+  sbtype * const sb = blas_##gemmtype##_sb;\
+\
+  uint32_t m_pos, n_pos, k_pos, m_inc, n_inc, k_inc;\
+  for (k_pos = 0; k_pos < K; k_pos += k_inc) {\
+    k_inc = K - k_pos;\
+    if (k_inc >= (GEMM_D_K << 1)) k_inc = GEMM_D_K;\
+    else if (k_inc > GEMM_D_K) k_inc >>= 1;\
+    ctype beta = (k_pos == 0) ? beta_inp : 1;\
+    for (n_pos = 0; n_pos < N; n_pos += n_inc) {\
+      n_inc = N - n_pos;\
+      if (n_inc >= (GEMM_R_MN << 1)) n_inc = GEMM_R_MN;\
+      else if (n_inc > GEMM_R_MN) n_inc >>= 1;\
+      if (b_rowmajor) {\
+        gemmtype##_##btype##_##sbtype##_tcopy_unroll##unroll_n(\
+          B + k_pos * N + n_pos, sb, N, n_inc, k_inc);\
+      } else {\
+        gemmtype##_##btype##_##sbtype##_ncopy_unroll##unroll_n(\
+          B + n_pos * K + k_pos, sb, K, k_inc, n_inc);\
+      }\
+      for (m_pos = 0; m_pos < M; m_pos += m_inc) {\
+        m_inc = M - m_pos;\
+        if (m_inc > GEMM_D_MN) m_inc = GEMM_D_MN;\
+        if (a_rowmajor) {\
+          gemmtype##_##atype##_##satype##_ncopy_unroll##unroll_m(\
+            A + m_pos * K + k_pos, sa, K, k_inc, m_inc);\
+        } else {\
+          gemmtype##_##atype##_##satype##_tcopy_unroll##unroll_m(\
+            A + k_pos * M + m_pos, sa, M, m_inc, k_inc);\
+        }\
+        uint32_t scratch_k_inc = (k_inc == 0) ? 0 :\
+          SCRATCH_K_CORD(k_inc - 1) + 1;\
+        gemmtype##_kernel_lm_m##unroll_m##n##unroll_n(m_inc, n_inc,\
+          scratch_k_inc,\
+          beta, sa, sb, C + n_pos * M + m_pos, M);\
+      }\
+    }\
+  }\
+}
+
+/* serial driver function with packing both source matrices,
+ * loop order: M { N { K } } */
+#define GEMM_SERIAL_FUNC_LN(gemmtype, atype, satype, btype, sbtype, ctype,\
+  unroll_m, unroll_n) \
+static void gemmtype##_serial_ln_m##unroll_m##n##unroll_n(\
+  int a_rowmajor, int b_rowmajor,\
+  const atype *A, const btype *B, ctype *C,\
+  uint32_t M, uint32_t N, uint32_t K, ctype beta_inp) {\
+\
+  satype * const sa = blas_##gemmtype##_sa;\
+  sbtype * const sb = blas_##gemmtype##_sb;\
+\
+  uint32_t m_pos, n_pos, k_pos, m_inc, n_inc, k_inc;\
+  for (k_pos = 0; k_pos < K; k_pos += k_inc) {\
+    k_inc = K - k_pos;\
+    if (k_inc >= (GEMM_D_K << 1)) k_inc = GEMM_D_K;\
+    else if (k_inc > GEMM_D_K) k_inc >>= 1;\
+    ctype beta = (k_pos == 0) ? beta_inp : 1;\
+    for (m_pos = 0; m_pos < M; m_pos += m_inc) {\
+      m_inc = M - m_pos;\
+      if (m_inc >= (GEMM_R_MN << 1)) m_inc = GEMM_R_MN;\
+      else if (m_inc > GEMM_R_MN) m_inc >>= 1;\
+      if (a_rowmajor) {\
+        gemmtype##_##atype##_##satype##_ncopy_unroll##unroll_m(\
+          A + m_pos * K + k_pos, sa, K, k_inc, m_inc);\
+      } else {\
+        gemmtype##_##atype##_##satype##_tcopy_unroll##unroll_m(\
+          A + k_pos * M + m_pos, sa, M, m_inc, k_inc);\
+      }\
+      for (n_pos = 0; n_pos < N; n_pos += n_inc) {\
+        n_inc = N - n_pos;\
+        if (n_inc > GEMM_D_MN) n_inc = GEMM_D_MN;\
+        if (b_rowmajor) {\
+          gemmtype##_##btype##_##sbtype##_tcopy_unroll##unroll_n(\
+            B + k_pos * N + n_pos, sb, N, n_inc, k_inc);\
+        } else {\
+          gemmtype##_##btype##_##sbtype##_ncopy_unroll##unroll_n(\
+            B + n_pos * K + k_pos, sb, K, k_inc, n_inc);\
+        }\
+        uint32_t scratch_k_inc = (k_inc == 0) ? 0 :\
+          SCRATCH_K_CORD(k_inc - 1) + 1;\
+        gemmtype##_kernel_ln_m##unroll_m##n##unroll_n(m_inc, n_inc,\
+          scratch_k_inc,\
+          beta, sa, sb, C + n_pos * M + m_pos, M);\
+      }\
+    }\
+  }\
+}
+
+/* inline function to check arguments */
+static inline bool inline_gemm_par_valid(const void *A, const void *B,
+  void *C, uint32_t M, uint32_t N, uint32_t K) {
+
+  bool a_valid = A || (M == 0 || K == 0);
+  bool b_valid = B || (N == 0 || K == 0);
+  bool c_valid = C || (M == 0 || N == 0);
+
+  return a_valid && b_valid && c_valid;
+}
+
+/* serial GEMM driver function */
+#define GEMM_SERIAL_FUNC(gemmtype, atype, satype, btype, sbtype, ctype,\
+  unroll_l2, unroll_l1, skin1_maxm, skin1_maxn, skin2_maxm, skin2_maxn, ...)\
+\
+GEMM_STATIC_BUFFER(gemmtype, sbtype, satype)\
+\
+GEMM_SERIAL_FUNC_LM(gemmtype, atype, satype, btype, sbtype, ctype,\
+  unroll_l2, unroll_l1)\
+\
+GEMM_SERIAL_FUNC_LN(gemmtype, atype, satype, btype, sbtype, ctype,\
+  unroll_l1, unroll_l2)\
+\
+static void arowmajor_bskinny_void(\
+  const atype *A_mat, const btype *B_skin, ctype *C_skin,\
+  uint32_t M, uint32_t K, uint8_t b_c_order, ctype beta_inp) { return; }\
+\
+static void bcolmajor_askinny_void(\
+  const btype *A_mat, const atype *B_skin, ctype *C_skin,\
+  uint32_t M, uint32_t K, uint8_t b_c_order, ctype beta_inp) { return; }\
+\
+static void acolmajor_bskinny_void(\
+  const atype *A_mat, const btype *B_skin, ctype *C_skin,\
+  uint32_t M, uint32_t K, uint8_t b_c_order, ctype beta_inp) { return; }\
+\
+static void browmajor_askinny_void(\
+  const btype *A_mat, const atype *B_skin, ctype *C_skin,\
+  uint32_t M, uint32_t K, uint8_t b_c_order, ctype beta_inp) { return; }\
+\
+static void (* gemmtype##_bskinny1[]) (\
+  const atype *A_mat, const btype *B_skin, ctype *C_skin,\
+  uint32_t M, uint32_t K, uint8_t b_c_order, ctype beta_inp) = {\
+  SKINNY_GEMM_FUNC_LIST(skin1_maxn,\
+    gemmtype##_arowmajor_bskinny_a##atype##_b##btype##_n,\
+    arowmajor_bskinny_void) };\
+\
+static void (* gemmtype##_askinny1[]) (\
+  const btype *A_mat, const atype *B_skin, ctype *C_skin,\
+  uint32_t M, uint32_t K, uint8_t b_c_order, ctype beta_inp) = {\
+  SKINNY_GEMM_FUNC_LIST(skin1_maxm,\
+    gemmtype##_arowmajor_bskinny_a##btype##_b##atype##_n,\
+    bcolmajor_askinny_void) };\
+\
+static void (* gemmtype##_bskinny2[]) (\
+  const atype *A_mat, const btype *B_skin, ctype *C_skin,\
+  uint32_t M, uint32_t K, uint8_t b_c_order, ctype beta_inp) = {\
+  SKINNY_GEMM_FUNC_LIST(skin2_maxn,\
+    gemmtype##_acolmajor_bskinny_a##atype##_b##btype##_n,\
+    acolmajor_bskinny_void) };\
+\
+static void (* gemmtype##_askinny2[]) (\
+  const btype *A_mat, const atype *B_skin, ctype *C_skin,\
+  uint32_t M, uint32_t K, uint8_t b_c_order, ctype beta_inp) = {\
+  SKINNY_GEMM_FUNC_LIST(skin2_maxm,\
+    gemmtype##_acolmajor_bskinny_a##btype##_b##atype##_n,\
+    browmajor_askinny_void) };\
+\
+int gemmtype##_serial(int a_rowmajor, int b_rowmajor,\
+  const atype *A, const btype *B, ctype *C,\
+  uint32_t M, uint32_t N, uint32_t K, ctype beta_inp) {\
+\
+  if (!inline_gemm_par_valid(A, B, C, M, N, K)) return 1;\
+  if (0 __VA_ARGS__) return 2;\
+\
+  if (K == 0) {\
+    if (beta_inp != (ctype)1.0) {\
+      const uint64_t MN = (uint64_t)M * (uint64_t)N;\
+      for (uint64_t pos = 0; pos < MN; ++pos) {\
+        C[pos] *= beta_inp;\
+      }\
+    }\
+    return 0;\
+  }\
+\
+  if (N <= skin1_maxn && a_rowmajor) {\
+    (* gemmtype##_bskinny1[N])(A, B, C, M, K, b_rowmajor ? 1 : 0, beta_inp);\
+    return 0;\
+  }\
+  if (M <= skin1_maxm && !b_rowmajor) {\
+    (* gemmtype##_askinny1[M])(B, A, C, N, K, a_rowmajor ? 2 : 3, beta_inp);\
+    return 0;\
+  }\
+  if (N <= skin2_maxn && !a_rowmajor) {\
+    (* gemmtype##_bskinny2[N])(A, B, C, M, K, b_rowmajor ? 1 : 0, beta_inp);\
+    return 0;\
+  }\
+  if (M <= skin2_maxm && b_rowmajor) {\
+    (* gemmtype##_askinny2[M])(B, A, C, N, K, a_rowmajor ? 2 : 3, beta_inp);\
+    return 0;\
+  }\
+\
+  if ((N >> 1) > M) {\
+    gemmtype##_serial_ln_m##unroll_l1##n##unroll_l2(\
+      a_rowmajor, b_rowmajor, A, B, C, M, N, K, beta_inp);\
+  } else {\
+    gemmtype##_serial_lm_m##unroll_l2##n##unroll_l1(\
+      a_rowmajor, b_rowmajor, A, B, C, M, N, K, beta_inp);\
+  }\
+  return 0;\
+}
+
+#ifdef EMLL_SERIAL_ONLY
+
+#define GEMM_PARALLEL_FUNC(gemmtype, atype, satype, btype, sbtype, ctype,\
+  unroll_l2, unroll_l1, skin1_maxm, skin1_maxn, skin2_maxm, skin2_maxn, ...) \
+\
+GEMM_SERIAL_FUNC(gemmtype, atype, satype, btype, sbtype, ctype,\
+  unroll_l2, unroll_l1, skin1_maxm, skin1_maxn, skin2_maxm, skin2_maxn, ##__VA_ARGS__)\
+int gemmtype(int a_rowmajor, int b_rowmajor,\
+  const atype *A, const btype *B, ctype *C,\
+  uint32_t M, uint32_t N, uint32_t K, ctype beta_inp, uint32_t num_threads) {\
+\
+  return gemmtype##_serial(a_rowmajor, b_rowmajor, A, B, C, M, N, K, beta_inp);\
+}
+
+#else
+
+/* OpenMP GEMM driver function */
+#define GEMM_PARALLEL_FUNC(gemmtype, atype, satype, btype, sbtype, ctype,\
+  unroll_l2, unroll_l1, skin1_maxm, skin1_maxn, skin2_maxm, skin2_maxn, ...) \
+\
+GEMM_SERIAL_FUNC(gemmtype, atype, satype, btype, sbtype, ctype,\
+  unroll_l2, unroll_l1, skin1_maxm, skin1_maxn, skin2_maxm, skin2_maxn, ##__VA_ARGS__)\
+\
+static void arowmajor_bskinny_void_omp(\
+  const atype *A_mat, const btype *B_skin, ctype *C_skin,\
+  uint32_t M, uint32_t K, uint8_t b_c_order,\
+  ctype beta_inp, uint32_t num_threads) { return; }\
+\
+static void bcolmajor_askinny_void_omp(\
+  const btype *A_mat, const atype *B_skin, ctype *C_skin,\
+  uint32_t M, uint32_t K, uint8_t b_c_order,\
+  ctype beta_inp, uint32_t num_threads) { return; }\
+\
+static void acolmajor_bskinny_void_omp(\
+  const atype *A_mat, const btype *B_skin, ctype *C_skin,\
+  uint32_t M, uint32_t K, uint8_t b_c_order,\
+  ctype beta_inp, uint32_t num_threads) { return; }\
+\
+static void browmajor_askinny_void_omp(\
+  const btype *A_mat, const atype *B_skin, ctype *C_skin,\
+  uint32_t M, uint32_t K, uint8_t b_c_order,\
+  ctype beta_inp, uint32_t num_threads) { return; }\
+\
+static void (* gemmtype##_bskinny1_omp[]) (\
+  const atype *A_mat, const btype *B_skin, ctype *C_skin,\
+  uint32_t M, uint32_t K, uint8_t b_c_order,\
+  ctype beta_inp, uint32_t num_threads) = {\
+  SKINNY_GEMM_FUNC_LIST(skin1_maxn,\
+    gemmtype##_arowmajor_bskinny_a##atype##_b##btype##_n,\
+    arowmajor_bskinny_void_omp, _omp) };\
+\
+static void (* gemmtype##_askinny1_omp[]) (\
+  const btype *A_mat, const atype *B_skin, ctype *C_skin,\
+  uint32_t M, uint32_t K, uint8_t b_c_order,\
+  ctype beta_inp, uint32_t num_threads) = {\
+  SKINNY_GEMM_FUNC_LIST(skin1_maxm,\
+    gemmtype##_arowmajor_bskinny_a##btype##_b##atype##_n,\
+    bcolmajor_askinny_void_omp, _omp) };\
+\
+static void (* gemmtype##_bskinny2_omp[]) (\
+  const atype *A_mat, const btype *B_skin, ctype *C_skin,\
+  uint32_t M, uint32_t K, uint8_t b_c_order,\
+  ctype beta_inp, uint32_t num_threads) = {\
+  SKINNY_GEMM_FUNC_LIST(skin2_maxn,\
+    gemmtype##_acolmajor_bskinny_a##atype##_b##btype##_n,\
+    acolmajor_bskinny_void_omp, _omp) };\
+\
+static void (* gemmtype##_askinny2_omp[]) (\
+  const btype *A_mat, const atype *B_skin, ctype *C_skin,\
+  uint32_t M, uint32_t K, uint8_t b_c_order,\
+  ctype beta_inp, uint32_t num_threads) = {\
+  SKINNY_GEMM_FUNC_LIST(skin2_maxm,\
+    gemmtype##_acolmajor_bskinny_a##btype##_b##atype##_n,\
+    browmajor_askinny_void_omp, _omp) };\
+\
+int gemmtype(int a_rowmajor, int b_rowmajor,\
+  const atype *A, const btype *B, ctype *C,\
+  uint32_t M, uint32_t N, uint32_t K, ctype beta_inp, uint32_t num_threads) {\
+\
+  uint32_t rec_threads = (uint64_t)M * (uint64_t)N * (uint64_t)K \
+    / (GEMM_D_K * GEMM_D_MN * unroll_l1) + 1;\
+  if (num_threads > rec_threads) num_threads = rec_threads;\
+  if (!num_threads) num_threads = rec_threads;\
+  uint32_t max_threads = omp_get_max_threads();\
+  if (num_threads > max_threads) num_threads = max_threads;\
+\
+  if (num_threads == 1 || K == 0) {\
+    return gemmtype##_serial(a_rowmajor, b_rowmajor, A, B, C, M, N, K, beta_inp);\
+  }\
+\
+  if (!inline_gemm_par_valid(A, B, C, M, N, K)) return 1;\
+  if (0 __VA_ARGS__) return 2;\
+\
+  omp_set_num_threads(num_threads);\
+  if (N <= skin1_maxn && a_rowmajor) {\
+    (* gemmtype##_bskinny1_omp[N])(\
+      A, B, C, M, K, b_rowmajor ? 1 : 0, beta_inp, num_threads);\
+    return 0;\
+  }\
+  if (M <= skin1_maxm && !b_rowmajor) {\
+    (* gemmtype##_askinny1_omp[M])(\
+      B, A, C, N, K, a_rowmajor ? 2 : 3, beta_inp, num_threads);\
+    return 0;\
+  }\
+  if (N <= skin2_maxn && !a_rowmajor) {\
+    (* gemmtype##_bskinny2_omp[N])(\
+      A, B, C, M, K, b_rowmajor ? 1 : 0, beta_inp, num_threads);\
+    return 0;\
+  }\
+  if (M <= skin2_maxm && b_rowmajor) {\
+    (* gemmtype##_askinny2_omp[M])(\
+      B, A, C, N, K, a_rowmajor ? 2 : 3, beta_inp, num_threads);\
+    return 0;\
+  }\
+\
+  satype * const blas_master_sa = blas_##gemmtype##_sa;\
+  sbtype * const blas_master_sb = blas_##gemmtype##_sb;\
+  uint32_t acopy_dim_left, bcopy_dim_left;\
+  uint64_t mn_task_end;\
+\
+  _Pragma("omp parallel")\
+  {\
+    const uint32_t tid = omp_get_thread_num();\
+    uint32_t k_pos, m_pos, n_pos, k_inc, m_inc, n_inc;\
+    uint32_t bcopy_dim_start, bcopy_dim_end, acopy_dim_start, acopy_dim_end;\
+    uint32_t gemm_mstart, gemm_mend, gemm_nstart, gemm_nend;\
+    uint64_t gemm_mn_max;\
+    for (k_pos = 0; k_pos < K; k_pos += k_inc) {\
+      k_inc = K - k_pos;\
+      if (k_inc >= (GEMM_D_K << 1)) k_inc = GEMM_D_K;\
+      else if (k_inc > GEMM_D_K) k_inc >>= 1;\
+      const uint32_t scratch_k_inc = (k_inc == 0) ? 0 :\
+        SCRATCH_K_CORD(k_inc - 1) + 1;\
+      const ctype beta = (k_pos == 0) ? beta_inp : 1;\
+      for (n_pos = 0; n_pos < N; n_pos += n_inc) {\
+        n_inc = N - n_pos;\
+        if (n_inc >= (GEMM_R_MN << 1)) n_inc = GEMM_R_MN;\
+        else if (n_inc > GEMM_R_MN) n_inc >>= 1;\
+\
+        if (!tid) bcopy_dim_left = n_inc;\
+        _Pragma("omp barrier");\
+\
+        while (get_copy_task(&bcopy_dim_left, unroll_l1 << 3,\
+          &bcopy_dim_start, &bcopy_dim_end)) {\
+          if (b_rowmajor) {\
+            gemmtype##_##btype##_##sbtype##_tcopy_unroll##unroll_l1(\
+              B + k_pos * N + n_pos + bcopy_dim_start,\
+              blas_master_sb + bcopy_dim_start * scratch_k_inc,\
+              N, bcopy_dim_end - bcopy_dim_start, k_inc);\
+          } else {\
+            gemmtype##_##btype##_##sbtype##_ncopy_unroll##unroll_l1(\
+              B + K * (n_pos + bcopy_dim_start) + k_pos,\
+              blas_master_sb + bcopy_dim_start * scratch_k_inc,\
+              K, k_inc, bcopy_dim_end - bcopy_dim_start);\
+          }\
+        }\
+\
+        for (m_pos = 0; m_pos < M; m_pos += m_inc) {\
+          m_inc = M - m_pos;\
+          if (m_inc >= (GEMM_R_MN << 1)) m_inc = GEMM_R_MN;\
+          else if (m_inc > GEMM_R_MN) m_inc >>= 1;\
+\
+          if (!tid) acopy_dim_left = m_inc;\
+          _Pragma("omp barrier");\
+\
+          while (get_copy_task(&acopy_dim_left, unroll_l2 << 3,\
+            &acopy_dim_start, &acopy_dim_end)) {\
+            if (a_rowmajor) {\
+              gemmtype##_##atype##_##satype##_ncopy_unroll##unroll_l2(\
+                A + K * (m_pos + acopy_dim_start) + k_pos,\
+                blas_master_sa + acopy_dim_start * scratch_k_inc,\
+                K, k_inc, acopy_dim_end - acopy_dim_start);\
+            } else {\
+              gemmtype##_##atype##_##satype##_tcopy_unroll##unroll_l2(\
+                A + M * k_pos + m_pos + acopy_dim_start,\
+                blas_master_sa + acopy_dim_start * scratch_k_inc,\
+                M, acopy_dim_end - acopy_dim_start, k_inc);\
+            }\
+          }\
+\
+          if (!tid) mn_task_end = (uint64_t)n_pos << 32 | (uint64_t)m_pos;\
+          gemm_mn_max = ((uint64_t)(n_pos + n_inc) << 32)\
+            | (uint64_t)(m_pos + m_inc);\
+          _Pragma("omp barrier");\
+\
+          while (get_mn_task(&mn_task_end,\
+            &gemm_mstart, &gemm_nstart, &gemm_mend, &gemm_nend,\
+            ((uint64_t)unroll_l1 << 32) | ((GEMM_D_MN >> 2) / unroll_l2 * unroll_l2),\
+            GEMM_D_MN, n_pos, gemm_mn_max, num_threads)) {\
+\
+            gemmtype##_kernel_lm_m##unroll_l2##n##unroll_l1(\
+              gemm_mend - gemm_mstart, gemm_nend - gemm_nstart, scratch_k_inc, beta,\
+              blas_master_sa + (gemm_mstart - m_pos) * scratch_k_inc,\
+              blas_master_sb + (gemm_nstart - n_pos) * scratch_k_inc,\
+              C + M * gemm_nstart + gemm_mstart, M);\
+          }\
+        }\
+      }\
+    }\
+  }\
+  return 0;\
+}
+
+#endif
+
+#endif
diff --git a/include/common/CommonKernel.h b/include/common/CommonKernel.h
new file mode 100644
index 0000000..04dfbc3
--- /dev/null
+++ b/include/common/CommonKernel.h
@@ -0,0 +1,190 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+/******************************************************************************
+ * File:        CommonKernel.h
+ * Description: The common skeleton of regular GEMM kernel functions with both
+ *              source matrices packed before computation
+ * Extention:   For supporting a new CPU arch, the following steps are needed
+ *              in addition to including this header:
+ *              (1) implement a collection of inline GEMM functions, each with
+ *                  fixed M & N but variable K(as input param), for the
+ *                  multiplication of column-major matrix A with row-major
+ *                  matrix B and update the results to column-major matrix C.
+ *                  A SGEMM inline function with M = 2 and N = 4 can be
+ *                  implemented like this:
+ *                   static inline void
+ *                     inline_dualpack_gemm_afloat_bfloat_cfloat_m2_n4(
+ *                       const float *a_head, const float *b_head,
+ *                         float *c_ptr, uint32_t K, float beta, uint32_t ldc) {
+ *                           float c0, c1, c2, c3, c4, c5, c6, c7;
+ *                           c0 = c1 = c2 = c3 = c4 = c5 = c6 = c7 = 0.0f;
+ *                           for (; K > 0; K--) {
+ *                             float a0 = a_head[0];
+ *                             float a1 = a_head[1]; a_head += 2;
+ *                             float b0 = b_head[0];
+ *                             float b1 = b_head[1];
+ *                             float b2 = b_head[2];
+ *                             float b3 = b_head[3]; b_head += 4;
+ *                             c0 += a0 * b0; c1 += a1 * b0;
+ *                             c2 += a0 * b1; c3 += a1 * b1;
+ *                             c4 += a0 * b2; c5 += a1 * b2;
+ *                             c6 += a0 * b3; c7 += a1 * b3;
+ *                           }
+ *                           c_ptr[0] = c_ptr[0] * beta + c0;
+ *                           c_ptr[1] = c_ptr[1] * beta + c1;
+ *                           c_ptr += ldc;
+ *                           c_ptr[0] = c_ptr[0] * beta + c2;
+ *                           c_ptr[1] = c_ptr[1] * beta + c3;
+ *                           c_ptr += ldc;
+ *                           c_ptr[0] = c_ptr[0] * beta + c4;
+ *                           c_ptr[1] = c_ptr[1] * beta + c5;
+ *                           c_ptr += ldc;
+ *                           c_ptr[0] = c_ptr[0] * beta + c6;
+ *                           c_ptr[1] = c_ptr[1] * beta + c7;
+ *                         }
+ *              (2) Construct kernel functions with the aid of macros.
+ *                  Please refer to src/neon_armv7a/SgemmKernel.c for example.
+ *****************************************************************************/
+
+#include "ExpandMacro.h"
+#include <stdint.h>
+
+#ifndef INCLUDE_COMMON_KERNEL
+#define INCLUDE_COMMON_KERNEL
+
+/* the macros COMPUTE_MmNn are architecture dependant,
+ * which should be defined in the source file including this header */
+
+#define COMPUTE_STD_INIT_SLICE(n_pos, mdim, ctype) \
+  ctype c_reg##n_pos[mdim];\
+  _Pragma("omp simd")\
+  for (int j = 0; j < mdim; ++j) {\
+    c_reg##n_pos[j] = 0;\
+  }
+
+#define COMPUTE_STD_ACC_SLICE(n_pos, mdim, ndim, k_off) \
+  _Pragma("omp simd")\
+  for (int j = 0; j < mdim; ++j) {\
+    c_reg##n_pos[j] += a_ptr[j + k_off * mdim] *\
+      b_ptr[n_pos - 1 + k_off * ndim];\
+  }
+
+#define COMPUTE_STD_SAVE_SLICE(n_pos, mdim, c_str) \
+  _Pragma("omp simd")\
+  for (int j = 0; j < mdim; ++j) {\
+    c_str[j] = c_str[j] * beta + c_reg##n_pos[j];\
+  }\
+  c_str += ldc;
+
+#define COMPUTE_STD(mdim, ndim, atype, btype, ctype) \
+static inline void\
+  inline_dualpack_gemm_a##atype##_b##btype##_c##ctype##_m##mdim##_n##ndim(\
+  const atype *a_head, const btype *b_head, ctype *c_ptr,\
+  uint32_t K, ctype beta, uint32_t ldc) {\
+  MACRO_EXP_##ndim(VOID_BASE, COMPUTE_STD_INIT_SLICE, mdim, ctype)\
+  const atype * a_ptr = a_head;\
+  const btype * b_ptr = b_head;\
+  uint32_t k_left = K;\
+  for (; k_left > 3; k_left -= 4) {\
+    MACRO_EXP_##ndim(VOID_BASE, COMPUTE_STD_ACC_SLICE, mdim, ndim, 0)\
+    MACRO_EXP_##ndim(VOID_BASE, COMPUTE_STD_ACC_SLICE, mdim, ndim, 1)\
+    MACRO_EXP_##ndim(VOID_BASE, COMPUTE_STD_ACC_SLICE, mdim, ndim, 2)\
+    MACRO_EXP_##ndim(VOID_BASE, COMPUTE_STD_ACC_SLICE, mdim, ndim, 3)\
+    a_ptr += mdim * 4;\
+    b_ptr += ndim * 4;\
+  }\
+  for (; k_left > 0; k_left--) {\
+    MACRO_EXP_##ndim(VOID_BASE, COMPUTE_STD_ACC_SLICE, mdim, ndim, 0)\
+    a_ptr += mdim;\
+    b_ptr += ndim;\
+  }\
+  ctype *c_str = c_ptr;\
+  MACRO_EXP_##ndim(VOID_BASE, COMPUTE_STD_SAVE_SLICE, mdim, c_str)\
+}
+
+#define MICRO_COMPUTE_LM_LOOP(mdim, ndim, atype, btype, ctype) \
+  for (; m_left >= mdim; m_left -= mdim) {\
+    inline_dualpack_gemm_a##atype##_b##btype##_c##ctype##_m##mdim##_n##ndim(\
+      a_head, b_head, c_ptr, K, beta, ldc);\
+    a_head += mdim * K;\
+    c_ptr += mdim;\
+  }
+
+#define MICRO_COMPUTE_LN_LOOP(ndim, mdim, atype, btype, ctype) \
+  for (; n_left >= ndim; n_left -= ndim) {\
+    inline_dualpack_gemm_a##atype##_b##btype##_c##ctype##_m##mdim##_n##ndim(\
+      a_head, b_head, c_ptr, K, beta, ldc);\
+    b_head += ndim * K;\
+    c_ptr += ndim * ldc;\
+  }
+
+#define MICRO_COMPUTE_LM(mdim, ndim, atype, btype, ctype) \
+  MACRO_EXPANSION_E_##mdim(MICRO_COMPUTE_LM_LOOP, ndim, atype, btype, ctype)
+
+#define MICRO_COMPUTE_LN(mdim, ndim, atype, btype, ctype) \
+  MACRO_EXPANSION_E_##ndim(MICRO_COMPUTE_LN_LOOP, mdim, atype, btype, ctype)
+
+#define DUALPACK_COMPUTE_LM(ndim, satype, sbtype, ctype, block_m_max) \
+  for (; n_left >= ndim; n_left -= ndim) {\
+    const satype *a_head = sa;\
+    ctype *c_ptr = c_head;\
+    uint32_t m_left = M;\
+    MICRO_COMPUTE_LM(block_m_max, ndim, satype, sbtype, ctype)\
+    b_head += K * ndim;\
+    c_head += ldc * ndim;\
+  }
+
+#define DUALPACK_COMPUTE_LN(mdim, satype, sbtype, ctype, block_n_max) \
+  for (; m_left >= mdim; m_left -= mdim) {\
+    const sbtype *b_head = sb;\
+    ctype *c_ptr = c_head;\
+    uint32_t n_left = N;\
+    MICRO_COMPUTE_LN(mdim, block_n_max, satype, sbtype, ctype)\
+    a_head += K * mdim;\
+    c_head += mdim;\
+  }
+
+#define ASSEMBLE_DUALPACK_COMPUTE_LM(ndim, satype, sbtype, ctype, block_m_max) \
+  MACRO_EXP_E_##ndim(DUALPACK_COMPUTE_LM, satype, sbtype, ctype, block_m_max)
+
+#define ASSEMBLE_DUALPACK_COMPUTE_LN(mdim, satype, sbtype, ctype, block_n_max) \
+  MACRO_EXP_E_##mdim(DUALPACK_COMPUTE_LN, satype, sbtype, ctype, block_n_max)
+
+#define DUALPACK_KERNEL_FUNC_LM(gemmtype, satype, sbtype, ctype, block_m_max, block_n_max) \
+void gemmtype##_kernel_lm_m##block_m_max##n##block_n_max(\
+  uint32_t M, uint32_t N, uint32_t K, ctype beta,\
+  const satype * __restrict__ sa, const sbtype * __restrict__ sb,\
+  ctype * __restrict__ C, uint32_t ldc) {\
+  uint32_t n_left = N;\
+  const sbtype *b_head = sb;\
+  ctype *c_head = C;\
+  ASSEMBLE_DUALPACK_COMPUTE_LM(block_n_max, satype, sbtype, ctype, block_m_max)\
+}
+
+#define DUALPACK_KERNEL_FUNC_LN(gemmtype, satype, sbtype, ctype, block_m_max, block_n_max) \
+void gemmtype##_kernel_ln_m##block_m_max##n##block_n_max(\
+  uint32_t M, uint32_t N, uint32_t K, ctype beta,\
+  const satype * __restrict__ sa, const sbtype * __restrict__ sb,\
+  ctype * __restrict__ C, uint32_t ldc) {\
+  uint32_t m_left = M;\
+  const satype *a_head = sa;\
+  ctype *c_head = C;\
+  ASSEMBLE_DUALPACK_COMPUTE_LN(block_m_max, satype, sbtype, ctype, block_n_max)\
+}
+
+#endif
diff --git a/include/common/CommonLayer.h b/include/common/CommonLayer.h
new file mode 100644
index 0000000..cbb7b63
--- /dev/null
+++ b/include/common/CommonLayer.h
@@ -0,0 +1,90 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+/******************************************************************************
+ * File:        CommonLayer.h
+ * Description: Function templates for operations in neural network layers
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdbool.h>
+
+#ifndef INCLUDE_COMMON_LAYER
+#define INCLUDE_COMMON_LAYER
+
+/* function template for fully-connected layer, serial & OpenMP */
+#define SIMPLE_FC_FUNC(gemmtype, wtype, itype, otype, ...) \
+int fc##__VA_ARGS__(const itype *src, const wtype *weight,\
+  const otype *bias, otype *output, int M, int K, int N,\
+  int trans_src, int trans_weight, int num_threads) {\
+\
+  int status = gemmtype(trans_weight, trans_src,\
+    weight, src, output, N, M, K, 0, num_threads);\
+  if (status) return status;\
+  bias_##otype(output, 0.0, bias, 1.0, NULL, 0.0, N, M);\
+  return status;\
+}
+
+/* function template for bias layer */
+#define STD_BIAS_FUNC(type) \
+void bias_##type(type *C, type bias_dim0, const type *bias_dim1,\
+  type bias_dim1_scale, const type *bias_dim2, type bias_dim2_scale,\
+  uint32_t dim1, uint32_t dim2) {\
+\
+  if (!C) return;\
+\
+  bool do_bias_0 = (bias_dim0 != 0);\
+  bool do_bias_1 = bias_dim1 && (bias_dim1_scale != 0);\
+  bool do_bias_2 = bias_dim2 && (bias_dim2_scale != 0);\
+\
+  if (!do_bias_0 && !do_bias_1 && !do_bias_2) return;\
+\
+  if (!do_bias_1 && (do_bias_0 || do_bias_2)) {\
+    for (uint32_t dim2_pos = 0; dim2_pos < dim2; ++dim2_pos) {\
+      type *c_ptr = C + dim2_pos * dim1;\
+      const type bs = bias_dim0 + \
+        (bias_dim2 ? bias_dim2[dim2_pos] * bias_dim2_scale : 0);\
+      _Pragma("omp simd")\
+      for (uint32_t dim1_pos = 0; dim1_pos < dim1; ++dim1_pos) {\
+        c_ptr[dim1_pos] += bs;\
+      }\
+    }\
+  } else if (do_bias_1 && !do_bias_0 && !do_bias_2) {\
+    for (uint32_t dim2_pos = 0; dim2_pos < dim2; ++dim2_pos) {\
+      type *c_ptr = C + dim2_pos * dim1;\
+      const type *bias_ptr = bias_dim1;\
+      _Pragma("omp simd")\
+      for (uint32_t dim1_pos = 0; dim1_pos < dim1; ++dim1_pos) {\
+        c_ptr[dim1_pos] += bias_ptr[dim1_pos] * bias_dim1_scale;\
+      }\
+    }\
+  } else {\
+    for (uint32_t dim2_pos = 0; dim2_pos < dim2; ++dim2_pos) {\
+      type *c_ptr = C + dim2_pos * dim1;\
+      const type bs = bias_dim0 + \
+        (bias_dim2 ? bias_dim2[dim2_pos] * bias_dim2_scale : 0);\
+      const type *bias_ptr = bias_dim1;\
+      _Pragma("omp simd")\
+      for (uint32_t dim1_pos = 0; dim1_pos < dim1; ++dim1_pos) {\
+        c_ptr[dim1_pos] += bs +\
+          bias_ptr[dim1_pos] * bias_dim1_scale;\
+      }\
+    }\
+  }\
+}
+
+#endif
diff --git a/include/common/CommonQuant.h b/include/common/CommonQuant.h
new file mode 100644
index 0000000..700b160
--- /dev/null
+++ b/include/common/CommonQuant.h
@@ -0,0 +1,311 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+/******************************************************************************
+ * File:        CommonQuant.h
+ * Description: Function templates for quant/dequant/requant functions.
+ *****************************************************************************/
+
+#include <stdint.h>
+#include <string.h>
+
+#ifndef INCLUDE_COMMON_QUANT
+#define INCLUDE_COMMON_QUANT
+
+/* function template for asymmetric quantization fp -> uint */
+#define QUANTIZE_ASYMMETRIC(inbits, outbits) \
+void quantize_asymmetric_f##inbits##_u##outbits(\
+  const float##inbits##_t *input, uint##outbits##_t *output,\
+  uint##outbits##_t *zero_point, float##inbits##_t *scale, uint32_t size,\
+  float##inbits##_t input_min, float##inbits##_t input_max) {\
+\
+  if (size == 0) return;\
+  float##inbits##_t min, max;\
+  if (input_min <= input_max) {\
+    min = input_min;\
+    max = input_max;\
+  } else {\
+    inline_find_extreme_float##inbits##_t(input, size, &min, &max);\
+  }\
+\
+  if (min > 0) min = 0.0;\
+  if (max < 0) max = 0.0;\
+  const float##inbits##_t max_diff = max - min;\
+  if (max_diff == 0.0) {\
+    memset(output, 0, size * (outbits >> 3));\
+    *zero_point = 0;\
+    *scale = 1.0;\
+    return;\
+  }\
+\
+  const float##inbits##_t sc = max_diff *\
+    (float##inbits##_t)(1.0 / (uint##outbits##_t)-1);\
+  *scale = sc;\
+  unsigned long long z = ((float##inbits##_t)0.0 - min) / sc\
+    + (float##inbits##_t)0.5;\
+  const uint##outbits##_t zp = z > (uint##outbits##_t)-1 ?\
+    (uint##outbits##_t)-1 : z;\
+  *zero_point = zp;\
+\
+  inline_quant_asym_u##outbits##_from_f##inbits(input, output, size, zp, sc);\
+}
+
+/* function template for symmetric quantization fp -> int */
+#define QUANTIZE_SYMMETRIC(inbits, outbits) \
+void quantize_symmetric_f##inbits##_s##outbits(\
+  const float##inbits##_t *input, int##outbits##_t *output,\
+  float##inbits##_t *scale, uint32_t size,\
+  float##inbits##_t input_min, float##inbits##_t input_max) {\
+\
+  if (size == 0) return;\
+  float##inbits##_t min, max;\
+  if (input_min <= input_max) {\
+    min = input_min;\
+    max = input_max;\
+  } else {\
+    inline_find_extreme_float##inbits##_t(input, size, &min, &max);\
+  }\
+\
+  const uint##outbits##_t out_abs_max = (uint##outbits##_t)-1 >> 1;\
+  const float##inbits##_t sc_positive = max *\
+    (float##inbits##_t)(1.0 / out_abs_max);\
+  const float##inbits##_t sc_negative = min *\
+    (float##inbits##_t)(-1.0 / (out_abs_max + 1));\
+  const float##inbits##_t sc =\
+    sc_positive > sc_negative ? sc_positive : sc_negative;\
+  if (sc == 0.0) {\
+    memset(output, 0, size * (outbits >> 3));\
+    *scale = 1.0;\
+    return;\
+  }\
+  *scale = sc;\
+\
+  inline_quant_sym_s##outbits##_from_f##inbits(input, output, size, sc);\
+}
+
+/******************************************************************************
+ * Template:    REQUANTIZE_ASYMMETRIC_MULHI
+ * Description: Function template of asymmetric requantization
+ *              based on "mulhi" operations.
+ *              Basically, the requantization can be done like this:
+ *              (1) determine the min and max of input integers
+ *                 if min > 0, min is set to 0
+ *                 if max < 0, max is set to 0
+ *              (2) calculate scaling factor Sint on input integers:
+ *                 Sint = expression_range_of_output_uint / (max - min)
+ *              (3) calculate zero point Z of output
+ *                 Z = -min * Si
+ *              (4) inflate input integers {Ii} to output ints {Oi}
+ *                 for i in input index range
+ *                   Oi = Ii * Sint + Z
+ *              (5) update scaling factor S
+ *                 S /= Sint
+ *              The steps (1) - (4) are just identical to that in asymmetric
+ *              quantization if the inputs are floating numbers. For integers
+ *              the situation gets a bit more complicated. The scaling factor
+ *              Sint need to be expressed by integer(s). For precision reasons
+ *              the exponent and mantissa part of Sint should be stored in
+ *              individual integers Bint and Eint:
+ *                Sint = (2^exp + mantissa) * (2^-exp) = Bint * (2^-Eint)
+ *                Bint = 2^exp + mantissa, Eint = exp
+ *              Also, the multiplication Ii * Sint in step (4) changes to
+ *              (Ii * Bint) >> Eint.
+ *
+ *              For integer multiplications on CPU, there're 3 types of
+ *              operations normally:
+ *              (1) keep all bits of the product, so the length of result
+ *                  is twice of that of input
+ *              (2) keep only lower half of the product, with output length
+ *                  unchanged: "mullo" operation
+ *              (3) keep only higher half of the product, with output length
+ *                  unchanged: "mulhi" operation
+ *              Among the 3 types of operations, type (2) is useful only when
+ *              the inputs are small enough (sum of valid bits must be no more
+ *              than input length). For type (1), keeping the lower half of
+ *              product is not necessary if the input numbers are big enough
+ *              (near expression limit). So we choose type (3) for precision
+ *              and efficiency concerns.
+ *              Generally, we determine a left-shift number L, a mult-factor
+ *              M, a right-shift number R and a zero-point Z according to
+ *              the min and max of input integers. Then the following steps
+ *              are performed on each input integer Ii:
+ *              (1) left-shift Ii by L, which can make the min or max number
+ *                  approach the expression limit of input integer type,
+ *                  so as to minimize the precision loss in subsequent
+ *                  "mulhi" operation.
+ *              (2) perform "mulhi" operation of shifted Ii with mult-factor
+ *                  M to yield (rounded) higher-half product Pi. The value
+ *                  of M is also near the expression limit of its type.
+ *              (3) right (saturated rounded) shift of Pi by R.
+ *                  The right shift is needed to fit results into
+ *                  the expression range of output type.
+ *              (4) add shifted Pi with Z to get output integer Oi.
+ * Parameters:  fp: the type of scale to update (float/double/float16_t/...)
+ *              inbits: the number of bits of input integral type
+ *              outbits: the number of bits of output integral type
+ *              accbits: must be 2 * outbits
+ * Dependency:  the following inline functions should be implemented prior
+ *              to the introduction of this macro:
+ *              (1) inline_find_extreme_int<inbits>_t(
+ *                    const int<inbits>_t *dat, uint32_t size,
+ *                    int<inbits>_t *min, int<inbits>_t *max) {...}
+ *                  This function determines the minimum (write to *min)
+ *                  and maximum (write to *max) value of input dat[] which
+ *                  has "size" elements.
+ *              (2) inline_requant_asym_u<outbits>_from_s<inbits>_mulhi(
+ *                    const int<inbits>_t *input, uint<outbits>_t *output,
+ *                    uint32_t size, uint8_t L, int<inbits>_t M,
+ *                    uint<outbits>_t Z) {...}
+ *                  This function performs left-shift on input by L, then
+ *                  "mulhi" it with a mult-factor M, right-shift the
+ *                  product by R and add it with Z to get output,
+ *                  just as the 4 steps shown above.
+ *                  The right-shift value R is fixed to accbits-outbits-3
+ *                  so it is not in the parameter list.
+ *****************************************************************************/
+#define REQUANTIZE_ASYMMETRIC_MULHI(fp, inbits, outbits, accbits) \
+void requantize_asymmetric_##inbits##to##outbits(\
+  const int##inbits##_t *input, uint##outbits##_t *output,\
+  fp *scale, uint##outbits##_t *zero_point, uint32_t size,\
+  int##inbits##_t input_min, int##inbits##_t input_max) {\
+\
+  if (size == 0) return;\
+  const fp scale_org = *scale;\
+  if (scale_org == 0.0) {\
+    *zero_point = 0;\
+    memset(output, 0, size * sizeof(uint##outbits##_t));\
+    return;\
+  }\
+\
+  int##inbits##_t min, max;\
+  if (input_min <= input_max) {\
+    min = input_min;\
+    max = input_max;\
+  } else {\
+    inline_find_extreme_int##inbits##_t(input, size, &min, &max);\
+  }\
+  max = max < 0 ? 0 : max;\
+  min = min > 0 ? 0 : min;\
+  if (min == max) {\
+    *zero_point = 0;\
+    memset(output, 0, size * sizeof(uint##outbits##_t));\
+    return;\
+  }\
+\
+  int##inbits##_t abs_max = -min;\
+  if (max > abs_max) abs_max = max;\
+  unsigned int max_digits = 0;\
+  for (; abs_max > 0; ++max_digits) abs_max >>= 1;\
+\
+  const int src_lshift = inbits - 1 - max_digits;\
+  const uint##inbits##_t range = (uint##inbits##_t)max - (uint##inbits##_t)min;\
+\
+  uint##accbits##_t mult_par = \
+    ((uint##accbits##_t)1 << (accbits - 3)) -\
+    ((uint##accbits##_t)1 << (accbits - outbits - 3));\
+\
+  int##accbits##_t lsh_range = (int##accbits##_t)range << src_lshift;\
+  int##inbits##_t mult_factor = mult_par / lsh_range;\
+  if (mult_par % lsh_range > lsh_range >> 1) {\
+    mult_factor++;\
+  }\
+\
+  int##accbits##_t z_mid = (int##accbits##_t)((-min) << src_lshift) * \
+    (int##accbits##_t)mult_factor;\
+  int##inbits##_t z_mid2 = z_mid >> (accbits - outbits - 3);\
+  if (z_mid & ((int##accbits##_t)1 << (accbits - outbits - 4))) z_mid2++;\
+  uint##outbits##_t zp = z_mid2 < 0 ?\
+    0 : (z_mid2 > (uint##outbits##_t)-1 ? (uint##outbits##_t)-1 : z_mid2);\
+  *zero_point = zp;\
+\
+  *scale = (*scale) * (fp)range * ((fp)1 / (fp)((uint##outbits##_t)-1));\
+  inline_requant_asym_u##outbits##_from_s##inbits##_mulhi(input, output,\
+    size, src_lshift, mult_factor, zp);\
+}
+
+/******************************************************************************
+ * Template:    REQUANTIZE_SYMMETRIC_MULHI
+ * Description: Function template of symmetric requantization
+ *              based on "mulhi" operations.
+ * Parameters:  fp: the type of scale to update (float/double/float16_t/...)
+ *              inbits: the number of bits of input integral type
+ *              outbits: the number of bits of output integral type
+ *              accbits: must be 2 * outbits
+ * Dependency:  the following inline functions should be implemented prior
+ *              to the introduction of this macro:
+ *              (1) inline_find_extreme_int<inbits>_t(
+ *                    const int<inbits>_t *dat, uint32_t size,
+ *                    int<inbits>_t *min, int<inbits>_t *max) {...}
+ *                  This function determines the minimum (write to *min)
+ *                  and maximum (write to *max) value of input dat[] which
+ *                  has "size" elements.
+ *              (2) inline_requant_sym_s<outbits>_from_s<inbits>_mulhi(
+ *                    const int<inbits>_t *input, int<outbits>_t *output,
+ *                    uint32_t size, uint8_t L, int<inbits>_t M) {...}
+ *                  This function performs left-shift on input by L, then
+ *                  "mulhi" it with a mult-factor M, finally right-shift
+ *                  the product by R to get output,
+ *                  The right-shift value R is fixed to accbits-outbits-2
+ *                  so it is not in the parameter list.
+ *****************************************************************************/
+#define REQUANTIZE_SYMMETRIC_MULHI(fp, inbits, outbits, accbits) \
+void requantize_symmetric_##inbits##to##outbits(\
+  const int##inbits##_t *input, int##outbits##_t *output,\
+  fp *scale, uint32_t size,\
+  int##inbits##_t input_min, int##inbits##_t input_max) {\
+\
+  if (size == 0) return;\
+  const fp scale_org = *scale;\
+  if (scale_org == 0.0) {\
+    memset(output, 0, size * sizeof(uint##outbits##_t));\
+    return;\
+  }\
+\
+  int##inbits##_t min, max;\
+  if (input_min <= input_max) {\
+    min = input_min;\
+    max = input_max;\
+  } else {\
+    inline_find_extreme_int##inbits##_t(input, size, &min, &max);\
+  }\
+  int##inbits##_t max_abs = max;\
+  if (max_abs < -min) max_abs = -min;\
+  if (max_abs == 0) {\
+    memset(output, 0, size * sizeof(uint##outbits##_t));\
+    return;\
+  }\
+\
+  int##inbits##_t tmp = max_abs;\
+  unsigned int max_digits = 0;\
+  for (; tmp > 0; ++max_digits) tmp >>= 1;\
+\
+  const int src_lshift = inbits - 1 - max_digits;\
+  uint##accbits##_t mult_par = \
+    ((uint##accbits##_t)1 << (accbits - 3)) -\
+    ((uint##accbits##_t)1 << (accbits - outbits - 2));\
+  uint##accbits##_t lsh_max_abs = max_abs << src_lshift;\
+  int##inbits##_t mult_factor = mult_par / lsh_max_abs;\
+  if (mult_par % lsh_max_abs > lsh_max_abs >> 1) {\
+    mult_factor++;\
+  }\
+\
+  *scale = (*scale) * (fp)max_abs * ((fp)1 / (fp)(((uint##outbits##_t)-1) >> 1));\
+  inline_requant_sym_s##outbits##_from_s##inbits##_mulhi(input, output,\
+    size, src_lshift, mult_factor);\
+}
+
+#endif
diff --git a/include/common/CommonSched.h b/include/common/CommonSched.h
new file mode 100644
index 0000000..4b21964
--- /dev/null
+++ b/include/common/CommonSched.h
@@ -0,0 +1,265 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+/*************************************************************************
+ * File:        CommonSched.h
+ * Description: Functions associated with task distribution and
+ *              synchronization in parallelized calculations
+*************************************************************************/
+#include <stdint.h>
+
+#ifndef INCLUDE_COMMON_SCHEDULE
+#define INCLUDE_COMMON_SCHEDULE
+
+/* The atomic compare-and-swap instructions are platform-
+ * specific, which need to be given elsewhere.
+ * If the compiler is GCC, the simplest way is to activate the following
+ * macro before including this file */
+#ifdef GCC_BUILTIN_SYNC
+static uint32_t atomicCAS_U32(uint32_t comp, uint32_t write, uint32_t *dst) {
+  __atomic_compare_exchange_n(dst, &comp, write,
+    0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+  return comp;
+}
+
+static uint64_t atomicCAS_U64(uint64_t comp, uint64_t write, uint64_t *dst) {
+  __atomic_compare_exchange_n(dst, &comp, write,
+    0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+  return comp;
+}
+#endif
+
+#ifndef D_SCALE
+#define D_SCALE 8 //dynamical scaling factor in 2D task distribution
+#endif
+
+/******************************************************************************
+ * Function:     get_mn_task
+ * Description:  Function for a running OpenMP thread to get GEMM task (m, n)
+ *               from a (shared) task container atomically.
+ * Design:       The task destribution in parallelized GEMM is done in (M, N)
+ *               2-dimension space. In parallel run, Several threads are
+ *               deployed to compute a MxN output matrix, just like dividing
+ *               a rectangular piece of paper with length = N and width = M.
+ *               The MxN piece is cut into a number of tiny rectangular pieces
+ *               that are distributed to threads. Instead of finishing all the
+ *               cutting work before distribution, this function does cutting
+ *               and distribution simultaneously. When a thread becomes idle
+ *               in parallel zone, it will call this function to cut a new
+ *               piece (task) from the remaining paper and take ownership of
+ *               the newly-cut small piece(then work on it), or go to a barrier
+ *               and wait till other threads finish their work when the paper
+ *               has been used up. The size of the newly-get piece(task) is
+ *               proportional to the area of the remaining paper.
+ *               A typical cutting process is shown below:
+ *
+ *                 ____________                      ______
+ *                |            |                    |      |
+ *                |            |  first cut    _____|m1    |
+ *             M  |            | -----------> |  n1        |
+ *                |            |   m1xn1      |  remain    |
+ *                |____________|              |____________|
+ *                      N                           |
+ *                                     second cut   | m1xn2
+ *                                   (m must be m1) |
+ *                                                  V    __
+ *                                                    m1|  |
+ *                 ____________   third cut    _________|  |
+ *                |            | <----------- |  n1+n2     |
+ *                |M-m1        |     m1xn3    |M-m1        |
+ *                |____________|  n3==N-n1-n2 |____________|
+ *                      N       (m must be m1)
+ *                      |
+ *           fourth cut | m2xn4
+ *                      |
+ *                      V
+ *                      _______
+ *                     |       |
+ *                 ____|       | ---> ---> ---> repeat till nothing left
+ *                |____________|
+ *
+ * Calls:        atomicCAS_U64 (atomic compare and swap, make the cuts atomic)
+ * Input:        uint64_t *task_end: the address of a long interger recording
+ *                                   the start point of the next task. The
+ *                                   long integer represents the coordinates
+ *                                   (low-32bit = m, high-32bit = n) of the
+ *                                   vortex right above the inward-pointing one
+ *                                   in the remaining paper when it is a
+ *                                   concave hexagon, or the upper left corner
+ *                                   when the remaining is a rectangle.
+ *               uint32_t n_pos_min: input the lower bound of N-direction.
+ *               uint64_t m_n_pos_max: input the upper bounds
+ *                                     along M and N axis
+ *                                     low 32 bits: m_max
+ *                                     high 32 bits: n_max.
+ *               uint64_t m_n_task_min: the minimum task size of
+ *                                      m (low 32bit) and n (high 32bit)
+ *               uint32_t m_task_max: the maximum task size of m
+ *               uint32_t num_threads: input the number of OpenMP threads.
+ * Output:       uint32_t *m_start: to output the starting m of the new task.
+ *               uint32_t *n_start: to output the starting n of the new task.
+ *               uint32_t *m_end: to output the ending m of the new task.
+ *               uint32_t *n_end: to output the ending n of the new task.
+ * Return:       0 if there's no task left, 1 if a new task has been acquired.
+ *****************************************************************************/
+static uint32_t get_mn_task(uint64_t *task_end,
+  uint32_t *m_start, uint32_t *n_start, uint32_t *m_end, uint32_t *n_end,
+  uint64_t m_n_task_min, uint32_t m_task_max,
+  uint32_t n_pos_min, uint64_t m_n_pos_max, uint32_t num_threads) {
+
+  const uint32_t m_pos_max = m_n_pos_max & 0xFFFFFFFF;
+  const uint32_t n_pos_max = m_n_pos_max >> 32;
+  const uint32_t m_task_min_raw = m_n_task_min & 0xFFFFFFFF;
+  const uint32_t n_task_min_raw = m_n_task_min >> 32;
+  const uint32_t m_task_min = (m_task_min_raw) ? m_task_min_raw : 24;
+  const uint32_t n_task_min = (n_task_min_raw) ? n_task_min_raw : 8;
+
+  if (n_pos_max <= n_pos_min) return 0;
+
+  uint32_t mstart, nstart, mend, nend;
+  uint64_t task_end_read, task_end_load;
+
+  do {
+    task_end_load = *task_end;
+    mstart = task_end_load & 0xFFFFFFFF;
+    nstart = task_end_load >> 32;
+
+    /* if there is no task left, return 0 */
+    if (mstart >= m_pos_max || nstart >= n_pos_max) return 0;
+
+    /* determine how many tasks left in <M, N> 2D space */
+    const uint64_t mn_left = (uint64_t)(n_pos_max - n_pos_min) *
+      (uint64_t)(m_pos_max - mstart);
+
+    /* determine the msize of the next task */
+    /* msize should only depend on mstart, not affected by nstart */
+    uint32_t msize = mn_left / (uint64_t)(num_threads * D_SCALE * n_task_min);
+    msize = msize / m_task_min * m_task_min;
+    if (msize > m_task_max) msize = m_task_max;
+    if (msize < m_task_min) msize = m_task_min;
+    if (msize > m_pos_max - mstart) msize = m_pos_max - mstart;
+
+    /* determine the nsize of the next task */
+    uint32_t n_inc = (nstart >= n_pos_min) ? nstart - n_pos_min : 0;
+    uint32_t nsize = (mn_left - (uint64_t)msize * (uint64_t)n_inc) /
+      (uint64_t)(num_threads * D_SCALE * msize);
+    nsize = nsize / n_task_min * n_task_min;
+    if (nsize < n_task_min) nsize = n_task_min;
+    if (nsize > n_pos_max - nstart) nsize = n_pos_max - nstart;
+
+    nend = nstart + nsize;
+    mend = mstart + msize;
+    uint32_t nextm = mstart;
+    uint32_t nextn = nend;
+    if (nend == n_pos_max) {
+      nextm = mend; nextn = n_pos_min;
+    }
+    uint64_t task_end_write = ((uint64_t)nextn << 32) | (uint64_t)nextm;
+    task_end_read = atomicCAS_U64(task_end_load, task_end_write, task_end);
+  } while (task_end_read != task_end_load);
+
+  /* write back task info */
+  *m_start = mstart;
+  *n_start = nstart;
+  *m_end = mend;
+  *n_end = nend;
+  /* if a task has been successfully required, return 1 */
+  return 1;
+}
+
+/******************************************************************************
+ * Function:     get_copy_task
+ * Description:  Function for a running thread to get GEMM copy task (m or n)
+ *               from a (shared) task container atomically
+ * Calls:        atomicCAS_U32
+ * Input:        uint32_t *dim_left: the address of an interger recording
+ *                                   the amount of remaining work to do,
+ *                                   which is shared among threads.
+ *               uint32_t min_task: the default size of task to get.
+ * Output:       uint32_t *dim_start: to output the starting position
+ *                                    of the new task.
+ *               uint32_t *dim_end: to output the ending position
+ *                                  of the new task.
+ * Return:       0 if there's no task left, 1 if a new task has been acquired.
+ *****************************************************************************/
+static uint32_t get_copy_task(uint32_t *dim_left, uint32_t min_task,
+  uint32_t *dim_start, uint32_t *dim_end) {
+
+  if (!min_task) min_task = 24;
+
+  uint32_t dim_left_load, dim_left_read, dim_left_write, dim_get;
+
+  do {
+    dim_left_load = *dim_left;
+    /* if no task left, return 0 */
+    if (dim_left_load == 0) return 0;
+
+    /* determine task size */
+    dim_get = dim_left_load % min_task;
+    if (dim_get == 0) dim_get = min_task;
+
+    dim_left_write = dim_left_load - dim_get;
+    dim_left_read = atomicCAS_U32(dim_left_load, dim_left_write, dim_left);
+
+  } while (dim_left_read != dim_left_load);
+
+  *dim_start = dim_left_write;
+  *dim_end = dim_left_load;
+  return 1;
+}
+
+/******************************************************************************
+ * Function:     get_irreg_task
+ * Description:  Function for a running thread to get 1D computing task
+ *               from a (shared) task container atomically
+ * Calls:        atomicCAS_U32
+ * Input:        uint32_t *dim_end: the address of an interger recording
+ *                                  how much work has been done,
+ *                                  which is shared among threads.
+ *               uint32_t min_task: specify the default size of a task.
+ *               uint32_t max_dim: input the amount of work to do.
+ * Output:       uint32_t *task_start: to output the starting position
+ *                                     of the new task.
+ *               uint32_t *task_end: to output the ending position
+ *                                   of the new task.
+ * Return:       0 if there's no task left, 1 if a new task has been acquired.
+ *****************************************************************************/
+static uint32_t get_irreg_task(uint32_t *dim_end,
+  uint32_t *task_start, uint32_t *task_end,
+  uint32_t min_task, uint32_t max_dim) {
+
+  if (!min_task) min_task = 4;
+  uint32_t dim_end_load, dim_end_read, dim_end_write;
+
+  do {
+    dim_end_load = *dim_end;
+    /* if no task left, return 0 */
+    if (dim_end_load >= max_dim) return 0;
+
+    dim_end_write = dim_end_load + min_task;
+    if (dim_end_write > max_dim) dim_end_write = max_dim;
+
+    dim_end_read = atomicCAS_U32(dim_end_load, dim_end_write, dim_end);
+  } while (dim_end_read != dim_end_load);
+
+  *task_start = dim_end_load;
+  *task_end = dim_end_write;
+  return 1;
+}
+
+#endif
+
diff --git a/include/common/CommonSkinnyDot.h b/include/common/CommonSkinnyDot.h
new file mode 100644
index 0000000..28e6175
--- /dev/null
+++ b/include/common/CommonSkinnyDot.h
@@ -0,0 +1,586 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+/******************************************************************************
+* File:        CommonSkinnyDot.h
+* Description: Common building blocks for regular * skinny or skinny * regular
+*              matmul when the regular matrix is row-major in the former
+*              case or column-major in the latter case. These 2 kinds of matmul
+*              involving skinny matrices require a special efficient kernel
+*              different from that in regular * regular matmul. Specifically,
+*              The regular matrix is no longer reordered (packed) during
+*              calculation. Elements from the regular matrix are accessed
+*              sequentially and only once.The GEMM calculation is decomposed
+*              into sequential DOT operations.The skinny source matrix is
+*              accessed repeatedly in DOT operations so it is always packed
+*              in a scratch array.
+* Extension:   To support a new CPU architecture, the following tasks should
+*              be done in addition to including this header:
+*              (1) Use typedef to define <gemm>_skinnydot_[a/b/c]scalar and
+*                  <gemm>_skinnydot_[a/b/c]vec[\d]. For example, when
+*                  developing avx2 SGEMM regular*skinny kernels, the following
+*                  lines should be added when the maximum vector length is 8
+*                  in K dimension:
+*                    // scalar types in main memory
+*                    typedef float sgemm_skinnydot_ascalar;
+*                    typedef float sgemm_skinnydot_bscalar;
+*                    typedef float sgemm_skinnydot_cscalar;
+*                    // (converted) vector types in registers
+*                    typedef float sgemm_skinnydot_avec1;
+*                    typedef __m128 sgemm_skinnydot_avec4;
+*                    typedef __m256 sgemm_skinnydot_avec8;
+*                    typedef float sgemm_skinnydot_bvec1;
+*                    typedef __m128 sgemm_skinnydot_bvec4;
+*                    typedef __m256 sgemm_skinnydot_bvec8;
+*                    typedef float sgemm_skinnydot_cvec1;
+*                    typedef __m128 sgemm_skinnydot_cvec4;
+*                    typedef __m256 sgemm_skinnydot_cvec8;
+*              (2) Implement inline functions for basic vector-vector
+*                  multiply-add operations. Here are examples for
+*                  inline functions of avx2 SGEMM with k_veclen = 8, 4 and 1.
+*                  These functions multiplies each element in a_vec with the
+*                  corresponding element in b_vec and add the result
+*                  to the corresponding element in c_vec:
+*                    GEMM_SKINNY_DOT_CALC_UNIT(sgemm, 8) {
+*                      return _mm256_fmadd_ps(a_vec, b_vec, c_vec);
+*                    }
+*                    GEMM_SKINNY_DOT_CALC_UNIT(sgemm, 4) {
+*                      return _mm_fmadd_ps(a_vec, b_vec, c_vec);
+*                    }
+*                    GEMM_SKINNY_DOT_CALC_UNIT(sgemm, 1) {
+*                      return a_vec * b_vec + c_vec;
+*                    }
+*              (3) Implement load and store inline functions for matrix
+*                  a & b like this (each catagory 1 example (k_veclen = 8)):
+*                    GEMM_SKINNY_DOT_LOADA_UNIT(sgemm, 8) {
+*                      _mm_prefetch((char *)(a_ptr + 24), _MM_HINT_T0);
+*                      return _mm256_loadu_ps(a_ptr);
+*                    }
+*                    GEMM_SKINNY_DOT_LOADB_UNIT(sgemm, 8) {
+*                      return _mm256_loadu_ps(b_ptr);
+*                    }
+*              (4) Implement inline vectorized reduction functions:
+*                    // reduction from vec[8] to vec[4]
+*                    GEMM_SKINNY_DOT_REDUC_UNIT(sgemm, 8, 4) {
+*                      return _mm_add_ps(_mm256_extractf128_ps(c_vec, 0),
+*                        _mm256_extractf128_ps(c_vec, 1));
+*                    }
+*                    // reduction from vec[4] to vec[1]
+*                    GEMM_SKINNY_DOT_REDUC_UNIT(sgemm, 4, 1) {
+*                      __m128 z0 = _mm_setzero_ps();
+*                      c_vec = _mm_hadd_ps(c_vec, z0);
+*                      c_vec = _mm_hadd_ps(c_vec, z0);
+*                      return _mm_cvtss_f32(c_vec);
+*                    }
+*              (5) Implement inline vector initialization functions.
+*                  A function in this category returns a vector filled with
+*                  zeros.
+*                    GEMM_SKINNY_DOT_INITC_UNIT(sgemm, 8) {
+*                      return _mm256_setzero_ps();
+*                    }
+*                    GEMM_SKINNY_DOT_INITC_UNIT(sgemm, 4) {
+*                      return _mm_setzero_ps();
+*                    }
+*                    GEMM_SKINNY_DOT_INITC_UNIT(sgemm, 1) {
+*                      return 0;
+*                    }
+*              (5) Finally build kernel functions from inline functions
+*                  defined above. For each kernel function only 1 line
+*                  is needed. The following line defines regular*skinny
+*                  kernel functions (serial and OpenMP) for the minimum
+*                  dimension length = 2 with k_veclen = {1, 4, 8} and
+*                  m_unroll = {1, 2, 4}:
+*                    GEMM_SKINNY_DOT_PARALLEL_FUNC(sgemm, 2, 13, 7, 8192,
+*                      float, float)
+*                  The last 2 parameters in the macro are for function
+*                  name mangling, providing the data type for regular
+*                  and skinny matrix respectively. The last number in
+*                  macro parameters (8192) specify the scratch size
+*                  for skinny matrix which should be adjusted to the size
+*                  of L1 cache. The second number (13) is the sum of all
+*                  implemented k_veclen values. The third number (7) is
+*                  the sum of all m_unroll values covered.
+******************************************************************************/
+
+#include "common/ExpandMacro.h"
+#include "common/CommonSched.h"
+
+#include <stdbool.h>
+#ifndef EMLL_SERIAL_ONLY
+#include <omp.h>
+#endif
+
+#ifndef INCLUDE_COMMON_SKINNY_DOT
+#define INCLUDE_COMMON_SKINNY_DOT
+
+/* computation units basic in skinny_dot function */
+#define GEMM_SKINNY_DOT_CALC_UNIT(gemm, k_veclen) \
+static inline gemm##_skinnydot_cvec##k_veclen\
+  inline_##gemm##_arowmajor_bskinny_fma_unit_m1n1k##k_veclen(\
+    gemm##_skinnydot_cvec##k_veclen c_vec,\
+    gemm##_skinnydot_avec##k_veclen a_vec,\
+    gemm##_skinnydot_bvec##k_veclen b_vec)
+/* you should give vectorized implementation equivalent to this:
+ * GEMM_SKINNY_DOT_CALC_UNIT(gemm, k_veclen) {
+ *   gemm##_skinnydot_cvec##k_veclen ret;
+ *   for (int i = 0; i < k_veclen; ++i) {
+ *     ret[i] = a_vec[i] * b_vec[i] + c_vec[i];
+ *   }
+ *   return ret;
+ * }
+ */
+
+#define GEMM_SKINNY_DOT_LOADA_UNIT(gemm, k_veclen) \
+static inline gemm##_skinnydot_avec##k_veclen\
+  inline_##gemm##_arowmajor_bskinny_loada_unit_k##k_veclen(\
+    const gemm##_skinnydot_ascalar *a_ptr)
+/* you should give vectorized implementation equivalent to this:
+ * GEMM_SKINNY_DOT_LOADA_UNIT(gemm, k_veclen) {
+ *   gemm##_skinnydot_avec##k_veclen ret;
+ *   for (int i = 0; i < k_veclen; ++i) {
+ *     ret[i] = a_ptr[i];
+ *   }
+ *   prefetch(a_ptr + pref_distance);
+ *   return ret;
+ * }
+ */
+
+#define GEMM_SKINNY_DOT_LOADB_UNIT(gemm, k_veclen) \
+static inline gemm##_skinnydot_bvec##k_veclen\
+  inline_##gemm##_arowmajor_bskinny_loadb_unit_k##k_veclen(\
+    const gemm##_skinnydot_bscalar *b_ptr)
+/* you should give vectorized implementation equivalent to this:
+ * GEMM_SKINNY_DOT_LOADB_UNIT(gemm, k_veclen) {
+ *   gemm##_skinnydot_bvec##k_veclen ret;
+ *   for (int i = 0; i < k_veclen; ++i) {
+ *     ret[i] = b_ptr[i];
+ *   }
+ *   return ret;
+ * }
+ */
+
+#define GEMM_SKINNY_DOT_REDUC_UNIT(gemm, old_k_vlen, new_k_vlen) \
+static inline gemm##_skinnydot_cvec##new_k_vlen\
+  inline_##gemm##_arowmajor_bskinny_reduc_unit_##new_k_vlen##from##old_k_vlen(\
+    gemm##_skinnydot_cvec##old_k_vlen c_vec)
+/* The sum of all elements of the returned vector should be
+ * equal to that of the input c_vec, here's an example:
+ * GEMM_SKINNY_DOT_REDUC_UNIT(gemm, old_k_vlen, new_k_vlen) {
+ *   gemm##_skinnydot_cvec##new_k_vlen ret;
+ *   int i;
+ *   for (i = 0; i < new_k_vlen; ++i) {
+ *     ret[i] = c_vec[i];
+ *   }
+ *   for (; i < old_k_vlen; ++i) {
+ *     ret[i % new_k_vlen] += c_vec[i];
+ *   }
+ *   return ret;
+ * }
+ */
+
+#define GEMM_SKINNY_DOT_INITC_UNIT(gemm, k_veclen) \
+static inline gemm##_skinnydot_cvec##k_veclen\
+  inline_##gemm##_arowmajor_bskinny_initc_unit_k##k_veclen()
+/* you should give vectorized implementation equivalent to this:
+ * GEMM_SKINNY_DOT_INITC_UNIT(gemm, k_veclen) {
+ *   gemm##_skinnydot_cvec##k_veclen ret = {0};
+ *   return ret;
+ * }
+ */
+
+/* construct inline function from building blocks */
+#define GEMM_SKINNY_DOT_INIT_CVEC_ITEM(m_id, gemm, k_veclen, n_id) \
+  gemm##_skinnydot_cvec##k_veclen c_##k_veclen##_##m_id##_##n_id =\
+    inline_##gemm##_arowmajor_bskinny_initc_unit_k##k_veclen();
+
+#define GEMM_SKINNY_DOT_INIT_CVEC_COL_ITEM(n_id, gemm, k_veclen, m_unroll) \
+  MACRO_EXPANSION_##m_unroll(VOID_BASE, GEMM_SKINNY_DOT_INIT_CVEC_ITEM,\
+    gemm, k_veclen, n_id)
+
+#define GEMM_SKINNY_DOT_INIT_CVEC(k_veclen, gemm, m_unroll, n_dim) \
+  MACRO_EXP_##n_dim(VOID_BASE, GEMM_SKINNY_DOT_INIT_CVEC_COL_ITEM,\
+    gemm, k_veclen, m_unroll)
+
+#define GEMM_SKINNY_DOT_CALC_ITEM(m_id, gemm, k_veclen, n_id) \
+  c_##k_veclen##_##m_id##_##n_id =\
+    inline_##gemm##_arowmajor_bskinny_fma_unit_m1n1k##k_veclen(\
+      c_##k_veclen##_##m_id##_##n_id, a_##k_veclen##_##m_id,\
+      b_##k_veclen##_##n_id);
+
+#define GEMM_SKINNY_DOT_CALC_COL_ITEM_PACK(n_id, gemm, k_veclen, m_unroll) \
+  const gemm##_skinnydot_bvec##k_veclen b_##k_veclen##_##n_id =\
+    inline_##gemm##_arowmajor_bskinny_loadb_unit_k##k_veclen(\
+      b_ptr + (n_id - 1) * k_veclen);\
+  MACRO_EXPANSION_##m_unroll(VOID_BASE, GEMM_SKINNY_DOT_CALC_ITEM,\
+    gemm, k_veclen, n_id)
+
+#define GEMM_SKINNY_DOT_LOADA_ITEM(m_id, gemm, k_veclen) \
+  const gemm##_skinnydot_avec##k_veclen a_##k_veclen##_##m_id =\
+    inline_##gemm##_arowmajor_bskinny_loada_unit_k##k_veclen(a_ptr##m_id);\
+  a_ptr##m_id += k_veclen;
+
+
+#define GEMM_SKINNY_DOT_CALC_LOOPITEM_PACK(k_veclen, gemm, m_unroll, n_dim) \
+for (; k_left >= k_veclen; k_left -= k_veclen) {\
+  MACRO_EXP_##m_unroll(VOID_BASE, GEMM_SKINNY_DOT_LOADA_ITEM, gemm, k_veclen)\
+  MACRO_EXP_##n_dim(VOID_BASE, GEMM_SKINNY_DOT_CALC_COL_ITEM_PACK,\
+    gemm, k_veclen, m_unroll)\
+  b_ptr += n_dim * k_veclen;\
+}
+
+#define GEMM_SKINNY_DOT_REDUC_ITEM(m_id, old_kvlen, new_kvlen, gemm, n_id) \
+  gemm##_skinnydot_cvec##new_kvlen c_##new_kvlen##_##m_id##_##n_id =\
+    inline_##gemm##_arowmajor_bskinny_reduc_unit_##new_kvlen##from##old_kvlen(\
+      c_##old_kvlen##_##m_id##_##n_id);
+
+#define GEMM_SKINNY_DOT_REDUC_COL_ITEM(n_id, m_unroll, gemm,\
+  old_kvlen, new_kvlen) \
+  MACRO_EXPANSION_##m_unroll(VOID_BASE, GEMM_SKINNY_DOT_REDUC_ITEM,\
+    old_kvlen, new_kvlen, gemm, n_id)
+
+#define GEMM_SKINNY_DOT_REDUC_CROSS_ITEM(old_kvlen, new_kvlen, gemm,\
+  m_unroll, n_dim)\
+  MACRO_EXP_##n_dim(VOID_BASE, GEMM_SKINNY_DOT_REDUC_COL_ITEM,\
+    m_unroll, gemm, old_kvlen, new_kvlen)
+
+#define GEMM_SKINNY_DOT_INIT_APTR_ITEM(m_id, gemm) \
+  const gemm##_skinnydot_ascalar *a_ptr##m_id = A + (m_id - 1) * LDK;
+
+#define GEMM_SKINNY_DOT_STOREC_ITEM_CC(m_id, n_id) \
+  c_ptr[m_id - 1] = c_ptr[m_id - 1] * beta + c_1_##m_id##_##n_id;
+
+#define GEMM_SKINNY_DOT_STOREC_ITEM_CR(n_id, m_id) \
+  c_ptr[n_id - 1] = c_ptr[n_id - 1] * beta + c_1_##m_id##_##n_id;
+
+#define GEMM_SKINNY_DOT_STOREC_CC_COL_ITEM(n_id, m_unroll) \
+  MACRO_EXPANSION_##m_unroll(VOID_BASE, GEMM_SKINNY_DOT_STOREC_ITEM_CC, n_id)\
+  c_ptr += LDM;
+
+#define GEMM_SKINNY_DOT_STOREC_CR_ROW_ITEM(m_id, n_dim) \
+  MACRO_EXPANSION_##n_dim(VOID_BASE, GEMM_SKINNY_DOT_STOREC_ITEM_CR, m_id)\
+  c_ptr += n_dim;
+
+#define GEMM_SKINNY_DOT_INLINE_PACK_FUNC(gemm, m_unroll, n_dim, k_mask) \
+static inline void\
+  inline_##gemm##_arowmajor_bskinny_m##m_unroll##n##n_dim(\
+    const gemm##_skinnydot_ascalar *A, const gemm##_skinnydot_bscalar *b_ptr,\
+    gemm##_skinnydot_cscalar *c_ptr, uint32_t k_left, uint32_t LDK, uint32_t LDM,\
+    gemm##_skinnydot_cscalar beta, bool c_rowmajor) {\
+\
+  MACRO_EXP_##m_unroll(VOID_BASE, GEMM_SKINNY_DOT_INIT_APTR_ITEM, gemm)\
+  MACRO_EXPANSION_IMX_##k_mask(GEMM_SKINNY_DOT_INIT_CVEC,\
+    GEMM_SKINNY_DOT_CALC_LOOPITEM_PACK,\
+    GEMM_SKINNY_DOT_REDUC_CROSS_ITEM, gemm, m_unroll, n_dim)\
+  if (c_rowmajor) {\
+    MACRO_EXP_##m_unroll(VOID_BASE, GEMM_SKINNY_DOT_STOREC_CR_ROW_ITEM, n_dim)\
+  } else {\
+    MACRO_EXP_##n_dim(VOID_BASE, GEMM_SKINNY_DOT_STOREC_CC_COL_ITEM, m_unroll)\
+  }\
+}
+
+#define GEMM_SKINNY_DOT_INLINE_FUNC_ITEM(m_unroll, gemm, n_dim, k_mask) \
+  GEMM_SKINNY_DOT_INLINE_PACK_FUNC(gemm, m_unroll, n_dim, k_mask)
+
+#define GEMM_SKINNY_DOT_PACKK_BC_ITEM(k_id, n_id) \
+  sb_ptr[k_id - 1] = b_ptr##n_id[k_id - 1];
+
+#define GEMM_SKINNY_DOT_PACKK_BC_COL_ITEM(n_id, k_veclen) \
+  MACRO_EXPANSION_##k_veclen(VOID_BASE, GEMM_SKINNY_DOT_PACKK_BC_ITEM, n_id)\
+  b_ptr##n_id += k_veclen; sb_ptr += k_veclen;
+
+#define GEMM_SKINNY_DOT_PACKK_BC_LOOP(k_veclen, n_dim) \
+  for (; k_left >= k_veclen; k_left -= k_veclen) {\
+    MACRO_EXP_##n_dim(VOID_BASE, GEMM_SKINNY_DOT_PACKK_BC_COL_ITEM, k_veclen)\
+  }
+
+#define GEMM_SKINNY_DOT_PACKK_BR_ITEM(n_id, k_id, k_veclen) \
+  sb_ptr[(n_id - 1) * k_veclen + k_id - 1] = b_ptr[n_id - 1];
+
+#define GEMM_SKINNY_DOT_PACKK_BR_ROW_ITEM(k_id, n_dim, k_veclen) \
+  MACRO_EXPANSION_##n_dim(VOID_BASE, GEMM_SKINNY_DOT_PACKK_BR_ITEM,\
+    k_id, k_veclen)\
+  b_ptr += n_dim;
+
+#define GEMM_SKINNY_DOT_PACKK_BR_LOOP(k_veclen, n_dim) \
+  for (; k_left >= k_veclen; k_left -= k_veclen) {\
+    MACRO_EXP_##k_veclen(VOID_BASE, GEMM_SKINNY_DOT_PACKK_BR_ROW_ITEM,\
+      n_dim, k_veclen)\
+    sb_ptr += n_dim * k_veclen;\
+  }
+
+#define GEMM_SKINNY_DOT_PACKK_BC_INIT_BPTR_ITEM(n_id, gemm) \
+  const gemm##_skinnydot_bscalar *b_ptr##n_id = b_ptr + (n_id - 1) * K;
+
+#define GEMM_SKINNY_DOT_INLINE_CALL_LOOP(m_unroll, gemm, n_dim) \
+  if (unroll_m##m_unroll) {\
+    for (; m_left >= m_unroll; m_left -= m_unroll) {\
+      inline_##gemm##_arowmajor_bskinny_m##m_unroll##n##n_dim(\
+        a_ptr, b_ptr, c_ptr, k_inc, K, M, beta, c_rowmajor);\
+      a_ptr += K * m_unroll;\
+      c_ptr += c_m_inc * m_unroll;\
+    }\
+  }
+
+#define GEMM_SKINNY_DOT_UNROLL_TEST(m_unroll, unroll_test, n_dim) \
+  const bool unroll_m##m_unroll = unroll_test##_m##m_unroll##n##n_dim(M, K)\
+    || (m_unroll == 1);
+
+#define GEMM_SKINNY_DOT_UNROLL_TEST_DEFAULT(m_unroll, n_dim) \
+static inline bool unroll_test_m##m_unroll##n##n_dim(uint32_t M, uint32_t K) {\
+  return true;\
+}
+
+#define GEMM_SKINNY_DOT_SERIAL_FUNC_NOINCINLINE(gemm, n_dim, k_mask, m_mask,\
+  scratch_size, atype, btype, unroll_test) \
+__attribute__((aligned(4096))) static __thread gemm##_skinnydot_bscalar\
+  blas_skinny_dot_b_scratch_##btype##n##n_dim[scratch_size];\
+void gemm##_arowmajor_bskinny_a##atype##_b##btype##_n##n_dim(\
+  const gemm##_skinnydot_ascalar *A, const gemm##_skinnydot_bscalar *B,\
+  gemm##_skinnydot_cscalar *C, uint32_t M, uint32_t K,\
+  uint8_t b_c_order, gemm##_skinnydot_cscalar beta_inp) {\
+\
+  if (K == 0) {\
+    if (beta_inp != (gemm##_skinnydot_cscalar)1.0) {\
+      uint64_t size = (uint64_t)M * n_dim;\
+      for (uint64_t pos = 0; pos < size; ++pos) {\
+        C[pos] *= beta_inp;\
+      }\
+    }\
+    return;\
+  }\
+\
+  const bool b_rowmajor = b_c_order & 1;\
+  const bool c_rowmajor = b_c_order & 2;\
+  const uint32_t k_limit = ((scratch_size / n_dim) >> 5) << 5;\
+  const uint32_t c_m_inc = c_rowmajor ? n_dim : 1;\
+  MACRO_EXPANSION_M_##m_mask(GEMM_SKINNY_DOT_UNROLL_TEST, unroll_test, n_dim)\
+\
+  uint32_t k_pos, k_inc;\
+  for (k_pos = 0; k_pos < K; k_pos += k_inc) {\
+    k_inc = K - k_pos;\
+    if (k_inc >= (k_limit << 1)) k_inc = k_limit;\
+    else if (k_inc > k_limit) k_inc >>= 1;\
+\
+    const gemm##_skinnydot_cscalar beta = (k_pos == 0) ? beta_inp : 1;\
+    if (n_dim == 1) {\
+      const gemm##_skinnydot_ascalar *a_ptr = A + k_pos;\
+      const gemm##_skinnydot_bscalar * const b_ptr = B + k_pos;\
+      gemm##_skinnydot_cscalar *c_ptr = C;\
+      uint32_t m_left = M;\
+      MACRO_EXPANSION_M_##m_mask(GEMM_SKINNY_DOT_INLINE_CALL_LOOP, gemm, n_dim)\
+    } else {\
+      if (b_rowmajor) {\
+        const gemm##_skinnydot_bscalar *b_ptr = B + k_pos * n_dim;\
+        gemm##_skinnydot_bscalar *sb_ptr =\
+          blas_skinny_dot_b_scratch_##btype##n##n_dim;\
+        uint32_t k_left = k_inc;\
+        MACRO_EXPANSION_M_##k_mask(GEMM_SKINNY_DOT_PACKK_BR_LOOP, n_dim)\
+      } else {\
+        const gemm##_skinnydot_bscalar *b_ptr = B + k_pos;\
+        MACRO_EXP_##n_dim(VOID_BASE, GEMM_SKINNY_DOT_PACKK_BC_INIT_BPTR_ITEM, gemm)\
+        gemm##_skinnydot_bscalar *sb_ptr =\
+          blas_skinny_dot_b_scratch_##btype##n##n_dim;\
+        uint32_t k_left = k_inc;\
+        MACRO_EXPANSION_M_##k_mask(GEMM_SKINNY_DOT_PACKK_BC_LOOP, n_dim)\
+      }\
+      const gemm##_skinnydot_ascalar *a_ptr = A + k_pos;\
+      const gemm##_skinnydot_bscalar * const b_ptr =\
+        blas_skinny_dot_b_scratch_##btype##n##n_dim;\
+      gemm##_skinnydot_cscalar *c_ptr = C;\
+      uint32_t m_left = M;\
+      MACRO_EXPANSION_M_##m_mask(GEMM_SKINNY_DOT_INLINE_CALL_LOOP, gemm, n_dim)\
+    }\
+  }\
+}
+
+/******************************************************************************
+ * Template:    GEMM_SKINNY_DOT_SERIAL_FUNC
+ * Description: Construct serial dot-based "regular * skinny" GEMM function
+ *              from the general algorithm.
+ * Parameters: gemm: The type of GEMM, e.g. sgemm, hgemm, u8u32gemm, ...
+ *             n_dim: The width of skinny matrix that this function can handle.
+ *                    (Every such function can only process 1 width)
+ *             k_mask: The sum of all supported accumulation vector width.
+ *                     For example, if inline calculation functions with
+ *                     k_veclen = 1, 4 and 8 are available, this parameter
+ *                     should be 1 + 4 + 8 = 13. Note that every k_veclen
+ *                     should be a power of 2.
+ *             m_mask: The sum of all supported unroll factors of M. During the
+ *                     calculation of dot values, usually several rows are
+ *                     read concurrently from the regular matrix to improve
+ *                     the ratio of arith/load. But if too many rows are loaded
+ *                     at the same time, there will be no enough registers to
+ *                     hold dot values. So there's a balance. Let's say, if
+ *                     the optimal solution is to read 4 rows together in the
+ *                     bulk region and read one by one at the edge, this
+ *                     parameter can be set to 4 + 1 = 5.
+ *             scratch_size: The size (number of elements) for the scratch
+ *                           array that holds rearranged (packed) block from
+ *                           the skinny source matrix. Because the skinny
+ *                           source is accessed repeatedly during calculations,
+ *                           it's better to rearrange it to make the access
+ *                           to its element fully sequential. This parameter
+ *                           should not exceed the capacity of level-2 cache.
+ *             atype: The data type of regular source matrix. This parameter
+ *                    is only for naming the function properly so that it
+ *                    can be called correctly by driver.
+ *             btype: The data type of skinny source matrix. This parameter
+ *                    is for naming the function only.
+ *****************************************************************************/
+#define GEMM_SKINNY_DOT_SERIAL_FUNC(gemm, n_dim, k_mask, m_mask,\
+  scratch_size, atype, btype) \
+  MACRO_EXP_M_##m_mask(GEMM_SKINNY_DOT_INLINE_FUNC_ITEM, gemm, n_dim, k_mask)\
+  MACRO_EXP_M_##m_mask(GEMM_SKINNY_DOT_UNROLL_TEST_DEFAULT, n_dim)\
+  GEMM_SKINNY_DOT_SERIAL_FUNC_NOINCINLINE(gemm, n_dim, k_mask, m_mask,\
+    scratch_size, atype, btype, unroll_test)
+
+#ifdef EMLL_SERIAL_ONLY
+
+#define GEMM_SKINNY_DOT_PARALLEL_FUNC_NOINCINLINE(gemm, n_dim, k_mask, m_mask,\
+  scratch_size, atype, btype, unroll_test) \
+GEMM_SKINNY_DOT_SERIAL_FUNC_NOINCINLINE(gemm, n_dim, k_mask, m_mask,\
+  scratch_size, atype, btype, unroll_test) \
+void gemm##_arowmajor_bskinny_a##atype##_b##btype##_n##n_dim##_omp(\
+  const gemm##_skinnydot_ascalar *A, const gemm##_skinnydot_bscalar *B,\
+  gemm##_skinnydot_cscalar *C, uint32_t M, uint32_t K,\
+  uint8_t b_c_order, gemm##_skinnydot_cscalar beta_inp, uint32_t num_threads) {\
+\
+  gemm##_arowmajor_bskinny_a##atype##_b##btype##_n##n_dim(A, B, C,\
+    M, K, b_c_order, beta_inp);\
+}
+
+#else
+
+#define GEMM_SKINNY_DOT_PARALLEL_FUNC_NOINCINLINE(gemm, n_dim, k_mask, m_mask,\
+  scratch_size, atype, btype, unroll_test) \
+GEMM_SKINNY_DOT_SERIAL_FUNC_NOINCINLINE(gemm, n_dim, k_mask, m_mask,\
+  scratch_size, atype, btype, unroll_test) \
+/* in ARMv7, the arguments when creating a thread is limited to a certain */\
+/* number, so some arguments need to be wrapped into a struct to pass */\
+struct gemm##_skinnydot_a##atype##_b##btype##_##n_dim##_matrix_info {\
+  const gemm##_skinnydot_ascalar *m_A;\
+  const gemm##_skinnydot_bscalar *m_B;\
+  gemm##_skinnydot_cscalar *m_C;\
+  uint32_t m_M;\
+};\
+void gemm##_arowmajor_bskinny_a##atype##_b##btype##_n##n_dim##_omp(\
+  const gemm##_skinnydot_ascalar *A, const gemm##_skinnydot_bscalar *B,\
+  gemm##_skinnydot_cscalar *C, uint32_t M, uint32_t K,\
+  uint8_t b_c_order, gemm##_skinnydot_cscalar beta_inp, uint32_t num_threads) {\
+\
+  if (num_threads <= 1 || K == 0) {\
+    gemm##_arowmajor_bskinny_a##atype##_b##btype##_n##n_dim(A, B, C,\
+      M, K, b_c_order, beta_inp);\
+    return;\
+  }\
+\
+  struct gemm##_skinnydot_a##atype##_b##btype##_##n_dim##_matrix_info thread_args;\
+  thread_args.m_A = A;\
+  thread_args.m_B = B;\
+  thread_args.m_C = C;\
+  thread_args.m_M = M;\
+  /* use the tls scratch of master thread for shared buffer */\
+  gemm##_skinnydot_bscalar * const b_scratch_master =\
+    blas_skinny_dot_b_scratch_##btype##n##n_dim;\
+  const bool b_rowmajor = b_c_order & 1;\
+  const bool c_rowmajor = b_c_order & 2;\
+  const uint32_t k_limit = ((scratch_size / n_dim) >> 5) << 5;\
+  const uint32_t c_m_inc = c_rowmajor ? n_dim : 1;\
+  MACRO_EXPANSION_M_##m_mask(GEMM_SKINNY_DOT_UNROLL_TEST, unroll_test, n_dim)\
+\
+  uint32_t k_pos, k_inc;\
+  for (k_pos = 0; k_pos < K; k_pos += k_inc) {\
+    k_inc = K - k_pos;\
+    if (k_inc >= (k_limit << 1)) k_inc = k_limit;\
+    else if (k_inc > k_limit) k_inc >>= 1;\
+\
+    const gemm##_skinnydot_cscalar beta = (k_pos == 0) ? beta_inp : 1;\
+    if (n_dim == 1) {\
+      uint32_t m_done = 0;\
+      omp_set_num_threads(num_threads);\
+      _Pragma("omp parallel")\
+      {\
+        uint32_t m_start, m_end;\
+        while(get_irreg_task(&m_done, &m_start, &m_end,\
+          ((((M - m_done) / num_threads) >> 2) / MACRO_EXP_M_FIRSTITEM_##m_mask + 1)\
+            * MACRO_EXP_M_FIRSTITEM_##m_mask, M)) {\
+          const gemm##_skinnydot_ascalar *a_ptr = A + k_pos + m_start * K;\
+          const gemm##_skinnydot_bscalar * const b_ptr = B + k_pos;\
+          gemm##_skinnydot_cscalar *c_ptr = C + m_start;\
+          uint32_t m_left = m_end - m_start;\
+          MACRO_EXPANSION_M_##m_mask(GEMM_SKINNY_DOT_INLINE_CALL_LOOP, gemm, n_dim)\
+        }\
+      }\
+    } else {\
+      uint32_t m_done = 0;\
+      uint32_t k_left_shared = k_inc;\
+      omp_set_num_threads(num_threads);\
+      _Pragma("omp parallel")\
+      {\
+        const gemm##_skinnydot_ascalar * const A = thread_args.m_A;\
+        const gemm##_skinnydot_bscalar * const B = thread_args.m_B;\
+        gemm##_skinnydot_cscalar * const C = thread_args.m_C;\
+        const uint32_t M = thread_args.m_M;\
+        uint32_t k_start, k_end;\
+        while(get_copy_task(&k_left_shared, MACRO_EXP_M_FIRSTITEM_##k_mask << 3,\
+          &k_start, &k_end)) {\
+          if (b_rowmajor) {\
+            const gemm##_skinnydot_bscalar *b_ptr = B + (k_pos + k_start) * n_dim;\
+            gemm##_skinnydot_bscalar *sb_ptr = b_scratch_master + k_start * n_dim;\
+            uint32_t k_left = k_end - k_start;\
+            MACRO_EXPANSION_M_##k_mask(GEMM_SKINNY_DOT_PACKK_BR_LOOP, n_dim)\
+          } else {\
+            const gemm##_skinnydot_bscalar *b_ptr = B + k_pos + k_start;\
+            MACRO_EXP_##n_dim(VOID_BASE, GEMM_SKINNY_DOT_PACKK_BC_INIT_BPTR_ITEM, gemm)\
+            gemm##_skinnydot_bscalar *sb_ptr = b_scratch_master + k_start * n_dim;\
+            uint32_t k_left = k_end - k_start;\
+            MACRO_EXPANSION_M_##k_mask(GEMM_SKINNY_DOT_PACKK_BC_LOOP, n_dim)\
+          }\
+        }\
+        _Pragma("omp barrier")\
+        uint32_t m_start, m_end;\
+        while(get_irreg_task(&m_done, &m_start, &m_end,\
+          ((((M - m_done) / num_threads) >> 2) / MACRO_EXP_M_FIRSTITEM_##m_mask + 1)\
+            * MACRO_EXP_M_FIRSTITEM_##m_mask, M)) {\
+          const gemm##_skinnydot_ascalar *a_ptr = A + k_pos + m_start * K;\
+          const gemm##_skinnydot_bscalar * const b_ptr = b_scratch_master;\
+          gemm##_skinnydot_cscalar *c_ptr = C + c_m_inc * m_start;\
+          uint32_t m_left = m_end - m_start;\
+          MACRO_EXPANSION_M_##m_mask(GEMM_SKINNY_DOT_INLINE_CALL_LOOP,\
+            gemm, n_dim)\
+        }\
+      }\
+    }\
+  }\
+}
+
+#endif
+
+/******************************************************************************
+ * Template:    GEMM_SKINNY_DOT_PARALLEL_FUNC
+ * Description: Construct dot-based "regular * skinny" GEMM function
+ *              paralleled by OpenMP.
+ * Parameters: the same as in GEMM_SKINNY_DOT_SERIAL_FUNC
+ *****************************************************************************/
+#define GEMM_SKINNY_DOT_PARALLEL_FUNC(gemm, n_dim, k_mask, m_mask,\
+  scratch_size, atype, btype) \
+  MACRO_EXP_M_##m_mask(GEMM_SKINNY_DOT_INLINE_FUNC_ITEM, gemm, n_dim, k_mask)\
+  MACRO_EXP_M_##m_mask(GEMM_SKINNY_DOT_UNROLL_TEST_DEFAULT, n_dim)\
+  GEMM_SKINNY_DOT_PARALLEL_FUNC_NOINCINLINE(gemm, n_dim, k_mask, m_mask,\
+    scratch_size, atype, btype, unroll_test)
+
+#endif
diff --git a/include/common/CommonSkinnyGer.h b/include/common/CommonSkinnyGer.h
new file mode 100644
index 0000000..b0af350
--- /dev/null
+++ b/include/common/CommonSkinnyGer.h
@@ -0,0 +1,526 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+/******************************************************************************
+* File:        CommonSkinnyGer.h
+* Description: Common building blocks for regular * skinny or skinny * regular
+*              matmul when the regular matrix is column-major in the former
+*              case or row-major in the latter case. These 2 kinds of matmul
+*              involving skinny matrices require a special efficient kernel
+*              different from that in regular * regular matmul. Specifically,
+*              The regular matrix is no longer reordered (packed) during
+*              calculation. Elements from the regular matrix are accessed
+*              sequentially and only once.The GEMM calculation is decomposed
+*              into sequential GER operations rather than DOT ones.The
+*              output matrix is accessed repeatedly in GER operations so
+*              it is always packed in a scratch array.
+* Extension:   To support a new CPU architecture, the following tasks should
+*              be done in addition to including this header:
+*              (1) Use typedef to define <gemm>_skinnyger_[a/b/c]scalar and
+*                  <gemm>_skinnyger_[a/b/c]vec[\d]. For example, when
+*                  developing avx2 SGEMM regular*skinny kernels, the following
+*                  lines should be added when the maximum vector length is 8
+*                  in M and 4 in K:
+*                    typedef float sgemm_skinnyger_ascalar;
+*                    typedef float sgemm_skinnyger_bscalar;
+*                    typedef float sgemm_skinnyger_cscalar;
+*                    // M vec length up to 8
+*                    typedef float sgemm_skinnyger_avec1;
+*                    typedef __m128 sgemm_skinnyger_avec4;
+*                    typedef __m256 sgemm_skinnyger_avec8;
+*                    typedef float sgemm_skinnyger_cvec1;
+*                    typedef __m128 sgemm_skinnyger_cvec4;
+*                    typedef __m256 sgemm_skinnyger_cvec8;
+*                    // K vec length up to 4
+*                    typedef float sgemm_skinnyger_bvec1;
+*                    typedef __m128 sgemm_skinnyger_bvec4;
+*              (2) Implement inline functions for basic vector-scalar
+*                  multiply-add operations. Here is an example for
+*                  an inline function of avx2 SGEMM with
+*                  m_veclen = 8, k_veclen = 4 and k_laneid = 3,
+*                  which multiplies each element in a_vec with the
+*                  element at lane 3 in b_vec and add the result
+*                  to the corresponding element in c_vec:
+*                    GEMM_SKINNY_GER_CALC_UNIT(sgemm, 8, 4, 3) {
+*                      __m256 b_v0 = _mm256_broadcast_ss((float*)&b_vec + 2);
+*                      return _mm256_fmadd_ps(a_vec, b_v0, c_vec);
+*                    }
+*                  For every combination of m_veclen and k_veclen,
+*                  all related inline multiply-add functions
+*                  with k_laneid from 1 to k_veclen should be implemented.
+*              (3) Implement load and store inline functions for matrix
+*                  a/b/c like this (each catagory 1 example):
+*                    // the 3 types of functions below should be written
+*                    // for each m_veclen
+*                    GEMM_SKINNY_GER_LOADA_UNIT(sgemm, 8) {
+*                      _mm_prefetch((char *)(a_ptr + 24), _MM_HINT_T0);
+*                      return _mm256_loadu_ps(a_ptr);
+*                    }
+*                    GEMM_SKINNY_GER_LOADC_UNIT(sgemm, 8) {
+*                      return _mm256_loadu_ps(c_ptr);
+*                    }
+*                    GEMM_SKINNY_GER_STOREC_UNIT(sgemm, 8) {
+*                      _mm256_storeu_ps(c_ptr, c_vec);
+*                    }
+*                    // the 2 types of functions blow should be written
+*                    // for each k_veclen
+*                    GEMM_SKINNY_GER_LOADB_UNIT_BROWMAJOR(sgemm, 4) {
+*                      float e0 = *b_ptr; b_ptr += ldb;
+*                      float e1 = *b_ptr; b_ptr += ldb;
+*                      float e2 = *b_ptr; b_ptr += ldb;
+*                      float e3 = *b_ptr;
+*                      return _mm_set_ps(e3, e2, e1, e0);
+*                    }
+*                    GEMM_SKINNY_GER_LOADB_UNIT_BCOLMAJOR(sgemm, 4) {
+*                      return _mm_loadu_ps(b_ptr);
+*                    }
+*              (4) Finally build kernel functions from inline functions
+*                  defined above. For each kernel function only 1 line
+*                  is needed. The following line defines regular*skinny
+*                  kernel functions (serial and OpenMP) for the minimum
+*                  dimension length = 3 with m_veclen = {1, 4, 8}
+*                  and k_veclen = {1, 4}:
+*                    GEMM_SKINNY_GER_PARALLEL_FUNC(sgemm, 3, 5, 13, 8192,
+*                      float, float)
+*                  The last 2 parameters in the macro are for function
+*                  name mangling, providing the data type for regular
+*                  and skinny matrix respectively. The last number in
+*                  macro parameters (8192) specify the scratch size
+*                  for output matrix which should be adjusted to the size
+*                  of L1 cache. The second number (5) is the sum of all
+*                  implemented k_veclen values. The third number (13) is
+*                  the sum of all m_veclen values implemented.
+******************************************************************************/
+
+#define D_SCALE 2 //dynamic scaling factor in scheduling
+#include "common/ExpandMacro.h"
+#include "common/CommonSched.h"
+#include <stdint.h>
+#include <stdbool.h>
+#include <string.h>
+#include <stdlib.h>
+#ifndef EMLL_SERIAL_ONLY
+#include <omp.h>
+#endif
+
+#ifndef INCLUDE_COMMON_SKINNY_GER
+#define INCLUDE_COMMON_SKINNY_GER
+
+/* GEMM_SKINNY_GER_XXX_UNIT: computation units basic in skinny_ger function */
+/* below are only headers to this 6 functions */
+/* the function bodies should be provided according to CPU arch */
+
+#define GEMM_SKINNY_GER_CALC_UNIT(gemm, m_vlen, k_vlen, k_id) \
+static inline gemm##_skinnyger_cvec##m_vlen\
+  inline_##gemm##_acolmajor_bskinny_fma_unit_m##m_vlen##_kid##k_id##in##k_vlen(\
+    gemm##_skinnyger_cvec##m_vlen c_vec,\
+    gemm##_skinnyger_avec##m_vlen a_vec,\
+    gemm##_skinnyger_bvec##k_vlen b_vec)
+/* you should give vectorized implementation equivalent to this:
+ * GEMM_SKINNY_GER_CALC_UNIT(gemm, m_vlen, k_vlen, k_id) {
+ *   gemm##_skinnyger_cvec##m_vlen ret;
+ *   for (int i = 0; i < m_vlen; ++i) {
+ *     ret[i] = c_vec[i] + a_vec[i] * b_vec[k_id - 1];
+ *   } 
+ *   return ret;
+ * }
+ */
+
+#define GEMM_SKINNY_GER_LOADA_UNIT(gemm, m_vlen) \
+static inline gemm##_skinnyger_avec##m_vlen\
+  inline_##gemm##_acolmajor_bskinny_loada_unit_m##m_vlen(\
+    const gemm##_skinnyger_ascalar *a_ptr)
+/* you should give vectorized implementation equivalent to this:
+ * GEMM_SKINNY_GER_LOADA_UNIT(gemm, m_vlen) {
+ *   gemm##_skinnyger_avec##m_vlen ret;
+ *   for (int i = 0; i < m_vlen; ++i) {
+ *     ret[i] = a_ptr[i];
+ *   }
+ *   prefetch(a_ptr + pref_distance);
+ *   return ret;
+ * }
+ */
+
+#define GEMM_SKINNY_GER_LOADC_UNIT(gemm, m_vlen) \
+static inline gemm##_skinnyger_cvec##m_vlen\
+  inline_##gemm##_acolmajor_bskinny_loadc_unit_m##m_vlen(\
+    const gemm##_skinnyger_cscalar *c_ptr)
+/* you should give vectorized implementation equivalent to this:
+ * GEMM_SKINNY_GER_LOADC_UNIT(gemm, m_vlen) {
+ *   gemm##_skinnyger_cvec##m_vlen ret;
+ *   for (int i = 0; i < m_vlen; ++i) {
+ *     ret[i] = c_ptr[i];
+ *   }
+ *   return ret;
+ * }
+ */
+
+#define GEMM_SKINNY_GER_STOREC_UNIT(gemm, m_vlen) \
+static inline void\
+  inline_##gemm##_acolmajor_bskinny_storec_unit_m##m_vlen(\
+    gemm##_skinnyger_cscalar *c_ptr,\
+    gemm##_skinnyger_cvec##m_vlen c_vec)
+/* you should give vectorized implementation equivalent to this:
+ * GEMM_SKINNY_GER_STOREC_UNIT(gemm, m_vlen) {
+ *   for (int i = 0; i < m_vlen; ++i) {
+ *     c_ptr[i] = c_vec[i];
+ *   }
+ * }
+ */
+
+#define GEMM_SKINNY_GER_LOADB_UNIT_BROWMAJOR(gemm, k_vlen) \
+static inline gemm##_skinnyger_bvec##k_vlen\
+  inline_##gemm##_acolmajor_bskinny_loadb_browmajor_unit_k##k_vlen(\
+    const gemm##_skinnyger_bscalar *b_ptr, uint32_t ldb)
+/* you should give optimized implementation equivalent to this:
+ * GEMM_SKINNY_GER_LOADB_UNIT_BROWMAJOR(gemm, k_vlen) {
+ *   gemm##_skinnyger_bvec##k_vlen ret;
+ *   for (int i = 0; i < m_vlen; ++i) {
+ *     ret[i] = *b_ptr; b_ptr += ldb;
+ *   }
+ * }
+ */
+
+#define GEMM_SKINNY_GER_LOADB_UNIT_BCOLMAJOR(gemm, k_vlen) \
+static inline gemm##_skinnyger_bvec##k_vlen\
+  inline_##gemm##_acolmajor_bskinny_loadb_bcolmajor_unit_k##k_vlen(\
+    const gemm##_skinnyger_bscalar *b_ptr)
+/* you should give vectorized implementation equivalent to this:
+ * GEMM_SKINNY_GER_LOADB_UNIT_BCOLMAJOR(gemm, k_vlen) {
+ *   gemm##_skinnyger_bvec##k_vlen ret;
+ *   for (int i = 0; i < m_vlen; ++i) {
+ *     ret[i] = b_ptr[i];
+ *   }
+ * }
+ */
+
+
+/* construct skinny_ger function from computation units */
+#define GEMM_SKINNY_GER_CALC_UNIT_ITEM(n_id, gemm, m_vlen, k_vlen, k_id) \
+  c##m_vlen##_##n_id =\
+    inline_##gemm##_acolmajor_bskinny_fma_unit_m##m_vlen##_kid##k_id##in##k_vlen(\
+      c##m_vlen##_##n_id, a##m_vlen##_##k_id, b##k_vlen##_##n_id);
+
+#define GEMM_SKINNY_GER_CALC_UNIT_K1(k_id, gemm, m_vlen, k_vlen, n_dim) \
+  const gemm##_skinnyger_avec##m_vlen a##m_vlen##_##k_id =\
+    inline_##gemm##_acolmajor_bskinny_loada_unit_m##m_vlen(a_ptr##k_id);\
+  a_ptr##k_id += m_vlen;\
+  MACRO_EXPANSION_##n_dim(VOID_BASE, GEMM_SKINNY_GER_CALC_UNIT_ITEM,\
+    gemm, m_vlen, k_vlen, k_id)
+
+#define GEMM_SKINNY_GER_LOADC_ITEM(n_id, gemm, m_vlen) \
+  gemm##_skinnyger_cvec##m_vlen c##m_vlen##_##n_id =\
+    inline_##gemm##_acolmajor_bskinny_loadc_unit_m##m_vlen(\
+      c_ptr + (n_id - 1) * m_vlen);
+
+#define GEMM_SKINNY_GER_STOREC_ITEM(n_id, gemm, m_vlen) \
+  inline_##gemm##_acolmajor_bskinny_storec_unit_m##m_vlen(\
+    c_ptr + (n_id - 1) * m_vlen, c##m_vlen##_##n_id);
+
+#define GEMM_SKINNY_GER_COMPUTE_BLOCK(gemm, m_vlen, k_vlen, n_dim) \
+  MACRO_EXPANSION_##n_dim(VOID_BASE,\
+    GEMM_SKINNY_GER_LOADC_ITEM, gemm, m_vlen)\
+  MACRO_EXP_##k_vlen(VOID_BASE,\
+    GEMM_SKINNY_GER_CALC_UNIT_K1, gemm, m_vlen, k_vlen, n_dim)\
+  MACRO_EXPANSION_##n_dim(VOID_BASE,\
+    GEMM_SKINNY_GER_STOREC_ITEM, gemm, m_vlen)
+
+#define GEMM_SKINNY_GER_COMPUTE_BLOCK_LOOP(\
+  m_vlen, gemm, k_vlen, n_dim) \
+  for (; m_left >= m_vlen; m_left -= m_vlen) {\
+    GEMM_SKINNY_GER_COMPUTE_BLOCK(gemm, m_vlen, k_vlen, n_dim) \
+    c_ptr += n_dim * m_vlen;\
+  }
+
+#define GEMM_SKINNY_GER_DECLARE_B_ITEM(n_id, gemm, k_vlen) \
+  gemm##_skinnyger_bvec##k_vlen b##k_vlen##_##n_id;
+
+#define GEMM_SKINNY_GER_LOADB_BROWMAJOR_ITEM(n_id, gemm, k_vlen) \
+  b##k_vlen##_##n_id =\
+    inline_##gemm##_acolmajor_bskinny_loadb_browmajor_unit_k##k_vlen(\
+      b_ptr, LDB);  b_ptr++;
+
+#define GEMM_SKINNY_GER_LOADB_BCOLMAJOR_ITEM(n_id, gemm, k_vlen) \
+  b##k_vlen##_##n_id =\
+    inline_##gemm##_acolmajor_bskinny_loadb_bcolmajor_unit_k##k_vlen(b_ptr);\
+  b_ptr += LDB;
+
+#define GEMM_SKINNY_GER_INIT_APTR_ITEM(k_id, gemm) \
+  const gemm##_skinnyger_ascalar *a_ptr##k_id = a_ptr + (k_id - 1) * LDA;
+
+/* define valid inline function */
+#define GEMM_SKINNY_GER_INLINE_FUNC(gemm, n_dim, k_vlen, m_mask) \
+static inline void inline_##gemm##_acolmajor_bskinny_k##k_vlen##n##n_dim(\
+  const gemm##_skinnyger_ascalar *a_ptr,\
+  const gemm##_skinnyger_bscalar *b_ptr,\
+  gemm##_skinnyger_cscalar *c_ptr,\
+  uint32_t m_left, uint32_t LDA, uint32_t LDB, bool b_rowmajor) {\
+\
+  MACRO_EXP_##k_vlen(VOID_BASE, GEMM_SKINNY_GER_INIT_APTR_ITEM, gemm)\
+  MACRO_EXP_##n_dim(VOID_BASE, GEMM_SKINNY_GER_DECLARE_B_ITEM, gemm, k_vlen)\
+  if (b_rowmajor) {\
+    MACRO_EXP_##n_dim(VOID_BASE,\
+      GEMM_SKINNY_GER_LOADB_BROWMAJOR_ITEM, gemm, k_vlen)\
+  } else {\
+    MACRO_EXP_##n_dim(VOID_BASE,\
+      GEMM_SKINNY_GER_LOADB_BCOLMAJOR_ITEM, gemm, k_vlen)\
+  }\
+\
+  MACRO_EXP_M_##m_mask(GEMM_SKINNY_GER_COMPUTE_BLOCK_LOOP,\
+    gemm, k_vlen, n_dim)\
+}
+
+#define GEMM_SKINNY_GER_INLINE_FUNC_ITEM(k_vlen, gemm, n_dim, m_mask)\
+  GEMM_SKINNY_GER_INLINE_FUNC(gemm, n_dim, k_vlen, m_mask)
+
+#define GEMM_SKINNY_GER_INLINE_FUNCS(gemm, n_dim, k_mask, m_mask) \
+  MACRO_EXPANSION_M_##k_mask(GEMM_SKINNY_GER_INLINE_FUNC_ITEM, gemm, n_dim, m_mask)
+
+#define GEMM_SKINNY_GER_INLINE_CALL_LOOP(k_vlen, gemm, n_dim) \
+  for (; k_left >= k_vlen; k_left -= k_vlen) {\
+    inline_##gemm##_acolmajor_bskinny_k##k_vlen##n##n_dim(\
+      a_ptr, b_ptr, c_scratch, m_inc, M, LDB, b_rowmajor);\
+    a_ptr += k_vlen * M;\
+    b_ptr += k_vlen * b_k_inc;\
+  }
+
+#define GEMM_SKINNY_GER_BETA_FUNC(gemm, n_dim) \
+static inline void inline_##gemm##_acolmajor_bskinny_beta_##n_dim(\
+  gemm##_skinnyger_cscalar *c_ptr, uint32_t M,\
+  gemm##_skinnyger_cscalar beta) {\
+\
+  if (beta == (gemm##_skinnyger_cscalar)1.0) {\
+    return;\
+  }\
+\
+  uint64_t size = (uint64_t)M * n_dim;\
+  for (; size > 7; size -= 8) {\
+    c_ptr[0] *= beta; c_ptr[1] *= beta;\
+    c_ptr[2] *= beta; c_ptr[3] *= beta;\
+    c_ptr[4] *= beta; c_ptr[5] *= beta;\
+    c_ptr[6] *= beta; c_ptr[7] *= beta;\
+    c_ptr += 8;\
+  }\
+  for (; size > 0; size--) {\
+    *c_ptr *= beta;\
+    c_ptr++;\
+  }\
+}
+
+/* params atype & btype here are for function name mangling only */
+#define GEMM_SKINNY_GER_SERIAL_FUNC(gemm, n_dim,\
+  k_mask, m_mask, stack_size, atype, btype) \
+GEMM_SKINNY_GER_BETA_FUNC(gemm, n_dim)\
+GEMM_SKINNY_GER_INLINE_FUNCS(gemm, n_dim, k_mask, m_mask)\
+__attribute__((aligned(4096))) static __thread gemm##_skinnyger_cscalar\
+  gemm##_acolmajor_bskinny_a##atype##_b##btype##_##n_dim##_cscratch[stack_size];\
+GEMM_SKINNY_GER_INLINE_DEPACK_FUNC(gemm, m_mask, n_dim)\
+void gemm##_acolmajor_bskinny_a##atype##_b##btype##_n##n_dim(\
+  const gemm##_skinnyger_ascalar *A,\
+  const gemm##_skinnyger_bscalar *B,\
+  gemm##_skinnyger_cscalar *C,\
+  uint32_t M, uint32_t K, uint8_t b_c_order,\
+  gemm##_skinnyger_cscalar beta_inp) {\
+\
+  const bool b_rowmajor = b_c_order & 1;\
+  const bool c_rowmajor = b_c_order & 2;\
+  const uint32_t b_k_inc = b_rowmajor ? n_dim : 1;\
+  const uint32_t LDB = b_rowmajor ? n_dim : K;\
+\
+  if (n_dim == 1) {\
+    uint32_t k_left = K;\
+    const uint32_t m_inc = M;\
+    const gemm##_skinnyger_ascalar *a_ptr = A;\
+    const gemm##_skinnyger_bscalar *b_ptr = B;\
+    gemm##_skinnyger_cscalar *c_scratch = C;\
+    inline_##gemm##_acolmajor_bskinny_beta_##n_dim(c_scratch, M, beta_inp);\
+    MACRO_EXP_M_##k_mask(GEMM_SKINNY_GER_INLINE_CALL_LOOP, gemm, 1)\
+    return;\
+  }\
+\
+  const uint32_t m_limit = ((stack_size / n_dim) >> 5) << 5;\
+  uint32_t m_pos, m_inc;\
+  for (m_pos = 0; m_pos < M; m_pos += m_inc) {\
+    m_inc = M - m_pos;\
+    if (m_inc >= (m_limit << 1)) m_inc = m_limit;\
+    else if (m_inc > m_limit) m_inc >>= 1;\
+    uint32_t k_left = K;\
+    const gemm##_skinnyger_ascalar *a_ptr = A + m_pos;\
+    const gemm##_skinnyger_bscalar *b_ptr = B;\
+    gemm##_skinnyger_cscalar *c_scratch =\
+      gemm##_acolmajor_bskinny_a##atype##_b##btype##_##n_dim##_cscratch;\
+    memset(c_scratch, 0, m_inc * n_dim * sizeof(gemm##_skinnyger_cscalar));\
+    MACRO_EXP_M_##k_mask(GEMM_SKINNY_GER_INLINE_CALL_LOOP, gemm, n_dim)\
+    inline_##gemm##_acolmajor_bskinny_depack_c_n##n_dim(c_rowmajor, C,\
+      c_scratch, M, m_pos, m_inc, beta_inp);\
+  }\
+}
+
+#ifdef EMLL_SERIAL_ONLY
+
+#define GEMM_SKINNY_GER_PARALLEL_FUNC(gemm, n_dim,\
+  k_mask, m_mask, stack_size, atype, btype) \
+GEMM_SKINNY_GER_SERIAL_FUNC(gemm, n_dim, k_mask, m_mask, stack_size, atype, btype)\
+void gemm##_acolmajor_bskinny_a##atype##_b##btype##_n##n_dim##_omp(\
+  const gemm##_skinnyger_ascalar *A,\
+  const gemm##_skinnyger_bscalar *B,\
+  gemm##_skinnyger_cscalar *C,\
+  uint32_t M, uint32_t K, uint8_t b_c_order,\
+  gemm##_skinnyger_cscalar beta_inp, uint32_t num_threads) {\
+\
+  gemm##_acolmajor_bskinny_a##atype##_b##btype##_n##n_dim(\
+    A, B, C, M, K, b_c_order, beta_inp);\
+}
+
+#else
+
+/* params atype & btype here are for function name mangling only */
+#define GEMM_SKINNY_GER_PARALLEL_FUNC(gemm, n_dim,\
+  k_mask, m_mask, stack_size, atype, btype) \
+struct gemm##_skinnyger_a##atype##_b##btype##_n##n_dim##_info {\
+  const gemm##_skinnyger_ascalar *m_A;\
+  const gemm##_skinnyger_bscalar *m_B;\
+  gemm##_skinnyger_cscalar *m_C;\
+  uint32_t m_M;\
+};\
+GEMM_SKINNY_GER_SERIAL_FUNC(gemm, n_dim, k_mask, m_mask, stack_size, atype, btype)\
+void gemm##_acolmajor_bskinny_a##atype##_b##btype##_n##n_dim##_omp(\
+  const gemm##_skinnyger_ascalar *A,\
+  const gemm##_skinnyger_bscalar *B,\
+  gemm##_skinnyger_cscalar *C,\
+  uint32_t M, uint32_t K, uint8_t b_c_order,\
+  gemm##_skinnyger_cscalar beta_inp, uint32_t num_threads) {\
+\
+  if (num_threads <= 1) {\
+    gemm##_acolmajor_bskinny_a##atype##_b##btype##_n##n_dim(\
+      A, B, C, M, K, b_c_order, beta_inp);\
+    return;\
+  }\
+\
+  inline_##gemm##_acolmajor_bskinny_beta_##n_dim(C, M, beta_inp);\
+  const bool b_rowmajor = b_c_order & 1;\
+  const bool c_rowmajor = b_c_order & 2;\
+  const uint32_t b_k_inc = b_rowmajor ? n_dim : 1;\
+  const uint32_t LDB = b_rowmajor ? n_dim : K;\
+  const uint32_t m_limit = ((stack_size / n_dim) >> 5) << 5;\
+  const uint32_t m_task_min = m_limit >= 256 ? 256 : m_limit;\
+  const uint64_t m_k_task_min = (16ULL << 32) | (uint64_t)m_task_min;\
+  const uint64_t m_k_pos_max = ((uint64_t)K << 32) | (uint64_t)M;\
+  uint64_t task_end = 0;\
+\
+  struct gemm##_skinnyger_a##atype##_b##btype##_n##n_dim##_info task_info;\
+  task_info.m_A = A;\
+  task_info.m_B = B;\
+  task_info.m_C = C;\
+  task_info.m_M = M;\
+\
+  omp_set_num_threads(num_threads);\
+  _Pragma("omp parallel")\
+  {\
+    const gemm##_skinnyger_ascalar * const A = task_info.m_A;\
+    const gemm##_skinnyger_bscalar * const B = task_info.m_B;\
+    gemm##_skinnyger_cscalar * const C = task_info.m_C;\
+    const uint32_t M = task_info.m_M;\
+    uint32_t m_start, k_start, m_end, k_end, m_start_old, m_inc_old;\
+    m_start_old = M; m_inc_old = 0;\
+    gemm##_skinnyger_cscalar * const c_scratch = \
+      gemm##_acolmajor_bskinny_a##atype##_b##btype##_##n_dim##_cscratch;\
+    while(get_mn_task(&task_end, &m_start, &k_start, &m_end, &k_end,\
+      m_k_task_min, m_limit, 0, m_k_pos_max, num_threads)) {\
+\
+      uint32_t k_left = k_end - k_start;\
+      const uint32_t m_inc = m_end - m_start;\
+      const gemm##_skinnyger_ascalar *a_ptr = A + k_start * M + m_start;\
+      const gemm##_skinnyger_bscalar *b_ptr = B + k_start * b_k_inc;\
+      if (m_start != m_start_old) {\
+        if (m_inc_old > 0) {\
+          _Pragma("omp critical")\
+          {\
+            inline_##gemm##_acolmajor_bskinny_depack_c_n##n_dim(c_rowmajor, C,\
+              c_scratch, M, m_start_old, m_inc_old, 1);\
+          }\
+        }\
+        memset(c_scratch, 0, m_inc * n_dim * sizeof(gemm##_skinnyger_cscalar));\
+      }\
+      MACRO_EXP_M_##k_mask(GEMM_SKINNY_GER_INLINE_CALL_LOOP, gemm, n_dim)\
+      m_start_old = m_start; m_inc_old = m_inc;\
+    }\
+    if (m_inc_old > 0) {\
+      _Pragma("omp critical")\
+      {\
+        inline_##gemm##_acolmajor_bskinny_depack_c_n##n_dim(c_rowmajor, C,\
+          c_scratch, M, m_start_old, m_inc_old, 1);\
+      }\
+    }\
+  }\
+}
+
+#endif
+
+#define GEMM_SKINNY_GER_DEPACK_CRM_LOW_ITEM(n_id, m_id, m_vlen, n_dim) \
+  c_wt[(m_id - 1) * n_dim + n_id - 1] =\
+    c_wt[(m_id - 1) * n_dim + n_id - 1] * beta +\
+      c_rd[(m_id - 1) + (n_id - 1) * m_vlen];
+
+#define GEMM_SKINNY_GER_DEPACK_CRM_MID_ITEM(m_id, m_vlen, n_dim) \
+  MACRO_EXPANSION_##n_dim(VOID_BASE,\
+    GEMM_SKINNY_GER_DEPACK_CRM_LOW_ITEM, m_id, m_vlen, n_dim)
+
+#define GEMM_SKINNY_GER_DEPACK_CRM_BLOCK_LOOP(m_vlen, gemm, n_dim) \
+  for (; m_left >= m_vlen; m_left -= m_vlen) {\
+    MACRO_EXP_##m_vlen(VOID_BASE,\
+      GEMM_SKINNY_GER_DEPACK_CRM_MID_ITEM, m_vlen, n_dim)\
+    c_wt += m_vlen * n_dim;\
+    c_rd += m_vlen * n_dim;\
+  }
+
+#define GEMM_SKINNY_GER_DEPACK_CCM_LOW_ITEM(m_id, n_id, m_vlen) \
+  c_wt1[m_id - 1] = c_wt1[m_id - 1] * beta +\
+      c_rd[(n_id - 1) * m_vlen + m_id - 1];
+
+#define GEMM_SKINNY_GER_DEPACK_CCM_MID_ITEM(n_id, m_vlen) \
+  MACRO_EXPANSION_##m_vlen(VOID_BASE,\
+    GEMM_SKINNY_GER_DEPACK_CCM_LOW_ITEM, n_id, m_vlen)\
+  c_wt1 += M;
+
+#define GEMM_SKINNY_GER_DEPACK_CCM_BLOCK_LOOP(m_vlen, gemm, n_dim) \
+  for (; m_left >= m_vlen; m_left -= m_vlen) {\
+    gemm##_skinnyger_cscalar *c_wt1 = c_wt;\
+    MACRO_EXP_##n_dim(VOID_BASE,\
+      GEMM_SKINNY_GER_DEPACK_CCM_MID_ITEM, m_vlen)\
+    c_wt += m_vlen;\
+    c_rd += m_vlen * n_dim;\
+  }
+
+#define GEMM_SKINNY_GER_INLINE_DEPACK_FUNC(gemm, m_mask, n_dim) \
+static void inline_##gemm##_acolmajor_bskinny_depack_c_n##n_dim(\
+  bool c_rowmajor, gemm##_skinnyger_cscalar * __restrict__ C,\
+  const gemm##_skinnyger_cscalar * __restrict__ c_scratch,\
+  uint32_t M, uint32_t m_pos, uint32_t m_left,\
+  gemm##_skinnyger_cscalar beta) {\
+\
+  const gemm##_skinnyger_cscalar *c_rd = c_scratch;\
+  if (c_rowmajor) {\
+    gemm##_skinnyger_cscalar *c_wt = C + m_pos * n_dim;\
+    MACRO_EXP_M_##m_mask(GEMM_SKINNY_GER_DEPACK_CRM_BLOCK_LOOP, gemm, n_dim)\
+  } else {\
+    gemm##_skinnyger_cscalar *c_wt = C + m_pos;\
+    MACRO_EXP_M_##m_mask(GEMM_SKINNY_GER_DEPACK_CCM_BLOCK_LOOP, gemm, n_dim)\
+  }\
+}
+
+#endif
diff --git a/include/common/CommonTest.h b/include/common/CommonTest.h
new file mode 100644
index 0000000..d0ca181
--- /dev/null
+++ b/include/common/CommonTest.h
@@ -0,0 +1,620 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+/******************************************************************************
+ * File:        CommonTest.h
+ * Description: Common test framework for GEMM/Bias/Quantization functions
+ * Usage:       Include this header, then define test functions by macros,
+ *              last call test functions in main function. Please refer to
+ *              test/Test*.c for example.
+ *****************************************************************************/
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+#include <stdbool.h>
+
+#ifndef INCLUDE_COMMON_TEST
+#define INCLUDE_COMMON_TEST
+
+#define STD_GEMM_AR_BC_CC(atype, btype, ctype, A, B, C, M, N, K, beta) {\
+  for (uint32_t n_pos = 0; n_pos < (N); ++n_pos) {\
+    ctype *c_ptr = (C) + n_pos * (M);\
+    const btype *b_ptr = (B) + n_pos * (K);\
+    for (uint32_t m_pos = 0; m_pos < (M); ++m_pos) {\
+      const atype *a_ptr = (A) + m_pos * (K);\
+      ctype sum = (ctype)0.0f;\
+      for (uint32_t k_pos = 0; k_pos < (K); ++k_pos) {\
+        sum += (ctype)a_ptr[k_pos] * (ctype)b_ptr[k_pos];\
+      }\
+      c_ptr[m_pos] = c_ptr[m_pos] * beta + sum;\
+    }\
+  }\
+}
+
+/* src: row-major; dst: column-major */
+#define STD_TRANSPOSE(T, src, dst, src_rows, src_cols) {\
+  for (uint32_t src_row_pos = 0; src_row_pos < src_rows; ++src_row_pos) {\
+    const T *src_ptr = src + src_row_pos * src_cols;\
+    T *dst_ptr = dst + src_row_pos;\
+    for (uint32_t src_col_pos = 0; src_col_pos < src_cols; ++src_col_pos) {\
+      dst_ptr[src_col_pos * src_rows] = src_ptr[src_col_pos];\
+    }\
+  }\
+}
+
+/* matrix C is column-major */
+#define STD_GEMM(gemmtype, atype, btype, ctype) \
+void std_##gemmtype(const atype *A, const btype *B, ctype *C,\
+  uint32_t M, uint32_t N, uint32_t K,\
+  bool a_rowmajor, bool b_colmajor, ctype beta) {\
+  atype *A_mat = NULL; const atype *A_rd = A;\
+  if (!a_rowmajor) {\
+    A_mat = (atype *)malloc(M * K * sizeof(atype));\
+    STD_TRANSPOSE(atype, A, A_mat, K, M)\
+    A_rd = A_mat;\
+  }\
+  btype *B_mat = NULL; const btype *B_rd = B;\
+  if (!b_colmajor) {\
+    B_mat = (btype *)malloc(N * K * sizeof(btype));\
+    STD_TRANSPOSE(btype, B, B_mat, K, N)\
+    B_rd = B_mat;\
+  }\
+  STD_GEMM_AR_BC_CC(atype, btype, ctype, A_rd, B_rd, C, M, N, K, beta)\
+  if (A_mat) free(A_mat);\
+  if (B_mat) free(B_mat);\
+}
+
+/* produce a random number from a/b, a is a random number in [-c, c] */
+/* c = dividend_abs_max; b = divisor */
+#define STD_RAND(T, dat, size, dividend_abs_max, divisor) {\
+  const int32_t abs_max_get = (dividend_abs_max) < 0 ? \
+    -(dividend_abs_max) : (dividend_abs_max);\
+  const int32_t offset_get = (dividend_abs_max) < 0 ? \
+    0 : (dividend_abs_max);\
+  for (uint64_t pos = 0; pos < (size); ++pos) {\
+    int32_t rand_i = rand() % (2 * abs_max_get + 1);\
+    rand_i -= offset_get;\
+    float rand_f = (float)rand_i / (float)(divisor);\
+    *((dat) + pos) = (T)rand_f;\
+  }\
+}
+
+#define STD_MAXDIFF(T, max, dat1, dat2, size) {\
+  T tmp;\
+  max = (T)0.0f;\
+  for (uint64_t pos = 0; pos < (size); ++pos) {\
+    tmp = (*((dat2) + pos)) - (*((dat1) + pos));\
+    if (tmp < 0) tmp = (T)0.0f - tmp;\
+    if (tmp > max) max = tmp;\
+  }\
+}
+
+#define SRC_SIZE 160000000
+
+#define STD_TEST(gemmtype, btype, atype, ctype, dividend_abs_max, divisor) \
+STD_GEMM(gemmtype, atype, btype, ctype)\
+typedef int (*TestFunc_##gemmtype)(int, int, const atype*, const btype*, ctype*,\
+  uint32_t, uint32_t, uint32_t, ctype, uint32_t);\
+void std_test_##gemmtype(TestFunc_##gemmtype test_gemm,\
+  uint32_t M, uint32_t N, uint32_t K, uint8_t transAB,\
+  ctype beta, uint32_t num_threads) {\
+\
+  const int b_rowmajor = transAB & 2;\
+  const int a_rowmajor = transAB & 1;\
+\
+  const uint64_t a_size = (uint64_t)M * (uint64_t)K;\
+  const uint64_t b_size = (uint64_t)N * (uint64_t)K;\
+  const uint64_t c_size = (uint64_t)M * (uint64_t)N;\
+  const uint64_t iters = (uint64_t)SRC_SIZE / \
+    (a_size * sizeof(atype) + b_size * sizeof(btype) + 1);\
+  if (iters == 0) {\
+    printf("Problem size too large. return.\n");\
+    return;\
+  }\
+  atype * const A = (atype *)malloc(a_size * iters * sizeof(atype));\
+  btype * const B = (btype *)malloc(b_size * iters * sizeof(btype));\
+  ctype * const C_ref = (ctype *)malloc(c_size * sizeof(ctype));\
+  ctype * const C_tst = (ctype *)malloc(c_size * sizeof(ctype));\
+  if (A == NULL || B == NULL || C_ref == NULL || C_tst == NULL) {\
+    printf("Memory allocation failed. return.\n");\
+    free(A); free(B); free(C_ref); free(C_tst);\
+    return;\
+  }\
+  srand(time(NULL));\
+  STD_RAND(float, A, a_size, dividend_abs_max, divisor)\
+  for (uint64_t pos = 1; pos < iters; ++pos) {\
+    memcpy(A + pos * a_size, A, a_size * sizeof(atype));\
+  }\
+  STD_RAND(float, B, b_size, dividend_abs_max, divisor)\
+  for (uint64_t pos = 1; pos < iters; ++pos) {\
+    memcpy(B + pos * b_size, B, b_size * sizeof(btype));\
+  }\
+  STD_RAND(float, C_tst, c_size, dividend_abs_max, divisor)\
+  memcpy(C_ref, C_tst, c_size * sizeof(ctype));\
+  struct timespec st, et;\
+  std_##gemmtype(A, B, C_ref, M, N, K, a_rowmajor, !b_rowmajor, beta);\
+  clock_gettime(CLOCK_MONOTONIC, &st);\
+  int ret_status = test_gemm(a_rowmajor, b_rowmajor, A, B, C_tst,\
+    M, N, K, beta, num_threads);\
+  clock_gettime(CLOCK_MONOTONIC, &et);\
+  double nsec = (double)(et.tv_nsec - st.tv_nsec) + 1.0e9 * \
+    (double)(et.tv_sec - st.tv_sec);\
+  printf("Time elapsed for the first run: %.2e ns\n", nsec);\
+  if (ret_status) {\
+    printf("An error has occurred in the tested gemm, error code = %d\n",\
+      ret_status);\
+    return;\
+  }\
+  ctype max;\
+  STD_MAXDIFF(float, max, C_ref, C_tst, c_size)\
+  printf("Max diff. between test and std: %.2e\n", (double)max);\
+\
+  if (iters > 1) {\
+    clock_gettime(CLOCK_MONOTONIC, &st);\
+    for (uint64_t pos = 1; pos < iters; ++pos) {\
+      test_gemm(a_rowmajor, b_rowmajor, A + a_size * pos, B + b_size * pos, C_tst,\
+        M, N, K, -1, num_threads);\
+    }\
+    clock_gettime(CLOCK_MONOTONIC, &et);\
+    double nsec = (double)(et.tv_nsec - st.tv_nsec) + 1.0e9 * \
+      (double)(et.tv_sec - st.tv_sec);\
+    double ops = (double)M * (double)N * (double)(2 * K - 1) * \
+      (double)(iters - 1);\
+    printf("Averaged time for each run after warm-up: %.2e ns\n",\
+      nsec / (double)(iters - 1));\
+    printf("The performance of test: %.2e GFLOPS\n", ops / nsec);\
+  }\
+\
+  free(A); free(B); free(C_ref); free(C_tst);\
+  return;\
+}
+
+#define TEST_1D_OPERATION_PERF(size, num_iters, FUNC_CALLER, ...) \
+  struct timespec st, et;\
+  clock_gettime(CLOCK_MONOTONIC, &st);\
+  for (uint32_t pos = 1; pos < num_iters; ++pos) {\
+    FUNC_CALLER(0, size, ##__VA_ARGS__)\
+  }\
+  clock_gettime(CLOCK_MONOTONIC, &et);\
+  double nsec = (double)(et.tv_nsec - st.tv_nsec) + 1.0e9 * (double)\
+    (et.tv_sec - st.tv_sec);\
+  printf("Avg. Perf.(repeat on the same data): %.2e G elements per second\n",\
+    (double)size * (double)(num_iters - 1) / nsec);\
+  clock_gettime(CLOCK_MONOTONIC, &st);\
+  for (uint32_t pos = 1; pos < num_iters; ++pos) {\
+    FUNC_CALLER(pos, size, ##__VA_ARGS__)\
+  }\
+  clock_gettime(CLOCK_MONOTONIC, &et);\
+  nsec = (double)(et.tv_nsec - st.tv_nsec) + 1.0e9 * (double)\
+    (et.tv_sec - st.tv_sec);\
+  printf("Avg. Perf.(no repeat of data region): %.2e G elements per second\n",\
+    (double)size * (double)(num_iters - 1) / nsec);
+
+#define FUNC_CALLER_QUANT_UNSYM(pos, size, inbits, outbits,\
+  src, tst_u, zero_addr, scale_addr) \
+  quantize_asymmetric_f##inbits##_u##outbits(\
+    src + pos * size, tst_u + pos * size,\
+    zero_addr, scale_addr, size, 0, -1);
+
+#define TEST_QUANT_UNSYM(inbits, outbits) \
+static void test_quant_asym_f##inbits##_u##outbits(uint32_t size) {\
+  if (size < 4) size = 4;\
+  printf("Test unsymmetrical quantization fp"#inbits" -> uint"#outbits":\n");\
+  printf("num_elements = %u\n", size);\
+\
+  const uint32_t num_iters = 40000000 / size;\
+  if (num_iters <= 2) {\
+    printf("Problem size too large.\n");\
+    return;\
+  }\
+\
+  uint##outbits##_t * const ref_u =\
+    (uint##outbits##_t *)malloc(size * (outbits >> 3));\
+  uint##outbits##_t * const tst_u =\
+    (uint##outbits##_t *)malloc(num_iters * size * (outbits >> 3));\
+  float##inbits##_t * const src =\
+    (float##inbits##_t *)malloc(num_iters * size * (inbits >> 3));\
+\
+  srand(time(NULL));\
+  for (uint32_t pos = 0; pos < size; ++pos) {\
+    ref_u[pos] = rand();\
+  }\
+  uint32_t min_pos = rand() % size;\
+  uint32_t max_pos = min_pos;\
+  while (max_pos == min_pos) {\
+    max_pos = rand() % size;\
+  }\
+  ref_u[min_pos] = 0;\
+  ref_u[max_pos] = (uint##outbits##_t)-1;\
+  const float##inbits##_t ref_scale =\
+    (float##inbits##_t)(rand() + 1) / (float##inbits##_t)(RAND_MAX >> 2);\
+  const uint##outbits##_t ref_zero = rand();\
+  printf("Generate src data with ref_zero = %u and ref_scale = %.2e\n",\
+    ref_zero, ref_scale);\
+  for (uint32_t pos = 0; pos < size; ++pos) {\
+    float##inbits##_t fluc =\
+      ((float##inbits##_t)rand() / RAND_MAX - (float##inbits##_t)0.5) *\
+      (float##inbits##_t)0.9875;\
+    if (pos == max_pos || pos == min_pos) fluc = 0.0;\
+    else if (ref_u[pos] == (uint##outbits##_t)-1 && fluc > 0) fluc *= -1.0;\
+    else if (ref_u[pos] == 0 && fluc < 0) fluc *= -1.0;\
+    src[pos] = ((float##inbits##_t)((long)ref_u[pos] - (long)ref_zero) + fluc)\
+      * ref_scale;\
+  }\
+  printf("First 4 elements of ref_u"#outbits"\n: %u, %u, %u, %u\n",\
+    ref_u[0], ref_u[1], ref_u[2], ref_u[3]);\
+  printf("First 4 elements of src_f"#inbits"\n: %.2e, %.2e, %.2e, %.2e\n",\
+    src[0], src[1], src[2], src[3]);\
+  for (uint32_t pos = 1; pos < num_iters; ++pos) {\
+    memcpy(src + pos * size, src, size * (inbits >> 3));\
+  }\
+\
+  uint##outbits##_t tst_zero;\
+  float##inbits##_t tst_scale;\
+  quantize_asymmetric_f##inbits##_u##outbits(\
+    src, tst_u, &tst_zero, &tst_scale, size, 0, -1);\
+\
+  if (tst_zero != ref_zero) {\
+    printf("tst_zero = %u, mismatch with ref_zero\n", tst_zero);\
+  }\
+  printf("relative difference between ref_scale and tst_scale: %.2e\n",\
+    (tst_scale - ref_scale) / ref_scale);\
+  int eql = 1;\
+  for (uint32_t pos = 0; pos < size; ++pos) {\
+    if (eql != 0 && tst_u[pos] != ref_u[pos]) {\
+      eql = 0;\
+      printf("u"#outbits" results at pos %u are inconsistent: ref = %u, tst = %u\n",\
+        pos, ref_u[pos], tst_u[pos]);\
+      break;\
+    }\
+  }\
+  if (eql != 0) {\
+    printf("u"#outbits" results are equal\n");\
+    TEST_1D_OPERATION_PERF(size, num_iters, FUNC_CALLER_QUANT_UNSYM,\
+      inbits, outbits, src, tst_u, &tst_zero, &tst_scale)\
+  }\
+\
+  free(src);\
+  free(ref_u);\
+  free(tst_u);\
+}
+
+#define FUNC_CALLER_QUANT_SYM(pos, size, inbits, outbits, src, tst_s, scale_addr)\
+  quantize_symmetric_f##inbits##_s##outbits(src + pos * size, tst_s + pos * size,\
+    scale_addr, size, 0, -1);
+
+#define TEST_QUANT_SYM(inbits, outbits) \
+static void test_quant_sym_f##inbits##_s##outbits(uint32_t size) {\
+  if (size < 4) size = 4;\
+  printf("Test symmetrical quantization f"#inbits" -> s"#outbits":\n");\
+  printf("num_elements = %u\n", size);\
+\
+  const uint32_t num_iters = 40000000 / size;\
+  if (num_iters <= 2) {\
+    printf("Problem size too large.\n");\
+    return;\
+  }\
+\
+  int##outbits##_t * const ref_s =\
+    (int##outbits##_t *)malloc(size * (outbits >> 3));\
+  int##outbits##_t * const tst_s =\
+    (int##outbits##_t *)malloc(num_iters * size * (outbits >> 3));\
+  float##inbits##_t * const src =\
+    (float##inbits##_t *)malloc(num_iters * size * (inbits >> 3));\
+\
+  const long sint_max = (uint##outbits##_t)-1 >> 1;\
+  const long sint_min = (-sint_max) + (-1);\
+  srand(time(NULL));\
+  for (uint32_t pos = 0; pos < size; ++pos) {\
+    ref_s[pos] = (long)rand() % (2 * sint_max + 2) + sint_min;\
+  }\
+  const uint32_t extreme_pos = rand() % size;\
+  ref_s[extreme_pos] = (rand() & 1) ? sint_min : sint_max;\
+  const float##inbits##_t ref_scale =\
+    (float##inbits##_t)(rand() + 1) / (RAND_MAX >> 2);\
+  printf("Generate fp"#inbits" src data with ref_scale = %.2e\n", ref_scale);\
+  for (uint32_t pos = 0; pos < size; ++pos) {\
+    float##inbits##_t fluc =\
+      ((float##inbits##_t)rand() / RAND_MAX - (float##inbits##_t)0.5)\
+      * (float##inbits##_t)0.9875;\
+    if (pos == extreme_pos) fluc = 0.0;\
+    else if (ref_s[pos] == sint_min && fluc < 0) fluc *= -1.0;\
+    else if (ref_s[pos] == sint_max && fluc > 0) fluc *= -1.0;\
+    src[pos] = ((float##inbits##_t)ref_s[pos] + fluc) * ref_scale;\
+  }\
+  for (uint32_t pos = 1; pos < num_iters; ++pos) {\
+    memcpy(src + pos * size, src, size * (inbits >> 3));\
+  }\
+  printf("First 4 elements of fp"#inbits" src:\n%.2e, %.2e, %.2e, %.2e\n",\
+    src[0], src[1], src[2], src[3]);\
+  printf("First 4 elements of s"#outbits" ref_dst:\n%d, %d, %d, %d\n",\
+    ref_s[0], ref_s[1], ref_s[2], ref_s[3]);\
+\
+  float##inbits##_t tst_scale;\
+  quantize_symmetric_f##inbits##_s##outbits(\
+    src, tst_s, &tst_scale, size, 0, -1);\
+\
+  printf("relative difference between ref_scale and tst_scale: %.2e\n",\
+    (tst_scale - ref_scale) / ref_scale);\
+  int eql = 1;\
+  for (uint32_t pos = 0; pos < size; ++pos) {\
+    if (eql != 0 && tst_s[pos] != ref_s[pos]) {\
+      eql = 0;\
+      printf("s"#outbits" results at pos %u are inconsistent: ref = %d, tst = %d\n",\
+        pos, ref_s[pos], tst_s[pos]);\
+      break;\
+    }\
+  }\
+  if (eql != 0) {\
+    printf("s"#outbits" results are equal\n");\
+    TEST_1D_OPERATION_PERF(size, num_iters, FUNC_CALLER_QUANT_SYM,\
+      inbits, outbits, src, tst_s, &tst_scale)\
+  }\
+\
+  free(src);\
+  free(ref_s);\
+  free(tst_s);\
+}
+
+#define FUNC_CALLER_DEQUANT(pos, size, inbits, outbits, src, tst_f, scale) \
+  dequantize_symmetric_f##outbits##_s##inbits(src + pos * size,\
+    tst_f + pos * size, scale, size);
+
+#define TEST_DEQUANT_SYM(inbits, outbits) \
+static void test_dequant_sym_f##outbits##_s##inbits(uint32_t size) {\
+  if (size < 4) size = 4;\
+  printf("Test dequantization s"#inbits" -> f"#outbits":\n");\
+  printf("num_elements = %u\n", size);\
+\
+  const uint32_t num_iters = 40000000 / size;\
+  if (num_iters <= 2) {\
+    printf("Problem size too large.\n");\
+    return;\
+  }\
+\
+  int##inbits##_t * const src =\
+    (int##inbits##_t *)malloc(num_iters * size * (inbits >> 3));\
+  float##outbits##_t * const ref_f =\
+    (float##outbits##_t *)malloc(size * (outbits >> 3));\
+  float##outbits##_t * const tst_f =\
+    (float##outbits##_t *)malloc(num_iters * size * (outbits >> 3));\
+\
+  srand(time(NULL));\
+  const float##outbits##_t scale = (float##outbits##_t)rand() / RAND_MAX;\
+  printf("Generate src with scale = %.2e\n", scale);\
+  for (uint32_t pos = 0; pos < size; ++pos) {\
+    src[pos] = (long long)rand() - (long long)(RAND_MAX >> 1);\
+    ref_f[pos] = scale * src[pos];\
+  }\
+  for (uint32_t pos = 1; pos < num_iters; ++pos) {\
+    memcpy(src + pos * size, src, size * (inbits >> 3));\
+  }\
+  printf("First 4 elements of src:\n%d, %d, %d, %d\n",\
+    src[0], src[1], src[2], src[3]);\
+  printf("First 4 elements of ref:\n%.2e, %.2e, %.2e, %.2e\n",\
+    ref_f[0], ref_f[1], ref_f[2], ref_f[3]);\
+\
+  dequantize_symmetric_f##outbits##_s##inbits(src, tst_f, scale, size);\
+\
+  float##outbits##_t max_diff = 0.0;\
+  for (uint32_t pos = 0; pos < size; ++pos) {\
+    float##outbits##_t tmp = tst_f[pos] - ref_f[pos];\
+    if (tmp < 0) tmp *= -1.0;\
+    if (tmp > max_diff) max_diff = tmp;\
+  }\
+  printf("Max diff. between tst. and ref.: %.2e\n", max_diff);\
+\
+  TEST_1D_OPERATION_PERF(size, num_iters, FUNC_CALLER_DEQUANT, inbits, outbits,\
+    src, tst_f, scale)\
+\
+  free(src);\
+  free(ref_f);\
+  free(tst_f);\
+}
+
+#define FUNC_CALLER_REQUANT_UNSYM(pos, size, inbits, fp, outbits,\
+  src, dst, org_scale, zero_addr) \
+  fp tmp_scale = org_scale;\
+  requantize_asymmetric_##inbits##to##outbits(\
+    src + pos * size, dst + pos * size, &tmp_scale, zero_addr, size, 0, -1);
+
+#define TEST_REQUANT_UNSYM(fp, inbits, outbits) \
+static void test_requant_int##inbits##_t_##fp##_uint##outbits##_t(\
+  uint32_t size, int##inbits##_t min_src, int##inbits##_t max_src,\
+  fp org_scale) {\
+\
+  if (max_src < min_src) {\
+    int##inbits##_t tmp = min_src;\
+    min_src = max_src;\
+    max_src = tmp;\
+  }\
+  if (size < 4) size = 4;\
+  printf("Test unsymmetrical requantization int"#inbits"_t -> uint"#outbits"_t:\n");\
+  printf("Range of src: %lld - %lld\n", (long long)min_src, (long long)max_src);\
+  printf("original_scale = %.2e\n", org_scale);\
+  printf("num_elements = %u\n", size);\
+\
+  const uint32_t num_iters = 40000000 / size;\
+  if (num_iters <= 2) {\
+    printf("Problem size too large.\n");\
+    return;\
+  }\
+\
+  int##inbits##_t * const src = (int##inbits##_t *)malloc(\
+    num_iters * size * sizeof(int##inbits##_t));\
+  uint##outbits##_t * const dst = (uint##outbits##_t *)malloc(\
+    num_iters * size * sizeof(uint##outbits##_t));\
+\
+  const double range = (long long)max_src - (long long)min_src;\
+  srand(time(NULL));\
+  if (range == 0) {\
+    for (uint32_t pos = 0; pos < size; ++pos) {\
+      src[pos] = min_src;\
+    }\
+  } else {\
+    for (uint32_t pos = 0; pos < size; ++pos) {\
+      double rv = (double)rand() / (double)RAND_MAX;\
+      double v = rv * range + (double)min_src;\
+      int##inbits##_t iv = v;\
+      if (iv < min_src) iv = min_src;\
+      if (iv > max_src) iv = max_src;\
+      src[pos] = iv;\
+    }\
+    uint32_t min_pos = rand() % size;\
+    uint32_t max_pos = rand() % size;\
+    while(max_pos == min_pos) {\
+      max_pos = rand() % size;\
+    }\
+    src[min_pos] = min_src;\
+    src[max_pos] = max_src;\
+  }\
+  printf("First 4 src elements: %lld, %lld, %lld, %lld\n",\
+    (long long)src[0], (long long)src[1], (long long)src[2], (long long)src[3]);\
+  for (uint32_t it = 1; it < num_iters; ++it) {\
+    memcpy(src + it * size, src, size * sizeof(int##inbits##_t));\
+  }\
+  for (uint32_t pos = 0; pos < size; ++pos) {\
+    dst[pos] = rand();\
+  }\
+\
+  const long long renorm_min_src = min_src > 0 ? 0 : min_src;\
+  const long long renorm_max_src = max_src < 0 ? 0 : max_src;\
+  const fp ref_scale = (double)org_scale * \
+    (double)(renorm_max_src - renorm_min_src) / ((uint##outbits##_t)-1);\
+  printf("ref_scale = %.2e\n", ref_scale);\
+\
+  uint##outbits##_t zero_point;\
+  fp new_scale = org_scale;\
+  requantize_asymmetric_##inbits##to##outbits(src, dst,\
+  &new_scale, &zero_point, size, 0, -1);\
+\
+  printf("tst_zero = %u\n", zero_point);\
+  printf("tst_scale - ref_scale = %.2e\n", new_scale - ref_scale);\
+  long min_out, max_out;\
+  double max_diff_out = 0.0;\
+  min_out = max_out = dst[0];\
+  for (uint32_t pos = 0; pos < size; ++pos) {\
+    long ld = dst[pos];\
+    if (ld < min_out) min_out = ld;\
+    if (ld > max_out) max_out = ld;\
+    double curr_fp = src[pos] * (double)org_scale;\
+    double curr_i8 = curr_fp / (double)new_scale;\
+    double curr_u8 = curr_i8 + (double)zero_point;\
+    double tmp_diff_out = (double)ld - curr_u8;\
+    if (tmp_diff_out < 0) tmp_diff_out *= -1.0;\
+    if (tmp_diff_out > max_diff_out) max_diff_out = tmp_diff_out;\
+  }\
+  printf("range of requant u"#outbits": [%ld, %ld]\n", min_out, max_out);\
+  printf("max deviation of requant u"#outbits": %.2e\n", max_diff_out);\
+\
+  TEST_1D_OPERATION_PERF(size, num_iters, FUNC_CALLER_REQUANT_UNSYM,\
+    inbits, fp, outbits, src, dst, org_scale, &zero_point)\
+\
+  free(src);\
+  free(dst);\
+}
+
+#define FUNC_CALLER_REQUANT_SYM(pos, size, inbits, fp, outbits, src, dst, org_scale) \
+  fp tmp_scale = org_scale;\
+  requantize_symmetric_##inbits##to##outbits(\
+    src + pos * size, dst + pos * size, &tmp_scale, size, 0, -1);
+
+#define TEST_REQUANT_SYM(fp, inbits, outbits) \
+static void test_requant_int##inbits##_t_##fp##_int##outbits##_t(\
+  uint32_t size, int##inbits##_t max_abs, fp org_scale) {\
+\
+  if (max_abs < 0) max_abs = -max_abs;\
+  if (size < 4) size = 4;\
+  printf("Test symmetrical requantization int"#inbits"_t -> int"#outbits"_t:\n");\
+  printf("Range of src: %d - %d\n", -max_abs, max_abs);\
+  printf("original_scale = %.2e\n", org_scale);\
+  printf("num_elements = %u\n", size);\
+\
+  const uint32_t num_iters = 40000000 / size;\
+  if (num_iters <= 2) {\
+    printf("Problem size too large.\n");\
+    return;\
+  }\
+\
+  int##inbits##_t * const src = (int##inbits##_t *)malloc(\
+    num_iters * size * sizeof(int##inbits##_t));\
+  int##outbits##_t * const dst = (int##outbits##_t *)malloc(\
+    num_iters * size * sizeof(int##outbits##_t));\
+\
+  srand(time(NULL));\
+  if (max_abs == 0) {\
+    memset(src, 0, size * sizeof(int##inbits##_t));\
+  } else {\
+    const double rand_range = 2.0 * (double)max_abs + 1.0;\
+    const double rand_offset = -1.0 * (double)max_abs;\
+    for (uint32_t pos = 0; pos < size; ++pos) {\
+      double rv = (double)rand() / (double)RAND_MAX;\
+      double ra = rv * rand_range + rand_offset;\
+      int##inbits##_t ia = ra;\
+      if (ia < -max_abs) ia = -max_abs;\
+      if (ia > max_abs) ia = max_abs;\
+      src[pos] = ia;\
+    }\
+    uint32_t max_rand_pos = rand() % size;\
+    src[max_rand_pos] = (rand() & 1) ? max_abs : -max_abs;\
+  }\
+  printf("The first 4 elements of src: %lld, %lld, %lld, %lld\n",\
+    (long long)src[0], (long long)src[1], (long long)src[2], (long long)src[3]);\
+  for (uint32_t it = 1; it < num_iters; ++it) {\
+    memcpy(src + it * size, src, size * sizeof(int##inbits##_t));\
+  }\
+\
+  const fp ref_scale = (double)org_scale * (double)max_abs / \
+    (double)(((uint##outbits##_t)-1) >> 1);\
+  printf("ref_scale = %.2e\n", ref_scale);\
+  fp new_scale = org_scale;\
+  requantize_symmetric_##inbits##to##outbits(src, dst, &new_scale, size, 0, -1);\
+  printf("diff. between ref_scale and tst_scale: %.2e\n",\
+    new_scale - ref_scale);\
+\
+  int##outbits##_t max_out_abs = 0;\
+  double max_out_dev = 0.0;\
+  for (uint32_t pos = 0; pos < size; ++pos) {\
+    int##outbits##_t l1 = dst[pos];\
+    if (l1 > max_out_abs) max_out_abs = l1;\
+    if (-l1 > max_out_abs) max_out_abs = -l1;\
+    if (new_scale != 0.0) {\
+      double expected = (double)src[pos] * (double)org_scale / \
+        (double)new_scale;\
+      double tmp_dev = expected - (double)dst[pos];\
+      if (tmp_dev < 0) tmp_dev *= -1.0;\
+      if (tmp_dev > max_out_dev) max_out_dev = tmp_dev;\
+    }\
+  }\
+  printf("max abs of output int"#outbits": %d\n", max_out_abs);\
+  if (new_scale == 0.0) {\
+    printf("max deviation of output int"#outbits" not determined.\n");\
+  } else {\
+    printf("max deviation of output int"#outbits": %.2e\n", max_out_dev);\
+  }\
+\
+  TEST_1D_OPERATION_PERF(size, num_iters, FUNC_CALLER_REQUANT_SYM,\
+    inbits, fp, outbits, src, dst, org_scale)\
+\
+  free(src);\
+  free(dst);\
+}
+
+#endif
diff --git a/include/common/ExpandMacro.h b/include/common/ExpandMacro.h
new file mode 100644
index 0000000..62403e7
--- /dev/null
+++ b/include/common/ExpandMacro.h
@@ -0,0 +1,932 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+/******************************************************************************
+ * File:        ExpandMacro.h
+ * Description: Smart macros for manual unroll of tiny loops
+ * Example:     Original loop:
+ *                INITIALIZATION(parm1, parm2)
+ *                for (int i = 1; i <= 8; ++i) {
+ *                  LOOP_ITEM(i, parm1, parm2)
+ *                }
+ *              Using macros to manually unroll the loop:
+ *                MACRO_EXP_8(INITIALIZATION, LOOP_ITEM, parm1, parm2)
+ *              Which is identical to the original loop.
+ *****************************************************************************/
+#ifndef INCLUDE_EXPAND_MACRO
+#define INCLUDE_EXPAND_MACRO
+
+#define VOID_BASE(...) /* */
+
+#define MACRO_EXP_0(BASE, ADD_ITEM, ...) \
+  BASE(__VA_ARGS__)
+
+#define MACRO_EXP_1(BASE, ADD_ITEM, ...) \
+  BASE(__VA_ARGS__)\
+  ADD_ITEM(1, ##__VA_ARGS__)
+
+#define MACRO_EXP_2(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_1(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(2, ##__VA_ARGS__)
+
+#define MACRO_EXP_3(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_2(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(3, ##__VA_ARGS__)
+
+#define MACRO_EXP_4(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_3(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(4, ##__VA_ARGS__)
+
+#define MACRO_EXP_5(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_4(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(5, ##__VA_ARGS__)
+
+#define MACRO_EXP_6(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_5(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(6, ##__VA_ARGS__)
+
+#define MACRO_EXP_7(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_6(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(7, ##__VA_ARGS__)
+
+#define MACRO_EXP_8(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_7(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(8, ##__VA_ARGS__)
+
+#define MACRO_EXP_9(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_8(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(9, ##__VA_ARGS__)
+
+#define MACRO_EXP_10(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_9(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(10, ##__VA_ARGS__)
+
+#define MACRO_EXP_11(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_10(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(11, ##__VA_ARGS__)
+
+#define MACRO_EXP_12(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_11(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(12, ##__VA_ARGS__)
+
+#define MACRO_EXP_13(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_12(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(13, ##__VA_ARGS__)
+
+#define MACRO_EXP_14(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_13(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(14, ##__VA_ARGS__)
+
+#define MACRO_EXP_15(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_14(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(15, ##__VA_ARGS__)
+
+#define MACRO_EXP_16(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_15(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(16, ##__VA_ARGS__)
+
+#define MACRO_EXP_17(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_16(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(17, ##__VA_ARGS__)
+
+#define MACRO_EXP_18(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_17(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(18, ##__VA_ARGS__)
+
+#define MACRO_EXP_19(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_18(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(19, ##__VA_ARGS__)
+
+#define MACRO_EXP_20(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_19(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(20, ##__VA_ARGS__)
+
+#define MACRO_EXP_21(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_20(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(21, ##__VA_ARGS__)
+
+#define MACRO_EXP_22(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_21(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(22, ##__VA_ARGS__)
+
+#define MACRO_EXP_23(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_22(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(23, ##__VA_ARGS__)
+
+#define MACRO_EXP_24(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_23(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(24, ##__VA_ARGS__)
+
+#define MACRO_EXP_25(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_24(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(25, ##__VA_ARGS__)
+
+#define MACRO_EXP_26(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_25(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(26, ##__VA_ARGS__)
+
+#define MACRO_EXP_27(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_26(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(27, ##__VA_ARGS__)
+
+#define MACRO_EXP_28(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_27(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(28, ##__VA_ARGS__)
+
+#define MACRO_EXP_29(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_28(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(29, ##__VA_ARGS__)
+
+#define MACRO_EXP_30(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_29(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(30, ##__VA_ARGS__)
+
+#define MACRO_EXP_31(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_30(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(31, ##__VA_ARGS__)
+
+#define MACRO_EXP_32(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_31(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(32, ##__VA_ARGS__)
+
+#define MACRO_EXP_33(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_32(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(33, ##__VA_ARGS__)
+
+#define MACRO_EXP_34(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_33(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(34, ##__VA_ARGS__)
+
+#define MACRO_EXP_35(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_34(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(35, ##__VA_ARGS__)
+
+#define MACRO_EXP_36(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_35(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(36, ##__VA_ARGS__)
+
+#define MACRO_EXP_37(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_36(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(37, ##__VA_ARGS__)
+
+#define MACRO_EXP_38(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_37(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(38, ##__VA_ARGS__)
+
+#define MACRO_EXP_39(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_38(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(39, ##__VA_ARGS__)
+
+#define MACRO_EXP_40(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_39(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(40, ##__VA_ARGS__)
+
+#define MACRO_EXP_41(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_40(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(41, ##__VA_ARGS__)
+
+#define MACRO_EXP_42(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_41(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(42, ##__VA_ARGS__)
+
+#define MACRO_EXP_43(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_42(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(43, ##__VA_ARGS__)
+
+#define MACRO_EXP_44(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_43(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(44, ##__VA_ARGS__)
+
+#define MACRO_EXP_45(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_44(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(45, ##__VA_ARGS__)
+
+#define MACRO_EXP_46(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_45(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(46, ##__VA_ARGS__)
+
+#define MACRO_EXP_47(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_46(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(47, ##__VA_ARGS__)
+
+#define MACRO_EXP_48(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_47(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(48, ##__VA_ARGS__)
+
+#define MACRO_EXP_49(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_48(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(49, ##__VA_ARGS__)
+
+#define MACRO_EXP_50(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_49(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(50, ##__VA_ARGS__)
+
+#define MACRO_EXP_51(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_50(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(51, ##__VA_ARGS__)
+
+#define MACRO_EXP_52(BASE, ADD_ITEM, ...) \
+  MACRO_EXP_51(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(52, ##__VA_ARGS__)
+
+/* MACRO_EXPANSION_X does the same as MACRO_EXP_X.
+ * this macro is useful when 2D unrolling is needed.
+ * Example:
+ *  to unroll LOOP_ITEM(x, y, parms) to x = 1-5
+ *  and y = 1-3, just write this:
+ *    #define LOOP_X_UNROLL(y, params)\
+ *      MACRO_EXP_5(LOOP_ITEM, y, params)
+ *    MACRO_EXPANSION_3(LOOP_X_UNROLL, params)
+ *    //you can't use MACRO_EXP_3 here because
+ *    //recursion cannot occur in macro expansion
+ */
+#define MACRO_EXPANSION_0(BASE, ADD_ITEM, ...) \
+  BASE(__VA_ARGS__)
+
+#define MACRO_EXPANSION_1(BASE, ADD_ITEM, ...) \
+  BASE(__VA_ARGS__)\
+  ADD_ITEM(1, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_2(BASE, ADD_ITEM, ...) \
+  MACRO_EXPANSION_1(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(2, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_3(BASE, ADD_ITEM, ...) \
+  MACRO_EXPANSION_2(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(3, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_4(BASE, ADD_ITEM, ...) \
+  MACRO_EXPANSION_3(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(4, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_5(BASE, ADD_ITEM, ...) \
+  MACRO_EXPANSION_4(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(5, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_6(BASE, ADD_ITEM, ...) \
+  MACRO_EXPANSION_5(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(6, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_7(BASE, ADD_ITEM, ...) \
+  MACRO_EXPANSION_6(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(7, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_8(BASE, ADD_ITEM, ...) \
+  MACRO_EXPANSION_7(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(8, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_9(BASE, ADD_ITEM, ...) \
+  MACRO_EXPANSION_8(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(9, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_10(BASE, ADD_ITEM, ...) \
+  MACRO_EXPANSION_9(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(10, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_11(BASE, ADD_ITEM, ...) \
+  MACRO_EXPANSION_10(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(11, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_12(BASE, ADD_ITEM, ...) \
+  MACRO_EXPANSION_11(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(12, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_13(BASE, ADD_ITEM, ...) \
+  MACRO_EXPANSION_12(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(13, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_14(BASE, ADD_ITEM, ...) \
+  MACRO_EXPANSION_13(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(14, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_15(BASE, ADD_ITEM, ...) \
+  MACRO_EXPANSION_14(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(15, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_16(BASE, ADD_ITEM, ...) \
+  MACRO_EXPANSION_15(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(16, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_17(BASE, ADD_ITEM, ...) \
+  MACRO_EXPANSION_16(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(17, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_18(BASE, ADD_ITEM, ...) \
+  MACRO_EXPANSION_17(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(18, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_19(BASE, ADD_ITEM, ...) \
+  MACRO_EXPANSION_18(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(19, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_20(BASE, ADD_ITEM, ...) \
+  MACRO_EXPANSION_19(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(20, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_21(BASE, ADD_ITEM, ...) \
+  MACRO_EXPANSION_20(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(21, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_22(BASE, ADD_ITEM, ...) \
+  MACRO_EXPANSION_21(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(22, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_23(BASE, ADD_ITEM, ...) \
+  MACRO_EXPANSION_22(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(23, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_24(BASE, ADD_ITEM, ...) \
+  MACRO_EXPANSION_23(BASE, ADD_ITEM, ##__VA_ARGS__)\
+  ADD_ITEM(24, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_Q_1(BASE, ADD_ITEM_1, ADD_ITEM_4, ...) \
+  BASE(__VA_ARGS__)\
+  ADD_ITEM_1(1, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_Q_2(BASE, ADD_ITEM_1, ADD_ITEM_4, ...) \
+  MACRO_EXPANSION_Q_1(BASE, ADD_ITEM_1, ADD_ITEM_4, ##__VA_ARGS__)\
+  ADD_ITEM_1(2, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_Q_3(BASE, ADD_ITEM_1, ADD_ITEM_4, ...) \
+  MACRO_EXPANSION_Q_2(BASE, ADD_ITEM_1, ADD_ITEM_4, ##__VA_ARGS__)\
+  ADD_ITEM_1(3, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_Q_4(BASE, ADD_ITEM_1, ADD_ITEM_4, ...) \
+  BASE(__VA_ARGS__)\
+  ADD_ITEM_4(1, 2, 3, 4, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_Q_5(BASE, ADD_ITEM_1, ADD_ITEM_4, ...) \
+  MACRO_EXPANSION_Q_4(BASE, ADD_ITEM_1, ADD_ITEM_4, ##__VA_ARGS__)\
+  ADD_ITEM_1(5, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_Q_6(BASE, ADD_ITEM_1, ADD_ITEM_4, ...) \
+  MACRO_EXPANSION_Q_5(BASE, ADD_ITEM_1, ADD_ITEM_4, ##__VA_ARGS__)\
+  ADD_ITEM_1(6, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_Q_7(BASE, ADD_ITEM_1, ADD_ITEM_4, ...) \
+  MACRO_EXPANSION_Q_6(BASE, ADD_ITEM_1, ADD_ITEM_4, ##__VA_ARGS__)\
+  ADD_ITEM_1(7, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_Q_8(BASE, ADD_ITEM_1, ADD_ITEM_4, ...) \
+  MACRO_EXPANSION_Q_4(BASE, ADD_ITEM_1, ADD_ITEM_4, ##__VA_ARGS__)\
+  ADD_ITEM_4(5, 6, 7, 8, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_Q_9(BASE, ADD_ITEM_1, ADD_ITEM_4, ...) \
+  MACRO_EXPANSION_Q_8(BASE, ADD_ITEM_1, ADD_ITEM_4, ##__VA_ARGS__)\
+  ADD_ITEM_1(9, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_Q_10(BASE, ADD_ITEM_1, ADD_ITEM_4, ...) \
+  MACRO_EXPANSION_Q_9(BASE, ADD_ITEM_1, ADD_ITEM_4, ##__VA_ARGS__)\
+  ADD_ITEM_1(10, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_Q_11(BASE, ADD_ITEM_1, ADD_ITEM_4, ...) \
+  MACRO_EXPANSION_Q_10(BASE, ADD_ITEM_1, ADD_ITEM_4, ##__VA_ARGS__)\
+  ADD_ITEM_1(11, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_Q_12(BASE, ADD_ITEM_1, ADD_ITEM_4, ...) \
+  MACRO_EXPANSION_Q_8(BASE, ADD_ITEM_1, ADD_ITEM_4, ##__VA_ARGS__)\
+  ADD_ITEM_4(9, 10, 11, 12, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_E_1(LOOP_ITEM, ...) \
+  LOOP_ITEM(1, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_E_2(LOOP_ITEM, ...) \
+  LOOP_ITEM(2, ##__VA_ARGS__) MACRO_EXPANSION_E_1(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_E_4(LOOP_ITEM, ...) \
+  LOOP_ITEM(4, ##__VA_ARGS__) MACRO_EXPANSION_E_2(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_E_6(LOOP_ITEM, ...) \
+  LOOP_ITEM(6, ##__VA_ARGS__) MACRO_EXPANSION_E_4(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_E_8(LOOP_ITEM, ...) \
+  LOOP_ITEM(8, ##__VA_ARGS__) MACRO_EXPANSION_E_4(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_E_12(LOOP_ITEM, ...) \
+  LOOP_ITEM(12, ##__VA_ARGS__) MACRO_EXPANSION_E_8(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_E_16(LOOP_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) MACRO_EXPANSION_E_8(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_E_24(LOOP_ITEM, ...) \
+  LOOP_ITEM(24, ##__VA_ARGS__) MACRO_EXPANSION_E_12(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXP_E_1(LOOP_ITEM, ...) \
+  LOOP_ITEM(1, ##__VA_ARGS__)
+
+#define MACRO_EXP_E_2(LOOP_ITEM, ...) \
+  LOOP_ITEM(2, ##__VA_ARGS__) MACRO_EXP_E_1(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXP_E_4(LOOP_ITEM, ...) \
+  LOOP_ITEM(4, ##__VA_ARGS__) MACRO_EXP_E_2(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXP_E_6(LOOP_ITEM, ...) \
+  LOOP_ITEM(6, ##__VA_ARGS__) MACRO_EXP_E_4(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXP_E_8(LOOP_ITEM, ...) \
+  LOOP_ITEM(8, ##__VA_ARGS__) MACRO_EXP_E_4(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXP_E_12(LOOP_ITEM, ...) \
+  LOOP_ITEM(12, ##__VA_ARGS__) MACRO_EXP_E_8(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXP_E_16(LOOP_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) MACRO_EXP_E_8(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXP_E_24(LOOP_ITEM, ...) \
+  LOOP_ITEM(24, ##__VA_ARGS__) MACRO_EXP_E_12(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXP_M_0(LOOP_ITEM, ...) \
+  LOOP_ITEM(1, ##__VA_ARGS__)
+
+#define MACRO_EXP_M_1(LOOP_ITEM, ...) \
+  LOOP_ITEM(1, ##__VA_ARGS__)
+
+#define MACRO_EXP_M_2(LOOP_ITEM, ...) \
+  LOOP_ITEM(2, ##__VA_ARGS__) MACRO_EXP_M_0(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXP_M_3(LOOP_ITEM, ...) \
+  LOOP_ITEM(2, ##__VA_ARGS__) MACRO_EXP_M_1(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXP_M_4(LOOP_ITEM, ...) \
+  LOOP_ITEM(4, ##__VA_ARGS__) MACRO_EXP_M_0(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXP_M_5(LOOP_ITEM, ...) \
+  LOOP_ITEM(4, ##__VA_ARGS__) MACRO_EXP_M_1(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXP_M_6(LOOP_ITEM, ...) \
+  LOOP_ITEM(4, ##__VA_ARGS__) MACRO_EXP_M_2(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXP_M_7(LOOP_ITEM, ...) \
+  LOOP_ITEM(4, ##__VA_ARGS__) MACRO_EXP_M_3(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXP_M_8(LOOP_ITEM, ...) \
+  LOOP_ITEM(8, ##__VA_ARGS__) MACRO_EXP_M_0(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXP_M_9(LOOP_ITEM, ...) \
+  LOOP_ITEM(8, ##__VA_ARGS__) MACRO_EXP_M_1(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXP_M_10(LOOP_ITEM, ...) \
+  LOOP_ITEM(8, ##__VA_ARGS__) MACRO_EXP_M_2(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXP_M_11(LOOP_ITEM, ...) \
+  LOOP_ITEM(8, ##__VA_ARGS__) MACRO_EXP_M_3(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXP_M_12(LOOP_ITEM, ...) \
+  LOOP_ITEM(8, ##__VA_ARGS__) MACRO_EXP_M_4(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXP_M_13(LOOP_ITEM, ...) \
+  LOOP_ITEM(8, ##__VA_ARGS__) MACRO_EXP_M_5(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXP_M_14(LOOP_ITEM, ...) \
+  LOOP_ITEM(8, ##__VA_ARGS__) MACRO_EXP_M_6(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXP_M_15(LOOP_ITEM, ...) \
+  LOOP_ITEM(8, ##__VA_ARGS__) MACRO_EXP_M_7(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXP_M_16(LOOP_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) MACRO_EXP_M_0(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXP_M_17(LOOP_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) MACRO_EXP_M_1(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXP_M_18(LOOP_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) MACRO_EXP_M_2(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXP_M_19(LOOP_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) MACRO_EXP_M_3(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXP_M_20(LOOP_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) MACRO_EXP_M_4(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXP_M_21(LOOP_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) MACRO_EXP_M_5(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXP_M_22(LOOP_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) MACRO_EXP_M_6(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXP_M_23(LOOP_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) MACRO_EXP_M_7(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXP_M_24(LOOP_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) MACRO_EXP_M_8(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXP_M_25(LOOP_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) MACRO_EXP_M_9(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXP_M_26(LOOP_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) MACRO_EXP_M_10(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXP_M_27(LOOP_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) MACRO_EXP_M_11(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXP_M_28(LOOP_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) MACRO_EXP_M_12(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXP_M_29(LOOP_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) MACRO_EXP_M_13(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXP_M_30(LOOP_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) MACRO_EXP_M_14(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXP_M_31(LOOP_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) MACRO_EXP_M_15(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_M_0(LOOP_ITEM, ...) \
+  LOOP_ITEM(1, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_M_1(LOOP_ITEM, ...) \
+  LOOP_ITEM(1, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_M_2(LOOP_ITEM, ...) \
+  LOOP_ITEM(2, ##__VA_ARGS__) MACRO_EXPANSION_M_0(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_M_3(LOOP_ITEM, ...) \
+  LOOP_ITEM(2, ##__VA_ARGS__) MACRO_EXPANSION_M_1(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_M_4(LOOP_ITEM, ...) \
+  LOOP_ITEM(4, ##__VA_ARGS__) MACRO_EXPANSION_M_0(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_M_5(LOOP_ITEM, ...) \
+  LOOP_ITEM(4, ##__VA_ARGS__) MACRO_EXPANSION_M_1(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_M_6(LOOP_ITEM, ...) \
+  LOOP_ITEM(4, ##__VA_ARGS__) MACRO_EXPANSION_M_2(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_M_7(LOOP_ITEM, ...) \
+  LOOP_ITEM(4, ##__VA_ARGS__) MACRO_EXPANSION_M_3(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_M_8(LOOP_ITEM, ...) \
+  LOOP_ITEM(8, ##__VA_ARGS__) MACRO_EXPANSION_M_0(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_M_9(LOOP_ITEM, ...) \
+  LOOP_ITEM(8, ##__VA_ARGS__) MACRO_EXPANSION_M_1(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_M_10(LOOP_ITEM, ...) \
+  LOOP_ITEM(8, ##__VA_ARGS__) MACRO_EXPANSION_M_2(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_M_11(LOOP_ITEM, ...) \
+  LOOP_ITEM(8, ##__VA_ARGS__) MACRO_EXPANSION_M_3(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_M_12(LOOP_ITEM, ...) \
+  LOOP_ITEM(8, ##__VA_ARGS__) MACRO_EXPANSION_M_4(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_M_13(LOOP_ITEM, ...) \
+  LOOP_ITEM(8, ##__VA_ARGS__) MACRO_EXPANSION_M_5(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_M_14(LOOP_ITEM, ...) \
+  LOOP_ITEM(8, ##__VA_ARGS__) MACRO_EXPANSION_M_6(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_M_15(LOOP_ITEM, ...) \
+  LOOP_ITEM(8, ##__VA_ARGS__) MACRO_EXPANSION_M_7(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_M_16(LOOP_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) MACRO_EXPANSION_M_0(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_M_17(LOOP_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) MACRO_EXPANSION_M_1(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_M_18(LOOP_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) MACRO_EXPANSION_M_2(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_M_19(LOOP_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) MACRO_EXPANSION_M_3(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_M_20(LOOP_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) MACRO_EXPANSION_M_4(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_M_21(LOOP_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) MACRO_EXPANSION_M_5(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_M_22(LOOP_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) MACRO_EXPANSION_M_6(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_M_23(LOOP_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) MACRO_EXPANSION_M_7(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_M_24(LOOP_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) MACRO_EXPANSION_M_8(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_M_25(LOOP_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) MACRO_EXPANSION_M_9(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_M_26(LOOP_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) MACRO_EXPANSION_M_10(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_M_27(LOOP_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) MACRO_EXPANSION_M_11(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_M_28(LOOP_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) MACRO_EXPANSION_M_12(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_M_29(LOOP_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) MACRO_EXPANSION_M_13(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_M_30(LOOP_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) MACRO_EXPANSION_M_14(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_M_31(LOOP_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) MACRO_EXPANSION_M_15(LOOP_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_MX_0(LOOP_ITEM, CROSS_ITEM, ...) \
+  LOOP_ITEM(1, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_MX_1(LOOP_ITEM, CROSS_ITEM, ...) \
+  LOOP_ITEM(1, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_MX_2(LOOP_ITEM, CROSS_ITEM, ...) \
+  LOOP_ITEM(2, ##__VA_ARGS__) CROSS_ITEM(2, 1, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_0(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_MX_3(LOOP_ITEM, CROSS_ITEM, ...) \
+  LOOP_ITEM(2, ##__VA_ARGS__) CROSS_ITEM(2, 1, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_1(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_MX_4(LOOP_ITEM, CROSS_ITEM, ...) \
+  LOOP_ITEM(4, ##__VA_ARGS__) CROSS_ITEM(4, 1, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_0(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_MX_5(LOOP_ITEM, CROSS_ITEM, ...) \
+  LOOP_ITEM(4, ##__VA_ARGS__) CROSS_ITEM(4, 1, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_1(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_MX_6(LOOP_ITEM, CROSS_ITEM, ...) \
+  LOOP_ITEM(4, ##__VA_ARGS__) CROSS_ITEM(4, 2, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_2(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_MX_7(LOOP_ITEM, CROSS_ITEM, ...) \
+  LOOP_ITEM(4, ##__VA_ARGS__) CROSS_ITEM(4, 2, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_3(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_MX_8(LOOP_ITEM, CROSS_ITEM, ...) \
+  LOOP_ITEM(8, ##__VA_ARGS__) CROSS_ITEM(8, 1, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_0(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_MX_9(LOOP_ITEM, CROSS_ITEM, ...) \
+  LOOP_ITEM(8, ##__VA_ARGS__) CROSS_ITEM(8, 1, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_1(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_MX_10(LOOP_ITEM, CROSS_ITEM, ...) \
+  LOOP_ITEM(8, ##__VA_ARGS__) CROSS_ITEM(8, 2, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_2(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_MX_11(LOOP_ITEM, CROSS_ITEM, ...) \
+  LOOP_ITEM(8, ##__VA_ARGS__) CROSS_ITEM(8, 2, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_3(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_MX_12(LOOP_ITEM, CROSS_ITEM, ...) \
+  LOOP_ITEM(8, ##__VA_ARGS__) CROSS_ITEM(8, 4, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_4(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_MX_13(LOOP_ITEM, CROSS_ITEM, ...) \
+  LOOP_ITEM(8, ##__VA_ARGS__) CROSS_ITEM(8, 4, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_5(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_MX_14(LOOP_ITEM, CROSS_ITEM, ...) \
+  LOOP_ITEM(8, ##__VA_ARGS__) CROSS_ITEM(8, 4, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_6(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_MX_15(LOOP_ITEM, CROSS_ITEM, ...) \
+  LOOP_ITEM(8, ##__VA_ARGS__) CROSS_ITEM(8, 4, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_7(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_MX_16(LOOP_ITEM, CROSS_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) CROSS_ITEM(16, 1, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_0(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_MX_17(LOOP_ITEM, CROSS_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) CROSS_ITEM(16, 1, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_1(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_MX_18(LOOP_ITEM, CROSS_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) CROSS_ITEM(16, 2, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_2(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_MX_19(LOOP_ITEM, CROSS_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) CROSS_ITEM(16, 2, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_3(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_MX_20(LOOP_ITEM, CROSS_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) CROSS_ITEM(16, 4, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_4(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_MX_21(LOOP_ITEM, CROSS_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) CROSS_ITEM(16, 4, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_5(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_MX_22(LOOP_ITEM, CROSS_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) CROSS_ITEM(16, 4, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_6(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_MX_23(LOOP_ITEM, CROSS_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) CROSS_ITEM(16, 4, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_7(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_MX_24(LOOP_ITEM, CROSS_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) CROSS_ITEM(16, 8, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_8(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_MX_25(LOOP_ITEM, CROSS_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) CROSS_ITEM(16, 8, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_9(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_MX_26(LOOP_ITEM, CROSS_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) CROSS_ITEM(16, 8, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_10(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_MX_27(LOOP_ITEM, CROSS_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) CROSS_ITEM(16, 8, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_11(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_MX_28(LOOP_ITEM, CROSS_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) CROSS_ITEM(16, 8, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_12(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_MX_29(LOOP_ITEM, CROSS_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) CROSS_ITEM(16, 8, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_13(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_MX_30(LOOP_ITEM, CROSS_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) CROSS_ITEM(16, 8, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_14(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_MX_31(LOOP_ITEM, CROSS_ITEM, ...) \
+  LOOP_ITEM(16, ##__VA_ARGS__) CROSS_ITEM(16, 8, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_15(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_IMX_0(INIT_ITEM, LOOP_ITEM, CROSS_ITEM, ...) \
+  INIT_ITEM(1, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_0(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_IMX_1(INIT_ITEM, LOOP_ITEM, CROSS_ITEM, ...) \
+  INIT_ITEM(1, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_1(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_IMX_2(INIT_ITEM, LOOP_ITEM, CROSS_ITEM, ...) \
+  INIT_ITEM(2, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_2(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_IMX_3(INIT_ITEM, LOOP_ITEM, CROSS_ITEM, ...) \
+  INIT_ITEM(2, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_3(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_IMX_4(INIT_ITEM, LOOP_ITEM, CROSS_ITEM, ...) \
+  INIT_ITEM(4, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_4(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_IMX_5(INIT_ITEM, LOOP_ITEM, CROSS_ITEM, ...) \
+  INIT_ITEM(4, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_5(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_IMX_6(INIT_ITEM, LOOP_ITEM, CROSS_ITEM, ...) \
+  INIT_ITEM(4, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_6(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_IMX_7(INIT_ITEM, LOOP_ITEM, CROSS_ITEM, ...) \
+  INIT_ITEM(4, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_7(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_IMX_8(INIT_ITEM, LOOP_ITEM, CROSS_ITEM, ...) \
+  INIT_ITEM(8, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_8(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_IMX_9(INIT_ITEM, LOOP_ITEM, CROSS_ITEM, ...) \
+  INIT_ITEM(8, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_9(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_IMX_10(INIT_ITEM, LOOP_ITEM, CROSS_ITEM, ...) \
+  INIT_ITEM(8, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_10(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_IMX_11(INIT_ITEM, LOOP_ITEM, CROSS_ITEM, ...) \
+  INIT_ITEM(8, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_11(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_IMX_12(INIT_ITEM, LOOP_ITEM, CROSS_ITEM, ...) \
+  INIT_ITEM(8, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_12(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_IMX_13(INIT_ITEM, LOOP_ITEM, CROSS_ITEM, ...) \
+  INIT_ITEM(8, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_13(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_IMX_14(INIT_ITEM, LOOP_ITEM, CROSS_ITEM, ...) \
+  INIT_ITEM(8, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_14(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_IMX_15(INIT_ITEM, LOOP_ITEM, CROSS_ITEM, ...) \
+  INIT_ITEM(8, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_15(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_IMX_16(INIT_ITEM, LOOP_ITEM, CROSS_ITEM, ...) \
+  INIT_ITEM(16, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_16(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_IMX_17(INIT_ITEM, LOOP_ITEM, CROSS_ITEM, ...) \
+  INIT_ITEM(16, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_17(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_IMX_18(INIT_ITEM, LOOP_ITEM, CROSS_ITEM, ...) \
+  INIT_ITEM(16, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_18(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_IMX_19(INIT_ITEM, LOOP_ITEM, CROSS_ITEM, ...) \
+  INIT_ITEM(16, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_19(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_IMX_20(INIT_ITEM, LOOP_ITEM, CROSS_ITEM, ...) \
+  INIT_ITEM(16, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_20(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_IMX_21(INIT_ITEM, LOOP_ITEM, CROSS_ITEM, ...) \
+  INIT_ITEM(16, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_21(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_IMX_22(INIT_ITEM, LOOP_ITEM, CROSS_ITEM, ...) \
+  INIT_ITEM(16, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_22(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_IMX_23(INIT_ITEM, LOOP_ITEM, CROSS_ITEM, ...) \
+  INIT_ITEM(16, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_23(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_IMX_24(INIT_ITEM, LOOP_ITEM, CROSS_ITEM, ...) \
+  INIT_ITEM(16, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_24(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_IMX_25(INIT_ITEM, LOOP_ITEM, CROSS_ITEM, ...) \
+  INIT_ITEM(16, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_25(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_IMX_26(INIT_ITEM, LOOP_ITEM, CROSS_ITEM, ...) \
+  INIT_ITEM(16, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_26(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_IMX_27(INIT_ITEM, LOOP_ITEM, CROSS_ITEM, ...) \
+  INIT_ITEM(16, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_27(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_IMX_28(INIT_ITEM, LOOP_ITEM, CROSS_ITEM, ...) \
+  INIT_ITEM(16, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_28(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_IMX_29(INIT_ITEM, LOOP_ITEM, CROSS_ITEM, ...) \
+  INIT_ITEM(16, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_29(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_IMX_30(INIT_ITEM, LOOP_ITEM, CROSS_ITEM, ...) \
+  INIT_ITEM(16, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_30(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXPANSION_IMX_31(INIT_ITEM, LOOP_ITEM, CROSS_ITEM, ...) \
+  INIT_ITEM(16, ##__VA_ARGS__)\
+  MACRO_EXPANSION_MX_31(LOOP_ITEM, CROSS_ITEM, ##__VA_ARGS__)
+
+#define MACRO_EXP_M_FIRSTITEM_0 1
+#define MACRO_EXP_M_FIRSTITEM_1 1
+#define MACRO_EXP_M_FIRSTITEM_2 2
+#define MACRO_EXP_M_FIRSTITEM_3 2
+#define MACRO_EXP_M_FIRSTITEM_4 4
+#define MACRO_EXP_M_FIRSTITEM_5 4
+#define MACRO_EXP_M_FIRSTITEM_6 4
+#define MACRO_EXP_M_FIRSTITEM_7 4
+#define MACRO_EXP_M_FIRSTITEM_8 8
+#define MACRO_EXP_M_FIRSTITEM_9 8
+#define MACRO_EXP_M_FIRSTITEM_10 8
+#define MACRO_EXP_M_FIRSTITEM_11 8
+#define MACRO_EXP_M_FIRSTITEM_12 8
+#define MACRO_EXP_M_FIRSTITEM_13 8
+#define MACRO_EXP_M_FIRSTITEM_14 8
+#define MACRO_EXP_M_FIRSTITEM_15 8
+#define MACRO_EXP_M_FIRSTITEM_16 16
+#define MACRO_EXP_M_FIRSTITEM_17 16
+#define MACRO_EXP_M_FIRSTITEM_18 16
+#define MACRO_EXP_M_FIRSTITEM_19 16
+#define MACRO_EXP_M_FIRSTITEM_20 16
+#define MACRO_EXP_M_FIRSTITEM_21 16
+#define MACRO_EXP_M_FIRSTITEM_22 16
+#define MACRO_EXP_M_FIRSTITEM_23 16
+#define MACRO_EXP_M_FIRSTITEM_24 16
+#define MACRO_EXP_M_FIRSTITEM_25 16
+#define MACRO_EXP_M_FIRSTITEM_26 16
+#define MACRO_EXP_M_FIRSTITEM_27 16
+#define MACRO_EXP_M_FIRSTITEM_28 16
+#define MACRO_EXP_M_FIRSTITEM_29 16
+#define MACRO_EXP_M_FIRSTITEM_30 16
+#define MACRO_EXP_M_FIRSTITEM_31 16
+
+#endif
+
diff --git a/include/neon_armv7a/Bias.h b/include/neon_armv7a/Bias.h
new file mode 100644
index 0000000..6fa7b9a
--- /dev/null
+++ b/include/neon_armv7a/Bias.h
@@ -0,0 +1,35 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+
+void bias_float(float *dst, float bias_dim0,
+  const float *bias_dim1, float bias_dim1_scale,
+  const float *bias_dim2, float bias_dim2_scale,
+  uint32_t dim1, uint32_t dim2);
+
+void bias_int32_t(int32_t *dst, int32_t bias_dim0,
+  const int32_t *bias_dim1, int32_t bias_dim1_scale,
+  const int32_t *bias_dim2, int32_t bias_dim2_scale,
+  uint32_t dim1, uint32_t dim2);
+
+void u8u32_sum(const uint8_t *src, uint32_t *dst,
+  uint32_t dim1, uint32_t dim2, uint8_t direction);
+
+void s16_sumsquare(const int16_t *dat, int32_t *sum,
+  int64_t *sumsquare, uint32_t size);
+
diff --git a/include/neon_armv7a/I8I32MlaGemmKernel.h b/include/neon_armv7a/I8I32MlaGemmKernel.h
new file mode 100644
index 0000000..9ff9a55
--- /dev/null
+++ b/include/neon_armv7a/I8I32MlaGemmKernel.h
@@ -0,0 +1,242 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include "arm_neon/NeonI8I32MlaGemmKernel.h"
+
+#ifndef INCLUDE_ARMV7A_I8I32MLA_KERNEL
+#define INCLUDE_ARMV7A_I8I32MLA_KERNEL
+
+#define KERNEL_M6N8_UNIT(a_head, b_head) \
+  I32X4 cq01, cq02, cq03, cq04, cq05, cq06;\
+  I32X4 cq07, cq08, cq09, cq10, cq11, cq12;\
+  COMMON_KERNEL_HEADER(a_head, b_head)\
+  __asm__ __volatile__(\
+    "vmov.i8 %q[cq01],#0; vmov.i8 %q[cq02],#0\n\t"\
+    "vmov.i8 %q[cq03],#0; vmov.i8 %q[cq04],#0\n\t"\
+    "vmov.i8 %q[cq05],#0; vmov.i8 %q[cq06],#0\n\t"\
+    "vmov.i8 %q[cq07],#0; vmov.i8 %q[cq08],#0\n\t"\
+    "vmov.i8 %q[cq09],#0; vmov.i8 %q[cq10],#0\n\t"\
+    "vmov.i8 %q[cq11],#0; vmov.i8 %q[cq12],#0\n\t"\
+    "cmp %[k_left],#2; blt 4f\n\t"\
+    "vldr d0,[%[a_ptr]]; ldr r0,[%[a_ptr],#8]\n\t"\
+    "ldr r1,[%[a_ptr],#12]; add %[a_ptr],%[a_ptr],#24\n\t"\
+    "vldr d4,[%[b_ptr]]; vldr d5,[%[b_ptr],#8]; ldr r2,[%[b_ptr],#16]\n\t"\
+    "ldr r3,[%[b_ptr],#20]; add %[b_ptr],%[b_ptr],#32\n\t"\
+    "cmp %[k_left],#6; blt 2f\n\t"\
+    ".balign 16; 1:\n\t"\
+    "vmov d6,r2,r3; vldr d7,[%[b_ptr],#-8]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq01],d4,d0[0]; ldr r2,[%[b_ptr]]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq02],d5,d0[0]; ldr r3,[%[b_ptr],#4]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq03],d4,d0[1]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq04],d5,d0[1]\n\t"\
+    "vmov d1,r0,r1; vldr d2,[%[a_ptr],#-8]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq05],d4,d0[2]; ldr r0,[%[a_ptr]]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq06],d5,d0[2]; ldr r1,[%[a_ptr],#4]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq07],d4,d0[3]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq08],d5,d0[3]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq09],d4,d1[0]; pld [%[a_ptr],#128]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq10],d5,d1[0]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq11],d4,d1[1]; pld [%[b_ptr],#128]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq12],d5,d1[1]\n\t"\
+    "vmov d4,r2,r3; vldr d5,[%[b_ptr],#8]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq01],d6,d1[2]; ldr r2,[%[b_ptr],#16]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq02],d7,d1[2]; ldr r3,[%[b_ptr],#20]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq03],d6,d1[3]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq04],d7,d1[3]\n\t"\
+    "vmov d0,r0,r1; vldr d1,[%[a_ptr],#8]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq05],d6,d2[0]; ldr r0,[%[a_ptr],#16]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq06],d7,d2[0]; ldr r1,[%[a_ptr],#20]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq07],d6,d2[1]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq08],d7,d2[1]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq09],d6,d2[2]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq10],d7,d2[2]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq11],d6,d2[3]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq12],d7,d2[3]\n\t"\
+    "vmov d6,r2,r3; vldr d7,[%[b_ptr],#24]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq01],d4,d0[0]; ldr r2,[%[b_ptr],#32]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq02],d5,d0[0]; ldr r3,[%[b_ptr],#36]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq03],d4,d0[1]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq04],d5,d0[1]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq05],d4,d0[2]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq06],d5,d0[2]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq07],d4,d0[3]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq08],d5,d0[3]\n\t"\
+    "vmov d2,r0,r1; vldr d0,[%[a_ptr],#24]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq09],d4,d1[0]; ldr r0,[%[a_ptr],#32]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq10],d5,d1[0]; ldr r1,[%[a_ptr],#36]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq11],d4,d1[1]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq12],d5,d1[1]\n\t"\
+    "vmov d4,r2,r3; vldr d5,[%[b_ptr],#40]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq01],d6,d1[2]; ldr r2,[%[b_ptr],#48]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq02],d7,d1[2]; ldr r3,[%[b_ptr],#52]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq03],d6,d1[3]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq04],d7,d1[3]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq05],d6,d2[0]; add %[a_ptr],%[a_ptr],#48\n\t"\
+    ""ASM_VMLAL_I16" %q[cq06],d7,d2[0]; add %[b_ptr],%[b_ptr],#64\n\t"\
+    ""ASM_VMLAL_I16" %q[cq07],d6,d2[1]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq08],d7,d2[1]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq09],d6,d2[2]; sub %[k_left],%[k_left],#4\n\t"\
+    ""ASM_VMLAL_I16" %q[cq10],d7,d2[2]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq11],d6,d2[3]; cmp %[k_left],#6\n\t"\
+    ""ASM_VMLAL_I16" %q[cq12],d7,d2[3]; bge 1b\n\t"\
+    "2:\n\t"\
+    "cmp %[k_left],#4; blt 3f\n\t"\
+    "vmov d6,r2,r3; vldr d7,[%[b_ptr],#-8]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq01],d4,d0[0]; ldr r2,[%[b_ptr]]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq02],d5,d0[0]; ldr r3,[%[b_ptr],#4]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq03],d4,d0[1]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq04],d5,d0[1]\n\t"\
+    "vmov d1,r0,r1; vldr d2,[%[a_ptr],#-8]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq05],d4,d0[2]; ldr r0,[%[a_ptr]]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq06],d5,d0[2]; ldr r1,[%[a_ptr],#4]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq07],d4,d0[3]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq08],d5,d0[3]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq09],d4,d1[0]; pld [%[a_ptr],#128]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq10],d5,d1[0]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq11],d4,d1[1]; pld [%[b_ptr],#128]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq12],d5,d1[1]\n\t"\
+    "vmov d4,r2,r3; vldr d5,[%[b_ptr],#8]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq01],d6,d1[2]; ldr r2,[%[b_ptr],#16]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq02],d7,d1[2]; ldr r3,[%[b_ptr],#20]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq03],d6,d1[3]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq04],d7,d1[3]\n\t"\
+    "vmov d0,r0,r1; vldr d1,[%[a_ptr],#8]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq05],d6,d2[0]; ldr r0,[%[a_ptr],#16]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq06],d7,d2[0]; ldr r1,[%[a_ptr],#20]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq07],d6,d2[1]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq08],d7,d2[1]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq09],d6,d2[2]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq10],d7,d2[2]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq11],d6,d2[3]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq12],d7,d2[3]\n\t"\
+    "vmov d6,r2,r3; vldr d7,[%[b_ptr],#24]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq01],d4,d0[0]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq02],d5,d0[0]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq03],d4,d0[1]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq04],d5,d0[1]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq05],d4,d0[2]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq06],d5,d0[2]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq07],d4,d0[3]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq08],d5,d0[3]\n\t"\
+    "vmov d2,r0,r1\n\t"\
+    ""ASM_VMLAL_I16" %q[cq09],d4,d1[0]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq10],d5,d1[0]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq11],d4,d1[1]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq12],d5,d1[1]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq01],d6,d1[2]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq02],d7,d1[2]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq03],d6,d1[3]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq04],d7,d1[3]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq05],d6,d2[0]; add %[a_ptr],%[a_ptr],#24\n\t"\
+    ""ASM_VMLAL_I16" %q[cq06],d7,d2[0]; add %[b_ptr],%[b_ptr],#32\n\t"\
+    ""ASM_VMLAL_I16" %q[cq07],d6,d2[1]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq08],d7,d2[1]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq09],d6,d2[2]; sub %[k_left],%[k_left],#4\n\t"\
+    ""ASM_VMLAL_I16" %q[cq10],d7,d2[2]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq11],d6,d2[3]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq12],d7,d2[3]; b 4f\n\t"\
+    "3:\n\t"\
+    "vmov d6,r2,r3; vldr d7,[%[b_ptr],#-8]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq01],d4,d0[0]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq02],d5,d0[0]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq03],d4,d0[1]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq04],d5,d0[1]\n\t"\
+    "vmov d1,r0,r1; vldr d2,[%[a_ptr],#-8]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq05],d4,d0[2]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq06],d5,d0[2]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq07],d4,d0[3]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq08],d5,d0[3]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq09],d4,d1[0]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq10],d5,d1[0]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq11],d4,d1[1]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq12],d5,d1[1]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq01],d6,d1[2]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq02],d7,d1[2]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq03],d6,d1[3]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq04],d7,d1[3]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq05],d6,d2[0]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq06],d7,d2[0]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq07],d6,d2[1]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq08],d7,d2[1]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq09],d6,d2[2]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq10],d7,d2[2]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq11],d6,d2[3]; sub %[k_left],%[k_left],#2\n\t"\
+    ""ASM_VMLAL_I16" %q[cq12],d7,d2[3]\n\t"\
+    "4:\n\t"\
+    "cmp %[k_left],#1; blt 5f\n\t"\
+    "vldr d4,[%[b_ptr]]; vldr d5,[%[b_ptr],#8]; add %[b_ptr],%[b_ptr],#16\n\t"\
+    "vldr d0,[%[a_ptr]]; vldr s2,[%[a_ptr],#8]\n\t"\
+    "add %[a_ptr],%[a_ptr],#12\n\t"\
+    ""ASM_VMLAL_I16" %q[cq01],d4,d0[0]; "ASM_VMLAL_I16" %q[cq02],d5,d0[0]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq03],d4,d0[1]; "ASM_VMLAL_I16" %q[cq04],d5,d0[1]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq05],d4,d0[2]; "ASM_VMLAL_I16" %q[cq06],d5,d0[2]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq07],d4,d0[3]; "ASM_VMLAL_I16" %q[cq08],d5,d0[3]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq09],d4,d1[0]; "ASM_VMLAL_I16" %q[cq10],d5,d1[0]\n\t"\
+    ""ASM_VMLAL_I16" %q[cq11],d4,d1[1]; "ASM_VMLAL_I16" %q[cq12],d5,d1[1]\n\t"\
+    "sub %[k_left],%[k_left],#1\n\t"\
+    "5:\n\t"\
+   :[a_ptr]"+r"(a_ptr), [b_ptr]"+r"(b_ptr), [k_left]"+r"(k_left),\
+   [cq01]"=w"(cq01), [cq02]"=w"(cq02), [cq03]"=w"(cq03), [cq04]"=w"(cq04),\
+   [cq05]"=w"(cq05), [cq06]"=w"(cq06), [cq07]"=w"(cq07), [cq08]"=w"(cq08),\
+   [cq09]"=w"(cq09), [cq10]"=w"(cq10), [cq11]"=w"(cq11), [cq12]"=w"(cq12)\
+   ::"r0","r1","r2","r3","cc","memory","q0","q1","q2","q3");
+
+static inline void pldw_c_6(const I32 *c) {
+  __asm__("pld [%0]; pld [%0,#20]\n\t"::"r"(c):);
+}
+
+static inline void pldw_c_8(const I32 *c) {
+  __asm__("pld [%0]; pld [%0,#28]\n\t"::"r"(c):);
+}
+
+#define KERNEL_M6N8 \
+  I32 *c_pref = c_ptr;\
+  pldw_c_6(c_pref); c_pref += ldc;\
+  pldw_c_6(c_pref); c_pref += ldc;\
+  pldw_c_6(c_pref); c_pref += ldc;\
+  pldw_c_6(c_pref); c_pref += ldc;\
+  pldw_c_6(c_pref); c_pref += ldc;\
+  pldw_c_6(c_pref); c_pref += ldc;\
+  pldw_c_6(c_pref); c_pref += ldc;\
+  pldw_c_6(c_pref);\
+  KERNEL_M6N8_UNIT(a_head, b_head)
+
+#define KERNEL_M8N6 \
+  I32 *c_pref = c_ptr;\
+  pldw_c_8(c_pref); c_pref += ldc;\
+  pldw_c_8(c_pref); c_pref += ldc;\
+  pldw_c_8(c_pref); c_pref += ldc;\
+  pldw_c_8(c_pref); c_pref += ldc;\
+  pldw_c_8(c_pref); c_pref += ldc;\
+  pldw_c_8(c_pref);\
+  KERNEL_M6N8_UNIT(b_head, a_head)
+
+#define SAVE_M6N8 \
+  I32 *c_tmp = c_ptr;\
+  UNIT_SAVE_M6N4(cq01, cq03, cq05, cq07, cq09, cq11)\
+  UNIT_SAVE_M6N4(cq02, cq04, cq06, cq08, cq10, cq12)
+
+#define SAVE_M8N6 \
+  I32 *c_tmp = c_ptr;\
+  UNIT_SAVE_M8N2(cq01, cq02, cq03, cq04)\
+  UNIT_SAVE_M8N2(cq05, cq06, cq07, cq08)\
+  UNIT_SAVE_M8N2(cq09, cq10, cq11, cq12)
+
+IMLAGEMM_INLINE_DUALPACK_UNIT_FUNC(6, 8, I16, I32)
+IMLAGEMM_INLINE_DUALPACK_UNIT_FUNC(8, 6, I16, I32)
+
+#endif
diff --git a/include/neon_armv7a/S8S32MlaGemmCopy.h b/include/neon_armv7a/S8S32MlaGemmCopy.h
new file mode 100644
index 0000000..47c5052
--- /dev/null
+++ b/include/neon_armv7a/S8S32MlaGemmCopy.h
@@ -0,0 +1,31 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+
+void s8s32mlagemm_int8_t_int16_t_ncopy_unroll6(const int8_t * __restrict__ src,
+  int16_t * __restrict__ dst, uint32_t ld_dim, uint32_t dim1, uint32_t dim2);
+
+void s8s32mlagemm_int8_t_int16_t_ncopy_unroll8(const int8_t * __restrict__ src,
+  int16_t * __restrict__ dst, uint32_t ld_dim, uint32_t dim1, uint32_t dim2);
+
+void s8s32mlagemm_int8_t_int16_t_tcopy_unroll6(const int8_t * __restrict__ src,
+  int16_t * __restrict__ dst, uint32_t ld_dim, uint32_t dim1, uint32_t dim2);
+
+void s8s32mlagemm_int8_t_int16_t_tcopy_unroll8(const int8_t * __restrict__ src,
+  int16_t * __restrict__ dst, uint32_t ld_dim, uint32_t dim1, uint32_t dim2);
+
diff --git a/include/neon_armv7a/S8S32MlaGemmDriver.h b/include/neon_armv7a/S8S32MlaGemmDriver.h
new file mode 100644
index 0000000..26121fa
--- /dev/null
+++ b/include/neon_armv7a/S8S32MlaGemmDriver.h
@@ -0,0 +1,28 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+
+int s8s32mlagemm_serial(int a_rowmajor, int b_rowmajor,
+  const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t N, uint32_t K, int32_t beta_inp);
+
+int s8s32mlagemm(int a_rowmajor, int b_rowmajor,
+  const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t N, uint32_t K,
+  int32_t beta_inp, uint32_t num_threads);
+
diff --git a/include/neon_armv7a/S8S32MlaGemmKernel.h b/include/neon_armv7a/S8S32MlaGemmKernel.h
new file mode 100644
index 0000000..4dd3469
--- /dev/null
+++ b/include/neon_armv7a/S8S32MlaGemmKernel.h
@@ -0,0 +1,29 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+
+void s8s32mlagemm_kernel_lm_m6n8(uint32_t M, uint32_t N, uint32_t K,
+  int32_t beta,
+  const int16_t * __restrict__ sa, const int16_t * __restrict__ sb,
+  int32_t * __restrict__ C, uint32_t ldc);
+
+void s8s32mlagemm_kernel_ln_m8n6(uint32_t M, uint32_t N, uint32_t K,
+  int32_t beta,
+  const int16_t * __restrict__ sa, const int16_t * __restrict__ sb,
+  int32_t * __restrict__ C, uint32_t ldc);
+
diff --git a/include/neon_armv7a/S8S32MlaGemmSkinnyDot.h b/include/neon_armv7a/S8S32MlaGemmSkinnyDot.h
new file mode 100644
index 0000000..1d4765e
--- /dev/null
+++ b/include/neon_armv7a/S8S32MlaGemmSkinnyDot.h
@@ -0,0 +1,47 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+
+void s8s32mlagemm_arowmajor_bskinny_aint8_t_bint8_t_n1(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, int32_t beta_inp);
+
+void s8s32mlagemm_arowmajor_bskinny_aint8_t_bint8_t_n2(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, int32_t beta_inp);
+
+void s8s32mlagemm_arowmajor_bskinny_aint8_t_bint8_t_n3(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, int32_t beta_inp);
+
+void s8s32mlagemm_arowmajor_bskinny_aint8_t_bint8_t_n4(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, int32_t beta_inp);
+
+void s8s32mlagemm_arowmajor_bskinny_aint8_t_bint8_t_n1_omp(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  int32_t beta_inp, uint32_t num_threads);
+
+void s8s32mlagemm_arowmajor_bskinny_aint8_t_bint8_t_n2_omp(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  int32_t beta_inp, uint32_t num_threads);
+
+void s8s32mlagemm_arowmajor_bskinny_aint8_t_bint8_t_n3_omp(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  int32_t beta_inp, uint32_t num_threads);
+
+void s8s32mlagemm_arowmajor_bskinny_aint8_t_bint8_t_n4_omp(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  int32_t beta_inp, uint32_t num_threads);
+
diff --git a/include/neon_armv7a/S8S32MlaGemmSkinnyGer.h b/include/neon_armv7a/S8S32MlaGemmSkinnyGer.h
new file mode 100644
index 0000000..79e73a9
--- /dev/null
+++ b/include/neon_armv7a/S8S32MlaGemmSkinnyGer.h
@@ -0,0 +1,47 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+
+void s8s32mlagemm_acolmajor_bskinny_aint8_t_bint8_t_n1(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, int32_t beta_inp);
+
+void s8s32mlagemm_acolmajor_bskinny_aint8_t_bint8_t_n2(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, int32_t beta_inp);
+
+void s8s32mlagemm_acolmajor_bskinny_aint8_t_bint8_t_n3(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, int32_t beta_inp);
+
+void s8s32mlagemm_acolmajor_bskinny_aint8_t_bint8_t_n4(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, int32_t beta_inp);
+
+void s8s32mlagemm_acolmajor_bskinny_aint8_t_bint8_t_n1_omp(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  int32_t beta_inp, uint32_t num_threads);
+
+void s8s32mlagemm_acolmajor_bskinny_aint8_t_bint8_t_n2_omp(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  int32_t beta_inp, uint32_t num_threads);
+
+void s8s32mlagemm_acolmajor_bskinny_aint8_t_bint8_t_n3_omp(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  int32_t beta_inp, uint32_t num_threads);
+
+void s8s32mlagemm_acolmajor_bskinny_aint8_t_bint8_t_n4_omp(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  int32_t beta_inp, uint32_t num_threads);
+
diff --git a/include/neon_armv7a/SgemmCopy.h b/include/neon_armv7a/SgemmCopy.h
new file mode 100644
index 0000000..ec11f82
--- /dev/null
+++ b/include/neon_armv7a/SgemmCopy.h
@@ -0,0 +1,31 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+
+void sgemm_float_float_ncopy_unroll6(const float * __restrict__ src,
+  float * __restrict__ dst, uint32_t ld_dim, uint32_t dim1, uint32_t dim2);
+
+void sgemm_float_float_ncopy_unroll8(const float * __restrict__ src,
+  float * __restrict__ dst, uint32_t ld_dim, uint32_t dim1, uint32_t dim2);
+
+void sgemm_float_float_tcopy_unroll6(const float * __restrict__ src,
+  float * __restrict__ dst, uint32_t ld_dim, uint32_t dim1, uint32_t dim2);
+
+void sgemm_float_float_tcopy_unroll8(const float * __restrict__ src,
+  float * __restrict__ dst, uint32_t ld_dim, uint32_t dim1, uint32_t dim2);
+
diff --git a/include/neon_armv7a/SgemmDriver.h b/include/neon_armv7a/SgemmDriver.h
new file mode 100644
index 0000000..bfc4217
--- /dev/null
+++ b/include/neon_armv7a/SgemmDriver.h
@@ -0,0 +1,27 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+
+int sgemm_serial(int a_rowmajor, int b_rowmajor,
+  const float *A, const float *B, float *C,
+  uint32_t M, uint32_t N, uint32_t K, float beta_inp);
+
+int sgemm(int a_rowmajor, int b_rowmajor,
+  const float *A, const float *B, float *C,
+  uint32_t M, uint32_t N, uint32_t K, float beta_inp, uint32_t num_threads);
+
diff --git a/include/neon_armv7a/SgemmKernel.h b/include/neon_armv7a/SgemmKernel.h
new file mode 100644
index 0000000..4535041
--- /dev/null
+++ b/include/neon_armv7a/SgemmKernel.h
@@ -0,0 +1,27 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+
+void sgemm_kernel_lm_m6n8(uint32_t M, uint32_t N, uint32_t K, float beta,
+  const float * __restrict__ sa, const float * __restrict__ sb,
+  float * __restrict__ C, uint32_t ldc);
+
+void sgemm_kernel_ln_m8n6(uint32_t M, uint32_t N, uint32_t K, float beta,
+  const float * __restrict__ sa, const float * __restrict__ sb,
+  float * __restrict__ C, uint32_t ldc);
+
diff --git a/include/neon_armv7a/SgemmSkinnyDot.h b/include/neon_armv7a/SgemmSkinnyDot.h
new file mode 100644
index 0000000..950d576
--- /dev/null
+++ b/include/neon_armv7a/SgemmSkinnyDot.h
@@ -0,0 +1,67 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n1(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n2(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n3(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n4(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n5(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n6(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n7(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n8(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n1_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n2_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n3_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n4_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n5_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n6_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n7_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n8_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
diff --git a/include/neon_armv7a/SgemmSkinnyGer.h b/include/neon_armv7a/SgemmSkinnyGer.h
new file mode 100644
index 0000000..6466d79
--- /dev/null
+++ b/include/neon_armv7a/SgemmSkinnyGer.h
@@ -0,0 +1,67 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+
+void sgemm_acolmajor_bskinny_afloat_bfloat_n1(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_acolmajor_bskinny_afloat_bfloat_n2(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_acolmajor_bskinny_afloat_bfloat_n3(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_acolmajor_bskinny_afloat_bfloat_n4(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_acolmajor_bskinny_afloat_bfloat_n5(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_acolmajor_bskinny_afloat_bfloat_n6(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_acolmajor_bskinny_afloat_bfloat_n7(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_acolmajor_bskinny_afloat_bfloat_n8(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_acolmajor_bskinny_afloat_bfloat_n1_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_acolmajor_bskinny_afloat_bfloat_n2_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_acolmajor_bskinny_afloat_bfloat_n3_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_acolmajor_bskinny_afloat_bfloat_n4_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_acolmajor_bskinny_afloat_bfloat_n5_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_acolmajor_bskinny_afloat_bfloat_n6_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_acolmajor_bskinny_afloat_bfloat_n7_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_acolmajor_bskinny_afloat_bfloat_n8_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
diff --git a/include/neon_armv7a/U8U32MlaGemmCopy.h b/include/neon_armv7a/U8U32MlaGemmCopy.h
new file mode 100644
index 0000000..cb78832
--- /dev/null
+++ b/include/neon_armv7a/U8U32MlaGemmCopy.h
@@ -0,0 +1,31 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+
+void u8u32mlagemm_uint8_t_uint16_t_ncopy_unroll6(const uint8_t * __restrict__ src,
+  uint16_t * __restrict__ dst, uint32_t ld_dim, uint32_t dim1, uint32_t dim2);
+
+void u8u32mlagemm_uint8_t_uint16_t_ncopy_unroll8(const uint8_t * __restrict__ src,
+  uint16_t * __restrict__ dst, uint32_t ld_dim, uint32_t dim1, uint32_t dim2);
+
+void u8u32mlagemm_uint8_t_uint16_t_tcopy_unroll6(const uint8_t * __restrict__ src,
+  uint16_t * __restrict__ dst, uint32_t ld_dim, uint32_t dim1, uint32_t dim2);
+
+void u8u32mlagemm_uint8_t_uint16_t_tcopy_unroll8(const uint8_t * __restrict__ src,
+  uint16_t * __restrict__ dst, uint32_t ld_dim, uint32_t dim1, uint32_t dim2);
+
diff --git a/include/neon_armv7a/U8U32MlaGemmDriver.h b/include/neon_armv7a/U8U32MlaGemmDriver.h
new file mode 100644
index 0000000..9477c3d
--- /dev/null
+++ b/include/neon_armv7a/U8U32MlaGemmDriver.h
@@ -0,0 +1,28 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+
+int u8u32mlagemm_serial(int a_rowmajor, int b_rowmajor,
+  const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t N, uint32_t K, uint32_t beta_inp);
+
+int u8u32mlagemm(int a_rowmajor, int b_rowmajor,
+  const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t N, uint32_t K,
+  uint32_t beta_inp, uint32_t num_threads);
+
diff --git a/include/neon_armv7a/U8U32MlaGemmKernel.h b/include/neon_armv7a/U8U32MlaGemmKernel.h
new file mode 100644
index 0000000..c0b79b8
--- /dev/null
+++ b/include/neon_armv7a/U8U32MlaGemmKernel.h
@@ -0,0 +1,29 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+
+void u8u32mlagemm_kernel_lm_m6n8(uint32_t M, uint32_t N, uint32_t K,
+  uint32_t beta,
+  const uint16_t * __restrict__ sa, const uint16_t * __restrict__ sb,
+  uint32_t * __restrict__ C, uint32_t ldc);
+
+void u8u32mlagemm_kernel_ln_m8n6(uint32_t M, uint32_t N, uint32_t K,
+  uint32_t beta,
+  const uint16_t * __restrict__ sa, const uint16_t * __restrict__ sb,
+  uint32_t * __restrict__ C, uint32_t ldc);
+
diff --git a/include/neon_armv7a/U8U32MlaGemmSkinnyDot.h b/include/neon_armv7a/U8U32MlaGemmSkinnyDot.h
new file mode 100644
index 0000000..59df381
--- /dev/null
+++ b/include/neon_armv7a/U8U32MlaGemmSkinnyDot.h
@@ -0,0 +1,47 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+
+void u8u32mlagemm_arowmajor_bskinny_auint8_t_buint8_t_n1(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, uint32_t beta_inp);
+
+void u8u32mlagemm_arowmajor_bskinny_auint8_t_buint8_t_n2(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, uint32_t beta_inp);
+
+void u8u32mlagemm_arowmajor_bskinny_auint8_t_buint8_t_n3(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, uint32_t beta_inp);
+
+void u8u32mlagemm_arowmajor_bskinny_auint8_t_buint8_t_n4(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, uint32_t beta_inp);
+
+void u8u32mlagemm_arowmajor_bskinny_auint8_t_buint8_t_n1_omp(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  uint32_t beta_inp, uint32_t num_threads);
+
+void u8u32mlagemm_arowmajor_bskinny_auint8_t_buint8_t_n2_omp(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  uint32_t beta_inp, uint32_t num_threads);
+
+void u8u32mlagemm_arowmajor_bskinny_auint8_t_buint8_t_n3_omp(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  uint32_t beta_inp, uint32_t num_threads);
+
+void u8u32mlagemm_arowmajor_bskinny_auint8_t_buint8_t_n4_omp(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  uint32_t beta_inp, uint32_t num_threads);
+
diff --git a/include/neon_armv7a/U8U32MlaGemmSkinnyGer.h b/include/neon_armv7a/U8U32MlaGemmSkinnyGer.h
new file mode 100644
index 0000000..5c121de
--- /dev/null
+++ b/include/neon_armv7a/U8U32MlaGemmSkinnyGer.h
@@ -0,0 +1,47 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+
+void u8u32mlagemm_acolmajor_bskinny_auint8_t_buint8_t_n1(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, uint32_t beta_inp);
+
+void u8u32mlagemm_acolmajor_bskinny_auint8_t_buint8_t_n2(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, uint32_t beta_inp);
+
+void u8u32mlagemm_acolmajor_bskinny_auint8_t_buint8_t_n3(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, uint32_t beta_inp);
+
+void u8u32mlagemm_acolmajor_bskinny_auint8_t_buint8_t_n4(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, uint32_t beta_inp);
+
+void u8u32mlagemm_acolmajor_bskinny_auint8_t_buint8_t_n1_omp(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  uint32_t beta_inp, uint32_t num_threads);
+
+void u8u32mlagemm_acolmajor_bskinny_auint8_t_buint8_t_n2_omp(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  uint32_t beta_inp, uint32_t num_threads);
+
+void u8u32mlagemm_acolmajor_bskinny_auint8_t_buint8_t_n3_omp(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  uint32_t beta_inp, uint32_t num_threads);
+
+void u8u32mlagemm_acolmajor_bskinny_auint8_t_buint8_t_n4_omp(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  uint32_t beta_inp, uint32_t num_threads);
+
diff --git a/include/neon_armv8a/Bias.h b/include/neon_armv8a/Bias.h
new file mode 100644
index 0000000..4ee8eb5
--- /dev/null
+++ b/include/neon_armv8a/Bias.h
@@ -0,0 +1,36 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+#include <arm_neon.h>
+
+void bias_float(float *dst, float bias_dim0,
+  const float *bias_dim1, float bias_dim1_scale,
+  const float *bias_dim2, float bias_dim2_scale,
+  uint32_t dim1, uint32_t dim2);
+
+void bias_int32_t(int32_t *dst, int32_t bias_dim0,
+  const int32_t *bias_dim1, int32_t bias_dim1_scale,
+  const int32_t *bias_dim2, int32_t bias_dim2_scale,
+  uint32_t dim1, uint32_t dim2);
+
+void u8u32_sum(const uint8_t *src, uint32_t *dst,
+  uint32_t dim1, uint32_t dim2, uint8_t direction);
+
+void s16_sumsquare(const int16_t *dat, int32_t *sum,
+  int64_t *sumsquare, uint32_t size);
+
diff --git a/include/neon_armv8a/HgemmCopy.h b/include/neon_armv8a/HgemmCopy.h
new file mode 100644
index 0000000..7ce17cd
--- /dev/null
+++ b/include/neon_armv8a/HgemmCopy.h
@@ -0,0 +1,32 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+#include <arm_neon.h>
+
+void hgemm_float16_t_float16_t_ncopy_unroll8(const float16_t * __restrict__ src,
+  float16_t * __restrict__ dst, uint32_t ld_dim, uint32_t dim1, uint32_t dim2);
+
+void hgemm_float16_t_float16_t_ncopy_unroll16(const float16_t * __restrict__ src,
+  float16_t * __restrict__ dst, uint32_t ld_dim, uint32_t dim1, uint32_t dim2);
+
+void hgemm_float16_t_float16_t_tcopy_unroll8(const float16_t * __restrict__ src,
+  float16_t * __restrict__ dst, uint32_t ld_dim, uint32_t dim1, uint32_t dim2);
+
+void hgemm_float16_t_float16_t_tcopy_unroll16(const float16_t * __restrict__ src,
+  float16_t * __restrict__ dst, uint32_t ld_dim, uint32_t dim1, uint32_t dim2);
+
diff --git a/include/neon_armv8a/HgemmDriver.h b/include/neon_armv8a/HgemmDriver.h
new file mode 100644
index 0000000..931a1e9
--- /dev/null
+++ b/include/neon_armv8a/HgemmDriver.h
@@ -0,0 +1,25 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+#include <arm_neon.h>
+
+int hgemm_serial(uint8_t transAB, const float16_t *A, const float16_t *B, float16_t *C,
+  uint32_t M, uint32_t N, uint32_t K, float16_t beta_inp);
+
+int hgemm(uint8_t transAB, const float16_t *A, const float16_t *B, float16_t *C,
+  uint32_t M, uint32_t N, uint32_t K, float16_t beta_inp, uint32_t num_threads);
diff --git a/include/neon_armv8a/HgemmKernel.h b/include/neon_armv8a/HgemmKernel.h
new file mode 100644
index 0000000..c778284
--- /dev/null
+++ b/include/neon_armv8a/HgemmKernel.h
@@ -0,0 +1,28 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+#include <arm_neon.h>
+
+void hgemm_kernel_lm_m8n16(uint32_t M, uint32_t N, uint32_t K, float16_t beta,
+  const float16_t * __restrict__ sa, const float16_t * __restrict__ sb,
+  float16_t * __restrict__ C, uint32_t ldc);
+
+void hgemm_kernel_ln_m16n8(uint32_t M, uint32_t N, uint32_t K, float16_t beta,
+  const float16_t * __restrict__ sa, const float16_t * __restrict__ sb,
+  float16_t * __restrict__ C, uint32_t ldc);
+
diff --git a/include/neon_armv8a/HgemmSkinnyDot.h b/include/neon_armv8a/HgemmSkinnyDot.h
new file mode 100644
index 0000000..601c56b
--- /dev/null
+++ b/include/neon_armv8a/HgemmSkinnyDot.h
@@ -0,0 +1,116 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+#include <arm_neon.h>
+
+void hgemm_arowmajor_bskinny_afloat16_t_bfloat16_t_n1(const float16_t *A, const float16_t *B,
+  float16_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, float16_t beta_inp);
+
+void hgemm_arowmajor_bskinny_afloat16_t_bfloat16_t_n2(const float16_t *A, const float16_t *B,
+  float16_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, float16_t beta_inp);
+
+void hgemm_arowmajor_bskinny_afloat16_t_bfloat16_t_n3(const float16_t *A, const float16_t *B,
+  float16_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, float16_t beta_inp);
+
+void hgemm_arowmajor_bskinny_afloat16_t_bfloat16_t_n4(const float16_t *A, const float16_t *B,
+  float16_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, float16_t beta_inp);
+
+void hgemm_arowmajor_bskinny_afloat16_t_bfloat16_t_n5(const float16_t *A, const float16_t *B,
+  float16_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, float16_t beta_inp);
+
+void hgemm_arowmajor_bskinny_afloat16_t_bfloat16_t_n6(const float16_t *A, const float16_t *B,
+  float16_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, float16_t beta_inp);
+
+void hgemm_arowmajor_bskinny_afloat16_t_bfloat16_t_n7(const float16_t *A, const float16_t *B,
+  float16_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, float16_t beta_inp);
+
+void hgemm_arowmajor_bskinny_afloat16_t_bfloat16_t_n8(const float16_t *A, const float16_t *B,
+  float16_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, float16_t beta_inp);
+
+void hgemm_arowmajor_bskinny_afloat16_t_bfloat16_t_n9(const float16_t *A, const float16_t *B,
+  float16_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, float16_t beta_inp);
+
+void hgemm_arowmajor_bskinny_afloat16_t_bfloat16_t_n10(const float16_t *A, const float16_t *B,
+  float16_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, float16_t beta_inp);
+
+void hgemm_arowmajor_bskinny_afloat16_t_bfloat16_t_n11(const float16_t *A, const float16_t *B,
+  float16_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, float16_t beta_inp);
+
+void hgemm_arowmajor_bskinny_afloat16_t_bfloat16_t_n12(const float16_t *A, const float16_t *B,
+  float16_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, float16_t beta_inp);
+
+void hgemm_arowmajor_bskinny_afloat16_t_bfloat16_t_n1_omp(
+  const float16_t *A, const float16_t *B,
+  float16_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  float16_t beta_inp, uint32_t num_threads);
+
+void hgemm_arowmajor_bskinny_afloat16_t_bfloat16_t_n2_omp(
+  const float16_t *A, const float16_t *B,
+  float16_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  float16_t beta_inp, uint32_t num_threads);
+
+void hgemm_arowmajor_bskinny_afloat16_t_bfloat16_t_n3_omp(
+  const float16_t *A, const float16_t *B,
+  float16_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  float16_t beta_inp, uint32_t num_threads);
+
+void hgemm_arowmajor_bskinny_afloat16_t_bfloat16_t_n4_omp(
+  const float16_t *A, const float16_t *B,
+  float16_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  float16_t beta_inp, uint32_t num_threads);
+
+void hgemm_arowmajor_bskinny_afloat16_t_bfloat16_t_n5_omp(
+  const float16_t *A, const float16_t *B,
+  float16_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  float16_t beta_inp, uint32_t num_threads);
+
+void hgemm_arowmajor_bskinny_afloat16_t_bfloat16_t_n6_omp(
+  const float16_t *A, const float16_t *B,
+  float16_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  float16_t beta_inp, uint32_t num_threads);
+
+void hgemm_arowmajor_bskinny_afloat16_t_bfloat16_t_n7_omp(
+  const float16_t *A, const float16_t *B,
+  float16_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  float16_t beta_inp, uint32_t num_threads);
+
+void hgemm_arowmajor_bskinny_afloat16_t_bfloat16_t_n8_omp(
+  const float16_t *A, const float16_t *B,
+  float16_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  float16_t beta_inp, uint32_t num_threads);
+
+void hgemm_arowmajor_bskinny_afloat16_t_bfloat16_t_n9_omp(
+  const float16_t *A, const float16_t *B,
+  float16_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  float16_t beta_inp, uint32_t num_threads);
+
+void hgemm_arowmajor_bskinny_afloat16_t_bfloat16_t_n10_omp(
+  const float16_t *A, const float16_t *B,
+  float16_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  float16_t beta_inp, uint32_t num_threads);
+
+void hgemm_arowmajor_bskinny_afloat16_t_bfloat16_t_n11_omp(
+  const float16_t *A, const float16_t *B,
+  float16_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  float16_t beta_inp, uint32_t num_threads);
+
+void hgemm_arowmajor_bskinny_afloat16_t_bfloat16_t_n12_omp(
+  const float16_t *A, const float16_t *B,
+  float16_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  float16_t beta_inp, uint32_t num_threads);
+
diff --git a/include/neon_armv8a/HgemmSkinnyGer.h b/include/neon_armv8a/HgemmSkinnyGer.h
new file mode 100644
index 0000000..44a1d2b
--- /dev/null
+++ b/include/neon_armv8a/HgemmSkinnyGer.h
@@ -0,0 +1,116 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+#include <arm_neon.h>
+
+void hgemm_acolmajor_bskinny_afloat16_t_bfloat16_t_n1(const float16_t *A, const float16_t *B,
+  float16_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, float16_t beta_inp);
+
+void hgemm_acolmajor_bskinny_afloat16_t_bfloat16_t_n2(const float16_t *A, const float16_t *B,
+  float16_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, float16_t beta_inp);
+
+void hgemm_acolmajor_bskinny_afloat16_t_bfloat16_t_n3(const float16_t *A, const float16_t *B,
+  float16_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, float16_t beta_inp);
+
+void hgemm_acolmajor_bskinny_afloat16_t_bfloat16_t_n4(const float16_t *A, const float16_t *B,
+  float16_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, float16_t beta_inp);
+
+void hgemm_acolmajor_bskinny_afloat16_t_bfloat16_t_n5(const float16_t *A, const float16_t *B,
+  float16_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, float16_t beta_inp);
+
+void hgemm_acolmajor_bskinny_afloat16_t_bfloat16_t_n6(const float16_t *A, const float16_t *B,
+  float16_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, float16_t beta_inp);
+
+void hgemm_acolmajor_bskinny_afloat16_t_bfloat16_t_n7(const float16_t *A, const float16_t *B,
+  float16_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, float16_t beta_inp);
+
+void hgemm_acolmajor_bskinny_afloat16_t_bfloat16_t_n8(const float16_t *A, const float16_t *B,
+  float16_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, float16_t beta_inp);
+
+void hgemm_acolmajor_bskinny_afloat16_t_bfloat16_t_n9(const float16_t *A, const float16_t *B,
+  float16_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, float16_t beta_inp);
+
+void hgemm_acolmajor_bskinny_afloat16_t_bfloat16_t_n10(const float16_t *A, const float16_t *B,
+  float16_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, float16_t beta_inp);
+
+void hgemm_acolmajor_bskinny_afloat16_t_bfloat16_t_n11(const float16_t *A, const float16_t *B,
+  float16_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, float16_t beta_inp);
+
+void hgemm_acolmajor_bskinny_afloat16_t_bfloat16_t_n12(const float16_t *A, const float16_t *B,
+  float16_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, float16_t beta_inp);
+
+void hgemm_acolmajor_bskinny_afloat16_t_bfloat16_t_n1_omp(
+  const float16_t *A, const float16_t *B,
+  float16_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  float16_t beta_inp, uint32_t num_threads);
+
+void hgemm_acolmajor_bskinny_afloat16_t_bfloat16_t_n2_omp(
+  const float16_t *A, const float16_t *B,
+  float16_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  float16_t beta_inp, uint32_t num_threads);
+
+void hgemm_acolmajor_bskinny_afloat16_t_bfloat16_t_n3_omp(
+  const float16_t *A, const float16_t *B,
+  float16_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  float16_t beta_inp, uint32_t num_threads);
+
+void hgemm_acolmajor_bskinny_afloat16_t_bfloat16_t_n4_omp(
+  const float16_t *A, const float16_t *B,
+  float16_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  float16_t beta_inp, uint32_t num_threads);
+
+void hgemm_acolmajor_bskinny_afloat16_t_bfloat16_t_n5_omp(
+  const float16_t *A, const float16_t *B,
+  float16_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  float16_t beta_inp, uint32_t num_threads);
+
+void hgemm_acolmajor_bskinny_afloat16_t_bfloat16_t_n6_omp(
+  const float16_t *A, const float16_t *B,
+  float16_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  float16_t beta_inp, uint32_t num_threads);
+
+void hgemm_acolmajor_bskinny_afloat16_t_bfloat16_t_n7_omp(
+  const float16_t *A, const float16_t *B,
+  float16_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  float16_t beta_inp, uint32_t num_threads);
+
+void hgemm_acolmajor_bskinny_afloat16_t_bfloat16_t_n8_omp(
+  const float16_t *A, const float16_t *B,
+  float16_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  float16_t beta_inp, uint32_t num_threads);
+
+void hgemm_acolmajor_bskinny_afloat16_t_bfloat16_t_n9_omp(
+  const float16_t *A, const float16_t *B,
+  float16_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  float16_t beta_inp, uint32_t num_threads);
+
+void hgemm_acolmajor_bskinny_afloat16_t_bfloat16_t_n10_omp(
+  const float16_t *A, const float16_t *B,
+  float16_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  float16_t beta_inp, uint32_t num_threads);
+
+void hgemm_acolmajor_bskinny_afloat16_t_bfloat16_t_n11_omp(
+  const float16_t *A, const float16_t *B,
+  float16_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  float16_t beta_inp, uint32_t num_threads);
+
+void hgemm_acolmajor_bskinny_afloat16_t_bfloat16_t_n12_omp(
+  const float16_t *A, const float16_t *B,
+  float16_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  float16_t beta_inp, uint32_t num_threads);
+
diff --git a/include/neon_armv8a/I8I32DotGemmCopy.h b/include/neon_armv8a/I8I32DotGemmCopy.h
new file mode 100644
index 0000000..aa8addd
--- /dev/null
+++ b/include/neon_armv8a/I8I32DotGemmCopy.h
@@ -0,0 +1,454 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include "arm_neon/NeonIntOpSign.h"
+
+#ifndef INCLUDE_I8I32DOT_COPY
+#define INCLUDE_I8I32DOT_COPY
+
+static inline void pref_ab(const I8 *dat) {
+  __asm__ ("prfm pldl1keep,[%0,#64]\n\t"::"r"(dat):);
+}
+
+#define NCOPY_NEON_LOOP_K16_UNROLL4(inc, dst_ptr, src1, src2, src3, src4) \
+  for (dim1_count = dim1; dim1_count > 15; dim1_count -= 16) {\
+    I32X4X4 t1;\
+    t1.val[0] = VREINTERPRETQ_I32_I8(VLD1Q_I8(src1));\
+    src1 += 16; pref_ab(src1);\
+    t1.val[1] = VREINTERPRETQ_I32_I8(VLD1Q_I8(src2));\
+    src2 += 16; pref_ab(src2);\
+    t1.val[2] = VREINTERPRETQ_I32_I8(VLD1Q_I8(src3));\
+    src3 += 16; pref_ab(src3);\
+    t1.val[3] = VREINTERPRETQ_I32_I8(VLD1Q_I8(src4));\
+    src4 += 16; pref_ab(src4);\
+    VST4Q_LANE_I32(dst_ptr, t1, 0);\
+    VST4Q_LANE_I32(dst_ptr + inc, t1, 1);\
+    VST4Q_LANE_I32(dst_ptr + inc * 2, t1, 2);\
+    VST4Q_LANE_I32(dst_ptr + inc * 3, t1, 3);\
+    dst_ptr += inc * 4;\
+  }\
+  if (dim1_count > 7) {\
+    I32X2X4 t1;\
+    t1.val[0] = VREINTERPRET_I32_I8(VLD1_I8(src1)); src1 += 8;\
+    t1.val[1] = VREINTERPRET_I32_I8(VLD1_I8(src2)); src2 += 8;\
+    t1.val[2] = VREINTERPRET_I32_I8(VLD1_I8(src3)); src3 += 8;\
+    t1.val[3] = VREINTERPRET_I32_I8(VLD1_I8(src4)); src4 += 8;\
+    VST4_LANE_I32(dst_ptr, t1, 0);\
+    VST4_LANE_I32(dst_ptr + inc, t1, 1);\
+    dst_ptr += inc * 2; dim1_count -= 8;\
+  }\
+  if (dim1_count > 3) {\
+    __asm__(\
+      "ldr w0,[%0],#4; ldr w1,[%1],#4; ldr w2,[%2],#4; ldr w3,[%3],#4\n\t"\
+      "str w0,[%4]; str w1,[%4,#4]; str w2,[%4,#8]; str w3,[%4,#12]\n\t"\
+      :"+r"(src1),"+r"(src2),"+r"(src3),"+r"(src4):"r"(dst_ptr)\
+      :"cc","memory","x0","x1","x2","x3");\
+    dst_ptr += inc; dim1_count -= 4;\
+  }\
+  if (dim1_count > 0) {\
+    uint32_t *dst_cast = (uint32_t *)dst_ptr; dst_ptr += inc;\
+    uint8_t *src1_cast = (uint8_t *)src1; src1 += dim1_count;\
+    uint8_t *src2_cast = (uint8_t *)src2; src2 += dim1_count;\
+    uint8_t *src3_cast = (uint8_t *)src3; src3 += dim1_count;\
+    uint8_t *src4_cast = (uint8_t *)src4; src4 += dim1_count;\
+    uint32_t d0, d1, d2, d3;\
+    d0 = *src1_cast; d1 = *src2_cast;\
+    d2 = *src3_cast; d3 = *src4_cast;\
+    if (dim1_count >= 2) {\
+      d0 |= ((uint32_t)src1_cast[1]) << 8;\
+      d1 |= ((uint32_t)src2_cast[1]) << 8;\
+      d2 |= ((uint32_t)src3_cast[1]) << 8;\
+      d3 |= ((uint32_t)src4_cast[1]) << 8;\
+    }\
+    if (dim1_count >= 3) {\
+      d0 |= ((uint32_t)src1_cast[2]) << 16;\
+      d1 |= ((uint32_t)src2_cast[2]) << 16;\
+      d2 |= ((uint32_t)src3_cast[2]) << 16;\
+      d3 |= ((uint32_t)src4_cast[2]) << 16;\
+    }\
+    dst_cast[0] = d0; dst_cast[1] = d1;\
+    dst_cast[2] = d2; dst_cast[3] = d3;\
+  }
+
+#define NCOPY_UNROLL_12 {\
+  I32 *dst_h1 = dst1;\
+  NCOPY_NEON_LOOP_K16_UNROLL4(12, dst_h1, src1, src2, src3, src4)\
+  dst_h1 = dst1 + 4;\
+  NCOPY_NEON_LOOP_K16_UNROLL4(12, dst_h1, src5, src6, src7, src8)\
+  dst_h1 = dst1 + 8;\
+  NCOPY_NEON_LOOP_K16_UNROLL4(12, dst_h1, src9, src10, src11, src12)\
+  dst1 = dst_h1 - 8;\
+}
+
+#define NCOPY_UNROLL_8 {\
+  I32 *dst_h1 = dst1;\
+  NCOPY_NEON_LOOP_K16_UNROLL4(8, dst_h1, src1, src2, src3, src4)\
+  dst_h1 = dst1 + 4;\
+  NCOPY_NEON_LOOP_K16_UNROLL4(8, dst_h1, src5, src6, src7, src8)\
+  dst1 = dst_h1 - 4;\
+}
+
+#define NCOPY_UNROLL_4 {\
+  NCOPY_NEON_LOOP_K16_UNROLL4(4, dst1, src1, src2, src3, src4)\
+}
+
+#define NCOPY_UNROLL_2 {\
+  for (dim1_count = dim1; dim1_count > 15; dim1_count -= 16) {\
+    I32X4X2 t1;\
+    t1.val[0] = VREINTERPRETQ_I32_I8(VLD1Q_I8(src1));\
+    src1 += 16; pref_ab(src1);\
+    t1.val[1] = VREINTERPRETQ_I32_I8(VLD1Q_I8(src2));\
+    src2 += 16; pref_ab(src2);\
+    VST2Q_I32(dst1, t1);\
+    dst1 += 8;\
+  }\
+  if (dim1_count > 7) {\
+    I32X2X2 t1;\
+    t1.val[0] = VREINTERPRET_I32_I8(VLD1_I8(src1)); src1 += 8;\
+    t1.val[1] = VREINTERPRET_I32_I8(VLD1_I8(src2)); src2 += 8;\
+    VST2_I32(dst1, t1);\
+    dst1 += 4; dim1_count -= 8;\
+  }\
+  if (dim1_count > 3) {\
+    __asm__(\
+      "ldr w0,[%0],#4; ldr w1,[%1],#4\n\t"\
+      "str w0,[%2]; str w1,[%2,#4]\n\t"\
+      :"+r"(src1),"+r"(src2):"r"(dst1)\
+      :"cc","memory","x0","x1");\
+    dst1 += 2; dim1_count -= 4;\
+  }\
+  if (dim1_count > 0) {\
+    uint32_t *dst_cast = (uint32_t *)dst1; dst1 += 2;\
+    uint8_t *src1_cast = (uint8_t *)src1; src1 += dim1_count;\
+    uint8_t *src2_cast = (uint8_t *)src2; src2 += dim1_count;\
+    uint32_t d0, d1;\
+    d0 = *src1_cast; d1 = *src2_cast;\
+    if (dim1_count >= 2) {\
+      d0 |= ((uint32_t)src1_cast[1]) << 8;\
+      d1 |= ((uint32_t)src2_cast[1]) << 8;\
+    }\
+    if (dim1_count >= 3) {\
+      d0 |= ((uint32_t)src1_cast[2]) << 16;\
+      d1 |= ((uint32_t)src2_cast[2]) << 16;\
+    }\
+    dst_cast[0] = d0; dst_cast[1] = d1;\
+  }\
+}
+
+#define NCOPY_UNROLL_1 {\
+  for (dim1_count = dim1; dim1_count > 15; dim1_count -= 16) {\
+    I32X4 t1 = VREINTERPRETQ_I32_I8(VLD1Q_I8(src1));\
+    src1 += 16;\
+    VST1Q_I32(dst1, t1);\
+    dst1 += 4;\
+  }\
+  if (dim1_count > 7) {\
+    I32X2 t1 = VREINTERPRET_I32_I8(VLD1_I8(src1)); src1 += 8;\
+    VST1_I32(dst1, t1);\
+    dst1 += 2; dim1_count -= 8;\
+  }\
+  if (dim1_count > 3) {\
+    __asm__(\
+      "ldr w0,[%0],#4; str w0,[%1]\n\t"\
+      :"+r"(src1):"r"(dst1)\
+      :"cc","memory","x0","x1");\
+    dst1++; dim1_count -= 4;\
+  }\
+  if (dim1_count > 0) {\
+    uint32_t *dst_cast = (uint32_t *)dst1; dst1++;\
+    uint8_t *src1_cast = (uint8_t *)src1; src1 += dim1_count;\
+    uint32_t d0 = *src1_cast;\
+    if (dim1_count >= 2) {\
+      d0 |= ((uint32_t)src1_cast[1]) << 8;\
+    }\
+    if (dim1_count >= 3) {\
+      d0 |= ((uint32_t)src1_cast[2]) << 16;\
+    }\
+    dst_cast[0] = d0;\
+  }\
+}
+
+#ifdef GEMM_UNSIGNED_INT
+#define NCOPY_uint8_t_uint32_t(unroll) NCOPY_UNROLL_##unroll
+#else
+#define NCOPY_int8_t_int32_t(unroll) NCOPY_UNROLL_##unroll
+#endif
+
+#define TCOPY_K4N8 {\
+  uint8_t *src1_cast = (uint8_t *)src1; src1 += 8; pref_ab(src1);\
+  uint8_t *src2_cast = (uint8_t *)src2; src2 += 8; pref_ab(src2);\
+  uint8_t *src3_cast = (uint8_t *)src3; src3 += 8; pref_ab(src3);\
+  uint8_t *src4_cast = (uint8_t *)src4; src4 += 8; pref_ab(src4);\
+  uint8_t *dst1_cast = (uint8_t *)dst1; dst1 += 8;\
+  uint8x8x4_t t1;\
+  t1.val[0] = vld1_u8(src1_cast);\
+  t1.val[1] = vld1_u8(src2_cast);\
+  t1.val[2] = vld1_u8(src3_cast);\
+  t1.val[3] = vld1_u8(src4_cast);\
+  vst4_u8(dst1_cast, t1);\
+}
+
+#define TCOPY_K3N8 {\
+  uint8_t *src1_cast = (uint8_t *)src1; src1 += 8; pref_ab(src1);\
+  uint8_t *src2_cast = (uint8_t *)src2; src2 += 8; pref_ab(src2);\
+  uint8_t *src3_cast = (uint8_t *)src3; src3 += 8; pref_ab(src3);\
+  uint8_t *dst1_cast = (uint8_t *)dst1; dst1 += 8;\
+  uint8x8x4_t t1;\
+  t1.val[0] = vld1_u8(src1_cast);\
+  t1.val[1] = vld1_u8(src2_cast);\
+  t1.val[2] = vld1_u8(src3_cast);\
+  t1.val[3] = vdup_n_u8(0);\
+  vst4_u8(dst1_cast, t1);\
+}
+
+#define TCOPY_K2N8 {\
+  uint8_t *src1_cast = (uint8_t *)src1; src1 += 8; pref_ab(src1);\
+  uint8_t *src2_cast = (uint8_t *)src2; src2 += 8; pref_ab(src2);\
+  uint8_t *dst1_cast = (uint8_t *)dst1; dst1 += 8;\
+  uint8x8x4_t t1;\
+  t1.val[0] = vld1_u8(src1_cast);\
+  t1.val[1] = vld1_u8(src2_cast);\
+  t1.val[2] = vdup_n_u8(0);\
+  t1.val[3] = vdup_n_u8(0);\
+  vst4_u8(dst1_cast, t1);\
+}
+
+#define TCOPY_K1N8 {\
+  uint8_t *src1_cast = (uint8_t *)src1; src1 += 8;\
+  uint8_t *dst1_cast = (uint8_t *)dst1; dst1 += 8;\
+  uint8x8x4_t t1;\
+  t1.val[0] = vld1_u8(src1_cast);\
+  t1.val[1] = vdup_n_u8(0);\
+  t1.val[2] = vdup_n_u8(0);\
+  t1.val[3] = vdup_n_u8(0);\
+  vst4_u8(dst1_cast, t1);\
+}
+
+#define LOAD_4_INCPTR_I8(ptr, v) \
+  __asm__ __volatile__("ldr %s["#v"],[%["#ptr"]],#4\n\t"\
+    :[v]"=w"(v),[ptr]"+r"(ptr)::"memory");
+
+#define STORE_4X4_INTERLEAVE_I8(v1, v2, v3, v4, dst) \
+  __asm__ __volatile__(\
+    "zip1 %["#v1"].8b,%["#v1"].8b,%["#v2"].8b\n\t"\
+    "zip1 %["#v3"].8b,%["#v3"].8b,%["#v4"].8b\n\t"\
+    "zip1 %["#v1"].8h,%["#v1"].8h,%["#v3"].8h\n\t"\
+    "str %q["#v1"],[%["#dst"]],#16\n\t"\
+   :[v1]"+w"(v1), [v2]"+w"(v2), [v3]"+w"(v3), [v4]"+w"(v4), [dst]"+r"(dst)\
+   ::"memory");
+
+#define TCOPY_K4N4 {\
+  I8X8 t1, t2, t3, t4;\
+  LOAD_4_INCPTR_I8(src1, t1)\
+  LOAD_4_INCPTR_I8(src2, t2)\
+  LOAD_4_INCPTR_I8(src3, t3)\
+  LOAD_4_INCPTR_I8(src4, t4)\
+  STORE_4X4_INTERLEAVE_I8(t1, t2, t3, t4, dst1)\
+}
+
+#define TCOPY_K3N4 {\
+  I8X8 t1, t2, t3, t4;\
+  LOAD_4_INCPTR_I8(src1, t1)\
+  LOAD_4_INCPTR_I8(src2, t2)\
+  LOAD_4_INCPTR_I8(src3, t3)\
+  t4 = VDUP_N_I8(0);\
+  STORE_4X4_INTERLEAVE_I8(t1, t2, t3, t4, dst1)\
+}
+
+#define TCOPY_K2N4 {\
+  I8X8 t1, t2, t3, t4;\
+  LOAD_4_INCPTR_I8(src1, t1)\
+  LOAD_4_INCPTR_I8(src2, t2)\
+  t3 = VDUP_N_I8(0);\
+  t4 = VDUP_N_I8(0);\
+  STORE_4X4_INTERLEAVE_I8(t1, t2, t3, t4, dst1)\
+}
+
+#define TCOPY_K1N4 {\
+  I8X8 t1, t2, t3, t4;\
+  LOAD_4_INCPTR_I8(src1, t1)\
+  t2 = VDUP_N_I8(0);\
+  t3 = VDUP_N_I8(0);\
+  t4 = VDUP_N_I8(0);\
+  STORE_4X4_INTERLEAVE_I8(t1, t2, t3, t4, dst1)\
+}
+
+#define TCOPY_K4N2 \
+  __asm__ __volatile__(\
+    "ldr h0,[%0],#2; ldr h1,[%1],#2\n\t"\
+    "ldr h2,[%2],#2; ldr h3,[%3],#2\n\t"\
+    "st4 {v0.b,v1.b,v2.b,v3.b}[0],[%4],#4\n\t"\
+    "st4 {v0.b,v1.b,v2.b,v3.b}[1],[%4],#4\n\t"\
+    :"+r"(src1),"+r"(src2),"+r"(src3),"+r"(src4),"+r"(dst1)\
+    ::"cc","memory","v0","v1","v2","v3");
+
+#define TCOPY_K3N2 \
+  __asm__ __volatile__(\
+    "ldr h0,[%0],#2; ldr h1,[%1],#2\n\t"\
+    "ldr h2,[%2],#2; movi v3.8b,#0\n\t"\
+    "st4 {v0.b,v1.b,v2.b,v3.b}[0],[%3],#4\n\t"\
+    "st4 {v0.b,v1.b,v2.b,v3.b}[1],[%3],#4\n\t"\
+    :"+r"(src1),"+r"(src2),"+r"(src3),"+r"(dst1)\
+    ::"cc","memory","v0","v1","v2","v3");
+
+#define TCOPY_K2N2 \
+  __asm__ __volatile__(\
+    "ldr h0,[%0],#2; ldr h1,[%1],#2\n\t"\
+    "movi v2.8b,#0; movi v3.8b,#0\n\t"\
+    "st4 {v0.b,v1.b,v2.b,v3.b}[0],[%2],#4\n\t"\
+    "st4 {v0.b,v1.b,v2.b,v3.b}[1],[%2],#4\n\t"\
+    :"+r"(src1),"+r"(src2),"+r"(dst1)\
+    ::"cc","memory","v0","v1","v2","v3");
+
+#define TCOPY_K1N2 \
+  __asm__ __volatile__(\
+    "ldr h0,[%0],#2; movi v1.8b,#0\n\t"\
+    "movi v2.8b,#0; movi v3.8b,#0\n\t"\
+    "st4 {v0.b,v1.b,v2.b,v3.b}[0],[%1],#4\n\t"\
+    "st4 {v0.b,v1.b,v2.b,v3.b}[1],[%1],#4\n\t"\
+    :"+r"(src1),"+r"(dst1)\
+    ::"cc","memory","v0","v1","v2","v3");
+
+#define TCOPY_K4N1 \
+  __asm__ __volatile__(\
+    "ldr b0,[%0],#1; ldr b1,[%1],#1\n\t"\
+    "ldr b2,[%2],#1; ldr b3,[%3],#1\n\t"\
+    "st4 {v0.b,v1.b,v2.b,v3.b}[0],[%4]\n\t"\
+    :"+r"(src1),"+r"(src2),"+r"(src3),"+r"(src4):"r"(dst1)\
+    :"cc","memory","v0","v1","v2","v3");
+
+#define TCOPY_K3N1 \
+  __asm__ __volatile__(\
+    "ldr b0,[%0],#1; ldr b1,[%1],#1\n\t"\
+    "ldr b2,[%2],#1; movi v3.8b,#0\n\t"\
+    "st4 {v0.b,v1.b,v2.b,v3.b}[0],[%3]\n\t"\
+    :"+r"(src1),"+r"(src2),"+r"(src3):"r"(dst1)\
+    :"cc","memory","v0","v1","v2","v3");
+
+#define TCOPY_K2N1 \
+  __asm__ __volatile__(\
+    "ldr b0,[%0],#1; ldr b1,[%1],#1\n\t"\
+    "movi v2.8b,#0; movi v3.8b,#0\n\t"\
+    "st4 {v0.b,v1.b,v2.b,v3.b}[0],[%2]\n\t"\
+    :"+r"(src1),"+r"(src2):"r"(dst1)\
+    :"cc","memory","v0","v1","v2","v3");
+
+#define TCOPY_K1N1 \
+  __asm__ __volatile__(\
+    "ldr b0,[%0],#1; str s0,[%1]\n\t"\
+    :"+r"(src1):"r"(dst1)\
+    :"cc","memory","v0");
+
+
+#define TCOPY_NMAX12_TEMPLATE(kdim) \
+  dst1 = dst + chunk_k_pass * 12;\
+  for (; dim1_count > 11; dim1_count -= 12) {\
+    TCOPY_K##kdim##N4 TCOPY_K##kdim##N8\
+    dst1 += chunk_k_num * 12 - 12;\
+  }\
+  dst1 -= chunk_k_pass * 4;\
+  if (dim1_count > 7) {\
+    TCOPY_K##kdim##N8\
+    dst1 += chunk_k_num * 8 - 8;\
+    dim1_count -= 8;\
+  }\
+  dst1 -= chunk_k_pass * 4;\
+  if (dim1_count > 3) {\
+    TCOPY_K##kdim##N4\
+    dst1 += chunk_k_num * 4 - 4;\
+    dim1_count -= 4;\
+  }\
+  dst1 -= chunk_k_pass * 2;\
+  if (dim1_count > 1) {\
+    TCOPY_K##kdim##N2\
+    dst1 += chunk_k_num * 2 - 2;\
+    dim1_count -= 2;\
+  }\
+  dst1 -= chunk_k_pass;\
+  if (dim1_count > 0) {\
+    TCOPY_K##kdim##N1\
+  }
+
+#define TCOPY_NMAX8_TEMPLATE(kdim) \
+  dst1 = dst + chunk_k_pass * 8;\
+  for (; dim1_count > 7; dim1_count -= 8) {\
+    TCOPY_K##kdim##N8\
+    dst1 += chunk_k_num * 8 - 8;\
+  }\
+  dst1 -= chunk_k_pass * 4;\
+  if (dim1_count > 3) {\
+    TCOPY_K##kdim##N4\
+    dst1 += chunk_k_num * 4 - 4;\
+    dim1_count -= 4;\
+  }\
+  dst1 -= chunk_k_pass * 2;\
+  if (dim1_count > 1) {\
+    TCOPY_K##kdim##N2\
+    dst1 += chunk_k_num * 2 - 2;\
+    dim1_count -= 2;\
+  }\
+  dst1 -= chunk_k_pass;\
+  if (dim1_count > 0) {\
+    TCOPY_K##kdim##N1\
+  }
+
+
+#define TCOPY_FUNC_TEMPLATE(funcname, maxunroll) \
+void funcname##maxunroll(\
+  const I8 * __restrict__ src,\
+  I32 * __restrict__ dst, uint32_t ld_dim,\
+  uint32_t dim1, uint32_t dim2) {\
+  if (!dim2) return;\
+  uint32_t dim2_count = dim2;\
+  const uint32_t chunk_k_num = ((dim2 - 1) >> 2) + 1;\
+  const I8 *src0 = src;\
+  for (; dim2_count > 3; dim2_count -= 4) {\
+    const I8 *src1 = src0;\
+    const I8 *src2 = src0 + ld_dim;\
+    const I8 *src3 = src0 + ld_dim * 2;\
+    const I8 *src4 = src0 + ld_dim * 3;\
+    src0 += ld_dim * 4;\
+    I32 *dst1;\
+    uint32_t dim1_count = dim1;\
+    const uint32_t chunk_k_pass = (dim2 - dim2_count) / 4;\
+    TCOPY_NMAX##maxunroll##_TEMPLATE(4)\
+  }\
+  if (dim2_count == 3) {\
+    const I8 *src1 = src0;\
+    const I8 *src2 = src0 + ld_dim;\
+    const I8 *src3 = src0 + ld_dim * 2;\
+    I32 *dst1;\
+    uint32_t dim1_count = dim1;\
+    const uint32_t chunk_k_pass = chunk_k_num - 1;\
+    TCOPY_NMAX##maxunroll##_TEMPLATE(3)\
+  } else if (dim2_count == 2) {\
+    const I8 *src1 = src0;\
+    const I8 *src2 = src0 + ld_dim;\
+    I32 *dst1;\
+    uint32_t dim1_count = dim1;\
+    const uint32_t chunk_k_pass = chunk_k_num - 1;\
+    TCOPY_NMAX##maxunroll##_TEMPLATE(2)\
+  } else if (dim2_count == 1) {\
+    const I8 *src1 = src0;\
+    I32 *dst1;\
+    uint32_t dim1_count = dim1;\
+    const uint32_t chunk_k_pass = chunk_k_num - 1;\
+    TCOPY_NMAX##maxunroll##_TEMPLATE(1)\
+  }\
+}
+
+#endif
diff --git a/include/neon_armv8a/I8I32DotGemmKernel.h b/include/neon_armv8a/I8I32DotGemmKernel.h
new file mode 100644
index 0000000..104f8f7
--- /dev/null
+++ b/include/neon_armv8a/I8I32DotGemmKernel.h
@@ -0,0 +1,1030 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include "arm_neon/NeonIntOpSign.h"
+
+#ifndef INCLUDE_I8I32DOT_GEMM_KERNEL
+#define INCLUDE_I8I32DOT_GEMM_KERNEL
+
+static inline void pref_c(I32 *dat) {
+  __asm__ ("prfm pstl1keep,[%0]\n\t"::"r"(dat):);
+}
+
+#define PREF_N1 pref_c(c_pref); c_pref += ldc;
+#define PREF_N2 PREF_N1 PREF_N1
+#define PREF_N4 PREF_N2 PREF_N2
+#define PREF_N8 PREF_N4 PREF_N4
+#define PREF_N12 PREF_N8 PREF_N4
+
+/* NOTE that the K actually means k/4 IN THIS FILE */
+
+/* unaligned load of 4 8-bit int to a S register */
+#define UNALIGNED_LD4B_SREG(var, ptr) \
+  __asm__("ldr %s0,[%1]\n\t":"=w"(var):"r"(ptr):"memory")
+
+#define VLD1(ptr) VREINTERPRET_I8_I32(VLD1_I32(ptr))
+
+#define VLD1Q(ptr) VREINTERPRETQ_I8_I32(VLD1Q_I32(ptr))
+
+#define NORMAL_KERNEL_SETUP(a_head, b_head) \
+  uint32_t kdiv4_left = K;\
+  const I32 *a_rd = a_head;\
+  const I32 *b_rd = b_head;
+
+#define KERNEL_M1N1 \
+  I32X4 cq1, cq2;\
+  cq1 = cq2 = VDUPQ_N_I32(0);\
+  NORMAL_KERNEL_SETUP(a_head, b_head)\
+  I8X16 aq1, aq2, bq1, bq2;\
+  if (kdiv4_left > 3) {\
+    aq1 = VLD1Q(a_rd); a_rd += 4;\
+    bq1 = VLD1Q(b_rd); b_rd += 4;\
+  }\
+  for (; kdiv4_left > 11; kdiv4_left -= 8) {\
+    aq2 = VLD1Q(a_rd);\
+    bq2 = VLD1Q(b_rd);\
+    cq1 = VDOTQ_I32(cq1, aq1, bq1);\
+    aq1 = VLD1Q(a_rd + 4); a_rd += 8;\
+    bq1 = VLD1Q(b_rd + 4); b_rd += 8;\
+    cq2 = VDOTQ_I32(cq2, aq2, bq2);\
+  }\
+  if (kdiv4_left > 7) {\
+    aq2 = VLD1Q(a_rd); a_rd += 4;\
+    bq2 = VLD1Q(b_rd); b_rd += 4;\
+    cq1 = VDOTQ_I32(cq1, aq1, bq1);\
+    cq2 = VDOTQ_I32(cq2, aq2, bq2);\
+    kdiv4_left -= 8;\
+  } else if (kdiv4_left > 3) {\
+    cq1 = VDOTQ_I32(cq1, aq1, bq1);\
+    kdiv4_left -= 4;\
+  }\
+  cq1 = VADDQ_I32(cq1, cq2);\
+  I32X2 cd1 = VADD_I32(VGET_LOW_I32(cq1), VGET_HIGH_I32(cq1));\
+  if (kdiv4_left > 1) {\
+    I8X8 ad1 = VLD1(a_rd); a_rd += 2;\
+    I8X8 bd1 = VLD1(b_rd); b_rd += 2;\
+    cd1 = VDOT_I32(cd1, ad1, bd1);\
+    kdiv4_left -= 2;\
+  }\
+  if (kdiv4_left > 0) {\
+    I8X8 ad1, bd1;\
+    UNALIGNED_LD4B_SREG(ad1, a_rd); a_rd++;\
+    UNALIGNED_LD4B_SREG(bd1, b_rd); b_rd++;\
+    cd1 = VDOT_I32(cd1, ad1, bd1);\
+  }\
+  I32 cs1 = VGET_LANE_I32(cd1, 0) + VGET_LANE_I32(cd1, 1);
+
+#define SAVE_M1N1 *c_ptr = c_ptr[0] * beta + cs1;
+
+#define KERNEL_M2N1_UNIT(a_head, b_head) \
+  I32X2 cd1, cd2;\
+  cd1 = cd2 = VDUP_N_I32(0);\
+  NORMAL_KERNEL_SETUP(a_head, b_head)\
+  I8X8 ad1, ad2, bd1;\
+  if (kdiv4_left > 1) {\
+    ad1 = VLD1(a_rd); ad2 = VLD1(a_rd + 2); a_rd += 4;\
+    bd1 = VLD1(b_rd); b_rd += 2;\
+  }\
+  for (; kdiv4_left > 3; kdiv4_left -= 2) {\
+    cd1 = VDOT_LANE_I32(cd1, ad1, bd1, 0); ad1 = VLD1(a_rd);\
+    cd2 = VDOT_LANE_I32(cd2, ad2, bd1, 1); ad2 = VLD1(a_rd + 2);\
+    a_rd += 4; bd1 = VLD1(b_rd); b_rd += 2;\
+  }\
+  if (kdiv4_left > 1) {\
+    cd1 = VDOT_LANE_I32(cd1, ad1, bd1, 0);\
+    cd2 = VDOT_LANE_I32(cd2, ad2, bd1, 1);\
+    kdiv4_left -= 2;\
+  }\
+  cd1 = VADD_I32(cd1, cd2);\
+  if (kdiv4_left > 0) {\
+    UNALIGNED_LD4B_SREG(bd1, b_rd); b_rd++;\
+    ad1 = VLD1(a_rd); a_rd += 2;\
+    cd1 = VDOT_LANE_I32(cd1, ad1, bd1, 0);\
+  }
+
+#define KERNEL_M2N1 KERNEL_M2N1_UNIT(a_head, b_head)
+#define KERNEL_M1N2 KERNEL_M2N1_UNIT(b_head, a_head)
+
+#define SAVE_M2N1 \
+  cd1 = VMLA_N_I32(cd1, VLD1_I32(c_ptr), beta);\
+  VST1_I32(c_ptr, cd1);
+
+#define SAVE_M1N2 \
+  c_ptr[0] = c_ptr[0] * beta + VGET_LANE_I32(cd1, 0);\
+  c_ptr[ldc] = c_ptr[ldc] * beta + VGET_LANE_I32(cd1, 1);
+
+#define KERNEL_M2N2 \
+  I32X2 cd1, cd2;\
+  cd1 = cd2 = VDUP_N_I32(0);\
+  NORMAL_KERNEL_SETUP(a_head, b_head)\
+  I8X8 ad1, bd1;\
+  if (kdiv4_left > 0) {\
+    ad1 = VLD1(a_rd); a_rd += 2;\
+    bd1 = VLD1(b_rd); b_rd += 2;\
+  }\
+  for (; kdiv4_left > 1; kdiv4_left--) {\
+    cd1 = VDOT_LANE_I32(cd1, ad1, bd1, 0);\
+    cd2 = VDOT_LANE_I32(cd2, ad1, bd1, 1);\
+    ad1 = VLD1(a_rd); a_rd += 2;\
+    bd1 = VLD1(b_rd); b_rd += 2;\
+  }\
+  if (kdiv4_left > 0) {\
+    cd1 = VDOT_LANE_I32(cd1, ad1, bd1, 0);\
+    cd2 = VDOT_LANE_I32(cd2, ad1, bd1, 1);\
+  }
+
+#define SAVE_M2N2 \
+  cd1 = VMLA_N_I32(cd1, VLD1_I32(c_ptr), beta);\
+  cd2 = VMLA_N_I32(cd2, VLD1_I32(c_ptr + ldc), beta);\
+  VST1_I32(c_ptr, cd1); VST1_I32(c_ptr + ldc, cd2);
+
+#define KERNEL_M4N1_UNIT(a_head, b_head) \
+  I32X4 cq1, cq2;\
+  cq1 = cq2 = VDUPQ_N_I32(0);\
+  NORMAL_KERNEL_SETUP(a_head, b_head)\
+  I8X16 aq1, aq2;\
+  I8X8 bd1;\
+  if (kdiv4_left > 1) {\
+    aq1 = VLD1Q(a_rd); aq2 = VLD1Q(a_rd + 4); a_rd += 8;\
+    bd1 = VLD1(b_rd); b_rd += 2;\
+  }\
+  for (; kdiv4_left > 3; kdiv4_left -= 2) {\
+    cq1 = VDOTQ_LANE_I32(cq1, aq1, bd1, 0); aq1 = VLD1Q(a_rd);\
+    cq2 = VDOTQ_LANE_I32(cq2, aq2, bd1, 1); aq2 = VLD1Q(a_rd + 4);\
+    a_rd += 8; bd1 = VLD1(b_rd); b_rd += 2;\
+  }\
+  if (kdiv4_left > 1) {\
+    cq1 = VDOTQ_LANE_I32(cq1, aq1, bd1, 0);\
+    cq2 = VDOTQ_LANE_I32(cq2, aq2, bd1, 1);\
+    kdiv4_left -= 2;\
+  }\
+  cq1 = VADDQ_I32(cq1, cq2);\
+  if (kdiv4_left > 0) {\
+    UNALIGNED_LD4B_SREG(bd1, b_rd); b_rd++;\
+    aq1 = VLD1Q(a_rd); a_rd += 4;\
+    cq1 = VDOTQ_LANE_I32(cq1, aq1, bd1, 0);\
+  }
+
+#define KERNEL_M4N1 KERNEL_M4N1_UNIT(a_head, b_head)
+#define KERNEL_M1N4 KERNEL_M4N1_UNIT(b_head, a_head)
+
+#define SAVE_M4N1 \
+  cq1 = VMLAQ_N_I32(cq1, VLD1Q_I32(c_ptr), beta);\
+  VST1Q_I32(c_ptr, cq1);
+
+#define UNIT_SAVE_M1N4(cq1) \
+  c_tmp[0] = c_tmp[0] * beta + VGETQ_LANE_I32(cq1, 0);\
+  c_tmp[ldc] = c_tmp[ldc] * beta + VGETQ_LANE_I32(cq1, 1);\
+  c_tmp += ldc * 2;\
+  c_tmp[0] = c_tmp[0] * beta + VGETQ_LANE_I32(cq1, 2);\
+  c_tmp[ldc] = c_tmp[ldc] * beta + VGETQ_LANE_I32(cq1, 3);\
+  c_tmp += ldc * 2;
+
+#define SAVE_M1N4 \
+  I32 *c_tmp = c_ptr; UNIT_SAVE_M1N4(cq1)
+
+#define KERNEL_M4N2_UNIT(a_head, b_head) \
+  I32X4 cq1, cq2;\
+  cq1 = cq2 = VDUPQ_N_I32(0);\
+  NORMAL_KERNEL_SETUP(a_head, b_head)\
+  I8X16 aq1; I8X8 bd1;\
+  if (kdiv4_left > 0) {\
+    aq1 = VLD1Q(a_rd); a_rd += 4;\
+    bd1 = VLD1(b_rd); b_rd += 2;\
+  }\
+  for (; kdiv4_left > 1; kdiv4_left--) {\
+    cq1 = VDOTQ_LANE_I32(cq1, aq1, bd1, 0);\
+    cq2 = VDOTQ_LANE_I32(cq2, aq1, bd1, 1);\
+    aq1 = VLD1Q(a_rd); a_rd += 4;\
+    bd1 = VLD1(b_rd); b_rd += 2;\
+  }\
+  if (kdiv4_left > 0) {\
+    cq1 = VDOTQ_LANE_I32(cq1, aq1, bd1, 0);\
+    cq2 = VDOTQ_LANE_I32(cq2, aq1, bd1, 1);\
+  }
+
+#define KERNEL_M4N2 KERNEL_M4N2_UNIT(a_head, b_head)
+#define KERNEL_M2N4 KERNEL_M4N2_UNIT(b_head, a_head)
+
+#define UNIT_SAVE_M4N2(cq1, cq2) \
+  cq1 = VMLAQ_N_I32(cq1, VLD1Q_I32(c_tmp), beta);\
+  cq2 = VMLAQ_N_I32(cq2, VLD1Q_I32(c_tmp + ldc), beta);\
+  VST1Q_I32(c_tmp, cq1); VST1Q_I32(c_tmp + ldc, cq2);\
+  c_tmp += ldc * 2;
+
+#define SAVE_M4N2 \
+  I32 *c_tmp = c_ptr; UNIT_SAVE_M4N2(cq1, cq2)
+
+#define UNIT_SAVE_M2N4(cq1, cq2) {\
+  I32X4 t1 = VZIP1Q_I32(cq1, cq2);\
+  I32X4 t2 = VZIP2Q_I32(cq1, cq2);\
+  I32X2 l1 = VMLA_N_I32(VGET_LOW_I32(t1), VLD1_I32(c_tmp), beta);\
+  I32X2 l2 = VMLA_N_I32(VGET_HIGH_I32(t1), VLD1_I32(c_tmp + ldc), beta);\
+  VST1_I32(c_tmp, l1); VST1_I32(c_tmp + ldc, l2); c_tmp += ldc * 2;\
+  I32X2 l3 = VMLA_N_I32(VGET_LOW_I32(t2), VLD1_I32(c_tmp), beta);\
+  I32X2 l4 = VMLA_N_I32(VGET_HIGH_I32(t2), VLD1_I32(c_tmp + ldc), beta);\
+  VST1_I32(c_tmp, l3); VST1_I32(c_tmp + ldc, l4); c_tmp += ldc * 2;\
+}
+
+#define SAVE_M2N4 \
+  I32 *c_tmp = c_ptr; UNIT_SAVE_M2N4(cq1, cq2)
+
+#define KERNEL_M4N4 \
+  I32X4 cq1, cq2, cq3, cq4;\
+  cq1 = cq2 = cq3 = cq4 = VDUPQ_N_I32(0);\
+  NORMAL_KERNEL_SETUP(a_head, b_head)\
+  I8X16 aq1, bq1;\
+  if (kdiv4_left > 0) {\
+    aq1 = VLD1Q(a_rd); a_rd += 4;\
+    bq1 = VLD1Q(b_rd); b_rd += 4;\
+  }\
+  for (; kdiv4_left > 1; kdiv4_left--) {\
+    cq1 = VDOTQ_LANEQ_I32(cq1, aq1, bq1, 0);\
+    cq2 = VDOTQ_LANEQ_I32(cq2, aq1, bq1, 1);\
+    cq3 = VDOTQ_LANEQ_I32(cq3, aq1, bq1, 2);\
+    cq4 = VDOTQ_LANEQ_I32(cq4, aq1, bq1, 3);\
+    aq1 = VLD1Q(a_rd); a_rd += 4;\
+    bq1 = VLD1Q(b_rd); b_rd += 4;\
+  }\
+  if (kdiv4_left > 0) {\
+    cq1 = VDOTQ_LANEQ_I32(cq1, aq1, bq1, 0);\
+    cq2 = VDOTQ_LANEQ_I32(cq2, aq1, bq1, 1);\
+    cq3 = VDOTQ_LANEQ_I32(cq3, aq1, bq1, 2);\
+    cq4 = VDOTQ_LANEQ_I32(cq4, aq1, bq1, 3);\
+  }
+
+#define SAVE_M4N4 \
+  I32 *c_tmp = c_ptr; UNIT_SAVE_M4N2(cq1, cq2) UNIT_SAVE_M4N2(cq3, cq4)
+
+#define KERNEL_M8N1_UNIT(a_head, b_head) \
+  I32X4 cq1, cq2, cq3, cq4;\
+  cq1 = cq2 = cq3 = cq4 = VDUPQ_N_I32(0);\
+  NORMAL_KERNEL_SETUP(a_head, b_head)\
+  I8X16 aq1, aq2, aq3, aq4;\
+  I8X8 bd1;\
+  if (kdiv4_left > 1) {\
+    aq1 = VLD1Q(a_rd); aq2 = VLD1Q(a_rd + 4);\
+    aq3 = VLD1Q(a_rd + 8); aq4 = VLD1Q(a_rd + 12); a_rd += 16;\
+    bd1 = VLD1(b_rd); b_rd += 2;\
+  }\
+  for (; kdiv4_left > 3; kdiv4_left -= 2) {\
+    cq1 = VDOTQ_LANE_I32(cq1, aq1, bd1, 0); aq1 = VLD1Q(a_rd);\
+    cq2 = VDOTQ_LANE_I32(cq2, aq2, bd1, 0); aq2 = VLD1Q(a_rd + 4);\
+    cq3 = VDOTQ_LANE_I32(cq3, aq3, bd1, 1); aq3 = VLD1Q(a_rd + 8);\
+    cq4 = VDOTQ_LANE_I32(cq4, aq4, bd1, 1); aq4 = VLD1Q(a_rd + 12);\
+    a_rd += 16; bd1 = VLD1(b_rd); b_rd += 2;\
+  }\
+  if (kdiv4_left > 1) {\
+    cq1 = VDOTQ_LANE_I32(cq1, aq1, bd1, 0);\
+    cq2 = VDOTQ_LANE_I32(cq2, aq2, bd1, 0);\
+    cq3 = VDOTQ_LANE_I32(cq3, aq3, bd1, 1);\
+    cq4 = VDOTQ_LANE_I32(cq4, aq4, bd1, 1);\
+    kdiv4_left -= 2;\
+  }\
+  cq1 = VADDQ_I32(cq1, cq3); cq2 = VADDQ_I32(cq2, cq4);\
+  if (kdiv4_left > 0) {\
+    UNALIGNED_LD4B_SREG(bd1, b_rd); b_rd++;\
+    aq1 = VLD1Q(a_rd); aq2 = VLD1Q(a_rd + 4); a_rd += 8;\
+    cq1 = VDOTQ_LANE_I32(cq1, aq1, bd1, 0);\
+    cq2 = VDOTQ_LANE_I32(cq2, aq2, bd1, 0);\
+  }
+
+#define KERNEL_M8N1 KERNEL_M8N1_UNIT(a_head, b_head)
+#define KERNEL_M1N8 KERNEL_M8N1_UNIT(b_head, a_head)
+
+#define SAVE_M8N1 \
+  cq1 = VMLAQ_N_I32(cq1, VLD1Q_I32(c_ptr), beta);\
+  cq2 = VMLAQ_N_I32(cq2, VLD1Q_I32(c_ptr + 4), beta);\
+  VST1Q_I32(c_ptr, cq1); VST1Q_I32(c_ptr + 4, cq2);
+
+#define SAVE_M1N8 \
+  I32 *c_tmp = c_ptr; UNIT_SAVE_M1N4(cq1) UNIT_SAVE_M1N4(cq2)
+
+#define KERNEL_M8N2_UNIT(a_head, b_head) \
+  I32X4 cq1, cq2, cq3, cq4;\
+  cq1 = cq2 = cq3 = cq4 = VDUPQ_N_I32(0);\
+  NORMAL_KERNEL_SETUP(a_head, b_head)\
+  I8X16 aq1, aq2;\
+  I8X8 bd1;\
+  if (kdiv4_left > 0) {\
+    aq1 = VLD1Q(a_rd); aq2 = VLD1Q(a_rd + 4); a_rd += 8;\
+    bd1 = VLD1(b_rd); b_rd += 2;\
+  }\
+  for (; kdiv4_left > 1; kdiv4_left--) {\
+    cq1 = VDOTQ_LANE_I32(cq1, aq1, bd1, 0);\
+    cq3 = VDOTQ_LANE_I32(cq3, aq1, bd1, 1);\
+    aq1 = VLD1Q(a_rd);\
+    cq2 = VDOTQ_LANE_I32(cq2, aq2, bd1, 0);\
+    cq4 = VDOTQ_LANE_I32(cq4, aq2, bd1, 1);\
+    aq2 = VLD1Q(a_rd + 4); a_rd += 8;\
+    bd1 = VLD1(b_rd); b_rd += 2;\
+  }\
+  if (kdiv4_left > 0) {\
+    cq1 = VDOTQ_LANE_I32(cq1, aq1, bd1, 0);\
+    cq3 = VDOTQ_LANE_I32(cq3, aq1, bd1, 1);\
+    cq2 = VDOTQ_LANE_I32(cq2, aq2, bd1, 0);\
+    cq4 = VDOTQ_LANE_I32(cq4, aq2, bd1, 1);\
+  }
+
+#define KERNEL_M8N2 KERNEL_M8N2_UNIT(a_head, b_head)
+#define KERNEL_M2N8 KERNEL_M8N2_UNIT(b_head, a_head)
+
+#define UNIT_SAVE_M8N2(cq1, cq2, cq3, cq4) \
+  cq1 = VMLAQ_N_I32(cq1, VLD1Q_I32(c_tmp), beta);\
+  cq2 = VMLAQ_N_I32(cq2, VLD1Q_I32(c_tmp + 4), beta);\
+  cq3 = VMLAQ_N_I32(cq3, VLD1Q_I32(c_tmp + ldc), beta);\
+  cq4 = VMLAQ_N_I32(cq4, VLD1Q_I32(c_tmp + ldc + 4), beta);\
+  VST1Q_I32(c_tmp, cq1); VST1Q_I32(c_tmp + 4, cq2);\
+  VST1Q_I32(c_tmp + ldc, cq3); VST1Q_I32(c_tmp + ldc + 4, cq4);\
+  c_tmp += ldc * 2;
+
+#define SAVE_M8N2 \
+  I32 *c_tmp = c_ptr; UNIT_SAVE_M8N2(cq1, cq2, cq3, cq4)
+
+#define SAVE_M2N8 \
+  I32 *c_tmp = c_ptr; UNIT_SAVE_M2N4(cq1, cq3) UNIT_SAVE_M2N4(cq2, cq4)
+
+#define KERNEL_M8N4_UNIT(a_head, b_head) \
+  I32X4 cq1, cq2, cq3, cq4, cq5, cq6, cq7, cq8;\
+  cq1 = cq2 = cq3 = cq4 = cq5 = cq6 = cq7 = cq8 = VDUPQ_N_I32(0);\
+  NORMAL_KERNEL_SETUP(a_head, b_head)\
+  I8X16 aq1, aq2, bq1;\
+  if (kdiv4_left > 0) {\
+    aq1 = VLD1Q(a_rd); aq2 = VLD1Q(a_rd + 4); a_rd += 8;\
+    bq1 = VLD1Q(b_rd); b_rd += 4;\
+  }\
+  for (; kdiv4_left > 1; kdiv4_left--) {\
+    cq1 = VDOTQ_LANEQ_I32(cq1, aq1, bq1, 0);\
+    cq3 = VDOTQ_LANEQ_I32(cq3, aq1, bq1, 1);\
+    cq5 = VDOTQ_LANEQ_I32(cq5, aq1, bq1, 2);\
+    cq7 = VDOTQ_LANEQ_I32(cq7, aq1, bq1, 3);\
+    aq1 = VLD1Q(a_rd);\
+    cq2 = VDOTQ_LANEQ_I32(cq2, aq2, bq1, 0);\
+    cq4 = VDOTQ_LANEQ_I32(cq4, aq2, bq1, 1);\
+    cq6 = VDOTQ_LANEQ_I32(cq6, aq2, bq1, 2);\
+    cq8 = VDOTQ_LANEQ_I32(cq8, aq2, bq1, 3);\
+    aq2 = VLD1Q(a_rd + 4); a_rd += 8;\
+    bq1 = VLD1Q(b_rd); b_rd += 4;\
+  }\
+  if (kdiv4_left > 0) {\
+    cq1 = VDOTQ_LANEQ_I32(cq1, aq1, bq1, 0);\
+    cq3 = VDOTQ_LANEQ_I32(cq3, aq1, bq1, 1);\
+    cq5 = VDOTQ_LANEQ_I32(cq5, aq1, bq1, 2);\
+    cq7 = VDOTQ_LANEQ_I32(cq7, aq1, bq1, 3);\
+    cq2 = VDOTQ_LANEQ_I32(cq2, aq2, bq1, 0);\
+    cq4 = VDOTQ_LANEQ_I32(cq4, aq2, bq1, 1);\
+    cq6 = VDOTQ_LANEQ_I32(cq6, aq2, bq1, 2);\
+    cq8 = VDOTQ_LANEQ_I32(cq8, aq2, bq1, 3);\
+  }
+
+#define KERNEL_M8N4 KERNEL_M8N4_UNIT(a_head, b_head)
+#define KERNEL_M4N8 KERNEL_M8N4_UNIT(b_head, a_head)
+
+#define SAVE_M8N4 \
+  I32 *c_tmp = c_ptr;\
+  UNIT_SAVE_M8N2(cq1, cq2, cq3, cq4) UNIT_SAVE_M8N2(cq5, cq6, cq7, cq8)
+
+#define UNIT_SAVE_M4N4_TRANS(cq1, cq2, cq3, cq4) {\
+  I32X4 l1 = VLD1Q_I32(c_tmp);\
+  I32X4 l2 = VLD1Q_I32(c_tmp + ldc);\
+  I32X4 l3 = VLD1Q_I32(c_tmp + ldc * 2);\
+  I32X4 l4 = VLD1Q_I32(c_tmp + ldc * 3);\
+  I64X2 t1 = VREINTERPRETQ_I64_I32(VZIP1Q_I32(cq1, cq2));\
+  I64X2 t2 = VREINTERPRETQ_I64_I32(VZIP1Q_I32(cq3, cq4));\
+  I64X2 t3 = VREINTERPRETQ_I64_I32(VZIP2Q_I32(cq1, cq2));\
+  I64X2 t4 = VREINTERPRETQ_I64_I32(VZIP2Q_I32(cq3, cq4));\
+  cq1 = VREINTERPRETQ_I32_I64(VZIP1Q_I64(t1, t2));\
+  cq2 = VREINTERPRETQ_I32_I64(VZIP2Q_I64(t1, t2));\
+  cq3 = VREINTERPRETQ_I32_I64(VZIP1Q_I64(t3, t4));\
+  cq4 = VREINTERPRETQ_I32_I64(VZIP2Q_I64(t3, t4));\
+  cq1 = VMLAQ_N_I32(cq1, l1, beta); cq2 = VMLAQ_N_I32(cq2, l2, beta);\
+  cq3 = VMLAQ_N_I32(cq3, l3, beta); cq4 = VMLAQ_N_I32(cq4, l4, beta);\
+  VST1Q_I32(c_tmp, cq1); VST1Q_I32(c_tmp + ldc, cq2);\
+  VST1Q_I32(c_tmp + ldc * 2, cq3); VST1Q_I32(c_tmp + ldc * 3, cq4);\
+  c_tmp += ldc * 4;\
+}
+  
+#define SAVE_M4N8 \
+  I32 *c_tmp = c_ptr;\
+  UNIT_SAVE_M4N4_TRANS(cq1, cq3, cq5, cq7)\
+  UNIT_SAVE_M4N4_TRANS(cq2, cq4, cq6, cq8)
+
+#define KERNEL_M8N8 \
+  I32X4 cq01, cq02, cq03, cq04, cq05, cq06, cq07, cq08;\
+  I32X4 cq09, cq10, cq11, cq12, cq13, cq14, cq15, cq16;\
+  cq01 = cq02 = cq03 = cq04 = cq05 = cq06 = cq07 = cq08 = VDUPQ_N_I32(0);\
+  cq09 = cq10 = cq11 = cq12 = cq13 = cq14 = cq15 = cq16 = VDUPQ_N_I32(0);\
+  NORMAL_KERNEL_SETUP(a_head, b_head)\
+  I8X16 aq1, aq2, bq1, bq2;\
+  if (kdiv4_left > 0) {\
+    aq1 = VLD1Q(a_rd); aq2 = VLD1Q(a_rd + 4); a_rd += 8;\
+    bq1 = VLD1Q(b_rd); bq2 = VLD1Q(b_rd + 4); b_rd += 8;\
+  }\
+  for (; kdiv4_left > 1; kdiv4_left--) {\
+    cq01 = VDOTQ_LANEQ_I32(cq01, aq1, bq1, 0);\
+    cq03 = VDOTQ_LANEQ_I32(cq03, aq1, bq1, 1);\
+    cq05 = VDOTQ_LANEQ_I32(cq05, aq1, bq1, 2);\
+    cq07 = VDOTQ_LANEQ_I32(cq07, aq1, bq1, 3);\
+    cq09 = VDOTQ_LANEQ_I32(cq09, aq1, bq2, 0);\
+    cq11 = VDOTQ_LANEQ_I32(cq11, aq1, bq2, 1);\
+    cq13 = VDOTQ_LANEQ_I32(cq13, aq1, bq2, 2);\
+    cq15 = VDOTQ_LANEQ_I32(cq15, aq1, bq2, 3);\
+    aq1 = VLD1Q(a_rd);\
+    cq02 = VDOTQ_LANEQ_I32(cq02, aq2, bq1, 0);\
+    cq04 = VDOTQ_LANEQ_I32(cq04, aq2, bq1, 1);\
+    cq06 = VDOTQ_LANEQ_I32(cq06, aq2, bq1, 2);\
+    cq08 = VDOTQ_LANEQ_I32(cq08, aq2, bq1, 3);\
+    bq1 = VLD1Q(b_rd);\
+    cq10 = VDOTQ_LANEQ_I32(cq10, aq2, bq2, 0);\
+    cq12 = VDOTQ_LANEQ_I32(cq12, aq2, bq2, 1);\
+    cq14 = VDOTQ_LANEQ_I32(cq14, aq2, bq2, 2);\
+    cq16 = VDOTQ_LANEQ_I32(cq16, aq2, bq2, 3);\
+    aq2 = VLD1Q(a_rd + 4); a_rd += 8;\
+    bq2 = VLD1Q(b_rd + 4); b_rd += 8;\
+  }\
+  if (kdiv4_left > 0) {\
+    cq01 = VDOTQ_LANEQ_I32(cq01, aq1, bq1, 0);\
+    cq03 = VDOTQ_LANEQ_I32(cq03, aq1, bq1, 1);\
+    cq05 = VDOTQ_LANEQ_I32(cq05, aq1, bq1, 2);\
+    cq07 = VDOTQ_LANEQ_I32(cq07, aq1, bq1, 3);\
+    cq09 = VDOTQ_LANEQ_I32(cq09, aq1, bq2, 0);\
+    cq11 = VDOTQ_LANEQ_I32(cq11, aq1, bq2, 1);\
+    cq13 = VDOTQ_LANEQ_I32(cq13, aq1, bq2, 2);\
+    cq15 = VDOTQ_LANEQ_I32(cq15, aq1, bq2, 3);\
+    cq02 = VDOTQ_LANEQ_I32(cq02, aq2, bq1, 0);\
+    cq04 = VDOTQ_LANEQ_I32(cq04, aq2, bq1, 1);\
+    cq06 = VDOTQ_LANEQ_I32(cq06, aq2, bq1, 2);\
+    cq08 = VDOTQ_LANEQ_I32(cq08, aq2, bq1, 3);\
+    cq10 = VDOTQ_LANEQ_I32(cq10, aq2, bq2, 0);\
+    cq12 = VDOTQ_LANEQ_I32(cq12, aq2, bq2, 1);\
+    cq14 = VDOTQ_LANEQ_I32(cq14, aq2, bq2, 2);\
+    cq16 = VDOTQ_LANEQ_I32(cq16, aq2, bq2, 3);\
+  }
+
+#define SAVE_M8N8 \
+  I32 *c_tmp = c_ptr;\
+  UNIT_SAVE_M8N2(cq01, cq02, cq03, cq04)\
+  UNIT_SAVE_M8N2(cq05, cq06, cq07, cq08)\
+  UNIT_SAVE_M8N2(cq09, cq10, cq11, cq12)\
+  UNIT_SAVE_M8N2(cq13, cq14, cq15, cq16)
+
+#define KERNEL_M12N1_UNIT(a_head, b_head) \
+  I32X4 cq1, cq2, cq3, cq4, cq5, cq6;\
+  cq1 = cq2 = cq3 = cq4 = cq5 = cq6 = VDUPQ_N_I32(0);\
+  NORMAL_KERNEL_SETUP(a_head, b_head)\
+  I8X16 aq1, aq2, aq3, aq4, aq5, aq6;\
+  I8X8 bd1;\
+  if (kdiv4_left > 1) {\
+    aq1 = VLD1Q(a_rd); aq2 = VLD1Q(a_rd + 4);\
+    aq3 = VLD1Q(a_rd + 8); aq4 = VLD1Q(a_rd + 12);\
+    aq5 = VLD1Q(a_rd + 16); aq6 = VLD1Q(a_rd + 20); a_rd += 24;\
+    bd1 = VLD1(b_rd); b_rd += 2;\
+  }\
+  for (; kdiv4_left > 3; kdiv4_left -= 2) {\
+    cq1 = VDOTQ_LANE_I32(cq1, aq1, bd1, 0); aq1 = VLD1Q(a_rd);\
+    cq2 = VDOTQ_LANE_I32(cq2, aq2, bd1, 0); aq2 = VLD1Q(a_rd + 4);\
+    cq3 = VDOTQ_LANE_I32(cq3, aq3, bd1, 0); aq3 = VLD1Q(a_rd + 8);\
+    cq4 = VDOTQ_LANE_I32(cq4, aq4, bd1, 1); aq4 = VLD1Q(a_rd + 12);\
+    cq5 = VDOTQ_LANE_I32(cq5, aq5, bd1, 1); aq5 = VLD1Q(a_rd + 16);\
+    cq6 = VDOTQ_LANE_I32(cq6, aq6, bd1, 1); aq6 = VLD1Q(a_rd + 20);\
+    a_rd += 24; bd1 = VLD1(b_rd); b_rd += 2;\
+  }\
+  if (kdiv4_left > 1) {\
+    cq1 = VDOTQ_LANE_I32(cq1, aq1, bd1, 0);\
+    cq2 = VDOTQ_LANE_I32(cq2, aq2, bd1, 0);\
+    cq3 = VDOTQ_LANE_I32(cq3, aq3, bd1, 0);\
+    cq4 = VDOTQ_LANE_I32(cq4, aq4, bd1, 1);\
+    cq5 = VDOTQ_LANE_I32(cq5, aq5, bd1, 1);\
+    cq6 = VDOTQ_LANE_I32(cq6, aq6, bd1, 1);\
+    kdiv4_left -= 2;\
+  }\
+  cq1 = VADDQ_I32(cq1, cq4);\
+  cq2 = VADDQ_I32(cq2, cq5);\
+  cq3 = VADDQ_I32(cq3, cq6);\
+  if (kdiv4_left > 0) {\
+    UNALIGNED_LD4B_SREG(bd1, b_rd); b_rd++;\
+    aq1 = VLD1Q(a_rd); aq2 = VLD1Q(a_rd + 4);\
+    aq3 = VLD1Q(a_rd + 8); a_rd += 12;\
+    cq1 = VDOTQ_LANE_I32(cq1, aq1, bd1, 0);\
+    cq2 = VDOTQ_LANE_I32(cq2, aq2, bd1, 0);\
+    cq3 = VDOTQ_LANE_I32(cq3, aq3, bd1, 0);\
+  }
+
+#define KERNEL_M12N1 KERNEL_M12N1_UNIT(a_head, b_head)
+#define KERNEL_M1N12 KERNEL_M12N1_UNIT(b_head, a_head)
+
+#define SAVE_M12N1 \
+  cq1 = VMLAQ_N_I32(cq1, VLD1Q_I32(c_ptr), beta);\
+  cq2 = VMLAQ_N_I32(cq2, VLD1Q_I32(c_ptr + 4), beta);\
+  cq3 = VMLAQ_N_I32(cq3, VLD1Q_I32(c_ptr + 8), beta);\
+  VST1Q_I32(c_ptr, cq1); VST1Q_I32(c_ptr + 4, cq2); VST1Q_I32(c_ptr + 8, cq3);
+
+#define SAVE_M1N12 \
+  I32 *c_tmp = c_ptr; UNIT_SAVE_M1N4(cq1)\
+  UNIT_SAVE_M1N4(cq2) UNIT_SAVE_M1N4(cq3)
+
+#define KERNEL_M12N2_UNIT(a_head, b_head) \
+  I32X4 cq1, cq2, cq3, cq4, cq5, cq6;\
+  cq1 = cq2 = cq3 = cq4 = cq5 = cq6 = VDUPQ_N_I32(0);\
+  NORMAL_KERNEL_SETUP(a_head, b_head)\
+  I8X16 aq1, aq2, aq3;\
+  I8X8 bd1;\
+  if (kdiv4_left > 0) {\
+    aq1 = VLD1Q(a_rd); aq2 = VLD1Q(a_rd + 4);\
+    aq3 = VLD1Q(a_rd + 8); a_rd += 12;\
+    bd1 = VLD1(b_rd); b_rd += 2;\
+  }\
+  for (; kdiv4_left > 1; kdiv4_left--) {\
+    cq1 = VDOTQ_LANE_I32(cq1, aq1, bd1, 0);\
+    cq4 = VDOTQ_LANE_I32(cq4, aq1, bd1, 1);\
+    aq1 = VLD1Q(a_rd);\
+    cq2 = VDOTQ_LANE_I32(cq2, aq2, bd1, 0);\
+    cq5 = VDOTQ_LANE_I32(cq5, aq2, bd1, 1);\
+    aq2 = VLD1Q(a_rd + 4);\
+    cq3 = VDOTQ_LANE_I32(cq3, aq3, bd1, 0);\
+    cq6 = VDOTQ_LANE_I32(cq6, aq3, bd1, 1);\
+    aq3 = VLD1Q(a_rd + 8); a_rd += 12;\
+    bd1 = VLD1(b_rd); b_rd += 2;\
+  }\
+  if (kdiv4_left > 0) {\
+    cq1 = VDOTQ_LANE_I32(cq1, aq1, bd1, 0);\
+    cq4 = VDOTQ_LANE_I32(cq4, aq1, bd1, 1);\
+    cq2 = VDOTQ_LANE_I32(cq2, aq2, bd1, 0);\
+    cq5 = VDOTQ_LANE_I32(cq5, aq2, bd1, 1);\
+    cq3 = VDOTQ_LANE_I32(cq3, aq3, bd1, 0);\
+    cq6 = VDOTQ_LANE_I32(cq6, aq3, bd1, 1);\
+  }
+
+#define KERNEL_M12N2 KERNEL_M12N2_UNIT(a_head, b_head)
+#define KERNEL_M2N12 KERNEL_M12N2_UNIT(b_head, a_head)
+
+#define UNIT_SAVE_M12N2(cq1, cq2, cq3, cq4, cq5, cq6) \
+  cq1 = VMLAQ_N_I32(cq1, VLD1Q_I32(c_tmp), beta);\
+  cq2 = VMLAQ_N_I32(cq2, VLD1Q_I32(c_tmp + 4), beta);\
+  cq3 = VMLAQ_N_I32(cq3, VLD1Q_I32(c_tmp + 8), beta);\
+  cq4 = VMLAQ_N_I32(cq4, VLD1Q_I32(c_tmp + ldc), beta);\
+  cq5 = VMLAQ_N_I32(cq5, VLD1Q_I32(c_tmp + ldc + 4), beta);\
+  cq6 = VMLAQ_N_I32(cq6, VLD1Q_I32(c_tmp + ldc + 8), beta);\
+  VST1Q_I32(c_tmp, cq1); VST1Q_I32(c_tmp + 4, cq2);\
+  VST1Q_I32(c_tmp + 8, cq3); VST1Q_I32(c_tmp + ldc, cq4);\
+  VST1Q_I32(c_tmp + ldc + 4, cq5); VST1Q_I32(c_tmp + ldc + 8, cq6);\
+  c_tmp += ldc * 2;
+
+#define SAVE_M12N2 \
+  I32 *c_tmp = c_ptr; UNIT_SAVE_M12N2(cq1, cq2, cq3, cq4, cq5, cq6)
+
+#define SAVE_M2N12 \
+  I32 *c_tmp = c_ptr; UNIT_SAVE_M2N4(cq1, cq4) UNIT_SAVE_M2N4(cq2, cq5)\
+  UNIT_SAVE_M2N4(cq3, cq6)
+
+#define KERNEL_M12N4_UNIT(a_head, b_head) \
+  I32X4 cq01, cq02, cq03, cq04, cq05, cq06;\
+  I32X4 cq07, cq08, cq09, cq10, cq11, cq12;\
+  cq01 = cq02 = cq03 = cq04 = cq05 = cq06 = VDUPQ_N_I32(0);\
+  cq07 = cq08 = cq09 = cq10 = cq11 = cq12 = VDUPQ_N_I32(0);\
+  NORMAL_KERNEL_SETUP(a_head, b_head)\
+  I8X16 aq1, aq2, aq3, bq1;\
+  if (kdiv4_left > 0) {\
+    aq1 = VLD1Q(a_rd); aq2 = VLD1Q(a_rd + 4);\
+    aq3 = VLD1Q(a_rd + 8); a_rd += 12;\
+    bq1 = VLD1Q(b_rd); b_rd += 4;\
+  }\
+  for (; kdiv4_left > 1; kdiv4_left--) {\
+    cq01 = VDOTQ_LANEQ_I32(cq01, aq1, bq1, 0);\
+    cq04 = VDOTQ_LANEQ_I32(cq04, aq1, bq1, 1);\
+    cq07 = VDOTQ_LANEQ_I32(cq07, aq1, bq1, 2);\
+    cq10 = VDOTQ_LANEQ_I32(cq10, aq1, bq1, 3);\
+    aq1 = VLD1Q(a_rd);\
+    cq02 = VDOTQ_LANEQ_I32(cq02, aq2, bq1, 0);\
+    cq05 = VDOTQ_LANEQ_I32(cq05, aq2, bq1, 1);\
+    cq08 = VDOTQ_LANEQ_I32(cq08, aq2, bq1, 2);\
+    cq11 = VDOTQ_LANEQ_I32(cq11, aq2, bq1, 3);\
+    aq2 = VLD1Q(a_rd + 4);\
+    cq03 = VDOTQ_LANEQ_I32(cq03, aq3, bq1, 0);\
+    cq06 = VDOTQ_LANEQ_I32(cq06, aq3, bq1, 1);\
+    cq09 = VDOTQ_LANEQ_I32(cq09, aq3, bq1, 2);\
+    cq12 = VDOTQ_LANEQ_I32(cq12, aq3, bq1, 3);\
+    aq3 = VLD1Q(a_rd + 8); a_rd += 12;\
+    bq1 = VLD1Q(b_rd); b_rd += 4;\
+  }\
+  if (kdiv4_left > 0) {\
+    cq01 = VDOTQ_LANEQ_I32(cq01, aq1, bq1, 0);\
+    cq04 = VDOTQ_LANEQ_I32(cq04, aq1, bq1, 1);\
+    cq07 = VDOTQ_LANEQ_I32(cq07, aq1, bq1, 2);\
+    cq10 = VDOTQ_LANEQ_I32(cq10, aq1, bq1, 3);\
+    cq02 = VDOTQ_LANEQ_I32(cq02, aq2, bq1, 0);\
+    cq05 = VDOTQ_LANEQ_I32(cq05, aq2, bq1, 1);\
+    cq08 = VDOTQ_LANEQ_I32(cq08, aq2, bq1, 2);\
+    cq11 = VDOTQ_LANEQ_I32(cq11, aq2, bq1, 3);\
+    cq03 = VDOTQ_LANEQ_I32(cq03, aq3, bq1, 0);\
+    cq06 = VDOTQ_LANEQ_I32(cq06, aq3, bq1, 1);\
+    cq09 = VDOTQ_LANEQ_I32(cq09, aq3, bq1, 2);\
+    cq12 = VDOTQ_LANEQ_I32(cq12, aq3, bq1, 3);\
+  }
+
+#define KERNEL_M12N4 KERNEL_M12N4_UNIT(a_head, b_head)
+#define KERNEL_M4N12 KERNEL_M12N4_UNIT(b_head, a_head)
+
+#define SAVE_M12N4 \
+  I32 *c_tmp = c_ptr;\
+  UNIT_SAVE_M12N2(cq01, cq02, cq03, cq04, cq05, cq06)\
+  UNIT_SAVE_M12N2(cq07, cq08, cq09, cq10, cq11, cq12)
+
+#define SAVE_M4N12 \
+  I32 *c_tmp = c_ptr;\
+  UNIT_SAVE_M4N4_TRANS(cq01, cq04, cq07, cq10)\
+  UNIT_SAVE_M4N4_TRANS(cq02, cq05, cq08, cq11)\
+  UNIT_SAVE_M4N4_TRANS(cq03, cq06, cq09, cq12)
+
+#define LDQ_STEP1_IDX_A55(v, ptr, idx) "ldr d"#v",["#ptr"],#"#idx"\n\t"
+#define LDQ_STEP1_OFF_A55(v, ptr, off) "ldr d"#v",["#ptr",#"#off"]\n\t"
+#define LDQ_STEP2_A55(r, ptr, off) "ldr x"#r",["#ptr",#"#off"]\n\t"
+#define LDQ_STEP3_A55(r, v) "fmov v"#v".d[1],x"#r"\n\t"
+#define LDQ_STEP1_IDX_A76(v, ptr, idx) "ldr q"#v",["#ptr"],#"#idx"\n\t"
+#define LDQ_STEP1_OFF_A76(v, ptr, off) "ldr q"#v",["#ptr",#"#off"]\n\t"
+#define LDQ_STEP2_A76(r, ptr, off) ""
+#define LDQ_STEP3_A76(r, v) ""
+
+#define KERNEL_M8N12_TEMPLATE(cpu) \
+  I32 *c_pref = c_ptr + 7; PREF_N12\
+  I32X4 cq01, cq02, cq03, cq04, cq05, cq06;\
+  I32X4 cq07, cq08, cq09, cq10, cq11, cq12;\
+  I32X4 cq13, cq14, cq15, cq16, cq17, cq18;\
+  I32X4 cq19, cq20, cq21, cq22, cq23, cq24;\
+  NORMAL_KERNEL_SETUP(a_head, b_head)\
+  __asm__ __volatile__(\
+    "movi %[cq01].16b,#0; movi %[cq02].16b,#0\n\t"\
+    "movi %[cq03].16b,#0; movi %[cq04].16b,#0\n\t"\
+    "movi %[cq05].16b,#0; movi %[cq06].16b,#0\n\t"\
+    "movi %[cq07].16b,#0; movi %[cq08].16b,#0\n\t"\
+    "movi %[cq09].16b,#0; movi %[cq10].16b,#0\n\t"\
+    "movi %[cq11].16b,#0; movi %[cq12].16b,#0\n\t"\
+    "movi %[cq13].16b,#0; movi %[cq14].16b,#0\n\t"\
+    "movi %[cq15].16b,#0; movi %[cq16].16b,#0\n\t"\
+    "movi %[cq17].16b,#0; movi %[cq18].16b,#0\n\t"\
+    "movi %[cq19].16b,#0; movi %[cq20].16b,#0\n\t"\
+    "movi %[cq21].16b,#0; movi %[cq22].16b,#0\n\t"\
+    "movi %[cq23].16b,#0; movi %[cq24].16b,#0\n\t"\
+    "cmp %w[kdiv4_left],#1; b.lt 4f\n\t"\
+    "ldr q0,[%[a_rd]]; ldr q1,[%[a_rd],#16]; add %[a_rd],%[a_rd],#32\n\t"\
+    "ldr q4,[%[b_rd]]; ldr q5,[%[b_rd],#16]; add %[b_rd],%[b_rd],#48\n\t"\
+    "cmp %w[kdiv4_left],#3; b.lt 2f\n\t"\
+    ".balign 16; 1:\n\t"\
+    ""IDOT" %[cq01].4s,v0.16b,v4.4b[0]\n\t" LDQ_STEP1_OFF_##cpu(6, %[b_rd], -16)\
+    ""IDOT" %[cq02].4s,v1.16b,v4.4b[0]\n\t" LDQ_STEP2_##cpu(1, %[b_rd], -8)\
+    ""IDOT" %[cq03].4s,v0.16b,v4.4b[1]\n\t" LDQ_STEP1_IDX_##cpu(2, %[a_rd], 64)\
+    ""IDOT" %[cq04].4s,v1.16b,v4.4b[1]\n\t"\
+    ""IDOT" %[cq05].4s,v0.16b,v4.4b[2]\n\t"\
+    ""IDOT" %[cq06].4s,v1.16b,v4.4b[2]\n\t"\
+    ""IDOT" %[cq07].4s,v0.16b,v4.4b[3]\n\t" LDQ_STEP2_##cpu(0, %[a_rd], -56)\
+    ""IDOT" %[cq08].4s,v1.16b,v4.4b[3]\n\t" LDQ_STEP3_##cpu(1, 6)\
+    ""IDOT" %[cq09].4s,v0.16b,v5.4b[0]\n\t" LDQ_STEP1_IDX_##cpu(4, %[b_rd], 96)\
+    ""IDOT" %[cq10].4s,v1.16b,v5.4b[0]\n\t"\
+    ""IDOT" %[cq11].4s,v0.16b,v5.4b[1]\n\t" LDQ_STEP2_##cpu(1, %[b_rd], -88)\
+    ""IDOT" %[cq12].4s,v1.16b,v5.4b[1]\n\t" LDQ_STEP3_##cpu(0, 2)\
+    ""IDOT" %[cq13].4s,v0.16b,v5.4b[2]\n\t" LDQ_STEP1_OFF_##cpu(3, %[a_rd], -48)\
+    ""IDOT" %[cq14].4s,v1.16b,v5.4b[2]\n\t"\
+    ""IDOT" %[cq15].4s,v0.16b,v5.4b[3]\n\t"\
+    ""IDOT" %[cq16].4s,v1.16b,v5.4b[3]\n\t" LDQ_STEP3_##cpu(1, 4)\
+    ""IDOT" %[cq17].4s,v0.16b,v6.4b[0]\n\t" LDQ_STEP1_OFF_##cpu(5, %[b_rd], -80)\
+    ""IDOT" %[cq18].4s,v1.16b,v6.4b[0]\n\t" LDQ_STEP2_##cpu(1, %[b_rd], -72)\
+    ""IDOT" %[cq19].4s,v0.16b,v6.4b[1]\n\t" LDQ_STEP2_##cpu(0, %[a_rd], -40)\
+    ""IDOT" %[cq20].4s,v1.16b,v6.4b[1]\n\t"\
+    ""IDOT" %[cq21].4s,v0.16b,v6.4b[2]\n\t"\
+    ""IDOT" %[cq22].4s,v1.16b,v6.4b[2]\n\t" LDQ_STEP3_##cpu(0, 3)\
+    ""IDOT" %[cq23].4s,v0.16b,v6.4b[3]\n\t"\
+    ""IDOT" %[cq24].4s,v1.16b,v6.4b[3]\n\t" LDQ_STEP3_##cpu(1, 5)\
+    ""IDOT" %[cq01].4s,v2.16b,v4.4b[0]\n\t" LDQ_STEP1_OFF_##cpu(6, %[b_rd], -64)\
+    ""IDOT" %[cq02].4s,v3.16b,v4.4b[0]\n\t" LDQ_STEP2_##cpu(1, %[b_rd], -56)\
+    ""IDOT" %[cq03].4s,v2.16b,v4.4b[1]\n\t" LDQ_STEP1_OFF_##cpu(0, %[a_rd], -32)\
+    ""IDOT" %[cq04].4s,v3.16b,v4.4b[1]\n\t"\
+    ""IDOT" %[cq05].4s,v2.16b,v4.4b[2]\n\t"\
+    ""IDOT" %[cq06].4s,v3.16b,v4.4b[2]\n\t"\
+    ""IDOT" %[cq07].4s,v2.16b,v4.4b[3]\n\t" LDQ_STEP2_##cpu(0, %[a_rd], -24)\
+    ""IDOT" %[cq08].4s,v3.16b,v4.4b[3]\n\t" LDQ_STEP3_##cpu(1, 6)\
+    ""IDOT" %[cq09].4s,v2.16b,v5.4b[0]\n\t" LDQ_STEP1_OFF_##cpu(4, %[b_rd], -48)\
+    ""IDOT" %[cq10].4s,v3.16b,v5.4b[0]\n\t"\
+    ""IDOT" %[cq11].4s,v2.16b,v5.4b[1]\n\t" LDQ_STEP2_##cpu(1, %[b_rd], -40)\
+    ""IDOT" %[cq12].4s,v3.16b,v5.4b[1]\n\t" LDQ_STEP3_##cpu(0, 0)\
+    ""IDOT" %[cq13].4s,v2.16b,v5.4b[2]\n\t" LDQ_STEP1_OFF_##cpu(1, %[a_rd], -16)\
+    ""IDOT" %[cq14].4s,v3.16b,v5.4b[2]\n\t"\
+    ""IDOT" %[cq15].4s,v2.16b,v5.4b[3]\n\t"\
+    ""IDOT" %[cq16].4s,v3.16b,v5.4b[3]\n\t" LDQ_STEP3_##cpu(1, 4)\
+    ""IDOT" %[cq17].4s,v2.16b,v6.4b[0]\n\t" LDQ_STEP1_OFF_##cpu(5, %[b_rd], -32)\
+    ""IDOT" %[cq18].4s,v3.16b,v6.4b[0]\n\t" LDQ_STEP2_##cpu(1, %[b_rd], -24)\
+    ""IDOT" %[cq19].4s,v2.16b,v6.4b[1]\n\t" LDQ_STEP2_##cpu(0, %[a_rd], -8)\
+    ""IDOT" %[cq20].4s,v3.16b,v6.4b[1]\n\t"\
+    ""IDOT" %[cq21].4s,v2.16b,v6.4b[2]; sub %w[kdiv4_left],%w[kdiv4_left],#2\n\t"\
+    ""IDOT" %[cq22].4s,v3.16b,v6.4b[2]\n\t" LDQ_STEP3_##cpu(0, 1)\
+    ""IDOT" %[cq23].4s,v2.16b,v6.4b[3]; cmp %w[kdiv4_left],#3\n\t"\
+    ""IDOT" %[cq24].4s,v3.16b,v6.4b[3]\n\t" LDQ_STEP3_##cpu(1, 5)\
+    "b.ge 1b; 2:\n\t"\
+    "cmp %w[kdiv4_left],#2; b.lt 3f\n\t"\
+    ""IDOT" %[cq01].4s,v0.16b,v4.4b[0]\n\t" LDQ_STEP1_OFF_##cpu(6, %[b_rd], -16)\
+    ""IDOT" %[cq02].4s,v1.16b,v4.4b[0]\n\t" LDQ_STEP2_##cpu(1, %[b_rd], -8)\
+    ""IDOT" %[cq03].4s,v0.16b,v4.4b[1]\n\t" LDQ_STEP1_IDX_##cpu(2, %[a_rd], 32)\
+    ""IDOT" %[cq04].4s,v1.16b,v4.4b[1]\n\t"\
+    ""IDOT" %[cq05].4s,v0.16b,v4.4b[2]\n\t"\
+    ""IDOT" %[cq06].4s,v1.16b,v4.4b[2]\n\t"\
+    ""IDOT" %[cq07].4s,v0.16b,v4.4b[3]\n\t" LDQ_STEP2_##cpu(0, %[a_rd], -24)\
+    ""IDOT" %[cq08].4s,v1.16b,v4.4b[3]\n\t" LDQ_STEP3_##cpu(1, 6)\
+    ""IDOT" %[cq09].4s,v0.16b,v5.4b[0]\n\t" LDQ_STEP1_IDX_##cpu(4, %[b_rd], 48)\
+    ""IDOT" %[cq10].4s,v1.16b,v5.4b[0]\n\t"\
+    ""IDOT" %[cq11].4s,v0.16b,v5.4b[1]\n\t" LDQ_STEP2_##cpu(1, %[b_rd], -40)\
+    ""IDOT" %[cq12].4s,v1.16b,v5.4b[1]\n\t" LDQ_STEP3_##cpu(0, 2)\
+    ""IDOT" %[cq13].4s,v0.16b,v5.4b[2]\n\t" LDQ_STEP1_OFF_##cpu(3, %[a_rd], -16)\
+    ""IDOT" %[cq14].4s,v1.16b,v5.4b[2]\n\t"\
+    ""IDOT" %[cq15].4s,v0.16b,v5.4b[3]\n\t"\
+    ""IDOT" %[cq16].4s,v1.16b,v5.4b[3]\n\t" LDQ_STEP3_##cpu(1, 4)\
+    ""IDOT" %[cq17].4s,v0.16b,v6.4b[0]\n\t" LDQ_STEP1_OFF_##cpu(5, %[b_rd], -32)\
+    ""IDOT" %[cq18].4s,v1.16b,v6.4b[0]\n\t" LDQ_STEP2_##cpu(1, %[b_rd], -24)\
+    ""IDOT" %[cq19].4s,v0.16b,v6.4b[1]\n\t" LDQ_STEP2_##cpu(0, %[a_rd], -8)\
+    ""IDOT" %[cq20].4s,v1.16b,v6.4b[1]\n\t"\
+    ""IDOT" %[cq21].4s,v0.16b,v6.4b[2]\n\t"\
+    ""IDOT" %[cq22].4s,v1.16b,v6.4b[2]\n\t" LDQ_STEP3_##cpu(0, 3)\
+    ""IDOT" %[cq23].4s,v0.16b,v6.4b[3]\n\t"\
+    ""IDOT" %[cq24].4s,v1.16b,v6.4b[3]\n\t" LDQ_STEP3_##cpu(1, 5)\
+    ""IDOT" %[cq01].4s,v2.16b,v4.4b[0]\n\t" LDQ_STEP1_OFF_##cpu(6, %[b_rd], -16)\
+    ""IDOT" %[cq02].4s,v3.16b,v4.4b[0]\n\t" LDQ_STEP2_##cpu(1, %[b_rd], -8)\
+    ""IDOT" %[cq03].4s,v2.16b,v4.4b[1]\n\t"\
+    ""IDOT" %[cq04].4s,v3.16b,v4.4b[1]\n\t"\
+    ""IDOT" %[cq05].4s,v2.16b,v4.4b[2]\n\t"\
+    ""IDOT" %[cq06].4s,v3.16b,v4.4b[2]\n\t"\
+    ""IDOT" %[cq07].4s,v2.16b,v4.4b[3]\n\t"\
+    ""IDOT" %[cq08].4s,v3.16b,v4.4b[3]\n\t" LDQ_STEP3_##cpu(1, 6)\
+    ""IDOT" %[cq09].4s,v2.16b,v5.4b[0]\n\t"\
+    ""IDOT" %[cq10].4s,v3.16b,v5.4b[0]\n\t"\
+    ""IDOT" %[cq11].4s,v2.16b,v5.4b[1]\n\t"\
+    ""IDOT" %[cq12].4s,v3.16b,v5.4b[1]\n\t"\
+    ""IDOT" %[cq13].4s,v2.16b,v5.4b[2]\n\t"\
+    ""IDOT" %[cq14].4s,v3.16b,v5.4b[2]\n\t"\
+    ""IDOT" %[cq15].4s,v2.16b,v5.4b[3]\n\t"\
+    ""IDOT" %[cq16].4s,v3.16b,v5.4b[3]\n\t"\
+    ""IDOT" %[cq17].4s,v2.16b,v6.4b[0]\n\t"\
+    ""IDOT" %[cq18].4s,v3.16b,v6.4b[0]\n\t"\
+    ""IDOT" %[cq19].4s,v2.16b,v6.4b[1]\n\t"\
+    ""IDOT" %[cq20].4s,v3.16b,v6.4b[1]\n\t"\
+    ""IDOT" %[cq21].4s,v2.16b,v6.4b[2]\n\t"\
+    ""IDOT" %[cq22].4s,v3.16b,v6.4b[2]\n\t"\
+    ""IDOT" %[cq23].4s,v2.16b,v6.4b[3]\n\t"\
+    ""IDOT" %[cq24].4s,v3.16b,v6.4b[3]\n\t"\
+    "b 4f; 3:\n\t"\
+    ""IDOT" %[cq01].4s,v0.16b,v4.4b[0]\n\t" LDQ_STEP1_OFF_##cpu(6, %[b_rd], -16)\
+    ""IDOT" %[cq02].4s,v1.16b,v4.4b[0]\n\t" LDQ_STEP2_##cpu(1, %[b_rd], -8)\
+    ""IDOT" %[cq03].4s,v0.16b,v4.4b[1]\n\t"\
+    ""IDOT" %[cq04].4s,v1.16b,v4.4b[1]\n\t"\
+    ""IDOT" %[cq05].4s,v0.16b,v4.4b[2]\n\t"\
+    ""IDOT" %[cq06].4s,v1.16b,v4.4b[2]\n\t"\
+    ""IDOT" %[cq07].4s,v0.16b,v4.4b[3]\n\t"\
+    ""IDOT" %[cq08].4s,v1.16b,v4.4b[3]\n\t" LDQ_STEP3_##cpu(1, 6)\
+    ""IDOT" %[cq09].4s,v0.16b,v5.4b[0]\n\t"\
+    ""IDOT" %[cq10].4s,v1.16b,v5.4b[0]\n\t"\
+    ""IDOT" %[cq11].4s,v0.16b,v5.4b[1]\n\t"\
+    ""IDOT" %[cq12].4s,v1.16b,v5.4b[1]\n\t"\
+    ""IDOT" %[cq13].4s,v0.16b,v5.4b[2]\n\t"\
+    ""IDOT" %[cq14].4s,v1.16b,v5.4b[2]\n\t"\
+    ""IDOT" %[cq15].4s,v0.16b,v5.4b[3]\n\t"\
+    ""IDOT" %[cq16].4s,v1.16b,v5.4b[3]\n\t"\
+    ""IDOT" %[cq17].4s,v0.16b,v6.4b[0]\n\t"\
+    ""IDOT" %[cq18].4s,v1.16b,v6.4b[0]\n\t"\
+    ""IDOT" %[cq19].4s,v0.16b,v6.4b[1]\n\t"\
+    ""IDOT" %[cq20].4s,v1.16b,v6.4b[1]\n\t"\
+    ""IDOT" %[cq21].4s,v0.16b,v6.4b[2]\n\t"\
+    ""IDOT" %[cq22].4s,v1.16b,v6.4b[2]\n\t"\
+    ""IDOT" %[cq23].4s,v0.16b,v6.4b[3]\n\t"\
+    ""IDOT" %[cq24].4s,v1.16b,v6.4b[3]\n\t"\
+    "4:\n\t"\
+  :[cq01]"=w"(cq01),[cq02]"=w"(cq02),[cq03]"=w"(cq03),[cq04]"=w"(cq04),\
+  [cq05]"=w"(cq05),[cq06]"=w"(cq06),[cq07]"=w"(cq07),[cq08]"=w"(cq08),\
+  [cq09]"=w"(cq09),[cq10]"=w"(cq10),[cq11]"=w"(cq11),[cq12]"=w"(cq12),\
+  [cq13]"=w"(cq13),[cq14]"=w"(cq14),[cq15]"=w"(cq15),[cq16]"=w"(cq16),\
+  [cq17]"=w"(cq17),[cq18]"=w"(cq18),[cq19]"=w"(cq19),[cq20]"=w"(cq20),\
+  [cq21]"=w"(cq21),[cq22]"=w"(cq22),[cq23]"=w"(cq23),[cq24]"=w"(cq24),\
+  [kdiv4_left]"+r"(kdiv4_left),[a_rd]"+r"(a_rd),[b_rd]"+r"(b_rd)\
+  ::"cc","memory","x0","x1","v0","v1","v2","v3","v4","v5","v6");
+
+#define SAVE_M8N12 \
+  I32 *c_tmp = c_ptr;\
+  UNIT_SAVE_M8N2(cq01, cq02, cq03, cq04)\
+  UNIT_SAVE_M8N2(cq05, cq06, cq07, cq08)\
+  UNIT_SAVE_M8N2(cq09, cq10, cq11, cq12)\
+  UNIT_SAVE_M8N2(cq13, cq14, cq15, cq16)\
+  UNIT_SAVE_M8N2(cq17, cq18, cq19, cq20)\
+  UNIT_SAVE_M8N2(cq21, cq22, cq23, cq24)
+
+#define KERNEL_M12N8_TEMPLATE(cpu) \
+  I32 *c_pref = c_ptr + 11; PREF_N8\
+  I32X4 cq01, cq02, cq03, cq04, cq05, cq06;\
+  I32X4 cq07, cq08, cq09, cq10, cq11, cq12;\
+  I32X4 cq13, cq14, cq15, cq16, cq17, cq18;\
+  I32X4 cq19, cq20, cq21, cq22, cq23, cq24;\
+  NORMAL_KERNEL_SETUP(a_head, b_head)\
+  __asm__ __volatile__(\
+    "movi %[cq01].16b,#0; movi %[cq02].16b,#0\n\t"\
+    "movi %[cq03].16b,#0; movi %[cq04].16b,#0\n\t"\
+    "movi %[cq05].16b,#0; movi %[cq06].16b,#0\n\t"\
+    "movi %[cq07].16b,#0; movi %[cq08].16b,#0\n\t"\
+    "movi %[cq09].16b,#0; movi %[cq10].16b,#0\n\t"\
+    "movi %[cq11].16b,#0; movi %[cq12].16b,#0\n\t"\
+    "movi %[cq13].16b,#0; movi %[cq14].16b,#0\n\t"\
+    "movi %[cq15].16b,#0; movi %[cq16].16b,#0\n\t"\
+    "movi %[cq17].16b,#0; movi %[cq18].16b,#0\n\t"\
+    "movi %[cq19].16b,#0; movi %[cq20].16b,#0\n\t"\
+    "movi %[cq21].16b,#0; movi %[cq22].16b,#0\n\t"\
+    "movi %[cq23].16b,#0; movi %[cq24].16b,#0\n\t"\
+    "cmp %w[kdiv4_left],#1; b.lt 4f\n\t"\
+    "ldr q0,[%[a_rd]]; ldr q1,[%[a_rd],#16]; add %[a_rd],%[a_rd],#48\n\t"\
+    "ldr q4,[%[b_rd]]; ldr q5,[%[b_rd],#16]; add %[b_rd],%[b_rd],#32\n\t"\
+    "cmp %w[kdiv4_left],#3; b.lt 2f\n\t"\
+    ".balign 16; 1:\n\t"\
+    ""IDOT" %[cq01].4s,v0.16b,v4.4b[0]\n\t" LDQ_STEP1_OFF_##cpu(2, %[a_rd], -16)\
+    ""IDOT" %[cq04].4s,v0.16b,v4.4b[1]\n\t" LDQ_STEP2_##cpu(0, %[a_rd], -8)\
+    ""IDOT" %[cq07].4s,v0.16b,v4.4b[2]\n\t" LDQ_STEP1_IDX_##cpu(6, %[b_rd], 64)\
+    ""IDOT" %[cq10].4s,v0.16b,v4.4b[3]\n\t"\
+    ""IDOT" %[cq13].4s,v0.16b,v5.4b[0]\n\t"\
+    ""IDOT" %[cq16].4s,v0.16b,v5.4b[1]\n\t"\
+    ""IDOT" %[cq19].4s,v0.16b,v5.4b[2]\n\t" LDQ_STEP2_##cpu(1, %[b_rd], -56)\
+    ""IDOT" %[cq22].4s,v0.16b,v5.4b[3]\n\t" LDQ_STEP3_##cpu(0, 2)\
+    ""IDOT" %[cq02].4s,v1.16b,v4.4b[0]\n\t" LDQ_STEP1_IDX_##cpu(0, %[a_rd], 96)\
+    ""IDOT" %[cq05].4s,v1.16b,v4.4b[1]\n\t"\
+    ""IDOT" %[cq08].4s,v1.16b,v4.4b[2]\n\t" LDQ_STEP2_##cpu(0, %[a_rd], -88)\
+    ""IDOT" %[cq11].4s,v1.16b,v4.4b[3]\n\t" LDQ_STEP3_##cpu(1, 6)\
+    ""IDOT" %[cq14].4s,v1.16b,v5.4b[0]\n\t" LDQ_STEP1_OFF_##cpu(7, %[b_rd], -48)\
+    ""IDOT" %[cq17].4s,v1.16b,v5.4b[1]\n\t"\
+    ""IDOT" %[cq20].4s,v1.16b,v5.4b[2]\n\t"\
+    ""IDOT" %[cq23].4s,v1.16b,v5.4b[3]\n\t" LDQ_STEP3_##cpu(0, 0)\
+    ""IDOT" %[cq03].4s,v2.16b,v4.4b[0]\n\t" LDQ_STEP1_OFF_##cpu(1, %[a_rd], -80)\
+    ""IDOT" %[cq06].4s,v2.16b,v4.4b[1]\n\t" LDQ_STEP2_##cpu(0, %[a_rd], -72)\
+    ""IDOT" %[cq09].4s,v2.16b,v4.4b[2]\n\t" LDQ_STEP2_##cpu(1, %[b_rd], -40)\
+    ""IDOT" %[cq12].4s,v2.16b,v4.4b[3]\n\t"\
+    ""IDOT" %[cq15].4s,v2.16b,v5.4b[0]\n\t"\
+    ""IDOT" %[cq18].4s,v2.16b,v5.4b[1]\n\t" LDQ_STEP3_##cpu(1, 7)\
+    ""IDOT" %[cq21].4s,v2.16b,v5.4b[2]\n\t"\
+    ""IDOT" %[cq24].4s,v2.16b,v5.4b[3]\n\t" LDQ_STEP3_##cpu(0, 1)\
+    ""IDOT" %[cq01].4s,v0.16b,v6.4b[0]\n\t" LDQ_STEP1_OFF_##cpu(2, %[a_rd], -64)\
+    ""IDOT" %[cq04].4s,v0.16b,v6.4b[1]\n\t" LDQ_STEP2_##cpu(0, %[a_rd], -56)\
+    ""IDOT" %[cq07].4s,v0.16b,v6.4b[2]\n\t" LDQ_STEP1_OFF_##cpu(4, %[b_rd], -32)\
+    ""IDOT" %[cq10].4s,v0.16b,v6.4b[3]\n\t"\
+    ""IDOT" %[cq13].4s,v0.16b,v7.4b[0]\n\t"\
+    ""IDOT" %[cq16].4s,v0.16b,v7.4b[1]\n\t"\
+    ""IDOT" %[cq19].4s,v0.16b,v7.4b[2]\n\t" LDQ_STEP2_##cpu(1, %[b_rd], -24)\
+    ""IDOT" %[cq22].4s,v0.16b,v7.4b[3]\n\t" LDQ_STEP3_##cpu(0, 2)\
+    ""IDOT" %[cq02].4s,v1.16b,v6.4b[0]\n\t" LDQ_STEP1_OFF_##cpu(0, %[a_rd], -48)\
+    ""IDOT" %[cq05].4s,v1.16b,v6.4b[1]\n\t"\
+    ""IDOT" %[cq08].4s,v1.16b,v6.4b[2]\n\t" LDQ_STEP2_##cpu(0, %[a_rd], -40)\
+    ""IDOT" %[cq11].4s,v1.16b,v6.4b[3]\n\t" LDQ_STEP3_##cpu(1, 4)\
+    ""IDOT" %[cq14].4s,v1.16b,v7.4b[0]\n\t" LDQ_STEP1_OFF_##cpu(5, %[b_rd], -16)\
+    ""IDOT" %[cq17].4s,v1.16b,v7.4b[1]\n\t"\
+    ""IDOT" %[cq20].4s,v1.16b,v7.4b[2]\n\t"\
+    ""IDOT" %[cq23].4s,v1.16b,v7.4b[3]\n\t" LDQ_STEP3_##cpu(0, 0)\
+    ""IDOT" %[cq03].4s,v2.16b,v6.4b[0]\n\t" LDQ_STEP1_OFF_##cpu(1, %[a_rd], -32)\
+    ""IDOT" %[cq06].4s,v2.16b,v6.4b[1]\n\t" LDQ_STEP2_##cpu(0, %[a_rd], -24)\
+    ""IDOT" %[cq09].4s,v2.16b,v6.4b[2]\n\t" LDQ_STEP2_##cpu(1, %[b_rd], -8)\
+    ""IDOT" %[cq12].4s,v2.16b,v6.4b[3]\n\t"\
+    ""IDOT" %[cq15].4s,v2.16b,v7.4b[0]; sub %w[kdiv4_left],%w[kdiv4_left],#2\n\t"\
+    ""IDOT" %[cq18].4s,v2.16b,v7.4b[1]\n\t" LDQ_STEP3_##cpu(1, 5)\
+    ""IDOT" %[cq21].4s,v2.16b,v7.4b[2]; cmp %w[kdiv4_left],#3\n\t"\
+    ""IDOT" %[cq24].4s,v2.16b,v7.4b[3]\n\t" LDQ_STEP3_##cpu(0, 1)\
+    "b.ge 1b; 2:\n\t"\
+    "cmp %w[kdiv4_left],#2; b.lt 3f\n\t"\
+    ""IDOT" %[cq01].4s,v0.16b,v4.4b[0]\n\t" LDQ_STEP1_OFF_##cpu(2, %[a_rd], -16)\
+    ""IDOT" %[cq04].4s,v0.16b,v4.4b[1]\n\t" LDQ_STEP2_##cpu(0, %[a_rd], -8)\
+    ""IDOT" %[cq07].4s,v0.16b,v4.4b[2]\n\t" LDQ_STEP1_IDX_##cpu(6, %[b_rd], 32)\
+    ""IDOT" %[cq10].4s,v0.16b,v4.4b[3]\n\t"\
+    ""IDOT" %[cq13].4s,v0.16b,v5.4b[0]\n\t"\
+    ""IDOT" %[cq16].4s,v0.16b,v5.4b[1]\n\t"\
+    ""IDOT" %[cq19].4s,v0.16b,v5.4b[2]\n\t" LDQ_STEP2_##cpu(1, %[b_rd], -24)\
+    ""IDOT" %[cq22].4s,v0.16b,v5.4b[3]\n\t" LDQ_STEP3_##cpu(0, 2)\
+    ""IDOT" %[cq02].4s,v1.16b,v4.4b[0]\n\t" LDQ_STEP1_IDX_##cpu(0, %[a_rd], 48)\
+    ""IDOT" %[cq05].4s,v1.16b,v4.4b[1]\n\t"\
+    ""IDOT" %[cq08].4s,v1.16b,v4.4b[2]\n\t" LDQ_STEP2_##cpu(0, %[a_rd], -40)\
+    ""IDOT" %[cq11].4s,v1.16b,v4.4b[3]\n\t" LDQ_STEP3_##cpu(1, 6)\
+    ""IDOT" %[cq14].4s,v1.16b,v5.4b[0]\n\t" LDQ_STEP1_OFF_##cpu(7, %[b_rd], -16)\
+    ""IDOT" %[cq17].4s,v1.16b,v5.4b[1]\n\t"\
+    ""IDOT" %[cq20].4s,v1.16b,v5.4b[2]\n\t"\
+    ""IDOT" %[cq23].4s,v1.16b,v5.4b[3]\n\t" LDQ_STEP3_##cpu(0, 0)\
+    ""IDOT" %[cq03].4s,v2.16b,v4.4b[0]\n\t" LDQ_STEP1_OFF_##cpu(1, %[a_rd], -32)\
+    ""IDOT" %[cq06].4s,v2.16b,v4.4b[1]\n\t" LDQ_STEP2_##cpu(0, %[a_rd], -24)\
+    ""IDOT" %[cq09].4s,v2.16b,v4.4b[2]\n\t" LDQ_STEP2_##cpu(1, %[b_rd], -8)\
+    ""IDOT" %[cq12].4s,v2.16b,v4.4b[3]\n\t"\
+    ""IDOT" %[cq15].4s,v2.16b,v5.4b[0]\n\t"\
+    ""IDOT" %[cq18].4s,v2.16b,v5.4b[1]\n\t" LDQ_STEP3_##cpu(1, 7)\
+    ""IDOT" %[cq21].4s,v2.16b,v5.4b[2]\n\t"\
+    ""IDOT" %[cq24].4s,v2.16b,v5.4b[3]\n\t" LDQ_STEP3_##cpu(0, 1)\
+    ""IDOT" %[cq01].4s,v0.16b,v6.4b[0]\n\t" LDQ_STEP1_OFF_##cpu(2, %[a_rd], -16)\
+    ""IDOT" %[cq04].4s,v0.16b,v6.4b[1]\n\t" LDQ_STEP2_##cpu(0, %[a_rd], -8)\
+    ""IDOT" %[cq07].4s,v0.16b,v6.4b[2]\n\t"\
+    ""IDOT" %[cq10].4s,v0.16b,v6.4b[3]\n\t"\
+    ""IDOT" %[cq13].4s,v0.16b,v7.4b[0]\n\t"\
+    ""IDOT" %[cq16].4s,v0.16b,v7.4b[1]\n\t"\
+    ""IDOT" %[cq19].4s,v0.16b,v7.4b[2]\n\t"\
+    ""IDOT" %[cq22].4s,v0.16b,v7.4b[3]\n\t" LDQ_STEP3_##cpu(0, 2)\
+    ""IDOT" %[cq02].4s,v1.16b,v6.4b[0]\n\t"\
+    ""IDOT" %[cq05].4s,v1.16b,v6.4b[1]\n\t"\
+    ""IDOT" %[cq08].4s,v1.16b,v6.4b[2]\n\t"\
+    ""IDOT" %[cq11].4s,v1.16b,v6.4b[3]\n\t"\
+    ""IDOT" %[cq14].4s,v1.16b,v7.4b[0]\n\t"\
+    ""IDOT" %[cq17].4s,v1.16b,v7.4b[1]\n\t"\
+    ""IDOT" %[cq20].4s,v1.16b,v7.4b[2]\n\t"\
+    ""IDOT" %[cq23].4s,v1.16b,v7.4b[3]\n\t"\
+    ""IDOT" %[cq03].4s,v2.16b,v6.4b[0]\n\t"\
+    ""IDOT" %[cq06].4s,v2.16b,v6.4b[1]\n\t"\
+    ""IDOT" %[cq09].4s,v2.16b,v6.4b[2]\n\t"\
+    ""IDOT" %[cq12].4s,v2.16b,v6.4b[3]\n\t"\
+    ""IDOT" %[cq15].4s,v2.16b,v7.4b[0]\n\t"\
+    ""IDOT" %[cq18].4s,v2.16b,v7.4b[1]\n\t"\
+    ""IDOT" %[cq21].4s,v2.16b,v7.4b[2]\n\t"\
+    ""IDOT" %[cq24].4s,v2.16b,v7.4b[3]\n\t"\
+    "b 4f; 3:\n\t"\
+    ""IDOT" %[cq01].4s,v0.16b,v4.4b[0]\n\t" LDQ_STEP1_OFF_##cpu(2, %[a_rd], -16)\
+    ""IDOT" %[cq04].4s,v0.16b,v4.4b[1]\n\t" LDQ_STEP2_##cpu(0, %[a_rd], -8)\
+    ""IDOT" %[cq07].4s,v0.16b,v4.4b[2]\n\t"\
+    ""IDOT" %[cq10].4s,v0.16b,v4.4b[3]\n\t"\
+    ""IDOT" %[cq13].4s,v0.16b,v5.4b[0]\n\t"\
+    ""IDOT" %[cq16].4s,v0.16b,v5.4b[1]\n\t"\
+    ""IDOT" %[cq19].4s,v0.16b,v5.4b[2]\n\t"\
+    ""IDOT" %[cq22].4s,v0.16b,v5.4b[3]\n\t" LDQ_STEP3_##cpu(0, 2)\
+    ""IDOT" %[cq02].4s,v1.16b,v4.4b[0]\n\t"\
+    ""IDOT" %[cq05].4s,v1.16b,v4.4b[1]\n\t"\
+    ""IDOT" %[cq08].4s,v1.16b,v4.4b[2]\n\t"\
+    ""IDOT" %[cq11].4s,v1.16b,v4.4b[3]\n\t"\
+    ""IDOT" %[cq14].4s,v1.16b,v5.4b[0]\n\t"\
+    ""IDOT" %[cq17].4s,v1.16b,v5.4b[1]\n\t"\
+    ""IDOT" %[cq20].4s,v1.16b,v5.4b[2]\n\t"\
+    ""IDOT" %[cq23].4s,v1.16b,v5.4b[3]\n\t"\
+    ""IDOT" %[cq03].4s,v2.16b,v4.4b[0]\n\t"\
+    ""IDOT" %[cq06].4s,v2.16b,v4.4b[1]\n\t"\
+    ""IDOT" %[cq09].4s,v2.16b,v4.4b[2]\n\t"\
+    ""IDOT" %[cq12].4s,v2.16b,v4.4b[3]\n\t"\
+    ""IDOT" %[cq15].4s,v2.16b,v5.4b[0]\n\t"\
+    ""IDOT" %[cq18].4s,v2.16b,v5.4b[1]\n\t"\
+    ""IDOT" %[cq21].4s,v2.16b,v5.4b[2]\n\t"\
+    ""IDOT" %[cq24].4s,v2.16b,v5.4b[3]\n\t"\
+    "4:\n\t"\
+  :[cq01]"=w"(cq01),[cq02]"=w"(cq02),[cq03]"=w"(cq03),[cq04]"=w"(cq04),\
+  [cq05]"=w"(cq05),[cq06]"=w"(cq06),[cq07]"=w"(cq07),[cq08]"=w"(cq08),\
+  [cq09]"=w"(cq09),[cq10]"=w"(cq10),[cq11]"=w"(cq11),[cq12]"=w"(cq12),\
+  [cq13]"=w"(cq13),[cq14]"=w"(cq14),[cq15]"=w"(cq15),[cq16]"=w"(cq16),\
+  [cq17]"=w"(cq17),[cq18]"=w"(cq18),[cq19]"=w"(cq19),[cq20]"=w"(cq20),\
+  [cq21]"=w"(cq21),[cq22]"=w"(cq22),[cq23]"=w"(cq23),[cq24]"=w"(cq24),\
+  [kdiv4_left]"+r"(kdiv4_left),[a_rd]"+r"(a_rd),[b_rd]"+r"(b_rd)\
+  ::"cc","memory","x0","x1","v0","v1","v2","v4","v5","v6","v7");
+
+#define SAVE_M12N8 \
+  I32 *c_tmp = c_ptr;\
+  UNIT_SAVE_M12N2(cq01, cq02, cq03, cq04, cq05, cq06)\
+  UNIT_SAVE_M12N2(cq07, cq08, cq09, cq10, cq11, cq12)\
+  UNIT_SAVE_M12N2(cq13, cq14, cq15, cq16, cq17, cq18)\
+  UNIT_SAVE_M12N2(cq19, cq20, cq21, cq22, cq23, cq24)
+
+#define NEON_IDOTGEMM_INLINE_DUALPACK_UNIT_FUNC(mdim, ndim, srcint, dstint) \
+static inline void\
+  inline_dualpack_gemm_a##srcint##_b##srcint##_c##dstint##_m##mdim##_n##ndim(\
+  const srcint *a_head, const srcint *b_head, dstint *c_ptr,\
+  uint32_t K, dstint beta, uint32_t ldc) {\
+  KERNEL_M##mdim##N##ndim\
+  SAVE_M##mdim##N##ndim\
+}
+
+#define IDOTGEMM_INLINE_DUALPACK_UNIT_FUNC(mdim, ndim, srcint, dstint)\
+  NEON_IDOTGEMM_INLINE_DUALPACK_UNIT_FUNC(mdim, ndim, srcint, dstint)
+
+
+IDOTGEMM_INLINE_DUALPACK_UNIT_FUNC(1, 1, I32, I32)
+IDOTGEMM_INLINE_DUALPACK_UNIT_FUNC(1, 2, I32, I32)
+IDOTGEMM_INLINE_DUALPACK_UNIT_FUNC(2, 1, I32, I32)
+IDOTGEMM_INLINE_DUALPACK_UNIT_FUNC(2, 2, I32, I32)
+IDOTGEMM_INLINE_DUALPACK_UNIT_FUNC(1, 4, I32, I32)
+IDOTGEMM_INLINE_DUALPACK_UNIT_FUNC(2, 4, I32, I32)
+IDOTGEMM_INLINE_DUALPACK_UNIT_FUNC(4, 1, I32, I32)
+IDOTGEMM_INLINE_DUALPACK_UNIT_FUNC(4, 2, I32, I32)
+IDOTGEMM_INLINE_DUALPACK_UNIT_FUNC(4, 4, I32, I32)
+IDOTGEMM_INLINE_DUALPACK_UNIT_FUNC(1, 8, I32, I32)
+IDOTGEMM_INLINE_DUALPACK_UNIT_FUNC(2, 8, I32, I32)
+IDOTGEMM_INLINE_DUALPACK_UNIT_FUNC(4, 8, I32, I32)
+IDOTGEMM_INLINE_DUALPACK_UNIT_FUNC(8, 1, I32, I32)
+IDOTGEMM_INLINE_DUALPACK_UNIT_FUNC(8, 2, I32, I32)
+IDOTGEMM_INLINE_DUALPACK_UNIT_FUNC(8, 4, I32, I32)
+IDOTGEMM_INLINE_DUALPACK_UNIT_FUNC(8, 8, I32, I32)
+IDOTGEMM_INLINE_DUALPACK_UNIT_FUNC(1, 12, I32, I32)
+IDOTGEMM_INLINE_DUALPACK_UNIT_FUNC(2, 12, I32, I32)
+IDOTGEMM_INLINE_DUALPACK_UNIT_FUNC(4, 12, I32, I32)
+IDOTGEMM_INLINE_DUALPACK_UNIT_FUNC(12, 1, I32, I32)
+IDOTGEMM_INLINE_DUALPACK_UNIT_FUNC(12, 2, I32, I32)
+IDOTGEMM_INLINE_DUALPACK_UNIT_FUNC(12, 4, I32, I32)
+
+#endif
diff --git a/include/neon_armv8a/I8I32MlaGemmKernel.h b/include/neon_armv8a/I8I32MlaGemmKernel.h
new file mode 100644
index 0000000..5ed9fe6
--- /dev/null
+++ b/include/neon_armv8a/I8I32MlaGemmKernel.h
@@ -0,0 +1,378 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include "arm_neon/NeonI8I32MlaGemmKernel.h"
+
+#ifndef INCLUDE_ARMV8A_I8I32MLA_ASM_KERNEL
+#define INCLUDE_ARMV8A_I8I32MLA_ASM_KERNEL
+
+static inline void pref_c_8(const I32 *c) {
+  __asm__("prfm pstl1keep,[%0]; prfm pstl1keep,[%0,#32]\n\t"::"r"(c):);
+}
+
+static inline void pref_c_12(const I32 *c) {
+  __asm__("prfm pstl1keep,[%0]; prfm pstl1keep,[%0,#48]\n\t"::"r"(c):);
+}
+
+#define KERNEL_M8N12 \
+  const I32 *c_pref = c_ptr;\
+  pref_c_8(c_pref); c_pref += ldc;\
+  pref_c_8(c_pref); c_pref += ldc;\
+  pref_c_8(c_pref); c_pref += ldc;\
+  pref_c_8(c_pref); c_pref += ldc;\
+  pref_c_8(c_pref); c_pref += ldc;\
+  pref_c_8(c_pref); c_pref += ldc;\
+  pref_c_8(c_pref); c_pref += ldc;\
+  pref_c_8(c_pref); c_pref += ldc;\
+  pref_c_8(c_pref); c_pref += ldc;\
+  pref_c_8(c_pref); c_pref += ldc;\
+  pref_c_8(c_pref); c_pref += ldc;\
+  pref_c_8(c_pref);\
+  COMMON_KERNEL_HEADER(a_head, b_head)\
+  I32X4 cq01, cq02, cq03, cq04, cq05, cq06, cq07, cq08;\
+  I32X4 cq09, cq10, cq11, cq12, cq13, cq14, cq15, cq16;\
+  I32X4 cq17, cq18, cq19, cq20, cq21, cq22, cq23, cq24;\
+  __asm__ __volatile__(\
+    "movi %[cq01].16b,#0; movi %[cq02].16b,#0\n\t"\
+    "movi %[cq03].16b,#0; movi %[cq04].16b,#0\n\t"\
+    "movi %[cq05].16b,#0; movi %[cq06].16b,#0\n\t"\
+    "movi %[cq07].16b,#0; movi %[cq08].16b,#0\n\t"\
+    "movi %[cq09].16b,#0; movi %[cq10].16b,#0\n\t"\
+    "movi %[cq11].16b,#0; movi %[cq12].16b,#0\n\t"\
+    "movi %[cq13].16b,#0; movi %[cq14].16b,#0\n\t"\
+    "movi %[cq15].16b,#0; movi %[cq16].16b,#0\n\t"\
+    "movi %[cq17].16b,#0; movi %[cq18].16b,#0\n\t"\
+    "movi %[cq19].16b,#0; movi %[cq20].16b,#0\n\t"\
+    "movi %[cq21].16b,#0; movi %[cq22].16b,#0\n\t"\
+    "movi %[cq23].16b,#0; movi %[cq24].16b,#0\n\t"\
+    "cmp %w[k_left],#1; b.lt 4f\n\t"\
+    "ldr q0,[%[a_ptr]],#16\n\t"\
+    "ldr q2,[%[b_ptr]]; ldr d3,[%[b_ptr],#16]; add %[b_ptr],%[b_ptr],#24\n\t"\
+    "cmp %w[k_left],#3; b.lt 2f\n\t"\
+    ".balign 16; 1:\n\t"\
+    ""IMLAL" %[cq01].4s,v0.4h,v2.h[0]; ldr x0,[%[b_ptr]],#48\n\t"\
+    ""IMLAL2" %[cq02].4s,v0.8h,v2.h[0]\n\t"\
+    ""IMLAL" %[cq03].4s,v0.4h,v2.h[1]\n\t"\
+    ""IMLAL2" %[cq04].4s,v0.8h,v2.h[1]\n\t"\
+    ""IMLAL" %[cq05].4s,v0.4h,v2.h[2]\n\t"\
+    ""IMLAL2" %[cq06].4s,v0.8h,v2.h[2]\n\t"\
+    ""IMLAL" %[cq07].4s,v0.4h,v2.h[3]\n\t"\
+    ""IMLAL2" %[cq08].4s,v0.8h,v2.h[3]\n\t"\
+    "fmov v3.d[1],x0; ldr d1,[%[a_ptr]],#32\n\t"\
+    ""IMLAL" %[cq09].4s,v0.4h,v2.h[4]\n\t"\
+    ""IMLAL2" %[cq10].4s,v0.8h,v2.h[4]; ldr x0,[%[a_ptr],#-24]\n\t"\
+    ""IMLAL" %[cq11].4s,v0.4h,v2.h[5]\n\t"\
+    ""IMLAL2" %[cq12].4s,v0.8h,v2.h[5]\n\t"\
+    ""IMLAL" %[cq13].4s,v0.4h,v2.h[6]\n\t"\
+    ""IMLAL2" %[cq14].4s,v0.8h,v2.h[6]\n\t"\
+    ""IMLAL" %[cq15].4s,v0.4h,v2.h[7]\n\t"\
+    ""IMLAL2" %[cq16].4s,v0.8h,v2.h[7]\n\t"\
+    "fmov v1.d[1],x0; ldr d4,[%[b_ptr],#-40]\n\t"\
+    ""IMLAL" %[cq17].4s,v0.4h,v3.h[0]; ldr x0,[%[b_ptr],#-32]\n\t"\
+    ""IMLAL2" %[cq18].4s,v0.8h,v3.h[0]\n\t"\
+    ""IMLAL" %[cq19].4s,v0.4h,v3.h[1]\n\t"\
+    ""IMLAL2" %[cq20].4s,v0.8h,v3.h[1]\n\t"\
+    ""IMLAL" %[cq21].4s,v0.4h,v3.h[2]\n\t"\
+    ""IMLAL2" %[cq22].4s,v0.8h,v3.h[2]\n\t"\
+    ""IMLAL" %[cq23].4s,v0.4h,v3.h[3]\n\t"\
+    ""IMLAL2" %[cq24].4s,v0.8h,v3.h[3]\n\t"\
+    "fmov v4.d[1],x0; ldr d0,[%[a_ptr],#-16]\n\t"\
+    ""IMLAL" %[cq01].4s,v1.4h,v3.h[4]; ldr x0,[%[a_ptr],#-8]\n\t"\
+    ""IMLAL2" %[cq02].4s,v1.8h,v3.h[4]\n\t"\
+    ""IMLAL" %[cq03].4s,v1.4h,v3.h[5]\n\t"\
+    ""IMLAL2" %[cq04].4s,v1.8h,v3.h[5]\n\t"\
+    ""IMLAL" %[cq05].4s,v1.4h,v3.h[6]\n\t"\
+    ""IMLAL2" %[cq06].4s,v1.8h,v3.h[6]\n\t"\
+    ""IMLAL" %[cq07].4s,v1.4h,v3.h[7]\n\t"\
+    ""IMLAL2" %[cq08].4s,v1.8h,v3.h[7]\n\t"\
+    "fmov v0.d[1],x0; ldr d2,[%[b_ptr],#-24]\n\t"\
+    ""IMLAL" %[cq09].4s,v1.4h,v4.h[0]; ldr x0,[%[b_ptr],#-16]\n\t"\
+    ""IMLAL2" %[cq10].4s,v1.8h,v4.h[0]\n\t"\
+    ""IMLAL" %[cq11].4s,v1.4h,v4.h[1]\n\t"\
+    ""IMLAL2" %[cq12].4s,v1.8h,v4.h[1]\n\t"\
+    ""IMLAL" %[cq13].4s,v1.4h,v4.h[2]\n\t"\
+    ""IMLAL2" %[cq14].4s,v1.8h,v4.h[2]\n\t"\
+    ""IMLAL" %[cq15].4s,v1.4h,v4.h[3]\n\t"\
+    ""IMLAL2" %[cq16].4s,v1.8h,v4.h[3]\n\t"\
+    "fmov v2.d[1],x0; ldr d3,[%[b_ptr],#-8]\n\t"\
+    ""IMLAL" %[cq17].4s,v1.4h,v4.h[4]\n\t"\
+    ""IMLAL2" %[cq18].4s,v1.8h,v4.h[4]\n\t"\
+    ""IMLAL" %[cq19].4s,v1.4h,v4.h[5]\n\t"\
+    ""IMLAL2" %[cq20].4s,v1.8h,v4.h[5]; sub %w[k_left],%w[k_left],#2\n\t"\
+    ""IMLAL" %[cq21].4s,v1.4h,v4.h[6]\n\t"\
+    ""IMLAL2" %[cq22].4s,v1.8h,v4.h[6]; cmp %w[k_left],#3\n\t"\
+    ""IMLAL" %[cq23].4s,v1.4h,v4.h[7]\n\t"\
+    ""IMLAL2" %[cq24].4s,v1.8h,v4.h[7]; b.ge 1b\n\t"\
+    "2:\n\t"\
+    "cmp %w[k_left],#2; b.lt 3f\n\t"\
+    ""IMLAL" %[cq01].4s,v0.4h,v2.h[0]; ldr x0,[%[b_ptr]],#24\n\t"\
+    ""IMLAL2" %[cq02].4s,v0.8h,v2.h[0]\n\t"\
+    ""IMLAL" %[cq03].4s,v0.4h,v2.h[1]\n\t"\
+    ""IMLAL2" %[cq04].4s,v0.8h,v2.h[1]\n\t"\
+    ""IMLAL" %[cq05].4s,v0.4h,v2.h[2]\n\t"\
+    ""IMLAL2" %[cq06].4s,v0.8h,v2.h[2]\n\t"\
+    ""IMLAL" %[cq07].4s,v0.4h,v2.h[3]\n\t"\
+    ""IMLAL2" %[cq08].4s,v0.8h,v2.h[3]\n\t"\
+    "fmov v3.d[1],x0; ldr d1,[%[a_ptr]],#16\n\t"\
+    ""IMLAL" %[cq09].4s,v0.4h,v2.h[4]\n\t"\
+    ""IMLAL2" %[cq10].4s,v0.8h,v2.h[4]; ldr x0,[%[a_ptr],#-8]\n\t"\
+    ""IMLAL" %[cq11].4s,v0.4h,v2.h[5]\n\t"\
+    ""IMLAL2" %[cq12].4s,v0.8h,v2.h[5]\n\t"\
+    ""IMLAL" %[cq13].4s,v0.4h,v2.h[6]\n\t"\
+    ""IMLAL2" %[cq14].4s,v0.8h,v2.h[6]\n\t"\
+    ""IMLAL" %[cq15].4s,v0.4h,v2.h[7]\n\t"\
+    ""IMLAL2" %[cq16].4s,v0.8h,v2.h[7]\n\t"\
+    "fmov v1.d[1],x0; ldr d4,[%[b_ptr],#-16]\n\t"\
+    ""IMLAL" %[cq17].4s,v0.4h,v3.h[0]; ldr x0,[%[b_ptr],#-8]\n\t"\
+    ""IMLAL2" %[cq18].4s,v0.8h,v3.h[0]\n\t"\
+    ""IMLAL" %[cq19].4s,v0.4h,v3.h[1]\n\t"\
+    ""IMLAL2" %[cq20].4s,v0.8h,v3.h[1]\n\t"\
+    ""IMLAL" %[cq21].4s,v0.4h,v3.h[2]\n\t"\
+    ""IMLAL2" %[cq22].4s,v0.8h,v3.h[2]\n\t"\
+    ""IMLAL" %[cq23].4s,v0.4h,v3.h[3]\n\t"\
+    ""IMLAL2" %[cq24].4s,v0.8h,v3.h[3]\n\t"\
+    "fmov v4.d[1],x0\n\t"\
+    ""IMLAL" %[cq01].4s,v1.4h,v3.h[4]\n\t"\
+    ""IMLAL2" %[cq02].4s,v1.8h,v3.h[4]\n\t"\
+    ""IMLAL" %[cq03].4s,v1.4h,v3.h[5]\n\t"\
+    ""IMLAL2" %[cq04].4s,v1.8h,v3.h[5]\n\t"\
+    ""IMLAL" %[cq05].4s,v1.4h,v3.h[6]\n\t"\
+    ""IMLAL2" %[cq06].4s,v1.8h,v3.h[6]\n\t"\
+    ""IMLAL" %[cq07].4s,v1.4h,v3.h[7]\n\t"\
+    ""IMLAL2" %[cq08].4s,v1.8h,v3.h[7]\n\t"\
+    ""IMLAL" %[cq09].4s,v1.4h,v4.h[0]\n\t"\
+    ""IMLAL2" %[cq10].4s,v1.8h,v4.h[0]\n\t"\
+    ""IMLAL" %[cq11].4s,v1.4h,v4.h[1]\n\t"\
+    ""IMLAL2" %[cq12].4s,v1.8h,v4.h[1]\n\t"\
+    ""IMLAL" %[cq13].4s,v1.4h,v4.h[2]\n\t"\
+    ""IMLAL2" %[cq14].4s,v1.8h,v4.h[2]\n\t"\
+    ""IMLAL" %[cq15].4s,v1.4h,v4.h[3]\n\t"\
+    ""IMLAL2" %[cq16].4s,v1.8h,v4.h[3]\n\t"\
+    ""IMLAL" %[cq17].4s,v1.4h,v4.h[4]\n\t"\
+    ""IMLAL2" %[cq18].4s,v1.8h,v4.h[4]\n\t"\
+    ""IMLAL" %[cq19].4s,v1.4h,v4.h[5]\n\t"\
+    ""IMLAL2" %[cq20].4s,v1.8h,v4.h[5]; sub %w[k_left],%w[k_left],#2\n\t"\
+    ""IMLAL" %[cq21].4s,v1.4h,v4.h[6]\n\t"\
+    ""IMLAL2" %[cq22].4s,v1.8h,v4.h[6]\n\t"\
+    ""IMLAL" %[cq23].4s,v1.4h,v4.h[7]\n\t"\
+    ""IMLAL2" %[cq24].4s,v1.8h,v4.h[7]; b 4f\n\t"\
+    "3:\n\t"\
+    ""IMLAL" %[cq01].4s,v0.4h,v2.h[0]; "IMLAL2" %[cq02].4s,v0.8h,v2.h[0]\n\t"\
+    ""IMLAL" %[cq03].4s,v0.4h,v2.h[1]; "IMLAL2" %[cq04].4s,v0.8h,v2.h[1]\n\t"\
+    ""IMLAL" %[cq05].4s,v0.4h,v2.h[2]; "IMLAL2" %[cq06].4s,v0.8h,v2.h[2]\n\t"\
+    ""IMLAL" %[cq07].4s,v0.4h,v2.h[3]; "IMLAL2" %[cq08].4s,v0.8h,v2.h[3]\n\t"\
+    ""IMLAL" %[cq09].4s,v0.4h,v2.h[4]; "IMLAL2" %[cq10].4s,v0.8h,v2.h[4]\n\t"\
+    ""IMLAL" %[cq11].4s,v0.4h,v2.h[5]; "IMLAL2" %[cq12].4s,v0.8h,v2.h[5]\n\t"\
+    ""IMLAL" %[cq13].4s,v0.4h,v2.h[6]; "IMLAL2" %[cq14].4s,v0.8h,v2.h[6]\n\t"\
+    ""IMLAL" %[cq15].4s,v0.4h,v2.h[7]; "IMLAL2" %[cq16].4s,v0.8h,v2.h[7]\n\t"\
+    ""IMLAL" %[cq17].4s,v0.4h,v3.h[0]; "IMLAL2" %[cq18].4s,v0.8h,v3.h[0]\n\t"\
+    ""IMLAL" %[cq19].4s,v0.4h,v3.h[1]; "IMLAL2" %[cq20].4s,v0.8h,v3.h[1]\n\t"\
+    ""IMLAL" %[cq21].4s,v0.4h,v3.h[2]; "IMLAL2" %[cq22].4s,v0.8h,v3.h[2]\n\t"\
+    ""IMLAL" %[cq23].4s,v0.4h,v3.h[3]; "IMLAL2" %[cq24].4s,v0.8h,v3.h[3]\n\t"\
+    "sub %w[k_left],%w[k_left],#1\n\t"\
+    "4:\n\t"\
+   :[a_ptr]"+r"(a_ptr), [b_ptr]"+r"(b_ptr), [k_left]"+r"(k_left),\
+   [cq01]"=w"(cq01), [cq02]"=w"(cq02), [cq03]"=w"(cq03), [cq04]"=w"(cq04),\
+   [cq05]"=w"(cq05), [cq06]"=w"(cq06), [cq07]"=w"(cq07), [cq08]"=w"(cq08),\
+   [cq09]"=w"(cq09), [cq10]"=w"(cq10), [cq11]"=w"(cq11), [cq12]"=w"(cq12),\
+   [cq13]"=w"(cq13), [cq14]"=w"(cq14), [cq15]"=w"(cq15), [cq16]"=w"(cq16),\
+   [cq17]"=w"(cq17), [cq18]"=w"(cq18), [cq19]"=w"(cq19), [cq20]"=w"(cq20),\
+   [cq21]"=w"(cq21), [cq22]"=w"(cq22), [cq23]"=w"(cq23), [cq24]"=w"(cq24)\
+   ::"cc","memory","x0","v0","v1","v2","v3","v4");
+
+#define SAVE_M8N12 \
+  I32 *c_tmp = c_ptr;\
+  UNIT_SAVE_M8N2(cq01, cq02, cq03, cq04)\
+  UNIT_SAVE_M8N2(cq05, cq06, cq07, cq08)\
+  UNIT_SAVE_M8N2(cq09, cq10, cq11, cq12)\
+  UNIT_SAVE_M8N2(cq13, cq14, cq15, cq16)\
+  UNIT_SAVE_M8N2(cq17, cq18, cq19, cq20)\
+  UNIT_SAVE_M8N2(cq21, cq22, cq23, cq24)
+
+#define KERNEL_M12N8 \
+  const I32 *c_pref = c_ptr;\
+  pref_c_12(c_pref); c_pref += ldc;\
+  pref_c_12(c_pref); c_pref += ldc;\
+  pref_c_12(c_pref); c_pref += ldc;\
+  pref_c_12(c_pref); c_pref += ldc;\
+  pref_c_12(c_pref); c_pref += ldc;\
+  pref_c_12(c_pref); c_pref += ldc;\
+  pref_c_12(c_pref); c_pref += ldc;\
+  pref_c_12(c_pref);\
+  COMMON_KERNEL_HEADER(a_head, b_head)\
+  I32X4 cq01, cq02, cq03, cq04, cq05, cq06, cq07, cq08;\
+  I32X4 cq09, cq10, cq11, cq12, cq13, cq14, cq15, cq16;\
+  I32X4 cq17, cq18, cq19, cq20, cq21, cq22, cq23, cq24;\
+  __asm__ __volatile__(\
+    "movi %[cq01].16b,#0; movi %[cq02].16b,#0\n\t"\
+    "movi %[cq03].16b,#0; movi %[cq04].16b,#0\n\t"\
+    "movi %[cq05].16b,#0; movi %[cq06].16b,#0\n\t"\
+    "movi %[cq07].16b,#0; movi %[cq08].16b,#0\n\t"\
+    "movi %[cq09].16b,#0; movi %[cq10].16b,#0\n\t"\
+    "movi %[cq11].16b,#0; movi %[cq12].16b,#0\n\t"\
+    "movi %[cq13].16b,#0; movi %[cq14].16b,#0\n\t"\
+    "movi %[cq15].16b,#0; movi %[cq16].16b,#0\n\t"\
+    "movi %[cq17].16b,#0; movi %[cq18].16b,#0\n\t"\
+    "movi %[cq19].16b,#0; movi %[cq20].16b,#0\n\t"\
+    "movi %[cq21].16b,#0; movi %[cq22].16b,#0\n\t"\
+    "movi %[cq23].16b,#0; movi %[cq24].16b,#0\n\t"\
+    "cmp %w[k_left],#1; b.lt 4f\n\t"\
+    "ldr q0,[%[a_ptr]]; ldr d1,[%[a_ptr],#16]; add %[a_ptr],%[a_ptr],#24\n\t"\
+    "ldr q3,[%[b_ptr]],#16\n\t"\
+    "cmp %w[k_left],#3; b.lt 2f\n\t"\
+    ".balign 16; 1:\n\t"\
+    ""IMLAL" %[cq01].4s,v0.4h,v3.h[0]; ldr x0,[%[a_ptr]],#48\n\t"\
+    ""IMLAL" %[cq04].4s,v0.4h,v3.h[1]\n\t"\
+    ""IMLAL" %[cq07].4s,v0.4h,v3.h[2]\n\t"\
+    ""IMLAL" %[cq10].4s,v0.4h,v3.h[3]\n\t"\
+    ""IMLAL" %[cq13].4s,v0.4h,v3.h[4]\n\t"\
+    ""IMLAL" %[cq16].4s,v0.4h,v3.h[5]\n\t"\
+    ""IMLAL" %[cq19].4s,v0.4h,v3.h[6]\n\t"\
+    ""IMLAL" %[cq22].4s,v0.4h,v3.h[7]\n\t"\
+    "fmov v1.d[1],x0; ldr d4,[%[b_ptr]],#32\n\t"\
+    ""IMLAL2" %[cq02].4s,v0.8h,v3.h[0]\n\t"\
+    ""IMLAL2" %[cq05].4s,v0.8h,v3.h[1]; ldr x0,[%[b_ptr],#-24]\n\t"\
+    ""IMLAL2" %[cq08].4s,v0.8h,v3.h[2]\n\t"\
+    ""IMLAL2" %[cq11].4s,v0.8h,v3.h[3]\n\t"\
+    ""IMLAL2" %[cq14].4s,v0.8h,v3.h[4]\n\t"\
+    ""IMLAL2" %[cq17].4s,v0.8h,v3.h[5]\n\t"\
+    ""IMLAL2" %[cq20].4s,v0.8h,v3.h[6]\n\t"\
+    ""IMLAL2" %[cq23].4s,v0.8h,v3.h[7]\n\t"\
+    "fmov v4.d[1],x0; ldr d2,[%[a_ptr],#-40]\n\t"\
+    ""IMLAL" %[cq03].4s,v1.4h,v3.h[0]; ldr x0,[%[a_ptr],#-32]\n\t"\
+    ""IMLAL" %[cq06].4s,v1.4h,v3.h[1]\n\t"\
+    ""IMLAL" %[cq09].4s,v1.4h,v3.h[2]\n\t"\
+    ""IMLAL" %[cq12].4s,v1.4h,v3.h[3]\n\t"\
+    ""IMLAL" %[cq15].4s,v1.4h,v3.h[4]\n\t"\
+    ""IMLAL" %[cq18].4s,v1.4h,v3.h[5]\n\t"\
+    ""IMLAL" %[cq21].4s,v1.4h,v3.h[6]\n\t"\
+    ""IMLAL" %[cq24].4s,v1.4h,v3.h[7]\n\t"\
+    "fmov v2.d[1],x0; ldr d3,[%[b_ptr],#-16]\n\t"\
+    ""IMLAL2" %[cq01].4s,v1.8h,v4.h[0]; ldr x0,[%[b_ptr],#-8]\n\t"\
+    ""IMLAL2" %[cq04].4s,v1.8h,v4.h[1]\n\t"\
+    ""IMLAL2" %[cq07].4s,v1.8h,v4.h[2]\n\t"\
+    ""IMLAL2" %[cq10].4s,v1.8h,v4.h[3]\n\t"\
+    ""IMLAL2" %[cq13].4s,v1.8h,v4.h[4]\n\t"\
+    ""IMLAL2" %[cq16].4s,v1.8h,v4.h[5]\n\t"\
+    ""IMLAL2" %[cq19].4s,v1.8h,v4.h[6]\n\t"\
+    ""IMLAL2" %[cq22].4s,v1.8h,v4.h[7]\n\t"\
+    "fmov v3.d[1],x0; ldr d0,[%[a_ptr],#-24]\n\t"\
+    ""IMLAL" %[cq02].4s,v2.4h,v4.h[0]; ldr x0,[%[a_ptr],#-16]\n\t"\
+    ""IMLAL" %[cq05].4s,v2.4h,v4.h[1]\n\t"\
+    ""IMLAL" %[cq08].4s,v2.4h,v4.h[2]\n\t"\
+    ""IMLAL" %[cq11].4s,v2.4h,v4.h[3]\n\t"\
+    ""IMLAL" %[cq14].4s,v2.4h,v4.h[4]\n\t"\
+    ""IMLAL" %[cq17].4s,v2.4h,v4.h[5]\n\t"\
+    ""IMLAL" %[cq20].4s,v2.4h,v4.h[6]\n\t"\
+    ""IMLAL" %[cq23].4s,v2.4h,v4.h[7]\n\t"\
+    "fmov v0.d[1],x0; ldr d1,[%[a_ptr],#-8]\n\t"\
+    ""IMLAL2" %[cq03].4s,v2.8h,v4.h[0]\n\t"\
+    ""IMLAL2" %[cq06].4s,v2.8h,v4.h[1]\n\t"\
+    ""IMLAL2" %[cq09].4s,v2.8h,v4.h[2]\n\t"\
+    ""IMLAL2" %[cq12].4s,v2.8h,v4.h[3]; sub %w[k_left],%w[k_left],#2\n\t"\
+    ""IMLAL2" %[cq15].4s,v2.8h,v4.h[4]\n\t"\
+    ""IMLAL2" %[cq18].4s,v2.8h,v4.h[5]; cmp %w[k_left],#3\n\t"\
+    ""IMLAL2" %[cq21].4s,v2.8h,v4.h[6]\n\t"\
+    ""IMLAL2" %[cq24].4s,v2.8h,v4.h[7]; b.ge 1b\n\t"\
+    "2:\n\t"\
+    "cmp %w[k_left],#2; b.lt 3f\n\t"\
+    ""IMLAL" %[cq01].4s,v0.4h,v3.h[0]; ldr x0,[%[a_ptr]],#24\n\t"\
+    ""IMLAL" %[cq04].4s,v0.4h,v3.h[1]\n\t"\
+    ""IMLAL" %[cq07].4s,v0.4h,v3.h[2]\n\t"\
+    ""IMLAL" %[cq10].4s,v0.4h,v3.h[3]\n\t"\
+    ""IMLAL" %[cq13].4s,v0.4h,v3.h[4]\n\t"\
+    ""IMLAL" %[cq16].4s,v0.4h,v3.h[5]\n\t"\
+    ""IMLAL" %[cq19].4s,v0.4h,v3.h[6]\n\t"\
+    ""IMLAL" %[cq22].4s,v0.4h,v3.h[7]\n\t"\
+    "fmov v1.d[1],x0; ldr d4,[%[b_ptr]],#16\n\t"\
+    ""IMLAL2" %[cq02].4s,v0.8h,v3.h[0]\n\t"\
+    ""IMLAL2" %[cq05].4s,v0.8h,v3.h[1]; ldr x0,[%[b_ptr],#-8]\n\t"\
+    ""IMLAL2" %[cq08].4s,v0.8h,v3.h[2]\n\t"\
+    ""IMLAL2" %[cq11].4s,v0.8h,v3.h[3]\n\t"\
+    ""IMLAL2" %[cq14].4s,v0.8h,v3.h[4]\n\t"\
+    ""IMLAL2" %[cq17].4s,v0.8h,v3.h[5]\n\t"\
+    ""IMLAL2" %[cq20].4s,v0.8h,v3.h[6]\n\t"\
+    ""IMLAL2" %[cq23].4s,v0.8h,v3.h[7]\n\t"\
+    "fmov v4.d[1],x0; ldr d2,[%[a_ptr],#-16]\n\t"\
+    ""IMLAL" %[cq03].4s,v1.4h,v3.h[0]; ldr x0,[%[a_ptr],#-8]\n\t"\
+    ""IMLAL" %[cq06].4s,v1.4h,v3.h[1]\n\t"\
+    ""IMLAL" %[cq09].4s,v1.4h,v3.h[2]\n\t"\
+    ""IMLAL" %[cq12].4s,v1.4h,v3.h[3]\n\t"\
+    ""IMLAL" %[cq15].4s,v1.4h,v3.h[4]\n\t"\
+    ""IMLAL" %[cq18].4s,v1.4h,v3.h[5]\n\t"\
+    ""IMLAL" %[cq21].4s,v1.4h,v3.h[6]\n\t"\
+    ""IMLAL" %[cq24].4s,v1.4h,v3.h[7]\n\t"\
+    "fmov v2.d[1],x0\n\t"\
+    ""IMLAL2" %[cq01].4s,v1.8h,v4.h[0]\n\t"\
+    ""IMLAL2" %[cq04].4s,v1.8h,v4.h[1]\n\t"\
+    ""IMLAL2" %[cq07].4s,v1.8h,v4.h[2]\n\t"\
+    ""IMLAL2" %[cq10].4s,v1.8h,v4.h[3]\n\t"\
+    ""IMLAL2" %[cq13].4s,v1.8h,v4.h[4]\n\t"\
+    ""IMLAL2" %[cq16].4s,v1.8h,v4.h[5]\n\t"\
+    ""IMLAL2" %[cq19].4s,v1.8h,v4.h[6]\n\t"\
+    ""IMLAL2" %[cq22].4s,v1.8h,v4.h[7]\n\t"\
+    ""IMLAL" %[cq02].4s,v2.4h,v4.h[0]\n\t"\
+    ""IMLAL" %[cq05].4s,v2.4h,v4.h[1]\n\t"\
+    ""IMLAL" %[cq08].4s,v2.4h,v4.h[2]\n\t"\
+    ""IMLAL" %[cq11].4s,v2.4h,v4.h[3]\n\t"\
+    ""IMLAL" %[cq14].4s,v2.4h,v4.h[4]\n\t"\
+    ""IMLAL" %[cq17].4s,v2.4h,v4.h[5]\n\t"\
+    ""IMLAL" %[cq20].4s,v2.4h,v4.h[6]\n\t"\
+    ""IMLAL" %[cq23].4s,v2.4h,v4.h[7]\n\t"\
+    ""IMLAL2" %[cq03].4s,v2.8h,v4.h[0]\n\t"\
+    ""IMLAL2" %[cq06].4s,v2.8h,v4.h[1]\n\t"\
+    ""IMLAL2" %[cq09].4s,v2.8h,v4.h[2]\n\t"\
+    ""IMLAL2" %[cq12].4s,v2.8h,v4.h[3]; sub %w[k_left],%w[k_left],#2\n\t"\
+    ""IMLAL2" %[cq15].4s,v2.8h,v4.h[4]\n\t"\
+    ""IMLAL2" %[cq18].4s,v2.8h,v4.h[5]\n\t"\
+    ""IMLAL2" %[cq21].4s,v2.8h,v4.h[6]\n\t"\
+    ""IMLAL2" %[cq24].4s,v2.8h,v4.h[7]; b 4f\n\t"\
+    "3:\n\t"\
+    ""IMLAL" %[cq01].4s,v0.4h,v3.h[0]; "IMLAL" %[cq04].4s,v0.4h,v3.h[1]\n\t"\
+    ""IMLAL" %[cq07].4s,v0.4h,v3.h[2]; "IMLAL" %[cq10].4s,v0.4h,v3.h[3]\n\t"\
+    ""IMLAL" %[cq13].4s,v0.4h,v3.h[4]; "IMLAL" %[cq16].4s,v0.4h,v3.h[5]\n\t"\
+    ""IMLAL" %[cq19].4s,v0.4h,v3.h[6]; "IMLAL" %[cq22].4s,v0.4h,v3.h[7]\n\t"\
+    ""IMLAL2" %[cq02].4s,v0.8h,v3.h[0]; "IMLAL2" %[cq05].4s,v0.8h,v3.h[1]\n\t"\
+    ""IMLAL2" %[cq08].4s,v0.8h,v3.h[2]; "IMLAL2" %[cq11].4s,v0.8h,v3.h[3]\n\t"\
+    ""IMLAL2" %[cq14].4s,v0.8h,v3.h[4]; "IMLAL2" %[cq17].4s,v0.8h,v3.h[5]\n\t"\
+    ""IMLAL2" %[cq20].4s,v0.8h,v3.h[6]; "IMLAL2" %[cq23].4s,v0.8h,v3.h[7]\n\t"\
+    ""IMLAL" %[cq03].4s,v1.4h,v3.h[0]; "IMLAL" %[cq06].4s,v1.4h,v3.h[1]\n\t"\
+    ""IMLAL" %[cq09].4s,v1.4h,v3.h[2]; "IMLAL" %[cq12].4s,v1.4h,v3.h[3]\n\t"\
+    ""IMLAL" %[cq15].4s,v1.4h,v3.h[4]; "IMLAL" %[cq18].4s,v1.4h,v3.h[5]\n\t"\
+    ""IMLAL" %[cq21].4s,v1.4h,v3.h[6]; "IMLAL" %[cq24].4s,v1.4h,v3.h[7]\n\t"\
+    "sub %w[k_left],%w[k_left],#1\n\t"\
+    "4:\n\t"\
+   :[a_ptr]"+r"(a_ptr), [b_ptr]"+r"(b_ptr), [k_left]"+r"(k_left),\
+   [cq01]"=w"(cq01), [cq02]"=w"(cq02), [cq03]"=w"(cq03), [cq04]"=w"(cq04),\
+   [cq05]"=w"(cq05), [cq06]"=w"(cq06), [cq07]"=w"(cq07), [cq08]"=w"(cq08),\
+   [cq09]"=w"(cq09), [cq10]"=w"(cq10), [cq11]"=w"(cq11), [cq12]"=w"(cq12),\
+   [cq13]"=w"(cq13), [cq14]"=w"(cq14), [cq15]"=w"(cq15), [cq16]"=w"(cq16),\
+   [cq17]"=w"(cq17), [cq18]"=w"(cq18), [cq19]"=w"(cq19), [cq20]"=w"(cq20),\
+   [cq21]"=w"(cq21), [cq22]"=w"(cq22), [cq23]"=w"(cq23), [cq24]"=w"(cq24)\
+   ::"cc","memory","x0","v0","v1","v2","v3","v4");
+
+#define SAVE_M12N8 \
+  I32 *c_tmp = c_ptr;\
+  UNIT_SAVE_M12N2(cq01, cq02, cq03, cq04, cq05, cq06)\
+  UNIT_SAVE_M12N2(cq07, cq08, cq09, cq10, cq11, cq12)\
+  UNIT_SAVE_M12N2(cq13, cq14, cq15, cq16, cq17, cq18)\
+  UNIT_SAVE_M12N2(cq19, cq20, cq21, cq22, cq23, cq24)
+
+IMLAGEMM_INLINE_DUALPACK_UNIT_FUNC(12, 8, I16, I32)
+IMLAGEMM_INLINE_DUALPACK_UNIT_FUNC(8, 12, I16, I32)
+
+#endif
diff --git a/include/neon_armv8a/I8I32MlaGemmSkinnyDot.h b/include/neon_armv8a/I8I32MlaGemmSkinnyDot.h
new file mode 100644
index 0000000..049a1ca
--- /dev/null
+++ b/include/neon_armv8a/I8I32MlaGemmSkinnyDot.h
@@ -0,0 +1,501 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include "arm_neon/NeonI8I32MlaGemmSkinnyDot.h"
+
+#ifndef INCLUDE_ARMV8_I8I32_SKINNYDOT_ASM
+#define INCLUDE_ARMV8_I8I32_SKINNYDOT_ASM
+
+#define I8I32MLA_SKINNYDOT_INLINE_M4N1(gemm) \
+static inline void inline_##gemm##_arowmajor_bskinny_m4n1(\
+  const I8 *a_ptr1, const I8 *b_ptr, I32 *c_ptr,\
+  uint32_t k_left, uint32_t LDK, uint32_t LDM,\
+  I32 beta, bool c_rowmajor) {\
+\
+  const I8 *a_ptr2 = a_ptr1 + LDK;\
+  const I8 *a_ptr3 = a_ptr1 + LDK * 2;\
+  const I8 *a_ptr4 = a_ptr2 + LDK * 2;\
+  I32X2 cd1, cd2;\
+  const uint32_t next_pref = (LDK * 4 - k_left) + 16;\
+  __asm__ __volatile__ (\
+    "movi v8.16b,#0; movi v9.16b,#0; movi v10.16b,#0; movi v11.16b,#0\n\t"\
+    "cmp %w[k_left],#16; b.lt 3f\n\t"\
+    "ldr q0,[%[a_ptr1]],#16; ldr q1,[%[a_ptr2]],#16\n\t"\
+    "ldr q2,[%[a_ptr3]],#16; ldr q3,[%[a_ptr4]],#16\n\t"\
+    "ldr q4,[%[b_ptr]],#16\n\t"\
+    "cmp %w[k_left],#32; b.lt 2f\n\t"\
+    ".balign 16; 1:\n\t"\
+    ""IMULL" v12.8h,v0.8b,v4.8b; prfm pldl1keep,[%[a_ptr1],#64]\n\t"\
+    ""IMULL" v13.8h,v1.8b,v4.8b; prfm pldl1keep,[%[a_ptr2],#64]\n\t"\
+    ""IMULL" v14.8h,v2.8b,v4.8b; prfm pldl1keep,[%[a_ptr3],#64]\n\t"\
+    ""IMULL" v15.8h,v3.8b,v4.8b; prfm pldl1keep,[%[a_ptr4],#64]\n\t"\
+    ""IADALP" v8.4s,v12.8h; "IMULL"2 v12.8h,v0.16b,v4.16b\n\t"\
+    "ldr q0,[%[a_ptr1]],#16\n\t"\
+    ""IADALP" v9.4s,v13.8h; "IMULL"2 v13.8h,v1.16b,v4.16b\n\t"\
+    "ldr q1,[%[a_ptr2]],#16\n\t"\
+    ""IADALP" v10.4s,v14.8h; "IMULL"2 v14.8h,v2.16b,v4.16b\n\t"\
+    "ldr q2,[%[a_ptr3]],#16\n\t"\
+    ""IADALP" v11.4s,v15.8h; "IMULL"2 v15.8h,v3.16b,v4.16b\n\t"\
+    "ldr q3,[%[a_ptr4]],#16\n\t"\
+    "ldr q4,[%[b_ptr]],#16\n\t"\
+    ""IADALP" v8.4s,v12.8h; sub %w[k_left],%w[k_left],#16\n\t"\
+    ""IADALP" v9.4s,v13.8h\n\t"\
+    ""IADALP" v10.4s,v14.8h; cmp %w[k_left],#32\n\t"\
+    ""IADALP" v11.4s,v15.8h; b.ge 1b\n\t"\
+    "2:\n\t"\
+    ""IMULL" v12.8h,v0.8b,v4.8b\n\t"\
+    "prfm pldl1keep,[%[a_ptr1],%w[next_pref],SXTW #0]\n\t"\
+    ""IMULL" v13.8h,v1.8b,v4.8b\n\t"\
+    "prfm pldl1keep,[%[a_ptr2],%w[next_pref],SXTW #0]\n\t"\
+    ""IMULL" v14.8h,v2.8b,v4.8b\n\t"\
+    "prfm pldl1keep,[%[a_ptr3],%w[next_pref],SXTW #0]\n\t"\
+    ""IMULL" v15.8h,v3.8b,v4.8b\n\t"\
+    "prfm pldl1keep,[%[a_ptr4],%w[next_pref],SXTW #0]\n\t"\
+    ""IADALP" v8.4s,v12.8h; "IMULL"2 v12.8h,v0.16b,v4.16b\n\t"\
+    ""IADALP" v9.4s,v13.8h; "IMULL"2 v13.8h,v1.16b,v4.16b\n\t"\
+    ""IADALP" v10.4s,v14.8h; "IMULL"2 v14.8h,v2.16b,v4.16b\n\t"\
+    ""IADALP" v11.4s,v15.8h; "IMULL"2 v15.8h,v3.16b,v4.16b\n\t"\
+    ""IADALP" v8.4s,v12.8h; "IADALP" v9.4s,v13.8h\n\t"\
+    "sub %w[k_left],%w[k_left],#16\n\t"\
+    ""IADALP" v10.4s,v14.8h; "IADALP" v11.4s,v15.8h\n\t"\
+    "3:\n\t"\
+    "cmp %w[k_left],#8; b.lt 4f\n\t"\
+    "ldr d0,[%[a_ptr1]],#8; ldr d1,[%[a_ptr2]],#8\n\t"\
+    "ldr d2,[%[a_ptr3]],#8; ldr d3,[%[a_ptr4]],#8\n\t"\
+    "ldr d4,[%[b_ptr]],#8; sub %w[k_left],%w[k_left],#8\n\t"\
+    ""IMULL" v12.8h,v0.8b,v4.8b; "IMULL" v13.8h,v1.8b,v4.8b\n\t"\
+    ""IMULL" v14.8h,v2.8b,v4.8b; "IMULL" v15.8h,v3.8b,v4.8b\n\t"\
+    ""IADALP" v8.4s,v12.8h; "IADALP" v9.4s,v13.8h\n\t"\
+    ""IADALP" v10.4s,v14.8h; "IADALP" v11.4s,v15.8h\n\t"\
+    "4:\n\t"\
+    "movi v12.16b,#0\n\t"\
+    "addp v8.4s,v8.4s,v12.4s; addp v9.4s,v9.4s,v12.4s\n\t"\
+    "addp v10.4s,v10.4s,v12.4s; addp v11.4s,v11.4s,v12.4s\n\t"\
+    "cmp %w[k_left],#4; b.lt 5f\n\t"\
+    "ldr s0,[%[a_ptr1]],#4; ldr s1,[%[a_ptr2]],#4\n\t"\
+    "ldr s2,[%[a_ptr3]],#4; ldr s3,[%[a_ptr4]],#4\n\t"\
+    "ldr s4,[%[b_ptr]],#4; sub %w[k_left],%w[k_left],#4\n\t"\
+    ""IMULL" v12.8h,v0.8b,v4.8b; "IMULL" v13.8h,v1.8b,v4.8b\n\t"\
+    ""IMULL" v14.8h,v2.8b,v4.8b; "IMULL" v15.8h,v3.8b,v4.8b\n\t"\
+    ""IADALP" v8.2s,v12.4h; "IADALP" v9.2s,v13.4h\n\t"\
+    ""IADALP" v10.2s,v14.4h; "IADALP" v11.2s,v15.4h\n\t"\
+    "5:\n\t"\
+    "cmp %w[k_left],#2; b.lt 6f\n\t"\
+    "ldr h0,[%[a_ptr1]],#2; ldr h1,[%[a_ptr2]],#2\n\t"\
+    "ldr h2,[%[a_ptr3]],#2; ldr h3,[%[a_ptr4]],#2\n\t"\
+    "ldr h4,[%[b_ptr]],#2; sub %w[k_left],%w[k_left],#2\n\t"\
+    ""IXTL" v0.8h,v0.8b; "IXTL" v1.8h,v1.8b\n\t"\
+    ""IXTL" v2.8h,v2.8b; "IXTL" v3.8h,v3.8b; "IXTL" v4.8h,v4.8b\n\t"\
+    ""IMLAL" v8.4s,v0.4h,v4.4h; "IMLAL" v9.4s,v1.4h,v4.4h\n\t"\
+    ""IMLAL" v10.4s,v2.4h,v4.4h; "IMLAL" v11.4s,v3.4h,v4.4h\n\t"\
+    "6:\n\t"\
+    "addp %[cd1].2s,v8.2s,v9.2s; addp %[cd2].2s,v10.2s,v11.2s\n\t"\
+    "cmp %w[k_left],#1; b.lt 7f\n\t"\
+    "ldr b0,[%[a_ptr1]],#1; ldr b1,[%[a_ptr2]],#1\n\t"\
+    "ldr b2,[%[a_ptr3]],#1; ldr b3,[%[a_ptr4]],#1\n\t"\
+    "ldr b4,[%[b_ptr]],#1; sub %w[k_left],%w[k_left],#1\n\t"\
+    "ins v0.b[1],v1.b[0]; ins v2.b[1],v3.b[0]\n\t"\
+    ""IXTL" v0.8h,v0.8b; "IXTL" v2.8h,v2.8b; "IXTL" v4.8h,v4.8b\n\t"\
+    ""IMLAL" %[cd1].4s,v0.4h,v4.h[0]; "IMLAL" %[cd2].4s,v2.4h,v4.h[0]\n\t"\
+    "7:\n\t"\
+   :[cd1]"=w"(cd1), [cd2]"=w"(cd2), [k_left]"+r"(k_left), [b_ptr]"+r"(b_ptr),\
+    [a_ptr1]"+r"(a_ptr1), [a_ptr2]"+r"(a_ptr2),\
+    [a_ptr3]"+r"(a_ptr3), [a_ptr4]"+r"(a_ptr4)\
+   :[next_pref]"r"(next_pref)\
+   :"cc","memory","v0","v1","v2","v3","v4",\
+    "v8","v9","v10","v11","v12","v13","v14","v15");\
+\
+  cd1 = VMLA_N_I32(cd1, VLD1_I32(c_ptr), beta);\
+  cd2 = VMLA_N_I32(cd2, VLD1_I32(c_ptr + 2), beta);\
+  VST1_I32(c_ptr, cd1);\
+  VST1_I32(c_ptr + 2, cd2);\
+}
+
+/* k_mask = 31 */
+#define I8I32MLA_SKINNYDOT_INLINE_M4N2(gemm) \
+static inline void inline_##gemm##_arowmajor_bskinny_m4n2(\
+  const I8 *a_ptr1, const I8 *b_ptr, I32 *c_ptr,\
+  uint32_t k_left, uint32_t LDK, uint32_t LDM,\
+  I32 beta, bool c_rowmajor) {\
+\
+  const I8 *a_ptr2 = a_ptr1 + LDK;\
+  const I8 *a_ptr3 = a_ptr1 + LDK * 2;\
+  const I8 *a_ptr4 = a_ptr2 + LDK * 2;\
+  I32X4 cq1, cq2, cq3, cq4; /* higher 2 elements not used */\
+  const uint32_t next_pref = (LDK * 4 - k_left) + 16;\
+  __asm__ __volatile__ (\
+    "movi v6.16b,#0; movi v7.16b,#0\n\t"\
+    "movi v8.16b,#0; movi v9.16b,#0\n\t"\
+    "movi v10.16b,#0; movi v11.16b,#0\n\t"\
+    "movi v12.16b,#0; movi v13.16b,#0\n\t"\
+    "cmp %w[k_left],#16; b.lt 3f\n\t"\
+    "ldr q0,[%[a_ptr1]],#16; ldr q1,[%[a_ptr2]],#16\n\t"\
+    "ldr q2,[%[a_ptr3]],#16; ldr q3,[%[a_ptr4]],#16\n\t"\
+    "ldr q4,[%[b_ptr]]; ldr q5,[%[b_ptr],#16]; add %[b_ptr],%[b_ptr],#32\n\t"\
+    "cmp %w[k_left],#32; b.lt 2f\n\t"\
+    ".balign 16; 1:\n\t"\
+    ""IMULL" v14.8h,v0.8b,v4.8b; "IMULL" v18.8h,v0.8b,v5.8b\n\t"\
+    "prfm pldl1keep,[%[a_ptr1],#64]\n\t"\
+    ""IMULL" v15.8h,v1.8b,v4.8b; "IMULL" v19.8h,v1.8b,v5.8b\n\t"\
+    "prfm pldl1keep,[%[a_ptr2],#64]\n\t"\
+    ""IMULL" v16.8h,v2.8b,v4.8b; "IMULL" v20.8h,v2.8b,v5.8b\n\t"\
+    "prfm pldl1keep,[%[a_ptr3],#64]\n\t"\
+    ""IMULL" v17.8h,v3.8b,v4.8b; "IMULL" v21.8h,v3.8b,v5.8b\n\t"\
+    "prfm pldl1keep,[%[a_ptr4],#64]\n\t"\
+    ""IADALP" v6.4s,v14.8h; "IMULL"2 v14.8h,v0.16b,v4.16b\n\t"\
+    ""IADALP" v10.4s,v18.8h; "IMULL"2 v18.8h,v0.16b,v5.16b\n\t"\
+    "ldr q0,[%[a_ptr1]],#16\n\t"\
+    ""IADALP" v7.4s,v15.8h; "IMULL"2 v15.8h,v1.16b,v4.16b\n\t"\
+    ""IADALP" v11.4s,v19.8h; "IMULL"2 v19.8h,v1.16b,v5.16b\n\t"\
+    "ldr q1,[%[a_ptr2]],#16\n\t"\
+    ""IADALP" v8.4s,v16.8h; "IMULL"2 v16.8h,v2.16b,v4.16b\n\t"\
+    ""IADALP" v12.4s,v20.8h; "IMULL"2 v20.8h,v2.16b,v5.16b\n\t"\
+    "ldr q2,[%[a_ptr3]],#16\n\t"\
+    ""IADALP" v9.4s,v17.8h; "IMULL"2 v17.8h,v3.16b,v4.16b\n\t"\
+    ""IADALP" v13.4s,v21.8h; "IMULL"2 v21.8h,v3.16b,v5.16b\n\t"\
+    "ldr q3,[%[a_ptr4]],#16\n\t"\
+    ""IADALP" v6.4s,v14.8h; "IADALP" v10.4s,v18.8h\n\t"\
+    "ldr q4,[%[b_ptr]],#32\n\t"\
+    ""IADALP" v7.4s,v15.8h; "IADALP" v11.4s,v19.8h\n\t"\
+    "ldr q5,[%[b_ptr],#-16]\n\t"\
+    ""IADALP" v8.4s,v16.8h; sub %w[k_left],%w[k_left],#16\n\t"\
+    ""IADALP" v12.4s,v20.8h; cmp %w[k_left],#32\n\t"\
+    ""IADALP" v9.4s,v17.8h; "IADALP" v13.4s,v21.8h\n\t"\
+    "b.ge 1b\n\t"\
+    "2:\n\t"\
+    ""IMULL" v14.8h,v0.8b,v4.8b; "IMULL" v18.8h,v0.8b,v5.8b\n\t"\
+    "prfm pldl1keep,[%[a_ptr1],%w[next_pref],SXTW #0]\n\t"\
+    ""IMULL" v15.8h,v1.8b,v4.8b; "IMULL" v19.8h,v1.8b,v5.8b\n\t"\
+    "prfm pldl1keep,[%[a_ptr2],%w[next_pref],SXTW #0]\n\t"\
+    ""IMULL" v16.8h,v2.8b,v4.8b; "IMULL" v20.8h,v2.8b,v5.8b\n\t"\
+    "prfm pldl1keep,[%[a_ptr3],%w[next_pref],SXTW #0]\n\t"\
+    ""IMULL" v17.8h,v3.8b,v4.8b; "IMULL" v21.8h,v3.8b,v5.8b\n\t"\
+    "prfm pldl1keep,[%[a_ptr4],%w[next_pref],SXTW #0]\n\t"\
+    ""IADALP" v6.4s,v14.8h; "IMULL"2 v14.8h,v0.16b,v4.16b\n\t"\
+    ""IADALP" v10.4s,v18.8h; "IMULL"2 v18.8h,v0.16b,v5.16b\n\t"\
+    ""IADALP" v7.4s,v15.8h; "IMULL"2 v15.8h,v1.16b,v4.16b\n\t"\
+    ""IADALP" v11.4s,v19.8h; "IMULL"2 v19.8h,v1.16b,v5.16b\n\t"\
+    ""IADALP" v8.4s,v16.8h; "IMULL"2 v16.8h,v2.16b,v4.16b\n\t"\
+    ""IADALP" v12.4s,v20.8h; "IMULL"2 v20.8h,v2.16b,v5.16b\n\t"\
+    ""IADALP" v9.4s,v17.8h; "IMULL"2 v17.8h,v3.16b,v4.16b\n\t"\
+    ""IADALP" v13.4s,v21.8h; "IMULL"2 v21.8h,v3.16b,v5.16b\n\t"\
+    ""IADALP" v6.4s,v14.8h; "IADALP" v10.4s,v18.8h\n\t"\
+    ""IADALP" v7.4s,v15.8h; "IADALP" v11.4s,v19.8h\n\t"\
+    ""IADALP" v8.4s,v16.8h; sub %w[k_left],%w[k_left],#16\n\t"\
+    ""IADALP" v12.4s,v20.8h\n\t"\
+    ""IADALP" v9.4s,v17.8h; "IADALP" v13.4s,v21.8h\n\t"\
+    "3:\n\t"\
+    "cmp %w[k_left],#8; b.lt 4f\n\t"\
+    "ldr d0,[%[a_ptr1]],#8; ldr d1,[%[a_ptr2]],#8\n\t"\
+    "ldr d2,[%[a_ptr3]],#8; ldr d3,[%[a_ptr4]],#8\n\t"\
+    "ldr d4,[%[b_ptr]],#16; ldr d5,[%[b_ptr],#-8]\n\t"\
+    ""IMULL" v14.8h,v0.8b,v4.8b; "IMULL" v18.8h,v0.8b,v5.8b\n\t"\
+    ""IMULL" v15.8h,v1.8b,v4.8b; "IMULL" v19.8h,v1.8b,v5.8b\n\t"\
+    ""IMULL" v16.8h,v2.8b,v4.8b; "IMULL" v20.8h,v2.8b,v5.8b\n\t"\
+    ""IMULL" v17.8h,v3.8b,v4.8b; "IMULL" v21.8h,v3.8b,v5.8b\n\t"\
+    "sub %w[k_left],%w[k_left],#8\n\t"\
+    ""IADALP" v6.4s,v14.8h; "IADALP" v10.4s,v18.8h\n\t"\
+    ""IADALP" v7.4s,v15.8h; "IADALP" v11.4s,v19.8h\n\t"\
+    ""IADALP" v8.4s,v16.8h; "IADALP" v12.4s,v20.8h\n\t"\
+    ""IADALP" v9.4s,v17.8h; "IADALP" v13.4s,v21.8h\n\t"\
+    "4:\n\t"\
+    "addp v6.4s,v6.4s,v10.4s; addp v7.4s,v7.4s,v11.4s\n\t"\
+    "addp v8.4s,v8.4s,v12.4s; addp v9.4s,v9.4s,v13.4s\n\t"\
+    "cmp %w[k_left],#4; b.lt 5f\n\t"\
+    "ldr s4,[%[b_ptr]],#8; ldr s5,[%[b_ptr],#-4]\n\t"\
+    "ld1r {v0.2s},[%[a_ptr1]],#4; ld1r {v1.2s},[%[a_ptr2]],#4\n\t"\
+    "ins v4.s[1],v5.s[0]\n\t"\
+    "ld1r {v2.2s},[%[a_ptr3]],#4; ld1r {v3.2s},[%[a_ptr4]],#4\n\t"\
+    "sub %w[k_left],%w[k_left],#4\n\t"\
+    ""IMULL" v14.8h,v0.8b,v4.8b; "IMULL" v15.8h,v1.8b,v4.8b\n\t"\
+    ""IMULL" v16.8h,v2.8b,v4.8b; "IMULL" v17.8h,v3.8b,v4.8b\n\t"\
+    ""IADALP" v6.4s,v14.8h; "IADALP" v7.4s,v15.8h\n\t"\
+    ""IADALP" v8.4s,v16.8h; "IADALP" v9.4s,v17.8h\n\t"\
+    "5:\n\t"\
+    "movi v14.16b,#0\n\t"\
+    "addp %[cq1].4s,v6.4s,v14.4s; addp %[cq2].4s,v7.4s,v14.4s\n\t"\
+    "addp %[cq3].4s,v8.4s,v14.4s; addp %[cq4].4s,v9.4s,v14.4s\n\t"\
+    "cmp %w[k_left],#2; b.lt 6f\n\t"\
+    "ldr h4,[%[b_ptr]],#4; ldr h5,[%[b_ptr],#-2]\n\t"\
+    "ld1r {v0.4h},[%[a_ptr1]],#2; ld1r {v1.4h},[%[a_ptr2]],#2\n\t"\
+    "sub %w[k_left],%w[k_left],#2\n\t"\
+    "ins v4.h[1],v5.h[0]\n\t"\
+    "ld1r {v2.4h},[%[a_ptr3]],#2; ld1r {v3.4h},[%[a_ptr4]],#2\n\t"\
+    ""IMULL" v14.8h,v0.8b,v4.8b; "IMULL" v15.8h,v1.8b,v4.8b\n\t"\
+    ""IMULL" v16.8h,v2.8b,v4.8b; "IMULL" v17.8h,v3.8b,v4.8b\n\t"\
+    ""IADALP" %[cq1].2s,v14.4h; "IADALP" %[cq2].2s,v15.4h\n\t"\
+    ""IADALP" %[cq3].2s,v16.4h; "IADALP" %[cq4].2s,v17.4h\n\t"\
+    "6:\n\t"\
+    "cmp %w[k_left],#1; b.lt 7f\n\t"\
+    "ldr b0,[%[a_ptr1]],#1; ldr b1,[%[a_ptr2]],#1\n\t"\
+    "ldr b2,[%[a_ptr3]],#1; ldr b3,[%[a_ptr4]],#1\n\t"\
+    "ldr b4,[%[b_ptr]],#2; ldr b5,[%[b_ptr],#-1]\n\t"\
+    "ins v0.b[1],v1.b[0]; ins v2.b[1],v3.b[0]; ins v4.b[1],v5.b[0]\n\t"\
+    ""IXTL" v0.8h,v0.8b; "IXTL" v2.8h,v2.8b; "IXTL" v4.8h,v4.8b\n\t"\
+    "sub %w[k_left],%w[k_left],#1\n\t"\
+    ""IMLAL" %[cq1].4s,v4.4h,v0.h[0]; "IMLAL" %[cq2].4s,v4.4h,v0.h[1]\n\t"\
+    ""IMLAL" %[cq3].4s,v4.4h,v2.h[0]; "IMLAL" %[cq4].4s,v4.4h,v2.h[1]\n\t"\
+    "7:\n\t"\
+   :[cq1]"=w"(cq1), [cq2]"=w"(cq2), [cq3]"=w"(cq3), [cq4]"=w"(cq4),\
+    [a_ptr1]"+r"(a_ptr1), [a_ptr2]"+r"(a_ptr2), [a_ptr3]"+r"(a_ptr3),\
+    [a_ptr4]"+r"(a_ptr4), [b_ptr]"+r"(b_ptr),\
+    [k_left]"+r"(k_left)\
+   :[next_pref]"r"(next_pref)\
+   :"cc","memory","v0","v1","v2","v3","v4","v5",\
+    "v6","v7","v8","v9","v10","v11","v12","v13",\
+    "v14","v15","v16","v17","v18","v19","v20","v21");\
+\
+  I32X2 cd1 = VGET_LOW_I32(cq1);\
+  I32X2 cd2 = VGET_LOW_I32(cq2);\
+  I32X2 cd3 = VGET_LOW_I32(cq3);\
+  I32X2 cd4 = VGET_LOW_I32(cq4);\
+  if (c_rowmajor) {\
+    cd1 = VMLA_N_I32(cd1, VLD1_I32(c_ptr), beta);\
+    cd2 = VMLA_N_I32(cd2, VLD1_I32(c_ptr + 2), beta);\
+    cd3 = VMLA_N_I32(cd3, VLD1_I32(c_ptr + 4), beta);\
+    cd4 = VMLA_N_I32(cd4, VLD1_I32(c_ptr + 6), beta);\
+    VST1_I32(c_ptr, cd1);\
+    VST1_I32(c_ptr + 2, cd2);\
+    VST1_I32(c_ptr + 4, cd3);\
+    VST1_I32(c_ptr + 6, cd4);\
+  } else {\
+    I32 *c_ptr2 = c_ptr + LDM;\
+    I32X2 cdl1 = VZIP1_I32(cd1, cd2);\
+    I32X2 cdl2 = VZIP1_I32(cd3, cd4);\
+    I32X2 cdl3 = VZIP2_I32(cd1, cd2);\
+    I32X2 cdl4 = VZIP2_I32(cd3, cd4);\
+    cdl1 = VMLA_N_I32(cdl1, VLD1_I32(c_ptr), beta);\
+    cdl2 = VMLA_N_I32(cdl2, VLD1_I32(c_ptr + 2), beta);\
+    cdl3 = VMLA_N_I32(cdl3, VLD1_I32(c_ptr2), beta);\
+    cdl4 = VMLA_N_I32(cdl4, VLD1_I32(c_ptr2 + 2), beta);\
+    VST1_I32(c_ptr, cdl1); VST1_I32(c_ptr + 2, cdl2);\
+    VST1_I32(c_ptr2, cdl3); VST1_I32(c_ptr2 + 2, cdl4);\
+  }\
+} 
+
+/* k_mask = 31 */
+#define I8I32MLA_SKINNYDOT_INLINE_M4N3(gemm) \
+static inline void inline_##gemm##_arowmajor_bskinny_m4n3(\
+  const I8 *a_ptr1, const I8 *b_ptr, I32 *c_ptr,\
+  uint32_t k_left, uint32_t LDK, uint32_t LDM,\
+  I32 beta, bool c_rowmajor) {\
+\
+  const I8 *a_ptr2 = a_ptr1 + LDK;\
+  const I8 *a_ptr3 = a_ptr1 + LDK * 2;\
+  const I8 *a_ptr4 = a_ptr2 + LDK * 2;\
+  I32X4 cq1, cq2, cq3;\
+  const uint32_t next_pref = (LDK * 4 - k_left) + 16;\
+  __asm__ __volatile__ (\
+    "movi %[q1].16b,#0; movi %[q2].16b,#0; movi %[q3].16b,#0\n\t"\
+    "movi v10.16b,#0; movi v11.16b,#0; movi v12.16b,#0\n\t"\
+    "movi v13.16b,#0; movi v14.16b,#0; movi v15.16b,#0\n\t"\
+    "movi v16.16b,#0; movi v17.16b,#0; movi v18.16b,#0\n\t"\
+    "cmp %w[k_left],#16; b.lt 3f\n\t"\
+    "ldr q0,[%[a_ptr1]],#16; ldr q1,[%[a_ptr2]],#16\n\t"\
+    "ldr q2,[%[a_ptr3]],#16; ldr q3,[%[a_ptr4]],#16\n\t"\
+    "ldr q4,[%[b_ptr]]; ldr q5,[%[b_ptr],#16]\n\t"\
+    "ldr q6,[%[b_ptr],#32]; add %[b_ptr],%[b_ptr],#48\n\t"\
+    "cmp %w[k_left],#32; b.lt 2f\n\t"\
+    ".balign 16; 1:\n\t"\
+    ""IMULL" v19.8h,v0.8b,v4.8b; "IMULL" v20.8h,v0.8b,v5.8b\n\t"\
+    ""IMULL" v21.8h,v0.8b,v6.8b; prfm pldl1keep,[%[a_ptr1],#64]\n\t"\
+    ""IMULL" v22.8h,v1.8b,v4.8b; "IMULL" v23.8h,v1.8b,v5.8b\n\t"\
+    ""IMULL" v24.8h,v1.8b,v6.8b; prfm pldl1keep,[%[a_ptr2],#64]\n\t"\
+    ""IMULL" v25.8h,v2.8b,v4.8b; "IMULL" v26.8h,v2.8b,v5.8b\n\t"\
+    ""IMULL" v27.8h,v2.8b,v6.8b; prfm pldl1keep,[%[a_ptr3],#64]\n\t"\
+    ""IMULL" v28.8h,v3.8b,v4.8b; "IMULL" v29.8h,v3.8b,v5.8b\n\t"\
+    ""IMULL" v30.8h,v3.8b,v6.8b; prfm pldl1keep,[%[a_ptr4],#64]\n\t"\
+    ""IADALP" %[q1].4s,v19.8h; "IMULL"2 v19.8h,v0.16b,v4.16b\n\t"\
+    ""IADALP" %[q2].4s,v20.8h; "IMULL"2 v20.8h,v0.16b,v5.16b\n\t"\
+    ""IADALP" %[q3].4s,v21.8h; "IMULL"2 v21.8h,v0.16b,v6.16b\n\t"\
+    "ldr q0,[%[a_ptr1]],#16\n\t"\
+    ""IADALP" v10.4s,v22.8h; "IMULL"2 v22.8h,v1.16b,v4.16b\n\t"\
+    ""IADALP" v11.4s,v23.8h; "IMULL"2 v23.8h,v1.16b,v5.16b\n\t"\
+    ""IADALP" v12.4s,v24.8h; "IMULL"2 v24.8h,v1.16b,v6.16b\n\t"\
+    "ldr q1,[%[a_ptr2]],#16\n\t"\
+    ""IADALP" v13.4s,v25.8h; "IMULL"2 v25.8h,v2.16b,v4.16b\n\t"\
+    ""IADALP" v14.4s,v26.8h; "IMULL"2 v26.8h,v2.16b,v5.16b\n\t"\
+    ""IADALP" v15.4s,v27.8h; "IMULL"2 v27.8h,v2.16b,v6.16b\n\t"\
+    "ldr q2,[%[a_ptr3]],#16\n\t"\
+    ""IADALP" v16.4s,v28.8h; "IMULL"2 v28.8h,v3.16b,v4.16b\n\t"\
+    ""IADALP" v17.4s,v29.8h; "IMULL"2 v29.8h,v3.16b,v5.16b\n\t"\
+    ""IADALP" v18.4s,v30.8h; "IMULL"2 v30.8h,v3.16b,v6.16b\n\t"\
+    "ldr q3,[%[a_ptr4]],#16\n\t"\
+    ""IADALP" %[q1].4s,v19.8h; "IADALP" %[q2].4s,v20.8h; "IADALP" %[q3].4s,v21.8h\n\t"\
+    "ldr q4,[%[b_ptr]]; sub %w[k_left],%w[k_left],#16\n\t"\
+    ""IADALP" v10.4s,v22.8h; "IADALP" v11.4s,v23.8h; "IADALP" v12.4s,v24.8h\n\t"\
+    "ldr q5,[%[b_ptr],#16]\n\t"\
+    ""IADALP" v13.4s,v25.8h; "IADALP" v14.4s,v26.8h; "IADALP" v15.4s,v27.8h\n\t"\
+    "ldr q6,[%[b_ptr],#32]; add %[b_ptr],%[b_ptr],#48; cmp %w[k_left],#32\n\t"\
+    ""IADALP" v16.4s,v28.8h; "IADALP" v17.4s,v29.8h; "IADALP" v18.4s,v30.8h\n\t"\
+    "b.ge 1b\n\t"\
+    "2:\n\t"\
+    ""IMULL" v19.8h,v0.8b,v4.8b; "IMULL" v20.8h,v0.8b,v5.8b\n\t"\
+    ""IMULL" v21.8h,v0.8b,v6.8b; prfm pldl1keep,[%[a_ptr1],%w[pref],SXTW #0]\n\t"\
+    ""IMULL" v22.8h,v1.8b,v4.8b; "IMULL" v23.8h,v1.8b,v5.8b\n\t"\
+    ""IMULL" v24.8h,v1.8b,v6.8b; prfm pldl1keep,[%[a_ptr2],%w[pref],SXTW #0]\n\t"\
+    ""IMULL" v25.8h,v2.8b,v4.8b; "IMULL" v26.8h,v2.8b,v5.8b\n\t"\
+    ""IMULL" v27.8h,v2.8b,v6.8b; prfm pldl1keep,[%[a_ptr3],%w[pref],SXTW #0]\n\t"\
+    ""IMULL" v28.8h,v3.8b,v4.8b; "IMULL" v29.8h,v3.8b,v5.8b\n\t"\
+    ""IMULL" v30.8h,v3.8b,v6.8b; prfm pldl1keep,[%[a_ptr4],%w[pref],SXTW #0]\n\t"\
+    ""IADALP" %[q1].4s,v19.8h; "IMULL"2 v19.8h,v0.16b,v4.16b\n\t"\
+    ""IADALP" %[q2].4s,v20.8h; "IMULL"2 v20.8h,v0.16b,v5.16b\n\t"\
+    ""IADALP" %[q3].4s,v21.8h; "IMULL"2 v21.8h,v0.16b,v6.16b\n\t"\
+    ""IADALP" v10.4s,v22.8h; "IMULL"2 v22.8h,v1.16b,v4.16b\n\t"\
+    ""IADALP" v11.4s,v23.8h; "IMULL"2 v23.8h,v1.16b,v5.16b\n\t"\
+    ""IADALP" v12.4s,v24.8h; "IMULL"2 v24.8h,v1.16b,v6.16b\n\t"\
+    ""IADALP" v13.4s,v25.8h; "IMULL"2 v25.8h,v2.16b,v4.16b\n\t"\
+    ""IADALP" v14.4s,v26.8h; "IMULL"2 v26.8h,v2.16b,v5.16b\n\t"\
+    ""IADALP" v15.4s,v27.8h; "IMULL"2 v27.8h,v2.16b,v6.16b\n\t"\
+    ""IADALP" v16.4s,v28.8h; "IMULL"2 v28.8h,v3.16b,v4.16b\n\t"\
+    ""IADALP" v17.4s,v29.8h; "IMULL"2 v29.8h,v3.16b,v5.16b\n\t"\
+    ""IADALP" v18.4s,v30.8h; "IMULL"2 v30.8h,v3.16b,v6.16b\n\t"\
+    ""IADALP" %[q1].4s,v19.8h; "IADALP" %[q2].4s,v20.8h; "IADALP" %[q3].4s,v21.8h\n\t"\
+    "sub %w[k_left],%w[k_left],#16\n\t"\
+    ""IADALP" v10.4s,v22.8h; "IADALP" v11.4s,v23.8h; "IADALP" v12.4s,v24.8h\n\t"\
+    ""IADALP" v13.4s,v25.8h; "IADALP" v14.4s,v26.8h; "IADALP" v15.4s,v27.8h\n\t"\
+    ""IADALP" v16.4s,v28.8h; "IADALP" v17.4s,v29.8h; "IADALP" v18.4s,v30.8h\n\t"\
+    "3:\n\t"\
+    "cmp %w[k_left],#8; b.lt 4f\n\t"\
+    "ldr d0,[%[a_ptr1]],#8; ldr d1,[%[a_ptr2]],#8\n\t"\
+    "ldr d2,[%[a_ptr3]],#8; ldr d3,[%[a_ptr4]],#8\n\t"\
+    "ldr d4,[%[b_ptr]]; ldr d5,[%[b_ptr],#8]\n\t"\
+    "ldr d6,[%[b_ptr],#16]; add %[b_ptr],%[b_ptr],#24\n\t"\
+    "sub %w[k_left],%w[k_left],#8\n\t"\
+    ""IMULL" v19.8h,v0.8b,v4.8b; "IMULL" v20.8h,v0.8b,v5.8b\n\t"\
+    ""IMULL" v21.8h,v0.8b,v6.8b; "IMULL" v22.8h,v1.8b,v4.8b\n\t"\
+    ""IMULL" v23.8h,v1.8b,v5.8b; "IMULL" v24.8h,v1.8b,v6.8b\n\t"\
+    ""IMULL" v25.8h,v2.8b,v4.8b; "IMULL" v26.8h,v2.8b,v5.8b\n\t"\
+    ""IMULL" v27.8h,v2.8b,v6.8b; "IMULL" v28.8h,v3.8b,v4.8b\n\t"\
+    ""IMULL" v29.8h,v3.8b,v5.8b; "IMULL" v30.8h,v3.8b,v6.8b\n\t"\
+    ""IADALP" %[q1].4s,v19.8h; "IADALP" %[q2].4s,v20.8h; "IADALP" %[q3].4s,v21.8h\n\t"\
+    ""IADALP" v10.4s,v22.8h; "IADALP" v11.4s,v23.8h; "IADALP" v12.4s,v24.8h\n\t"\
+    ""IADALP" v13.4s,v25.8h; "IADALP" v14.4s,v26.8h; "IADALP" v15.4s,v27.8h\n\t"\
+    ""IADALP" v16.4s,v28.8h; "IADALP" v17.4s,v29.8h; "IADALP" v18.4s,v30.8h\n\t"\
+    "4:\n\t"\
+    "addp %[q1].4s,%[q1].4s,v10.4s; addp v13.4s,v13.4s,v16.4s\n\t"\
+    "addp %[q2].4s,%[q2].4s,v11.4s; addp v14.4s,v14.4s,v17.4s\n\t"\
+    "addp %[q3].4s,%[q3].4s,v12.4s; addp v15.4s,v15.4s,v18.4s\n\t"\
+    "cmp %w[k_left],#4; b.lt 5f\n\t"\
+    "ldr s0,[%[a_ptr1]],#4; ldr s1,[%[a_ptr2]],#4\n\t"\
+    "ldr s2,[%[a_ptr3]],#4; ldr s3,[%[a_ptr4]],#4\n\t"\
+    "ld1r {v4.2s},[%[b_ptr]],#4; ins v0.s[1],v1.s[0]\n\t"\
+    "ld1r {v5.2s},[%[b_ptr]],#4; ins v2.s[1],v3.s[0]\n\t"\
+    "ld1r {v6.2s},[%[b_ptr]],#4\n\t"\
+    "sub %w[k_left],%w[k_left],#4\n\t"\
+    ""IMULL" v19.8h,v0.8b,v4.8b; "IMULL" v20.8h,v0.8b,v5.8b\n\t"\
+    ""IMULL" v21.8h,v0.8b,v6.8b; "IMULL" v25.8h,v2.8b,v4.8b\n\t"\
+    ""IMULL" v26.8h,v2.8b,v5.8b; "IMULL" v27.8h,v2.8b,v6.8b\n\t"\
+    ""IADALP" %[q1].4s,v19.8h; "IADALP" %[q2].4s,v20.8h; "IADALP" %[q3].4s,v21.8h\n\t"\
+    ""IADALP" v13.4s,v25.8h; "IADALP" v14.4s,v26.8h; "IADALP" v15.4s,v27.8h\n\t"\
+    "5:\n\t"\
+    "addp %[q1].4s,%[q1].4s,v13.4s\n\t"\
+    "addp %[q2].4s,%[q2].4s,v14.4s\n\t"\
+    "addp %[q3].4s,%[q3].4s,v15.4s\n\t"\
+    "cmp %w[k_left],#2; b.lt 6f\n\t"\
+    "ldr h0,[%[a_ptr1]],#2; ldr h1,[%[a_ptr2]],#2\n\t"\
+    "ldr h2,[%[a_ptr3]],#2; ldr h3,[%[a_ptr4]],#2\n\t"\
+    "ld1r {v4.4h},[%[b_ptr]],#2; ins v0.h[1],v1.h[0]\n\t"\
+    "ld1r {v5.4h},[%[b_ptr]],#2; ins v2.h[1],v3.h[0]\n\t"\
+    "ld1r {v6.4h},[%[b_ptr]],#2\n\t"\
+    "sub %w[k_left],%w[k_left],#2\n\t"\
+    "ins v0.s[1],v2.s[0]\n\t"\
+    ""IMULL" v19.8h,v0.8b,v4.8b\n\t"\
+    ""IMULL" v20.8h,v0.8b,v5.8b\n\t"\
+    ""IMULL" v21.8h,v0.8b,v6.8b\n\t"\
+    ""IADALP" %[q1].4s,v19.8h; "IADALP" %[q2].4s,v20.8h; "IADALP" %[q3].4s,v21.8h\n\t"\
+    "6:\n\t"\
+    "cmp %w[k_left],#1; b.lt 7f\n\t"\
+    "ldr b0,[%[a_ptr1]],#1; ldr b1,[%[a_ptr2]],#1\n\t"\
+    "ldr b2,[%[a_ptr3]],#1; ldr b3,[%[a_ptr4]],#1\n\t"\
+    "ldr b4,[%[b_ptr]]; ins v0.b[1],v1.b[0]\n\t"\
+    "ldr b5,[%[b_ptr],#1]; ins v2.b[1],v3.b[0]\n\t"\
+    "ldr b6,[%[b_ptr],#2]; add %[b_ptr],%[b_ptr],#3\n\t"\
+    "ins v4.b[1],v5.b[0]\n\t"\
+    "ins v0.h[1],v2.h[0]; ins v4.b[2],v6.b[0]\n\t"\
+    "sub %w[k_left],%w[k_left],#1\n\t"\
+    ""IXTL" v0.8h,v0.8b; "IXTL" v4.8h,v4.8b\n\t"\
+    ""IMLAL" %[q1].4s,v0.4h,v4.h[0]\n\t"\
+    ""IMLAL" %[q2].4s,v0.4h,v4.h[1]\n\t"\
+    ""IMLAL" %[q3].4s,v0.4h,v4.h[2]\n\t"\
+    "7:\n\t"\
+   :[q1]"=w"(cq1), [q2]"=w"(cq2), [q3]"=w"(cq3), [k_left]"+r"(k_left),\
+    [a_ptr1]"+r"(a_ptr1), [a_ptr2]"+r"(a_ptr2), [a_ptr3]"+r"(a_ptr3),\
+    [a_ptr4]"+r"(a_ptr4), [b_ptr]"+r"(b_ptr)\
+   :[pref]"r"(next_pref)\
+   :"cc","memory","v0","v1","v2","v3","v4","v5","v6",\
+    "v10","v11","v12","v13","v14","v15","v16","v17","v18",\
+    "v19","v20","v21","v22","v23","v24","v25","v26","v27",\
+    "v28","v29","v30");\
+\
+  if (c_rowmajor) {\
+    I32X4X3 cqt1 = VLD3Q_I32(c_ptr);\
+    cqt1.val[0] = VMLAQ_N_I32(cq1, cqt1.val[0], beta);\
+    cqt1.val[1] = VMLAQ_N_I32(cq2, cqt1.val[1], beta);\
+    cqt1.val[2] = VMLAQ_N_I32(cq3, cqt1.val[2], beta);\
+    VST3Q_I32(c_ptr, cqt1);\
+  } else {\
+    cq1 = VMLAQ_N_I32(cq1, VLD1Q_I32(c_ptr), beta);\
+    cq2 = VMLAQ_N_I32(cq2, VLD1Q_I32(c_ptr + LDM), beta);\
+    cq3 = VMLAQ_N_I32(cq3, VLD1Q_I32(c_ptr + LDM * 2), beta);\
+    VST1Q_I32(c_ptr, cq1); c_ptr += LDM;\
+    VST1Q_I32(c_ptr, cq2); c_ptr += LDM;\
+    VST1Q_I32(c_ptr, cq3);\
+  }\
+}
+
+
+
+#define I8I32MLA_SKINNY_DOT_INLINE_FUNCS_M4(gemm) \
+  I8I32MLA_SKINNYDOT_INLINE_M4N1(gemm)\
+  I8I32MLA_SKINNYDOT_INLINE_M4N2(gemm)\
+  I8I32MLA_SKINNYDOT_INLINE_M4N3(gemm)
+
+I8I32MLA_SKINNY_DOT_INLINE_FUNCS_M4(I8I32MLAGEMM)
+
+#define GEMM_SKINNY_DOT_INLINE_FUNC_DEDUCE(a, b, c, d)\
+  GEMM_SKINNY_DOT_INLINE_PACK_FUNC(a, b, c, d)
+
+GEMM_SKINNY_DOT_INLINE_FUNC_DEDUCE(I8I32MLAGEMM, 1, 1, 31)
+GEMM_SKINNY_DOT_INLINE_FUNC_DEDUCE(I8I32MLAGEMM, 1, 2, 31)
+GEMM_SKINNY_DOT_INLINE_FUNC_DEDUCE(I8I32MLAGEMM, 1, 3, 31)
+
+static inline bool unroll_test_m4n1(uint32_t M, uint32_t K) {
+  return K <= 16384;
+}
+
+static inline bool unroll_test_m1n1(uint32_t M, uint32_t K) {
+  return true;
+}
+
+static inline bool unroll_test_m4n2(uint32_t M, uint32_t K) {
+  return K <= 16384;
+}
+
+static inline bool unroll_test_m1n2(uint32_t M, uint32_t K) {
+  return true;
+}
+
+static inline bool unroll_test_m4n3(uint32_t M, uint32_t K) {
+  return K <= 16384;
+}
+
+static inline bool unroll_test_m1n3(uint32_t M, uint32_t K) {
+  return true;
+}
+
+#endif
\ No newline at end of file
diff --git a/include/neon_armv8a/S8S32DotGemmCopy.h b/include/neon_armv8a/S8S32DotGemmCopy.h
new file mode 100644
index 0000000..d3e2d8c
--- /dev/null
+++ b/include/neon_armv8a/S8S32DotGemmCopy.h
@@ -0,0 +1,31 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+
+void s8s32dotgemm_int8_t_int32_t_tcopy_unroll8(const int8_t * __restrict__ src,
+  int32_t * __restrict__ dst, uint32_t ld_dim, uint32_t dim1, uint32_t dim2);
+
+void s8s32dotgemm_int8_t_int32_t_tcopy_unroll12(const int8_t * __restrict__ src,
+  int32_t * __restrict__ dst, uint32_t ld_dim, uint32_t dim1, uint32_t dim2);
+
+void s8s32dotgemm_int8_t_int32_t_ncopy_unroll8(const int8_t * __restrict__ src,
+  int32_t * __restrict__ dst, uint32_t ld_dim, uint32_t dim1, uint32_t dim2);
+
+void s8s32dotgemm_int8_t_int32_t_ncopy_unroll12(const int8_t * __restrict__ src,
+  int32_t * __restrict__ dst, uint32_t ld_dim, uint32_t dim1, uint32_t dim2);
+
diff --git a/include/neon_armv8a/S8S32DotGemmDriver.h b/include/neon_armv8a/S8S32DotGemmDriver.h
new file mode 100644
index 0000000..06e3d93
--- /dev/null
+++ b/include/neon_armv8a/S8S32DotGemmDriver.h
@@ -0,0 +1,28 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+
+int s8s32dotgemm_serial(int a_rowmajor, int b_rowmajor,
+  const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t N, uint32_t K, int32_t beta_inp);
+
+int s8s32dotgemm(int a_rowmajor, int b_rowmajor,
+  const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t N, uint32_t K,
+  int32_t beta_inp, uint32_t num_threads);
+
diff --git a/include/neon_armv8a/S8S32DotGemmKernel.h b/include/neon_armv8a/S8S32DotGemmKernel.h
new file mode 100644
index 0000000..8b55075
--- /dev/null
+++ b/include/neon_armv8a/S8S32DotGemmKernel.h
@@ -0,0 +1,29 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+
+void s8s32dotgemm_kernel_lm_m8n12(uint32_t M, uint32_t N, uint32_t kdiv4,
+  int32_t beta,
+  const int32_t * __restrict__ sa, const int32_t * __restrict__ sb,
+  int32_t * __restrict__ C, uint32_t ldc);
+
+void s8s32dotgemm_kernel_ln_m12n8(uint32_t M, uint32_t N, uint32_t Kdiv4,
+  int32_t beta,
+  const int32_t * __restrict__ sa, const int32_t * __restrict__ sb,
+  int32_t * __restrict__ C, uint32_t ldc);
+
diff --git a/include/neon_armv8a/S8S32DotGemmSkinnyDot.h b/include/neon_armv8a/S8S32DotGemmSkinnyDot.h
new file mode 100644
index 0000000..58c0639
--- /dev/null
+++ b/include/neon_armv8a/S8S32DotGemmSkinnyDot.h
@@ -0,0 +1,103 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+
+void s8s32dotgemm_arowmajor_bskinny_aint8_t_bint8_t_n1(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, int32_t beta_inp);
+
+void s8s32dotgemm_arowmajor_bskinny_aint8_t_bint8_t_n2(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, int32_t beta_inp);
+
+void s8s32dotgemm_arowmajor_bskinny_aint8_t_bint8_t_n3(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, int32_t beta_inp);
+
+void s8s32dotgemm_arowmajor_bskinny_aint8_t_bint8_t_n4(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, int32_t beta_inp);
+
+void s8s32dotgemm_arowmajor_bskinny_aint8_t_bint8_t_n5(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, int32_t beta_inp);
+
+void s8s32dotgemm_arowmajor_bskinny_aint8_t_bint8_t_n6(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, int32_t beta_inp);
+
+void s8s32dotgemm_arowmajor_bskinny_aint8_t_bint8_t_n7(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, int32_t beta_inp);
+
+void s8s32dotgemm_arowmajor_bskinny_aint8_t_bint8_t_n8(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, int32_t beta_inp);
+
+void s8s32dotgemm_arowmajor_bskinny_aint8_t_bint8_t_n9(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, int32_t beta_inp);
+
+void s8s32dotgemm_arowmajor_bskinny_aint8_t_bint8_t_n10(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, int32_t beta_inp);
+
+void s8s32dotgemm_arowmajor_bskinny_aint8_t_bint8_t_n11(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, int32_t beta_inp);
+
+void s8s32dotgemm_arowmajor_bskinny_aint8_t_bint8_t_n12(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, int32_t beta_inp);
+
+void s8s32dotgemm_arowmajor_bskinny_aint8_t_bint8_t_n1_omp(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  int32_t beta_inp, uint32_t num_threads);
+
+void s8s32dotgemm_arowmajor_bskinny_aint8_t_bint8_t_n2_omp(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  int32_t beta_inp, uint32_t num_threads);
+
+void s8s32dotgemm_arowmajor_bskinny_aint8_t_bint8_t_n3_omp(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  int32_t beta_inp, uint32_t num_threads);
+
+void s8s32dotgemm_arowmajor_bskinny_aint8_t_bint8_t_n4_omp(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  int32_t beta_inp, uint32_t num_threads);
+
+void s8s32dotgemm_arowmajor_bskinny_aint8_t_bint8_t_n5_omp(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  int32_t beta_inp, uint32_t num_threads);
+
+void s8s32dotgemm_arowmajor_bskinny_aint8_t_bint8_t_n6_omp(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  int32_t beta_inp, uint32_t num_threads);
+
+void s8s32dotgemm_arowmajor_bskinny_aint8_t_bint8_t_n7_omp(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  int32_t beta_inp, uint32_t num_threads);
+
+void s8s32dotgemm_arowmajor_bskinny_aint8_t_bint8_t_n8_omp(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  int32_t beta_inp, uint32_t num_threads);
+
+void s8s32dotgemm_arowmajor_bskinny_aint8_t_bint8_t_n9_omp(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  int32_t beta_inp, uint32_t num_threads);
+
+void s8s32dotgemm_arowmajor_bskinny_aint8_t_bint8_t_n10_omp(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  int32_t beta_inp, uint32_t num_threads);
+
+void s8s32dotgemm_arowmajor_bskinny_aint8_t_bint8_t_n11_omp(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  int32_t beta_inp, uint32_t num_threads);
+
+void s8s32dotgemm_arowmajor_bskinny_aint8_t_bint8_t_n12_omp(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  int32_t beta_inp, uint32_t num_threads);
+
diff --git a/include/neon_armv8a/S8S32MlaGemmCopy.h b/include/neon_armv8a/S8S32MlaGemmCopy.h
new file mode 100644
index 0000000..2041972
--- /dev/null
+++ b/include/neon_armv8a/S8S32MlaGemmCopy.h
@@ -0,0 +1,31 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+
+void s8s32mlagemm_int8_t_int16_t_ncopy_unroll8(const int8_t * __restrict__ src,
+  int16_t * __restrict__ dst, uint32_t ld_dim, uint32_t dim1, uint32_t dim2);
+
+void s8s32mlagemm_int8_t_int16_t_ncopy_unroll12(const int8_t * __restrict__ src,
+  int16_t * __restrict__ dst, uint32_t ld_dim, uint32_t dim1, uint32_t dim2);
+
+void s8s32mlagemm_int8_t_int16_t_tcopy_unroll8(const int8_t * __restrict__ src,
+  int16_t * __restrict__ dst, uint32_t ld_dim, uint32_t dim1, uint32_t dim2);
+
+void s8s32mlagemm_int8_t_int16_t_tcopy_unroll12(const int8_t * __restrict__ src,
+  int16_t * __restrict__ dst, uint32_t ld_dim, uint32_t dim1, uint32_t dim2);
+
diff --git a/include/neon_armv8a/S8S32MlaGemmDriver.h b/include/neon_armv8a/S8S32MlaGemmDriver.h
new file mode 100644
index 0000000..26121fa
--- /dev/null
+++ b/include/neon_armv8a/S8S32MlaGemmDriver.h
@@ -0,0 +1,28 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+
+int s8s32mlagemm_serial(int a_rowmajor, int b_rowmajor,
+  const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t N, uint32_t K, int32_t beta_inp);
+
+int s8s32mlagemm(int a_rowmajor, int b_rowmajor,
+  const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t N, uint32_t K,
+  int32_t beta_inp, uint32_t num_threads);
+
diff --git a/include/neon_armv8a/S8S32MlaGemmKernel.h b/include/neon_armv8a/S8S32MlaGemmKernel.h
new file mode 100644
index 0000000..330a8a2
--- /dev/null
+++ b/include/neon_armv8a/S8S32MlaGemmKernel.h
@@ -0,0 +1,29 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+
+void s8s32mlagemm_kernel_lm_m8n12(uint32_t M, uint32_t N, uint32_t K,
+  int32_t beta,
+  const int16_t * __restrict__ sa, const int16_t * __restrict__ sb,
+  int32_t * __restrict__ C, uint32_t ldc);
+
+void s8s32mlagemm_kernel_ln_m12n8(uint32_t M, uint32_t N, uint32_t K,
+  int32_t beta,
+  const int16_t * __restrict__ sa, const int16_t * __restrict__ sb,
+  int32_t * __restrict__ C, uint32_t ldc);
+
diff --git a/include/neon_armv8a/S8S32MlaGemmSkinnyDot.h b/include/neon_armv8a/S8S32MlaGemmSkinnyDot.h
new file mode 100644
index 0000000..ffe895b
--- /dev/null
+++ b/include/neon_armv8a/S8S32MlaGemmSkinnyDot.h
@@ -0,0 +1,75 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+
+void s8s32mlagemm_arowmajor_bskinny_aint8_t_bint8_t_n1(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, int32_t beta_inp);
+
+void s8s32mlagemm_arowmajor_bskinny_aint8_t_bint8_t_n2(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, int32_t beta_inp);
+
+void s8s32mlagemm_arowmajor_bskinny_aint8_t_bint8_t_n3(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, int32_t beta_inp);
+
+void s8s32mlagemm_arowmajor_bskinny_aint8_t_bint8_t_n4(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, int32_t beta_inp);
+
+void s8s32mlagemm_arowmajor_bskinny_aint8_t_bint8_t_n5(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, int32_t beta_inp);
+
+void s8s32mlagemm_arowmajor_bskinny_aint8_t_bint8_t_n6(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, int32_t beta_inp);
+
+void s8s32mlagemm_arowmajor_bskinny_aint8_t_bint8_t_n7(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, int32_t beta_inp);
+
+void s8s32mlagemm_arowmajor_bskinny_aint8_t_bint8_t_n8(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, int32_t beta_inp);
+
+void s8s32mlagemm_arowmajor_bskinny_aint8_t_bint8_t_n1_omp(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  int32_t beta_inp, uint32_t num_threads);
+
+void s8s32mlagemm_arowmajor_bskinny_aint8_t_bint8_t_n2_omp(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  int32_t beta_inp, uint32_t num_threads);
+
+void s8s32mlagemm_arowmajor_bskinny_aint8_t_bint8_t_n3_omp(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  int32_t beta_inp, uint32_t num_threads);
+
+void s8s32mlagemm_arowmajor_bskinny_aint8_t_bint8_t_n4_omp(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  int32_t beta_inp, uint32_t num_threads);
+
+void s8s32mlagemm_arowmajor_bskinny_aint8_t_bint8_t_n5_omp(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  int32_t beta_inp, uint32_t num_threads);
+
+void s8s32mlagemm_arowmajor_bskinny_aint8_t_bint8_t_n6_omp(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  int32_t beta_inp, uint32_t num_threads);
+
+void s8s32mlagemm_arowmajor_bskinny_aint8_t_bint8_t_n7_omp(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  int32_t beta_inp, uint32_t num_threads);
+
+void s8s32mlagemm_arowmajor_bskinny_aint8_t_bint8_t_n8_omp(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  int32_t beta_inp, uint32_t num_threads);
+
diff --git a/include/neon_armv8a/S8S32MlaGemmSkinnyGer.h b/include/neon_armv8a/S8S32MlaGemmSkinnyGer.h
new file mode 100644
index 0000000..8600d13
--- /dev/null
+++ b/include/neon_armv8a/S8S32MlaGemmSkinnyGer.h
@@ -0,0 +1,74 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+
+void s8s32mlagemm_acolmajor_bskinny_aint8_t_bint8_t_n1(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, int32_t beta_inp);
+
+void s8s32mlagemm_acolmajor_bskinny_aint8_t_bint8_t_n2(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, int32_t beta_inp);
+
+void s8s32mlagemm_acolmajor_bskinny_aint8_t_bint8_t_n3(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, int32_t beta_inp);
+
+void s8s32mlagemm_acolmajor_bskinny_aint8_t_bint8_t_n4(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, int32_t beta_inp);
+
+void s8s32mlagemm_acolmajor_bskinny_aint8_t_bint8_t_n5(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, int32_t beta_inp);
+
+void s8s32mlagemm_acolmajor_bskinny_aint8_t_bint8_t_n6(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, int32_t beta_inp);
+
+void s8s32mlagemm_acolmajor_bskinny_aint8_t_bint8_t_n7(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, int32_t beta_inp);
+
+void s8s32mlagemm_acolmajor_bskinny_aint8_t_bint8_t_n8(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, int32_t beta_inp);
+
+void s8s32mlagemm_acolmajor_bskinny_aint8_t_bint8_t_n1_omp(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  int32_t beta_inp, uint32_t num_threads);
+
+void s8s32mlagemm_acolmajor_bskinny_aint8_t_bint8_t_n2_omp(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  int32_t beta_inp, uint32_t num_threads);
+
+void s8s32mlagemm_acolmajor_bskinny_aint8_t_bint8_t_n3_omp(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  int32_t beta_inp, uint32_t num_threads);
+
+void s8s32mlagemm_acolmajor_bskinny_aint8_t_bint8_t_n4_omp(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  int32_t beta_inp, uint32_t num_threads);
+
+void s8s32mlagemm_acolmajor_bskinny_aint8_t_bint8_t_n5_omp(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  int32_t beta_inp, uint32_t num_threads);
+
+void s8s32mlagemm_acolmajor_bskinny_aint8_t_bint8_t_n6_omp(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  int32_t beta_inp, uint32_t num_threads);
+
+void s8s32mlagemm_acolmajor_bskinny_aint8_t_bint8_t_n7_omp(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  int32_t beta_inp, uint32_t num_threads);
+
+void s8s32mlagemm_acolmajor_bskinny_aint8_t_bint8_t_n8_omp(const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  int32_t beta_inp, uint32_t num_threads);
diff --git a/include/neon_armv8a/SgemmCopy.h b/include/neon_armv8a/SgemmCopy.h
new file mode 100644
index 0000000..7a74074
--- /dev/null
+++ b/include/neon_armv8a/SgemmCopy.h
@@ -0,0 +1,31 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+
+void sgemm_float_float_ncopy_unroll8(const float * __restrict__ src,
+  float * __restrict__ dst, uint32_t ld_dim, uint32_t dim1, uint32_t dim2);
+
+void sgemm_float_float_ncopy_unroll12(const float * __restrict__ src,
+  float * __restrict__ dst, uint32_t ld_dim, uint32_t dim1, uint32_t dim2);
+
+void sgemm_float_float_tcopy_unroll8(const float * __restrict__ src,
+  float * __restrict__ dst, uint32_t ld_dim, uint32_t dim1, uint32_t dim2);
+
+void sgemm_float_float_tcopy_unroll12(const float * __restrict__ src,
+  float * __restrict__ dst, uint32_t ld_dim, uint32_t dim1, uint32_t dim2);
+
diff --git a/include/neon_armv8a/SgemmDriver.h b/include/neon_armv8a/SgemmDriver.h
new file mode 100644
index 0000000..bfc4217
--- /dev/null
+++ b/include/neon_armv8a/SgemmDriver.h
@@ -0,0 +1,27 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+
+int sgemm_serial(int a_rowmajor, int b_rowmajor,
+  const float *A, const float *B, float *C,
+  uint32_t M, uint32_t N, uint32_t K, float beta_inp);
+
+int sgemm(int a_rowmajor, int b_rowmajor,
+  const float *A, const float *B, float *C,
+  uint32_t M, uint32_t N, uint32_t K, float beta_inp, uint32_t num_threads);
+
diff --git a/include/neon_armv8a/SgemmKernel.h b/include/neon_armv8a/SgemmKernel.h
new file mode 100644
index 0000000..936160c
--- /dev/null
+++ b/include/neon_armv8a/SgemmKernel.h
@@ -0,0 +1,26 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+
+void sgemm_kernel_lm_m8n12(uint32_t M, uint32_t N, uint32_t K, float beta,
+  const float * __restrict__ sa, const float * __restrict__ sb,
+  float * __restrict__ C, uint32_t ldc);
+
+void sgemm_kernel_ln_m12n8(uint32_t M, uint32_t N, uint32_t K, float beta,
+  const float * __restrict__ sa, const float * __restrict__ sb,
+  float * __restrict__ C, uint32_t ldc);
diff --git a/include/neon_armv8a/SgemmSkinnyDot.h b/include/neon_armv8a/SgemmSkinnyDot.h
new file mode 100644
index 0000000..d40593e
--- /dev/null
+++ b/include/neon_armv8a/SgemmSkinnyDot.h
@@ -0,0 +1,319 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n1(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n2(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n3(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n4(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n5(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n6(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n7(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n8(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n9(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n10(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n11(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n12(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n13(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n14(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n15(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n16(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n17(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n18(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n19(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n20(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n21(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n22(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n23(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n24(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n25(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n26(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n27(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n28(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n29(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n30(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n31(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n32(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n33(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n34(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n35(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n36(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n37(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n38(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n39(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n40(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n41(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n42(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n43(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n44(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n45(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n46(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n47(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n48(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n49(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n50(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n1_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n2_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n3_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n4_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n5_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n6_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n7_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n8_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n9_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n10_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n11_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n12_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n13_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n14_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n15_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n16_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n17_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n18_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n19_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n20_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n21_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n22_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n23_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n24_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n25_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n26_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n27_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n28_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n29_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n30_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n31_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n32_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n33_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n34_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n35_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n36_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n37_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n38_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n39_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n40_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n41_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n42_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n43_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n44_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n45_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n46_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n47_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n48_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n49_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_arowmajor_bskinny_afloat_bfloat_n50_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
diff --git a/include/neon_armv8a/SgemmSkinnyGer.h b/include/neon_armv8a/SgemmSkinnyGer.h
new file mode 100644
index 0000000..fdc16b3
--- /dev/null
+++ b/include/neon_armv8a/SgemmSkinnyGer.h
@@ -0,0 +1,91 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+
+void sgemm_acolmajor_bskinny_afloat_bfloat_n1(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_acolmajor_bskinny_afloat_bfloat_n2(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_acolmajor_bskinny_afloat_bfloat_n3(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_acolmajor_bskinny_afloat_bfloat_n4(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_acolmajor_bskinny_afloat_bfloat_n5(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_acolmajor_bskinny_afloat_bfloat_n6(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_acolmajor_bskinny_afloat_bfloat_n7(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_acolmajor_bskinny_afloat_bfloat_n8(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_acolmajor_bskinny_afloat_bfloat_n9(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_acolmajor_bskinny_afloat_bfloat_n10(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_acolmajor_bskinny_afloat_bfloat_n11(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_acolmajor_bskinny_afloat_bfloat_n12(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp);
+
+void sgemm_acolmajor_bskinny_afloat_bfloat_n1_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_acolmajor_bskinny_afloat_bfloat_n2_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_acolmajor_bskinny_afloat_bfloat_n3_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_acolmajor_bskinny_afloat_bfloat_n4_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_acolmajor_bskinny_afloat_bfloat_n5_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_acolmajor_bskinny_afloat_bfloat_n6_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_acolmajor_bskinny_afloat_bfloat_n7_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_acolmajor_bskinny_afloat_bfloat_n8_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_acolmajor_bskinny_afloat_bfloat_n9_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_acolmajor_bskinny_afloat_bfloat_n10_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_acolmajor_bskinny_afloat_bfloat_n11_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_acolmajor_bskinny_afloat_bfloat_n12_omp(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
diff --git a/include/neon_armv8a/U8U32DotGemmCopy.h b/include/neon_armv8a/U8U32DotGemmCopy.h
new file mode 100644
index 0000000..b33c964
--- /dev/null
+++ b/include/neon_armv8a/U8U32DotGemmCopy.h
@@ -0,0 +1,31 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+
+void u8u32dotgemm_uint8_t_uint32_t_tcopy_unroll8(const uint8_t * __restrict__ src,
+  uint32_t * __restrict__ dst, uint32_t ld_dim, uint32_t dim1, uint32_t dim2);
+
+void u8u32dotgemm_uint8_t_uint32_t_tcopy_unroll12(const uint8_t * __restrict__ src,
+  uint32_t * __restrict__ dst, uint32_t ld_dim, uint32_t dim1, uint32_t dim2);
+
+void u8u32dotgemm_uint8_t_uint32_t_ncopy_unroll8(const uint8_t * __restrict__ src,
+  uint32_t * __restrict__ dst, uint32_t ld_dim, uint32_t dim1, uint32_t dim2);
+
+void u8u32dotgemm_uint8_t_uint32_t_ncopy_unroll12(const uint8_t * __restrict__ src,
+  uint32_t * __restrict__ dst, uint32_t ld_dim, uint32_t dim1, uint32_t dim2);
+
diff --git a/include/neon_armv8a/U8U32DotGemmDriver.h b/include/neon_armv8a/U8U32DotGemmDriver.h
new file mode 100644
index 0000000..170723a
--- /dev/null
+++ b/include/neon_armv8a/U8U32DotGemmDriver.h
@@ -0,0 +1,28 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+
+int u8u32dotgemm_serial(int a_rowmajor, int b_rowmajor,
+  const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t N, uint32_t K, uint32_t beta_inp);
+
+int u8u32dotgemm(int a_rowmajor, int b_rowmajor,
+  const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t N, uint32_t K,
+  uint32_t beta_inp, uint32_t num_threads);
+
diff --git a/include/neon_armv8a/U8U32DotGemmKernel.h b/include/neon_armv8a/U8U32DotGemmKernel.h
new file mode 100644
index 0000000..910ef46
--- /dev/null
+++ b/include/neon_armv8a/U8U32DotGemmKernel.h
@@ -0,0 +1,29 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+
+void u8u32dotgemm_kernel_lm_m8n12(uint32_t M, uint32_t N, uint32_t kdiv4,
+  uint32_t beta,
+  const uint32_t * __restrict__ sa, const uint32_t * __restrict__ sb,
+  uint32_t * __restrict__ C, uint32_t ldc);
+
+void u8u32dotgemm_kernel_ln_m12n8(uint32_t M, uint32_t N, uint32_t Kdiv4,
+  uint32_t beta,
+  const uint32_t * __restrict__ sa, const uint32_t * __restrict__ sb,
+  uint32_t * __restrict__ C, uint32_t ldc);
+
diff --git a/include/neon_armv8a/U8U32DotGemmSkinnyDot.h b/include/neon_armv8a/U8U32DotGemmSkinnyDot.h
new file mode 100644
index 0000000..5c8a646
--- /dev/null
+++ b/include/neon_armv8a/U8U32DotGemmSkinnyDot.h
@@ -0,0 +1,115 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+
+void u8u32dotgemm_arowmajor_bskinny_auint8_t_buint8_t_n1(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, uint32_t beta_inp);
+
+void u8u32dotgemm_arowmajor_bskinny_auint8_t_buint8_t_n2(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, uint32_t beta_inp);
+
+void u8u32dotgemm_arowmajor_bskinny_auint8_t_buint8_t_n3(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, uint32_t beta_inp);
+
+void u8u32dotgemm_arowmajor_bskinny_auint8_t_buint8_t_n4(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, uint32_t beta_inp);
+
+void u8u32dotgemm_arowmajor_bskinny_auint8_t_buint8_t_n5(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, uint32_t beta_inp);
+
+void u8u32dotgemm_arowmajor_bskinny_auint8_t_buint8_t_n6(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, uint32_t beta_inp);
+
+void u8u32dotgemm_arowmajor_bskinny_auint8_t_buint8_t_n7(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, uint32_t beta_inp);
+
+void u8u32dotgemm_arowmajor_bskinny_auint8_t_buint8_t_n8(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, uint32_t beta_inp);
+
+void u8u32dotgemm_arowmajor_bskinny_auint8_t_buint8_t_n9(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, uint32_t beta_inp);
+
+void u8u32dotgemm_arowmajor_bskinny_auint8_t_buint8_t_n10(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, uint32_t beta_inp);
+
+void u8u32dotgemm_arowmajor_bskinny_auint8_t_buint8_t_n11(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, uint32_t beta_inp);
+
+void u8u32dotgemm_arowmajor_bskinny_auint8_t_buint8_t_n12(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, uint32_t beta_inp);
+
+void u8u32dotgemm_arowmajor_bskinny_auint8_t_buint8_t_n1_omp(
+  const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  uint32_t beta_inp, uint32_t num_threads);
+
+void u8u32dotgemm_arowmajor_bskinny_auint8_t_buint8_t_n2_omp(
+  const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  uint32_t beta_inp, uint32_t num_threads);
+
+void u8u32dotgemm_arowmajor_bskinny_auint8_t_buint8_t_n3_omp(
+  const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  uint32_t beta_inp, uint32_t num_threads);
+
+void u8u32dotgemm_arowmajor_bskinny_auint8_t_buint8_t_n4_omp(
+  const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  uint32_t beta_inp, uint32_t num_threads);
+
+void u8u32dotgemm_arowmajor_bskinny_auint8_t_buint8_t_n5_omp(
+  const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  uint32_t beta_inp, uint32_t num_threads);
+
+void u8u32dotgemm_arowmajor_bskinny_auint8_t_buint8_t_n6_omp(
+  const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  uint32_t beta_inp, uint32_t num_threads);
+
+void u8u32dotgemm_arowmajor_bskinny_auint8_t_buint8_t_n7_omp(
+  const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  uint32_t beta_inp, uint32_t num_threads);
+
+void u8u32dotgemm_arowmajor_bskinny_auint8_t_buint8_t_n8_omp(
+  const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  uint32_t beta_inp, uint32_t num_threads);
+
+void u8u32dotgemm_arowmajor_bskinny_auint8_t_buint8_t_n9_omp(
+  const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  uint32_t beta_inp, uint32_t num_threads);
+
+void u8u32dotgemm_arowmajor_bskinny_auint8_t_buint8_t_n10_omp(
+  const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  uint32_t beta_inp, uint32_t num_threads);
+
+void u8u32dotgemm_arowmajor_bskinny_auint8_t_buint8_t_n11_omp(
+  const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  uint32_t beta_inp, uint32_t num_threads);
+
+void u8u32dotgemm_arowmajor_bskinny_auint8_t_buint8_t_n12_omp(
+  const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  uint32_t beta_inp, uint32_t num_threads);
+
diff --git a/include/neon_armv8a/U8U32MlaGemmCopy.h b/include/neon_armv8a/U8U32MlaGemmCopy.h
new file mode 100644
index 0000000..18a6eef
--- /dev/null
+++ b/include/neon_armv8a/U8U32MlaGemmCopy.h
@@ -0,0 +1,31 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+
+void u8u32mlagemm_uint8_t_uint16_t_ncopy_unroll8(const uint8_t * __restrict__ src,
+  uint16_t * __restrict__ dst, uint32_t ld_dim, uint32_t dim1, uint32_t dim2);
+
+void u8u32mlagemm_uint8_t_uint16_t_ncopy_unroll12(const uint8_t * __restrict__ src,
+  uint16_t * __restrict__ dst, uint32_t ld_dim, uint32_t dim1, uint32_t dim2);
+
+void u8u32mlagemm_uint8_t_uint16_t_tcopy_unroll8(const uint8_t * __restrict__ src,
+  uint16_t * __restrict__ dst, uint32_t ld_dim, uint32_t dim1, uint32_t dim2);
+
+void u8u32mlagemm_uint8_t_uint16_t_tcopy_unroll12(const uint8_t * __restrict__ src,
+  uint16_t * __restrict__ dst, uint32_t ld_dim, uint32_t dim1, uint32_t dim2);
+
diff --git a/include/neon_armv8a/U8U32MlaGemmDriver.h b/include/neon_armv8a/U8U32MlaGemmDriver.h
new file mode 100644
index 0000000..9477c3d
--- /dev/null
+++ b/include/neon_armv8a/U8U32MlaGemmDriver.h
@@ -0,0 +1,28 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+
+int u8u32mlagemm_serial(int a_rowmajor, int b_rowmajor,
+  const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t N, uint32_t K, uint32_t beta_inp);
+
+int u8u32mlagemm(int a_rowmajor, int b_rowmajor,
+  const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t N, uint32_t K,
+  uint32_t beta_inp, uint32_t num_threads);
+
diff --git a/include/neon_armv8a/U8U32MlaGemmKernel.h b/include/neon_armv8a/U8U32MlaGemmKernel.h
new file mode 100644
index 0000000..34f1285
--- /dev/null
+++ b/include/neon_armv8a/U8U32MlaGemmKernel.h
@@ -0,0 +1,29 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+
+void u8u32mlagemm_kernel_lm_m8n12(uint32_t M, uint32_t N, uint32_t K,
+  uint32_t beta,
+  const uint16_t * __restrict__ sa, const uint16_t * __restrict__ sb,
+  uint32_t * __restrict__ C, uint32_t ldc);
+
+void u8u32mlagemm_kernel_ln_m12n8(uint32_t M, uint32_t N, uint32_t K,
+  uint32_t beta,
+  const uint16_t * __restrict__ sa, const uint16_t * __restrict__ sb,
+  uint32_t * __restrict__ C, uint32_t ldc);
+
diff --git a/include/neon_armv8a/U8U32MlaGemmSkinnyDot.h b/include/neon_armv8a/U8U32MlaGemmSkinnyDot.h
new file mode 100644
index 0000000..e1ea3e4
--- /dev/null
+++ b/include/neon_armv8a/U8U32MlaGemmSkinnyDot.h
@@ -0,0 +1,75 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+
+void u8u32mlagemm_arowmajor_bskinny_auint8_t_buint8_t_n1(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, uint32_t beta_inp);
+
+void u8u32mlagemm_arowmajor_bskinny_auint8_t_buint8_t_n2(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, uint32_t beta_inp);
+
+void u8u32mlagemm_arowmajor_bskinny_auint8_t_buint8_t_n3(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, uint32_t beta_inp);
+
+void u8u32mlagemm_arowmajor_bskinny_auint8_t_buint8_t_n4(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, uint32_t beta_inp);
+
+void u8u32mlagemm_arowmajor_bskinny_auint8_t_buint8_t_n5(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, uint32_t beta_inp);
+
+void u8u32mlagemm_arowmajor_bskinny_auint8_t_buint8_t_n6(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, uint32_t beta_inp);
+
+void u8u32mlagemm_arowmajor_bskinny_auint8_t_buint8_t_n7(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, uint32_t beta_inp);
+
+void u8u32mlagemm_arowmajor_bskinny_auint8_t_buint8_t_n8(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, uint32_t beta_inp);
+
+void u8u32mlagemm_arowmajor_bskinny_auint8_t_buint8_t_n1_omp(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  uint32_t beta_inp, uint32_t num_threads);
+
+void u8u32mlagemm_arowmajor_bskinny_auint8_t_buint8_t_n2_omp(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  uint32_t beta_inp, uint32_t num_threads);
+
+void u8u32mlagemm_arowmajor_bskinny_auint8_t_buint8_t_n3_omp(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  uint32_t beta_inp, uint32_t num_threads);
+
+void u8u32mlagemm_arowmajor_bskinny_auint8_t_buint8_t_n4_omp(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  uint32_t beta_inp, uint32_t num_threads);
+
+void u8u32mlagemm_arowmajor_bskinny_auint8_t_buint8_t_n5_omp(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  uint32_t beta_inp, uint32_t num_threads);
+
+void u8u32mlagemm_arowmajor_bskinny_auint8_t_buint8_t_n6_omp(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  uint32_t beta_inp, uint32_t num_threads);
+
+void u8u32mlagemm_arowmajor_bskinny_auint8_t_buint8_t_n7_omp(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  uint32_t beta_inp, uint32_t num_threads);
+
+void u8u32mlagemm_arowmajor_bskinny_auint8_t_buint8_t_n8_omp(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  uint32_t beta_inp, uint32_t num_threads);
+
diff --git a/include/neon_armv8a/U8U32MlaGemmSkinnyGer.h b/include/neon_armv8a/U8U32MlaGemmSkinnyGer.h
new file mode 100644
index 0000000..d72b3bf
--- /dev/null
+++ b/include/neon_armv8a/U8U32MlaGemmSkinnyGer.h
@@ -0,0 +1,74 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+
+void u8u32mlagemm_acolmajor_bskinny_auint8_t_buint8_t_n1(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, uint32_t beta_inp);
+
+void u8u32mlagemm_acolmajor_bskinny_auint8_t_buint8_t_n2(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, uint32_t beta_inp);
+
+void u8u32mlagemm_acolmajor_bskinny_auint8_t_buint8_t_n3(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, uint32_t beta_inp);
+
+void u8u32mlagemm_acolmajor_bskinny_auint8_t_buint8_t_n4(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, uint32_t beta_inp);
+
+void u8u32mlagemm_acolmajor_bskinny_auint8_t_buint8_t_n5(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, uint32_t beta_inp);
+
+void u8u32mlagemm_acolmajor_bskinny_auint8_t_buint8_t_n6(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, uint32_t beta_inp);
+
+void u8u32mlagemm_acolmajor_bskinny_auint8_t_buint8_t_n7(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, uint32_t beta_inp);
+
+void u8u32mlagemm_acolmajor_bskinny_auint8_t_buint8_t_n8(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order, uint32_t beta_inp);
+
+void u8u32mlagemm_acolmajor_bskinny_auint8_t_buint8_t_n1_omp(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  uint32_t beta_inp, uint32_t num_threads);
+
+void u8u32mlagemm_acolmajor_bskinny_auint8_t_buint8_t_n2_omp(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  uint32_t beta_inp, uint32_t num_threads);
+
+void u8u32mlagemm_acolmajor_bskinny_auint8_t_buint8_t_n3_omp(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  uint32_t beta_inp, uint32_t num_threads);
+
+void u8u32mlagemm_acolmajor_bskinny_auint8_t_buint8_t_n4_omp(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  uint32_t beta_inp, uint32_t num_threads);
+
+void u8u32mlagemm_acolmajor_bskinny_auint8_t_buint8_t_n5_omp(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  uint32_t beta_inp, uint32_t num_threads);
+
+void u8u32mlagemm_acolmajor_bskinny_auint8_t_buint8_t_n6_omp(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  uint32_t beta_inp, uint32_t num_threads);
+
+void u8u32mlagemm_acolmajor_bskinny_auint8_t_buint8_t_n7_omp(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  uint32_t beta_inp, uint32_t num_threads);
+
+void u8u32mlagemm_acolmajor_bskinny_auint8_t_buint8_t_n8_omp(const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t K, uint8_t b_c_order,
+  uint32_t beta_inp, uint32_t num_threads);
diff --git a/include/neon_armv8a/sgemm_skinny_dot_kernel/ReadME.md b/include/neon_armv8a/sgemm_skinny_dot_kernel/ReadME.md
new file mode 100644
index 0000000..2ae6f90
--- /dev/null
+++ b/include/neon_armv8a/sgemm_skinny_dot_kernel/ReadME.md
@@ -0,0 +1,23 @@
+# Tuned ARMv8a SGEMM functions for skinny matrices
+
+### Supported shapes and orders
+```
+C(MxN) = A(MxK) B(KxN)
+(1). 4 < M < 51, N >> 50, K >> 50, matrix B is column-major;
+(2). 4 < N < 51, M >> 50, K >> 50, matrix A is row-major.
+```
+
+### Interface
+```
+sgemm_skinny1_arowmajor_nXXX_YYY(const float *A, const float *B, float *C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order);
+
+XXX: a number representing the length of dimension N
+YYY: letters indicating tuned arm CPU, e.g. a35/a53/a7x
+b_c_order: the order of skinny matrices B & C
+  0: B & C column-major;
+  1: B row-major, C column-major
+  2: B column-major, C row-major
+  3: B & C row-major
+```
diff --git a/include/neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotA35.h b/include/neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotA35.h
new file mode 100644
index 0000000..9e9b4b7
--- /dev/null
+++ b/include/neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotA35.h
@@ -0,0 +1,488 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+
+void sgemm_skinny1_arowmajor_n4_a35(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n5_a35(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n6_a35(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n7_a35(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n8_a35(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n9_a35(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n10_a35(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n11_a35(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n12_a35(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n13_a35(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n14_a35(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n15_a35(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n16_a35(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n17_a35(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n18_a35(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n19_a35(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n20_a35(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n21_a35(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n22_a35(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n23_a35(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n24_a35(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n25_a35(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n26_a35(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n27_a35(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n28_a35(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n29_a35(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n30_a35(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n31_a35(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n32_a35(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n33_a35(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n34_a35(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n35_a35(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n36_a35(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n37_a35(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n38_a35(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n39_a35(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n40_a35(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n41_a35(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n42_a35(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n43_a35(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n44_a35(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n45_a35(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n46_a35(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n47_a35(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n48_a35(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n49_a35(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n50_a35(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n4_a35_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n5_a35_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n6_a35_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n7_a35_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n8_a35_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n9_a35_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n10_a35_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n11_a35_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n12_a35_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n13_a35_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n14_a35_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n15_a35_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n16_a35_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n17_a35_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n18_a35_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n19_a35_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n20_a35_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n21_a35_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n22_a35_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n23_a35_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n24_a35_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n25_a35_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n26_a35_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n27_a35_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n28_a35_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n29_a35_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n30_a35_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n31_a35_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n32_a35_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n33_a35_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n34_a35_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n35_a35_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n36_a35_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n37_a35_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n38_a35_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n39_a35_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n40_a35_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n41_a35_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n42_a35_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n43_a35_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n44_a35_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n45_a35_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n46_a35_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n47_a35_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n48_a35_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n49_a35_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n50_a35_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
diff --git a/include/neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotA53.h b/include/neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotA53.h
new file mode 100644
index 0000000..1b2575f
--- /dev/null
+++ b/include/neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotA53.h
@@ -0,0 +1,488 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+
+void sgemm_skinny1_arowmajor_n4_a53(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n5_a53(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n6_a53(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n7_a53(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n8_a53(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n9_a53(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n10_a53(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n11_a53(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n12_a53(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n13_a53(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n14_a53(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n15_a53(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n16_a53(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n17_a53(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n18_a53(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n19_a53(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n20_a53(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n21_a53(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n22_a53(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n23_a53(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n24_a53(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n25_a53(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n26_a53(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n27_a53(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n28_a53(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n29_a53(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n30_a53(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n31_a53(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n32_a53(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n33_a53(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n34_a53(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n35_a53(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n36_a53(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n37_a53(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n38_a53(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n39_a53(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n40_a53(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n41_a53(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n42_a53(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n43_a53(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n44_a53(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n45_a53(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n46_a53(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n47_a53(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n48_a53(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n49_a53(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n50_a53(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n4_a53_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n5_a53_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n6_a53_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n7_a53_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n8_a53_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n9_a53_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n10_a53_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n11_a53_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n12_a53_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n13_a53_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n14_a53_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n15_a53_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n16_a53_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n17_a53_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n18_a53_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n19_a53_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n20_a53_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n21_a53_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n22_a53_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n23_a53_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n24_a53_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n25_a53_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n26_a53_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n27_a53_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n28_a53_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n29_a53_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n30_a53_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n31_a53_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n32_a53_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n33_a53_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n34_a53_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n35_a53_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n36_a53_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n37_a53_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n38_a53_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n39_a53_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n40_a53_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n41_a53_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n42_a53_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n43_a53_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n44_a53_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n45_a53_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n46_a53_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n47_a53_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n48_a53_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n49_a53_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n50_a53_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
diff --git a/include/neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotA7x.h b/include/neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotA7x.h
new file mode 100644
index 0000000..b9b61f3
--- /dev/null
+++ b/include/neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotA7x.h
@@ -0,0 +1,488 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+
+void sgemm_skinny1_arowmajor_n4_a7x(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n5_a7x(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n6_a7x(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n7_a7x(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n8_a7x(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n9_a7x(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n10_a7x(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n11_a7x(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n12_a7x(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n13_a7x(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n14_a7x(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n15_a7x(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n16_a7x(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n17_a7x(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n18_a7x(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n19_a7x(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n20_a7x(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n21_a7x(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n22_a7x(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n23_a7x(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n24_a7x(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n25_a7x(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n26_a7x(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n27_a7x(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n28_a7x(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n29_a7x(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n30_a7x(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n31_a7x(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n32_a7x(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n33_a7x(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n34_a7x(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n35_a7x(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n36_a7x(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n37_a7x(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n38_a7x(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n39_a7x(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n40_a7x(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n41_a7x(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n42_a7x(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n43_a7x(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n44_a7x(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n45_a7x(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n46_a7x(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n47_a7x(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n48_a7x(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n49_a7x(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n50_a7x(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp);
+
+void sgemm_skinny1_arowmajor_n4_a7x_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n5_a7x_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n6_a7x_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n7_a7x_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n8_a7x_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n9_a7x_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n10_a7x_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n11_a7x_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n12_a7x_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n13_a7x_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n14_a7x_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n15_a7x_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n16_a7x_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n17_a7x_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n18_a7x_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n19_a7x_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n20_a7x_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n21_a7x_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n22_a7x_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n23_a7x_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n24_a7x_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n25_a7x_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n26_a7x_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n27_a7x_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n28_a7x_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n29_a7x_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n30_a7x_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n31_a7x_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n32_a7x_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n33_a7x_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n34_a7x_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n35_a7x_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n36_a7x_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n37_a7x_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n38_a7x_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n39_a7x_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n40_a7x_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n41_a7x_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n42_a7x_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n43_a7x_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n44_a7x_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n45_a7x_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n46_a7x_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n47_a7x_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n48_a7x_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n49_a7x_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
+
+void sgemm_skinny1_arowmajor_n50_a7x_omp(const float * __restrict__ A,
+  const float * __restrict__ B, float * __restrict__ C,
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads);
diff --git a/include/neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotCopy.h b/include/neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotCopy.h
new file mode 100644
index 0000000..55778b1
--- /dev/null
+++ b/include/neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotCopy.h
@@ -0,0 +1,108 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+
+/* In the calculation, the content of skinny matrix B will be read for multiple
+ * times. We rearrange its elements to make the reading sequential and
+ * contiguous. The process of such rearrangement is called packing. */
+
+/* There are 5 packing types used for skinny matrix B */
+/* type_0: row-major contiguous pattern */
+/* type_1: partitioned in 4-row chunks, row-major bulk + col-major edge */
+/* type_2: partitioned in 2-row chunks, row-major bulk + col-major edge */
+/* type_3: partitioned in 2-row chunks, col-major in each chunk */
+/* type_4: partitioned in 2-row chunks, x-type interleave like shoelaces */
+
+/* The selection of paking type depends on CPU architecture and problem size */
+/* cortex-a35: type_3 when N < 10, type_0 for even N, type_4 for odd N */
+/* cortex-a53: type_1 when N < 15, type_2 when 14 < N < 23, type_0 for big N */
+/* cortex-a55: the same to cortex-a53 */
+/* cortex-a76 & cortex-a72: always type_1 */
+
+/* Example 1 */
+/* source matrix B (4x5):
+ * a b c d e
+ * f g h i j
+ * k l m n o
+ * p q r s t */
+/* pack results to b_scr[] */
+/* type_0 pack: abcdefghijklmnopqrst */
+/* type_1 pack: abcdfghiklmnpqrsejot */
+/* type_2 pack: abcdfghiejklmnpqrsot */
+/* type_3 pack: afbgchdiejkplqmrnsot */
+/* type_4 pack: agciejfbhdkqmsotplrn */
+
+/* Example 2 */
+/* source matrix B (6x6):
+ * 11-12-13-14-15-16
+ * 21-22-23-24-25-26
+ * 31-32-33-34-35-36
+ * 41-42-43-44-45-46
+ * 51-52-53-54-55-56
+ * 61-62-63-64-65-66 */
+/* type_0 pack: 11-12-13-14-15-16-21-22-23-24-25-26-31-32-33-34-
+ * 35-36-41-42-43-44-45-46-51-52-53-54-55-56-61-62-63-64-65-66 */
+/* type_1 pack: 11-12-13-14-21-22-23-24-31-32-33-34-41-42-43-44-
+ * 15-25-35-45-16-26-36-46-51-52-53-54-55-56-61-62-63-64-65-66 */
+/* type_2 pack: 11-12-13-14-21-22-23-24-15-25-16-26-31-32-33-34-
+ * 41-42-43-44-35-45-36-46-51-52-53-54-61-62-63-64-55-65-56-66 */
+/* type_3 pack: 11-21-12-22-13-23-14-24-15-25-16-26-31-41-32-42-
+ * 33-43-34-44-35-45-36-46-51-61-52-62-53-63-54-64-55-65-56-66 */
+/* type-4 pack: 11-22-13-24-15-26-21-12-23-14-25-16-31-42-33-44-
+ * 35-46-41-32-43-34-45-36-51-62-53-64-55-66-61-52-63-54-65-56 */
+
+/* type_0 pack from col-major B */
+void pack_0_from_cm(float * __restrict__ b_scr,
+  const float * __restrict__ B, uint32_t LDB, uint32_t K, uint32_t N);
+
+/* type_1 pack from col-major B */
+void pack_1_from_cm(float * __restrict__ b_scr,
+  const float * __restrict__ B, uint32_t LDB, uint32_t K, uint32_t N);
+
+/* type_2 pack from col-major B */
+void pack_2_from_cm(float * __restrict__ b_scr,
+  const float * __restrict__ B, uint32_t LDB, uint32_t K, uint32_t N);
+
+/* type_3 pack from col-major B */
+void pack_3_from_cm(float * __restrict__ b_scr,
+  const float * __restrict__ B, uint32_t LDB, uint32_t K, uint32_t N);
+
+/* type_4 pack from col-major B */
+void pack_4_from_cm(float * __restrict__ b_scr,
+  const float * __restrict__ B, uint32_t LDB, uint32_t K, uint32_t N);
+
+/* type_0 pack from row-major B */
+void pack_0_from_rm(float * __restrict__ b_scr,
+  const float * __restrict__ B, uint32_t LDB, uint32_t K, uint32_t N);
+
+/* type_1 pack from row-major B */
+void pack_1_from_rm(float * __restrict__ b_scr,
+  const float * __restrict__ B, uint32_t LDB, uint32_t K, uint32_t N);
+
+/* type_2 pack from row-major B */
+void pack_2_from_rm(float * __restrict__ b_scr,
+  const float * __restrict__ B, uint32_t LDB, uint32_t K, uint32_t N);
+
+/* type_3 pack from row-major B */
+void pack_3_from_rm(float * __restrict__ b_scr,
+  const float * __restrict__ B, uint32_t LDB, uint32_t K, uint32_t N);
+
+/* type_4 pack from row-major B */
+void pack_4_from_rm(float * __restrict__ b_scr,
+  const float * __restrict__ B, uint32_t LDB, uint32_t K, uint32_t N);
+
diff --git a/include/neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotDriver.h b/include/neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotDriver.h
new file mode 100644
index 0000000..183ee01
--- /dev/null
+++ b/include/neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotDriver.h
@@ -0,0 +1,485 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include "arm_neon/ARMCompareAndSwap.h"
+#include "common/CommonSched.h"
+#ifndef EMLL_SERIAL_ONLY
+#include <omp.h>
+#endif
+
+#ifndef INCLUDE_SKINNY1_DRIVER
+#define INCLUDE_SKINNY1_DRIVER
+
+#define DRIVER_PURE_PACK_SERIAL(cpu, ndim, K_BATCH, pack_type, unroll_m) \
+void sgemm_skinny1_arowmajor_n##ndim##_##cpu(const float * __restrict__ A,\
+  const float * __restrict__ B, float * __restrict__ C,\
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,\
+  uint8_t b_c_order, float beta_inp) {\
+\
+  const uint8_t b_rowmajor = b_c_order & 1;\
+  const uint8_t c_rowmajor = b_c_order & 2;\
+\
+  __attribute__((aligned(4096))) float b_scr[ndim * K_BATCH];\
+\
+  uint32_t k_pos, k_inc;\
+  for (k_pos = 0; k_pos < K; k_pos += k_inc) {\
+    k_inc = K - k_pos;\
+    if (k_inc >= K_BATCH * 2) k_inc = K_BATCH;\
+    else if (k_inc > K_BATCH) k_inc >>= 1;\
+    if (b_rowmajor == 0) {\
+      pack_##pack_type##_from_cm(b_scr, B + k_pos, LDB, k_inc, ndim);\
+    } else {\
+      pack_##pack_type##_from_rm(b_scr, B + k_pos * LDB, LDB, k_inc, ndim);\
+    }\
+    uint32_t m_pos = M;\
+    const float *a_ptr = A + k_pos;\
+    float *c_ptr = C;\
+    const uint32_t c_incr = (c_rowmajor == 0) ? 1 : LDC;\
+    const float beta = (k_pos == 0) ? beta_inp : 1.0f;\
+    for (; m_pos >= unroll_m; m_pos -= unroll_m) {\
+      sgemm_skinny1_##cpu##_m##unroll_m##n##ndim(a_ptr, b_scr, c_ptr,\
+        k_inc, LDA, LDC, c_rowmajor, &beta);\
+      a_ptr += LDA * unroll_m;\
+      c_ptr += c_incr * unroll_m;\
+    }\
+    for (; m_pos > 0; m_pos--) {\
+      sgemm_skinny1_##cpu##_m1n##ndim(a_ptr, b_scr, c_ptr, k_inc, LDC,\
+        c_rowmajor, beta);\
+      a_ptr += LDA;\
+      c_ptr += c_incr;\
+    }\
+  }\
+}
+
+#define DRIVER_PURE_PACK_OMP(cpu, ndim, K_BATCH, pack_type, unroll_m) \
+void sgemm_skinny1_arowmajor_n##ndim##_##cpu##_omp(\
+  const float * __restrict__ A,\
+  const float * __restrict__ B, float * __restrict__ C,\
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,\
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads) {\
+\
+  if (num_threads <= 1) {\
+    sgemm_skinny1_arowmajor_n##ndim##_##cpu(A, B, C, M, K,\
+      LDA, LDB, LDC, b_c_order, beta_inp);\
+    return;\
+  }\
+  omp_set_num_threads(num_threads);\
+\
+  const uint8_t b_rowmajor = b_c_order & 1;\
+  const uint8_t c_rowmajor = b_c_order & 2;\
+  const uint32_t c_m_inc = (c_rowmajor == 0) ? 1 : LDC;\
+\
+  __attribute__((aligned(4096))) float b_scr[ndim * K_BATCH];\
+\
+  uint32_t k_pos, k_inc;\
+  for (k_pos = 0; k_pos < K; k_pos += k_inc) {\
+    k_inc = K - k_pos;\
+    if (k_inc >= K_BATCH * 2) k_inc = K_BATCH;\
+    else if (k_inc > K_BATCH) k_inc >>= 1;\
+    const float beta = (k_pos == 0) ? beta_inp : 1.0f;\
+\
+    uint32_t k_copy_left = k_inc;\
+    uint32_t m_calc_done = 0;\
+    _Pragma("omp parallel")\
+    {\
+      uint32_t k_copy_start, k_copy_end;\
+      while(get_copy_task(&k_copy_left, 64, &k_copy_start, &k_copy_end)) {\
+        if (b_rowmajor == 0) {\
+          pack_##pack_type##_from_cm(b_scr + k_copy_start * ndim,\
+            B + k_pos + k_copy_start, LDB,\
+            k_copy_end - k_copy_start, ndim);\
+        } else {\
+          pack_##pack_type##_from_rm(b_scr + k_copy_start * ndim,\
+            B + (k_pos + k_copy_start) * LDB, LDB,\
+            k_copy_end - k_copy_start, ndim);\
+        }\
+      }\
+      _Pragma("omp barrier")\
+      uint32_t m_calc_start, m_calc_end;\
+      while(get_irreg_task(&m_calc_done, &m_calc_start, &m_calc_end,\
+        unroll_m << 2, M)) {\
+        const float *a_ptr = A + m_calc_start * LDA + k_pos;\
+        float *c_ptr = C + m_calc_start * c_m_inc;\
+        uint32_t sub_m_left = m_calc_end - m_calc_start;\
+        for (; sub_m_left >= unroll_m; sub_m_left -= unroll_m) {\
+          sgemm_skinny1_##cpu##_m##unroll_m##n##ndim(a_ptr, b_scr, c_ptr,\
+            k_inc, LDA, LDC, c_rowmajor, &beta);\
+          a_ptr += LDA * unroll_m;\
+          c_ptr += c_m_inc * unroll_m;\
+        }\
+        for (; sub_m_left > 0; sub_m_left--) {\
+          sgemm_skinny1_##cpu##_m1n##ndim(a_ptr, b_scr, c_ptr, k_inc, LDC,\
+            c_rowmajor, beta);\
+          a_ptr += LDA;\
+          c_ptr += c_m_inc;\
+        }\
+      }\
+    }\
+  }\
+}
+
+#define DRIVER_MIX2_PACK_SERIAL(cpu, ndim, K_BATCH, pack1, pack2, n_pack1, n_pack2, unroll_m) \
+void sgemm_skinny1_arowmajor_n##ndim##_##cpu(const float * __restrict__ A,\
+  const float * __restrict__ B, float * __restrict__ C,\
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,\
+  uint8_t b_c_order, float beta_inp) {\
+\
+  const uint8_t b_rowmajor = b_c_order & 1;\
+  const uint8_t c_rowmajor = b_c_order & 2;\
+\
+  __attribute__((aligned(4096))) float b_scr[ndim * K_BATCH];\
+  float * const b_scr2 = b_scr + n_pack1 * K_BATCH;\
+\
+  uint32_t k_pos, k_inc;\
+  for (k_pos = 0; k_pos < K; k_pos += k_inc) {\
+    k_inc = K - k_pos;\
+    if (k_inc >= K_BATCH * 2) k_inc = K_BATCH;\
+    else if (k_inc > K_BATCH) k_inc >>= 1;\
+    if (b_rowmajor == 0) {\
+      pack_##pack1##_from_cm(b_scr, B + k_pos, LDB, k_inc, n_pack1);\
+      pack_##pack2##_from_cm(b_scr2, B + k_pos + n_pack1 * LDB,\
+        LDB, k_inc, n_pack2);\
+    } else {\
+      pack_##pack1##_from_rm(b_scr, B + k_pos * LDB, LDB, k_inc, n_pack1);\
+      pack_##pack2##_from_rm(b_scr2, B + k_pos * LDB + n_pack1,\
+        LDB, k_inc, n_pack2);\
+    }\
+    uint32_t m_pos = M;\
+    const float *a_ptr = A + k_pos;\
+    float *c_ptr1 = C;\
+    float *c_ptr2 = (c_rowmajor == 0) ? C + n_pack1 * LDC : C + n_pack1;\
+    const uint32_t c_incr = (c_rowmajor == 0) ? 1 : LDC;\
+    const float beta = (k_pos == 0) ? beta_inp : 1.0f;\
+    for (; m_pos >= unroll_m; m_pos -= unroll_m) {\
+      sgemm_skinny1_##cpu##_m##unroll_m##n##n_pack1(a_ptr, b_scr, c_ptr1,\
+        k_inc, LDA, LDC, c_rowmajor, &beta);\
+      sgemm_skinny1_##cpu##_m##unroll_m##n##n_pack2(a_ptr, b_scr2, c_ptr2,\
+        k_inc, LDA, LDC, c_rowmajor, &beta);\
+      a_ptr += LDA * unroll_m;\
+      c_ptr1 += c_incr * unroll_m;\
+      c_ptr2 += c_incr * unroll_m;\
+    }\
+    for (; m_pos > 0; m_pos--) {\
+      sgemm_skinny1_##cpu##_m1n##n_pack1(a_ptr, b_scr, c_ptr1, k_inc, LDC,\
+        c_rowmajor, beta);\
+      sgemm_skinny1_##cpu##_m1n##n_pack2(a_ptr, b_scr2, c_ptr2, k_inc, LDC,\
+        c_rowmajor, beta);\
+      a_ptr += LDA;\
+      c_ptr1 += c_incr;\
+      c_ptr2 += c_incr;\
+    }\
+  }\
+}
+
+#define DRIVER_MIX2_PACK_OMP(cpu, ndim, K_BATCH, pack1, pack2, n_pack1, n_pack2, unroll_m) \
+void sgemm_skinny1_arowmajor_n##ndim##_##cpu##_omp(\
+  const float * __restrict__ A,\
+  const float * __restrict__ B, float * __restrict__ C,\
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,\
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads) {\
+\
+  if (num_threads <= 1) {\
+    sgemm_skinny1_arowmajor_n##ndim##_##cpu(A, B, C, M, K,\
+      LDA, LDB, LDC, b_c_order, beta_inp);\
+    return;\
+  }\
+\
+  const uint8_t b_rowmajor = b_c_order & 1;\
+  const uint8_t c_rowmajor = b_c_order & 2;\
+  const uint32_t c_m_inc = (c_rowmajor == 0) ? 1 : LDC;\
+\
+  __attribute__((aligned(4096))) float b_scr[ndim * K_BATCH];\
+  float * const b_scr2 = b_scr + n_pack1 * K_BATCH;\
+\
+  uint32_t k_pos, k_inc;\
+  for (k_pos = 0; k_pos < K; k_pos += k_inc) {\
+    k_inc = K - k_pos;\
+    if (k_inc >= K_BATCH * 2) k_inc = K_BATCH;\
+    else if (k_inc > K_BATCH) k_inc >>= 1;\
+    const float beta = (k_pos == 0) ? beta_inp : 1.0f;\
+\
+    uint32_t k_copy_left = k_inc;\
+    uint32_t m_calc_done = 0;\
+    _Pragma("omp parallel")\
+    {\
+      uint32_t k_copy_start, k_copy_end;\
+      while(get_copy_task(&k_copy_left, 64, &k_copy_start, &k_copy_end)) {\
+        if (b_rowmajor == 0) {\
+          pack_##pack1##_from_cm(b_scr + k_copy_start * n_pack1,\
+            B + (k_pos + k_copy_start), LDB,\
+            k_copy_end - k_copy_start, n_pack1);\
+          pack_##pack2##_from_cm(b_scr2 + k_copy_start * n_pack2,\
+            B + (k_pos + k_copy_start) + n_pack1 * LDB, LDB,\
+            k_copy_end - k_copy_start, n_pack2);\
+        } else {\
+          pack_##pack1##_from_rm(b_scr + k_copy_start * n_pack1,\
+            B + (k_pos + k_copy_start) * LDB, LDB,\
+            k_copy_end - k_copy_start, n_pack1);\
+          pack_##pack2##_from_rm(b_scr2 + k_copy_start * n_pack2,\
+            B + (k_pos + k_copy_start) * LDB + n_pack1, LDB,\
+            k_copy_end - k_copy_start, n_pack2);\
+        }\
+      }\
+      _Pragma("omp barrier")\
+      uint32_t m_calc_start, m_calc_end;\
+      while(get_irreg_task(&m_calc_done, &m_calc_start, &m_calc_end,\
+        unroll_m << 2, M)) {\
+        const float *a_ptr = A + m_calc_start * LDA + k_pos;\
+        float *c_ptr1 = C + m_calc_start * c_m_inc;\
+        float *c_ptr2 = (c_rowmajor == 0) ?\
+          c_ptr1 + n_pack1 * LDC : c_ptr1 + n_pack1;\
+        uint32_t sub_m_left = m_calc_end - m_calc_start;\
+        for (; sub_m_left >= unroll_m; sub_m_left -= unroll_m) {\
+          sgemm_skinny1_##cpu##_m##unroll_m##n##n_pack1(a_ptr, b_scr, c_ptr1,\
+            k_inc, LDA, LDC, c_rowmajor, &beta);\
+          sgemm_skinny1_##cpu##_m##unroll_m##n##n_pack2(a_ptr, b_scr2, c_ptr2,\
+            k_inc, LDA, LDC, c_rowmajor, &beta);\
+          a_ptr += LDA * unroll_m;\
+          c_ptr1 += c_m_inc * unroll_m;\
+          c_ptr2 += c_m_inc * unroll_m;\
+        }\
+        for (; sub_m_left > 0; sub_m_left--) {\
+          sgemm_skinny1_##cpu##_m1n##n_pack1(a_ptr, b_scr, c_ptr1, k_inc, LDC,\
+            c_rowmajor, beta);\
+          sgemm_skinny1_##cpu##_m1n##n_pack2(a_ptr, b_scr2, c_ptr2, k_inc, LDC,\
+            c_rowmajor, beta);\
+          a_ptr += LDA;\
+          c_ptr1 += c_m_inc;\
+          c_ptr2 += c_m_inc;\
+        }\
+      }\
+    }\
+  }\
+}
+
+#define DRIVER_MIX3_PACK_SERIAL(cpu, ndim, K_BATCH, pack1, pack2, pack3, n_pack1, n_pack2, n_pack3, unroll_m) \
+void sgemm_skinny1_arowmajor_n##ndim##_##cpu(const float * __restrict__ A,\
+  const float * __restrict__ B, float * __restrict__ C,\
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,\
+  uint8_t b_c_order, float beta_inp) {\
+\
+  const uint8_t b_rowmajor = b_c_order & 1;\
+  const uint8_t c_rowmajor = b_c_order & 2;\
+\
+  __attribute__((aligned(4096))) float b_scr[ndim * K_BATCH];\
+  float * const b_scr2 = b_scr + n_pack1 * K_BATCH;\
+  float * const b_scr3 = b_scr2 + n_pack2 * K_BATCH;\
+\
+  uint32_t k_pos, k_inc;\
+  for (k_pos = 0; k_pos < K; k_pos += k_inc) {\
+    k_inc = K - k_pos;\
+    if (k_inc >= K_BATCH * 2) k_inc = K_BATCH;\
+    else if (k_inc > K_BATCH) k_inc >>= 1;\
+    if (b_rowmajor == 0) {\
+      pack_##pack1##_from_cm(b_scr, B + k_pos, LDB, k_inc, n_pack1);\
+      pack_##pack2##_from_cm(b_scr2, B + k_pos + n_pack1 * LDB,\
+        LDB, k_inc, n_pack2);\
+      pack_##pack3##_from_cm(b_scr3, B + k_pos + (n_pack1 + n_pack2) * LDB,\
+        LDB, k_inc, n_pack3);\
+    } else {\
+      pack_##pack1##_from_rm(b_scr, B + k_pos * LDB, LDB, k_inc, n_pack1);\
+      pack_##pack2##_from_rm(b_scr2, B + k_pos * LDB + n_pack1,\
+        LDB, k_inc, n_pack2);\
+      pack_##pack3##_from_rm(b_scr3, B + k_pos * LDB + n_pack1 + n_pack2,\
+        LDB, k_inc, n_pack3);\
+    }\
+    uint32_t m_pos = M;\
+    const float *a_ptr = A + k_pos;\
+    float *c_ptr1 = C;\
+    float *c_ptr2 = (c_rowmajor == 0) ? C + n_pack1 * LDC : C + n_pack1;\
+    float *c_ptr3 = (c_rowmajor == 0) ? C + (n_pack1 + n_pack2) * LDC :\
+      C + n_pack1 + n_pack2;\
+    const uint32_t c_incr = (c_rowmajor == 0) ? 1 : LDC;\
+    const float beta = (k_pos == 0) ? beta_inp : 1.0f;\
+    for (; m_pos >= unroll_m; m_pos -= unroll_m) {\
+      sgemm_skinny1_##cpu##_m##unroll_m##n##n_pack1(a_ptr, b_scr, c_ptr1,\
+        k_inc, LDA, LDC, c_rowmajor, &beta);\
+      sgemm_skinny1_##cpu##_m##unroll_m##n##n_pack2(a_ptr, b_scr2, c_ptr2,\
+        k_inc, LDA, LDC, c_rowmajor, &beta);\
+      sgemm_skinny1_##cpu##_m##unroll_m##n##n_pack3(a_ptr, b_scr3, c_ptr3,\
+        k_inc, LDA, LDC, c_rowmajor, &beta);\
+      a_ptr += LDA * unroll_m;\
+      c_ptr1 += c_incr * unroll_m;\
+      c_ptr2 += c_incr * unroll_m;\
+      c_ptr3 += c_incr * unroll_m;\
+    }\
+    for (; m_pos > 0; m_pos--) {\
+      sgemm_skinny1_##cpu##_m1n##n_pack1(a_ptr, b_scr, c_ptr1, k_inc, LDC,\
+        c_rowmajor, beta);\
+      sgemm_skinny1_##cpu##_m1n##n_pack2(a_ptr, b_scr2, c_ptr2, k_inc, LDC,\
+        c_rowmajor, beta);\
+      sgemm_skinny1_##cpu##_m1n##n_pack3(a_ptr, b_scr3, c_ptr3, k_inc, LDC,\
+        c_rowmajor, beta);\
+      a_ptr += LDA;\
+      c_ptr1 += c_incr;\
+      c_ptr2 += c_incr;\
+      c_ptr3 += c_incr;\
+    }\
+  }\
+}
+
+#define DRIVER_MIX3_PACK_OMP(cpu, ndim, K_BATCH, pack1, pack2, pack3, n_pack1, n_pack2, n_pack3, unroll_m) \
+void sgemm_skinny1_arowmajor_n##ndim##_##cpu##_omp(\
+  const float * __restrict__ A,\
+  const float * __restrict__ B, float * __restrict__ C,\
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,\
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads) {\
+\
+  if (num_threads <= 1) {\
+    sgemm_skinny1_arowmajor_n##ndim##_##cpu(A, B, C, M, K,\
+      LDA, LDB, LDC, b_c_order, beta_inp);\
+    return;\
+  }\
+\
+  const uint8_t b_rowmajor = b_c_order & 1;\
+  const uint8_t c_rowmajor = b_c_order & 2;\
+  const uint32_t c_m_inc = (c_rowmajor == 0) ? 1 : LDC;\
+\
+  __attribute__((aligned(4096))) float b_scr[ndim * K_BATCH];\
+  float * const b_scr2 = b_scr + n_pack1 * K_BATCH;\
+  float * const b_scr3 = b_scr2 + n_pack2 * K_BATCH;\
+\
+  uint32_t k_pos, k_inc;\
+  for (k_pos = 0; k_pos < K; k_pos += k_inc) {\
+    k_inc = K - k_pos;\
+    if (k_inc >= K_BATCH * 2) k_inc = K_BATCH;\
+    else if (k_inc > K_BATCH) k_inc >>= 1;\
+    const float beta = (k_pos == 0) ? beta_inp : 1.0f;\
+\
+    uint32_t k_copy_left = k_inc;\
+    uint32_t m_calc_done = 0;\
+    _Pragma("omp parallel")\
+    {\
+      uint32_t k_copy_start, k_copy_end;\
+      while(get_copy_task(&k_copy_left, 64, &k_copy_start, &k_copy_end)) {\
+        if (b_rowmajor == 0) {\
+          pack_##pack1##_from_cm(b_scr + k_copy_start * n_pack1,\
+            B + (k_pos + k_copy_start), LDB,\
+            k_copy_end - k_copy_start, n_pack1);\
+          pack_##pack2##_from_cm(b_scr2 + k_copy_start * n_pack2,\
+            B + (k_pos + k_copy_start) + n_pack1 * LDB, LDB,\
+            k_copy_end - k_copy_start, n_pack2);\
+          pack_##pack3##_from_cm(b_scr3 + k_copy_start * n_pack3,\
+            B + (k_pos + k_copy_start) + (n_pack1 + n_pack2) * LDB, LDB,\
+            k_copy_end - k_copy_start, n_pack3);\
+        } else {\
+          pack_##pack1##_from_rm(b_scr + k_copy_start * n_pack1,\
+            B + (k_pos + k_copy_start) * LDB, LDB,\
+            k_copy_end - k_copy_start, n_pack1);\
+          pack_##pack2##_from_rm(b_scr2 + k_copy_start * n_pack2,\
+            B + (k_pos + k_copy_start) * LDB + n_pack1, LDB,\
+            k_copy_end - k_copy_start, n_pack2);\
+          pack_##pack3##_from_rm(b_scr3 + k_copy_start * n_pack3,\
+            B + (k_pos + k_copy_start) * LDB + n_pack1 + n_pack2, LDB,\
+            k_copy_end - k_copy_start, n_pack3);\
+        }\
+      }\
+      _Pragma("omp barrier")\
+      uint32_t m_calc_start, m_calc_end;\
+      while(get_irreg_task(&m_calc_done, &m_calc_start, &m_calc_end,\
+        unroll_m << 2, M)) {\
+        const float *a_ptr = A + m_calc_start * LDA + k_pos;\
+        float *c_ptr1 = C + m_calc_start * c_m_inc;\
+        float *c_ptr2 = (c_rowmajor == 0) ?\
+          c_ptr1 + n_pack1 * LDC : c_ptr1 + n_pack1;\
+        float *c_ptr3 = (c_rowmajor == 0) ?\
+          c_ptr1 + (n_pack1 + n_pack2) * LDC : c_ptr1 + n_pack1 + n_pack2;\
+        uint32_t sub_m_left = m_calc_end - m_calc_start;\
+        for (; sub_m_left >= unroll_m; sub_m_left -= unroll_m) {\
+          sgemm_skinny1_##cpu##_m##unroll_m##n##n_pack1(a_ptr, b_scr, c_ptr1,\
+            k_inc, LDA, LDC, c_rowmajor, &beta);\
+          sgemm_skinny1_##cpu##_m##unroll_m##n##n_pack2(a_ptr, b_scr2, c_ptr2,\
+            k_inc, LDA, LDC, c_rowmajor, &beta);\
+          sgemm_skinny1_##cpu##_m##unroll_m##n##n_pack3(a_ptr, b_scr3, c_ptr3,\
+            k_inc, LDA, LDC, c_rowmajor, &beta);\
+          a_ptr += LDA * unroll_m;\
+          c_ptr1 += c_m_inc * unroll_m;\
+          c_ptr2 += c_m_inc * unroll_m;\
+          c_ptr3 += c_m_inc * unroll_m;\
+        }\
+        for (; sub_m_left > 0; sub_m_left--) {\
+          sgemm_skinny1_##cpu##_m1n##n_pack1(a_ptr, b_scr, c_ptr1, k_inc, LDC,\
+            c_rowmajor, beta);\
+          sgemm_skinny1_##cpu##_m1n##n_pack2(a_ptr, b_scr2, c_ptr2, k_inc, LDC,\
+            c_rowmajor, beta);\
+          sgemm_skinny1_##cpu##_m1n##n_pack3(a_ptr, b_scr3, c_ptr3, k_inc, LDC,\
+            c_rowmajor, beta);\
+          a_ptr += LDA;\
+          c_ptr1 += c_m_inc;\
+          c_ptr2 += c_m_inc;\
+          c_ptr3 += c_m_inc;\
+        }\
+      }\
+    }\
+  }\
+}
+
+#ifdef EMLL_SERIAL_ONLY
+
+#define DRIVER_PURE_PACK(cpu, ndim, K_BATCH, pack_type, unroll_m) \
+  DRIVER_PURE_PACK_SERIAL(cpu, ndim, K_BATCH, pack_type, unroll_m)\
+void sgemm_skinny1_arowmajor_n##ndim##_##cpu##_omp(\
+  const float * __restrict__ A,\
+  const float * __restrict__ B, float * __restrict__ C,\
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,\
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads) {\
+\
+  sgemm_skinny1_arowmajor_n##ndim##_##cpu(A, B, C, M, K,\
+    LDA, LDB, LDC, b_c_order, beta_inp);\
+}
+
+#define DRIVER_MIX2_PACK(cpu, ndim, K_BATCH, pack1, pack2, n_pack1, n_pack2, unroll_m) \
+  DRIVER_MIX2_PACK_SERIAL(cpu, ndim, K_BATCH, pack1, pack2, n_pack1, n_pack2, unroll_m)\
+void sgemm_skinny1_arowmajor_n##ndim##_##cpu##_omp(\
+  const float * __restrict__ A,\
+  const float * __restrict__ B, float * __restrict__ C,\
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,\
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads) {\
+\
+  sgemm_skinny1_arowmajor_n##ndim##_##cpu(A, B, C, M, K,\
+    LDA, LDB, LDC, b_c_order, beta_inp);\
+}
+
+#define DRIVER_MIX3_PACK(cpu, ndim, K_BATCH, pack1, pack2, pack3, n_pack1, n_pack2, n_pack3, unroll_m) \
+  DRIVER_MIX3_PACK_SERIAL(cpu, ndim, K_BATCH, pack1, pack2, pack3, n_pack1, n_pack2, n_pack3, unroll_m)\
+void sgemm_skinny1_arowmajor_n##ndim##_##cpu##_omp(\
+  const float * __restrict__ A,\
+  const float * __restrict__ B, float * __restrict__ C,\
+  uint32_t M, uint32_t K, uint32_t LDA, uint32_t LDB, uint32_t LDC,\
+  uint8_t b_c_order, float beta_inp, uint32_t num_threads) {\
+\
+  sgemm_skinny1_arowmajor_n##ndim##_##cpu(A, B, C, M, K,\
+    LDA, LDB, LDC, b_c_order, beta_inp);\
+}
+
+#else
+
+#define DRIVER_PURE_PACK(cpu, ndim, K_BATCH, pack_type, unroll_m) \
+  DRIVER_PURE_PACK_SERIAL(cpu, ndim, K_BATCH, pack_type, unroll_m)\
+  DRIVER_PURE_PACK_OMP(cpu, ndim, K_BATCH, pack_type, unroll_m)
+
+#define DRIVER_MIX2_PACK(cpu, ndim, K_BATCH, pack1, pack2, n_pack1, n_pack2, unroll_m) \
+  DRIVER_MIX2_PACK_SERIAL(cpu, ndim, K_BATCH, pack1, pack2, n_pack1, n_pack2, unroll_m)\
+  DRIVER_MIX2_PACK_OMP(cpu, ndim, K_BATCH, pack1, pack2, n_pack1, n_pack2, unroll_m)
+
+#define DRIVER_MIX3_PACK(cpu, ndim, K_BATCH, pack1, pack2, pack3, n_pack1, n_pack2, n_pack3, unroll_m) \
+  DRIVER_MIX3_PACK_SERIAL(cpu, ndim, K_BATCH, pack1, pack2, pack3, n_pack1, n_pack2, n_pack3, unroll_m)\
+  DRIVER_MIX3_PACK_OMP(cpu, ndim, K_BATCH, pack1, pack2, pack3, n_pack1, n_pack2, n_pack3, unroll_m)
+
+#endif
+#endif
+
diff --git a/include/neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotKernelA35.h b/include/neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotKernelA35.h
new file mode 100644
index 0000000..5cb2de5
--- /dev/null
+++ b/include/neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotKernelA35.h
@@ -0,0 +1,2439 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+#include <arm_neon.h>
+
+#ifndef INCLUDE_A35_KERNEL
+#define INCLUDE_A35_KERNEL
+
+/* for cortex-a35, fp32 NEON operation on q regs are not recommended,
+ * using d regs without broadcast is better */
+
+/* for cortex-a35 fp32 fma instruction sequence,
+ * it's recommended to put 3 nearest fma inst together */
+#define FMA_3V(c1, c2, c3, a1, a2, a3, b1, b2, b3) \
+  "fmla v"#c1".2s,v"#a1".2s,v"#b1".2s\n\t"\
+  "fmla v"#c2".2s,v"#a2".2s,v"#b2".2s\n\t"\
+  "fmla v"#c3".2s,v"#a3".2s,v"#b3".2s\n\t"
+
+#define INIT_3V(c1, c2, c3) \
+  "movi v"#c1".8b,#0; movi v"#c2".8b,#0; movi v"#c3".8b,#0\n\t"
+
+#define INIT_4V(c1, c2, c3, c4) INIT_3V(c1, c2, c3)\
+  "movi v"#c4".8b,#0\n\t"
+
+/* x12 - x15 for c_tmp pointers */
+/* v0 always for beta at storage status */
+
+#define INIT_SAVE_M3_CR \
+  "ld1r {v0.2s},[%[beta_addr]]\n\t"\
+  "mov x12,%[c_ptr]; add x13,%[c_ptr],%w[LDC],UXTW #2\n\t"\
+  "add x14,%[c_ptr],%w[LDC],UXTW #3\n\t"
+
+#define INIT_SAVE_M4_CR INIT_SAVE_M3_CR \
+  "add x15,x13,%w[LDC],UXTW #3\n\t"
+
+#define INIT_SAVE_CC \
+  "ld1r {v0.2s},[%[beta_addr]]; mov x12,%[c_ptr]\n\t"\
+  "add x13,%[c_ptr],%w[LDC],UXTW #2\n\t"
+
+/* c1[0], c1[1] */
+/* c2[0], c2[1] */
+/* c3[0], c3[1] */
+/* c4[0], c4[1] */
+/* clobber: x12 - x13, v0 - v4 */
+#define UNIT_SAVE_M4N2_CC(c1, c2, c3, c4) \
+  "ldr d1,[x12]; ldr d2,[x12,#8]\n\t"\
+  "trn1 v3.2s,v"#c1".2s,v"#c2".2s; trn1 v4.2s,v"#c3".2s,v"#c4".2s\n\t"\
+  "trn2 v"#c2".2s,v"#c1".2s,v"#c2".2s; trn2 v"#c4".2s,v"#c3".2s,v"#c4".2s\n\t"\
+  "ldr d"#c1",[x13]; ldr d"#c3",[x13,#8]\n\t"\
+  "fmla v3.2s,v1.2s,v0.2s\n\t"\
+  "fmla v4.2s,v2.2s,v0.2s\n\t"\
+  "fmla v"#c2".2s,v"#c1".2s,v0.2s\n\t"\
+  "fmla v"#c4".2s,v"#c3".2s,v0.2s\n\t"\
+  "str d3,[x12]; str d4,[x12,#8]\n\t"\
+  "prfm pstl2keep,[x12,#32]; add x12,x12,%w[LDC],UXTW #3\n\t"\
+  "str d"#c2",[x13]; str d"#c4",[x13,#8]\n\t"\
+  "prfm pstl2keep,[x13,#32]; add x13,x13,%w[LDC],UXTW #3\n\t"
+
+/* clobber: x12 - x15, v0 - v4 */
+#define UNIT_SAVE_M4N2_CR(c1, c2, c3, c4) \
+  "ldr d1,[x12]; ldr d2,[x13]\n\t"\
+  "ldr d3,[x14]; ldr d4,[x15]\n\t"\
+  "fmla v"#c1".2s,v1.2s,v0.2s\n\t"\
+  "fmla v"#c2".2s,v2.2s,v0.2s\n\t"\
+  "fmla v"#c3".2s,v3.2s,v0.2s\n\t"\
+  "fmla v"#c4".2s,v4.2s,v0.2s\n\t"\
+  "str d"#c1",[x12],#8; str d"#c2",[x13],#8\n\t"\
+  "str d"#c3",[x14],#8; str d"#c4",[x15],#8\n\t"
+
+/* c1[0], c1[1] */
+/* c2[0], c2[1] */
+/* c3[0], c3[1] */
+/* clobber: x12 - x13, v0 - v3 */
+#define UNIT_SAVE_M3N2_CC(c1, c2, c3) \
+  "ldr d1,[x12]\n\t"\
+  "trn1 v2.2s,v"#c1".2s,v"#c2".2s\n\t"\
+  "trn2 v"#c2".2s,v"#c1".2s,v"#c2".2s\n\t"\
+  "ldr d"#c1",[x13]; ldr s3,[x12,#8]\n\t"\
+  "fmla v2.2s,v1.2s,v0.2s\n\t"\
+  "fmla v"#c2".2s,v"#c1".2s,v0.2s\n\t"\
+  "ldr s1,[x13,#8]\n\t"\
+  "str d2,[x12]; ins v2.s[0],v"#c3".s[1]\n\t"\
+  "str d"#c2",[x13]\n\t"\
+  "fmla s"#c3",s3,v0.s[0]; fmla s2,s1,v0.s[0]\n\t"\
+  "str s"#c3",[x12,#8]; prfm pstl2keep,[x12,#24]\n\t"\
+  "add x12,x12,%w[LDC],UXTW #3\n\t"\
+  "str s2,[x13,#8]; prfm pstl2keep,[x13,#24]\n\t"\
+  "add x13,x13,%w[LDC],UXTW #3\n\t"
+
+/* clobber: x12 - x14, v0 - v3 */
+#define UNIT_SAVE_M3N2_CR(c1, c2, c3) \
+  "ldr d1,[x12]; ldr d2,[x13]\n\t"\
+  "ldr d3,[x14]\n\t"\
+  "fmla v"#c1".2s,v1.2s,v0.2s\n\t"\
+  "fmla v"#c2".2s,v2.2s,v0.2s\n\t"\
+  "fmla v"#c3".2s,v3.2s,v0.2s\n\t"\
+  "str d"#c1",[x12],#8; str d"#c2",[x13],#8\n\t"\
+  "str d"#c3",[x14],#8\n\t"
+
+/* c1[0] + c1[1] */
+/* c2[0] + c2[1] */
+/* c3[0] + c3[1] */
+/* c4[0] + c4[1] */
+/* clobber: x12, v0 - v4 */
+#define UNIT_SAVE_M4N1_CC(c1, c2, c3, c4) \
+  "ldr d3,[x12]; ldr d4,[x12,#8]\n\t"\
+  "faddp v1.2s,v"#c1".2s,v"#c2".2s\n\t"\
+  "faddp v2.2s,v"#c3".2s,v"#c4".2s\n\t"\
+  "fmla v1.2s,v3.2s,v0.2s\n\t"\
+  "fmla v2.2s,v4.2s,v0.2s\n\t"\
+  "str d1,[x12]; str d2,[x12,#8]\n\t"\
+  "prfm pstl2keep,[x12,#32]\n\t"\
+  "add x12,x12,%w[LDC],UXTW #2\n\t"
+
+/* clobber: x12 - x15, v0 - v4 */
+#define UNIT_SAVE_M4N1_CR(c1, c2, c3, c4) \
+  "ldr s1,[x12]; ldr s2,[x13]\n\t"\
+  "faddp v"#c1".2s,v"#c1".2s,v"#c3".2s\n\t"\
+  "ld1 {v1.s}[1],[x14]; ld1 {v2.s}[1],[x15]\n\t"\
+  "faddp v"#c2".2s,v"#c2".2s,v"#c4".2s\n\t"\
+  "fmla v"#c1".2s,v1.2s,v0.2s\n\t"\
+  "fmla v"#c2".2s,v2.2s,v0.2s\n\t"\
+  "str s"#c1",[x12],#4; str s"#c2",[x13],#4\n\t"\
+  "st1 {v"#c1".s}[1],[x14],#4; st1 {v"#c2".s}[1],[x15],#4\n\t"
+
+/* c1[0] + c1[1] */
+/* c2[0] + c2[1] */
+/* c3[0] + c3[1] */
+/* clobber: x12, v0 - v3 */
+#define UNIT_SAVE_M3N1_CC(c1, c2, c3) \
+  "ldr d1,[x12]; ldr s2,[x12,#8]\n\t"\
+  "faddp v"#c1".2s,v"#c1".2s,v"#c2".2s\n\t"\
+  "faddp s"#c3",v"#c3".2s\n\t"\
+  "fmla v"#c1".2s,v1.2s,v0.2s\n\t"\
+  "fmla s"#c3",s2,v0.s[0]\n\t"\
+  "str d"#c1",[x12]; str s"#c3",[x12,#8]\n\t"\
+  "prfm pstl2keep,[x12,#24]\n\t"\
+  "add x12,x12,%w[LDC],UXTW #2\n\t"
+
+/* clobber: x12 - x14, v0 - v3 */
+#define UNIT_SAVE_M3N1_CR(c1, c2, c3) \
+  "ldr s1,[x12]; ldr s2,[x13]; ldr s3,[x14]\n\t"\
+  "faddp s"#c1",v"#c1".2s; faddp s"#c2",v"#c2".2s; faddp s"#c3",v"#c3".2s\n\t"\
+  "fmla s"#c1",s1,v0.s[0]\n\t"\
+  "fmla s"#c2",s2,v0.s[0]\n\t"\
+  "fmla s"#c3",s3,v0.s[0]\n\t"\
+  "str s"#c1",[x12],#4\n\t"\
+  "str s"#c2",[x13],#4\n\t"\
+  "str s"#c3",[x14],#4\n\t"
+
+/* x0 = a_ptr1 (top) */
+/* x1 = a_ptr2 */
+/* x2 = a_ptr3 */
+/* x3 = a_ptr4 (or pref_head when M == 3) */
+/* x4 = b_ptr */
+/* w5 = k_left */
+/* x8 - x11 for pref head */
+/* x12 - x15 for c_tmp1 - c_tmp4 */
+
+/* macro for GEMM with packing pattern NO.#3 */
+/* mdim = 3, 4; ndim = 5 - 8 */
+#define FUNC_PACK3(mdim, ndim) \
+static inline void sgemm_skinny1_a35_m##mdim##n##ndim(\
+  const float * __restrict__ a_ptr, const float * __restrict__ b_scr,\
+  float * __restrict__ c_ptr, uint32_t K, uint32_t LDA, uint32_t LDC,\
+  uint8_t c_rowmajor, const float * __restrict__ beta_addr) {\
+  __asm__ __volatile__(\
+    "mov x4,%[b_scr]\n\t"\
+    "mov x0,%[a_ptr]; add x1,%[a_ptr],%w[LDA],UXTW #2\n\t"\
+    "add x2,%[a_ptr],%w[LDA],UXTW #3; add x3,x1,%w[LDA],UXTW #3\n\t"\
+    "add x8,x0,%w[LDA],UXTW #4; add x9,x1,%w[LDA],UXTW #4\n\t"\
+    "add x10,x2,%w[LDA],UXTW #4; add x11,x3,%w[LDA],UXTW #4\n\t"\
+    "mov w5,%w[K]\n\t"\
+    INIT_M##mdim##N##ndim\
+    "cmp w5,#2; b.lt 4f\n\t"\
+    KERNEL_M##mdim##N##ndim##_PRELOAD2\
+    "cmp w5,#10; b.lt 7f\n\t"\
+    ".balign 16; 8:\n\t"\
+    KERNEL_M##mdim##N##ndim##_MAIN8 "b.ge 8b\n\t"\
+    "7:\n\t"\
+    "cmp w5,#6; b.lt 1f\n\t"\
+    KERNEL_M##mdim##N##ndim##_MAIN4\
+    "1:\n\t"\
+    "cmp w5,#4; b.lt 2f\n\t"\
+    KERNEL_M##mdim##N##ndim##_TAIL4 "b 4f\n\t"\
+    "2:\n\t"\
+    KERNEL_M##mdim##N##ndim##_TAIL2\
+    "4:\n\t"\
+    "cmp w5,#1; b.lt 5f\n\t"\
+    KERNEL_M##mdim##N##ndim##_FIN1\
+    "5:\n\t"\
+    "cmp %w[c_rowmajor],#0; b.eq 6f\n\t"\
+    INIT_SAVE_M##mdim##_CR SAVE_M##mdim##N##ndim(CR) "b 7f\n\t"\
+    "6:\n\t"\
+    INIT_SAVE_CC SAVE_M##mdim##N##ndim(CC)\
+    "7:\n\t"\
+   ::[a_ptr]"r"(a_ptr), [b_scr]"r"(b_scr), [c_ptr]"r"(c_ptr),\
+     [LDA]"r"(LDA), [LDC]"r"(LDC), [K]"r"(K),\
+     [beta_addr]"r"(beta_addr), [c_rowmajor]"r"(c_rowmajor)\
+   :"cc","memory","x0","x1","x2","x3","x4","x5",\
+    "x8","x9","x10","x11","x12","x13","x14","x15",\
+    "v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11",\
+    "v12","v13","v14","v15","v16","v17","v18","v19","v20","v21",\
+    "v22","v23","v24","v25","v26","v27","v28","v29","v30","v31");\
+}
+
+/* acc layout for m4n4 kernel */
+/* m0n0 v16 v17 v18 v19 m0n4 */
+/* m1n0 v20 v21 v22 v23 m1n4 */
+/* m2n0 v24 v25 v26 v27 m2n4 */
+/* m3n0 v28 v29 v30 v31 m3n4 */
+/* b-holder layout for m4n4 kernel */
+/* n0 v4 v5 v6 v7 */
+/* a-holder layout for m4n4 kernel */
+/* a_ptr1->v0, a_ptr2->v1, a_ptr3->v2, a_ptr4->v3 */
+
+#define INIT_M4N4 \
+  INIT_4V(16, 20, 24, 28) INIT_4V(17, 21, 25, 29)\
+  INIT_4V(18, 22, 26, 30) INIT_4V(19, 23, 27, 31)
+
+#define SAVE_M4N4(mode) \
+  UNIT_SAVE_M4N1_##mode(16, 20, 24, 28) UNIT_SAVE_M4N1_##mode(17, 21, 25, 29)\
+  UNIT_SAVE_M4N1_##mode(18, 22, 26, 30) UNIT_SAVE_M4N1_##mode(19, 23, 27, 31)
+
+#define KERNEL_M4N4_PRELOAD2 \
+  "ldr d0,[x0],#8\n\t"\
+  "ldr d4,[x4]; ldr d5,[x4,#8]; ldr d6,[x4,#16]; add x4,x4,#32\n\t"
+
+#define KERNEL_M4N4_MAIN8 \
+  "ldr d1,[x1],#8\n\t" FMA_3V(16, 17, 18, 0, 0, 0, 4, 5, 6)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(20, 21, 22, 1, 1, 1, 4, 5, 6)\
+  "ldr d3,[x3],#8\n\t" FMA_3V(24, 25, 26, 2, 2, 2, 4, 5, 6)\
+  "ldr d7,[x4,#-8]\n\t" FMA_3V(28, 29, 30, 3, 3, 3, 4, 5, 6)\
+  "ldr d4,[x4]\n\t" FMA_3V(19, 23, 27, 0, 1, 2, 7, 7, 7)\
+  "ldr d0,[x0],#8; ldr d5,[x4,#8]; ldr d6,[x4,#16]\n\t"\
+  "prfm pldl1keep,[x0,#64]; fmla v31.2s,v3.2s,v7.2s\n\t"\
+  "ldr d1,[x1],#8\n\t" FMA_3V(16, 17, 18, 0, 0, 0, 4, 5, 6)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(20, 21, 22, 1, 1, 1, 4, 5, 6)\
+  "ldr d3,[x3],#8\n\t" FMA_3V(24, 25, 26, 2, 2, 2, 4, 5, 6)\
+  "ldr d7,[x4,#24]\n\t" FMA_3V(28, 29, 30, 3, 3, 3, 4, 5, 6)\
+  "ldr d4,[x4,#32]\n\t" FMA_3V(19, 23, 27, 0, 1, 2, 7, 7, 7)\
+  "ldr d0,[x0],#8; ldr d5,[x4,#40]; ldr d6,[x4,#48]\n\t"\
+  "prfm pldl1keep,[x1,#64]; fmla v31.2s,v3.2s,v7.2s\n\t"\
+  "ldr d1,[x1],#8\n\t" FMA_3V(16, 17, 18, 0, 0, 0, 4, 5, 6)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(20, 21, 22, 1, 1, 1, 4, 5, 6)\
+  "ldr d3,[x3],#8\n\t" FMA_3V(24, 25, 26, 2, 2, 2, 4, 5, 6)\
+  "ldr d7,[x4,#56]\n\t" FMA_3V(28, 29, 30, 3, 3, 3, 4, 5, 6)\
+  "ldr d4,[x4,#64]\n\t" FMA_3V(19, 23, 27, 0, 1, 2, 7, 7, 7)\
+  "ldr d0,[x0],#8; ldr d5,[x4,#72]; ldr d6,[x4,#80]\n\t"\
+  "prfm pldl1keep,[x2,#64]; fmla v31.2s,v3.2s,v7.2s\n\t"\
+  "ldr d1,[x1],#8\n\t" FMA_3V(16, 17, 18, 0, 0, 0, 4, 5, 6)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(20, 21, 22, 1, 1, 1, 4, 5, 6)\
+  "ldr d3,[x3],#8\n\t" FMA_3V(24, 25, 26, 2, 2, 2, 4, 5, 6)\
+  "ldr d7,[x4,#88]\n\t" FMA_3V(28, 29, 30, 3, 3, 3, 4, 5, 6)\
+  "ldr d4,[x4,#96]\n\t" FMA_3V(19, 23, 27, 0, 1, 2, 7, 7, 7)\
+  "ldr d0,[x0],#8; sub w5,w5,#8\n\t"\
+  "ldr d5,[x4,#104]; ldr d6,[x4,#112]; cmp w5,#10\n\t"\
+  "prfm pldl1keep,[x3,#64]; fmla v31.2s,v3.2s,v7.2s; add x4,x4,#128\n\t"
+
+#define KERNEL_M4N4_MAIN4 \
+  "ldr d1,[x1],#8\n\t" FMA_3V(16, 17, 18, 0, 0, 0, 4, 5, 6)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(20, 21, 22, 1, 1, 1, 4, 5, 6)\
+  "ldr d3,[x3],#8\n\t" FMA_3V(24, 25, 26, 2, 2, 2, 4, 5, 6)\
+  "ldr d7,[x4,#-8]\n\t" FMA_3V(28, 29, 30, 3, 3, 3, 4, 5, 6)\
+  "ldr d4,[x4]\n\t" FMA_3V(19, 23, 27, 0, 1, 2, 7, 7, 7)\
+  "ldr d0,[x0],#8; ldr d5,[x4,#8]; ldr d6,[x4,#16]\n\t"\
+  "sub w5,w5,#4; fmla v31.2s,v3.2s,v7.2s\n\t"\
+  "ldr d1,[x1],#8\n\t" FMA_3V(16, 17, 18, 0, 0, 0, 4, 5, 6)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(20, 21, 22, 1, 1, 1, 4, 5, 6)\
+  "ldr d3,[x3],#8\n\t" FMA_3V(24, 25, 26, 2, 2, 2, 4, 5, 6)\
+  "ldr d7,[x4,#24]\n\t" FMA_3V(28, 29, 30, 3, 3, 3, 4, 5, 6)\
+  "ldr d4,[x4,#32]\n\t" FMA_3V(19, 23, 27, 0, 1, 2, 7, 7, 7)\
+  "ldr d0,[x0],#8; ldr d5,[x4,#40]; ldr d6,[x4,#48]\n\t"\
+  "add x4,x4,#64; fmla v31.2s,v3.2s,v7.2s\n\t"
+
+#define KERNEL_M4N4_TAIL4 \
+  "ldr d1,[x1],#8\n\t" FMA_3V(16, 17, 18, 0, 0, 0, 4, 5, 6)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(20, 21, 22, 1, 1, 1, 4, 5, 6)\
+  "ldr d3,[x3],#8\n\t" FMA_3V(24, 25, 26, 2, 2, 2, 4, 5, 6)\
+  "ldr d7,[x4,#-8]\n\t" FMA_3V(28, 29, 30, 3, 3, 3, 4, 5, 6)\
+  "ldr d4,[x4]\n\t" FMA_3V(19, 23, 27, 0, 1, 2, 7, 7, 7)\
+  "ldr d0,[x0],#8; ldr d5,[x4,#8]; ldr d6,[x4,#16]\n\t"\
+  "prfm pldl1keep,[x8]; sub w5,w5,#4\n\t"\
+  "prfm pldl1keep,[x9]; fmla v31.2s,v3.2s,v7.2s\n\t"\
+  "ldr d1,[x1],#8\n\t" FMA_3V(16, 17, 18, 0, 0, 0, 4, 5, 6)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(20, 21, 22, 1, 1, 1, 4, 5, 6)\
+  "ldr d3,[x3],#8\n\t" FMA_3V(24, 25, 26, 2, 2, 2, 4, 5, 6)\
+  "ldr d7,[x4,#24]\n\t" FMA_3V(28, 29, 30, 3, 3, 3, 4, 5, 6)\
+  "prfm pldl1keep,[x10]\n\t" FMA_3V(19, 23, 27, 0, 1, 2, 7, 7, 7)\
+  "add x4,x4,#32\n\t"\
+  "prfm pldl1keep,[x11]; fmla v31.2s,v3.2s,v7.2s\n\t"
+
+#define KERNEL_M4N4_TAIL2 \
+  "ldr d1,[x1],#8\n\t" FMA_3V(16, 17, 18, 0, 0, 0, 4, 5, 6)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(20, 21, 22, 1, 1, 1, 4, 5, 6)\
+  "ldr d3,[x3],#8\n\t" FMA_3V(24, 25, 26, 2, 2, 2, 4, 5, 6)\
+  "ldr d7,[x4,#-8]\n\t" FMA_3V(28, 29, 30, 3, 3, 3, 4, 5, 6)\
+  "prfm pldl1keep,[x8]\n\t" FMA_3V(19, 23, 27, 0, 1, 2, 7, 7, 7)\
+  "prfm pldl1keep,[x9]\n\t"\
+  "prfm pldl1keep,[x10]; sub w5,w5,#2\n\t"\
+  "prfm pldl1keep,[x11]; fmla v31.2s,v3.2s,v7.2s\n\t"
+
+#define KERNEL_M4N4_FIN1 \
+  "ldr s0,[x0],#4; ldr s4,[x4]; ldr s5,[x4,#4]; ldr s6,[x4,#8]\n\t"\
+  "ldr s1,[x1],#4\n\t" FMA_3V(16, 17, 18, 0, 0, 0, 4, 5, 6)\
+  "ldr s2,[x2],#4\n\t" FMA_3V(20, 21, 22, 1, 1, 1, 4, 5, 6)\
+  "ldr s3,[x3],#4\n\t" FMA_3V(24, 25, 26, 2, 2, 2, 4, 5, 6)\
+  "ldr s7,[x4,#12]\n\t" FMA_3V(28, 29, 30, 3, 3, 3, 4, 5, 6)\
+  "add x4,x4,#16\n\t" FMA_3V(19, 23, 27, 0, 1, 2, 7, 7, 7)\
+  "fmla v31.2s,v3.2s,v7.2s\n\t"
+
+
+/* acc layout for m4n5 kernel */
+/* m0n0 v12 v13 v14 v15 v16 m0n5 */
+/* m1n0 v17 v18 v19 v20 v21 m1n5 */
+/* m2n0 v22 v23 v24 v25 v26 m2n5 */
+/* m3n0 v27 v28 v29 v30 v31 m3n5 */
+/* b-holder layout for m4n5 kernel */
+/* n0 v4 v5 v6 v7 v8 */
+/* a-holder layout for m4n5 kernel */
+/* a_ptr1->v0, a_ptr2->v1, a_ptr3->v2, a_ptr4->v3 */
+
+#define INIT_M4N5 \
+  INIT_4V(12, 17, 22, 27) INIT_4V(13, 18, 23, 28) INIT_4V(14, 19, 24, 29)\
+  INIT_4V(15, 20, 25, 30) INIT_4V(16, 21, 26, 31)
+
+#define SAVE_M4N5(mode) \
+  UNIT_SAVE_M4N1_##mode(12, 17, 22, 27) UNIT_SAVE_M4N1_##mode(13, 18, 23, 28)\
+  UNIT_SAVE_M4N1_##mode(14, 19, 24, 29) UNIT_SAVE_M4N1_##mode(15, 20, 25, 30)\
+  UNIT_SAVE_M4N1_##mode(16, 21, 26, 31)
+
+#define KERNEL_M4N5_PRELOAD2 \
+  "ldr d0,[x0],#8\n\t"\
+  "ldr d4,[x4]; ldr d5,[x4,#8]; ldr d6,[x4,#16]; add x4,x4,#40\n\t"
+
+#define KERNEL_M4N5_MAIN8 \
+  "ldr d1,[x1],#8\n\t" FMA_3V(12, 13, 14, 0, 0, 0, 4, 5, 6)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(17, 18, 19, 1, 1, 1, 4, 5, 6)\
+  "ldr d3,[x3],#8\n\t" FMA_3V(22, 23, 24, 2, 2, 2, 4, 5, 6)\
+  "ldr d7,[x4,#-16]\n\t" FMA_3V(27, 28, 29, 3, 3, 3, 4, 5, 6)\
+  "ldr d8,[x4,#-8]\n\t" FMA_3V(15, 20, 25, 0, 1, 2, 7, 7, 7)\
+  "ldr d4,[x4]\n\t" FMA_3V(16, 21, 26, 0, 1, 2, 8, 8, 8)\
+  "ldr d0,[x0],#8; ldr d5,[x4,#8]; ldr d6,[x4,#16]\n\t"\
+  "prfm pldl1keep,[x0,#64]; fmla v30.2s,v3.2s,v7.2s; fmla v31.2s,v3.2s,v8.2s\n\t"\
+  "ldr d1,[x1],#8\n\t" FMA_3V(12, 13, 14, 0, 0, 0, 4, 5, 6)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(17, 18, 19, 1, 1, 1, 4, 5, 6)\
+  "ldr d3,[x3],#8\n\t" FMA_3V(22, 23, 24, 2, 2, 2, 4, 5, 6)\
+  "ldr d7,[x4,#24]\n\t" FMA_3V(27, 28, 29, 3, 3, 3, 4, 5, 6)\
+  "ldr d8,[x4,#32]\n\t" FMA_3V(15, 20, 25, 0, 1, 2, 7, 7, 7)\
+  "ldr d4,[x4,#40]\n\t" FMA_3V(16, 21, 26, 0, 1, 2, 8, 8, 8)\
+  "ldr d0,[x0],#8; ldr d5,[x4,#48]; ldr d6,[x4,#56]; sub w5,w5,#8\n\t"\
+  "prfm pldl1keep,[x1,#64]; fmla v30.2s,v3.2s,v7.2s; fmla v31.2s,v3.2s,v8.2s\n\t"\
+  "ldr d1,[x1],#8\n\t" FMA_3V(12, 13, 14, 0, 0, 0, 4, 5, 6)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(17, 18, 19, 1, 1, 1, 4, 5, 6)\
+  "ldr d3,[x3],#8\n\t" FMA_3V(22, 23, 24, 2, 2, 2, 4, 5, 6)\
+  "ldr d7,[x4,#64]\n\t" FMA_3V(27, 28, 29, 3, 3, 3, 4, 5, 6)\
+  "ldr d8,[x4,#72]\n\t" FMA_3V(15, 20, 25, 0, 1, 2, 7, 7, 7)\
+  "ldr d4,[x4,#80]\n\t" FMA_3V(16, 21, 26, 0, 1, 2, 8, 8, 8)\
+  "ldr d0,[x0],#8; ldr d5,[x4,#88]; ldr d6,[x4,#96]; cmp w5,#10\n\t"\
+  "prfm pldl1keep,[x2,#64]; fmla v30.2s,v3.2s,v7.2s; fmla v31.2s,v3.2s,v8.2s\n\t"\
+  "ldr d1,[x1],#8\n\t" FMA_3V(12, 13, 14, 0, 0, 0, 4, 5, 6)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(17, 18, 19, 1, 1, 1, 4, 5, 6)\
+  "ldr d3,[x3],#8\n\t" FMA_3V(22, 23, 24, 2, 2, 2, 4, 5, 6)\
+  "ldr d7,[x4,#104]\n\t" FMA_3V(27, 28, 29, 3, 3, 3, 4, 5, 6)\
+  "ldr d8,[x4,#112]\n\t" FMA_3V(15, 20, 25, 0, 1, 2, 7, 7, 7)\
+  "ldr d4,[x4,#120]\n\t" FMA_3V(16, 21, 26, 0, 1, 2, 8, 8, 8)\
+  "ldr d0,[x0],#8; ldr d5,[x4,#128]; ldr d6,[x4,#136]; add x4,x4,#160\n\t"\
+  "prfm pldl1keep,[x3,#64]; fmla v30.2s,v3.2s,v7.2s; fmla v31.2s,v3.2s,v8.2s\n\t"
+
+#define KERNEL_M4N5_MAIN4 \
+  "ldr d1,[x1],#8\n\t" FMA_3V(12, 13, 14, 0, 0, 0, 4, 5, 6)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(17, 18, 19, 1, 1, 1, 4, 5, 6)\
+  "ldr d3,[x3],#8\n\t" FMA_3V(22, 23, 24, 2, 2, 2, 4, 5, 6)\
+  "ldr d7,[x4,#-16]\n\t" FMA_3V(27, 28, 29, 3, 3, 3, 4, 5, 6)\
+  "ldr d8,[x4,#-8]\n\t" FMA_3V(15, 20, 25, 0, 1, 2, 7, 7, 7)\
+  "ldr d4,[x4]\n\t" FMA_3V(16, 21, 26, 0, 1, 2, 8, 8, 8)\
+  "ldr d0,[x0],#8; ldr d5,[x4,#8]; ldr d6,[x4,#16]; sub w5,w5,#4\n\t"\
+  "fmla v30.2s,v3.2s,v7.2s; fmla v31.2s,v3.2s,v8.2s\n\t"\
+  "ldr d1,[x1],#8\n\t" FMA_3V(12, 13, 14, 0, 0, 0, 4, 5, 6)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(17, 18, 19, 1, 1, 1, 4, 5, 6)\
+  "ldr d3,[x3],#8\n\t" FMA_3V(22, 23, 24, 2, 2, 2, 4, 5, 6)\
+  "ldr d7,[x4,#24]\n\t" FMA_3V(27, 28, 29, 3, 3, 3, 4, 5, 6)\
+  "ldr d8,[x4,#32]\n\t" FMA_3V(15, 20, 25, 0, 1, 2, 7, 7, 7)\
+  "ldr d4,[x4,#40]\n\t" FMA_3V(16, 21, 26, 0, 1, 2, 8, 8, 8)\
+  "ldr d0,[x0],#8; ldr d5,[x4,#48]; ldr d6,[x4,#56]; add x4,x4,#80\n\t"\
+  "fmla v30.2s,v3.2s,v7.2s; fmla v31.2s,v3.2s,v8.2s\n\t"
+
+#define KERNEL_M4N5_TAIL4 \
+  "ldr d1,[x1],#8\n\t" FMA_3V(12, 13, 14, 0, 0, 0, 4, 5, 6)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(17, 18, 19, 1, 1, 1, 4, 5, 6)\
+  "ldr d3,[x3],#8\n\t" FMA_3V(22, 23, 24, 2, 2, 2, 4, 5, 6)\
+  "ldr d7,[x4,#-16]\n\t" FMA_3V(27, 28, 29, 3, 3, 3, 4, 5, 6)\
+  "ldr d8,[x4,#-8]\n\t" FMA_3V(15, 20, 25, 0, 1, 2, 7, 7, 7)\
+  "ldr d4,[x4]\n\t" FMA_3V(16, 21, 26, 0, 1, 2, 8, 8, 8)\
+  "ldr d0,[x0],#8; ldr d5,[x4,#8]; ldr d6,[x4,#16]\n\t"\
+  "prfm pldl1keep,[x8]\n\t"\
+  "prfm pldl1keep,[x9]; fmla v30.2s,v3.2s,v7.2s; fmla v31.2s,v3.2s,v8.2s\n\t"\
+  "ldr d1,[x1],#8\n\t" FMA_3V(12, 13, 14, 0, 0, 0, 4, 5, 6)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(17, 18, 19, 1, 1, 1, 4, 5, 6)\
+  "ldr d3,[x3],#8\n\t" FMA_3V(22, 23, 24, 2, 2, 2, 4, 5, 6)\
+  "ldr d7,[x4,#24]\n\t" FMA_3V(27, 28, 29, 3, 3, 3, 4, 5, 6)\
+  "ldr d8,[x4,#32]\n\t" FMA_3V(15, 20, 25, 0, 1, 2, 7, 7, 7)\
+  "prfm pldl1keep,[x10]\n\t" FMA_3V(16, 21, 26, 0, 1, 2, 8, 8, 8)\
+  "add x4,x4,#40; sub w5,w5,#4\n\t"\
+  "prfm pldl1keep,[x11]; fmla v30.2s,v3.2s,v7.2s; fmla v31.2s,v3.2s,v8.2s\n\t"
+
+#define KERNEL_M4N5_TAIL2 \
+  "ldr d1,[x1],#8\n\t" FMA_3V(12, 13, 14, 0, 0, 0, 4, 5, 6)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(17, 18, 19, 1, 1, 1, 4, 5, 6)\
+  "ldr d3,[x3],#8\n\t" FMA_3V(22, 23, 24, 2, 2, 2, 4, 5, 6)\
+  "ldr d7,[x4,#-16]\n\t" FMA_3V(27, 28, 29, 3, 3, 3, 4, 5, 6)\
+  "ldr d8,[x4,#-8]\n\t" FMA_3V(15, 20, 25, 0, 1, 2, 7, 7, 7)\
+  "prfm pldl1keep,[x8]\n\t" FMA_3V(16, 21, 26, 0, 1, 2, 8, 8, 8)\
+  "prfm pldl1keep,[x9]; prfm pldl1keep,[x10]; prfm pldl1keep,[x11]\n\t"\
+  "sub w5,w5,#2; fmla v30.2s,v3.2s,v7.2s; fmla v31.2s,v3.2s,v8.2s\n\t"
+
+#define KERNEL_M4N5_FIN1 \
+  "ldr s0,[x0],#4; ldr s4,[x4]; ldr s5,[x4,#4]; ldr s6,[x4,#8]\n\t"\
+  "ldr s1,[x1],#4\n\t" FMA_3V(12, 13, 14, 0, 0, 0, 4, 5, 6)\
+  "ldr s2,[x2],#4\n\t" FMA_3V(17, 18, 19, 1, 1, 1, 4, 5, 6)\
+  "ldr s3,[x3],#4\n\t" FMA_3V(22, 23, 24, 2, 2, 2, 4, 5, 6)\
+  "ldr s7,[x4,#12]\n\t" FMA_3V(27, 28, 29, 3, 3, 3, 4, 5, 6)\
+  "ldr s8,[x4,#16]\n\t" FMA_3V(15, 20, 25, 0, 1, 2, 7, 7, 7)\
+  "add x4,x4,#20\n\t" FMA_3V(16, 21, 26, 0, 1, 2, 8, 8, 8)\
+  "fmla v30.2s,v3.2s,v7.2s; fmla v31.2s,v3.2s,v8.2s\n\t"
+
+
+/* acc layout for m4n6 kernel */
+/* m0n0 v8 v9 v10 v11 v12 v13 m0n6 */
+/* m1n0 v14 v15 v16 v17 v18 v19 m1n6 */
+/* m2n0 v20 v21 v22 v23 v24 v25 m2n6 */
+/* m3n0 v26 v27 v28 v29 v30 v31 m3n6 */
+/* b-holder layout for m4n6 kernel */
+/* n0 v4 v5 v6 v7 */
+/* a-holder layout for m4n5 kernel */
+/* a_ptr1->v0, a_ptr2->v1, a_ptr3->v2, a_ptr4->v3 */
+
+#define INIT_M4N6 \
+  INIT_4V(8, 14, 20, 26) INIT_4V(9, 15, 21, 27) INIT_4V(10, 16, 22, 28)\
+  INIT_4V(11, 17, 23, 29) INIT_4V(12, 18, 24, 30) INIT_4V(13, 19, 25, 31)
+
+#define SAVE_M4N6(mode) \
+  UNIT_SAVE_M4N1_##mode(8, 14, 20, 26) UNIT_SAVE_M4N1_##mode(9, 15, 21, 27)\
+  UNIT_SAVE_M4N1_##mode(10, 16, 22, 28) UNIT_SAVE_M4N1_##mode(11, 17, 23, 29)\
+  UNIT_SAVE_M4N1_##mode(12, 18, 24, 30) UNIT_SAVE_M4N1_##mode(13, 19, 25, 31)
+
+#define KERNEL_M4N6_PRELOAD2 \
+  "ldr d0,[x0],#8\n\t"\
+  "ldr d4,[x4]; ldr d5,[x4,#8]; ldr d6,[x4,#16]; add x4,x4,#48\n\t"
+
+#define KERNEL_M4N6_MAIN8 \
+  "ldr d1,[x1],#8\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 4, 5, 6)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(14, 15, 16, 1, 1, 1, 4, 5, 6)\
+  "ldr d3,[x3],#8\n\t" FMA_3V(20, 21, 22, 2, 2, 2, 4, 5, 6)\
+  "ldr d7,[x4,#-24]\n\t" FMA_3V(26, 27, 28, 3, 3, 3, 4, 5, 6)\
+  "ldr d5,[x4,#-16]\n\t" FMA_3V(17, 23, 29, 1, 2, 3, 7, 7, 7)\
+  "ldr d6,[x4,#-8]\n\t" FMA_3V(18, 24, 30, 1, 2, 3, 5, 5, 5)\
+  "ldr d4,[x4]\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 7, 5, 6)\
+  "ldr d0,[x0],#8; ldr d5,[x4,#8]; ldr d7,[x4,#16]; sub w5,w5,#8\n\t"\
+  "prfm pldl1keep,[x0,#64]\n\t" FMA_3V(19, 25, 31, 1, 2, 3, 6, 6, 6)\
+  "ldr d1,[x1],#8\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 4, 5, 7)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(14, 15, 16, 1, 1, 1, 4, 5, 7)\
+  "ldr d3,[x3],#8\n\t" FMA_3V(20, 21, 22, 2, 2, 2, 4, 5, 7)\
+  "ldr d6,[x4,#24]\n\t" FMA_3V(26, 27, 28, 3, 3, 3, 4, 5, 7)\
+  "ldr d5,[x4,#32]\n\t" FMA_3V(17, 23, 29, 1, 2, 3, 6, 6, 6)\
+  "ldr d7,[x4,#40]\n\t" FMA_3V(18, 24, 30, 1, 2, 3, 5, 5, 5)\
+  "ldr d4,[x4,#48]\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 6, 5, 7)\
+  "ldr d0,[x0],#8; ldr d5,[x4,#56]; ldr d6,[x4,#64]; cmp w5,#10\n\t"\
+  "prfm pldl1keep,[x1,#64]\n\t" FMA_3V(19, 25, 31, 1, 2, 3, 7, 7, 7)\
+  "ldr d1,[x1],#8\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 4, 5, 6)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(14, 15, 16, 1, 1, 1, 4, 5, 6)\
+  "ldr d3,[x3],#8\n\t" FMA_3V(20, 21, 22, 2, 2, 2, 4, 5, 6)\
+  "ldr d7,[x4,#72]\n\t" FMA_3V(26, 27, 28, 3, 3, 3, 4, 5, 6)\
+  "ldr d5,[x4,#80]\n\t" FMA_3V(17, 23, 29, 1, 2, 3, 7, 7, 7)\
+  "ldr d6,[x4,#88]\n\t" FMA_3V(18, 24, 30, 1, 2, 3, 5, 5, 5)\
+  "ldr d4,[x4,#96]\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 7, 5, 6)\
+  "ldr d0,[x0],#8; ldr d5,[x4,#104]; ldr d7,[x4,#112]\n\t"\
+  "prfm pldl1keep,[x2,#64]\n\t" FMA_3V(19, 25, 31, 1, 2, 3, 6, 6, 6)\
+  "ldr d1,[x1],#8\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 4, 5, 7)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(14, 15, 16, 1, 1, 1, 4, 5, 7)\
+  "ldr d3,[x3],#8\n\t" FMA_3V(20, 21, 22, 2, 2, 2, 4, 5, 7)\
+  "ldr d6,[x4,#120]\n\t" FMA_3V(26, 27, 28, 3, 3, 3, 4, 5, 7)\
+  "ldr d5,[x4,#128]\n\t" FMA_3V(17, 23, 29, 1, 2, 3, 6, 6, 6)\
+  "ldr d7,[x4,#136]\n\t" FMA_3V(18, 24, 30, 1, 2, 3, 5, 5, 5)\
+  "ldr d4,[x4,#144]\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 6, 5, 7)\
+  "ldr d0,[x0],#8; ldr d5,[x4,#152]; ldr d6,[x4,#160]; add x4,x4,#192\n\t"\
+  "prfm pldl1keep,[x3,#64]\n\t" FMA_3V(19, 25, 31, 1, 2, 3, 7, 7, 7)
+
+#define KERNEL_M4N6_MAIN4 \
+  "ldr d1,[x1],#8\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 4, 5, 6)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(14, 15, 16, 1, 1, 1, 4, 5, 6)\
+  "ldr d3,[x3],#8\n\t" FMA_3V(20, 21, 22, 2, 2, 2, 4, 5, 6)\
+  "ldr d7,[x4,#-24]\n\t" FMA_3V(26, 27, 28, 3, 3, 3, 4, 5, 6)\
+  "ldr d5,[x4,#-16]\n\t" FMA_3V(17, 23, 29, 1, 2, 3, 7, 7, 7)\
+  "ldr d6,[x4,#-8]\n\t" FMA_3V(18, 24, 30, 1, 2, 3, 5, 5, 5)\
+  "ldr d4,[x4]\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 7, 5, 6)\
+  "ldr d0,[x0],#8; ldr d5,[x4,#8]; ldr d7,[x4,#16]; sub w5,w5,#4\n\t"\
+  FMA_3V(19, 25, 31, 1, 2, 3, 6, 6, 6)\
+  "ldr d1,[x1],#8\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 4, 5, 7)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(14, 15, 16, 1, 1, 1, 4, 5, 7)\
+  "ldr d3,[x3],#8\n\t" FMA_3V(20, 21, 22, 2, 2, 2, 4, 5, 7)\
+  "ldr d6,[x4,#24]\n\t" FMA_3V(26, 27, 28, 3, 3, 3, 4, 5, 7)\
+  "ldr d5,[x4,#32]\n\t" FMA_3V(17, 23, 29, 1, 2, 3, 6, 6, 6)\
+  "ldr d7,[x4,#40]\n\t" FMA_3V(18, 24, 30, 1, 2, 3, 5, 5, 5)\
+  "ldr d4,[x4,#48]\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 6, 5, 7)\
+  "ldr d0,[x0],#8; ldr d5,[x4,#56]; ldr d6,[x4,#64]; add x4,x4,#96\n\t"\
+  FMA_3V(19, 25, 31, 1, 2, 3, 7, 7, 7)
+
+#define KERNEL_M4N6_TAIL4 \
+  "ldr d1,[x1],#8\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 4, 5, 6)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(14, 15, 16, 1, 1, 1, 4, 5, 6)\
+  "ldr d3,[x3],#8\n\t" FMA_3V(20, 21, 22, 2, 2, 2, 4, 5, 6)\
+  "ldr d7,[x4,#-24]\n\t" FMA_3V(26, 27, 28, 3, 3, 3, 4, 5, 6)\
+  "ldr d5,[x4,#-16]\n\t" FMA_3V(17, 23, 29, 1, 2, 3, 7, 7, 7)\
+  "ldr d6,[x4,#-8]\n\t" FMA_3V(18, 24, 30, 1, 2, 3, 5, 5, 5)\
+  "ldr d4,[x4]\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 7, 5, 6)\
+  "ldr d0,[x0],#8; ldr d5,[x4,#8]; ldr d7,[x4,#16]; sub w5,w5,#4\n\t"\
+  "prfm pldl1keep,[x8]\n\t" FMA_3V(19, 25, 31, 1, 2, 3, 6, 6, 6)\
+  "ldr d1,[x1],#8\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 4, 5, 7)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(14, 15, 16, 1, 1, 1, 4, 5, 7)\
+  "ldr d3,[x3],#8\n\t" FMA_3V(20, 21, 22, 2, 2, 2, 4, 5, 7)\
+  "ldr d6,[x4,#24]\n\t" FMA_3V(26, 27, 28, 3, 3, 3, 4, 5, 7)\
+  "ldr d5,[x4,#32]\n\t" FMA_3V(17, 23, 29, 1, 2, 3, 6, 6, 6)\
+  "ldr d7,[x4,#40]\n\t" FMA_3V(18, 24, 30, 1, 2, 3, 5, 5, 5)\
+  "prfm pldl1keep,[x9]\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 6, 5, 7)\
+  "prfm pldl1keep,[x10]; add x4,x4,#48\n\t"\
+  "prfm pldl1keep,[x11]\n\t" FMA_3V(19, 25, 31, 1, 2, 3, 7, 7, 7)
+
+#define KERNEL_M4N6_TAIL2 \
+  "ldr d1,[x1],#8\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 4, 5, 6)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(14, 15, 16, 1, 1, 1, 4, 5, 6)\
+  "ldr d3,[x3],#8\n\t" FMA_3V(20, 21, 22, 2, 2, 2, 4, 5, 6)\
+  "ldr d7,[x4,#-24]\n\t" FMA_3V(26, 27, 28, 3, 3, 3, 4, 5, 6)\
+  "ldr d5,[x4,#-16]\n\t" FMA_3V(17, 23, 29, 1, 2, 3, 7, 7, 7)\
+  "ldr d6,[x4,#-8]\n\t" FMA_3V(18, 24, 30, 1, 2, 3, 5, 5, 5)\
+  "prfm pldl1keep,[x8]\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 7, 5, 6)\
+  "prfm pldl1keep,[x9]; prfm pldl1keep,[x10]; sub w5,w5,#2\n\t"\
+  "prfm pldl1keep,[x11]\n\t" FMA_3V(19, 25, 31, 1, 2, 3, 6, 6, 6)
+
+#define KERNEL_M4N6_FIN1 \
+  "ldr s0,[x0],#4; ldr s4,[x4]; ldr s5,[x4,#4]; ldr s6,[x4,#8]\n\t"\
+  "ldr s1,[x1],#4\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 4, 5, 6)\
+  "ldr s2,[x2],#4\n\t" FMA_3V(14, 15, 16, 1, 1, 1, 4, 5, 6)\
+  "ldr s3,[x3],#4\n\t" FMA_3V(20, 21, 22, 2, 2, 2, 4, 5, 6)\
+  "ldr s7,[x4,#12]\n\t" FMA_3V(26, 27, 28, 3, 3, 3, 4, 5, 6)\
+  "ldr s5,[x4,#16]\n\t" FMA_3V(17, 23, 29, 1, 2, 3, 7, 7, 7)\
+  "ldr s6,[x4,#20]\n\t" FMA_3V(18, 24, 30, 1, 2, 3, 5, 5, 5)\
+  "add x4,x4,#24\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 7, 5, 6)\
+  FMA_3V(19, 25, 31, 1, 2, 3, 6, 6, 6)
+
+
+/* acc layout for m3n7 kernel */
+/* m1n0 v11 v12 v13 v14 v15 v16 v17 m0n7 */
+/* m2n0 v18 v19 v20 v21 v22 v23 v24 m1n7 */
+/* m3n0 v25 v26 v27 v28 v29 v30 v31 m2n7 */
+/* b-holder layout for m3n7 kernel */
+/* n0 v3 v4 v5 v6 v7 v8 v9 */
+/* a-holder layout for m3n7 kernel */
+/* a_ptr1->v0, a_ptr2->v1, a_ptr3->v2 */
+
+#define INIT_M3N7 \
+  INIT_3V(11, 18, 25) INIT_3V(12, 19, 26) INIT_3V(13, 20, 27)\
+  INIT_3V(14, 21, 28) INIT_3V(15, 22, 29) INIT_3V(16, 23, 30)\
+  INIT_3V(17, 24, 31)
+
+#define SAVE_M3N7(mode) \
+  UNIT_SAVE_M3N1_##mode(11, 18, 25) UNIT_SAVE_M3N1_##mode(12, 19, 26)\
+  UNIT_SAVE_M3N1_##mode(13, 20, 27) UNIT_SAVE_M3N1_##mode(14, 21, 28)\
+  UNIT_SAVE_M3N1_##mode(15, 22, 29) UNIT_SAVE_M3N1_##mode(16, 23, 30)\
+  UNIT_SAVE_M3N1_##mode(17, 24, 31)
+
+#define KERNEL_M3N7_PRELOAD2 \
+  "ldr d0,[x0],#8\n\t"\
+  "ldr d3,[x4]; ldr d4,[x4,#8]; ldr d5,[x4,#16]; add x4,x4,#56\n\t"
+
+#define KERNEL_M3N7_MAIN8 \
+  "ldr d1,[x1],#8\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 3, 4, 5)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(18, 19, 20, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#-32]\n\t" FMA_3V(25, 26, 27, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#-24]\n\t" FMA_3V(14, 21, 28, 0, 1, 2, 6, 6, 6)\
+  "ldr d8,[x4,#-16]; ldr d9,[x4,#-8]; ldr d3,[x4]\n\t"\
+  "prfm pldl1keep,[x0,#64]\n\t"\
+  "ldr d4,[x4,#8]\n\t" FMA_3V(15, 22, 29, 0, 1, 2, 7, 7, 7)\
+  "ldr d5,[x4,#16]\n\t" FMA_3V(16, 23, 17, 0, 1, 0, 8, 8, 9)\
+  "ldr d0,[x0],#8\n\t" FMA_3V(30, 24, 31, 2, 1, 2, 8, 9, 9)\
+  "ldr d1,[x1],#8\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 3, 4, 5)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(18, 19, 20, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#24]\n\t" FMA_3V(25, 26, 27, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#32]\n\t" FMA_3V(14, 21, 28, 0, 1, 2, 6, 6, 6)\
+  "ldr d8,[x4,#40]; ldr d9,[x4,#48]; ldr d3,[x4,#56]; sub w5,w5,#8\n\t"\
+  "prfm pldl1keep,[x1,#64]\n\t"\
+  "ldr d4,[x4,#64]\n\t" FMA_3V(15, 22, 29, 0, 1, 2, 7, 7, 7)\
+  "ldr d5,[x4,#72]\n\t" FMA_3V(16, 23, 17, 0, 1, 0, 8, 8, 9)\
+  "ldr d0,[x0],#8\n\t" FMA_3V(30, 24, 31, 2, 1, 2, 8, 9, 9)\
+  "ldr d1,[x1],#8\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 3, 4, 5)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(18, 19, 20, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#80]\n\t" FMA_3V(25, 26, 27, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#88]\n\t" FMA_3V(14, 21, 28, 0, 1, 2, 6, 6, 6)\
+  "ldr d8,[x4,#96]; ldr d9,[x4,#104]; ldr d3,[x4,#112]; cmp w5,#10\n\t"\
+  "prfm pldl1keep,[x2,#64]\n\t"\
+  "ldr d4,[x4,#120]\n\t" FMA_3V(15, 22, 29, 0, 1, 2, 7, 7, 7)\
+  "ldr d5,[x4,#128]\n\t" FMA_3V(16, 23, 17, 0, 1, 0, 8, 8, 9)\
+  "ldr d0,[x0],#8\n\t" FMA_3V(30, 24, 31, 2, 1, 2, 8, 9, 9)\
+  "ldr d1,[x1],#8\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 3, 4, 5)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(18, 19, 20, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#136]\n\t" FMA_3V(25, 26, 27, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#144]\n\t" FMA_3V(14, 21, 28, 0, 1, 2, 6, 6, 6)\
+  "ldr d8,[x4,#152]; ldr d9,[x4,#160]; ldr d3,[x4,#168]; add x4,x4,#224\n\t"\
+  "prfm pldl1keep,[x3,#64]\n\t"\
+  "ldr d4,[x4,#-48]\n\t" FMA_3V(15, 22, 29, 0, 1, 2, 7, 7, 7)\
+  "ldr d5,[x4,#-40]\n\t" FMA_3V(16, 23, 17, 0, 1, 0, 8, 8, 9)\
+  "ldr d0,[x0],#8\n\t" FMA_3V(30, 24, 31, 2, 1, 2, 8, 9, 9)
+
+#define KERNEL_M3N7_MAIN4 \
+  "ldr d1,[x1],#8\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 3, 4, 5)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(18, 19, 20, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#-32]\n\t" FMA_3V(25, 26, 27, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#-24]\n\t" FMA_3V(14, 21, 28, 0, 1, 2, 6, 6, 6)\
+  "ldr d8,[x4,#-16]; ldr d9,[x4,#-8]; ldr d3,[x4]; sub w5,w5,#4\n\t"\
+  "ldr d4,[x4,#8]\n\t" FMA_3V(15, 22, 29, 0, 1, 2, 7, 7, 7)\
+  "ldr d5,[x4,#16]\n\t" FMA_3V(16, 23, 17, 0, 1, 0, 8, 8, 9)\
+  "ldr d0,[x0],#8\n\t" FMA_3V(30, 24, 31, 2, 1, 2, 8, 9, 9)\
+  "ldr d1,[x1],#8\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 3, 4, 5)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(18, 19, 20, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#24]\n\t" FMA_3V(25, 26, 27, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#32]\n\t" FMA_3V(14, 21, 28, 0, 1, 2, 6, 6, 6)\
+  "ldr d8,[x4,#40]; ldr d9,[x4,#48]; ldr d3,[x4,#56]; add x4,x4,#112\n\t"\
+  "ldr d4,[x4,#-48]\n\t" FMA_3V(15, 22, 29, 0, 1, 2, 7, 7, 7)\
+  "ldr d5,[x4,#-40]\n\t" FMA_3V(16, 23, 17, 0, 1, 0, 8, 8, 9)\
+  "ldr d0,[x0],#8\n\t" FMA_3V(30, 24, 31, 2, 1, 2, 8, 9, 9)
+
+#define KERNEL_M3N7_TAIL4 \
+  "ldr d1,[x1],#8\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 3, 4, 5)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(18, 19, 20, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#-32]\n\t" FMA_3V(25, 26, 27, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#-24]\n\t" FMA_3V(14, 21, 28, 0, 1, 2, 6, 6, 6)\
+  "ldr d8,[x4,#-16]; ldr d9,[x4,#-8]; ldr d3,[x4]; sub w5,w5,#4\n\t"\
+  "ldr d4,[x4,#8]\n\t" FMA_3V(15, 22, 29, 0, 1, 2, 7, 7, 7)\
+  "ldr d5,[x4,#16]\n\t" FMA_3V(16, 23, 17, 0, 1, 0, 8, 8, 9)\
+  "ldr d0,[x0],#8\n\t" FMA_3V(30, 24, 31, 2, 1, 2, 8, 9, 9)\
+  "ldr d1,[x1],#8\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 3, 4, 5)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(18, 19, 20, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#24]\n\t" FMA_3V(25, 26, 27, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#32]\n\t" FMA_3V(14, 21, 28, 0, 1, 2, 6, 6, 6)\
+  "ldr d8,[x4,#40]; ldr d9,[x4,#48]; add x4,x4,#56\n\t"\
+  "prfm pldl1keep,[x3]\n\t" FMA_3V(15, 22, 29, 0, 1, 2, 7, 7, 7)\
+  "prfm pldl1keep,[x8]\n\t" FMA_3V(16, 23, 17, 0, 1, 0, 8, 8, 9)\
+  "prfm pldl1keep,[x9]\n\t" FMA_3V(30, 24, 31, 2, 1, 2, 8, 9, 9)
+
+#define KERNEL_M3N7_TAIL2 \
+  "ldr d1,[x1],#8\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 3, 4, 5)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(18, 19, 20, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#-32]\n\t" FMA_3V(25, 26, 27, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#-24]\n\t" FMA_3V(14, 21, 28, 0, 1, 2, 6, 6, 6)\
+  "ldr d8,[x4,#-16]; ldr d9,[x4,#-8]; sub w5,w5,#2\n\t"\
+  "prfm pldl1keep,[x3]\n\t" FMA_3V(15, 22, 29, 0, 1, 2, 7, 7, 7)\
+  "prfm pldl1keep,[x8]\n\t" FMA_3V(16, 23, 17, 0, 1, 0, 8, 8, 9)\
+  "prfm pldl1keep,[x9]\n\t" FMA_3V(30, 24, 31, 2, 1, 2, 8, 9, 9)
+
+#define KERNEL_M3N7_FIN1 \
+  "ldr s0,[x0],#4; ldr s3,[x4]; ldr s4,[x4,#4]; ldr s5,[x4,#8]\n\t"\
+  "ldr s1,[x1],#4\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 3, 4, 5)\
+  "ldr s2,[x2],#4\n\t" FMA_3V(18, 19, 20, 1, 1, 1, 3, 4, 5)\
+  "ldr s6,[x4,#12]\n\t" FMA_3V(25, 26, 27, 2, 2, 2, 3, 4, 5)\
+  "ldr s7,[x4,#16]\n\t" FMA_3V(14, 21, 28, 0, 1, 2, 6, 6, 6)\
+  "ldr s8,[x4,#20]\n\t" FMA_3V(15, 22, 29, 0, 1, 2, 7, 7, 7)\
+  "ldr s9,[x4,#24]\n\t" FMA_3V(16, 23, 30, 0, 1, 2, 8, 8, 8)\
+  "add x4,x4,#28\n\t" FMA_3V(17, 24, 31, 0, 1, 2, 9, 9, 9)
+
+
+/* acc layout for m3n8 kernel */
+/* m1n0 v8 v9 v10 v11 v12 v13 v14 v15 m0n8 */
+/* m2n0 v16 v17 v18 v19 v20 v21 v22 v23 m1n8 */
+/* m3n0 v24 v25 v26 v27 v28 v29 v30 v31 m2n8 */
+/* b-holder layout for m3n8 kernel */
+/* n0 v3 v4 v5 v6 v7 */
+/* a-holder layout for m3n8 kernel */
+/* a_ptr1->v0, a_ptr2->v1, a_ptr3->v2 */
+
+#define INIT_M3N8 \
+  INIT_3V(8, 16, 24) INIT_3V(9, 17, 25) INIT_3V(10, 18, 26)\
+  INIT_3V(11, 19, 27) INIT_3V(12, 20, 28) INIT_3V(13, 21, 29)\
+  INIT_3V(14, 22, 30) INIT_3V(15, 23, 31)
+
+#define SAVE_M3N8(mode) \
+  UNIT_SAVE_M3N1_##mode(8, 16, 24) UNIT_SAVE_M3N1_##mode(9, 17, 25)\
+  UNIT_SAVE_M3N1_##mode(10, 18, 26) UNIT_SAVE_M3N1_##mode(11, 19, 27)\
+  UNIT_SAVE_M3N1_##mode(12, 20, 28) UNIT_SAVE_M3N1_##mode(13, 21, 29)\
+  UNIT_SAVE_M3N1_##mode(14, 22, 30) UNIT_SAVE_M3N1_##mode(15, 23, 31)
+
+#define KERNEL_M3N8_PRELOAD2 \
+  "ldr d0,[x0],#8\n\t"\
+  "ldr d3,[x4]; ldr d4,[x4,#8]; ldr d5,[x4,#16]; add x4,x4,#64\n\t"
+
+#define KERNEL_M3N8_MAIN8 \
+  "ldr d1,[x1],#8\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 3, 4, 5)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(16, 17, 18, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#-40]\n\t" FMA_3V(24, 25, 26, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#-32]\n\t" FMA_3V(11, 19, 27, 0, 1, 2, 6, 6, 6)\
+  "ldr d5,[x4,#-24]\n\t" FMA_3V(12, 20, 28, 0, 1, 2, 7, 7, 7)\
+  "ldr d6,[x4,#-16]; ldr d7,[x4,#-8]; ldr d3,[x4]\n\t"\
+  "prfm pldl1keep,[x0,#64]\n\t"\
+  "ldr d4,[x4,#8]\n\t" FMA_3V(13, 21, 29, 0, 1, 2, 5, 5, 5)\
+  "ldr d5,[x4,#16]\n\t" FMA_3V(14, 22, 15, 0, 1, 0, 6, 6, 7)\
+  "ldr d0,[x0],#8\n\t" FMA_3V(30, 23, 31, 2, 1, 2, 6, 7, 7)\
+  "ldr d1,[x1],#8\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 3, 4, 5)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(16, 17, 18, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#24]\n\t" FMA_3V(24, 25, 26, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#32]\n\t" FMA_3V(11, 19, 27, 0, 1, 2, 6, 6, 6)\
+  "ldr d5,[x4,#40]\n\t" FMA_3V(12, 20, 28, 0, 1, 2, 7, 7, 7)\
+  "ldr d6,[x4,#48]; ldr d7,[x4,#56]; ldr d3,[x4,#64]; sub w5,w5,#8\n\t"\
+  "prfm pldl1keep,[x1,#64]\n\t"\
+  "ldr d4,[x4,#72]\n\t" FMA_3V(13, 21, 29, 0, 1, 2, 5, 5, 5)\
+  "ldr d5,[x4,#80]\n\t" FMA_3V(14, 22, 15, 0, 1, 0, 6, 6, 7)\
+  "ldr d0,[x0],#8\n\t" FMA_3V(30, 23, 31, 2, 1, 2, 6, 7, 7)\
+  "ldr d1,[x1],#8\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 3, 4, 5)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(16, 17, 18, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#88]\n\t" FMA_3V(24, 25, 26, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#96]\n\t" FMA_3V(11, 19, 27, 0, 1, 2, 6, 6, 6)\
+  "ldr d5,[x4,#104]\n\t" FMA_3V(12, 20, 28, 0, 1, 2, 7, 7, 7)\
+  "ldr d6,[x4,#112]; ldr d7,[x4,#120]; ldr d3,[x4,#128]; cmp w5,#10\n\t"\
+  "prfm pldl1keep,[x2,#64]\n\t"\
+  "ldr d4,[x4,#136]\n\t" FMA_3V(13, 21, 29, 0, 1, 2, 5, 5, 5)\
+  "ldr d5,[x4,#144]\n\t" FMA_3V(14, 22, 15, 0, 1, 0, 6, 6, 7)\
+  "ldr d0,[x0],#8\n\t" FMA_3V(30, 23, 31, 2, 1, 2, 6, 7, 7)\
+  "ldr d1,[x1],#8\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 3, 4, 5)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(16, 17, 18, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#152]\n\t" FMA_3V(24, 25, 26, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#160]\n\t" FMA_3V(11, 19, 27, 0, 1, 2, 6, 6, 6)\
+  "ldr d5,[x4,#168]\n\t" FMA_3V(12, 20, 28, 0, 1, 2, 7, 7, 7)\
+  "ldr d6,[x4,#176]; ldr d7,[x4,#184]; ldr d3,[x4,#192]; add x4,x4,#256\n\t"\
+  "prfm pldl1keep,[x3,#64]\n\t"\
+  "ldr d4,[x4,#-56]\n\t" FMA_3V(13, 21, 29, 0, 1, 2, 5, 5, 5)\
+  "ldr d5,[x4,#-48]\n\t" FMA_3V(14, 22, 15, 0, 1, 0, 6, 6, 7)\
+  "ldr d0,[x0],#8\n\t" FMA_3V(30, 23, 31, 2, 1, 2, 6, 7, 7)
+
+#define KERNEL_M3N8_MAIN4 \
+  "ldr d1,[x1],#8\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 3, 4, 5)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(16, 17, 18, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#-40]\n\t" FMA_3V(24, 25, 26, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#-32]\n\t" FMA_3V(11, 19, 27, 0, 1, 2, 6, 6, 6)\
+  "ldr d5,[x4,#-24]\n\t" FMA_3V(12, 20, 28, 0, 1, 2, 7, 7, 7)\
+  "ldr d6,[x4,#-16]; ldr d7,[x4,#-8]; ldr d3,[x4]; sub w5,w5,#4\n\t"\
+  "ldr d4,[x4,#8]\n\t" FMA_3V(13, 21, 29, 0, 1, 2, 5, 5, 5)\
+  "ldr d5,[x4,#16]\n\t" FMA_3V(14, 22, 15, 0, 1, 0, 6, 6, 7)\
+  "ldr d0,[x0],#8\n\t" FMA_3V(30, 23, 31, 2, 1, 2, 6, 7, 7)\
+  "ldr d1,[x1],#8\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 3, 4, 5)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(16, 17, 18, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#24]\n\t" FMA_3V(24, 25, 26, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#32]\n\t" FMA_3V(11, 19, 27, 0, 1, 2, 6, 6, 6)\
+  "ldr d5,[x4,#40]\n\t" FMA_3V(12, 20, 28, 0, 1, 2, 7, 7, 7)\
+  "ldr d6,[x4,#48]; ldr d7,[x4,#56]; ldr d3,[x4,#64]; add x4,x4,#128\n\t"\
+  "ldr d4,[x4,#-56]\n\t" FMA_3V(13, 21, 29, 0, 1, 2, 5, 5, 5)\
+  "ldr d5,[x4,#-48]\n\t" FMA_3V(14, 22, 15, 0, 1, 0, 6, 6, 7)\
+  "ldr d0,[x0],#8\n\t" FMA_3V(30, 23, 31, 2, 1, 2, 6, 7, 7)
+
+#define KERNEL_M3N8_TAIL4 \
+  "ldr d1,[x1],#8\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 3, 4, 5)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(16, 17, 18, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#-40]\n\t" FMA_3V(24, 25, 26, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#-32]\n\t" FMA_3V(11, 19, 27, 0, 1, 2, 6, 6, 6)\
+  "ldr d5,[x4,#-24]\n\t" FMA_3V(12, 20, 28, 0, 1, 2, 7, 7, 7)\
+  "ldr d6,[x4,#-16]; ldr d7,[x4,#-8]; ldr d3,[x4]; sub w5,w5,#4\n\t"\
+  "ldr d4,[x4,#8]\n\t" FMA_3V(13, 21, 29, 0, 1, 2, 5, 5, 5)\
+  "ldr d5,[x4,#16]\n\t" FMA_3V(14, 22, 15, 0, 1, 0, 6, 6, 7)\
+  "ldr d0,[x0],#8\n\t" FMA_3V(30, 23, 31, 2, 1, 2, 6, 7, 7)\
+  "ldr d1,[x1],#8\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 3, 4, 5)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(16, 17, 18, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#24]\n\t" FMA_3V(24, 25, 26, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#32]\n\t" FMA_3V(11, 19, 27, 0, 1, 2, 6, 6, 6)\
+  "ldr d5,[x4,#40]\n\t" FMA_3V(12, 20, 28, 0, 1, 2, 7, 7, 7)\
+  "ldr d6,[x4,#48]; ldr d7,[x4,#56]; add x4,x4,#64\n\t"\
+  "prfm pldl1keep,[x3]\n\t" FMA_3V(13, 21, 29, 0, 1, 2, 5, 5, 5)\
+  "prfm pldl1keep,[x8]\n\t" FMA_3V(14, 22, 15, 0, 1, 0, 6, 6, 7)\
+  "prfm pldl1keep,[x9]\n\t" FMA_3V(30, 23, 31, 2, 1, 2, 6, 7, 7)
+
+#define KERNEL_M3N8_TAIL2 \
+  "ldr d1,[x1],#8\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 3, 4, 5)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(16, 17, 18, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#-40]\n\t" FMA_3V(24, 25, 26, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#-32]\n\t" FMA_3V(11, 19, 27, 0, 1, 2, 6, 6, 6)\
+  "ldr d5,[x4,#-24]\n\t" FMA_3V(12, 20, 28, 0, 1, 2, 7, 7, 7)\
+  "ldr d6,[x4,#-16]; ldr d7,[x4,#-8]; sub w5,w5,#2\n\t"\
+  "prfm pldl1keep,[x3]\n\t" FMA_3V(13, 21, 29, 0, 1, 2, 5, 5, 5)\
+  "prfm pldl1keep,[x8]\n\t" FMA_3V(14, 22, 15, 0, 1, 0, 6, 6, 7)\
+  "prfm pldl1keep,[x9]\n\t" FMA_3V(30, 23, 31, 2, 1, 2, 6, 7, 7)
+
+#define KERNEL_M3N8_FIN1 \
+  "ldr s0,[x0],#4; ldr s3,[x4]; ldr s4,[x4,#4]; ldr s5,[x4,#8]\n\t"\
+  "ldr s1,[x1],#4\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 3, 4, 5)\
+  "ldr s2,[x2],#4\n\t" FMA_3V(16, 17, 18, 1, 1, 1, 3, 4, 5)\
+  "ldr s6,[x4,#12]\n\t" FMA_3V(24, 25, 26, 2, 2, 2, 3, 4, 5)\
+  "ldr s7,[x4,#16]\n\t" FMA_3V(11, 19, 27, 0, 1, 2, 6, 6, 6)\
+  "ldr s5,[x4,#20]\n\t" FMA_3V(12, 20, 28, 0, 1, 2, 7, 7, 7)\
+  "ldr s6,[x4,#24]\n\t" FMA_3V(13, 21, 29, 0, 1, 2, 5, 5, 5)\
+  "ldr s7,[x4,#28]\n\t" FMA_3V(14, 22, 30, 0, 1, 2, 6, 6, 6)\
+  "add x4,x4,#32\n\t" FMA_3V(15, 23, 31, 0, 1, 2, 7, 7, 7)
+
+FUNC_PACK3(4, 4)
+
+FUNC_PACK3(4, 5)
+
+FUNC_PACK3(4, 6)
+
+FUNC_PACK3(3, 7)
+
+FUNC_PACK3(3, 8)
+
+/* macro for GEMM with packing pattern NO.#0 */
+/* mdim = 3, 4; ndim = 10, 12, 14, 16 */
+#define FUNC_PACK0(mdim, ndim) \
+static inline void sgemm_skinny1_a35_m##mdim##n##ndim(\
+  const float * __restrict__ a_ptr, const float * __restrict__ b_scr,\
+  float * __restrict__ c_ptr, uint32_t K, uint32_t LDA, uint32_t LDC,\
+  uint8_t c_rowmajor, const float * __restrict__ beta_addr) {\
+  __asm__ __volatile__(\
+    "mov x4,%[b_scr]\n\t"\
+    "mov x0,%[a_ptr]; add x1,%[a_ptr],%w[LDA],UXTW #2\n\t"\
+    "add x2,%[a_ptr],%w[LDA],UXTW #3; add x3,x1,%w[LDA],UXTW #3\n\t"\
+    "add x8,x0,%w[LDA],UXTW #4; add x9,x1,%w[LDA],UXTW #4\n\t"\
+    "add x10,x2,%w[LDA],UXTW #4; add x11,x3,%w[LDA],UXTW #4\n\t"\
+    "mov w5,%w[K]\n\t"\
+    INIT_M##mdim##N##ndim\
+    "cmp w5,#1; b.lt 4f\n\t"\
+    KERNEL_M##mdim##N##ndim##_PRELOAD1\
+    "cmp w5,#5; b.lt 1f\n\t"\
+    ".balign 16; 8:\n\t"\
+    KERNEL_M##mdim##N##ndim##_MAIN4 "b.ge 8b\n\t"\
+    "1:\n\t"\
+    "cmp w5,#3; b.lt 2f\n\t"\
+    KERNEL_M##mdim##N##ndim##_MAIN2\
+    "2:\n\t"\
+    "cmp w5,#2; b.ne 3f\n\t"\
+    KERNEL_M##mdim##N##ndim##_TAIL2 "b 4f\n\t"\
+    "3:\n\t"\
+    KERNEL_M##mdim##N##ndim##_TAIL1\
+    "4:\n\t"\
+    "cmp %w[c_rowmajor],#0; b.eq 6f\n\t"\
+    INIT_SAVE_M##mdim##_CR SAVE_M##mdim##N##ndim(CR) "b 7f\n\t"\
+    "6:\n\t"\
+    INIT_SAVE_CC SAVE_M##mdim##N##ndim(CC)\
+    "7:\n\t"\
+   ::[a_ptr]"r"(a_ptr), [b_scr]"r"(b_scr), [c_ptr]"r"(c_ptr),\
+     [LDA]"r"(LDA), [LDC]"r"(LDC), [K]"r"(K),\
+     [beta_addr]"r"(beta_addr), [c_rowmajor]"r"(c_rowmajor)\
+   :"cc","memory","x0","x1","x2","x3","x4","x5",\
+    "x8","x9","x10","x11","x12","x13","x14","x15",\
+    "v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11",\
+    "v12","v13","v14","v15","v16","v17","v18","v19","v20","v21",\
+    "v22","v23","v24","v25","v26","v27","v28","v29","v30","v31");\
+}
+
+/* acc layout for m4n10 kernel */
+/* m0n0 v10 v11 v12 v13 v14 m0n10 */
+/* m1n0 v15 v16 v17 v18 v19 m1n10 */
+/* m2n0 v20 v21 v22 v23 v24 m2n10 */
+/* m3n0 v25 v26 v27 v28 v29 m3n10 */
+/* b-holder layout for m4n10 kernel */
+/* n0 v5 v6 v7 v8 v9 n10 */
+/* a-holder layout for m4n10 kernel */
+/* a_ptr1->v0, a_ptr2->v1, a_ptr3->v2, a_ptr4->v3 */
+
+#define INIT_M4N10 \
+  INIT_4V(10, 15, 20, 25) INIT_4V(11, 16, 21, 26)\
+  INIT_4V(12, 17, 22, 27) INIT_4V(13, 18, 23, 28)\
+  INIT_4V(14, 19, 24, 29)
+
+#define SAVE_M4N10(mode) \
+  UNIT_SAVE_M4N2_##mode(10, 15, 20, 25) UNIT_SAVE_M4N2_##mode(11, 16, 21, 26)\
+  UNIT_SAVE_M4N2_##mode(12, 17, 22, 27) UNIT_SAVE_M4N2_##mode(13, 18, 23, 28)\
+  UNIT_SAVE_M4N2_##mode(14, 19, 24, 29)
+
+#define KERNEL_M4N10_PRELOAD1 \
+  "ld1r {v0.2s},[x0],#4\n\t"\
+  "ldr d5,[x4]; ldr d6,[x4,#8]; ldr d7,[x4,#16]; add x4,x4,#40\n\t"
+
+#define KERNEL_M4N10_MAIN4 \
+  "ld1r {v1.2s},[x1],#4\n\t" FMA_3V(10, 11, 12, 0, 0, 0, 5, 6, 7)\
+  "ld1r {v2.2s},[x2],#4\n\t" FMA_3V(15, 16, 17, 1, 1, 1, 5, 6, 7)\
+  "ld1r {v3.2s},[x3],#4\n\t" FMA_3V(20, 21, 22, 2, 2, 2, 5, 6, 7)\
+  "ldr d8,[x4,#-16]\n\t" FMA_3V(25, 26, 27, 3, 3, 3, 5, 6, 7)\
+  "ldr d9,[x4,#-8]\n\t" FMA_3V(13, 18, 23, 0, 1, 2, 8, 8, 8)\
+  "ldr d5,[x4]\n\t" FMA_3V(14, 19, 24, 0, 1, 2, 9, 9, 9)\
+  "ld1r {v0.2s},[x0],#4; ldr d6,[x4,#8]; ldr d7,[x4,#16]\n\t"\
+  "prfm pldl1keep,[x0,#64]\n\t"\
+  "fmla v28.2s,v3.2s,v8.2s; fmla v29.2s,v3.2s,v9.2s\n\t"\
+  "ld1r {v1.2s},[x1],#4\n\t" FMA_3V(10, 11, 12, 0, 0, 0, 5, 6, 7)\
+  "ld1r {v2.2s},[x2],#4\n\t" FMA_3V(15, 16, 17, 1, 1, 1, 5, 6, 7)\
+  "ld1r {v3.2s},[x3],#4\n\t" FMA_3V(20, 21, 22, 2, 2, 2, 5, 6, 7)\
+  "ldr d8,[x4,#24]\n\t" FMA_3V(25, 26, 27, 3, 3, 3, 5, 6, 7)\
+  "ldr d9,[x4,#32]\n\t" FMA_3V(13, 18, 23, 0, 1, 2, 8, 8, 8)\
+  "ldr d5,[x4,#40]\n\t" FMA_3V(14, 19, 24, 0, 1, 2, 9, 9, 9)\
+  "ld1r {v0.2s},[x0],#4; ldr d6,[x4,#48]; ldr d7,[x4,#56]\n\t"\
+  "prfm pldl1keep,[x1,#64]\n\t"\
+  "fmla v28.2s,v3.2s,v8.2s; fmla v29.2s,v3.2s,v9.2s\n\t"\
+  "ld1r {v1.2s},[x1],#4\n\t" FMA_3V(10, 11, 12, 0, 0, 0, 5, 6, 7)\
+  "ld1r {v2.2s},[x2],#4\n\t" FMA_3V(15, 16, 17, 1, 1, 1, 5, 6, 7)\
+  "ld1r {v3.2s},[x3],#4\n\t" FMA_3V(20, 21, 22, 2, 2, 2, 5, 6, 7)\
+  "ldr d8,[x4,#64]\n\t" FMA_3V(25, 26, 27, 3, 3, 3, 5, 6, 7)\
+  "ldr d9,[x4,#72]\n\t" FMA_3V(13, 18, 23, 0, 1, 2, 8, 8, 8)\
+  "ldr d5,[x4,#80]\n\t" FMA_3V(14, 19, 24, 0, 1, 2, 9, 9, 9)\
+  "ld1r {v0.2s},[x0],#4; ldr d6,[x4,#88]; ldr d7,[x4,#96]\n\t"\
+  "prfm pldl1keep,[x2,#64]\n\t"\
+  "fmla v28.2s,v3.2s,v8.2s; fmla v29.2s,v3.2s,v9.2s\n\t"\
+  "ld1r {v1.2s},[x1],#4\n\t" FMA_3V(10, 11, 12, 0, 0, 0, 5, 6, 7)\
+  "ld1r {v2.2s},[x2],#4\n\t" FMA_3V(15, 16, 17, 1, 1, 1, 5, 6, 7)\
+  "ld1r {v3.2s},[x3],#4\n\t" FMA_3V(20, 21, 22, 2, 2, 2, 5, 6, 7)\
+  "ldr d8,[x4,#104]\n\t" FMA_3V(25, 26, 27, 3, 3, 3, 5, 6, 7)\
+  "ldr d9,[x4,#112]\n\t" FMA_3V(13, 18, 23, 0, 1, 2, 8, 8, 8)\
+  "ldr d5,[x4,#120]\n\t" FMA_3V(14, 19, 24, 0, 1, 2, 9, 9, 9)\
+  "ld1r {v0.2s},[x0],#4; ldr d6,[x4,#128]; ldr d7,[x4,#136]; add x4,x4,#160\n\t"\
+  "prfm pldl1keep,[x3,#64]; sub w5,w5,#4\n\t"\
+  "fmla v28.2s,v3.2s,v8.2s; cmp w5,#5; fmla v29.2s,v3.2s,v9.2s\n\t"
+
+#define KERNEL_M4N10_MAIN2 \
+  "ld1r {v1.2s},[x1],#4\n\t" FMA_3V(10, 11, 12, 0, 0, 0, 5, 6, 7)\
+  "ld1r {v2.2s},[x2],#4\n\t" FMA_3V(15, 16, 17, 1, 1, 1, 5, 6, 7)\
+  "ld1r {v3.2s},[x3],#4\n\t" FMA_3V(20, 21, 22, 2, 2, 2, 5, 6, 7)\
+  "ldr d8,[x4,#-16]\n\t" FMA_3V(25, 26, 27, 3, 3, 3, 5, 6, 7)\
+  "ldr d9,[x4,#-8]\n\t" FMA_3V(13, 18, 23, 0, 1, 2, 8, 8, 8)\
+  "ldr d5,[x4]\n\t" FMA_3V(14, 19, 24, 0, 1, 2, 9, 9, 9)\
+  "ld1r {v0.2s},[x0],#4; ldr d6,[x4,#8]; ldr d7,[x4,#16]\n\t"\
+  "fmla v28.2s,v3.2s,v8.2s; fmla v29.2s,v3.2s,v9.2s\n\t"\
+  "ld1r {v1.2s},[x1],#4\n\t" FMA_3V(10, 11, 12, 0, 0, 0, 5, 6, 7)\
+  "ld1r {v2.2s},[x2],#4\n\t" FMA_3V(15, 16, 17, 1, 1, 1, 5, 6, 7)\
+  "ld1r {v3.2s},[x3],#4\n\t" FMA_3V(20, 21, 22, 2, 2, 2, 5, 6, 7)\
+  "ldr d8,[x4,#24]\n\t" FMA_3V(25, 26, 27, 3, 3, 3, 5, 6, 7)\
+  "ldr d9,[x4,#32]\n\t" FMA_3V(13, 18, 23, 0, 1, 2, 8, 8, 8)\
+  "ldr d5,[x4,#40]\n\t" FMA_3V(14, 19, 24, 0, 1, 2, 9, 9, 9)\
+  "ld1r {v0.2s},[x0],#4; ldr d6,[x4,#48]; ldr d7,[x4,#56]; add x4,x4,#80\n\t"\
+  "sub w5,w5,#2\n\t"\
+  "fmla v28.2s,v3.2s,v8.2s; fmla v29.2s,v3.2s,v9.2s\n\t"
+
+#define KERNEL_M4N10_TAIL2 \
+  "ld1r {v1.2s},[x1],#4\n\t" FMA_3V(10, 11, 12, 0, 0, 0, 5, 6, 7)\
+  "ld1r {v2.2s},[x2],#4\n\t" FMA_3V(15, 16, 17, 1, 1, 1, 5, 6, 7)\
+  "ld1r {v3.2s},[x3],#4\n\t" FMA_3V(20, 21, 22, 2, 2, 2, 5, 6, 7)\
+  "ldr d8,[x4,#-16]\n\t" FMA_3V(25, 26, 27, 3, 3, 3, 5, 6, 7)\
+  "ldr d9,[x4,#-8]\n\t" FMA_3V(13, 18, 23, 0, 1, 2, 8, 8, 8)\
+  "ldr d5,[x4]\n\t" FMA_3V(14, 19, 24, 0, 1, 2, 9, 9, 9)\
+  "ld1r {v0.2s},[x0],#4; ldr d6,[x4,#8]; ldr d7,[x4,#16]\n\t"\
+  "prfm pldl1keep,[x8]; prfm pldl1keep,[x9]\n\t"\
+  "fmla v28.2s,v3.2s,v8.2s; fmla v29.2s,v3.2s,v9.2s\n\t"\
+  "ld1r {v1.2s},[x1],#4\n\t" FMA_3V(10, 11, 12, 0, 0, 0, 5, 6, 7)\
+  "ld1r {v2.2s},[x2],#4\n\t" FMA_3V(15, 16, 17, 1, 1, 1, 5, 6, 7)\
+  "ld1r {v3.2s},[x3],#4\n\t" FMA_3V(20, 21, 22, 2, 2, 2, 5, 6, 7)\
+  "ldr d8,[x4,#24]\n\t" FMA_3V(25, 26, 27, 3, 3, 3, 5, 6, 7)\
+  "ldr d9,[x4,#32]\n\t" FMA_3V(13, 18, 23, 0, 1, 2, 8, 8, 8)\
+  "prfm pldl1keep,[x10]; sub w5,w5,#2\n\t" FMA_3V(14, 19, 24, 0, 1, 2, 9, 9, 9)\
+  "prfm pldl1keep,[x11]; add x4,x4,#40\n\t"\
+  "fmla v28.2s,v3.2s,v8.2s; fmla v29.2s,v3.2s,v9.2s\n\t"
+
+#define KERNEL_M4N10_TAIL1 \
+  "ld1r {v1.2s},[x1],#4\n\t" FMA_3V(10, 11, 12, 0, 0, 0, 5, 6, 7)\
+  "ld1r {v2.2s},[x2],#4\n\t" FMA_3V(15, 16, 17, 1, 1, 1, 5, 6, 7)\
+  "ld1r {v3.2s},[x3],#4\n\t" FMA_3V(20, 21, 22, 2, 2, 2, 5, 6, 7)\
+  "ldr d8,[x4,#-16]\n\t" FMA_3V(25, 26, 27, 3, 3, 3, 5, 6, 7)\
+  "ldr d9,[x4,#-8]\n\t" FMA_3V(13, 18, 23, 0, 1, 2, 8, 8, 8)\
+  "prfm pldl1keep,[x8]\n\t" FMA_3V(14, 19, 24, 0, 1, 2, 9, 9, 9)\
+  "prfm pldl1keep,[x9]; prfm pldl1keep,[x10]; prfm pldl1keep,[x11]\n\t"\
+  "sub w5,w5,#1\n\t"\
+  "fmla v28.2s,v3.2s,v8.2s; fmla v29.2s,v3.2s,v9.2s\n\t"
+
+
+/* acc layout for m4n12 kernel */
+/* m0n0 v8 v9 v10 v11 v12 v13 m0n12 */
+/* m1n0 v14 v15 v16 v17 v18 v19 m1n12 */
+/* m2n0 v20 v21 v22 v23 v24 v25 m2n12 */
+/* m3n0 v26 v27 v28 v29 v30 v31 m3n12 */
+/* b-holder layout for m4n12 kernel */
+/* n0 v4 v5 v6/v7 v7/v6 v5 v6/v7 n12 */
+/* a-holder layout for m4n12 kernel */
+/* a_ptr1->v0, a_ptr2->v1, a_ptr3->v2, a_ptr4->v3 */
+
+#define INIT_M4N12 \
+  INIT_4V(8, 14, 20, 26) INIT_4V(9, 15, 21, 27)\
+  INIT_4V(10, 16, 22, 28) INIT_4V(11, 17, 23, 29)\
+  INIT_4V(12, 18, 24, 30) INIT_4V(13, 19, 25, 31)
+
+#define SAVE_M4N12(mode) \
+  UNIT_SAVE_M4N2_##mode(8, 14, 20, 26) UNIT_SAVE_M4N2_##mode(9, 15, 21, 27)\
+  UNIT_SAVE_M4N2_##mode(10, 16, 22, 28) UNIT_SAVE_M4N2_##mode(11, 17, 23, 29)\
+  UNIT_SAVE_M4N2_##mode(12, 18, 24, 30) UNIT_SAVE_M4N2_##mode(13, 19, 25, 31)
+
+#define KERNEL_M4N12_PRELOAD1 \
+  "ld1r {v0.2s},[x0],#4\n\t"\
+  "ldr d4,[x4]; ldr d5,[x4,#8]; ldr d6,[x4,#16]; add x4,x4,#48\n\t"
+
+#define KERNEL_M4N12_MAIN4 \
+  "ld1r {v1.2s},[x1],#4\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 4, 5, 6)\
+  "ld1r {v2.2s},[x2],#4\n\t" FMA_3V(14, 15, 16, 1, 1, 1, 4, 5, 6)\
+  "ld1r {v3.2s},[x3],#4\n\t" FMA_3V(20, 21, 22, 2, 2, 2, 4, 5, 6)\
+  "ldr d7,[x4,#-24]\n\t" FMA_3V(26, 27, 28, 3, 3, 3, 4, 5, 6)\
+  "ldr d5,[x4,#-16]\n\t" FMA_3V(17, 23, 29, 1, 2, 3, 7, 7, 7)\
+  "ldr d6,[x4,#-8]\n\t" FMA_3V(18, 24, 30, 1, 2, 3, 5, 5, 5)\
+  "ldr d4,[x4]\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 7, 5, 6)\
+  "ld1r {v0.2s},[x0],#4; ldr d5,[x4,#8]; ldr d7,[x4,#16]\n\t"\
+  "prfm pldl1keep,[x0,#64]\n\t" FMA_3V(19, 25, 31, 1, 2, 3, 6, 6, 6)\
+  "ld1r {v1.2s},[x1],#4\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 4, 5, 7)\
+  "ld1r {v2.2s},[x2],#4\n\t" FMA_3V(14, 15, 16, 1, 1, 1, 4, 5, 7)\
+  "ld1r {v3.2s},[x3],#4\n\t" FMA_3V(20, 21, 22, 2, 2, 2, 4, 5, 7)\
+  "ldr d6,[x4,#24]\n\t" FMA_3V(26, 27, 28, 3, 3, 3, 4, 5, 7)\
+  "ldr d5,[x4,#32]\n\t" FMA_3V(17, 23, 29, 1, 2, 3, 6, 6, 6)\
+  "ldr d7,[x4,#40]\n\t" FMA_3V(18, 24, 30, 1, 2, 3, 5, 5, 5)\
+  "ldr d4,[x4,#48]\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 6, 5, 7)\
+  "ld1r {v0.2s},[x0],#4; ldr d5,[x4,#56]; ldr d6,[x4,#64]\n\t"\
+  "prfm pldl1keep,[x1,#64]\n\t" FMA_3V(19, 25, 31, 1, 2, 3, 7, 7, 7)\
+  "ld1r {v1.2s},[x1],#4\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 4, 5, 6)\
+  "ld1r {v2.2s},[x2],#4\n\t" FMA_3V(14, 15, 16, 1, 1, 1, 4, 5, 6)\
+  "ld1r {v3.2s},[x3],#4\n\t" FMA_3V(20, 21, 22, 2, 2, 2, 4, 5, 6)\
+  "ldr d7,[x4,#72]\n\t" FMA_3V(26, 27, 28, 3, 3, 3, 4, 5, 6)\
+  "ldr d5,[x4,#80]\n\t" FMA_3V(17, 23, 29, 1, 2, 3, 7, 7, 7)\
+  "ldr d6,[x4,#88]\n\t" FMA_3V(18, 24, 30, 1, 2, 3, 5, 5, 5)\
+  "ldr d4,[x4,#96]\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 7, 5, 6)\
+  "ld1r {v0.2s},[x0],#4; ldr d5,[x4,#104]; ldr d7,[x4,#112]\n\t"\
+  "prfm pldl1keep,[x2,#64]\n\t" FMA_3V(19, 25, 31, 1, 2, 3, 6, 6, 6)\
+  "ld1r {v1.2s},[x1],#4\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 4, 5, 7)\
+  "ld1r {v2.2s},[x2],#4\n\t" FMA_3V(14, 15, 16, 1, 1, 1, 4, 5, 7)\
+  "ld1r {v3.2s},[x3],#4\n\t" FMA_3V(20, 21, 22, 2, 2, 2, 4, 5, 7)\
+  "ldr d6,[x4,#120]\n\t" FMA_3V(26, 27, 28, 3, 3, 3, 4, 5, 7)\
+  "ldr d5,[x4,#128]\n\t" FMA_3V(17, 23, 29, 1, 2, 3, 6, 6, 6)\
+  "ldr d7,[x4,#136]\n\t" FMA_3V(18, 24, 30, 1, 2, 3, 5, 5, 5)\
+  "ldr d4,[x4,#144]\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 6, 5, 7)\
+  "ld1r {v0.2s},[x0],#4; sub w5,w5,#4; ldr d5,[x4,#152]; cmp w5,#5\n\t"\
+  "ldr d6,[x4,#160]; add x4,x4,#192\n\t"\
+  "prfm pldl1keep,[x3,#64]\n\t" FMA_3V(19, 25, 31, 1, 2, 3, 7, 7, 7)
+
+#define KERNEL_M4N12_MAIN2 \
+  "ld1r {v1.2s},[x1],#4\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 4, 5, 6)\
+  "ld1r {v2.2s},[x2],#4\n\t" FMA_3V(14, 15, 16, 1, 1, 1, 4, 5, 6)\
+  "ld1r {v3.2s},[x3],#4\n\t" FMA_3V(20, 21, 22, 2, 2, 2, 4, 5, 6)\
+  "ldr d7,[x4,#-24]\n\t" FMA_3V(26, 27, 28, 3, 3, 3, 4, 5, 6)\
+  "ldr d5,[x4,#-16]\n\t" FMA_3V(17, 23, 29, 1, 2, 3, 7, 7, 7)\
+  "ldr d6,[x4,#-8]\n\t" FMA_3V(18, 24, 30, 1, 2, 3, 5, 5, 5)\
+  "ldr d4,[x4]\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 7, 5, 6)\
+  "ld1r {v0.2s},[x0],#4; ldr d5,[x4,#8]; ldr d7,[x4,#16]\n\t"\
+  FMA_3V(19, 25, 31, 1, 2, 3, 6, 6, 6)\
+  "ld1r {v1.2s},[x1],#4\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 4, 5, 7)\
+  "ld1r {v2.2s},[x2],#4\n\t" FMA_3V(14, 15, 16, 1, 1, 1, 4, 5, 7)\
+  "ld1r {v3.2s},[x3],#4\n\t" FMA_3V(20, 21, 22, 2, 2, 2, 4, 5, 7)\
+  "ldr d6,[x4,#24]\n\t" FMA_3V(26, 27, 28, 3, 3, 3, 4, 5, 7)\
+  "ldr d5,[x4,#32]\n\t" FMA_3V(17, 23, 29, 1, 2, 3, 6, 6, 6)\
+  "ldr d7,[x4,#40]\n\t" FMA_3V(18, 24, 30, 1, 2, 3, 5, 5, 5)\
+  "ldr d4,[x4,#48]\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 6, 5, 7)\
+  "ld1r {v0.2s},[x0],#4; ldr d5,[x4,#56]; ldr d6,[x4,#64]; add x4,x4,#96\n\t"\
+  "sub w5,w5,#2\n\t" FMA_3V(19, 25, 31, 1, 2, 3, 7, 7, 7)
+
+#define KERNEL_M4N12_TAIL2 \
+  "ld1r {v1.2s},[x1],#4\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 4, 5, 6)\
+  "ld1r {v2.2s},[x2],#4\n\t" FMA_3V(14, 15, 16, 1, 1, 1, 4, 5, 6)\
+  "ld1r {v3.2s},[x3],#4\n\t" FMA_3V(20, 21, 22, 2, 2, 2, 4, 5, 6)\
+  "ldr d7,[x4,#-24]\n\t" FMA_3V(26, 27, 28, 3, 3, 3, 4, 5, 6)\
+  "ldr d5,[x4,#-16]\n\t" FMA_3V(17, 23, 29, 1, 2, 3, 7, 7, 7)\
+  "ldr d6,[x4,#-8]\n\t" FMA_3V(18, 24, 30, 1, 2, 3, 5, 5, 5)\
+  "ldr d4,[x4]\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 7, 5, 6)\
+  "ld1r {v0.2s},[x0],#4; ldr d5,[x4,#8]; ldr d7,[x4,#16]\n\t"\
+  "prfm pldl1keep,[x8]; prfm pldl1keep,[x9]\n\t"\
+  FMA_3V(19, 25, 31, 1, 2, 3, 6, 6, 6)\
+  "ld1r {v1.2s},[x1],#4\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 4, 5, 7)\
+  "ld1r {v2.2s},[x2],#4\n\t" FMA_3V(14, 15, 16, 1, 1, 1, 4, 5, 7)\
+  "ld1r {v3.2s},[x3],#4\n\t" FMA_3V(20, 21, 22, 2, 2, 2, 4, 5, 7)\
+  "ldr d6,[x4,#24]\n\t" FMA_3V(26, 27, 28, 3, 3, 3, 4, 5, 7)\
+  "ldr d5,[x4,#32]\n\t" FMA_3V(17, 23, 29, 1, 2, 3, 6, 6, 6)\
+  "ldr d7,[x4,#40]\n\t" FMA_3V(18, 24, 30, 1, 2, 3, 5, 5, 5)\
+  "prfm pldl1keep,[x10]; add x4,x4,#48\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 6, 5, 7)\
+  "prfm pldl1keep,[x11]; sub w5,w5,#2\n\t" FMA_3V(19, 25, 31, 1, 2, 3, 7, 7, 7)
+
+#define KERNEL_M4N12_TAIL1 \
+  "ld1r {v1.2s},[x1],#4\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 4, 5, 6)\
+  "ld1r {v2.2s},[x2],#4\n\t" FMA_3V(14, 15, 16, 1, 1, 1, 4, 5, 6)\
+  "ld1r {v3.2s},[x3],#4\n\t" FMA_3V(20, 21, 22, 2, 2, 2, 4, 5, 6)\
+  "ldr d7,[x4,#-24]\n\t" FMA_3V(26, 27, 28, 3, 3, 3, 4, 5, 6)\
+  "ldr d5,[x4,#-16]\n\t" FMA_3V(17, 23, 29, 1, 2, 3, 7, 7, 7)\
+  "ldr d6,[x4,#-8]\n\t" FMA_3V(18, 24, 30, 1, 2, 3, 5, 5, 5)\
+  "prfm pldl1keep,[x8]; prfm pldl1keep,[x9]; prfm pldl1keep,[x10]\n\t"\
+  "sub w5,w5,#1\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 7, 5, 6)\
+  "prfm pldl1keep,[x11]\n\t" FMA_3V(19, 25, 31, 1, 2, 3, 6, 6, 6)
+
+
+/* acc layout for m3n14 kernel */
+/* m0n0 v11 v12 v13 v14 v15 v16 v17 m0n14 */
+/* m1n0 v18 v19 v20 v21 v22 v23 v24 m1n14 */
+/* m2n0 v25 v26 v27 v28 v29 v30 v31 m2n14 */
+/* b-holder layout for m3n14 kernel */
+/* n0 v3 v4 v5 v6 v7 v8 v9 n14 */
+/* a-holder layout for m3n14 kernel */
+/* a_ptr1->v0, a_ptr2->v1, a_ptr3->v2 */
+
+#define INIT_M3N14 \
+  INIT_3V(11, 18, 25) INIT_3V(12, 19, 26) INIT_3V(13, 20, 27)\
+  INIT_3V(14, 21, 28) INIT_3V(15, 22, 29) INIT_3V(16, 23, 30)\
+  INIT_3V(17, 24, 31)
+
+#define SAVE_M3N14(mode) \
+  UNIT_SAVE_M3N2_##mode(11, 18, 25) UNIT_SAVE_M3N2_##mode(12, 19, 26)\
+  UNIT_SAVE_M3N2_##mode(13, 20, 27) UNIT_SAVE_M3N2_##mode(14, 21, 28)\
+  UNIT_SAVE_M3N2_##mode(15, 22, 29) UNIT_SAVE_M3N2_##mode(16, 23, 30)\
+  UNIT_SAVE_M3N2_##mode(17, 24, 31)
+
+#define KERNEL_M3N14_PRELOAD1 \
+  "ld1r {v0.2s},[x0],#4\n\t"\
+  "ldr d3,[x4]; ldr d4,[x4,#8]; ldr d5,[x4,#16]; add x4,x4,#56\n\t"
+
+#define KERNEL_M3N14_MAIN4 \
+  "ld1r {v1.2s},[x1],#4\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 3, 4, 5)\
+  "ld1r {v2.2s},[x2],#4\n\t" FMA_3V(18, 19, 20, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#-32]\n\t" FMA_3V(25, 26, 27, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#-24]\n\t" FMA_3V(14, 21, 28, 0, 1, 2, 6, 6, 6)\
+  "ldr d8,[x4,#-16]; ldr d9,[x4,#-8]; ldr d3,[x4]; ldr d4,[x4,#8]\n\t"\
+  FMA_3V(15, 22, 29, 0, 1, 2, 7, 7, 7)\
+  "ldr d5,[x4,#16]\n\t" FMA_3V(16, 17, 23, 0, 0, 1, 8, 9, 8)\
+  "ld1r {v0.2s},[x0],#4\n\t" FMA_3V(24, 30, 31, 1, 2, 2, 9, 8, 9)\
+  "ld1r {v1.2s},[x1],#4\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 3, 4, 5)\
+  "ld1r {v2.2s},[x2],#4\n\t" FMA_3V(18, 19, 20, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#24]\n\t" FMA_3V(25, 26, 27, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#32]\n\t" FMA_3V(14, 21, 28, 0, 1, 2, 6, 6, 6)\
+  "ldr d8,[x4,#40]; ldr d9,[x4,#48]; ldr d3,[x4,#56]; ldr d4,[x4,#64]\n\t"\
+  "prfm pldl1keep,[x0,#64]\n\t" FMA_3V(15, 22, 29, 0, 1, 2, 7, 7, 7)\
+  "ldr d5,[x4,#72]\n\t" FMA_3V(16, 17, 23, 0, 0, 1, 8, 9, 8)\
+  "ld1r {v0.2s},[x0],#4\n\t" FMA_3V(24, 30, 31, 1, 2, 2, 9, 8, 9)\
+  "ld1r {v1.2s},[x1],#4\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 3, 4, 5)\
+  "ld1r {v2.2s},[x2],#4\n\t" FMA_3V(18, 19, 20, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#80]\n\t" FMA_3V(25, 26, 27, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#88]\n\t" FMA_3V(14, 21, 28, 0, 1, 2, 6, 6, 6)\
+  "ldr d8,[x4,#96]; ldr d9,[x4,#104]; ldr d3,[x4,#112]; ldr d4,[x4,#120]\n\t"\
+  "prfm pldl1keep,[x1,#64]\n\t" FMA_3V(15, 22, 29, 0, 1, 2, 7, 7, 7)\
+  "ldr d5,[x4,#128]\n\t" FMA_3V(16, 17, 23, 0, 0, 1, 8, 9, 8)\
+  "ld1r {v0.2s},[x0],#4\n\t" FMA_3V(24, 30, 31, 1, 2, 2, 9, 8, 9)\
+  "ld1r {v1.2s},[x1],#4\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 3, 4, 5)\
+  "ld1r {v2.2s},[x2],#4\n\t" FMA_3V(18, 19, 20, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#136]\n\t" FMA_3V(25, 26, 27, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#144]\n\t" FMA_3V(14, 21, 28, 0, 1, 2, 6, 6, 6)\
+  "ldr d8,[x4,#152]; ldr d9,[x4,#160]; ldr d3,[x4,#168]; ldr d4,[x4,#176]\n\t"\
+  "add x4,x4,#224; prfm pldl1keep,[x2,#64]; sub w5,w5,#4\n\t"\
+  FMA_3V(15, 22, 29, 0, 1, 2, 7, 7, 7)\
+  "ldr d5,[x4,#-40]\n\t" FMA_3V(16, 17, 23, 0, 0, 1, 8, 9, 8)\
+  "ld1r {v0.2s},[x0],#4; cmp w5,#5\n\t"\
+  FMA_3V(24, 30, 31, 1, 2, 2, 9, 8, 9)
+
+#define KERNEL_M3N14_MAIN2 \
+  "ld1r {v1.2s},[x1],#4\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 3, 4, 5)\
+  "ld1r {v2.2s},[x2],#4\n\t" FMA_3V(18, 19, 20, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#-32]\n\t" FMA_3V(25, 26, 27, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#-24]\n\t" FMA_3V(14, 21, 28, 0, 1, 2, 6, 6, 6)\
+  "ldr d8,[x4,#-16]; ldr d9,[x4,#-8]; ldr d3,[x4]; ldr d4,[x4,#8]\n\t"\
+  FMA_3V(15, 22, 29, 0, 1, 2, 7, 7, 7)\
+  "ldr d5,[x4,#16]\n\t" FMA_3V(16, 17, 23, 0, 0, 1, 8, 9, 8)\
+  "ld1r {v0.2s},[x0],#4\n\t" FMA_3V(24, 30, 31, 1, 2, 2, 9, 8, 9)\
+  "ld1r {v1.2s},[x1],#4\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 3, 4, 5)\
+  "ld1r {v2.2s},[x2],#4\n\t" FMA_3V(18, 19, 20, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#24]\n\t" FMA_3V(25, 26, 27, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#32]\n\t" FMA_3V(14, 21, 28, 0, 1, 2, 6, 6, 6)\
+  "ldr d8,[x4,#40]; ldr d9,[x4,#48]; ldr d3,[x4,#56]\n\t"\
+  "ldr d4,[x4,#64]; add x4,x4,#112\n\t"\
+  "sub w5,w5,#2\n\t" FMA_3V(15, 22, 29, 0, 1, 2, 7, 7, 7)\
+  "ldr d5,[x4,#-40]\n\t" FMA_3V(16, 17, 23, 0, 0, 1, 8, 9, 8)\
+  "ld1r {v0.2s},[x0],#4\n\t" FMA_3V(24, 30, 31, 1, 2, 2, 9, 8, 9)
+
+#define KERNEL_M3N14_TAIL2 \
+  "ld1r {v1.2s},[x1],#4\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 3, 4, 5)\
+  "ld1r {v2.2s},[x2],#4\n\t" FMA_3V(18, 19, 20, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#-32]\n\t" FMA_3V(25, 26, 27, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#-24]\n\t" FMA_3V(14, 21, 28, 0, 1, 2, 6, 6, 6)\
+  "ldr d8,[x4,#-16]; ldr d9,[x4,#-8]; ldr d3,[x4]; ldr d4,[x4,#8]\n\t"\
+  "prfm pldl1keep,[x3]\n\t" FMA_3V(15, 22, 29, 0, 1, 2, 7, 7, 7)\
+  "ldr d5,[x4,#16]\n\t" FMA_3V(16, 17, 23, 0, 0, 1, 8, 9, 8)\
+  "ld1r {v0.2s},[x0],#4\n\t" FMA_3V(24, 30, 31, 1, 2, 2, 9, 8, 9)\
+  "ld1r {v1.2s},[x1],#4\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 3, 4, 5)\
+  "ld1r {v2.2s},[x2],#4\n\t" FMA_3V(18, 19, 20, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#24]\n\t" FMA_3V(25, 26, 27, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#32]\n\t" FMA_3V(14, 21, 28, 0, 1, 2, 6, 6, 6)\
+  "ldr d8,[x4,#40]; ldr d9,[x4,#48]; add x4,x4,#56\n\t"\
+  "sub w5,w5,#2\n\t" FMA_3V(15, 22, 29, 0, 1, 2, 7, 7, 7)\
+  "prfm pldl1keep,[x8]\n\t" FMA_3V(16, 17, 23, 0, 0, 1, 8, 9, 8)\
+  "prfm pldl1keep,[x9]\n\t" FMA_3V(24, 30, 31, 1, 2, 2, 9, 8, 9)
+
+#define KERNEL_M3N14_TAIL1 \
+  "ld1r {v1.2s},[x1],#4\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 3, 4, 5)\
+  "ld1r {v2.2s},[x2],#4\n\t" FMA_3V(18, 19, 20, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#-32]\n\t" FMA_3V(25, 26, 27, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#-24]\n\t" FMA_3V(14, 21, 28, 0, 1, 2, 6, 6, 6)\
+  "ldr d8,[x4,#-16]; ldr d9,[x4,#-8]; sub w5,w5,#1\n\t"\
+  "prfm pldl1keep,[x3]\n\t" FMA_3V(15, 22, 29, 0, 1, 2, 7, 7, 7)\
+  "prfm pldl1keep,[x8]\n\t" FMA_3V(16, 17, 23, 0, 0, 1, 8, 9, 8)\
+  "prfm pldl1keep,[x9]\n\t" FMA_3V(24, 30, 31, 1, 2, 2, 9, 8, 9)
+
+
+/* acc layout for m3n16 kernel */
+/* m0n0 v8 v9 v10 v11 v12 v13 v14 v15 m0n16 */
+/* m1n0 v16 v17 v18 v19 v20 v21 v22 v23 m1n16 */
+/* m2n0 v24 v25 v26 v27 v28 v29 v30 v31 m2n16 */
+/* b-holder layout for m3n16 kernel */
+/* n0 v3 v4 v5 v6 v7 v5 v6 v7 n16 */
+/* a-holder layout for m3n16 kernel */
+/* a_ptr1->v0, a_ptr2->v1, a_ptr3->v2 */
+
+#define INIT_M3N16 \
+  INIT_3V(8, 16, 24) INIT_3V(9, 17, 25) INIT_3V(10, 18, 26)\
+  INIT_3V(11, 19, 27) INIT_3V(12, 20, 28) INIT_3V(13, 21, 29)\
+  INIT_3V(14, 22, 30) INIT_3V(15, 23, 31)
+
+#define SAVE_M3N16(mode) \
+  UNIT_SAVE_M3N2_##mode(8, 16, 24) UNIT_SAVE_M3N2_##mode(9, 17, 25)\
+  UNIT_SAVE_M3N2_##mode(10, 18, 26) UNIT_SAVE_M3N2_##mode(11, 19, 27)\
+  UNIT_SAVE_M3N2_##mode(12, 20, 28) UNIT_SAVE_M3N2_##mode(13, 21, 29)\
+  UNIT_SAVE_M3N2_##mode(14, 22, 30) UNIT_SAVE_M3N2_##mode(15, 23, 31)
+
+#define KERNEL_M3N16_PRELOAD1 \
+  "ld1r {v0.2s},[x0],#4\n\t"\
+  "ldr d3,[x4]; ldr d4,[x4,#8]; ldr d5,[x4,#16]; add x4,x4,#64\n\t"
+
+#define KERNEL_M3N16_MAIN4 \
+  "ld1r {v1.2s},[x1],#4\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 3, 4, 5)\
+  "ld1r {v2.2s},[x2],#4\n\t" FMA_3V(16, 17, 18, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#-40]\n\t" FMA_3V(24, 25, 26, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#-32]\n\t" FMA_3V(11, 19, 27, 0, 1, 2, 6, 6, 6)\
+  "ldr d5,[x4,#-24]\n\t" FMA_3V(12, 20, 28, 0, 1, 2, 7, 7, 7)\
+  "ldr d6,[x4,#-16]; ldr d7,[x4,#-8]; ldr d3,[x4]; ldr d4,[x4,#8]\n\t"\
+  FMA_3V(13, 21, 29, 0, 1, 2, 5, 5, 5)\
+  "ldr d5,[x4,#16]\n\t" FMA_3V(14, 15, 22, 0, 0, 1, 6, 7, 6)\
+  "ld1r {v0.2s},[x0],#4\n\t" FMA_3V(23, 30, 31, 1, 2, 2, 7, 6, 7)\
+  "ld1r {v1.2s},[x1],#4\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 3, 4, 5)\
+  "ld1r {v2.2s},[x2],#4\n\t" FMA_3V(16, 17, 18, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#24]\n\t" FMA_3V(24, 25, 26, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#32]\n\t" FMA_3V(11, 19, 27, 0, 1, 2, 6, 6, 6)\
+  "ldr d5,[x4,#40]\n\t" FMA_3V(12, 20, 28, 0, 1, 2, 7, 7, 7)\
+  "ldr d6,[x4,#48]; ldr d7,[x4,#56]; ldr d3,[x4,#64]; ldr d4,[x4,#72]\n\t"\
+  "prfm pldl1keep,[x0,#64]\n\t" FMA_3V(13, 21, 29, 0, 1, 2, 5, 5, 5)\
+  "ldr d5,[x4,#80]\n\t" FMA_3V(14, 15, 22, 0, 0, 1, 6, 7, 6)\
+  "ld1r {v0.2s},[x0],#4\n\t" FMA_3V(23, 30, 31, 1, 2, 2, 7, 6, 7)\
+  "ld1r {v1.2s},[x1],#4\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 3, 4, 5)\
+  "ld1r {v2.2s},[x2],#4\n\t" FMA_3V(16, 17, 18, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#88]\n\t" FMA_3V(24, 25, 26, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#96]\n\t" FMA_3V(11, 19, 27, 0, 1, 2, 6, 6, 6)\
+  "ldr d5,[x4,#104]\n\t" FMA_3V(12, 20, 28, 0, 1, 2, 7, 7, 7)\
+  "ldr d6,[x4,#112]; ldr d7,[x4,#120]; ldr d3,[x4,#128]; ldr d4,[x4,#136]\n\t"\
+  "prfm pldl1keep,[x1,#64]\n\t" FMA_3V(13, 21, 29, 0, 1, 2, 5, 5, 5)\
+  "ldr d5,[x4,#144]\n\t" FMA_3V(14, 15, 22, 0, 0, 1, 6, 7, 6)\
+  "ld1r {v0.2s},[x0],#4\n\t" FMA_3V(23, 30, 31, 1, 2, 2, 7, 6, 7)\
+  "ld1r {v1.2s},[x1],#4\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 3, 4, 5)\
+  "ld1r {v2.2s},[x2],#4\n\t" FMA_3V(16, 17, 18, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#152]\n\t" FMA_3V(24, 25, 26, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#160]\n\t" FMA_3V(11, 19, 27, 0, 1, 2, 6, 6, 6)\
+  "ldr d5,[x4,#168]\n\t" FMA_3V(12, 20, 28, 0, 1, 2, 7, 7, 7)\
+  "ldr d6,[x4,#176]; ldr d7,[x4,#184]; ldr d3,[x4,#192]; ldr d4,[x4,#200]\n\t"\
+  "prfm pldl1keep,[x2,#64]\n\t" FMA_3V(13, 21, 29, 0, 1, 2, 5, 5, 5)\
+  "ldr d5,[x4,#208]; add x4,x4,#256\n\t" FMA_3V(14, 15, 22, 0, 0, 1, 6, 7, 6)\
+  "sub w5,w5,#4; ld1r {v0.2s},[x0],#4; cmp w5,#5\n\t"\
+  FMA_3V(23, 30, 31, 1, 2, 2, 7, 6, 7)
+
+#define KERNEL_M3N16_MAIN2 \
+  "ld1r {v1.2s},[x1],#4\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 3, 4, 5)\
+  "ld1r {v2.2s},[x2],#4\n\t" FMA_3V(16, 17, 18, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#-40]\n\t" FMA_3V(24, 25, 26, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#-32]\n\t" FMA_3V(11, 19, 27, 0, 1, 2, 6, 6, 6)\
+  "ldr d5,[x4,#-24]\n\t" FMA_3V(12, 20, 28, 0, 1, 2, 7, 7, 7)\
+  "ldr d6,[x4,#-16]; ldr d7,[x4,#-8]; ldr d3,[x4]; ldr d4,[x4,#8]\n\t"\
+  FMA_3V(13, 21, 29, 0, 1, 2, 5, 5, 5)\
+  "ldr d5,[x4,#16]\n\t" FMA_3V(14, 15, 22, 0, 0, 1, 6, 7, 6)\
+  "ld1r {v0.2s},[x0],#4\n\t" FMA_3V(23, 30, 31, 1, 2, 2, 7, 6, 7)\
+  "ld1r {v1.2s},[x1],#4\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 3, 4, 5)\
+  "ld1r {v2.2s},[x2],#4\n\t" FMA_3V(16, 17, 18, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#24]\n\t" FMA_3V(24, 25, 26, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#32]\n\t" FMA_3V(11, 19, 27, 0, 1, 2, 6, 6, 6)\
+  "ldr d5,[x4,#40]\n\t" FMA_3V(12, 20, 28, 0, 1, 2, 7, 7, 7)\
+  "ldr d6,[x4,#48]; ldr d7,[x4,#56]; ldr d3,[x4,#64]; ldr d4,[x4,#72]\n\t"\
+  FMA_3V(13, 21, 29, 0, 1, 2, 5, 5, 5)\
+  "ldr d5,[x4,#80]; add x4,x4,#128\n\t" FMA_3V(14, 15, 22, 0, 0, 1, 6, 7, 6)\
+  "ld1r {v0.2s},[x0],#4; sub w5,w5,#2\n\t" FMA_3V(23, 30, 31, 1, 2, 2, 7, 6, 7)
+
+#define KERNEL_M3N16_TAIL2 \
+  "ld1r {v1.2s},[x1],#4\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 3, 4, 5)\
+  "ld1r {v2.2s},[x2],#4\n\t" FMA_3V(16, 17, 18, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#-40]\n\t" FMA_3V(24, 25, 26, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#-32]\n\t" FMA_3V(11, 19, 27, 0, 1, 2, 6, 6, 6)\
+  "ldr d5,[x4,#-24]\n\t" FMA_3V(12, 20, 28, 0, 1, 2, 7, 7, 7)\
+  "ldr d6,[x4,#-16]; ldr d7,[x4,#-8]; ldr d3,[x4]; ldr d4,[x4,#8]\n\t"\
+  FMA_3V(13, 21, 29, 0, 1, 2, 5, 5, 5)\
+  "ldr d5,[x4,#16]\n\t" FMA_3V(14, 15, 22, 0, 0, 1, 6, 7, 6)\
+  "ld1r {v0.2s},[x0],#4\n\t" FMA_3V(23, 30, 31, 1, 2, 2, 7, 6, 7)\
+  "ld1r {v1.2s},[x1],#4\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 3, 4, 5)\
+  "ld1r {v2.2s},[x2],#4\n\t" FMA_3V(16, 17, 18, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#24]\n\t" FMA_3V(24, 25, 26, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#32]\n\t" FMA_3V(11, 19, 27, 0, 1, 2, 6, 6, 6)\
+  "ldr d5,[x4,#40]\n\t" FMA_3V(12, 20, 28, 0, 1, 2, 7, 7, 7)\
+  "ldr d6,[x4,#48]; ldr d7,[x4,#56]\n\t"\
+  "prfm pldl1keep,[x3]\n\t" FMA_3V(13, 21, 29, 0, 1, 2, 5, 5, 5)\
+  "add x4,x4,#64; prfm pldl1keep,[x8]\n\t" FMA_3V(14, 15, 22, 0, 0, 1, 6, 7, 6)\
+  "sub w5,w5,#2; prfm pldl1keep,[x9]\n\t" FMA_3V(23, 30, 31, 1, 2, 2, 7, 6, 7)
+
+#define KERNEL_M3N16_TAIL1 \
+  "ld1r {v1.2s},[x1],#4\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 3, 4, 5)\
+  "ld1r {v2.2s},[x2],#4\n\t" FMA_3V(16, 17, 18, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#-40]\n\t" FMA_3V(24, 25, 26, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#-32]\n\t" FMA_3V(11, 19, 27, 0, 1, 2, 6, 6, 6)\
+  "ldr d5,[x4,#-24]\n\t" FMA_3V(12, 20, 28, 0, 1, 2, 7, 7, 7)\
+  "ldr d6,[x4,#-16]; ldr d7,[x4,#-8]; sub w5,w5,#1\n\t"\
+  "prfm pldl1keep,[x3]\n\t" FMA_3V(13, 21, 29, 0, 1, 2, 5, 5, 5)\
+  "prfm pldl1keep,[x8]\n\t" FMA_3V(14, 15, 22, 0, 0, 1, 6, 7, 6)\
+  "prfm pldl1keep,[x9]\n\t" FMA_3V(23, 30, 31, 1, 2, 2, 7, 6, 7)
+
+FUNC_PACK0(4, 10)
+
+FUNC_PACK0(4, 12)
+
+FUNC_PACK0(3, 14)
+
+FUNC_PACK0(3, 16)
+
+/* macro for GEMM with packing pattern NO.#4 */
+/* mdim = 3, 4; ndim = 9, 11, 13, 15, 17, 18 */
+#define FUNC_PACK4(mdim, ndim) \
+static inline void sgemm_skinny1_a35_m##mdim##n##ndim(\
+  const float * __restrict__ a_ptr, const float * __restrict__ b_scr,\
+  float * __restrict__ c_ptr, uint32_t K, uint32_t LDA, uint32_t LDC,\
+  uint8_t c_rowmajor, const float * __restrict__ beta_addr) {\
+  __asm__ __volatile__(\
+    "mov x4,%[b_scr]\n\t"\
+    "mov x0,%[a_ptr]; add x1,%[a_ptr],%w[LDA],UXTW #2\n\t"\
+    "add x2,%[a_ptr],%w[LDA],UXTW #3; add x3,x1,%w[LDA],UXTW #3\n\t"\
+    "add x8,x0,%w[LDA],UXTW #4; add x9,x1,%w[LDA],UXTW #4\n\t"\
+    "add x10,x2,%w[LDA],UXTW #4; add x11,x3,%w[LDA],UXTW #4\n\t"\
+    "mov w5,%w[K]\n\t"\
+    INIT_M##mdim##N##ndim\
+    "cmp w5,#2; b.lt 4f\n\t"\
+    KERNEL_M##mdim##N##ndim##_PRELOAD2\
+    "cmp w5,#6; b.lt 1f\n\t"\
+    ".balign 16; 8:\n\t"\
+    KERNEL_M##mdim##N##ndim##_MAIN4 "b.ge 8b\n\t"\
+    "1:\n\t"\
+    "cmp w5,#4; b.lt 2f\n\t"\
+    KERNEL_M##mdim##N##ndim##_TAIL4 "b 4f\n\t"\
+    "2:\n\t"\
+    KERNEL_M##mdim##N##ndim##_TAIL2\
+    "4:\n\t"\
+    "cmp w5,#1; b.lt 5f\n\t"\
+    KERNEL_M##mdim##N##ndim##_FIN1\
+    "5:\n\t"\
+    "cmp %w[c_rowmajor],#0; b.eq 6f\n\t"\
+    INIT_SAVE_M##mdim##_CR SAVE_M##mdim##N##ndim(CR) "b 7f\n\t"\
+    "6:\n\t"\
+    INIT_SAVE_CC SAVE_M##mdim##N##ndim(CC)\
+    "7:\n\t"\
+   ::[a_ptr]"r"(a_ptr), [b_scr]"r"(b_scr), [c_ptr]"r"(c_ptr),\
+     [LDA]"r"(LDA), [LDC]"r"(LDC), [K]"r"(K),\
+     [beta_addr]"r"(beta_addr), [c_rowmajor]"r"(c_rowmajor)\
+   :"cc","memory","x0","x1","x2","x3","x4","x5",\
+    "x8","x9","x10","x11","x12","x13","x14","x15",\
+    "v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11",\
+    "v12","v13","v14","v15","v16","v17","v18","v19","v20","v21",\
+    "v22","v23","v24","v25","v26","v27","v28","v29","v30","v31");\
+}
+
+/* acc layout for m4n9 kernel */
+/* m0n0 v12 v13 v14 v15 v16_h m0n9 */
+/* m1n0 v17 v18 v19 v20 v21_h m1n9 */
+/* m2n0 v22 v23 v24 v25 v26_h m2n9 */
+/* m3n0 v27 v28 v29 v30 v31_h m3n9 */
+/* b-holder layout for m4n9 kernel */
+/* n0 v4 v5 v6 v7 v8(s) n9 */
+/* a-holder layout for m4n9 kernel */
+/* a_ptr1->v0, a_ptr2->v1, a_ptr3->v2, a_ptr4->v3 */
+
+#define INIT_M4N9 \
+  INIT_4V(12, 17, 22, 27) INIT_4V(13, 18, 23, 28)\
+  INIT_4V(14, 19, 24, 29) INIT_4V(15, 20, 25, 30)\
+  INIT_4V(16, 21, 26, 31)
+
+#define KERNEL_M4N9_PRELOAD2 \
+  "ldr d0,[x0],#8\n\t"\
+  "ldr d4,[x4]; ldr d5,[x4,#8]; ldr d6,[x4,#16]; add x4,x4,#72\n\t"
+
+#define KERNEL_M4N9_MAIN4 \
+  "ldr d1,[x1],#8\n\t" FMA_3V(12, 13, 14, 0, 0, 0, 4, 5, 6)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(17, 18, 19, 1, 1, 1, 4, 5, 6)\
+  "ldr d3,[x3],#8\n\t" FMA_3V(22, 23, 24, 2, 2, 2, 4, 5, 6)\
+  "ldr d7,[x4,#-48]\n\t" FMA_3V(27, 28, 29, 3, 3, 3, 4, 5, 6)\
+  "ldr d8,[x4,#-40]\n\t" FMA_3V(15, 20, 25, 0, 1, 2, 7, 7, 7)\
+  "ldr d4,[x4,#-32]\n\t" FMA_3V(16, 21, 26, 0, 1, 2, 8, 8, 8)\
+  "rev64 v0.2s,v0.2s; ldr d5,[x4,#-24]; ldr d6,[x4,#-16]; sub w5,w5,#4\n\t"\
+  "prfm pldl1keep,[x0,#64]\n\t" FMA_3V(30, 31, 12, 3, 3, 0, 7, 8, 4)\
+  "rev64 v1.2s,v1.2s\n\t" FMA_3V(13, 14, 17, 0, 0, 1, 5, 6, 4)\
+  "rev64 v2.2s,v2.2s\n\t" FMA_3V(18, 19, 22, 1, 1, 2, 5, 6, 4)\
+  "rev64 v3.2s,v3.2s\n\t" FMA_3V(23, 24, 27, 2, 2, 3, 5, 6, 4)\
+  "ldr d7,[x4,#-8]\n\t" FMA_3V(28, 29, 15, 3, 3, 0, 5, 6, 7)\
+  "ldr d0,[x0],#8; ldr d4,[x4]; ldr d5,[x4,#8]; ldr d6,[x4,#16]; cmp w5,#6\n\t"\
+  "prfm pldl1keep,[x1,#64]\n\t" FMA_3V(20, 25, 30, 1, 2, 3, 7, 7, 7)\
+  "ldr d1,[x1],#8\n\t" FMA_3V(12, 13, 14, 0, 0, 0, 4, 5, 6)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(17, 18, 19, 1, 1, 1, 4, 5, 6)\
+  "ldr d3,[x3],#8\n\t" FMA_3V(22, 23, 24, 2, 2, 2, 4, 5, 6)\
+  "ldr d7,[x4,#24]\n\t" FMA_3V(27, 28, 29, 3, 3, 3, 4, 5, 6)\
+  "ldr d8,[x4,#32]\n\t" FMA_3V(15, 20, 25, 0, 1, 2, 7, 7, 7)\
+  "ldr d4,[x4,#40]\n\t" FMA_3V(16, 21, 26, 0, 1, 2, 8, 8, 8)\
+  "rev64 v0.2s,v0.2s; ldr d5,[x4,#48]; ldr d6,[x4,#56]; add x4,x4,#144\n\t"\
+  "prfm pldl1keep,[x2,#64]\n\t" FMA_3V(30, 31, 12, 3, 3, 0, 7, 8, 4)\
+  "rev64 v1.2s,v1.2s\n\t" FMA_3V(13, 14, 17, 0, 0, 1, 5, 6, 4)\
+  "rev64 v2.2s,v2.2s\n\t" FMA_3V(18, 19, 22, 1, 1, 2, 5, 6, 4)\
+  "rev64 v3.2s,v3.2s\n\t" FMA_3V(23, 24, 27, 2, 2, 3, 5, 6, 4)\
+  "ldr d7,[x4,#-80]\n\t" FMA_3V(28, 29, 15, 3, 3, 0, 5, 6, 7)\
+  "ldr d0,[x0],#8; ldr d4,[x4,#-72]; ldr d5,[x4,#-64]; ldr d6,[x4,#-56]\n\t"\
+  "prfm pldl1keep,[x3,#64]\n\t" FMA_3V(20, 25, 30, 1, 2, 3, 7, 7, 7)
+
+#define KERNEL_M4N9_TAIL4 \
+  "ldr d1,[x1],#8\n\t" FMA_3V(12, 13, 14, 0, 0, 0, 4, 5, 6)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(17, 18, 19, 1, 1, 1, 4, 5, 6)\
+  "ldr d3,[x3],#8\n\t" FMA_3V(22, 23, 24, 2, 2, 2, 4, 5, 6)\
+  "ldr d7,[x4,#-48]\n\t" FMA_3V(27, 28, 29, 3, 3, 3, 4, 5, 6)\
+  "ldr d8,[x4,#-40]\n\t" FMA_3V(15, 20, 25, 0, 1, 2, 7, 7, 7)\
+  "ldr d4,[x4,#-32]\n\t" FMA_3V(16, 21, 26, 0, 1, 2, 8, 8, 8)\
+  "rev64 v0.2s,v0.2s; ldr d5,[x4,#-24]; ldr d6,[x4,#-16]; sub w5,w5,#4\n\t"\
+  "prfm pldl1keep,[x8]\n\t" FMA_3V(30, 31, 12, 3, 3, 0, 7, 8, 4)\
+  "rev64 v1.2s,v1.2s\n\t" FMA_3V(13, 14, 17, 0, 0, 1, 5, 6, 4)\
+  "rev64 v2.2s,v2.2s\n\t" FMA_3V(18, 19, 22, 1, 1, 2, 5, 6, 4)\
+  "rev64 v3.2s,v3.2s\n\t" FMA_3V(23, 24, 27, 2, 2, 3, 5, 6, 4)\
+  "ldr d7,[x4,#-8]\n\t" FMA_3V(28, 29, 15, 3, 3, 0, 5, 6, 7)\
+  "ldr d0,[x0],#8; ldr d4,[x4]; ldr d5,[x4,#8]; ldr d6,[x4,#16]\n\t"\
+  "prfm pldl1keep,[x9]\n\t" FMA_3V(20, 25, 30, 1, 2, 3, 7, 7, 7)\
+  "ldr d1,[x1],#8\n\t" FMA_3V(12, 13, 14, 0, 0, 0, 4, 5, 6)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(17, 18, 19, 1, 1, 1, 4, 5, 6)\
+  "ldr d3,[x3],#8\n\t" FMA_3V(22, 23, 24, 2, 2, 2, 4, 5, 6)\
+  "ldr d7,[x4,#24]\n\t" FMA_3V(27, 28, 29, 3, 3, 3, 4, 5, 6)\
+  "ldr d8,[x4,#32]\n\t" FMA_3V(15, 20, 25, 0, 1, 2, 7, 7, 7)\
+  "ldr d4,[x4,#40]\n\t" FMA_3V(16, 21, 26, 0, 1, 2, 8, 8, 8)\
+  "rev64 v0.2s,v0.2s; ldr d5,[x4,#48]; ldr d6,[x4,#56]; add x4,x4,#72\n\t"\
+  "prfm pldl1keep,[x10]\n\t" FMA_3V(30, 31, 12, 3, 3, 0, 7, 8, 4)\
+  "rev64 v1.2s,v1.2s\n\t" FMA_3V(13, 14, 17, 0, 0, 1, 5, 6, 4)\
+  "rev64 v2.2s,v2.2s\n\t" FMA_3V(18, 19, 22, 1, 1, 2, 5, 6, 4)\
+  "rev64 v3.2s,v3.2s\n\t" FMA_3V(23, 24, 27, 2, 2, 3, 5, 6, 4)\
+  "ldr d7,[x4,#-8]\n\t" FMA_3V(28, 29, 15, 3, 3, 0, 5, 6, 7)\
+  "prfm pldl1keep,[x11]\n\t" FMA_3V(20, 25, 30, 1, 2, 3, 7, 7, 7)
+
+#define KERNEL_M4N9_TAIL2 \
+  "ldr d1,[x1],#8\n\t" FMA_3V(12, 13, 14, 0, 0, 0, 4, 5, 6)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(17, 18, 19, 1, 1, 1, 4, 5, 6)\
+  "ldr d3,[x3],#8\n\t" FMA_3V(22, 23, 24, 2, 2, 2, 4, 5, 6)\
+  "ldr d7,[x4,#-48]\n\t" FMA_3V(27, 28, 29, 3, 3, 3, 4, 5, 6)\
+  "ldr d8,[x4,#-40]\n\t" FMA_3V(15, 20, 25, 0, 1, 2, 7, 7, 7)\
+  "ldr d4,[x4,#-32]\n\t" FMA_3V(16, 21, 26, 0, 1, 2, 8, 8, 8)\
+  "rev64 v0.2s,v0.2s; ldr d5,[x4,#-24]; ldr d6,[x4,#-16]; sub w5,w5,#2\n\t"\
+  "prfm pldl1keep,[x8]\n\t" FMA_3V(30, 31, 12, 3, 3, 0, 7, 8, 4)\
+  "rev64 v1.2s,v1.2s\n\t" FMA_3V(13, 14, 17, 0, 0, 1, 5, 6, 4)\
+  "rev64 v2.2s,v2.2s\n\t" FMA_3V(18, 19, 22, 1, 1, 2, 5, 6, 4)\
+  "rev64 v3.2s,v3.2s\n\t" FMA_3V(23, 24, 27, 2, 2, 3, 5, 6, 4)\
+  "ldr d7,[x4,#-8]\n\t" FMA_3V(28, 29, 15, 3, 3, 0, 5, 6, 7)\
+  "prfm pldl1keep,[x9]; prfm pldl1keep,[x10]\n\t"\
+  "prfm pldl1keep,[x11]\n\t" FMA_3V(20, 25, 30, 1, 2, 3, 7, 7, 7)
+
+#define KERNEL_M4N9_FIN1 \
+  "ld1r {v0.2s},[x0],#4; ldr d4,[x4]; ldr d5,[x4,#8]; ldr d6,[x4,#16]\n\t"\
+  "ld1r {v1.2s},[x1],#4\n\t" FMA_3V(12, 13, 14, 0, 0, 0, 4, 5, 6)\
+  "ld1r {v2.2s},[x2],#4\n\t" FMA_3V(17, 18, 19, 1, 1, 1, 4, 5, 6)\
+  "ld1r {v3.2s},[x3],#4\n\t" FMA_3V(22, 23, 24, 2, 2, 2, 4, 5, 6)\
+  "ldr d7,[x4,#24]\n\t" FMA_3V(27, 28, 29, 3, 3, 3, 4, 5, 6)\
+  "ldr s8,[x4,#32]\n\t" FMA_3V(15, 20, 25, 0, 1, 2, 7, 7, 7)\
+  "add x4,x4,#36\n\t" FMA_3V(16, 21, 26, 0, 1, 2, 8, 8, 8)\
+  "fmla v30.2s,v3.2s,v7.2s; fmla v31.2s,v3.2s,v8.2s\n\t"
+
+#define SAVE_M4N9(mode) \
+  UNIT_SAVE_M4N2_##mode(12, 17, 22, 27) UNIT_SAVE_M4N2_##mode(13, 18, 23, 28)\
+  UNIT_SAVE_M4N2_##mode(14, 19, 24, 29) UNIT_SAVE_M4N2_##mode(15, 20, 25, 30)\
+  UNIT_SAVE_M4N1_##mode(16, 21, 26, 31)
+
+
+/* acc layout for m4n11 kernel */
+/* m0n0 v8 v9 v10 v11 v12 v13_h m0n11 */
+/* m1n0 v14 v15 v16 v17 v18 v19_h m1n11 */
+/* m2n0 v20 v21 v22 v23 v24 v25_h m2n11 */
+/* m3n0 v26 v27 v28 v29 v30 v31_h m3n11 */
+/* b-holder layout for m4n11 kernel */
+/* n0 v4 v5 v6/v7 v7/v6 v5/v7 v6(s) n11 */
+/* a-holder layout for m4n11 kernel */
+/* a_ptr1->v0, a_ptr2->v1, a_ptr3->v2, a_ptr4->v3 */
+
+#define INIT_M4N11 \
+  INIT_4V(8, 14, 20, 26) INIT_4V(9, 15, 21, 27)\
+  INIT_4V(10, 16, 22, 28) INIT_4V(11, 17, 23, 29)\
+  INIT_4V(12, 18, 24, 30) INIT_4V(13, 19, 25, 31)
+
+#define KERNEL_M4N11_PRELOAD2 \
+  "ldr d0,[x0],#8\n\t"\
+  "ldr d4,[x4]; ldr d5,[x4,#8]; ldr d6,[x4,#16]; add x4,x4,#88\n\t"
+
+#define KERNEL_M4N11_MAIN4 \
+  "ldr d1,[x1],#8\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 4, 5, 6)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(14, 15, 16, 1, 1, 1, 4, 5, 6)\
+  "ldr d3,[x3],#8\n\t" FMA_3V(20, 21, 22, 2, 2, 2, 4, 5, 6)\
+  "ldr d7,[x4,#-64]\n\t" FMA_3V(26, 27, 28, 3, 3, 3, 4, 5, 6)\
+  "ldr d5,[x4,#-56]\n\t" FMA_3V(17, 23, 29, 1, 2, 3, 7, 7, 7)\
+  "ldr d6,[x4,#-48]\n\t" FMA_3V(18, 24, 30, 1, 2, 3, 5, 5, 5)\
+  "ldr d4,[x4,#-40]\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 7, 5, 6)\
+  "ldr d5,[x4,#-32]; ldr d7,[x4,#-24]; prfm pldl1keep,[x0,#64]\n\t"\
+  "rev64 v0.2s,v0.2s\n\t" FMA_3V(19, 25, 31, 1, 2, 3, 6, 6, 6)\
+  "rev64 v1.2s,v1.2s\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 4, 5, 7)\
+  "rev64 v2.2s,v2.2s\n\t" FMA_3V(14, 15, 16, 1, 1, 1, 4, 5, 7)\
+  "rev64 v3.2s,v3.2s\n\t" FMA_3V(20, 21, 22, 2, 2, 2, 4, 5, 7)\
+  "ldr d6,[x4,#-16]\n\t" FMA_3V(26, 27, 28, 3, 3, 3, 4, 5, 7)\
+  "ldr d7,[x4,#-8]\n\t" FMA_3V(17, 23, 29, 1, 2, 3, 6, 6, 6)\
+  "ldr d4,[x4]\n\t" FMA_3V(11, 12, 18, 0, 0, 1, 6, 7, 7)\
+  "ldr d0,[x0],#8; ldr d5,[x4,#8]; prfm pldl1keep,[x1,#64]; sub w5,w5,#4\n\t"\
+  "ldr d6,[x4,#16]; fmla v24.2s,v2.2s,v7.2s; fmla v30.2s,v3.2s,v7.2s\n\t"\
+  "ldr d1,[x1],#8\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 4, 5, 6)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(14, 15, 16, 1, 1, 1, 4, 5, 6)\
+  "ldr d3,[x3],#8\n\t" FMA_3V(20, 21, 22, 2, 2, 2, 4, 5, 6)\
+  "ldr d7,[x4,#24]\n\t" FMA_3V(26, 27, 28, 3, 3, 3, 4, 5, 6)\
+  "ldr d5,[x4,#32]\n\t" FMA_3V(17, 23, 29, 1, 2, 3, 7, 7, 7)\
+  "ldr d6,[x4,#40]\n\t" FMA_3V(18, 24, 30, 1, 2, 3, 5, 5, 5)\
+  "ldr d4,[x4,#48]\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 7, 5, 6)\
+  "ldr d5,[x4,#56]; ldr d7,[x4,#64]; prfm pldl1keep,[x2,#64]\n\t"\
+  "rev64 v0.2s,v0.2s\n\t" FMA_3V(19, 25, 31, 1, 2, 3, 6, 6, 6)\
+  "rev64 v1.2s,v1.2s\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 4, 5, 7)\
+  "rev64 v2.2s,v2.2s\n\t" FMA_3V(14, 15, 16, 1, 1, 1, 4, 5, 7)\
+  "rev64 v3.2s,v3.2s\n\t" FMA_3V(20, 21, 22, 2, 2, 2, 4, 5, 7)\
+  "ldr d6,[x4,#72]\n\t" FMA_3V(26, 27, 28, 3, 3, 3, 4, 5, 7)\
+  "ldr d7,[x4,#80]\n\t" FMA_3V(17, 23, 29, 1, 2, 3, 6, 6, 6)\
+  "ldr d4,[x4,#88]\n\t" FMA_3V(11, 12, 18, 0, 0, 1, 6, 7, 7)\
+  "ldr d0,[x0],#8; ldr d5,[x4,#96]; prfm pldl1keep,[x3,#64]; cmp w5,#6\n\t"\
+  "ldr d6,[x4,#104]; add x4,x4,#176\n\t"\
+  "fmla v24.2s,v2.2s,v7.2s; fmla v30.2s,v3.2s,v7.2s\n\t"
+
+#define KERNEL_M4N11_TAIL4 \
+  "ldr d1,[x1],#8\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 4, 5, 6)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(14, 15, 16, 1, 1, 1, 4, 5, 6)\
+  "ldr d3,[x3],#8\n\t" FMA_3V(20, 21, 22, 2, 2, 2, 4, 5, 6)\
+  "ldr d7,[x4,#-64]\n\t" FMA_3V(26, 27, 28, 3, 3, 3, 4, 5, 6)\
+  "ldr d5,[x4,#-56]\n\t" FMA_3V(17, 23, 29, 1, 2, 3, 7, 7, 7)\
+  "ldr d6,[x4,#-48]\n\t" FMA_3V(18, 24, 30, 1, 2, 3, 5, 5, 5)\
+  "ldr d4,[x4,#-40]\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 7, 5, 6)\
+  "ldr d5,[x4,#-32]; ldr d7,[x4,#-24]; prfm pldl1keep,[x8]\n\t"\
+  "rev64 v0.2s,v0.2s\n\t" FMA_3V(19, 25, 31, 1, 2, 3, 6, 6, 6)\
+  "rev64 v1.2s,v1.2s\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 4, 5, 7)\
+  "rev64 v2.2s,v2.2s\n\t" FMA_3V(14, 15, 16, 1, 1, 1, 4, 5, 7)\
+  "rev64 v3.2s,v3.2s\n\t" FMA_3V(20, 21, 22, 2, 2, 2, 4, 5, 7)\
+  "ldr d6,[x4,#-16]\n\t" FMA_3V(26, 27, 28, 3, 3, 3, 4, 5, 7)\
+  "ldr d7,[x4,#-8]\n\t" FMA_3V(17, 23, 29, 1, 2, 3, 6, 6, 6)\
+  "ldr d4,[x4]\n\t" FMA_3V(11, 12, 18, 0, 0, 1, 6, 7, 7)\
+  "ldr d0,[x0],#8; ldr d5,[x4,#8]; prfm pldl1keep,[x9]; sub w5,w5,#4\n\t"\
+  "ldr d6,[x4,#16]; fmla v24.2s,v2.2s,v7.2s; fmla v30.2s,v3.2s,v7.2s\n\t"\
+  "ldr d1,[x1],#8\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 4, 5, 6)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(14, 15, 16, 1, 1, 1, 4, 5, 6)\
+  "ldr d3,[x3],#8\n\t" FMA_3V(20, 21, 22, 2, 2, 2, 4, 5, 6)\
+  "ldr d7,[x4,#24]\n\t" FMA_3V(26, 27, 28, 3, 3, 3, 4, 5, 6)\
+  "ldr d5,[x4,#32]\n\t" FMA_3V(17, 23, 29, 1, 2, 3, 7, 7, 7)\
+  "ldr d6,[x4,#40]\n\t" FMA_3V(18, 24, 30, 1, 2, 3, 5, 5, 5)\
+  "ldr d4,[x4,#48]\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 7, 5, 6)\
+  "ldr d5,[x4,#56]; ldr d7,[x4,#64]; prfm pldl1keep,[x10]\n\t"\
+  "rev64 v0.2s,v0.2s\n\t" FMA_3V(19, 25, 31, 1, 2, 3, 6, 6, 6)\
+  "rev64 v1.2s,v1.2s\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 4, 5, 7)\
+  "rev64 v2.2s,v2.2s\n\t" FMA_3V(14, 15, 16, 1, 1, 1, 4, 5, 7)\
+  "rev64 v3.2s,v3.2s\n\t" FMA_3V(20, 21, 22, 2, 2, 2, 4, 5, 7)\
+  "ldr d6,[x4,#72]\n\t" FMA_3V(26, 27, 28, 3, 3, 3, 4, 5, 7)\
+  "ldr d7,[x4,#80]\n\t" FMA_3V(17, 23, 29, 1, 2, 3, 6, 6, 6)\
+  "prfm pldl1keep,[x11]\n\t" FMA_3V(11, 12, 18, 0, 0, 1, 6, 7, 7)\
+  "add x4,x4,#88; fmla v24.2s,v2.2s,v7.2s; fmla v30.2s,v3.2s,v7.2s\n\t"
+
+#define KERNEL_M4N11_TAIL2 \
+  "ldr d1,[x1],#8\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 4, 5, 6)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(14, 15, 16, 1, 1, 1, 4, 5, 6)\
+  "ldr d3,[x3],#8\n\t" FMA_3V(20, 21, 22, 2, 2, 2, 4, 5, 6)\
+  "ldr d7,[x4,#-64]\n\t" FMA_3V(26, 27, 28, 3, 3, 3, 4, 5, 6)\
+  "ldr d5,[x4,#-56]\n\t" FMA_3V(17, 23, 29, 1, 2, 3, 7, 7, 7)\
+  "ldr d6,[x4,#-48]\n\t" FMA_3V(18, 24, 30, 1, 2, 3, 5, 5, 5)\
+  "ldr d4,[x4,#-40]\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 7, 5, 6)\
+  "ldr d5,[x4,#-32]; ldr d7,[x4,#-24]; prfm pldl1keep,[x8]\n\t"\
+  "rev64 v0.2s,v0.2s\n\t" FMA_3V(19, 25, 31, 1, 2, 3, 6, 6, 6)\
+  "rev64 v1.2s,v1.2s\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 4, 5, 7)\
+  "rev64 v2.2s,v2.2s\n\t" FMA_3V(14, 15, 16, 1, 1, 1, 4, 5, 7)\
+  "rev64 v3.2s,v3.2s\n\t" FMA_3V(20, 21, 22, 2, 2, 2, 4, 5, 7)\
+  "ldr d6,[x4,#-16]\n\t" FMA_3V(26, 27, 28, 3, 3, 3, 4, 5, 7)\
+  "ldr d7,[x4,#-8]\n\t" FMA_3V(17, 23, 29, 1, 2, 3, 6, 6, 6)\
+  "prfm pldl1keep,[x9]\n\t" FMA_3V(11, 12, 18, 0, 0, 1, 6, 7, 7)\
+  "prfm pldl1keep,[x10]; sub w5,w5,#2\n\t"\
+  "fmla v24.2s,v2.2s,v7.2s; fmla v30.2s,v3.2s,v7.2s\n\t"\
+  "prfm pldl1keep,[x11]\n\t"
+
+#define KERNEL_M4N11_FIN1 \
+  "ld1r {v0.2s},[x0],#4; ldr d4,[x4]; ldr d5,[x4,#8]\n\t"\
+  "ldr d6,[x4,#16]; add x4,x4,#44\n\t"\
+  "ld1r {v1.2s},[x1],#4\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 4, 5, 6)\
+  "ld1r {v2.2s},[x2],#4\n\t" FMA_3V(14, 15, 16, 1, 1, 1, 4, 5, 6)\
+  "ld1r {v3.2s},[x3],#4\n\t" FMA_3V(20, 21, 22, 2, 2, 2, 4, 5, 6)\
+  "ldr d7,[x4,#-20]\n\t" FMA_3V(26, 27, 28, 3, 3, 3, 4, 5, 6)\
+  "ldr d5,[x4,#-12]\n\t" FMA_3V(17, 23, 29, 1, 2, 3, 7, 7, 7)\
+  "ldr s6,[x4,#-4]\n\t" FMA_3V(18, 24, 30, 1, 2, 3, 5, 5, 5)\
+  FMA_3V(11, 12, 13, 0, 0, 0, 7, 5, 6)\
+  FMA_3V(19, 25, 31, 1, 2, 3, 6, 6, 6)
+
+#define SAVE_M4N11(mode) \
+  UNIT_SAVE_M4N2_##mode(8, 14, 20, 26) UNIT_SAVE_M4N2_##mode(9, 15, 21, 27)\
+  UNIT_SAVE_M4N2_##mode(10, 16, 22, 28) UNIT_SAVE_M4N2_##mode(11, 17, 23, 29)\
+  UNIT_SAVE_M4N2_##mode(12, 18, 24, 30) UNIT_SAVE_M4N1_##mode(13, 19, 25, 31)
+
+
+/* acc layout for m3n13 kernel */
+/* m0n0 v11 v12 v13 v14 v15 v16 v17_h m0n13 */
+/* m1n0 v18 v19 v20 v21 v22 v23 v24_h m1n13 */
+/* m2n0 v25 v26 v27 v28 v29 v30 v31_h m2n13 */
+/* b-holder layout for m3n13 kernel */
+/* n0 v3 v4 v5 v6 v7 v8 v9(s) n13 */
+/* a-holder layout for m3n13 kernel */
+/* a_ptr1->v0, a_ptr2->v1, a_ptr3->v2 */
+
+#define INIT_M3N13 \
+  INIT_3V(11, 18, 25) INIT_3V(12, 19, 26) INIT_3V(13, 20, 27)\
+  INIT_3V(14, 21, 28) INIT_3V(15, 22, 29) INIT_3V(16, 23, 30)\
+  INIT_3V(17, 24, 31)
+
+#define KERNEL_M3N13_PRELOAD2 \
+  "ldr d0,[x0],#8\n\t"\
+  "ldr d3,[x4]; ldr d4,[x4,#8]; ldr d5,[x4,#16]; add x4,x4,#104\n\t"
+
+#define KERNEL_M3N13_MAIN4 \
+  "ldr d1,[x1],#8\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 3, 4, 5)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(18, 19, 20, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#-80]\n\t" FMA_3V(25, 26, 27, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#-72]\n\t" FMA_3V(14, 21, 28, 0, 1, 2, 6, 6, 6)\
+  "ldr d8,[x4,#-64]; ldr d9,[x4,#-56]; prfm pldl1keep,[x0,#64]\n\t"\
+  "ldr d3,[x4,#-48]; sub w5,w5,#4\n\t"\
+  "ldr d4,[x4,#-40]\n\t" FMA_3V(15, 22, 29, 0, 1, 2, 7, 7, 7)\
+  "ldr d5,[x4,#-32]\n\t" FMA_3V(16, 23, 17, 0, 1, 0, 8, 8, 9)\
+  "rev64 v0.2s,v0.2s\n\t" FMA_3V(30, 24, 31, 2, 1, 2, 8, 9, 9)\
+  "rev64 v1.2s,v1.2s\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 3, 4, 5)\
+  "rev64 v2.2s,v2.2s\n\t" FMA_3V(18, 19, 20, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#-24]\n\t" FMA_3V(25, 26, 27, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#-16]; ldr d8,[x4,#-8]; prfm pldl1keep,[x1,#64]\n\t"\
+  "ldr d3,[x4]; cmp w5,#6\n\t"\
+  "ldr d4,[x4,#8]\n\t" FMA_3V(14, 21, 28, 0, 1, 2, 6, 6, 6)\
+  "ldr d5,[x4,#16]\n\t" FMA_3V(15, 22, 16, 0, 1, 0, 7, 7, 8)\
+  "ldr d0,[x0],#8\n\t" FMA_3V(29, 23, 30, 2, 1, 2, 7, 8, 8)\
+  "ldr d1,[x1],#8\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 3, 4, 5)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(18, 19, 20, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#24]\n\t" FMA_3V(25, 26, 27, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#32]\n\t" FMA_3V(14, 21, 28, 0, 1, 2, 6, 6, 6)\
+  "ldr d8,[x4,#40]; ldr d9,[x4,#48]; ldr d3,[x4,#56]; ldr d4,[x4,#64]\n\t"\
+  "prfm pldl1keep,[x2,#64]\n\t" FMA_3V(15, 22, 29, 0, 1, 2, 7, 7, 7)\
+  "ldr d5,[x4,#72]\n\t" FMA_3V(16, 23, 17, 0, 1, 0, 8, 8, 9)\
+  "rev64 v0.2s,v0.2s\n\t" FMA_3V(30, 24, 31, 2, 1, 2, 8, 9, 9)\
+  "rev64 v1.2s,v1.2s\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 3, 4, 5)\
+  "rev64 v2.2s,v2.2s\n\t" FMA_3V(18, 19, 20, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#80]\n\t" FMA_3V(25, 26, 27, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#88]; ldr d8,[x4,#96]; ldr d3,[x4,#104]; ldr d4,[x4,#112]\n\t"\
+  "add x4,x4,#208\n\t" FMA_3V(14, 21, 28, 0, 1, 2, 6, 6, 6)\
+  "ldr d5,[x4,#-88]\n\t" FMA_3V(15, 22, 16, 0, 1, 0, 7, 7, 8)\
+  "ldr d0,[x0],#8\n\t" FMA_3V(29, 23, 30, 2, 1, 2, 7, 8, 8)
+
+#define KERNEL_M3N13_TAIL4 \
+  "ldr d1,[x1],#8\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 3, 4, 5)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(18, 19, 20, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#-80]\n\t" FMA_3V(25, 26, 27, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#-72]\n\t" FMA_3V(14, 21, 28, 0, 1, 2, 6, 6, 6)\
+  "ldr d8,[x4,#-64]; ldr d9,[x4,#-56]; prfm pldl1keep,[x3]\n\t"\
+  "ldr d3,[x4,#-48]; sub w5,w5,#4\n\t"\
+  "ldr d4,[x4,#-40]\n\t" FMA_3V(15, 22, 29, 0, 1, 2, 7, 7, 7)\
+  "ldr d5,[x4,#-32]\n\t" FMA_3V(16, 23, 17, 0, 1, 0, 8, 8, 9)\
+  "rev64 v0.2s,v0.2s\n\t" FMA_3V(30, 24, 31, 2, 1, 2, 8, 9, 9)\
+  "rev64 v1.2s,v1.2s\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 3, 4, 5)\
+  "rev64 v2.2s,v2.2s\n\t" FMA_3V(18, 19, 20, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#-24]\n\t" FMA_3V(25, 26, 27, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#-16]; ldr d8,[x4,#-8]; prfm pldl1keep,[x8]\n\t"\
+  "ldr d3,[x4]\n\t"\
+  "ldr d4,[x4,#8]\n\t" FMA_3V(14, 21, 28, 0, 1, 2, 6, 6, 6)\
+  "ldr d5,[x4,#16]\n\t" FMA_3V(15, 22, 16, 0, 1, 0, 7, 7, 8)\
+  "ldr d0,[x0],#8\n\t" FMA_3V(29, 23, 30, 2, 1, 2, 7, 8, 8)\
+  "ldr d1,[x1],#8\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 3, 4, 5)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(18, 19, 20, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#24]\n\t" FMA_3V(25, 26, 27, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#32]\n\t" FMA_3V(14, 21, 28, 0, 1, 2, 6, 6, 6)\
+  "ldr d8,[x4,#40]; ldr d9,[x4,#48]; ldr d3,[x4,#56]; ldr d4,[x4,#64]\n\t"\
+  FMA_3V(15, 22, 29, 0, 1, 2, 7, 7, 7)\
+  "ldr d5,[x4,#72]\n\t" FMA_3V(16, 23, 17, 0, 1, 0, 8, 8, 9)\
+  "rev64 v0.2s,v0.2s\n\t" FMA_3V(30, 24, 31, 2, 1, 2, 8, 9, 9)\
+  "rev64 v1.2s,v1.2s\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 3, 4, 5)\
+  "rev64 v2.2s,v2.2s\n\t" FMA_3V(18, 19, 20, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#80]\n\t" FMA_3V(25, 26, 27, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#88]; ldr d8,[x4,#96]\n\t"\
+  "add x4,x4,#104\n\t" FMA_3V(14, 21, 28, 0, 1, 2, 6, 6, 6)\
+  "prfm pldl1keep,[x9]\n\t" FMA_3V(15, 22, 16, 0, 1, 0, 7, 7, 8)\
+  FMA_3V(29, 23, 30, 2, 1, 2, 7, 8, 8)
+
+#define KERNEL_M3N13_TAIL2 \
+  "ldr d1,[x1],#8\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 3, 4, 5)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(18, 19, 20, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#-80]\n\t" FMA_3V(25, 26, 27, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#-72]\n\t" FMA_3V(14, 21, 28, 0, 1, 2, 6, 6, 6)\
+  "ldr d8,[x4,#-64]; ldr d9,[x4,#-56]\n\t"\
+  "ldr d3,[x4,#-48]; sub w5,w5,#2\n\t"\
+  "ldr d4,[x4,#-40]\n\t" FMA_3V(15, 22, 29, 0, 1, 2, 7, 7, 7)\
+  "ldr d5,[x4,#-32]\n\t" FMA_3V(16, 23, 17, 0, 1, 0, 8, 8, 9)\
+  "rev64 v0.2s,v0.2s\n\t" FMA_3V(30, 24, 31, 2, 1, 2, 8, 9, 9)\
+  "rev64 v1.2s,v1.2s\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 3, 4, 5)\
+  "rev64 v2.2s,v2.2s\n\t" FMA_3V(18, 19, 20, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#-24]\n\t" FMA_3V(25, 26, 27, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#-16]; ldr d8,[x4,#-8]\n\t"\
+  "prfm pldl1keep,[x3]\n\t" FMA_3V(14, 21, 28, 0, 1, 2, 6, 6, 6)\
+  "prfm pldl1keep,[x8]\n\t" FMA_3V(15, 22, 16, 0, 1, 0, 7, 7, 8)\
+  "prfm pldl1keep,[x9]\n\t" FMA_3V(29, 23, 30, 2, 1, 2, 7, 8, 8)
+
+#define KERNEL_M3N13_FIN1 \
+  "ld1r {v0.2s},[x0],#4; ldr d3,[x4]\n\t"\
+  "ldr d4,[x4,#8]; ldr d5,[x4,#16]; add x4,x4,#52\n\t"\
+  "ld1r {v1.2s},[x1],#4\n\t" FMA_3V(11, 12, 13, 0, 0, 0, 3, 4, 5)\
+  "ld1r {v2.2s},[x2],#4\n\t" FMA_3V(18, 19, 20, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#-28]\n\t" FMA_3V(25, 26, 27, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#-20]\n\t" FMA_3V(14, 21, 28, 0, 1, 2, 6, 6, 6)\
+  "ldr d8,[x4,#-12]\n\t" FMA_3V(15, 22, 29, 0, 1, 2, 7, 7, 7)\
+  "ldr s9,[x4,#-4]\n\t" FMA_3V(16, 23, 30, 0, 1, 2, 8, 8, 8)\
+  FMA_3V(17, 24, 31, 0, 1, 2, 9, 9, 9)
+
+#define SAVE_M3N13(mode) \
+  UNIT_SAVE_M3N2_##mode(11, 18, 25) UNIT_SAVE_M3N2_##mode(12, 19, 26)\
+  UNIT_SAVE_M3N2_##mode(13, 20, 27) UNIT_SAVE_M3N2_##mode(14, 21, 28)\
+  UNIT_SAVE_M3N2_##mode(15, 22, 29) UNIT_SAVE_M3N2_##mode(16, 23, 30)\
+  UNIT_SAVE_M3N1_##mode(17, 24, 31)
+
+
+/* acc layout for m3n15 kernel */
+/* m0n0 v8 v9 v10 v11 v12 v13 v14 v15_h m0n15 */
+/* m1n0 v16 v17 v18 v19 v20 v21 v22 v23_h m1n15 */
+/* m2n0 v24 v25 v26 v27 v28 v29 v30 v31_h m2n15 */
+/* b-holder layout for m3n15 kernel */
+/* n0 v3 v4 v5 v6 v7/v5 v5/v6 v6/v7 v7(s) n15 */
+/* a-holder layout for m3n15 kernel */
+/* a_ptr1->v0, a_ptr2->v1, a_ptr3->v2 */
+
+#define INIT_M3N15 \
+  INIT_3V(8, 16, 24) INIT_3V(9, 17, 25) INIT_3V(10, 18, 26)\
+  INIT_3V(11, 19, 27) INIT_3V(12, 20, 28) INIT_3V(13, 21, 29)\
+  INIT_3V(14, 22, 30) INIT_3V(15, 23, 31)
+
+#define KERNEL_M3N15_PRELOAD2 \
+  "ldr d0,[x0],#8\n\t"\
+  "ldr d3,[x4]; ldr d4,[x4,#8]; ldr d5,[x4,#16]; add x4,x4,#120\n\t"
+
+#define KERNEL_M3N15_MAIN4 \
+  "ldr d1,[x1],#8\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 3, 4, 5)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(16, 17, 18, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#-96]\n\t" FMA_3V(24, 25, 26, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#-88]\n\t" FMA_3V(11, 19, 27, 0, 1, 2, 6, 6, 6)\
+  "ldr d5,[x4,#-80]\n\t" FMA_3V(12, 20, 28, 0, 1, 2, 7, 7, 7)\
+  "ldr d6,[x4,#-72]; ldr d7,[x4,#-64]; ldr d3,[x4,#-56]; ldr d4,[x4,#-48]\n\t"\
+  "prfm pldl1keep,[x0,#64]\n\t" FMA_3V(13, 21, 29, 0, 1, 2, 5, 5, 5)\
+  "ldr d5,[x4,#-40]\n\t" FMA_3V(14, 22, 15, 0, 1, 0, 6, 6, 7)\
+  "rev64 v0.2s,v0.2s\n\t" FMA_3V(30, 23, 31, 2, 1, 2, 6, 7, 7)\
+  "rev64 v1.2s,v1.2s\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 3, 4, 5)\
+  "rev64 v2.2s,v2.2s\n\t" FMA_3V(16, 17, 18, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#-32]\n\t" FMA_3V(24, 25, 26, 2, 2, 2, 3, 4, 5)\
+  "ldr d5,[x4,#-24]\n\t" FMA_3V(11, 19, 27, 0, 1, 2, 6, 6, 6)\
+  "ldr d6,[x4,#-16]; ldr d7,[x4,#-8]; ldr d3,[x4]; ldr d4,[x4,#8]\n\t"\
+  "prfm pldl1keep,[x1,#64]\n\t" FMA_3V(12, 20, 28, 0, 1, 2, 5, 5, 5)\
+  "ldr d5,[x4,#16]\n\t" FMA_3V(13, 21, 14, 0, 1, 0, 6, 6, 7)\
+  "ldr d0,[x0],#8\n\t" FMA_3V(29, 22, 30, 2, 1, 2, 6, 7, 7)\
+  "ldr d1,[x1],#8\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 3, 4, 5)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(16, 17, 18, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#24]\n\t" FMA_3V(24, 25, 26, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#32]\n\t" FMA_3V(11, 19, 27, 0, 1, 2, 6, 6, 6)\
+  "ldr d5,[x4,#40]\n\t" FMA_3V(12, 20, 28, 0, 1, 2, 7, 7, 7)\
+  "ldr d6,[x4,#48]; ldr d7,[x4,#56]; ldr d3,[x4,#64]; ldr d4,[x4,#72]\n\t"\
+  "prfm pldl1keep,[x2,#64]\n\t" FMA_3V(13, 21, 29, 0, 1, 2, 5, 5, 5)\
+  "ldr d5,[x4,#80]\n\t" FMA_3V(14, 22, 15, 0, 1, 0, 6, 6, 7)\
+  "rev64 v0.2s,v0.2s\n\t" FMA_3V(30, 23, 31, 2, 1, 2, 6, 7, 7)\
+  "rev64 v1.2s,v1.2s\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 3, 4, 5)\
+  "rev64 v2.2s,v2.2s\n\t" FMA_3V(16, 17, 18, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#88]\n\t" FMA_3V(24, 25, 26, 2, 2, 2, 3, 4, 5)\
+  "ldr d5,[x4,#96]\n\t" FMA_3V(11, 19, 27, 0, 1, 2, 6, 6, 6)\
+  "ldr d6,[x4,#104]; ldr d7,[x4,#112]; ldr d3,[x4,#120]\n\t"\
+  "ldr d4,[x4,#128]; sub w5,w5,#4\n\t"\
+  FMA_3V(12, 20, 28, 0, 1, 2, 5, 5, 5)\
+  "ldr d5,[x4,#136]; add x4,x4,#240\n\t"\
+  FMA_3V(13, 21, 14, 0, 1, 0, 6, 6, 7)\
+  "ldr d0,[x0],#8; cmp w5,#6\n\t"\
+  FMA_3V(29, 22, 30, 2, 1, 2, 6, 7, 7)
+
+#define KERNEL_M3N15_TAIL4 \
+  "ldr d1,[x1],#8\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 3, 4, 5)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(16, 17, 18, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#-96]\n\t" FMA_3V(24, 25, 26, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#-88]\n\t" FMA_3V(11, 19, 27, 0, 1, 2, 6, 6, 6)\
+  "ldr d5,[x4,#-80]\n\t" FMA_3V(12, 20, 28, 0, 1, 2, 7, 7, 7)\
+  "ldr d6,[x4,#-72]; ldr d7,[x4,#-64]; ldr d3,[x4,#-56]; ldr d4,[x4,#-48]\n\t"\
+  "prfm pldl1keep,[x3]\n\t" FMA_3V(13, 21, 29, 0, 1, 2, 5, 5, 5)\
+  "ldr d5,[x4,#-40]\n\t" FMA_3V(14, 22, 15, 0, 1, 0, 6, 6, 7)\
+  "rev64 v0.2s,v0.2s\n\t" FMA_3V(30, 23, 31, 2, 1, 2, 6, 7, 7)\
+  "rev64 v1.2s,v1.2s\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 3, 4, 5)\
+  "rev64 v2.2s,v2.2s\n\t" FMA_3V(16, 17, 18, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#-32]\n\t" FMA_3V(24, 25, 26, 2, 2, 2, 3, 4, 5)\
+  "ldr d5,[x4,#-24]\n\t" FMA_3V(11, 19, 27, 0, 1, 2, 6, 6, 6)\
+  "ldr d6,[x4,#-16]; ldr d7,[x4,#-8]; ldr d3,[x4]; ldr d4,[x4,#8]\n\t"\
+  "prfm pldl1keep,[x8]\n\t" FMA_3V(12, 20, 28, 0, 1, 2, 5, 5, 5)\
+  "ldr d5,[x4,#16]\n\t" FMA_3V(13, 21, 14, 0, 1, 0, 6, 6, 7)\
+  "ldr d0,[x0],#8\n\t" FMA_3V(29, 22, 30, 2, 1, 2, 6, 7, 7)\
+  "ldr d1,[x1],#8\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 3, 4, 5)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(16, 17, 18, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#24]\n\t" FMA_3V(24, 25, 26, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#32]\n\t" FMA_3V(11, 19, 27, 0, 1, 2, 6, 6, 6)\
+  "ldr d5,[x4,#40]\n\t" FMA_3V(12, 20, 28, 0, 1, 2, 7, 7, 7)\
+  "ldr d6,[x4,#48]; ldr d7,[x4,#56]; ldr d3,[x4,#64]; ldr d4,[x4,#72]\n\t"\
+  FMA_3V(13, 21, 29, 0, 1, 2, 5, 5, 5)\
+  "ldr d5,[x4,#80]\n\t" FMA_3V(14, 22, 15, 0, 1, 0, 6, 6, 7)\
+  "rev64 v0.2s,v0.2s\n\t" FMA_3V(30, 23, 31, 2, 1, 2, 6, 7, 7)\
+  "rev64 v1.2s,v1.2s\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 3, 4, 5)\
+  "rev64 v2.2s,v2.2s\n\t" FMA_3V(16, 17, 18, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#88]\n\t" FMA_3V(24, 25, 26, 2, 2, 2, 3, 4, 5)\
+  "ldr d5,[x4,#96]\n\t" FMA_3V(11, 19, 27, 0, 1, 2, 6, 6, 6)\
+  "ldr d6,[x4,#104]; ldr d7,[x4,#112]\n\t"\
+  "sub w5,w5,#4\n\t" FMA_3V(12, 20, 28, 0, 1, 2, 5, 5, 5)\
+  "prfm pldl1keep,[x9]; add x4,x4,#120\n\t"\
+  FMA_3V(13, 21, 14, 0, 1, 0, 6, 6, 7)\
+  FMA_3V(29, 22, 30, 2, 1, 2, 6, 7, 7)
+
+#define KERNEL_M3N15_TAIL2 \
+  "ldr d1,[x1],#8\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 3, 4, 5)\
+  "ldr d2,[x2],#8\n\t" FMA_3V(16, 17, 18, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#-96]\n\t" FMA_3V(24, 25, 26, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#-88]\n\t" FMA_3V(11, 19, 27, 0, 1, 2, 6, 6, 6)\
+  "ldr d5,[x4,#-80]\n\t" FMA_3V(12, 20, 28, 0, 1, 2, 7, 7, 7)\
+  "ldr d6,[x4,#-72]; ldr d7,[x4,#-64]; ldr d3,[x4,#-56]; ldr d4,[x4,#-48]\n\t"\
+  "prfm pldl1keep,[x3]\n\t" FMA_3V(13, 21, 29, 0, 1, 2, 5, 5, 5)\
+  "ldr d5,[x4,#-40]\n\t" FMA_3V(14, 22, 15, 0, 1, 0, 6, 6, 7)\
+  "rev64 v0.2s,v0.2s\n\t" FMA_3V(30, 23, 31, 2, 1, 2, 6, 7, 7)\
+  "rev64 v1.2s,v1.2s\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 3, 4, 5)\
+  "rev64 v2.2s,v2.2s\n\t" FMA_3V(16, 17, 18, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#-32]\n\t" FMA_3V(24, 25, 26, 2, 2, 2, 3, 4, 5)\
+  "ldr d5,[x4,#-24]\n\t" FMA_3V(11, 19, 27, 0, 1, 2, 6, 6, 6)\
+  "ldr d6,[x4,#-16]; ldr d7,[x4,#-8]\n\t"\
+  "prfm pldl1keep,[x8]\n\t" FMA_3V(12, 20, 28, 0, 1, 2, 5, 5, 5)\
+  "prfm pldl1keep,[x9]\n\t" FMA_3V(13, 21, 14, 0, 1, 0, 6, 6, 7)\
+  "sub w5,w5,#2\n\t" FMA_3V(29, 22, 30, 2, 1, 2, 6, 7, 7)
+
+#define KERNEL_M3N15_FIN1 \
+  "ld1r {v0.2s},[x0],#4; ldr d3,[x4]; ldr d4,[x4,#8]; ldr d5,[x4,#16]\n\t"\
+  "ld1r {v1.2s},[x1],#4; add x4,x4,#60\n\t" FMA_3V(8, 9, 10, 0, 0, 0, 3, 4, 5)\
+  "ld1r {v2.2s},[x2],#4\n\t" FMA_3V(16, 17, 18, 1, 1, 1, 3, 4, 5)\
+  "ldr d6,[x4,#-36]\n\t" FMA_3V(24, 25, 26, 2, 2, 2, 3, 4, 5)\
+  "ldr d7,[x4,#-28]\n\t" FMA_3V(11, 19, 27, 0, 1, 2, 6, 6, 6)\
+  "ldr d5,[x4,#-20]\n\t" FMA_3V(12, 20, 28, 0, 1, 2, 7, 7, 7)\
+  "ldr d6,[x4,#-12]\n\t" FMA_3V(13, 21, 29, 0, 1, 2, 5, 5, 5)\
+  "ldr s7,[x4,#-4]\n\t" FMA_3V(14, 22, 30, 0, 1, 2, 6, 6, 6)\
+  FMA_3V(15, 23, 31, 0, 1, 2, 7, 7, 7)
+
+#define SAVE_M3N15(mode) \
+  UNIT_SAVE_M3N2_##mode(8, 16, 24) UNIT_SAVE_M3N2_##mode(9, 17, 25)\
+  UNIT_SAVE_M3N2_##mode(10, 18, 26) UNIT_SAVE_M3N2_##mode(11, 19, 27)\
+  UNIT_SAVE_M3N2_##mode(12, 20, 28) UNIT_SAVE_M3N2_##mode(13, 21, 29)\
+  UNIT_SAVE_M3N2_##mode(14, 22, 30) UNIT_SAVE_M3N1_##mode(15, 23, 31)
+
+
+/* acc layout for m3n17 kernel */
+/* m0n0 v5 v6 v7 v8 v9 v10 v11 v12 v13_h m0n17 */
+/* m1n0 v14 v15 v16 v17 v18 v19 v20 v21 v22_h m1n17 */
+/* m2n0 v23 v24 v25 v26 v27 v28 v29 v30 v31_h m2n17 */
+/* b-holder layout for m3n17 kernel */
+/* n0 v3-4 alt n17 */
+/* a-holder layout for m3n17 kernel */
+/* a_ptr1->v0, a_ptr2->v1, a_ptr3->v2 */
+
+#define INIT_M3N17 \
+  INIT_3V(5, 14, 23) INIT_3V(6, 15, 24) INIT_3V(7, 16, 25)\
+  INIT_3V(8, 17, 26) INIT_3V(9, 18, 27) INIT_3V(10, 19, 28)\
+  INIT_3V(11, 20, 29) INIT_3V(12, 21, 30) INIT_3V(13, 22, 31)
+
+#define KERNEL_M3N17_PRELOAD2 \
+  "ldr d3,[x4],#136\n\t"
+
+#define KERNEL_M3N17_MAIN4 \
+  "ldr d0,[x0],#8; ldr d1,[x1],#8; ldr d2,[x2],#8\n\t"\
+  "prfm pldl1keep,[x0,#64]\n\t"\
+  "ldr d4,[x4,#-128]\n\t" FMA_3V(5, 14, 23, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-120]\n\t" FMA_3V(6, 15, 24, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-112]\n\t" FMA_3V(7, 16, 25, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-104]\n\t" FMA_3V(8, 17, 26, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-96]\n\t" FMA_3V(9, 18, 27, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-88]\n\t" FMA_3V(10, 19, 28, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-80]\n\t" FMA_3V(11, 20, 29, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-72]\n\t" FMA_3V(12, 21, 30, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-64]\n\t" FMA_3V(13, 22, 31, 0, 1, 2, 3, 3, 3)\
+  "rev64 v0.2s,v0.2s; rev64 v1.2s,v1.2s; rev64 v2.2s,v2.2s\n\t"\
+  "prfm pldl1keep,[x1,#64]\n\t"\
+  "ldr d3,[x4,#-56]\n\t" FMA_3V(5, 14, 23, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-48]\n\t" FMA_3V(6, 15, 24, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-40]\n\t" FMA_3V(7, 16, 25, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-32]\n\t" FMA_3V(8, 17, 26, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-24]\n\t" FMA_3V(9, 18, 27, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-16]\n\t" FMA_3V(10, 19, 28, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-8]\n\t" FMA_3V(11, 20, 29, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4]\n\t" FMA_3V(12, 21, 30, 0, 1, 2, 3, 3, 3)\
+  "ldr d0,[x0],#8; ldr d1,[x1],#8; ldr d2,[x2],#8\n\t"\
+  "ldr d3,[x4,#8]\n\t" FMA_3V(5, 14, 23, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#16]\n\t" FMA_3V(6, 15, 24, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#24]\n\t" FMA_3V(7, 16, 25, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#32]\n\t" FMA_3V(8, 17, 26, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#40]\n\t" FMA_3V(9, 18, 27, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#48]\n\t" FMA_3V(10, 19, 28, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#56]\n\t" FMA_3V(11, 20, 29, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#64]\n\t" FMA_3V(12, 21, 30, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#72]\n\t" FMA_3V(13, 22, 31, 0, 1, 2, 4, 4, 4)\
+  "rev64 v0.2s,v0.2s; rev64 v1.2s,v1.2s; rev64 v2.2s,v2.2s\n\t"\
+  "prfm pldl1keep,[x2,#64]\n\t"\
+  "ldr d4,[x4,#80]\n\t" FMA_3V(5, 14, 23, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#88]\n\t" FMA_3V(6, 15, 24, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#96]\n\t" FMA_3V(7, 16, 25, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#104]\n\t" FMA_3V(8, 17, 26, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#112]\n\t" FMA_3V(9, 18, 27, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#120]; sub w5,w5,#4\n\t" FMA_3V(10, 19, 28, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#128]; cmp w5,#6\n\t" FMA_3V(11, 20, 29, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#136]; add x4,x4,#272\n\t" FMA_3V(12, 21, 30, 0, 1, 2, 4, 4, 4)
+
+#define KERNEL_M3N17_TAIL4 \
+  "ldr d0,[x0],#8; ldr d1,[x1],#8; ldr d2,[x2],#8\n\t"\
+  "ldr d4,[x4,#-128]\n\t" FMA_3V(5, 14, 23, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-120]\n\t" FMA_3V(6, 15, 24, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-112]\n\t" FMA_3V(7, 16, 25, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-104]\n\t" FMA_3V(8, 17, 26, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-96]\n\t" FMA_3V(9, 18, 27, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-88]\n\t" FMA_3V(10, 19, 28, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-80]\n\t" FMA_3V(11, 20, 29, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-72]\n\t" FMA_3V(12, 21, 30, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-64]\n\t" FMA_3V(13, 22, 31, 0, 1, 2, 3, 3, 3)\
+  "rev64 v0.2s,v0.2s; rev64 v1.2s,v1.2s; rev64 v2.2s,v2.2s\n\t"\
+  "prfm pldl1keep,[x3]\n\t"\
+  "ldr d3,[x4,#-56]\n\t" FMA_3V(5, 14, 23, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-48]\n\t" FMA_3V(6, 15, 24, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-40]\n\t" FMA_3V(7, 16, 25, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-32]\n\t" FMA_3V(8, 17, 26, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-24]\n\t" FMA_3V(9, 18, 27, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-16]\n\t" FMA_3V(10, 19, 28, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-8]\n\t" FMA_3V(11, 20, 29, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4]\n\t" FMA_3V(12, 21, 30, 0, 1, 2, 3, 3, 3)\
+  "ldr d0,[x0],#8; ldr d1,[x1],#8; ldr d2,[x2],#8\n\t"\
+  "ldr d3,[x4,#8]\n\t" FMA_3V(5, 14, 23, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#16]\n\t" FMA_3V(6, 15, 24, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#24]\n\t" FMA_3V(7, 16, 25, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#32]\n\t" FMA_3V(8, 17, 26, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#40]\n\t" FMA_3V(9, 18, 27, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#48]\n\t" FMA_3V(10, 19, 28, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#56]\n\t" FMA_3V(11, 20, 29, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#64]\n\t" FMA_3V(12, 21, 30, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#72]\n\t" FMA_3V(13, 22, 31, 0, 1, 2, 4, 4, 4)\
+  "rev64 v0.2s,v0.2s; rev64 v1.2s,v1.2s; rev64 v2.2s,v2.2s\n\t"\
+  "prfm pldl1keep,[x8]\n\t"\
+  "ldr d4,[x4,#80]\n\t" FMA_3V(5, 14, 23, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#88]\n\t" FMA_3V(6, 15, 24, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#96]\n\t" FMA_3V(7, 16, 25, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#104]\n\t" FMA_3V(8, 17, 26, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#112]\n\t" FMA_3V(9, 18, 27, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#120]; sub w5,w5,#4\n\t" FMA_3V(10, 19, 28, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#128]\n\t" FMA_3V(11, 20, 29, 0, 1, 2, 3, 3, 3)\
+  "prfm pldl1keep,[x9]; add x4,x4,#136\n\t"\
+  FMA_3V(12, 21, 30, 0, 1, 2, 4, 4, 4)
+
+#define KERNEL_M3N17_TAIL2 \
+  "ldr d0,[x0],#8; ldr d1,[x1],#8; ldr d2,[x2],#8\n\t"\
+  "prfm pldl1keep,[x3]\n\t"\
+  "ldr d4,[x4,#-128]\n\t" FMA_3V(5, 14, 23, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-120]\n\t" FMA_3V(6, 15, 24, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-112]\n\t" FMA_3V(7, 16, 25, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-104]\n\t" FMA_3V(8, 17, 26, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-96]\n\t" FMA_3V(9, 18, 27, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-88]\n\t" FMA_3V(10, 19, 28, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-80]\n\t" FMA_3V(11, 20, 29, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-72]\n\t" FMA_3V(12, 21, 30, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-64]\n\t" FMA_3V(13, 22, 31, 0, 1, 2, 3, 3, 3)\
+  "rev64 v0.2s,v0.2s; rev64 v1.2s,v1.2s; rev64 v2.2s,v2.2s\n\t"\
+  "sub w5,w5,#2; prfm pldl1keep,[x8]\n\t"\
+  "ldr d3,[x4,#-56]\n\t" FMA_3V(5, 14, 23, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-48]\n\t" FMA_3V(6, 15, 24, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-40]\n\t" FMA_3V(7, 16, 25, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-32]\n\t" FMA_3V(8, 17, 26, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-24]\n\t" FMA_3V(9, 18, 27, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-16]\n\t" FMA_3V(10, 19, 28, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-8]\n\t" FMA_3V(11, 20, 29, 0, 1, 2, 4, 4, 4)\
+  "prfm pldl1keep,[x9]\n\t" FMA_3V(12, 21, 30, 0, 1, 2, 3, 3, 3)
+
+#define KERNEL_M3N17_FIN1 \
+  "ldr d3,[x4],#68\n\t"\
+  "ld1r {v0.2s},[x0],#4; ld1r {v1.2s},[x1],#4; ld1r {v2.2s},[x2],#4\n\t"\
+  "ldr d4,[x4,#-60]\n\t" FMA_3V(5, 14, 23, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-52]\n\t" FMA_3V(6, 15, 24, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-44]\n\t" FMA_3V(7, 16, 25, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-36]\n\t" FMA_3V(8, 17, 26, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-28]\n\t" FMA_3V(9, 18, 27, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-20]\n\t" FMA_3V(10, 19, 28, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-12]\n\t" FMA_3V(11, 20, 29, 0, 1, 2, 3, 3, 3)\
+  "ldr s3,[x4,#-4]\n\t" FMA_3V(12, 21, 30, 0, 1, 2, 4, 4, 4)\
+  FMA_3V(13, 22, 31, 0, 1, 2, 3, 3, 3)
+
+#define SAVE_M3N17(mode) \
+  UNIT_SAVE_M3N2_##mode(5, 14, 23) UNIT_SAVE_M3N2_##mode(6, 15, 24)\
+  UNIT_SAVE_M3N2_##mode(7, 16, 25) UNIT_SAVE_M3N2_##mode(8, 17, 26)\
+  UNIT_SAVE_M3N2_##mode(9, 18, 27) UNIT_SAVE_M3N2_##mode(10, 19, 28)\
+  UNIT_SAVE_M3N2_##mode(11, 20, 29) UNIT_SAVE_M3N2_##mode(12, 21, 30)\
+  UNIT_SAVE_M3N1_##mode(13, 22, 31)
+
+
+/* acc layout for m3n18 kernel */
+/* m0n0 v5 v6 v7 v8 v9 v10 v11 v12 v13 m0n18 */
+/* m1n0 v14 v15 v16 v17 v18 v19 v20 v21 v22 m1n18 */
+/* m2n0 v23 v24 v25 v26 v27 v28 v29 v30 v31 m2n18 */
+/* b-holder layout for m3n18 kernel */
+/* n0 v3-4 alt n18 */
+/* a-holder layout for m3n18 kernel */
+/* a_ptr1->v0, a_ptr2->v1, a_ptr3->v2 */
+
+#define INIT_M3N18 \
+  INIT_3V(5, 14, 23) INIT_3V(6, 15, 24) INIT_3V(7, 16, 25)\
+  INIT_3V(8, 17, 26) INIT_3V(9, 18, 27) INIT_3V(10, 19, 28)\
+  INIT_3V(11, 20, 29) INIT_3V(12, 21, 30) INIT_3V(13, 22, 31)
+
+#define KERNEL_M3N18_PRELOAD2 \
+  "ldr d3,[x4],#144\n\t"
+
+#define KERNEL_M3N18_MAIN4 \
+  "ldr d0,[x0],#8; ldr d1,[x1],#8; ldr d2,[x2],#8\n\t"\
+  "prfm pldl1keep,[x0,#64]; sub w5,w5,#4\n\t"\
+  "ldr d4,[x4,#-136]\n\t" FMA_3V(5, 14, 23, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-128]\n\t" FMA_3V(6, 15, 24, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-120]\n\t" FMA_3V(7, 16, 25, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-112]\n\t" FMA_3V(8, 17, 26, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-104]\n\t" FMA_3V(9, 18, 27, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-96]\n\t" FMA_3V(10, 19, 28, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-88]\n\t" FMA_3V(11, 20, 29, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-80]\n\t" FMA_3V(12, 21, 30, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-72]\n\t" FMA_3V(13, 22, 31, 0, 1, 2, 3, 3, 3)\
+  "rev64 v0.2s,v0.2s; rev64 v1.2s,v1.2s; rev64 v2.2s,v2.2s\n\t"\
+  "prfm pldl1keep,[x1,#64]; cmp w5,#6\n\t"\
+  "ldr d3,[x4,#-64]\n\t" FMA_3V(5, 14, 23, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-56]\n\t" FMA_3V(6, 15, 24, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-48]\n\t" FMA_3V(7, 16, 25, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-40]\n\t" FMA_3V(8, 17, 26, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-32]\n\t" FMA_3V(9, 18, 27, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-24]\n\t" FMA_3V(10, 19, 28, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-16]\n\t" FMA_3V(11, 20, 29, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-8]\n\t" FMA_3V(12, 21, 30, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4]\n\t" FMA_3V(13, 22, 31, 0, 1, 2, 4, 4, 4)\
+  "ldr d0,[x0],#8; ldr d1,[x1],#8; ldr d2,[x2],#8\n\t"\
+  "prfm pldl1keep,[x2,#64]\n\t"\
+  "ldr d4,[x4,#8]\n\t" FMA_3V(5, 14, 23, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#16]\n\t" FMA_3V(6, 15, 24, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#24]\n\t" FMA_3V(7, 16, 25, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#32]\n\t" FMA_3V(8, 17, 26, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#40]\n\t" FMA_3V(9, 18, 27, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#48]\n\t" FMA_3V(10, 19, 28, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#56]\n\t" FMA_3V(11, 20, 29, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#64]\n\t" FMA_3V(12, 21, 30, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#72]\n\t" FMA_3V(13, 22, 31, 0, 1, 2, 3, 3, 3)\
+  "add x4,x4,#288\n\t"\
+  "rev64 v0.2s,v0.2s; rev64 v1.2s,v1.2s; rev64 v2.2s,v2.2s\n\t"\
+  "ldr d3,[x4,#-208]\n\t" FMA_3V(5, 14, 23, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-200]\n\t" FMA_3V(6, 15, 24, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-192]\n\t" FMA_3V(7, 16, 25, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-184]\n\t" FMA_3V(8, 17, 26, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-176]\n\t" FMA_3V(9, 18, 27, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-168]\n\t" FMA_3V(10, 19, 28, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-160]\n\t" FMA_3V(11, 20, 29, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-152]\n\t" FMA_3V(12, 21, 30, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-144]\n\t" FMA_3V(13, 22, 31, 0, 1, 2, 4, 4, 4)
+
+#define KERNEL_M3N18_TAIL4 \
+  "ldr d0,[x0],#8; ldr d1,[x1],#8; ldr d2,[x2],#8\n\t"\
+  "prfm pldl1keep,[x3]; sub w5,w5,#4\n\t"\
+  "ldr d4,[x4,#-136]\n\t" FMA_3V(5, 14, 23, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-128]\n\t" FMA_3V(6, 15, 24, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-120]\n\t" FMA_3V(7, 16, 25, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-112]\n\t" FMA_3V(8, 17, 26, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-104]\n\t" FMA_3V(9, 18, 27, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-96]\n\t" FMA_3V(10, 19, 28, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-88]\n\t" FMA_3V(11, 20, 29, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-80]\n\t" FMA_3V(12, 21, 30, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-72]\n\t" FMA_3V(13, 22, 31, 0, 1, 2, 3, 3, 3)\
+  "rev64 v0.2s,v0.2s; rev64 v1.2s,v1.2s; rev64 v2.2s,v2.2s\n\t"\
+  "prfm pldl1keep,[x8]\n\t"\
+  "ldr d3,[x4,#-64]\n\t" FMA_3V(5, 14, 23, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-56]\n\t" FMA_3V(6, 15, 24, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-48]\n\t" FMA_3V(7, 16, 25, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-40]\n\t" FMA_3V(8, 17, 26, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-32]\n\t" FMA_3V(9, 18, 27, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-24]\n\t" FMA_3V(10, 19, 28, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-16]\n\t" FMA_3V(11, 20, 29, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-8]\n\t" FMA_3V(12, 21, 30, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4]\n\t" FMA_3V(13, 22, 31, 0, 1, 2, 4, 4, 4)\
+  "ldr d0,[x0],#8; ldr d1,[x1],#8; ldr d2,[x2],#8\n\t"\
+  "ldr d4,[x4,#8]\n\t" FMA_3V(5, 14, 23, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#16]\n\t" FMA_3V(6, 15, 24, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#24]\n\t" FMA_3V(7, 16, 25, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#32]\n\t" FMA_3V(8, 17, 26, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#40]\n\t" FMA_3V(9, 18, 27, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#48]\n\t" FMA_3V(10, 19, 28, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#56]\n\t" FMA_3V(11, 20, 29, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#64]\n\t" FMA_3V(12, 21, 30, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#72]\n\t" FMA_3V(13, 22, 31, 0, 1, 2, 3, 3, 3)\
+  "add x4,x4,#144\n\t"\
+  "rev64 v0.2s,v0.2s; rev64 v1.2s,v1.2s; rev64 v2.2s,v2.2s\n\t"\
+  "ldr d3,[x4,#-64]\n\t" FMA_3V(5, 14, 23, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-56]\n\t" FMA_3V(6, 15, 24, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-48]\n\t" FMA_3V(7, 16, 25, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-40]\n\t" FMA_3V(8, 17, 26, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-32]\n\t" FMA_3V(9, 18, 27, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-24]\n\t" FMA_3V(10, 19, 28, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-16]\n\t" FMA_3V(11, 20, 29, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-8]\n\t" FMA_3V(12, 21, 30, 0, 1, 2, 3, 3, 3)\
+  "prfm pldl1keep,[x9]\n\t" FMA_3V(13, 22, 31, 0, 1, 2, 4, 4, 4)
+
+#define KERNEL_M3N18_TAIL2 \
+  "ldr d0,[x0],#8; ldr d1,[x1],#8; ldr d2,[x2],#8\n\t"\
+  "prfm pldl1keep,[x3]; sub w5,w5,#2\n\t"\
+  "ldr d4,[x4,#-136]\n\t" FMA_3V(5, 14, 23, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-128]\n\t" FMA_3V(6, 15, 24, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-120]\n\t" FMA_3V(7, 16, 25, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-112]\n\t" FMA_3V(8, 17, 26, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-104]\n\t" FMA_3V(9, 18, 27, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-96]\n\t" FMA_3V(10, 19, 28, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-88]\n\t" FMA_3V(11, 20, 29, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-80]\n\t" FMA_3V(12, 21, 30, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-72]\n\t" FMA_3V(13, 22, 31, 0, 1, 2, 3, 3, 3)\
+  "rev64 v0.2s,v0.2s; rev64 v1.2s,v1.2s; rev64 v2.2s,v2.2s\n\t"\
+  "prfm pldl1keep,[x8]\n\t"\
+  "ldr d3,[x4,#-64]\n\t" FMA_3V(5, 14, 23, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-56]\n\t" FMA_3V(6, 15, 24, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-48]\n\t" FMA_3V(7, 16, 25, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-40]\n\t" FMA_3V(8, 17, 26, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-32]\n\t" FMA_3V(9, 18, 27, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-24]\n\t" FMA_3V(10, 19, 28, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-16]\n\t" FMA_3V(11, 20, 29, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-8]\n\t" FMA_3V(12, 21, 30, 0, 1, 2, 3, 3, 3)\
+  "prfm pldl1keep,[x9]\n\t" FMA_3V(13, 22, 31, 0, 1, 2, 4, 4, 4)
+
+#define KERNEL_M3N18_FIN1 \
+  "ldr d3,[x4],#72\n\t"\
+  "ld1r {v0.2s},[x0],#4; ld1r {v1.2s},[x1],#4; ld1r {v2.2s},[x2],#4\n\t"\
+  "ldr d4,[x4,#-64]\n\t" FMA_3V(5, 14, 23, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-56]\n\t" FMA_3V(6, 15, 24, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-48]\n\t" FMA_3V(7, 16, 25, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-40]\n\t" FMA_3V(8, 17, 26, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-32]\n\t" FMA_3V(9, 18, 27, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-24]\n\t" FMA_3V(10, 19, 28, 0, 1, 2, 4, 4, 4)\
+  "ldr d4,[x4,#-16]\n\t" FMA_3V(11, 20, 29, 0, 1, 2, 3, 3, 3)\
+  "ldr d3,[x4,#-8]\n\t" FMA_3V(12, 21, 30, 0, 1, 2, 4, 4, 4)\
+  FMA_3V(13, 22, 31, 0, 1, 2, 3, 3, 3)
+
+#define SAVE_M3N18(mode) \
+  UNIT_SAVE_M3N2_##mode(5, 14, 23) UNIT_SAVE_M3N2_##mode(6, 15, 24)\
+  UNIT_SAVE_M3N2_##mode(7, 16, 25) UNIT_SAVE_M3N2_##mode(8, 17, 26)\
+  UNIT_SAVE_M3N2_##mode(9, 18, 27) UNIT_SAVE_M3N2_##mode(10, 19, 28)\
+  UNIT_SAVE_M3N2_##mode(11, 20, 29) UNIT_SAVE_M3N2_##mode(12, 21, 30)\
+  UNIT_SAVE_M3N2_##mode(13, 22, 31)
+
+
+FUNC_PACK4(4, 9)
+
+FUNC_PACK4(4, 11)
+
+FUNC_PACK4(3, 13)
+
+FUNC_PACK4(3, 15)
+
+FUNC_PACK4(3, 17)
+
+FUNC_PACK4(3, 18)
+
+
+#define INIT_M1N4 \
+  float32x2_t cd1, cd2, cd3, cd4;\
+  cd1 = cd2 = cd3 = cd4 = vdup_n_f32(0.0f);
+
+#define INIT_M1N5 INIT_M1N4 float32x2_t cd5 = vdup_n_f32(0.0f);
+
+#define INIT_M1N6 INIT_M1N5 float32x2_t cd6 = vdup_n_f32(0.0f);
+
+#define INIT_M1N7 INIT_M1N6 float32x2_t cd7 = vdup_n_f32(0.0f);
+
+#define INIT_M1N8 INIT_M1N7 float32x2_t cd8 = vdup_n_f32(0.0f);
+
+#define INIT_M1N10 INIT_M1N5
+
+#define INIT_M1N12 INIT_M1N6
+
+#define INIT_M1N14 INIT_M1N7
+
+#define INIT_M1N16 INIT_M1N8
+
+#define INIT_M1N9 \
+  float32x2_t cd1, cd2, cd3, cd4;\
+  cd1 = cd2 = cd3 = cd4 = vdup_n_f32(0.0f);\
+  float32x2_t cd0 = vdup_n_f32(0.0f);
+
+#define INIT_M1N11 INIT_M1N10 float32x2_t cd0 = vdup_n_f32(0.0f);
+
+#define INIT_M1N13 INIT_M1N12 float32x2_t cd0 = vdup_n_f32(0.0f);
+
+#define INIT_M1N15 INIT_M1N14 float32x2_t cd0 = vdup_n_f32(0.0f);
+
+#define INIT_M1N17 INIT_M1N16 float32x2_t cd0 = vdup_n_f32(0.0f);
+
+#define INIT_M1N18 INIT_M1N16 float32x2_t cd9 = vdup_n_f32(0.0f);
+
+#define LOAD_4D_B \
+  float32x2_t bd1 = vld1_f32(b_ptr);\
+  float32x2_t bd2 = vld1_f32(b_ptr + 2);\
+  float32x2_t bd3 = vld1_f32(b_ptr + 4);\
+  float32x2_t bd4 = vld1_f32(b_ptr + 6);
+
+#define LOAD_5D_B LOAD_4D_B float32x2_t bd5 = vld1_f32(b_ptr + 8);
+
+#define LOAD_6D_B LOAD_5D_B float32x2_t bd6 = vld1_f32(b_ptr + 10);
+
+#define LOAD_7D_B LOAD_6D_B float32x2_t bd7 = vld1_f32(b_ptr + 12);
+
+#define LOAD_8D_B LOAD_7D_B float32x2_t bd8 = vld1_f32(b_ptr + 14);
+
+#define LOAD_9D_B LOAD_8D_B float32x2_t bd9 = vld1_f32(b_ptr + 16);
+
+#define ACC_4D \
+  cd1 = vfma_f32(cd1, ad1, bd1);\
+  cd2 = vfma_f32(cd2, ad1, bd2);\
+  cd3 = vfma_f32(cd3, ad1, bd3);\
+  cd4 = vfma_f32(cd4, ad1, bd4);
+
+#define ACC_5D ACC_4D cd5 = vfma_f32(cd5, ad1, bd5);
+
+#define ACC_6D ACC_5D cd6 = vfma_f32(cd6, ad1, bd6);
+
+#define ACC_7D ACC_6D cd7 = vfma_f32(cd7, ad1, bd7);
+
+#define ACC_8D ACC_7D cd8 = vfma_f32(cd8, ad1, bd8);
+
+#define ACC_9D ACC_8D cd9 = vfma_f32(cd9, ad1, bd9);
+
+#define REDUC_4D \
+  float cs1 = vpadds_f32(cd1); float cs2 = vpadds_f32(cd2);\
+  float cs3 = vpadds_f32(cd3); float cs4 = vpadds_f32(cd4);\
+
+#define REDUC_5D REDUC_4D float cs5 = vpadds_f32(cd5);
+
+#define REDUC_6D REDUC_5D float cs6 = vpadds_f32(cd6);
+
+#define REDUC_7D REDUC_6D float cs7 = vpadds_f32(cd7);
+
+#define REDUC_8D REDUC_7D float cs8 = vpadds_f32(cd8);
+
+#define ACC_4S \
+  cs1 += as1 * b_ptr[0]; cs2 += as1 * b_ptr[1];\
+  cs3 += as1 * b_ptr[2]; cs4 += as1 * b_ptr[3];\
+
+#define ACC_5S ACC_4S cs5 += as1 * b_ptr[4];
+
+#define ACC_6S ACC_5S cs6 += as1 * b_ptr[5];
+
+#define ACC_7S ACC_6S cs7 += as1 * b_ptr[6];
+
+#define ACC_8S ACC_7S cs8 += as1 * b_ptr[7];
+
+#define UNIT_SAVE_M1N1_CC(cs1) \
+  c_ptr[0] = c_ptr[0] * beta + cs1; c_ptr += LDC;
+
+#define UNIT_SAVE_M1N1_CR(cs1) \
+  c_ptr[0] = c_ptr[0] * beta + cs1; c_ptr++;
+
+#define UNIT_SAVE_M1N2_CC(cd1) \
+  c_ptr[0] = c_ptr[0] * beta + vget_lane_f32(cd1, 0);\
+  c_ptr[LDC] = c_ptr[LDC] * beta + vget_lane_f32(cd1, 1);\
+  c_ptr += LDC * 2;
+
+#define UNIT_SAVE_M1N2_CR(cd1) \
+  cd1 = vfma_n_f32(cd1, vld1_f32(c_ptr), beta);\
+  vst1_f32(c_ptr, cd1); c_ptr += 2;
+
+#define SAVE_M1N4(mode) \
+  UNIT_SAVE_M1N1_##mode(cs1) UNIT_SAVE_M1N1_##mode(cs2)\
+  UNIT_SAVE_M1N1_##mode(cs3) UNIT_SAVE_M1N1_##mode(cs4)\
+
+#define SAVE_M1N5(mode) SAVE_M1N4(mode) UNIT_SAVE_M1N1_##mode(cs5)
+
+#define SAVE_M1N6(mode) SAVE_M1N5(mode) UNIT_SAVE_M1N1_##mode(cs6)
+
+#define SAVE_M1N7(mode) SAVE_M1N6(mode) UNIT_SAVE_M1N1_##mode(cs7)
+
+#define SAVE_M1N8(mode) SAVE_M1N7(mode) UNIT_SAVE_M1N1_##mode(cs8)
+
+#define SAVE_M1N10(mode) \
+  UNIT_SAVE_M1N2_##mode(cd1) UNIT_SAVE_M1N2_##mode(cd2)\
+  UNIT_SAVE_M1N2_##mode(cd3) UNIT_SAVE_M1N2_##mode(cd4)\
+  UNIT_SAVE_M1N2_##mode(cd5)
+
+#define SAVE_M1N12(mode) SAVE_M1N10(mode) UNIT_SAVE_M1N2_##mode(cd6)
+
+#define SAVE_M1N14(mode) SAVE_M1N12(mode) UNIT_SAVE_M1N2_##mode(cd7)
+
+#define SAVE_M1N16(mode) SAVE_M1N14(mode) UNIT_SAVE_M1N2_##mode(cd8)
+
+#define SAVE_M1N18(mode) SAVE_M1N16(mode) UNIT_SAVE_M1N2_##mode(cd9)
+
+#define SAVE_M1N9(mode) \
+  UNIT_SAVE_M1N2_##mode(cd1) UNIT_SAVE_M1N2_##mode(cd2)\
+  UNIT_SAVE_M1N2_##mode(cd3) UNIT_SAVE_M1N2_##mode(cd4)\
+  UNIT_SAVE_M1N1_##mode(cs0)
+
+#define SAVE_M1N11(mode) SAVE_M1N10(mode) UNIT_SAVE_M1N1_##mode(cs0)
+
+#define SAVE_M1N13(mode) SAVE_M1N12(mode) UNIT_SAVE_M1N1_##mode(cs0)
+
+#define SAVE_M1N15(mode) SAVE_M1N14(mode) UNIT_SAVE_M1N1_##mode(cs0)
+
+#define SAVE_M1N17(mode) SAVE_M1N16(mode) UNIT_SAVE_M1N1_##mode(cs0)
+
+#define COMPUTE_M1_PACK3(ndim) \
+  for (; k_left > 1; k_left -= 2) {\
+    float32x2_t ad1 = vld1_f32(a_ptr); a_ptr += 2;\
+    LOAD_##ndim##D_B\
+    ACC_##ndim##D\
+    b_ptr += 2 * ndim;\
+  }\
+  REDUC_##ndim##D\
+  if (k_left > 0) {\
+    float as1 = *a_ptr;\
+    ACC_##ndim##S\
+  }
+
+#define COMPUTE_M1_PACK0_BASE(ndiv2) \
+  for (; k_left > 0; k_left--) {\
+    float32x2_t ad1 = vld1_dup_f32(a_ptr); a_ptr++;\
+    LOAD_##ndiv2##D_B\
+    ACC_##ndiv2##D\
+    b_ptr += ndiv2 * 2;\
+  }
+
+#define COMPUTE_M1_PACK0_N10 COMPUTE_M1_PACK0_BASE(5)
+#define COMPUTE_M1_PACK0_N12 COMPUTE_M1_PACK0_BASE(6)
+#define COMPUTE_M1_PACK0_N14 COMPUTE_M1_PACK0_BASE(7)
+#define COMPUTE_M1_PACK0_N16 COMPUTE_M1_PACK0_BASE(8)
+#define COMPUTE_M1_PACK0(ndim) COMPUTE_M1_PACK0_N##ndim
+
+#define COMPUTE_M1_PACK4_EVEN(ndiv2) \
+  for (; k_left > 1; k_left -= 2) {\
+    float32x2_t ad1 = vld1_f32(a_ptr); a_ptr += 2;\
+    {\
+      LOAD_##ndiv2##D_B b_ptr += ndiv2 * 2;\
+      ACC_##ndiv2##D\
+    }\
+    ad1 = vrev64_f32(ad1);\
+    LOAD_##ndiv2##D_B b_ptr += ndiv2 * 2;\
+    ACC_##ndiv2##D\
+  }\
+  if (k_left > 0) {\
+    float32x2_t ad1 = vld1_dup_f32(a_ptr);\
+    LOAD_##ndiv2##D_B\
+    ACC_##ndiv2##D\
+  }
+
+#define COMPUTE_M1_PACK4_N18 COMPUTE_M1_PACK4_EVEN(9)
+
+#define COMPUTE_M1_PACK4_ODD(ndiv2) \
+  for (; k_left > 1; k_left -= 2) {\
+    float32x2_t ad1 = vld1_f32(a_ptr); a_ptr += 2;\
+    {\
+      LOAD_##ndiv2##D_B\
+      float32x2_t bd0 = vld1_f32(b_ptr + ndiv2 * 2);\
+      b_ptr += ndiv2 * 2 + 2;\
+      ACC_##ndiv2##D\
+      cd0 = vfma_f32(cd0, ad1, bd0);\
+    }\
+    ad1 = vrev64_f32(ad1);\
+    LOAD_##ndiv2##D_B b_ptr += ndiv2 * 2;\
+    ACC_##ndiv2##D\
+  }\
+  float cs0 = vpadds_f32(cd0);\
+  if (k_left > 0) {\
+    float32x2_t ad1 = vld1_dup_f32(a_ptr);\
+    LOAD_##ndiv2##D_B\
+    float bs0 = b_ptr[ndiv2 * 2];\
+    ACC_##ndiv2##D\
+    cs0 += bs0 * vget_lane_f32(ad1, 0);\
+  }
+
+#define COMPUTE_M1_PACK4_N9 COMPUTE_M1_PACK4_ODD(4)
+#define COMPUTE_M1_PACK4_N11 COMPUTE_M1_PACK4_ODD(5)
+#define COMPUTE_M1_PACK4_N13 COMPUTE_M1_PACK4_ODD(6)
+#define COMPUTE_M1_PACK4_N15 COMPUTE_M1_PACK4_ODD(7)
+#define COMPUTE_M1_PACK4_N17 COMPUTE_M1_PACK4_ODD(8)
+
+#define COMPUTE_M1_PACK4(ndim) COMPUTE_M1_PACK4_N##ndim    
+
+#define FUNC_EDGE(ndim, pack) \
+static inline void sgemm_skinny1_a35_m1n##ndim(\
+  const float * __restrict__ a_ptr, const float * __restrict__ b_ptr,\
+  float * __restrict__ c_ptr, uint32_t k_left, uint32_t LDC,\
+  uint8_t c_rowmajor, float beta) {\
+  INIT_M1N##ndim\
+  COMPUTE_M1_PACK##pack(ndim)\
+  if (c_rowmajor == 0) {\
+    SAVE_M1N##ndim(CC)\
+  } else {\
+    SAVE_M1N##ndim(CR)\
+  }\
+}
+
+FUNC_EDGE(4, 3)
+
+FUNC_EDGE(5, 3)
+
+FUNC_EDGE(6, 3)
+
+FUNC_EDGE(7, 3)
+
+FUNC_EDGE(8, 3)
+
+FUNC_EDGE(10, 0)
+
+FUNC_EDGE(12, 0)
+
+FUNC_EDGE(14, 0)
+
+FUNC_EDGE(16, 0)
+
+FUNC_EDGE(9, 4)
+
+FUNC_EDGE(11, 4)
+
+FUNC_EDGE(13, 4)
+
+FUNC_EDGE(15, 4)
+
+FUNC_EDGE(17, 4)
+
+FUNC_EDGE(18, 4)
+
+#endif
diff --git a/include/neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotKernelA53.h b/include/neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotKernelA53.h
new file mode 100644
index 0000000..f0f9370
--- /dev/null
+++ b/include/neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotKernelA53.h
@@ -0,0 +1,4306 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+#include <arm_neon.h>
+
+#ifndef INCLUDE_A53_KERNEL
+#define INCLUDE_A53_KERNEL
+
+/* x0 - x3 for a_ptrs */
+/* x4 for b_ptr, x5 for k_left */
+/* x6 - x9 for a_pref */
+/* x10 - x11, x16 - x17 and x19 - x20 for vec_fill */
+/* x12 - x15 for c_tmp */
+
+#define INIT_SAVE \
+  "ldr s0,[%[beta_addr]]; mov x12,%[c_ptr]\n\t"\
+  "add x13,%[c_ptr],%w[LDC],UXTW #2; add x14,%[c_ptr],%w[LDC],UXTW #3\n\t"\
+  "add x15,x13,%w[LDC],UXTW #3\n\t"
+
+#define UNIT_SAVE_M4N4_VR_CC(c1, c2, c3, c4) \
+  "ldr q1,[x12]; ldr q2,[x13]; ldr q3,[x14]; ldr q4,[x15]\n\t"\
+  "zip1 v5.4s,v"#c1".4s,v"#c2".4s; zip1 v6.4s,v"#c3".4s,v"#c4".4s\n\t"\
+  "zip2 v7.4s,v"#c1".4s,v"#c2".4s; zip2 v"#c4".4s,v"#c3".4s,v"#c4".4s\n\t"\
+  "zip1 v"#c1".2d,v5.2d,v6.2d; zip1 v"#c3".2d,v7.2d,v"#c4".2d\n\t"\
+  "zip2 v"#c2".2d,v5.2d,v6.2d; zip2 v"#c4".2d,v7.2d,v"#c4".2d\n\t"\
+  "fmla v"#c1".4s,v1.4s,v0.s[0]; fmla v"#c2".4s,v2.4s,v0.s[0]\n\t"\
+  "fmla v"#c3".4s,v3.4s,v0.s[0]; fmla v"#c4".4s,v4.4s,v0.s[0]\n\t"\
+  "str q"#c1",[x12]; prfm pldl2keep,[x12,#32]\n\t"\
+  "add x12,x12,%w[LDC],UXTW #4; prfm pstl1keep,[x12,#8]\n\t"\
+  "str q"#c2",[x13]; prfm pldl2keep,[x13,#32]\n\t"\
+  "add x13,x13,%w[LDC],UXTW #4; prfm pstl1keep,[x13,#8]\n\t"\
+  "str q"#c3",[x14]; prfm pldl2keep,[x14,#32]\n\t"\
+  "add x14,x14,%w[LDC],UXTW #4; prfm pstl1keep,[x14,#8]\n\t"\
+  "str q"#c4",[x15]; prfm pldl2keep,[x15,#32]\n\t"\
+  "add x15,x15,%w[LDC],UXTW #4; prfm pstl1keep,[x15,#8]\n\t"
+
+#define UNIT_SAVE_M4N4_VR_CR(c1, c2, c3, c4) \
+  "ldr q1,[x12]; ldr q2,[x13]; ldr q3,[x14]; ldr q4,[x15]\n\t"\
+  "fmla v"#c1".4s,v1.4s,v0.s[0]; fmla v"#c2".4s,v2.4s,v0.s[0]\n\t"\
+  "fmla v"#c3".4s,v3.4s,v0.s[0]; fmla v"#c4".4s,v4.4s,v0.s[0]\n\t"\
+  "str q"#c1",[x12],#16; str q"#c2",[x13],#16\n\t"\
+  "str q"#c3",[x14],#16; str q"#c4",[x15],#16\n\t"
+
+#define UNIT_SAVE_M4N4_VC_CC(c1, c2, c3, c4) \
+  "ldr q1,[x12]; ldr q2,[x13]; ldr q3,[x14]; ldr q4,[x15]\n\t"\
+  "fmla v"#c1".4s,v1.4s,v0.s[0]; fmla v"#c2".4s,v2.4s,v0.s[0]\n\t"\
+  "fmla v"#c3".4s,v3.4s,v0.s[0]; fmla v"#c4".4s,v4.4s,v0.s[0]\n\t"\
+  "str q"#c1",[x12]; prfm pldl2keep,[x12,#32]\n\t"\
+  "add x12,x12,%w[LDC],UXTW #4; prfm pstl1keep,[x12,#8]\n\t"\
+  "str q"#c2",[x13]; prfm pldl2keep,[x13,#32]\n\t"\
+  "add x13,x13,%w[LDC],UXTW #4; prfm pstl1keep,[x13,#8]\n\t"\
+  "str q"#c3",[x14]; prfm pldl2keep,[x14,#32]\n\t"\
+  "add x14,x14,%w[LDC],UXTW #4; prfm pstl1keep,[x14,#8]\n\t"\
+  "str q"#c4",[x15]; prfm pldl2keep,[x15,#32]\n\t"\
+  "add x15,x15,%w[LDC],UXTW #4; prfm pstl1keep,[x15,#8]\n\t"
+
+#define UNIT_SAVE_M4N4_VC_CR(c1, c2, c3, c4) \
+  "zip1 v1.4s,v"#c1".4s,v"#c2".4s; zip1 v2.4s,v"#c3".4s,v"#c4".4s\n\t"\
+  "zip2 v3.4s,v"#c1".4s,v"#c2".4s; zip2 v4.4s,v"#c3".4s,v"#c4".4s\n\t"\
+  "zip1 v"#c1".2d,v1.2d,v2.2d; zip2 v"#c2".2d,v1.2d,v2.2d\n\t"\
+  "ldr q1,[x12]; ldr q2,[x13]\n\t"\
+  "zip1 v"#c3".2d,v3.2d,v4.2d; zip2 v"#c4".2d,v3.2d,v4.2d\n\t"\
+  "ldr q3,[x14]; ldr q4,[x15]\n\t"\
+  "fmla v"#c1".4s,v1.4s,v0.s[0]; fmla v"#c2".4s,v2.4s,v0.s[0]\n\t"\
+  "fmla v"#c3".4s,v3.4s,v0.s[0]; fmla v"#c4".4s,v4.4s,v0.s[0]\n\t"\
+  "str q"#c1",[x12],#16; str q"#c2",[x13],#16\n\t"\
+  "str q"#c3",[x14],#16; str q"#c4",[x15],#16\n\t"
+
+#define EDGE_SAVE_M4N1K4_CC(c1, c2, c3, c4) \
+  "ldr q1,[x12]\n\t"\
+  "faddp v"#c1".4s,v"#c1".4s,v"#c2".4s\n\t"\
+  "faddp v"#c3".4s,v"#c3".4s,v"#c4".4s\n\t"\
+  "faddp v"#c1".4s,v"#c1".4s,v"#c3".4s\n\t"\
+  "fmla v"#c1".4s,v1.4s,v0.s[0]; str q"#c1",[x12]\n\t"\
+  "prfm pldl1keep,[x12,#32]; add x12,x12,%w[LDC],UXTW #2\n\t"
+
+#define EDGE_SAVE_M4N1K4_CR(c1, c2, c3, c4) \
+  "ldr s1,[x12]; ldr s2,[x13]; ldr s3,[x14]; ldr s4,[x15]\n\t"\
+  "faddp v"#c1".4s,v"#c1".4s,v"#c2".4s\n\t"\
+  "ins v1.s[1],v2.s[0]; ins v3.s[1],v4.s[0]\n\t"\
+  "faddp v"#c3".4s,v"#c3".4s,v"#c4".4s\n\t"\
+  "ins v1.d[1],v3.d[0]\n\t"\
+  "faddp v"#c1".4s,v"#c1".4s,v"#c3".4s\n\t"\
+  "fmla v"#c1".4s,v1.4s,v0.s[0]\n\t"\
+  "st1 {v"#c1".s}[0],[x12],#4; st1 {v"#c1".s}[1],[x13],#4\n\t"\
+  "st1 {v"#c1".s}[2],[x14],#4; st1 {v"#c1".s}[3],[x15],#4\n\t"
+
+#define EDGE_SAVE_M4N1K2_CC(c1, c2) \
+  "ldr q1,[x12]\n\t"\
+  "trn1 v2.4s,v"#c1".4s,v"#c2".4s; trn2 v3.4s,v"#c1".4s,v"#c2".4s\n\t"\
+  "fadd v2.4s,v2.4s,v3.4s; fmla v2.4s,v1.4s,v0.s[0]\n\t"\
+  "str q2,[x12]; prfm pstl2keep,[x12,#32]; add x12,x12,%w[LDC],UXTW #2\n\t"
+
+#define EDGE_SAVE_M4N1K2_CR(c1, c2) \
+  "ldr s1,[x12]; ldr s2,[x13]; ldr s3,[x14]; ldr s4,[x15]\n\t"\
+  "dup d5,v"#c1".d[1]; ins v1.s[1],v2.s[0]\n\t"\
+  "dup d6,v"#c2".d[1]; ins v3.s[1],v4.s[0]\n\t"\
+  "faddp v"#c1".2s,v"#c1".2s,v"#c2".2s; faddp v"#c2".2s,v5.2s,v6.2s\n\t"\
+  "fmla v"#c1".2s,v1.2s,v0.s[0]; fmla v"#c2".2s,v3.2s,v0.s[0]\n\t"\
+  "st1 {v"#c1".s}[0],[x12],#4; st1 {v"#c1".s}[1],[x13],#4\n\t"\
+  "st1 {v"#c2".s}[0],[x14],#4; st1 {v"#c2".s}[1],[x15],#4\n\t"
+
+#define EDGE_SAVE_M4N1K1_CC(c1) \
+  "ldr q1,[x12]; fmla v"#c1".4s,v1.4s,v0.s[0]\n\t"\
+  "str q"#c1",[x12]; prfm pstl2keep,[x12,#32]\n\t"\
+  "add x12,x12,%w[LDC],UXTW #2\n\t"
+
+#define EDGE_SAVE_M4N1K1_CR(c1) \
+  "ldr s1,[x12]; ldr s2,[x13]; ldr s3,[x14]; ldr s4,[x15]\n\t"\
+  "ins v1.s[1],v2.s[0]; ins v3.s[1],v4.s[0]; ins v1.d[1],v3.d[0]\n\t"\
+  "fmla v"#c1".4s,v1.4s,v0.s[0]\n\t"\
+  "st1 {v"#c1".s}[0],[x12],#4; st1 {v"#c1".s}[1],[x13],#4\n\t"\
+  "st1 {v"#c1".s}[2],[x14],#4; st1 {v"#c1".s}[3],[x15],#4\n\t"
+
+#define INIT_1V(c1) "movi v"#c1".16b,#0\n\t"
+
+#define INIT_2V(c1, c2) \
+  "movi v"#c1".16b,#0; movi v"#c2".16b,#0\n\t"\
+
+#define INIT_4V(c1, c2, c3, c4) INIT_2V(c1, c2) INIT_2V(c3, c4)
+
+/* m4n4 c_vec */
+/* v28(v24) */
+/* v29(v25) */
+/* v30(v26) */
+/* v31(v27) */
+#define INIT_M4N4 \
+  INIT_4V(24, 25, 26, 27) INIT_4V(28, 29, 30, 31)
+
+#define SAVE_M4N4(mode) \
+  "fadd v28.4s,v28.4s,v24.4s; fadd v29.4s,v29.4s,v25.4s\n\t"\
+  "fadd v30.4s,v30.4s,v26.4s; fadd v31.4s,v31.4s,v27.4s\n\t"\
+  UNIT_SAVE_M4N4_VR_##mode(28, 29, 30, 31)
+
+#define KERNEL_M4N4_PRELOAD4 \
+  "ldr q0,[x0],#16; ldr q1,[x1],#16; ldr q2,[x2],#16; ldr d3,[x3],#16\n\t"\
+  "ldr q8,[x4],#64; ldr d9,[x4,#-48]; ldr x10,[x4,#-40]; ldr x11,[x3,#-8]\n\t"
+
+#define KERNEL_M4N4_K8_L4(ac1, ac2, ac3, ac4, an1, an2, an3, an4) \
+  "fmov v9.d[1],x10; ldr d"#an1",[x0],#16\n\t"\
+  "fmla v24.4s,v8.4s,v"#ac1".s[0]; prfm pldl1keep,[x1,#80]\n\t"\
+  "fmla v25.4s,v8.4s,v"#ac2".s[0]; ldr x16,[x0,#-8]\n\t"\
+  "fmov v"#ac4".d[1],x11; ldr d10,[x4,#-32]\n\t"\
+  "fmla v26.4s,v8.4s,v"#ac3".s[0]; ldr x10,[x4,#-24]\n\t"\
+  "fmla v27.4s,v8.4s,v"#ac4".s[0]; prfm pldl1keep,[x0,#64]\n\t"\
+  "fmov v10.d[1],x10; ldr d"#an2",[x1],#16\n\t"\
+  "fmla v28.4s,v9.4s,v"#ac1".s[1]; prfm pldl1keep,[x4,#96]\n\t"\
+  "fmla v29.4s,v9.4s,v"#ac2".s[1]; ldr x11,[x1,#-8]\n\t"\
+  "fmov v"#an1".d[1],x16; ldr d11,[x4,#-16]\n\t"\
+  "fmla v30.4s,v9.4s,v"#ac3".s[1]; ldr x10,[x4,#-8]\n\t"\
+  "fmla v31.4s,v9.4s,v"#ac4".s[1]; prfm pldl1keep,[x2,#80]\n\t"\
+  "fmov v11.d[1],x10; ldr d"#an3",[x2],#16\n\t"\
+  "fmla v24.4s,v10.4s,v"#ac1".s[2]; prfm pldl1keep,[x3,#80]\n\t"\
+  "fmla v25.4s,v10.4s,v"#ac2".s[2]; ldr x16,[x2,#-8]\n\t"\
+  "fmov v"#an2".d[1],x11; ldr d8,[x4]\n\t"\
+  "fmla v26.4s,v10.4s,v"#ac3".s[2]; ldr x10,[x4,#8]\n\t"\
+  "fmla v27.4s,v10.4s,v"#ac4".s[2]; sub w5,w5,#4\n\t"\
+  "fmov v8.d[1],x10; ldr d"#an4",[x3],#16\n\t"\
+  "fmla v28.4s,v11.4s,v"#ac1".s[3]; cmp w5,#12\n\t"\
+  "fmla v29.4s,v11.4s,v"#ac2".s[3]; ldr x11,[x3,#-8]\n\t"\
+  "fmov v"#an3".d[1],x16; ldr d9,[x4,#16]\n\t"\
+  "fmla v30.4s,v11.4s,v"#ac3".s[3]; ldr x10,[x4,#24]\n\t"\
+  "fmla v31.4s,v11.4s,v"#ac4".s[3]; add x4,x4,#64\n\t"
+
+#define KERNEL_M4N4_K8_T4(ac1, ac2, ac3, ac4) \
+  "fmov v9.d[1],x10\n\t"\
+  "fmla v24.4s,v8.4s,v"#ac1".s[0]\n\t"\
+  "fmla v25.4s,v8.4s,v"#ac2".s[0]\n\t"\
+  "fmov v"#ac4".d[1],x11; ldr d10,[x4,#-32]\n\t"\
+  "fmla v26.4s,v8.4s,v"#ac3".s[0]; ldr x10,[x4,#-24]\n\t"\
+  "fmla v27.4s,v8.4s,v"#ac4".s[0]; prfm pldl1keep,[x6]\n\t"\
+  "fmov v10.d[1],x10\n\t"\
+  "fmla v28.4s,v9.4s,v"#ac1".s[1]\n\t"\
+  "fmla v29.4s,v9.4s,v"#ac2".s[1]\n\t"\
+  "ldr d11,[x4,#-16]\n\t"\
+  "fmla v30.4s,v9.4s,v"#ac3".s[1]; ldr x10,[x4,#-8]\n\t"\
+  "fmla v31.4s,v9.4s,v"#ac4".s[1]; prfm pldl1keep,[x7]\n\t"\
+  "fmov v11.d[1],x10\n\t"\
+  "fmla v24.4s,v10.4s,v"#ac1".s[2]\n\t"\
+  "fmla v25.4s,v10.4s,v"#ac2".s[2]; prfm pldl1keep,[x8]\n\t"\
+  "fmla v26.4s,v10.4s,v"#ac3".s[2]\n\t"\
+  "fmla v27.4s,v10.4s,v"#ac4".s[2]; sub w5,w5,#4\n\t"\
+  "fmla v28.4s,v11.4s,v"#ac1".s[3]; prfm pldl1keep,[x9]\n\t"\
+  "fmla v29.4s,v11.4s,v"#ac2".s[3]\n\t"\
+  "fmla v30.4s,v11.4s,v"#ac3".s[3]\n\t"\
+  "fmla v31.4s,v11.4s,v"#ac4".s[3]\n\t"
+
+#define KERNEL_M4N4_TL1 \
+  "ldr s0,[x0],#4; ldr s1,[x1],#4\n\t"\
+  "ldr q8,[x4],#16\n\t"\
+  "ldr s2,[x2],#4; ldr s3,[x3],#4\n\t"\
+  "fmla v28.4s,v8.4s,v0.s[0]; sub w5,w5,#1\n\t"\
+  "fmla v29.4s,v8.4s,v1.s[0]; cmp w5,#1\n\t"\
+  "fmla v30.4s,v8.4s,v2.s[0]\n\t"\
+  "fmla v31.4s,v8.4s,v3.s[0]\n\t"
+
+/* m4n5 c_vec */
+/* v21(v20) v22_comp */
+/* v24(v23) v25_comp */
+/* v27(v26) v28_comp */
+/* v30(v29) v31_comp */
+
+#define INIT_M4N5 \
+  INIT_4V(20, 21, 22, 23)\
+  INIT_4V(24, 25, 26, 27) INIT_4V(28, 29, 30, 31)
+
+#define SAVE_M4N5(mode) \
+  "fadd v21.4s,v21.4s,v20.4s; fadd v24.4s,v24.4s,v23.4s\n\t"\
+  "fadd v27.4s,v27.4s,v26.4s; fadd v30.4s,v30.4s,v29.4s\n\t"\
+  UNIT_SAVE_M4N4_VR_##mode(21, 24, 27, 30) EDGE_SAVE_M4N1K4_##mode(22, 25, 28, 31)
+
+#define KERNEL_M4N5_PRELOAD4 \
+  "ldr q0,[x0],#16; ldr q1,[x1],#16; ldr q2,[x2],#16; ldr d3,[x3],#16\n\t"\
+  "ldr q8,[x4],#80; ldr d9,[x4,#-64]; ldr x10,[x4,#-56]; ldr x11,[x3,#-8]\n\t"
+
+#define KERNEL_M4N5_K8_L4(ac1, ac2, ac3, ac4, an1, an2, an3, an4) \
+  "fmov v9.d[1],x10; ldr d"#an1",[x0],#16\n\t"\
+  "fmla v20.4s,v8.4s,v"#ac1".s[0]; prfm pldl1keep,[x1,#80]\n\t"\
+  "fmla v23.4s,v8.4s,v"#ac2".s[0]; ldr x16,[x0,#-8]\n\t"\
+  "fmov v"#ac4".d[1],x11; ldr d10,[x4,#-48]\n\t"\
+  "fmla v26.4s,v8.4s,v"#ac3".s[0]; ldr x10,[x4,#-40]\n\t"\
+  "fmla v29.4s,v8.4s,v"#ac4".s[0]; prfm pldl1keep,[x0,#64]\n\t"\
+  "fmov v10.d[1],x10; ldr d"#an2",[x1],#16\n\t"\
+  "fmla v21.4s,v9.4s,v"#ac1".s[1]; prfm pldl1keep,[x4,#96]\n\t"\
+  "fmla v24.4s,v9.4s,v"#ac2".s[1]; ldr x11,[x1,#-8]\n\t"\
+  "fmov v"#an1".d[1],x16; ldr d11,[x4,#-32]\n\t"\
+  "fmla v27.4s,v9.4s,v"#ac3".s[1]; ldr x10,[x4,#-24]\n\t"\
+  "fmla v30.4s,v9.4s,v"#ac4".s[1]; prfm pldl1keep,[x2,#80]\n\t"\
+  "fmov v11.d[1],x10; ldr d"#an3",[x2],#16\n\t"\
+  "fmla v20.4s,v10.4s,v"#ac1".s[2]\n\t"\
+  "fmla v23.4s,v10.4s,v"#ac2".s[2]; ldr x16,[x2,#-8]\n\t"\
+  "fmov v"#an2".d[1],x11; ldr d12,[x4,#-16]\n\t"\
+  "fmla v26.4s,v10.4s,v"#ac3".s[2]; ldr x10,[x4,#-8]\n\t"\
+  "fmla v29.4s,v10.4s,v"#ac4".s[2]; sub w5,w5,#4\n\t"\
+  "fmov v12.d[1],x10; ldr d"#an4",[x3],#16\n\t"\
+  "fmla v21.4s,v11.4s,v"#ac1".s[3]; cmp w5,#12\n\t"\
+  "fmla v24.4s,v11.4s,v"#ac2".s[3]; ldr x11,[x3,#-8]\n\t"\
+  "fmov v"#an3".d[1],x16; ldr d8,[x4]\n\t"\
+  "fmla v27.4s,v11.4s,v"#ac3".s[3]; ldr x10,[x4,#8]\n\t"\
+  "fmla v30.4s,v11.4s,v"#ac4".s[3]; add x4,x4,#80\n\t"\
+  "fmla v22.4s,v12.4s,v"#ac1".4s; prfm pldl1keep,[x3,#64]\n\t"\
+  "fmov v8.d[1],x10; ldr d9,[x4,#-64]\n\t"\
+  "fmla v25.4s,v12.4s,v"#ac2".4s; ldr x10,[x4,#-56]\n\t"\
+  "fmla v28.4s,v12.4s,v"#ac3".4s; prfm pldl1keep,[x4,#48]\n\t"\
+  "fmla v31.4s,v12.4s,v"#ac4".4s\n\t"
+
+#define KERNEL_M4N5_K8_T4(ac1, ac2, ac3, ac4) \
+  "fmov v9.d[1],x10\n\t"\
+  "fmla v20.4s,v8.4s,v"#ac1".s[0]\n\t"\
+  "fmla v23.4s,v8.4s,v"#ac2".s[0]\n\t"\
+  "fmov v"#ac4".d[1],x11; ldr d10,[x4,#-48]\n\t"\
+  "fmla v26.4s,v8.4s,v"#ac3".s[0]; ldr x10,[x4,#-40]\n\t"\
+  "fmla v29.4s,v8.4s,v"#ac4".s[0]\n\t"\
+  "fmov v10.d[1],x10\n\t"\
+  "fmla v21.4s,v9.4s,v"#ac1".s[1]; prfm pldl1keep,[x6]\n\t"\
+  "fmla v24.4s,v9.4s,v"#ac2".s[1]\n\t"\
+  "ldr d11,[x4,#-32]\n\t"\
+  "fmla v27.4s,v9.4s,v"#ac3".s[1]; ldr x10,[x4,#-24]\n\t"\
+  "fmla v30.4s,v9.4s,v"#ac4".s[1]; prfm pldl1keep,[x7]\n\t"\
+  "fmov v11.d[1],x10\n\t"\
+  "fmla v20.4s,v10.4s,v"#ac1".s[2]\n\t"\
+  "fmla v23.4s,v10.4s,v"#ac2".s[2]\n\t"\
+  "ldr d12,[x4,#-16]\n\t"\
+  "fmla v26.4s,v10.4s,v"#ac3".s[2]; ldr x10,[x4,#-8]\n\t"\
+  "fmla v29.4s,v10.4s,v"#ac4".s[2]; sub w5,w5,#4\n\t"\
+  "fmov v12.d[1],x10\n\t"\
+  "fmla v21.4s,v11.4s,v"#ac1".s[3]; prfm pldl1keep,[x8]\n\t"\
+  "fmla v24.4s,v11.4s,v"#ac2".s[3]\n\t"\
+  "fmla v27.4s,v11.4s,v"#ac3".s[3]\n\t"\
+  "fmla v30.4s,v11.4s,v"#ac4".s[3]\n\t"\
+  "fmla v22.4s,v12.4s,v"#ac1".4s\n\t"\
+  "fmla v25.4s,v12.4s,v"#ac2".4s\n\t"\
+  "fmla v28.4s,v12.4s,v"#ac3".4s; prfm pldl1keep,[x9]\n\t"\
+  "fmla v31.4s,v12.4s,v"#ac4".4s\n\t"
+
+#define KERNEL_M4N5_TL1 \
+  "ldr s0,[x0],#4; ldr s1,[x1],#4\n\t"\
+  "ldr q8,[x4],#16\n\t"\
+  "ldr s2,[x2],#4; ldr s3,[x3],#4\n\t"\
+  "fmla v21.4s,v8.4s,v0.s[0]; sub w5,w5,#1\n\t"\
+  "fmla v24.4s,v8.4s,v1.s[0]; cmp w5,#1\n\t"\
+  "fmla v27.4s,v8.4s,v2.s[0]\n\t"\
+  "ldr s9,[x4],#4\n\t"\
+  "fmla v30.4s,v8.4s,v3.s[0]\n\t"\
+  "fmla v22.4s,v0.4s,v9.s[0]\n\t"\
+  "fmla v25.4s,v1.4s,v9.s[0]\n\t"\
+  "fmla v28.4s,v2.4s,v9.s[0]\n\t"\
+  "fmla v31.4s,v3.4s,v9.s[0]\n\t"
+
+/* m4n6 c_vec */
+/* v17(v16) v18_comp v19_comp */
+/* v21(v20) v22_comp v23_comp */
+/* v25(v24) v26_comp v27_comp */
+/* v29(v28) v30_comp v31_comp */
+
+#define INIT_M4N6 \
+  INIT_4V(16, 17, 18, 19) INIT_4V(20, 21, 22, 23)\
+  INIT_4V(24, 25, 26, 27) INIT_4V(28, 29, 30, 31)
+
+#define SAVE_M4N6(mode) \
+  "fadd v17.4s,v17.4s,v16.4s; fadd v21.4s,v21.4s,v20.4s\n\t"\
+  "fadd v25.4s,v25.4s,v24.4s; fadd v29.4s,v29.4s,v28.4s\n\t"\
+  UNIT_SAVE_M4N4_VR_##mode(17, 21, 25, 29) EDGE_SAVE_M4N1K4_##mode(18, 22, 26, 30)\
+  EDGE_SAVE_M4N1K4_##mode(19, 23, 27, 31)
+
+#define KERNEL_M4N6_PRELOAD4 \
+  "ldr q0,[x0],#16; ldr q1,[x1],#16; ldr q2,[x2],#16; ldr d3,[x3],#16\n\t"\
+  "ldr q8,[x4],#96; ldr d9,[x4,#-80]; ldr x10,[x4,#-72]; ldr x11,[x3,#-8]\n\t"
+
+#define KERNEL_M4N6_K8_L4(ac1, ac2, ac3, ac4, an1, an2, an3, an4) \
+  "fmov v9.d[1],x10; ldr d"#an1",[x0],#16\n\t"\
+  "fmla v16.4s,v8.4s,v"#ac1".s[0]; prfm pldl1keep,[x1,#80]\n\t"\
+  "fmla v20.4s,v8.4s,v"#ac2".s[0]; ldr x16,[x0,#-8]\n\t"\
+  "fmov v"#ac4".d[1],x11; ldr d10,[x4,#-64]\n\t"\
+  "fmla v24.4s,v8.4s,v"#ac3".s[0]; ldr x10,[x4,#-56]\n\t"\
+  "fmla v28.4s,v8.4s,v"#ac4".s[0]; prfm pldl1keep,[x0,#64]\n\t"\
+  "fmov v10.d[1],x10; ldr d"#an2",[x1],#16\n\t"\
+  "fmla v17.4s,v9.4s,v"#ac1".s[1]; prfm pldl1keep,[x4,#96]\n\t"\
+  "fmla v21.4s,v9.4s,v"#ac2".s[1]; ldr x11,[x1,#-8]\n\t"\
+  "fmov v"#an1".d[1],x16; ldr d8,[x4,#-48]\n\t"\
+  "fmla v25.4s,v9.4s,v"#ac3".s[1]; ldr x10,[x4,#-40]\n\t"\
+  "fmla v29.4s,v9.4s,v"#ac4".s[1]; prfm pldl1keep,[x2,#80]\n\t"\
+  "fmov v8.d[1],x10; ldr d"#an3",[x2],#16\n\t"\
+  "fmla v16.4s,v10.4s,v"#ac1".s[2]\n\t"\
+  "fmla v20.4s,v10.4s,v"#ac2".s[2]; ldr x16,[x2,#-8]\n\t"\
+  "fmov v"#an2".d[1],x11; ldr d9,[x4,#-32]\n\t"\
+  "fmla v24.4s,v10.4s,v"#ac3".s[2]; ldr x10,[x4,#-24]\n\t"\
+  "fmla v28.4s,v10.4s,v"#ac4".s[2]; sub w5,w5,#4\n\t"\
+  "fmov v9.d[1],x10; ldr d10,[x4,#-16]\n\t"\
+  "fmla v17.4s,v8.4s,v"#ac1".s[3]; ldr x10,[x4,#-8]\n\t"\
+  "fmla v21.4s,v8.4s,v"#ac2".s[3]\n\t"\
+  "fmla v25.4s,v8.4s,v"#ac3".s[3]\n\t"\
+  "fmov v10.d[1],x10; ldr d"#an4",[x3],#16\n\t"\
+  "fmla v29.4s,v8.4s,v"#ac4".s[3]; cmp w5,#12\n\t"\
+  "fmla v18.4s,v9.4s,v"#ac1".4s; ldr x11,[x3,#-8]\n\t"\
+  "fmla v22.4s,v9.4s,v"#ac2".4s; prfm pldl1keep,[x3,#64]\n\t"\
+  "fmov v"#an3".d[1],x16; ldr d8,[x4]\n\t"\
+  "fmla v26.4s,v9.4s,v"#ac3".4s; ldr x10,[x4,#8]\n\t"\
+  "fmla v30.4s,v9.4s,v"#ac4".4s; prfm pldl1keep,[x4,#144]\n\t"\
+  "fmla v19.4s,v10.4s,v"#ac1".4s\n\t"\
+  "fmov v8.d[1],x10; ldr d9,[x4,#16]\n\t"\
+  "fmla v23.4s,v10.4s,v"#ac2".4s; ldr x10,[x4,#24]\n\t"\
+  "fmla v27.4s,v10.4s,v"#ac3".4s; add x4,x4,#96\n\t"\
+  "fmla v31.4s,v10.4s,v"#ac4".4s\n\t"
+
+#define KERNEL_M4N6_K8_T4(ac1, ac2, ac3, ac4) \
+  "fmov v9.d[1],x10\n\t"\
+  "fmla v16.4s,v8.4s,v"#ac1".s[0]\n\t"\
+  "fmla v20.4s,v8.4s,v"#ac2".s[0]\n\t"\
+  "fmov v"#ac4".d[1],x11; ldr d10,[x4,#-64]\n\t"\
+  "fmla v24.4s,v8.4s,v"#ac3".s[0]; ldr x10,[x4,#-56]\n\t"\
+  "fmla v28.4s,v8.4s,v"#ac4".s[0]\n\t"\
+  "fmov v10.d[1],x10\n\t"\
+  "fmla v17.4s,v9.4s,v"#ac1".s[1]; prfm pldl1keep,[x6]\n\t"\
+  "fmla v21.4s,v9.4s,v"#ac2".s[1]\n\t"\
+  "ldr d8,[x4,#-48]\n\t"\
+  "fmla v25.4s,v9.4s,v"#ac3".s[1]; ldr x10,[x4,#-40]\n\t"\
+  "fmla v29.4s,v9.4s,v"#ac4".s[1]; prfm pldl1keep,[x7]\n\t"\
+  "fmov v8.d[1],x10\n\t"\
+  "fmla v16.4s,v10.4s,v"#ac1".s[2]\n\t"\
+  "fmla v20.4s,v10.4s,v"#ac2".s[2]\n\t"\
+  "ldr d9,[x4,#-32]\n\t"\
+  "fmla v24.4s,v10.4s,v"#ac3".s[2]; ldr x10,[x4,#-24]\n\t"\
+  "fmla v28.4s,v10.4s,v"#ac4".s[2]; sub w5,w5,#4\n\t"\
+  "fmov v9.d[1],x10; ldr d10,[x4,#-16]\n\t"\
+  "fmla v17.4s,v8.4s,v"#ac1".s[3]; ldr x10,[x4,#-8]\n\t"\
+  "fmla v21.4s,v8.4s,v"#ac2".s[3]\n\t"\
+  "fmla v25.4s,v8.4s,v"#ac3".s[3]\n\t"\
+  "fmov v10.d[1],x10\n\t"\
+  "fmla v29.4s,v8.4s,v"#ac4".s[3]; prfm pldl1keep,[x8]\n\t"\
+  "fmla v18.4s,v9.4s,v"#ac1".4s\n\t"\
+  "fmla v22.4s,v9.4s,v"#ac2".4s\n\t"\
+  "fmla v26.4s,v9.4s,v"#ac3".4s\n\t"\
+  "fmla v30.4s,v9.4s,v"#ac4".4s; prfm pldl1keep,[x9]\n\t"\
+  "fmla v19.4s,v10.4s,v"#ac1".4s\n\t"\
+  "fmla v23.4s,v10.4s,v"#ac2".4s\n\t"\
+  "fmla v27.4s,v10.4s,v"#ac3".4s\n\t"\
+  "fmla v31.4s,v10.4s,v"#ac4".4s\n\t"
+
+#define KERNEL_M4N6_TL1 \
+  "ldr s0,[x0],#4; ldr s1,[x1],#4\n\t"\
+  "ldr q8,[x4],#16\n\t"\
+  "ldr s2,[x2],#4; ldr s3,[x3],#4\n\t"\
+  "fmla v17.4s,v8.4s,v0.s[0]; sub w5,w5,#1\n\t"\
+  "fmla v21.4s,v8.4s,v1.s[0]; cmp w5,#1\n\t"\
+  "fmla v25.4s,v8.4s,v2.s[0]\n\t"\
+  "ldr d9,[x4],#8\n\t"\
+  "fmla v29.4s,v8.4s,v3.s[0]\n\t"\
+  "fmla v18.4s,v0.4s,v9.s[0]\n\t"\
+  "fmla v22.4s,v1.4s,v9.s[0]\n\t"\
+  "fmla v26.4s,v2.4s,v9.s[0]\n\t"\
+  "fmla v30.4s,v3.4s,v9.s[0]\n\t"\
+  "fmla v19.4s,v0.4s,v9.s[1]\n\t"\
+  "fmla v23.4s,v1.4s,v9.s[1]\n\t"\
+  "fmla v27.4s,v2.4s,v9.s[1]\n\t"\
+  "fmla v31.4s,v3.4s,v9.s[1]\n\t"
+
+
+/* m4n7 c_vec */
+/* v13(v12) v14_comp v15_comp v16_comp */
+/* v18(v17) v19_comp v20_comp v21_comp */
+/* v23(v22) v24_comp v25_comp v26_comp */
+/* v28(v27) v29_comp v30_comp v31_comp */
+
+#define INIT_M4N7 \
+  INIT_4V(12, 13, 14, 15)\
+  INIT_4V(16, 17, 18, 19) INIT_4V(20, 21, 22, 23)\
+  INIT_4V(24, 25, 26, 27) INIT_4V(28, 29, 30, 31)
+
+#define SAVE_M4N7(mode) \
+  "fadd v13.4s,v13.4s,v12.4s; fadd v18.4s,v18.4s,v17.4s\n\t"\
+  "fadd v23.4s,v23.4s,v22.4s; fadd v28.4s,v28.4s,v27.4s\n\t"\
+  UNIT_SAVE_M4N4_VR_##mode(13, 18, 23, 28) EDGE_SAVE_M4N1K4_##mode(14, 19, 24, 29)\
+  EDGE_SAVE_M4N1K4_##mode(15, 20, 25, 30) EDGE_SAVE_M4N1K4_##mode(16, 21, 26, 31)
+
+#define KERNEL_M4N7_PRELOAD4 \
+  "ldr q0,[x0],#16; ldr q1,[x1],#16; ldr q2,[x2],#16; ldr d3,[x3],#16\n\t"\
+  "ldr q8,[x4],#112; ldr d9,[x4,#-96]; ldr x10,[x4,#-88]; ldr x11,[x3,#-8]\n\t"
+
+#define KERNEL_M4N7_K8_L4(ac1, ac2, ac3, ac4, an1, an2, an3, an4) \
+  "fmov v9.d[1],x10; ldr d"#an1",[x0],#16\n\t"\
+  "fmla v12.4s,v8.4s,v"#ac1".s[0]\n\t"\
+  "fmla v17.4s,v8.4s,v"#ac2".s[0]; ldr x16,[x0,#-8]\n\t"\
+  "fmov v"#ac4".d[1],x11; ldr d10,[x4,#-80]\n\t"\
+  "fmla v22.4s,v8.4s,v"#ac3".s[0]; ldr x10,[x4,#-72]\n\t"\
+  "fmla v27.4s,v8.4s,v"#ac4".s[0]; prfm pldl1keep,[x0,#64]\n\t"\
+  "fmov v10.d[1],x10; ldr d"#an2",[x1],#16\n\t"\
+  "fmla v13.4s,v9.4s,v"#ac1".s[1]; prfm pldl1keep,[x4,#56]\n\t"\
+  "fmla v18.4s,v9.4s,v"#ac2".s[1]; ldr x11,[x1,#-8]\n\t"\
+  "fmov v"#an1".d[1],x16; ldr d8,[x4,#-64]\n\t"\
+  "fmla v23.4s,v9.4s,v"#ac3".s[1]; ldr x10,[x4,#-56]\n\t"\
+  "fmla v28.4s,v9.4s,v"#ac4".s[1]; prfm pldl1keep,[x1,#64]\n\t"\
+  "fmov v8.d[1],x10; ldr d9,[x4,#-48]\n\t"\
+  "fmla v12.4s,v10.4s,v"#ac1".s[2]; ldr x10,[x4,#-40]\n\t"\
+  "fmla v17.4s,v10.4s,v"#ac2".s[2]\n\t"\
+  "fmov v9.d[1],x10; ldr d"#an3",[x2],#16\n\t"\
+  "fmla v22.4s,v10.4s,v"#ac3".s[2]\n\t"\
+  "fmla v27.4s,v10.4s,v"#ac4".s[2]; sub w5,w5,#4\n\t"\
+  "fmla v13.4s,v8.4s,v"#ac1".s[3]\n\t"\
+  "fmov v"#an2".d[1],x11; ldr d10,[x4,#-32]\n\t"\
+  "fmla v18.4s,v8.4s,v"#ac2".s[3]; ldr x10,[x4,#-24]\n\t"\
+  "fmla v23.4s,v8.4s,v"#ac3".s[3]; prfm pldl1keep,[x2,#64]\n\t"\
+  "fmla v28.4s,v8.4s,v"#ac4".s[3]\n\t"\
+  "fmov v10.d[1],x10; ldr d11,[x4,#-16]\n\t"\
+  "fmla v14.4s,v9.4s,v"#ac1".4s; ldr x10,[x4,#-8]\n\t"\
+  "fmla v19.4s,v9.4s,v"#ac2".4s; ldr x16,[x2,#-8]\n\t"\
+  "fmla v24.4s,v9.4s,v"#ac3".4s\n\t"\
+  "fmov v11.d[1],x10; ldr d"#an4",[x3],#16\n\t"\
+  "fmla v29.4s,v9.4s,v"#ac4".4s; cmp w5,#12\n\t"\
+  "fmla v15.4s,v10.4s,v"#ac1".4s; prfm pldl1keep,[x3,#64]\n\t"\
+  "fmla v20.4s,v10.4s,v"#ac2".4s\n\t"\
+  "fmov v"#an3".d[1],x16; ldr d8,[x4]\n\t"\
+  "fmla v25.4s,v10.4s,v"#ac3".4s; ldr x10,[x4,#8]\n\t"\
+  "fmla v30.4s,v10.4s,v"#ac4".4s; prfm pldl1keep,[x4,#120]\n\t"\
+  "fmla v16.4s,v11.4s,v"#ac1".4s; ldr x11,[x3,#-8]\n\t"\
+  "fmov v8.d[1],x10; ldr d9,[x4,#16]\n\t"\
+  "fmla v21.4s,v11.4s,v"#ac2".4s; ldr x10,[x4,#24]\n\t"\
+  "fmla v26.4s,v11.4s,v"#ac3".4s; add x4,x4,#112\n\t"\
+  "fmla v31.4s,v11.4s,v"#ac4".4s\n\t"
+
+#define KERNEL_M4N7_K8_T4(ac1, ac2, ac3, ac4) \
+  "fmov v9.d[1],x10\n\t"\
+  "fmla v12.4s,v8.4s,v"#ac1".s[0]; prfm pldl1keep,[x6]\n\t"\
+  "fmla v17.4s,v8.4s,v"#ac2".s[0]\n\t"\
+  "fmov v"#ac4".d[1],x11; ldr d10,[x4,#-80]\n\t"\
+  "fmla v22.4s,v8.4s,v"#ac3".s[0]; ldr x10,[x4,#-72]\n\t"\
+  "fmla v27.4s,v8.4s,v"#ac4".s[0]\n\t"\
+  "fmov v10.d[1],x10\n\t"\
+  "fmla v13.4s,v9.4s,v"#ac1".s[1]; prfm pldl1keep,[x7]\n\t"\
+  "fmla v18.4s,v9.4s,v"#ac2".s[1]\n\t"\
+  "ldr d8,[x4,#-64]\n\t"\
+  "fmla v23.4s,v9.4s,v"#ac3".s[1]; ldr x10,[x4,#-56]\n\t"\
+  "fmla v28.4s,v9.4s,v"#ac4".s[1]\n\t"\
+  "fmov v8.d[1],x10; ldr d9,[x4,#-48]\n\t"\
+  "fmla v12.4s,v10.4s,v"#ac1".s[2]; ldr x10,[x4,#-40]\n\t"\
+  "fmla v17.4s,v10.4s,v"#ac2".s[2]\n\t"\
+  "fmov v9.d[1],x10\n\t"\
+  "fmla v22.4s,v10.4s,v"#ac3".s[2]\n\t"\
+  "fmla v27.4s,v10.4s,v"#ac4".s[2]; sub w5,w5,#4\n\t"\
+  "fmla v13.4s,v8.4s,v"#ac1".s[3]\n\t"\
+  "ldr d10,[x4,#-32]\n\t"\
+  "fmla v18.4s,v8.4s,v"#ac2".s[3]; ldr x10,[x4,#-24]\n\t"\
+  "fmla v23.4s,v8.4s,v"#ac3".s[3]; prfm pldl1keep,[x8]\n\t"\
+  "fmla v28.4s,v8.4s,v"#ac4".s[3]\n\t"\
+  "fmov v10.d[1],x10; ldr d11,[x4,#-16]\n\t"\
+  "fmla v14.4s,v9.4s,v"#ac1".4s; ldr x10,[x4,#-8]\n\t"\
+  "fmla v19.4s,v9.4s,v"#ac2".4s\n\t"\
+  "fmla v24.4s,v9.4s,v"#ac3".4s\n\t"\
+  "fmov v11.d[1],x10\n\t"\
+  "fmla v29.4s,v9.4s,v"#ac4".4s\n\t"\
+  "fmla v15.4s,v10.4s,v"#ac1".4s\n\t"\
+  "fmla v20.4s,v10.4s,v"#ac2".4s\n\t"\
+  "fmla v25.4s,v10.4s,v"#ac3".4s\n\t"\
+  "fmla v30.4s,v10.4s,v"#ac4".4s; prfm pldl1keep,[x9]\n\t"\
+  "fmla v16.4s,v11.4s,v"#ac1".4s\n\t"\
+  "fmla v21.4s,v11.4s,v"#ac2".4s\n\t"\
+  "fmla v26.4s,v11.4s,v"#ac3".4s\n\t"\
+  "fmla v31.4s,v11.4s,v"#ac4".4s\n\t"
+
+#define KERNEL_M4N7_TL1 \
+  "ldr s0,[x0],#4; ldr s1,[x1],#4\n\t"\
+  "ldr q8,[x4],#16\n\t"\
+  "ldr s2,[x2],#4; ldr s3,[x3],#4\n\t"\
+  "fmla v13.4s,v8.4s,v0.s[0]; sub w5,w5,#1\n\t"\
+  "fmla v18.4s,v8.4s,v1.s[0]; cmp w5,#1\n\t"\
+  "fmla v23.4s,v8.4s,v2.s[0]\n\t"\
+  "ldr d9,[x4],#8\n\t"\
+  "fmla v28.4s,v8.4s,v3.s[0]\n\t"\
+  "fmla v14.4s,v0.4s,v9.s[0]\n\t"\
+  "fmla v19.4s,v1.4s,v9.s[0]\n\t"\
+  "ldr s10,[x4],#4\n\t"\
+  "fmla v24.4s,v2.4s,v9.s[0]\n\t"\
+  "fmla v29.4s,v3.4s,v9.s[0]\n\t"\
+  "fmla v15.4s,v0.4s,v9.s[1]\n\t"\
+  "fmla v20.4s,v1.4s,v9.s[1]\n\t"\
+  "fmla v25.4s,v2.4s,v9.s[1]\n\t"\
+  "fmla v30.4s,v3.4s,v9.s[1]\n\t"\
+  "fmla v16.4s,v0.4s,v10.s[0]\n\t"\
+  "fmla v21.4s,v1.4s,v10.s[0]\n\t"\
+  "fmla v26.4s,v2.4s,v10.s[0]\n\t"\
+  "fmla v31.4s,v3.4s,v10.s[0]\n\t"
+
+
+/* m4n8 c_vec */
+/* v24 - v25 */
+/* v26 - v27 */
+/* v28 - v29 */
+/* v30 - v31 */
+
+#define INIT_M4N8 \
+  INIT_4V(24, 25, 26, 27) INIT_4V(28, 29, 30, 31)
+
+#define SAVE_M4N8(mode) \
+  UNIT_SAVE_M4N4_VR_##mode(24, 26, 28, 30) UNIT_SAVE_M4N4_VR_##mode(25, 27, 29, 31)
+
+#define KERNEL_M4N8_PRELOAD4 \
+  "ldr q0,[x0],#16; ldr q1,[x1],#16; ldr q2,[x2],#16; ldr d3,[x3],#16\n\t"\
+  "ldr q8,[x4],#128; ldr d9,[x4,#-112]; ldr x10,[x4,#-104]; ldr x11,[x3,#-8]\n\t"
+
+#define KERNEL_M4N8_K8_L4(ac1, ac2, ac3, ac4, an1, an2, an3, an4) \
+  "fmov v9.d[1],x10; ldr d"#an1",[x0],#16\n\t"\
+  "fmla v24.4s,v8.4s,v"#ac1".s[0]\n\t"\
+  "fmla v26.4s,v8.4s,v"#ac2".s[0]; prfm pldl1keep,[x0,#64]\n\t"\
+  "fmla v28.4s,v8.4s,v"#ac3".s[0]\n\t"\
+  "fmov v"#ac4".d[1],x11; ldr d10,[x4,#-96]\n\t"\
+  "fmla v27.4s,v9.4s,v"#ac2".s[0]; ldr x10,[x4,#-88]\n\t"\
+  "fmla v25.4s,v9.4s,v"#ac1".s[0]; prfm pldl1keep,[x4,#40]\n\t"\
+  "fmla v30.4s,v8.4s,v"#ac4".s[0]\n\t"\
+  "fmov v10.d[1],x10; ldr d11,[x4,#-80]\n\t"\
+  "fmla v29.4s,v9.4s,v"#ac3".s[0]; ldr x10,[x4,#-72]\n\t"\
+  "fmla v31.4s,v9.4s,v"#ac4".s[0]; ldr x16,[x0,#-8]\n\t"\
+  "fmla v24.4s,v10.4s,v"#ac1".s[1]\n\t"\
+  "fmov v11.d[1],x10; ldr d8,[x4,#-64]\n\t"\
+  "fmla v26.4s,v10.4s,v"#ac2".s[1]; ldr x10,[x4,#-56]\n\t"\
+  "fmla v28.4s,v10.4s,v"#ac3".s[1]; sub w5,w5,#4\n\t"\
+  "fmla v30.4s,v10.4s,v"#ac4".s[1]\n\t"\
+  "fmov v8.d[1],x10; ldr d"#an2",[x1],#16\n\t"\
+  "fmla v25.4s,v11.4s,v"#ac1".s[1]\n\t"\
+  "fmla v27.4s,v11.4s,v"#ac2".s[1]; prfm pldl1keep,[x1,#64]\n\t"\
+  "fmla v29.4s,v11.4s,v"#ac3".s[1]; cmp w5,#12\n\t"\
+  "fmov v"#an1".d[1],x16; ldr d9,[x4,#-48]\n\t"\
+  "fmla v31.4s,v11.4s,v"#ac4".s[1]; ldr x10,[x4,#-40]\n\t"\
+  "fmla v24.4s,v8.4s,v"#ac1".s[2]; ldr x11,[x1,#-8]\n\t"\
+  "fmla v26.4s,v8.4s,v"#ac2".s[2]\n\t"\
+  "fmov v9.d[1],x10; ldr d10,[x4,#-32]\n\t"\
+  "fmla v28.4s,v8.4s,v"#ac3".s[2]; ldr x10,[x4,#-24]\n\t"\
+  "fmla v30.4s,v8.4s,v"#ac4".s[2]; prfm pldl1keep,[x4,#104]\n\t"\
+  "fmla v25.4s,v9.4s,v"#ac1".s[2]; prfm pldl1keep,[x3,#80]\n\t"\
+  "fmov v10.d[1],x10; ldr d11,[x4,#-16]\n\t"\
+  "fmla v27.4s,v9.4s,v"#ac2".s[2]; ldr x10,[x4,#-8]\n\t"\
+  "fmla v29.4s,v9.4s,v"#ac3".s[2]; add x4,x4,#128\n\t"\
+  "fmla v31.4s,v9.4s,v"#ac4".s[2]\n\t"\
+  "fmov v11.d[1],x10; ldr d"#an3",[x2],#16\n\t"\
+  "fmla v24.4s,v10.4s,v"#ac1".s[3]\n\t"\
+  "fmla v26.4s,v10.4s,v"#ac2".s[3]; prfm pldl1keep,[x2,#64]\n\t"\
+  "fmla v28.4s,v10.4s,v"#ac3".s[3]; ldr x16,[x2,#-8]\n\t"\
+  "fmov v"#an2".d[1],x11; ldr d8,[x4,#-128]\n\t"\
+  "fmla v30.4s,v10.4s,v"#ac4".s[3]; ldr x10,[x4,#-120]\n\t"\
+  "fmla v25.4s,v11.4s,v"#ac1".s[3]; ldr x11,[x3],#16\n\t"\
+  "fmla v27.4s,v11.4s,v"#ac2".s[3]\n\t"\
+  "fmov v8.d[1],x10; ldr d9,[x4,#-112]\n\t"\
+  "fmov v"#an3".d[1],x16; fmov d"#an4",x11\n\t"\
+  "fmla v29.4s,v11.4s,v"#ac3".s[3]; ldr x10,[x4,#-104]\n\t"\
+  "fmla v31.4s,v11.4s,v"#ac4".s[3]; ldr x11,[x3,#-8]\n\t"
+
+#define KERNEL_M4N8_K8_T4(ac1, ac2, ac3, ac4) \
+  "fmov v9.d[1],x10\n\t"\
+  "fmla v24.4s,v8.4s,v"#ac1".s[0]\n\t"\
+  "fmla v26.4s,v8.4s,v"#ac2".s[0]\n\t"\
+  "fmla v28.4s,v8.4s,v"#ac3".s[0]\n\t"\
+  "fmov v"#ac4".d[1],x11; ldr d10,[x4,#-96]\n\t"\
+  "fmla v27.4s,v9.4s,v"#ac2".s[0]; ldr x10,[x4,#-88]\n\t"\
+  "fmla v25.4s,v9.4s,v"#ac1".s[0]\n\t"\
+  "fmla v30.4s,v8.4s,v"#ac4".s[0]\n\t"\
+  "fmov v10.d[1],x10; ldr d11,[x4,#-80]\n\t"\
+  "fmla v29.4s,v9.4s,v"#ac3".s[0]; ldr x10,[x4,#-72]\n\t"\
+  "fmla v31.4s,v9.4s,v"#ac4".s[0]; prfm pldl1keep,[x6]\n\t"\
+  "fmla v24.4s,v10.4s,v"#ac1".s[1]\n\t"\
+  "fmov v11.d[1],x10; ldr d8,[x4,#-64]\n\t"\
+  "fmla v26.4s,v10.4s,v"#ac2".s[1]; ldr x10,[x4,#-56]\n\t"\
+  "fmla v28.4s,v10.4s,v"#ac3".s[1]; sub w5,w5,#4\n\t"\
+  "fmla v30.4s,v10.4s,v"#ac4".s[1]\n\t"\
+  "fmov v8.d[1],x10\n\t"\
+  "fmla v25.4s,v11.4s,v"#ac1".s[1]\n\t"\
+  "fmla v27.4s,v11.4s,v"#ac2".s[1]\n\t"\
+  "fmla v29.4s,v11.4s,v"#ac3".s[1]\n\t"\
+  "ldr d9,[x4,#-48]\n\t"\
+  "fmla v31.4s,v11.4s,v"#ac4".s[1]; ldr x10,[x4,#-40]\n\t"\
+  "fmla v24.4s,v8.4s,v"#ac1".s[2]; prfm pldl1keep,[x7]\n\t"\
+  "fmla v26.4s,v8.4s,v"#ac2".s[2]\n\t"\
+  "fmov v9.d[1],x10; ldr d10,[x4,#-32]\n\t"\
+  "fmla v28.4s,v8.4s,v"#ac3".s[2]; ldr x10,[x4,#-24]\n\t"\
+  "fmla v30.4s,v8.4s,v"#ac4".s[2]; prfm pldl1keep,[x8]\n\t"\
+  "fmla v25.4s,v9.4s,v"#ac1".s[2]\n\t"\
+  "fmov v10.d[1],x10; ldr d11,[x4,#-16]\n\t"\
+  "fmla v27.4s,v9.4s,v"#ac2".s[2]; ldr x10,[x4,#-8]\n\t"\
+  "fmla v29.4s,v9.4s,v"#ac3".s[2]\n\t"\
+  "fmla v31.4s,v9.4s,v"#ac4".s[2]\n\t"\
+  "fmov v11.d[1],x10\n\t"\
+  "fmla v24.4s,v10.4s,v"#ac1".s[3]; prfm pldl1keep,[x9]\n\t"\
+  "fmla v26.4s,v10.4s,v"#ac2".s[3]\n\t"\
+  "fmla v28.4s,v10.4s,v"#ac3".s[3]\n\t"\
+  "fmla v30.4s,v10.4s,v"#ac4".s[3]\n\t"\
+  "fmla v25.4s,v11.4s,v"#ac1".s[3]\n\t"\
+  "fmla v27.4s,v11.4s,v"#ac2".s[3]\n\t"\
+  "fmla v29.4s,v11.4s,v"#ac3".s[3]\n\t"\
+  "fmla v31.4s,v11.4s,v"#ac4".s[3]\n\t"
+
+#define KERNEL_M4N8_TL1 \
+  "ldr s0,[x0],#4; ldr s1,[x1],#4\n\t"\
+  "ldr q8,[x4]; ldr q9,[x4,#16]; add x4,x4,#32\n\t"\
+  "ldr s2,[x2],#4\n\t"\
+  "fmla v24.4s,v8.4s,v0.s[0]\n\t"\
+  "fmla v25.4s,v9.4s,v0.s[0]\n\t"\
+  "fmla v26.4s,v8.4s,v1.s[0]\n\t"\
+  "ldr s3,[x3],#4\n\t"\
+  "fmla v27.4s,v9.4s,v1.s[0]\n\t"\
+  "fmla v28.4s,v8.4s,v2.s[0]\n\t"\
+  "fmla v29.4s,v9.4s,v2.s[0]; sub w5,w5,#1\n\t"\
+  "fmla v30.4s,v8.4s,v3.s[0]; cmp w5,#1\n\t"\
+  "fmla v31.4s,v9.4s,v3.s[0]\n\t"
+
+
+/* m4n9 c_vec */
+/* v20 - v21 v22_comp */
+/* v23 - v24 v25_comp */
+/* v26 - v27 v28_comp */
+/* v29 - v30 v31_comp */
+
+#define INIT_M4N9 \
+  INIT_4V(20, 21, 22, 23) INIT_4V(24, 25, 26, 27)\
+  INIT_4V(28, 29, 30, 31)
+
+#define SAVE_M4N9(mode) \
+  UNIT_SAVE_M4N4_VR_##mode(20, 23, 26, 29) UNIT_SAVE_M4N4_VR_##mode(21, 24, 27, 30)\
+  EDGE_SAVE_M4N1K4_##mode(22, 25, 28, 31)
+
+#define KERNEL_M4N9_PRELOAD4 \
+  "ldr q0,[x0],#16; ldr q1,[x1],#16; ldr q2,[x2],#16; ldr d3,[x3],#16\n\t"\
+  "ldr q8,[x4],#144; ldr d9,[x4,#-128]; ldr x10,[x4,#-120]; ldr x11,[x3,#-8]\n\t"
+
+#define KERNEL_M4N9_K8_L4(ac1, ac2, ac3, ac4, an1, an2, an3, an4) \
+  "fmov v9.d[1],x10; ldr d"#an1",[x0],#16\n\t"\
+  "fmla v20.4s,v8.4s,v"#ac1".s[0]\n\t"\
+  "fmla v23.4s,v8.4s,v"#ac2".s[0]; prfm pldl1keep,[x0,#64]\n\t"\
+  "fmla v26.4s,v8.4s,v"#ac3".s[0]\n\t"\
+  "fmov v"#ac4".d[1],x11; ldr d10,[x4,#-112]\n\t"\
+  "fmla v24.4s,v9.4s,v"#ac2".s[0]; ldr x10,[x4,#-104]\n\t"\
+  "fmla v21.4s,v9.4s,v"#ac1".s[0]; prfm pldl1keep,[x4,#24]\n\t"\
+  "fmla v29.4s,v8.4s,v"#ac4".s[0]\n\t"\
+  "fmov v10.d[1],x10; ldr d8,[x4,#-96]\n\t"\
+  "fmla v27.4s,v9.4s,v"#ac3".s[0]; ldr x10,[x4,#-88]\n\t"\
+  "fmla v30.4s,v9.4s,v"#ac4".s[0]; ldr x16,[x0,#-8]\n\t"\
+  "fmla v20.4s,v10.4s,v"#ac1".s[1]\n\t"\
+  "fmov v8.d[1],x10; ldr d9,[x4,#-80]\n\t"\
+  "fmla v23.4s,v10.4s,v"#ac2".s[1]; ldr x10,[x4,#-72]\n\t"\
+  "fmla v26.4s,v10.4s,v"#ac3".s[1]; sub w5,w5,#4\n\t"\
+  "fmla v29.4s,v10.4s,v"#ac4".s[1]\n\t"\
+  "fmov v9.d[1],x10; ldr d"#an2",[x1],#16\n\t"\
+  "fmla v21.4s,v8.4s,v"#ac1".s[1]\n\t"\
+  "fmla v24.4s,v8.4s,v"#ac2".s[1]; prfm pldl1keep,[x1,#64]\n\t"\
+  "fmla v27.4s,v8.4s,v"#ac3".s[1]\n\t"\
+  "fmov v"#an1".d[1],x16; ldr d10,[x4,#-64]\n\t"\
+  "fmla v30.4s,v8.4s,v"#ac4".s[1]; ldr x10,[x4,#-56]\n\t"\
+  "fmla v20.4s,v9.4s,v"#ac1".s[2]; ldr x11,[x1,#-8]\n\t"\
+  "fmla v23.4s,v9.4s,v"#ac2".s[2]\n\t"\
+  "fmov v10.d[1],x10; ldr d8,[x4,#-48]\n\t"\
+  "fmla v26.4s,v9.4s,v"#ac3".s[2]; ldr x10,[x4,#-40]\n\t"\
+  "fmla v29.4s,v9.4s,v"#ac4".s[2]; prfm pldl1keep,[x4,#88]\n\t"\
+  "fmla v21.4s,v10.4s,v"#ac1".s[2]\n\t"\
+  "fmov v8.d[1],x10; ldr d9,[x4,#-32]\n\t"\
+  "fmla v24.4s,v10.4s,v"#ac2".s[2]; ldr x10,[x4,#-24]\n\t"\
+  "fmla v27.4s,v10.4s,v"#ac3".s[2]; add x4,x4,#144\n\t"\
+  "fmla v30.4s,v10.4s,v"#ac4".s[2]\n\t"\
+  "fmov v9.d[1],x10; ldr d"#an3",[x2],#16\n\t"\
+  "fmla v20.4s,v8.4s,v"#ac1".s[3]\n\t"\
+  "fmla v23.4s,v8.4s,v"#ac2".s[3]; prfm pldl1keep,[x2,#64]\n\t"\
+  "fmla v26.4s,v8.4s,v"#ac3".s[3]; cmp w5,#12\n\t"\
+  "fmov v"#an2".d[1],x11; ldr d10,[x4,#-160]\n\t"\
+  "fmla v29.4s,v8.4s,v"#ac4".s[3]; ldr x10,[x4,#-152]\n\t"\
+  "fmla v21.4s,v9.4s,v"#ac1".s[3]; prfm pldl1keep,[x4,#8]\n\t"\
+  "fmla v24.4s,v9.4s,v"#ac2".s[3]; ldr x16,[x2,#-8]\n\t"\
+  "fmov v10.d[1],x10; ldr d8,[x4,#-144]\n\t"\
+  "fmla v27.4s,v9.4s,v"#ac3".s[3]; ldr x10,[x4,#-136]\n\t"\
+  "fmla v30.4s,v9.4s,v"#ac4".s[3]; ldr x11,[x3],#16\n\t"\
+  "fmla v22.4s,v10.4s,v"#ac1".4s\n\t"\
+  "fmov v8.d[1],x10; ldr d9,[x4,#-128]\n\t"\
+  "fmov v"#an3".d[1],x16; fmov d"#an4",x11\n\t"\
+  "fmla v25.4s,v10.4s,v"#ac2".4s; ldr x10,[x4,#-120]\n\t"\
+  "fmla v28.4s,v10.4s,v"#ac3".4s; ldr x11,[x3,#-8]\n\t"\
+  "fmla v31.4s,v10.4s,v"#ac4".4s; prfm pldl1keep,[x3,#64]\n\t"
+
+#define KERNEL_M4N9_K8_T4(ac1, ac2, ac3, ac4) \
+  "fmov v9.d[1],x10\n\t"\
+  "fmla v20.4s,v8.4s,v"#ac1".s[0]\n\t"\
+  "fmla v23.4s,v8.4s,v"#ac2".s[0]\n\t"\
+  "fmla v26.4s,v8.4s,v"#ac3".s[0]\n\t"\
+  "fmov v"#ac4".d[1],x11; ldr d10,[x4,#-112]\n\t"\
+  "fmla v24.4s,v9.4s,v"#ac2".s[0]; ldr x10,[x4,#-104]\n\t"\
+  "fmla v21.4s,v9.4s,v"#ac1".s[0]\n\t"\
+  "fmla v29.4s,v8.4s,v"#ac4".s[0]\n\t"\
+  "fmov v10.d[1],x10; ldr d8,[x4,#-96]\n\t"\
+  "fmla v27.4s,v9.4s,v"#ac3".s[0]; ldr x10,[x4,#-88]\n\t"\
+  "fmla v30.4s,v9.4s,v"#ac4".s[0]; prfm pldl1keep,[x6]\n\t"\
+  "fmla v20.4s,v10.4s,v"#ac1".s[1]\n\t"\
+  "fmov v8.d[1],x10; ldr d9,[x4,#-80]\n\t"\
+  "fmla v23.4s,v10.4s,v"#ac2".s[1]; ldr x10,[x4,#-72]\n\t"\
+  "fmla v26.4s,v10.4s,v"#ac3".s[1]; sub w5,w5,#4\n\t"\
+  "fmla v29.4s,v10.4s,v"#ac4".s[1]\n\t"\
+  "fmov v9.d[1],x10\n\t"\
+  "fmla v21.4s,v8.4s,v"#ac1".s[1]\n\t"\
+  "fmla v24.4s,v8.4s,v"#ac2".s[1]\n\t"\
+  "fmla v27.4s,v8.4s,v"#ac3".s[1]\n\t"\
+  "ldr d10,[x4,#-64]\n\t"\
+  "fmla v30.4s,v8.4s,v"#ac4".s[1]; ldr x10,[x4,#-56]\n\t"\
+  "fmla v20.4s,v9.4s,v"#ac1".s[2]; prfm pldl1keep,[x7]\n\t"\
+  "fmla v23.4s,v9.4s,v"#ac2".s[2]\n\t"\
+  "fmov v10.d[1],x10; ldr d8,[x4,#-48]\n\t"\
+  "fmla v26.4s,v9.4s,v"#ac3".s[2]; ldr x10,[x4,#-40]\n\t"\
+  "fmla v29.4s,v9.4s,v"#ac4".s[2]; prfm pldl1keep,[x8]\n\t"\
+  "fmla v21.4s,v10.4s,v"#ac1".s[2]\n\t"\
+  "fmov v8.d[1],x10; ldr d9,[x4,#-32]\n\t"\
+  "fmla v24.4s,v10.4s,v"#ac2".s[2]; ldr x10,[x4,#-24]\n\t"\
+  "fmla v27.4s,v10.4s,v"#ac3".s[2]\n\t"\
+  "fmla v30.4s,v10.4s,v"#ac4".s[2]\n\t"\
+  "fmov v9.d[1],x10\n\t"\
+  "fmla v20.4s,v8.4s,v"#ac1".s[3]\n\t"\
+  "fmla v23.4s,v8.4s,v"#ac2".s[3]\n\t"\
+  "fmla v26.4s,v8.4s,v"#ac3".s[3]\n\t"\
+  "ldr d10,[x4,#-16]\n\t"\
+  "fmla v29.4s,v8.4s,v"#ac4".s[3]; ldr x10,[x4,#-8]\n\t"\
+  "fmla v21.4s,v9.4s,v"#ac1".s[3]; prfm pldl1keep,[x9]\n\t"\
+  "fmla v24.4s,v9.4s,v"#ac2".s[3]\n\t"\
+  "fmov v10.d[1],x10\n\t"\
+  "fmla v27.4s,v9.4s,v"#ac3".s[3]\n\t"\
+  "fmla v30.4s,v9.4s,v"#ac4".s[3]\n\t"\
+  "fmla v22.4s,v10.4s,v"#ac1".4s\n\t"\
+  "fmla v25.4s,v10.4s,v"#ac2".4s\n\t"\
+  "fmla v28.4s,v10.4s,v"#ac3".4s\n\t"\
+  "fmla v31.4s,v10.4s,v"#ac4".4s\n\t"
+
+#define KERNEL_M4N9_TL1 \
+  "ldr s0,[x0],#4; ldr s1,[x1],#4\n\t"\
+  "ldr q8,[x4]; ldr q9,[x4,#16]; add x4,x4,#36\n\t"\
+  "ldr s2,[x2],#4\n\t"\
+  "fmla v20.4s,v8.4s,v0.s[0]\n\t"\
+  "fmla v21.4s,v9.4s,v0.s[0]\n\t"\
+  "fmla v23.4s,v8.4s,v1.s[0]\n\t"\
+  "ldr s3,[x3],#4\n\t"\
+  "fmla v24.4s,v9.4s,v1.s[0]\n\t"\
+  "fmla v26.4s,v8.4s,v2.s[0]\n\t"\
+  "fmla v27.4s,v9.4s,v2.s[0]; sub w5,w5,#1\n\t"\
+  "ldr s10,[x4,#-4]\n\t"\
+  "fmla v29.4s,v8.4s,v3.s[0]; cmp w5,#1\n\t"\
+  "fmla v30.4s,v9.4s,v3.s[0]\n\t"\
+  "fmla v22.4s,v10.4s,v0.4s\n\t"\
+  "fmla v25.4s,v10.4s,v1.4s\n\t"\
+  "fmla v28.4s,v10.4s,v2.4s\n\t"\
+  "fmla v31.4s,v10.4s,v3.4s\n\t"
+
+
+/* m4n10 c_vec */
+/* v16 - v17 v18_comp v19_comp */
+/* v20 - v21 v22_comp v23_comp */
+/* v24 - v25 v26_comp v27_comp */
+/* v28 - v29 v30_comp v31_comp */
+
+#define INIT_M4N10 \
+  INIT_4V(16, 17, 18, 19) INIT_4V(20, 21, 22, 23)\
+  INIT_4V(24, 25, 26, 27) INIT_4V(28, 29, 30, 31)
+
+#define SAVE_M4N10(mode) \
+  UNIT_SAVE_M4N4_VR_##mode(16, 20, 24, 28) UNIT_SAVE_M4N4_VR_##mode(17, 21, 25, 29)\
+  EDGE_SAVE_M4N1K4_##mode(18, 22, 26, 30) EDGE_SAVE_M4N1K4_##mode(19, 23, 27, 31)
+
+#define KERNEL_M4N10_PRELOAD4 \
+  "ldr q0,[x0],#16; ldr q1,[x1],#16; ldr q2,[x2],#16; ldr d3,[x3],#16\n\t"\
+  "ldr q8,[x4],#160; ldr d9,[x4,#-144]; ldr x10,[x4,#-136]; ldr x11,[x3,#-8]\n\t"
+
+#define KERNEL_M4N10_K8_L4(ac1, ac2, ac3, ac4, an1, an2, an3, an4) \
+  "fmov v9.d[1],x10; ldr d"#an1",[x0],#16\n\t"\
+  "fmla v16.4s,v8.4s,v"#ac1".s[0]\n\t"\
+  "fmla v20.4s,v8.4s,v"#ac2".s[0]; prfm pldl1keep,[x0,#64]\n\t"\
+  "fmla v24.4s,v8.4s,v"#ac3".s[0]\n\t"\
+  "fmov v"#ac4".d[1],x11; ldr d10,[x4,#-128]\n\t"\
+  "fmla v21.4s,v9.4s,v"#ac2".s[0]; ldr x10,[x4,#-120]\n\t"\
+  "fmla v17.4s,v9.4s,v"#ac1".s[0]; prfm pldl1keep,[x4,#32]\n\t"\
+  "fmla v28.4s,v8.4s,v"#ac4".s[0]\n\t"\
+  "fmov v10.d[1],x10; ldr d8,[x4,#-112]\n\t"\
+  "fmla v25.4s,v9.4s,v"#ac3".s[0]; ldr x10,[x4,#-104]\n\t"\
+  "fmla v29.4s,v9.4s,v"#ac4".s[0]; ldr x16,[x0,#-8]\n\t"\
+  "fmla v16.4s,v10.4s,v"#ac1".s[1]\n\t"\
+  "fmov v8.d[1],x10; ldr d9,[x4,#-96]\n\t"\
+  "fmla v20.4s,v10.4s,v"#ac2".s[1]; ldr x10,[x4,#-88]\n\t"\
+  "fmla v24.4s,v10.4s,v"#ac3".s[1]; sub w5,w5,#4\n\t"\
+  "fmla v28.4s,v10.4s,v"#ac4".s[1]\n\t"\
+  "fmov v9.d[1],x10; ldr d"#an2",[x1],#16\n\t"\
+  "fmla v17.4s,v8.4s,v"#ac1".s[1]\n\t"\
+  "fmla v21.4s,v8.4s,v"#ac2".s[1]; prfm pldl1keep,[x1,#64]\n\t"\
+  "fmla v25.4s,v8.4s,v"#ac3".s[1]\n\t"\
+  "fmov v"#an1".d[1],x16; ldr d10,[x4,#-80]\n\t"\
+  "fmla v29.4s,v8.4s,v"#ac4".s[1]; ldr x10,[x4,#-72]\n\t"\
+  "fmla v16.4s,v9.4s,v"#ac1".s[2]; ldr x11,[x1,#-8]\n\t"\
+  "fmla v20.4s,v9.4s,v"#ac2".s[2]\n\t"\
+  "fmov v10.d[1],x10; ldr d8,[x4,#-64]\n\t"\
+  "fmla v24.4s,v9.4s,v"#ac3".s[2]; ldr x10,[x4,#-56]\n\t"\
+  "fmla v28.4s,v9.4s,v"#ac4".s[2]; prfm pldl1keep,[x4,#96]\n\t"\
+  "fmla v17.4s,v10.4s,v"#ac1".s[2]\n\t"\
+  "fmov v8.d[1],x10; ldr d9,[x4,#-48]\n\t"\
+  "fmla v21.4s,v10.4s,v"#ac2".s[2]; ldr x10,[x4,#-40]\n\t"\
+  "fmla v25.4s,v10.4s,v"#ac3".s[2]; add x4,x4,#160\n\t"\
+  "fmla v29.4s,v10.4s,v"#ac4".s[2]\n\t"\
+  "fmov v9.d[1],x10; ldr d"#an3",[x2],#16\n\t"\
+  "fmla v16.4s,v8.4s,v"#ac1".s[3]; cmp w5,#12\n\t"\
+  "fmla v20.4s,v8.4s,v"#ac2".s[3]; prfm pldl1keep,[x2,#64]\n\t"\
+  "fmla v24.4s,v8.4s,v"#ac3".s[3]\n\t"\
+  "fmov v"#an2".d[1],x11; ldr d10,[x4,#-192]\n\t"\
+  "fmla v28.4s,v8.4s,v"#ac4".s[3]; ldr x10,[x4,#-184]\n\t"\
+  "fmla v17.4s,v9.4s,v"#ac1".s[3]; ldr x16,[x2,#-8]\n\t"\
+  "fmla v21.4s,v9.4s,v"#ac2".s[3]\n\t"\
+  "fmov v10.d[1],x10; ldr d11,[x4,#-176]\n\t"\
+  "fmla v25.4s,v9.4s,v"#ac3".s[3]; ldr x10,[x4,#-168]\n\t"\
+  "fmla v29.4s,v9.4s,v"#ac4".s[3]; ldr x11,[x3],#16\n\t"\
+  "fmla v18.4s,v10.4s,v"#ac1".4s\n\t"\
+  "fmov v11.d[1],x10; ldr d8,[x4,#-160]\n\t"\
+  "fmla v22.4s,v10.4s,v"#ac2".4s; ldr x10,[x4,#-152]\n\t"\
+  "fmla v26.4s,v10.4s,v"#ac3".4s; prfm pldl1keep,[x4]\n\t"\
+  "fmla v30.4s,v10.4s,v"#ac4".4s\n\t"\
+  "fmov v"#an3".d[1],x16; fmov d"#an4",x11\n\t"\
+  "fmla v19.4s,v11.4s,v"#ac1".4s; ldr x11,[x3,#-8]\n\t"\
+  "fmla v23.4s,v11.4s,v"#ac2".4s; prfm pldl1keep,[x3,#64]\n\t"\
+  "fmla v27.4s,v11.4s,v"#ac3".4s\n\t"\
+  "fmov v8.d[1],x10; ldr d9,[x4,#-144]\n\t"\
+  "fmla v31.4s,v11.4s,v"#ac4".4s; ldr x10,[x4,#-136]\n\t"
+
+#define KERNEL_M4N10_K8_T4(ac1, ac2, ac3, ac4) \
+  "fmov v9.d[1],x10\n\t"\
+  "fmla v16.4s,v8.4s,v"#ac1".s[0]\n\t"\
+  "fmla v20.4s,v8.4s,v"#ac2".s[0]\n\t"\
+  "fmla v24.4s,v8.4s,v"#ac3".s[0]\n\t"\
+  "fmov v"#ac4".d[1],x11; ldr d10,[x4,#-128]\n\t"\
+  "fmla v21.4s,v9.4s,v"#ac2".s[0]; ldr x10,[x4,#-120]\n\t"\
+  "fmla v17.4s,v9.4s,v"#ac1".s[0]\n\t"\
+  "fmla v28.4s,v8.4s,v"#ac4".s[0]\n\t"\
+  "fmov v10.d[1],x10; ldr d8,[x4,#-112]\n\t"\
+  "fmla v25.4s,v9.4s,v"#ac3".s[0]; ldr x10,[x4,#-104]\n\t"\
+  "fmla v29.4s,v9.4s,v"#ac4".s[0]; prfm pldl1keep,[x6]\n\t"\
+  "fmla v16.4s,v10.4s,v"#ac1".s[1]\n\t"\
+  "fmov v8.d[1],x10; ldr d9,[x4,#-96]\n\t"\
+  "fmla v20.4s,v10.4s,v"#ac2".s[1]; ldr x10,[x4,#-88]\n\t"\
+  "fmla v24.4s,v10.4s,v"#ac3".s[1]; sub w5,w5,#4\n\t"\
+  "fmla v28.4s,v10.4s,v"#ac4".s[1]\n\t"\
+  "fmov v9.d[1],x10\n\t"\
+  "fmla v17.4s,v8.4s,v"#ac1".s[1]\n\t"\
+  "fmla v21.4s,v8.4s,v"#ac2".s[1]\n\t"\
+  "fmla v25.4s,v8.4s,v"#ac3".s[1]\n\t"\
+  "ldr d10,[x4,#-80]\n\t"\
+  "fmla v29.4s,v8.4s,v"#ac4".s[1]; ldr x10,[x4,#-72]\n\t"\
+  "fmla v16.4s,v9.4s,v"#ac1".s[2]; prfm pldl1keep,[x7]\n\t"\
+  "fmla v20.4s,v9.4s,v"#ac2".s[2]\n\t"\
+  "fmov v10.d[1],x10; ldr d8,[x4,#-64]\n\t"\
+  "fmla v24.4s,v9.4s,v"#ac3".s[2]; ldr x10,[x4,#-56]\n\t"\
+  "fmla v28.4s,v9.4s,v"#ac4".s[2]; prfm pldl1keep,[x8]\n\t"\
+  "fmla v17.4s,v10.4s,v"#ac1".s[2]\n\t"\
+  "fmov v8.d[1],x10; ldr d9,[x4,#-48]\n\t"\
+  "fmla v21.4s,v10.4s,v"#ac2".s[2]; ldr x10,[x4,#-40]\n\t"\
+  "fmla v25.4s,v10.4s,v"#ac3".s[2]\n\t"\
+  "fmla v29.4s,v10.4s,v"#ac4".s[2]\n\t"\
+  "fmov v9.d[1],x10\n\t"\
+  "fmla v16.4s,v8.4s,v"#ac1".s[3]\n\t"\
+  "fmla v20.4s,v8.4s,v"#ac2".s[3]\n\t"\
+  "fmla v24.4s,v8.4s,v"#ac3".s[3]\n\t"\
+  "ldr d10,[x4,#-32]\n\t"\
+  "fmla v28.4s,v8.4s,v"#ac4".s[3]; ldr x10,[x4,#-24]\n\t"\
+  "fmla v17.4s,v9.4s,v"#ac1".s[3]; prfm pldl1keep,[x9]\n\t"\
+  "fmla v21.4s,v9.4s,v"#ac2".s[3]\n\t"\
+  "fmov v10.d[1],x10; ldr d11,[x4,#-16]\n\t"\
+  "fmla v25.4s,v9.4s,v"#ac3".s[3]; ldr x10,[x4,#-8]\n\t"\
+  "fmla v29.4s,v9.4s,v"#ac4".s[3]\n\t"\
+  "fmla v18.4s,v10.4s,v"#ac1".4s\n\t"\
+  "fmov v11.d[1],x10\n\t"\
+  "fmla v22.4s,v10.4s,v"#ac2".4s\n\t"\
+  "fmla v26.4s,v10.4s,v"#ac3".4s\n\t"\
+  "fmla v30.4s,v10.4s,v"#ac4".4s\n\t"\
+  "fmla v19.4s,v11.4s,v"#ac1".4s\n\t"\
+  "fmla v23.4s,v11.4s,v"#ac2".4s\n\t"\
+  "fmla v27.4s,v11.4s,v"#ac3".4s\n\t"\
+  "fmla v31.4s,v11.4s,v"#ac4".4s\n\t"
+
+#define KERNEL_M4N10_TL1 \
+  "ldr s0,[x0],#4; ldr s1,[x1],#4\n\t"\
+  "ldr q8,[x4]; ldr q9,[x4,#16]; add x4,x4,#40\n\t"\
+  "ldr s2,[x2],#4\n\t"\
+  "fmla v16.4s,v8.4s,v0.s[0]\n\t"\
+  "fmla v17.4s,v9.4s,v0.s[0]\n\t"\
+  "fmla v20.4s,v8.4s,v1.s[0]\n\t"\
+  "ldr s3,[x3],#4\n\t"\
+  "fmla v21.4s,v9.4s,v1.s[0]\n\t"\
+  "fmla v24.4s,v8.4s,v2.s[0]\n\t"\
+  "fmla v25.4s,v9.4s,v2.s[0]; sub w5,w5,#1\n\t"\
+  "ldr d10,[x4,#-8]\n\t"\
+  "fmla v28.4s,v8.4s,v3.s[0]; cmp w5,#1\n\t"\
+  "fmla v29.4s,v9.4s,v3.s[0]\n\t"\
+  "fmla v18.4s,v0.4s,v10.s[0]\n\t"\
+  "fmla v22.4s,v1.4s,v10.s[0]\n\t"\
+  "fmla v26.4s,v2.4s,v10.s[0]\n\t"\
+  "fmla v30.4s,v3.4s,v10.s[0]\n\t"\
+  "fmla v19.4s,v0.4s,v10.s[1]\n\t"\
+  "fmla v23.4s,v1.4s,v10.s[1]\n\t"\
+  "fmla v27.4s,v2.4s,v10.s[1]\n\t"\
+  "fmla v31.4s,v3.4s,v10.s[1]\n\t"
+
+
+/* m4n11 c_vec */
+/* v12 - v13 v14_comp v15_comp v16_comp */
+/* v17 - v18 v19_comp v20_comp v21_comp */
+/* v22 - v23 v24_comp v25_comp v26_comp */
+/* v27 - v28 v29_comp v30_comp v31_comp */
+
+#define INIT_M4N11 \
+  INIT_4V(12, 13, 14, 15) INIT_4V(16, 17, 18, 19)\
+  INIT_4V(20, 21, 22, 23) INIT_4V(24, 25, 26, 27)\
+  INIT_4V(28, 29, 30, 31)
+
+#define SAVE_M4N11(mode) \
+  UNIT_SAVE_M4N4_VR_##mode(12, 17, 22, 27) UNIT_SAVE_M4N4_VR_##mode(13, 18, 23, 28)\
+  EDGE_SAVE_M4N1K4_##mode(14, 19, 24, 29) EDGE_SAVE_M4N1K4_##mode(15, 20, 25, 30)\
+  EDGE_SAVE_M4N1K4_##mode(16, 21, 26, 31)
+
+#define KERNEL_M4N11_PRELOAD4 \
+  "ldr q0,[x0],#16; ldr q1,[x1],#16; ldr q2,[x2],#16; ldr d3,[x3],#16\n\t"\
+  "ldr q8,[x4],#176; ldr d9,[x4,#-160]; ldr x10,[x4,#-152]; ldr x11,[x3,#-8]\n\t"
+
+#define KERNEL_M4N11_K8_L4(ac1, ac2, ac3, ac4, an1, an2, an3, an4) \
+  "fmov v9.d[1],x10; ldr d"#an1",[x0],#16\n\t"\
+  "fmla v12.4s,v8.4s,v"#ac1".s[0]\n\t"\
+  "fmla v17.4s,v8.4s,v"#ac2".s[0]; prfm pldl1keep,[x0,#64]\n\t"\
+  "fmla v22.4s,v8.4s,v"#ac3".s[0]\n\t"\
+  "fmov v"#ac4".d[1],x11; ldr d10,[x4,#-144]\n\t"\
+  "fmla v18.4s,v9.4s,v"#ac2".s[0]; ldr x10,[x4,#-136]\n\t"\
+  "fmla v13.4s,v9.4s,v"#ac1".s[0]; prfm pldl1keep,[x4,#48]\n\t"\
+  "fmla v27.4s,v8.4s,v"#ac4".s[0]\n\t"\
+  "fmov v10.d[1],x10; ldr d8,[x4,#-128]\n\t"\
+  "fmla v23.4s,v9.4s,v"#ac3".s[0]; ldr x10,[x4,#-120]\n\t"\
+  "fmla v28.4s,v9.4s,v"#ac4".s[0]; ldr x16,[x0,#-8]\n\t"\
+  "fmla v12.4s,v10.4s,v"#ac1".s[1]\n\t"\
+  "fmov v8.d[1],x10; ldr d9,[x4,#-112]\n\t"\
+  "fmla v17.4s,v10.4s,v"#ac2".s[1]; ldr x10,[x4,#-104]\n\t"\
+  "fmla v22.4s,v10.4s,v"#ac3".s[1]; sub w5,w5,#4\n\t"\
+  "fmla v27.4s,v10.4s,v"#ac4".s[1]\n\t"\
+  "fmov v9.d[1],x10; ldr d"#an2",[x1],#16\n\t"\
+  "fmla v13.4s,v8.4s,v"#ac1".s[1]\n\t"\
+  "fmla v18.4s,v8.4s,v"#ac2".s[1]; prfm pldl1keep,[x1,#64]\n\t"\
+  "fmla v23.4s,v8.4s,v"#ac3".s[1]\n\t"\
+  "fmov v"#an1".d[1],x16; ldr d10,[x4,#-96]\n\t"\
+  "fmla v28.4s,v8.4s,v"#ac4".s[1]; ldr x10,[x4,#-88]\n\t"\
+  "fmla v12.4s,v9.4s,v"#ac1".s[2]; ldr x11,[x1,#-8]\n\t"\
+  "fmla v17.4s,v9.4s,v"#ac2".s[2]\n\t"\
+  "fmov v10.d[1],x10; ldr d11,[x4,#-80]\n\t"\
+  "fmla v22.4s,v9.4s,v"#ac3".s[2]; ldr x10,[x4,#-72]\n\t"\
+  "fmla v27.4s,v9.4s,v"#ac4".s[2]; prfm pldl1keep,[x4,#112]\n\t"\
+  "fmla v13.4s,v10.4s,v"#ac1".s[2]\n\t"\
+  "fmov v11.d[1],x10; ldr d8,[x4,#-64]\n\t"\
+  "fmla v18.4s,v10.4s,v"#ac2".s[2]; ldr x10,[x4,#-56]\n\t"\
+  "fmla v23.4s,v10.4s,v"#ac3".s[2]; add x4,x4,#176\n\t"\
+  "fmla v28.4s,v10.4s,v"#ac4".s[2]\n\t"\
+  "fmov v8.d[1],x10; ldr d"#an3",[x2],#16\n\t"\
+  "fmla v12.4s,v11.4s,v"#ac1".s[3]; cmp w5,#12\n\t"\
+  "fmla v17.4s,v11.4s,v"#ac2".s[3]; prfm pldl1keep,[x2,#64]\n\t"\
+  "fmla v22.4s,v11.4s,v"#ac3".s[3]\n\t"\
+  "fmov v"#an2".d[1],x11; ldr d9,[x4,#-224]\n\t"\
+  "fmla v27.4s,v11.4s,v"#ac4".s[3]; ldr x10,[x4,#-216]\n\t"\
+  "fmla v13.4s,v8.4s,v"#ac1".s[3]; prfm pldl1keep,[x4]\n\t"\
+  "fmla v18.4s,v8.4s,v"#ac2".s[3]\n\t"\
+  "fmov v9.d[1],x10; ldr d10,[x4,#-208]\n\t"\
+  "fmla v23.4s,v8.4s,v"#ac3".s[3]; ldr x10,[x4,#-200]\n\t"\
+  "fmla v28.4s,v8.4s,v"#ac4".s[3]; ldr x16,[x2,#-8]\n\t"\
+  "fmla v14.4s,v9.4s,v"#ac1".4s\n\t"\
+  "fmov v10.d[1],x10; ldr d11,[x4,#-192]\n\t"\
+  "fmla v19.4s,v9.4s,v"#ac2".4s; ldr x10,[x4,#-184]\n\t"\
+  "fmla v24.4s,v9.4s,v"#ac3".4s\n\t"\
+  "fmla v29.4s,v9.4s,v"#ac4".4s\n\t"\
+  "fmov v11.d[1],x10; ldr d8,[x4,#-176]\n\t"\
+  "fmla v15.4s,v10.4s,v"#ac1".4s; ldr x10,[x4,#-168]\n\t"\
+  "fmla v20.4s,v10.4s,v"#ac2".4s\n\t"\
+  "fmla v25.4s,v10.4s,v"#ac3".4s\n\t"\
+  "fmov v"#an3".d[1],x16; ldr d"#an4",[x3],#16\n\t"\
+  "fmla v30.4s,v10.4s,v"#ac4".4s\n\t"\
+  "fmla v16.4s,v11.4s,v"#ac1".4s; prfm pldl1keep,[x3,#64]\n\t"\
+  "fmla v21.4s,v11.4s,v"#ac2".4s\n\t"\
+  "fmov v8.d[1],x10; ldr d9,[x4,#-160]\n\t"\
+  "fmla v26.4s,v11.4s,v"#ac3".4s; ldr x10,[x4,#-152]\n\t"\
+  "fmla v31.4s,v11.4s,v"#ac4".4s; ldr x11,[x3,#-8]\n\t"
+
+#define KERNEL_M4N11_K8_T4(ac1, ac2, ac3, ac4) \
+  "fmov v9.d[1],x10\n\t"\
+  "fmla v12.4s,v8.4s,v"#ac1".s[0]\n\t"\
+  "fmla v17.4s,v8.4s,v"#ac2".s[0]\n\t"\
+  "fmla v22.4s,v8.4s,v"#ac3".s[0]\n\t"\
+  "fmov v"#ac4".d[1],x11; ldr d10,[x4,#-144]\n\t"\
+  "fmla v18.4s,v9.4s,v"#ac2".s[0]; ldr x10,[x4,#-136]\n\t"\
+  "fmla v13.4s,v9.4s,v"#ac1".s[0]\n\t"\
+  "fmla v27.4s,v8.4s,v"#ac4".s[0]\n\t"\
+  "fmov v10.d[1],x10; ldr d8,[x4,#-128]\n\t"\
+  "fmla v23.4s,v9.4s,v"#ac3".s[0]; ldr x10,[x4,#-120]\n\t"\
+  "fmla v28.4s,v9.4s,v"#ac4".s[0]; prfm pldl1keep,[x6]\n\t"\
+  "fmla v12.4s,v10.4s,v"#ac1".s[1]\n\t"\
+  "fmov v8.d[1],x10; ldr d9,[x4,#-112]\n\t"\
+  "fmla v17.4s,v10.4s,v"#ac2".s[1]; ldr x10,[x4,#-104]\n\t"\
+  "fmla v22.4s,v10.4s,v"#ac3".s[1]; sub w5,w5,#4\n\t"\
+  "fmla v27.4s,v10.4s,v"#ac4".s[1]\n\t"\
+  "fmov v9.d[1],x10\n\t"\
+  "fmla v13.4s,v8.4s,v"#ac1".s[1]\n\t"\
+  "fmla v18.4s,v8.4s,v"#ac2".s[1]\n\t"\
+  "fmla v23.4s,v8.4s,v"#ac3".s[1]\n\t"\
+  "ldr d10,[x4,#-96]\n\t"\
+  "fmla v28.4s,v8.4s,v"#ac4".s[1]; ldr x10,[x4,#-88]\n\t"\
+  "fmla v12.4s,v9.4s,v"#ac1".s[2]; prfm pldl1keep,[x7]\n\t"\
+  "fmla v17.4s,v9.4s,v"#ac2".s[2]\n\t"\
+  "fmov v10.d[1],x10; ldr d11,[x4,#-80]\n\t"\
+  "fmla v22.4s,v9.4s,v"#ac3".s[2]; ldr x10,[x4,#-72]\n\t"\
+  "fmla v27.4s,v9.4s,v"#ac4".s[2]; prfm pldl1keep,[x8]\n\t"\
+  "fmla v13.4s,v10.4s,v"#ac1".s[2]\n\t"\
+  "fmov v11.d[1],x10; ldr d8,[x4,#-64]\n\t"\
+  "fmla v18.4s,v10.4s,v"#ac2".s[2]; ldr x10,[x4,#-56]\n\t"\
+  "fmla v23.4s,v10.4s,v"#ac3".s[2]\n\t"\
+  "fmla v28.4s,v10.4s,v"#ac4".s[2]\n\t"\
+  "fmov v8.d[1],x10\n\t"\
+  "fmla v12.4s,v11.4s,v"#ac1".s[3]\n\t"\
+  "fmla v17.4s,v11.4s,v"#ac2".s[3]\n\t"\
+  "fmla v22.4s,v11.4s,v"#ac3".s[3]\n\t"\
+  "ldr d9,[x4,#-48]\n\t"\
+  "fmla v27.4s,v11.4s,v"#ac4".s[3]; ldr x10,[x4,#-40]\n\t"\
+  "fmla v13.4s,v8.4s,v"#ac1".s[3]; prfm pldl1keep,[x9]\n\t"\
+  "fmla v18.4s,v8.4s,v"#ac2".s[3]\n\t"\
+  "fmov v9.d[1],x10; ldr d10,[x4,#-32]\n\t"\
+  "fmla v23.4s,v8.4s,v"#ac3".s[3]; ldr x10,[x4,#-24]\n\t"\
+  "fmla v28.4s,v8.4s,v"#ac4".s[3]\n\t"\
+  "fmla v14.4s,v9.4s,v"#ac1".4s\n\t"\
+  "fmov v10.d[1],x10; ldr d11,[x4,#-16]\n\t"\
+  "fmla v19.4s,v9.4s,v"#ac2".4s; ldr x10,[x4,#-8]\n\t"\
+  "fmla v24.4s,v9.4s,v"#ac3".4s\n\t"\
+  "fmla v29.4s,v9.4s,v"#ac4".4s\n\t"\
+  "fmov v11.d[1],x10\n\t"\
+  "fmla v15.4s,v10.4s,v"#ac1".4s\n\t"\
+  "fmla v20.4s,v10.4s,v"#ac2".4s\n\t"\
+  "fmla v25.4s,v10.4s,v"#ac3".4s\n\t"\
+  "fmla v30.4s,v10.4s,v"#ac4".4s\n\t"\
+  "fmla v16.4s,v11.4s,v"#ac1".4s\n\t"\
+  "fmla v21.4s,v11.4s,v"#ac2".4s\n\t"\
+  "fmla v26.4s,v11.4s,v"#ac3".4s\n\t"\
+  "fmla v31.4s,v11.4s,v"#ac4".4s\n\t"
+
+#define KERNEL_M4N11_TL1 \
+  "ldr s0,[x0],#4; ldr s1,[x1],#4\n\t"\
+  "ldr q8,[x4]; ldr q9,[x4,#16]; add x4,x4,#44\n\t"\
+  "ldr s2,[x2],#4\n\t"\
+  "fmla v12.4s,v8.4s,v0.s[0]\n\t"\
+  "fmla v13.4s,v9.4s,v0.s[0]\n\t"\
+  "fmla v17.4s,v8.4s,v1.s[0]\n\t"\
+  "ldr s3,[x3],#4\n\t"\
+  "fmla v18.4s,v9.4s,v1.s[0]\n\t"\
+  "fmla v22.4s,v8.4s,v2.s[0]\n\t"\
+  "fmla v23.4s,v9.4s,v2.s[0]; sub w5,w5,#1\n\t"\
+  "ldr d10,[x4,#-12]\n\t"\
+  "fmla v27.4s,v8.4s,v3.s[0]; cmp w5,#1\n\t"\
+  "fmla v28.4s,v9.4s,v3.s[0]\n\t"\
+  "fmla v14.4s,v0.4s,v10.s[0]\n\t"\
+  "ldr s11,[x4,#-4]\n\t"\
+  "fmla v19.4s,v1.4s,v10.s[0]\n\t"\
+  "fmla v24.4s,v2.4s,v10.s[0]\n\t"\
+  "fmla v29.4s,v3.4s,v10.s[0]\n\t"\
+  "fmla v15.4s,v0.4s,v10.s[1]\n\t"\
+  "fmla v20.4s,v1.4s,v10.s[1]\n\t"\
+  "fmla v25.4s,v2.4s,v10.s[1]\n\t"\
+  "fmla v30.4s,v3.4s,v10.s[1]\n\t"\
+  "fmla v16.4s,v0.4s,v11.s[0]\n\t"\
+  "fmla v21.4s,v1.4s,v11.s[0]\n\t"\
+  "fmla v26.4s,v2.4s,v11.s[0]\n\t"\
+  "fmla v31.4s,v3.4s,v11.s[0]\n\t"
+
+
+/* m4n12 c_vec */
+/* v20 - v22 */
+/* v23 - v25 */
+/* v26 - v28 */
+/* v29 - v31 */
+
+#define INIT_M4N12 \
+  INIT_4V(20, 21, 22, 23) INIT_4V(24, 25, 26, 27)\
+  INIT_4V(28, 29, 30, 31)
+
+#define SAVE_M4N12(mode) \
+  UNIT_SAVE_M4N4_VR_##mode(20, 23, 26, 29) UNIT_SAVE_M4N4_VR_##mode(21, 24, 27, 30)\
+  UNIT_SAVE_M4N4_VR_##mode(22, 25, 28, 31)
+
+#define KERNEL_M4N12_PRELOAD4 \
+  "ldr q0,[x0],#16; ldr q1,[x1],#16; ldr q2,[x2],#16; ldr d3,[x3],#16\n\t"\
+  "ldr q8,[x4],#192; ldr d9,[x4,#-176]; ldr x10,[x4,#-168]; ldr x11,[x3,#-8]\n\t"
+
+#define KERNEL_M4N12_K8_L4(ac1, ac2, ac3, ac4, an1, an2, an3, an4) \
+  "fmov v9.d[1],x10; ldr d"#an1",[x0],#16\n\t"\
+  "fmla v20.4s,v8.4s,v"#ac1".s[0]\n\t"\
+  "fmla v23.4s,v8.4s,v"#ac2".s[0]; prfm pldl1keep,[x0,#64]\n\t"\
+  "fmla v26.4s,v8.4s,v"#ac3".s[0]\n\t"\
+  "fmov v"#ac4".d[1],x11; ldr d10,[x4,#-160]\n\t"\
+  "fmla v24.4s,v9.4s,v"#ac2".s[0]; ldr x10,[x4,#-152]\n\t"\
+  "fmla v21.4s,v9.4s,v"#ac1".s[0]; prfm pldl1keep,[x4,#8]\n\t"\
+  "fmla v29.4s,v8.4s,v"#ac4".s[0]\n\t"\
+  "fmov v10.d[1],x10; ldr d8,[x4,#-144]\n\t"\
+  "fmla v27.4s,v9.4s,v"#ac3".s[0]; ldr x10,[x4,#-136]\n\t"\
+  "fmla v30.4s,v9.4s,v"#ac4".s[0]\n\t"\
+  "fmla v22.4s,v10.4s,v"#ac1".s[0]\n\t"\
+  "fmov v8.d[1],x10; ldr d9,[x4,#-128]\n\t"\
+  "fmla v25.4s,v10.4s,v"#ac2".s[0]; ldr x10,[x4,#-120]\n\t"\
+  "fmla v28.4s,v10.4s,v"#ac3".s[0]; ldr x16,[x0,#-8]\n\t"\
+  "fmla v31.4s,v10.4s,v"#ac4".s[0]\n\t"\
+  "fmov v9.d[1],x10; ldr d"#an2",[x1],#16\n\t"\
+  "fmla v20.4s,v8.4s,v"#ac1".s[1]\n\t"\
+  "fmla v23.4s,v8.4s,v"#ac2".s[1]; prfm pldl1keep,[x1,#64]\n\t"\
+  "fmla v26.4s,v8.4s,v"#ac3".s[1]\n\t"\
+  "fmov v"#an1".d[1],x16; ldr d10,[x4,#-112]\n\t"\
+  "fmla v29.4s,v8.4s,v"#ac4".s[1]; ldr x10,[x4,#-104]\n\t"\
+  "fmla v21.4s,v9.4s,v"#ac1".s[1]; ldr x11,[x1,#-8]\n\t"\
+  "fmla v24.4s,v9.4s,v"#ac2".s[1]\n\t"\
+  "fmov v10.d[1],x10; ldr d8,[x4,#-96]\n\t"\
+  "fmla v27.4s,v9.4s,v"#ac3".s[1]; ldr x10,[x4,#-88]\n\t"\
+  "fmla v30.4s,v9.4s,v"#ac4".s[1]; prfm pldl1keep,[x4,#72]\n\t"\
+  "fmla v22.4s,v10.4s,v"#ac1".s[1]\n\t"\
+  "fmov v8.d[1],x10; ldr d9,[x4,#-80]\n\t"\
+  "fmla v25.4s,v10.4s,v"#ac2".s[1]; ldr x10,[x4,#-72]\n\t"\
+  "fmla v28.4s,v10.4s,v"#ac3".s[1]\n\t"\
+  "fmla v31.4s,v10.4s,v"#ac4".s[1]\n\t"\
+  "fmov v9.d[1],x10; ldr d"#an3",[x2],#16\n\t"\
+  "fmla v20.4s,v8.4s,v"#ac1".s[2]\n\t"\
+  "fmla v23.4s,v8.4s,v"#ac2".s[2]; prfm pldl1keep,[x2,#64]\n\t"\
+  "fmla v26.4s,v8.4s,v"#ac3".s[2]\n\t"\
+  "fmov v"#an2".d[1],x11; ldr d10,[x4,#-64]\n\t"\
+  "fmla v29.4s,v8.4s,v"#ac4".s[2]; ldr x10,[x4,#-56]\n\t"\
+  "fmla v21.4s,v9.4s,v"#ac1".s[2]; ldr x16,[x2,#-8]\n\t"\
+  "fmla v24.4s,v9.4s,v"#ac2".s[2]\n\t"\
+  "fmov v10.d[1],x10; ldr d8,[x4,#-48]\n\t"\
+  "fmla v27.4s,v9.4s,v"#ac3".s[2]; ldr x10,[x4,#-40]\n\t"\
+  "fmla v30.4s,v9.4s,v"#ac4".s[2]\n\t"\
+  "fmla v22.4s,v10.4s,v"#ac1".s[2]\n\t"\
+  "fmov v8.d[1],x10; ldr d9,[x4,#-32]\n\t"\
+  "fmla v25.4s,v10.4s,v"#ac2".s[2]; ldr x10,[x4,#-24]\n\t"\
+  "fmla v28.4s,v10.4s,v"#ac3".s[2]; prfm pldl1keep,[x4,#136]\n\t"\
+  "fmla v31.4s,v10.4s,v"#ac4".s[2]\n\t"\
+  "fmov v9.d[1],x10; ldr d"#an4",[x3],#16\n\t"\
+  "fmla v20.4s,v8.4s,v"#ac1".s[3]; sub w5,w5,#4\n\t"\
+  "fmla v23.4s,v8.4s,v"#ac2".s[3]; prfm pldl1keep,[x3,#64]\n\t"\
+  "fmla v26.4s,v8.4s,v"#ac3".s[3]\n\t"\
+  "fmov v"#an3".d[1],x16; ldr d10,[x4,#-16]\n\t"\
+  "fmla v29.4s,v8.4s,v"#ac4".s[3]; ldr x10,[x4,#-8]\n\t"\
+  "fmla v21.4s,v9.4s,v"#ac1".s[3]; ldr x11,[x3,#-8]\n\t"\
+  "fmla v24.4s,v9.4s,v"#ac2".s[3]\n\t"\
+  "fmov v10.d[1],x10; ldr d8,[x4]\n\t"\
+  "fmla v27.4s,v9.4s,v"#ac3".s[3]; ldr x10,[x4,#8]\n\t"\
+  "fmla v30.4s,v9.4s,v"#ac4".s[3]; cmp w5,#12\n\t"\
+  "fmla v22.4s,v10.4s,v"#ac1".s[3]\n\t"\
+  "fmov v8.d[1],x10; ldr d9,[x4,#16]\n\t"\
+  "fmla v25.4s,v10.4s,v"#ac2".s[3]; ldr x10,[x4,#24]\n\t"\
+  "fmla v28.4s,v10.4s,v"#ac3".s[3]; add x4,x4,#192\n\t"\
+  "fmla v31.4s,v10.4s,v"#ac4".s[3]\n\t"
+
+#define KERNEL_M4N12_K8_T4(ac1, ac2, ac3, ac4) \
+  "fmov v"#ac4".d[1],x11; fmov v9.d[1],x10\n\t"\
+  "fmla v20.4s,v8.4s,v"#ac1".s[0]\n\t"\
+  "fmla v23.4s,v8.4s,v"#ac2".s[0]\n\t"\
+  "fmla v26.4s,v8.4s,v"#ac3".s[0]\n\t"\
+  "ldr d10,[x4,#-160]\n\t"\
+  "fmla v29.4s,v8.4s,v"#ac4".s[0]; ldr x10,[x4,#-152]\n\t"\
+  "fmla v21.4s,v9.4s,v"#ac1".s[0]\n\t"\
+  "fmla v24.4s,v9.4s,v"#ac2".s[0]\n\t"\
+  "fmov v10.d[1],x10; ldr d8,[x4,#-144]\n\t"\
+  "fmla v27.4s,v9.4s,v"#ac3".s[0]; ldr x10,[x4,#-136]\n\t"\
+  "fmla v30.4s,v9.4s,v"#ac4".s[0]; prfm pldl1keep,[x6]\n\t"\
+  "fmla v22.4s,v10.4s,v"#ac1".s[0]\n\t"\
+  "fmov v8.d[1],x10; ldr d9,[x4,#-128]\n\t"\
+  "fmla v25.4s,v10.4s,v"#ac2".s[0]; ldr x10,[x4,#-120]\n\t"\
+  "fmla v28.4s,v10.4s,v"#ac3".s[0]\n\t"\
+  "fmla v31.4s,v10.4s,v"#ac4".s[0]\n\t"\
+  "fmov v9.d[1],x10\n\t"\
+  "fmla v20.4s,v8.4s,v"#ac1".s[1]\n\t"\
+  "fmla v23.4s,v8.4s,v"#ac2".s[1]\n\t"\
+  "fmla v26.4s,v8.4s,v"#ac3".s[1]\n\t"\
+  "ldr d10,[x4,#-112]\n\t"\
+  "fmla v29.4s,v8.4s,v"#ac4".s[1]; ldr x10,[x4,#-104]\n\t"\
+  "fmla v21.4s,v9.4s,v"#ac1".s[1]\n\t"\
+  "fmla v24.4s,v9.4s,v"#ac2".s[1]\n\t"\
+  "fmov v10.d[1],x10; ldr d8,[x4,#-96]\n\t"\
+  "fmla v27.4s,v9.4s,v"#ac3".s[1]; ldr x10,[x4,#-88]\n\t"\
+  "fmla v30.4s,v9.4s,v"#ac4".s[1]; prfm pldl1keep,[x7]\n\t"\
+  "fmla v22.4s,v10.4s,v"#ac1".s[1]\n\t"\
+  "fmov v8.d[1],x10; ldr d9,[x4,#-80]\n\t"\
+  "fmla v25.4s,v10.4s,v"#ac2".s[1]; ldr x10,[x4,#-72]\n\t"\
+  "fmla v28.4s,v10.4s,v"#ac3".s[1]\n\t"\
+  "fmla v31.4s,v10.4s,v"#ac4".s[1]\n\t"\
+  "fmov v9.d[1],x10\n\t"\
+  "fmla v20.4s,v8.4s,v"#ac1".s[2]\n\t"\
+  "fmla v23.4s,v8.4s,v"#ac2".s[2]\n\t"\
+  "fmla v26.4s,v8.4s,v"#ac3".s[2]\n\t"\
+  "ldr d10,[x4,#-64]\n\t"\
+  "fmla v29.4s,v8.4s,v"#ac4".s[2]; ldr x10,[x4,#-56]\n\t"\
+  "fmla v21.4s,v9.4s,v"#ac1".s[2]\n\t"\
+  "fmla v24.4s,v9.4s,v"#ac2".s[2]\n\t"\
+  "fmov v10.d[1],x10; ldr d8,[x4,#-48]\n\t"\
+  "fmla v27.4s,v9.4s,v"#ac3".s[2]; ldr x10,[x4,#-40]\n\t"\
+  "fmla v30.4s,v9.4s,v"#ac4".s[2]; prfm pldl1keep,[x8]\n\t"\
+  "fmla v22.4s,v10.4s,v"#ac1".s[2]\n\t"\
+  "fmov v8.d[1],x10; ldr d9,[x4,#-32]\n\t"\
+  "fmla v25.4s,v10.4s,v"#ac2".s[2]; ldr x10,[x4,#-24]\n\t"\
+  "fmla v28.4s,v10.4s,v"#ac3".s[2]\n\t"\
+  "fmla v31.4s,v10.4s,v"#ac4".s[2]\n\t"\
+  "fmov v9.d[1],x10\n\t"\
+  "fmla v20.4s,v8.4s,v"#ac1".s[3]\n\t"\
+  "fmla v23.4s,v8.4s,v"#ac2".s[3]\n\t"\
+  "fmla v26.4s,v8.4s,v"#ac3".s[3]\n\t"\
+  "ldr d10,[x4,#-16]\n\t"\
+  "fmla v29.4s,v8.4s,v"#ac4".s[3]; ldr x10,[x4,#-8]\n\t"\
+  "fmla v21.4s,v9.4s,v"#ac1".s[3]; sub w5,w5,#4\n\t"\
+  "fmla v24.4s,v9.4s,v"#ac2".s[3]\n\t"\
+  "fmov v10.d[1],x10\n\t"\
+  "fmla v27.4s,v9.4s,v"#ac3".s[3]\n\t"\
+  "fmla v30.4s,v9.4s,v"#ac4".s[3]; prfm pldl1keep,[x9]\n\t"\
+  "fmla v22.4s,v10.4s,v"#ac1".s[3]\n\t"\
+  "fmla v25.4s,v10.4s,v"#ac2".s[3]\n\t"\
+  "fmla v28.4s,v10.4s,v"#ac3".s[3]\n\t"\
+  "fmla v31.4s,v10.4s,v"#ac4".s[3]\n\t"
+
+#define KERNEL_M4N12_TL1 \
+  "ldr s0,[x0],#4; ldr q8,[x4]; ldr q9,[x4,#16]\n\t"\
+  "ldr q10,[x4,#32]; add x4,x4,#48\n\t"\
+  "ldr s1,[x1],#4\n\t"\
+  "fmla v20.4s,v8.4s,v0.s[0]\n\t"\
+  "fmla v21.4s,v9.4s,v0.s[0]\n\t"\
+  "fmla v22.4s,v10.4s,v0.s[0]\n\t"\
+  "ldr s2,[x2],#4\n\t"\
+  "fmla v23.4s,v8.4s,v1.s[0]\n\t"\
+  "fmla v24.4s,v9.4s,v1.s[0]\n\t"\
+  "fmla v25.4s,v10.4s,v1.s[0]\n\t"\
+  "ldr s3,[x3],#4\n\t"\
+  "fmla v26.4s,v8.4s,v2.s[0]; sub w5,w5,#1\n\t"\
+  "fmla v27.4s,v9.4s,v2.s[0]\n\t"\
+  "fmla v28.4s,v10.4s,v2.s[0]\n\t"\
+  "cmp w5,#1\n\t"\
+  "fmla v29.4s,v8.4s,v3.s[0]\n\t"\
+  "fmla v30.4s,v9.4s,v3.s[0]\n\t"\
+  "fmla v31.4s,v10.4s,v3.s[0]\n\t"
+
+
+/* m4n13 c_vec */
+/* v16 - v18 v19_comp */
+/* v20 - v22 v23_comp */
+/* v24 - v26 v27_comp */
+/* v28 - v30 v31_comp */
+
+#define INIT_M4N13 \
+  INIT_4V(16, 17, 18, 19) INIT_4V(20, 21, 22, 23)\
+  INIT_4V(24, 25, 26, 27) INIT_4V(28, 29, 30, 31)
+
+#define SAVE_M4N13(mode) \
+  UNIT_SAVE_M4N4_VR_##mode(16, 20, 24, 28) UNIT_SAVE_M4N4_VR_##mode(17, 21, 25, 29)\
+  UNIT_SAVE_M4N4_VR_##mode(18, 22, 26, 30) EDGE_SAVE_M4N1K4_##mode(19, 23, 27, 31)
+
+#define KERNEL_M4N13_PRELOAD4 \
+  "ldr q0,[x0],#16; ldr q1,[x1],#16; ldr q2,[x2],#16; ldr d3,[x3],#16\n\t"\
+  "ldr q8,[x4],#208; ldr d9,[x4,#-192]; ldr x10,[x4,#-184]; ldr x11,[x3,#-8]\n\t"
+
+#define KERNEL_M4N13_K8_L4(ac1, ac2, ac3, ac4, an1, an2, an3, an4) \
+  "fmov v9.d[1],x10; ldr d"#an1",[x0],#16\n\t"\
+  "fmla v16.4s,v8.4s,v"#ac1".s[0]\n\t"\
+  "fmla v20.4s,v8.4s,v"#ac2".s[0]; prfm pldl1keep,[x0,#64]\n\t"\
+  "fmla v24.4s,v8.4s,v"#ac3".s[0]\n\t"\
+  "fmov v"#ac4".d[1],x11; ldr d10,[x4,#-176]\n\t"\
+  "fmla v21.4s,v9.4s,v"#ac2".s[0]; ldr x10,[x4,#-168]\n\t"\
+  "fmla v17.4s,v9.4s,v"#ac1".s[0]; prfm pldl1keep,[x4,#24]\n\t"\
+  "fmla v28.4s,v8.4s,v"#ac4".s[0]\n\t"\
+  "fmov v10.d[1],x10; ldr d8,[x4,#-160]\n\t"\
+  "fmla v25.4s,v9.4s,v"#ac3".s[0]; ldr x10,[x4,#-152]\n\t"\
+  "fmla v29.4s,v9.4s,v"#ac4".s[0]\n\t"\
+  "fmla v18.4s,v10.4s,v"#ac1".s[0]\n\t"\
+  "fmov v8.d[1],x10; ldr d9,[x4,#-144]\n\t"\
+  "fmla v22.4s,v10.4s,v"#ac2".s[0]; ldr x10,[x4,#-136]\n\t"\
+  "fmla v26.4s,v10.4s,v"#ac3".s[0]; ldr x16,[x0,#-8]\n\t"\
+  "fmla v30.4s,v10.4s,v"#ac4".s[0]\n\t"\
+  "fmov v9.d[1],x10; ldr d"#an2",[x1],#16\n\t"\
+  "fmla v16.4s,v8.4s,v"#ac1".s[1]\n\t"\
+  "fmla v20.4s,v8.4s,v"#ac2".s[1]; prfm pldl1keep,[x1,#64]\n\t"\
+  "fmla v24.4s,v8.4s,v"#ac3".s[1]\n\t"\
+  "fmov v"#an1".d[1],x16; ldr d10,[x4,#-128]\n\t"\
+  "fmla v28.4s,v8.4s,v"#ac4".s[1]; ldr x10,[x4,#-120]\n\t"\
+  "fmla v17.4s,v9.4s,v"#ac1".s[1]\n\t"\
+  "fmla v21.4s,v9.4s,v"#ac2".s[1]\n\t"\
+  "fmov v10.d[1],x10; ldr d8,[x4,#-112]\n\t"\
+  "fmla v25.4s,v9.4s,v"#ac3".s[1]; ldr x10,[x4,#-104]\n\t"\
+  "fmla v29.4s,v9.4s,v"#ac4".s[1]; prfm pldl1keep,[x4,#88]\n\t"\
+  "fmla v18.4s,v10.4s,v"#ac1".s[1]\n\t"\
+  "fmov v8.d[1],x10; ldr d9,[x4,#-96]\n\t"\
+  "fmla v22.4s,v10.4s,v"#ac2".s[1]; ldr x10,[x4,#-88]\n\t"\
+  "fmla v26.4s,v10.4s,v"#ac3".s[1]; ldr x11,[x1,#-8]\n\t"\
+  "fmla v30.4s,v10.4s,v"#ac4".s[1]\n\t"\
+  "fmov v9.d[1],x10; ldr d"#an3",[x2],#16\n\t"\
+  "fmla v16.4s,v8.4s,v"#ac1".s[2]\n\t"\
+  "fmla v20.4s,v8.4s,v"#ac2".s[2]; prfm pldl1keep,[x2,#64]\n\t"\
+  "fmla v24.4s,v8.4s,v"#ac3".s[2]\n\t"\
+  "fmov v"#an2".d[1],x11; ldr d10,[x4,#-80]\n\t"\
+  "fmla v28.4s,v8.4s,v"#ac4".s[2]; ldr x10,[x4,#-72]\n\t"\
+  "fmla v17.4s,v9.4s,v"#ac1".s[2]\n\t"\
+  "fmla v21.4s,v9.4s,v"#ac2".s[2]\n\t"\
+  "fmov v10.d[1],x10; ldr d8,[x4,#-64]\n\t"\
+  "fmla v25.4s,v9.4s,v"#ac3".s[2]; ldr x10,[x4,#-56]\n\t"\
+  "fmla v29.4s,v9.4s,v"#ac4".s[2]; ldr x16,[x2,#-8]\n\t"\
+  "fmla v18.4s,v10.4s,v"#ac1".s[2]\n\t"\
+  "fmov v8.d[1],x10; ldr d9,[x4,#-48]\n\t"\
+  "fmla v22.4s,v10.4s,v"#ac2".s[2]; ldr x10,[x4,#-40]\n\t"\
+  "fmla v26.4s,v10.4s,v"#ac3".s[2]; prfm pldl1keep,[x4,#152]\n\t"\
+  "fmla v30.4s,v10.4s,v"#ac4".s[2]\n\t"\
+  "fmov v9.d[1],x10; ldr d"#an4",[x3],#16\n\t"\
+  "fmla v16.4s,v8.4s,v"#ac1".s[3]\n\t"\
+  "fmla v20.4s,v8.4s,v"#ac2".s[3]; prfm pldl1keep,[x3,#64]\n\t"\
+  "fmla v24.4s,v8.4s,v"#ac3".s[3]\n\t"\
+  "fmov v"#an3".d[1],x16; ldr d10,[x4,#-32]\n\t"\
+  "fmla v28.4s,v8.4s,v"#ac4".s[3]; ldr x10,[x4,#-24]\n\t"\
+  "fmla v17.4s,v9.4s,v"#ac1".s[3]; sub w5,w5,#4\n\t"\
+  "fmla v21.4s,v9.4s,v"#ac2".s[3]\n\t"\
+  "fmov v10.d[1],x10; ldr d11,[x4,#-16]\n\t"\
+  "fmla v25.4s,v9.4s,v"#ac3".s[3]; ldr x10,[x4,#-8]\n\t"\
+  "fmla v29.4s,v9.4s,v"#ac4".s[3]; cmp w5,#12\n\t"\
+  "fmla v18.4s,v10.4s,v"#ac1".s[3]\n\t"\
+  "fmov v11.d[1],x10; ldr d8,[x4]\n\t"\
+  "fmla v22.4s,v10.4s,v"#ac2".s[3]; ldr x16,[x4,#8]\n\t"\
+  "fmla v26.4s,v10.4s,v"#ac3".s[3]; ldr x11,[x3,#-8]\n\t"\
+  "fmla v30.4s,v10.4s,v"#ac4".s[3]\n\t"\
+  "ldr d9,[x4,#16]\n\t"\
+  "fmla v19.4s,v11.4s,v"#ac1".4s; ldr x10,[x4,#24]\n\t"\
+  "fmla v23.4s,v11.4s,v"#ac2".4s\n\t"\
+  "fmov v8.d[1],x16\n\t"\
+  "fmla v27.4s,v11.4s,v"#ac3".4s; add x4,x4,#208\n\t"\
+  "fmla v31.4s,v11.4s,v"#ac4".4s\n\t"
+
+#define KERNEL_M4N13_K8_T4(ac1, ac2, ac3, ac4) \
+  "fmov v9.d[1],x10\n\t"\
+  "fmla v16.4s,v8.4s,v"#ac1".s[0]\n\t"\
+  "fmla v20.4s,v8.4s,v"#ac2".s[0]\n\t"\
+  "fmla v24.4s,v8.4s,v"#ac3".s[0]\n\t"\
+  "fmov v"#ac4".d[1],x11; ldr d10,[x4,#-176]\n\t"\
+  "fmla v21.4s,v9.4s,v"#ac2".s[0]; ldr x10,[x4,#-168]\n\t"\
+  "fmla v17.4s,v9.4s,v"#ac1".s[0]\n\t"\
+  "fmla v28.4s,v8.4s,v"#ac4".s[0]\n\t"\
+  "fmov v10.d[1],x10; ldr d8,[x4,#-160]\n\t"\
+  "fmla v25.4s,v9.4s,v"#ac3".s[0]; ldr x10,[x4,#-152]\n\t"\
+  "fmla v29.4s,v9.4s,v"#ac4".s[0]; prfm pldl1keep,[x6]\n\t"\
+  "fmla v18.4s,v10.4s,v"#ac1".s[0]\n\t"\
+  "fmov v8.d[1],x10; ldr d9,[x4,#-144]\n\t"\
+  "fmla v22.4s,v10.4s,v"#ac2".s[0]; ldr x10,[x4,#-136]\n\t"\
+  "fmla v26.4s,v10.4s,v"#ac3".s[0]\n\t"\
+  "fmla v30.4s,v10.4s,v"#ac4".s[0]\n\t"\
+  "fmov v9.d[1],x10\n\t"\
+  "fmla v16.4s,v8.4s,v"#ac1".s[1]\n\t"\
+  "fmla v20.4s,v8.4s,v"#ac2".s[1]\n\t"\
+  "fmla v24.4s,v8.4s,v"#ac3".s[1]\n\t"\
+  "ldr d10,[x4,#-128]\n\t"\
+  "fmla v28.4s,v8.4s,v"#ac4".s[1]; ldr x10,[x4,#-120]\n\t"\
+  "fmla v17.4s,v9.4s,v"#ac1".s[1]\n\t"\
+  "fmla v21.4s,v9.4s,v"#ac2".s[1]\n\t"\
+  "fmov v10.d[1],x10; ldr d8,[x4,#-112]\n\t"\
+  "fmla v25.4s,v9.4s,v"#ac3".s[1]; ldr x10,[x4,#-104]\n\t"\
+  "fmla v29.4s,v9.4s,v"#ac4".s[1]; prfm pldl1keep,[x7]\n\t"\
+  "fmla v18.4s,v10.4s,v"#ac1".s[1]\n\t"\
+  "fmov v8.d[1],x10; ldr d9,[x4,#-96]\n\t"\
+  "fmla v22.4s,v10.4s,v"#ac2".s[1]; ldr x10,[x4,#-88]\n\t"\
+  "fmla v26.4s,v10.4s,v"#ac3".s[1]\n\t"\
+  "fmla v30.4s,v10.4s,v"#ac4".s[1]\n\t"\
+  "fmov v9.d[1],x10\n\t"\
+  "fmla v16.4s,v8.4s,v"#ac1".s[2]\n\t"\
+  "fmla v20.4s,v8.4s,v"#ac2".s[2]\n\t"\
+  "fmla v24.4s,v8.4s,v"#ac3".s[2]\n\t"\
+  "ldr d10,[x4,#-80]\n\t"\
+  "fmla v28.4s,v8.4s,v"#ac4".s[2]; ldr x10,[x4,#-72]\n\t"\
+  "fmla v17.4s,v9.4s,v"#ac1".s[2]\n\t"\
+  "fmla v21.4s,v9.4s,v"#ac2".s[2]\n\t"\
+  "fmov v10.d[1],x10; ldr d8,[x4,#-64]\n\t"\
+  "fmla v25.4s,v9.4s,v"#ac3".s[2]; ldr x10,[x4,#-56]\n\t"\
+  "fmla v29.4s,v9.4s,v"#ac4".s[2]; prfm pldl1keep,[x8]\n\t"\
+  "fmla v18.4s,v10.4s,v"#ac1".s[2]\n\t"\
+  "fmov v8.d[1],x10; ldr d9,[x4,#-48]\n\t"\
+  "fmla v22.4s,v10.4s,v"#ac2".s[2]; ldr x10,[x4,#-40]\n\t"\
+  "fmla v26.4s,v10.4s,v"#ac3".s[2]; prfm pldl1keep,[x9]\n\t"\
+  "fmla v30.4s,v10.4s,v"#ac4".s[2]\n\t"\
+  "fmov v9.d[1],x10\n\t"\
+  "fmla v16.4s,v8.4s,v"#ac1".s[3]\n\t"\
+  "fmla v20.4s,v8.4s,v"#ac2".s[3]\n\t"\
+  "fmla v24.4s,v8.4s,v"#ac3".s[3]\n\t"\
+  "ldr d10,[x4,#-32]\n\t"\
+  "fmla v28.4s,v8.4s,v"#ac4".s[3]; ldr x10,[x4,#-24]\n\t"\
+  "fmla v17.4s,v9.4s,v"#ac1".s[3]; sub w5,w5,#4\n\t"\
+  "fmla v21.4s,v9.4s,v"#ac2".s[3]\n\t"\
+  "fmov v10.d[1],x10; ldr d11,[x4,#-16]\n\t"\
+  "fmla v25.4s,v9.4s,v"#ac3".s[3]; ldr x10,[x4,#-8]\n\t"\
+  "fmla v29.4s,v9.4s,v"#ac4".s[3]\n\t"\
+  "fmla v18.4s,v10.4s,v"#ac1".s[3]\n\t"\
+  "fmov v11.d[1],x10\n\t"\
+  "fmla v22.4s,v10.4s,v"#ac2".s[3]\n\t"\
+  "fmla v26.4s,v10.4s,v"#ac3".s[3]\n\t"\
+  "fmla v30.4s,v10.4s,v"#ac4".s[3]\n\t"\
+  "fmla v19.4s,v11.4s,v"#ac1".4s\n\t"\
+  "fmla v23.4s,v11.4s,v"#ac2".4s\n\t"\
+  "fmla v27.4s,v11.4s,v"#ac3".4s\n\t"\
+  "fmla v31.4s,v11.4s,v"#ac4".4s\n\t"
+
+#define KERNEL_M4N13_TL1 \
+  "ldr s0,[x0],#4; ldr q8,[x4]; ldr q9,[x4,#16]\n\t"\
+  "ldr q10,[x4,#32]; add x4,x4,#52\n\t"\
+  "ldr s1,[x1],#4\n\t"\
+  "fmla v16.4s,v8.4s,v0.s[0]\n\t"\
+  "fmla v17.4s,v9.4s,v0.s[0]\n\t"\
+  "fmla v18.4s,v10.4s,v0.s[0]\n\t"\
+  "ldr s2,[x2],#4\n\t"\
+  "fmla v20.4s,v8.4s,v1.s[0]\n\t"\
+  "fmla v21.4s,v9.4s,v1.s[0]\n\t"\
+  "fmla v22.4s,v10.4s,v1.s[0]\n\t"\
+  "ldr s3,[x3],#4\n\t"\
+  "fmla v24.4s,v8.4s,v2.s[0]; sub w5,w5,#1\n\t"\
+  "fmla v25.4s,v9.4s,v2.s[0]\n\t"\
+  "fmla v26.4s,v10.4s,v2.s[0]\n\t"\
+  "ldr s11,[x4,#-4]\n\t"\
+  "fmla v28.4s,v8.4s,v3.s[0]; cmp w5,#1\n\t"\
+  "fmla v29.4s,v9.4s,v3.s[0]\n\t"\
+  "fmla v30.4s,v10.4s,v3.s[0]\n\t"\
+  "fmla v19.4s,v0.4s,v11.4s\n\t"\
+  "fmla v23.4s,v1.4s,v11.4s\n\t"\
+  "fmla v27.4s,v2.4s,v11.4s\n\t"\
+  "fmla v31.4s,v3.4s,v11.4s\n\t"
+
+
+/* m4n14 c_vec */
+/* v12 - v14 v15_comp v16_comp */
+/* v17 - v19 v20_comp v21_comp */
+/* v22 - v24 v25_comp v26_comp */
+/* v27 - v29 v30_comp v31_comp */
+
+#define INIT_M4N14 \
+  INIT_4V(12, 13, 14, 15) INIT_4V(16, 17, 18, 19)\
+  INIT_4V(20, 21, 22, 23) INIT_4V(24, 25, 26, 27)\
+  INIT_4V(28, 29, 30, 31)
+
+#define SAVE_M4N14(mode) \
+  UNIT_SAVE_M4N4_VR_##mode(12, 17, 22, 27) UNIT_SAVE_M4N4_VR_##mode(13, 18, 23, 28)\
+  UNIT_SAVE_M4N4_VR_##mode(14, 19, 24, 29) EDGE_SAVE_M4N1K4_##mode(15, 20, 25, 30)\
+  EDGE_SAVE_M4N1K4_##mode(16, 21, 26, 31)
+
+#define KERNEL_M4N14_PRELOAD4 \
+  "ldr q0,[x0],#16; ldr q1,[x1],#16; ldr q2,[x2],#16; ldr d3,[x3],#16\n\t"\
+  "ldr q8,[x4],#224; ldr d9,[x4,#-208]; ldr x10,[x4,#-200]; ldr x11,[x3,#-8]\n\t"
+
+#define KERNEL_M4N14_K8_L4(ac1, ac2, ac3, ac4, an1, an2, an3, an4) \
+  "fmov v9.d[1],x10; ldr d"#an1",[x0],#16\n\t"\
+  "fmla v12.4s,v8.4s,v"#ac1".s[0]\n\t"\
+  "fmla v17.4s,v8.4s,v"#ac2".s[0]; prfm pldl1keep,[x0,#64]\n\t"\
+  "fmla v22.4s,v8.4s,v"#ac3".s[0]\n\t"\
+  "fmov v"#ac4".d[1],x11; ldr d10,[x4,#-192]\n\t"\
+  "fmla v18.4s,v9.4s,v"#ac2".s[0]; ldr x10,[x4,#-184]\n\t"\
+  "fmla v13.4s,v9.4s,v"#ac1".s[0]; prfm pldl1keep,[x4,#8]\n\t"\
+  "fmla v27.4s,v8.4s,v"#ac4".s[0]\n\t"\
+  "fmov v10.d[1],x10; ldr d8,[x4,#-176]\n\t"\
+  "fmla v23.4s,v9.4s,v"#ac3".s[0]; ldr x10,[x4,#-168]\n\t"\
+  "fmla v28.4s,v9.4s,v"#ac4".s[0]\n\t"\
+  "fmla v14.4s,v10.4s,v"#ac1".s[0]\n\t"\
+  "fmov v8.d[1],x10; ldr d9,[x4,#-160]\n\t"\
+  "fmla v19.4s,v10.4s,v"#ac2".s[0]; ldr x10,[x4,#-152]\n\t"\
+  "fmla v24.4s,v10.4s,v"#ac3".s[0]; ldr x16,[x0,#-8]\n\t"\
+  "fmla v29.4s,v10.4s,v"#ac4".s[0]\n\t"\
+  "fmov v9.d[1],x10; ldr d"#an2",[x1],#16\n\t"\
+  "fmla v12.4s,v8.4s,v"#ac1".s[1]\n\t"\
+  "fmla v17.4s,v8.4s,v"#ac2".s[1]; prfm pldl1keep,[x1,#64]\n\t"\
+  "fmla v22.4s,v8.4s,v"#ac3".s[1]\n\t"\
+  "fmov v"#an1".d[1],x16; ldr d10,[x4,#-144]\n\t"\
+  "fmla v27.4s,v8.4s,v"#ac4".s[1]; ldr x10,[x4,#-136]\n\t"\
+  "fmla v13.4s,v9.4s,v"#ac1".s[1]; ldr x11,[x1,#-8]\n\t"\
+  "fmla v18.4s,v9.4s,v"#ac2".s[1]\n\t"\
+  "fmov v10.d[1],x10; ldr d8,[x4,#-128]\n\t"\
+  "fmla v23.4s,v9.4s,v"#ac3".s[1]; ldr x10,[x4,#-120]\n\t"\
+  "fmla v28.4s,v9.4s,v"#ac4".s[1]; prfm pldl1keep,[x4,#72]\n\t"\
+  "fmla v14.4s,v10.4s,v"#ac1".s[1]\n\t"\
+  "fmov v8.d[1],x10; ldr d9,[x4,#-112]\n\t"\
+  "fmla v19.4s,v10.4s,v"#ac2".s[1]; ldr x10,[x4,#-104]\n\t"\
+  "fmla v24.4s,v10.4s,v"#ac3".s[1]\n\t"\
+  "fmla v29.4s,v10.4s,v"#ac4".s[1]\n\t"\
+  "fmov v9.d[1],x10; ldr d"#an3",[x2],#16\n\t"\
+  "fmla v12.4s,v8.4s,v"#ac1".s[2]\n\t"\
+  "fmla v17.4s,v8.4s,v"#ac2".s[2]; prfm pldl1keep,[x2,#64]\n\t"\
+  "fmla v22.4s,v8.4s,v"#ac3".s[2]\n\t"\
+  "fmov v"#an2".d[1],x11; ldr d10,[x4,#-96]\n\t"\
+  "fmla v27.4s,v8.4s,v"#ac4".s[2]; ldr x10,[x4,#-88]\n\t"\
+  "fmla v13.4s,v9.4s,v"#ac1".s[2]; ldr x16,[x2,#-8]\n\t"\
+  "fmla v18.4s,v9.4s,v"#ac2".s[2]\n\t"\
+  "fmov v10.d[1],x10; ldr d8,[x4,#-80]\n\t"\
+  "fmla v23.4s,v9.4s,v"#ac3".s[2]; ldr x10,[x4,#-72]\n\t"\
+  "fmla v28.4s,v9.4s,v"#ac4".s[2]\n\t"\
+  "fmla v14.4s,v10.4s,v"#ac1".s[2]\n\t"\
+  "fmov v8.d[1],x10; ldr d11,[x4,#-64]\n\t"\
+  "fmla v19.4s,v10.4s,v"#ac2".s[2]; ldr x10,[x4,#-56]\n\t"\
+  "fmla v24.4s,v10.4s,v"#ac3".s[2]; prfm pldl1keep,[x4,#136]\n\t"\
+  "fmla v29.4s,v10.4s,v"#ac4".s[2]\n\t"\
+  "fmov v11.d[1],x10; ldr d"#an4",[x3],#16\n\t"\
+  "fmla v12.4s,v8.4s,v"#ac1".s[3]\n\t"\
+  "fmla v17.4s,v8.4s,v"#ac2".s[3]; prfm pldl1keep,[x3,#64]\n\t"\
+  "fmla v22.4s,v8.4s,v"#ac3".s[3]\n\t"\
+  "fmov v"#an3".d[1],x16; ldr d9,[x4,#-48]\n\t"\
+  "fmla v27.4s,v8.4s,v"#ac4".s[3]; ldr x10,[x4,#-40]\n\t"\
+  "fmla v13.4s,v11.4s,v"#ac1".s[3]; sub w5,w5,#4\n\t"\
+  "fmla v18.4s,v11.4s,v"#ac2".s[3]\n\t"\
+  "fmov v9.d[1],x10; ldr d10,[x4,#-32]\n\t"\
+  "fmla v23.4s,v11.4s,v"#ac3".s[3]; ldr x10,[x4,#-24]\n\t"\
+  "fmla v28.4s,v11.4s,v"#ac4".s[3]; cmp w5,#12\n\t"\
+  "fmla v14.4s,v9.4s,v"#ac1".s[3]\n\t"\
+  "fmov v10.d[1],x10; ldr d11,[x4,#-16]\n\t"\
+  "fmla v19.4s,v9.4s,v"#ac2".s[3]; ldr x10,[x4,#-8]\n\t"\
+  "fmla v24.4s,v9.4s,v"#ac3".s[3]; ldr x11,[x3,#-8]\n\t"\
+  "fmla v29.4s,v9.4s,v"#ac4".s[3]\n\t"\
+  "fmov v11.d[1],x10; ldr d8,[x4]\n\t"\
+  "fmla v15.4s,v10.4s,v"#ac1".4s; ldr x16,[x4,#8]\n\t"\
+  "fmla v20.4s,v10.4s,v"#ac2".4s\n\t"\
+  "fmla v25.4s,v10.4s,v"#ac3".4s\n\t"\
+  "ldr d9,[x4,#16]\n\t"\
+  "fmla v30.4s,v10.4s,v"#ac4".4s; ldr x10,[x4,#24]\n\t"\
+  "fmla v16.4s,v11.4s,v"#ac1".4s; add x4,x4,#224\n\t"\
+  "fmla v21.4s,v11.4s,v"#ac2".4s\n\t"\
+  "fmov v8.d[1],x16\n\t"\
+  "fmla v26.4s,v11.4s,v"#ac3".4s\n\t"\
+  "fmla v31.4s,v11.4s,v"#ac4".4s\n\t"
+
+#define KERNEL_M4N14_K8_T4(ac1, ac2, ac3, ac4) \
+  "fmov v9.d[1],x10\n\t"\
+  "fmla v12.4s,v8.4s,v"#ac1".s[0]\n\t"\
+  "fmla v17.4s,v8.4s,v"#ac2".s[0]\n\t"\
+  "fmla v22.4s,v8.4s,v"#ac3".s[0]\n\t"\
+  "fmov v"#ac4".d[1],x11; ldr d10,[x4,#-192]\n\t"\
+  "fmla v18.4s,v9.4s,v"#ac2".s[0]; ldr x10,[x4,#-184]\n\t"\
+  "fmla v13.4s,v9.4s,v"#ac1".s[0]\n\t"\
+  "fmla v27.4s,v8.4s,v"#ac4".s[0]\n\t"\
+  "fmov v10.d[1],x10; ldr d8,[x4,#-176]\n\t"\
+  "fmla v23.4s,v9.4s,v"#ac3".s[0]; ldr x10,[x4,#-168]\n\t"\
+  "fmla v28.4s,v9.4s,v"#ac4".s[0]; prfm pldl1keep,[x6]\n\t"\
+  "fmla v14.4s,v10.4s,v"#ac1".s[0]\n\t"\
+  "fmov v8.d[1],x10; ldr d9,[x4,#-160]\n\t"\
+  "fmla v19.4s,v10.4s,v"#ac2".s[0]; ldr x10,[x4,#-152]\n\t"\
+  "fmla v24.4s,v10.4s,v"#ac3".s[0]\n\t"\
+  "fmla v29.4s,v10.4s,v"#ac4".s[0]\n\t"\
+  "fmov v9.d[1],x10\n\t"\
+  "fmla v12.4s,v8.4s,v"#ac1".s[1]\n\t"\
+  "fmla v17.4s,v8.4s,v"#ac2".s[1]\n\t"\
+  "fmla v22.4s,v8.4s,v"#ac3".s[1]\n\t"\
+  "ldr d10,[x4,#-144]\n\t"\
+  "fmla v27.4s,v8.4s,v"#ac4".s[1]; ldr x10,[x4,#-136]\n\t"\
+  "fmla v13.4s,v9.4s,v"#ac1".s[1]\n\t"\
+  "fmla v18.4s,v9.4s,v"#ac2".s[1]\n\t"\
+  "fmov v10.d[1],x10; ldr d8,[x4,#-128]\n\t"\
+  "fmla v23.4s,v9.4s,v"#ac3".s[1]; ldr x10,[x4,#-120]\n\t"\
+  "fmla v28.4s,v9.4s,v"#ac4".s[1]; prfm pldl1keep,[x7]\n\t"\
+  "fmla v14.4s,v10.4s,v"#ac1".s[1]\n\t"\
+  "fmov v8.d[1],x10; ldr d9,[x4,#-112]\n\t"\
+  "fmla v19.4s,v10.4s,v"#ac2".s[1]; ldr x10,[x4,#-104]\n\t"\
+  "fmla v24.4s,v10.4s,v"#ac3".s[1]\n\t"\
+  "fmla v29.4s,v10.4s,v"#ac4".s[1]\n\t"\
+  "fmov v9.d[1],x10\n\t"\
+  "fmla v12.4s,v8.4s,v"#ac1".s[2]\n\t"\
+  "fmla v17.4s,v8.4s,v"#ac2".s[2]\n\t"\
+  "fmla v22.4s,v8.4s,v"#ac3".s[2]\n\t"\
+  "ldr d10,[x4,#-96]\n\t"\
+  "fmla v27.4s,v8.4s,v"#ac4".s[2]; ldr x10,[x4,#-88]\n\t"\
+  "fmla v13.4s,v9.4s,v"#ac1".s[2]\n\t"\
+  "fmla v18.4s,v9.4s,v"#ac2".s[2]\n\t"\
+  "fmov v10.d[1],x10; ldr d8,[x4,#-80]\n\t"\
+  "fmla v23.4s,v9.4s,v"#ac3".s[2]; ldr x10,[x4,#-72]\n\t"\
+  "fmla v28.4s,v9.4s,v"#ac4".s[2]\n\t"\
+  "fmla v14.4s,v10.4s,v"#ac1".s[2]\n\t"\
+  "fmov v8.d[1],x10; ldr d11,[x4,#-64]\n\t"\
+  "fmla v19.4s,v10.4s,v"#ac2".s[2]; ldr x10,[x4,#-56]\n\t"\
+  "fmla v24.4s,v10.4s,v"#ac3".s[2]; prfm pldl1keep,[x8]\n\t"\
+  "fmla v29.4s,v10.4s,v"#ac4".s[2]\n\t"\
+  "fmov v11.d[1],x10\n\t"\
+  "fmla v12.4s,v8.4s,v"#ac1".s[3]\n\t"\
+  "fmla v17.4s,v8.4s,v"#ac2".s[3]\n\t"\
+  "fmla v22.4s,v8.4s,v"#ac3".s[3]\n\t"\
+  "ldr d9,[x4,#-48]\n\t"\
+  "fmla v27.4s,v8.4s,v"#ac4".s[3]; ldr x10,[x4,#-40]\n\t"\
+  "fmla v13.4s,v11.4s,v"#ac1".s[3]; sub w5,w5,#4\n\t"\
+  "fmla v18.4s,v11.4s,v"#ac2".s[3]\n\t"\
+  "fmov v9.d[1],x10; ldr d10,[x4,#-32]\n\t"\
+  "fmla v23.4s,v11.4s,v"#ac3".s[3]; ldr x10,[x4,#-24]\n\t"\
+  "fmla v28.4s,v11.4s,v"#ac4".s[3]\n\t"\
+  "fmla v14.4s,v9.4s,v"#ac1".s[3]\n\t"\
+  "fmov v10.d[1],x10; ldr d11,[x4,#-16]\n\t"\
+  "fmla v19.4s,v9.4s,v"#ac2".s[3]; ldr x10,[x4,#-8]\n\t"\
+  "fmla v24.4s,v9.4s,v"#ac3".s[3]\n\t"\
+  "fmla v29.4s,v9.4s,v"#ac4".s[3]\n\t"\
+  "fmov v11.d[1],x10\n\t"\
+  "fmla v15.4s,v10.4s,v"#ac1".4s\n\t"\
+  "fmla v20.4s,v10.4s,v"#ac2".4s; prfm pldl1keep,[x9]\n\t"\
+  "fmla v25.4s,v10.4s,v"#ac3".4s\n\t"\
+  "fmla v30.4s,v10.4s,v"#ac4".4s\n\t"\
+  "fmla v16.4s,v11.4s,v"#ac1".4s\n\t"\
+  "fmla v21.4s,v11.4s,v"#ac2".4s\n\t"\
+  "fmla v26.4s,v11.4s,v"#ac3".4s\n\t"\
+  "fmla v31.4s,v11.4s,v"#ac4".4s\n\t"
+
+#define KERNEL_M4N14_TL1 \
+  "ldr s0,[x0],#4; ldr q8,[x4]; ldr q9,[x4,#16]\n\t"\
+  "ldr q10,[x4,#32]; add x4,x4,#56\n\t"\
+  "ldr s1,[x1],#4\n\t"\
+  "fmla v12.4s,v8.4s,v0.s[0]\n\t"\
+  "fmla v13.4s,v9.4s,v0.s[0]\n\t"\
+  "fmla v14.4s,v10.4s,v0.s[0]\n\t"\
+  "ldr s2,[x2],#4\n\t"\
+  "fmla v17.4s,v8.4s,v1.s[0]\n\t"\
+  "fmla v18.4s,v9.4s,v1.s[0]\n\t"\
+  "fmla v19.4s,v10.4s,v1.s[0]\n\t"\
+  "ldr s3,[x3],#4\n\t"\
+  "fmla v22.4s,v8.4s,v2.s[0]; sub w5,w5,#1\n\t"\
+  "fmla v23.4s,v9.4s,v2.s[0]\n\t"\
+  "fmla v24.4s,v10.4s,v2.s[0]\n\t"\
+  "ldr d11,[x4,#-8]\n\t"\
+  "fmla v27.4s,v8.4s,v3.s[0]; cmp w5,#1\n\t"\
+  "fmla v28.4s,v9.4s,v3.s[0]\n\t"\
+  "fmla v29.4s,v10.4s,v3.s[0]\n\t"\
+  "fmla v15.4s,v0.4s,v11.s[0]\n\t"\
+  "fmla v20.4s,v1.4s,v11.s[0]\n\t"\
+  "fmla v25.4s,v2.4s,v11.s[0]\n\t"\
+  "fmla v30.4s,v3.4s,v11.s[0]\n\t"\
+  "fmla v16.4s,v0.4s,v11.s[1]\n\t"\
+  "fmla v21.4s,v1.4s,v11.s[1]\n\t"\
+  "fmla v26.4s,v2.4s,v11.s[1]\n\t"\
+  "fmla v31.4s,v3.4s,v11.s[1]\n\t"
+
+#define FUNC_K4(ndim) \
+static inline void sgemm_skinny1_a53_m4n##ndim(\
+  const float * __restrict__ a_ptr, const float * __restrict__ b_scr,\
+  float * __restrict__ c_ptr, uint32_t K, uint32_t LDA, uint32_t LDC,\
+  uint8_t c_rowmajor, const float * __restrict__ beta_addr) {\
+  __asm__ __volatile__ (\
+    "mov x0,%[a_ptr]; add x1,%[a_ptr],%w[LDA],UXTW #2\n\t"\
+    "add x2,%[a_ptr],%w[LDA],UXTW #3; add x3,x1,%w[LDA],UXTW #3\n\t"\
+    "add x6,x0,%w[LDA],UXTW #4; add x7,x1,%w[LDA],UXTW #4\n\t"\
+    "add x8,x2,%w[LDA],UXTW #4; add x9,x3,%w[LDA],UXTW #4\n\t"\
+    "mov x4,%[b_scr]; mov w5,%w[K]\n\t"\
+    INIT_M4N##ndim\
+    "cmp w5,#4; b.lt 4f\n\t"\
+    KERNEL_M4N##ndim##_PRELOAD4\
+    "cmp w5,#12; b.lt 2f\n\t"\
+    ".balign 16; 1:\n\t"\
+    KERNEL_M4N##ndim##_K8_L4(0, 1, 2, 3, 4, 5, 6, 7)\
+    KERNEL_M4N##ndim##_K8_L4(4, 5, 6, 7, 0, 1, 2, 3)\
+    "b.ge 1b; 2:\n\t"\
+    "cmp w5,#8; b.lt 3f\n\t"\
+    KERNEL_M4N##ndim##_K8_L4(0, 1, 2, 3, 4, 5, 6, 7)\
+    KERNEL_M4N##ndim##_K8_T4(4, 5, 6, 7)\
+    "b 4f; 3:\n\t"\
+    KERNEL_M4N##ndim##_K8_T4(0, 1, 2, 3)\
+    "4:\n\t"\
+    "cmp w5,#1; b.lt 6f\n\t"\
+    "5:\n\t"\
+    KERNEL_M4N##ndim##_TL1\
+    "b.ge 5b; 6:\n\t"\
+    INIT_SAVE\
+    "cmp %w[c_rowmajor],#0; b.eq 7f\n\t"\
+    SAVE_M4N##ndim(CR) "b 8f\n\t"\
+    "7:\n\t"\
+    SAVE_M4N##ndim(CC)\
+    "8:\n\t"\
+  ::[a_ptr]"r"(a_ptr), [c_ptr]"r"(c_ptr), [b_scr]"r"(b_scr),\
+    [K]"r"(K), [LDA]"r"(LDA), [LDC]"r"(LDC),\
+    [beta_addr]"r"(beta_addr), [c_rowmajor]"r"(c_rowmajor)\
+  :"cc","memory","x0","x1","x2","x3","x4","x5","x6","x7","x8","x9",\
+  "x10","x11","x12","x13","x14","x15","x16",\
+  "v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11","v12","v13",\
+  "v14","v15","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25",\
+  "v26","v27","v28","v29","v30","v31");\
+}
+
+FUNC_K4(4)
+FUNC_K4(5)
+FUNC_K4(6)
+FUNC_K4(7)
+FUNC_K4(8)
+FUNC_K4(9)
+FUNC_K4(10)
+FUNC_K4(11)
+FUNC_K4(12)
+FUNC_K4(13)
+FUNC_K4(14)
+
+/* m4n15 c_vec */
+/* v14 - v16 v23_comp v24_comp v25_comp */
+/* v17 - v19 v29_comp v30_comp v31_comp */
+/* v20 - v22 v23_comp v24_comp v25_comp */
+/* v26 - v28 v29_comp v30_comp v31_comp */
+
+#define INIT_M4N15 \
+  INIT_4V(14, 15, 16, 17) INIT_4V(18, 19, 20, 21) INIT_4V(22, 23, 24, 25)\
+  INIT_4V(26, 27, 28, 29) INIT_2V(30, 31)
+
+#define SAVE_M4N15(mode) \
+  UNIT_SAVE_M4N4_VR_##mode(14, 17, 20, 26) UNIT_SAVE_M4N4_VR_##mode(15, 18, 21, 27)\
+  UNIT_SAVE_M4N4_VR_##mode(16, 19, 22, 28) EDGE_SAVE_M4N1K2_##mode(23, 29)\
+  EDGE_SAVE_M4N1K2_##mode(24, 30) EDGE_SAVE_M4N1K2_##mode(25, 31)
+
+#define KERNEL_M4N15_PRELOAD2 \
+  "ldr d0,[x0],#8; ldr d1,[x1],#8; ldr x16,[x2],#8; ldr x11,[x3],#8\n\t"\
+  "ldr q4,[x4]; ldr d5,[x4,#16]; ldr x10,[x4,#24]\n\t"\
+  "add x4,x4,#120; fmov v0.d[1],x16\n\t"
+
+#define KERNEL_M4N15_MAIN2(ac1, ac2, an1, an2, ap1, ap2) \
+  "fmov v5.d[1],x10; ldr d"#an1",[x0],#8\n\t"\
+  "fmla v14.4s,v4.4s,v"#ac1".s[0]\n\t"\
+  "fmla v17.4s,v4.4s,v"#ac2".s[0]; prfm pldl1keep,[x"#ap1",#64]\n\t"\
+  "fmla v20.4s,v4.4s,v"#ac1".s[2]\n\t"\
+  "fmov v"#ac2".d[1],x11; ldr d6,[x4,#-88]\n\t"\
+  "fmla v15.4s,v5.4s,v"#ac1".s[0]; ldr x10,[x4,#-80]\n\t"\
+  "fmla v18.4s,v5.4s,v"#ac2".s[0]; prfm pldl1keep,[x4,#64]\n\t"\
+  "fmla v26.4s,v4.4s,v"#ac2".s[2]\n\t"\
+  "fmov v6.d[1],x10; ldr d4,[x4,#-72]\n\t"\
+  "fmla v21.4s,v5.4s,v"#ac1".s[2]; ldr x10,[x4,#-64]\n\t"\
+  "fmla v27.4s,v5.4s,v"#ac2".s[2]; ldr x16,[x2],#8\n\t"\
+  "fmla v16.4s,v6.4s,v"#ac1".s[0]\n\t"\
+  "fmov v4.d[1],x10; ldr d5,[x4,#-56]\n\t"\
+  "fmla v19.4s,v6.4s,v"#ac2".s[0]; ldr x10,[x4,#-48]\n\t"\
+  "fmla v22.4s,v6.4s,v"#ac1".s[2]; prfm pldl1keep,[x"#ap2",#64]\n\t"\
+  "fmla v28.4s,v6.4s,v"#ac2".s[2]\n\t"\
+  "fmov v5.d[1],x10; ldr d"#an2",[x1],#8\n\t"\
+  "fmla v14.4s,v4.4s,v"#ac1".s[1]\n\t"\
+  "fmla v17.4s,v4.4s,v"#ac2".s[1]; prfm pldl1keep,[x4,#128]\n\t"\
+  "fmla v20.4s,v4.4s,v"#ac1".s[3]\n\t"\
+  "fmov v"#an1".d[1],x16; ldr d6,[x4,#-40]\n\t"\
+  "fmla v26.4s,v4.4s,v"#ac2".s[3]; ldr x10,[x4,#-32]\n\t"\
+  "fmla v15.4s,v5.4s,v"#ac1".s[1]\n\t"\
+  "fmla v18.4s,v5.4s,v"#ac2".s[1]\n\t"\
+  "fmov v6.d[1],x10; ldr d7,[x4,#-24]\n\t"\
+  "fmla v21.4s,v5.4s,v"#ac1".s[3]; ldr x10,[x4,#-16]\n\t"\
+  "fmla v27.4s,v5.4s,v"#ac2".s[3]; ldr x11,[x3],#8\n\t"\
+  "fmla v16.4s,v6.4s,v"#ac1".s[1]\n\t"\
+  "ins v7.d[1],v7.d[0]; dup v8.2d,x10\n\t"\
+  "fmla v19.4s,v6.4s,v"#ac2".s[1]; ldr x10,[x4,#-8]\n\t"\
+  "fmla v22.4s,v6.4s,v"#ac1".s[3]; add x4,x4,#120\n\t"\
+  "fmla v28.4s,v6.4s,v"#ac2".s[3]\n\t"\
+  "dup v6.2d,x10; ldr d4,[x4,#-120]\n\t"\
+  "fmla v23.4s,v7.4s,v"#ac1".4s; ldr x10,[x4,#-112]\n\t"\
+  "fmla v29.4s,v7.4s,v"#ac2".4s; sub w5,w5,#2\n\t"\
+  "fmla v24.4s,v8.4s,v"#ac1".4s\n\t"\
+  "fmov v4.d[1],x10; ldr d5,[x4,#-104]\n\t"\
+  "fmla v30.4s,v8.4s,v"#ac2".4s; ldr x10,[x4,#-96]\n\t"\
+  "fmla v25.4s,v6.4s,v"#ac1".4s; cmp w5,#6\n\t"\
+  "fmla v31.4s,v6.4s,v"#ac2".4s\n\t"
+
+#define KERNEL_M4N15_TAIL2(ac1, ac2) \
+  "fmov v5.d[1],x10\n\t"\
+  "fmla v14.4s,v4.4s,v"#ac1".s[0]\n\t"\
+  "fmla v17.4s,v4.4s,v"#ac2".s[0]\n\t"\
+  "fmla v20.4s,v4.4s,v"#ac1".s[2]\n\t"\
+  "fmov v"#ac2".d[1],x11; ldr d6,[x4,#-88]\n\t"\
+  "fmla v15.4s,v5.4s,v"#ac1".s[0]; ldr x10,[x4,#-80]\n\t"\
+  "fmla v18.4s,v5.4s,v"#ac2".s[0]\n\t"\
+  "fmla v26.4s,v4.4s,v"#ac2".s[2]\n\t"\
+  "fmov v6.d[1],x10; ldr d4,[x4,#-72]\n\t"\
+  "fmla v21.4s,v5.4s,v"#ac1".s[2]; ldr x10,[x4,#-64]\n\t"\
+  "fmla v27.4s,v5.4s,v"#ac2".s[2]\n\t"\
+  "fmla v16.4s,v6.4s,v"#ac1".s[0]\n\t"\
+  "fmov v4.d[1],x10; ldr d5,[x4,#-56]\n\t"\
+  "fmla v19.4s,v6.4s,v"#ac2".s[0]; ldr x10,[x4,#-48]\n\t"\
+  "fmla v22.4s,v6.4s,v"#ac1".s[2]; prfm pldl1keep,[x6]\n\t"\
+  "fmla v28.4s,v6.4s,v"#ac2".s[2]\n\t"\
+  "fmov v5.d[1],x10\n\t"\
+  "fmla v14.4s,v4.4s,v"#ac1".s[1]\n\t"\
+  "fmla v17.4s,v4.4s,v"#ac2".s[1]; prfm pldl1keep,[x7]\n\t"\
+  "fmla v20.4s,v4.4s,v"#ac1".s[3]\n\t"\
+  "ldr d6,[x4,#-40]\n\t"\
+  "fmla v26.4s,v4.4s,v"#ac2".s[3]; ldr x10,[x4,#-32]\n\t"\
+  "fmla v15.4s,v5.4s,v"#ac1".s[1]\n\t"\
+  "fmla v18.4s,v5.4s,v"#ac2".s[1]\n\t"\
+  "fmov v6.d[1],x10; ldr d7,[x4,#-24]\n\t"\
+  "fmla v21.4s,v5.4s,v"#ac1".s[3]; ldr x10,[x4,#-16]\n\t"\
+  "fmla v27.4s,v5.4s,v"#ac2".s[3]\n\t"\
+  "fmla v16.4s,v6.4s,v"#ac1".s[1]\n\t"\
+  "ins v7.d[1],v7.d[0]; dup v8.2d,x10\n\t"\
+  "fmla v19.4s,v6.4s,v"#ac2".s[1]; ldr x10,[x4,#-8]\n\t"\
+  "fmla v22.4s,v6.4s,v"#ac1".s[3]\n\t"\
+  "fmla v28.4s,v6.4s,v"#ac2".s[3]\n\t"\
+  "dup v6.2d,x10\n\t"\
+  "fmla v23.4s,v7.4s,v"#ac1".4s; prfm pldl1keep,[x8]\n\t"\
+  "fmla v29.4s,v7.4s,v"#ac2".4s; sub w5,w5,#2\n\t"\
+  "fmla v24.4s,v8.4s,v"#ac1".4s\n\t"\
+  "fmla v30.4s,v8.4s,v"#ac2".4s; prfm pldl1keep,[x9]\n\t"\
+  "fmla v25.4s,v6.4s,v"#ac1".4s\n\t"\
+  "fmla v31.4s,v6.4s,v"#ac2".4s\n\t"
+
+#define KERNEL_M4N15_FIN1 \
+  "ldr s0,[x0],#4; ldr q4,[x4]; ldr q5,[x4,#16]\n\t"\
+  "ldr q6,[x4,#32]; add x4,x4,#60\n\t"\
+  "ldr s1,[x1],#4\n\t"\
+  "fmla v14.4s,v4.4s,v0.s[0]\n\t"\
+  "fmla v15.4s,v5.4s,v0.s[0]\n\t"\
+  "fmla v16.4s,v6.4s,v0.s[0]\n\t"\
+  "ldr s2,[x2],#4\n\t"\
+  "fmla v17.4s,v4.4s,v1.s[0]; ldr w10,[x4,#-12]\n\t"\
+  "fmla v18.4s,v5.4s,v1.s[0]\n\t"\
+  "fmla v19.4s,v6.4s,v1.s[0]\n\t"\
+  "ldr s3,[x3],#4; dup v7.2d,x10\n\t"\
+  "fmla v20.4s,v4.4s,v2.s[0]; ldr w11,[x4,#-8]\n\t"\
+  "fmla v21.4s,v5.4s,v2.s[0]\n\t"\
+  "fmla v22.4s,v6.4s,v2.s[0]\n\t"\
+  "ins v0.d[1],v2.d[0]; dup v8.2d,x11\n\t"\
+  "fmla v26.4s,v4.4s,v3.s[0]; ldr w16,[x4,#-4]\n\t"\
+  "fmla v27.4s,v5.4s,v3.s[0]\n\t"\
+  "fmla v28.4s,v6.4s,v3.s[0]\n\t"\
+  "ins v1.d[1],v3.d[0]; dup v6.2d,x16\n\t"\
+  "fmla v23.4s,v7.4s,v0.4s\n\t"\
+  "fmla v24.4s,v8.4s,v0.4s\n\t"\
+  "fmla v29.4s,v7.4s,v1.4s\n\t"\
+  "fmla v30.4s,v8.4s,v1.4s\n\t"\
+  "fmla v25.4s,v6.4s,v0.4s\n\t"\
+  "fmla v31.4s,v6.4s,v1.4s\n\t"
+
+
+/* m4n16 c_vec */
+/* v16 - v19 */
+/* v20 - v23 */
+/* v24 - v27 */
+/* v28 - v31 */
+
+#define INIT_M4N16 \
+  INIT_4V(16, 17, 18, 19) INIT_4V(20, 21, 22, 23)\
+  INIT_4V(24, 25, 26, 27) INIT_4V(28, 29, 30, 31)
+
+#define SAVE_M4N16(mode) \
+  UNIT_SAVE_M4N4_VR_##mode(16, 20, 24, 28) UNIT_SAVE_M4N4_VR_##mode(17, 21, 25, 29)\
+  UNIT_SAVE_M4N4_VR_##mode(18, 22, 26, 30) UNIT_SAVE_M4N4_VR_##mode(19, 23, 27, 31)
+
+#define KERNEL_M4N16_PRELOAD2 \
+  "ldr d0,[x0],#8; ldr d1,[x1],#8; ldr x16,[x2],#8; ldr x11,[x3],#8\n\t"\
+  "ldr q4,[x4]; ldr d5,[x4,#16]; ldr x10,[x4,#24]\n\t"\
+  "add x4,x4,#128; fmov v0.d[1],x16\n\t"
+
+#define KERNEL_M4N16_MAIN2(ac1, ac2, an1, an2, ap1, ap2) \
+  "fmov v5.d[1],x10; ldr d"#an1",[x0],#8\n\t"\
+  "fmla v16.4s,v4.4s,v"#ac1".s[0]\n\t"\
+  "fmla v20.4s,v4.4s,v"#ac2".s[0]; prfm pldl1keep,[x"#ap1",#64]\n\t"\
+  "fmla v24.4s,v4.4s,v"#ac1".s[2]\n\t"\
+  "fmov v"#ac2".d[1],x11; ldr d6,[x4,#-96]\n\t"\
+  "fmla v17.4s,v5.4s,v"#ac1".s[0]; ldr x10,[x4,#-88]\n\t"\
+  "fmla v21.4s,v5.4s,v"#ac2".s[0]; prfm pldl1keep,[x4,#64]\n\t"\
+  "fmla v28.4s,v4.4s,v"#ac2".s[2]\n\t"\
+  "fmov v6.d[1],x10; ldr d7,[x4,#-80]\n\t"\
+  "fmla v25.4s,v5.4s,v"#ac1".s[2]; ldr x10,[x4,#-72]\n\t"\
+  "fmla v29.4s,v5.4s,v"#ac2".s[2]; ldr x16,[x2],#8\n\t"\
+  "fmla v18.4s,v6.4s,v"#ac1".s[0]\n\t"\
+  "fmov v7.d[1],x10; ldr d4,[x4,#-64]\n\t"\
+  "fmla v22.4s,v6.4s,v"#ac2".s[0]; ldr x10,[x4,#-56]\n\t"\
+  "fmla v26.4s,v6.4s,v"#ac1".s[2]\n\t"\
+  "fmla v30.4s,v6.4s,v"#ac2".s[2]\n\t"\
+  "fmov v4.d[1],x10; ldr d"#an2",[x1],#8\n\t"\
+  "fmla v19.4s,v7.4s,v"#ac1".s[0]\n\t"\
+  "fmla v23.4s,v7.4s,v"#ac2".s[0]; prfm pldl1keep,[x"#ap2",#64]\n\t"\
+  "fmla v27.4s,v7.4s,v"#ac1".s[2]\n\t"\
+  "fmov v"#an1".d[1],x16; ldr d5,[x4,#-48]\n\t"\
+  "fmla v31.4s,v7.4s,v"#ac2".s[2]; ldr x10,[x4,#-40]\n\t"\
+  "fmla v16.4s,v4.4s,v"#ac1".s[1]\n\t"\
+  "fmla v20.4s,v4.4s,v"#ac2".s[1]\n\t"\
+  "fmov v5.d[1],x10; ldr d6,[x4,#-32]\n\t"\
+  "fmla v24.4s,v4.4s,v"#ac1".s[3]; ldr x10,[x4,#-24]\n\t"\
+  "fmla v28.4s,v4.4s,v"#ac2".s[3]; ldr x11,[x3],#8\n\t"\
+  "fmla v17.4s,v5.4s,v"#ac1".s[1]\n\t"\
+  "fmov v6.d[1],x10; ldr d7,[x4,#-16]\n\t"\
+  "fmla v21.4s,v5.4s,v"#ac2".s[1]; ldr x10,[x4,#-8]\n\t"\
+  "fmla v25.4s,v5.4s,v"#ac1".s[3]; add x4,x4,#128\n\t"\
+  "fmla v29.4s,v5.4s,v"#ac2".s[3]\n\t"\
+  "fmov v7.d[1],x10; ldr d4,[x4,#-128]\n\t"\
+  "fmla v18.4s,v6.4s,v"#ac1".s[1]; ldr x16,[x4,#-120]\n\t"\
+  "fmla v22.4s,v6.4s,v"#ac2".s[1]; prfm pldl1keep,[x4]\n\t"\
+  "fmla v26.4s,v6.4s,v"#ac1".s[3]\n\t"\
+  "ldr d5,[x4,#-112]\n\t"\
+  "fmla v30.4s,v6.4s,v"#ac2".s[3]; ldr x10,[x4,#-104]\n\t"\
+  "fmla v19.4s,v7.4s,v"#ac1".s[1]; sub w5,w5,#2\n\t"\
+  "fmla v23.4s,v7.4s,v"#ac2".s[1]\n\t"\
+  "fmov v4.d[1],x16\n\t"\
+  "fmla v27.4s,v7.4s,v"#ac1".s[3]; cmp w5,#6\n\t"\
+  "fmla v31.4s,v7.4s,v"#ac2".s[3]\n\t"
+
+#define KERNEL_M4N16_TAIL2(ac1, ac2) \
+  "fmov v5.d[1],x10\n\t"\
+  "fmla v16.4s,v4.4s,v"#ac1".s[0]\n\t"\
+  "fmla v20.4s,v4.4s,v"#ac2".s[0]\n\t"\
+  "fmla v24.4s,v4.4s,v"#ac1".s[2]\n\t"\
+  "fmov v"#ac2".d[1],x11; ldr d6,[x4,#-96]\n\t"\
+  "fmla v17.4s,v5.4s,v"#ac1".s[0]; ldr x10,[x4,#-88]\n\t"\
+  "fmla v21.4s,v5.4s,v"#ac2".s[0]; prfm pldl1keep,[x6]\n\t"\
+  "fmla v28.4s,v4.4s,v"#ac2".s[2]\n\t"\
+  "fmov v6.d[1],x10; ldr d7,[x4,#-80]\n\t"\
+  "fmla v25.4s,v5.4s,v"#ac1".s[2]; ldr x10,[x4,#-72]\n\t"\
+  "fmla v29.4s,v5.4s,v"#ac2".s[2]\n\t"\
+  "fmla v18.4s,v6.4s,v"#ac1".s[0]\n\t"\
+  "fmov v7.d[1],x10; ldr d4,[x4,#-64]\n\t"\
+  "fmla v22.4s,v6.4s,v"#ac2".s[0]; ldr x10,[x4,#-56]\n\t"\
+  "fmla v26.4s,v6.4s,v"#ac1".s[2]; prfm pldl1keep,[x7]\n\t"\
+  "fmla v30.4s,v6.4s,v"#ac2".s[2]\n\t"\
+  "fmov v4.d[1],x10\n\t"\
+  "fmla v19.4s,v7.4s,v"#ac1".s[0]\n\t"\
+  "fmla v23.4s,v7.4s,v"#ac2".s[0]; prfm pldl1keep,[x8]\n\t"\
+  "fmla v27.4s,v7.4s,v"#ac1".s[2]\n\t"\
+  "ldr d5,[x4,#-48]\n\t"\
+  "fmla v31.4s,v7.4s,v"#ac2".s[2]; ldr x10,[x4,#-40]\n\t"\
+  "fmla v16.4s,v4.4s,v"#ac1".s[1]\n\t"\
+  "fmla v20.4s,v4.4s,v"#ac2".s[1]\n\t"\
+  "fmov v5.d[1],x10; ldr d6,[x4,#-32]\n\t"\
+  "fmla v24.4s,v4.4s,v"#ac1".s[3]; ldr x10,[x4,#-24]\n\t"\
+  "fmla v28.4s,v4.4s,v"#ac2".s[3]\n\t"\
+  "fmla v17.4s,v5.4s,v"#ac1".s[1]\n\t"\
+  "fmov v6.d[1],x10; ldr d7,[x4,#-16]\n\t"\
+  "fmla v21.4s,v5.4s,v"#ac2".s[1]; ldr x10,[x4,#-8]\n\t"\
+  "fmla v25.4s,v5.4s,v"#ac1".s[3]\n\t"\
+  "fmla v29.4s,v5.4s,v"#ac2".s[3]\n\t"\
+  "fmov v7.d[1],x10\n\t"\
+  "fmla v18.4s,v6.4s,v"#ac1".s[1]\n\t"\
+  "fmla v22.4s,v6.4s,v"#ac2".s[1]; prfm pldl1keep,[x9]\n\t"\
+  "fmla v26.4s,v6.4s,v"#ac1".s[3]\n\t"\
+  "fmla v30.4s,v6.4s,v"#ac2".s[3]\n\t"\
+  "fmla v19.4s,v7.4s,v"#ac1".s[1]; sub w5,w5,#2\n\t"\
+  "fmla v23.4s,v7.4s,v"#ac2".s[1]\n\t"\
+  "fmla v27.4s,v7.4s,v"#ac1".s[3]\n\t"\
+  "fmla v31.4s,v7.4s,v"#ac2".s[3]\n\t"
+
+#define KERNEL_M4N16_FIN1 \
+  "ldr s0,[x0],#4; ldr q4,[x4]; ldr q5,[x4,#16]\n\t"\
+  "ldr q6,[x4,#32]; add x4,x4,#64\n\t"\
+  "ldr s1,[x1],#4\n\t"\
+  "fmla v16.4s,v4.4s,v0.s[0]\n\t"\
+  "fmla v17.4s,v5.4s,v0.s[0]\n\t"\
+  "fmla v18.4s,v6.4s,v0.s[0]\n\t"\
+  "ldr s2,[x2],#4\n\t"\
+  "fmla v20.4s,v4.4s,v1.s[0]\n\t"\
+  "fmla v21.4s,v5.4s,v1.s[0]\n\t"\
+  "fmla v22.4s,v6.4s,v1.s[0]\n\t"\
+  "ldr s3,[x3],#4; ldr d7,[x4,#-16]\n\t"\
+  "fmla v24.4s,v4.4s,v2.s[0]; ldr x10,[x4,#-8]\n\t"\
+  "fmla v25.4s,v5.4s,v2.s[0]\n\t"\
+  "fmla v26.4s,v6.4s,v2.s[0]\n\t"\
+  "fmov v7.d[1],x10\n\t"\
+  "fmla v28.4s,v4.4s,v3.s[0]\n\t"\
+  "fmla v29.4s,v5.4s,v3.s[0]\n\t"\
+  "fmla v30.4s,v6.4s,v3.s[0]\n\t"\
+  "fmla v19.4s,v7.4s,v0.s[0]\n\t"\
+  "fmla v23.4s,v7.4s,v1.s[0]\n\t"\
+  "fmla v27.4s,v7.4s,v2.s[0]\n\t"\
+  "fmla v31.4s,v7.4s,v3.s[0]\n\t"
+
+
+/* m4n17 c_vec */
+/* v14 - v17 v26_comp */
+/* v18 - v21 v31_comp */
+/* v22 - v25 v26_comp */
+/* v27 - v30 v31_comp */
+
+#define INIT_M4N17 INIT_2V(14, 15)\
+  INIT_4V(16, 17, 18, 19) INIT_4V(20, 21, 22, 23)\
+  INIT_4V(24, 25, 26, 27) INIT_4V(28, 29, 30, 31)
+
+#define SAVE_M4N17(mode) \
+  UNIT_SAVE_M4N4_VR_##mode(14, 18, 22, 27) UNIT_SAVE_M4N4_VR_##mode(15, 19, 23, 28)\
+  UNIT_SAVE_M4N4_VR_##mode(16, 20, 24, 29) UNIT_SAVE_M4N4_VR_##mode(17, 21, 25, 30)\
+  EDGE_SAVE_M4N1K2_##mode(26, 31)
+
+#define KERNEL_M4N17_PRELOAD2 \
+  "ldr d0,[x0],#8; ldr d1,[x1],#8; ldr x16,[x2],#8; ldr x11,[x3],#8\n\t"\
+  "ldr q4,[x4]; ldr d5,[x4,#16]; ldr x10,[x4,#24]\n\t"\
+  "add x4,x4,#136; fmov v0.d[1],x16\n\t"
+
+#define KERNEL_M4N17_MAIN2(ac1, ac2, an1, an2, ap1, ap2) \
+  "fmov v5.d[1],x10; ldr d"#an1",[x0],#8\n\t"\
+  "fmla v14.4s,v4.4s,v"#ac1".s[0]\n\t"\
+  "fmla v18.4s,v4.4s,v"#ac2".s[0]; prfm pldl1keep,[x"#ap1",#64]\n\t"\
+  "fmla v22.4s,v4.4s,v"#ac1".s[2]\n\t"\
+  "fmov v"#ac2".d[1],x11; ldr d6,[x4,#-104]\n\t"\
+  "fmla v15.4s,v5.4s,v"#ac1".s[0]; ldr x10,[x4,#-96]\n\t"\
+  "fmla v19.4s,v5.4s,v"#ac2".s[0]; prfm pldl1keep,[x4,#64]\n\t"\
+  "fmla v27.4s,v4.4s,v"#ac2".s[2]\n\t"\
+  "fmov v6.d[1],x10; ldr d7,[x4,#-88]\n\t"\
+  "fmla v23.4s,v5.4s,v"#ac1".s[2]; ldr x10,[x4,#-80]\n\t"\
+  "fmla v28.4s,v5.4s,v"#ac2".s[2]; ldr x16,[x2],#8\n\t"\
+  "fmla v16.4s,v6.4s,v"#ac1".s[0]\n\t"\
+  "fmov v7.d[1],x10; ldr d4,[x4,#-72]\n\t"\
+  "fmla v20.4s,v6.4s,v"#ac2".s[0]; ldr x10,[x4,#-64]\n\t"\
+  "fmla v24.4s,v6.4s,v"#ac1".s[2]\n\t"\
+  "fmla v29.4s,v6.4s,v"#ac2".s[2]\n\t"\
+  "fmov v4.d[1],x10; ldr d"#an2",[x1],#8\n\t"\
+  "fmla v17.4s,v7.4s,v"#ac1".s[0]\n\t"\
+  "fmla v21.4s,v7.4s,v"#ac2".s[0]; prfm pldl1keep,[x"#ap2",#64]\n\t"\
+  "fmla v25.4s,v7.4s,v"#ac1".s[2]\n\t"\
+  "fmov v"#an1".d[1],x16; ldr d5,[x4,#-56]\n\t"\
+  "fmla v30.4s,v7.4s,v"#ac2".s[2]; ldr x10,[x4,#-48]\n\t"\
+  "fmla v14.4s,v4.4s,v"#ac1".s[1]\n\t"\
+  "fmla v18.4s,v4.4s,v"#ac2".s[1]\n\t"\
+  "fmov v5.d[1],x10; ldr d6,[x4,#-40]\n\t"\
+  "fmla v22.4s,v4.4s,v"#ac1".s[3]; ldr x10,[x4,#-32]\n\t"\
+  "fmla v27.4s,v4.4s,v"#ac2".s[3]; prfm pldl1keep,[x4,#112]\n\t"\
+  "fmla v15.4s,v5.4s,v"#ac1".s[1]\n\t"\
+  "fmov v6.d[1],x10; ldr d7,[x4,#-24]\n\t"\
+  "fmla v19.4s,v5.4s,v"#ac2".s[1]; ldr x10,[x4,#-16]\n\t"\
+  "fmla v23.4s,v5.4s,v"#ac1".s[3]; ldr x11,[x3],#8\n\t"\
+  "fmla v28.4s,v5.4s,v"#ac2".s[3]\n\t"\
+  "fmov v7.d[1],x10; ldr d8,[x4,#-8]\n\t"\
+  "fmla v16.4s,v6.4s,v"#ac1".s[1]\n\t"\
+  "fmla v20.4s,v6.4s,v"#ac2".s[1]; prfm pldl1keep,[x4,#160]\n\t"\
+  "fmla v24.4s,v6.4s,v"#ac1".s[3]\n\t"\
+  "ins v8.d[1],v8.d[0]; ldr d4,[x4]\n\t"\
+  "fmla v29.4s,v6.4s,v"#ac2".s[3]; ldr x16,[x4,#8]\n\t"\
+  "fmla v17.4s,v7.4s,v"#ac1".s[1]; add x4,x4,#136\n\t"\
+  "fmla v21.4s,v7.4s,v"#ac2".s[1]\n\t"\
+  "ldr d5,[x4,#-120]\n\t"\
+  "fmla v25.4s,v7.4s,v"#ac1".s[3]; ldr x10,[x4,#-112]\n\t"\
+  "fmla v30.4s,v7.4s,v"#ac2".s[3]; sub w5,w5,#2\n\t"\
+  "fmov v4.d[1],x16\n\t"\
+  "fmla v26.4s,v8.4s,v"#ac1".4s; cmp w5,#6\n\t"\
+  "fmla v31.4s,v8.4s,v"#ac2".4s\n\t"
+
+#define KERNEL_M4N17_TAIL2(ac1, ac2) \
+  "fmov v5.d[1],x10\n\t"\
+  "fmla v14.4s,v4.4s,v"#ac1".s[0]\n\t"\
+  "fmla v18.4s,v4.4s,v"#ac2".s[0]\n\t"\
+  "fmla v22.4s,v4.4s,v"#ac1".s[2]\n\t"\
+  "fmov v"#ac2".d[1],x11; ldr d6,[x4,#-104]\n\t"\
+  "fmla v15.4s,v5.4s,v"#ac1".s[0]; ldr x10,[x4,#-96]\n\t"\
+  "fmla v19.4s,v5.4s,v"#ac2".s[0]; prfm pldl1keep,[x6]\n\t"\
+  "fmla v27.4s,v4.4s,v"#ac2".s[2]\n\t"\
+  "fmov v6.d[1],x10; ldr d7,[x4,#-88]\n\t"\
+  "fmla v23.4s,v5.4s,v"#ac1".s[2]; ldr x10,[x4,#-80]\n\t"\
+  "fmla v28.4s,v5.4s,v"#ac2".s[2]\n\t"\
+  "fmla v16.4s,v6.4s,v"#ac1".s[0]\n\t"\
+  "fmov v7.d[1],x10; ldr d4,[x4,#-72]\n\t"\
+  "fmla v20.4s,v6.4s,v"#ac2".s[0]; ldr x10,[x4,#-64]\n\t"\
+  "fmla v24.4s,v6.4s,v"#ac1".s[2]\n\t"\
+  "fmla v29.4s,v6.4s,v"#ac2".s[2]\n\t"\
+  "fmov v4.d[1],x10\n\t"\
+  "fmla v17.4s,v7.4s,v"#ac1".s[0]\n\t"\
+  "fmla v21.4s,v7.4s,v"#ac2".s[0]; prfm pldl1keep,[x7]\n\t"\
+  "fmla v25.4s,v7.4s,v"#ac1".s[2]\n\t"\
+  "ldr d5,[x4,#-56]\n\t"\
+  "fmla v30.4s,v7.4s,v"#ac2".s[2]; ldr x10,[x4,#-48]\n\t"\
+  "fmla v14.4s,v4.4s,v"#ac1".s[1]\n\t"\
+  "fmla v18.4s,v4.4s,v"#ac2".s[1]\n\t"\
+  "fmov v5.d[1],x10; ldr d6,[x4,#-40]\n\t"\
+  "fmla v22.4s,v4.4s,v"#ac1".s[3]; ldr x10,[x4,#-32]\n\t"\
+  "fmla v27.4s,v4.4s,v"#ac2".s[3]; prfm pldl1keep,[x8]\n\t"\
+  "fmla v15.4s,v5.4s,v"#ac1".s[1]\n\t"\
+  "fmov v6.d[1],x10; ldr d7,[x4,#-24]\n\t"\
+  "fmla v19.4s,v5.4s,v"#ac2".s[1]; ldr x10,[x4,#-16]\n\t"\
+  "fmla v23.4s,v5.4s,v"#ac1".s[3]\n\t"\
+  "fmla v28.4s,v5.4s,v"#ac2".s[3]\n\t"\
+  "fmov v7.d[1],x10; ldr d8,[x4,#-8]\n\t"\
+  "fmla v16.4s,v6.4s,v"#ac1".s[1]\n\t"\
+  "fmla v20.4s,v6.4s,v"#ac2".s[1]; prfm pldl1keep,[x9]\n\t"\
+  "fmla v24.4s,v6.4s,v"#ac1".s[3]\n\t"\
+  "ins v8.d[1],v8.d[0]\n\t"\
+  "fmla v29.4s,v6.4s,v"#ac2".s[3]\n\t"\
+  "fmla v17.4s,v7.4s,v"#ac1".s[1]\n\t"\
+  "fmla v21.4s,v7.4s,v"#ac2".s[1]\n\t"\
+  "fmla v25.4s,v7.4s,v"#ac1".s[3]\n\t"\
+  "fmla v30.4s,v7.4s,v"#ac2".s[3]; sub w5,w5,#2\n\t"\
+  "fmla v26.4s,v8.4s,v"#ac1".4s\n\t"\
+  "fmla v31.4s,v8.4s,v"#ac2".4s\n\t"
+
+#define KERNEL_M4N17_FIN1 \
+  "ldr s0,[x0],#4; ldr q4,[x4]; ldr q5,[x4,#16]\n\t"\
+  "ldr q6,[x4,#32]; add x4,x4,#68\n\t"\
+  "ldr s1,[x1],#4\n\t"\
+  "fmla v14.4s,v4.4s,v0.s[0]\n\t"\
+  "fmla v15.4s,v5.4s,v0.s[0]\n\t"\
+  "fmla v16.4s,v6.4s,v0.s[0]\n\t"\
+  "ldr s2,[x2],#4\n\t"\
+  "fmla v18.4s,v4.4s,v1.s[0]\n\t"\
+  "fmla v19.4s,v5.4s,v1.s[0]\n\t"\
+  "fmla v20.4s,v6.4s,v1.s[0]\n\t"\
+  "ldr s3,[x3],#4; ldr d7,[x4,#-20]\n\t"\
+  "fmla v22.4s,v4.4s,v2.s[0]; ldr x10,[x4,#-12]\n\t"\
+  "fmla v23.4s,v5.4s,v2.s[0]\n\t"\
+  "fmla v24.4s,v6.4s,v2.s[0]\n\t"\
+  "fmov v7.d[1],x10; ldr s8,[x4,#-4]\n\t"\
+  "fmla v27.4s,v4.4s,v3.s[0]\n\t"\
+  "fmla v28.4s,v5.4s,v3.s[0]\n\t"\
+  "fmla v29.4s,v6.4s,v3.s[0]\n\t"\
+  "ins v8.d[1],v8.d[0]\n\t"\
+  "fmla v17.4s,v7.4s,v0.s[0]\n\t"\
+  "fmla v21.4s,v7.4s,v1.s[0]\n\t"\
+  "fmla v25.4s,v7.4s,v2.s[0]\n\t"\
+  "ins v0.d[1],v2.d[0]; ins v1.d[1],v3.d[0]\n\t"\
+  "fmla v30.4s,v7.4s,v3.s[0]\n\t"\
+  "fmla v26.4s,v8.4s,v0.4s\n\t"\
+  "fmla v31.4s,v8.4s,v1.4s\n\t"
+
+
+/* m4n18 c_vec */
+/* v12 - v15 v24_comp v25_comp */
+/* v16 - v19 v30_comp v31_comp */
+/* v20 - v23 v24_comp v25_comp */
+/* v26 - v29 v30_comp v31_comp */
+
+#define INIT_M4N18 INIT_4V(12, 13, 14, 15)\
+  INIT_4V(16, 17, 18, 19) INIT_4V(20, 21, 22, 23)\
+  INIT_4V(24, 25, 26, 27) INIT_4V(28, 29, 30, 31)
+
+#define SAVE_M4N18(mode) \
+  UNIT_SAVE_M4N4_VR_##mode(12, 16, 20, 26) UNIT_SAVE_M4N4_VR_##mode(13, 17, 21, 27)\
+  UNIT_SAVE_M4N4_VR_##mode(14, 18, 22, 28) UNIT_SAVE_M4N4_VR_##mode(15, 19, 23, 29)\
+  EDGE_SAVE_M4N1K2_##mode(24, 30) EDGE_SAVE_M4N1K2_##mode(25, 31)
+
+#define KERNEL_M4N18_PRELOAD2 \
+  "ldr d0,[x0],#8; ldr d1,[x1],#8; ldr x16,[x2],#8; ldr x11,[x3],#8\n\t"\
+  "ldr q4,[x4]; ldr d5,[x4,#16]; ldr x10,[x4,#24]\n\t"\
+  "add x4,x4,#144; fmov v0.d[1],x16\n\t"
+
+#define KERNEL_M4N18_MAIN2(ac1, ac2, an1, an2, ap1, ap2) \
+  "fmov v5.d[1],x10; ldr d"#an1",[x0],#8\n\t"\
+  "fmla v12.4s,v4.4s,v"#ac1".s[0]\n\t"\
+  "fmla v16.4s,v4.4s,v"#ac2".s[0]; prfm pldl1keep,[x"#ap1",#64]\n\t"\
+  "fmla v20.4s,v4.4s,v"#ac1".s[2]\n\t"\
+  "fmov v"#ac2".d[1],x11; ldr d6,[x4,#-112]\n\t"\
+  "fmla v13.4s,v5.4s,v"#ac1".s[0]; ldr x10,[x4,#-104]\n\t"\
+  "fmla v17.4s,v5.4s,v"#ac2".s[0]; prfm pldl1keep,[x4,#64]\n\t"\
+  "fmla v26.4s,v4.4s,v"#ac2".s[2]\n\t"\
+  "fmov v6.d[1],x10; ldr d7,[x4,#-96]\n\t"\
+  "fmla v21.4s,v5.4s,v"#ac1".s[2]; ldr x10,[x4,#-88]\n\t"\
+  "fmla v27.4s,v5.4s,v"#ac2".s[2]; ldr x16,[x2],#8\n\t"\
+  "fmla v14.4s,v6.4s,v"#ac1".s[0]\n\t"\
+  "fmov v7.d[1],x10; ldr d4,[x4,#-80]\n\t"\
+  "fmla v18.4s,v6.4s,v"#ac2".s[0]; ldr x10,[x4,#-72]\n\t"\
+  "fmla v22.4s,v6.4s,v"#ac1".s[2]\n\t"\
+  "fmla v28.4s,v6.4s,v"#ac2".s[2]\n\t"\
+  "fmov v4.d[1],x10; ldr d"#an2",[x1],#8\n\t"\
+  "fmla v15.4s,v7.4s,v"#ac1".s[0]\n\t"\
+  "fmla v19.4s,v7.4s,v"#ac2".s[0]; prfm pldl1keep,[x"#ap2",#64]\n\t"\
+  "fmla v23.4s,v7.4s,v"#ac1".s[2]\n\t"\
+  "fmov v"#an1".d[1],x16; ldr d5,[x4,#-64]\n\t"\
+  "fmla v29.4s,v7.4s,v"#ac2".s[2]; ldr x10,[x4,#-56]\n\t"\
+  "fmla v12.4s,v4.4s,v"#ac1".s[1]\n\t"\
+  "fmla v16.4s,v4.4s,v"#ac2".s[1]\n\t"\
+  "fmov v5.d[1],x10; ldr d6,[x4,#-48]\n\t"\
+  "fmla v20.4s,v4.4s,v"#ac1".s[3]; ldr x10,[x4,#-40]\n\t"\
+  "fmla v26.4s,v4.4s,v"#ac2".s[3]; prfm pldl1keep,[x4,#112]\n\t"\
+  "fmla v13.4s,v5.4s,v"#ac1".s[1]\n\t"\
+  "fmov v6.d[1],x10; ldr d7,[x4,#-32]\n\t"\
+  "fmla v17.4s,v5.4s,v"#ac2".s[1]; ldr x10,[x4,#-24]\n\t"\
+  "fmla v21.4s,v5.4s,v"#ac1".s[3]; ldr x11,[x3],#8\n\t"\
+  "fmla v27.4s,v5.4s,v"#ac2".s[3]\n\t"\
+  "fmov v7.d[1],x10; ldr d8,[x4,#-16]\n\t"\
+  "fmla v14.4s,v6.4s,v"#ac1".s[1]\n\t"\
+  "fmla v18.4s,v6.4s,v"#ac2".s[1]; prfm pldl1keep,[x4,#160]\n\t"\
+  "fmla v22.4s,v6.4s,v"#ac1".s[3]\n\t"\
+  "ins v8.d[1],v8.d[0]; ldr d9,[x4,#-8]\n\t"\
+  "fmla v28.4s,v6.4s,v"#ac2".s[3]\n\t"\
+  "fmla v15.4s,v7.4s,v"#ac1".s[1]; add x4,x4,#144\n\t"\
+  "fmla v19.4s,v7.4s,v"#ac2".s[1]\n\t"\
+  "ins v9.d[1],v9.d[0]; ldr d4,[x4,#-144]\n\t"\
+  "fmla v23.4s,v7.4s,v"#ac1".s[3]; ldr x10,[x4,#-136]\n\t"\
+  "fmla v29.4s,v7.4s,v"#ac2".s[3]; sub w5,w5,#2\n\t"\
+  "fmla v24.4s,v8.4s,v"#ac1".4s\n\t"\
+  "fmov v4.d[1],x10; ldr d5,[x4,#-128]\n\t"\
+  "fmla v30.4s,v8.4s,v"#ac2".4s; ldr x10,[x4,#-120]\n\t"\
+  "fmla v25.4s,v9.4s,v"#ac1".4s; cmp w5,#6\n\t"\
+  "fmla v31.4s,v9.4s,v"#ac2".4s\n\t"
+
+#define KERNEL_M4N18_TAIL2(ac1, ac2) \
+  "fmov v5.d[1],x10\n\t"\
+  "fmla v12.4s,v4.4s,v"#ac1".s[0]\n\t"\
+  "fmla v16.4s,v4.4s,v"#ac2".s[0]; prfm pldl1keep,[x6]\n\t"\
+  "fmla v20.4s,v4.4s,v"#ac1".s[2]\n\t"\
+  "fmov v"#ac2".d[1],x11; ldr d6,[x4,#-112]\n\t"\
+  "fmla v13.4s,v5.4s,v"#ac1".s[0]; ldr x10,[x4,#-104]\n\t"\
+  "fmla v17.4s,v5.4s,v"#ac2".s[0]\n\t"\
+  "fmla v26.4s,v4.4s,v"#ac2".s[2]\n\t"\
+  "fmov v6.d[1],x10; ldr d7,[x4,#-96]\n\t"\
+  "fmla v21.4s,v5.4s,v"#ac1".s[2]; ldr x10,[x4,#-88]\n\t"\
+  "fmla v27.4s,v5.4s,v"#ac2".s[2]\n\t"\
+  "fmla v14.4s,v6.4s,v"#ac1".s[0]\n\t"\
+  "fmov v7.d[1],x10; ldr d4,[x4,#-80]\n\t"\
+  "fmla v18.4s,v6.4s,v"#ac2".s[0]; ldr x10,[x4,#-72]\n\t"\
+  "fmla v22.4s,v6.4s,v"#ac1".s[2]\n\t"\
+  "fmla v28.4s,v6.4s,v"#ac2".s[2]\n\t"\
+  "fmov v4.d[1],x10\n\t"\
+  "fmla v15.4s,v7.4s,v"#ac1".s[0]\n\t"\
+  "fmla v19.4s,v7.4s,v"#ac2".s[0]; prfm pldl1keep,[x7]\n\t"\
+  "fmla v23.4s,v7.4s,v"#ac1".s[2]\n\t"\
+  "ldr d5,[x4,#-64]\n\t"\
+  "fmla v29.4s,v7.4s,v"#ac2".s[2]; ldr x10,[x4,#-56]\n\t"\
+  "fmla v12.4s,v4.4s,v"#ac1".s[1]\n\t"\
+  "fmla v16.4s,v4.4s,v"#ac2".s[1]\n\t"\
+  "fmov v5.d[1],x10; ldr d6,[x4,#-48]\n\t"\
+  "fmla v20.4s,v4.4s,v"#ac1".s[3]; ldr x10,[x4,#-40]\n\t"\
+  "fmla v26.4s,v4.4s,v"#ac2".s[3]; prfm pldl1keep,[x8]\n\t"\
+  "fmla v13.4s,v5.4s,v"#ac1".s[1]\n\t"\
+  "fmov v6.d[1],x10; ldr d7,[x4,#-32]\n\t"\
+  "fmla v17.4s,v5.4s,v"#ac2".s[1]; ldr x10,[x4,#-24]\n\t"\
+  "fmla v21.4s,v5.4s,v"#ac1".s[3]\n\t"\
+  "fmla v27.4s,v5.4s,v"#ac2".s[3]\n\t"\
+  "fmov v7.d[1],x10; ldr d8,[x4,#-16]\n\t"\
+  "fmla v14.4s,v6.4s,v"#ac1".s[1]\n\t"\
+  "fmla v18.4s,v6.4s,v"#ac2".s[1]; prfm pldl1keep,[x9]\n\t"\
+  "fmla v22.4s,v6.4s,v"#ac1".s[3]\n\t"\
+  "ins v8.d[1],v8.d[0]; ldr d9,[x4,#-8]\n\t"\
+  "fmla v28.4s,v6.4s,v"#ac2".s[3]\n\t"\
+  "fmla v15.4s,v7.4s,v"#ac1".s[1]\n\t"\
+  "fmla v19.4s,v7.4s,v"#ac2".s[1]\n\t"\
+  "ins v9.d[1],v9.d[0]\n\t"\
+  "fmla v23.4s,v7.4s,v"#ac1".s[3]\n\t"\
+  "fmla v29.4s,v7.4s,v"#ac2".s[3]; sub w5,w5,#2\n\t"\
+  "fmla v24.4s,v8.4s,v"#ac1".4s\n\t"\
+  "fmla v30.4s,v8.4s,v"#ac2".4s\n\t"\
+  "fmla v25.4s,v9.4s,v"#ac1".4s\n\t"\
+  "fmla v31.4s,v9.4s,v"#ac2".4s\n\t"
+
+#define KERNEL_M4N18_FIN1 \
+  "ldr s0,[x0],#4; ldr q4,[x4]; ldr q5,[x4,#16]\n\t"\
+  "ldr q6,[x4,#32]; add x4,x4,#72\n\t"\
+  "ldr s1,[x1],#4\n\t"\
+  "fmla v12.4s,v4.4s,v0.s[0]\n\t"\
+  "fmla v13.4s,v5.4s,v0.s[0]\n\t"\
+  "fmla v14.4s,v6.4s,v0.s[0]\n\t"\
+  "ldr s2,[x2],#4\n\t"\
+  "fmla v16.4s,v4.4s,v1.s[0]\n\t"\
+  "fmla v17.4s,v5.4s,v1.s[0]\n\t"\
+  "fmla v18.4s,v6.4s,v1.s[0]\n\t"\
+  "ldr s3,[x3],#4; ldr d7,[x4,#-24]\n\t"\
+  "fmla v20.4s,v4.4s,v2.s[0]; ldr x10,[x4,#-16]\n\t"\
+  "fmla v21.4s,v5.4s,v2.s[0]\n\t"\
+  "fmla v22.4s,v6.4s,v2.s[0]\n\t"\
+  "fmov v7.d[1],x10; ldr s8,[x4,#-8]\n\t"\
+  "fmla v26.4s,v4.4s,v3.s[0]; ldr w10,[x4,#-4]\n\t"\
+  "fmla v27.4s,v5.4s,v3.s[0]\n\t"\
+  "fmla v28.4s,v6.4s,v3.s[0]\n\t"\
+  "ins v8.d[1],v8.d[0]; dup v9.2d,x10\n\t"\
+  "fmla v15.4s,v7.4s,v0.s[0]\n\t"\
+  "fmla v19.4s,v7.4s,v1.s[0]\n\t"\
+  "fmla v23.4s,v7.4s,v2.s[0]\n\t"\
+  "ins v0.d[1],v2.d[0]; ins v1.d[1],v3.d[0]\n\t"\
+  "fmla v29.4s,v7.4s,v3.s[0]\n\t"\
+  "fmla v24.4s,v8.4s,v0.4s\n\t"\
+  "fmla v30.4s,v8.4s,v1.4s\n\t"\
+  "fmla v25.4s,v9.4s,v0.4s\n\t"\
+  "fmla v31.4s,v9.4s,v1.4s\n\t"
+
+
+/* m4n19 c_vec */
+/* v10 - v13 v22_comp v23_comp v24_comp */
+/* v14 - v17 v29_comp v30_comp v31_comp */
+/* v18 - v21 v22_comp v23_comp v24_comp */
+/* v25 - v28 v29_comp v30_comp v31_comp */
+
+#define INIT_M4N19 \
+  INIT_2V(10, 11) INIT_4V(12, 13, 14, 15)\
+  INIT_4V(16, 17, 18, 19) INIT_4V(20, 21, 22, 23)\
+  INIT_4V(24, 25, 26, 27) INIT_4V(28, 29, 30, 31)
+
+#define SAVE_M4N19(mode) \
+  UNIT_SAVE_M4N4_VR_##mode(10, 14, 18, 25) UNIT_SAVE_M4N4_VR_##mode(11, 15, 19, 26)\
+  UNIT_SAVE_M4N4_VR_##mode(12, 16, 20, 27) UNIT_SAVE_M4N4_VR_##mode(13, 17, 21, 28)\
+  EDGE_SAVE_M4N1K2_##mode(22, 29) EDGE_SAVE_M4N1K2_##mode(23, 30) EDGE_SAVE_M4N1K2_##mode(24, 31)
+
+#define KERNEL_M4N19_PRELOAD2 \
+  "ldr d0,[x0],#8; ldr d1,[x1],#8; ldr x16,[x2],#8; ldr x11,[x3],#8\n\t"\
+  "ldr q4,[x4]; ldr d5,[x4,#16]; ldr x10,[x4,#24]\n\t"\
+  "add x4,x4,#152; fmov v0.d[1],x16\n\t"
+
+#define KERNEL_M4N19_MAIN2(ac1, ac2, an1, an2, ap1, ap2) \
+  "fmov v5.d[1],x10; ldr d"#an1",[x0],#8\n\t"\
+  "fmla v10.4s,v4.4s,v"#ac1".s[0]\n\t"\
+  "fmla v14.4s,v4.4s,v"#ac2".s[0]; prfm pldl1keep,[x"#ap1",#64]\n\t"\
+  "fmla v18.4s,v4.4s,v"#ac1".s[2]\n\t"\
+  "fmov v"#ac2".d[1],x11; ldr d6,[x4,#-120]\n\t"\
+  "fmla v11.4s,v5.4s,v"#ac1".s[0]; ldr x10,[x4,#-112]\n\t"\
+  "fmla v15.4s,v5.4s,v"#ac2".s[0]; prfm pldl1keep,[x4,#64]\n\t"\
+  "fmla v25.4s,v4.4s,v"#ac2".s[2]\n\t"\
+  "fmov v6.d[1],x10; ldr d7,[x4,#-104]\n\t"\
+  "fmla v19.4s,v5.4s,v"#ac1".s[2]; ldr x10,[x4,#-96]\n\t"\
+  "fmla v26.4s,v5.4s,v"#ac2".s[2]; ldr x16,[x2],#8\n\t"\
+  "fmla v12.4s,v6.4s,v"#ac1".s[0]\n\t"\
+  "fmov v7.d[1],x10; ldr d4,[x4,#-88]\n\t"\
+  "fmla v16.4s,v6.4s,v"#ac2".s[0]; ldr x10,[x4,#-80]\n\t"\
+  "fmla v20.4s,v6.4s,v"#ac1".s[2]\n\t"\
+  "fmla v27.4s,v6.4s,v"#ac2".s[2]\n\t"\
+  "fmov v4.d[1],x10; ldr d5,[x4,#-72]\n\t"\
+  "fmla v13.4s,v7.4s,v"#ac1".s[0]; ldr x10,[x4,#-64]\n\t"\
+  "fmla v17.4s,v7.4s,v"#ac2".s[0]\n\t"\
+  "fmla v21.4s,v7.4s,v"#ac1".s[2]\n\t"\
+  "fmov v5.d[1],x10; ldr d"#an2",[x1],#8\n\t"\
+  "fmla v28.4s,v7.4s,v"#ac2".s[2]\n\t"\
+  "fmla v10.4s,v4.4s,v"#ac1".s[1]; prfm pldl1keep,[x"#ap2",#64]\n\t"\
+  "fmla v14.4s,v4.4s,v"#ac2".s[1]\n\t"\
+  "fmov v"#an1".d[1],x16; ldr d6,[x4,#-56]\n\t"\
+  "fmla v18.4s,v4.4s,v"#ac1".s[3]; ldr x10,[x4,#-48]\n\t"\
+  "fmla v25.4s,v4.4s,v"#ac2".s[3]; prfm pldl1keep,[x4,#112]\n\t"\
+  "fmla v11.4s,v5.4s,v"#ac1".s[1]\n\t"\
+  "fmov v6.d[1],x10; ldr d7,[x4,#-40]\n\t"\
+  "fmla v15.4s,v5.4s,v"#ac2".s[1]; ldr x10,[x4,#-32]\n\t"\
+  "fmla v19.4s,v5.4s,v"#ac1".s[3]\n\t"\
+  "fmla v26.4s,v5.4s,v"#ac2".s[3]\n\t"\
+  "fmov v7.d[1],x10; ldr d8,[x4,#-24]\n\t"\
+  "fmla v12.4s,v6.4s,v"#ac1".s[1]\n\t"\
+  "fmla v16.4s,v6.4s,v"#ac2".s[1]; prfm pldl1keep,[x4,#160]\n\t"\
+  "fmla v20.4s,v6.4s,v"#ac1".s[3]\n\t"\
+  "ins v8.d[1],v8.d[0]; ldr d9,[x4,#-16]\n\t"\
+  "fmla v27.4s,v6.4s,v"#ac2".s[3]\n\t"\
+  "fmla v13.4s,v7.4s,v"#ac1".s[1]; ldr x11,[x3],#8\n\t"\
+  "fmla v17.4s,v7.4s,v"#ac2".s[1]\n\t"\
+  "ins v9.d[1],v9.d[0]; ldr d6,[x4,#-8]\n\t"\
+  "fmla v21.4s,v7.4s,v"#ac1".s[3]; add x4,x4,#152\n\t"\
+  "fmla v28.4s,v7.4s,v"#ac2".s[3]; sub w5,w5,#2\n\t"\
+  "fmla v22.4s,v8.4s,v"#ac1".4s\n\t"\
+  "ins v6.d[1],v6.d[0]; ldr d4,[x4,#-152]\n\t"\
+  "fmla v29.4s,v8.4s,v"#ac2".4s; ldr x10,[x4,#-144]\n\t"\
+  "fmla v23.4s,v9.4s,v"#ac1".4s; cmp w5,#6\n\t"\
+  "fmla v30.4s,v9.4s,v"#ac2".4s\n\t"\
+  "fmov v4.d[1],x10; ldr d5,[x4,#-136]\n\t"\
+  "fmla v24.4s,v6.4s,v"#ac1".4s; ldr x10,[x4,#-128]\n\t"\
+  "fmla v31.4s,v6.4s,v"#ac2".4s\n\t"
+
+#define KERNEL_M4N19_TAIL2(ac1, ac2) \
+  "fmov v5.d[1],x10\n\t"\
+  "fmla v10.4s,v4.4s,v"#ac1".s[0]\n\t"\
+  "fmla v14.4s,v4.4s,v"#ac2".s[0]; prfm pldl1keep,[x6]\n\t"\
+  "fmla v18.4s,v4.4s,v"#ac1".s[2]\n\t"\
+  "fmov v"#ac2".d[1],x11; ldr d6,[x4,#-120]\n\t"\
+  "fmla v11.4s,v5.4s,v"#ac1".s[0]; ldr x10,[x4,#-112]\n\t"\
+  "fmla v15.4s,v5.4s,v"#ac2".s[0]\n\t"\
+  "fmla v25.4s,v4.4s,v"#ac2".s[2]\n\t"\
+  "fmov v6.d[1],x10; ldr d7,[x4,#-104]\n\t"\
+  "fmla v19.4s,v5.4s,v"#ac1".s[2]; ldr x10,[x4,#-96]\n\t"\
+  "fmla v26.4s,v5.4s,v"#ac2".s[2]\n\t"\
+  "fmla v12.4s,v6.4s,v"#ac1".s[0]\n\t"\
+  "fmov v7.d[1],x10; ldr d4,[x4,#-88]\n\t"\
+  "fmla v16.4s,v6.4s,v"#ac2".s[0]; ldr x10,[x4,#-80]\n\t"\
+  "fmla v20.4s,v6.4s,v"#ac1".s[2]\n\t"\
+  "fmla v27.4s,v6.4s,v"#ac2".s[2]\n\t"\
+  "fmov v4.d[1],x10; ldr d5,[x4,#-72]\n\t"\
+  "fmla v13.4s,v7.4s,v"#ac1".s[0]; ldr x10,[x4,#-64]\n\t"\
+  "fmla v17.4s,v7.4s,v"#ac2".s[0]\n\t"\
+  "fmla v21.4s,v7.4s,v"#ac1".s[2]\n\t"\
+  "fmov v5.d[1],x10\n\t"\
+  "fmla v28.4s,v7.4s,v"#ac2".s[2]\n\t"\
+  "fmla v10.4s,v4.4s,v"#ac1".s[1]; prfm pldl1keep,[x7]\n\t"\
+  "fmla v14.4s,v4.4s,v"#ac2".s[1]\n\t"\
+  "ldr d6,[x4,#-56]\n\t"\
+  "fmla v18.4s,v4.4s,v"#ac1".s[3]; ldr x10,[x4,#-48]\n\t"\
+  "fmla v25.4s,v4.4s,v"#ac2".s[3]; prfm pldl1keep,[x8]\n\t"\
+  "fmla v11.4s,v5.4s,v"#ac1".s[1]\n\t"\
+  "fmov v6.d[1],x10; ldr d7,[x4,#-40]\n\t"\
+  "fmla v15.4s,v5.4s,v"#ac2".s[1]; ldr x10,[x4,#-32]\n\t"\
+  "fmla v19.4s,v5.4s,v"#ac1".s[3]\n\t"\
+  "fmla v26.4s,v5.4s,v"#ac2".s[3]\n\t"\
+  "fmov v7.d[1],x10; ldr d8,[x4,#-24]\n\t"\
+  "fmla v12.4s,v6.4s,v"#ac1".s[1]\n\t"\
+  "fmla v16.4s,v6.4s,v"#ac2".s[1]; prfm pldl1keep,[x9]\n\t"\
+  "fmla v20.4s,v6.4s,v"#ac1".s[3]\n\t"\
+  "ins v8.d[1],v8.d[0]; ldr d9,[x4,#-16]\n\t"\
+  "fmla v27.4s,v6.4s,v"#ac2".s[3]\n\t"\
+  "fmla v13.4s,v7.4s,v"#ac1".s[1]\n\t"\
+  "fmla v17.4s,v7.4s,v"#ac2".s[1]\n\t"\
+  "ins v9.d[1],v9.d[0]; ldr d6,[x4,#-8]\n\t"\
+  "fmla v21.4s,v7.4s,v"#ac1".s[3]\n\t"\
+  "fmla v28.4s,v7.4s,v"#ac2".s[3]; sub w5,w5,#2\n\t"\
+  "fmla v22.4s,v8.4s,v"#ac1".4s\n\t"\
+  "ins v6.d[1],v6.d[0]\n\t"\
+  "fmla v29.4s,v8.4s,v"#ac2".4s\n\t"\
+  "fmla v23.4s,v9.4s,v"#ac1".4s\n\t"\
+  "fmla v30.4s,v9.4s,v"#ac2".4s\n\t"\
+  "fmla v24.4s,v6.4s,v"#ac1".4s\n\t"\
+  "fmla v31.4s,v6.4s,v"#ac2".4s\n\t"
+
+#define KERNEL_M4N19_FIN1 \
+  "ldr s0,[x0],#4; ldr q4,[x4]; ldr q5,[x4,#16]\n\t"\
+  "ldr q6,[x4,#32]; add x4,x4,#76\n\t"\
+  "ldr s1,[x1],#4\n\t"\
+  "fmla v10.4s,v4.4s,v0.s[0]\n\t"\
+  "fmla v11.4s,v5.4s,v0.s[0]\n\t"\
+  "fmla v12.4s,v6.4s,v0.s[0]\n\t"\
+  "ldr s2,[x2],#4\n\t"\
+  "fmla v14.4s,v4.4s,v1.s[0]\n\t"\
+  "fmla v15.4s,v5.4s,v1.s[0]\n\t"\
+  "fmla v16.4s,v6.4s,v1.s[0]\n\t"\
+  "ldr s3,[x3],#4; ldr d7,[x4,#-28]\n\t"\
+  "fmla v18.4s,v4.4s,v2.s[0]; ldr x10,[x4,#-20]\n\t"\
+  "fmla v19.4s,v5.4s,v2.s[0]\n\t"\
+  "fmla v20.4s,v6.4s,v2.s[0]\n\t"\
+  "fmov v7.d[1],x10; ldr s8,[x4,#-12]\n\t"\
+  "fmla v25.4s,v4.4s,v3.s[0]; ldr w10,[x4,#-8]\n\t"\
+  "fmla v26.4s,v5.4s,v3.s[0]\n\t"\
+  "fmla v27.4s,v6.4s,v3.s[0]\n\t"\
+  "ins v8.d[1],v8.d[0]; dup v9.2d,x10\n\t"\
+  "fmla v13.4s,v7.4s,v0.s[0]; ldr w10,[x4,#-4]\n\t"\
+  "fmla v17.4s,v7.4s,v1.s[0]\n\t"\
+  "fmla v21.4s,v7.4s,v2.s[0]\n\t"\
+  "ins v0.d[1],v2.d[0]; ins v1.d[1],v3.d[0]\n\t"\
+  "fmla v28.4s,v7.4s,v3.s[0]\n\t"\
+  "fmla v22.4s,v8.4s,v0.4s\n\t"\
+  "fmla v29.4s,v8.4s,v1.4s\n\t"\
+  "dup v6.2d,x10\n\t"\
+  "fmla v23.4s,v9.4s,v0.4s\n\t"\
+  "fmla v30.4s,v9.4s,v1.4s\n\t"\
+  "fmla v24.4s,v6.4s,v0.4s\n\t"\
+  "fmla v31.4s,v6.4s,v1.4s\n\t"
+
+
+/* m4n20 c_vec */
+/* v12 - v16 */
+/* v17 - v21 */
+/* v22 - v26 */
+/* v27 - v31 */
+
+#define INIT_M4N20 INIT_4V(12, 13, 14, 15)\
+  INIT_4V(16, 17, 18, 19) INIT_4V(20, 21, 22, 23)\
+  INIT_4V(24, 25, 26, 27) INIT_4V(28, 29, 30, 31)
+
+#define SAVE_M4N20(mode) UNIT_SAVE_M4N4_VR_##mode(12, 17, 22, 27)\
+  UNIT_SAVE_M4N4_VR_##mode(13, 18, 23, 28) UNIT_SAVE_M4N4_VR_##mode(14, 19, 24, 29)\
+  UNIT_SAVE_M4N4_VR_##mode(15, 20, 25, 30) UNIT_SAVE_M4N4_VR_##mode(16, 21, 26, 31)
+
+#define KERNEL_M4N20_PRELOAD2 \
+  "ldr d0,[x0],#8; ldr d1,[x1],#8; ldr x16,[x2],#8; ldr x11,[x3],#8\n\t"\
+  "ldr q4,[x4]; ldr d5,[x4,#16]; ldr x10,[x4,#24]\n\t"\
+  "add x4,x4,#160; fmov v0.d[1],x16\n\t"
+
+#define KERNEL_M4N20_MAIN2(ac1, ac2, an1, an2, ap1, ap2) \
+  "fmov v5.d[1],x10; ldr d"#an1",[x0],#8\n\t"\
+  "fmla v12.4s,v4.4s,v"#ac1".s[0]\n\t"\
+  "fmla v17.4s,v4.4s,v"#ac2".s[0]; prfm pldl1keep,[x"#ap1",#64]\n\t"\
+  "fmla v22.4s,v4.4s,v"#ac1".s[2]\n\t"\
+  "fmov v"#ac2".d[1],x11; ldr d6,[x4,#-128]\n\t"\
+  "fmla v13.4s,v5.4s,v"#ac1".s[0]; ldr x10,[x4,#-120]\n\t"\
+  "fmla v18.4s,v5.4s,v"#ac2".s[0]; prfm pldl1keep,[x4,#72]\n\t"\
+  "fmla v27.4s,v4.4s,v"#ac2".s[2]\n\t"\
+  "fmov v6.d[1],x10; ldr d7,[x4,#-112]\n\t"\
+  "fmla v23.4s,v5.4s,v"#ac1".s[2]; ldr x10,[x4,#-104]\n\t"\
+  "fmla v28.4s,v5.4s,v"#ac2".s[2]; ldr x16,[x2],#8\n\t"\
+  "fmla v14.4s,v6.4s,v"#ac1".s[0]\n\t"\
+  "fmov v7.d[1],x10; ldr d8,[x4,#-96]\n\t"\
+  "fmla v19.4s,v6.4s,v"#ac2".s[0]; ldr x10,[x4,#-88]\n\t"\
+  "fmla v24.4s,v6.4s,v"#ac1".s[2]\n\t"\
+  "fmla v29.4s,v6.4s,v"#ac2".s[2]\n\t"\
+  "fmov v8.d[1],x10; ldr d4,[x4,#-80]\n\t"\
+  "fmla v15.4s,v7.4s,v"#ac1".s[0]; ldr x10,[x4,#-72]\n\t"\
+  "fmla v20.4s,v7.4s,v"#ac2".s[0]; prfm pldl1keep,[x4,#120]\n\t"\
+  "fmla v25.4s,v7.4s,v"#ac1".s[2]\n\t"\
+  "fmov v4.d[1],x10; ldr d5,[x4,#-64]\n\t"\
+  "fmla v30.4s,v7.4s,v"#ac2".s[2]; ldr x10,[x4,#-56]\n\t"\
+  "fmla v16.4s,v8.4s,v"#ac1".s[0]\n\t"\
+  "fmla v21.4s,v8.4s,v"#ac2".s[0]\n\t"\
+  "fmov v5.d[1],x10; ldr d"#an2",[x1],#8\n\t"\
+  "fmla v26.4s,v8.4s,v"#ac1".s[2]\n\t"\
+  "fmla v31.4s,v8.4s,v"#ac2".s[2]; prfm pldl1keep,[x"#ap2",#64]\n\t"\
+  "fmla v12.4s,v4.4s,v"#ac1".s[1]\n\t"\
+  "fmov v"#an1".d[1],x16; ldr d6,[x4,#-48]\n\t"\
+  "fmla v17.4s,v4.4s,v"#ac2".s[1]; ldr x10,[x4,#-40]\n\t"\
+  "fmla v22.4s,v4.4s,v"#ac1".s[3]\n\t"\
+  "fmla v27.4s,v4.4s,v"#ac2".s[3]\n\t"\
+  "fmov v6.d[1],x10; ldr d7,[x4,#-32]\n\t"\
+  "fmla v13.4s,v5.4s,v"#ac1".s[1]; ldr x10,[x4,#-24]\n\t"\
+  "fmla v18.4s,v5.4s,v"#ac2".s[1]; prfm pldl1keep,[x4,#168]\n\t"\
+  "fmla v23.4s,v5.4s,v"#ac1".s[3]\n\t"\
+  "fmov v7.d[1],x10; ldr d8,[x4,#-16]\n\t"\
+  "fmla v28.4s,v5.4s,v"#ac2".s[3]; ldr x10,[x4,#-8]\n\t"\
+  "fmla v14.4s,v6.4s,v"#ac1".s[1]; add x4,x4,#160\n\t"\
+  "fmla v19.4s,v6.4s,v"#ac2".s[1]\n\t"\
+  "fmov v8.d[1],x10\n\t"\
+  "fmla v24.4s,v6.4s,v"#ac1".s[3]; sub w5,w5,#2\n\t"\
+  "fmla v29.4s,v6.4s,v"#ac2".s[3]; ldr x11,[x3],#8\n\t"\
+  "fmla v15.4s,v7.4s,v"#ac1".s[1]\n\t"\
+  "ldr d4,[x4,#-160]\n\t"\
+  "fmla v20.4s,v7.4s,v"#ac2".s[1]; ldr x10,[x4,#-152]\n\t"\
+  "fmla v25.4s,v7.4s,v"#ac1".s[3]\n\t"\
+  "fmla v30.4s,v7.4s,v"#ac2".s[3]\n\t"\
+  "fmov v4.d[1],x10\n\t"\
+  "fmla v16.4s,v8.4s,v"#ac1".s[1]; cmp w5,#6\n\t"\
+  "fmla v21.4s,v8.4s,v"#ac2".s[1]\n\t"\
+  "ldr d5,[x4,#-144]\n\t"\
+  "fmla v26.4s,v8.4s,v"#ac1".s[3]; ldr x10,[x4,#-136]\n\t"\
+  "fmla v31.4s,v8.4s,v"#ac2".s[3]\n\t"
+
+#define KERNEL_M4N20_TAIL2(ac1, ac2) \
+  "fmov v5.d[1],x10\n\t"\
+  "fmla v12.4s,v4.4s,v"#ac1".s[0]\n\t"\
+  "fmla v17.4s,v4.4s,v"#ac2".s[0]; prfm pldl1keep,[x6]\n\t"\
+  "fmla v22.4s,v4.4s,v"#ac1".s[2]\n\t"\
+  "fmov v"#ac2".d[1],x11; ldr d6,[x4,#-128]\n\t"\
+  "fmla v13.4s,v5.4s,v"#ac1".s[0]; ldr x10,[x4,#-120]\n\t"\
+  "fmla v18.4s,v5.4s,v"#ac2".s[0]\n\t"\
+  "fmla v27.4s,v4.4s,v"#ac2".s[2]\n\t"\
+  "fmov v6.d[1],x10; ldr d7,[x4,#-112]\n\t"\
+  "fmla v23.4s,v5.4s,v"#ac1".s[2]; ldr x10,[x4,#-104]\n\t"\
+  "fmla v28.4s,v5.4s,v"#ac2".s[2]\n\t"\
+  "fmla v14.4s,v6.4s,v"#ac1".s[0]\n\t"\
+  "fmov v7.d[1],x10; ldr d8,[x4,#-96]\n\t"\
+  "fmla v19.4s,v6.4s,v"#ac2".s[0]; ldr x10,[x4,#-88]\n\t"\
+  "fmla v24.4s,v6.4s,v"#ac1".s[2]; prfm pldl1keep,[x7]\n\t"\
+  "fmla v29.4s,v6.4s,v"#ac2".s[2]\n\t"\
+  "fmov v8.d[1],x10; ldr d4,[x4,#-80]\n\t"\
+  "fmla v15.4s,v7.4s,v"#ac1".s[0]; ldr x10,[x4,#-72]\n\t"\
+  "fmla v20.4s,v7.4s,v"#ac2".s[0]\n\t"\
+  "fmla v25.4s,v7.4s,v"#ac1".s[2]\n\t"\
+  "fmov v4.d[1],x10; ldr d5,[x4,#-64]\n\t"\
+  "fmla v30.4s,v7.4s,v"#ac2".s[2]; ldr x10,[x4,#-56]\n\t"\
+  "fmla v16.4s,v8.4s,v"#ac1".s[0]\n\t"\
+  "fmla v21.4s,v8.4s,v"#ac2".s[0]\n\t"\
+  "fmov v5.d[1],x10\n\t"\
+  "fmla v26.4s,v8.4s,v"#ac1".s[2]\n\t"\
+  "fmla v31.4s,v8.4s,v"#ac2".s[2]; prfm pldl1keep,[x8]\n\t"\
+  "fmla v12.4s,v4.4s,v"#ac1".s[1]\n\t"\
+  "ldr d6,[x4,#-48]\n\t"\
+  "fmla v17.4s,v4.4s,v"#ac2".s[1]; ldr x10,[x4,#-40]\n\t"\
+  "fmla v22.4s,v4.4s,v"#ac1".s[3]\n\t"\
+  "fmla v27.4s,v4.4s,v"#ac2".s[3]\n\t"\
+  "fmov v6.d[1],x10; ldr d7,[x4,#-32]\n\t"\
+  "fmla v13.4s,v5.4s,v"#ac1".s[1]; ldr x10,[x4,#-24]\n\t"\
+  "fmla v18.4s,v5.4s,v"#ac2".s[1]; prfm pldl1keep,[x9]\n\t"\
+  "fmla v23.4s,v5.4s,v"#ac1".s[3]\n\t"\
+  "fmov v7.d[1],x10; ldr d8,[x4,#-16]\n\t"\
+  "fmla v28.4s,v5.4s,v"#ac2".s[3]; ldr x10,[x4,#-8]\n\t"\
+  "fmla v14.4s,v6.4s,v"#ac1".s[1]\n\t"\
+  "fmla v19.4s,v6.4s,v"#ac2".s[1]\n\t"\
+  "fmov v8.d[1],x10\n\t"\
+  "fmla v24.4s,v6.4s,v"#ac1".s[3]; sub w5,w5,#2\n\t"\
+  "fmla v29.4s,v6.4s,v"#ac2".s[3]\n\t"\
+  "fmla v15.4s,v7.4s,v"#ac1".s[1]\n\t"\
+  "fmla v20.4s,v7.4s,v"#ac2".s[1]\n\t"\
+  "fmla v25.4s,v7.4s,v"#ac1".s[3]\n\t"\
+  "fmla v30.4s,v7.4s,v"#ac2".s[3]\n\t"\
+  "fmla v16.4s,v8.4s,v"#ac1".s[1]\n\t"\
+  "fmla v21.4s,v8.4s,v"#ac2".s[1]\n\t"\
+  "fmla v26.4s,v8.4s,v"#ac1".s[3]\n\t"\
+  "fmla v31.4s,v8.4s,v"#ac2".s[3]\n\t"
+
+#define KERNEL_M4N20_FIN1 \
+  "ldr s0,[x0],#4; ldr q4,[x4]; ldr q5,[x4,#16]\n\t"\
+  "ldr q6,[x4,#32]; add x4,x4,#80\n\t"\
+  "ldr s1,[x1],#4\n\t"\
+  "fmla v12.4s,v4.4s,v0.s[0]\n\t"\
+  "fmla v13.4s,v5.4s,v0.s[0]\n\t"\
+  "fmla v14.4s,v6.4s,v0.s[0]\n\t"\
+  "ldr s2,[x2],#4; ldr d7,[x4,#-32]\n\t"\
+  "fmla v17.4s,v4.4s,v1.s[0]; ldr x10,[x4,#-24]\n\t"\
+  "fmla v18.4s,v5.4s,v1.s[0]\n\t"\
+  "fmla v19.4s,v6.4s,v1.s[0]\n\t"\
+  "ldr s3,[x3],#4; fmov v7.d[1],x10\n\t"\
+  "fmla v22.4s,v4.4s,v2.s[0]\n\t"\
+  "fmla v23.4s,v5.4s,v2.s[0]\n\t"\
+  "fmla v24.4s,v6.4s,v2.s[0]\n\t"\
+  "ldr d8,[x4,#-16]\n\t"\
+  "fmla v27.4s,v4.4s,v3.s[0]; ldr x10,[x4,#-8]\n\t"\
+  "fmla v28.4s,v5.4s,v3.s[0]\n\t"\
+  "fmla v29.4s,v6.4s,v3.s[0]\n\t"\
+  "fmov v8.d[1],x10\n\t"\
+  "fmla v15.4s,v7.4s,v0.s[0]\n\t"\
+  "fmla v20.4s,v7.4s,v1.s[0]\n\t"\
+  "fmla v25.4s,v7.4s,v2.s[0]\n\t"\
+  "fmla v30.4s,v7.4s,v3.s[0]\n\t"\
+  "fmla v16.4s,v8.4s,v0.s[0]\n\t"\
+  "fmla v21.4s,v8.4s,v1.s[0]\n\t"\
+  "fmla v26.4s,v8.4s,v2.s[0]\n\t"\
+  "fmla v31.4s,v8.4s,v3.s[0]\n\t"
+
+
+/* m4n21 c_vec */
+/* v12 - v16 v10_comp */
+/* v17 - v21 v11_comp */
+/* v22 - v26 v10_comp */
+/* v27 - v31 v11_comp */
+
+#define INIT_M4N21 \
+  INIT_2V(10, 11) INIT_4V(12, 13, 14, 15)\
+  INIT_4V(16, 17, 18, 19) INIT_4V(20, 21, 22, 23)\
+  INIT_4V(24, 25, 26, 27) INIT_4V(28, 29, 30, 31)
+
+#define SAVE_M4N21(mode) UNIT_SAVE_M4N4_VR_##mode(12, 17, 22, 27)\
+  UNIT_SAVE_M4N4_VR_##mode(13, 18, 23, 28) UNIT_SAVE_M4N4_VR_##mode(14, 19, 24, 29)\
+  UNIT_SAVE_M4N4_VR_##mode(15, 20, 25, 30) UNIT_SAVE_M4N4_VR_##mode(16, 21, 26, 31)\
+  EDGE_SAVE_M4N1K2_##mode(10, 11)
+
+#define KERNEL_M4N21_PRELOAD2 \
+  "ldr d0,[x0],#8; ldr d1,[x1],#8; ldr x16,[x2],#8; ldr x11,[x3],#8\n\t"\
+  "ldr q4,[x4]; ldr d5,[x4,#16]; ldr x10,[x4,#24]\n\t"\
+  "add x4,x4,#168; fmov v0.d[1],x16\n\t"
+
+#define KERNEL_M4N21_MAIN2(ac1, ac2, an1, an2, ap1, ap2) \
+  "fmov v5.d[1],x10; ldr d"#an1",[x0],#8\n\t"\
+  "fmla v12.4s,v4.4s,v"#ac1".s[0]\n\t"\
+  "fmla v17.4s,v4.4s,v"#ac2".s[0]; prfm pldl1keep,[x"#ap1",#64]\n\t"\
+  "fmla v22.4s,v4.4s,v"#ac1".s[2]\n\t"\
+  "fmov v"#ac2".d[1],x11; ldr d6,[x4,#-136]\n\t"\
+  "fmla v13.4s,v5.4s,v"#ac1".s[0]; ldr x10,[x4,#-128]\n\t"\
+  "fmla v18.4s,v5.4s,v"#ac2".s[0]; prfm pldl1keep,[x4,#64]\n\t"\
+  "fmla v27.4s,v4.4s,v"#ac2".s[2]\n\t"\
+  "fmov v6.d[1],x10; ldr d7,[x4,#-120]\n\t"\
+  "fmla v23.4s,v5.4s,v"#ac1".s[2]; ldr x10,[x4,#-112]\n\t"\
+  "fmla v28.4s,v5.4s,v"#ac2".s[2]\n\t"\
+  "fmla v14.4s,v6.4s,v"#ac1".s[0]\n\t"\
+  "fmov v7.d[1],x10; ldr d8,[x4,#-104]\n\t"\
+  "fmla v19.4s,v6.4s,v"#ac2".s[0]; ldr x10,[x4,#-96]\n\t"\
+  "fmla v24.4s,v6.4s,v"#ac1".s[2]; ldr x16,[x2],#8\n\t"\
+  "fmla v29.4s,v6.4s,v"#ac2".s[2]\n\t"\
+  "fmov v8.d[1],x10; ldr d4,[x4,#-88]\n\t"\
+  "fmla v15.4s,v7.4s,v"#ac1".s[0]; ldr x10,[x4,#-80]\n\t"\
+  "fmla v20.4s,v7.4s,v"#ac2".s[0]; prfm pldl1keep,[x4,#120]\n\t"\
+  "fmla v25.4s,v7.4s,v"#ac1".s[2]\n\t"\
+  "fmov v4.d[1],x10; ldr d5,[x4,#-72]\n\t"\
+  "fmla v30.4s,v7.4s,v"#ac2".s[2]; ldr x10,[x4,#-64]\n\t"\
+  "fmla v16.4s,v8.4s,v"#ac1".s[0]\n\t"\
+  "fmla v21.4s,v8.4s,v"#ac2".s[0]\n\t"\
+  "fmov v5.d[1],x10; ldr d"#an2",[x1],#8\n\t"\
+  "fmla v26.4s,v8.4s,v"#ac1".s[2]\n\t"\
+  "fmla v31.4s,v8.4s,v"#ac2".s[2]; prfm pldl1keep,[x"#ap2",#64]\n\t"\
+  "fmla v12.4s,v4.4s,v"#ac1".s[1]\n\t"\
+  "fmov v"#an1".d[1],x16; ldr d6,[x4,#-56]\n\t"\
+  "fmla v17.4s,v4.4s,v"#ac2".s[1]; ldr x10,[x4,#-48]\n\t"\
+  "fmla v22.4s,v4.4s,v"#ac1".s[3]\n\t"\
+  "fmla v27.4s,v4.4s,v"#ac2".s[3]\n\t"\
+  "fmov v6.d[1],x10; ldr d7,[x4,#-40]\n\t"\
+  "fmla v13.4s,v5.4s,v"#ac1".s[1]; ldr x10,[x4,#-32]\n\t"\
+  "fmla v18.4s,v5.4s,v"#ac2".s[1]; prfm pldl1keep,[x4,#176]\n\t"\
+  "fmla v23.4s,v5.4s,v"#ac1".s[3]\n\t"\
+  "fmov v7.d[1],x10; ldr d8,[x4,#-24]\n\t"\
+  "fmla v28.4s,v5.4s,v"#ac2".s[3]; ldr x10,[x4,#-16]\n\t"\
+  "fmla v14.4s,v6.4s,v"#ac1".s[1]\n\t"\
+  "fmla v19.4s,v6.4s,v"#ac2".s[1]\n\t"\
+  "fmov v8.d[1],x10; ldr d9,[x4,#-8]\n\t"\
+  "fmla v24.4s,v6.4s,v"#ac1".s[3]; add x4,x4,#168\n\t"\
+  "fmla v29.4s,v6.4s,v"#ac2".s[3]; ldr x11,[x3],#8\n\t"\
+  "fmla v15.4s,v7.4s,v"#ac1".s[1]\n\t"\
+  "ins v9.d[1],v9.d[0]; ldr d4,[x4,#-168]\n\t"\
+  "fmla v20.4s,v7.4s,v"#ac2".s[1]; ldr x10,[x4,#-160]\n\t"\
+  "fmla v25.4s,v7.4s,v"#ac1".s[3]\n\t"\
+  "fmla v30.4s,v7.4s,v"#ac2".s[3]\n\t"\
+  "fmov v4.d[1],x10\n\t"\
+  "fmla v16.4s,v8.4s,v"#ac1".s[1]; sub w5,w5,#2\n\t"\
+  "fmla v21.4s,v8.4s,v"#ac2".s[1]\n\t"\
+  "fmla v26.4s,v8.4s,v"#ac1".s[3]\n\t"\
+  "ldr d5,[x4,#-152]\n\t"\
+  "fmla v31.4s,v8.4s,v"#ac2".s[3]; ldr x10,[x4,#-144]\n\t"\
+  "fmla v10.4s,v9.4s,v"#ac1".4s; cmp w5,#6\n\t"\
+  "fmla v11.4s,v9.4s,v"#ac2".4s\n\t"
+
+#define KERNEL_M4N21_TAIL2(ac1, ac2) \
+  "fmov v5.d[1],x10\n\t"\
+  "fmla v12.4s,v4.4s,v"#ac1".s[0]\n\t"\
+  "fmla v17.4s,v4.4s,v"#ac2".s[0]\n\t"\
+  "fmla v22.4s,v4.4s,v"#ac1".s[2]\n\t"\
+  "fmov v"#ac2".d[1],x11; ldr d6,[x4,#-136]\n\t"\
+  "fmla v13.4s,v5.4s,v"#ac1".s[0]; ldr x10,[x4,#-128]\n\t"\
+  "fmla v18.4s,v5.4s,v"#ac2".s[0]; prfm pldl1keep,[x6]\n\t"\
+  "fmla v27.4s,v4.4s,v"#ac2".s[2]\n\t"\
+  "fmov v6.d[1],x10; ldr d7,[x4,#-120]\n\t"\
+  "fmla v23.4s,v5.4s,v"#ac1".s[2]; ldr x10,[x4,#-112]\n\t"\
+  "fmla v28.4s,v5.4s,v"#ac2".s[2]\n\t"\
+  "fmla v14.4s,v6.4s,v"#ac1".s[0]\n\t"\
+  "fmov v7.d[1],x10; ldr d8,[x4,#-104]\n\t"\
+  "fmla v19.4s,v6.4s,v"#ac2".s[0]; ldr x10,[x4,#-96]\n\t"\
+  "fmla v24.4s,v6.4s,v"#ac1".s[2]\n\t"\
+  "fmla v29.4s,v6.4s,v"#ac2".s[2]\n\t"\
+  "fmov v8.d[1],x10; ldr d4,[x4,#-88]\n\t"\
+  "fmla v15.4s,v7.4s,v"#ac1".s[0]; ldr x10,[x4,#-80]\n\t"\
+  "fmla v20.4s,v7.4s,v"#ac2".s[0]; prfm pldl1keep,[x7]\n\t"\
+  "fmla v25.4s,v7.4s,v"#ac1".s[2]\n\t"\
+  "fmov v4.d[1],x10; ldr d5,[x4,#-72]\n\t"\
+  "fmla v30.4s,v7.4s,v"#ac2".s[2]; ldr x10,[x4,#-64]\n\t"\
+  "fmla v16.4s,v8.4s,v"#ac1".s[0]\n\t"\
+  "fmla v21.4s,v8.4s,v"#ac2".s[0]\n\t"\
+  "fmov v5.d[1],x10\n\t"\
+  "fmla v26.4s,v8.4s,v"#ac1".s[2]\n\t"\
+  "fmla v31.4s,v8.4s,v"#ac2".s[2]; prfm pldl1keep,[x8]\n\t"\
+  "fmla v12.4s,v4.4s,v"#ac1".s[1]\n\t"\
+  "ldr d6,[x4,#-56]\n\t"\
+  "fmla v17.4s,v4.4s,v"#ac2".s[1]; ldr x10,[x4,#-48]\n\t"\
+  "fmla v22.4s,v4.4s,v"#ac1".s[3]\n\t"\
+  "fmla v27.4s,v4.4s,v"#ac2".s[3]\n\t"\
+  "fmov v6.d[1],x10; ldr d7,[x4,#-40]\n\t"\
+  "fmla v13.4s,v5.4s,v"#ac1".s[1]; ldr x10,[x4,#-32]\n\t"\
+  "fmla v18.4s,v5.4s,v"#ac2".s[1]; prfm pldl1keep,[x9]\n\t"\
+  "fmla v23.4s,v5.4s,v"#ac1".s[3]\n\t"\
+  "fmov v7.d[1],x10; ldr d8,[x4,#-24]\n\t"\
+  "fmla v28.4s,v5.4s,v"#ac2".s[3]; ldr x10,[x4,#-16]\n\t"\
+  "fmla v14.4s,v6.4s,v"#ac1".s[1]\n\t"\
+  "fmla v19.4s,v6.4s,v"#ac2".s[1]\n\t"\
+  "fmov v8.d[1],x10; ldr d9,[x4,#-8]\n\t"\
+  "fmla v24.4s,v6.4s,v"#ac1".s[3]\n\t"\
+  "fmla v29.4s,v6.4s,v"#ac2".s[3]\n\t"\
+  "fmla v15.4s,v7.4s,v"#ac1".s[1]\n\t"\
+  "ins v9.d[1],v9.d[0]\n\t"\
+  "fmla v20.4s,v7.4s,v"#ac2".s[1]\n\t"\
+  "fmla v25.4s,v7.4s,v"#ac1".s[3]\n\t"\
+  "fmla v30.4s,v7.4s,v"#ac2".s[3]\n\t"\
+  "fmla v16.4s,v8.4s,v"#ac1".s[1]; sub w5,w5,#2\n\t"\
+  "fmla v21.4s,v8.4s,v"#ac2".s[1]\n\t"\
+  "fmla v26.4s,v8.4s,v"#ac1".s[3]\n\t"\
+  "fmla v31.4s,v8.4s,v"#ac2".s[3]\n\t"\
+  "fmla v10.4s,v9.4s,v"#ac1".4s\n\t"\
+  "fmla v11.4s,v9.4s,v"#ac2".4s\n\t"
+
+#define KERNEL_M4N21_FIN1 \
+  "ldr s0,[x0],#4; ldr q4,[x4]; ldr q5,[x4,#16]\n\t"\
+  "ldr q6,[x4,#32]; add x4,x4,#84\n\t"\
+  "ldr s1,[x1],#4\n\t"\
+  "fmla v12.4s,v4.4s,v0.s[0]\n\t"\
+  "fmla v13.4s,v5.4s,v0.s[0]\n\t"\
+  "fmla v14.4s,v6.4s,v0.s[0]\n\t"\
+  "ldr s2,[x2],#4; ldr d7,[x4,#-36]\n\t"\
+  "fmla v17.4s,v4.4s,v1.s[0]; ldr x10,[x4,#-28]\n\t"\
+  "fmla v18.4s,v5.4s,v1.s[0]\n\t"\
+  "fmla v19.4s,v6.4s,v1.s[0]\n\t"\
+  "ldr s3,[x3],#4; fmov v7.d[1],x10\n\t"\
+  "fmla v22.4s,v4.4s,v2.s[0]\n\t"\
+  "fmla v23.4s,v5.4s,v2.s[0]\n\t"\
+  "fmla v24.4s,v6.4s,v2.s[0]\n\t"\
+  "ldr d8,[x4,#-20]\n\t"\
+  "fmla v27.4s,v4.4s,v3.s[0]; ldr x10,[x4,#-12]\n\t"\
+  "fmla v28.4s,v5.4s,v3.s[0]\n\t"\
+  "fmla v29.4s,v6.4s,v3.s[0]\n\t"\
+  "fmov v8.d[1],x10; ldr s9,[x4,#-4]\n\t"\
+  "fmla v15.4s,v7.4s,v0.s[0]\n\t"\
+  "fmla v20.4s,v7.4s,v1.s[0]\n\t"\
+  "fmla v25.4s,v7.4s,v2.s[0]\n\t"\
+  "ins v9.d[1],v9.d[0]\n\t"\
+  "fmla v30.4s,v7.4s,v3.s[0]\n\t"\
+  "fmla v16.4s,v8.4s,v0.s[0]\n\t"\
+  "fmla v21.4s,v8.4s,v1.s[0]\n\t"\
+  "ins v0.d[1],v2.d[0]; ins v1.d[1],v3.d[0]\n\t"\
+  "fmla v26.4s,v8.4s,v2.s[0]\n\t"\
+  "fmla v31.4s,v8.4s,v3.s[0]\n\t"\
+  "fmla v10.4s,v9.4s,v0.4s\n\t"\
+  "fmla v11.4s,v9.4s,v1.4s\n\t"
+
+
+/* m4n22 c_vec */
+/* v12 - v16 v10_comp v8_comp */
+/* v17 - v21 v11_comp v9_comp */
+/* v22 - v26 v10_comp v8_comp */
+/* v27 - v31 v11_comp v9_comp */
+
+#define INIT_M4N22 \
+  INIT_4V(8, 9, 10, 11) INIT_4V(12, 13, 14, 15)\
+  INIT_4V(16, 17, 18, 19) INIT_4V(20, 21, 22, 23)\
+  INIT_4V(24, 25, 26, 27) INIT_4V(28, 29, 30, 31)
+
+#define SAVE_M4N22(mode) UNIT_SAVE_M4N4_VR_##mode(12, 17, 22, 27)\
+  UNIT_SAVE_M4N4_VR_##mode(13, 18, 23, 28) UNIT_SAVE_M4N4_VR_##mode(14, 19, 24, 29)\
+  UNIT_SAVE_M4N4_VR_##mode(15, 20, 25, 30) UNIT_SAVE_M4N4_VR_##mode(16, 21, 26, 31)\
+  EDGE_SAVE_M4N1K2_##mode(10, 11) EDGE_SAVE_M4N1K2_##mode(8, 9)
+
+#define KERNEL_M4N22_PRELOAD2 \
+  "ldr d0,[x0],#8; ldr d1,[x1],#8; ldr x16,[x2],#8; ldr x11,[x3],#8\n\t"\
+  "ldr q4,[x4]; ldr d5,[x4,#16]; ldr x10,[x4,#24]\n\t"\
+  "add x4,x4,#176; fmov v0.d[1],x16\n\t"
+
+#define KERNEL_M4N22_MAIN2(ac1, ac2, an1, an2, ap1, ap2) \
+  "fmov v5.d[1],x10; ldr d"#an1",[x0],#8\n\t"\
+  "fmla v12.4s,v4.4s,v"#ac1".s[0]\n\t"\
+  "fmla v17.4s,v4.4s,v"#ac2".s[0]; prfm pldl1keep,[x"#ap1",#64]\n\t"\
+  "fmla v22.4s,v4.4s,v"#ac1".s[2]\n\t"\
+  "fmov v"#ac2".d[1],x11; ldr d6,[x4,#-144]\n\t"\
+  "fmla v13.4s,v5.4s,v"#ac1".s[0]; ldr x10,[x4,#-136]\n\t"\
+  "fmla v18.4s,v5.4s,v"#ac2".s[0]; prfm pldl1keep,[x4,#64]\n\t"\
+  "fmla v27.4s,v4.4s,v"#ac2".s[2]\n\t"\
+  "fmov v6.d[1],x10; ldr d7,[x4,#-128]\n\t"\
+  "fmla v23.4s,v5.4s,v"#ac1".s[2]; ldr x10,[x4,#-120]\n\t"\
+  "fmla v28.4s,v5.4s,v"#ac2".s[2]\n\t"\
+  "fmla v14.4s,v6.4s,v"#ac1".s[0]\n\t"\
+  "fmov v7.d[1],x10; ldr d4,[x4,#-112]\n\t"\
+  "fmla v19.4s,v6.4s,v"#ac2".s[0]; ldr x10,[x4,#-104]\n\t"\
+  "fmla v24.4s,v6.4s,v"#ac1".s[2]; ldr x16,[x2],#8\n\t"\
+  "fmla v29.4s,v6.4s,v"#ac2".s[2]\n\t"\
+  "fmov v4.d[1],x10; ldr d5,[x4,#-96]\n\t"\
+  "fmla v15.4s,v7.4s,v"#ac1".s[0]; ldr x10,[x4,#-88]\n\t"\
+  "fmla v20.4s,v7.4s,v"#ac2".s[0]; prfm pldl1keep,[x4,#120]\n\t"\
+  "fmla v25.4s,v7.4s,v"#ac1".s[2]\n\t"\
+  "fmov v5.d[1],x10; ldr d6,[x4,#-80]\n\t"\
+  "fmla v30.4s,v7.4s,v"#ac2".s[2]; ldr x10,[x4,#-72]\n\t"\
+  "fmla v16.4s,v4.4s,v"#ac1".s[0]\n\t"\
+  "fmla v21.4s,v4.4s,v"#ac2".s[0]\n\t"\
+  "fmov v6.d[1],x10; ldr d"#an2",[x1],#8\n\t"\
+  "fmla v26.4s,v4.4s,v"#ac1".s[2]\n\t"\
+  "fmla v31.4s,v4.4s,v"#ac2".s[2]; prfm pldl1keep,[x"#ap2",#64]\n\t"\
+  "fmla v12.4s,v5.4s,v"#ac1".s[1]\n\t"\
+  "fmov v"#an1".d[1],x16; ldr d7,[x4,#-64]\n\t"\
+  "fmla v17.4s,v5.4s,v"#ac2".s[1]; ldr x10,[x4,#-56]\n\t"\
+  "fmla v22.4s,v5.4s,v"#ac1".s[3]\n\t"\
+  "fmla v27.4s,v5.4s,v"#ac2".s[3]\n\t"\
+  "fmov v7.d[1],x10; ldr d4,[x4,#-48]\n\t"\
+  "fmla v13.4s,v6.4s,v"#ac1".s[1]; ldr x10,[x4,#-40]\n\t"\
+  "fmla v18.4s,v6.4s,v"#ac2".s[1]; prfm pldl1keep,[x4,#176]\n\t"\
+  "fmla v23.4s,v6.4s,v"#ac1".s[3]\n\t"\
+  "fmov v4.d[1],x10; ldr d5,[x4,#-32]\n\t"\
+  "fmla v28.4s,v6.4s,v"#ac2".s[3]; ldr x10,[x4,#-24]\n\t"\
+  "fmla v14.4s,v7.4s,v"#ac1".s[1]\n\t"\
+  "fmla v19.4s,v7.4s,v"#ac2".s[1]\n\t"\
+  "fmov v5.d[1],x10; ldr d6,[x4,#-16]\n\t"\
+  "fmla v24.4s,v7.4s,v"#ac1".s[3]\n\t"\
+  "fmla v29.4s,v7.4s,v"#ac2".s[3]; ldr x11,[x3],#8\n\t"\
+  "fmla v15.4s,v4.4s,v"#ac1".s[1]\n\t"\
+  "ins v6.d[1],v6.d[0]; ldr d7,[x4,#-8]\n\t"\
+  "fmla v20.4s,v4.4s,v"#ac2".s[1]; add x4,x4,#176\n\t"\
+  "fmla v25.4s,v4.4s,v"#ac1".s[3]\n\t"\
+  "fmla v30.4s,v4.4s,v"#ac2".s[3]\n\t"\
+  "ins v7.d[1],v7.d[0]; ldr d4,[x4,#-176]\n\t"\
+  "fmla v16.4s,v5.4s,v"#ac1".s[1]; ldr x10,[x4,#-168]\n\t"\
+  "fmla v21.4s,v5.4s,v"#ac2".s[1]; sub w5,w5,#2\n\t"\
+  "fmla v26.4s,v5.4s,v"#ac1".s[3]\n\t"\
+  "fmov v4.d[1],x10\n\t"\
+  "fmla v31.4s,v5.4s,v"#ac2".s[3]\n\t"\
+  "fmla v10.4s,v6.4s,v"#ac1".4s; cmp w5,#6\n\t"\
+  "fmla v11.4s,v6.4s,v"#ac2".4s\n\t"\
+  "ldr d5,[x4,#-160]\n\t"\
+  "fmla v8.4s,v7.4s,v"#ac1".4s; ldr x10,[x4,#-152]\n\t"\
+  "fmla v9.4s,v7.4s,v"#ac2".4s\n\t"
+
+#define KERNEL_M4N22_TAIL2(ac1, ac2) \
+  "fmov v5.d[1],x10\n\t"\
+  "fmla v12.4s,v4.4s,v"#ac1".s[0]\n\t"\
+  "fmla v17.4s,v4.4s,v"#ac2".s[0]; prfm pldl1keep,[x6]\n\t"\
+  "fmla v22.4s,v4.4s,v"#ac1".s[2]\n\t"\
+  "fmov v"#ac2".d[1],x11; ldr d6,[x4,#-144]\n\t"\
+  "fmla v13.4s,v5.4s,v"#ac1".s[0]; ldr x10,[x4,#-136]\n\t"\
+  "fmla v18.4s,v5.4s,v"#ac2".s[0]\n\t"\
+  "fmla v27.4s,v4.4s,v"#ac2".s[2]\n\t"\
+  "fmov v6.d[1],x10; ldr d7,[x4,#-128]\n\t"\
+  "fmla v23.4s,v5.4s,v"#ac1".s[2]; ldr x10,[x4,#-120]\n\t"\
+  "fmla v28.4s,v5.4s,v"#ac2".s[2]\n\t"\
+  "fmla v14.4s,v6.4s,v"#ac1".s[0]\n\t"\
+  "fmov v7.d[1],x10; ldr d4,[x4,#-112]\n\t"\
+  "fmla v19.4s,v6.4s,v"#ac2".s[0]; ldr x10,[x4,#-104]\n\t"\
+  "fmla v24.4s,v6.4s,v"#ac1".s[2]\n\t"\
+  "fmla v29.4s,v6.4s,v"#ac2".s[2]\n\t"\
+  "fmov v4.d[1],x10; ldr d5,[x4,#-96]\n\t"\
+  "fmla v15.4s,v7.4s,v"#ac1".s[0]; ldr x10,[x4,#-88]\n\t"\
+  "fmla v20.4s,v7.4s,v"#ac2".s[0]; prfm pldl1keep,[x7]\n\t"\
+  "fmla v25.4s,v7.4s,v"#ac1".s[2]\n\t"\
+  "fmov v5.d[1],x10; ldr d6,[x4,#-80]\n\t"\
+  "fmla v30.4s,v7.4s,v"#ac2".s[2]; ldr x10,[x4,#-72]\n\t"\
+  "fmla v16.4s,v4.4s,v"#ac1".s[0]\n\t"\
+  "fmla v21.4s,v4.4s,v"#ac2".s[0]\n\t"\
+  "fmov v6.d[1],x10\n\t"\
+  "fmla v26.4s,v4.4s,v"#ac1".s[2]\n\t"\
+  "fmla v31.4s,v4.4s,v"#ac2".s[2]; prfm pldl1keep,[x8]\n\t"\
+  "fmla v12.4s,v5.4s,v"#ac1".s[1]\n\t"\
+  "ldr d7,[x4,#-64]\n\t"\
+  "fmla v17.4s,v5.4s,v"#ac2".s[1]; ldr x10,[x4,#-56]\n\t"\
+  "fmla v22.4s,v5.4s,v"#ac1".s[3]\n\t"\
+  "fmla v27.4s,v5.4s,v"#ac2".s[3]\n\t"\
+  "fmov v7.d[1],x10; ldr d4,[x4,#-48]\n\t"\
+  "fmla v13.4s,v6.4s,v"#ac1".s[1]; ldr x10,[x4,#-40]\n\t"\
+  "fmla v18.4s,v6.4s,v"#ac2".s[1]; prfm pldl1keep,[x9]\n\t"\
+  "fmla v23.4s,v6.4s,v"#ac1".s[3]\n\t"\
+  "fmov v4.d[1],x10; ldr d5,[x4,#-32]\n\t"\
+  "fmla v28.4s,v6.4s,v"#ac2".s[3]; ldr x10,[x4,#-24]\n\t"\
+  "fmla v14.4s,v7.4s,v"#ac1".s[1]\n\t"\
+  "fmla v19.4s,v7.4s,v"#ac2".s[1]\n\t"\
+  "fmov v5.d[1],x10; ldr d6,[x4,#-16]\n\t"\
+  "fmla v24.4s,v7.4s,v"#ac1".s[3]\n\t"\
+  "fmla v29.4s,v7.4s,v"#ac2".s[3]\n\t"\
+  "fmla v15.4s,v4.4s,v"#ac1".s[1]\n\t"\
+  "ins v6.d[1],v6.d[0]; ldr d7,[x4,#-8]\n\t"\
+  "fmla v20.4s,v4.4s,v"#ac2".s[1]\n\t"\
+  "fmla v25.4s,v4.4s,v"#ac1".s[3]\n\t"\
+  "fmla v30.4s,v4.4s,v"#ac2".s[3]\n\t"\
+  "ins v7.d[1],v7.d[0]\n\t"\
+  "fmla v16.4s,v5.4s,v"#ac1".s[1]\n\t"\
+  "fmla v21.4s,v5.4s,v"#ac2".s[1]; sub w5,w5,#2\n\t"\
+  "fmla v26.4s,v5.4s,v"#ac1".s[3]\n\t"\
+  "fmla v31.4s,v5.4s,v"#ac2".s[3]\n\t"\
+  "fmla v10.4s,v6.4s,v"#ac1".4s\n\t"\
+  "fmla v11.4s,v6.4s,v"#ac2".4s\n\t"\
+  "fmla v8.4s,v7.4s,v"#ac1".4s\n\t"\
+  "fmla v9.4s,v7.4s,v"#ac2".4s\n\t"
+
+#define KERNEL_M4N22_FIN1 \
+  "ldr s0,[x0],#4; ldr q4,[x4]; ldr q5,[x4,#16]\n\t"\
+  "ldr q6,[x4,#32]; add x4,x4,#88\n\t"\
+  "ldr s1,[x1],#4\n\t"\
+  "fmla v12.4s,v4.4s,v0.s[0]\n\t"\
+  "fmla v13.4s,v5.4s,v0.s[0]\n\t"\
+  "fmla v14.4s,v6.4s,v0.s[0]\n\t"\
+  "ldr s2,[x2],#4; ldr d7,[x4,#-40]\n\t"\
+  "fmla v17.4s,v4.4s,v1.s[0]; ldr x10,[x4,#-32]\n\t"\
+  "fmla v18.4s,v5.4s,v1.s[0]\n\t"\
+  "fmla v19.4s,v6.4s,v1.s[0]\n\t"\
+  "ldr s3,[x3],#4; fmov v7.d[1],x10\n\t"\
+  "fmla v22.4s,v4.4s,v2.s[0]\n\t"\
+  "fmla v23.4s,v5.4s,v2.s[0]\n\t"\
+  "fmla v27.4s,v4.4s,v3.s[0]\n\t"\
+  "ldr d4,[x4,#-24]\n\t"\
+  "fmla v24.4s,v6.4s,v2.s[0]; ldr x10,[x4,#-16]\n\t"\
+  "fmla v28.4s,v5.4s,v3.s[0]\n\t"\
+  "fmla v29.4s,v6.4s,v3.s[0]\n\t"\
+  "fmov v4.d[1],x10; ldr s5,[x4,#-8]\n\t"\
+  "fmla v15.4s,v7.4s,v0.s[0]; ldr w11,[x4,#-4]\n\t"\
+  "fmla v20.4s,v7.4s,v1.s[0]\n\t"\
+  "fmla v25.4s,v7.4s,v2.s[0]\n\t"\
+  "ins v5.d[1],v5.d[0]; dup v6.2d,x11\n\t"\
+  "fmla v30.4s,v7.4s,v3.s[0]\n\t"\
+  "fmla v16.4s,v4.4s,v0.s[0]\n\t"\
+  "fmla v21.4s,v4.4s,v1.s[0]\n\t"\
+  "ins v0.d[1],v2.d[0]; ins v1.d[1],v3.d[0]\n\t"\
+  "fmla v26.4s,v4.4s,v2.s[0]\n\t"\
+  "fmla v31.4s,v4.4s,v3.s[0]\n\t"\
+  "fmla v10.4s,v5.4s,v0.4s\n\t"\
+  "fmla v11.4s,v5.4s,v1.4s\n\t"\
+  "fmla v8.4s,v6.4s,v0.4s\n\t"\
+  "fmla v9.4s,v6.4s,v1.4s\n\t"
+
+
+#define FUNC_K2(ndim) \
+static inline void sgemm_skinny1_a53_m4n##ndim(\
+  const float * __restrict__ a_ptr, const float * __restrict__ b_scr,\
+  float * __restrict__ c_ptr, uint32_t K, uint32_t LDA, uint32_t LDC,\
+  uint8_t c_rowmajor, const float * __restrict__ beta_addr) {\
+  __asm__ __volatile__ (\
+    "mov x0,%[a_ptr]; add x1,%[a_ptr],%w[LDA],UXTW #2\n\t"\
+    "add x2,%[a_ptr],%w[LDA],UXTW #3; add x3,x1,%w[LDA],UXTW #3\n\t"\
+    "add x6,x0,%w[LDA],UXTW #4; add x7,x1,%w[LDA],UXTW #4\n\t"\
+    "add x8,x2,%w[LDA],UXTW #4; add x9,x3,%w[LDA],UXTW #4\n\t"\
+    "mov x4,%[b_scr]; mov w5,%w[K]\n\t"\
+    INIT_M4N##ndim\
+    "cmp w5,#2; b.lt 4f\n\t"\
+    KERNEL_M4N##ndim##_PRELOAD2\
+    "cmp w5,#6; b.lt 2f\n\t"\
+    ".balign 16; 1:\n\t"\
+    KERNEL_M4N##ndim##_MAIN2(0, 1, 2, 3, 0, 1)\
+    KERNEL_M4N##ndim##_MAIN2(2, 3, 0, 1, 2, 3)\
+    "b.ge 1b; 2:\n\t"\
+    "cmp w5,#4; b.lt 3f\n\t"\
+    KERNEL_M4N##ndim##_MAIN2(0, 1, 2, 3, 0, 1)\
+    KERNEL_M4N##ndim##_TAIL2(2, 3)\
+    "b 4f; 3:\n\t"\
+    KERNEL_M4N##ndim##_TAIL2(0, 1)\
+    "4:\n\t"\
+    "cmp w5,#1; b.lt 6f\n\t"\
+    "5:\n\t"\
+    KERNEL_M4N##ndim##_FIN1\
+    "6:\n\t"\
+    INIT_SAVE\
+    "cmp %w[c_rowmajor],#0; b.eq 7f\n\t"\
+    SAVE_M4N##ndim(CR) "b 8f\n\t"\
+    "7:\n\t"\
+    SAVE_M4N##ndim(CC)\
+    "8:\n\t"\
+  ::[a_ptr]"r"(a_ptr), [c_ptr]"r"(c_ptr), [b_scr]"r"(b_scr),\
+    [K]"r"(K), [LDA]"r"(LDA), [LDC]"r"(LDC),\
+    [beta_addr]"r"(beta_addr), [c_rowmajor]"r"(c_rowmajor)\
+  :"cc","memory","x0","x1","x2","x3","x4","x5","x6","x7","x8","x9",\
+  "x10","x11","x12","x13","x14","x15","x16",\
+  "v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11","v12","v13",\
+  "v14","v15","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25",\
+  "v26","v27","v28","v29","v30","v31");\
+}
+
+FUNC_K2(15)
+FUNC_K2(16)
+FUNC_K2(17)
+FUNC_K2(18)
+FUNC_K2(19)
+FUNC_K2(20)
+FUNC_K2(21)
+FUNC_K2(22)
+
+#define INIT_M4N23 INIT_4V(6, 7, 8, 9) \
+  INIT_4V(10, 11, 12, 13) INIT_4V(14, 15, 16, 17)\
+  INIT_4V(18, 19, 20, 21) INIT_4V(22, 23, 24, 25)\
+  INIT_2V(26, 27) INIT_1V(28)
+
+#define SAVE_M4N23(mode) \
+  UNIT_SAVE_M4N4_VC_##mode(6, 7, 8, 9) UNIT_SAVE_M4N4_VC_##mode(10, 11, 12, 13)\
+  UNIT_SAVE_M4N4_VC_##mode(14, 15, 16, 17) UNIT_SAVE_M4N4_VC_##mode(18, 19, 20, 21)\
+  UNIT_SAVE_M4N4_VC_##mode(22, 23, 24, 25) EDGE_SAVE_M4N1K1_##mode(26)\
+  EDGE_SAVE_M4N1K1_##mode(27) EDGE_SAVE_M4N1K1_##mode(28)
+
+#define KERNEL_M4N23_PRELOAD2 \
+  "ldr x16,[x0],#8; ldr x17,[x1],#8; ldr x19,[x2],#8; ldr x20,[x3],#8\n\t"\
+  "ldr q2,[x4]; ldr q3,[x4,#16]; ldr x10,[x4,#24]; add x4,x4,#184\n\t"\
+  "mov w11,w16; bfi x11,x17,#32,#32; fmov d0,x11\n\t"\
+  "mov w11,w19; bfi x11,x20,#32,#32; fmov v0.d[1],x11\n\t"
+
+#define KERNEL_M4N23_MAIN2(ap1, ap2) \
+  "fmov v3.d[1],x10; ldr d4,[x4,#-152]\n\t"\
+  "fmla v6.4s,v0.4s,v2.s[0]; ldr x10,[x4,#-144]\n\t"\
+  "fmla v7.4s,v0.4s,v2.s[1]; bfxil x17,x16,#32,#32\n\t"\
+  "fmla v8.4s,v0.4s,v2.s[2]\n\t"\
+  "fmov v4.d[1],x10; fmov d1,x17\n\t"\
+  "fmla v9.4s,v0.4s,v2.s[3]; ldr x16,[x0],#8\n\t"\
+  "fmla v10.4s,v0.4s,v3.s[0]; bfxil x20,x19,#32,#32\n\t"\
+  "fmla v11.4s,v0.4s,v3.s[1]\n\t"\
+  "fmov v1.d[1],x20; ldr d2,[x4,#-136]\n\t"\
+  "fmla v12.4s,v0.4s,v3.s[2]; ldr x10,[x4,#-128]\n\t"\
+  "fmla v13.4s,v0.4s,v3.s[3]; prfm pldl1keep,[x4,#48]\n\t"\
+  "fmla v14.4s,v0.4s,v4.s[0]\n\t"\
+  "fmov v2.d[1],x10; ldr d3,[x4,#-120]\n\t"\
+  "fmla v15.4s,v0.4s,v4.s[1]; ldr x10,[x4,#-112]\n\t"\
+  "fmla v16.4s,v0.4s,v4.s[2]; ldr x17,[x1],#8\n\t"\
+  "fmla v17.4s,v0.4s,v4.s[3]\n\t"\
+  "fmov v3.d[1],x10; ldr d4,[x4,#-104]\n\t"\
+  "fmla v18.4s,v0.4s,v2.s[0]; ldr x10,[x4,#-96]\n\t"\
+  "fmla v19.4s,v0.4s,v2.s[1]; mov w11,w16\n\t"\
+  "fmla v20.4s,v0.4s,v2.s[2]\n\t"\
+  "fmov v4.d[1],x10\n\t"\
+  "fmla v21.4s,v0.4s,v2.s[3]; bfi x11,x17,#32,#32\n\t"\
+  "fmla v22.4s,v0.4s,v3.s[0]; ldr x19,[x2],#8\n\t"\
+  "fmla v23.4s,v0.4s,v3.s[1]\n\t"\
+  "ldr d2,[x4,#-88]\n\t"\
+  "fmla v24.4s,v0.4s,v3.s[2]; ldr x10,[x4,#-80]\n\t"\
+  "fmla v25.4s,v0.4s,v3.s[3]; prfm pldl1keep,[x"#ap1",#64]\n\t"\
+  "fmla v26.4s,v0.4s,v4.s[0]\n\t"\
+  "fmov v2.d[1],x10; ldr d3,[x4,#-72]\n\t"\
+  "fmla v27.4s,v0.4s,v4.s[1]; ldr x10,[x4,#-64]\n\t"\
+  "fmla v28.4s,v0.4s,v4.s[2]\n\t"\
+  "fmla v6.4s,v1.4s,v4.s[3]\n\t"\
+  "fmov v3.d[1],x10; ldr d4,[x4,#-56]\n\t"\
+  "fmla v7.4s,v1.4s,v2.s[0]; ldr x10,[x4,#-48]\n\t"\
+  "fmla v8.4s,v1.4s,v2.s[1]; prfm pldl1keep,[x4,#112]\n\t"\
+  "fmla v9.4s,v1.4s,v2.s[2]\n\t"\
+  "fmov v4.d[1],x10; fmov d0,x11\n\t"\
+  "fmla v10.4s,v1.4s,v2.s[3]; mov w11,w19\n\t"\
+  "fmla v11.4s,v1.4s,v3.s[0]; ldr x20,[x3],#8\n\t"\
+  "fmla v12.4s,v1.4s,v3.s[1]\n\t"\
+  "ldr d2,[x4,#-40]\n\t"\
+  "fmla v13.4s,v1.4s,v3.s[2]; ldr x10,[x4,#-32]\n\t"\
+  "fmla v14.4s,v1.4s,v3.s[3]; prfm pldl1keep,[x"#ap2",#64]\n\t"\
+  "fmla v15.4s,v1.4s,v4.s[0]\n\t"\
+  "fmov v2.d[1],x10; ldr d3,[x4,#-24]\n\t"\
+  "fmla v16.4s,v1.4s,v4.s[1]; ldr x10,[x4,#-16]\n\t"\
+  "fmla v17.4s,v1.4s,v4.s[2]; bfi x11,x20,#32,#32\n\t"\
+  "fmla v18.4s,v1.4s,v4.s[3]\n\t"\
+  "fmov v3.d[1],x10; fmov v0.d[1],x11\n\t"\
+  "fmla v19.4s,v1.4s,v2.s[0]; sub w5,w5,#2\n\t"\
+  "fmla v20.4s,v1.4s,v2.s[1]; cmp w5,#6\n\t"\
+  "fmla v21.4s,v1.4s,v2.s[2]\n\t"\
+  "ldr d4,[x4,#-8]\n\t"\
+  "fmla v22.4s,v1.4s,v2.s[3]; prfm pldl1keep,[x4,#176]\n\t"\
+  "fmla v23.4s,v1.4s,v3.s[0]; add x4,x4,#184\n\t"\
+  "fmla v24.4s,v1.4s,v3.s[1]\n\t"\
+  "ldr d2,[x4,#-184]\n\t"\
+  "fmla v25.4s,v1.4s,v3.s[2]; ldr x10,[x4,#-176]\n\t"\
+  "fmla v26.4s,v1.4s,v3.s[3]\n\t"\
+  "fmov v2.d[1],x10; ldr d3,[x4,#-168]\n\t"\
+  "fmla v27.4s,v1.4s,v4.s[0]; ldr x10,[x4,#-160]\n\t"\
+  "fmla v28.4s,v1.4s,v4.s[1]\n\t"
+
+#define KERNEL_M4N23_TAIL2 \
+  "fmov v3.d[1],x10; ldr d4,[x4,#-152]\n\t"\
+  "fmla v6.4s,v0.4s,v2.s[0]; ldr x10,[x4,#-144]\n\t"\
+  "fmla v7.4s,v0.4s,v2.s[1]; bfxil x17,x16,#32,#32\n\t"\
+  "fmla v8.4s,v0.4s,v2.s[2]\n\t"\
+  "fmov v4.d[1],x10; fmov d1,x17\n\t"\
+  "fmla v9.4s,v0.4s,v2.s[3]\n\t"\
+  "fmla v10.4s,v0.4s,v3.s[0]; bfxil x20,x19,#32,#32\n\t"\
+  "fmla v11.4s,v0.4s,v3.s[1]\n\t"\
+  "fmov v1.d[1],x20; ldr d2,[x4,#-136]\n\t"\
+  "fmla v12.4s,v0.4s,v3.s[2]; ldr x10,[x4,#-128]\n\t"\
+  "fmla v13.4s,v0.4s,v3.s[3]\n\t"\
+  "fmla v14.4s,v0.4s,v4.s[0]\n\t"\
+  "fmov v2.d[1],x10; ldr d3,[x4,#-120]\n\t"\
+  "fmla v15.4s,v0.4s,v4.s[1]; ldr x10,[x4,#-112]\n\t"\
+  "fmla v16.4s,v0.4s,v4.s[2]\n\t"\
+  "fmla v17.4s,v0.4s,v4.s[3]\n\t"\
+  "fmov v3.d[1],x10; ldr d4,[x4,#-104]\n\t"\
+  "fmla v18.4s,v0.4s,v2.s[0]; ldr x10,[x4,#-96]\n\t"\
+  "fmla v19.4s,v0.4s,v2.s[1]\n\t"\
+  "fmla v20.4s,v0.4s,v2.s[2]\n\t"\
+  "fmov v4.d[1],x10\n\t"\
+  "fmla v21.4s,v0.4s,v2.s[3]\n\t"\
+  "fmla v22.4s,v0.4s,v3.s[0]\n\t"\
+  "fmla v23.4s,v0.4s,v3.s[1]\n\t"\
+  "ldr d2,[x4,#-88]\n\t"\
+  "fmla v24.4s,v0.4s,v3.s[2]; ldr x10,[x4,#-80]\n\t"\
+  "fmla v25.4s,v0.4s,v3.s[3]; prfm pldl1keep,[x6]\n\t"\
+  "fmla v26.4s,v0.4s,v4.s[0]\n\t"\
+  "fmov v2.d[1],x10; ldr d3,[x4,#-72]\n\t"\
+  "fmla v27.4s,v0.4s,v4.s[1]; ldr x10,[x4,#-64]\n\t"\
+  "fmla v28.4s,v0.4s,v4.s[2]\n\t"\
+  "fmla v6.4s,v1.4s,v4.s[3]\n\t"\
+  "fmov v3.d[1],x10\n\t"\
+  "fmla v7.4s,v1.4s,v2.s[0]\n\t"\
+  "fmla v8.4s,v1.4s,v2.s[1]\n\t"\
+  "fmla v9.4s,v1.4s,v2.s[2]\n\t"\
+  "ldr d4,[x4,#-56]\n\t"\
+  "fmla v10.4s,v1.4s,v2.s[3]; ldr x10,[x4,#-48]\n\t"\
+  "fmla v11.4s,v1.4s,v3.s[0]; prfm pldl1keep,[x7]\n\t"\
+  "fmla v12.4s,v1.4s,v3.s[1]\n\t"\
+  "fmov v4.d[1],x10; ldr d2,[x4,#-40]\n\t"\
+  "fmla v13.4s,v1.4s,v3.s[2]; ldr x10,[x4,#-32]\n\t"\
+  "fmla v14.4s,v1.4s,v3.s[3]; prfm pldl1keep,[x8]\n\t"\
+  "fmla v15.4s,v1.4s,v4.s[0]\n\t"\
+  "fmov v2.d[1],x10; ldr d3,[x4,#-24]\n\t"\
+  "fmla v16.4s,v1.4s,v4.s[1]; ldr x10,[x4,#-16]\n\t"\
+  "fmla v17.4s,v1.4s,v4.s[2]; prfm pldl1keep,[x9]\n\t"\
+  "fmla v18.4s,v1.4s,v4.s[3]\n\t"\
+  "fmov v3.d[1],x10\n\t"\
+  "fmla v19.4s,v1.4s,v2.s[0]; sub w5,w5,#2\n\t"\
+  "fmla v20.4s,v1.4s,v2.s[1]\n\t"\
+  "fmla v21.4s,v1.4s,v2.s[2]\n\t"\
+  "ldr d4,[x4,#-8]\n\t"\
+  "fmla v22.4s,v1.4s,v2.s[3]\n\t"\
+  "fmla v23.4s,v1.4s,v3.s[0]\n\t"\
+  "fmla v24.4s,v1.4s,v3.s[1]\n\t"\
+  "fmla v25.4s,v1.4s,v3.s[2]\n\t"\
+  "fmla v26.4s,v1.4s,v3.s[3]\n\t"\
+  "fmla v27.4s,v1.4s,v4.s[0]\n\t"\
+  "fmla v28.4s,v1.4s,v4.s[1]\n\t"
+
+#define KERNEL_M4N23_FIN1 \
+  "ldr w16,[x0],#4; ldr q2,[x4]\n\t"\
+  "ldr w17,[x1],#4; ldr d3,[x4,#16]\n\t"\
+  "ldr w19,[x2],#4; ldr x10,[x4,#24]\n\t"\
+  "ldr w20,[x3],#4; orr x16,x16,x17,LSL #32\n\t"\
+  "fmov d0,x16; orr x19,x19,x20,LSL #32; fmov v0.d[1],x19\n\t"\
+  "fmov v3.d[1],x10; ldr d4,[x4,#32]\n\t"\
+  "fmla v6.4s,v0.4s,v2.s[0]; ldr x10,[x4,#40]\n\t"\
+  "fmla v7.4s,v0.4s,v2.s[1]\n\t"\
+  "fmla v8.4s,v0.4s,v2.s[2]\n\t"\
+  "fmov v4.d[1],x10\n\t"\
+  "fmla v9.4s,v0.4s,v2.s[3]\n\t"\
+  "fmla v10.4s,v0.4s,v3.s[0]\n\t"\
+  "fmla v11.4s,v0.4s,v3.s[1]\n\t"\
+  "ldr d2,[x4,#48]\n\t"\
+  "fmla v12.4s,v0.4s,v3.s[2]; ldr x10,[x4,#56]\n\t"\
+  "fmla v13.4s,v0.4s,v3.s[3]\n\t"\
+  "fmla v14.4s,v0.4s,v4.s[0]\n\t"\
+  "fmov v2.d[1],x10; ldr d3,[x4,#64]\n\t"\
+  "fmla v15.4s,v0.4s,v4.s[1]; ldr x10,[x4,#72]\n\t"\
+  "fmla v16.4s,v0.4s,v4.s[2]\n\t"\
+  "fmla v17.4s,v0.4s,v4.s[3]\n\t"\
+  "fmov v3.d[1],x10; ldr d4,[x4,#80]\n\t"\
+  "fmla v18.4s,v0.4s,v2.s[0]; ldr w10,[x4,#88]\n\t"\
+  "fmla v19.4s,v0.4s,v2.s[1]; add x4,x4,#92\n\t"\
+  "fmla v20.4s,v0.4s,v2.s[2]\n\t"\
+  "fmov v4.d[1],x10\n\t"\
+  "fmla v21.4s,v0.4s,v2.s[3]\n\t"\
+  "fmla v22.4s,v0.4s,v3.s[0]\n\t"\
+  "fmla v23.4s,v0.4s,v3.s[1]\n\t"\
+  "fmla v24.4s,v0.4s,v3.s[2]\n\t"\
+  "fmla v25.4s,v0.4s,v3.s[3]\n\t"\
+  "fmla v26.4s,v0.4s,v4.s[0]\n\t"\
+  "fmla v27.4s,v0.4s,v4.s[1]\n\t"\
+  "fmla v28.4s,v0.4s,v4.s[2]\n\t"
+
+
+#define INIT_M4N24 INIT_4V(6, 7, 8, 9) \
+  INIT_4V(10, 11, 12, 13) INIT_4V(14, 15, 16, 17)\
+  INIT_4V(18, 19, 20, 21) INIT_4V(22, 23, 24, 25)\
+  INIT_4V(26, 27, 28, 29)
+
+#define SAVE_M4N24(mode) \
+  UNIT_SAVE_M4N4_VC_##mode(6, 7, 8, 9) UNIT_SAVE_M4N4_VC_##mode(10, 11, 12, 13)\
+  UNIT_SAVE_M4N4_VC_##mode(14, 15, 16, 17) UNIT_SAVE_M4N4_VC_##mode(18, 19, 20, 21)\
+  UNIT_SAVE_M4N4_VC_##mode(22, 23, 24, 25) UNIT_SAVE_M4N4_VC_##mode(26, 27, 28, 29)
+
+#define KERNEL_M4N24_PRELOAD2 \
+  "ldr x16,[x0],#8; ldr x17,[x1],#8; ldr x19,[x2],#8; ldr x20,[x3],#8\n\t"\
+  "ldr q2,[x4]; ldr q3,[x4,#16]; ldr x10,[x4,#24]; add x4,x4,#192\n\t"\
+  "mov w11,w16; bfi x11,x17,#32,#32; fmov d0,x11\n\t"\
+  "mov w11,w19; bfi x11,x20,#32,#32; fmov v0.d[1],x11\n\t"
+
+#define KERNEL_M4N24_MAIN2(ap1, ap2) \
+  "fmov v3.d[1],x10; ldr d4,[x4,#-160]\n\t"\
+  "fmla v6.4s,v0.4s,v2.s[0]; ldr x10,[x4,#-152]\n\t"\
+  "fmla v7.4s,v0.4s,v2.s[1]; bfxil x17,x16,#32,#32\n\t"\
+  "fmla v8.4s,v0.4s,v2.s[2]\n\t"\
+  "fmov v4.d[1],x10; fmov d1,x17\n\t"\
+  "fmla v9.4s,v0.4s,v2.s[3]; ldr x16,[x0],#8\n\t"\
+  "fmla v10.4s,v0.4s,v3.s[0]; bfxil x20,x19,#32,#32\n\t"\
+  "fmla v11.4s,v0.4s,v3.s[1]\n\t"\
+  "fmov v1.d[1],x20; ldr d2,[x4,#-144]\n\t"\
+  "fmla v12.4s,v0.4s,v3.s[2]; ldr x10,[x4,#-136]\n\t"\
+  "fmla v13.4s,v0.4s,v3.s[3]; prfm pldl1keep,[x4,#48]\n\t"\
+  "fmla v14.4s,v0.4s,v4.s[0]\n\t"\
+  "fmov v2.d[1],x10; ldr d3,[x4,#-128]\n\t"\
+  "fmla v15.4s,v0.4s,v4.s[1]; ldr x10,[x4,#-120]\n\t"\
+  "fmla v16.4s,v0.4s,v4.s[2]; ldr x17,[x1],#8\n\t"\
+  "fmla v17.4s,v0.4s,v4.s[3]\n\t"\
+  "fmov v3.d[1],x10; ldr d4,[x4,#-112]\n\t"\
+  "fmla v18.4s,v0.4s,v2.s[0]; ldr x10,[x4,#-104]\n\t"\
+  "fmla v19.4s,v0.4s,v2.s[1]; mov w11,w16\n\t"\
+  "fmla v20.4s,v0.4s,v2.s[2]\n\t"\
+  "fmov v4.d[1],x10\n\t"\
+  "fmla v21.4s,v0.4s,v2.s[3]; bfi x11,x17,#32,#32\n\t"\
+  "fmla v22.4s,v0.4s,v3.s[0]; ldr x19,[x2],#8\n\t"\
+  "fmla v23.4s,v0.4s,v3.s[1]\n\t"\
+  "ldr d2,[x4,#-96]\n\t"\
+  "fmla v24.4s,v0.4s,v3.s[2]; ldr x10,[x4,#-88]\n\t"\
+  "fmla v25.4s,v0.4s,v3.s[3]; prfm pldl1keep,[x"#ap1",#64]\n\t"\
+  "fmla v26.4s,v0.4s,v4.s[0]\n\t"\
+  "fmov v2.d[1],x10; ldr d3,[x4,#-80]\n\t"\
+  "fmla v27.4s,v0.4s,v4.s[1]; ldr x10,[x4,#-72]\n\t"\
+  "fmla v28.4s,v0.4s,v4.s[2]\n\t"\
+  "fmla v29.4s,v0.4s,v4.s[3]\n\t"\
+  "fmov v3.d[1],x10; ldr d4,[x4,#-64]\n\t"\
+  "fmla v6.4s,v1.4s,v2.s[0]; ldr x10,[x4,#-56]\n\t"\
+  "fmla v7.4s,v1.4s,v2.s[1]; prfm pldl1keep,[x4,#112]\n\t"\
+  "fmla v8.4s,v1.4s,v2.s[2]\n\t"\
+  "fmov v4.d[1],x10; fmov d0,x11\n\t"\
+  "fmla v9.4s,v1.4s,v2.s[3]; mov w11,w19\n\t"\
+  "fmla v10.4s,v1.4s,v3.s[0]; ldr x20,[x3],#8\n\t"\
+  "fmla v11.4s,v1.4s,v3.s[1]\n\t"\
+  "ldr d2,[x4,#-48]\n\t"\
+  "fmla v12.4s,v1.4s,v3.s[2]; ldr x10,[x4,#-40]\n\t"\
+  "fmla v13.4s,v1.4s,v3.s[3]; prfm pldl1keep,[x"#ap2",#64]\n\t"\
+  "fmla v14.4s,v1.4s,v4.s[0]\n\t"\
+  "fmov v2.d[1],x10; ldr d3,[x4,#-32]\n\t"\
+  "fmla v15.4s,v1.4s,v4.s[1]; ldr x10,[x4,#-24]\n\t"\
+  "fmla v16.4s,v1.4s,v4.s[2]; bfi x11,x20,#32,#32\n\t"\
+  "fmla v17.4s,v1.4s,v4.s[3]\n\t"\
+  "fmov v3.d[1],x10; fmov v0.d[1],x11\n\t"\
+  "fmla v18.4s,v1.4s,v2.s[0]; sub w5,w5,#2\n\t"\
+  "fmla v19.4s,v1.4s,v2.s[1]; cmp w5,#6\n\t"\
+  "fmla v20.4s,v1.4s,v2.s[2]\n\t"\
+  "ldr d4,[x4,#-16]\n\t"\
+  "fmla v21.4s,v1.4s,v2.s[3]; ldr x10,[x4,#-8]\n\t"\
+  "fmla v22.4s,v1.4s,v3.s[0]; prfm pldl1keep,[x4,#176]\n\t"\
+  "fmla v23.4s,v1.4s,v3.s[1]\n\t"\
+  "fmov v4.d[1],x10; ldr d2,[x4]\n\t"\
+  "fmla v24.4s,v1.4s,v3.s[2]; ldr x10,[x4,#8]\n\t"\
+  "fmla v25.4s,v1.4s,v3.s[3]; add x4,x4,#192\n\t"\
+  "fmla v26.4s,v1.4s,v4.s[0]\n\t"\
+  "fmov v2.d[1],x10; ldr d3,[x4,#-176]\n\t"\
+  "fmla v27.4s,v1.4s,v4.s[1]; ldr x10,[x4,#-168]\n\t"\
+  "fmla v28.4s,v1.4s,v4.s[2]\n\t"\
+  "fmla v29.4s,v1.4s,v4.s[3]\n\t"
+
+#define KERNEL_M4N24_TAIL2 \
+  "fmov v3.d[1],x10; ldr d4,[x4,#-160]\n\t"\
+  "fmla v6.4s,v0.4s,v2.s[0]; ldr x10,[x4,#-152]\n\t"\
+  "fmla v7.4s,v0.4s,v2.s[1]; bfxil x17,x16,#32,#32\n\t"\
+  "fmla v8.4s,v0.4s,v2.s[2]\n\t"\
+  "fmov v4.d[1],x10; fmov d1,x17\n\t"\
+  "fmla v9.4s,v0.4s,v2.s[3]\n\t"\
+  "fmla v10.4s,v0.4s,v3.s[0]; bfxil x20,x19,#32,#32\n\t"\
+  "fmla v11.4s,v0.4s,v3.s[1]\n\t"\
+  "fmov v1.d[1],x20; ldr d2,[x4,#-144]\n\t"\
+  "fmla v12.4s,v0.4s,v3.s[2]; ldr x10,[x4,#-136]\n\t"\
+  "fmla v13.4s,v0.4s,v3.s[3]\n\t"\
+  "fmla v14.4s,v0.4s,v4.s[0]\n\t"\
+  "fmov v2.d[1],x10; ldr d3,[x4,#-128]\n\t"\
+  "fmla v15.4s,v0.4s,v4.s[1]; ldr x10,[x4,#-120]\n\t"\
+  "fmla v16.4s,v0.4s,v4.s[2]\n\t"\
+  "fmla v17.4s,v0.4s,v4.s[3]\n\t"\
+  "fmov v3.d[1],x10; ldr d4,[x4,#-112]\n\t"\
+  "fmla v18.4s,v0.4s,v2.s[0]; ldr x10,[x4,#-104]\n\t"\
+  "fmla v19.4s,v0.4s,v2.s[1]\n\t"\
+  "fmla v20.4s,v0.4s,v2.s[2]\n\t"\
+  "fmov v4.d[1],x10\n\t"\
+  "fmla v21.4s,v0.4s,v2.s[3]\n\t"\
+  "fmla v22.4s,v0.4s,v3.s[0]\n\t"\
+  "fmla v23.4s,v0.4s,v3.s[1]\n\t"\
+  "ldr d2,[x4,#-96]\n\t"\
+  "fmla v24.4s,v0.4s,v3.s[2]; ldr x10,[x4,#-88]\n\t"\
+  "fmla v25.4s,v0.4s,v3.s[3]; prfm pldl1keep,[x6]\n\t"\
+  "fmla v26.4s,v0.4s,v4.s[0]\n\t"\
+  "fmov v2.d[1],x10; ldr d3,[x4,#-80]\n\t"\
+  "fmla v27.4s,v0.4s,v4.s[1]; ldr x10,[x4,#-72]\n\t"\
+  "fmla v28.4s,v0.4s,v4.s[2]\n\t"\
+  "fmla v29.4s,v0.4s,v4.s[3]\n\t"\
+  "fmov v3.d[1],x10; ldr d4,[x4,#-64]\n\t"\
+  "fmla v6.4s,v1.4s,v2.s[0]; ldr x10,[x4,#-56]\n\t"\
+  "fmla v7.4s,v1.4s,v2.s[1]; prfm pldl1keep,[x7]\n\t"\
+  "fmla v8.4s,v1.4s,v2.s[2]\n\t"\
+  "fmov v4.d[1],x10\n\t"\
+  "fmla v9.4s,v1.4s,v2.s[3]\n\t"\
+  "fmla v10.4s,v1.4s,v3.s[0]\n\t"\
+  "fmla v11.4s,v1.4s,v3.s[1]\n\t"\
+  "ldr d2,[x4,#-48]\n\t"\
+  "fmla v12.4s,v1.4s,v3.s[2]; ldr x10,[x4,#-40]\n\t"\
+  "fmla v13.4s,v1.4s,v3.s[3]; prfm pldl1keep,[x8]\n\t"\
+  "fmla v14.4s,v1.4s,v4.s[0]\n\t"\
+  "fmov v2.d[1],x10; ldr d3,[x4,#-32]\n\t"\
+  "fmla v15.4s,v1.4s,v4.s[1]; ldr x10,[x4,#-24]\n\t"\
+  "fmla v16.4s,v1.4s,v4.s[2]\n\t"\
+  "fmla v17.4s,v1.4s,v4.s[3]\n\t"\
+  "fmov v3.d[1],x10\n\t"\
+  "fmla v18.4s,v1.4s,v2.s[0]; sub w5,w5,#2\n\t"\
+  "fmla v19.4s,v1.4s,v2.s[1]\n\t"\
+  "fmla v20.4s,v1.4s,v2.s[2]\n\t"\
+  "ldr d4,[x4,#-16]\n\t"\
+  "fmla v21.4s,v1.4s,v2.s[3]; ldr x10,[x4,#-8]\n\t"\
+  "fmla v22.4s,v1.4s,v3.s[0]; prfm pldl1keep,[x9]\n\t"\
+  "fmla v23.4s,v1.4s,v3.s[1]\n\t"\
+  "fmov v4.d[1],x10\n\t"\
+  "fmla v24.4s,v1.4s,v3.s[2]\n\t"\
+  "fmla v25.4s,v1.4s,v3.s[3]\n\t"\
+  "fmla v26.4s,v1.4s,v4.s[0]\n\t"\
+  "fmla v27.4s,v1.4s,v4.s[1]\n\t"\
+  "fmla v28.4s,v1.4s,v4.s[2]\n\t"\
+  "fmla v29.4s,v1.4s,v4.s[3]\n\t"
+
+#define KERNEL_M4N24_FIN1 \
+  "ldr w16,[x0],#4; ldr q2,[x4]\n\t"\
+  "ldr w17,[x1],#4; ldr d3,[x4,#16]\n\t"\
+  "ldr w19,[x2],#4; ldr x10,[x4,#24]\n\t"\
+  "ldr w20,[x3],#4; orr x16,x16,x17,LSL #32\n\t"\
+  "fmov d0,x16; orr x19,x19,x20,LSL #32; fmov v0.d[1],x19\n\t"\
+  "fmov v3.d[1],x10; ldr d4,[x4,#32]\n\t"\
+  "fmla v6.4s,v0.4s,v2.s[0]; ldr x10,[x4,#40]\n\t"\
+  "fmla v7.4s,v0.4s,v2.s[1]\n\t"\
+  "fmla v8.4s,v0.4s,v2.s[2]\n\t"\
+  "fmov v4.d[1],x10\n\t"\
+  "fmla v9.4s,v0.4s,v2.s[3]\n\t"\
+  "fmla v10.4s,v0.4s,v3.s[0]\n\t"\
+  "fmla v11.4s,v0.4s,v3.s[1]\n\t"\
+  "ldr d2,[x4,#48]\n\t"\
+  "fmla v12.4s,v0.4s,v3.s[2]; ldr x10,[x4,#56]\n\t"\
+  "fmla v13.4s,v0.4s,v3.s[3]\n\t"\
+  "fmla v14.4s,v0.4s,v4.s[0]\n\t"\
+  "fmov v2.d[1],x10; ldr d3,[x4,#64]\n\t"\
+  "fmla v15.4s,v0.4s,v4.s[1]; ldr x10,[x4,#72]\n\t"\
+  "fmla v16.4s,v0.4s,v4.s[2]\n\t"\
+  "fmla v17.4s,v0.4s,v4.s[3]\n\t"\
+  "fmov v3.d[1],x10; ldr d4,[x4,#80]\n\t"\
+  "fmla v18.4s,v0.4s,v2.s[0]; ldr x10,[x4,#88]\n\t"\
+  "fmla v19.4s,v0.4s,v2.s[1]; add x4,x4,#96\n\t"\
+  "fmla v20.4s,v0.4s,v2.s[2]\n\t"\
+  "fmov v4.d[1],x10\n\t"\
+  "fmla v21.4s,v0.4s,v2.s[3]\n\t"\
+  "fmla v22.4s,v0.4s,v3.s[0]\n\t"\
+  "fmla v23.4s,v0.4s,v3.s[1]\n\t"\
+  "fmla v24.4s,v0.4s,v3.s[2]\n\t"\
+  "fmla v25.4s,v0.4s,v3.s[3]\n\t"\
+  "fmla v26.4s,v0.4s,v4.s[0]\n\t"\
+  "fmla v27.4s,v0.4s,v4.s[1]\n\t"\
+  "fmla v28.4s,v0.4s,v4.s[2]\n\t"\
+  "fmla v29.4s,v0.4s,v4.s[3]\n\t"
+
+
+#define INIT_M4N25 INIT_4V(6, 7, 8, 9) \
+  INIT_4V(10, 11, 12, 13) INIT_4V(14, 15, 16, 17)\
+  INIT_4V(18, 19, 20, 21) INIT_4V(22, 23, 24, 25)\
+  INIT_4V(26, 27, 28, 29) INIT_1V(30)
+
+#define SAVE_M4N25(mode) \
+  UNIT_SAVE_M4N4_VC_##mode(6, 7, 8, 9) UNIT_SAVE_M4N4_VC_##mode(10, 11, 12, 13)\
+  UNIT_SAVE_M4N4_VC_##mode(14, 15, 16, 17) UNIT_SAVE_M4N4_VC_##mode(18, 19, 20, 21)\
+  UNIT_SAVE_M4N4_VC_##mode(22, 23, 24, 25) UNIT_SAVE_M4N4_VC_##mode(26, 27, 28, 29)\
+  EDGE_SAVE_M4N1K1_##mode(30)
+
+#define KERNEL_M4N25_PRELOAD2 \
+  "ldr x16,[x0],#8; ldr x17,[x1],#8; ldr x19,[x2],#8; ldr x20,[x3],#8\n\t"\
+  "ldr q2,[x4]; ldr q3,[x4,#16]; ldr x10,[x4,#24]; add x4,x4,#200\n\t"\
+  "mov w11,w16; bfi x11,x17,#32,#32; fmov d0,x11\n\t"\
+  "mov w11,w19; bfi x11,x20,#32,#32; fmov v0.d[1],x11\n\t"
+
+#define KERNEL_M4N25_MAIN2(ap1, ap2) \
+  "fmov v3.d[1],x10; ldr d4,[x4,#-168]\n\t"\
+  "fmla v6.4s,v0.4s,v2.s[0]; ldr x10,[x4,#-160]\n\t"\
+  "fmla v7.4s,v0.4s,v2.s[1]; bfxil x17,x16,#32,#32\n\t"\
+  "fmla v8.4s,v0.4s,v2.s[2]\n\t"\
+  "fmov v4.d[1],x10; fmov d1,x17\n\t"\
+  "fmla v9.4s,v0.4s,v2.s[3]; ldr x16,[x0],#8\n\t"\
+  "fmla v10.4s,v0.4s,v3.s[0]; bfxil x20,x19,#32,#32\n\t"\
+  "fmla v11.4s,v0.4s,v3.s[1]\n\t"\
+  "fmov v1.d[1],x20; ldr d2,[x4,#-152]\n\t"\
+  "fmla v12.4s,v0.4s,v3.s[2]; ldr x10,[x4,#-144]\n\t"\
+  "fmla v13.4s,v0.4s,v3.s[3]; prfm pldl1keep,[x4,#48]\n\t"\
+  "fmla v14.4s,v0.4s,v4.s[0]\n\t"\
+  "fmov v2.d[1],x10; ldr d3,[x4,#-136]\n\t"\
+  "fmla v15.4s,v0.4s,v4.s[1]; ldr x10,[x4,#-128]\n\t"\
+  "fmla v16.4s,v0.4s,v4.s[2]; ldr x17,[x1],#8\n\t"\
+  "fmla v17.4s,v0.4s,v4.s[3]\n\t"\
+  "fmov v3.d[1],x10; ldr d4,[x4,#-120]\n\t"\
+  "fmla v18.4s,v0.4s,v2.s[0]; ldr x10,[x4,#-112]\n\t"\
+  "fmla v19.4s,v0.4s,v2.s[1]; mov w11,w16\n\t"\
+  "fmla v20.4s,v0.4s,v2.s[2]\n\t"\
+  "fmov v4.d[1],x10\n\t"\
+  "fmla v21.4s,v0.4s,v2.s[3]; bfi x11,x17,#32,#32\n\t"\
+  "fmla v22.4s,v0.4s,v3.s[0]\n\t"\
+  "fmla v23.4s,v0.4s,v3.s[1]\n\t"\
+  "ldr d2,[x4,#-104]\n\t"\
+  "fmla v24.4s,v0.4s,v3.s[2]; ldr x10,[x4,#-96]\n\t"\
+  "fmla v25.4s,v0.4s,v3.s[3]; prfm pldl1keep,[x"#ap1",#64]\n\t"\
+  "fmla v26.4s,v0.4s,v4.s[0]\n\t"\
+  "fmov v2.d[1],x10; ldr d3,[x4,#-88]\n\t"\
+  "fmla v27.4s,v0.4s,v4.s[1]; ldr x10,[x4,#-80]\n\t"\
+  "fmla v28.4s,v0.4s,v4.s[2]; ldr x19,[x2],#8\n\t"\
+  "fmla v29.4s,v0.4s,v4.s[3]\n\t"\
+  "fmov v3.d[1],x10; ldr d4,[x4,#-72]\n\t"\
+  "fmla v30.4s,v0.4s,v2.s[0]; ldr x10,[x4,#-64]\n\t"\
+  "fmla v6.4s,v1.4s,v2.s[1]; prfm pldl1keep,[x4,#96]\n\t"\
+  "fmla v7.4s,v1.4s,v2.s[2]\n\t"\
+  "fmov v4.d[1],x10; fmov d0,x11\n\t"\
+  "fmla v8.4s,v1.4s,v2.s[3]; mov w11,w19\n\t"\
+  "fmla v9.4s,v1.4s,v3.s[0]\n\t"\
+  "fmla v10.4s,v1.4s,v3.s[1]\n\t"\
+  "ldr d2,[x4,#-56]\n\t"\
+  "fmla v11.4s,v1.4s,v3.s[2]; ldr x10,[x4,#-48]\n\t"\
+  "fmla v12.4s,v1.4s,v3.s[3]; ldr x20,[x3],#8\n\t"\
+  "fmla v13.4s,v1.4s,v4.s[0]\n\t"\
+  "fmov v2.d[1],x10; ldr d3,[x4,#-40]\n\t"\
+  "fmla v14.4s,v1.4s,v4.s[1]; ldr x10,[x4,#-32]\n\t"\
+  "fmla v15.4s,v1.4s,v4.s[2]; sub w5,w5,#2\n\t"\
+  "fmla v16.4s,v1.4s,v4.s[3]\n\t"\
+  "fmov v3.d[1],x10\n\t"\
+  "fmla v17.4s,v1.4s,v2.s[0]; bfi x11,x20,#32,#32\n\t"\
+  "fmla v18.4s,v1.4s,v2.s[1]; cmp w5,#6\n\t"\
+  "fmla v19.4s,v1.4s,v2.s[2]\n\t"\
+  "ldr d4,[x4,#-24]; fmov v0.d[1],x11\n\t"\
+  "fmla v20.4s,v1.4s,v2.s[3]; ldr x10,[x4,#-16]\n\t"\
+  "fmla v21.4s,v1.4s,v3.s[0]; prfm pldl1keep,[x4,#144]\n\t"\
+  "fmla v22.4s,v1.4s,v3.s[1]\n\t"\
+  "fmov v4.d[1],x10; ldr d5,[x4,#-8]\n\t"\
+  "fmla v23.4s,v1.4s,v3.s[2]; prfm pldl1keep,[x"#ap2",#64]\n\t"\
+  "fmla v24.4s,v1.4s,v3.s[3]; add x4,x4,#200\n\t"\
+  "fmla v25.4s,v1.4s,v4.s[0]\n\t"\
+  "ldr d2,[x4,#-200]\n\t"\
+  "fmla v26.4s,v1.4s,v4.s[1]; ldr x10,[x4,#-192]\n\t"\
+  "fmla v27.4s,v1.4s,v4.s[2]; prfm pldl1keep,[x4]\n\t"\
+  "fmla v28.4s,v1.4s,v4.s[3]\n\t"\
+  "fmov v2.d[1],x10; ldr d3,[x4,#-184]\n\t"\
+  "fmla v29.4s,v1.4s,v5.s[0]; ldr x10,[x4,#-176]\n\t"\
+  "fmla v30.4s,v1.4s,v5.s[1]\n\t"
+
+#define KERNEL_M4N25_TAIL2 \
+  "fmov v3.d[1],x10; ldr d4,[x4,#-168]\n\t"\
+  "fmla v6.4s,v0.4s,v2.s[0]; ldr x10,[x4,#-160]\n\t"\
+  "fmla v7.4s,v0.4s,v2.s[1]; bfxil x17,x16,#32,#32\n\t"\
+  "fmla v8.4s,v0.4s,v2.s[2]\n\t"\
+  "fmov v4.d[1],x10; fmov d1,x17\n\t"\
+  "fmla v9.4s,v0.4s,v2.s[3]\n\t"\
+  "fmla v10.4s,v0.4s,v3.s[0]; bfxil x20,x19,#32,#32\n\t"\
+  "fmla v11.4s,v0.4s,v3.s[1]\n\t"\
+  "fmov v1.d[1],x20; ldr d2,[x4,#-152]\n\t"\
+  "fmla v12.4s,v0.4s,v3.s[2]; ldr x10,[x4,#-144]\n\t"\
+  "fmla v13.4s,v0.4s,v3.s[3]; prfm pldl1keep,[x6]\n\t"\
+  "fmla v14.4s,v0.4s,v4.s[0]\n\t"\
+  "fmov v2.d[1],x10; ldr d3,[x4,#-136]\n\t"\
+  "fmla v15.4s,v0.4s,v4.s[1]; ldr x10,[x4,#-128]\n\t"\
+  "fmla v16.4s,v0.4s,v4.s[2]\n\t"\
+  "fmla v17.4s,v0.4s,v4.s[3]\n\t"\
+  "fmov v3.d[1],x10; ldr d4,[x4,#-120]\n\t"\
+  "fmla v18.4s,v0.4s,v2.s[0]; ldr x10,[x4,#-112]\n\t"\
+  "fmla v19.4s,v0.4s,v2.s[1]\n\t"\
+  "fmla v20.4s,v0.4s,v2.s[2]\n\t"\
+  "fmov v4.d[1],x10\n\t"\
+  "fmla v21.4s,v0.4s,v2.s[3]\n\t"\
+  "fmla v22.4s,v0.4s,v3.s[0]\n\t"\
+  "fmla v23.4s,v0.4s,v3.s[1]\n\t"\
+  "ldr d2,[x4,#-104]\n\t"\
+  "fmla v24.4s,v0.4s,v3.s[2]; ldr x10,[x4,#-96]\n\t"\
+  "fmla v25.4s,v0.4s,v3.s[3]; prfm pldl1keep,[x7]\n\t"\
+  "fmla v26.4s,v0.4s,v4.s[0]\n\t"\
+  "fmov v2.d[1],x10; ldr d3,[x4,#-88]\n\t"\
+  "fmla v27.4s,v0.4s,v4.s[1]; ldr x10,[x4,#-80]\n\t"\
+  "fmla v28.4s,v0.4s,v4.s[2]\n\t"\
+  "fmla v29.4s,v0.4s,v4.s[3]\n\t"\
+  "fmov v3.d[1],x10; ldr d4,[x4,#-72]\n\t"\
+  "fmla v30.4s,v0.4s,v2.s[0]; ldr x10,[x4,#-64]\n\t"\
+  "fmla v6.4s,v1.4s,v2.s[1]; prfm pldl1keep,[x8]\n\t"\
+  "fmla v7.4s,v1.4s,v2.s[2]\n\t"\
+  "fmov v4.d[1],x10\n\t"\
+  "fmla v8.4s,v1.4s,v2.s[3]\n\t"\
+  "fmla v9.4s,v1.4s,v3.s[0]\n\t"\
+  "fmla v10.4s,v1.4s,v3.s[1]\n\t"\
+  "ldr d2,[x4,#-56]\n\t"\
+  "fmla v11.4s,v1.4s,v3.s[2]; ldr x10,[x4,#-48]\n\t"\
+  "fmla v12.4s,v1.4s,v3.s[3]\n\t"\
+  "fmla v13.4s,v1.4s,v4.s[0]\n\t"\
+  "fmov v2.d[1],x10; ldr d3,[x4,#-40]\n\t"\
+  "fmla v14.4s,v1.4s,v4.s[1]; ldr x10,[x4,#-32]\n\t"\
+  "fmla v15.4s,v1.4s,v4.s[2]; sub w5,w5,#2\n\t"\
+  "fmla v16.4s,v1.4s,v4.s[3]\n\t"\
+  "fmov v3.d[1],x10\n\t"\
+  "fmla v17.4s,v1.4s,v2.s[0]\n\t"\
+  "fmla v18.4s,v1.4s,v2.s[1]\n\t"\
+  "fmla v19.4s,v1.4s,v2.s[2]\n\t"\
+  "ldr d4,[x4,#-24]\n\t"\
+  "fmla v20.4s,v1.4s,v2.s[3]; ldr x10,[x4,#-16]\n\t"\
+  "fmla v21.4s,v1.4s,v3.s[0]\n\t"\
+  "fmla v22.4s,v1.4s,v3.s[1]\n\t"\
+  "fmov v4.d[1],x10; ldr d5,[x4,#-8]\n\t"\
+  "fmla v23.4s,v1.4s,v3.s[2]; prfm pldl1keep,[x9]\n\t"\
+  "fmla v24.4s,v1.4s,v3.s[3]\n\t"\
+  "fmla v25.4s,v1.4s,v4.s[0]\n\t"\
+  "fmla v26.4s,v1.4s,v4.s[1]\n\t"\
+  "fmla v27.4s,v1.4s,v4.s[2]\n\t"\
+  "fmla v28.4s,v1.4s,v4.s[3]\n\t"\
+  "fmla v29.4s,v1.4s,v5.s[0]\n\t"\
+  "fmla v30.4s,v1.4s,v5.s[1]\n\t"
+
+#define KERNEL_M4N25_FIN1 \
+  "ldr w16,[x0],#4; ldr q2,[x4]\n\t"\
+  "ldr w17,[x1],#4; ldr d3,[x4,#16]\n\t"\
+  "ldr w19,[x2],#4; ldr x10,[x4,#24]\n\t"\
+  "ldr w20,[x3],#4; orr x16,x16,x17,LSL #32\n\t"\
+  "fmov d0,x16; orr x19,x19,x20,LSL #32; fmov v0.d[1],x19\n\t"\
+  "fmov v3.d[1],x10; ldr d4,[x4,#32]\n\t"\
+  "fmla v6.4s,v0.4s,v2.s[0]; ldr x10,[x4,#40]\n\t"\
+  "fmla v7.4s,v0.4s,v2.s[1]\n\t"\
+  "fmla v8.4s,v0.4s,v2.s[2]\n\t"\
+  "fmov v4.d[1],x10\n\t"\
+  "fmla v9.4s,v0.4s,v2.s[3]\n\t"\
+  "fmla v10.4s,v0.4s,v3.s[0]\n\t"\
+  "fmla v11.4s,v0.4s,v3.s[1]\n\t"\
+  "ldr d2,[x4,#48]\n\t"\
+  "fmla v12.4s,v0.4s,v3.s[2]; ldr x10,[x4,#56]\n\t"\
+  "fmla v13.4s,v0.4s,v3.s[3]\n\t"\
+  "fmla v14.4s,v0.4s,v4.s[0]\n\t"\
+  "fmov v2.d[1],x10; ldr d3,[x4,#64]\n\t"\
+  "fmla v15.4s,v0.4s,v4.s[1]; ldr x10,[x4,#72]\n\t"\
+  "fmla v16.4s,v0.4s,v4.s[2]\n\t"\
+  "fmla v17.4s,v0.4s,v4.s[3]\n\t"\
+  "fmov v3.d[1],x10; ldr d4,[x4,#80]\n\t"\
+  "fmla v18.4s,v0.4s,v2.s[0]; ldr x10,[x4,#88]\n\t"\
+  "fmla v19.4s,v0.4s,v2.s[1]; add x4,x4,#100\n\t"\
+  "fmla v20.4s,v0.4s,v2.s[2]\n\t"\
+  "fmov v4.d[1],x10\n\t"\
+  "fmla v21.4s,v0.4s,v2.s[3]\n\t"\
+  "fmla v22.4s,v0.4s,v3.s[0]\n\t"\
+  "fmla v23.4s,v0.4s,v3.s[1]\n\t"\
+  "ldr s2,[x4,#-4]\n\t"\
+  "fmla v24.4s,v0.4s,v3.s[2]\n\t"\
+  "fmla v25.4s,v0.4s,v3.s[3]\n\t"\
+  "fmla v26.4s,v0.4s,v4.s[0]\n\t"\
+  "fmla v27.4s,v0.4s,v4.s[1]\n\t"\
+  "fmla v28.4s,v0.4s,v4.s[2]\n\t"\
+  "fmla v29.4s,v0.4s,v4.s[3]\n\t"\
+  "fmla v30.4s,v0.4s,v2.s[0]\n\t"
+
+
+#define INIT_M4N26 INIT_4V(6, 7, 8, 9) \
+  INIT_4V(10, 11, 12, 13) INIT_4V(14, 15, 16, 17)\
+  INIT_4V(18, 19, 20, 21) INIT_4V(22, 23, 24, 25)\
+  INIT_4V(26, 27, 28, 29) INIT_2V(30, 31)
+
+#define SAVE_M4N26(mode) \
+  UNIT_SAVE_M4N4_VC_##mode(6, 7, 8, 9) UNIT_SAVE_M4N4_VC_##mode(10, 11, 12, 13)\
+  UNIT_SAVE_M4N4_VC_##mode(14, 15, 16, 17) UNIT_SAVE_M4N4_VC_##mode(18, 19, 20, 21)\
+  UNIT_SAVE_M4N4_VC_##mode(22, 23, 24, 25) UNIT_SAVE_M4N4_VC_##mode(26, 27, 28, 29)\
+  EDGE_SAVE_M4N1K1_##mode(30) EDGE_SAVE_M4N1K1_##mode(31)
+
+#define KERNEL_M4N26_PRELOAD2 \
+  "ldr x16,[x0],#8; ldr x17,[x1],#8; ldr x19,[x2],#8; ldr x20,[x3],#8\n\t"\
+  "ldr q2,[x4]; ldr q3,[x4,#16]; ldr x10,[x4,#24]; add x4,x4,#208\n\t"\
+  "mov w11,w16; bfi x11,x17,#32,#32; fmov d0,x11\n\t"\
+  "mov w11,w19; bfi x11,x20,#32,#32; fmov v0.d[1],x11\n\t"
+
+#define KERNEL_M4N26_MAIN2(ap1, ap2) \
+  "fmov v3.d[1],x10; ldr d4,[x4,#-176]\n\t"\
+  "fmla v6.4s,v0.4s,v2.s[0]; ldr x10,[x4,#-168]\n\t"\
+  "fmla v7.4s,v0.4s,v2.s[1]; bfxil x17,x16,#32,#32\n\t"\
+  "fmla v8.4s,v0.4s,v2.s[2]\n\t"\
+  "fmov v4.d[1],x10; fmov d1,x17\n\t"\
+  "fmla v9.4s,v0.4s,v2.s[3]; ldr x16,[x0],#8\n\t"\
+  "fmla v10.4s,v0.4s,v3.s[0]; bfxil x20,x19,#32,#32\n\t"\
+  "fmla v11.4s,v0.4s,v3.s[1]\n\t"\
+  "fmov v1.d[1],x20; ldr d2,[x4,#-160]\n\t"\
+  "fmla v12.4s,v0.4s,v3.s[2]; ldr x10,[x4,#-152]\n\t"\
+  "fmla v13.4s,v0.4s,v3.s[3]; prfm pldl1keep,[x4,#48]\n\t"\
+  "fmla v14.4s,v0.4s,v4.s[0]\n\t"\
+  "fmov v2.d[1],x10; ldr d3,[x4,#-144]\n\t"\
+  "fmla v15.4s,v0.4s,v4.s[1]; ldr x10,[x4,#-136]\n\t"\
+  "fmla v16.4s,v0.4s,v4.s[2]; ldr x17,[x1],#8\n\t"\
+  "fmla v17.4s,v0.4s,v4.s[3]\n\t"\
+  "fmov v3.d[1],x10; ldr d4,[x4,#-128]\n\t"\
+  "fmla v18.4s,v0.4s,v2.s[0]; ldr x10,[x4,#-120]\n\t"\
+  "fmla v19.4s,v0.4s,v2.s[1]; mov w11,w16\n\t"\
+  "fmla v20.4s,v0.4s,v2.s[2]\n\t"\
+  "fmov v4.d[1],x10\n\t"\
+  "fmla v21.4s,v0.4s,v2.s[3]; bfi x11,x17,#32,#32\n\t"\
+  "fmla v22.4s,v0.4s,v3.s[0]\n\t"\
+  "fmla v23.4s,v0.4s,v3.s[1]\n\t"\
+  "ldr d2,[x4,#-112]\n\t"\
+  "fmla v24.4s,v0.4s,v3.s[2]; ldr x10,[x4,#-104]\n\t"\
+  "fmla v25.4s,v0.4s,v3.s[3]; prfm pldl1keep,[x"#ap1",#64]\n\t"\
+  "fmla v26.4s,v0.4s,v4.s[0]\n\t"\
+  "fmov v2.d[1],x10; ldr d3,[x4,#-96]\n\t"\
+  "fmla v27.4s,v0.4s,v4.s[1]; ldr x10,[x4,#-88]\n\t"\
+  "fmla v28.4s,v0.4s,v4.s[2]; ldr x19,[x2],#8\n\t"\
+  "fmla v29.4s,v0.4s,v4.s[3]\n\t"\
+  "fmov v3.d[1],x10; ldr d4,[x4,#-80]\n\t"\
+  "fmla v30.4s,v0.4s,v2.s[0]; ldr x10,[x4,#-72]\n\t"\
+  "fmla v31.4s,v0.4s,v2.s[1]; prfm pldl1keep,[x4,#96]\n\t"\
+  "fmla v6.4s,v1.4s,v2.s[2]\n\t"\
+  "fmov v4.d[1],x10; fmov d0,x11\n\t"\
+  "fmla v7.4s,v1.4s,v2.s[3]; mov w11,w19\n\t"\
+  "fmla v8.4s,v1.4s,v3.s[0]; prfm pldl1keep,[x"#ap2",#64]\n\t"\
+  "fmla v9.4s,v1.4s,v3.s[1]\n\t"\
+  "ldr d2,[x4,#-64]\n\t"\
+  "fmla v10.4s,v1.4s,v3.s[2]; ldr x10,[x4,#-56]\n\t"\
+  "fmla v11.4s,v1.4s,v3.s[3]; ldr x20,[x3],#8\n\t"\
+  "fmla v12.4s,v1.4s,v4.s[0]\n\t"\
+  "fmov v2.d[1],x10; ldr d3,[x4,#-48]\n\t"\
+  "fmla v13.4s,v1.4s,v4.s[1]; ldr x10,[x4,#-40]\n\t"\
+  "fmla v14.4s,v1.4s,v4.s[2]; sub w5,w5,#2\n\t"\
+  "fmla v15.4s,v1.4s,v4.s[3]\n\t"\
+  "fmov v3.d[1],x10\n\t"\
+  "fmla v16.4s,v1.4s,v2.s[0]; bfi x11,x20,#32,#32\n\t"\
+  "fmla v17.4s,v1.4s,v2.s[1]; cmp w5,#6\n\t"\
+  "fmla v18.4s,v1.4s,v2.s[2]\n\t"\
+  "ldr d4,[x4,#-32]; fmov v0.d[1],x11\n\t"\
+  "fmla v19.4s,v1.4s,v2.s[3]; ldr x10,[x4,#-24]\n\t"\
+  "fmla v20.4s,v1.4s,v3.s[0]; prfm pldl1keep,[x4,#144]\n\t"\
+  "fmla v21.4s,v1.4s,v3.s[1]\n\t"\
+  "fmov v4.d[1],x10; ldr d5,[x4,#-16]\n\t"\
+  "fmla v22.4s,v1.4s,v3.s[2]; ldr x10,[x4,#-8]\n\t"\
+  "fmla v23.4s,v1.4s,v3.s[3]; add x4,x4,#208\n\t"\
+  "fmla v24.4s,v1.4s,v4.s[0]\n\t"\
+  "fmov v5.d[1],x10; ldr d2,[x4,#-208]\n\t"\
+  "fmla v25.4s,v1.4s,v4.s[1]; ldr x10,[x4,#-200]\n\t"\
+  "fmla v26.4s,v1.4s,v4.s[2]; prfm pldl1keep,[x4]\n\t"\
+  "fmla v27.4s,v1.4s,v4.s[3]\n\t"\
+  "fmov v2.d[1],x10\n\t"\
+  "fmla v28.4s,v1.4s,v5.s[0]\n\t"\
+  "fmla v29.4s,v1.4s,v5.s[1]\n\t"\
+  "ldr d3,[x4,#-192]\n\t"\
+  "fmla v30.4s,v1.4s,v5.s[2]; ldr x10,[x4,#-184]\n\t"\
+  "fmla v31.4s,v1.4s,v5.s[3]\n\t"
+
+#define KERNEL_M4N26_TAIL2 \
+  "fmov v3.d[1],x10; ldr d4,[x4,#-176]\n\t"\
+  "fmla v6.4s,v0.4s,v2.s[0]; ldr x10,[x4,#-168]\n\t"\
+  "fmla v7.4s,v0.4s,v2.s[1]; bfxil x17,x16,#32,#32\n\t"\
+  "fmla v8.4s,v0.4s,v2.s[2]\n\t"\
+  "fmov v4.d[1],x10; fmov d1,x17\n\t"\
+  "fmla v9.4s,v0.4s,v2.s[3]\n\t"\
+  "fmla v10.4s,v0.4s,v3.s[0]; bfxil x20,x19,#32,#32\n\t"\
+  "fmla v11.4s,v0.4s,v3.s[1]\n\t"\
+  "fmov v1.d[1],x20; ldr d2,[x4,#-160]\n\t"\
+  "fmla v12.4s,v0.4s,v3.s[2]; ldr x10,[x4,#-152]\n\t"\
+  "fmla v13.4s,v0.4s,v3.s[3]; prfm pldl1keep,[x6]\n\t"\
+  "fmla v14.4s,v0.4s,v4.s[0]\n\t"\
+  "fmov v2.d[1],x10; ldr d3,[x4,#-144]\n\t"\
+  "fmla v15.4s,v0.4s,v4.s[1]; ldr x10,[x4,#-136]\n\t"\
+  "fmla v16.4s,v0.4s,v4.s[2]\n\t"\
+  "fmla v17.4s,v0.4s,v4.s[3]\n\t"\
+  "fmov v3.d[1],x10; ldr d4,[x4,#-128]\n\t"\
+  "fmla v18.4s,v0.4s,v2.s[0]; ldr x10,[x4,#-120]\n\t"\
+  "fmla v19.4s,v0.4s,v2.s[1]\n\t"\
+  "fmla v20.4s,v0.4s,v2.s[2]\n\t"\
+  "fmov v4.d[1],x10\n\t"\
+  "fmla v21.4s,v0.4s,v2.s[3]\n\t"\
+  "fmla v22.4s,v0.4s,v3.s[0]\n\t"\
+  "fmla v23.4s,v0.4s,v3.s[1]\n\t"\
+  "ldr d2,[x4,#-112]\n\t"\
+  "fmla v24.4s,v0.4s,v3.s[2]; ldr x10,[x4,#-104]\n\t"\
+  "fmla v25.4s,v0.4s,v3.s[3]; prfm pldl1keep,[x7]\n\t"\
+  "fmla v26.4s,v0.4s,v4.s[0]\n\t"\
+  "fmov v2.d[1],x10; ldr d3,[x4,#-96]\n\t"\
+  "fmla v27.4s,v0.4s,v4.s[1]; ldr x10,[x4,#-88]\n\t"\
+  "fmla v28.4s,v0.4s,v4.s[2]\n\t"\
+  "fmla v29.4s,v0.4s,v4.s[3]\n\t"\
+  "fmov v3.d[1],x10; ldr d4,[x4,#-80]\n\t"\
+  "fmla v30.4s,v0.4s,v2.s[0]; ldr x10,[x4,#-72]\n\t"\
+  "fmla v31.4s,v0.4s,v2.s[1]; prfm pldl1keep,[x8]\n\t"\
+  "fmla v6.4s,v1.4s,v2.s[2]\n\t"\
+  "fmov v4.d[1],x10\n\t"\
+  "fmla v7.4s,v1.4s,v2.s[3]\n\t"\
+  "fmla v8.4s,v1.4s,v3.s[0]; prfm pldl1keep,[x9]\n\t"\
+  "fmla v9.4s,v1.4s,v3.s[1]\n\t"\
+  "ldr d2,[x4,#-64]\n\t"\
+  "fmla v10.4s,v1.4s,v3.s[2]; ldr x10,[x4,#-56]\n\t"\
+  "fmla v11.4s,v1.4s,v3.s[3]\n\t"\
+  "fmla v12.4s,v1.4s,v4.s[0]\n\t"\
+  "fmov v2.d[1],x10; ldr d3,[x4,#-48]\n\t"\
+  "fmla v13.4s,v1.4s,v4.s[1]; ldr x10,[x4,#-40]\n\t"\
+  "fmla v14.4s,v1.4s,v4.s[2]; sub w5,w5,#2\n\t"\
+  "fmla v15.4s,v1.4s,v4.s[3]\n\t"\
+  "fmov v3.d[1],x10\n\t"\
+  "fmla v16.4s,v1.4s,v2.s[0]\n\t"\
+  "fmla v17.4s,v1.4s,v2.s[1]\n\t"\
+  "fmla v18.4s,v1.4s,v2.s[2]\n\t"\
+  "ldr d4,[x4,#-32]\n\t"\
+  "fmla v19.4s,v1.4s,v2.s[3]; ldr x10,[x4,#-24]\n\t"\
+  "fmla v20.4s,v1.4s,v3.s[0]\n\t"\
+  "fmla v21.4s,v1.4s,v3.s[1]\n\t"\
+  "fmov v4.d[1],x10; ldr d5,[x4,#-16]\n\t"\
+  "fmla v22.4s,v1.4s,v3.s[2]; ldr x10,[x4,#-8]\n\t"\
+  "fmla v23.4s,v1.4s,v3.s[3]\n\t"\
+  "fmla v24.4s,v1.4s,v4.s[0]\n\t"\
+  "fmov v5.d[1],x10\n\t"\
+  "fmla v25.4s,v1.4s,v4.s[1]\n\t"\
+  "fmla v26.4s,v1.4s,v4.s[2]\n\t"\
+  "fmla v27.4s,v1.4s,v4.s[3]\n\t"\
+  "fmla v28.4s,v1.4s,v5.s[0]\n\t"\
+  "fmla v29.4s,v1.4s,v5.s[1]\n\t"\
+  "fmla v30.4s,v1.4s,v5.s[2]\n\t"\
+  "fmla v31.4s,v1.4s,v5.s[3]\n\t"
+
+#define KERNEL_M4N26_FIN1 \
+  "ldr w16,[x0],#4; ldr q2,[x4]\n\t"\
+  "ldr w17,[x1],#4; ldr d3,[x4,#16]\n\t"\
+  "ldr w19,[x2],#4; ldr x10,[x4,#24]\n\t"\
+  "ldr w20,[x3],#4; orr x16,x16,x17,LSL #32\n\t"\
+  "fmov d0,x16; orr x19,x19,x20,LSL #32; fmov v0.d[1],x19\n\t"\
+  "fmov v3.d[1],x10; ldr d4,[x4,#32]\n\t"\
+  "fmla v6.4s,v0.4s,v2.s[0]; ldr x10,[x4,#40]\n\t"\
+  "fmla v7.4s,v0.4s,v2.s[1]\n\t"\
+  "fmla v8.4s,v0.4s,v2.s[2]\n\t"\
+  "fmov v4.d[1],x10\n\t"\
+  "fmla v9.4s,v0.4s,v2.s[3]\n\t"\
+  "fmla v10.4s,v0.4s,v3.s[0]\n\t"\
+  "fmla v11.4s,v0.4s,v3.s[1]\n\t"\
+  "ldr d2,[x4,#48]\n\t"\
+  "fmla v12.4s,v0.4s,v3.s[2]; ldr x10,[x4,#56]\n\t"\
+  "fmla v13.4s,v0.4s,v3.s[3]\n\t"\
+  "fmla v14.4s,v0.4s,v4.s[0]\n\t"\
+  "fmov v2.d[1],x10; ldr d3,[x4,#64]\n\t"\
+  "fmla v15.4s,v0.4s,v4.s[1]; ldr x10,[x4,#72]\n\t"\
+  "fmla v16.4s,v0.4s,v4.s[2]\n\t"\
+  "fmla v17.4s,v0.4s,v4.s[3]\n\t"\
+  "fmov v3.d[1],x10; ldr d4,[x4,#80]\n\t"\
+  "fmla v18.4s,v0.4s,v2.s[0]; ldr x10,[x4,#88]\n\t"\
+  "fmla v19.4s,v0.4s,v2.s[1]; add x4,x4,#104\n\t"\
+  "fmla v20.4s,v0.4s,v2.s[2]\n\t"\
+  "fmov v4.d[1],x10\n\t"\
+  "fmla v21.4s,v0.4s,v2.s[3]\n\t"\
+  "fmla v22.4s,v0.4s,v3.s[0]\n\t"\
+  "fmla v23.4s,v0.4s,v3.s[1]\n\t"\
+  "ldr d2,[x4,#-8]\n\t"\
+  "fmla v24.4s,v0.4s,v3.s[2]\n\t"\
+  "fmla v25.4s,v0.4s,v3.s[3]\n\t"\
+  "fmla v26.4s,v0.4s,v4.s[0]\n\t"\
+  "fmla v27.4s,v0.4s,v4.s[1]\n\t"\
+  "fmla v28.4s,v0.4s,v4.s[2]\n\t"\
+  "fmla v29.4s,v0.4s,v4.s[3]\n\t"\
+  "fmla v30.4s,v0.4s,v2.s[0]\n\t"\
+  "fmla v31.4s,v0.4s,v2.s[1]\n\t"
+
+#define FUNC_K1(ndim) \
+static inline void sgemm_skinny1_a53_m4n##ndim(\
+  const float * __restrict__ a_ptr, const float * __restrict__ b_scr,\
+  float * __restrict__ c_ptr, uint32_t K, uint32_t LDA, uint32_t LDC,\
+  uint8_t c_rowmajor, const float * __restrict__ beta_addr) {\
+  __asm__ __volatile__ (\
+    "mov x0,%[a_ptr]; add x1,%[a_ptr],%w[LDA],UXTW #2\n\t"\
+    "add x2,%[a_ptr],%w[LDA],UXTW #3; add x3,x1,%w[LDA],UXTW #3\n\t"\
+    "add x6,x0,%w[LDA],UXTW #4; add x7,x1,%w[LDA],UXTW #4\n\t"\
+    "add x8,x2,%w[LDA],UXTW #4; add x9,x3,%w[LDA],UXTW #4\n\t"\
+    "mov x4,%[b_scr]; mov w5,%w[K]\n\t"\
+    INIT_M4N##ndim\
+    "cmp w5,#2; b.lt 4f\n\t"\
+    KERNEL_M4N##ndim##_PRELOAD2\
+    "cmp w5,#6; b.lt 2f\n\t"\
+    ".balign 16; 1:\n\t"\
+    KERNEL_M4N##ndim##_MAIN2(0, 1)\
+    KERNEL_M4N##ndim##_MAIN2(2, 3)\
+    "b.ge 1b; 2:\n\t"\
+    "cmp w5,#4; b.lt 3f\n\t"\
+    KERNEL_M4N##ndim##_MAIN2(0, 1)\
+    KERNEL_M4N##ndim##_TAIL2\
+    "b 4f; 3:\n\t"\
+    KERNEL_M4N##ndim##_TAIL2\
+    "4:\n\t"\
+    "cmp w5,#1; b.lt 6f\n\t"\
+    "5:\n\t"\
+    KERNEL_M4N##ndim##_FIN1\
+    "6:\n\t"\
+    INIT_SAVE\
+    "cmp %w[c_rowmajor],#0; b.eq 7f\n\t"\
+    SAVE_M4N##ndim(CR) "b 8f\n\t"\
+    "7:\n\t"\
+    SAVE_M4N##ndim(CC)\
+    "8:\n\t"\
+  ::[a_ptr]"r"(a_ptr), [c_ptr]"r"(c_ptr), [b_scr]"r"(b_scr),\
+    [K]"r"(K), [LDA]"r"(LDA), [LDC]"r"(LDC),\
+    [beta_addr]"r"(beta_addr), [c_rowmajor]"r"(c_rowmajor)\
+  :"cc","memory","x0","x1","x2","x3","x4","x5","x6","x7","x8","x9",\
+  "x10","x11","x12","x13","x14","x15","x16","x17","x19","x20",\
+  "v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11","v12","v13",\
+  "v14","v15","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25",\
+  "v26","v27","v28","v29","v30","v31");\
+}
+
+FUNC_K1(23)
+FUNC_K1(24)
+FUNC_K1(25)
+FUNC_K1(26)
+
+#define INIT_M1N4 \
+  float32x4_t cq1, cq2, cq3, cq4;\
+  cq1 = cq2 = cq3 = cq4 = vdupq_n_f32(0.0f);
+
+#define INIT_M1N5 INIT_M1N4 float32x4_t cq5 = cq1;
+
+#define INIT_M1N6 INIT_M1N5 float32x4_t cq6 = cq1;
+
+#define INIT_M1N7 INIT_M1N6 float32x4_t cq7 = cq1;
+
+#define INIT_M1N8 \
+  float32x4_t cq1, cq2; cq1 = cq2 = vdupq_n_f32(0.0f);
+
+#define INIT_M1N9 INIT_M1N8 float32x4_t cq3 = cq1;
+
+#define INIT_M1N10 INIT_M1N9 float32x4_t cq4 = cq1;
+
+#define INIT_M1N11 INIT_M1N10 float32x4_t cq5 = cq1;
+
+#define INIT_M1N12 \
+  float32x4_t cq1, cq2, cq3; cq1 = cq2 = cq3 = vdupq_n_f32(0.0f);
+
+#define INIT_M1N13 INIT_M1N12 float32x4_t cq4 = cq1;
+
+#define INIT_M1N14 INIT_M1N13 float32x4_t cq5 = cq2;
+
+#define ACC_K4M1N4 \
+  float32x4_t aq1 = vld1q_f32(a_rd); a_rd += 4;\
+  float32x4_t bq1 = vld1q_f32(b_rd);\
+  float32x4_t bq2 = vld1q_f32(b_rd + 4);\
+  float32x4_t bq3 = vld1q_f32(b_rd + 8);\
+  float32x4_t bq4 = vld1q_f32(b_rd + 12);\
+  cq1 = vfmaq_laneq_f32(cq1, bq1, aq1, 0);\
+  cq2 = vfmaq_laneq_f32(cq2, bq2, aq1, 1);\
+  cq3 = vfmaq_laneq_f32(cq3, bq3, aq1, 2);\
+  cq4 = vfmaq_laneq_f32(cq4, bq4, aq1, 3);
+
+#define ACC_K4M1N5 ACC_K4M1N4 \
+  bq1 = vld1q_f32(b_rd + 16); cq5 = vfmaq_f32(cq5, aq1, bq1);
+
+#define ACC_K4M1N6 ACC_K4M1N5 \
+  bq2 = vld1q_f32(b_rd + 20); cq6 = vfmaq_f32(cq6, aq1, bq2);
+
+#define ACC_K4M1N7 ACC_K4M1N6 \
+  bq3 = vld1q_f32(b_rd + 24); cq7 = vfmaq_f32(cq7, aq1, bq3);
+
+#define ACC_K4M1N8 \
+  float32x4_t aq1 = vld1q_f32(a_rd); a_rd += 4;\
+  float32x4_t bq1 = vld1q_f32(b_rd);\
+  float32x4_t bq2 = vld1q_f32(b_rd + 4);\
+  cq1 = vfmaq_laneq_f32(cq1, bq1, aq1, 0); bq1 = vld1q_f32(b_rd + 8);\
+  cq2 = vfmaq_laneq_f32(cq2, bq2, aq1, 0); bq2 = vld1q_f32(b_rd + 12);\
+  cq1 = vfmaq_laneq_f32(cq1, bq1, aq1, 1); bq1 = vld1q_f32(b_rd + 16);\
+  cq2 = vfmaq_laneq_f32(cq2, bq2, aq1, 1); bq2 = vld1q_f32(b_rd + 20);\
+  cq1 = vfmaq_laneq_f32(cq1, bq1, aq1, 2); bq1 = vld1q_f32(b_rd + 24);\
+  cq2 = vfmaq_laneq_f32(cq2, bq2, aq1, 2); bq2 = vld1q_f32(b_rd + 28);\
+  cq1 = vfmaq_laneq_f32(cq1, bq1, aq1, 3);\
+  cq2 = vfmaq_laneq_f32(cq2, bq2, aq1, 3);
+
+#define ACC_K4M1N9 ACC_K4M1N8 \
+  bq1 = vld1q_f32(b_rd + 32); cq3 = vfmaq_f32(cq3, bq1, aq1);
+
+#define ACC_K4M1N10 ACC_K4M1N9 \
+  bq2 = vld1q_f32(b_rd + 36); cq4 = vfmaq_f32(cq4, bq2, aq1);
+
+#define ACC_K4M1N11 ACC_K4M1N10 \
+  bq1 = vld1q_f32(b_rd + 40); cq5 = vfmaq_f32(cq5, bq1, aq1);
+
+#define ACC_K4M1N12 \
+  float32x4_t aq1 = vld1q_f32(a_rd); a_rd += 4;\
+  float32x4_t bq1 = vld1q_f32(b_rd);\
+  float32x4_t bq2 = vld1q_f32(b_rd + 4);\
+  float32x4_t bq3 = vld1q_f32(b_rd + 8);\
+  cq1 = vfmaq_laneq_f32(cq1, bq1, aq1, 0); bq1 = vld1q_f32(b_rd + 12);\
+  cq2 = vfmaq_laneq_f32(cq2, bq2, aq1, 0); bq2 = vld1q_f32(b_rd + 16);\
+  cq3 = vfmaq_laneq_f32(cq3, bq3, aq1, 0); bq3 = vld1q_f32(b_rd + 20);\
+  cq1 = vfmaq_laneq_f32(cq1, bq1, aq1, 1); bq1 = vld1q_f32(b_rd + 24);\
+  cq2 = vfmaq_laneq_f32(cq2, bq2, aq1, 1); bq2 = vld1q_f32(b_rd + 28);\
+  cq3 = vfmaq_laneq_f32(cq3, bq3, aq1, 1); bq3 = vld1q_f32(b_rd + 32);\
+  cq1 = vfmaq_laneq_f32(cq1, bq1, aq1, 2); bq1 = vld1q_f32(b_rd + 36);\
+  cq2 = vfmaq_laneq_f32(cq2, bq2, aq1, 2); bq2 = vld1q_f32(b_rd + 40);\
+  cq3 = vfmaq_laneq_f32(cq3, bq3, aq1, 2); bq3 = vld1q_f32(b_rd + 44);\
+  cq1 = vfmaq_laneq_f32(cq1, bq1, aq1, 3);\
+  cq2 = vfmaq_laneq_f32(cq2, bq2, aq1, 3);\
+  cq3 = vfmaq_laneq_f32(cq3, bq3, aq1, 3);
+
+#define ACC_K4M1N13 ACC_K4M1N12 \
+  bq1 = vld1q_f32(b_rd + 48); cq4 = vfmaq_f32(cq4, bq1, aq1);
+
+#define ACC_K4M1N14 ACC_K4M1N13 \
+  bq2 = vld1q_f32(b_rd + 52); cq5 = vfmaq_f32(cq5, bq2, aq1);
+
+#define REDUC_N4 \
+  cq1 = vaddq_f32(cq1, cq2); cq3 = vaddq_f32(cq3, cq4);\
+  cq1 = vaddq_f32(cq1, cq3);
+
+#define REDUC_N5 REDUC_N4 \
+  float32x2_t cd1 = vadd_f32(vget_low_f32(cq5), vget_high_f32(cq5));\
+  float cs1 = vget_lane_f32(cd1, 0) + vget_lane_f32(cd1, 1);
+
+#define REDUC_N6 REDUC_N5 \
+  float32x2_t cd2 = vadd_f32(vget_low_f32(cq6), vget_high_f32(cq6));\
+  float cs2 = vget_lane_f32(cd2, 0) + vget_lane_f32(cd2, 1);
+
+#define REDUC_N7 REDUC_N6 \
+  float32x2_t cd3 = vadd_f32(vget_low_f32(cq7), vget_high_f32(cq7));\
+  float cs3 = vget_lane_f32(cd3, 0) + vget_lane_f32(cd3, 1);
+
+#define REDUC_N8 {}
+
+#define REDUC_N9 \
+  float32x2_t cd1 = vadd_f32(vget_low_f32(cq3), vget_high_f32(cq3));\
+  float cs1 = vget_lane_f32(cd1, 0) + vget_lane_f32(cd1, 1);
+
+#define REDUC_N10 REDUC_N9 \
+  float32x2_t cd2 = vadd_f32(vget_low_f32(cq4), vget_high_f32(cq4));\
+  float cs2 = vget_lane_f32(cd2, 0) + vget_lane_f32(cd2, 1);
+
+#define REDUC_N11 REDUC_N10 \
+  float32x2_t cd3 = vadd_f32(vget_low_f32(cq5), vget_high_f32(cq5));\
+  float cs3 = vget_lane_f32(cd3, 0) + vget_lane_f32(cd3, 1);
+
+#define REDUC_N12 {}
+
+#define REDUC_N13 \
+  float32x2_t cd1 = vadd_f32(vget_low_f32(cq4), vget_high_f32(cq4));\
+  float cs1 = vget_lane_f32(cd1, 0) + vget_lane_f32(cd1, 1);
+
+#define REDUC_N14 REDUC_N13 \
+  float32x2_t cd2 = vadd_f32(vget_low_f32(cq5), vget_high_f32(cq5));\
+  float cs2 = vget_lane_f32(cd2, 0) + vget_lane_f32(cd2, 1);
+
+#define ACC_K1M1N4 \
+  float as1 = *a_rd++;\
+  float32x4_t bq1 = vld1q_f32(b_rd);\
+  cq1 = vfmaq_n_f32(cq1, bq1, as1);
+
+#define ACC_K1M1N5 ACC_K1M1N4 cs1 += as1 * b_rd[4];
+
+#define ACC_K1M1N6 ACC_K1M1N5 cs2 += as1 * b_rd[5];
+
+#define ACC_K1M1N7 ACC_K1M1N6 cs3 += as1 * b_rd[6];
+
+#define ACC_K1M1N8 \
+  float as1 = *a_rd++;\
+  float32x4_t bq1 = vld1q_f32(b_rd);\
+  float32x4_t bq2 = vld1q_f32(b_rd + 4);\
+  cq1 = vfmaq_n_f32(cq1, bq1, as1);\
+  cq2 = vfmaq_n_f32(cq2, bq2, as1);
+
+#define ACC_K1M1N9 ACC_K1M1N8 cs1 += as1 * b_rd[8];
+
+#define ACC_K1M1N10 ACC_K1M1N9 cs2 += as1 * b_rd[9];
+
+#define ACC_K1M1N11 ACC_K1M1N10 cs3 += as1 * b_rd[10];
+
+#define ACC_K1M1N12 \
+  float as1 = *a_rd++;\
+  float32x4_t bq1 = vld1q_f32(b_rd);\
+  float32x4_t bq2 = vld1q_f32(b_rd + 4);\
+  float32x4_t bq3 = vld1q_f32(b_rd + 8);\
+  cq1 = vfmaq_n_f32(cq1, bq1, as1);\
+  cq2 = vfmaq_n_f32(cq2, bq2, as1);\
+  cq3 = vfmaq_n_f32(cq3, bq3, as1);
+
+#define ACC_K1M1N13 ACC_K1M1N12 cs1 += as1 * b_rd[12];
+
+#define ACC_K1M1N14 ACC_K1M1N13 cs2 += as1 * b_rd[13];
+
+#define UNIT_SAVE_M1N4_CC(cq1) \
+  c_ptr[0] = c_ptr[0] * beta + vgetq_lane_f32(cq1, 0);\
+  c_ptr[LDC] = c_ptr[LDC] * beta + vgetq_lane_f32(cq1, 1);\
+  c_ptr += LDC * 2;\
+  c_ptr[0] = c_ptr[0] * beta + vgetq_lane_f32(cq1, 2);\
+  c_ptr[LDC] = c_ptr[LDC] * beta + vgetq_lane_f32(cq1, 3);\
+  c_ptr += LDC * 2;
+
+#define UNIT_SAVE_M1N4_CR(cq1) \
+  cq1 = vfmaq_n_f32(cq1, vld1q_f32(c_ptr), beta);\
+  vst1q_f32(c_ptr, cq1); c_ptr += 4;
+
+#define UNIT_SAVE_M1N1_CC(cs1) \
+  c_ptr[0] = c_ptr[0] * beta + cs1; c_ptr += LDC;
+
+#define UNIT_SAVE_M1N1_CR(cs1) \
+  c_ptr[0] = c_ptr[0] * beta + cs1; c_ptr++;
+
+#define SAVE_M1N4(mode) UNIT_SAVE_M1N4_##mode(cq1)
+
+#define SAVE_M1N5(mode) SAVE_M1N4(mode) UNIT_SAVE_M1N1_##mode(cs1)
+
+#define SAVE_M1N6(mode) SAVE_M1N5(mode) UNIT_SAVE_M1N1_##mode(cs2)
+
+#define SAVE_M1N7(mode) SAVE_M1N6(mode) UNIT_SAVE_M1N1_##mode(cs3)
+
+#define SAVE_M1N8(mode) \
+  UNIT_SAVE_M1N4_##mode(cq1) UNIT_SAVE_M1N4_##mode(cq2)
+
+#define SAVE_M1N9(mode) SAVE_M1N8(mode) UNIT_SAVE_M1N1_##mode(cs1)
+
+#define SAVE_M1N10(mode) SAVE_M1N9(mode) UNIT_SAVE_M1N1_##mode(cs2)
+
+#define SAVE_M1N11(mode) SAVE_M1N10(mode) UNIT_SAVE_M1N1_##mode(cs3)
+
+#define SAVE_M1N12(mode) \
+  UNIT_SAVE_M1N4_##mode(cq1) UNIT_SAVE_M1N4_##mode(cq2) UNIT_SAVE_M1N4_##mode(cq3)
+
+#define SAVE_M1N13(mode) SAVE_M1N12(mode) UNIT_SAVE_M1N1_##mode(cs1)
+
+#define SAVE_M1N14(mode) SAVE_M1N13(mode) UNIT_SAVE_M1N1_##mode(cs2)
+
+#define FUNC_EDGE_K4(ndim) \
+static inline void sgemm_skinny1_a53_m1n##ndim(\
+  const float * __restrict__ a_rd, const float * __restrict__ b_rd,\
+  float * __restrict__ c_ptr, uint32_t k_left, uint32_t LDC,\
+  uint8_t c_rowmajor, float beta) {\
+  INIT_M1N##ndim\
+  for (; k_left > 3; k_left -= 4) {\
+    ACC_K4M1N##ndim b_rd += ndim * 4;\
+  }\
+  REDUC_N##ndim\
+  for (; k_left > 0; k_left--) {\
+    ACC_K1M1N##ndim b_rd += ndim;\
+  }\
+  if (c_rowmajor == 0) {\
+    SAVE_M1N##ndim(CC)\
+  } else {\
+    SAVE_M1N##ndim(CR)\
+  }\
+}
+
+FUNC_EDGE_K4(4)
+FUNC_EDGE_K4(5)
+FUNC_EDGE_K4(6)
+FUNC_EDGE_K4(7)
+FUNC_EDGE_K4(8)
+FUNC_EDGE_K4(9)
+FUNC_EDGE_K4(10)
+FUNC_EDGE_K4(11)
+FUNC_EDGE_K4(12)
+FUNC_EDGE_K4(13)
+FUNC_EDGE_K4(14)
+
+#define INIT_M1N15 \
+  float32x4_t cq1, cq2, cq3, cq4, cq5, cq6;\
+  cq1 = cq2 = cq3 = cq4 = cq5 = cq6 = vdupq_n_f32(0.0f);\
+  float32x2_t cd1, cd2, cd3;\
+  cd1 = cd2 = cd3 = vdup_n_f32(0.0f);
+
+#define INIT_M1N16 \
+  float32x4_t cq1, cq2, cq3, cq4;\
+  cq1 = cq2 = cq3 = cq4 = vdupq_n_f32(0.0f);
+
+#define INIT_M1N17 INIT_M1N16 float32x2_t cd1 = vdup_n_f32(0.0f);
+
+#define INIT_M1N18 INIT_M1N17 float32x2_t cd2 = vdup_n_f32(0.0f);
+
+#define INIT_M1N19 INIT_M1N18 float32x2_t cd3 = vdup_n_f32(0.0f);
+
+#define INIT_M1N20 INIT_M1N16 float32x4_t cq5 = vdupq_n_f32(0.0f);
+
+#define INIT_M1N21 INIT_M1N20 float32x2_t cd1 = vdup_n_f32(0.0f);
+
+#define INIT_M1N22 INIT_M1N21 float32x2_t cd2 = vdup_n_f32(0.0f);
+
+#define ACC_M1N15K2 \
+  float32x2_t ad1 = vld1_f32(a_rd); a_rd += 2;\
+  float32x4_t bq1 = vld1q_f32(b_rd);\
+  float32x4_t bq2 = vld1q_f32(b_rd + 4);\
+  float32x4_t bq3 = vld1q_f32(b_rd + 8);\
+  cq1 = vfmaq_lane_f32(cq1, bq1, ad1, 0); bq1 = vld1q_f32(b_rd + 12);\
+  cq2 = vfmaq_lane_f32(cq2, bq2, ad1, 0); bq2 = vld1q_f32(b_rd + 16);\
+  cq3 = vfmaq_lane_f32(cq3, bq3, ad1, 0); bq3 = vld1q_f32(b_rd + 20);\
+  cq4 = vfmaq_lane_f32(cq4, bq1, ad1, 1); float32x2_t bd1 = vld1_f32(b_rd + 24);\
+  cq5 = vfmaq_lane_f32(cq5, bq2, ad1, 1); float32x2_t bd2 = vld1_f32(b_rd + 26);\
+  cq6 = vfmaq_lane_f32(cq6, bq3, ad1, 1); float32x2_t bd3 = vld1_f32(b_rd + 28);\
+  cd1 = vfma_f32(cd1, ad1, bd1);\
+  cd2 = vfma_f32(cd2, ad1, bd2);\
+  cd3 = vfma_f32(cd3, ad1, bd3);
+
+#define ACC_M1N16K2 \
+  float32x2_t ad1 = vld1_f32(a_rd); a_rd += 2;\
+  float32x4_t bq1 = vld1q_f32(b_rd);\
+  float32x4_t bq2 = vld1q_f32(b_rd + 4);\
+  float32x4_t bq3 = vld1q_f32(b_rd + 8);\
+  float32x4_t bq4 = vld1q_f32(b_rd + 12);\
+  cq1 = vfmaq_lane_f32(cq1, bq1, ad1, 0); bq1 = vld1q_f32(b_rd + 16);\
+  cq2 = vfmaq_lane_f32(cq2, bq2, ad1, 0); bq2 = vld1q_f32(b_rd + 20);\
+  cq3 = vfmaq_lane_f32(cq3, bq3, ad1, 0); bq3 = vld1q_f32(b_rd + 24);\
+  cq4 = vfmaq_lane_f32(cq4, bq4, ad1, 0); bq4 = vld1q_f32(b_rd + 28);\
+  cq1 = vfmaq_lane_f32(cq1, bq1, ad1, 1);\
+  cq2 = vfmaq_lane_f32(cq2, bq2, ad1, 1);\
+  cq3 = vfmaq_lane_f32(cq3, bq3, ad1, 1);\
+  cq4 = vfmaq_lane_f32(cq4, bq4, ad1, 1);
+
+#define ACC_M1N17K2 ACC_M1N16K2 \
+  float32x2_t bd1 = vld1_f32(b_rd + 32);\
+  cd1 = vfma_f32(cd1, ad1, bd1);
+
+#define ACC_M1N18K2 ACC_M1N17K2 \
+  bd1 = vld1_f32(b_rd + 34); cd2 = vfma_f32(cd2, ad1, bd1);
+
+#define ACC_M1N19K2 ACC_M1N18K2 \
+  bd1 = vld1_f32(b_rd + 36); cd3 = vfma_f32(cd3, ad1, bd1);
+
+#define ACC_M1N20K2 \
+  float32x2_t ad1 = vld1_f32(a_rd); a_rd += 2;\
+  float32x4_t bq1 = vld1q_f32(b_rd);\
+  float32x4_t bq2 = vld1q_f32(b_rd + 4);\
+  float32x4_t bq3 = vld1q_f32(b_rd + 8);\
+  float32x4_t bq4 = vld1q_f32(b_rd + 12);\
+  float32x4_t bq5 = vld1q_f32(b_rd + 16);\
+  cq1 = vfmaq_lane_f32(cq1, bq1, ad1, 0); bq1 = vld1q_f32(b_rd + 20);\
+  cq2 = vfmaq_lane_f32(cq2, bq2, ad1, 0); bq2 = vld1q_f32(b_rd + 24);\
+  cq3 = vfmaq_lane_f32(cq3, bq3, ad1, 0); bq3 = vld1q_f32(b_rd + 28);\
+  cq4 = vfmaq_lane_f32(cq4, bq4, ad1, 0); bq4 = vld1q_f32(b_rd + 32);\
+  cq5 = vfmaq_lane_f32(cq5, bq5, ad1, 0); bq5 = vld1q_f32(b_rd + 36);\
+  cq1 = vfmaq_lane_f32(cq1, bq1, ad1, 1);\
+  cq2 = vfmaq_lane_f32(cq2, bq2, ad1, 1);\
+  cq3 = vfmaq_lane_f32(cq3, bq3, ad1, 1);\
+  cq4 = vfmaq_lane_f32(cq4, bq4, ad1, 1);\
+  cq5 = vfmaq_lane_f32(cq5, bq5, ad1, 1);
+
+#define ACC_M1N21K2 ACC_M1N20K2 \
+  float32x2_t bd1 = vld1_f32(b_rd + 40); cd1 = vfma_f32(cd1, ad1, bd1);
+
+#define ACC_M1N22K2 ACC_M1N21K2 \
+  float32x2_t bd2 = vld1_f32(b_rd + 42); cd2 = vfma_f32(cd2, ad1, bd2);
+
+#define REDUC_M1N15 \
+  cq1 = vaddq_f32(cq1, cq4); cq2 = vaddq_f32(cq2, cq5); cq3 = vaddq_f32(cq3, cq6);\
+  float cs1 = vget_lane_f32(cd1, 0) + vget_lane_f32(cd1, 1);\
+  float cs2 = vget_lane_f32(cd2, 0) + vget_lane_f32(cd2, 1);\
+  float cs3 = vget_lane_f32(cd3, 0) + vget_lane_f32(cd3, 1);
+
+#define REDUC_M1N16 {}
+
+#define REDUC_M1N17 float cs1 = vget_lane_f32(cd1, 0) + vget_lane_f32(cd1, 1);
+
+#define REDUC_M1N18 REDUC_M1N17 \
+  float cs2 = vget_lane_f32(cd2, 0) + vget_lane_f32(cd2, 1);
+
+#define REDUC_M1N19 REDUC_M1N18 \
+  float cs3 = vget_lane_f32(cd3, 0) + vget_lane_f32(cd3, 1);
+
+#define REDUC_M1N20 {}
+
+#define REDUC_M1N21 float cs1 = vget_lane_f32(cd1, 0) + vget_lane_f32(cd1, 1);
+
+#define REDUC_M1N22 REDUC_M1N21 \
+  float cs2 = vget_lane_f32(cd2, 0) + vget_lane_f32(cd2, 1);
+
+#define ACC_M1N15K1 \
+  float as1 = *a_rd++;\
+  float32x4_t bq1 = vld1q_f32(b_rd);\
+  float32x4_t bq2 = vld1q_f32(b_rd + 4);\
+  float32x4_t bq3 = vld1q_f32(b_rd + 8);\
+  cq1 = vfmaq_n_f32(cq1, bq1, as1); float bs1 = b_rd[12];\
+  cq2 = vfmaq_n_f32(cq2, bq2, as1); float bs2 = b_rd[13];\
+  cq3 = vfmaq_n_f32(cq3, bq3, as1); float bs3 = b_rd[14];\
+  cs1 += as1 * bs1; cs2 += as1 * bs2; cs3 += as1 * bs3;
+
+#define ACC_M1N16K1 \
+  float as1 = *a_rd++;\
+  float32x4_t bq1 = vld1q_f32(b_rd);\
+  float32x4_t bq2 = vld1q_f32(b_rd + 4);\
+  float32x4_t bq3 = vld1q_f32(b_rd + 8);\
+  float32x4_t bq4 = vld1q_f32(b_rd + 12);\
+  cq1 = vfmaq_n_f32(cq1, bq1, as1); cq2 = vfmaq_n_f32(cq2, bq2, as1);\
+  cq3 = vfmaq_n_f32(cq3, bq3, as1); cq4 = vfmaq_n_f32(cq4, bq4, as1);
+
+#define ACC_M1N17K1 ACC_M1N16K1 cs1 += as1 * b_rd[16];
+
+#define ACC_M1N18K1 ACC_M1N17K1 cs2 += as1 * b_rd[17];
+
+#define ACC_M1N19K1 ACC_M1N18K1 cs3 += as1 * b_rd[18];
+
+#define ACC_M1N20K1 \
+  float as1 = *a_rd++;\
+  float32x4_t bq1 = vld1q_f32(b_rd);\
+  float32x4_t bq2 = vld1q_f32(b_rd + 4);\
+  float32x4_t bq3 = vld1q_f32(b_rd + 8);\
+  float32x4_t bq4 = vld1q_f32(b_rd + 12);\
+  float32x4_t bq5 = vld1q_f32(b_rd + 16);\
+  cq1 = vfmaq_n_f32(cq1, bq1, as1); cq2 = vfmaq_n_f32(cq2, bq2, as1);\
+  cq3 = vfmaq_n_f32(cq3, bq3, as1); cq4 = vfmaq_n_f32(cq4, bq4, as1);\
+  cq5 = vfmaq_n_f32(cq5, bq5, as1);
+
+#define ACC_M1N21K1 ACC_M1N20K1 cs1 += as1 * b_rd[20];
+
+#define ACC_M1N22K1 ACC_M1N21K1 cs2 += as1 * b_rd[21];
+
+#define SAVE_M1N15(mode) \
+  UNIT_SAVE_M1N4_##mode(cq1) UNIT_SAVE_M1N4_##mode(cq2) UNIT_SAVE_M1N4_##mode(cq3)\
+  UNIT_SAVE_M1N1_##mode(cs1) UNIT_SAVE_M1N1_##mode(cs2) UNIT_SAVE_M1N1_##mode(cs3)
+
+#define SAVE_M1N16(mode) \
+  UNIT_SAVE_M1N4_##mode(cq1) UNIT_SAVE_M1N4_##mode(cq2)\
+  UNIT_SAVE_M1N4_##mode(cq3) UNIT_SAVE_M1N4_##mode(cq4)
+
+#define SAVE_M1N17(mode) SAVE_M1N16(mode) UNIT_SAVE_M1N1_##mode(cs1)
+
+#define SAVE_M1N18(mode) SAVE_M1N17(mode) UNIT_SAVE_M1N1_##mode(cs2)
+
+#define SAVE_M1N19(mode) SAVE_M1N18(mode) UNIT_SAVE_M1N1_##mode(cs3)
+
+#define SAVE_M1N20(mode) SAVE_M1N16(mode) UNIT_SAVE_M1N4_##mode(cq5)
+
+#define SAVE_M1N21(mode) SAVE_M1N20(mode) UNIT_SAVE_M1N1_##mode(cs1)
+
+#define SAVE_M1N22(mode) SAVE_M1N21(mode) UNIT_SAVE_M1N1_##mode(cs2)
+
+#define FUNC_EDGE_K2(ndim) \
+static inline void sgemm_skinny1_a53_m1n##ndim(\
+  const float * __restrict__ a_rd, const float * __restrict__ b_rd,\
+  float * __restrict__ c_ptr, uint32_t k_left, uint32_t LDC,\
+  uint8_t c_rowmajor, float beta) {\
+  INIT_M1N##ndim\
+  for (; k_left > 1; k_left -= 2) {\
+    ACC_M1N##ndim##K2 b_rd += ndim * 2;\
+  }\
+  REDUC_M1N##ndim\
+  for (; k_left > 0; k_left--) {\
+    ACC_M1N##ndim##K1 b_rd += ndim;\
+  }\
+  if (c_rowmajor == 0) {\
+    SAVE_M1N##ndim(CC)\
+  } else {\
+    SAVE_M1N##ndim(CR)\
+  }\
+}
+
+FUNC_EDGE_K2(15)
+FUNC_EDGE_K2(16)
+FUNC_EDGE_K2(17)
+FUNC_EDGE_K2(18)
+FUNC_EDGE_K2(19)
+FUNC_EDGE_K2(20)
+FUNC_EDGE_K2(21)
+FUNC_EDGE_K2(22)
+
+#define INIT_M1N23 INIT_M1N20 \
+  float cs1 = 0.0f, cs2 = 0.0f, cs3 = 0.0f;
+
+#define INIT_M1N24 INIT_M1N20 float32x4_t cq6 = vdupq_n_f32(0.0f);
+
+#define INIT_M1N25 INIT_M1N24 float cs1 = 0.0f;
+
+#define INIT_M1N26 INIT_M1N25 float cs2 = 0.0f;
+
+#define ACC_M1N23K1 ACC_M1N20K1 \
+  cs1 += as1 * b_rd[20]; cs2 += as1 * b_rd[21]; cs3 += as1 * b_rd[22];
+
+#define ACC_M1N24K1 ACC_M1N20K1 \
+  float32x4_t bq6 = vld1q_f32(b_rd + 20);\
+  cq6 = vfmaq_n_f32(cq6, bq6, as1);
+
+#define ACC_M1N25K1 ACC_M1N24K1 cs1 += as1 * b_rd[24];
+
+#define ACC_M1N26K1 ACC_M1N25K1 cs2 += as1 * b_rd[25];
+
+#define SAVE_M1N23(mode) \
+  UNIT_SAVE_M1N4_##mode(cq1) UNIT_SAVE_M1N4_##mode(cq2) UNIT_SAVE_M1N4_##mode(cq3)\
+  UNIT_SAVE_M1N4_##mode(cq4) UNIT_SAVE_M1N4_##mode(cq5)\
+  UNIT_SAVE_M1N1_##mode(cs1) UNIT_SAVE_M1N1_##mode(cs2) UNIT_SAVE_M1N1_##mode(cs3)
+
+#define SAVE_M1N24(mode) \
+  UNIT_SAVE_M1N4_##mode(cq1) UNIT_SAVE_M1N4_##mode(cq2) UNIT_SAVE_M1N4_##mode(cq3)\
+  UNIT_SAVE_M1N4_##mode(cq4) UNIT_SAVE_M1N4_##mode(cq5) UNIT_SAVE_M1N4_##mode(cq6)
+
+#define SAVE_M1N25(mode) SAVE_M1N24(mode) UNIT_SAVE_M1N1_##mode(cs1)
+
+#define SAVE_M1N26(mode) SAVE_M1N25(mode) UNIT_SAVE_M1N1_##mode(cs2)
+
+#define FUNC_EDGE_K1(ndim) \
+static inline void sgemm_skinny1_a53_m1n##ndim(\
+  const float * __restrict__ a_rd, const float * __restrict__ b_rd,\
+  float * __restrict__ c_ptr, uint32_t k_left, uint32_t LDC,\
+  uint8_t c_rowmajor, float beta) {\
+  INIT_M1N##ndim\
+  for (; k_left > 0; k_left--) {\
+    ACC_M1N##ndim##K1 b_rd += ndim;\
+  }\
+  if (c_rowmajor == 0) {\
+    SAVE_M1N##ndim(CC)\
+  } else {\
+    SAVE_M1N##ndim(CR)\
+  }\
+}
+
+FUNC_EDGE_K1(23)
+FUNC_EDGE_K1(24)
+FUNC_EDGE_K1(25)
+FUNC_EDGE_K1(26)
+
+#endif
diff --git a/include/neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotKernelA7x.h b/include/neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotKernelA7x.h
new file mode 100644
index 0000000..016ce73
--- /dev/null
+++ b/include/neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotKernelA7x.h
@@ -0,0 +1,2556 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+#include <arm_neon.h>
+
+#ifndef INCLUDE_A7X_KERNEL
+#define INCLUDE_A7X_KERNEL
+
+/* x0 - x3 for a_ptrs */
+/* x4 for b_ptr, x5 for k_left */
+/* x6 - x9 for a_pref */
+/* x12 - x15 for c_tmp */
+
+#define INIT_1V(c1) "movi v"#c1".16b,#0\n\t"
+
+#define INIT_2V(c1, c2) INIT_1V(c1) INIT_1V(c2)
+
+#define INIT_4V(c1, c2, c3, c4) INIT_2V(c1, c2) INIT_2V(c3, c4)
+
+#define INIT_SAVE \
+  "ldr s0,[%[beta_addr]]; mov x12,%[c_ptr]\n\t"\
+  "add x13,%[c_ptr],%w[LDC],UXTW #2; add x14,%[c_ptr],%w[LDC],UXTW #3\n\t"\
+  "add x15,x13,%w[LDC],UXTW #3\n\t"
+
+#define UNIT_SAVE_M4N4_CC(c1, c2, c3, c4) \
+  "trn1 v1.4s,v"#c1".4s,v"#c2".4s; trn1 v2.4s,v"#c3".4s,v"#c4".4s\n\t"\
+  "trn2 v3.4s,v"#c1".4s,v"#c2".4s; trn2 v4.4s,v"#c3".4s,v"#c4".4s\n\t"\
+  "trn1 v"#c1".2d,v1.2d,v2.2d; trn1 v"#c2".2d,v3.2d,v4.2d\n\t"\
+  "trn2 v"#c3".2d,v1.2d,v2.2d; trn2 v"#c4".2d,v3.2d,v4.2d\n\t"\
+  "ldr q1,[x12]; ldr q2,[x13]; ldr q3,[x14]; ldr q4,[x15]\n\t"\
+  "fmla v"#c1".4s,v1.4s,v0.s[0]; fmla v"#c2".4s,v2.4s,v0.s[0]\n\t"\
+  "fmla v"#c3".4s,v3.4s,v0.s[0]; fmla v"#c4".4s,v4.4s,v0.s[0]\n\t"\
+  "str q"#c1",[x12]; prfm pstl2keep,[x12,#32]; add x12,x12,%w[LDC],UXTW #4\n\t"\
+  "str q"#c2",[x13]; prfm pstl2keep,[x13,#32]; add x13,x13,%w[LDC],UXTW #4\n\t"\
+  "str q"#c3",[x14]; prfm pstl2keep,[x14,#32]; add x14,x14,%w[LDC],UXTW #4\n\t"\
+  "str q"#c4",[x15]; prfm pstl2keep,[x15,#32]; add x15,x15,%w[LDC],UXTW #4\n\t"
+
+#define EDGE_SAVE_M4N1_CC(c1, c2, c3, c4) \
+  "ldr q1,[x12]\n\t"\
+  "faddp v"#c1".4s,v"#c1".4s,v"#c2".4s\n\t"\
+  "faddp v"#c3".4s,v"#c3".4s,v"#c4".4s\n\t"\
+  "faddp v"#c1".4s,v"#c1".4s,v"#c3".4s\n\t"\
+  "fmla v"#c1".4s,v1.4s,v0.s[0]\n\t"\
+  "str q"#c1",[x12]; prfm pstl2keep,[x12,#32]\n\t"\
+  "add x12,x12,%w[LDC],UXTW #2\n\t"
+
+#define UNIT_SAVE_M4N4_CR(c1, c2, c3, c4) \
+  "ldr q1,[x12]; ldr q2,[x13]; ldr q3,[x14]; ldr q4,[x15]\n\t"\
+  "fmla v"#c1".4s,v1.4s,v0.s[0]; fmla v"#c2".4s,v2.4s,v0.s[0]\n\t"\
+  "fmla v"#c3".4s,v3.4s,v0.s[0]; fmla v"#c4".4s,v4.4s,v0.s[0]\n\t"\
+  "str q"#c1",[x12],#16; str q"#c2",[x13],#16\n\t"\
+  "str q"#c3",[x14],#16; str q"#c4",[x15],#16\n\t"
+
+#define EDGE_SAVE_M4N1_CR(c1, c2, c3, c4) \
+  "ldr s1,[x12]; ldr s2,[x13]; ldr s3,[x14]; ldr s4,[x15]\n\t"\
+  "faddp v"#c1".4s,v"#c1".4s,v"#c2".4s\n\t"\
+  "ins v1.s[1],v2.s[0]; ins v3.s[1],v4.s[0]\n\t"\
+  "faddp v"#c3".4s,v"#c3".4s,v"#c4".4s\n\t"\
+  "ins v1.d[1],v3.d[0]\n\t"\
+  "faddp v"#c1".4s,v"#c1".4s,v"#c3".4s\n\t"\
+  "fmla v"#c1".4s,v1.4s,v0.s[0]\n\t"\
+  "st1 {v"#c1".s}[0],[x12],#4; st1 {v"#c1".s}[1],[x13],#4\n\t"\
+  "st1 {v"#c1".s}[2],[x14],#4; st1 {v"#c1".s}[3],[x15],#4\n\t"
+
+#define FUNC_M4(ndim) \
+static inline void sgemm_skinny1_a7x_m4n##ndim(\
+  const float * __restrict__ a_ptr, const float * __restrict__ b_scr,\
+  float * __restrict__ c_ptr, uint32_t K, uint32_t LDA, uint32_t LDC,\
+  uint8_t c_rowmajor, const float * __restrict__ beta_addr) {\
+  __asm__ __volatile__ (\
+    "mov x0,%[a_ptr]; add x1,%[a_ptr],%w[LDA],UXTW #2\n\t"\
+    "add x2,%[a_ptr],%w[LDA],UXTW #3; add x3,x1,%w[LDA],UXTW #3\n\t"\
+    "add x6,x0,%w[LDA],UXTW #4; add x7,x1,%w[LDA],UXTW #4\n\t"\
+    "add x8,x2,%w[LDA],UXTW #4; add x9,x3,%w[LDA],UXTW #4\n\t"\
+    "mov x4,%[b_scr]; mov w5,%w[K]\n\t"\
+    INIT_M4N##ndim\
+    "cmp w5,#4; b.lt 4f\n\t"\
+    KERNEL_M4N##ndim##_PRELOAD4\
+    "cmp w5,#20; b.lt 1f\n\t"\
+    ".balign 16; 9:\n\t"\
+    "prfm pldl2keep,[x6]; add x6,x6,#64\n\t"\
+    KERNEL_M4N##ndim##_MAIN4(0, 1, 2, 3, 4, 5, 6, 7)\
+    "prfm pldl2keep,[x7]; add x7,x7,#64\n\t"\
+    KERNEL_M4N##ndim##_MAIN4(4, 5, 6, 7, 0, 1, 2, 3)\
+    "prfm pldl2keep,[x8]; add x8,x8,#64\n\t"\
+    KERNEL_M4N##ndim##_MAIN4(0, 1, 2, 3, 4, 5, 6, 7)\
+    "prfm pldl2keep,[x9]; add x9,x9,#64\n\t"\
+    KERNEL_M4N##ndim##_MAIN4(4, 5, 6, 7, 0, 1, 2, 3)\
+    "cmp w5,#20; b.ge 9b; 1:\n\t"\
+    "cmp w5,#12; b.lt 2f\n\t"\
+    KERNEL_M4N##ndim##_MAIN4(0, 1, 2, 3, 4, 5, 6, 7)\
+    KERNEL_M4N##ndim##_MAIN4(4, 5, 6, 7, 0, 1, 2, 3)\
+    "2:\n\t"\
+    "cmp w5,#8; b.lt 3f\n\t"\
+    KERNEL_M4N##ndim##_MAIN4(0, 1, 2, 3, 4, 5, 6, 7)\
+    KERNEL_M4N##ndim##_TAIL4(4, 5, 6, 7)\
+    "b 4f; 3:\n\t"\
+    KERNEL_M4N##ndim##_TAIL4(0, 1, 2, 3)\
+    "4:\n\t"\
+    "cmp w5,#1; b.lt 6f\n\t"\
+    "5:\n\t"\
+    KERNEL_M4N##ndim##_TL1 "b.gt 5b\n\t"\
+    "6:\n\t"\
+    INIT_SAVE\
+    "cmp %w[c_rowmajor],#0; b.eq 7f\n\t"\
+    SAVE_M4N##ndim(CR) "b 8f\n\t"\
+    "7:\n\t"\
+    SAVE_M4N##ndim(CC)\
+    "8:\n\t"\
+  ::[a_ptr]"r"(a_ptr), [b_scr]"r"(b_scr), [c_ptr]"r"(c_ptr),\
+    [K]"r"(K), [LDA]"r"(LDA), [LDC]"r"(LDC),\
+    [beta_addr]"r"(beta_addr), [c_rowmajor]"r"(c_rowmajor)\
+  :"cc","memory","x0","x1","x2","x3","x4","x5","x6","x7","x8","x9",\
+   "x12","x13","x14","x15",\
+   "v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11","v12","v13",\
+   "v14","v15","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25",\
+   "v26","v27","v28","v29","v30","v31");\
+}
+
+#define UNIT_SAVE_M3N4_CC(c1, c2, c3) \
+  "ldr d1,[x12]; ldr s2,[x12,#8]\n\t"\
+  "ldr d3,[x13]; ldr s4,[x13,#8]\n\t"\
+  "trn1 v5.4s,v"#c1".4s,v"#c2".4s; trn2 v"#c2".4s,v"#c1".4s,v"#c2".4s\n\t"\
+  "mov v6.8b,v"#c2".8b; mov v"#c1".16b,v5.16b\n\t"\
+  "fmla v5.2s,v1.2s,v0.s[0]; fmla v6.2s,v3.2s,v0.s[0]\n\t"\
+  "fmov s1,s"#c3"; ins v3.s[0],v"#c3".s[1]\n\t"\
+  "fmla s1,s2,v0.s[0]; fmla s3,s4,v0.s[0]\n\t"\
+  "str d5,[x12]; str s1,[x12,#8]; prfm pstl2keep,[x12,#24]\n\t"\
+  "add x12,x12,%w[LDC],UXTW #4\n\t"\
+  "str d6,[x13]; str s3,[x13,#8]; prfm pstl2keep,[x13,#24]\n\t"\
+  "add x13,x13,%w[LDC],UXTW #4\n\t"\
+  "ldr d1,[x14]; ldr s2,[x14,#8]\n\t"\
+  "ldr d3,[x15]; ldr s4,[x15,#8]\n\t"\
+  "ins v"#c1".d[0],v"#c1".d[1]; ins v"#c2".d[0],v"#c2".d[1]\n\t"\
+  "ins v5.s[0],v"#c3".s[2]; ins v6.s[0],v"#c3".s[3]\n\t"\
+  "fmla v"#c1".2s,v1.2s,v0.s[0]; fmla v"#c2".2s,v3.2s,v0.s[0]\n\t"\
+  "fmla s5,s2,v0.s[0]; fmla s6,s4,v0.s[0]\n\t"\
+  "str d"#c1",[x14]; str s5,[x14,#8]; prfm pstl2keep,[x14,#24]\n\t"\
+  "add x14,x14,%w[LDC],UXTW #4\n\t"\
+  "str d"#c2",[x15]; str s6,[x15,#8]; prfm pstl2keep,[x15,#24]\n\t"\
+  "add x15,x15,%w[LDC],UXTW #4\n\t"
+
+#define UNIT_SAVE_M3N4_CR(c1, c2, c3) \
+  "ldr q1,[x12]; ldr q2,[x13]; ldr q3,[x14]\n\t"\
+  "fmla v"#c1".4s,v1.4s,v0.s[0]\n\t"\
+  "fmla v"#c2".4s,v2.4s,v0.s[0]\n\t"\
+  "fmla v"#c3".4s,v3.4s,v0.s[0]\n\t"\
+  "str q"#c1",[x12],#16; str q"#c2",[x13],#16; str q"#c3",[x14],#16\n\t"
+
+#define EDGE_SAVE_M3N1_CC(c1, c2, c3) \
+  "ldr d1,[x12]; ldr s2,[x12,#8]\n\t"\
+  "faddp v"#c1".4s,v"#c1".4s,v"#c2".4s\n\t"\
+  "faddp v"#c3".4s,v"#c3".4s,v"#c3".4s\n\t"\
+  "faddp v"#c1".4s,v"#c1".4s,v"#c1".4s\n\t"\
+  "faddp s"#c3",v"#c3".2s\n\t"\
+  "fmla v"#c1".2s,v1.2s,v0.s[0]; fmla s"#c3",s2,v0.s[0]\n\t"\
+  "str d"#c1",[x12]; str s"#c3",[x12,#8]\n\t"\
+  "prfm pstl2keep,[x12,#24]\n\t"\
+  "add x12,x12,%w[LDC],UXTW #2\n\t"
+
+#define EDGE_SAVE_M3N1_CR(c1, c2, c3) \
+  "ldr s1,[x12]; ldr s2,[x13]; ldr s3,[x14]\n\t"\
+  "faddp v"#c1".4s,v"#c1".4s,v"#c2".4s\n\t"\
+  "faddp v"#c3".4s,v"#c3".4s,v"#c3".4s\n\t"\
+  "ins v1.s[1],v2.s[0]\n\t"\
+  "faddp v"#c1".4s,v"#c1".4s,v"#c1".4s\n\t"\
+  "faddp s"#c3",v"#c3".2s\n\t"\
+  "fmla v"#c1".2s,v1.2s,v0.s[0]; fmla s"#c3",s3,v0.s[0]\n\t"\
+  "st1 {v"#c1".s}[0],[x12],#4; st1 {v"#c1".s}[1],[x13],#4\n\t"\
+  "str s"#c3",[x14],#4\n\t"
+
+#define FUNC_M3(ndim) \
+static inline void sgemm_skinny1_a7x_m3n##ndim(\
+  const float * __restrict__ a_ptr, const float * __restrict__ b_scr,\
+  float * __restrict__ c_ptr, uint32_t K, uint32_t LDA, uint32_t LDC,\
+  uint8_t c_rowmajor, const float * __restrict__ beta_addr) {\
+  __asm__ __volatile__ (\
+    "mov x0,%[a_ptr]; add x1,%[a_ptr],%w[LDA],UXTW #2\n\t"\
+    "add x2,%[a_ptr],%w[LDA],UXTW #3\n\t"\
+    "add x6,x1,%w[LDA],UXTW #3; add x7,x0,%w[LDA],UXTW #4\n\t"\
+    "add x8,x1,%w[LDA],UXTW #4\n\t"\
+    "mov x4,%[b_scr]; mov w5,%w[K]\n\t"\
+    INIT_M3N##ndim\
+    "cmp w5,#4; b.lt 4f\n\t"\
+    KERNEL_M3N##ndim##_PRELOAD4\
+    "cmp w5,#20; b.lt 1f\n\t"\
+    ".balign 16; 9:\n\t"\
+    KERNEL_M3N##ndim##_MAIN4(0, 1, 2, 3, 4, 5)\
+    "prfm pldl2keep,[x6]; add x6,x6,#64\n\t"\
+    KERNEL_M3N##ndim##_MAIN4(3, 4, 5, 0, 1, 2)\
+    "prfm pldl2keep,[x7]; add x7,x7,#64\n\t"\
+    KERNEL_M3N##ndim##_MAIN4(0, 1, 2, 3, 4, 5)\
+    "prfm pldl2keep,[x8]; add x8,x8,#64\n\t"\
+    KERNEL_M3N##ndim##_MAIN4(3, 4, 5, 0, 1, 2)\
+    "cmp w5,#20; b.ge 9b; 1:\n\t"\
+    "cmp w5,#12; b.lt 2f\n\t"\
+    KERNEL_M3N##ndim##_MAIN4(0, 1, 2, 3, 4, 5)\
+    KERNEL_M3N##ndim##_MAIN4(3, 4, 5, 0, 1, 2)\
+    "2:\n\t"\
+    "cmp w5,#8; b.lt 3f\n\t"\
+    KERNEL_M3N##ndim##_MAIN4(0, 1, 2, 3, 4, 5)\
+    KERNEL_M3N##ndim##_TAIL4(3, 4, 5)\
+    "b 4f; 3:\n\t"\
+    KERNEL_M3N##ndim##_TAIL4(0, 1, 2)\
+    "4:\n\t"\
+    "cmp w5,#1; b.lt 6f\n\t"\
+    "5:\n\t"\
+    KERNEL_M3N##ndim##_TL1 "b.gt 5b\n\t"\
+    "6:\n\t"\
+    INIT_SAVE\
+    "cmp %w[c_rowmajor],#0; b.eq 7f\n\t"\
+    SAVE_M3N##ndim(CR) "b 8f\n\t"\
+    "7:\n\t"\
+    SAVE_M3N##ndim(CC)\
+    "8:\n\t"\
+  ::[a_ptr]"r"(a_ptr), [b_scr]"r"(b_scr), [c_ptr]"r"(c_ptr),\
+    [K]"r"(K), [LDA]"r"(LDA), [LDC]"r"(LDC),\
+    [beta_addr]"r"(beta_addr), [c_rowmajor]"r"(c_rowmajor)\
+  :"cc","memory","x0","x1","x2","x4","x5","x6","x7","x8",\
+   "x12","x13","x14","x15",\
+   "v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11","v12","v13",\
+   "v14","v15","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25",\
+   "v26","v27","v28","v29","v30","v31");\
+}
+
+
+#define INIT_M4N4 INIT_4V(12, 13, 14, 15) INIT_4V(16, 17, 18, 19)
+
+#define SAVE_M4N4(mode) UNIT_SAVE_M4N4_##mode(12, 13, 14, 15)
+
+#define KERNEL_M4N4_PRELOAD4 \
+  "ldr q0,[x0],#16; ldr q1,[x1],#16; ldr q2,[x2],#16; ldr q3,[x3],#16\n\t"\
+  "ldr q8,[x4]; ldr q9,[x4,#16]; ldr q10,[x4,#32]; ldr q11,[x4,#48]\n\t"\
+  "add x4,x4,#64\n\t"
+
+#define KERNEL_M4N4_MAIN4(ac1, ac2, ac3, ac4, an1, an2, an3, an4) \
+  "fmla v12.4s,v8.4s,v"#ac1".s[0]; fmla v13.4s,v8.4s,v"#ac2".s[0]\n\t"\
+  "ldr q"#an1",[x0],#16\n\t"\
+  "fmla v14.4s,v8.4s,v"#ac3".s[0]; fmla v15.4s,v8.4s,v"#ac4".s[0]\n\t"\
+  "ldr q8,[x4]\n\t"\
+  "fmla v16.4s,v9.4s,v"#ac1".s[1]; fmla v17.4s,v9.4s,v"#ac2".s[1]\n\t"\
+  "ldr q"#an2",[x1],#16\n\t"\
+  "fmla v18.4s,v9.4s,v"#ac3".s[1]; fmla v19.4s,v9.4s,v"#ac4".s[1]\n\t"\
+  "ldr q9,[x4,#16]; sub w5,w5,#4\n\t"\
+  "fmla v12.4s,v10.4s,v"#ac1".s[2]; fmla v13.4s,v10.4s,v"#ac2".s[2]\n\t"\
+  "ldr q"#an3",[x2],#16\n\t"\
+  "fmla v14.4s,v10.4s,v"#ac3".s[2]; fmla v15.4s,v10.4s,v"#ac4".s[2]\n\t"\
+  "ldr q10,[x4,#32]\n\t"\
+  "fmla v16.4s,v11.4s,v"#ac1".s[3]; fmla v17.4s,v11.4s,v"#ac2".s[3]\n\t"\
+  "ldr q"#an4",[x3],#16\n\t"\
+  "fmla v18.4s,v11.4s,v"#ac3".s[3]; fmla v19.4s,v11.4s,v"#ac4".s[3]\n\t"\
+  "ldr q11,[x4,#48]; add x4,x4,#64\n\t"
+
+#define KERNEL_M4N4_TAIL4(ac1, ac2, ac3, ac4) \
+  "fmla v12.4s,v8.4s,v"#ac1".s[0]; fmla v13.4s,v8.4s,v"#ac2".s[0]\n\t"\
+  "fmla v14.4s,v8.4s,v"#ac3".s[0]; fmla v15.4s,v8.4s,v"#ac4".s[0]\n\t"\
+  "prfm pldl2keep,[x6]\n\t"\
+  "fmla v16.4s,v9.4s,v"#ac1".s[1]; fmla v17.4s,v9.4s,v"#ac2".s[1]\n\t"\
+  "fmla v18.4s,v9.4s,v"#ac3".s[1]; fmla v19.4s,v9.4s,v"#ac4".s[1]\n\t"\
+  "prfm pldl2keep,[x7]\n\t"\
+  "fmla v12.4s,v10.4s,v"#ac1".s[2]; fmla v13.4s,v10.4s,v"#ac2".s[2]\n\t"\
+  "fmla v14.4s,v10.4s,v"#ac3".s[2]; fmla v15.4s,v10.4s,v"#ac4".s[2]\n\t"\
+  "prfm pldl2keep,[x8]\n\t"\
+  "fmla v16.4s,v11.4s,v"#ac1".s[3]; fmla v17.4s,v11.4s,v"#ac2".s[3]\n\t"\
+  "fmla v18.4s,v11.4s,v"#ac3".s[3]; fmla v19.4s,v11.4s,v"#ac4".s[3]\n\t"\
+  "sub w5,w5,#4; prfm pldl2keep,[x9]\n\t"\
+  "fadd v12.4s,v12.4s,v16.4s; fadd v13.4s,v13.4s,v17.4s\n\t"\
+  "fadd v14.4s,v14.4s,v18.4s; fadd v15.4s,v15.4s,v19.4s\n\t"
+
+#define KERNEL_M4N4_TL1 \
+  "ldr s0,[x0],#4; ldr s1,[x1],#4; ldr s2,[x2],#4; ldr s3,[x3],#4\n\t"\
+  "ldr q8,[x4],#16\n\t"\
+  "fmla v12.4s,v8.4s,v0.s[0]; fmla v13.4s,v8.4s,v1.s[0]; subs w5,w5,#1\n\t"\
+  "fmla v14.4s,v8.4s,v2.s[0]; fmla v15.4s,v8.4s,v3.s[0]\n\t"
+
+
+#define INIT_M4N5 INIT_4V(12, 13, 14, 15) INIT_4V(16, 17, 18, 19)\
+  INIT_4V(20, 21, 22, 23)
+
+#define SAVE_M4N5(mode) UNIT_SAVE_M4N4_##mode(12, 13, 14, 15)\
+  EDGE_SAVE_M4N1_##mode(20, 21, 22, 23)
+
+#define KERNEL_M4N5_PRELOAD4 \
+  "ldr q0,[x0],#16; ldr q1,[x1],#16; ldr q2,[x2],#16; ldr q3,[x3],#16\n\t"\
+  "ldr q8,[x4]; ldr q9,[x4,#16]; ldr q10,[x4,#32]; ldr q11,[x4,#48]\n\t"\
+  "ldr q24,[x4,#64]; add x4,x4,#80\n\t"
+
+#define KERNEL_M4N5_MAIN4(ac1, ac2, ac3, ac4, an1, an2, an3, an4) \
+  "fmla v12.4s,v8.4s,v"#ac1".s[0]; fmla v13.4s,v8.4s,v"#ac2".s[0]\n\t"\
+  "ldr q"#an1",[x0],#16\n\t"\
+  "fmla v14.4s,v8.4s,v"#ac3".s[0]; fmla v15.4s,v8.4s,v"#ac4".s[0]\n\t"\
+  "ldr q8,[x4]\n\t"\
+  "fmla v16.4s,v9.4s,v"#ac1".s[1]; fmla v17.4s,v9.4s,v"#ac2".s[1]\n\t"\
+  "ldr q"#an2",[x1],#16\n\t"\
+  "fmla v18.4s,v9.4s,v"#ac3".s[1]; fmla v19.4s,v9.4s,v"#ac4".s[1]\n\t"\
+  "ldr q9,[x4,#16]; sub w5,w5,#4\n\t"\
+  "fmla v12.4s,v10.4s,v"#ac1".s[2]; fmla v13.4s,v10.4s,v"#ac2".s[2]\n\t"\
+  "ldr q"#an3",[x2],#16\n\t"\
+  "fmla v14.4s,v10.4s,v"#ac3".s[2]; fmla v15.4s,v10.4s,v"#ac4".s[2]\n\t"\
+  "ldr q10,[x4,#32]\n\t"\
+  "fmla v16.4s,v11.4s,v"#ac1".s[3]; fmla v17.4s,v11.4s,v"#ac2".s[3]\n\t"\
+  "ldr q"#an4",[x3],#16\n\t"\
+  "fmla v18.4s,v11.4s,v"#ac3".s[3]; fmla v19.4s,v11.4s,v"#ac4".s[3]\n\t"\
+  "ldr q11,[x4,#48]\n\t"\
+  "fmla v20.4s,v24.4s,v"#ac1".4s; fmla v21.4s,v24.4s,v"#ac2".4s\n\t"\
+  "fmla v22.4s,v24.4s,v"#ac3".4s; fmla v23.4s,v24.4s,v"#ac4".4s\n\t"\
+  "ldr q24,[x4,#64]; add x4,x4,#80\n\t"
+
+#define KERNEL_M4N5_TAIL4(ac1, ac2, ac3, ac4) \
+  "fmla v12.4s,v8.4s,v"#ac1".s[0]; fmla v13.4s,v8.4s,v"#ac2".s[0]\n\t"\
+  "fmla v14.4s,v8.4s,v"#ac3".s[0]; fmla v15.4s,v8.4s,v"#ac4".s[0]\n\t"\
+  "prfm pldl2keep,[x6]\n\t"\
+  "fmla v16.4s,v9.4s,v"#ac1".s[1]; fmla v17.4s,v9.4s,v"#ac2".s[1]\n\t"\
+  "fmla v18.4s,v9.4s,v"#ac3".s[1]; fmla v19.4s,v9.4s,v"#ac4".s[1]\n\t"\
+  "prfm pldl2keep,[x7]\n\t"\
+  "fmla v12.4s,v10.4s,v"#ac1".s[2]; fmla v13.4s,v10.4s,v"#ac2".s[2]\n\t"\
+  "fmla v14.4s,v10.4s,v"#ac3".s[2]; fmla v15.4s,v10.4s,v"#ac4".s[2]\n\t"\
+  "prfm pldl2keep,[x8]\n\t"\
+  "fmla v16.4s,v11.4s,v"#ac1".s[3]; fmla v17.4s,v11.4s,v"#ac2".s[3]\n\t"\
+  "fmla v18.4s,v11.4s,v"#ac3".s[3]; fmla v19.4s,v11.4s,v"#ac4".s[3]\n\t"\
+  "prfm pldl2keep,[x9]\n\t"\
+  "fmla v20.4s,v24.4s,v"#ac1".4s; fmla v21.4s,v24.4s,v"#ac2".4s\n\t"\
+  "fmla v22.4s,v24.4s,v"#ac3".4s; fmla v23.4s,v24.4s,v"#ac4".4s\n\t"\
+  "sub w5,w5,#4\n\t"\
+  "fadd v12.4s,v12.4s,v16.4s; fadd v13.4s,v13.4s,v17.4s\n\t"\
+  "fadd v14.4s,v14.4s,v18.4s; fadd v15.4s,v15.4s,v19.4s\n\t"
+
+#define KERNEL_M4N5_TL1 \
+  "ldr s0,[x0],#4; ldr s1,[x1],#4; ldr s2,[x2],#4; ldr s3,[x3],#4\n\t"\
+  "ldr q8,[x4],#16; ldr s9,[x4],#4\n\t"\
+  "fmla v12.4s,v8.4s,v0.s[0]; fmla v13.4s,v8.4s,v1.s[0]; subs w5,w5,#1\n\t"\
+  "fmla v14.4s,v8.4s,v2.s[0]; fmla v15.4s,v8.4s,v3.s[0]\n\t"\
+  "fmla v20.4s,v9.4s,v0.4s; fmla v21.4s,v9.4s,v1.4s\n\t"\
+  "fmla v22.4s,v9.4s,v2.4s; fmla v23.4s,v9.4s,v3.4s\n\t"
+
+
+#define INIT_M4N6 INIT_4V(12, 13, 14, 15) INIT_4V(16, 17, 18, 19)\
+  INIT_4V(20, 21, 22, 23) INIT_4V(24, 25, 26, 27)
+
+#define SAVE_M4N6(mode) UNIT_SAVE_M4N4_##mode(12, 13, 14, 15)\
+  EDGE_SAVE_M4N1_##mode(20, 21, 22, 23) EDGE_SAVE_M4N1_##mode(24, 25, 26, 27)
+
+#define KERNEL_M4N6_PRELOAD4 \
+  "ldr q0,[x0],#16; ldr q1,[x1],#16; ldr q2,[x2],#16; ldr q3,[x3],#16\n\t"\
+  "ldr q8,[x4]; ldr q9,[x4,#16]; ldr q10,[x4,#32]; ldr q11,[x4,#48]\n\t"\
+  "ldr q28,[x4,#64]; ldr q29,[x4,#80]; add x4,x4,#96\n\t"
+
+#define KERNEL_M4N6_MAIN4(ac1, ac2, ac3, ac4, an1, an2, an3, an4) \
+  "fmla v12.4s,v8.4s,v"#ac1".s[0]; fmla v13.4s,v8.4s,v"#ac2".s[0]\n\t"\
+  "ldr q"#an1",[x0],#16\n\t"\
+  "fmla v14.4s,v8.4s,v"#ac3".s[0]; fmla v15.4s,v8.4s,v"#ac4".s[0]\n\t"\
+  "ldr q8,[x4]\n\t"\
+  "fmla v16.4s,v9.4s,v"#ac1".s[1]; fmla v17.4s,v9.4s,v"#ac2".s[1]\n\t"\
+  "ldr q"#an2",[x1],#16\n\t"\
+  "fmla v18.4s,v9.4s,v"#ac3".s[1]; fmla v19.4s,v9.4s,v"#ac4".s[1]\n\t"\
+  "ldr q9,[x4,#16]; sub w5,w5,#4\n\t"\
+  "fmla v12.4s,v10.4s,v"#ac1".s[2]; fmla v13.4s,v10.4s,v"#ac2".s[2]\n\t"\
+  "ldr q"#an3",[x2],#16\n\t"\
+  "fmla v14.4s,v10.4s,v"#ac3".s[2]; fmla v15.4s,v10.4s,v"#ac4".s[2]\n\t"\
+  "ldr q10,[x4,#32]\n\t"\
+  "fmla v16.4s,v11.4s,v"#ac1".s[3]; fmla v17.4s,v11.4s,v"#ac2".s[3]\n\t"\
+  "ldr q"#an4",[x3],#16\n\t"\
+  "fmla v18.4s,v11.4s,v"#ac3".s[3]; fmla v19.4s,v11.4s,v"#ac4".s[3]\n\t"\
+  "ldr q11,[x4,#48]\n\t"\
+  "fmla v20.4s,v28.4s,v"#ac1".4s; fmla v21.4s,v28.4s,v"#ac2".4s\n\t"\
+  "fmla v22.4s,v28.4s,v"#ac3".4s; fmla v23.4s,v28.4s,v"#ac4".4s\n\t"\
+  "ldr q28,[x4,#64]\n\t"\
+  "fmla v24.4s,v29.4s,v"#ac1".4s; fmla v25.4s,v29.4s,v"#ac2".4s\n\t"\
+  "fmla v26.4s,v29.4s,v"#ac3".4s; fmla v27.4s,v29.4s,v"#ac4".4s\n\t"\
+  "ldr q29,[x4,#80]; add x4,x4,#96\n\t"
+
+#define KERNEL_M4N6_TAIL4(ac1, ac2, ac3, ac4) \
+  "fmla v12.4s,v8.4s,v"#ac1".s[0]; fmla v13.4s,v8.4s,v"#ac2".s[0]\n\t"\
+  "fmla v14.4s,v8.4s,v"#ac3".s[0]; fmla v15.4s,v8.4s,v"#ac4".s[0]\n\t"\
+  "prfm pldl2keep,[x6]\n\t"\
+  "fmla v16.4s,v9.4s,v"#ac1".s[1]; fmla v17.4s,v9.4s,v"#ac2".s[1]\n\t"\
+  "fmla v18.4s,v9.4s,v"#ac3".s[1]; fmla v19.4s,v9.4s,v"#ac4".s[1]\n\t"\
+  "prfm pldl2keep,[x7]\n\t"\
+  "fmla v12.4s,v10.4s,v"#ac1".s[2]; fmla v13.4s,v10.4s,v"#ac2".s[2]\n\t"\
+  "fmla v14.4s,v10.4s,v"#ac3".s[2]; fmla v15.4s,v10.4s,v"#ac4".s[2]\n\t"\
+  "prfm pldl2keep,[x8]\n\t"\
+  "fmla v16.4s,v11.4s,v"#ac1".s[3]; fmla v17.4s,v11.4s,v"#ac2".s[3]\n\t"\
+  "fmla v18.4s,v11.4s,v"#ac3".s[3]; fmla v19.4s,v11.4s,v"#ac4".s[3]\n\t"\
+  "prfm pldl2keep,[x9]\n\t"\
+  "fmla v20.4s,v28.4s,v"#ac1".4s; fmla v21.4s,v28.4s,v"#ac2".4s\n\t"\
+  "fmla v22.4s,v28.4s,v"#ac3".4s; fmla v23.4s,v28.4s,v"#ac4".4s\n\t"\
+  "fmla v24.4s,v29.4s,v"#ac1".4s; fmla v25.4s,v29.4s,v"#ac2".4s\n\t"\
+  "fmla v26.4s,v29.4s,v"#ac3".4s; fmla v27.4s,v29.4s,v"#ac4".4s\n\t"\
+  "sub w5,w5,#4\n\t"\
+  "fadd v12.4s,v12.4s,v16.4s; fadd v13.4s,v13.4s,v17.4s\n\t"\
+  "fadd v14.4s,v14.4s,v18.4s; fadd v15.4s,v15.4s,v19.4s\n\t"
+
+#define KERNEL_M4N6_TL1 \
+  "ldr s0,[x0],#4; ldr s1,[x1],#4; ldr s2,[x2],#4; ldr s3,[x3],#4\n\t"\
+  "ldr q8,[x4],#16; ldr s9,[x4],#4; ldr s10,[x4],#4\n\t"\
+  "fmla v12.4s,v8.4s,v0.s[0]; fmla v13.4s,v8.4s,v1.s[0]; subs w5,w5,#1\n\t"\
+  "fmla v14.4s,v8.4s,v2.s[0]; fmla v15.4s,v8.4s,v3.s[0]\n\t"\
+  "fmla v20.4s,v9.4s,v0.4s; fmla v21.4s,v9.4s,v1.4s\n\t"\
+  "fmla v22.4s,v9.4s,v2.4s; fmla v23.4s,v9.4s,v3.4s\n\t"\
+  "fmla v24.4s,v10.4s,v0.4s; fmla v25.4s,v10.4s,v1.4s\n\t"\
+  "fmla v26.4s,v10.4s,v2.4s; fmla v27.4s,v10.4s,v3.4s\n\t"
+
+
+#define INIT_M4N7 INIT_4V(12, 13, 14, 15) INIT_4V(16, 17, 18, 19)\
+  INIT_4V(20, 21, 22, 23) INIT_4V(24, 25, 26, 27) INIT_4V(28, 29, 30, 31)
+
+#define SAVE_M4N7(mode) UNIT_SAVE_M4N4_##mode(12, 13, 14, 15)\
+  EDGE_SAVE_M4N1_##mode(20, 21, 22, 23) EDGE_SAVE_M4N1_##mode(24, 25, 26, 27)\
+  EDGE_SAVE_M4N1_##mode(28, 29, 30, 31)
+
+#define KERNEL_M4N7_PRELOAD4 \
+  "ldr q0,[x0],#16; ldr q1,[x1],#16; ldr q2,[x2],#16; ldr q3,[x3],#16\n\t"\
+  "ldr q8,[x4]; ldr q9,[x4,#16]; ldr q10,[x4,#32]; ldr q11,[x4,#48]\n\t"\
+  "add x4,x4,#112\n\t"
+
+#define KERNEL_M4N7_MAIN4(ac1, ac2, ac3, ac4, an1, an2, an3, an4) \
+  "fmla v12.4s,v8.4s,v"#ac1".s[0]; fmla v13.4s,v8.4s,v"#ac2".s[0]\n\t"\
+  "ldr q"#an1",[x0],#16\n\t"\
+  "fmla v14.4s,v8.4s,v"#ac3".s[0]; fmla v15.4s,v8.4s,v"#ac4".s[0]\n\t"\
+  "ldr q8,[x4,#-48]\n\t"\
+  "fmla v16.4s,v9.4s,v"#ac1".s[1]; fmla v17.4s,v9.4s,v"#ac2".s[1]\n\t"\
+  "ldr q"#an2",[x1],#16\n\t"\
+  "fmla v18.4s,v9.4s,v"#ac3".s[1]; fmla v19.4s,v9.4s,v"#ac4".s[1]\n\t"\
+  "ldr q9,[x4,#-32]; sub w5,w5,#4\n\t"\
+  "fmla v12.4s,v10.4s,v"#ac1".s[2]; fmla v13.4s,v10.4s,v"#ac2".s[2]\n\t"\
+  "ldr q"#an3",[x2],#16\n\t"\
+  "fmla v14.4s,v10.4s,v"#ac3".s[2]; fmla v15.4s,v10.4s,v"#ac4".s[2]\n\t"\
+  "ldr q10,[x4,#-16]\n\t"\
+  "fmla v16.4s,v11.4s,v"#ac1".s[3]; fmla v17.4s,v11.4s,v"#ac2".s[3]\n\t"\
+  "ldr q"#an4",[x3],#16\n\t"\
+  "fmla v18.4s,v11.4s,v"#ac3".s[3]; fmla v19.4s,v11.4s,v"#ac4".s[3]\n\t"\
+  "fmla v20.4s,v8.4s,v"#ac1".4s; fmla v21.4s,v8.4s,v"#ac2".4s\n\t"\
+  "fmla v22.4s,v8.4s,v"#ac3".4s; fmla v23.4s,v8.4s,v"#ac4".4s\n\t"\
+  "ldr q8,[x4],#112\n\t"\
+  "fmla v24.4s,v9.4s,v"#ac1".4s; fmla v25.4s,v9.4s,v"#ac2".4s\n\t"\
+  "fmla v26.4s,v9.4s,v"#ac3".4s; fmla v27.4s,v9.4s,v"#ac4".4s\n\t"\
+  "ldr q9,[x4,#-96]\n\t"\
+  "fmla v28.4s,v10.4s,v"#ac1".4s; fmla v29.4s,v10.4s,v"#ac2".4s\n\t"\
+  "fmla v30.4s,v10.4s,v"#ac3".4s; fmla v31.4s,v10.4s,v"#ac4".4s\n\t"\
+  "ldr q10,[x4,#-80]; ldr q11,[x4,#-64]\n\t"
+
+#define KERNEL_M4N7_TAIL4(ac1, ac2, ac3, ac4) \
+  "fmla v12.4s,v8.4s,v"#ac1".s[0]; fmla v13.4s,v8.4s,v"#ac2".s[0]\n\t"\
+  "prfm pldl2keep,[x6]\n\t"\
+  "fmla v14.4s,v8.4s,v"#ac3".s[0]; fmla v15.4s,v8.4s,v"#ac4".s[0]\n\t"\
+  "ldr q8,[x4,#-48]\n\t"\
+  "fmla v16.4s,v9.4s,v"#ac1".s[1]; fmla v17.4s,v9.4s,v"#ac2".s[1]\n\t"\
+  "prfm pldl2keep,[x7]\n\t"\
+  "fmla v18.4s,v9.4s,v"#ac3".s[1]; fmla v19.4s,v9.4s,v"#ac4".s[1]\n\t"\
+  "ldr q9,[x4,#-32]; sub w5,w5,#4\n\t"\
+  "fmla v12.4s,v10.4s,v"#ac1".s[2]; fmla v13.4s,v10.4s,v"#ac2".s[2]\n\t"\
+  "prfm pldl2keep,[x8]\n\t"\
+  "fmla v14.4s,v10.4s,v"#ac3".s[2]; fmla v15.4s,v10.4s,v"#ac4".s[2]\n\t"\
+  "ldr q10,[x4,#-16]\n\t"\
+  "fmla v16.4s,v11.4s,v"#ac1".s[3]; fmla v17.4s,v11.4s,v"#ac2".s[3]\n\t"\
+  "prfm pldl2keep,[x9]\n\t"\
+  "fmla v18.4s,v11.4s,v"#ac3".s[3]; fmla v19.4s,v11.4s,v"#ac4".s[3]\n\t"\
+  "fmla v20.4s,v8.4s,v"#ac1".4s; fmla v21.4s,v8.4s,v"#ac2".4s\n\t"\
+  "fmla v22.4s,v8.4s,v"#ac3".4s; fmla v23.4s,v8.4s,v"#ac4".4s\n\t"\
+  "fmla v24.4s,v9.4s,v"#ac1".4s; fmla v25.4s,v9.4s,v"#ac2".4s\n\t"\
+  "fmla v26.4s,v9.4s,v"#ac3".4s; fmla v27.4s,v9.4s,v"#ac4".4s\n\t"\
+  "fmla v28.4s,v10.4s,v"#ac1".4s; fmla v29.4s,v10.4s,v"#ac2".4s\n\t"\
+  "fmla v30.4s,v10.4s,v"#ac3".4s; fmla v31.4s,v10.4s,v"#ac4".4s\n\t"\
+  "fadd v12.4s,v12.4s,v16.4s; fadd v13.4s,v13.4s,v17.4s\n\t"\
+  "fadd v14.4s,v14.4s,v18.4s; fadd v15.4s,v15.4s,v19.4s\n\t"
+
+#define KERNEL_M4N7_TL1 \
+  "ldr s0,[x0],#4; ldr s1,[x1],#4; ldr s2,[x2],#4; ldr s3,[x3],#4\n\t"\
+  "ldr q8,[x4],#16; ldr s9,[x4],#4; ldr s10,[x4],#4; ldr s11,[x4],#4\n\t"\
+  "fmla v12.4s,v8.4s,v0.s[0]; fmla v13.4s,v8.4s,v1.s[0]; subs w5,w5,#1\n\t"\
+  "fmla v14.4s,v8.4s,v2.s[0]; fmla v15.4s,v8.4s,v3.s[0]\n\t"\
+  "fmla v20.4s,v9.4s,v0.4s; fmla v21.4s,v9.4s,v1.4s\n\t"\
+  "fmla v22.4s,v9.4s,v2.4s; fmla v23.4s,v9.4s,v3.4s\n\t"\
+  "fmla v24.4s,v10.4s,v0.4s; fmla v25.4s,v10.4s,v1.4s\n\t"\
+  "fmla v26.4s,v10.4s,v2.4s; fmla v27.4s,v10.4s,v3.4s\n\t"\
+  "fmla v28.4s,v11.4s,v0.4s; fmla v29.4s,v11.4s,v1.4s\n\t"\
+  "fmla v30.4s,v11.4s,v2.4s; fmla v31.4s,v11.4s,v3.4s\n\t"
+
+
+#define INIT_M4N8 INIT_4V(12, 13, 14, 15) INIT_4V(16, 17, 18, 19)
+
+#define SAVE_M4N8(mode) UNIT_SAVE_M4N4_##mode(12, 13, 14, 15)\
+  UNIT_SAVE_M4N4_##mode(16, 17, 18, 19)
+
+#define KERNEL_M4N8_PRELOAD4 \
+  "ldr q0,[x0],#16; ldr q1,[x1],#16; ldr q2,[x2],#16; ldr q3,[x3],#16\n\t"\
+  "ldr q8,[x4]; ldr q9,[x4,#16]; ldr q10,[x4,#32]; ldr q11,[x4,#48]\n\t"\
+  "add x4,x4,#128\n\t"
+
+#define KERNEL_M4N8_MAIN4(ac1, ac2, ac3, ac4, an1, an2, an3, an4) \
+  "fmla v12.4s,v8.4s,v"#ac1".s[0]; fmla v13.4s,v8.4s,v"#ac2".s[0]\n\t"\
+  "ldr q"#an1",[x0],#16\n\t"\
+  "fmla v14.4s,v8.4s,v"#ac3".s[0]; fmla v15.4s,v8.4s,v"#ac4".s[0]\n\t"\
+  "ldr q8,[x4,#-64]\n\t"\
+  "fmla v16.4s,v9.4s,v"#ac1".s[0]; fmla v17.4s,v9.4s,v"#ac2".s[0]\n\t"\
+  "fmla v18.4s,v9.4s,v"#ac3".s[0]; fmla v19.4s,v9.4s,v"#ac4".s[0]\n\t"\
+  "ldr q9,[x4,#-48]\n\t"\
+  "fmla v12.4s,v10.4s,v"#ac1".s[1]; fmla v13.4s,v10.4s,v"#ac2".s[1]\n\t"\
+  "ldr q"#an2",[x1],#16\n\t"\
+  "fmla v14.4s,v10.4s,v"#ac3".s[1]; fmla v15.4s,v10.4s,v"#ac4".s[1]\n\t"\
+  "ldr q10,[x4,#-32]\n\t"\
+  "fmla v16.4s,v11.4s,v"#ac1".s[1]; fmla v17.4s,v11.4s,v"#ac2".s[1]\n\t"\
+  "fmla v18.4s,v11.4s,v"#ac3".s[1]; fmla v19.4s,v11.4s,v"#ac4".s[1]\n\t"\
+  "ldr q11,[x4,#-16]\n\t"\
+  "fmla v12.4s,v8.4s,v"#ac1".s[2]; fmla v13.4s,v8.4s,v"#ac2".s[2]\n\t"\
+  "ldr q"#an3",[x2],#16\n\t"\
+  "fmla v14.4s,v8.4s,v"#ac3".s[2]; fmla v15.4s,v8.4s,v"#ac4".s[2]\n\t"\
+  "ldr q8,[x4],#128\n\t"\
+  "fmla v16.4s,v9.4s,v"#ac1".s[2]; fmla v17.4s,v9.4s,v"#ac2".s[2]\n\t"\
+  "fmla v18.4s,v9.4s,v"#ac3".s[2]; fmla v19.4s,v9.4s,v"#ac4".s[2]\n\t"\
+  "ldr q9,[x4,#-112]; sub w5,w5,#4\n\t"\
+  "fmla v12.4s,v10.4s,v"#ac1".s[3]; fmla v13.4s,v10.4s,v"#ac2".s[3]\n\t"\
+  "ldr q"#an4",[x3],#16\n\t"\
+  "fmla v14.4s,v10.4s,v"#ac3".s[3]; fmla v15.4s,v10.4s,v"#ac4".s[3]\n\t"\
+  "ldr q10,[x4,#-96]\n\t"\
+  "fmla v16.4s,v11.4s,v"#ac1".s[3]; fmla v17.4s,v11.4s,v"#ac2".s[3]\n\t"\
+  "fmla v18.4s,v11.4s,v"#ac3".s[3]; fmla v19.4s,v11.4s,v"#ac4".s[3]\n\t"\
+  "ldr q11,[x4,#-80]\n\t"
+
+#define KERNEL_M4N8_TAIL4(ac1, ac2, ac3, ac4) \
+  "fmla v12.4s,v8.4s,v"#ac1".s[0]; fmla v13.4s,v8.4s,v"#ac2".s[0]\n\t"\
+  "fmla v14.4s,v8.4s,v"#ac3".s[0]; fmla v15.4s,v8.4s,v"#ac4".s[0]\n\t"\
+  "ldr q8,[x4,#-64]\n\t"\
+  "fmla v16.4s,v9.4s,v"#ac1".s[0]; fmla v17.4s,v9.4s,v"#ac2".s[0]\n\t"\
+  "prfm pldl2keep,[x6]\n\t"\
+  "fmla v18.4s,v9.4s,v"#ac3".s[0]; fmla v19.4s,v9.4s,v"#ac4".s[0]\n\t"\
+  "ldr q9,[x4,#-48]\n\t"\
+  "fmla v12.4s,v10.4s,v"#ac1".s[1]; fmla v13.4s,v10.4s,v"#ac2".s[1]\n\t"\
+  "fmla v14.4s,v10.4s,v"#ac3".s[1]; fmla v15.4s,v10.4s,v"#ac4".s[1]\n\t"\
+  "ldr q10,[x4,#-32]\n\t"\
+  "fmla v16.4s,v11.4s,v"#ac1".s[1]; fmla v17.4s,v11.4s,v"#ac2".s[1]\n\t"\
+  "prfm pldl2keep,[x7]\n\t"\
+  "fmla v18.4s,v11.4s,v"#ac3".s[1]; fmla v19.4s,v11.4s,v"#ac4".s[1]\n\t"\
+  "ldr q11,[x4,#-16]\n\t"\
+  "fmla v12.4s,v8.4s,v"#ac1".s[2]; fmla v13.4s,v8.4s,v"#ac2".s[2]\n\t"\
+  "fmla v14.4s,v8.4s,v"#ac3".s[2]; fmla v15.4s,v8.4s,v"#ac4".s[2]\n\t"\
+  "fmla v16.4s,v9.4s,v"#ac1".s[2]; fmla v17.4s,v9.4s,v"#ac2".s[2]\n\t"\
+  "prfm pldl2keep,[x8]\n\t"\
+  "fmla v18.4s,v9.4s,v"#ac3".s[2]; fmla v19.4s,v9.4s,v"#ac4".s[2]\n\t"\
+  "sub w5,w5,#4\n\t"\
+  "fmla v12.4s,v10.4s,v"#ac1".s[3]; fmla v13.4s,v10.4s,v"#ac2".s[3]\n\t"\
+  "fmla v14.4s,v10.4s,v"#ac3".s[3]; fmla v15.4s,v10.4s,v"#ac4".s[3]\n\t"\
+  "fmla v16.4s,v11.4s,v"#ac1".s[3]; fmla v17.4s,v11.4s,v"#ac2".s[3]\n\t"\
+  "prfm pldl2keep,[x9]\n\t"\
+  "fmla v18.4s,v11.4s,v"#ac3".s[3]; fmla v19.4s,v11.4s,v"#ac4".s[3]\n\t"
+
+#define KERNEL_M4N8_TL1 \
+  "ldr s0,[x0],#4; ldr s1,[x1],#4; ldr s2,[x2],#4; ldr s3,[x3],#4\n\t"\
+  "ldr q8,[x4],#16; ldr q9,[x4],#16\n\t"\
+  "fmla v12.4s,v8.4s,v0.s[0]; fmla v13.4s,v8.4s,v1.s[0]; subs w5,w5,#1\n\t"\
+  "fmla v14.4s,v8.4s,v2.s[0]; fmla v15.4s,v8.4s,v3.s[0]\n\t"\
+  "fmla v16.4s,v9.4s,v0.s[0]; fmla v17.4s,v9.4s,v1.s[0]\n\t"\
+  "fmla v18.4s,v9.4s,v2.s[0]; fmla v19.4s,v9.4s,v3.s[0]\n\t"
+
+
+#define INIT_M4N9 INIT_4V(12, 13, 14, 15) INIT_4V(16, 17, 18, 19)\
+  INIT_4V(20, 21, 22, 23)
+
+#define SAVE_M4N9(mode) UNIT_SAVE_M4N4_##mode(12, 13, 14, 15)\
+  UNIT_SAVE_M4N4_##mode(16, 17, 18, 19) EDGE_SAVE_M4N1_##mode(20, 21, 22, 23)
+
+#define KERNEL_M4N9_PRELOAD4 \
+  "ldr q0,[x0],#16; ldr q1,[x1],#16; ldr q2,[x2],#16; ldr q3,[x3],#16\n\t"\
+  "ldr q8,[x4]; ldr q9,[x4,#16]; ldr q10,[x4,#32]\n\t"\
+  "add x4,x4,#144\n\t"
+
+#define KERNEL_M4N9_MAIN4(ac1, ac2, ac3, ac4, an1, an2, an3, an4) \
+  "fmla v12.4s,v8.4s,v"#ac1".s[0]; fmla v13.4s,v8.4s,v"#ac2".s[0]\n\t"\
+  "ldr q"#an1",[x0],#16\n\t"\
+  "fmla v14.4s,v8.4s,v"#ac3".s[0]; fmla v15.4s,v8.4s,v"#ac4".s[0]\n\t"\
+  "ldr q8,[x4,#-96]\n\t"\
+  "fmla v16.4s,v9.4s,v"#ac1".s[0]; fmla v17.4s,v9.4s,v"#ac2".s[0]\n\t"\
+  "fmla v18.4s,v9.4s,v"#ac3".s[0]; fmla v19.4s,v9.4s,v"#ac4".s[0]\n\t"\
+  "ldr q9,[x4,#-80]\n\t"\
+  "fmla v12.4s,v10.4s,v"#ac1".s[1]; fmla v13.4s,v10.4s,v"#ac2".s[1]\n\t"\
+  "ldr q"#an2",[x1],#16\n\t"\
+  "fmla v14.4s,v10.4s,v"#ac3".s[1]; fmla v15.4s,v10.4s,v"#ac4".s[1]\n\t"\
+  "ldr q10,[x4,#-64]\n\t"\
+  "fmla v16.4s,v8.4s,v"#ac1".s[1]; fmla v17.4s,v8.4s,v"#ac2".s[1]\n\t"\
+  "fmla v18.4s,v8.4s,v"#ac3".s[1]; fmla v19.4s,v8.4s,v"#ac4".s[1]\n\t"\
+  "ldr q8,[x4,#-48]\n\t"\
+  "fmla v12.4s,v9.4s,v"#ac1".s[2]; fmla v13.4s,v9.4s,v"#ac2".s[2]\n\t"\
+  "ldr q"#an3",[x2],#16\n\t"\
+  "fmla v14.4s,v9.4s,v"#ac3".s[2]; fmla v15.4s,v9.4s,v"#ac4".s[2]\n\t"\
+  "ldr q9,[x4,#-32]\n\t"\
+  "fmla v16.4s,v10.4s,v"#ac1".s[2]; fmla v17.4s,v10.4s,v"#ac2".s[2]\n\t"\
+  "fmla v18.4s,v10.4s,v"#ac3".s[2]; fmla v19.4s,v10.4s,v"#ac4".s[2]\n\t"\
+  "ldr q10,[x4,#-16]\n\t"\
+  "fmla v12.4s,v8.4s,v"#ac1".s[3]; fmla v13.4s,v8.4s,v"#ac2".s[3]\n\t"\
+  "ldr q"#an4",[x3],#16\n\t"\
+  "fmla v14.4s,v8.4s,v"#ac3".s[3]; fmla v15.4s,v8.4s,v"#ac4".s[3]\n\t"\
+  "ldr q8,[x4],#144\n\t"\
+  "fmla v16.4s,v9.4s,v"#ac1".s[3]; fmla v17.4s,v9.4s,v"#ac2".s[3]\n\t"\
+  "sub w5,w5,#4\n\t"\
+  "fmla v18.4s,v9.4s,v"#ac3".s[3]; fmla v19.4s,v9.4s,v"#ac4".s[3]\n\t"\
+  "ldr q9,[x4,#-128]\n\t"\
+  "fmla v20.4s,v10.4s,v"#ac1".4s; fmla v21.4s,v10.4s,v"#ac2".4s\n\t"\
+  "fmla v22.4s,v10.4s,v"#ac3".4s; fmla v23.4s,v10.4s,v"#ac4".4s\n\t"\
+  "ldr q10,[x4,#-112]\n\t"
+
+#define KERNEL_M4N9_TAIL4(ac1, ac2, ac3, ac4) \
+  "fmla v12.4s,v8.4s,v"#ac1".s[0]; fmla v13.4s,v8.4s,v"#ac2".s[0]\n\t"\
+  "fmla v14.4s,v8.4s,v"#ac3".s[0]; fmla v15.4s,v8.4s,v"#ac4".s[0]\n\t"\
+  "ldr q8,[x4,#-96]\n\t"\
+  "fmla v16.4s,v9.4s,v"#ac1".s[0]; fmla v17.4s,v9.4s,v"#ac2".s[0]\n\t"\
+  "prfm pldl2keep,[x6]\n\t"\
+  "fmla v18.4s,v9.4s,v"#ac3".s[0]; fmla v19.4s,v9.4s,v"#ac4".s[0]\n\t"\
+  "ldr q9,[x4,#-80]\n\t"\
+  "fmla v12.4s,v10.4s,v"#ac1".s[1]; fmla v13.4s,v10.4s,v"#ac2".s[1]\n\t"\
+  "fmla v14.4s,v10.4s,v"#ac3".s[1]; fmla v15.4s,v10.4s,v"#ac4".s[1]\n\t"\
+  "ldr q10,[x4,#-64]\n\t"\
+  "fmla v16.4s,v8.4s,v"#ac1".s[1]; fmla v17.4s,v8.4s,v"#ac2".s[1]\n\t"\
+  "prfm pldl2keep,[x7]\n\t"\
+  "fmla v18.4s,v8.4s,v"#ac3".s[1]; fmla v19.4s,v8.4s,v"#ac4".s[1]\n\t"\
+  "ldr q8,[x4,#-48]\n\t"\
+  "fmla v12.4s,v9.4s,v"#ac1".s[2]; fmla v13.4s,v9.4s,v"#ac2".s[2]\n\t"\
+  "fmla v14.4s,v9.4s,v"#ac3".s[2]; fmla v15.4s,v9.4s,v"#ac4".s[2]\n\t"\
+  "ldr q9,[x4,#-32]\n\t"\
+  "fmla v16.4s,v10.4s,v"#ac1".s[2]; fmla v17.4s,v10.4s,v"#ac2".s[2]\n\t"\
+  "prfm pldl2keep,[x8]\n\t"\
+  "fmla v18.4s,v10.4s,v"#ac3".s[2]; fmla v19.4s,v10.4s,v"#ac4".s[2]\n\t"\
+  "ldr q10,[x4,#-16]\n\t"\
+  "fmla v12.4s,v8.4s,v"#ac1".s[3]; fmla v13.4s,v8.4s,v"#ac2".s[3]\n\t"\
+  "fmla v14.4s,v8.4s,v"#ac3".s[3]; fmla v15.4s,v8.4s,v"#ac4".s[3]\n\t"\
+  "fmla v16.4s,v9.4s,v"#ac1".s[3]; fmla v17.4s,v9.4s,v"#ac2".s[3]\n\t"\
+  "prfm pldl2keep,[x9]; sub w5,w5,#4\n\t"\
+  "fmla v18.4s,v9.4s,v"#ac3".s[3]; fmla v19.4s,v9.4s,v"#ac4".s[3]\n\t"\
+  "fmla v20.4s,v10.4s,v"#ac1".4s; fmla v21.4s,v10.4s,v"#ac2".4s\n\t"\
+  "fmla v22.4s,v10.4s,v"#ac3".4s; fmla v23.4s,v10.4s,v"#ac4".4s\n\t"
+
+#define KERNEL_M4N9_TL1 \
+  "ldr s0,[x0],#4; ldr s1,[x1],#4; ldr s2,[x2],#4; ldr s3,[x3],#4\n\t"\
+  "ldr q8,[x4],#16; ldr q9,[x4],#16; ldr s10,[x4],#4\n\t"\
+  "fmla v12.4s,v8.4s,v0.s[0]; fmla v13.4s,v8.4s,v1.s[0]; subs w5,w5,#1\n\t"\
+  "fmla v14.4s,v8.4s,v2.s[0]; fmla v15.4s,v8.4s,v3.s[0]\n\t"\
+  "fmla v16.4s,v9.4s,v0.s[0]; fmla v17.4s,v9.4s,v1.s[0]\n\t"\
+  "fmla v18.4s,v9.4s,v2.s[0]; fmla v19.4s,v9.4s,v3.s[0]\n\t"\
+  "fmla v20.4s,v10.4s,v0.4s; fmla v21.4s,v10.4s,v1.4s\n\t"\
+  "fmla v22.4s,v10.4s,v2.4s; fmla v23.4s,v10.4s,v3.4s\n\t"
+
+
+#define INIT_M4N10 INIT_4V(12, 13, 14, 15) INIT_4V(16, 17, 18, 19)\
+  INIT_4V(20, 21, 22, 23) INIT_4V(24, 25, 26, 27)
+
+#define SAVE_M4N10(mode) UNIT_SAVE_M4N4_##mode(12, 13, 14, 15)\
+  UNIT_SAVE_M4N4_##mode(16, 17, 18, 19)\
+  EDGE_SAVE_M4N1_##mode(20, 21, 22, 23) EDGE_SAVE_M4N1_##mode(24, 25, 26, 27)
+
+#define KERNEL_M4N10_PRELOAD4 \
+  "ldr q0,[x0],#16; ldr q1,[x1],#16; ldr q2,[x2],#16; ldr q3,[x3],#16\n\t"\
+  "ldr q8,[x4]; ldr q9,[x4,#16]; ldr q10,[x4,#32]\n\t"\
+  "add x4,x4,#160\n\t"
+
+#define KERNEL_M4N10_MAIN4(ac1, ac2, ac3, ac4, an1, an2, an3, an4) \
+  "fmla v12.4s,v8.4s,v"#ac1".s[0]; fmla v13.4s,v8.4s,v"#ac2".s[0]\n\t"\
+  "ldr q"#an1",[x0],#16\n\t"\
+  "fmla v14.4s,v8.4s,v"#ac3".s[0]; fmla v15.4s,v8.4s,v"#ac4".s[0]\n\t"\
+  "ldr q8,[x4,#-112]\n\t"\
+  "fmla v16.4s,v9.4s,v"#ac1".s[0]; fmla v17.4s,v9.4s,v"#ac2".s[0]\n\t"\
+  "fmla v18.4s,v9.4s,v"#ac3".s[0]; fmla v19.4s,v9.4s,v"#ac4".s[0]\n\t"\
+  "ldr q9,[x4,#-96]\n\t"\
+  "fmla v12.4s,v10.4s,v"#ac1".s[1]; fmla v13.4s,v10.4s,v"#ac2".s[1]\n\t"\
+  "ldr q"#an2",[x1],#16\n\t"\
+  "fmla v14.4s,v10.4s,v"#ac3".s[1]; fmla v15.4s,v10.4s,v"#ac4".s[1]\n\t"\
+  "ldr q10,[x4,#-80]\n\t"\
+  "fmla v16.4s,v8.4s,v"#ac1".s[1]; fmla v17.4s,v8.4s,v"#ac2".s[1]\n\t"\
+  "fmla v18.4s,v8.4s,v"#ac3".s[1]; fmla v19.4s,v8.4s,v"#ac4".s[1]\n\t"\
+  "ldr q8,[x4,#-64]\n\t"\
+  "fmla v12.4s,v9.4s,v"#ac1".s[2]; fmla v13.4s,v9.4s,v"#ac2".s[2]\n\t"\
+  "ldr q"#an3",[x2],#16\n\t"\
+  "fmla v14.4s,v9.4s,v"#ac3".s[2]; fmla v15.4s,v9.4s,v"#ac4".s[2]\n\t"\
+  "ldr q9,[x4,#-48]\n\t"\
+  "fmla v16.4s,v10.4s,v"#ac1".s[2]; fmla v17.4s,v10.4s,v"#ac2".s[2]\n\t"\
+  "fmla v18.4s,v10.4s,v"#ac3".s[2]; fmla v19.4s,v10.4s,v"#ac4".s[2]\n\t"\
+  "ldr q10,[x4,#-32]\n\t"\
+  "fmla v12.4s,v8.4s,v"#ac1".s[3]; fmla v13.4s,v8.4s,v"#ac2".s[3]\n\t"\
+  "ldr q"#an4",[x3],#16\n\t"\
+  "fmla v14.4s,v8.4s,v"#ac3".s[3]; fmla v15.4s,v8.4s,v"#ac4".s[3]\n\t"\
+  "ldr q11,[x4,#-16]\n\t"\
+  "fmla v16.4s,v9.4s,v"#ac1".s[3]; fmla v17.4s,v9.4s,v"#ac2".s[3]\n\t"\
+  "sub w5,w5,#4\n\t"\
+  "fmla v18.4s,v9.4s,v"#ac3".s[3]; fmla v19.4s,v9.4s,v"#ac4".s[3]\n\t"\
+  "ldr q8,[x4],#160\n\t"\
+  "fmla v20.4s,v10.4s,v"#ac1".4s; fmla v21.4s,v10.4s,v"#ac2".4s\n\t"\
+  "fmla v22.4s,v10.4s,v"#ac3".4s; fmla v23.4s,v10.4s,v"#ac4".4s\n\t"\
+  "ldr q9,[x4,#-144]\n\t"\
+  "fmla v24.4s,v11.4s,v"#ac1".4s; fmla v25.4s,v11.4s,v"#ac2".4s\n\t"\
+  "fmla v26.4s,v11.4s,v"#ac3".4s; fmla v27.4s,v11.4s,v"#ac4".4s\n\t"\
+  "ldr q10,[x4,#-128]\n\t"
+
+#define KERNEL_M4N10_TAIL4(ac1, ac2, ac3, ac4) \
+  "fmla v12.4s,v8.4s,v"#ac1".s[0]; fmla v13.4s,v8.4s,v"#ac2".s[0]\n\t"\
+  "fmla v14.4s,v8.4s,v"#ac3".s[0]; fmla v15.4s,v8.4s,v"#ac4".s[0]\n\t"\
+  "ldr q8,[x4,#-112]\n\t"\
+  "fmla v16.4s,v9.4s,v"#ac1".s[0]; fmla v17.4s,v9.4s,v"#ac2".s[0]\n\t"\
+  "prfm pldl2keep,[x6]\n\t"\
+  "fmla v18.4s,v9.4s,v"#ac3".s[0]; fmla v19.4s,v9.4s,v"#ac4".s[0]\n\t"\
+  "ldr q9,[x4,#-96]\n\t"\
+  "fmla v12.4s,v10.4s,v"#ac1".s[1]; fmla v13.4s,v10.4s,v"#ac2".s[1]\n\t"\
+  "fmla v14.4s,v10.4s,v"#ac3".s[1]; fmla v15.4s,v10.4s,v"#ac4".s[1]\n\t"\
+  "ldr q10,[x4,#-80]\n\t"\
+  "fmla v16.4s,v8.4s,v"#ac1".s[1]; fmla v17.4s,v8.4s,v"#ac2".s[1]\n\t"\
+  "prfm pldl2keep,[x7]\n\t"\
+  "fmla v18.4s,v8.4s,v"#ac3".s[1]; fmla v19.4s,v8.4s,v"#ac4".s[1]\n\t"\
+  "ldr q8,[x4,#-64]\n\t"\
+  "fmla v12.4s,v9.4s,v"#ac1".s[2]; fmla v13.4s,v9.4s,v"#ac2".s[2]\n\t"\
+  "fmla v14.4s,v9.4s,v"#ac3".s[2]; fmla v15.4s,v9.4s,v"#ac4".s[2]\n\t"\
+  "ldr q9,[x4,#-48]\n\t"\
+  "fmla v16.4s,v10.4s,v"#ac1".s[2]; fmla v17.4s,v10.4s,v"#ac2".s[2]\n\t"\
+  "prfm pldl2keep,[x8]\n\t"\
+  "fmla v18.4s,v10.4s,v"#ac3".s[2]; fmla v19.4s,v10.4s,v"#ac4".s[2]\n\t"\
+  "ldr q10,[x4,#-32]\n\t"\
+  "fmla v12.4s,v8.4s,v"#ac1".s[3]; fmla v13.4s,v8.4s,v"#ac2".s[3]\n\t"\
+  "fmla v14.4s,v8.4s,v"#ac3".s[3]; fmla v15.4s,v8.4s,v"#ac4".s[3]\n\t"\
+  "ldr q11,[x4,#-16]\n\t"\
+  "fmla v16.4s,v9.4s,v"#ac1".s[3]; fmla v17.4s,v9.4s,v"#ac2".s[3]\n\t"\
+  "prfm pldl2keep,[x9]; sub w5,w5,#4\n\t"\
+  "fmla v18.4s,v9.4s,v"#ac3".s[3]; fmla v19.4s,v9.4s,v"#ac4".s[3]\n\t"\
+  "fmla v20.4s,v10.4s,v"#ac1".4s; fmla v21.4s,v10.4s,v"#ac2".4s\n\t"\
+  "fmla v22.4s,v10.4s,v"#ac3".4s; fmla v23.4s,v10.4s,v"#ac4".4s\n\t"\
+  "fmla v24.4s,v11.4s,v"#ac1".4s; fmla v25.4s,v11.4s,v"#ac2".4s\n\t"\
+  "fmla v26.4s,v11.4s,v"#ac3".4s; fmla v27.4s,v11.4s,v"#ac4".4s\n\t"
+
+#define KERNEL_M4N10_TL1 \
+  "ldr s0,[x0],#4; ldr s1,[x1],#4; ldr s2,[x2],#4; ldr s3,[x3],#4\n\t"\
+  "ldr q8,[x4],#16; ldr q9,[x4],#16; ldr s10,[x4],#4; ldr s11,[x4],#4\n\t"\
+  "fmla v12.4s,v8.4s,v0.s[0]; fmla v13.4s,v8.4s,v1.s[0]; subs w5,w5,#1\n\t"\
+  "fmla v14.4s,v8.4s,v2.s[0]; fmla v15.4s,v8.4s,v3.s[0]\n\t"\
+  "fmla v16.4s,v9.4s,v0.s[0]; fmla v17.4s,v9.4s,v1.s[0]\n\t"\
+  "fmla v18.4s,v9.4s,v2.s[0]; fmla v19.4s,v9.4s,v3.s[0]\n\t"\
+  "fmla v20.4s,v10.4s,v0.4s; fmla v21.4s,v10.4s,v1.4s\n\t"\
+  "fmla v22.4s,v10.4s,v2.4s; fmla v23.4s,v10.4s,v3.4s\n\t"\
+  "fmla v24.4s,v11.4s,v0.4s; fmla v25.4s,v11.4s,v1.4s\n\t"\
+  "fmla v26.4s,v11.4s,v2.4s; fmla v27.4s,v11.4s,v3.4s\n\t"
+
+
+#define INIT_M4N11 INIT_4V(12, 13, 14, 15) INIT_4V(16, 17, 18, 19)\
+  INIT_4V(20, 21, 22, 23) INIT_4V(24, 25, 26, 27) INIT_4V(28, 29, 30, 31)
+
+#define SAVE_M4N11(mode) UNIT_SAVE_M4N4_##mode(12, 13, 14, 15)\
+  UNIT_SAVE_M4N4_##mode(16, 17, 18, 19)\
+  EDGE_SAVE_M4N1_##mode(20, 21, 22, 23) EDGE_SAVE_M4N1_##mode(24, 25, 26, 27)\
+  EDGE_SAVE_M4N1_##mode(28, 29, 30, 31)
+
+#define KERNEL_M4N11_PRELOAD4 \
+  "ldr q0,[x0],#16; ldr q1,[x1],#16; ldr q2,[x2],#16; ldr q3,[x3],#16\n\t"\
+  "ldr q8,[x4]; ldr q9,[x4,#16]; ldr q10,[x4,#32]\n\t"\
+  "add x4,x4,#176\n\t"
+
+#define KERNEL_M4N11_MAIN4(ac1, ac2, ac3, ac4, an1, an2, an3, an4) \
+  "fmla v12.4s,v8.4s,v"#ac1".s[0]; fmla v13.4s,v8.4s,v"#ac2".s[0]\n\t"\
+  "ldr q"#an1",[x0],#16\n\t"\
+  "fmla v14.4s,v8.4s,v"#ac3".s[0]; fmla v15.4s,v8.4s,v"#ac4".s[0]\n\t"\
+  "ldr q8,[x4,#-128]\n\t"\
+  "fmla v16.4s,v9.4s,v"#ac1".s[0]; fmla v17.4s,v9.4s,v"#ac2".s[0]\n\t"\
+  "fmla v18.4s,v9.4s,v"#ac3".s[0]; fmla v19.4s,v9.4s,v"#ac4".s[0]\n\t"\
+  "ldr q9,[x4,#-112]\n\t"\
+  "fmla v12.4s,v10.4s,v"#ac1".s[1]; fmla v13.4s,v10.4s,v"#ac2".s[1]\n\t"\
+  "ldr q"#an2",[x1],#16\n\t"\
+  "fmla v14.4s,v10.4s,v"#ac3".s[1]; fmla v15.4s,v10.4s,v"#ac4".s[1]\n\t"\
+  "ldr q10,[x4,#-96]\n\t"\
+  "fmla v16.4s,v8.4s,v"#ac1".s[1]; fmla v17.4s,v8.4s,v"#ac2".s[1]\n\t"\
+  "fmla v18.4s,v8.4s,v"#ac3".s[1]; fmla v19.4s,v8.4s,v"#ac4".s[1]\n\t"\
+  "ldr q11,[x4,#-80]\n\t"\
+  "fmla v12.4s,v9.4s,v"#ac1".s[2]; fmla v13.4s,v9.4s,v"#ac2".s[2]\n\t"\
+  "ldr q"#an3",[x2],#16\n\t"\
+  "fmla v14.4s,v9.4s,v"#ac3".s[2]; fmla v15.4s,v9.4s,v"#ac4".s[2]\n\t"\
+  "ldr q8,[x4,#-64]\n\t"\
+  "fmla v16.4s,v10.4s,v"#ac1".s[2]; fmla v17.4s,v10.4s,v"#ac2".s[2]\n\t"\
+  "fmla v18.4s,v10.4s,v"#ac3".s[2]; fmla v19.4s,v10.4s,v"#ac4".s[2]\n\t"\
+  "ldr q9,[x4,#-48]\n\t"\
+  "fmla v12.4s,v11.4s,v"#ac1".s[3]; fmla v13.4s,v11.4s,v"#ac2".s[3]\n\t"\
+  "ldr q"#an4",[x3],#16\n\t"\
+  "fmla v14.4s,v11.4s,v"#ac3".s[3]; fmla v15.4s,v11.4s,v"#ac4".s[3]\n\t"\
+  "ldr q10,[x4,#-32]\n\t"\
+  "fmla v16.4s,v8.4s,v"#ac1".s[3]; fmla v17.4s,v8.4s,v"#ac2".s[3]\n\t"\
+  "sub w5,w5,#4\n\t"\
+  "fmla v18.4s,v8.4s,v"#ac3".s[3]; fmla v19.4s,v8.4s,v"#ac4".s[3]\n\t"\
+  "ldr q11,[x4,#-16]\n\t"\
+  "fmla v20.4s,v9.4s,v"#ac1".4s; fmla v21.4s,v9.4s,v"#ac2".4s\n\t"\
+  "fmla v22.4s,v9.4s,v"#ac3".4s; fmla v23.4s,v9.4s,v"#ac4".4s\n\t"\
+  "ldr q8,[x4],#176\n\t"\
+  "fmla v24.4s,v10.4s,v"#ac1".4s; fmla v25.4s,v10.4s,v"#ac2".4s\n\t"\
+  "fmla v26.4s,v10.4s,v"#ac3".4s; fmla v27.4s,v10.4s,v"#ac4".4s\n\t"\
+  "ldr q9,[x4,#-160]\n\t"\
+  "fmla v28.4s,v11.4s,v"#ac1".4s; fmla v29.4s,v11.4s,v"#ac2".4s\n\t"\
+  "fmla v30.4s,v11.4s,v"#ac3".4s; fmla v31.4s,v11.4s,v"#ac4".4s\n\t"\
+  "ldr q10,[x4,#-144]\n\t"
+
+#define KERNEL_M4N11_TAIL4(ac1, ac2, ac3, ac4) \
+  "fmla v12.4s,v8.4s,v"#ac1".s[0]; fmla v13.4s,v8.4s,v"#ac2".s[0]\n\t"\
+  "fmla v14.4s,v8.4s,v"#ac3".s[0]; fmla v15.4s,v8.4s,v"#ac4".s[0]\n\t"\
+  "ldr q8,[x4,#-128]\n\t"\
+  "fmla v16.4s,v9.4s,v"#ac1".s[0]; fmla v17.4s,v9.4s,v"#ac2".s[0]\n\t"\
+  "prfm pldl2keep,[x6]\n\t"\
+  "fmla v18.4s,v9.4s,v"#ac3".s[0]; fmla v19.4s,v9.4s,v"#ac4".s[0]\n\t"\
+  "ldr q9,[x4,#-112]\n\t"\
+  "fmla v12.4s,v10.4s,v"#ac1".s[1]; fmla v13.4s,v10.4s,v"#ac2".s[1]\n\t"\
+  "fmla v14.4s,v10.4s,v"#ac3".s[1]; fmla v15.4s,v10.4s,v"#ac4".s[1]\n\t"\
+  "ldr q10,[x4,#-96]\n\t"\
+  "fmla v16.4s,v8.4s,v"#ac1".s[1]; fmla v17.4s,v8.4s,v"#ac2".s[1]\n\t"\
+  "prfm pldl2keep,[x7]\n\t"\
+  "fmla v18.4s,v8.4s,v"#ac3".s[1]; fmla v19.4s,v8.4s,v"#ac4".s[1]\n\t"\
+  "ldr q11,[x4,#-80]\n\t"\
+  "fmla v12.4s,v9.4s,v"#ac1".s[2]; fmla v13.4s,v9.4s,v"#ac2".s[2]\n\t"\
+  "fmla v14.4s,v9.4s,v"#ac3".s[2]; fmla v15.4s,v9.4s,v"#ac4".s[2]\n\t"\
+  "ldr q8,[x4,#-64]\n\t"\
+  "fmla v16.4s,v10.4s,v"#ac1".s[2]; fmla v17.4s,v10.4s,v"#ac2".s[2]\n\t"\
+  "prfm pldl2keep,[x8]\n\t"\
+  "fmla v18.4s,v10.4s,v"#ac3".s[2]; fmla v19.4s,v10.4s,v"#ac4".s[2]\n\t"\
+  "ldr q9,[x4,#-48]\n\t"\
+  "fmla v12.4s,v11.4s,v"#ac1".s[3]; fmla v13.4s,v11.4s,v"#ac2".s[3]\n\t"\
+  "fmla v14.4s,v11.4s,v"#ac3".s[3]; fmla v15.4s,v11.4s,v"#ac4".s[3]\n\t"\
+  "ldr q10,[x4,#-32]\n\t"\
+  "fmla v16.4s,v8.4s,v"#ac1".s[3]; fmla v17.4s,v8.4s,v"#ac2".s[3]\n\t"\
+  "prfm pldl2keep,[x9]; sub w5,w5,#4\n\t"\
+  "fmla v18.4s,v8.4s,v"#ac3".s[3]; fmla v19.4s,v8.4s,v"#ac4".s[3]\n\t"\
+  "ldr q11,[x4,#-16]\n\t"\
+  "fmla v20.4s,v9.4s,v"#ac1".4s; fmla v21.4s,v9.4s,v"#ac2".4s\n\t"\
+  "fmla v22.4s,v9.4s,v"#ac3".4s; fmla v23.4s,v9.4s,v"#ac4".4s\n\t"\
+  "fmla v24.4s,v10.4s,v"#ac1".4s; fmla v25.4s,v10.4s,v"#ac2".4s\n\t"\
+  "fmla v26.4s,v10.4s,v"#ac3".4s; fmla v27.4s,v10.4s,v"#ac4".4s\n\t"\
+  "fmla v28.4s,v11.4s,v"#ac1".4s; fmla v29.4s,v11.4s,v"#ac2".4s\n\t"\
+  "fmla v30.4s,v11.4s,v"#ac3".4s; fmla v31.4s,v11.4s,v"#ac4".4s\n\t"
+
+#define KERNEL_M4N11_TL1 \
+  "ldr s0,[x0],#4; ldr s1,[x1],#4; ldr s2,[x2],#4; ldr s3,[x3],#4\n\t"\
+  "ldr q8,[x4],#16; ldr q9,[x4],#16; ldr d10,[x4],#8; ldr s11,[x4],#4\n\t"\
+  "fmla v12.4s,v8.4s,v0.s[0]; fmla v13.4s,v8.4s,v1.s[0]; subs w5,w5,#1\n\t"\
+  "fmla v14.4s,v8.4s,v2.s[0]; fmla v15.4s,v8.4s,v3.s[0]\n\t"\
+  "fmla v16.4s,v9.4s,v0.s[0]; fmla v17.4s,v9.4s,v1.s[0]\n\t"\
+  "fmla v18.4s,v9.4s,v2.s[0]; fmla v19.4s,v9.4s,v3.s[0]\n\t"\
+  "fmla v20.4s,v0.4s,v10.s[0]; fmla v21.4s,v1.4s,v10.s[0]\n\t"\
+  "fmla v22.4s,v2.4s,v10.s[0]; fmla v23.4s,v3.4s,v10.s[0]\n\t"\
+  "fmla v24.4s,v0.4s,v10.s[1]; fmla v25.4s,v1.4s,v10.s[1]\n\t"\
+  "fmla v26.4s,v2.4s,v10.s[1]; fmla v27.4s,v3.4s,v10.s[1]\n\t"\
+  "fmla v28.4s,v0.4s,v11.s[0]; fmla v29.4s,v1.4s,v11.s[0]\n\t"\
+  "fmla v30.4s,v2.4s,v11.s[0]; fmla v31.4s,v3.4s,v11.s[0]\n\t"
+
+
+#define INIT_M4N12 INIT_4V(12, 13, 14, 15) INIT_4V(16, 17, 18, 19)\
+  INIT_4V(20, 21, 22, 23)
+
+#define SAVE_M4N12(mode) UNIT_SAVE_M4N4_##mode(12, 13, 14, 15)\
+  UNIT_SAVE_M4N4_##mode(16, 17, 18, 19) UNIT_SAVE_M4N4_##mode(20, 21, 22, 23)
+
+#define KERNEL_M4N12_PRELOAD4 \
+  "ldr q0,[x0],#16; ldr q1,[x1],#16; ldr q2,[x2],#16; ldr q3,[x3],#16\n\t"\
+  "ldr q8,[x4]; ldr q9,[x4,#16]; ldr q10,[x4,#32]\n\t"\
+  "add x4,x4,#192\n\t"
+
+#define KERNEL_M4N12_MAIN4(ac1, ac2, ac3, ac4, an1, an2, an3, an4) \
+  "fmla v12.4s,v8.4s,v"#ac1".s[0]; fmla v13.4s,v8.4s,v"#ac2".s[0]\n\t"\
+  "ldr q"#an1",[x0],#16\n\t"\
+  "fmla v14.4s,v8.4s,v"#ac3".s[0]; fmla v15.4s,v8.4s,v"#ac4".s[0]\n\t"\
+  "ldr q8,[x4,#-144]\n\t"\
+  "fmla v16.4s,v9.4s,v"#ac1".s[0]; fmla v17.4s,v9.4s,v"#ac2".s[0]\n\t"\
+  "fmla v18.4s,v9.4s,v"#ac3".s[0]; fmla v19.4s,v9.4s,v"#ac4".s[0]\n\t"\
+  "ldr q9,[x4,#-128]\n\t"\
+  "fmla v20.4s,v10.4s,v"#ac1".s[0]; fmla v21.4s,v10.4s,v"#ac2".s[0]\n\t"\
+  "fmla v22.4s,v10.4s,v"#ac3".s[0]; fmla v23.4s,v10.4s,v"#ac4".s[0]\n\t"\
+  "ldr q10,[x4,#-112]\n\t"\
+  "fmla v12.4s,v8.4s,v"#ac1".s[1]; fmla v13.4s,v8.4s,v"#ac2".s[1]\n\t"\
+  "ldr q"#an2",[x1],#16\n\t"\
+  "fmla v14.4s,v8.4s,v"#ac3".s[1]; fmla v15.4s,v8.4s,v"#ac4".s[1]\n\t"\
+  "ldr q8,[x4,#-96]\n\t"\
+  "fmla v16.4s,v9.4s,v"#ac1".s[1]; fmla v17.4s,v9.4s,v"#ac2".s[1]\n\t"\
+  "fmla v18.4s,v9.4s,v"#ac3".s[1]; fmla v19.4s,v9.4s,v"#ac4".s[1]\n\t"\
+  "ldr q9,[x4,#-80]\n\t"\
+  "fmla v20.4s,v10.4s,v"#ac1".s[1]; fmla v21.4s,v10.4s,v"#ac2".s[1]\n\t"\
+  "fmla v22.4s,v10.4s,v"#ac3".s[1]; fmla v23.4s,v10.4s,v"#ac4".s[1]\n\t"\
+  "ldr q10,[x4,#-64]\n\t"\
+  "fmla v12.4s,v8.4s,v"#ac1".s[2]; fmla v13.4s,v8.4s,v"#ac2".s[2]\n\t"\
+  "ldr q"#an3",[x2],#16\n\t"\
+  "fmla v14.4s,v8.4s,v"#ac3".s[2]; fmla v15.4s,v8.4s,v"#ac4".s[2]\n\t"\
+  "ldr q8,[x4,#-48]\n\t"\
+  "fmla v16.4s,v9.4s,v"#ac1".s[2]; fmla v17.4s,v9.4s,v"#ac2".s[2]\n\t"\
+  "fmla v18.4s,v9.4s,v"#ac3".s[2]; fmla v19.4s,v9.4s,v"#ac4".s[2]\n\t"\
+  "ldr q9,[x4,#-32]\n\t"\
+  "fmla v20.4s,v10.4s,v"#ac1".s[2]; fmla v21.4s,v10.4s,v"#ac2".s[2]\n\t"\
+  "fmla v22.4s,v10.4s,v"#ac3".s[2]; fmla v23.4s,v10.4s,v"#ac4".s[2]\n\t"\
+  "ldr q10,[x4,#-16]\n\t"\
+  "fmla v12.4s,v8.4s,v"#ac1".s[3]; fmla v13.4s,v8.4s,v"#ac2".s[3]\n\t"\
+  "ldr q"#an4",[x3],#16\n\t"\
+  "fmla v14.4s,v8.4s,v"#ac3".s[3]; fmla v15.4s,v8.4s,v"#ac4".s[3]\n\t"\
+  "ldr q8,[x4],#192\n\t"\
+  "fmla v16.4s,v9.4s,v"#ac1".s[3]; fmla v17.4s,v9.4s,v"#ac2".s[3]\n\t"\
+  "sub w5,w5,#4\n\t"\
+  "fmla v18.4s,v9.4s,v"#ac3".s[3]; fmla v19.4s,v9.4s,v"#ac4".s[3]\n\t"\
+  "ldr q9,[x4,#-176]\n\t"\
+  "fmla v20.4s,v10.4s,v"#ac1".s[3]; fmla v21.4s,v10.4s,v"#ac2".s[3]\n\t"\
+  "fmla v22.4s,v10.4s,v"#ac3".s[3]; fmla v23.4s,v10.4s,v"#ac4".s[3]\n\t"\
+  "ldr q10,[x4,#-160]\n\t"
+
+#define KERNEL_M4N12_TAIL4(ac1, ac2, ac3, ac4) \
+  "fmla v12.4s,v8.4s,v"#ac1".s[0]; fmla v13.4s,v8.4s,v"#ac2".s[0]\n\t"\
+  "fmla v14.4s,v8.4s,v"#ac3".s[0]; fmla v15.4s,v8.4s,v"#ac4".s[0]\n\t"\
+  "ldr q8,[x4,#-144]; prfm pldl2keep,[x6]\n\t"\
+  "fmla v16.4s,v9.4s,v"#ac1".s[0]; fmla v17.4s,v9.4s,v"#ac2".s[0]\n\t"\
+  "fmla v18.4s,v9.4s,v"#ac3".s[0]; fmla v19.4s,v9.4s,v"#ac4".s[0]\n\t"\
+  "ldr q9,[x4,#-128]\n\t"\
+  "fmla v20.4s,v10.4s,v"#ac1".s[0]; fmla v21.4s,v10.4s,v"#ac2".s[0]\n\t"\
+  "fmla v22.4s,v10.4s,v"#ac3".s[0]; fmla v23.4s,v10.4s,v"#ac4".s[0]\n\t"\
+  "ldr q10,[x4,#-112]\n\t"\
+  "fmla v12.4s,v8.4s,v"#ac1".s[1]; fmla v13.4s,v8.4s,v"#ac2".s[1]\n\t"\
+  "fmla v14.4s,v8.4s,v"#ac3".s[1]; fmla v15.4s,v8.4s,v"#ac4".s[1]\n\t"\
+  "ldr q8,[x4,#-96]; prfm pldl2keep,[x7]\n\t"\
+  "fmla v16.4s,v9.4s,v"#ac1".s[1]; fmla v17.4s,v9.4s,v"#ac2".s[1]\n\t"\
+  "fmla v18.4s,v9.4s,v"#ac3".s[1]; fmla v19.4s,v9.4s,v"#ac4".s[1]\n\t"\
+  "ldr q9,[x4,#-80]\n\t"\
+  "fmla v20.4s,v10.4s,v"#ac1".s[1]; fmla v21.4s,v10.4s,v"#ac2".s[1]\n\t"\
+  "fmla v22.4s,v10.4s,v"#ac3".s[1]; fmla v23.4s,v10.4s,v"#ac4".s[1]\n\t"\
+  "ldr q10,[x4,#-64]\n\t"\
+  "fmla v12.4s,v8.4s,v"#ac1".s[2]; fmla v13.4s,v8.4s,v"#ac2".s[2]\n\t"\
+  "fmla v14.4s,v8.4s,v"#ac3".s[2]; fmla v15.4s,v8.4s,v"#ac4".s[2]\n\t"\
+  "ldr q8,[x4,#-48]; prfm pldl2keep,[x8]\n\t"\
+  "fmla v16.4s,v9.4s,v"#ac1".s[2]; fmla v17.4s,v9.4s,v"#ac2".s[2]\n\t"\
+  "fmla v18.4s,v9.4s,v"#ac3".s[2]; fmla v19.4s,v9.4s,v"#ac4".s[2]\n\t"\
+  "ldr q9,[x4,#-32]\n\t"\
+  "fmla v20.4s,v10.4s,v"#ac1".s[2]; fmla v21.4s,v10.4s,v"#ac2".s[2]\n\t"\
+  "fmla v22.4s,v10.4s,v"#ac3".s[2]; fmla v23.4s,v10.4s,v"#ac4".s[2]\n\t"\
+  "ldr q10,[x4,#-16]\n\t"\
+  "fmla v12.4s,v8.4s,v"#ac1".s[3]; fmla v13.4s,v8.4s,v"#ac2".s[3]\n\t"\
+  "fmla v14.4s,v8.4s,v"#ac3".s[3]; fmla v15.4s,v8.4s,v"#ac4".s[3]\n\t"\
+  "fmla v16.4s,v9.4s,v"#ac1".s[3]; fmla v17.4s,v9.4s,v"#ac2".s[3]\n\t"\
+  "sub w5,w5,#4; prfm pldl2keep,[x9]\n\t"\
+  "fmla v18.4s,v9.4s,v"#ac3".s[3]; fmla v19.4s,v9.4s,v"#ac4".s[3]\n\t"\
+  "fmla v20.4s,v10.4s,v"#ac1".s[3]; fmla v21.4s,v10.4s,v"#ac2".s[3]\n\t"\
+  "fmla v22.4s,v10.4s,v"#ac3".s[3]; fmla v23.4s,v10.4s,v"#ac4".s[3]\n\t"
+
+#define KERNEL_M4N12_TL1 \
+  "ldr s0,[x0],#4; ldr s1,[x1],#4; ldr s2,[x2],#4; ldr s3,[x3],#4\n\t"\
+  "ldr q8,[x4],#16; ldr q9,[x4],#16; ldr q10,[x4],#16\n\t"\
+  "fmla v12.4s,v8.4s,v0.s[0]; fmla v13.4s,v8.4s,v1.s[0]; subs w5,w5,#1\n\t"\
+  "fmla v14.4s,v8.4s,v2.s[0]; fmla v15.4s,v8.4s,v3.s[0]\n\t"\
+  "fmla v16.4s,v9.4s,v0.s[0]; fmla v17.4s,v9.4s,v1.s[0]\n\t"\
+  "fmla v18.4s,v9.4s,v2.s[0]; fmla v19.4s,v9.4s,v3.s[0]\n\t"\
+  "fmla v20.4s,v10.4s,v0.s[0]; fmla v21.4s,v10.4s,v1.s[0]\n\t"\
+  "fmla v22.4s,v10.4s,v2.s[0]; fmla v23.4s,v10.4s,v3.s[0]\n\t"
+
+FUNC_M4(4)
+FUNC_M4(5)
+FUNC_M4(6)
+FUNC_M4(7)
+FUNC_M4(8)
+FUNC_M4(9)
+FUNC_M4(10)
+FUNC_M4(11)
+FUNC_M4(12)
+
+#define FMA_M3N4(c1, c2, c3, a1, a2, a3, b1, k) \
+  "fmla v"#c1".4s,v"#b1".4s,v"#a1".s["#k"]\n\t"\
+  "fmla v"#c2".4s,v"#b1".4s,v"#a2".s["#k"]\n\t"\
+  "fmla v"#c3".4s,v"#b1".4s,v"#a3".s["#k"]\n\t"
+
+#define FMA_M3N1(c1, c2, c3, a1, a2, a3, b1) \
+  "fmla v"#c1".4s,v"#b1".4s,v"#a1".4s\n\t"\
+  "fmla v"#c2".4s,v"#b1".4s,v"#a2".4s\n\t"\
+  "fmla v"#c3".4s,v"#b1".4s,v"#a3".4s\n\t"
+
+
+#define INIT_M3N13 INIT_4V(12, 13, 14, 15) INIT_4V(16, 17, 18, 19)\
+  INIT_4V(20, 21, 22, 23)
+
+#define SAVE_M3N13(mode) UNIT_SAVE_M3N4_##mode(12, 13, 14)\
+  UNIT_SAVE_M3N4_##mode(15, 16, 17) UNIT_SAVE_M3N4_##mode(18, 19, 20)\
+  EDGE_SAVE_M3N1_##mode(21, 22, 23)
+
+#define KERNEL_M3N13_PRELOAD4 \
+  "ldr q0,[x0],#16; ldr q1,[x1],#16; ldr q2,[x2],#16\n\t"\
+  "ldr q8,[x4]; ldr q9,[x4,#16]; ldr q10,[x4,#32]\n\t"\
+  "add x4,x4,#208\n\t"
+
+#define KERNEL_M3N13_MAIN4(ac1, ac2, ac3, an1, an2, an3) \
+  FMA_M3N4(12, 13, 14, ac1, ac2, ac3, 8, 0) "ldr q8,[x4,#-160]\n\t"\
+  "ldr q"#an1",[x0],#16\n\t"\
+  FMA_M3N4(15, 16, 17, ac1, ac2, ac3, 9, 0) "ldr q9,[x4,#-144]\n\t"\
+  FMA_M3N4(18, 19, 20, ac1, ac2, ac3, 10, 0) "ldr q10,[x4,#-128]\n\t"\
+  FMA_M3N4(12, 13, 14, ac1, ac2, ac3, 8, 1) "ldr q8,[x4,#-112]\n\t"\
+  "ldr q"#an2",[x1],#16\n\t"\
+  FMA_M3N4(15, 16, 17, ac1, ac2, ac3, 9, 1) "ldr q9,[x4,#-96]\n\t"\
+  FMA_M3N4(18, 19, 20, ac1, ac2, ac3, 10, 1) "ldr q10,[x4,#-80]\n\t"\
+  FMA_M3N4(12, 13, 14, ac1, ac2, ac3, 8, 2) "ldr q8,[x4,#-64]\n\t"\
+  "ldr q"#an3",[x2],#16\n\t"\
+  FMA_M3N4(15, 16, 17, ac1, ac2, ac3, 9, 2) "ldr q9,[x4,#-48]\n\t"\
+  FMA_M3N4(18, 19, 20, ac1, ac2, ac3, 10, 2) "ldr q10,[x4,#-32]\n\t"\
+  FMA_M3N4(12, 13, 14, ac1, ac2, ac3, 8, 3) "ldr q11,[x4,#-16]\n\t"\
+  FMA_M3N4(15, 16, 17, ac1, ac2, ac3, 9, 3) "ldr q8,[x4],#208\n\t"\
+  "sub w5,w5,#4\n\t"\
+  FMA_M3N4(18, 19, 20, ac1, ac2, ac3, 10, 3) "ldr q9,[x4,#-192]\n\t"\
+  FMA_M3N1(21, 22, 23, ac1, ac2, ac3, 11) "ldr q10,[x4,#-176]\n\t"
+ 
+#define KERNEL_M3N13_TAIL4(ac1, ac2, ac3) \
+  FMA_M3N4(12, 13, 14, ac1, ac2, ac3, 8, 0) "ldr q8,[x4,#-160]\n\t"\
+  FMA_M3N4(15, 16, 17, ac1, ac2, ac3, 9, 0) "ldr q9,[x4,#-144]\n\t"\
+  FMA_M3N4(18, 19, 20, ac1, ac2, ac3, 10, 0) "ldr q10,[x4,#-128]\n\t"\
+  "prfm pldl2keep,[x6]\n\t"\
+  FMA_M3N4(12, 13, 14, ac1, ac2, ac3, 8, 1) "ldr q8,[x4,#-112]\n\t"\
+  FMA_M3N4(15, 16, 17, ac1, ac2, ac3, 9, 1) "ldr q9,[x4,#-96]\n\t"\
+  FMA_M3N4(18, 19, 20, ac1, ac2, ac3, 10, 1) "ldr q10,[x4,#-80]\n\t"\
+  "prfm pldl2keep,[x7]\n\t"\
+  FMA_M3N4(12, 13, 14, ac1, ac2, ac3, 8, 2) "ldr q8,[x4,#-64]\n\t"\
+  FMA_M3N4(15, 16, 17, ac1, ac2, ac3, 9, 2) "ldr q9,[x4,#-48]\n\t"\
+  FMA_M3N4(18, 19, 20, ac1, ac2, ac3, 10, 2) "ldr q10,[x4,#-32]\n\t"\
+  "prfm pldl2keep,[x8]\n\t"\
+  FMA_M3N4(12, 13, 14, ac1, ac2, ac3, 8, 3) "ldr q11,[x4,#-16]\n\t"\
+  FMA_M3N4(15, 16, 17, ac1, ac2, ac3, 9, 3)\
+  "sub w5,w5,#4\n\t"\
+  FMA_M3N4(18, 19, 20, ac1, ac2, ac3, 10, 3)\
+  FMA_M3N1(21, 22, 23, ac1, ac2, ac3, 11)
+
+#define KERNEL_M3N13_TL1 \
+  "ldr s0,[x0],#4; ldr s1,[x1],#4; ldr s2,[x2],#4\n\t"\
+  "ldr q8,[x4],#16; ldr q9,[x4],#16; ldr q10,[x4],#16; ldr s11,[x4],#4\n\t"\
+  "fmla v12.4s,v8.4s,v0.s[0]; fmla v13.4s,v8.4s,v1.s[0]; subs w5,w5,#1\n\t"\
+  "fmla v14.4s,v8.4s,v2.s[0]; fmla v15.4s,v9.4s,v0.s[0]\n\t"\
+  "fmla v16.4s,v9.4s,v1.s[0]; fmla v17.4s,v9.4s,v2.s[0]\n\t"\
+  "fmla v18.4s,v10.4s,v0.s[0]; fmla v19.4s,v10.4s,v1.s[0]\n\t"\
+  "fmla v20.4s,v10.4s,v2.s[0]; fmla v21.4s,v0.4s,v11.s[0]\n\t"\
+  "fmla v22.4s,v1.4s,v11.s[0]; fmla v23.4s,v2.4s,v11.s[0]\n\t"
+
+
+#define INIT_M3N14 INIT_4V(12, 13, 14, 15) INIT_4V(16, 17, 18, 19)\
+  INIT_4V(20, 21, 22, 23) INIT_2V(24, 25) INIT_1V(26)
+
+#define SAVE_M3N14(mode) UNIT_SAVE_M3N4_##mode(12, 13, 14)\
+  UNIT_SAVE_M3N4_##mode(15, 16, 17) UNIT_SAVE_M3N4_##mode(18, 19, 20)\
+  EDGE_SAVE_M3N1_##mode(21, 22, 23) EDGE_SAVE_M3N1_##mode(24, 25, 26)
+
+#define KERNEL_M3N14_PRELOAD4 \
+  "ldr q0,[x0],#16; ldr q1,[x1],#16; ldr q2,[x2],#16\n\t"\
+  "ldr q8,[x4]; ldr q9,[x4,#16]; ldr q10,[x4,#32]\n\t"\
+  "add x4,x4,#224\n\t"
+
+#define KERNEL_M3N14_MAIN4(ac1, ac2, ac3, an1, an2, an3) \
+  FMA_M3N4(12, 13, 14, ac1, ac2, ac3, 8, 0) "ldr q8,[x4,#-176]\n\t"\
+  "ldr q"#an1",[x0],#16\n\t"\
+  FMA_M3N4(15, 16, 17, ac1, ac2, ac3, 9, 0) "ldr q9,[x4,#-160]\n\t"\
+  FMA_M3N4(18, 19, 20, ac1, ac2, ac3, 10, 0) "ldr q10,[x4,#-144]\n\t"\
+  FMA_M3N4(12, 13, 14, ac1, ac2, ac3, 8, 1) "ldr q8,[x4,#-128]\n\t"\
+  "ldr q"#an2",[x1],#16\n\t"\
+  FMA_M3N4(15, 16, 17, ac1, ac2, ac3, 9, 1) "ldr q9,[x4,#-112]\n\t"\
+  FMA_M3N4(18, 19, 20, ac1, ac2, ac3, 10, 1) "ldr q10,[x4,#-96]\n\t"\
+  FMA_M3N4(12, 13, 14, ac1, ac2, ac3, 8, 2) "ldr q11,[x4,#-80]\n\t"\
+  "ldr q"#an3",[x2],#16\n\t"\
+  FMA_M3N4(15, 16, 17, ac1, ac2, ac3, 9, 2) "ldr q8,[x4,#-64]\n\t"\
+  FMA_M3N4(18, 19, 20, ac1, ac2, ac3, 10, 2) "ldr q9,[x4,#-48]\n\t"\
+  FMA_M3N4(12, 13, 14, ac1, ac2, ac3, 11, 3) "ldr q10,[x4,#-32]\n\t"\
+  FMA_M3N4(15, 16, 17, ac1, ac2, ac3, 8, 3) "ldr q11,[x4,#-16]\n\t"\
+  "sub w5,w5,#4\n\t"\
+  FMA_M3N4(18, 19, 20, ac1, ac2, ac3, 9, 3) "ldr q8,[x4],#224\n\t"\
+  FMA_M3N1(21, 22, 23, ac1, ac2, ac3, 10) "ldr q9,[x4,#-208]\n\t"\
+  FMA_M3N1(24, 25, 26, ac1, ac2, ac3, 11) "ldr q10,[x4,#-192]\n\t"
+
+#define KERNEL_M3N14_TAIL4(ac1, ac2, ac3) \
+  FMA_M3N4(12, 13, 14, ac1, ac2, ac3, 8, 0) "ldr q8,[x4,#-176]\n\t"\
+  FMA_M3N4(15, 16, 17, ac1, ac2, ac3, 9, 0) "ldr q9,[x4,#-160]\n\t"\
+  FMA_M3N4(18, 19, 20, ac1, ac2, ac3, 10, 0) "ldr q10,[x4,#-144]\n\t"\
+  "prfm pldl2keep,[x6]\n\t"\
+  FMA_M3N4(12, 13, 14, ac1, ac2, ac3, 8, 1) "ldr q8,[x4,#-128]\n\t"\
+  FMA_M3N4(15, 16, 17, ac1, ac2, ac3, 9, 1) "ldr q9,[x4,#-112]\n\t"\
+  FMA_M3N4(18, 19, 20, ac1, ac2, ac3, 10, 1) "ldr q10,[x4,#-96]\n\t"\
+  "prfm pldl2keep,[x7]\n\t"\
+  FMA_M3N4(12, 13, 14, ac1, ac2, ac3, 8, 2) "ldr q11,[x4,#-80]\n\t"\
+  FMA_M3N4(15, 16, 17, ac1, ac2, ac3, 9, 2) "ldr q8,[x4,#-64]\n\t"\
+  FMA_M3N4(18, 19, 20, ac1, ac2, ac3, 10, 2) "ldr q9,[x4,#-48]\n\t"\
+  "prfm pldl2keep,[x8]\n\t"\
+  FMA_M3N4(12, 13, 14, ac1, ac2, ac3, 11, 3) "ldr q10,[x4,#-32]\n\t"\
+  FMA_M3N4(15, 16, 17, ac1, ac2, ac3, 8, 3) "ldr q11,[x4,#-16]\n\t"\
+  "sub w5,w5,#4\n\t"\
+  FMA_M3N4(18, 19, 20, ac1, ac2, ac3, 9, 3)\
+  FMA_M3N1(21, 22, 23, ac1, ac2, ac3, 10)\
+  FMA_M3N1(24, 25, 26, ac1, ac2, ac3, 11)
+
+#define KERNEL_M3N14_TL1 \
+  "ldr s0,[x0],#4; ldr s1,[x1],#4; ldr s2,[x2],#4\n\t"\
+  "ldr q8,[x4],#16; ldr q9,[x4],#16; ldr q10,[x4],#16; ldr d11,[x4],#8\n\t"\
+  "fmla v12.4s,v8.4s,v0.s[0]; fmla v13.4s,v8.4s,v1.s[0]; subs w5,w5,#1\n\t"\
+  "fmla v14.4s,v8.4s,v2.s[0]; fmla v15.4s,v9.4s,v0.s[0]\n\t"\
+  "fmla v16.4s,v9.4s,v1.s[0]; fmla v17.4s,v9.4s,v2.s[0]\n\t"\
+  "fmla v18.4s,v10.4s,v0.s[0]; fmla v19.4s,v10.4s,v1.s[0]\n\t"\
+  "fmla v20.4s,v10.4s,v2.s[0]; fmla v21.4s,v0.4s,v11.s[0]\n\t"\
+  "fmla v22.4s,v1.4s,v11.s[0]; fmla v23.4s,v2.4s,v11.s[0]\n\t"\
+  "fmla v24.4s,v0.4s,v11.s[1]; fmla v25.4s,v1.4s,v11.s[1]\n\t"\
+  "fmla v26.4s,v2.4s,v11.s[1]\n\t"
+
+
+#define INIT_M3N15 INIT_4V(12, 13, 14, 15) INIT_4V(16, 17, 18, 19)\
+  INIT_4V(20, 21, 22, 23) INIT_4V(24, 25, 26, 27) INIT_2V(28, 29)
+
+#define SAVE_M3N15(mode) UNIT_SAVE_M3N4_##mode(12, 13, 14)\
+  UNIT_SAVE_M3N4_##mode(15, 16, 17) UNIT_SAVE_M3N4_##mode(18, 19, 20)\
+  EDGE_SAVE_M3N1_##mode(21, 22, 23) EDGE_SAVE_M3N1_##mode(24, 25, 26)\
+  EDGE_SAVE_M3N1_##mode(27, 28, 29)
+
+#define KERNEL_M3N15_PRELOAD4 \
+  "ldr q0,[x0],#16; ldr q1,[x1],#16; ldr q2,[x2],#16\n\t"\
+  "ldr q8,[x4]; ldr q9,[x4,#16]; ldr q10,[x4,#32]\n\t"\
+  "add x4,x4,#240\n\t"
+
+#define KERNEL_M3N15_MAIN4(ac1, ac2, ac3, an1, an2, an3) \
+  FMA_M3N4(12, 13, 14, ac1, ac2, ac3, 8, 0) "ldr q8,[x4,#-192]\n\t"\
+  "ldr q"#an1",[x0],#16\n\t"\
+  FMA_M3N4(15, 16, 17, ac1, ac2, ac3, 9, 0) "ldr q9,[x4,#-176]\n\t"\
+  FMA_M3N4(18, 19, 20, ac1, ac2, ac3, 10, 0) "ldr q10,[x4,#-160]\n\t"\
+  FMA_M3N4(12, 13, 14, ac1, ac2, ac3, 8, 1) "ldr q11,[x4,#-144]\n\t"\
+  "ldr q"#an2",[x1],#16\n\t"\
+  FMA_M3N4(15, 16, 17, ac1, ac2, ac3, 9, 1) "ldr q8,[x4,#-128]\n\t"\
+  FMA_M3N4(18, 19, 20, ac1, ac2, ac3, 10, 1) "ldr q9,[x4,#-112]\n\t"\
+  FMA_M3N4(12, 13, 14, ac1, ac2, ac3, 11, 2) "ldr q10,[x4,#-96]\n\t"\
+  "ldr q"#an3",[x2],#16\n\t"\
+  FMA_M3N4(15, 16, 17, ac1, ac2, ac3, 8, 2) "ldr q11,[x4,#-80]\n\t"\
+  FMA_M3N4(18, 19, 20, ac1, ac2, ac3, 9, 2) "ldr q8,[x4,#-64]\n\t"\
+  FMA_M3N4(12, 13, 14, ac1, ac2, ac3, 10, 3) "ldr q9,[x4,#-48]\n\t"\
+  FMA_M3N4(15, 16, 17, ac1, ac2, ac3, 11, 3) "ldr q10,[x4,#-32]\n\t"\
+  "sub w5,w5,#4\n\t"\
+  FMA_M3N4(18, 19, 20, ac1, ac2, ac3, 8, 3) "ldr q11,[x4,#-16]\n\t"\
+  FMA_M3N1(21, 22, 23, ac1, ac2, ac3, 9) "ldr q8,[x4],#240\n\t"\
+  FMA_M3N1(24, 25, 26, ac1, ac2, ac3, 10) "ldr q9,[x4,#-224]\n\t"\
+  FMA_M3N1(27, 28, 29, ac1, ac2, ac3, 11) "ldr q10,[x4,#-208]\n\t"
+
+#define KERNEL_M3N15_TAIL4(ac1, ac2, ac3) \
+  FMA_M3N4(12, 13, 14, ac1, ac2, ac3, 8, 0) "ldr q8,[x4,#-192]\n\t"\
+  "prfm pldl2keep,[x6]\n\t"\
+  FMA_M3N4(15, 16, 17, ac1, ac2, ac3, 9, 0) "ldr q9,[x4,#-176]\n\t"\
+  FMA_M3N4(18, 19, 20, ac1, ac2, ac3, 10, 0) "ldr q10,[x4,#-160]\n\t"\
+  FMA_M3N4(12, 13, 14, ac1, ac2, ac3, 8, 1) "ldr q11,[x4,#-144]\n\t"\
+  "prfm pldl2keep,[x7]\n\t"\
+  FMA_M3N4(15, 16, 17, ac1, ac2, ac3, 9, 1) "ldr q8,[x4,#-128]\n\t"\
+  FMA_M3N4(18, 19, 20, ac1, ac2, ac3, 10, 1) "ldr q9,[x4,#-112]\n\t"\
+  FMA_M3N4(12, 13, 14, ac1, ac2, ac3, 11, 2) "ldr q10,[x4,#-96]\n\t"\
+  "prfm pldl2keep,[x8]\n\t"\
+  FMA_M3N4(15, 16, 17, ac1, ac2, ac3, 8, 2) "ldr q11,[x4,#-80]\n\t"\
+  FMA_M3N4(18, 19, 20, ac1, ac2, ac3, 9, 2) "ldr q8,[x4,#-64]\n\t"\
+  FMA_M3N4(12, 13, 14, ac1, ac2, ac3, 10, 3) "ldr q9,[x4,#-48]\n\t"\
+  FMA_M3N4(15, 16, 17, ac1, ac2, ac3, 11, 3) "ldr q10,[x4,#-32]\n\t"\
+  "sub w5,w5,#4\n\t"\
+  FMA_M3N4(18, 19, 20, ac1, ac2, ac3, 8, 3) "ldr q11,[x4,#-16]\n\t"\
+  FMA_M3N1(21, 22, 23, ac1, ac2, ac3, 9)\
+  FMA_M3N1(24, 25, 26, ac1, ac2, ac3, 10)\
+  FMA_M3N1(27, 28, 29, ac1, ac2, ac3, 11)
+
+#define KERNEL_M3N15_TL1 \
+  "ldr s0,[x0],#4; ldr s1,[x1],#4; ldr s2,[x2],#4\n\t"\
+  "ldr q8,[x4],#16; ldr q9,[x4],#16; ldr q10,[x4],#16; ldr d11,[x4],#8\n\t"\
+  "fmla v12.4s,v8.4s,v0.s[0]; fmla v13.4s,v8.4s,v1.s[0]; subs w5,w5,#1\n\t"\
+  "fmla v14.4s,v8.4s,v2.s[0]; fmla v15.4s,v9.4s,v0.s[0]\n\t"\
+  "ldr s8,[x4],#4\n\t"\
+  "fmla v16.4s,v9.4s,v1.s[0]; fmla v17.4s,v9.4s,v2.s[0]\n\t"\
+  "fmla v18.4s,v10.4s,v0.s[0]; fmla v19.4s,v10.4s,v1.s[0]\n\t"\
+  "fmla v20.4s,v10.4s,v2.s[0]; fmla v21.4s,v0.4s,v11.s[0]\n\t"\
+  "fmla v22.4s,v1.4s,v11.s[0]; fmla v23.4s,v2.4s,v11.s[0]\n\t"\
+  "fmla v24.4s,v0.4s,v11.s[1]; fmla v25.4s,v1.4s,v11.s[1]\n\t"\
+  "fmla v26.4s,v2.4s,v11.s[1]; fmla v27.4s,v0.4s,v8.s[0]\n\t"\
+  "fmla v28.4s,v1.4s,v8.s[0]; fmla v29.4s,v2.4s,v8.s[0]\n\t"
+
+
+#define INIT_M3N16 INIT_4V(10, 11, 12, 13) INIT_4V(14, 15, 16, 17)\
+  INIT_4V(18, 19, 20, 21)
+
+#define SAVE_M3N16(mode) UNIT_SAVE_M3N4_##mode(10, 11, 12)\
+  UNIT_SAVE_M3N4_##mode(13, 14, 15) UNIT_SAVE_M3N4_##mode(16, 17, 18)\
+  UNIT_SAVE_M3N4_##mode(19, 20, 21)
+
+#define KERNEL_M3N16_PRELOAD4 \
+  "ldr q0,[x0],#16; ldr q1,[x1],#16; ldr q2,[x2],#16\n\t"\
+  "ldr q6,[x4]; ldr q7,[x4,#16]; ldr q8,[x4,#32]\n\t"\
+  "add x4,x4,#256\n\t"
+
+#define KERNEL_M3N16_MAIN4(ac1, ac2, ac3, an1, an2, an3) \
+  FMA_M3N4(10, 11, 12, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#-208]\n\t"\
+  FMA_M3N4(13, 14, 15, ac1, ac2, ac3, 7, 0) "ldr q7,[x4,#-192]\n\t"\
+  FMA_M3N4(16, 17, 18, ac1, ac2, ac3, 8, 0) "ldr q8,[x4,#-176]\n\t"\
+  "ldr q"#an1",[x0],#16\n\t"\
+  FMA_M3N4(19, 20, 21, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#-160]\n\t"\
+  FMA_M3N4(10, 11, 12, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#-144]\n\t"\
+  FMA_M3N4(13, 14, 15, ac1, ac2, ac3, 8, 1) "ldr q8,[x4,#-128]\n\t"\
+  "ldr q"#an2",[x1],#16\n\t"\
+  FMA_M3N4(16, 17, 18, ac1, ac2, ac3, 6, 1) "ldr q6,[x4,#-112]\n\t"\
+  FMA_M3N4(19, 20, 21, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#-96]\n\t"\
+  FMA_M3N4(10, 11, 12, ac1, ac2, ac3, 8, 2) "ldr q8,[x4,#-80]\n\t"\
+  "ldr q"#an3",[x2],#16\n\t"\
+  FMA_M3N4(13, 14, 15, ac1, ac2, ac3, 6, 2) "ldr q6,[x4,#-64]\n\t"\
+  FMA_M3N4(16, 17, 18, ac1, ac2, ac3, 7, 2) "ldr q7,[x4,#-48]\n\t"\
+  FMA_M3N4(19, 20, 21, ac1, ac2, ac3, 8, 2) "ldr q8,[x4,#-32]\n\t"\
+  "sub w5,w5,#4\n\t"\
+  FMA_M3N4(10, 11, 12, ac1, ac2, ac3, 6, 3) "ldr q9,[x4,#-16]\n\t"\
+  FMA_M3N4(13, 14, 15, ac1, ac2, ac3, 7, 3) "ldr q6,[x4]; add x4,x4,#256\n\t"\
+  FMA_M3N4(16, 17, 18, ac1, ac2, ac3, 8, 3) "ldr q7,[x4,#-240]\n\t"\
+  FMA_M3N4(19, 20, 21, ac1, ac2, ac3, 9, 3) "ldr q8,[x4,#-224]\n\t"
+
+#define KERNEL_M3N16_TAIL4(ac1, ac2, ac3) \
+  FMA_M3N4(10, 11, 12, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#-208]\n\t"\
+  FMA_M3N4(13, 14, 15, ac1, ac2, ac3, 7, 0) "ldr q7,[x4,#-192]\n\t"\
+  FMA_M3N4(16, 17, 18, ac1, ac2, ac3, 8, 0) "ldr q8,[x4,#-176]\n\t"\
+  "prfm pldl2keep,[x6]\n\t"\
+  FMA_M3N4(19, 20, 21, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#-160]\n\t"\
+  FMA_M3N4(10, 11, 12, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#-144]\n\t"\
+  FMA_M3N4(13, 14, 15, ac1, ac2, ac3, 8, 1) "ldr q8,[x4,#-128]\n\t"\
+  "prfm pldl2keep,[x7]\n\t"\
+  FMA_M3N4(16, 17, 18, ac1, ac2, ac3, 6, 1) "ldr q6,[x4,#-112]\n\t"\
+  FMA_M3N4(19, 20, 21, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#-96]\n\t"\
+  FMA_M3N4(10, 11, 12, ac1, ac2, ac3, 8, 2) "ldr q8,[x4,#-80]\n\t"\
+  "prfm pldl2keep,[x8]\n\t"\
+  FMA_M3N4(13, 14, 15, ac1, ac2, ac3, 6, 2) "ldr q6,[x4,#-64]\n\t"\
+  FMA_M3N4(16, 17, 18, ac1, ac2, ac3, 7, 2) "ldr q7,[x4,#-48]\n\t"\
+  FMA_M3N4(19, 20, 21, ac1, ac2, ac3, 8, 2) "ldr q8,[x4,#-32]\n\t"\
+  "sub w5,w5,#4\n\t"\
+  FMA_M3N4(10, 11, 12, ac1, ac2, ac3, 6, 3) "ldr q9,[x4,#-16]\n\t"\
+  FMA_M3N4(13, 14, 15, ac1, ac2, ac3, 7, 3)\
+  FMA_M3N4(16, 17, 18, ac1, ac2, ac3, 8, 3)\
+  FMA_M3N4(19, 20, 21, ac1, ac2, ac3, 9, 3)
+
+#define KERNEL_M3N16_TL1 \
+  "ldr s0,[x0],#4; ldr s1,[x1],#4; ldr s2,[x2],#4\n\t"\
+  "ldr q6,[x4],#16; ldr q7,[x4],#16; ldr q8,[x4],#16; ldr q9,[x4],#16\n\t"\
+  "fmla v10.4s,v6.4s,v0.s[0]; fmla v11.4s,v6.4s,v1.s[0]; subs w5,w5,#1\n\t"\
+  "fmla v12.4s,v6.4s,v2.s[0]; fmla v13.4s,v7.4s,v0.s[0]\n\t"\
+  "fmla v14.4s,v7.4s,v1.s[0]; fmla v15.4s,v7.4s,v2.s[0]\n\t"\
+  "fmla v16.4s,v8.4s,v0.s[0]; fmla v17.4s,v8.4s,v1.s[0]\n\t"\
+  "fmla v18.4s,v8.4s,v2.s[0]; fmla v19.4s,v9.4s,v0.s[0]\n\t"\
+  "fmla v20.4s,v9.4s,v1.s[0]; fmla v21.4s,v9.4s,v2.s[0]\n\t"
+
+
+#define INIT_M3N17 INIT_4V(10, 11, 12, 13) INIT_4V(14, 15, 16, 17)\
+  INIT_4V(18, 19, 20, 21) INIT_2V(22, 23) INIT_1V(24)
+
+#define SAVE_M3N17(mode) UNIT_SAVE_M3N4_##mode(10, 11, 12)\
+  UNIT_SAVE_M3N4_##mode(13, 14, 15) UNIT_SAVE_M3N4_##mode(16, 17, 18)\
+  UNIT_SAVE_M3N4_##mode(19, 20, 21) EDGE_SAVE_M3N1_##mode(22, 23, 24)
+
+#define KERNEL_M3N17_PRELOAD4 \
+  "ldr q0,[x0],#16; ldr q1,[x1],#16; ldr q2,[x2],#16\n\t"\
+  "ldr q6,[x4]; ldr q7,[x4,#16]; ldr q8,[x4,#32]\n\t"\
+  "add x4,x4,#272\n\t"
+
+#define KERNEL_M3N17_MAIN4(ac1, ac2, ac3, an1, an2, an3) \
+  FMA_M3N4(10, 11, 12, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#-224]\n\t"\
+  FMA_M3N4(13, 14, 15, ac1, ac2, ac3, 7, 0) "ldr q7,[x4,#-208]\n\t"\
+  FMA_M3N4(16, 17, 18, ac1, ac2, ac3, 8, 0) "ldr q8,[x4,#-192]\n\t"\
+  "ldr q"#an1",[x0],#16\n\t"\
+  FMA_M3N4(19, 20, 21, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#-176]\n\t"\
+  FMA_M3N4(10, 11, 12, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#-160]\n\t"\
+  FMA_M3N4(13, 14, 15, ac1, ac2, ac3, 8, 1) "ldr q8,[x4,#-144]\n\t"\
+  "ldr q"#an2",[x1],#16\n\t"\
+  FMA_M3N4(16, 17, 18, ac1, ac2, ac3, 6, 1) "ldr q6,[x4,#-128]\n\t"\
+  FMA_M3N4(19, 20, 21, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#-112]\n\t"\
+  FMA_M3N4(10, 11, 12, ac1, ac2, ac3, 8, 2) "ldr q8,[x4,#-96]\n\t"\
+  "ldr q"#an3",[x2],#16\n\t"\
+  FMA_M3N4(13, 14, 15, ac1, ac2, ac3, 6, 2) "ldr q9,[x4,#-80]\n\t"\
+  FMA_M3N4(16, 17, 18, ac1, ac2, ac3, 7, 2) "ldr q6,[x4,#-64]\n\t"\
+  FMA_M3N4(19, 20, 21, ac1, ac2, ac3, 8, 2) "ldr q7,[x4,#-48]\n\t"\
+  "sub w5,w5,#4\n\t"\
+  FMA_M3N4(10, 11, 12, ac1, ac2, ac3, 9, 3) "ldr q8,[x4,#-32]\n\t"\
+  FMA_M3N4(13, 14, 15, ac1, ac2, ac3, 6, 3) "ldr q9,[x4,#-16]\n\t"\
+  FMA_M3N4(16, 17, 18, ac1, ac2, ac3, 7, 3) "ldr q6,[x4]; add x4,x4,#272\n\t"\
+  FMA_M3N4(19, 20, 21, ac1, ac2, ac3, 8, 3) "ldr q7,[x4,#-256]\n\t"\
+  FMA_M3N1(22, 23, 24, ac1, ac2, ac3, 9) "ldr q8,[x4,#-240]\n\t"
+
+#define KERNEL_M3N17_TAIL4(ac1, ac2, ac3) \
+  FMA_M3N4(10, 11, 12, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#-224]\n\t"\
+  FMA_M3N4(13, 14, 15, ac1, ac2, ac3, 7, 0) "ldr q7,[x4,#-208]\n\t"\
+  FMA_M3N4(16, 17, 18, ac1, ac2, ac3, 8, 0) "ldr q8,[x4,#-192]\n\t"\
+  "prfm pldl2keep,[x6]\n\t"\
+  FMA_M3N4(19, 20, 21, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#-176]\n\t"\
+  FMA_M3N4(10, 11, 12, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#-160]\n\t"\
+  FMA_M3N4(13, 14, 15, ac1, ac2, ac3, 8, 1) "ldr q8,[x4,#-144]\n\t"\
+  "prfm pldl2keep,[x7]\n\t"\
+  FMA_M3N4(16, 17, 18, ac1, ac2, ac3, 6, 1) "ldr q6,[x4,#-128]\n\t"\
+  FMA_M3N4(19, 20, 21, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#-112]\n\t"\
+  FMA_M3N4(10, 11, 12, ac1, ac2, ac3, 8, 2) "ldr q8,[x4,#-96]\n\t"\
+  "prfm pldl2keep,[x8]\n\t"\
+  FMA_M3N4(13, 14, 15, ac1, ac2, ac3, 6, 2) "ldr q9,[x4,#-80]\n\t"\
+  FMA_M3N4(16, 17, 18, ac1, ac2, ac3, 7, 2) "ldr q6,[x4,#-64]\n\t"\
+  FMA_M3N4(19, 20, 21, ac1, ac2, ac3, 8, 2) "ldr q7,[x4,#-48]\n\t"\
+  "sub w5,w5,#4\n\t"\
+  FMA_M3N4(10, 11, 12, ac1, ac2, ac3, 9, 3) "ldr q8,[x4,#-32]\n\t"\
+  FMA_M3N4(13, 14, 15, ac1, ac2, ac3, 6, 3) "ldr q9,[x4,#-16]\n\t"\
+  FMA_M3N4(16, 17, 18, ac1, ac2, ac3, 7, 3)\
+  FMA_M3N4(19, 20, 21, ac1, ac2, ac3, 8, 3)\
+  FMA_M3N1(22, 23, 24, ac1, ac2, ac3, 9)
+
+#define KERNEL_M3N17_TL1 \
+  "ldr s0,[x0],#4; ldr s1,[x1],#4; ldr s2,[x2],#4\n\t"\
+  "ldr q6,[x4],#16; ldr q7,[x4],#16; ldr q8,[x4],#16; ldr q9,[x4],#16\n\t"\
+  "fmla v10.4s,v6.4s,v0.s[0]; fmla v11.4s,v6.4s,v1.s[0]; subs w5,w5,#1\n\t"\
+  "fmla v12.4s,v6.4s,v2.s[0]; fmla v13.4s,v7.4s,v0.s[0]; ldr s6,[x4],#4\n\t"\
+  "fmla v14.4s,v7.4s,v1.s[0]; fmla v15.4s,v7.4s,v2.s[0]\n\t"\
+  "fmla v16.4s,v8.4s,v0.s[0]; fmla v17.4s,v8.4s,v1.s[0]\n\t"\
+  "fmla v18.4s,v8.4s,v2.s[0]; fmla v19.4s,v9.4s,v0.s[0]\n\t"\
+  "fmla v20.4s,v9.4s,v1.s[0]; fmla v21.4s,v9.4s,v2.s[0]\n\t"\
+  "fmla v22.4s,v0.4s,v6.s[0]; fmla v23.4s,v1.4s,v6.s[0]\n\t"\
+  "fmla v24.4s,v2.4s,v6.s[0]\n\t"
+
+
+#define INIT_M3N18 INIT_4V(10, 11, 12, 13) INIT_4V(14, 15, 16, 17)\
+  INIT_4V(18, 19, 20, 21) INIT_4V(22, 23, 24, 25) INIT_2V(26, 27)
+
+#define SAVE_M3N18(mode) UNIT_SAVE_M3N4_##mode(10, 11, 12)\
+  UNIT_SAVE_M3N4_##mode(13, 14, 15) UNIT_SAVE_M3N4_##mode(16, 17, 18)\
+  UNIT_SAVE_M3N4_##mode(19, 20, 21) EDGE_SAVE_M3N1_##mode(22, 23, 24)\
+  EDGE_SAVE_M3N1_##mode(25, 26, 27)
+
+#define KERNEL_M3N18_PRELOAD4 \
+  "ldr q0,[x0],#16; ldr q1,[x1],#16; ldr q2,[x2],#16\n\t"\
+  "ldr q6,[x4]; ldr q7,[x4,#16]; ldr q8,[x4,#32]\n\t"\
+  "add x4,x4,#288\n\t"
+
+#define KERNEL_M3N18_MAIN4(ac1, ac2, ac3, an1, an2, an3) \
+  FMA_M3N4(10, 11, 12, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#-240]\n\t"\
+  FMA_M3N4(13, 14, 15, ac1, ac2, ac3, 7, 0) "ldr q7,[x4,#-224]\n\t"\
+  FMA_M3N4(16, 17, 18, ac1, ac2, ac3, 8, 0) "ldr q8,[x4,#-208]\n\t"\
+  "ldr q"#an1",[x0],#16\n\t"\
+  FMA_M3N4(19, 20, 21, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#-192]\n\t"\
+  FMA_M3N4(10, 11, 12, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#-176]\n\t"\
+  FMA_M3N4(13, 14, 15, ac1, ac2, ac3, 8, 1) "ldr q8,[x4,#-160]\n\t"\
+  "ldr q"#an2",[x1],#16\n\t"\
+  FMA_M3N4(16, 17, 18, ac1, ac2, ac3, 6, 1) "ldr q9,[x4,#-144]\n\t"\
+  FMA_M3N4(19, 20, 21, ac1, ac2, ac3, 7, 1) "ldr q6,[x4,#-128]\n\t"\
+  FMA_M3N4(10, 11, 12, ac1, ac2, ac3, 8, 2) "ldr q7,[x4,#-112]\n\t"\
+  "ldr q"#an3",[x2],#16\n\t"\
+  FMA_M3N4(13, 14, 15, ac1, ac2, ac3, 9, 2) "ldr q8,[x4,#-96]\n\t"\
+  FMA_M3N4(16, 17, 18, ac1, ac2, ac3, 6, 2) "ldr q9,[x4,#-80]\n\t"\
+  FMA_M3N4(19, 20, 21, ac1, ac2, ac3, 7, 2) "ldr q6,[x4,#-64]\n\t"\
+  "sub w5,w5,#4\n\t"\
+  FMA_M3N4(10, 11, 12, ac1, ac2, ac3, 8, 3) "ldr q7,[x4,#-48]\n\t"\
+  FMA_M3N4(13, 14, 15, ac1, ac2, ac3, 9, 3) "ldr q8,[x4,#-32]\n\t"\
+  FMA_M3N4(16, 17, 18, ac1, ac2, ac3, 6, 3) "ldr q9,[x4,#-16]\n\t"\
+  FMA_M3N4(19, 20, 21, ac1, ac2, ac3, 7, 3) "ldr q6,[x4]\n\t"\
+  FMA_M3N1(22, 23, 24, ac1, ac2, ac3, 8) "ldr q7,[x4,#16]\n\t"\
+  FMA_M3N1(25, 26, 27, ac1, ac2, ac3, 9) "ldr q8,[x4,#32]; add x4,x4,#288\n\t"
+
+#define KERNEL_M3N18_TAIL4(ac1, ac2, ac3) \
+  FMA_M3N4(10, 11, 12, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#-240]\n\t"\
+  FMA_M3N4(13, 14, 15, ac1, ac2, ac3, 7, 0) "ldr q7,[x4,#-224]\n\t"\
+  FMA_M3N4(16, 17, 18, ac1, ac2, ac3, 8, 0) "ldr q8,[x4,#-208]\n\t"\
+  "prfm pldl2keep,[x6]\n\t"\
+  FMA_M3N4(19, 20, 21, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#-192]\n\t"\
+  FMA_M3N4(10, 11, 12, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#-176]\n\t"\
+  FMA_M3N4(13, 14, 15, ac1, ac2, ac3, 8, 1) "ldr q8,[x4,#-160]\n\t"\
+  "prfm pldl2keep,[x7]\n\t"\
+  FMA_M3N4(16, 17, 18, ac1, ac2, ac3, 6, 1) "ldr q9,[x4,#-144]\n\t"\
+  FMA_M3N4(19, 20, 21, ac1, ac2, ac3, 7, 1) "ldr q6,[x4,#-128]\n\t"\
+  FMA_M3N4(10, 11, 12, ac1, ac2, ac3, 8, 2) "ldr q7,[x4,#-112]\n\t"\
+  "prfm pldl2keep,[x8]\n\t"\
+  FMA_M3N4(13, 14, 15, ac1, ac2, ac3, 9, 2) "ldr q8,[x4,#-96]\n\t"\
+  FMA_M3N4(16, 17, 18, ac1, ac2, ac3, 6, 2) "ldr q9,[x4,#-80]\n\t"\
+  FMA_M3N4(19, 20, 21, ac1, ac2, ac3, 7, 2) "ldr q6,[x4,#-64]\n\t"\
+  "sub w5,w5,#4\n\t"\
+  FMA_M3N4(10, 11, 12, ac1, ac2, ac3, 8, 3) "ldr q7,[x4,#-48]\n\t"\
+  FMA_M3N4(13, 14, 15, ac1, ac2, ac3, 9, 3) "ldr q8,[x4,#-32]\n\t"\
+  FMA_M3N4(16, 17, 18, ac1, ac2, ac3, 6, 3) "ldr q9,[x4,#-16]\n\t"\
+  FMA_M3N4(19, 20, 21, ac1, ac2, ac3, 7, 3)\
+  FMA_M3N1(22, 23, 24, ac1, ac2, ac3, 8)\
+  FMA_M3N1(25, 26, 27, ac1, ac2, ac3, 9)
+
+#define KERNEL_M3N18_TL1 \
+  "ldr s0,[x0],#4; ldr s1,[x1],#4; ldr s2,[x2],#4\n\t"\
+  "ldr q6,[x4],#16; ldr q7,[x4],#16; ldr q8,[x4],#16; ldr q9,[x4],#16\n\t"\
+  "fmla v10.4s,v6.4s,v0.s[0]; fmla v11.4s,v6.4s,v1.s[0]; subs w5,w5,#1\n\t"\
+  "fmla v12.4s,v6.4s,v2.s[0]; fmla v13.4s,v7.4s,v0.s[0]; ldr d6,[x4],#8\n\t"\
+  "fmla v14.4s,v7.4s,v1.s[0]; fmla v15.4s,v7.4s,v2.s[0]\n\t"\
+  "fmla v16.4s,v8.4s,v0.s[0]; fmla v17.4s,v8.4s,v1.s[0]\n\t"\
+  "fmla v18.4s,v8.4s,v2.s[0]; fmla v19.4s,v9.4s,v0.s[0]\n\t"\
+  "fmla v20.4s,v9.4s,v1.s[0]; fmla v21.4s,v9.4s,v2.s[0]\n\t"\
+  "fmla v22.4s,v0.4s,v6.s[0]; fmla v23.4s,v1.4s,v6.s[0]\n\t"\
+  "fmla v24.4s,v2.4s,v6.s[0]; fmla v25.4s,v0.4s,v6.s[1]\n\t"\
+  "fmla v26.4s,v1.4s,v6.s[1]; fmla v27.4s,v2.4s,v6.s[1]\n\t"
+
+
+#define INIT_M3N19 INIT_4V(10, 11, 12, 13) INIT_4V(14, 15, 16, 17)\
+  INIT_4V(18, 19, 20, 21) INIT_4V(22, 23, 24, 25)\
+  INIT_4V(26, 27, 28, 29) INIT_1V(30)
+
+#define SAVE_M3N19(mode) UNIT_SAVE_M3N4_##mode(10, 11, 12)\
+  UNIT_SAVE_M3N4_##mode(13, 14, 15) UNIT_SAVE_M3N4_##mode(16, 17, 18)\
+  UNIT_SAVE_M3N4_##mode(19, 20, 21) EDGE_SAVE_M3N1_##mode(22, 23, 24)\
+  EDGE_SAVE_M3N1_##mode(25, 26, 27) EDGE_SAVE_M3N1_##mode(28, 29, 30)
+
+#define KERNEL_M3N19_PRELOAD4 \
+  "ldr q0,[x0],#16; ldr q1,[x1],#16; ldr q2,[x2],#16\n\t"\
+  "ldr q6,[x4]; ldr q7,[x4,#16]; ldr q8,[x4,#32]\n\t"\
+  "add x4,x4,#304\n\t"
+
+#define KERNEL_M3N19_MAIN4(ac1, ac2, ac3, an1, an2, an3) \
+  FMA_M3N4(10, 11, 12, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#-256]\n\t"\
+  FMA_M3N4(13, 14, 15, ac1, ac2, ac3, 7, 0) "ldr q7,[x4,#-240]\n\t"\
+  FMA_M3N4(16, 17, 18, ac1, ac2, ac3, 8, 0) "ldr q8,[x4,#-224]\n\t"\
+  "ldr q"#an1",[x0],#16\n\t"\
+  FMA_M3N4(19, 20, 21, ac1, ac2, ac3, 6, 0) "ldr q9,[x4,#-208]\n\t"\
+  FMA_M3N4(10, 11, 12, ac1, ac2, ac3, 7, 1) "ldr q6,[x4,#-192]\n\t"\
+  FMA_M3N4(13, 14, 15, ac1, ac2, ac3, 8, 1) "ldr q7,[x4,#-176]\n\t"\
+  "ldr q"#an2",[x1],#16\n\t"\
+  FMA_M3N4(16, 17, 18, ac1, ac2, ac3, 9, 1) "ldr q8,[x4,#-160]\n\t"\
+  FMA_M3N4(19, 20, 21, ac1, ac2, ac3, 6, 1) "ldr q9,[x4,#-144]\n\t"\
+  FMA_M3N4(10, 11, 12, ac1, ac2, ac3, 7, 2) "ldr q6,[x4,#-128]\n\t"\
+  "ldr q"#an3",[x2],#16\n\t"\
+  FMA_M3N4(13, 14, 15, ac1, ac2, ac3, 8, 2) "ldr q7,[x4,#-112]\n\t"\
+  FMA_M3N4(16, 17, 18, ac1, ac2, ac3, 9, 2) "ldr q8,[x4,#-96]\n\t"\
+  FMA_M3N4(19, 20, 21, ac1, ac2, ac3, 6, 2) "ldr q9,[x4,#-80]\n\t"\
+  "sub w5,w5,#4\n\t"\
+  FMA_M3N4(10, 11, 12, ac1, ac2, ac3, 7, 3) "ldr q6,[x4,#-64]\n\t"\
+  FMA_M3N4(13, 14, 15, ac1, ac2, ac3, 8, 3) "ldr q7,[x4,#-48]\n\t"\
+  FMA_M3N4(16, 17, 18, ac1, ac2, ac3, 9, 3) "ldr q8,[x4,#-32]\n\t"\
+  FMA_M3N4(19, 20, 21, ac1, ac2, ac3, 6, 3) "ldr q9,[x4,#-16]\n\t"\
+  FMA_M3N1(22, 23, 24, ac1, ac2, ac3, 7) "ldr q6,[x4]\n\t"\
+  FMA_M3N1(25, 26, 27, ac1, ac2, ac3, 8) "ldr q7,[x4,#16]\n\t"\
+  FMA_M3N1(28, 29, 30, ac1, ac2, ac3, 9) "ldr q8,[x4,#32]; add x4,x4,#304\n\t"
+
+#define KERNEL_M3N19_TAIL4(ac1, ac2, ac3) \
+  FMA_M3N4(10, 11, 12, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#-256]\n\t"\
+  FMA_M3N4(13, 14, 15, ac1, ac2, ac3, 7, 0) "ldr q7,[x4,#-240]\n\t"\
+  FMA_M3N4(16, 17, 18, ac1, ac2, ac3, 8, 0) "ldr q8,[x4,#-224]\n\t"\
+  "prfm pldl2keep,[x6]\n\t"\
+  FMA_M3N4(19, 20, 21, ac1, ac2, ac3, 6, 0) "ldr q9,[x4,#-208]\n\t"\
+  FMA_M3N4(10, 11, 12, ac1, ac2, ac3, 7, 1) "ldr q6,[x4,#-192]\n\t"\
+  FMA_M3N4(13, 14, 15, ac1, ac2, ac3, 8, 1) "ldr q7,[x4,#-176]\n\t"\
+  "prfm pldl2keep,[x7]\n\t"\
+  FMA_M3N4(16, 17, 18, ac1, ac2, ac3, 9, 1) "ldr q8,[x4,#-160]\n\t"\
+  FMA_M3N4(19, 20, 21, ac1, ac2, ac3, 6, 1) "ldr q9,[x4,#-144]\n\t"\
+  FMA_M3N4(10, 11, 12, ac1, ac2, ac3, 7, 2) "ldr q6,[x4,#-128]\n\t"\
+  "prfm pldl2keep,[x8]\n\t"\
+  FMA_M3N4(13, 14, 15, ac1, ac2, ac3, 8, 2) "ldr q7,[x4,#-112]\n\t"\
+  FMA_M3N4(16, 17, 18, ac1, ac2, ac3, 9, 2) "ldr q8,[x4,#-96]\n\t"\
+  FMA_M3N4(19, 20, 21, ac1, ac2, ac3, 6, 2) "ldr q9,[x4,#-80]\n\t"\
+  "sub w5,w5,#4\n\t"\
+  FMA_M3N4(10, 11, 12, ac1, ac2, ac3, 7, 3) "ldr q6,[x4,#-64]\n\t"\
+  FMA_M3N4(13, 14, 15, ac1, ac2, ac3, 8, 3) "ldr q7,[x4,#-48]\n\t"\
+  FMA_M3N4(16, 17, 18, ac1, ac2, ac3, 9, 3) "ldr q8,[x4,#-32]\n\t"\
+  FMA_M3N4(19, 20, 21, ac1, ac2, ac3, 6, 3) "ldr q9,[x4,#-16]\n\t"\
+  FMA_M3N1(22, 23, 24, ac1, ac2, ac3, 7)\
+  FMA_M3N1(25, 26, 27, ac1, ac2, ac3, 8)\
+  FMA_M3N1(28, 29, 30, ac1, ac2, ac3, 9)
+
+#define KERNEL_M3N19_TL1 \
+  "ldr s0,[x0],#4; ldr s1,[x1],#4; ldr s2,[x2],#4\n\t"\
+  "ldr q6,[x4],#16; ldr q7,[x4],#16; ldr q8,[x4],#16; ldr q9,[x4],#16\n\t"\
+  "fmla v10.4s,v6.4s,v0.s[0]; fmla v11.4s,v6.4s,v1.s[0]; subs w5,w5,#1\n\t"\
+  "fmla v12.4s,v6.4s,v2.s[0]; fmla v13.4s,v7.4s,v0.s[0]; ldr d6,[x4],#8\n\t"\
+  "fmla v14.4s,v7.4s,v1.s[0]; fmla v15.4s,v7.4s,v2.s[0]; ldr s7,[x4],#4\n\t"\
+  "fmla v16.4s,v8.4s,v0.s[0]; fmla v17.4s,v8.4s,v1.s[0]\n\t"\
+  "fmla v18.4s,v8.4s,v2.s[0]; fmla v19.4s,v9.4s,v0.s[0]\n\t"\
+  "fmla v20.4s,v9.4s,v1.s[0]; fmla v21.4s,v9.4s,v2.s[0]\n\t"\
+  "fmla v22.4s,v0.4s,v6.s[0]; fmla v23.4s,v1.4s,v6.s[0]\n\t"\
+  "fmla v24.4s,v2.4s,v6.s[0]; fmla v25.4s,v0.4s,v6.s[1]\n\t"\
+  "fmla v26.4s,v1.4s,v6.s[1]; fmla v27.4s,v2.4s,v6.s[1]\n\t"\
+  "fmla v28.4s,v0.4s,v7.s[0]; fmla v29.4s,v1.4s,v7.s[0]\n\t"\
+  "fmla v30.4s,v2.4s,v7.s[0]\n\t"
+
+
+#define INIT_M3N20 INIT_4V(8, 9, 10, 11) INIT_4V(12, 13, 14, 15)\
+  INIT_4V(16, 17, 18, 19) INIT_2V(20, 21) INIT_1V(22)
+
+#define SAVE_M3N20(mode) UNIT_SAVE_M3N4_##mode(8, 9, 10)\
+  UNIT_SAVE_M3N4_##mode(11, 12, 13) UNIT_SAVE_M3N4_##mode(14, 15, 16)\
+  UNIT_SAVE_M3N4_##mode(17, 18, 19) UNIT_SAVE_M3N4_##mode(20, 21, 22)
+
+#define KERNEL_M3N20_PRELOAD4 \
+  "ldr q0,[x0],#16; ldr q1,[x1],#16; ldr q2,[x2],#16\n\t"\
+  "ldr q6,[x4]; ldr q7,[x4,#16]\n\t"
+
+#define KERNEL_M3N20_MAIN4(ac1, ac2, ac3, an1, an2, an3) \
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#32]\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 7, 0) "ldr q7,[x4,#48]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#64]\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 7, 0) "ldr q7,[x4,#80]\n\t"\
+  "ldr q"#an1",[x0],#16\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#96]\n\t"\
+  "add x4,x4,#320\n\t"\
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#-208]\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 6, 1) "ldr q6,[x4,#-192]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#-176]\n\t"\
+  "ldr q"#an2",[x1],#16\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 6, 1) "ldr q6,[x4,#-160]\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#-144]\n\t"\
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 6, 2) "ldr q6,[x4,#-128]\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 7, 2) "ldr q7,[x4,#-112]\n\t"\
+  "ldr q"#an3",[x2],#16\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 6, 2) "ldr q6,[x4,#-96]\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 7, 2) "ldr q7,[x4,#-80]\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 6, 2) "ldr q6,[x4,#-64]\n\t"\
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 7, 3) "ldr q7,[x4,#-48]\n\t"\
+  "sub w5,w5,#4\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 6, 3) "ldr q6,[x4,#-32]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 7, 3) "ldr q7,[x4,#-16]\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 6, 3) "ldr q6,[x4]\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 7, 3) "ldr q7,[x4,#16]\n\t"
+
+#define KERNEL_M3N20_TAIL4(ac1, ac2, ac3) \
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#32]\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 7, 0) "ldr q7,[x4,#48]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#64]\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 7, 0) "ldr q7,[x4,#80]\n\t"\
+  "prfm pldl2keep,[x6]\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#96]\n\t"\
+  "add x4,x4,#320\n\t"\
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#-208]\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 6, 1) "ldr q6,[x4,#-192]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#-176]\n\t"\
+  "prfm pldl2keep,[x7]\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 6, 1) "ldr q6,[x4,#-160]\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#-144]\n\t"\
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 6, 2) "ldr q6,[x4,#-128]\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 7, 2) "ldr q7,[x4,#-112]\n\t"\
+  "prfm pldl2keep,[x8]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 6, 2) "ldr q6,[x4,#-96]\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 7, 2) "ldr q7,[x4,#-80]\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 6, 2) "ldr q6,[x4,#-64]\n\t"\
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 7, 3) "ldr q7,[x4,#-48]\n\t"\
+  "sub w5,w5,#4\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 6, 3) "ldr q6,[x4,#-32]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 7, 3) "ldr q7,[x4,#-16]\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 6, 3)\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 7, 3)
+
+#define KERNEL_M3N20_TL1 \
+  "ldr s0,[x0],#4; ldr s1,[x1],#4; ldr s2,[x2],#4\n\t"\
+  "ldr q6,[x4],#16; ldr q7,[x4],#16\n\t"\
+  FMA_M3N4(8, 9, 10, 0, 1, 2, 6, 0) "ldr q6,[x4],#16\n\t"\
+  FMA_M3N4(11, 12, 13, 0, 1, 2, 7, 0) "ldr q7,[x4],#16\n\t"\
+  FMA_M3N4(14, 15, 16, 0, 1, 2, 6, 0) "ldr q6,[x4],#16\n\t"\
+  FMA_M3N4(17, 18, 19, 0, 1, 2, 7, 0) "subs w5,w5,#1\n\t"\
+  FMA_M3N4(20, 21, 22, 0, 1, 2, 6, 0)
+
+
+#define INIT_M3N21 INIT_4V(8, 9, 10, 11) INIT_4V(12, 13, 14, 15)\
+  INIT_4V(16, 17, 18, 19) INIT_4V(20, 21, 22, 23) INIT_2V(24, 25)
+
+#define SAVE_M3N21(mode) UNIT_SAVE_M3N4_##mode(8, 9, 10)\
+  UNIT_SAVE_M3N4_##mode(11, 12, 13) UNIT_SAVE_M3N4_##mode(14, 15, 16)\
+  UNIT_SAVE_M3N4_##mode(17, 18, 19) UNIT_SAVE_M3N4_##mode(20, 21, 22)\
+  EDGE_SAVE_M3N1_##mode(23, 24, 25)
+
+#define KERNEL_M3N21_PRELOAD4 \
+  "ldr q0,[x0],#16; ldr q1,[x1],#16; ldr q2,[x2],#16\n\t"\
+  "ldr q6,[x4]; ldr q7,[x4,#16]\n\t"
+
+#define KERNEL_M3N21_MAIN4(ac1, ac2, ac3, an1, an2, an3) \
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#32]\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 7, 0) "ldr q7,[x4,#48]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#64]\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 7, 0) "ldr q7,[x4,#80]\n\t"\
+  "ldr q"#an1",[x0],#16\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#96]\n\t"\
+  "add x4,x4,#336\n\t"\
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#-224]\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 6, 1) "ldr q6,[x4,#-208]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#-192]\n\t"\
+  "ldr q"#an2",[x1],#16\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 6, 1) "ldr q6,[x4,#-176]\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#-160]\n\t"\
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 6, 2) "ldr q6,[x4,#-144]\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 7, 2) "ldr q7,[x4,#-128]\n\t"\
+  "ldr q"#an3",[x2],#16\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 6, 2) "ldr q6,[x4,#-112]\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 7, 2) "ldr q7,[x4,#-96]\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 6, 2) "ldr q6,[x4,#-80]\n\t"\
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 7, 3) "ldr q7,[x4,#-64]\n\t"\
+  "sub w5,w5,#4\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 6, 3) "ldr q6,[x4,#-48]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 7, 3) "ldr q7,[x4,#-32]\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 6, 3) "ldr q6,[x4]\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 7, 3) "ldr q7,[x4,#-16]\n\t"\
+  FMA_M3N1(23, 24, 25, ac1, ac2, ac3, 7) "ldr q7,[x4,#16]\n\t"
+
+#define KERNEL_M3N21_TAIL4(ac1, ac2, ac3) \
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#32]\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 7, 0) "ldr q7,[x4,#48]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#64]\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 7, 0) "ldr q7,[x4,#80]\n\t"\
+  "prfm pldl2keep,[x6]\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#96]\n\t"\
+  "add x4,x4,#336\n\t"\
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#-224]\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 6, 1) "ldr q6,[x4,#-208]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#-192]\n\t"\
+  "prfm pldl2keep,[x7]\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 6, 1) "ldr q6,[x4,#-176]\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#-160]\n\t"\
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 6, 2) "ldr q6,[x4,#-144]\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 7, 2) "ldr q7,[x4,#-128]\n\t"\
+  "prfm pldl2keep,[x8]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 6, 2) "ldr q6,[x4,#-112]\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 7, 2) "ldr q7,[x4,#-96]\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 6, 2) "ldr q6,[x4,#-80]\n\t"\
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 7, 3) "ldr q7,[x4,#-64]\n\t"\
+  "sub w5,w5,#4\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 6, 3) "ldr q6,[x4,#-48]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 7, 3) "ldr q7,[x4,#-32]\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 6, 3)\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 7, 3) "ldr q7,[x4,#-16]\n\t"\
+  FMA_M3N1(23, 24, 25, ac1, ac2, ac3, 7)
+
+#define KERNEL_M3N21_TL1 \
+  "ldr s0,[x0],#4; ldr s1,[x1],#4; ldr s2,[x2],#4\n\t"\
+  "ldr q6,[x4],#16; ldr q7,[x4],#16\n\t"\
+  FMA_M3N4(8, 9, 10, 0, 1, 2, 6, 0) "ldr q6,[x4],#16\n\t"\
+  FMA_M3N4(11, 12, 13, 0, 1, 2, 7, 0) "ldr q7,[x4],#16\n\t"\
+  FMA_M3N4(14, 15, 16, 0, 1, 2, 6, 0) "ldr q6,[x4],#16\n\t"\
+  FMA_M3N4(17, 18, 19, 0, 1, 2, 7, 0) "ldr s7,[x4],#4\n\t"\
+  FMA_M3N4(20, 21, 22, 0, 1, 2, 6, 0) "subs w5,w5,#1\n\t"\
+  "fmla v23.4s,v0.4s,v7.s[0]; fmla v24.4s,v1.4s,v7.s[0]\n\t"\
+  "fmla v25.4s,v2.4s,v7.s[0]\n\t"
+
+
+#define INIT_M3N22 INIT_4V(8, 9, 10, 11) INIT_4V(12, 13, 14, 15)\
+  INIT_4V(16, 17, 18, 19) INIT_4V(20, 21, 22, 23)\
+  INIT_4V(24, 25, 26, 27) INIT_1V(28)
+
+#define SAVE_M3N22(mode) UNIT_SAVE_M3N4_##mode(8, 9, 10)\
+  UNIT_SAVE_M3N4_##mode(11, 12, 13) UNIT_SAVE_M3N4_##mode(14, 15, 16)\
+  UNIT_SAVE_M3N4_##mode(17, 18, 19) UNIT_SAVE_M3N4_##mode(20, 21, 22)\
+  EDGE_SAVE_M3N1_##mode(23, 24, 25) EDGE_SAVE_M3N1_##mode(26, 27, 28)\
+
+#define KERNEL_M3N22_PRELOAD4 \
+  "ldr q0,[x0],#16; ldr q1,[x1],#16; ldr q2,[x2],#16\n\t"\
+  "ldr q6,[x4]; ldr q7,[x4,#16]\n\t"
+
+#define KERNEL_M3N22_MAIN4(ac1, ac2, ac3, an1, an2, an3) \
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#32]\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 7, 0) "ldr q7,[x4,#48]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#64]\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 7, 0) "ldr q7,[x4,#80]\n\t"\
+  "ldr q"#an1",[x0],#16\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#96]\n\t"\
+  "add x4,x4,#352\n\t"\
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#-240]\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 6, 1) "ldr q6,[x4,#-224]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#-208]\n\t"\
+  "ldr q"#an2",[x1],#16\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 6, 1) "ldr q6,[x4,#-192]\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#-176]\n\t"\
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 6, 2) "ldr q6,[x4,#-160]\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 7, 2) "ldr q7,[x4,#-144]\n\t"\
+  "ldr q"#an3",[x2],#16\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 6, 2) "ldr q6,[x4,#-128]\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 7, 2) "ldr q7,[x4,#-112]\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 6, 2) "ldr q6,[x4,#-96]\n\t"\
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 7, 3) "ldr q7,[x4,#-80]\n\t"\
+  "sub w5,w5,#4\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 6, 3) "ldr q6,[x4,#-64]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 7, 3) "ldr q7,[x4,#-48]\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 6, 3) "ldr q6,[x4,#-32]\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 7, 3) "ldr q7,[x4,#-16]\n\t"\
+  FMA_M3N1(23, 24, 25, ac1, ac2, ac3, 6) "ldr q6,[x4]\n\t"\
+  FMA_M3N1(26, 27, 28, ac1, ac2, ac3, 7) "ldr q7,[x4,#16]\n\t"
+
+#define KERNEL_M3N22_TAIL4(ac1, ac2, ac3) \
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#32]\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 7, 0) "ldr q7,[x4,#48]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#64]\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 7, 0) "ldr q7,[x4,#80]\n\t"\
+  "prfm pldl2keep,[x6]\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#96]\n\t"\
+  "add x4,x4,#352\n\t"\
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#-240]\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 6, 1) "ldr q6,[x4,#-224]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#-208]\n\t"\
+  "prfm pldl2keep,[x7]\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 6, 1) "ldr q6,[x4,#-192]\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#-176]\n\t"\
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 6, 2) "ldr q6,[x4,#-160]\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 7, 2) "ldr q7,[x4,#-144]\n\t"\
+  "prfm pldl2keep,[x8]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 6, 2) "ldr q6,[x4,#-128]\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 7, 2) "ldr q7,[x4,#-112]\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 6, 2) "ldr q6,[x4,#-96]\n\t"\
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 7, 3) "ldr q7,[x4,#-80]\n\t"\
+  "sub w5,w5,#4\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 6, 3) "ldr q6,[x4,#-64]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 7, 3) "ldr q7,[x4,#-48]\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 6, 3) "ldr q6,[x4,#-32]\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 7, 3) "ldr q7,[x4,#-16]\n\t"\
+  FMA_M3N1(23, 24, 25, ac1, ac2, ac3, 6)\
+  FMA_M3N1(26, 27, 28, ac1, ac2, ac3, 7)
+
+#define KERNEL_M3N22_TL1 \
+  "ldr s0,[x0],#4; ldr s1,[x1],#4; ldr s2,[x2],#4\n\t"\
+  "ldr q6,[x4],#16; ldr q7,[x4],#16\n\t"\
+  FMA_M3N4(8, 9, 10, 0, 1, 2, 6, 0) "ldr q6,[x4],#16\n\t"\
+  FMA_M3N4(11, 12, 13, 0, 1, 2, 7, 0) "ldr q7,[x4],#16\n\t"\
+  FMA_M3N4(14, 15, 16, 0, 1, 2, 6, 0) "ldr q6,[x4],#16\n\t"\
+  FMA_M3N4(17, 18, 19, 0, 1, 2, 7, 0) "ldr d7,[x4],#8\n\t"\
+  FMA_M3N4(20, 21, 22, 0, 1, 2, 6, 0) "subs w5,w5,#1\n\t"\
+  "fmla v23.4s,v0.4s,v7.s[0]; fmla v24.4s,v1.4s,v7.s[0]\n\t"\
+  "fmla v25.4s,v2.4s,v7.s[0]; fmla v26.4s,v0.4s,v7.s[1]\n\t"\
+  "fmla v27.4s,v1.4s,v7.s[1]; fmla v28.4s,v2.4s,v7.s[1]\n\t"
+
+
+#define INIT_M3N23 INIT_4V(8, 9, 10, 11) INIT_4V(12, 13, 14, 15)\
+  INIT_4V(16, 17, 18, 19) INIT_4V(20, 21, 22, 23)\
+  INIT_4V(24, 25, 26, 27) INIT_4V(28, 29, 30, 31)
+
+#define SAVE_M3N23(mode) UNIT_SAVE_M3N4_##mode(8, 9, 10)\
+  UNIT_SAVE_M3N4_##mode(11, 12, 13) UNIT_SAVE_M3N4_##mode(14, 15, 16)\
+  UNIT_SAVE_M3N4_##mode(17, 18, 19) UNIT_SAVE_M3N4_##mode(20, 21, 22)\
+  EDGE_SAVE_M3N1_##mode(23, 24, 25) EDGE_SAVE_M3N1_##mode(26, 27, 28)\
+  EDGE_SAVE_M3N1_##mode(29, 30, 31)
+
+#define KERNEL_M3N23_PRELOAD4 \
+  "ldr q0,[x0],#16; ldr q1,[x1],#16; ldr q2,[x2],#16\n\t"\
+  "ldr q6,[x4]; ldr q7,[x4,#16]\n\t"
+
+#define KERNEL_M3N23_MAIN4(ac1, ac2, ac3, an1, an2, an3) \
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#32]\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 7, 0) "ldr q7,[x4,#48]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#64]\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 7, 0) "ldr q7,[x4,#80]\n\t"\
+  "ldr q"#an1",[x0],#16\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#96]\n\t"\
+  "add x4,x4,#368\n\t"\
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#-256]\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 6, 1) "ldr q6,[x4,#-240]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#-224]\n\t"\
+  "ldr q"#an2",[x1],#16\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 6, 1) "ldr q6,[x4,#-208]\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#-192]\n\t"\
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 6, 2) "ldr q6,[x4,#-176]\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 7, 2) "ldr q7,[x4,#-160]\n\t"\
+  "ldr q"#an3",[x2],#16\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 6, 2) "ldr q6,[x4,#-144]\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 7, 2) "ldr q7,[x4,#-128]\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 6, 2) "ldr q6,[x4,#-112]\n\t"\
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 7, 3) "ldr q7,[x4,#-96]\n\t"\
+  "sub w5,w5,#4\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 6, 3) "ldr q6,[x4,#-80]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 7, 3) "ldr q7,[x4,#-64]\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 6, 3) "ldr q6,[x4,#-48]\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 7, 3) "ldr q7,[x4,#-32]\n\t"\
+  FMA_M3N1(23, 24, 25, ac1, ac2, ac3, 6) "ldr q6,[x4]\n\t"\
+  FMA_M3N1(26, 27, 28, ac1, ac2, ac3, 7) "ldr q7,[x4,#-16]\n\t"\
+  FMA_M3N1(29, 30, 31, ac1, ac2, ac3, 7) "ldr q7,[x4,#16]\n\t"
+
+#define KERNEL_M3N23_TAIL4(ac1, ac2, ac3) \
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#32]\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 7, 0) "ldr q7,[x4,#48]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#64]\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 7, 0) "ldr q7,[x4,#80]\n\t"\
+  "prfm pldl2keep,[x6]\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#96]\n\t"\
+  "add x4,x4,#368\n\t"\
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#-256]\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 6, 1) "ldr q6,[x4,#-240]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#-224]\n\t"\
+  "prfm pldl2keep,[x7]\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 6, 1) "ldr q6,[x4,#-208]\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#-192]\n\t"\
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 6, 2) "ldr q6,[x4,#-176]\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 7, 2) "ldr q7,[x4,#-160]\n\t"\
+  "prfm pldl2keep,[x8]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 6, 2) "ldr q6,[x4,#-144]\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 7, 2) "ldr q7,[x4,#-128]\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 6, 2) "ldr q6,[x4,#-112]\n\t"\
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 7, 3) "ldr q7,[x4,#-96]\n\t"\
+  "sub w5,w5,#4\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 6, 3) "ldr q6,[x4,#-80]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 7, 3) "ldr q7,[x4,#-64]\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 6, 3) "ldr q6,[x4,#-48]\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 7, 3) "ldr q7,[x4,#-32]\n\t"\
+  FMA_M3N1(23, 24, 25, ac1, ac2, ac3, 6)\
+  FMA_M3N1(26, 27, 28, ac1, ac2, ac3, 7) "ldr q7,[x4,#-16]\n\t"\
+  FMA_M3N1(29, 30, 31, ac1, ac2, ac3, 7)
+
+#define KERNEL_M3N23_TL1 \
+  "ldr s0,[x0],#4; ldr s1,[x1],#4; ldr s2,[x2],#4\n\t"\
+  "ldr q6,[x4],#16; ldr q7,[x4],#16\n\t"\
+  FMA_M3N4(8, 9, 10, 0, 1, 2, 6, 0) "ldr q6,[x4],#16\n\t"\
+  FMA_M3N4(11, 12, 13, 0, 1, 2, 7, 0) "ldr q7,[x4],#16\n\t"\
+  FMA_M3N4(14, 15, 16, 0, 1, 2, 6, 0) "ldr q6,[x4],#16\n\t"\
+  FMA_M3N4(17, 18, 19, 0, 1, 2, 7, 0) "ldr d7,[x4],#8\n\t"\
+  FMA_M3N4(20, 21, 22, 0, 1, 2, 6, 0) "ldr s6,[x4],#4\n\t"\
+  "fmla v23.4s,v0.4s,v7.s[0]; fmla v24.4s,v1.4s,v7.s[0]; subs w5,w5,#1\n\t"\
+  "fmla v25.4s,v2.4s,v7.s[0]; fmla v26.4s,v0.4s,v7.s[1]\n\t"\
+  "fmla v27.4s,v1.4s,v7.s[1]; fmla v28.4s,v2.4s,v7.s[1]\n\t"\
+  "fmla v29.4s,v0.4s,v6.s[0]; fmla v30.4s,v1.4s,v6.s[0]\n\t"\
+  "fmla v31.4s,v2.4s,v6.s[0]\n\t"
+
+
+#define INIT_M3N24 INIT_4V(8, 9, 10, 11) INIT_4V(12, 13, 14, 15)\
+  INIT_4V(16, 17, 18, 19) INIT_4V(20, 21, 22, 23) INIT_2V(24, 25)
+
+#define SAVE_M3N24(mode) UNIT_SAVE_M3N4_##mode(8, 9, 10)\
+  UNIT_SAVE_M3N4_##mode(11, 12, 13) UNIT_SAVE_M3N4_##mode(14, 15, 16)\
+  UNIT_SAVE_M3N4_##mode(17, 18, 19) UNIT_SAVE_M3N4_##mode(20, 21, 22)\
+  UNIT_SAVE_M3N4_##mode(23, 24, 25)
+
+#define KERNEL_M3N24_PRELOAD4 \
+  "ldr q0,[x0],#16; ldr q1,[x1],#16; ldr q2,[x2],#16\n\t"\
+  "ldr q6,[x4]; ldr q7,[x4,#16]\n\t"
+
+#define KERNEL_M3N24_MAIN4(ac1, ac2, ac3, an1, an2, an3) \
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#32]\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 7, 0) "ldr q7,[x4,#48]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#64]\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 7, 0) "ldr q7,[x4,#80]\n\t"\
+  "ldr q"#an1",[x0],#16\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#96]\n\t"\
+  FMA_M3N4(23, 24, 25, ac1, ac2, ac3, 7, 0) "ldr q7,[x4,#112]\n\t"\
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 6, 1) "ldr q6,[x4,#128]\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#144]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 6, 1) "ldr q6,[x4,#160]\n\t"\
+  "ldr q"#an2",[x1],#16; add x4,x4,#384\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#-208]\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 6, 1) "ldr q6,[x4,#-192]\n\t"\
+  FMA_M3N4(23, 24, 25, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#-176]\n\t"\
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 6, 2) "ldr q6,[x4,#-160]\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 7, 2) "ldr q7,[x4,#-144]\n\t"\
+  "ldr q"#an3",[x2],#16\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 6, 2) "ldr q6,[x4,#-128]\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 7, 2) "ldr q7,[x4,#-112]\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 6, 2) "ldr q6,[x4,#-96]\n\t"\
+  FMA_M3N4(23, 24, 25, ac1, ac2, ac3, 7, 2) "ldr q7,[x4,#-80]\n\t"\
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 6, 3) "ldr q6,[x4,#-64]\n\t"\
+  "sub w5,w5,#4\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 7, 3) "ldr q7,[x4,#-48]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 6, 3) "ldr q6,[x4,#-32]\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 7, 3) "ldr q7,[x4,#-16]\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 6, 3) "ldr q6,[x4]\n\t"\
+  FMA_M3N4(23, 24, 25, ac1, ac2, ac3, 7, 3) "ldr q7,[x4,#16]\n\t"
+
+#define KERNEL_M3N24_TAIL4(ac1, ac2, ac3) \
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#32]\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 7, 0) "ldr q7,[x4,#48]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#64]\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 7, 0) "ldr q7,[x4,#80]\n\t"\
+  "prfm pldl2keep,[x6]\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#96]\n\t"\
+  FMA_M3N4(23, 24, 25, ac1, ac2, ac3, 7, 0) "ldr q7,[x4,#112]\n\t"\
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 6, 1) "ldr q6,[x4,#128]\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#144]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 6, 1) "ldr q6,[x4,#160]\n\t"\
+  "prfm pldl2keep,[x7]; add x4,x4,#384\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#-208]\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 6, 1) "ldr q6,[x4,#-192]\n\t"\
+  FMA_M3N4(23, 24, 25, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#-176]\n\t"\
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 6, 2) "ldr q6,[x4,#-160]\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 7, 2) "ldr q7,[x4,#-144]\n\t"\
+  "prfm pldl2keep,[x8]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 6, 2) "ldr q6,[x4,#-128]\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 7, 2) "ldr q7,[x4,#-112]\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 6, 2) "ldr q6,[x4,#-96]\n\t"\
+  FMA_M3N4(23, 24, 25, ac1, ac2, ac3, 7, 2) "ldr q7,[x4,#-80]\n\t"\
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 6, 3) "ldr q6,[x4,#-64]\n\t"\
+  "sub w5,w5,#4\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 7, 3) "ldr q7,[x4,#-48]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 6, 3) "ldr q6,[x4,#-32]\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 7, 3) "ldr q7,[x4,#-16]\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 6, 3)\
+  FMA_M3N4(23, 24, 25, ac1, ac2, ac3, 7, 3)
+
+#define KERNEL_M3N24_TL1 \
+  "ldr s0,[x0],#4; ldr s1,[x1],#4; ldr s2,[x2],#4\n\t"\
+  "ldr q6,[x4],#16; ldr q7,[x4],#16\n\t"\
+  FMA_M3N4(8, 9, 10, 0, 1, 2, 6, 0) "ldr q6,[x4],#16\n\t"\
+  FMA_M3N4(11, 12, 13, 0, 1, 2, 7, 0) "ldr q7,[x4],#16\n\t"\
+  FMA_M3N4(14, 15, 16, 0, 1, 2, 6, 0) "ldr q6,[x4],#16\n\t"\
+  FMA_M3N4(17, 18, 19, 0, 1, 2, 7, 0) "ldr q7,[x4],#16\n\t"\
+  FMA_M3N4(20, 21, 22, 0, 1, 2, 6, 0) "subs w5,w5,#1\n\t"\
+  FMA_M3N4(23, 24, 25, 0, 1, 2, 7, 0)
+
+
+#define INIT_M3N25 INIT_4V(8, 9, 10, 11) INIT_4V(12, 13, 14, 15)\
+  INIT_4V(16, 17, 18, 19) INIT_4V(20, 21, 22, 23)\
+  INIT_4V(24, 25, 26, 27) INIT_1V(28)
+
+#define SAVE_M3N25(mode) UNIT_SAVE_M3N4_##mode(8, 9, 10)\
+  UNIT_SAVE_M3N4_##mode(11, 12, 13) UNIT_SAVE_M3N4_##mode(14, 15, 16)\
+  UNIT_SAVE_M3N4_##mode(17, 18, 19) UNIT_SAVE_M3N4_##mode(20, 21, 22)\
+  UNIT_SAVE_M3N4_##mode(23, 24, 25) EDGE_SAVE_M3N1_##mode(26, 27, 28)
+
+#define KERNEL_M3N25_PRELOAD4 \
+  "ldr q0,[x0],#16; ldr q1,[x1],#16; ldr q2,[x2],#16\n\t"\
+  "ldr q6,[x4]; ldr q7,[x4,#16]\n\t"
+
+#define KERNEL_M3N25_MAIN4(ac1, ac2, ac3, an1, an2, an3) \
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#32]\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 7, 0) "ldr q7,[x4,#48]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#64]\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 7, 0) "ldr q7,[x4,#80]\n\t"\
+  "ldr q"#an1",[x0],#16\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#96]\n\t"\
+  FMA_M3N4(23, 24, 25, ac1, ac2, ac3, 7, 0) "ldr q7,[x4,#112]\n\t"\
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 6, 1) "ldr q6,[x4,#128]\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#144]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 6, 1) "ldr q6,[x4,#160]\n\t"\
+  "ldr q"#an2",[x1],#16; add x4,x4,#400\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#-224]\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 6, 1) "ldr q6,[x4,#-208]\n\t"\
+  FMA_M3N4(23, 24, 25, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#-192]\n\t"\
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 6, 2) "ldr q6,[x4,#-176]\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 7, 2) "ldr q7,[x4,#-160]\n\t"\
+  "ldr q"#an3",[x2],#16\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 6, 2) "ldr q6,[x4,#-144]\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 7, 2) "ldr q7,[x4,#-128]\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 6, 2) "ldr q6,[x4,#-112]\n\t"\
+  FMA_M3N4(23, 24, 25, ac1, ac2, ac3, 7, 2) "ldr q7,[x4,#-96]\n\t"\
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 6, 3) "ldr q6,[x4,#-80]\n\t"\
+  "sub w5,w5,#4\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 7, 3) "ldr q7,[x4,#-64]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 6, 3) "ldr q6,[x4,#-48]\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 7, 3) "ldr q7,[x4,#-32]\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 6, 3) "ldr q6,[x4]\n\t"\
+  FMA_M3N4(23, 24, 25, ac1, ac2, ac3, 7, 3) "ldr q7,[x4,#-16]\n\t"\
+  FMA_M3N1(26, 27, 28, ac1, ac2, ac3, 7) "ldr q7,[x4,#16]\n\t"
+
+#define KERNEL_M3N25_TAIL4(ac1, ac2, ac3) \
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#32]\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 7, 0) "ldr q7,[x4,#48]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#64]\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 7, 0) "ldr q7,[x4,#80]\n\t"\
+  "prfm pldl2keep,[x6]\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#96]\n\t"\
+  FMA_M3N4(23, 24, 25, ac1, ac2, ac3, 7, 0) "ldr q7,[x4,#112]\n\t"\
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 6, 1) "ldr q6,[x4,#128]\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#144]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 6, 1) "ldr q6,[x4,#160]\n\t"\
+  "prfm pldl2keep,[x7]; add x4,x4,#400\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#-224]\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 6, 1) "ldr q6,[x4,#-208]\n\t"\
+  FMA_M3N4(23, 24, 25, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#-192]\n\t"\
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 6, 2) "ldr q6,[x4,#-176]\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 7, 2) "ldr q7,[x4,#-160]\n\t"\
+  "prfm pldl2keep,[x8]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 6, 2) "ldr q6,[x4,#-144]\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 7, 2) "ldr q7,[x4,#-128]\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 6, 2) "ldr q6,[x4,#-112]\n\t"\
+  FMA_M3N4(23, 24, 25, ac1, ac2, ac3, 7, 2) "ldr q7,[x4,#-96]\n\t"\
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 6, 3) "ldr q6,[x4,#-80]\n\t"\
+  "sub w5,w5,#4\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 7, 3) "ldr q7,[x4,#-64]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 6, 3) "ldr q6,[x4,#-48]\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 7, 3) "ldr q7,[x4,#-32]\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 6, 3)\
+  FMA_M3N4(23, 24, 25, ac1, ac2, ac3, 7, 3) "ldr q7,[x4,#-16]\n\t"\
+  FMA_M3N1(26, 27, 28, ac1, ac2, ac3, 7)
+
+#define KERNEL_M3N25_TL1 \
+  "ldr s0,[x0],#4; ldr s1,[x1],#4; ldr s2,[x2],#4\n\t"\
+  "ldr q6,[x4],#16; ldr q7,[x4],#16\n\t"\
+  FMA_M3N4(8, 9, 10, 0, 1, 2, 6, 0) "ldr q6,[x4],#16\n\t"\
+  FMA_M3N4(11, 12, 13, 0, 1, 2, 7, 0) "ldr q7,[x4],#16\n\t"\
+  FMA_M3N4(14, 15, 16, 0, 1, 2, 6, 0) "ldr q6,[x4],#16\n\t"\
+  FMA_M3N4(17, 18, 19, 0, 1, 2, 7, 0) "ldr q7,[x4],#16\n\t"\
+  FMA_M3N4(20, 21, 22, 0, 1, 2, 6, 0) "ldr s6,[x4],#4\n\t"\
+  FMA_M3N4(23, 24, 25, 0, 1, 2, 7, 0) "subs w5,w5,#1\n\t"\
+  "fmla v26.4s,v0.4s,v6.s[0]; fmla v27.4s,v1.4s,v6.s[0]\n\t"\
+  "fmla v28.4s,v2.4s,v6.s[0]\n\t"
+
+
+#define INIT_M3N26 INIT_4V(8, 9, 10, 11) INIT_4V(12, 13, 14, 15)\
+  INIT_4V(16, 17, 18, 19) INIT_4V(20, 21, 22, 23)\
+  INIT_4V(24, 25, 26, 27) INIT_4V(28, 29, 30, 31)
+
+#define SAVE_M3N26(mode) UNIT_SAVE_M3N4_##mode(8, 9, 10)\
+  UNIT_SAVE_M3N4_##mode(11, 12, 13) UNIT_SAVE_M3N4_##mode(14, 15, 16)\
+  UNIT_SAVE_M3N4_##mode(17, 18, 19) UNIT_SAVE_M3N4_##mode(20, 21, 22)\
+  UNIT_SAVE_M3N4_##mode(23, 24, 25) EDGE_SAVE_M3N1_##mode(26, 27, 28)\
+  EDGE_SAVE_M3N1_##mode(29, 30, 31)
+
+#define KERNEL_M3N26_PRELOAD4 \
+  "ldr q0,[x0],#16; ldr q1,[x1],#16; ldr q2,[x2],#16\n\t"\
+  "ldr q6,[x4]; ldr q7,[x4,#16]\n\t"
+
+#define KERNEL_M3N26_MAIN4(ac1, ac2, ac3, an1, an2, an3) \
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#32]\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 7, 0) "ldr q7,[x4,#48]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#64]\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 7, 0) "ldr q7,[x4,#80]\n\t"\
+  "ldr q"#an1",[x0],#16\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#96]\n\t"\
+  FMA_M3N4(23, 24, 25, ac1, ac2, ac3, 7, 0) "ldr q7,[x4,#112]\n\t"\
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 6, 1) "ldr q6,[x4,#128]\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#144]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 6, 1) "ldr q6,[x4,#160]\n\t"\
+  "ldr q"#an2",[x1],#16; add x4,x4,#416\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#-240]\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 6, 1) "ldr q6,[x4,#-224]\n\t"\
+  FMA_M3N4(23, 24, 25, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#-208]\n\t"\
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 6, 2) "ldr q6,[x4,#-192]\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 7, 2) "ldr q7,[x4,#-176]\n\t"\
+  "ldr q"#an3",[x2],#16\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 6, 2) "ldr q6,[x4,#-160]\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 7, 2) "ldr q7,[x4,#-144]\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 6, 2) "ldr q6,[x4,#-128]\n\t"\
+  FMA_M3N4(23, 24, 25, ac1, ac2, ac3, 7, 2) "ldr q7,[x4,#-112]\n\t"\
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 6, 3) "ldr q6,[x4,#-96]\n\t"\
+  "sub w5,w5,#4\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 7, 3) "ldr q7,[x4,#-80]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 6, 3) "ldr q6,[x4,#-64]\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 7, 3) "ldr q7,[x4,#-48]\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 6, 3) "ldr q6,[x4,#-32]\n\t"\
+  FMA_M3N4(23, 24, 25, ac1, ac2, ac3, 7, 3) "ldr q7,[x4,#-16]\n\t"\
+  FMA_M3N1(26, 27, 28, ac1, ac2, ac3, 6) "ldr q6,[x4]\n\t"\
+  FMA_M3N1(29, 30, 31, ac1, ac2, ac3, 7) "ldr q7,[x4,#16]\n\t"
+
+#define KERNEL_M3N26_TAIL4(ac1, ac2, ac3) \
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#32]\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 7, 0) "ldr q7,[x4,#48]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#64]\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 7, 0) "ldr q7,[x4,#80]\n\t"\
+  "prfm pldl2keep,[x6]\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 6, 0) "ldr q6,[x4,#96]\n\t"\
+  FMA_M3N4(23, 24, 25, ac1, ac2, ac3, 7, 0) "ldr q7,[x4,#112]\n\t"\
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 6, 1) "ldr q6,[x4,#128]\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#144]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 6, 1) "ldr q6,[x4,#160]\n\t"\
+  "prfm pldl2keep,[x7]; add x4,x4,#416\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#-240]\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 6, 1) "ldr q6,[x4,#-224]\n\t"\
+  FMA_M3N4(23, 24, 25, ac1, ac2, ac3, 7, 1) "ldr q7,[x4,#-208]\n\t"\
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 6, 2) "ldr q6,[x4,#-192]\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 7, 2) "ldr q7,[x4,#-176]\n\t"\
+  "prfm pldl2keep,[x8]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 6, 2) "ldr q6,[x4,#-160]\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 7, 2) "ldr q7,[x4,#-144]\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 6, 2) "ldr q6,[x4,#-128]\n\t"\
+  FMA_M3N4(23, 24, 25, ac1, ac2, ac3, 7, 2) "ldr q7,[x4,#-112]\n\t"\
+  FMA_M3N4(8, 9, 10, ac1, ac2, ac3, 6, 3) "ldr q6,[x4,#-96]\n\t"\
+  "sub w5,w5,#4\n\t"\
+  FMA_M3N4(11, 12, 13, ac1, ac2, ac3, 7, 3) "ldr q7,[x4,#-80]\n\t"\
+  FMA_M3N4(14, 15, 16, ac1, ac2, ac3, 6, 3) "ldr q6,[x4,#-64]\n\t"\
+  FMA_M3N4(17, 18, 19, ac1, ac2, ac3, 7, 3) "ldr q7,[x4,#-48]\n\t"\
+  FMA_M3N4(20, 21, 22, ac1, ac2, ac3, 6, 3) "ldr q6,[x4,#-32]\n\t"\
+  FMA_M3N4(23, 24, 25, ac1, ac2, ac3, 7, 3) "ldr q7,[x4,#-16]\n\t"\
+  FMA_M3N1(26, 27, 28, ac1, ac2, ac3, 6)\
+  FMA_M3N1(29, 30, 31, ac1, ac2, ac3, 7)
+
+#define KERNEL_M3N26_TL1 \
+  "ldr s0,[x0],#4; ldr s1,[x1],#4; ldr s2,[x2],#4\n\t"\
+  "ldr q6,[x4],#16; ldr q7,[x4],#16\n\t"\
+  FMA_M3N4(8, 9, 10, 0, 1, 2, 6, 0) "ldr q6,[x4],#16\n\t"\
+  FMA_M3N4(11, 12, 13, 0, 1, 2, 7, 0) "ldr q7,[x4],#16\n\t"\
+  FMA_M3N4(14, 15, 16, 0, 1, 2, 6, 0) "ldr q6,[x4],#16\n\t"\
+  FMA_M3N4(17, 18, 19, 0, 1, 2, 7, 0) "ldr q7,[x4],#16\n\t"\
+  FMA_M3N4(20, 21, 22, 0, 1, 2, 6, 0) "ldr d6,[x4],#8\n\t"\
+  FMA_M3N4(23, 24, 25, 0, 1, 2, 7, 0) "subs w5,w5,#1\n\t"\
+  "fmla v26.4s,v0.4s,v6.s[0]; fmla v27.4s,v1.4s,v6.s[0]\n\t"\
+  "fmla v28.4s,v2.4s,v6.s[0]; fmla v29.4s,v0.4s,v6.s[1]\n\t"\
+  "fmla v30.4s,v1.4s,v6.s[1]; fmla v31.4s,v2.4s,v6.s[1]\n\t"
+
+FUNC_M3(13)
+FUNC_M3(14)
+FUNC_M3(15)
+FUNC_M3(16)
+FUNC_M3(17)
+FUNC_M3(18)
+FUNC_M3(19)
+FUNC_M3(20)
+FUNC_M3(21)
+FUNC_M3(22)
+FUNC_M3(23)
+FUNC_M3(24)
+FUNC_M3(25)
+FUNC_M3(26)
+
+
+#define INIT_M1N4 \
+  float32x4_t cq1, cq2, cq3, cq4;\
+  cq1 = cq2 = cq3 = cq4 = vdupq_n_f32(0.0f);
+
+#define INIT_M1N5 INIT_M1N4 float32x4_t cq5 = vdupq_n_f32(0.0f);
+#define INIT_M1N6 INIT_M1N5 float32x4_t cq6 = vdupq_n_f32(0.0f);
+#define INIT_M1N7 INIT_M1N6 float32x4_t cq7 = vdupq_n_f32(0.0f);
+
+#define INIT_M1N8 \
+  float32x4_t cq1, cq2, cq3, cq4, cq5, cq6, cq7, cq8;\
+  cq1 = cq2 = cq3 = cq4 = vdupq_n_f32(0.0f);\
+  cq5 = cq6 = cq7 = cq8 = vdupq_n_f32(0.0f);
+
+#define INIT_M1N9 INIT_M1N8 float32x4_t cq9 = vdupq_n_f32(0.0f);
+#define INIT_M1N10 INIT_M1N9 float32x4_t cq10 = vdupq_n_f32(0.0f);
+#define INIT_M1N11 INIT_M1N10 float32x4_t cq11 = vdupq_n_f32(0.0f);
+
+#define INIT_M1N12 \
+  float32x4_t cq1, cq2, cq3, cq4, cq5, cq6;\
+  float32x4_t cq7, cq8, cq9, cq10, cq11, cq12;\
+  cq1 = cq2 = cq3 = cq4 = cq5 = cq6 = vdupq_n_f32(0.0f);\
+  cq7 = cq8 = cq9 = cq10 = cq11 = cq12 = vdupq_n_f32(0.0f);
+
+#define INIT_M1N13 INIT_M1N12 float32x4_t cq13 = vdupq_n_f32(0.0f);
+#define INIT_M1N14 INIT_M1N13 float32x4_t cq14 = vdupq_n_f32(0.0f);
+#define INIT_M1N15 INIT_M1N14 float32x4_t cq15 = vdupq_n_f32(0.0f);
+
+#define INIT_M1N16 \
+  float32x4_t cq1, cq2, cq3, cq4, cq5, cq6, cq7, cq8;\
+  cq1 = cq2 = cq3 = cq4 = cq5 = cq6 = cq7 = cq8 = vdupq_n_f32(0.0f);\
+
+#define INIT_M1N17 INIT_M1N16 float32x4_t cq9 = vdupq_n_f32(0.0f);
+#define INIT_M1N18 INIT_M1N17 float32x4_t cq10 = vdupq_n_f32(0.0f);
+#define INIT_M1N19 INIT_M1N18 float32x4_t cq11 = vdupq_n_f32(0.0f);
+
+#define INIT_M1N20 \
+  float32x4_t cq1, cq2, cq3, cq4, cq5, cq6, cq7, cq8, cq9, cq10;\
+  cq1 = cq2 = cq3 = cq4 = cq5 = vdupq_n_f32(0.0f);\
+  cq6 = cq7 = cq8 = cq9 = cq10 = vdupq_n_f32(0.0f);
+
+#define INIT_M1N21 INIT_M1N20 float32x4_t cq11 = vdupq_n_f32(0.0f);
+#define INIT_M1N22 INIT_M1N21 float32x4_t cq12 = vdupq_n_f32(0.0f);
+#define INIT_M1N23 INIT_M1N22 float32x4_t cq13 = vdupq_n_f32(0.0f);
+
+#define INIT_M1N24 \
+  float32x4_t cq1, cq2, cq3, cq4, cq5, cq6;\
+  float32x4_t cq7, cq8, cq9, cq10, cq11, cq12;\
+  cq1 = cq2 = cq3 = cq4 = cq5 = cq6 = vdupq_n_f32(0.0f);\
+  cq7 = cq8 = cq9 = cq10 = cq11 = cq12 = vdupq_n_f32(0.0f);
+
+#define INIT_M1N25 INIT_M1N24 float32x4_t cq13 = vdupq_n_f32(0.0f);
+#define INIT_M1N26 INIT_M1N25 float32x4_t cq14 = vdupq_n_f32(0.0f);
+
+#define ACC_K4M1N4 \
+  float32x4_t aq1 = vld1q_f32(a_rd); a_rd += 4;\
+  float32x4_t bq1 = vld1q_f32(b_rd);\
+  float32x4_t bq2 = vld1q_f32(b_rd + 4);\
+  float32x4_t bq3 = vld1q_f32(b_rd + 8);\
+  float32x4_t bq4 = vld1q_f32(b_rd + 12);\
+  cq1 = vfmaq_laneq_f32(cq1, bq1, aq1, 0);\
+  cq2 = vfmaq_laneq_f32(cq2, bq2, aq1, 1);\
+  cq3 = vfmaq_laneq_f32(cq3, bq3, aq1, 2);\
+  cq4 = vfmaq_laneq_f32(cq4, bq4, aq1, 3);
+
+#define UNIT_ACC_K4M1N1(q_no, off) \
+  float32x4_t bq##q_no = vld1q_f32(b_rd + off);\
+  cq##q_no = vfmaq_f32(cq##q_no, bq##q_no, aq1);
+
+#define ACC_K4M1N5 ACC_K4M1N4 UNIT_ACC_K4M1N1(5, 16)
+#define ACC_K4M1N6 ACC_K4M1N5 UNIT_ACC_K4M1N1(6, 20)
+#define ACC_K4M1N7 ACC_K4M1N6 UNIT_ACC_K4M1N1(7, 24)
+
+#define ACC_K4M1N8 \
+  float32x4_t aq1 = vld1q_f32(a_rd); a_rd += 4;\
+  float32x4_t bq1 = vld1q_f32(b_rd);\
+  float32x4_t bq2 = vld1q_f32(b_rd + 4);\
+  float32x4_t bq3 = vld1q_f32(b_rd + 8);\
+  float32x4_t bq4 = vld1q_f32(b_rd + 12);\
+  float32x4_t bq5 = vld1q_f32(b_rd + 16);\
+  float32x4_t bq6 = vld1q_f32(b_rd + 20);\
+  float32x4_t bq7 = vld1q_f32(b_rd + 24);\
+  float32x4_t bq8 = vld1q_f32(b_rd + 28);\
+  cq1 = vfmaq_laneq_f32(cq1, bq1, aq1, 0);\
+  cq2 = vfmaq_laneq_f32(cq2, bq2, aq1, 0);\
+  cq3 = vfmaq_laneq_f32(cq3, bq3, aq1, 1);\
+  cq4 = vfmaq_laneq_f32(cq4, bq4, aq1, 1);\
+  cq5 = vfmaq_laneq_f32(cq5, bq5, aq1, 2);\
+  cq6 = vfmaq_laneq_f32(cq6, bq6, aq1, 2);\
+  cq7 = vfmaq_laneq_f32(cq7, bq7, aq1, 3);\
+  cq8 = vfmaq_laneq_f32(cq8, bq8, aq1, 3);
+
+#define ACC_K4M1N9 ACC_K4M1N8 UNIT_ACC_K4M1N1(9, 32)
+#define ACC_K4M1N10 ACC_K4M1N9 UNIT_ACC_K4M1N1(10, 36)
+#define ACC_K4M1N11 ACC_K4M1N10 UNIT_ACC_K4M1N1(11, 40)
+
+#define ACC_K4M1N12 \
+  float32x4_t aq1 = vld1q_f32(a_rd); a_rd += 4;\
+  float32x4_t bq1 = vld1q_f32(b_rd);\
+  float32x4_t bq2 = vld1q_f32(b_rd + 4);\
+  float32x4_t bq3 = vld1q_f32(b_rd + 8);\
+  float32x4_t bq4 = vld1q_f32(b_rd + 12);\
+  float32x4_t bq5 = vld1q_f32(b_rd + 16);\
+  float32x4_t bq6 = vld1q_f32(b_rd + 20);\
+  float32x4_t bq7 = vld1q_f32(b_rd + 24);\
+  float32x4_t bq8 = vld1q_f32(b_rd + 28);\
+  float32x4_t bq9 = vld1q_f32(b_rd + 32);\
+  float32x4_t bq10 = vld1q_f32(b_rd + 36);\
+  float32x4_t bq11 = vld1q_f32(b_rd + 40);\
+  float32x4_t bq12 = vld1q_f32(b_rd + 44);\
+  cq1 = vfmaq_laneq_f32(cq1, bq1, aq1, 0);\
+  cq2 = vfmaq_laneq_f32(cq2, bq2, aq1, 0);\
+  cq3 = vfmaq_laneq_f32(cq3, bq3, aq1, 0);\
+  cq4 = vfmaq_laneq_f32(cq4, bq4, aq1, 1);\
+  cq5 = vfmaq_laneq_f32(cq5, bq5, aq1, 1);\
+  cq6 = vfmaq_laneq_f32(cq6, bq6, aq1, 1);\
+  cq7 = vfmaq_laneq_f32(cq7, bq7, aq1, 2);\
+  cq8 = vfmaq_laneq_f32(cq8, bq8, aq1, 2);\
+  cq9 = vfmaq_laneq_f32(cq9, bq9, aq1, 2);\
+  cq10 = vfmaq_laneq_f32(cq10, bq10, aq1, 3);\
+  cq11 = vfmaq_laneq_f32(cq11, bq11, aq1, 3);\
+  cq12 = vfmaq_laneq_f32(cq12, bq12, aq1, 3);
+
+#define ACC_K4M1N13 ACC_K4M1N12 UNIT_ACC_K4M1N1(13, 48)
+#define ACC_K4M1N14 ACC_K4M1N13 UNIT_ACC_K4M1N1(14, 52)
+#define ACC_K4M1N15 ACC_K4M1N14 UNIT_ACC_K4M1N1(15, 56)
+
+#define ACC_K4M1N16 \
+  float32x4_t aq1 = vld1q_f32(a_rd); a_rd += 4;\
+  float32x4_t bq1 = vld1q_f32(b_rd);\
+  float32x4_t bq2 = vld1q_f32(b_rd + 4);\
+  float32x4_t bq3 = vld1q_f32(b_rd + 8);\
+  float32x4_t bq4 = vld1q_f32(b_rd + 12);\
+  float32x4_t bq5 = vld1q_f32(b_rd + 16);\
+  float32x4_t bq6 = vld1q_f32(b_rd + 20);\
+  float32x4_t bq7 = vld1q_f32(b_rd + 24);\
+  float32x4_t bq8 = vld1q_f32(b_rd + 28);\
+  cq1 = vfmaq_laneq_f32(cq1, bq1, aq1, 0); bq1 = vld1q_f32(b_rd + 32);\
+  cq2 = vfmaq_laneq_f32(cq2, bq2, aq1, 0); bq2 = vld1q_f32(b_rd + 36);\
+  cq3 = vfmaq_laneq_f32(cq3, bq3, aq1, 0); bq3 = vld1q_f32(b_rd + 40);\
+  cq4 = vfmaq_laneq_f32(cq4, bq4, aq1, 0); bq4 = vld1q_f32(b_rd + 44);\
+  cq5 = vfmaq_laneq_f32(cq5, bq5, aq1, 1); bq5 = vld1q_f32(b_rd + 48);\
+  cq6 = vfmaq_laneq_f32(cq6, bq6, aq1, 1); bq6 = vld1q_f32(b_rd + 52);\
+  cq7 = vfmaq_laneq_f32(cq7, bq7, aq1, 1); bq7 = vld1q_f32(b_rd + 56);\
+  cq8 = vfmaq_laneq_f32(cq8, bq8, aq1, 1); bq8 = vld1q_f32(b_rd + 60);\
+  cq1 = vfmaq_laneq_f32(cq1, bq1, aq1, 2);\
+  cq2 = vfmaq_laneq_f32(cq2, bq2, aq1, 2);\
+  cq3 = vfmaq_laneq_f32(cq3, bq3, aq1, 2);\
+  cq4 = vfmaq_laneq_f32(cq4, bq4, aq1, 2);\
+  cq5 = vfmaq_laneq_f32(cq5, bq5, aq1, 3);\
+  cq6 = vfmaq_laneq_f32(cq6, bq6, aq1, 3);\
+  cq7 = vfmaq_laneq_f32(cq7, bq7, aq1, 3);\
+  cq8 = vfmaq_laneq_f32(cq8, bq8, aq1, 3);
+
+#define ACC_K4M1N17 ACC_K4M1N16 UNIT_ACC_K4M1N1(9, 64)
+#define ACC_K4M1N18 ACC_K4M1N17 UNIT_ACC_K4M1N1(10, 68)
+#define ACC_K4M1N19 ACC_K4M1N18 UNIT_ACC_K4M1N1(11, 72)
+
+#define ACC_K4M1N20 \
+  float32x4_t aq1 = vld1q_f32(a_rd); a_rd += 4;\
+  float32x4_t bq1 = vld1q_f32(b_rd);\
+  float32x4_t bq2 = vld1q_f32(b_rd + 4);\
+  float32x4_t bq3 = vld1q_f32(b_rd + 8);\
+  float32x4_t bq4 = vld1q_f32(b_rd + 12);\
+  float32x4_t bq5 = vld1q_f32(b_rd + 16);\
+  float32x4_t bq6 = vld1q_f32(b_rd + 20);\
+  float32x4_t bq7 = vld1q_f32(b_rd + 24);\
+  float32x4_t bq8 = vld1q_f32(b_rd + 28);\
+  float32x4_t bq9 = vld1q_f32(b_rd + 32);\
+  float32x4_t bq10 = vld1q_f32(b_rd + 36);\
+  cq1 = vfmaq_laneq_f32(cq1, bq1, aq1, 0); bq1 = vld1q_f32(b_rd + 40);\
+  cq2 = vfmaq_laneq_f32(cq2, bq2, aq1, 0); bq2 = vld1q_f32(b_rd + 44);\
+  cq3 = vfmaq_laneq_f32(cq3, bq3, aq1, 0); bq3 = vld1q_f32(b_rd + 48);\
+  cq4 = vfmaq_laneq_f32(cq4, bq4, aq1, 0); bq4 = vld1q_f32(b_rd + 52);\
+  cq5 = vfmaq_laneq_f32(cq5, bq5, aq1, 0); bq5 = vld1q_f32(b_rd + 56);\
+  cq6 = vfmaq_laneq_f32(cq6, bq6, aq1, 1); bq6 = vld1q_f32(b_rd + 60);\
+  cq7 = vfmaq_laneq_f32(cq7, bq7, aq1, 1); bq7 = vld1q_f32(b_rd + 64);\
+  cq8 = vfmaq_laneq_f32(cq8, bq8, aq1, 1); bq8 = vld1q_f32(b_rd + 68);\
+  cq9 = vfmaq_laneq_f32(cq9, bq9, aq1, 1); bq9 = vld1q_f32(b_rd + 72);\
+  cq10 = vfmaq_laneq_f32(cq10, bq10, aq1, 1); bq10 = vld1q_f32(b_rd + 76);\
+  cq1 = vfmaq_laneq_f32(cq1, bq1, aq1, 2);\
+  cq2 = vfmaq_laneq_f32(cq2, bq2, aq1, 2);\
+  cq3 = vfmaq_laneq_f32(cq3, bq3, aq1, 2);\
+  cq4 = vfmaq_laneq_f32(cq4, bq4, aq1, 2);\
+  cq5 = vfmaq_laneq_f32(cq5, bq5, aq1, 2);\
+  cq6 = vfmaq_laneq_f32(cq6, bq6, aq1, 3);\
+  cq7 = vfmaq_laneq_f32(cq7, bq7, aq1, 3);\
+  cq8 = vfmaq_laneq_f32(cq8, bq8, aq1, 3);\
+  cq9 = vfmaq_laneq_f32(cq9, bq9, aq1, 3);\
+  cq10 = vfmaq_laneq_f32(cq10, bq10, aq1, 3);
+
+#define ACC_K4M1N21 ACC_K4M1N20 UNIT_ACC_K4M1N1(11, 80)
+#define ACC_K4M1N22 ACC_K4M1N21 UNIT_ACC_K4M1N1(12, 84)
+#define ACC_K4M1N23 ACC_K4M1N22 UNIT_ACC_K4M1N1(13, 88)
+
+#define ACC_K4M1N24 \
+  float32x4_t aq1 = vld1q_f32(a_rd); a_rd += 4;\
+  float32x4_t bq1 = vld1q_f32(b_rd);\
+  float32x4_t bq2 = vld1q_f32(b_rd + 4);\
+  float32x4_t bq3 = vld1q_f32(b_rd + 8);\
+  float32x4_t bq4 = vld1q_f32(b_rd + 12);\
+  float32x4_t bq5 = vld1q_f32(b_rd + 16);\
+  float32x4_t bq6 = vld1q_f32(b_rd + 20);\
+  float32x4_t bq7 = vld1q_f32(b_rd + 24);\
+  float32x4_t bq8 = vld1q_f32(b_rd + 28);\
+  float32x4_t bq9 = vld1q_f32(b_rd + 32);\
+  float32x4_t bq10 = vld1q_f32(b_rd + 36);\
+  float32x4_t bq11 = vld1q_f32(b_rd + 40);\
+  float32x4_t bq12 = vld1q_f32(b_rd + 44);\
+  cq1 = vfmaq_laneq_f32(cq1, bq1, aq1, 0); bq1 = vld1q_f32(b_rd + 48);\
+  cq2 = vfmaq_laneq_f32(cq2, bq2, aq1, 0); bq2 = vld1q_f32(b_rd + 52);\
+  cq3 = vfmaq_laneq_f32(cq3, bq3, aq1, 0); bq3 = vld1q_f32(b_rd + 56);\
+  cq4 = vfmaq_laneq_f32(cq4, bq4, aq1, 0); bq4 = vld1q_f32(b_rd + 60);\
+  cq5 = vfmaq_laneq_f32(cq5, bq5, aq1, 0); bq5 = vld1q_f32(b_rd + 64);\
+  cq6 = vfmaq_laneq_f32(cq6, bq6, aq1, 0); bq6 = vld1q_f32(b_rd + 68);\
+  cq7 = vfmaq_laneq_f32(cq7, bq7, aq1, 1); bq7 = vld1q_f32(b_rd + 72);\
+  cq8 = vfmaq_laneq_f32(cq8, bq8, aq1, 1); bq8 = vld1q_f32(b_rd + 76);\
+  cq9 = vfmaq_laneq_f32(cq9, bq9, aq1, 1); bq9 = vld1q_f32(b_rd + 80);\
+  cq10 = vfmaq_laneq_f32(cq10, bq10, aq1, 1); bq10 = vld1q_f32(b_rd + 84);\
+  cq11 = vfmaq_laneq_f32(cq11, bq11, aq1, 1); bq11 = vld1q_f32(b_rd + 88);\
+  cq12 = vfmaq_laneq_f32(cq12, bq12, aq1, 1); bq12 = vld1q_f32(b_rd + 92);\
+  cq1 = vfmaq_laneq_f32(cq1, bq1, aq1, 2);\
+  cq2 = vfmaq_laneq_f32(cq2, bq2, aq1, 2);\
+  cq3 = vfmaq_laneq_f32(cq3, bq3, aq1, 2);\
+  cq4 = vfmaq_laneq_f32(cq4, bq4, aq1, 2);\
+  cq5 = vfmaq_laneq_f32(cq5, bq5, aq1, 2);\
+  cq6 = vfmaq_laneq_f32(cq6, bq6, aq1, 2);\
+  cq7 = vfmaq_laneq_f32(cq7, bq7, aq1, 3);\
+  cq8 = vfmaq_laneq_f32(cq8, bq8, aq1, 3);\
+  cq9 = vfmaq_laneq_f32(cq9, bq9, aq1, 3);\
+  cq10 = vfmaq_laneq_f32(cq10, bq10, aq1, 3);\
+  cq11 = vfmaq_laneq_f32(cq11, bq11, aq1, 3);\
+  cq12 = vfmaq_laneq_f32(cq12, bq12, aq1, 3);
+
+#define ACC_K4M1N25 ACC_K4M1N24 UNIT_ACC_K4M1N1(13, 96)
+#define ACC_K4M1N26 ACC_K4M1N25 UNIT_ACC_K4M1N1(14, 100)
+
+#define REDUC_M1N4 \
+  cq1 = vaddq_f32(cq1, cq2); cq3 = vaddq_f32(cq3, cq4);\
+  cq1 = vaddq_f32(cq1, cq3);
+
+#define UNIT_REDUC_1V(q_no, s_no) \
+  float32x2_t cd##s_no = vadd_f32(vget_low_f32(cq##q_no),\
+    vget_high_f32(cq##q_no));\
+  float cs##s_no = vget_lane_f32(cd##s_no, 0) + vget_lane_f32(cd##s_no, 1);
+
+#define REDUC_M1N5 REDUC_M1N4 UNIT_REDUC_1V(5, 1)
+#define REDUC_M1N6 REDUC_M1N5 UNIT_REDUC_1V(6, 2)
+#define REDUC_M1N7 REDUC_M1N6 UNIT_REDUC_1V(7, 3)
+
+#define REDUC_M1N8 \
+  cq1 = vaddq_f32(cq1, cq3); cq2 = vaddq_f32(cq2, cq4);\
+  cq5 = vaddq_f32(cq5, cq7); cq6 = vaddq_f32(cq6, cq8);\
+  cq1 = vaddq_f32(cq1, cq5); cq2 = vaddq_f32(cq2, cq6);
+
+#define REDUC_M1N9 REDUC_M1N8 UNIT_REDUC_1V(9, 1)
+#define REDUC_M1N10 REDUC_M1N9 UNIT_REDUC_1V(10, 2)
+#define REDUC_M1N11 REDUC_M1N10 UNIT_REDUC_1V(11, 3)
+
+#define REDUC_M1N12 \
+  cq1 = vaddq_f32(cq1, cq4); cq2 = vaddq_f32(cq2, cq5);\
+  cq3 = vaddq_f32(cq3, cq6); cq7 = vaddq_f32(cq7, cq10);\
+  cq8 = vaddq_f32(cq8, cq11); cq9 = vaddq_f32(cq9, cq12);\
+  cq1 = vaddq_f32(cq1, cq7); cq2 = vaddq_f32(cq2, cq8);\
+  cq3 = vaddq_f32(cq3, cq9);
+
+#define REDUC_M1N13 REDUC_M1N12 UNIT_REDUC_1V(13, 1)
+#define REDUC_M1N14 REDUC_M1N13 UNIT_REDUC_1V(14, 2)
+#define REDUC_M1N15 REDUC_M1N14 UNIT_REDUC_1V(15, 3)
+
+#define REDUC_M1N16 \
+  cq1 = vaddq_f32(cq1, cq5); cq2 = vaddq_f32(cq2, cq6);\
+  cq3 = vaddq_f32(cq3, cq7); cq4 = vaddq_f32(cq4, cq8);
+
+#define REDUC_M1N17 REDUC_M1N16 UNIT_REDUC_1V(9, 1)
+#define REDUC_M1N18 REDUC_M1N17 UNIT_REDUC_1V(10, 2)
+#define REDUC_M1N19 REDUC_M1N18 UNIT_REDUC_1V(11, 3)
+
+#define REDUC_M1N20 \
+  cq1 = vaddq_f32(cq1, cq6); cq2 = vaddq_f32(cq2, cq7);\
+  cq3 = vaddq_f32(cq3, cq8); cq4 = vaddq_f32(cq4, cq9);\
+  cq5 = vaddq_f32(cq5, cq10);
+
+#define REDUC_M1N21 REDUC_M1N20 UNIT_REDUC_1V(11, 1)
+#define REDUC_M1N22 REDUC_M1N21 UNIT_REDUC_1V(12, 2)
+#define REDUC_M1N23 REDUC_M1N22 UNIT_REDUC_1V(13, 3)
+
+#define REDUC_M1N24 \
+  cq1 = vaddq_f32(cq1, cq7); cq2 = vaddq_f32(cq2, cq8);\
+  cq3 = vaddq_f32(cq3, cq9); cq4 = vaddq_f32(cq4, cq10);\
+  cq5 = vaddq_f32(cq5, cq11); cq6 = vaddq_f32(cq6, cq12);
+
+#define REDUC_M1N25 REDUC_M1N24 UNIT_REDUC_1V(13, 1)
+#define REDUC_M1N26 REDUC_M1N25 UNIT_REDUC_1V(14, 2)
+
+#define ACC_K1M1N4 \
+  float as1 = *a_rd++;\
+  float32x4_t bq1 = vld1q_f32(b_rd);\
+  cq1 = vfmaq_n_f32(cq1, bq1, as1);
+
+#define ACC_K1M1N5 ACC_K1M1N4 cs1 += as1 * b_rd[4];
+#define ACC_K1M1N6 ACC_K1M1N5 cs2 += as1 * b_rd[5];
+#define ACC_K1M1N7 ACC_K1M1N6 cs3 += as1 * b_rd[6];
+
+#define ACC_K1M1N8 \
+  float as1 = *a_rd++;\
+  float32x4_t bq1 = vld1q_f32(b_rd);\
+  float32x4_t bq2 = vld1q_f32(b_rd + 4);\
+  cq1 = vfmaq_n_f32(cq1, bq1, as1);\
+  cq2 = vfmaq_n_f32(cq2, bq2, as1);
+
+#define ACC_K1M1N9 ACC_K1M1N8 cs1 += as1 * b_rd[8];
+#define ACC_K1M1N10 ACC_K1M1N9 cs2 += as1 * b_rd[9];
+#define ACC_K1M1N11 ACC_K1M1N10 cs3 += as1 * b_rd[10];
+
+#define ACC_K1M1N12 \
+  float as1 = *a_rd++;\
+  float32x4_t bq1 = vld1q_f32(b_rd);\
+  float32x4_t bq2 = vld1q_f32(b_rd + 4);\
+  float32x4_t bq3 = vld1q_f32(b_rd + 8);\
+  cq1 = vfmaq_n_f32(cq1, bq1, as1);\
+  cq2 = vfmaq_n_f32(cq2, bq2, as1);\
+  cq3 = vfmaq_n_f32(cq3, bq3, as1);
+
+#define ACC_K1M1N13 ACC_K1M1N12 cs1 += as1 * b_rd[12];
+#define ACC_K1M1N14 ACC_K1M1N13 cs2 += as1 * b_rd[13];
+#define ACC_K1M1N15 ACC_K1M1N14 cs3 += as1 * b_rd[14];
+
+#define ACC_K1M1N16 \
+  float as1 = *a_rd++;\
+  float32x4_t bq1 = vld1q_f32(b_rd);\
+  float32x4_t bq2 = vld1q_f32(b_rd + 4);\
+  float32x4_t bq3 = vld1q_f32(b_rd + 8);\
+  float32x4_t bq4 = vld1q_f32(b_rd + 12);\
+  cq1 = vfmaq_n_f32(cq1, bq1, as1);\
+  cq2 = vfmaq_n_f32(cq2, bq2, as1);\
+  cq3 = vfmaq_n_f32(cq3, bq3, as1);\
+  cq4 = vfmaq_n_f32(cq4, bq4, as1);
+
+#define ACC_K1M1N17 ACC_K1M1N16 cs1 += as1 * b_rd[16];
+#define ACC_K1M1N18 ACC_K1M1N17 cs2 += as1 * b_rd[17];
+#define ACC_K1M1N19 ACC_K1M1N18 cs3 += as1 * b_rd[18];
+
+#define ACC_K1M1N20 \
+  float as1 = *a_rd++;\
+  float32x4_t bq1 = vld1q_f32(b_rd);\
+  float32x4_t bq2 = vld1q_f32(b_rd + 4);\
+  float32x4_t bq3 = vld1q_f32(b_rd + 8);\
+  float32x4_t bq4 = vld1q_f32(b_rd + 12);\
+  float32x4_t bq5 = vld1q_f32(b_rd + 16);\
+  cq1 = vfmaq_n_f32(cq1, bq1, as1);\
+  cq2 = vfmaq_n_f32(cq2, bq2, as1);\
+  cq3 = vfmaq_n_f32(cq3, bq3, as1);\
+  cq4 = vfmaq_n_f32(cq4, bq4, as1);\
+  cq5 = vfmaq_n_f32(cq5, bq5, as1);
+
+#define ACC_K1M1N21 ACC_K1M1N20 cs1 += as1 * b_rd[20];
+#define ACC_K1M1N22 ACC_K1M1N21 cs2 += as1 * b_rd[21];
+#define ACC_K1M1N23 ACC_K1M1N22 cs3 += as1 * b_rd[22];
+
+#define ACC_K1M1N24 \
+  float as1 = *a_rd++;\
+  float32x4_t bq1 = vld1q_f32(b_rd);\
+  float32x4_t bq2 = vld1q_f32(b_rd + 4);\
+  float32x4_t bq3 = vld1q_f32(b_rd + 8);\
+  float32x4_t bq4 = vld1q_f32(b_rd + 12);\
+  float32x4_t bq5 = vld1q_f32(b_rd + 16);\
+  float32x4_t bq6 = vld1q_f32(b_rd + 20);\
+  cq1 = vfmaq_n_f32(cq1, bq1, as1);\
+  cq2 = vfmaq_n_f32(cq2, bq2, as1);\
+  cq3 = vfmaq_n_f32(cq3, bq3, as1);\
+  cq4 = vfmaq_n_f32(cq4, bq4, as1);\
+  cq5 = vfmaq_n_f32(cq5, bq5, as1);\
+  cq6 = vfmaq_n_f32(cq6, bq6, as1);
+
+#define ACC_K1M1N25 ACC_K1M1N24 cs1 += as1 * b_rd[24];
+#define ACC_K1M1N26 ACC_K1M1N25 cs2 += as1 * b_rd[25];
+
+#define UNIT_SAVE_M1N4_CC(cq1) \
+  c_ptr[0] = c_ptr[0] * beta + vgetq_lane_f32(cq1, 0);\
+  c_ptr[LDC] = c_ptr[LDC] * beta + vgetq_lane_f32(cq1, 1);\
+  c_ptr += LDC * 2;\
+  c_ptr[0] = c_ptr[0] * beta + vgetq_lane_f32(cq1, 2);\
+  c_ptr[LDC] = c_ptr[LDC] * beta + vgetq_lane_f32(cq1, 3);\
+  c_ptr += LDC * 2;
+
+#define UNIT_SAVE_M1N4_CR(cq1) \
+  cq1 = vfmaq_n_f32(cq1, vld1q_f32(c_ptr), beta);\
+  vst1q_f32(c_ptr, cq1); c_ptr += 4;
+
+#define UNIT_SAVE_M1N1_CC(cs1) \
+  c_ptr[0] = c_ptr[0] * beta + cs1; c_ptr += LDC;
+
+#define UNIT_SAVE_M1N1_CR(cs1) \
+  c_ptr[0] = c_ptr[0] * beta + cs1; c_ptr++;
+
+#define SAVE_M1N4(mode) UNIT_SAVE_M1N4_##mode(cq1)
+
+#define SAVE_M1N5(mode) SAVE_M1N4(mode) UNIT_SAVE_M1N1_##mode(cs1)
+#define SAVE_M1N6(mode) SAVE_M1N5(mode) UNIT_SAVE_M1N1_##mode(cs2)
+#define SAVE_M1N7(mode) SAVE_M1N6(mode) UNIT_SAVE_M1N1_##mode(cs3)
+
+#define SAVE_M1N8(mode) \
+  UNIT_SAVE_M1N4_##mode(cq1) UNIT_SAVE_M1N4_##mode(cq2)
+
+#define SAVE_M1N9(mode) SAVE_M1N8(mode) UNIT_SAVE_M1N1_##mode(cs1)
+#define SAVE_M1N10(mode) SAVE_M1N9(mode) UNIT_SAVE_M1N1_##mode(cs2)
+#define SAVE_M1N11(mode) SAVE_M1N10(mode) UNIT_SAVE_M1N1_##mode(cs3)
+
+#define SAVE_M1N12(mode) \
+  UNIT_SAVE_M1N4_##mode(cq1) UNIT_SAVE_M1N4_##mode(cq2) UNIT_SAVE_M1N4_##mode(cq3)
+
+#define SAVE_M1N13(mode) SAVE_M1N12(mode) UNIT_SAVE_M1N1_##mode(cs1)
+#define SAVE_M1N14(mode) SAVE_M1N13(mode) UNIT_SAVE_M1N1_##mode(cs2)
+#define SAVE_M1N15(mode) SAVE_M1N14(mode) UNIT_SAVE_M1N1_##mode(cs3)
+
+#define SAVE_M1N16(mode) \
+  UNIT_SAVE_M1N4_##mode(cq1) UNIT_SAVE_M1N4_##mode(cq2)\
+  UNIT_SAVE_M1N4_##mode(cq3) UNIT_SAVE_M1N4_##mode(cq4)
+
+#define SAVE_M1N17(mode) SAVE_M1N16(mode) UNIT_SAVE_M1N1_##mode(cs1)
+#define SAVE_M1N18(mode) SAVE_M1N17(mode) UNIT_SAVE_M1N1_##mode(cs2)
+#define SAVE_M1N19(mode) SAVE_M1N18(mode) UNIT_SAVE_M1N1_##mode(cs3)
+
+#define SAVE_M1N20(mode) \
+  UNIT_SAVE_M1N4_##mode(cq1) UNIT_SAVE_M1N4_##mode(cq2)\
+  UNIT_SAVE_M1N4_##mode(cq3) UNIT_SAVE_M1N4_##mode(cq4) UNIT_SAVE_M1N4_##mode(cq5)
+
+#define SAVE_M1N21(mode) SAVE_M1N20(mode) UNIT_SAVE_M1N1_##mode(cs1)
+#define SAVE_M1N22(mode) SAVE_M1N21(mode) UNIT_SAVE_M1N1_##mode(cs2)
+#define SAVE_M1N23(mode) SAVE_M1N22(mode) UNIT_SAVE_M1N1_##mode(cs3)
+
+#define SAVE_M1N24(mode) \
+  UNIT_SAVE_M1N4_##mode(cq1) UNIT_SAVE_M1N4_##mode(cq2) UNIT_SAVE_M1N4_##mode(cq3)\
+  UNIT_SAVE_M1N4_##mode(cq4) UNIT_SAVE_M1N4_##mode(cq5) UNIT_SAVE_M1N4_##mode(cq6)
+
+#define SAVE_M1N25(mode) SAVE_M1N24(mode) UNIT_SAVE_M1N1_##mode(cs1)
+#define SAVE_M1N26(mode) SAVE_M1N25(mode) UNIT_SAVE_M1N1_##mode(cs2)
+
+#define FUNC_M1(ndim) \
+static inline void sgemm_skinny1_a7x_m1n##ndim(\
+  const float * __restrict__ a_rd, const float * __restrict__ b_rd,\
+  float * __restrict__ c_ptr, uint32_t k_left, uint32_t LDC,\
+  uint8_t c_rowmajor, float beta) {\
+  INIT_M1N##ndim\
+  for (; k_left > 3; k_left -= 4) {\
+    ACC_K4M1N##ndim\
+    b_rd += 4 * ndim;\
+  }\
+  REDUC_M1N##ndim\
+  for (; k_left > 0; k_left--) {\
+    ACC_K1M1N##ndim\
+    b_rd += ndim;\
+  }\
+  if (c_rowmajor == 0) {\
+    SAVE_M1N##ndim(CC)\
+  } else {\
+    SAVE_M1N##ndim(CR)\
+  }\
+}
+
+FUNC_M1(4)
+FUNC_M1(5)
+FUNC_M1(6)
+FUNC_M1(7)
+FUNC_M1(8)
+FUNC_M1(9)
+FUNC_M1(10)
+FUNC_M1(11)
+FUNC_M1(12)
+FUNC_M1(13)
+FUNC_M1(14)
+FUNC_M1(15)
+FUNC_M1(16)
+FUNC_M1(17)
+FUNC_M1(18)
+FUNC_M1(19)
+FUNC_M1(20)
+FUNC_M1(21)
+FUNC_M1(22)
+FUNC_M1(23)
+FUNC_M1(24)
+FUNC_M1(25)
+FUNC_M1(26)
+
+#endif
diff --git a/src/arm_neon/ARMCompareAndSwap.c b/src/arm_neon/ARMCompareAndSwap.c
new file mode 100644
index 0000000..4dcf12f
--- /dev/null
+++ b/src/arm_neon/ARMCompareAndSwap.c
@@ -0,0 +1,112 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+#include <sys/auxv.h>
+#include <asm/hwcap.h>
+
+#if __aarch64__
+/* detect ARMv8 CAS support */
+static __thread uint8_t blas_arm64_cas_type = 0;
+static __thread uint8_t blas_arm64_cas_init = 0;
+
+#ifndef HWCAP_ATOMICS
+#define HWCAP_ATOMICS (1 << 8)
+#endif
+
+static uint8_t blas_arm64_get_cas_support() {
+  if (!blas_arm64_cas_init) {
+    blas_arm64_cas_type = (getauxval(AT_HWCAP) & HWCAP_ATOMICS) ?
+      1 : 0;
+    blas_arm64_cas_init = 1;
+  }
+  return blas_arm64_cas_type;
+}
+
+#endif
+
+uint32_t atomicCAS_U32(uint32_t comp, uint32_t write, uint32_t *dst) {
+#if __aarch64__
+  if (blas_arm64_get_cas_support()) {
+    uint32_t tmp = comp;
+    __asm__ __volatile__(
+      "cas %w0,%w1,[%2]\n\t"
+     :"+r"(tmp):"r"(write),"r"(dst):"cc","memory");
+    return tmp;
+  } else {
+    register uint32_t tmp __asm("w0");
+    register uint32_t comp_asm __asm("w2") = comp;
+    register uint32_t write_asm __asm("w3") = write;
+    register uint32_t *dst_asm __asm("x4") = dst;
+    __asm__ __volatile__(
+      "1:\n\t"
+      "ldxr %w0,[%x3]; cmp %w0,%w1; bne 2f; stxr w1,%w2,[%x3]\n\t"
+      "cmp w1,#0; bne 1b\n\t"
+      "2:\n\t"
+     :"+r"(tmp):"r"(comp_asm),"r"(write_asm),"r"(dst_asm):"x1","cc","memory");
+    return tmp;
+  }
+#else
+  register uint32_t tmp __asm("r0");
+  register uint32_t comp_asm __asm("r2") = comp;
+  register uint32_t write_asm __asm("r3") = write;
+  register uint32_t *dst_asm __asm("r4") = dst;
+  __asm__ __volatile__(
+    "1:\n\t"
+    "ldrex %0,[%3]; cmp %0,%1; bne 2f; strex r1,%2,[%3]\n\t"
+    "cmp r1,#0; bne 1b\n\t"
+    "2:\n\t"
+    :"+r"(tmp):"r"(comp_asm),"r"(write_asm),"r"(dst_asm):"r1","cc","memory");
+  return tmp;
+#endif
+}
+
+uint64_t atomicCAS_U64(uint64_t comp, uint64_t write, uint64_t *dst) {
+  uint64_t tmp;
+#if __aarch64__
+  if (blas_arm64_get_cas_support()) {
+    tmp = comp;
+    __asm__ __volatile__(
+      "cas %x0,%x1,[%2]\n\t"
+     :"+r"(tmp):"r"(write),"r"(dst):"cc","memory");
+  } else {
+    __asm__ __volatile__(
+      "mov x2,%x1; mov x4,%x2\n\t"
+      "1:\n\t"
+      "ldxr %x0,[%x3]; cmp %x0,x2; bne 2f; stxr w6,x4,[%x3]\n\t"
+      "cmp w6,#0; bne 1b\n\t"
+      "2:\n\t"
+     :"+r"(tmp):"r"(comp),"r"(write),"r"(dst):"x2","x4","w6","cc","memory");
+  }
+#else
+  uint64_t *comp_addr = &comp;
+  uint64_t *write_loc = &write;
+  uint64_t *tmp_addr = &tmp;
+  __asm__ __volatile__(
+    "ldr r2,[%0]; ldr r3,[%0,#4]; ldr r4,[%1]; ldr r5,[%1,#4]\n\t"
+    "1:\n\t"
+    "ldrexd r0,r1,[%2]; cmp r0,r2; bne 2f\n\t"
+    "cmp r1,r3; bne 2f; strexd r6,r4,r5,[%2]\n\t"
+    "cmp r6,#0; bne 1b\n\t"
+    "2:\n\t"
+    "str r0,[%3]; str r1,[%3,#4]\n\t"
+   ::"r"(comp_addr),"r"(write_loc),"r"(dst),"r"(tmp_addr)
+   :"r0","r1","r2","r3","r4","r5","r6","cc","memory");
+#endif
+  return tmp;
+}
+
diff --git a/src/arm_neon/ARMCpuType.c b/src/arm_neon/ARMCpuType.c
new file mode 100644
index 0000000..6d58056
--- /dev/null
+++ b/src/arm_neon/ARMCpuType.c
@@ -0,0 +1,451 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include "arm_neon/ARMCpuType.h"
+#include <pthread.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/auxv.h>
+#include <asm/hwcap.h>
+#ifndef _POSIX_C_SOURCE
+#define _POSIX_C_SOURCE 200809L
+#endif
+#include <signal.h>
+#include <setjmp.h>
+
+#define MAX_CPU_COUNT 20
+
+struct ARM_CpuType {
+  bool m_init;
+  uint8_t m_cpuType[MAX_CPU_COUNT];
+};
+
+static struct ARM_CpuType blas_arm_cpu_type = {false, {0}};
+
+static pthread_mutex_t blas_arm_get_cpu_type_lock
+  = PTHREAD_MUTEX_INITIALIZER;
+
+static bool is_hex(char test) {
+  if (test >= 48 && test <= 57) return true; //0-9
+  else if (test >= 65 && test <= 70) return true; //A-F
+  else if (test >= 97 && test <= 102) return true; //a-f
+  else return false;
+}
+
+static uint16_t hex2num(char test) {
+  if (test >= 48 && test <= 57) return (test - 48); //0-9
+  else if (test >= 65 && test <= 70) return (test - 55); //A-F
+  else if (test >= 97 && test <= 102) return (test - 87); //a-f
+  else return 0;
+}
+
+static uint16_t extract_id(const char *line_header,
+  unsigned int header_start, unsigned int size) {
+
+  unsigned int header_read = header_start;
+  /* find the first colon */
+  for (; header_read < size; ++header_read) {
+    if (line_header[header_read] == ':') {
+      header_read++;
+      break;
+    }
+  }
+  /* skip backspace chars after the colon */
+  for (; header_read < size; ++header_read) {
+    if (line_header[header_read] != ' ') break;
+  }
+  /* detect 0x or 0X header */
+  bool hex_id = false;
+  if (header_read + 2 < size) {
+    if (line_header[header_read] == '0' &&
+      (line_header[header_read + 1] == 'x' || line_header[header_read + 1] == 'X')) {
+      hex_id = true;
+      header_read += 2;
+    }
+  }
+  /* read number */
+  uint16_t id = 0;
+  if (hex_id) {
+    for (; header_read < size; ++header_read) {
+      char test = line_header[header_read];
+      if (!is_hex(test)) break;
+      id = id * 16 + hex2num(test);
+    }
+  } else {//decimal
+    for (; header_read < size; ++header_read) {
+      char test = line_header[header_read];
+      if (test < 48 || test > 57) break;
+      id = id * 10 + (test - 48);
+    }
+  }
+  return id;
+}
+
+/* parse_midr: get CPU model information from MIDR bits */
+static uint8_t parse_midr(uint32_t midr) {
+
+  uint8_t cputype = 0; //0 = generic
+  uint32_t implementer = midr >> 24;
+  uint32_t part = (midr >> 4) & 0xFFF;
+  uint32_t variant = (midr >> 20) & 0xF;
+  if (implementer == 0x41) { //0x41 == ARM
+    if (part == 0xD03) cputype = 53; //Cortex-A53
+    else if (part == 0xD04) cputype = 35; //Cortex-A35
+    else if (part == 0xD05) {
+      if (variant > 0) cputype = 55; //Cortex-A55
+      else cputype = 53; //dual-issue ability of Cortex-A55r0 is limited
+    }
+  }
+  else if (implementer == 0x51) { //0x51 == Qualcomm
+    if (part == 0x803 || part == 0x801) cputype = 53;
+    if (part == 0x805) cputype = 55;
+  }
+  return cputype;
+}
+
+/* MIDR: Main ID Register in ARM processor */
+/* direct access of MIDR is not possible in user mode without kernel modules */
+/* however the system (Linux/Android) reads MIDR and stores its info to /proc/cpuinfo */
+/* so we can assemble the bits of MIDR from the informations in /proc/cpuinfo */
+static int read_midr(uint32_t *midr, uint8_t midr_size) {
+
+  FILE *fp = fopen("/proc/cpuinfo", "r");
+  if (fp == NULL) {
+    return -1; //file open failed
+  }
+
+  unsigned char num_cpu_detected = 0;
+  unsigned char num_cpu_part_parsed = 0;
+  unsigned char num_cpu_vendor_parsed = 0;
+
+  char buffer[300], line_header[30];
+  unsigned int header_read = 0, buffer_read = 0;
+  bool continue_find_endline = false, line_fill = false;
+  size_t bytes_read = 0;
+  unsigned int cpuid = 0;
+  do {
+    bytes_read = fread(buffer, 1, sizeof(buffer), fp);
+    if (ferror(fp)) {
+      fclose(fp);
+      return -2; //error during file read
+    }
+    for (buffer_read = 0; buffer_read < bytes_read; ) {
+      if (continue_find_endline) {
+        for (; buffer_read < bytes_read; ++buffer_read) {
+          if (buffer[buffer_read] == '\n') {
+            continue_find_endline = false;
+            buffer_read++;
+            break;
+          }
+        }
+      }
+      for (; buffer_read < bytes_read; ++buffer_read) {
+        if (header_read == sizeof(line_header) || buffer[buffer_read] == '\n') {
+          line_fill = true;
+          break;
+        }
+        line_header[header_read] = buffer[buffer_read]; header_read++;
+      }
+      if (line_fill) {
+        for (; header_read < sizeof(line_header); ++header_read) {
+          line_header[header_read] = '\0';
+        }
+        /* extract MIDR information from /proc/cpuinfo */
+        /* "CPU implementer : <implementer>" */
+        /* "CPU variant     : <variant>" */
+        /* "CPU architecture: <architecture>" */
+        /* "CPU part        : <part number>" */
+        /* "CPU revision    : <revision> */
+        if (line_header[0] == 'C' && line_header[1] == 'P' && line_header[2] == 'U'
+          && cpuid < midr_size) {
+
+          for (header_read = 3; header_read < sizeof(line_header); ++header_read) {
+            if (line_header[header_read] != ' ') break;
+          }
+          bool skip_detection = false;
+          /* extract architecture (MIDR[16:19]) */
+          if (header_read + 12 < sizeof(line_header)) {
+            if (line_header[header_read] == 'a' && line_header[header_read + 1] == 'r'
+              && line_header[header_read + 2] == 'c' && line_header[header_read + 3] == 'h'
+              && line_header[header_read + 4] == 'i' && line_header[header_read + 5] == 't') {
+
+              skip_detection = true;
+              header_read += 12;
+              midr[cpuid] |=
+                ((uint32_t)extract_id(line_header, header_read, sizeof(line_header)) << 16);
+            }
+          }
+          /* extract revision (MIDR[0:3]) */
+          if (!skip_detection && header_read + 8 < sizeof(line_header)) {
+            if (line_header[header_read] == 'r' && line_header[header_read + 1] == 'e'
+              && line_header[header_read + 2] == 'v' && line_header[header_read + 3] == 'i'
+              && line_header[header_read + 4] == 's' && line_header[header_read + 5] == 'i') {
+
+              skip_detection = true;
+              header_read += 8;
+              midr[cpuid] |= 
+                ((uint32_t)extract_id(line_header, header_read, sizeof(line_header)));
+            }
+          }
+          /* extract variant (MIDR[20:23]) */
+          if (!skip_detection && header_read + 7 < sizeof(line_header)) {
+            if (line_header[header_read] == 'v' && line_header[header_read + 1] == 'a'
+              && line_header[header_read + 2] == 'r' && line_header[header_read + 3] == 'i'
+              && line_header[header_read + 4] == 'a' && line_header[header_read + 5] == 'n') {
+
+              skip_detection = true;
+              header_read += 7;
+              midr[cpuid] |=
+                ((uint32_t)extract_id(line_header, header_read, sizeof(line_header)) << 20);
+            }
+          }
+          /* extract implementer (MIDR[24:31]) */
+          if (!skip_detection && header_read + 11 < sizeof(line_header)) {
+            if (line_header[header_read] == 'i' && line_header[header_read + 1] == 'm'
+              && line_header[header_read + 2] == 'p' && line_header[header_read + 3] == 'l'
+              && line_header[header_read + 4] == 'e' && line_header[header_read + 5] == 'm') {
+
+              skip_detection = true;
+              header_read += 11;
+              midr[cpuid] |=
+                ((uint32_t)extract_id(line_header, header_read, sizeof(line_header))) << 24;
+              num_cpu_vendor_parsed++;
+            }
+          }
+          /* extract part number (MIDR[4:15]) */
+          if (!skip_detection && header_read + 4 < sizeof(line_header)) {
+            if (line_header[header_read] == 'p' && line_header[header_read + 1] == 'a'
+              && line_header[header_read + 2] == 'r' && line_header[header_read + 3] == 't') {
+
+              skip_detection = true;
+              header_read += 4;
+              midr[cpuid] |=
+                ((uint32_t)extract_id(line_header, header_read, sizeof(line_header))) << 4;
+              num_cpu_part_parsed++;
+            }
+          }
+        }
+        /* read processor id from /proc/cpuinfo */
+        /* "processor  : <id>" */
+        if (line_header[0] == 'p' && line_header[1] == 'r' && line_header[2] == 'o'
+          && line_header[3] == 'c' && line_header[4] == 'e' && line_header[5] == 's'
+          && line_header[6] == 's' && line_header[7] == 'o' && line_header[8] == 'r') {
+
+          header_read = 9;
+          cpuid = extract_id(line_header, header_read, sizeof(line_header));
+          if (cpuid < midr_size) midr[cpuid] = 0;
+          num_cpu_detected++;
+        }
+        line_fill = false;
+        header_read = 0;
+      }
+      for (; buffer_read < bytes_read; ++buffer_read) {
+        continue_find_endline = true;
+        if (buffer[buffer_read] == '\n') {
+          continue_find_endline = false;
+          buffer_read++;
+          break;
+        }
+      }
+    }
+  } while(bytes_read == sizeof(buffer));
+
+  fclose(fp);
+
+  /* on some platforms the Linux kernel is buggy,
+   * info from /proc/cpuinfo lack some fields. */
+  if (num_cpu_detected != num_cpu_part_parsed) return -3;
+  if (num_cpu_detected != num_cpu_vendor_parsed) return -3;
+  return num_cpu_detected;
+}
+
+static char cpu_uevent[40] = "/sys/devices/system/cpu/cpu";
+
+static uint8_t get_cputype_from_uevent(uint8_t cpuid) {
+  /* first form the file path */
+  uint8_t digits[8];
+  uint8_t n_digits = 0;
+  uint8_t tmp = cpuid;
+  do {
+    digits[n_digits] = tmp % 10;
+    tmp /= 10;
+    n_digits++;
+  } while (tmp > 0);
+  for (uint8_t i = 0; i < n_digits; ++i) {
+    cpu_uevent[27 + i] = digits[n_digits - i - 1] + 48;
+  }
+  uint8_t tail_pos = 27 + n_digits;
+  cpu_uevent[tail_pos] = '/';
+  cpu_uevent[tail_pos + 1] = 'u';
+  cpu_uevent[tail_pos + 2] = 'e';
+  cpu_uevent[tail_pos + 3] = 'v';
+  cpu_uevent[tail_pos + 4] = 'e';
+  cpu_uevent[tail_pos + 5] = 'n';
+  cpu_uevent[tail_pos + 6] = 't';
+  cpu_uevent[tail_pos + 7] = '\0';
+  /* then open the file */
+  FILE *fp = fopen(cpu_uevent, "r");
+  if (fp == NULL) {
+    return 0; //file open failed
+  }
+  unsigned char buffer[100];
+  fread(buffer, 1, sizeof(buffer), fp);
+  if (ferror(fp)) {
+    return 0; //error during read
+  }
+  uint8_t cputype = 0;
+  /* search for patterns like "OF_COMPATIBLE_0=arm,cortex-a72" */
+  for (uint8_t i = 0; i < sizeof(buffer) - 40; ++i) {
+    if (buffer[i] == 'O' && buffer[i + 1] == 'F' && buffer[i + 2] == '_') {
+      i += 3;
+      if (buffer[i] == 'C' && buffer[i + 1] == 'O' && buffer[i + 2] == 'M') {
+        i += 3;
+        if (buffer[i] == 'P' && buffer[i + 1] == 'A' && buffer[i + 2] == 'T') {
+          i += 10;
+          if (buffer[i] == 'a' && buffer[i + 1] == 'r' && buffer[i + 2] == 'm') {
+            i += 4;
+            if (buffer[i] == 'c' && buffer[i + 1] == 'o' && buffer[i + 2] == 'r') {
+              i += 5;
+              if (buffer[i] == 'x' && buffer[i + 1] == '-' && buffer[i + 2] == 'a') {
+                char tmp = buffer[i + 3];
+                if (tmp >= 48 && tmp <= 57) cputype = tmp - 48;
+                tmp = buffer[i + 4];
+                if (tmp >= 48 && tmp <= 57) cputype = cputype * 10 + (tmp - 48);
+                break;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  return cputype;
+}
+
+uint8_t blas_arm_get_cpu_type(uint8_t cpuid) {
+  if (cpuid >= MAX_CPU_COUNT) return 0;
+  if (!blas_arm_cpu_type.m_init) {
+    int acc_lock = pthread_mutex_lock(&blas_arm_get_cpu_type_lock);
+    if (acc_lock != 0) return 0;
+    if (!blas_arm_cpu_type.m_init) {
+      uint32_t midr[MAX_CPU_COUNT];
+      for (int cpupos = 0; cpupos < MAX_CPU_COUNT; ++cpupos) {
+        midr[cpupos] = 0;
+      }
+      int midr_read_status = read_midr(midr, MAX_CPU_COUNT);
+      if (midr_read_status > MAX_CPU_COUNT) midr_read_status = MAX_CPU_COUNT;
+      if (midr_read_status >= 0) {
+        for (int cpupos = 0; cpupos < midr_read_status; ++cpupos) {
+          blas_arm_cpu_type.m_cpuType[cpupos] =
+            parse_midr(midr[cpupos]);
+        }
+      } else {
+        for (int cpupos = 0; cpupos < MAX_CPU_COUNT; ++cpupos) {
+          blas_arm_cpu_type.m_cpuType[cpupos] =
+            get_cputype_from_uevent(cpupos);
+        }
+      }
+      blas_arm_cpu_type.m_init = true;
+    }
+    pthread_mutex_unlock(&blas_arm_get_cpu_type_lock);
+  }
+  return blas_arm_cpu_type.m_cpuType[cpuid];
+}
+
+static __thread uint8_t blas_arm_fp16_type = 0;
+static __thread uint8_t blas_arm_fp16_init = 0;
+
+#ifndef HWCAP_ASIMDHP
+#define HWCAP_ASIMDHP (1 << 10)
+#endif
+#ifndef HWCAP_FPHP
+#define HWCAP_FPHP (1 << 9)
+#endif
+
+uint8_t blas_arm_get_fp16_support() {
+  if (!blas_arm_fp16_init) {
+    unsigned long hwcap = getauxval(AT_HWCAP);
+#if __aarch64__
+    blas_arm_fp16_type =
+      ((hwcap & HWCAP_ASIMDHP) && (hwcap & HWCAP_FPHP)) ? 2 : 1;
+#else
+    blas_arm_fp16_type =
+      ((hwcap & HWCAP_VFPv4) && (hwcap & HWCAP_NEON)) ? 1 : 0;
+#endif
+    blas_arm_fp16_init = 1;
+  }
+  return blas_arm_fp16_type;
+}
+
+#if __aarch64__
+#define GEMM_DEFAULT_I8I32_INST 1
+#else
+#define GEMM_DEFAULT_I8I32_INST 0
+#endif
+
+static uint8_t blas_arm_i8i32_type = GEMM_DEFAULT_I8I32_INST + 1;
+static uint8_t blas_arm_i8i32_init = 0;
+static pthread_mutex_t blas_arm_set_int_lock
+  = PTHREAD_MUTEX_INITIALIZER;
+static jmp_buf i8i32_ret_env;
+static pthread_t int_tid;
+
+static void i8i32gemm_sigill_handler(int sigill) {
+  if (pthread_equal(int_tid, pthread_self()) != 0) {
+    blas_arm_i8i32_type = GEMM_DEFAULT_I8I32_INST;
+    longjmp(i8i32_ret_env, 1);
+  } else {
+    _Exit(EXIT_FAILURE);
+  }
+}
+
+static void test_i8i32() {
+#if __aarch64__
+  __asm__ __volatile__("sdot v1.4s,v0.16b,v2.4b[0]":::"v0","v1","v2");
+#else
+  __asm__ __volatile__("vmlal.s16 q1,d0,d1[0]":::"q0","q1");
+#endif
+}
+
+uint8_t blas_arm_get_i8i32_support() {
+  if (!blas_arm_i8i32_init) {
+    int acc_lock = pthread_mutex_lock(&blas_arm_set_int_lock);
+    if (acc_lock != 0) return GEMM_DEFAULT_I8I32_INST;
+    if (!blas_arm_i8i32_init) {
+      struct sigaction i8i32_act, old_act;
+      memset(&i8i32_act, '\0', sizeof(i8i32_act));
+      i8i32_act.sa_handler = &i8i32gemm_sigill_handler;
+      int_tid = pthread_self();
+      if (setjmp(i8i32_ret_env)) {
+        sigaction(SIGILL, &old_act, NULL);
+        blas_arm_i8i32_init = 1;
+        pthread_mutex_unlock(&blas_arm_set_int_lock);
+        return GEMM_DEFAULT_I8I32_INST;
+      }
+      __asm__ __volatile__("dsb sy":::"memory");
+      sigaction(SIGILL, &i8i32_act, &old_act);
+      test_i8i32();
+      sigaction(SIGILL, &old_act, NULL);
+      blas_arm_i8i32_init = 1;
+    }
+    pthread_mutex_unlock(&blas_arm_set_int_lock);
+  }
+  return blas_arm_i8i32_type;
+}
+
diff --git a/src/neon_armv7a/Bias.c b/src/neon_armv7a/Bias.c
new file mode 100644
index 0000000..e77da7c
--- /dev/null
+++ b/src/neon_armv7a/Bias.c
@@ -0,0 +1,28 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include "arm_neon/NeonBias.h"
+#include "arm_neon/NeonSum.h"
+
+NEON_BIAS(float, float32x4_t, f32, 4, mla)
+
+NEON_BIAS(int32_t, int32x4_t, s32, 4, mla)
+
+NEON_I8I32_SUM(u, uint)
+
+NEON_I16_SUMSQUARE(s, int)
+
diff --git a/src/neon_armv7a/Layer.c b/src/neon_armv7a/Layer.c
new file mode 100644
index 0000000..c64cabd
--- /dev/null
+++ b/src/neon_armv7a/Layer.c
@@ -0,0 +1,24 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include "neon_armv7a/SgemmDriver.h"
+#include "neon_armv7a/Bias.h"
+#include "common/CommonLayer.h"
+#include <stdlib.h>
+
+SIMPLE_FC_FUNC(sgemm, float, float, float)
+
diff --git a/src/neon_armv7a/Quant.c b/src/neon_armv7a/Quant.c
new file mode 100644
index 0000000..3835294
--- /dev/null
+++ b/src/neon_armv7a/Quant.c
@@ -0,0 +1,52 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include "common/CommonQuant.h"
+#include "arm_neon/NeonQuant.h"
+
+NEON_FIND_EXTREME(float32_t, f32, float32x2_t, float32x4_t, 2)
+
+QUANTIZE_ASYMMETRIC(32, 8)
+
+QUANTIZE_SYMMETRIC(32, 8)
+
+QUANTIZE_ASYMMETRIC(32, 16)
+
+QUANTIZE_SYMMETRIC(32, 16)
+
+void dequantize_symmetric_f32_s32(const int32_t *src, float32_t *dst,
+  float32_t scale, uint32_t size) {
+
+  inline_dequant_cvt_f32_s32(dst, src, scale, size);
+}
+
+NEON_FIND_EXTREME(int32_t, s32, int32x2_t, int32x4_t, 2)
+
+NEON_FIND_EXTREME(int16_t, s16, int16x4_t, int16x8_t, 4)
+
+REQUANTIZE_ASYMMETRIC_MULHI(float, 32, 8, 64)
+
+REQUANTIZE_SYMMETRIC_MULHI(float, 32, 8, 64)
+
+REQUANTIZE_ASYMMETRIC_MULHI(float, 32, 16, 64)
+
+REQUANTIZE_SYMMETRIC_MULHI(float, 32, 16, 64)
+
+REQUANTIZE_ASYMMETRIC_MULHI(float, 16, 8, 32)
+
+REQUANTIZE_SYMMETRIC_MULHI(float, 16, 8, 32)
+
diff --git a/src/neon_armv7a/S8S32GemmDriver.c b/src/neon_armv7a/S8S32GemmDriver.c
new file mode 100644
index 0000000..6089838
--- /dev/null
+++ b/src/neon_armv7a/S8S32GemmDriver.c
@@ -0,0 +1,43 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include "neon_armv7a/S8S32MlaGemmDriver.h"
+#include "arm_neon/ARMCpuType.h"
+
+int s8s32gemm_serial(int a_rowmajor, int b_rowmajor,
+  const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t N, uint32_t K, int32_t beta_inp) {
+
+  if (blas_arm_get_i8i32_support() == 0) {
+    return 2;
+  }
+  return s8s32mlagemm_serial(a_rowmajor, b_rowmajor, A, B, C,
+    M, N, K, beta_inp);
+}
+
+int s8s32gemm(int a_rowmajor, int b_rowmajor,
+  const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t N, uint32_t K,
+  int32_t beta_inp, uint32_t num_threads) {
+
+  if (blas_arm_get_i8i32_support() == 0) {
+    return 2;
+  }
+  return s8s32mlagemm(a_rowmajor, b_rowmajor, A, B, C,
+    M, N, K, beta_inp, num_threads);
+}
+
diff --git a/src/neon_armv7a/S8S32MlaGemmCopy.c b/src/neon_armv7a/S8S32MlaGemmCopy.c
new file mode 100644
index 0000000..5e7edb8
--- /dev/null
+++ b/src/neon_armv7a/S8S32MlaGemmCopy.c
@@ -0,0 +1,30 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#ifdef GEMM_UNSIGNED_INT
+#undef GEMM_UNSIGNED_INT
+#endif
+
+#include "common/CommonCopy.h"
+#include "arm_neon/NeonI8I32MlaGemmCopy.h"
+
+GENERIC_NCOPY_FUNC(s8s32mlagemm, int8_t, int16_t, 6)
+GENERIC_NCOPY_FUNC(s8s32mlagemm, int8_t, int16_t, 8)
+
+GENERIC_TCOPY_FUNC(s8s32mlagemm, int8_t, int16_t, 6)
+GENERIC_TCOPY_FUNC(s8s32mlagemm, int8_t, int16_t, 8)
+
diff --git a/src/neon_armv7a/S8S32MlaGemmDriver.c b/src/neon_armv7a/S8S32MlaGemmDriver.c
new file mode 100644
index 0000000..7b09908
--- /dev/null
+++ b/src/neon_armv7a/S8S32MlaGemmDriver.c
@@ -0,0 +1,27 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include "neon_armv7a/S8S32MlaGemmCopy.h"
+#include "neon_armv7a/S8S32MlaGemmKernel.h"
+#include "neon_armv7a/S8S32MlaGemmSkinnyGer.h"
+#include "neon_armv7a/S8S32MlaGemmSkinnyDot.h"
+#include "arm_neon/ARMCompareAndSwap.h"
+#include "common/CommonDriver.h"
+
+GEMM_PARALLEL_FUNC(s8s32mlagemm, int8_t, int16_t, int8_t, int16_t, int32_t,
+  6, 8, 4, 4, 4, 4)
+
diff --git a/src/neon_armv7a/S8S32MlaGemmKernel.c b/src/neon_armv7a/S8S32MlaGemmKernel.c
new file mode 100644
index 0000000..bc0dcf1
--- /dev/null
+++ b/src/neon_armv7a/S8S32MlaGemmKernel.c
@@ -0,0 +1,27 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#ifdef GEMM_UNSIGNED_INT
+#undef GEMM_UNSIGNED_INT
+#endif
+
+#include "common/CommonKernel.h"
+#include "neon_armv7a/I8I32MlaGemmKernel.h"
+
+DUALPACK_KERNEL_FUNC_LM(s8s32mlagemm, int16_t, int16_t, int32_t, 6, 8)
+DUALPACK_KERNEL_FUNC_LN(s8s32mlagemm, int16_t, int16_t, int32_t, 8, 6)
+
diff --git a/src/neon_armv7a/S8S32MlaGemmSkinnyDot.c b/src/neon_armv7a/S8S32MlaGemmSkinnyDot.c
new file mode 100644
index 0000000..2380dc3
--- /dev/null
+++ b/src/neon_armv7a/S8S32MlaGemmSkinnyDot.c
@@ -0,0 +1,29 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#ifdef GEMM_UNSIGNED_INT
+#undef GEMM_UNSIGNED_INT
+#endif
+
+#include "arm_neon/ARMCompareAndSwap.h"
+#include "arm_neon/NeonI8I32MlaGemmSkinnyDot.h"
+#include "common/CommonSkinnyDot.h"
+
+GEMM_SKINNY_DOT_PARALLEL_FUNC(s8s32mlagemm, 1, 15, 7, 131072, int8_t, int8_t)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(s8s32mlagemm, 2, 15, 7, 131072, int8_t, int8_t)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(s8s32mlagemm, 3, 15, 3, 131072, int8_t, int8_t)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(s8s32mlagemm, 4, 15, 3, 131072, int8_t, int8_t)
\ No newline at end of file
diff --git a/src/neon_armv7a/S8S32MlaGemmSkinnyGer.c b/src/neon_armv7a/S8S32MlaGemmSkinnyGer.c
new file mode 100644
index 0000000..0ae4d36
--- /dev/null
+++ b/src/neon_armv7a/S8S32MlaGemmSkinnyGer.c
@@ -0,0 +1,29 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#ifdef GEMM_UNSIGNED_INT
+#undef GEMM_UNSIGNED_INT
+#endif
+
+#include "arm_neon/ARMCompareAndSwap.h"
+#include "arm_neon/NeonI8I32MlaGemmSkinnyGer.h"
+
+GEMM_SKINNY_GER_PARALLEL_FUNC(s8s32mlagemm, 1, 5, 5, 8192, int8_t, int8_t)
+GEMM_SKINNY_GER_PARALLEL_FUNC(s8s32mlagemm, 2, 5, 5, 8192, int8_t, int8_t)
+GEMM_SKINNY_GER_PARALLEL_FUNC(s8s32mlagemm, 3, 5, 5, 8192, int8_t, int8_t)
+GEMM_SKINNY_GER_PARALLEL_FUNC(s8s32mlagemm, 4, 5, 5, 8192, int8_t, int8_t)
+
diff --git a/src/neon_armv7a/SgemmCopy.c b/src/neon_armv7a/SgemmCopy.c
new file mode 100644
index 0000000..58929e6
--- /dev/null
+++ b/src/neon_armv7a/SgemmCopy.c
@@ -0,0 +1,31 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include "common/CommonCopy.h"
+#include "arm_neon/NeonSgemmCopy.h"
+
+#define NCOPY_float_float(unroll) NCOPY_UNROLL_##unroll
+
+GENERIC_NCOPY_FUNC(sgemm, float, float, 6)
+GENERIC_NCOPY_FUNC(sgemm, float, float, 8)
+
+#define TCOPY_UNIT_float_float(src_ptr, dst_ptr, dst_offset, num_elements) \
+  TCOPY_UNIT_##num_elements(src_ptr, dst_ptr, dst_offset)
+
+GENERIC_TCOPY_FUNC(sgemm, float, float, 6)
+GENERIC_TCOPY_FUNC(sgemm, float, float, 8)
+
diff --git a/src/neon_armv7a/SgemmDriver.c b/src/neon_armv7a/SgemmDriver.c
new file mode 100644
index 0000000..1fa8a15
--- /dev/null
+++ b/src/neon_armv7a/SgemmDriver.c
@@ -0,0 +1,26 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include "neon_armv7a/SgemmKernel.h"
+#include "neon_armv7a/SgemmCopy.h"
+#include "neon_armv7a/SgemmSkinnyGer.h"
+#include "neon_armv7a/SgemmSkinnyDot.h"
+#include "arm_neon/ARMCompareAndSwap.h"
+#include "common/CommonDriver.h"
+
+GEMM_PARALLEL_FUNC(sgemm, float, float, float, float, float, 6, 8, 8, 8, 8, 8)
+
diff --git a/src/neon_armv7a/SgemmKernel.c b/src/neon_armv7a/SgemmKernel.c
new file mode 100644
index 0000000..1b01135
--- /dev/null
+++ b/src/neon_armv7a/SgemmKernel.c
@@ -0,0 +1,328 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include "common/CommonKernel.h"
+#include "arm_neon/NeonSgemmKernel.h"
+
+#define NEON_SGEMM_KERNEL_M6N8_PRELOAD_A53 \
+  "vldr d0,[%13]; vldr d1,[%13,#8]; add %13,%13,#24\n\t"\
+  "vldr d4,[%14]; vldr d5,[%14,#8]; ldr r2,[%14,#16]; ldr r3,[%14,#20]\n\t"\
+  "add %14,%14,#32\n\t"
+
+#define NEON_SGEMM_KERNEL_M6N8_MAIN2_A53 \
+  "vldr d3,[%13,#-8]; vmov d2,d1\n\t"\
+  "vmla.f32 %q0,q0,d4[0]; ldr r0,[%14]\n\t"\
+  "vmla.f32 %q1,q0,d4[1]; ldr r1,[%14,#4]\n\t"\
+  "vmla.f32 %q2,q0,d5[0]\n\t"\
+  "vldr d7,[%14,#-8]; vmov d6,r2,r3\n\t"\
+  "vmla.f32 %q3,q0,d5[1]; ldr r2,[%13]\n\t"\
+  "vmla.f32 %q4,q2,d3[0]; ldr r3,[%13,#4]\n\t"\
+  "vmla.f32 %q5,q2,d3[1]\n\t"\
+  "vldr d5,[%14,#8]; vmov d4,r0,r1\n\t"\
+  "vmla.f32 %q6,q3,d0[0]; add %13,%13,#48\n\t"\
+  "vmla.f32 %q7,q3,d0[1]; add %14,%14,#64\n\t"\
+  "vmla.f32 %q8,q1,d6[0]; pld [%13,#128]\n\t"\
+  "vldr d1,[%13,#-40]; vmov d0,r2,r3\n\t"\
+  "vmla.f32 %q9,q1,d6[1]; ldr r2,[%14,#-48]\n\t"\
+  "vmla.f32 %q10,q1,d7[0]; ldr r3,[%14,#-44]\n\t"\
+  "vmla.f32 %q11,q1,d7[1]\n\t"\
+  "vldr d3,[%13,#-32]; vmov d2,d1\n\t"\
+  "vmla.f32 %q0,q0,d4[0]; ldr r0,[%14,#-32]\n\t"\
+  "vmla.f32 %q1,q0,d4[1]; ldr r1,[%14,#-28]\n\t"\
+  "vmla.f32 %q2,q0,d5[0]\n\t"\
+  "vldr d7,[%14,#-40]; vmov d6,r2,r3\n\t"\
+  "vmla.f32 %q3,q0,d5[1]; ldr r2,[%13,#-24]\n\t"\
+  "vmla.f32 %q4,q2,d3[0]; ldr r3,[%13,#-20]\n\t"\
+  "vmla.f32 %q5,q2,d3[1]\n\t"\
+  "vldr d5,[%14,#-24]; vmov d4,r0,r1\n\t"\
+  "vmla.f32 %q6,q3,d0[0]; sub %12,%12,#2\n\t"\
+  "vmla.f32 %q7,q3,d0[1]; cmp %12,#2\n\t"\
+  "vmla.f32 %q8,q1,d6[0]; pld [%14,#192]\n\t"\
+  "vldr d1,[%13,#-16]; vmov d0,r2,r3\n\t"\
+  "vmla.f32 %q9,q1,d6[1]; ldr r2,[%14,#-16]\n\t"\
+  "vmla.f32 %q10,q1,d7[0]; ldr r3,[%14,#-12]\n\t"\
+  "vmla.f32 %q11,q1,d7[1]\n\t"
+
+#define NEON_SGEMM_KERNEL_M6N8_TAIL2_A53 \
+  "vldr d3,[%13,#-8]; vmov d2,d1\n\t"\
+  "vmla.f32 %q0,q0,d4[0]; ldr r0,[%14]\n\t"\
+  "vmla.f32 %q1,q0,d4[1]; ldr r1,[%14,#4]\n\t"\
+  "vmla.f32 %q2,q0,d5[0]\n\t"\
+  "vldr d7,[%14,#-8]; vmov d6,r2,r3\n\t"\
+  "vmla.f32 %q3,q0,d5[1]; ldr r2,[%13]\n\t"\
+  "vmla.f32 %q4,q2,d3[0]; ldr r3,[%13,#4]\n\t"\
+  "vmla.f32 %q5,q2,d3[1]\n\t"\
+  "vldr d5,[%14,#8]; vmov d4,r0,r1\n\t"\
+  "vmla.f32 %q6,q3,d0[0]; add %13,%13,#24\n\t"\
+  "vmla.f32 %q7,q3,d0[1]; add %14,%14,#32\n\t"\
+  "vmla.f32 %q8,q1,d6[0]\n\t"\
+  "vldr d1,[%13,#-16]; vmov d0,r2,r3\n\t"\
+  "vmla.f32 %q9,q1,d6[1]; ldr r2,[%14,#-16]\n\t"\
+  "vmla.f32 %q10,q1,d7[0]; ldr r3,[%14,#-12]\n\t"\
+  "vmla.f32 %q11,q1,d7[1]\n\t"\
+  "vldr d3,[%13,#-8]; vmov d2,d1\n\t"\
+  "vmla.f32 %q0,q0,d4[0]\n\t"\
+  "vmla.f32 %q1,q0,d4[1]\n\t"\
+  "vmla.f32 %q2,q0,d5[0]\n\t"\
+  "vldr d7,[%14,#-8]; vmov d6,r2,r3\n\t"\
+  "vmla.f32 %q3,q0,d5[1]\n\t"\
+  "vmla.f32 %q4,q2,d3[0]\n\t"\
+  "vmla.f32 %q5,q2,d3[1]\n\t"\
+  "vmla.f32 %q6,q3,d0[0]\n\t"\
+  "vmla.f32 %q7,q3,d0[1]\n\t"\
+  "vmla.f32 %q8,q1,d6[0]\n\t"\
+  "vmla.f32 %q9,q1,d6[1]\n\t"\
+  "vmla.f32 %q10,q1,d7[0]\n\t"\
+  "vmla.f32 %q11,q1,d7[1]\n\t"
+
+#define NEON_SGEMM_KERNEL_M6N8_TAIL1_A53 \
+  "vldr d3,[%13,#-8]; vmov d2,d1\n\t"\
+  "vmla.f32 %q0,q0,d4[0]\n\t"\
+  "vmla.f32 %q1,q0,d4[1]\n\t"\
+  "vmla.f32 %q2,q0,d5[0]\n\t"\
+  "vldr d7,[%14,#-8]; vmov d6,r2,r3\n\t"\
+  "vmla.f32 %q3,q0,d5[1]\n\t"\
+  "vmla.f32 %q4,q2,d3[0]\n\t"\
+  "vmla.f32 %q5,q2,d3[1]\n\t"\
+  "vmla.f32 %q6,q3,d0[0]\n\t"\
+  "vmla.f32 %q7,q3,d0[1]\n\t"\
+  "vmla.f32 %q8,q1,d6[0]\n\t"\
+  "vmla.f32 %q9,q1,d6[1]\n\t"\
+  "vmla.f32 %q10,q1,d7[0]\n\t"\
+  "vmla.f32 %q11,q1,d7[1]\n\t"
+
+#define NEON_SGEMM_SAVE_M6N8_ASM \
+  float32x4x2_t ct1 = vzipq_f32(cq05, cq06);\
+  float32x2_t cd1 = vget_low_f32(ct1.val[0]);\
+  float32x2_t cd2 = vget_high_f32(ct1.val[0]);\
+\
+  cq01 = vmlaq_n_f32(cq01, vld1q_f32(c_tmp), beta);\
+  cd1 = vmla_n_f32(cd1, vld1_f32(c_tmp + 4), beta);\
+  cq02 = vmlaq_n_f32(cq02, vld1q_f32(c_tmp + ldc), beta);\
+  cd2 = vmla_n_f32(cd2, vld1_f32(c_tmp + ldc + 4), beta);\
+\
+  vst1q_f32(c_tmp, cq01); vst1_f32(c_tmp + 4, cd1); c_tmp += ldc;\
+  vst1q_f32(c_tmp, cq02); vst1_f32(c_tmp + 4, cd2); c_tmp += ldc;\
+  cd1 = vget_low_f32(ct1.val[1]);\
+  cd2 = vget_high_f32(ct1.val[1]);\
+\
+  cq03 = vmlaq_n_f32(cq03, vld1q_f32(c_tmp), beta);\
+  cd1 = vmla_n_f32(cd1, vld1_f32(c_tmp + 4), beta);\
+  cq04 = vmlaq_n_f32(cq04, vld1q_f32(c_tmp + ldc), beta);\
+  cd2 = vmla_n_f32(cd2, vld1_f32(c_tmp + ldc + 4), beta);\
+\
+  vst1q_f32(c_tmp, cq03); vst1_f32(c_tmp + 4, cd1); c_tmp += ldc;\
+  vst1q_f32(c_tmp, cq04); vst1_f32(c_tmp + 4, cd2); c_tmp += ldc;\
+  ct1 = vzipq_f32(cq07, cq08);\
+  cd1 = vget_low_f32(ct1.val[0]);\
+  cd2 = vget_high_f32(ct1.val[0]);\
+\
+  cd1 = vmla_n_f32(cd1, vld1_f32(c_tmp), beta);\
+  cq09 = vmlaq_n_f32(cq09, vld1q_f32(c_tmp + 2), beta);\
+  cd2 = vmla_n_f32(cd2, vld1_f32(c_tmp + ldc), beta);\
+  cq10 = vmlaq_n_f32(cq10, vld1q_f32(c_tmp + ldc + 2), beta);\
+\
+  vst1_f32(c_tmp, cd1); vst1q_f32(c_tmp + 2, cq09); c_tmp += ldc;\
+  vst1_f32(c_tmp, cd2); vst1q_f32(c_tmp + 2, cq10); c_tmp += ldc;\
+  cd1 = vget_low_f32(ct1.val[1]);\
+  cd2 = vget_high_f32(ct1.val[1]);\
+\
+  cd1 = vmla_n_f32(cd1, vld1_f32(c_tmp), beta);\
+  cq11 = vmlaq_n_f32(cq11, vld1q_f32(c_tmp + 2), beta);\
+  cd2 = vmla_n_f32(cd2, vld1_f32(c_tmp + ldc), beta);\
+  cq12 = vmlaq_n_f32(cq12, vld1q_f32(c_tmp + ldc + 2), beta);\
+\
+  vst1_f32(c_tmp, cd1); vst1q_f32(c_tmp + 2, cq11); c_tmp += ldc;\
+  vst1_f32(c_tmp, cd2); vst1q_f32(c_tmp + 2, cq12);
+
+#define NEON_SGEMM_KERNEL_M8N6_PRELOAD_A53 \
+  "vldr d0,[%13]; vldr d1,[%13,#8]\n\t"\
+  "ldr r2,[%13,#16]; ldr r3,[%13,#20]; add %13,%13,#32\n\t"\
+  "vldr d4,[%14]; vldr d5,[%14,#8]; add %14,%14,#24\n\t"
+
+#define NEON_SGEMM_KERNEL_M8N6_MAIN2_A53 \
+  "vldr d7,[%14,#-8]; vmov d6,d5\n\t"\
+  "vmla.f32 %q0,q0,d4[0]; ldr r0,[%13]\n\t"\
+  "vmla.f32 %q1,q0,d4[1]; ldr r1,[%13,#4]\n\t"\
+  "vmla.f32 %q2,q0,d5[0]\n\t"\
+  "vldr d3,[%13,#-8]; vmov d2,r2,r3\n\t"\
+  "vmla.f32 %q3,q0,d5[1]; ldr r2,[%14]\n\t"\
+  "vmla.f32 %q4,q0,d7[0]; ldr r3,[%14,#4]\n\t"\
+  "vmla.f32 %q5,q0,d7[1]\n\t"\
+  "vldr d1,[%13,#8]; vmov d0,r0,r1\n\t"\
+  "vmla.f32 %q6,q1,d4[0]; add %13,%13,#64\n\t"\
+  "vmla.f32 %q7,q1,d4[1]; add %14,%14,#48\n\t"\
+  "vmla.f32 %q8,q1,d6[0]; pld [%13,#192]\n\t"\
+  "vldr d5,[%14,#-40]; vmov d4,r2,r3\n\t"\
+  "vmla.f32 %q9,q1,d6[1]; ldr r2,[%13,#-48]\n\t"\
+  "vmla.f32 %q10,q1,d7[0]; ldr r3,[%13,#-44]\n\t"\
+  "vmla.f32 %q11,q1,d7[1]\n\t"\
+  "vldr d7,[%14,#-32]; vmov d6,d5\n\t"\
+  "vmla.f32 %q0,q0,d4[0]; ldr r0,[%13,#-32]\n\t"\
+  "vmla.f32 %q1,q0,d4[1]; ldr r1,[%13,#-28]\n\t"\
+  "vmla.f32 %q2,q0,d5[0]\n\t"\
+  "vldr d3,[%13,#-40]; vmov d2,r2,r3\n\t"\
+  "vmla.f32 %q3,q0,d5[1]; ldr r2,[%14,#-24]\n\t"\
+  "vmla.f32 %q4,q0,d7[0]; ldr r3,[%14,#-20]\n\t"\
+  "vmla.f32 %q5,q0,d7[1]\n\t"\
+  "vldr d1,[%13,#-24]; vmov d0,r0,r1\n\t"\
+  "vmla.f32 %q6,q1,d4[0]; sub %12,%12,#2\n\t"\
+  "vmla.f32 %q7,q1,d4[1]; cmp %12,#2\n\t"\
+  "vmla.f32 %q8,q1,d6[0]; pld [%14,#128]\n\t"\
+  "vldr d5,[%14,#-16]; vmov d4,r2,r3\n\t"\
+  "vmla.f32 %q9,q1,d6[1]; ldr r2,[%13,#-16]\n\t"\
+  "vmla.f32 %q10,q1,d7[0]; ldr r3,[%13,#-12]\n\t"\
+  "vmla.f32 %q11,q1,d7[1]\n\t"
+
+#define NEON_SGEMM_KERNEL_M8N6_TAIL2_A53 \
+  "vldr d7,[%14,#-8]; vmov d6,d5\n\t"\
+  "vmla.f32 %q0,q0,d4[0]; ldr r0,[%13]\n\t"\
+  "vmla.f32 %q1,q0,d4[1]; ldr r1,[%13,#4]\n\t"\
+  "vmla.f32 %q2,q0,d5[0]\n\t"\
+  "vldr d3,[%13,#-8]; vmov d2,r2,r3\n\t"\
+  "vmla.f32 %q3,q0,d5[1]; ldr r2,[%14]\n\t"\
+  "vmla.f32 %q4,q0,d7[0]; ldr r3,[%14,#4]\n\t"\
+  "vmla.f32 %q5,q0,d7[1]\n\t"\
+  "vldr d1,[%13,#8]; vmov d0,r0,r1\n\t"\
+  "vmla.f32 %q6,q1,d4[0]; add %13,%13,#32\n\t"\
+  "vmla.f32 %q7,q1,d4[1]; add %14,%14,#24\n\t"\
+  "vmla.f32 %q8,q1,d6[0]\n\t"\
+  "vldr d5,[%14,#-16]; vmov d4,r2,r3\n\t"\
+  "vmla.f32 %q9,q1,d6[1]; ldr r2,[%13,#-16]\n\t"\
+  "vmla.f32 %q10,q1,d7[0]; ldr r3,[%13,#-12]\n\t"\
+  "vmla.f32 %q11,q1,d7[1]\n\t"\
+  "vldr d7,[%14,#-8]; vmov d6,d5\n\t"\
+  "vmla.f32 %q0,q0,d4[0]\n\t"\
+  "vmla.f32 %q1,q0,d4[1]\n\t"\
+  "vmla.f32 %q2,q0,d5[0]\n\t"\
+  "vldr d3,[%13,#-8]; vmov d2,r2,r3\n\t"\
+  "vmla.f32 %q3,q0,d5[1]\n\t"\
+  "vmla.f32 %q4,q0,d7[0]\n\t"\
+  "vmla.f32 %q5,q0,d7[1]\n\t"\
+  "vmla.f32 %q6,q1,d4[0]\n\t"\
+  "vmla.f32 %q7,q1,d4[1]\n\t"\
+  "vmla.f32 %q8,q1,d6[0]\n\t"\
+  "vmla.f32 %q9,q1,d6[1]\n\t"\
+  "vmla.f32 %q10,q1,d7[0]\n\t"\
+  "vmla.f32 %q11,q1,d7[1]\n\t"
+
+#define NEON_SGEMM_KERNEL_M8N6_TAIL1_A53 \
+  "vldr d7,[%14,#-8]; vmov d6,d5\n\t"\
+  "vmla.f32 %q0,q0,d4[0]\n\t"\
+  "vmla.f32 %q1,q0,d4[1]\n\t"\
+  "vmla.f32 %q2,q0,d5[0]\n\t"\
+  "vldr d3,[%13,#-8]; vmov d2,r2,r3\n\t"\
+  "vmla.f32 %q3,q0,d5[1]\n\t"\
+  "vmla.f32 %q4,q0,d7[0]\n\t"\
+  "vmla.f32 %q5,q0,d7[1]\n\t"\
+  "vmla.f32 %q6,q1,d4[0]\n\t"\
+  "vmla.f32 %q7,q1,d4[1]\n\t"\
+  "vmla.f32 %q8,q1,d6[0]\n\t"\
+  "vmla.f32 %q9,q1,d6[1]\n\t"\
+  "vmla.f32 %q10,q1,d7[0]\n\t"\
+  "vmla.f32 %q11,q1,d7[1]\n\t"
+
+#define NEON_SGEMM_SAVE_M8N6_ASM \
+\
+  cq01 = vmlaq_n_f32(cq01, vld1q_f32(c_tmp), beta);\
+  cq07 = vmlaq_n_f32(cq07, vld1q_f32(c_tmp + 4), beta);\
+  cq02 = vmlaq_n_f32(cq02, vld1q_f32(c_tmp + ldc), beta);\
+  cq08 = vmlaq_n_f32(cq08, vld1q_f32(c_tmp + ldc + 4), beta);\
+\
+  vst1q_f32(c_tmp, cq01); vst1q_f32(c_tmp + 4, cq07); c_tmp += ldc;\
+  vst1q_f32(c_tmp, cq02); vst1q_f32(c_tmp + 4, cq08); c_tmp += ldc;\
+\
+  cq03 = vmlaq_n_f32(cq03, vld1q_f32(c_tmp), beta);\
+  cq09 = vmlaq_n_f32(cq09, vld1q_f32(c_tmp + 4), beta);\
+  cq04 = vmlaq_n_f32(cq04, vld1q_f32(c_tmp + ldc), beta);\
+  cq10 = vmlaq_n_f32(cq10, vld1q_f32(c_tmp + ldc + 4), beta);\
+\
+  vst1q_f32(c_tmp, cq03); vst1q_f32(c_tmp + 4, cq09); c_tmp += ldc;\
+  vst1q_f32(c_tmp, cq04); vst1q_f32(c_tmp + 4, cq10); c_tmp += ldc;\
+\
+  cq05 = vmlaq_n_f32(cq05, vld1q_f32(c_tmp), beta);\
+  cq11 = vmlaq_n_f32(cq11, vld1q_f32(c_tmp + 4), beta);\
+  cq06 = vmlaq_n_f32(cq06, vld1q_f32(c_tmp + ldc), beta);\
+  cq12 = vmlaq_n_f32(cq12, vld1q_f32(c_tmp + ldc + 4), beta);\
+\
+  vst1q_f32(c_tmp, cq05); vst1q_f32(c_tmp + 4, cq11); c_tmp += ldc;\
+  vst1q_f32(c_tmp, cq06); vst1q_f32(c_tmp + 4, cq12);
+
+#define PREF_C_1_LANE(n, mdim) \
+  pref_c(c_pref); pref_c(c_pref + mdim - 1); c_pref += ldc;
+#define PREF_C(mdim, ndim) \
+  MACRO_EXPANSION_##ndim(VOID_BASE, PREF_C_1_LANE, mdim)
+
+#define NEON_SGEMM_ASM(mdim, ndim, cputype) {\
+  float *c_pref = c_ptr; PREF_C(mdim, ndim)\
+  register float32x4_t cq01 __asm("q4");\
+  register float32x4_t cq02 __asm("q5");\
+  register float32x4_t cq03 __asm("q6");\
+  register float32x4_t cq04 __asm("q7");\
+  register float32x4_t cq05 __asm("q8");\
+  register float32x4_t cq06 __asm("q9");\
+  register float32x4_t cq07 __asm("q10");\
+  register float32x4_t cq08 __asm("q11");\
+  register float32x4_t cq09 __asm("q12");\
+  register float32x4_t cq10 __asm("q13");\
+  register float32x4_t cq11 __asm("q14");\
+  register float32x4_t cq12 __asm("q15");\
+  const float *a_ptr, *b_ptr;\
+  uint32_t k_left;\
+  b_ptr = b_head;\
+  a_ptr = a_head;\
+  k_left = K;\
+  __asm__ __volatile__ (\
+    "vmov.i8 %q0,#0; vmov.i8 %q1,#0; vmov %q2,%q0; vmov %q3,%q1\n\t"\
+    "vmov %q4,%q0; vmov %q5,%q1; vmov %q6,%q0; vmov %q7,%q1\n\t"\
+    "vmov %q8,%q0; vmov %q9,%q1; vmov %q10,%q0; vmov %q11,%q1\n\t"\
+    "cmp %12,#0; beq 4f\n\t"\
+    NEON_SGEMM_KERNEL_M##mdim##N##ndim##_PRELOAD_##cputype\
+    "cmp %12,#2; ble 2f\n\t"\
+    ".balign 16\n\t"\
+    "1:\n\t"\
+    NEON_SGEMM_KERNEL_M##mdim##N##ndim##_MAIN2_##cputype "bgt 1b\n\t"\
+    "2:\n\t"\
+    "cmp %12,#2; bne 3f\n\t"\
+    NEON_SGEMM_KERNEL_M##mdim##N##ndim##_TAIL2_##cputype "b 4f\n\t"\
+    "3:\n\t"\
+    NEON_SGEMM_KERNEL_M##mdim##N##ndim##_TAIL1_##cputype\
+    "4:\n\t"\
+  :"=w"(cq01),"=w"(cq02),"=w"(cq03),"=w"(cq04),"=w"(cq05),"=w"(cq06),\
+  "=w"(cq07),"=w"(cq08),"=w"(cq09),"=w"(cq10),"=w"(cq11),"=w"(cq12),\
+  "+r"(k_left),"+r"(a_ptr),"+r"(b_ptr)\
+  ::"d0","d1","d2","d3","d4","d5","d6","d7",\
+  "r0","r1","r2","r3","cc","memory");\
+  float *c_tmp = c_ptr;\
+  NEON_SGEMM_SAVE_M##mdim##N##ndim##_ASM\
+}
+
+static inline void inline_dualpack_gemm_afloat_bfloat_cfloat_m6_n8(
+  const float *a_head, const float *b_head, float *c_ptr,
+  uint32_t K, float beta, uint32_t ldc) {
+  NEON_SGEMM_ASM(6, 8, A53)
+}
+
+static inline void inline_dualpack_gemm_afloat_bfloat_cfloat_m8_n6(
+  const float *a_head, const float *b_head, float *c_ptr,
+  uint32_t K, float beta, uint32_t ldc) {
+  NEON_SGEMM_ASM(8, 6, A53)
+}
+
+DUALPACK_KERNEL_FUNC_LM(sgemm, float, float, float, 6, 8)
+DUALPACK_KERNEL_FUNC_LN(sgemm, float, float, float, 8, 6)
+
diff --git a/src/neon_armv7a/SgemmSkinnyDot.c b/src/neon_armv7a/SgemmSkinnyDot.c
new file mode 100644
index 0000000..40c12d1
--- /dev/null
+++ b/src/neon_armv7a/SgemmSkinnyDot.c
@@ -0,0 +1,495 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include "arm_neon/ARMCpuType.h"
+#include "arm_neon/ARMCompareAndSwap.h"
+#include "common/CommonSkinnyDot.h"
+#include <arm_neon.h>
+
+typedef float sgemm_skinnydot_ascalar;
+typedef float sgemm_skinnydot_bscalar;
+typedef float sgemm_skinnydot_cscalar;
+
+static inline void inline_sgemm_arowmajor_bskinny_m4n1(const float *a_ptr1,
+  const float *b_ptr, float *c_ptr, uint32_t k_inc, uint32_t LDK, uint32_t LDM,
+  float beta, bool c_rowmajor) {
+
+  float32x2_t cd1, cd2, cd3, cd4, cd5, cd6, cd7, cd8;
+  const float *a_ptr2 = a_ptr1 + LDK;
+  const float *a_ptr3 = a_ptr1 + LDK * 2;
+  const float *a_ptr4 = a_ptr2 + LDK * 2;
+  const float *a_pref = a_ptr4 + LDK;
+  const uint32_t pref_inc = LDK > k_inc ? (LDK - k_inc) * sizeof(float) : 0;
+  uint32_t k_left = k_inc;
+  __asm__ __volatile__(
+    "mov r0,#0\n\t"
+    "vmov.i8 %[cd1],#0; vmov.i8 %[cd2],#0\n\t"
+    "vmov.i8 %[cd3],#0; vmov.i8 %[cd4],#0\n\t"
+    "vmov.i8 %[cd5],#0; vmov.i8 %[cd6],#0\n\t"
+    "vmov.i8 %[cd7],#0; vmov.i8 %[cd8],#0\n\t"
+    "cmp %[k_left],#4; blt 3f\n\t"
+    "vldr d2,[%[a_ptr1]]; vldr d6,[%[a_ptr1],#8]; add %[a_ptr1],%[a_ptr1],#16\n\t"
+    "vldr d3,[%[a_ptr2]]; vldr d7,[%[a_ptr2],#8]; add %[a_ptr2],%[a_ptr2],#16\n\t"
+    "vldr d4,[%[a_ptr3]]; vldr d8,[%[a_ptr3],#8]; add %[a_ptr3],%[a_ptr3],#16\n\t"
+    "vldr d5,[%[a_ptr4]]; vldr d9,[%[a_ptr4],#8]; add %[a_ptr4],%[a_ptr4],#16\n\t"
+    "vldm %[b_ptr]!,{d0,d1}\n\t"
+    "cmp %[k_left],#8; blt 2f\n\t"
+    ".balign 16; 1:\n\t"
+    "pld [%[a_pref]]; add %[a_pref],%[a_pref],#64; add r0,r0,#16\n\t"
+    "vmla.f32 %[cd1],d2,d0; vldr d2,[%[a_ptr1]]\n\t"
+    "cmp r0,%[k_inc]\n\t"
+    "vmla.f32 %[cd2],d3,d0; vldr d3,[%[a_ptr2]]\n\t"
+    "addgt %[a_pref],%[a_pref],%[pref_inc]\n\t"
+    "vmla.f32 %[cd3],d4,d0; vldr d4,[%[a_ptr3]]\n\t"
+    "movgt r0,#0\n\t"
+    "vmla.f32 %[cd4],d5,d0; vldr d5,[%[a_ptr4]]\n\t"
+    "vldr d0,[%[b_ptr]]; sub %[k_left],%[k_left],#4\n\t"
+    "vmla.f32 %[cd5],d6,d1; vldr d6,[%[a_ptr1],#8]\n\t"
+    "add %[a_ptr1],%[a_ptr1],#16\n\t"
+    "vmla.f32 %[cd6],d7,d1; vldr d7,[%[a_ptr2],#8]\n\t"
+    "add %[a_ptr2],%[a_ptr2],#16; cmp %[k_left],#8\n\t"
+    "vmla.f32 %[cd7],d8,d1; vldr d8,[%[a_ptr3],#8]\n\t"
+    "add %[a_ptr3],%[a_ptr3],#16\n\t"
+    "vmla.f32 %[cd8],d9,d1; vldr d9,[%[a_ptr4],#8]\n\t"
+    "add %[a_ptr4],%[a_ptr4],#16\n\t"
+    "vldr d1,[%[b_ptr],#8]; add %[b_ptr],%[b_ptr],#16; bge 1b\n\t"
+    "2:\n\t"
+    "vmla.f32 %[cd1],d2,d0; vmla.f32 %[cd2],d3,d0\n\t"
+    "vmla.f32 %[cd3],d4,d0; vmla.f32 %[cd4],d5,d0\n\t"
+    "sub %[k_left],%[k_left],#4\n\t"
+    "vmla.f32 %[cd5],d6,d1; vmla.f32 %[cd6],d7,d1\n\t"
+    "vmla.f32 %[cd7],d8,d1; vmla.f32 %[cd8],d9,d1\n\t"
+    "3:\n\t"
+   :[cd1]"=w"(cd1), [cd2]"=w"(cd2), [cd3]"=w"(cd3), [cd4]"=w"(cd4),
+    [cd5]"=w"(cd5), [cd6]"=w"(cd6), [cd7]"=w"(cd7), [cd8]"=w"(cd8),
+    [a_ptr1]"+r"(a_ptr1), [a_ptr2]"+r"(a_ptr2), [a_ptr3]"+r"(a_ptr3),
+    [a_ptr4]"+r"(a_ptr4), [b_ptr]"+r"(b_ptr),
+    [k_left]"+r"(k_left), [a_pref]"+r"(a_pref)
+   :[pref_inc]"r"(pref_inc), [k_inc]"r"(k_inc)
+   :"d0","d1","d2","d3","d4","d5","d6","d7","d8","d9",
+    "r0","cc","memory");
+
+  cd1 = vadd_f32(cd1, cd5); cd2 = vadd_f32(cd2, cd6);
+  cd3 = vadd_f32(cd3, cd7); cd4 = vadd_f32(cd4, cd8);
+  float cs1 = vget_lane_f32(cd1, 0) + vget_lane_f32(cd1, 1);
+  float cs2 = vget_lane_f32(cd2, 0) + vget_lane_f32(cd2, 1);
+  float cs3 = vget_lane_f32(cd3, 0) + vget_lane_f32(cd3, 1);
+  float cs4 = vget_lane_f32(cd4, 0) + vget_lane_f32(cd4, 1);
+  for (; k_left > 0; k_left--) {
+    float bs1 = *b_ptr; b_ptr++;
+    cs1 += (*a_ptr1) * bs1; a_ptr1++; 
+    cs2 += (*a_ptr2) * bs1; a_ptr2++;
+    cs3 += (*a_ptr3) * bs1; a_ptr3++; 
+    cs4 += (*a_ptr4) * bs1; a_ptr4++;
+  }
+  c_ptr[0] = c_ptr[0] * beta + cs1; c_ptr[1] = c_ptr[1] * beta + cs2;
+  c_ptr[2] = c_ptr[2] * beta + cs3; c_ptr[3] = c_ptr[3] * beta + cs4;
+}
+
+static inline void inline_sgemm_arowmajor_bskinny_m1n1(const float *a_ptr,
+  const float *b_ptr, float *c_ptr, uint32_t k_left, uint32_t LDK, uint32_t LDM,
+  float beta, bool c_rowmajor) {
+
+  float32x4_t cq1;
+  __asm__ __volatile__(
+    "vmov.i8 d16,#0; vmov.i8 d17,#0\n\t"
+    "vmov d18,d16; vmov d19,d17\n\t"
+    "vmov d20,d16; vmov d21,d17\n\t"
+    "vmov d22,d16; vmov d23,d17\n\t"
+    "cmp %[K],#16; blt 4f\n\t"
+    "pld [%[a_ptr],#256]\n\t"
+    "add %[a_ptr],%[a_ptr],#64; add %[b_ptr],%[b_ptr],#64\n\t"
+    "vldr d24,[%[a_ptr],#-64]; vldr d8,[%[b_ptr],#-64]\n\t"
+    "vldr d25,[%[a_ptr],#-56]; vldr d9,[%[b_ptr],#-56]\n\t"
+    "vldr d26,[%[a_ptr],#-48]; vldr d10,[%[b_ptr],#-48]\n\t"
+    "vldr d27,[%[a_ptr],#-40]; vldr d11,[%[b_ptr],#-40]\n\t"
+    "vldr d28,[%[a_ptr],#-32]; vldr d12,[%[b_ptr],#-32]\n\t"
+    "vldr d29,[%[a_ptr],#-24]; vldr d13,[%[b_ptr],#-24]\n\t"
+    "vldr d30,[%[a_ptr],#-16]; vldr d14,[%[b_ptr],#-16]\n\t"
+    "vldr d31,[%[a_ptr],#-8]; vldr d15,[%[b_ptr],#-8]\n\t"
+    "cmp %[K],#32; blt 3f\n\t"
+    "2:\n\t"
+    "pld [%[a_ptr],#256]\n\t"
+    "add %[a_ptr],%[a_ptr],#64; add %[b_ptr],%[b_ptr],#64\n\t"
+    "vmla.f32 d16,d24,d8; vldr d24,[%[a_ptr],#-64]; vldr d8,[%[b_ptr],#-64]\n\t"
+    "vmla.f32 d17,d25,d9; vldr d25,[%[a_ptr],#-56]; vldr d9,[%[b_ptr],#-56]\n\t"
+    "vmla.f32 d18,d26,d10; vldr d26,[%[a_ptr],#-48]; vldr d10,[%[b_ptr],#-48]\n\t"
+    "vmla.f32 d19,d27,d11; vldr d27,[%[a_ptr],#-40]; vldr d11,[%[b_ptr],#-40]\n\t"
+    "sub %[K],%[K],#16\n\t"
+    "vmla.f32 d20,d28,d12; vldr d28,[%[a_ptr],#-32]; vldr d12,[%[b_ptr],#-32]\n\t"
+    "vmla.f32 d21,d29,d13; vldr d29,[%[a_ptr],#-24]; vldr d13,[%[b_ptr],#-24]\n\t"
+    "cmp %[K],#32\n\t"
+    "vmla.f32 d22,d30,d14; vldr d30,[%[a_ptr],#-16]; vldr d14,[%[b_ptr],#-16]\n\t"
+    "vmla.f32 d23,d31,d15; vldr d31,[%[a_ptr],#-8]; vldr d15,[%[b_ptr],#-8]\n\t"
+    "bge 2b\n\t"
+    "3:\n\t"
+    "vmla.f32 d16,d24,d8; vmla.f32 d17,d25,d9\n\t"
+    "vmla.f32 d18,d26,d10; vmla.f32 d19,d27,d11; sub %[K],%[K],#16\n\t"
+    "vmla.f32 d20,d28,d12; vmla.f32 d21,d29,d13\n\t"
+    "vmla.f32 d22,d30,d14; vmla.f32 d23,d31,d15\n\t"
+    "4:\n\t"
+    "vadd.f32 d16,d16,d20; vadd.f32 d17,d17,d21\n\t"
+    "vadd.f32 d18,d18,d22; vadd.f32 d19,d19,d23\n\t"
+    "cmp %[K],#8; blt 5f; add %[a_ptr],%[a_ptr],#32; add %[b_ptr],%[b_ptr],#32\n\t"
+    "vldr d24,[%[a_ptr],#-32]; vldr d8,[%[b_ptr],#-32]; vmla.f32 d16,d24,d8\n\t"
+    "vldr d25,[%[a_ptr],#-24]; vldr d9,[%[b_ptr],#-24]; vmla.f32 d17,d25,d9\n\t"
+    "sub %[K],%[K],#8\n\t"
+    "vldr d26,[%[a_ptr],#-16]; vldr d10,[%[b_ptr],#-16]; vmla.f32 d18,d26,d10\n\t"
+    "vldr d27,[%[a_ptr],#-8]; vldr d11,[%[b_ptr],#-8]; vmla.f32 d19,d27,d11\n\t"
+    "5:\n\t"
+    "vadd.f32 %e[cq1],d16,d17; vadd.f32 %f[cq1],d18,d19\n\t"
+    "cmp %[K],#4; blt 6f\n\t"
+    "add %[a_ptr],%[a_ptr],#16; add %[b_ptr],%[b_ptr],#16\n\t"
+    "vldr d24,[%[a_ptr],#-16]; vldr d8,[%[b_ptr],#-16]; vmla.f32 %e[cq1],d24,d8\n\t"
+    "sub %[K],%[K],#4\n\t"
+    "vldr d25,[%[a_ptr],#-8]; vldr d9,[%[b_ptr],#-8]; vmla.f32 %f[cq1],d25,d9\n\t"
+    "6:\n\t"
+  :[cq1]"=w"(cq1), [a_ptr]"+r"(a_ptr), [b_ptr]"+r"(b_ptr), [K]"+r"(k_left)
+  ::"cc","memory","q12","q13","q14","q15",
+    "q4","q5","q6","q7","q8","q9","q10","q11");
+
+  float32x2_t cd1 = vadd_f32(vget_low_f32(cq1), vget_high_f32(cq1));
+  if (k_left > 1) {
+    float32x2_t ad1 = vld1_f32(a_ptr); a_ptr += 2;
+    float32x2_t bd1 = vld1_f32(b_ptr); b_ptr += 2;
+    cd1 = vmla_f32(cd1, ad1, bd1);
+    k_left -= 2;
+  }
+
+  float cs1 = vget_lane_f32(cd1, 0) + vget_lane_f32(cd1, 1);
+  if (k_left > 0) {
+    cs1 += a_ptr[0] * b_ptr[0];
+  }
+  c_ptr[0] = c_ptr[0] * beta + cs1;
+}
+
+/* k_mask = 7 */
+static inline void inline_sgemm_arowmajor_bskinny_m4n2(const float *a_ptr1,
+  const float *b_ptr, float *c_ptr, uint32_t k_inc, uint32_t LDK, uint32_t LDM,
+  float beta, bool c_rowmajor) {
+
+  const float *a_ptr2 = a_ptr1 + LDK;
+  const float *a_ptr3 = a_ptr1 + LDK * 2;
+  const float *a_ptr4 = a_ptr1 + LDK * 3;
+  const float *a_pref = a_ptr1 + LDK * 4;
+  uint32_t k_left = k_inc;
+  const uint32_t pref_inc = LDK > k_inc ? (LDK - k_inc) * sizeof(float) : 0;
+  float32x4_t cq1, cq2, cq3, cq4, cq5, cq6, cq7, cq8;
+  __asm__ __volatile__(
+    "mov r0,#0\n\t"
+    "vmov.i8 %q[cq1],#0; vmov.i8 %q[cq2],#0\n\t"
+    "vmov.i8 %q[cq3],#0; vmov.i8 %q[cq4],#0\n\t"
+    "vmov.i8 %q[cq5],#0; vmov.i8 %q[cq6],#0\n\t"
+    "vmov.i8 %q[cq7],#0; vmov.i8 %q[cq8],#0\n\t"
+    "cmp %[k_left],#4; blt 3f\n\t"
+    "vldm %[a_ptr1]!,{q2}; vldm %[a_ptr2]!,{q3}\n\t"
+    "vldm %[a_ptr3]!,{q4}; vldm %[a_ptr4]!,{q5}\n\t"
+    "vldm %[b_ptr]!,{q0}; vldm %[b_ptr]!,{q1}\n\t"
+    "cmp %[k_left],#8; blt 2f\n\t"
+    ".balign 16; 1:\n\t"
+    "pld [%[a_pref]]; add %[a_pref],%[a_pref],#64; add r0,r0,#16\n\t"
+    "vmla.f32 %q[cq1],q2,q0; cmp r0,%[k_inc]\n\t"
+    "vmla.f32 %q[cq5],q2,q1; vldm %[a_ptr1]!,{q2}\n\t"
+    "vmla.f32 %q[cq2],q3,q0; addgt %[a_pref],%[a_pref],%[pref_inc]\n\t"
+    "vmla.f32 %q[cq6],q3,q1; vldm %[a_ptr2]!,{q3}\n\t"
+    "sub %[k_left],%[k_left],#4\n\t"
+    "vmla.f32 %q[cq3],q4,q0; movgt r0,#0\n\t"
+    "vmla.f32 %q[cq7],q4,q1; vldm %[a_ptr3]!,{q4}\n\t"
+    "vmla.f32 %q[cq4],q5,q0; cmp %[k_left],#8\n\t"
+    "vmla.f32 %q[cq8],q5,q1; vldm %[a_ptr4]!,{q5}\n\t"
+    "vldm %[b_ptr]!,{q0}; vldm %[b_ptr]!,{q1}; bge 1b\n\t"
+    "2:\n\t"
+    "vmla.f32 %q[cq1],q2,q0; vmla.f32 %q[cq5],q2,q1\n\t"
+    "vmla.f32 %q[cq2],q3,q0; vmla.f32 %q[cq6],q3,q1\n\t"
+    "vmla.f32 %q[cq3],q4,q0; vmla.f32 %q[cq7],q4,q1\n\t"
+    "vmla.f32 %q[cq4],q5,q0; vmla.f32 %q[cq8],q5,q1\n\t"
+    "sub %[k_left],%[k_left],#4\n\t"
+    "3:\n\t"
+   :[cq1]"=w"(cq1), [cq2]"=w"(cq2), [cq3]"=w"(cq3), [cq4]"=w"(cq4),
+    [cq5]"=w"(cq5), [cq6]"=w"(cq6), [cq7]"=w"(cq7), [cq8]"=w"(cq8),
+    [k_left]"+r"(k_left), [a_pref]"+r"(a_pref), [b_ptr]"+r"(b_ptr),
+    [a_ptr1]"+r"(a_ptr1), [a_ptr2]"+r"(a_ptr2),
+    [a_ptr3]"+r"(a_ptr3), [a_ptr4]"+r"(a_ptr4)
+   :[pref_inc]"r"(pref_inc), [k_inc]"r"(k_inc)
+   :"d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11",
+    "r0","cc","memory");
+
+  float32x2_t cd1 = vadd_f32(vget_low_f32(cq1), vget_high_f32(cq1));
+  float32x2_t cd2 = vadd_f32(vget_low_f32(cq2), vget_high_f32(cq2));
+  float32x2_t cd3 = vadd_f32(vget_low_f32(cq3), vget_high_f32(cq3));
+  float32x2_t cd4 = vadd_f32(vget_low_f32(cq4), vget_high_f32(cq4));
+  float32x2_t cd5 = vadd_f32(vget_low_f32(cq5), vget_high_f32(cq5));
+  float32x2_t cd6 = vadd_f32(vget_low_f32(cq6), vget_high_f32(cq6));
+  float32x2_t cd7 = vadd_f32(vget_low_f32(cq7), vget_high_f32(cq7));
+  float32x2_t cd8 = vadd_f32(vget_low_f32(cq8), vget_high_f32(cq8));
+  if (k_left >= 2) {
+    float32x2_t bd1 = vld1_f32(b_ptr);
+    float32x2_t bd2 = vld1_f32(b_ptr + 2); b_ptr += 4;
+    float32x2_t ad1 = vld1_f32(a_ptr1); a_ptr1 += 2;
+    float32x2_t ad2 = vld1_f32(a_ptr2); a_ptr2 += 2;
+    float32x2_t ad3 = vld1_f32(a_ptr3); a_ptr3 += 2;
+    float32x2_t ad4 = vld1_f32(a_ptr4); a_ptr4 += 2;
+    cd1 = vmla_f32(cd1, ad1, bd1);
+    cd2 = vmla_f32(cd2, ad2, bd1);
+    cd3 = vmla_f32(cd3, ad3, bd1);
+    cd4 = vmla_f32(cd4, ad4, bd1);
+    cd5 = vmla_f32(cd5, ad1, bd2);
+    cd6 = vmla_f32(cd6, ad2, bd2);
+    cd7 = vmla_f32(cd7, ad3, bd2);
+    cd8 = vmla_f32(cd8, ad4, bd2);
+    k_left -= 2;
+  }
+  float cs1 = vget_lane_f32(cd1, 0) + vget_lane_f32(cd1, 1);
+  float cs2 = vget_lane_f32(cd2, 0) + vget_lane_f32(cd2, 1);
+  float cs3 = vget_lane_f32(cd3, 0) + vget_lane_f32(cd3, 1);
+  float cs4 = vget_lane_f32(cd4, 0) + vget_lane_f32(cd4, 1);
+  float cs5 = vget_lane_f32(cd5, 0) + vget_lane_f32(cd5, 1);
+  float cs6 = vget_lane_f32(cd6, 0) + vget_lane_f32(cd6, 1);
+  float cs7 = vget_lane_f32(cd7, 0) + vget_lane_f32(cd7, 1);
+  float cs8 = vget_lane_f32(cd8, 0) + vget_lane_f32(cd8, 1);
+  if (k_left > 0) {
+    float bs1 = b_ptr[0];
+    float bs2 = b_ptr[1];
+    float as1 = *a_ptr1;
+    float as2 = *a_ptr2;
+    float as3 = *a_ptr3;
+    float as4 = *a_ptr4;
+    cs1 += as1 * bs1; cs2 += as2 * bs1;
+    cs3 += as3 * bs1; cs4 += as4 * bs1;
+    cs5 += as1 * bs2; cs6 += as2 * bs2;
+    cs7 += as3 * bs2; cs8 += as4 * bs2;
+  }
+  if (c_rowmajor) {
+    c_ptr[0] = c_ptr[0] * beta + cs1; c_ptr[1] = c_ptr[1] * beta + cs5;
+    c_ptr[2] = c_ptr[2] * beta + cs2; c_ptr[3] = c_ptr[3] * beta + cs6;
+    c_ptr[4] = c_ptr[4] * beta + cs3; c_ptr[5] = c_ptr[5] * beta + cs7;
+    c_ptr[6] = c_ptr[6] * beta + cs4; c_ptr[7] = c_ptr[7] * beta + cs8;
+  } else {
+    c_ptr[0] = c_ptr[0] * beta + cs1; c_ptr[1] = c_ptr[1] * beta + cs2;
+    c_ptr[2] = c_ptr[2] * beta + cs3; c_ptr[3] = c_ptr[3] * beta + cs4;
+    c_ptr += LDM;
+    c_ptr[0] = c_ptr[0] * beta + cs5; c_ptr[1] = c_ptr[1] * beta + cs6;
+    c_ptr[2] = c_ptr[2] * beta + cs7; c_ptr[3] = c_ptr[3] * beta + cs8;
+  }
+}
+
+static inline void inline_sgemm_arowmajor_bskinny_m1n2(const float *a_ptr,
+  const float *b_ptr, float *c_ptr, uint32_t k_left, uint32_t LDK, uint32_t LDM,
+  float beta, bool c_rowmajor) {
+
+  register float32x4_t cq1 __asm("q8");
+  __asm__ __volatile__(
+    "vmov.i8 %q[cq1],#0; vmov.i8 q9,#0\n\t"
+    "vmov.i8 q10,#0; vmov.i8 q11,#0\n\t"
+    "cmp %[k_left],#16; blt 4f\n\t"
+    "pld [%[a_ptr],#256]\n\t"
+    "vldm %[a_ptr]!,{q12,q13,q14,q15}\n\t"
+    "vldm %[b_ptr]!,{q0,q1,q2,q3}\n\t"
+    "vldm %[b_ptr]!,{q4,q5,q6,q7}\n\t"
+    "cmp %[k_left],#32; blt 3f\n\t"
+    ".balign 16; 2:\n\t"
+    "pld [%[a_ptr],#256]\n\t"
+    "vmla.f32 %q[cq1],q12,q0; vldm %[b_ptr]!,{q0}\n\t"
+    "vmla.f32 q10,q12,q1; vldm %[b_ptr]!,{q1}; vldm %[a_ptr]!,{q12}\n\t"
+    "vmla.f32 q9,q13,q2; vldm %[b_ptr]!,{q2}\n\t"
+    "vmla.f32 q11,q13,q3; vldm %[b_ptr]!,{q3}; vldm %[a_ptr]!,{q13}\n\t"
+    "sub %[k_left],%[k_left],#16\n\t"
+    "vmla.f32 %q[cq1],q14,q4; vldm %[b_ptr]!,{q4}\n\t"
+    "vmla.f32 q10,q14,q5; vldm %[b_ptr]!,{q5}; vldm %[a_ptr]!,{q14}\n\t"
+    "cmp %[k_left],#32\n\t"
+    "vmla.f32 q9,q15,q6; vldm %[b_ptr]!,{q6}\n\t"
+    "vmla.f32 q11,q15,q7; vldm %[b_ptr]!,{q7}; vldm %[a_ptr]!,{q15}\n\t"
+    "bge 2b\n\t"
+    "3:\n\t"
+    "vmla.f32 %q[cq1],q12,q0; vmla.f32 q10,q12,q1; sub %[k_left],%[k_left],#16\n\t"
+    "vmla.f32 q9,q13,q2; vmla.f32 q11,q13,q3\n\t"
+    "vmla.f32 %q[cq1],q14,q4; vmla.f32 q10,q14,q5\n\t"
+    "vmla.f32 q9,q15,q6; vmla.f32 q11,q15,q7\n\t"
+    "4:\n\t"
+    "cmp %[k_left],#8; blt 5f\n\t"
+    "vldm %[a_ptr]!,{q12}; vldm %[b_ptr]!,{q0,q1}\n\t"
+    "vldm %[a_ptr]!,{q13}; vldm %[b_ptr]!,{q2,q3}\n\t"
+    "vmla.f32 %q[cq1],q12,q0; vmla.f32 q10,q12,q1\n\t"
+    "sub %[k_left],%[k_left],#8\n\t"
+    "vmla.f32 q9,q13,q2; vmla.f32 q11,q13,q3\n\t"
+    "5:\n\t"
+    "vadd.f32 %q[cq1],%q[cq1],q9; vadd.f32 q10,q10,q11\n\t"
+    "cmp %[k_left],#4; blt 6f\n\t"
+    "vldm %[a_ptr]!,{q12}; vldm %[b_ptr]!,{q4}; vldm %[b_ptr]!,{q0}\n\t"
+    "vmla.f32 %q[cq1],q12,q4; vmla.f32 q10,q12,q0\n\t"
+    "sub %[k_left],%[k_left],#4\n\t"
+    "6:\n\t"
+    "vadd.f32 %e[cq1],%e[cq1],%f[cq1]; vadd.f32 %f[cq1],d20,d21\n\t"
+    "cmp %[k_left],#2; blt 7f\n\t"
+    "vld1.32 {d24},[%[a_ptr]]!\n\t"
+    "vld1.32 {d8},[%[b_ptr]]!; vld1.32 {d0},[%[b_ptr]]!\n\t"
+    "vmla.f32 %e[cq1],d24,d8; vmla.f32 %f[cq1],d24,d0\n\t"
+    "sub %[k_left],%[k_left],#2\n\t"
+    "7:\n\t"
+   :[cq1]"=w"(cq1), [a_ptr]"+r"(a_ptr),
+    [k_left]"+r"(k_left), [b_ptr]"+r"(b_ptr)
+   ::"cc","memory","q0","q1","q2","q3","q4","q5","q6","q7",
+     "q9","q10","q11","q12","q13","q14","q15");
+
+  float32x2_t cd1 = vpadd_f32(vget_low_f32(cq1), vget_high_f32(cq1));
+  if (k_left > 0) {
+    float as1 = *a_ptr;
+    float32x2_t bd1 = vld1_f32(b_ptr);
+    cd1 = vmla_n_f32(cd1, bd1, as1);
+  }
+
+  if (c_rowmajor) {
+    cd1 = vmla_n_f32(cd1, vld1_f32(c_ptr), beta);
+    vst1_f32(c_ptr, cd1);
+  } else {
+    c_ptr[0] = c_ptr[0] * beta + vget_lane_f32(cd1, 0);
+    c_ptr[LDM] = c_ptr[LDM] * beta + vget_lane_f32(cd1, 1);
+  }
+}
+
+static inline bool unroll_test_m4n1(uint32_t M, uint32_t K) {
+  return K <= 512;
+}
+
+static inline bool unroll_test_m1n1(uint32_t M, uint32_t K) {
+  return true;
+}
+
+static inline bool unroll_test_m4n2(uint32_t M, uint32_t K) {
+  return K <= 512;
+}
+
+static inline bool unroll_test_m1n2(uint32_t M, uint32_t K) {
+  return true;
+}
+
+GEMM_SKINNY_DOT_PARALLEL_FUNC_NOINCINLINE(sgemm, 1, 5, 5, 32768, float, float, unroll_test)
+GEMM_SKINNY_DOT_PARALLEL_FUNC_NOINCINLINE(sgemm, 2, 7, 5, 32768, float, float, unroll_test)
+
+typedef float sgemm_skinnydot_avec1;
+typedef float sgemm_skinnydot_bvec1;
+typedef float sgemm_skinnydot_cvec1;
+
+typedef float32x2_t sgemm_skinnydot_avec2;
+typedef float32x2_t sgemm_skinnydot_bvec2;
+typedef float32x2_t sgemm_skinnydot_cvec2;
+
+typedef float32x4_t sgemm_skinnydot_avec4;
+typedef float32x4_t sgemm_skinnydot_bvec4;
+typedef float32x4_t sgemm_skinnydot_cvec4;
+
+typedef float32x4x2_t sgemm_skinnydot_avec8;
+typedef float32x4x2_t sgemm_skinnydot_bvec8;
+typedef float32x4x2_t sgemm_skinnydot_cvec8;
+
+GEMM_SKINNY_DOT_CALC_UNIT(sgemm, 8) {
+  float32x4x2_t ret;
+  ret.val[0] = vmlaq_f32(c_vec.val[0], a_vec.val[0], b_vec.val[0]);
+  ret.val[1] = vmlaq_f32(c_vec.val[1], a_vec.val[1], b_vec.val[1]);
+  return ret;
+}
+
+GEMM_SKINNY_DOT_CALC_UNIT(sgemm, 4) {
+  return vmlaq_f32(c_vec, a_vec, b_vec);
+}
+
+GEMM_SKINNY_DOT_CALC_UNIT(sgemm, 2) {
+  return vmla_f32(c_vec, a_vec, b_vec);
+}
+
+GEMM_SKINNY_DOT_CALC_UNIT(sgemm, 1) {
+  return c_vec + a_vec * b_vec;
+}
+
+GEMM_SKINNY_DOT_LOADA_UNIT(sgemm, 8) {
+  __asm__("pld [%0,#96]"::"r"(a_ptr):);
+  float32x4x2_t ret;
+  ret.val[0] = vld1q_f32(a_ptr);
+  ret.val[1] = vld1q_f32(a_ptr + 4);
+  return ret;
+}
+
+GEMM_SKINNY_DOT_LOADA_UNIT(sgemm, 4) {
+  __asm__("pld [%0,#80]"::"r"(a_ptr):);
+  return vld1q_f32(a_ptr);
+}
+
+GEMM_SKINNY_DOT_LOADA_UNIT(sgemm, 2) {
+  __asm__("pld [%0,#72]"::"r"(a_ptr):);
+  return vld1_f32(a_ptr);
+}
+
+GEMM_SKINNY_DOT_LOADA_UNIT(sgemm, 1) {
+  return *a_ptr;
+}
+
+GEMM_SKINNY_DOT_LOADB_UNIT(sgemm, 8) {
+  float32x4x2_t ret;
+  ret.val[0] = vld1q_f32(b_ptr);
+  ret.val[1] = vld1q_f32(b_ptr + 4);
+  return ret;
+}
+
+GEMM_SKINNY_DOT_LOADB_UNIT(sgemm, 4) {
+  return vld1q_f32(b_ptr);
+}
+
+GEMM_SKINNY_DOT_LOADB_UNIT(sgemm, 2) {
+  return vld1_f32(b_ptr);
+}
+
+GEMM_SKINNY_DOT_LOADB_UNIT(sgemm, 1) {
+  return *b_ptr;
+}
+
+GEMM_SKINNY_DOT_REDUC_UNIT(sgemm, 8, 4) {
+  return vaddq_f32(c_vec.val[0], c_vec.val[1]);
+}
+
+GEMM_SKINNY_DOT_REDUC_UNIT(sgemm, 4, 2) {
+  return vadd_f32(vget_low_f32(c_vec), vget_high_f32(c_vec));
+}
+
+GEMM_SKINNY_DOT_REDUC_UNIT(sgemm, 2, 1) {
+  return vget_lane_f32(c_vec, 0) + vget_lane_f32(c_vec, 1);
+}
+
+GEMM_SKINNY_DOT_INITC_UNIT(sgemm, 8) {
+  float32x4x2_t ret;
+  ret.val[0] = vdupq_n_f32(0);
+  ret.val[1] = vdupq_n_f32(0);
+  return ret;
+}
+
+GEMM_SKINNY_DOT_INITC_UNIT(sgemm, 4) {
+  return vdupq_n_f32(0);
+}
+
+GEMM_SKINNY_DOT_INITC_UNIT(sgemm, 2) {
+  return vdup_n_f32(0);
+}
+
+GEMM_SKINNY_DOT_INITC_UNIT(sgemm, 1) {
+  return 0;
+}
+
+GEMM_SKINNY_DOT_PARALLEL_FUNC(sgemm, 3, 3, 7, 32768, float, float)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(sgemm, 4, 3, 7, 32768, float, float)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(sgemm, 5, 3, 7, 32768, float, float)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(sgemm, 6, 3, 7, 32768, float, float)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(sgemm, 7, 3, 3, 32768, float, float)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(sgemm, 8, 3, 3, 32768, float, float)
diff --git a/src/neon_armv7a/SgemmSkinnyGer.c b/src/neon_armv7a/SgemmSkinnyGer.c
new file mode 100644
index 0000000..a051de7
--- /dev/null
+++ b/src/neon_armv7a/SgemmSkinnyGer.c
@@ -0,0 +1,280 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include "arm_neon/ARMCompareAndSwap.h"
+#include "common/CommonSkinnyGer.h"
+
+#include <arm_neon.h>
+
+typedef float sgemm_skinnyger_ascalar;
+typedef float sgemm_skinnyger_bscalar;
+typedef float sgemm_skinnyger_cscalar;
+
+typedef float sgemm_skinnyger_avec1;
+typedef float sgemm_skinnyger_bvec1;
+typedef float sgemm_skinnyger_cvec1;
+
+typedef float32x2_t sgemm_skinnyger_avec2;
+typedef float32x2_t sgemm_skinnyger_bvec2;
+typedef float32x2_t sgemm_skinnyger_cvec2;
+
+typedef float32x4_t sgemm_skinnyger_avec4;
+typedef float32x4_t sgemm_skinnyger_bvec4;
+typedef float32x4_t sgemm_skinnyger_cvec4;
+
+typedef float32x4x2_t sgemm_skinnyger_avec8;
+typedef float32x4x2_t sgemm_skinnyger_bvec8;
+typedef float32x4x2_t sgemm_skinnyger_cvec8;
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 8, 4, 1) {
+  float32x4x2_t ret;
+  ret.val[0] = vmlaq_lane_f32(c_vec.val[0], a_vec.val[0], vget_low_f32(b_vec), 0);
+  ret.val[1] = vmlaq_lane_f32(c_vec.val[1], a_vec.val[1], vget_low_f32(b_vec), 0);
+  return ret;
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 8, 4, 2) {
+  float32x4x2_t ret;
+  ret.val[0] = vmlaq_lane_f32(c_vec.val[0], a_vec.val[0], vget_low_f32(b_vec), 1);
+  ret.val[1] = vmlaq_lane_f32(c_vec.val[1], a_vec.val[1], vget_low_f32(b_vec), 1);
+  return ret;
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 8, 4, 3) {
+  float32x4x2_t ret;
+  ret.val[0] = vmlaq_lane_f32(c_vec.val[0], a_vec.val[0], vget_high_f32(b_vec), 0);
+  ret.val[1] = vmlaq_lane_f32(c_vec.val[1], a_vec.val[1], vget_high_f32(b_vec), 0);
+  return ret;
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 8, 4, 4) {
+  float32x4x2_t ret;
+  ret.val[0] = vmlaq_lane_f32(c_vec.val[0], a_vec.val[0], vget_high_f32(b_vec), 1);
+  ret.val[1] = vmlaq_lane_f32(c_vec.val[1], a_vec.val[1], vget_high_f32(b_vec), 1);
+  return ret;
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 8, 2, 1) {
+  float32x4x2_t ret;
+  ret.val[0] = vmlaq_lane_f32(c_vec.val[0], a_vec.val[0], b_vec, 0);
+  ret.val[1] = vmlaq_lane_f32(c_vec.val[1], a_vec.val[1], b_vec, 0);
+  return ret;
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 8, 2, 2) {
+  float32x4x2_t ret;
+  ret.val[0] = vmlaq_lane_f32(c_vec.val[0], a_vec.val[0], b_vec, 1);
+  ret.val[1] = vmlaq_lane_f32(c_vec.val[1], a_vec.val[1], b_vec, 1);
+  return ret;
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 8, 1, 1) {
+  float32x4x2_t ret;
+  ret.val[0] = vmlaq_n_f32(c_vec.val[0], a_vec.val[0], b_vec);
+  ret.val[1] = vmlaq_n_f32(c_vec.val[1], a_vec.val[1], b_vec);
+  return ret;
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 4, 4, 1) {
+  return vmlaq_lane_f32(c_vec, a_vec, vget_low_f32(b_vec), 0);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 4, 4, 2) {
+  return vmlaq_lane_f32(c_vec, a_vec, vget_low_f32(b_vec), 1);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 4, 4, 3) {
+  return vmlaq_lane_f32(c_vec, a_vec, vget_high_f32(b_vec), 0);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 4, 4, 4) {
+  return vmlaq_lane_f32(c_vec, a_vec, vget_high_f32(b_vec), 1);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 4, 2, 1) {
+  return vmlaq_lane_f32(c_vec, a_vec, b_vec, 0);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 4, 2, 2) {
+  return vmlaq_lane_f32(c_vec, a_vec, b_vec, 1);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 4, 1, 1) {
+  return vmlaq_n_f32(c_vec, a_vec, b_vec);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 2, 4, 1) {
+  return vmla_lane_f32(c_vec, a_vec, vget_low_f32(b_vec), 0);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 2, 4, 2) {
+  return vmla_lane_f32(c_vec, a_vec, vget_low_f32(b_vec), 1);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 2, 4, 3) {
+  return vmla_lane_f32(c_vec, a_vec, vget_high_f32(b_vec), 0);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 2, 4, 4) {
+  return vmla_lane_f32(c_vec, a_vec, vget_high_f32(b_vec), 1);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 2, 2, 1) {
+  return vmla_lane_f32(c_vec, a_vec, b_vec, 0);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 2, 2, 2) {
+  return vmla_lane_f32(c_vec, a_vec, b_vec, 1);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 2, 1, 1) {
+  return vmla_n_f32(c_vec, a_vec, b_vec);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 1, 4, 1) {
+  return c_vec + a_vec * vgetq_lane_f32(b_vec, 0);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 1, 4, 2) {
+  return c_vec + a_vec * vgetq_lane_f32(b_vec, 1);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 1, 4, 3) {
+  return c_vec + a_vec * vgetq_lane_f32(b_vec, 2);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 1, 4, 4) {
+  return c_vec + a_vec * vgetq_lane_f32(b_vec, 3);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 1, 2, 1) {
+  return c_vec + a_vec * vget_lane_f32(b_vec, 0);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 1, 2, 2) {
+  return c_vec + a_vec * vget_lane_f32(b_vec, 1);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 1, 1, 1) {
+  return a_vec * b_vec + c_vec;
+}
+
+GEMM_SKINNY_GER_LOADA_UNIT(sgemm, 8) {
+  float32x4x2_t ret;
+  ret.val[0] = vld1q_f32(a_ptr);
+  ret.val[1] = vld1q_f32(a_ptr + 4);
+  __asm__("pld [%0,#96]"::"r"(a_ptr):);
+  return ret;
+}
+
+GEMM_SKINNY_GER_LOADA_UNIT(sgemm, 4) {
+  __asm__("pld [%0,#80]"::"r"(a_ptr):);
+  return vld1q_f32(a_ptr);
+}
+
+GEMM_SKINNY_GER_LOADA_UNIT(sgemm, 2) {
+  __asm__("pld [%0,#72]"::"r"(a_ptr):);
+  return vld1_f32(a_ptr);
+}
+
+GEMM_SKINNY_GER_LOADA_UNIT(sgemm, 1) {
+  return *a_ptr;
+}
+
+GEMM_SKINNY_GER_LOADC_UNIT(sgemm, 8) {
+  float32x4x2_t ret;
+  ret.val[0] = vld1q_f32(c_ptr);
+  ret.val[1] = vld1q_f32(c_ptr + 4);
+  return ret;
+}
+
+GEMM_SKINNY_GER_LOADC_UNIT(sgemm, 4) {
+  return vld1q_f32(c_ptr);
+}
+
+GEMM_SKINNY_GER_LOADC_UNIT(sgemm, 2) {
+  return vld1_f32(c_ptr);
+}
+
+GEMM_SKINNY_GER_LOADC_UNIT(sgemm, 1) {
+  return *c_ptr;
+}
+
+GEMM_SKINNY_GER_STOREC_UNIT(sgemm, 8) {
+  vst1q_f32(c_ptr, c_vec.val[0]);
+  vst1q_f32(c_ptr + 4, c_vec.val[1]);
+}
+
+GEMM_SKINNY_GER_STOREC_UNIT(sgemm, 4) {
+  vst1q_f32(c_ptr, c_vec);
+}
+
+GEMM_SKINNY_GER_STOREC_UNIT(sgemm, 2) {
+  vst1_f32(c_ptr, c_vec);
+}
+
+GEMM_SKINNY_GER_STOREC_UNIT(sgemm, 1) {
+  *c_ptr = c_vec;
+}
+
+GEMM_SKINNY_GER_LOADB_UNIT_BROWMAJOR(sgemm, 4) {
+  float32x4_t ret = vdupq_n_f32(0);
+  float b1 = *b_ptr; b_ptr += ldb;
+  float b2 = *b_ptr; b_ptr += ldb;
+  float b3 = *b_ptr; b_ptr += ldb;
+  float b4 = *b_ptr;
+  ret = vsetq_lane_f32(b1, ret, 0);
+  ret = vsetq_lane_f32(b2, ret, 1);
+  ret = vsetq_lane_f32(b3, ret, 2);
+  ret = vsetq_lane_f32(b4, ret, 3);
+  return ret;
+}
+
+GEMM_SKINNY_GER_LOADB_UNIT_BROWMAJOR(sgemm, 2) {
+  float32x2_t ret = vdup_n_f32(0);
+  float b1 = *b_ptr;
+  float b2 = b_ptr[ldb];
+  ret = vset_lane_f32(b1, ret, 0);
+  ret = vset_lane_f32(b2, ret, 1);
+  return ret;
+}
+
+GEMM_SKINNY_GER_LOADB_UNIT_BROWMAJOR(sgemm, 1) {
+  return *b_ptr;
+}
+
+GEMM_SKINNY_GER_LOADB_UNIT_BCOLMAJOR(sgemm, 4) {
+  return vld1q_f32(b_ptr);
+}
+
+GEMM_SKINNY_GER_LOADB_UNIT_BCOLMAJOR(sgemm, 2) {
+  return vld1_f32(b_ptr);
+}
+
+GEMM_SKINNY_GER_LOADB_UNIT_BCOLMAJOR(sgemm, 1) {
+  return *b_ptr;
+}
+
+GEMM_SKINNY_GER_PARALLEL_FUNC(sgemm, 1, 7, 7, 8192, float, float)
+GEMM_SKINNY_GER_PARALLEL_FUNC(sgemm, 2, 7, 7, 8192, float, float)
+GEMM_SKINNY_GER_PARALLEL_FUNC(sgemm, 3, 7, 7, 8192, float, float)
+GEMM_SKINNY_GER_PARALLEL_FUNC(sgemm, 4, 7, 7, 8192, float, float)
+GEMM_SKINNY_GER_PARALLEL_FUNC(sgemm, 5, 7, 7, 8192, float, float)
+GEMM_SKINNY_GER_PARALLEL_FUNC(sgemm, 6, 7, 7, 8192, float, float)
+GEMM_SKINNY_GER_PARALLEL_FUNC(sgemm, 7, 7, 3, 8192, float, float)
+GEMM_SKINNY_GER_PARALLEL_FUNC(sgemm, 8, 7, 3, 8192, float, float)
+
diff --git a/src/neon_armv7a/U8U32GemmDriver.c b/src/neon_armv7a/U8U32GemmDriver.c
new file mode 100644
index 0000000..f67fda7
--- /dev/null
+++ b/src/neon_armv7a/U8U32GemmDriver.c
@@ -0,0 +1,42 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include "neon_armv7a/U8U32MlaGemmDriver.h"
+#include "arm_neon/ARMCpuType.h"
+
+int u8u32gemm_serial(int a_rowmajor, int b_rowmajor,
+  const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t N, uint32_t K, uint32_t beta_inp) {
+
+  if (blas_arm_get_i8i32_support() == 0) {
+    return 2;
+  }
+  return u8u32mlagemm_serial(a_rowmajor, b_rowmajor, A, B, C,
+    M, N, K, beta_inp);
+}
+
+int u8u32gemm(int a_rowmajor, int b_rowmajor,
+  const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t N, uint32_t K,
+  uint32_t beta_inp, uint32_t num_threads) {
+
+  if (blas_arm_get_i8i32_support() == 0) {
+    return 2;
+  }
+  return u8u32mlagemm(a_rowmajor, b_rowmajor, A, B, C,
+    M, N, K, beta_inp, num_threads);
+}
diff --git a/src/neon_armv7a/U8U32MlaGemmCopy.c b/src/neon_armv7a/U8U32MlaGemmCopy.c
new file mode 100644
index 0000000..be09b7f
--- /dev/null
+++ b/src/neon_armv7a/U8U32MlaGemmCopy.c
@@ -0,0 +1,30 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#ifndef GEMM_UNSIGNED_INT
+#define GEMM_UNSIGNED_INT
+#endif
+
+#include "common/CommonCopy.h"
+#include "arm_neon/NeonI8I32MlaGemmCopy.h"
+
+GENERIC_NCOPY_FUNC(u8u32mlagemm, uint8_t, uint16_t, 6)
+GENERIC_NCOPY_FUNC(u8u32mlagemm, uint8_t, uint16_t, 8)
+
+GENERIC_TCOPY_FUNC(u8u32mlagemm, uint8_t, uint16_t, 6)
+GENERIC_TCOPY_FUNC(u8u32mlagemm, uint8_t, uint16_t, 8)
+
diff --git a/src/neon_armv7a/U8U32MlaGemmDriver.c b/src/neon_armv7a/U8U32MlaGemmDriver.c
new file mode 100644
index 0000000..071a898
--- /dev/null
+++ b/src/neon_armv7a/U8U32MlaGemmDriver.c
@@ -0,0 +1,27 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include "neon_armv7a/U8U32MlaGemmCopy.h"
+#include "neon_armv7a/U8U32MlaGemmKernel.h"
+#include "neon_armv7a/U8U32MlaGemmSkinnyDot.h"
+#include "neon_armv7a/U8U32MlaGemmSkinnyGer.h"
+#include "arm_neon/ARMCompareAndSwap.h"
+#include "common/CommonDriver.h"
+
+GEMM_PARALLEL_FUNC(u8u32mlagemm, uint8_t, uint16_t, uint8_t, uint16_t, uint32_t,
+  6, 8, 4, 4, 4, 4)
+
diff --git a/src/neon_armv7a/U8U32MlaGemmKernel.c b/src/neon_armv7a/U8U32MlaGemmKernel.c
new file mode 100644
index 0000000..5638fd9
--- /dev/null
+++ b/src/neon_armv7a/U8U32MlaGemmKernel.c
@@ -0,0 +1,27 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#ifndef GEMM_UNSIGNED_INT
+#define GEMM_UNSIGNED_INT
+#endif
+
+#include "common/CommonKernel.h"
+#include "neon_armv7a/I8I32MlaGemmKernel.h"
+
+DUALPACK_KERNEL_FUNC_LM(u8u32mlagemm, uint16_t, uint16_t, uint32_t, 6, 8)
+DUALPACK_KERNEL_FUNC_LN(u8u32mlagemm, uint16_t, uint16_t, uint32_t, 8, 6)
+
diff --git a/src/neon_armv7a/U8U32MlaGemmSkinnyDot.c b/src/neon_armv7a/U8U32MlaGemmSkinnyDot.c
new file mode 100644
index 0000000..36e1c25
--- /dev/null
+++ b/src/neon_armv7a/U8U32MlaGemmSkinnyDot.c
@@ -0,0 +1,29 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#ifndef GEMM_UNSIGNED_INT
+#define GEMM_UNSIGNED_INT
+#endif
+
+#include "arm_neon/ARMCompareAndSwap.h"
+#include "arm_neon/NeonI8I32MlaGemmSkinnyDot.h"
+#include "common/CommonSkinnyDot.h"
+
+GEMM_SKINNY_DOT_PARALLEL_FUNC(u8u32mlagemm, 1, 15, 7, 131072, uint8_t, uint8_t)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(u8u32mlagemm, 2, 15, 7, 131072, uint8_t, uint8_t)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(u8u32mlagemm, 3, 15, 3, 131072, uint8_t, uint8_t)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(u8u32mlagemm, 4, 15, 3, 131072, uint8_t, uint8_t)
\ No newline at end of file
diff --git a/src/neon_armv7a/U8U32MlaGemmSkinnyGer.c b/src/neon_armv7a/U8U32MlaGemmSkinnyGer.c
new file mode 100644
index 0000000..8bc28a0
--- /dev/null
+++ b/src/neon_armv7a/U8U32MlaGemmSkinnyGer.c
@@ -0,0 +1,29 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#ifndef GEMM_UNSIGNED_INT
+#define GEMM_UNSIGNED_INT
+#endif
+
+#include "arm_neon/ARMCompareAndSwap.h"
+#include "arm_neon/NeonI8I32MlaGemmSkinnyGer.h"
+
+GEMM_SKINNY_GER_PARALLEL_FUNC(u8u32mlagemm, 1, 5, 5, 8192, uint8_t, uint8_t)
+GEMM_SKINNY_GER_PARALLEL_FUNC(u8u32mlagemm, 2, 5, 5, 8192, uint8_t, uint8_t)
+GEMM_SKINNY_GER_PARALLEL_FUNC(u8u32mlagemm, 3, 5, 5, 8192, uint8_t, uint8_t)
+GEMM_SKINNY_GER_PARALLEL_FUNC(u8u32mlagemm, 4, 5, 5, 8192, uint8_t, uint8_t)
+
diff --git a/src/neon_armv8a/Bias.c b/src/neon_armv8a/Bias.c
new file mode 100644
index 0000000..08519e8
--- /dev/null
+++ b/src/neon_armv8a/Bias.c
@@ -0,0 +1,28 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include "arm_neon/NeonBias.h"
+#include "arm_neon/NeonSum.h"
+
+NEON_BIAS(float, float32x4_t, f32, 4, fma)
+
+NEON_BIAS(int32_t, int32x4_t, s32, 4, mla)
+
+NEON_I8I32_SUM(u, uint)
+
+NEON_I16_SUMSQUARE(s, int)
+
diff --git a/src/neon_armv8a/HgemmDriver.c b/src/neon_armv8a/HgemmDriver.c
new file mode 100644
index 0000000..3c31786
--- /dev/null
+++ b/src/neon_armv8a/HgemmDriver.c
@@ -0,0 +1,28 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include "neon_armv8a/HgemmKernel.h"
+#include "neon_armv8a/HgemmCopy.h"
+#include "neon_armv8a/HgemmSkinnyDot.h"
+#include "neon_armv8a/HgemmSkinnyGer.h"
+#include "arm_neon/ARMCpuType.h"
+#include "arm_neon/ARMCompareAndSwap.h"
+#include "common/CommonDriver.h"
+
+GEMM_PARALLEL_FUNC(hgemm, float16_t, float16_t, float16_t, float16_t, float16_t,
+  8, 16, 12, 12, 12, 12, || blas_arm_get_fp16_support() < 2)
+
diff --git a/src/neon_armv8a/Layer.c b/src/neon_armv8a/Layer.c
new file mode 100644
index 0000000..1d8bee2
--- /dev/null
+++ b/src/neon_armv8a/Layer.c
@@ -0,0 +1,24 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include "neon_armv8a/SgemmDriver.h"
+#include "neon_armv8a/Bias.h"
+#include "common/CommonLayer.h"
+#include <stdlib.h>
+
+SIMPLE_FC_FUNC(sgemm, float, float, float)
+
diff --git a/src/neon_armv8a/Quant.c b/src/neon_armv8a/Quant.c
new file mode 100644
index 0000000..3835294
--- /dev/null
+++ b/src/neon_armv8a/Quant.c
@@ -0,0 +1,52 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include "common/CommonQuant.h"
+#include "arm_neon/NeonQuant.h"
+
+NEON_FIND_EXTREME(float32_t, f32, float32x2_t, float32x4_t, 2)
+
+QUANTIZE_ASYMMETRIC(32, 8)
+
+QUANTIZE_SYMMETRIC(32, 8)
+
+QUANTIZE_ASYMMETRIC(32, 16)
+
+QUANTIZE_SYMMETRIC(32, 16)
+
+void dequantize_symmetric_f32_s32(const int32_t *src, float32_t *dst,
+  float32_t scale, uint32_t size) {
+
+  inline_dequant_cvt_f32_s32(dst, src, scale, size);
+}
+
+NEON_FIND_EXTREME(int32_t, s32, int32x2_t, int32x4_t, 2)
+
+NEON_FIND_EXTREME(int16_t, s16, int16x4_t, int16x8_t, 4)
+
+REQUANTIZE_ASYMMETRIC_MULHI(float, 32, 8, 64)
+
+REQUANTIZE_SYMMETRIC_MULHI(float, 32, 8, 64)
+
+REQUANTIZE_ASYMMETRIC_MULHI(float, 32, 16, 64)
+
+REQUANTIZE_SYMMETRIC_MULHI(float, 32, 16, 64)
+
+REQUANTIZE_ASYMMETRIC_MULHI(float, 16, 8, 32)
+
+REQUANTIZE_SYMMETRIC_MULHI(float, 16, 8, 32)
+
diff --git a/src/neon_armv8a/S8S32DotGemmDriver.c b/src/neon_armv8a/S8S32DotGemmDriver.c
new file mode 100644
index 0000000..9d77d1f
--- /dev/null
+++ b/src/neon_armv8a/S8S32DotGemmDriver.c
@@ -0,0 +1,36 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include "neon_armv8a/S8S32DotGemmCopy.h"
+#include "neon_armv8a/S8S32DotGemmKernel.h"
+#include "neon_armv8a/S8S32DotGemmSkinnyDot.h"
+#include "arm_neon/ARMCompareAndSwap.h"
+#include "common/CommonDriver.h"
+
+#ifdef SCRATCH_K_CORD
+#undef SCRATCH_K_CORD
+#define SCRATCH_K_CORD(k) ((k) >> 2)
+#endif
+
+#ifdef GEMM_D_K
+#undef GEMM_D_K
+#define GEMM_D_K 768
+#endif
+
+GEMM_PARALLEL_FUNC(s8s32dotgemm, int8_t, int32_t, int8_t, int32_t, int32_t,
+  8, 12, 12, 12, 0, 0)
+
diff --git a/src/neon_armv8a/S8S32GemmDriver.c b/src/neon_armv8a/S8S32GemmDriver.c
new file mode 100644
index 0000000..7dbbf90
--- /dev/null
+++ b/src/neon_armv8a/S8S32GemmDriver.c
@@ -0,0 +1,48 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include "neon_armv8a/S8S32MlaGemmDriver.h"
+#include "neon_armv8a/S8S32DotGemmDriver.h"
+#include "arm_neon/ARMCpuType.h"
+
+int s8s32gemm_serial(int a_rowmajor, int b_rowmajor,
+  const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t N, uint32_t K, int32_t beta_inp) {
+
+  if (blas_arm_get_i8i32_support() == 2) {
+    return s8s32dotgemm_serial(a_rowmajor, b_rowmajor, A, B, C,
+      M, N, K, beta_inp);
+  } else {
+    return s8s32mlagemm_serial(a_rowmajor, b_rowmajor, A, B, C,
+      M, N, K, beta_inp);
+  }
+}
+
+int s8s32gemm(int a_rowmajor, int b_rowmajor,
+  const int8_t *A, const int8_t *B,
+  int32_t *C, uint32_t M, uint32_t N, uint32_t K,
+  int32_t beta_inp, uint32_t num_threads) {
+
+  if (blas_arm_get_i8i32_support() == 2) {
+    return s8s32dotgemm(a_rowmajor, b_rowmajor, A, B, C,
+      M, N, K, beta_inp, num_threads);
+  } else {
+    return s8s32mlagemm(a_rowmajor, b_rowmajor, A, B, C,
+      M, N, K, beta_inp, num_threads);
+  }
+}
+
diff --git a/src/neon_armv8a/S8S32MlaGemmCopy.c b/src/neon_armv8a/S8S32MlaGemmCopy.c
new file mode 100644
index 0000000..42c9451
--- /dev/null
+++ b/src/neon_armv8a/S8S32MlaGemmCopy.c
@@ -0,0 +1,30 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#ifdef GEMM_UNSIGNED_INT
+#undef GEMM_UNSIGNED_INT
+#endif
+
+#include "common/CommonCopy.h"
+#include "arm_neon/NeonI8I32MlaGemmCopy.h"
+
+GENERIC_NCOPY_FUNC(s8s32mlagemm, int8_t, int16_t, 8)
+GENERIC_NCOPY_FUNC(s8s32mlagemm, int8_t, int16_t, 12)
+
+GENERIC_TCOPY_FUNC(s8s32mlagemm, int8_t, int16_t, 8)
+GENERIC_TCOPY_FUNC(s8s32mlagemm, int8_t, int16_t, 12)
+
diff --git a/src/neon_armv8a/S8S32MlaGemmDriver.c b/src/neon_armv8a/S8S32MlaGemmDriver.c
new file mode 100644
index 0000000..e52bd40
--- /dev/null
+++ b/src/neon_armv8a/S8S32MlaGemmDriver.c
@@ -0,0 +1,27 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include "neon_armv8a/S8S32MlaGemmCopy.h"
+#include "neon_armv8a/S8S32MlaGemmKernel.h"
+#include "neon_armv8a/S8S32MlaGemmSkinnyGer.h"
+#include "neon_armv8a/S8S32MlaGemmSkinnyDot.h"
+#include "arm_neon/ARMCompareAndSwap.h"
+#include "common/CommonDriver.h"
+
+GEMM_PARALLEL_FUNC(s8s32mlagemm, int8_t, int16_t, int8_t, int16_t, int32_t,
+  8, 12, 8, 8, 8, 8)
+
diff --git a/src/neon_armv8a/S8S32MlaGemmKernel.c b/src/neon_armv8a/S8S32MlaGemmKernel.c
new file mode 100644
index 0000000..110f834
--- /dev/null
+++ b/src/neon_armv8a/S8S32MlaGemmKernel.c
@@ -0,0 +1,27 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#ifdef GEMM_UNSIGNED_INT
+#undef GEMM_UNSIGNED_INT
+#endif
+
+#include "common/CommonKernel.h"
+#include "neon_armv8a/I8I32MlaGemmKernel.h"
+
+DUALPACK_KERNEL_FUNC_LM(s8s32mlagemm, int16_t, int16_t, int32_t, 8, 12)
+DUALPACK_KERNEL_FUNC_LN(s8s32mlagemm, int16_t, int16_t, int32_t, 12, 8)
+
diff --git a/src/neon_armv8a/S8S32MlaGemmSkinnyDot.c b/src/neon_armv8a/S8S32MlaGemmSkinnyDot.c
new file mode 100644
index 0000000..3d45eac
--- /dev/null
+++ b/src/neon_armv8a/S8S32MlaGemmSkinnyDot.c
@@ -0,0 +1,34 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#ifdef GEMM_UNSIGNED_INT
+#undef GEMM_UNSIGNED_INT
+#endif
+
+#include "arm_neon/ARMCompareAndSwap.h"
+#include "neon_armv8a/I8I32MlaGemmSkinnyDot.h"
+#include "common/CommonSkinnyDot.h"
+
+GEMM_SKINNY_DOT_PARALLEL_FUNC_NOINCINLINE(s8s32mlagemm, 1, 31, 5, 131072, int8_t, int8_t, unroll_test)
+GEMM_SKINNY_DOT_PARALLEL_FUNC_NOINCINLINE(s8s32mlagemm, 2, 31, 5, 131072, int8_t, int8_t, unroll_test)
+GEMM_SKINNY_DOT_PARALLEL_FUNC_NOINCINLINE(s8s32mlagemm, 3, 31, 5, 131072, int8_t, int8_t, unroll_test)
+
+GEMM_SKINNY_DOT_PARALLEL_FUNC(s8s32mlagemm, 4, 15, 7, 131072, int8_t, int8_t)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(s8s32mlagemm, 5, 15, 7, 131072, int8_t, int8_t)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(s8s32mlagemm, 6, 15, 7, 131072, int8_t, int8_t)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(s8s32mlagemm, 7, 15, 3, 131072, int8_t, int8_t)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(s8s32mlagemm, 8, 15, 3, 131072, int8_t, int8_t)
diff --git a/src/neon_armv8a/S8S32MlaGemmSkinnyGer.c b/src/neon_armv8a/S8S32MlaGemmSkinnyGer.c
new file mode 100644
index 0000000..c8a739b
--- /dev/null
+++ b/src/neon_armv8a/S8S32MlaGemmSkinnyGer.c
@@ -0,0 +1,32 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#ifdef GEMM_UNSIGNED_INT
+#undef GEMM_UNSIGNED_INT
+#endif
+
+#include "arm_neon/ARMCompareAndSwap.h"
+#include "arm_neon/NeonI8I32MlaGemmSkinnyGer.h"
+
+GEMM_SKINNY_GER_PARALLEL_FUNC(s8s32mlagemm, 1, 5, 29, 8192, int8_t, int8_t)
+GEMM_SKINNY_GER_PARALLEL_FUNC(s8s32mlagemm, 2, 5, 29, 8192, int8_t, int8_t)
+GEMM_SKINNY_GER_PARALLEL_FUNC(s8s32mlagemm, 3, 5, 29, 8192, int8_t, int8_t)
+GEMM_SKINNY_GER_PARALLEL_FUNC(s8s32mlagemm, 4, 5, 29, 8192, int8_t, int8_t)
+GEMM_SKINNY_GER_PARALLEL_FUNC(s8s32mlagemm, 5, 5, 13, 8192, int8_t, int8_t)
+GEMM_SKINNY_GER_PARALLEL_FUNC(s8s32mlagemm, 6, 5, 13, 8192, int8_t, int8_t)
+GEMM_SKINNY_GER_PARALLEL_FUNC(s8s32mlagemm, 7, 5, 13, 8192, int8_t, int8_t)
+GEMM_SKINNY_GER_PARALLEL_FUNC(s8s32mlagemm, 8, 5, 13, 8192, int8_t, int8_t)
diff --git a/src/neon_armv8a/SgemmCopy.c b/src/neon_armv8a/SgemmCopy.c
new file mode 100644
index 0000000..fe6308b
--- /dev/null
+++ b/src/neon_armv8a/SgemmCopy.c
@@ -0,0 +1,30 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include "common/CommonCopy.h"
+#include "arm_neon/NeonSgemmCopy.h"
+
+#define NCOPY_float_float(unroll) NCOPY_UNROLL_##unroll
+
+GENERIC_NCOPY_FUNC(sgemm, float, float, 8)
+GENERIC_NCOPY_FUNC(sgemm, float, float, 12)
+
+#define TCOPY_UNIT_float_float(src_ptr, dst_ptr, dst_offset, num_elements) \
+  TCOPY_UNIT_##num_elements(src_ptr, dst_ptr, dst_offset)
+
+GENERIC_TCOPY_FUNC(sgemm, float, float, 8)
+GENERIC_TCOPY_FUNC(sgemm, float, float, 12)
diff --git a/src/neon_armv8a/SgemmDriver.c b/src/neon_armv8a/SgemmDriver.c
new file mode 100644
index 0000000..09ed573
--- /dev/null
+++ b/src/neon_armv8a/SgemmDriver.c
@@ -0,0 +1,26 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include "neon_armv8a/SgemmKernel.h"
+#include "neon_armv8a/SgemmCopy.h"
+#include "neon_armv8a/SgemmSkinnyDot.h"
+#include "neon_armv8a/SgemmSkinnyGer.h"
+#include "arm_neon/ARMCompareAndSwap.h"
+#include "common/CommonDriver.h"
+
+GEMM_PARALLEL_FUNC(sgemm, float, float, float, float, float, 8, 12, 50, 50, 12, 12)
+
diff --git a/src/neon_armv8a/SgemmKernel.c b/src/neon_armv8a/SgemmKernel.c
new file mode 100644
index 0000000..7e9bc94
--- /dev/null
+++ b/src/neon_armv8a/SgemmKernel.c
@@ -0,0 +1,1071 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include "common/CommonKernel.h"
+#include "arm_neon/NeonSgemmKernel.h"
+#include "arm_neon/ARMCpuType.h"
+#include <sched.h>
+
+#define NEON_SGEMM_KERNEL_M8N12_PRELOAD_A53 \
+  "ldr q0,[%25]; add %25,%25,#32\n\t"\
+  "ldr q3,[%26]; ldr d5,[%26,#16]; ldr x0,[%26,#24]; add %26,%26,#48\n\t"
+
+#define NEON_SGEMM_KERNEL_M8N12_MAIN2_A53 \
+  "fmov v5.d[1],x0; ldr d7,[%26,#-16]\n\t"\
+  "fmla %0.4s,v0.4s,v3.s[0]; ldr x0,[%26,#-8]\n\t"\
+  "fmla %2.4s,v0.4s,v3.s[1]; fmla %4.4s,v0.4s,v3.s[2]\n\t"\
+  "fmov v7.d[1],x0; ldr d2,[%25,#-16]\n\t"\
+  "fmla %6.4s,v0.4s,v3.s[3]; ldr x0,[%25,#-8]\n\t"\
+  "fmla %8.4s,v0.4s,v5.s[0]; fmla %16.4s,v0.4s,v7.s[0]\n\t"\
+  "fmov v2.d[1],x0; ldr d4,[%26]\n\t"\
+  "fmla %18.4s,v0.4s,v7.s[1]; ldr x0,[%26,#8]\n\t"\
+  "fmla %20.4s,v0.4s,v7.s[2]; fmla %22.4s,v0.4s,v7.s[3]\n\t"\
+  "fmov v4.d[1],x0; ldr d1,[%25]\n\t"\
+  "fmla %17.4s,v2.4s,v7.s[0]; ldr x0,[%25,#8]\n\t"\
+  "fmla %19.4s,v2.4s,v7.s[1]; fmla %21.4s,v2.4s,v7.s[2]\n\t"\
+  "fmov v1.d[1],x0; ldr d6,[%26,#16]\n\t"\
+  "fmla %23.4s,v2.4s,v7.s[3]; ldr x0,[%26,#24]\n\t"\
+  "fmla %9.4s,v2.4s,v5.s[0]; fmla %1.4s,v2.4s,v3.s[0]\n\t"\
+  "fmov v6.d[1],x0; ldr d7,[%26,#32]\n\t"\
+  "fmla %3.4s,v2.4s,v3.s[1]; ldr x0,[%26,#40]\n\t"\
+  "fmla %5.4s,v2.4s,v3.s[2]; fmla %7.4s,v2.4s,v3.s[3]\n\t"\
+  "fmov v7.d[1],x0; ldr d3,[%26,#48]\n\t"\
+  "fmla %11.4s,v2.4s,v5.s[1]; ldr x0,[%26,#56]\n\t"\
+  "fmla %13.4s,v2.4s,v5.s[2]; fmla %15.4s,v2.4s,v5.s[3]\n\t"\
+  "fmov v3.d[1],x0; ldr d2,[%25,#16]\n\t"\
+  "fmla %10.4s,v0.4s,v5.s[1]; ldr x0,[%25,#24]\n\t"\
+  "fmla %12.4s,v0.4s,v5.s[2]; fmla %14.4s,v0.4s,v5.s[3]\n\t"\
+  "fmov v2.d[1],x0; ldr d0,[%25,#32]\n\t"\
+  "fmla %0.4s,v1.4s,v4.s[0]; ldr x0,[%25,#40]\n\t"\
+  "fmla %2.4s,v1.4s,v4.s[1]; fmla %4.4s,v1.4s,v4.s[2]\n\t"\
+  "fmov v0.d[1],x0; ldr d5,[%26,#64]\n\t"\
+  "fmla %6.4s,v1.4s,v4.s[3]; ldr x0,[%26,#72]\n\t"\
+  "fmla %8.4s,v1.4s,v6.s[0]; fmla %10.4s,v1.4s,v6.s[1]\n\t"\
+  "add %25,%25,#64\n\t"\
+  "fmla %12.4s,v1.4s,v6.s[2]\n\t"\
+  "fmla %14.4s,v1.4s,v6.s[3]; fmla %16.4s,v1.4s,v7.s[0]\n\t"\
+  "add %26,%26,#96\n\t"\
+  "fmla %18.4s,v1.4s,v7.s[1]\n\t"\
+  "fmla %20.4s,v1.4s,v7.s[2]; fmla %22.4s,v1.4s,v7.s[3]\n\t"\
+  "prfm pldl1keep,[%25,#128]\n\t"\
+  "fmla %1.4s,v2.4s,v4.s[0]\n\t"\
+  "fmla %3.4s,v2.4s,v4.s[1]; fmla %5.4s,v2.4s,v4.s[2]\n\t"\
+  "prfm pldl1keep,[%26,#192]\n\t"\
+  "fmla %7.4s,v2.4s,v4.s[3]\n\t"\
+  "fmla %9.4s,v2.4s,v6.s[0]; fmla %11.4s,v2.4s,v6.s[1]\n\t"\
+  "sub %w24,%w24,#2\n\t"\
+  "fmla %13.4s,v2.4s,v6.s[2]\n\t"\
+  "fmla %15.4s,v2.4s,v6.s[3]; fmla %17.4s,v2.4s,v7.s[0]\n\t"\
+  "cmp %w24,#2; prfm pldl1keep,[%26,#240]\n\t"\
+  "fmla %19.4s,v2.4s,v7.s[1]\n\t"\
+  "fmla %21.4s,v2.4s,v7.s[2]; fmla %23.4s,v2.4s,v7.s[3]\n\t"
+
+#define NEON_SGEMM_KERNEL_M8N12_TAIL2_A53 \
+  "fmov v5.d[1],x0; ldr d7,[%26,#-16]\n\t"\
+  "fmla %0.4s,v0.4s,v3.s[0]; ldr x0,[%26,#-8]\n\t"\
+  "fmla %2.4s,v0.4s,v3.s[1]; fmla %4.4s,v0.4s,v3.s[2]\n\t"\
+  "fmov v7.d[1],x0; ldr d2,[%25,#-16]\n\t"\
+  "fmla %6.4s,v0.4s,v3.s[3]; ldr x0,[%25,#-8]\n\t"\
+  "fmla %8.4s,v0.4s,v5.s[0]; fmla %16.4s,v0.4s,v7.s[0]\n\t"\
+  "fmov v2.d[1],x0; ldr d4,[%26]\n\t"\
+  "fmla %18.4s,v0.4s,v7.s[1]; ldr x0,[%26,#8]\n\t"\
+  "fmla %20.4s,v0.4s,v7.s[2]; fmla %22.4s,v0.4s,v7.s[3]\n\t"\
+  "fmov v4.d[1],x0; ldr d1,[%25]\n\t"\
+  "fmla %17.4s,v2.4s,v7.s[0]; ldr x0,[%25,#8]\n\t"\
+  "fmla %19.4s,v2.4s,v7.s[1]; fmla %21.4s,v2.4s,v7.s[2]\n\t"\
+  "fmov v1.d[1],x0; ldr d6,[%26,#16]\n\t"\
+  "fmla %23.4s,v2.4s,v7.s[3]; ldr x0,[%26,#24]\n\t"\
+  "fmla %9.4s,v2.4s,v5.s[0]; fmla %1.4s,v2.4s,v3.s[0]\n\t"\
+  "fmov v6.d[1],x0; ldr d7,[%26,#32]\n\t"\
+  "fmla %3.4s,v2.4s,v3.s[1]; ldr x0,[%26,#40]\n\t"\
+  "fmla %5.4s,v2.4s,v3.s[2]; fmla %7.4s,v2.4s,v3.s[3]\n\t"\
+  "fmov v7.d[1],x0\n\t"\
+  "fmla %11.4s,v2.4s,v5.s[1]\n\t"\
+  "fmla %13.4s,v2.4s,v5.s[2]; fmla %15.4s,v2.4s,v5.s[3]\n\t"\
+  "ldr d2,[%25,#16]\n\t"\
+  "fmla %10.4s,v0.4s,v5.s[1]; ldr x0,[%25,#24]\n\t"\
+  "fmla %12.4s,v0.4s,v5.s[2]; fmla %14.4s,v0.4s,v5.s[3]\n\t"\
+  "fmov v2.d[1],x0\n\t"\
+  "fmla %0.4s,v1.4s,v4.s[0]\n\t"\
+  "fmla %2.4s,v1.4s,v4.s[1]; fmla %4.4s,v1.4s,v4.s[2]\n\t"\
+  "fmla %6.4s,v1.4s,v4.s[3]\n\t"\
+  "fmla %8.4s,v1.4s,v6.s[0]; fmla %10.4s,v1.4s,v6.s[1]\n\t"\
+  "add %25,%25,#32\n\t"\
+  "fmla %12.4s,v1.4s,v6.s[2]\n\t"\
+  "fmla %14.4s,v1.4s,v6.s[3]; fmla %16.4s,v1.4s,v7.s[0]\n\t"\
+  "add %26,%26,#48\n\t"\
+  "fmla %18.4s,v1.4s,v7.s[1]\n\t"\
+  "fmla %20.4s,v1.4s,v7.s[2]; fmla %22.4s,v1.4s,v7.s[3]\n\t"\
+  "fmla %1.4s,v2.4s,v4.s[0]\n\t"\
+  "fmla %3.4s,v2.4s,v4.s[1]; fmla %5.4s,v2.4s,v4.s[2]\n\t"\
+  "fmla %7.4s,v2.4s,v4.s[3]\n\t"\
+  "fmla %9.4s,v2.4s,v6.s[0]; fmla %11.4s,v2.4s,v6.s[1]\n\t"\
+  "fmla %13.4s,v2.4s,v6.s[2]\n\t"\
+  "fmla %15.4s,v2.4s,v6.s[3]; fmla %17.4s,v2.4s,v7.s[0]\n\t"\
+  "fmla %19.4s,v2.4s,v7.s[1]\n\t"\
+  "fmla %21.4s,v2.4s,v7.s[2]; fmla %23.4s,v2.4s,v7.s[3]\n\t"
+
+#define NEON_SGEMM_KERNEL_M8N12_TAIL1_A53 \
+  "fmov v5.d[1],x0; ldr d7,[%26,#-16]\n\t"\
+  "fmla %0.4s,v0.4s,v3.s[0]; ldr x0,[%26,#-8]\n\t"\
+  "fmla %2.4s,v0.4s,v3.s[1]; fmla %4.4s,v0.4s,v3.s[2]\n\t"\
+  "fmov v7.d[1],x0; ldr d2,[%25,#-16]\n\t"\
+  "fmla %6.4s,v0.4s,v3.s[3]; ldr x0,[%25,#-8]\n\t"\
+  "fmla %8.4s,v0.4s,v5.s[0]; fmla %16.4s,v0.4s,v7.s[0]\n\t"\
+  "fmov v2.d[1],x0\n\t"\
+  "fmla %18.4s,v0.4s,v7.s[1]\n\t"\
+  "fmla %20.4s,v0.4s,v7.s[2]; fmla %22.4s,v0.4s,v7.s[3]\n\t"\
+  "fmla %17.4s,v2.4s,v7.s[0]\n\t"\
+  "fmla %19.4s,v2.4s,v7.s[1]; fmla %21.4s,v2.4s,v7.s[2]\n\t"\
+  "fmla %23.4s,v2.4s,v7.s[3]\n\t"\
+  "fmla %9.4s,v2.4s,v5.s[0]; fmla %1.4s,v2.4s,v3.s[0]\n\t"\
+  "fmla %3.4s,v2.4s,v3.s[1]\n\t"\
+  "fmla %5.4s,v2.4s,v3.s[2]; fmla %7.4s,v2.4s,v3.s[3]\n\t"\
+  "fmla %11.4s,v2.4s,v5.s[1]\n\t"\
+  "fmla %13.4s,v2.4s,v5.s[2]; fmla %15.4s,v2.4s,v5.s[3]\n\t"\
+  "fmla %10.4s,v0.4s,v5.s[1]\n\t"\
+  "fmla %12.4s,v0.4s,v5.s[2]; fmla %14.4s,v0.4s,v5.s[3]\n\t"
+
+#define NEON_SGEMM_KERNEL_M8N12_PRELOAD_A55 \
+  "ldr q0,[%25]; ldr q1,[%25,#16]; add %25,%25,#32\n\t"\
+  "ldr q4,[%26]; ldr d5,[%26,#16]; ldr x1,[%26,#24]; add %26,%26,#48\n\t"
+
+#define NEON_SGEMM_KERNEL_M8N12_MAIN2_A55 \
+  "fmla %0.4s,v0.4s,v4.s[0]; ldr d2,[%25]\n\t"\
+  "fmla %2.4s,v0.4s,v4.s[1]; ldr x0,[%25,#8]\n\t"\
+  "fmla %4.4s,v0.4s,v4.s[2]\n\t"\
+  "fmla %6.4s,v0.4s,v4.s[3]; fmov v5.d[1],x1\n\t"\
+  "fmla %1.4s,v1.4s,v4.s[0]; ldr d6,[%26,#-16]\n\t"\
+  "fmla %3.4s,v1.4s,v4.s[1]; ldr x1,[%26,#-8]\n\t"\
+  "fmla %5.4s,v1.4s,v4.s[2]\n\t"\
+  "fmla %7.4s,v1.4s,v4.s[3]; fmov v2.d[1],x0\n\t"\
+  "fmla %8.4s,v0.4s,v5.s[0]; ldr d3,[%25,#16]\n\t"\
+  "fmla %10.4s,v0.4s,v5.s[1]; ldr x0,[%25,#24]\n\t"\
+  "fmla %12.4s,v0.4s,v5.s[2]\n\t"\
+  "fmla %14.4s,v0.4s,v5.s[3]; fmov v6.d[1],x1\n\t"\
+  "fmla %9.4s,v1.4s,v5.s[0]; ldr d4,[%26]\n\t"\
+  "fmla %11.4s,v1.4s,v5.s[1]; ldr x1,[%26,#8]\n\t"\
+  "fmla %13.4s,v1.4s,v5.s[2]\n\t"\
+  "fmla %15.4s,v1.4s,v5.s[3]; fmov v3.d[1],x0\n\t"\
+  "fmla %16.4s,v0.4s,v6.s[0]; ldr d5,[%26,#16]\n\t"\
+  "fmla %18.4s,v0.4s,v6.s[1]; ldr x0,[%26,#24]\n\t"\
+  "fmla %20.4s,v0.4s,v6.s[2]\n\t"\
+  "fmla %22.4s,v0.4s,v6.s[3]; fmov v4.d[1],x1\n\t"\
+  "fmla %17.4s,v1.4s,v6.s[0]; add %25,%25,#64\n\t"\
+  "fmla %19.4s,v1.4s,v6.s[1]; add %26,%26,#96\n\t"\
+  "fmla %21.4s,v1.4s,v6.s[2]\n\t"\
+  "fmla %23.4s,v1.4s,v6.s[3]\n\t"\
+  "fmla %0.4s,v2.4s,v4.s[0]; ldr d0,[%25,#-32]\n\t"\
+  "fmla %2.4s,v2.4s,v4.s[1]; ldr x1,[%25,#-24]\n\t"\
+  "fmla %4.4s,v2.4s,v4.s[2]\n\t"\
+  "fmla %6.4s,v2.4s,v4.s[3]; fmov v5.d[1],x0\n\t"\
+  "fmla %1.4s,v3.4s,v4.s[0]; ldr d6,[%26,#-64]\n\t"\
+  "fmla %3.4s,v3.4s,v4.s[1]; ldr x0,[%26,#-56]\n\t"\
+  "fmla %5.4s,v3.4s,v4.s[2]\n\t"\
+  "fmla %7.4s,v3.4s,v4.s[3]; fmov v0.d[1],x1\n\t"\
+  "fmla %8.4s,v2.4s,v5.s[0]; ldr d1,[%25,#-16]\n\t"\
+  "fmla %10.4s,v2.4s,v5.s[1]; ldr x1,[%25,#-8]\n\t"\
+  "fmla %12.4s,v2.4s,v5.s[2]\n\t"\
+  "fmla %14.4s,v2.4s,v5.s[3]; fmov v6.d[1],x0\n\t"\
+  "fmla %9.4s,v3.4s,v5.s[0]; ldr d4,[%26,#-48]\n\t"\
+  "fmla %11.4s,v3.4s,v5.s[1]; ldr x0,[%26,#-40]\n\t"\
+  "fmla %13.4s,v3.4s,v5.s[2]\n\t"\
+  "fmla %15.4s,v3.4s,v5.s[3]; fmov v1.d[1],x1\n\t"\
+  "fmla %16.4s,v2.4s,v6.s[0]; ldr d5,[%26,#-32]\n\t"\
+  "fmla %18.4s,v2.4s,v6.s[1]; ldr x1,[%26,#-24]\n\t"\
+  "fmla %20.4s,v2.4s,v6.s[2]\n\t"\
+  "fmla %22.4s,v2.4s,v6.s[3]; fmov v4.d[1],x0\n\t"\
+  "fmla %17.4s,v3.4s,v6.s[0]\n\t"\
+  "fmla %19.4s,v3.4s,v6.s[1]; sub %w24,%w24,#2\n\t"\
+  "fmla %21.4s,v3.4s,v6.s[2]; cmp %w24,#2\n\t"\
+  "fmla %23.4s,v3.4s,v6.s[3]\n\t"
+
+#define NEON_SGEMM_KERNEL_M8N12_TAIL2_A55 \
+  "fmla %0.4s,v0.4s,v4.s[0]; ldr d2,[%25]\n\t"\
+  "fmla %2.4s,v0.4s,v4.s[1]; ldr x0,[%25,#8]\n\t"\
+  "fmla %4.4s,v0.4s,v4.s[2]\n\t"\
+  "fmla %6.4s,v0.4s,v4.s[3]; fmov v5.d[1],x1\n\t"\
+  "fmla %1.4s,v1.4s,v4.s[0]; ldr d6,[%26,#-16]\n\t"\
+  "fmla %3.4s,v1.4s,v4.s[1]; ldr x1,[%26,#-8]\n\t"\
+  "fmla %5.4s,v1.4s,v4.s[2]\n\t"\
+  "fmla %7.4s,v1.4s,v4.s[3]; fmov v2.d[1],x0\n\t"\
+  "fmla %8.4s,v0.4s,v5.s[0]; ldr d3,[%25,#16]\n\t"\
+  "fmla %10.4s,v0.4s,v5.s[1]; ldr x0,[%25,#24]\n\t"\
+  "fmla %12.4s,v0.4s,v5.s[2]\n\t"\
+  "fmla %14.4s,v0.4s,v5.s[3]; fmov v6.d[1],x1\n\t"\
+  "fmla %9.4s,v1.4s,v5.s[0]; ldr d4,[%26]\n\t"\
+  "fmla %11.4s,v1.4s,v5.s[1]; ldr x1,[%26,#8]\n\t"\
+  "fmla %13.4s,v1.4s,v5.s[2]\n\t"\
+  "fmla %15.4s,v1.4s,v5.s[3]; fmov v3.d[1],x0\n\t"\
+  "fmla %16.4s,v0.4s,v6.s[0]; ldr d5,[%26,#16]\n\t"\
+  "fmla %18.4s,v0.4s,v6.s[1]; ldr x0,[%26,#24]\n\t"\
+  "fmla %20.4s,v0.4s,v6.s[2]\n\t"\
+  "fmla %22.4s,v0.4s,v6.s[3]; fmov v4.d[1],x1\n\t"\
+  "fmla %17.4s,v1.4s,v6.s[0]; add %25,%25,#32\n\t"\
+  "fmla %19.4s,v1.4s,v6.s[1]; add %26,%26,#48\n\t"\
+  "fmla %21.4s,v1.4s,v6.s[2]\n\t"\
+  "fmla %23.4s,v1.4s,v6.s[3]\n\t"\
+  "fmla %0.4s,v2.4s,v4.s[0]\n\t"\
+  "fmla %2.4s,v2.4s,v4.s[1]\n\t"\
+  "fmla %4.4s,v2.4s,v4.s[2]\n\t"\
+  "fmla %6.4s,v2.4s,v4.s[3]; fmov v5.d[1],x0\n\t"\
+  "fmla %1.4s,v3.4s,v4.s[0]; ldr d6,[%26,#-16]\n\t"\
+  "fmla %3.4s,v3.4s,v4.s[1]; ldr x0,[%26,#-8]\n\t"\
+  "fmla %5.4s,v3.4s,v4.s[2]\n\t"\
+  "fmla %7.4s,v3.4s,v4.s[3]\n\t"\
+  "fmla %8.4s,v2.4s,v5.s[0]\n\t"\
+  "fmla %10.4s,v2.4s,v5.s[1]\n\t"\
+  "fmla %12.4s,v2.4s,v5.s[2]\n\t"\
+  "fmla %14.4s,v2.4s,v5.s[3]; fmov v6.d[1],x0\n\t"\
+  "fmla %9.4s,v3.4s,v5.s[0]\n\t"\
+  "fmla %11.4s,v3.4s,v5.s[1]\n\t"\
+  "fmla %13.4s,v3.4s,v5.s[2]\n\t"\
+  "fmla %15.4s,v3.4s,v5.s[3]\n\t"\
+  "fmla %16.4s,v2.4s,v6.s[0]\n\t"\
+  "fmla %18.4s,v2.4s,v6.s[1]\n\t"\
+  "fmla %20.4s,v2.4s,v6.s[2]\n\t"\
+  "fmla %22.4s,v2.4s,v6.s[3]\n\t"\
+  "fmla %17.4s,v3.4s,v6.s[0]\n\t"\
+  "fmla %19.4s,v3.4s,v6.s[1]\n\t"\
+  "fmla %21.4s,v3.4s,v6.s[2]\n\t"\
+  "fmla %23.4s,v3.4s,v6.s[3]\n\t"
+
+#define NEON_SGEMM_KERNEL_M8N12_TAIL1_A55 \
+  "fmla %0.4s,v0.4s,v4.s[0]\n\t"\
+  "fmla %2.4s,v0.4s,v4.s[1]\n\t"\
+  "fmla %4.4s,v0.4s,v4.s[2]\n\t"\
+  "fmla %6.4s,v0.4s,v4.s[3]; fmov v5.d[1],x1\n\t"\
+  "fmla %1.4s,v1.4s,v4.s[0]; ldr d6,[%26,#-16]\n\t"\
+  "fmla %3.4s,v1.4s,v4.s[1]; ldr x1,[%26,#-8]\n\t"\
+  "fmla %5.4s,v1.4s,v4.s[2]\n\t"\
+  "fmla %7.4s,v1.4s,v4.s[3]\n\t"\
+  "fmla %8.4s,v0.4s,v5.s[0]\n\t"\
+  "fmla %10.4s,v0.4s,v5.s[1]\n\t"\
+  "fmla %12.4s,v0.4s,v5.s[2]\n\t"\
+  "fmla %14.4s,v0.4s,v5.s[3]; fmov v6.d[1],x1\n\t"\
+  "fmla %9.4s,v1.4s,v5.s[0]\n\t"\
+  "fmla %11.4s,v1.4s,v5.s[1]\n\t"\
+  "fmla %13.4s,v1.4s,v5.s[2]\n\t"\
+  "fmla %15.4s,v1.4s,v5.s[3]\n\t"\
+  "fmla %16.4s,v0.4s,v6.s[0]\n\t"\
+  "fmla %18.4s,v0.4s,v6.s[1]\n\t"\
+  "fmla %20.4s,v0.4s,v6.s[2]\n\t"\
+  "fmla %22.4s,v0.4s,v6.s[3]\n\t"\
+  "fmla %17.4s,v1.4s,v6.s[0]\n\t"\
+  "fmla %19.4s,v1.4s,v6.s[1]\n\t"\
+  "fmla %21.4s,v1.4s,v6.s[2]\n\t"\
+  "fmla %23.4s,v1.4s,v6.s[3]\n\t"
+
+#define NEON_SGEMM_KERNEL_M8N12_PRELOAD_A72 \
+  "ldr q0,[%25]; ldr q1,[%25,#16]; add %25,%25,#32\n\t"\
+  "ldr q4,[%26]; ldr q5,[%26,#16]; add %26,%26,#48\n\t"\
+
+#define NEON_SGEMM_KERNEL_M8N12_MAIN2_A72 \
+  "fmla %0.4s,v0.4s,v4.s[0]; fmla %2.4s,v0.4s,v4.s[1]; ldr q6,[%26,#-16]\n\t"\
+  "fmla %4.4s,v0.4s,v4.s[2]; fmla %6.4s,v0.4s,v4.s[3]\n\t"\
+  "fmla %1.4s,v1.4s,v4.s[0]; fmla %3.4s,v1.4s,v4.s[1]; ldr q2,[%25],#64\n\t"\
+  "fmla %5.4s,v1.4s,v4.s[2]; fmla %7.4s,v1.4s,v4.s[3]\n\t"\
+  "fmla %8.4s,v0.4s,v5.s[0]; fmla %10.4s,v0.4s,v5.s[1]; ldr q4,[%26],#96\n\t"\
+  "fmla %12.4s,v0.4s,v5.s[2]; fmla %14.4s,v0.4s,v5.s[3]\n\t"\
+  "fmla %9.4s,v1.4s,v5.s[0]; fmla %11.4s,v1.4s,v5.s[1]; ldr q3,[%25,#-48]\n\t"\
+  "fmla %13.4s,v1.4s,v5.s[2]; fmla %15.4s,v1.4s,v5.s[3]\n\t"\
+  "fmla %16.4s,v0.4s,v6.s[0]; fmla %18.4s,v0.4s,v6.s[1]; ldr q5,[%26,#-80]\n\t"\
+  "fmla %20.4s,v0.4s,v6.s[2]; fmla %22.4s,v0.4s,v6.s[3]\n\t"\
+  "fmla %17.4s,v1.4s,v6.s[0]; fmla %19.4s,v1.4s,v6.s[1]; sub %w24,%w24,#2\n\t"\
+  "fmla %21.4s,v1.4s,v6.s[2]; fmla %23.4s,v1.4s,v6.s[3]\n\t"\
+  "fmla %0.4s,v2.4s,v4.s[0]; fmla %2.4s,v2.4s,v4.s[1]; ldr q6,[%26,#-64]\n\t"\
+  "fmla %4.4s,v2.4s,v4.s[2]; fmla %6.4s,v2.4s,v4.s[3]\n\t"\
+  "fmla %1.4s,v3.4s,v4.s[0]; fmla %3.4s,v3.4s,v4.s[1]; ldr q0,[%25,#-32]\n\t"\
+  "fmla %5.4s,v3.4s,v4.s[2]; fmla %7.4s,v3.4s,v4.s[3]\n\t"\
+  "fmla %8.4s,v2.4s,v5.s[0]; fmla %10.4s,v2.4s,v5.s[1]; ldr q4,[%26,#-48]\n\t"\
+  "fmla %12.4s,v2.4s,v5.s[2]; fmla %14.4s,v2.4s,v5.s[3]\n\t"\
+  "fmla %9.4s,v3.4s,v5.s[0]; fmla %11.4s,v3.4s,v5.s[1]; ldr q1,[%25,#-16]\n\t"\
+  "fmla %13.4s,v3.4s,v5.s[2]; fmla %15.4s,v3.4s,v5.s[3]\n\t"\
+  "fmla %16.4s,v2.4s,v6.s[0]; fmla %18.4s,v2.4s,v6.s[1]; ldr q5,[%26,#-32]\n\t"\
+  "fmla %20.4s,v2.4s,v6.s[2]; fmla %22.4s,v2.4s,v6.s[3]\n\t"\
+  "fmla %17.4s,v3.4s,v6.s[0]; fmla %19.4s,v3.4s,v6.s[1]; cmp %w24,#2\n\t"\
+  "fmla %21.4s,v3.4s,v6.s[2]; fmla %23.4s,v3.4s,v6.s[3]\n\t"
+
+#define NEON_SGEMM_KERNEL_M8N12_TAIL2_A72 \
+  "fmla %0.4s,v0.4s,v4.s[0]; fmla %2.4s,v0.4s,v4.s[1]; ldr q6,[%26,#-16]\n\t"\
+  "fmla %4.4s,v0.4s,v4.s[2]; fmla %6.4s,v0.4s,v4.s[3]\n\t"\
+  "fmla %1.4s,v1.4s,v4.s[0]; fmla %3.4s,v1.4s,v4.s[1]; ldr q2,[%25],#32\n\t"\
+  "fmla %5.4s,v1.4s,v4.s[2]; fmla %7.4s,v1.4s,v4.s[3]\n\t"\
+  "fmla %8.4s,v0.4s,v5.s[0]; fmla %10.4s,v0.4s,v5.s[1]; ldr q4,[%26],#48\n\t"\
+  "fmla %12.4s,v0.4s,v5.s[2]; fmla %14.4s,v0.4s,v5.s[3]\n\t"\
+  "fmla %9.4s,v1.4s,v5.s[0]; fmla %11.4s,v1.4s,v5.s[1]; ldr q3,[%25,#-16]\n\t"\
+  "fmla %13.4s,v1.4s,v5.s[2]; fmla %15.4s,v1.4s,v5.s[3]\n\t"\
+  "fmla %16.4s,v0.4s,v6.s[0]; fmla %18.4s,v0.4s,v6.s[1]; ldr q5,[%26,#-32]\n\t"\
+  "fmla %20.4s,v0.4s,v6.s[2]; fmla %22.4s,v0.4s,v6.s[3]\n\t"\
+  "fmla %17.4s,v1.4s,v6.s[0]; fmla %19.4s,v1.4s,v6.s[1]\n\t"\
+  "fmla %21.4s,v1.4s,v6.s[2]; fmla %23.4s,v1.4s,v6.s[3]\n\t"\
+  "fmla %0.4s,v2.4s,v4.s[0]; fmla %2.4s,v2.4s,v4.s[1]; ldr q6,[%26,#-16]\n\t"\
+  "fmla %4.4s,v2.4s,v4.s[2]; fmla %6.4s,v2.4s,v4.s[3]\n\t"\
+  "fmla %1.4s,v3.4s,v4.s[0]; fmla %3.4s,v3.4s,v4.s[1]\n\t"\
+  "fmla %5.4s,v3.4s,v4.s[2]; fmla %7.4s,v3.4s,v4.s[3]\n\t"\
+  "fmla %8.4s,v2.4s,v5.s[0]; fmla %10.4s,v2.4s,v5.s[1]\n\t"\
+  "fmla %12.4s,v2.4s,v5.s[2]; fmla %14.4s,v2.4s,v5.s[3]\n\t"\
+  "fmla %9.4s,v3.4s,v5.s[0]; fmla %11.4s,v3.4s,v5.s[1]\n\t"\
+  "fmla %13.4s,v3.4s,v5.s[2]; fmla %15.4s,v3.4s,v5.s[3]\n\t"\
+  "fmla %16.4s,v2.4s,v6.s[0]; fmla %18.4s,v2.4s,v6.s[1]\n\t"\
+  "fmla %20.4s,v2.4s,v6.s[2]; fmla %22.4s,v2.4s,v6.s[3]\n\t"\
+  "fmla %17.4s,v3.4s,v6.s[0]; fmla %19.4s,v3.4s,v6.s[1]\n\t"\
+  "fmla %21.4s,v3.4s,v6.s[2]; fmla %23.4s,v3.4s,v6.s[3]\n\t"
+
+#define NEON_SGEMM_KERNEL_M8N12_TAIL1_A72 \
+  "fmla %0.4s,v0.4s,v4.s[0]; fmla %2.4s,v0.4s,v4.s[1]; ldr q6,[%26,#-16]\n\t"\
+  "fmla %4.4s,v0.4s,v4.s[2]; fmla %6.4s,v0.4s,v4.s[3]\n\t"\
+  "fmla %1.4s,v1.4s,v4.s[0]; fmla %3.4s,v1.4s,v4.s[1]\n\t"\
+  "fmla %5.4s,v1.4s,v4.s[2]; fmla %7.4s,v1.4s,v4.s[3]\n\t"\
+  "fmla %8.4s,v0.4s,v5.s[0]; fmla %10.4s,v0.4s,v5.s[1]\n\t"\
+  "fmla %12.4s,v0.4s,v5.s[2]; fmla %14.4s,v0.4s,v5.s[3]\n\t"\
+  "fmla %9.4s,v1.4s,v5.s[0]; fmla %11.4s,v1.4s,v5.s[1]\n\t"\
+  "fmla %13.4s,v1.4s,v5.s[2]; fmla %15.4s,v1.4s,v5.s[3]\n\t"\
+  "fmla %16.4s,v0.4s,v6.s[0]; fmla %18.4s,v0.4s,v6.s[1]\n\t"\
+  "fmla %20.4s,v0.4s,v6.s[2]; fmla %22.4s,v0.4s,v6.s[3]\n\t"\
+  "fmla %17.4s,v1.4s,v6.s[0]; fmla %19.4s,v1.4s,v6.s[1]\n\t"\
+  "fmla %21.4s,v1.4s,v6.s[2]; fmla %23.4s,v1.4s,v6.s[3]\n\t"
+
+#define NEON_SGEMM_SAVE_M8N3_UNIT(cq1, cq2, cq3, cq4, cq5, cq6) \
+  ct1 = vld1q_f32(c_tmp1); ct2 = vld1q_f32(c_tmp1 + 4);\
+  ct3 = vld1q_f32(c_tmp2); ct4 = vld1q_f32(c_tmp2 + 4);\
+  ct5 = vld1q_f32(c_tmp3); ct6 = vld1q_f32(c_tmp3 + 4);\
+  cq1 = vfmaq_n_f32(cq1, ct1, beta); cq2 = vfmaq_n_f32(cq2, ct2, beta);\
+  cq3 = vfmaq_n_f32(cq3, ct3, beta); cq4 = vfmaq_n_f32(cq4, ct4, beta);\
+  cq5 = vfmaq_n_f32(cq5, ct5, beta); cq6 = vfmaq_n_f32(cq6, ct6, beta);\
+  vst1q_f32(c_tmp1, cq1); vst1q_f32(c_tmp1 + 4, cq2); c_tmp1 += ldc3;\
+  vst1q_f32(c_tmp2, cq3); vst1q_f32(c_tmp2 + 4, cq4); c_tmp2 += ldc3;\
+  vst1q_f32(c_tmp3, cq5); vst1q_f32(c_tmp3 + 4, cq6); c_tmp3 += ldc3;
+
+#define NEON_SGEMM_SAVE_M8N12_ASM1 \
+  float *c_tmp1 = c_ptr;\
+  float *c_tmp2 = c_ptr + ldc;\
+  float *c_tmp3 = c_ptr + ldc * 2;\
+  uint32_t ldc3 = ldc * 3;\
+  float32x4_t ct1, ct2, ct3, ct4, ct5, ct6;\
+  NEON_SGEMM_SAVE_M8N3_UNIT(cq01, cq02, cq03, cq04, cq05, cq06)\
+  NEON_SGEMM_SAVE_M8N3_UNIT(cq07, cq08, cq09, cq10, cq11, cq12)\
+  NEON_SGEMM_SAVE_M8N3_UNIT(cq13, cq14, cq15, cq16, cq17, cq18)\
+  NEON_SGEMM_SAVE_M8N3_UNIT(cq19, cq20, cq21, cq22, cq23, cq24)
+
+#define NEON_SGEMM_KERNEL_M12N8_PRELOAD_A53 \
+  "ldr q5,[%26]; add %26,%26,#32\n\t"\
+  "ldr q0,[%25]; ldr d2,[%25,#16]; ldr x0,[%25,#24]; add %25,%25,#48\n\t"
+
+#define NEON_SGEMM_KERNEL_M12N8_MAIN2_A53 \
+  "fmov v2.d[1],x0; ldr d4,[%25,#-16]\n\t"\
+  "fmla %0.4s,v0.4s,v5.s[0]; ldr x0,[%25,#-8]\n\t"\
+  "fmla %1.4s,v0.4s,v5.s[1]; fmla %2.4s,v0.4s,v5.s[2]\n\t"\
+  "fmov v4.d[1],x0; ldr d7,[%26,#-16]\n\t"\
+  "fmla %3.4s,v0.4s,v5.s[3]; ldr x0,[%26,#-8]\n\t"\
+  "fmla %8.4s,v2.4s,v5.s[0]; fmla %16.4s,v4.4s,v5.s[0]\n\t"\
+  "fmov v7.d[1],x0; ldr d6,[%26]\n\t"\
+  "fmla %17.4s,v4.4s,v5.s[1]; ldr x0,[%26,#8]\n\t"\
+  "fmla %18.4s,v4.4s,v5.s[2]; fmla %20.4s,v4.4s,v7.s[0]\n\t"\
+  "fmov v6.d[1],x0; ldr d1,[%25]\n\t"\
+  "fmla %21.4s,v4.4s,v7.s[1]; ldr x0,[%25,#8]\n\t"\
+  "fmla %22.4s,v4.4s,v7.s[2]; fmla %23.4s,v4.4s,v7.s[3]\n\t"\
+  "fmov v1.d[1],x0; ldr d3,[%25,#16]\n\t"\
+  "fmla %19.4s,v4.4s,v5.s[3]; ldr x0,[%25,#24]\n\t"\
+  "fmla %4.4s,v0.4s,v7.s[0]; fmla %5.4s,v0.4s,v7.s[1]\n\t"\
+  "fmov v3.d[1],x0; ldr d4,[%25,#32]\n\t"\
+  "fmla %6.4s,v0.4s,v7.s[2]; ldr x0,[%25,#40]\n\t"\
+  "fmla %7.4s,v0.4s,v7.s[3]; fmla %12.4s,v2.4s,v7.s[0]\n\t"\
+  "fmov v4.d[1],x0; ldr d0,[%25,#48]\n\t"\
+  "fmla %13.4s,v2.4s,v7.s[1]; ldr x0,[%25,#56]\n\t"\
+  "fmla %14.4s,v2.4s,v7.s[2]; fmla %15.4s,v2.4s,v7.s[3]\n\t"\
+  "fmov v0.d[1],x0; ldr d7,[%26,#16]\n\t"\
+  "fmla %9.4s,v2.4s,v5.s[1]; ldr x0,[%26,#24]\n\t"\
+  "fmla %10.4s,v2.4s,v5.s[2]; fmla %11.4s,v2.4s,v5.s[3]\n\t"\
+  "fmov v7.d[1],x0; ldr d5,[%26,#32]\n\t"\
+  "fmla %0.4s,v1.4s,v6.s[0]; ldr x0,[%26,#40]\n\t"\
+  "fmla %1.4s,v1.4s,v6.s[1]; fmla %2.4s,v1.4s,v6.s[2]\n\t"\
+  "fmov v5.d[1],x0; ldr d2,[%25,#64]\n\t"\
+  "fmla %3.4s,v1.4s,v6.s[3]; ldr x0,[%25,#72]\n\t"\
+  "fmla %4.4s,v1.4s,v7.s[0]; fmla %5.4s,v1.4s,v7.s[1]\n\t"\
+  "add %25,%25,#96\n\t"\
+  "fmla %6.4s,v1.4s,v7.s[2]\n\t"\
+  "fmla %7.4s,v1.4s,v7.s[3]; fmla %8.4s,v3.4s,v6.s[0]\n\t"\
+  "prfm pldl1keep,[%25,#192]\n\t"\
+  "fmla %9.4s,v3.4s,v6.s[1]\n\t"\
+  "fmla %10.4s,v3.4s,v6.s[2]; fmla %11.4s,v3.4s,v6.s[3]\n\t"\
+  "add %26,%26,#64\n\t"\
+  "fmla %12.4s,v3.4s,v7.s[0]\n\t"\
+  "fmla %13.4s,v3.4s,v7.s[1]; fmla %14.4s,v3.4s,v7.s[2]\n\t"\
+  "prfm pldl1keep,[%26,#128]\n\t"\
+  "fmla %15.4s,v3.4s,v7.s[3]\n\t"\
+  "fmla %16.4s,v4.4s,v6.s[0]; fmla %17.4s,v4.4s,v6.s[1]\n\t"\
+  "sub %w24,%w24,#2\n\t"\
+  "fmla %18.4s,v4.4s,v6.s[2]\n\t"\
+  "fmla %19.4s,v4.4s,v6.s[3]; fmla %20.4s,v4.4s,v7.s[0]\n\t"\
+  "cmp %w24,#2; prfm pldl1keep,[%25,#240]\n\t"\
+  "fmla %21.4s,v4.4s,v7.s[1]\n\t"\
+  "fmla %22.4s,v4.4s,v7.s[2]; fmla %23.4s,v4.4s,v7.s[3]\n\t"
+
+#define NEON_SGEMM_KERNEL_M12N8_TAIL2_A53 \
+  "fmov v2.d[1],x0; ldr d4,[%25,#-16]\n\t"\
+  "fmla %0.4s,v0.4s,v5.s[0]; ldr x0,[%25,#-8]\n\t"\
+  "fmla %1.4s,v0.4s,v5.s[1]; fmla %2.4s,v0.4s,v5.s[2]\n\t"\
+  "fmov v4.d[1],x0; ldr d7,[%26,#-16]\n\t"\
+  "fmla %3.4s,v0.4s,v5.s[3]; ldr x0,[%26,#-8]\n\t"\
+  "fmla %8.4s,v2.4s,v5.s[0]; fmla %16.4s,v4.4s,v5.s[0]\n\t"\
+  "fmov v7.d[1],x0; ldr d6,[%26]\n\t"\
+  "fmla %17.4s,v4.4s,v5.s[1]; ldr x0,[%26,#8]\n\t"\
+  "fmla %18.4s,v4.4s,v5.s[2]; fmla %20.4s,v4.4s,v7.s[0]\n\t"\
+  "fmov v6.d[1],x0; ldr d1,[%25]\n\t"\
+  "fmla %21.4s,v4.4s,v7.s[1]; ldr x0,[%25,#8]\n\t"\
+  "fmla %22.4s,v4.4s,v7.s[2]; fmla %23.4s,v4.4s,v7.s[3]\n\t"\
+  "fmov v1.d[1],x0; ldr d3,[%25,#16]\n\t"\
+  "fmla %19.4s,v4.4s,v5.s[3]; ldr x0,[%25,#24]\n\t"\
+  "fmla %4.4s,v0.4s,v7.s[0]; fmla %5.4s,v0.4s,v7.s[1]\n\t"\
+  "fmov v3.d[1],x0; ldr d4,[%25,#32]\n\t"\
+  "fmla %6.4s,v0.4s,v7.s[2]; ldr x0,[%25,#40]\n\t"\
+  "fmla %7.4s,v0.4s,v7.s[3]; fmla %12.4s,v2.4s,v7.s[0]\n\t"\
+  "fmov v4.d[1],x0\n\t"\
+  "fmla %13.4s,v2.4s,v7.s[1]\n\t"\
+  "fmla %14.4s,v2.4s,v7.s[2]; fmla %15.4s,v2.4s,v7.s[3]\n\t"\
+  "ldr d7,[%26,#16]\n\t"\
+  "fmla %9.4s,v2.4s,v5.s[1]; ldr x0,[%26,#24]\n\t"\
+  "fmla %10.4s,v2.4s,v5.s[2]; fmla %11.4s,v2.4s,v5.s[3]\n\t"\
+  "fmov v7.d[1],x0\n\t"\
+  "fmla %0.4s,v1.4s,v6.s[0]\n\t"\
+  "fmla %1.4s,v1.4s,v6.s[1]; fmla %2.4s,v1.4s,v6.s[2]\n\t"\
+  "fmla %3.4s,v1.4s,v6.s[3]\n\t"\
+  "fmla %4.4s,v1.4s,v7.s[0]; fmla %5.4s,v1.4s,v7.s[1]\n\t"\
+  "add %25,%25,#48\n\t"\
+  "fmla %6.4s,v1.4s,v7.s[2]\n\t"\
+  "fmla %7.4s,v1.4s,v7.s[3]; fmla %8.4s,v3.4s,v6.s[0]\n\t"\
+  "add %26,%26,#32\n\t"\
+  "fmla %9.4s,v3.4s,v6.s[1]\n\t"\
+  "fmla %10.4s,v3.4s,v6.s[2]; fmla %11.4s,v3.4s,v6.s[3]\n\t"\
+  "fmla %12.4s,v3.4s,v7.s[0]\n\t"\
+  "fmla %13.4s,v3.4s,v7.s[1]; fmla %14.4s,v3.4s,v7.s[2]\n\t"\
+  "fmla %15.4s,v3.4s,v7.s[3]\n\t"\
+  "fmla %16.4s,v4.4s,v6.s[0]; fmla %17.4s,v4.4s,v6.s[1]\n\t"\
+  "fmla %18.4s,v4.4s,v6.s[2]\n\t"\
+  "fmla %19.4s,v4.4s,v6.s[3]; fmla %20.4s,v4.4s,v7.s[0]\n\t"\
+  "fmla %21.4s,v4.4s,v7.s[1]\n\t"\
+  "fmla %22.4s,v4.4s,v7.s[2]; fmla %23.4s,v4.4s,v7.s[3]\n\t"
+
+#define NEON_SGEMM_KERNEL_M12N8_TAIL1_A53 \
+  "fmov v2.d[1],x0; ldr d4,[%25,#-16]\n\t"\
+  "fmla %0.4s,v0.4s,v5.s[0]; ldr x0,[%25,#-8]\n\t"\
+  "fmla %1.4s,v0.4s,v5.s[1]; fmla %2.4s,v0.4s,v5.s[2]\n\t"\
+  "fmov v4.d[1],x0; ldr d7,[%26,#-16]\n\t"\
+  "fmla %3.4s,v0.4s,v5.s[3]; ldr x0,[%26,#-8]\n\t"\
+  "fmla %8.4s,v2.4s,v5.s[0]; fmla %16.4s,v4.4s,v5.s[0]\n\t"\
+  "fmov v7.d[1],x0\n\t"\
+  "fmla %17.4s,v4.4s,v5.s[1]\n\t"\
+  "fmla %18.4s,v4.4s,v5.s[2]; fmla %20.4s,v4.4s,v7.s[0]\n\t"\
+  "fmla %21.4s,v4.4s,v7.s[1]\n\t"\
+  "fmla %22.4s,v4.4s,v7.s[2]; fmla %23.4s,v4.4s,v7.s[3]\n\t"\
+  "fmla %19.4s,v4.4s,v5.s[3]\n\t"\
+  "fmla %4.4s,v0.4s,v7.s[0]; fmla %5.4s,v0.4s,v7.s[1]\n\t"\
+  "fmla %6.4s,v0.4s,v7.s[2]\n\t"\
+  "fmla %7.4s,v0.4s,v7.s[3]; fmla %12.4s,v2.4s,v7.s[0]\n\t"\
+  "fmla %13.4s,v2.4s,v7.s[1]\n\t"\
+  "fmla %14.4s,v2.4s,v7.s[2]; fmla %15.4s,v2.4s,v7.s[3]\n\t"\
+  "fmla %9.4s,v2.4s,v5.s[1]\n\t"\
+  "fmla %10.4s,v2.4s,v5.s[2]; fmla %11.4s,v2.4s,v5.s[3]\n\t"
+
+#define NEON_SGEMM_KERNEL_M12N8_PRELOAD_A55 \
+  "ldr q4,[%26]; ldr q5,[%26,#16]; add %26,%26,#32\n\t"\
+  "ldr q0,[%25]; ldr d1,[%25,#16]; ldr x1,[%25,#24]; add %25,%25,#48\n\t"
+
+#define NEON_SGEMM_KERNEL_M12N8_MAIN2_A55 \
+  "fmla %0.4s,v0.4s,v4.s[0]; ldr d6,[%26]\n\t"\
+  "fmla %1.4s,v0.4s,v4.s[1]; ldr x0,[%26,#8]\n\t"\
+  "fmla %2.4s,v0.4s,v4.s[2]\n\t"\
+  "fmla %3.4s,v0.4s,v4.s[3]; fmov v1.d[1],x1\n\t"\
+  "fmla %4.4s,v0.4s,v5.s[0]; ldr d2,[%25,#-16]\n\t"\
+  "fmla %5.4s,v0.4s,v5.s[1]; ldr x1,[%25,#-8]\n\t"\
+  "fmla %6.4s,v0.4s,v5.s[2]\n\t"\
+  "fmla %7.4s,v0.4s,v5.s[3]; fmov v6.d[1],x0\n\t"\
+  "fmla %8.4s,v1.4s,v4.s[0]; ldr d7,[%26,#16]\n\t"\
+  "fmla %9.4s,v1.4s,v4.s[1]; ldr x0,[%26,#24]\n\t"\
+  "fmla %10.4s,v1.4s,v4.s[2]\n\t"\
+  "fmla %11.4s,v1.4s,v4.s[3]; fmov v2.d[1],x1\n\t"\
+  "fmla %12.4s,v1.4s,v5.s[0]; ldr d0,[%25]\n\t"\
+  "fmla %13.4s,v1.4s,v5.s[1]; ldr x1,[%25,#8]\n\t"\
+  "fmla %14.4s,v1.4s,v5.s[2]\n\t"\
+  "fmla %15.4s,v1.4s,v5.s[3]; fmov v7.d[1],x0\n\t"\
+  "fmla %16.4s,v2.4s,v4.s[0]; ldr d1,[%25,#16]\n\t"\
+  "fmla %17.4s,v2.4s,v4.s[1]; ldr x0,[%25,#24]\n\t"\
+  "fmla %18.4s,v2.4s,v4.s[2]\n\t"\
+  "fmla %19.4s,v2.4s,v4.s[3]; fmov v0.d[1],x1\n\t"\
+  "fmla %20.4s,v2.4s,v5.s[0]; add %25,%25,#96\n\t"\
+  "fmla %21.4s,v2.4s,v5.s[1]; add %26,%26,#64\n\t"\
+  "fmla %22.4s,v2.4s,v5.s[2]\n\t"\
+  "fmla %23.4s,v2.4s,v5.s[3]\n\t"\
+  "fmla %0.4s,v0.4s,v6.s[0]; ldr d4,[%26,#-32]\n\t"\
+  "fmla %1.4s,v0.4s,v6.s[1]; ldr x1,[%26,#-24]\n\t"\
+  "fmla %2.4s,v0.4s,v6.s[2]\n\t"\
+  "fmla %3.4s,v0.4s,v6.s[3]; fmov v1.d[1],x0\n\t"\
+  "fmla %4.4s,v0.4s,v7.s[0]; ldr d2,[%25,#-64]\n\t"\
+  "fmla %5.4s,v0.4s,v7.s[1]; ldr x0,[%25,#-56]\n\t"\
+  "fmla %6.4s,v0.4s,v7.s[2]\n\t"\
+  "fmla %7.4s,v0.4s,v7.s[3]; fmov v4.d[1],x1\n\t"\
+  "fmla %8.4s,v1.4s,v6.s[0]; ldr d5,[%26,#-16]\n\t"\
+  "fmla %9.4s,v1.4s,v6.s[1]; ldr x1,[%26,#-8]\n\t"\
+  "fmla %10.4s,v1.4s,v6.s[2]\n\t"\
+  "fmla %11.4s,v1.4s,v6.s[3]; fmov v2.d[1],x0\n\t"\
+  "fmla %12.4s,v1.4s,v7.s[0]; ldr d0,[%25,#-48]\n\t"\
+  "fmla %13.4s,v1.4s,v7.s[1]; ldr x0,[%25,#-40]\n\t"\
+  "fmla %14.4s,v1.4s,v7.s[2]\n\t"\
+  "fmla %15.4s,v1.4s,v7.s[3]; fmov v5.d[1],x1\n\t"\
+  "fmla %16.4s,v2.4s,v6.s[0]; ldr d1,[%25,#-32]\n\t"\
+  "fmla %17.4s,v2.4s,v6.s[1]; ldr x1,[%25,#-24]\n\t"\
+  "fmla %18.4s,v2.4s,v6.s[2]\n\t"\
+  "fmla %19.4s,v2.4s,v6.s[3]; fmov v0.d[1],x0\n\t"\
+  "fmla %20.4s,v2.4s,v7.s[0]\n\t"\
+  "fmla %21.4s,v2.4s,v7.s[1]; sub %w24,%w24,#2\n\t"\
+  "fmla %22.4s,v2.4s,v7.s[2]; cmp %w24,#2\n\t"\
+  "fmla %23.4s,v2.4s,v7.s[3]\n\t"
+
+#define NEON_SGEMM_KERNEL_M12N8_TAIL2_A55 \
+  "fmla %0.4s,v0.4s,v4.s[0]; ldr d6,[%26]\n\t"\
+  "fmla %1.4s,v0.4s,v4.s[1]; ldr x0,[%26,#8]\n\t"\
+  "fmla %2.4s,v0.4s,v4.s[2]\n\t"\
+  "fmla %3.4s,v0.4s,v4.s[3]; fmov v1.d[1],x1\n\t"\
+  "fmla %4.4s,v0.4s,v5.s[0]; ldr d2,[%25,#-16]\n\t"\
+  "fmla %5.4s,v0.4s,v5.s[1]; ldr x1,[%25,#-8]\n\t"\
+  "fmla %6.4s,v0.4s,v5.s[2]\n\t"\
+  "fmla %7.4s,v0.4s,v5.s[3]; fmov v6.d[1],x0\n\t"\
+  "fmla %8.4s,v1.4s,v4.s[0]; ldr d7,[%26,#16]\n\t"\
+  "fmla %9.4s,v1.4s,v4.s[1]; ldr x0,[%26,#24]\n\t"\
+  "fmla %10.4s,v1.4s,v4.s[2]\n\t"\
+  "fmla %11.4s,v1.4s,v4.s[3]; fmov v2.d[1],x1\n\t"\
+  "fmla %12.4s,v1.4s,v5.s[0]; ldr d0,[%25]\n\t"\
+  "fmla %13.4s,v1.4s,v5.s[1]; ldr x1,[%25,#8]\n\t"\
+  "fmla %14.4s,v1.4s,v5.s[2]\n\t"\
+  "fmla %15.4s,v1.4s,v5.s[3]; fmov v7.d[1],x0\n\t"\
+  "fmla %16.4s,v2.4s,v4.s[0]; ldr d1,[%25,#16]\n\t"\
+  "fmla %17.4s,v2.4s,v4.s[1]; ldr x0,[%25,#24]\n\t"\
+  "fmla %18.4s,v2.4s,v4.s[2]\n\t"\
+  "fmla %19.4s,v2.4s,v4.s[3]; fmov v0.d[1],x1\n\t"\
+  "fmla %20.4s,v2.4s,v5.s[0]; add %25,%25,#48\n\t"\
+  "fmla %21.4s,v2.4s,v5.s[1]; add %26,%26,#32\n\t"\
+  "fmla %22.4s,v2.4s,v5.s[2]\n\t"\
+  "fmla %23.4s,v2.4s,v5.s[3]\n\t"\
+  "fmla %0.4s,v0.4s,v6.s[0]\n\t"\
+  "fmla %1.4s,v0.4s,v6.s[1]\n\t"\
+  "fmla %2.4s,v0.4s,v6.s[2]\n\t"\
+  "fmla %3.4s,v0.4s,v6.s[3]; fmov v1.d[1],x0\n\t"\
+  "fmla %4.4s,v0.4s,v7.s[0]; ldr d2,[%25,#-16]\n\t"\
+  "fmla %5.4s,v0.4s,v7.s[1]; ldr x0,[%25,#-8]\n\t"\
+  "fmla %6.4s,v0.4s,v7.s[2]\n\t"\
+  "fmla %7.4s,v0.4s,v7.s[3]\n\t"\
+  "fmla %8.4s,v1.4s,v6.s[0]\n\t"\
+  "fmla %9.4s,v1.4s,v6.s[1]\n\t"\
+  "fmla %10.4s,v1.4s,v6.s[2]\n\t"\
+  "fmla %11.4s,v1.4s,v6.s[3]; fmov v2.d[1],x0\n\t"\
+  "fmla %12.4s,v1.4s,v7.s[0]\n\t"\
+  "fmla %13.4s,v1.4s,v7.s[1]\n\t"\
+  "fmla %14.4s,v1.4s,v7.s[2]\n\t"\
+  "fmla %15.4s,v1.4s,v7.s[3]\n\t"\
+  "fmla %16.4s,v2.4s,v6.s[0]\n\t"\
+  "fmla %17.4s,v2.4s,v6.s[1]\n\t"\
+  "fmla %18.4s,v2.4s,v6.s[2]\n\t"\
+  "fmla %19.4s,v2.4s,v6.s[3]\n\t"\
+  "fmla %20.4s,v2.4s,v7.s[0]\n\t"\
+  "fmla %21.4s,v2.4s,v7.s[1]\n\t"\
+  "fmla %22.4s,v2.4s,v7.s[2]\n\t"\
+  "fmla %23.4s,v2.4s,v7.s[3]\n\t"
+
+#define NEON_SGEMM_KERNEL_M12N8_TAIL1_A55 \
+  "fmla %0.4s,v0.4s,v4.s[0]\n\t"\
+  "fmla %1.4s,v0.4s,v4.s[1]\n\t"\
+  "fmla %2.4s,v0.4s,v4.s[2]\n\t"\
+  "fmla %3.4s,v0.4s,v4.s[3]; fmov v1.d[1],x1\n\t"\
+  "fmla %4.4s,v0.4s,v5.s[0]; ldr d2,[%25,#-16]\n\t"\
+  "fmla %5.4s,v0.4s,v5.s[1]; ldr x1,[%25,#-8]\n\t"\
+  "fmla %6.4s,v0.4s,v5.s[2]\n\t"\
+  "fmla %7.4s,v0.4s,v5.s[3]\n\t"\
+  "fmla %8.4s,v1.4s,v4.s[0]\n\t"\
+  "fmla %9.4s,v1.4s,v4.s[1]\n\t"\
+  "fmla %10.4s,v1.4s,v4.s[2]\n\t"\
+  "fmla %11.4s,v1.4s,v4.s[3]; fmov v2.d[1],x1\n\t"\
+  "fmla %12.4s,v1.4s,v5.s[0]\n\t"\
+  "fmla %13.4s,v1.4s,v5.s[1]\n\t"\
+  "fmla %14.4s,v1.4s,v5.s[2]\n\t"\
+  "fmla %15.4s,v1.4s,v5.s[3]\n\t"\
+  "fmla %16.4s,v2.4s,v4.s[0]\n\t"\
+  "fmla %17.4s,v2.4s,v4.s[1]\n\t"\
+  "fmla %18.4s,v2.4s,v4.s[2]\n\t"\
+  "fmla %19.4s,v2.4s,v4.s[3]\n\t"\
+  "fmla %20.4s,v2.4s,v5.s[0]\n\t"\
+  "fmla %21.4s,v2.4s,v5.s[1]\n\t"\
+  "fmla %22.4s,v2.4s,v5.s[2]\n\t"\
+  "fmla %23.4s,v2.4s,v5.s[3]\n\t"
+
+#define NEON_SGEMM_KERNEL_M12N8_PRELOAD_A72 \
+  "ldr q0,[%25]; ldr q1,[%25,#16]; add %25,%25,#48\n\t"\
+  "ldr q4,[%26]; ldr q5,[%26,#16]; add %26,%26,#32\n\t"
+
+#define NEON_SGEMM_KERNEL_M12N8_MAIN2_A72 \
+  "fmla %0.4s,v0.4s,v4.s[0]; fmla %1.4s,v0.4s,v4.s[1]; ldr q2,[%25,#-16]\n\t"\
+  "fmla %2.4s,v0.4s,v4.s[2]; fmla %3.4s,v0.4s,v4.s[3]\n\t"\
+  "fmla %4.4s,v0.4s,v5.s[0]; fmla %5.4s,v0.4s,v5.s[1]; ldr q6,[%26],#64\n\t"\
+  "fmla %6.4s,v0.4s,v5.s[2]; fmla %7.4s,v0.4s,v5.s[3]\n\t"\
+  "fmla %8.4s,v1.4s,v4.s[0]; fmla %9.4s,v1.4s,v4.s[1]; ldr q0,[%25],#96\n\t"\
+  "fmla %10.4s,v1.4s,v4.s[2]; fmla %11.4s,v1.4s,v4.s[3]\n\t"\
+  "fmla %12.4s,v1.4s,v5.s[0]; fmla %13.4s,v1.4s,v5.s[1]; ldr q7,[%26,#-48]\n\t"\
+  "fmla %14.4s,v1.4s,v5.s[2]; fmla %15.4s,v1.4s,v5.s[3]\n\t"\
+  "fmla %16.4s,v2.4s,v4.s[0]; fmla %17.4s,v2.4s,v4.s[1]; ldr q1,[%25,#-80]\n\t"\
+  "fmla %18.4s,v2.4s,v4.s[2]; fmla %19.4s,v2.4s,v4.s[3]\n\t"\
+  "fmla %20.4s,v2.4s,v5.s[0]; fmla %21.4s,v2.4s,v5.s[1]; sub %w24,%w24,#2\n\t"\
+  "fmla %22.4s,v2.4s,v5.s[2]; fmla %23.4s,v2.4s,v5.s[3]\n\t"\
+  "fmla %0.4s,v0.4s,v6.s[0]; fmla %1.4s,v0.4s,v6.s[1]; ldr q2,[%25,#-64]\n\t"\
+  "fmla %2.4s,v0.4s,v6.s[2]; fmla %3.4s,v0.4s,v6.s[3]\n\t"\
+  "fmla %4.4s,v0.4s,v7.s[0]; fmla %5.4s,v0.4s,v7.s[1]; ldr q4,[%26,#-32]\n\t"\
+  "fmla %6.4s,v0.4s,v7.s[2]; fmla %7.4s,v0.4s,v7.s[3]\n\t"\
+  "fmla %8.4s,v1.4s,v6.s[0]; fmla %9.4s,v1.4s,v6.s[1]; ldr q0,[%25,#-48]\n\t"\
+  "fmla %10.4s,v1.4s,v6.s[2]; fmla %11.4s,v1.4s,v6.s[3]\n\t"\
+  "fmla %12.4s,v1.4s,v7.s[0]; fmla %13.4s,v1.4s,v7.s[1]; ldr q5,[%26,#-16]\n\t"\
+  "fmla %14.4s,v1.4s,v7.s[2]; fmla %15.4s,v1.4s,v7.s[3]\n\t"\
+  "fmla %16.4s,v2.4s,v6.s[0]; fmla %17.4s,v2.4s,v6.s[1]; ldr q1,[%25,#-32]\n\t"\
+  "fmla %18.4s,v2.4s,v6.s[2]; fmla %19.4s,v2.4s,v6.s[3]\n\t"\
+  "fmla %20.4s,v2.4s,v7.s[0]; fmla %21.4s,v2.4s,v7.s[1]; cmp %w24,#2\n\t"\
+  "fmla %22.4s,v2.4s,v7.s[2]; fmla %23.4s,v2.4s,v7.s[3]\n\t"
+
+#define NEON_SGEMM_KERNEL_M12N8_TAIL2_A72 \
+  "fmla %0.4s,v0.4s,v4.s[0]; fmla %1.4s,v0.4s,v4.s[1]; ldr q2,[%25,#-16]\n\t"\
+  "fmla %2.4s,v0.4s,v4.s[2]; fmla %3.4s,v0.4s,v4.s[3]\n\t"\
+  "fmla %4.4s,v0.4s,v5.s[0]; fmla %5.4s,v0.4s,v5.s[1]; ldr q6,[%26],#32\n\t"\
+  "fmla %6.4s,v0.4s,v5.s[2]; fmla %7.4s,v0.4s,v5.s[3]\n\t"\
+  "fmla %8.4s,v1.4s,v4.s[0]; fmla %9.4s,v1.4s,v4.s[1]; ldr q0,[%25],#48\n\t"\
+  "fmla %10.4s,v1.4s,v4.s[2]; fmla %11.4s,v1.4s,v4.s[3]\n\t"\
+  "fmla %12.4s,v1.4s,v5.s[0]; fmla %13.4s,v1.4s,v5.s[1]; ldr q7,[%26,#-16]\n\t"\
+  "fmla %14.4s,v1.4s,v5.s[2]; fmla %15.4s,v1.4s,v5.s[3]\n\t"\
+  "fmla %16.4s,v2.4s,v4.s[0]; fmla %17.4s,v2.4s,v4.s[1]; ldr q1,[%25,#-32]\n\t"\
+  "fmla %18.4s,v2.4s,v4.s[2]; fmla %19.4s,v2.4s,v4.s[3]\n\t"\
+  "fmla %20.4s,v2.4s,v5.s[0]; fmla %21.4s,v2.4s,v5.s[1]\n\t"\
+  "fmla %22.4s,v2.4s,v5.s[2]; fmla %23.4s,v2.4s,v5.s[3]\n\t"\
+  "fmla %0.4s,v0.4s,v6.s[0]; fmla %1.4s,v0.4s,v6.s[1]; ldr q2,[%25,#-16]\n\t"\
+  "fmla %2.4s,v0.4s,v6.s[2]; fmla %3.4s,v0.4s,v6.s[3]\n\t"\
+  "fmla %4.4s,v0.4s,v7.s[0]; fmla %5.4s,v0.4s,v7.s[1]\n\t"\
+  "fmla %6.4s,v0.4s,v7.s[2]; fmla %7.4s,v0.4s,v7.s[3]\n\t"\
+  "fmla %8.4s,v1.4s,v6.s[0]; fmla %9.4s,v1.4s,v6.s[1]\n\t"\
+  "fmla %10.4s,v1.4s,v6.s[2]; fmla %11.4s,v1.4s,v6.s[3]\n\t"\
+  "fmla %12.4s,v1.4s,v7.s[0]; fmla %13.4s,v1.4s,v7.s[1]\n\t"\
+  "fmla %14.4s,v1.4s,v7.s[2]; fmla %15.4s,v1.4s,v7.s[3]\n\t"\
+  "fmla %16.4s,v2.4s,v6.s[0]; fmla %17.4s,v2.4s,v6.s[1]\n\t"\
+  "fmla %18.4s,v2.4s,v6.s[2]; fmla %19.4s,v2.4s,v6.s[3]\n\t"\
+  "fmla %20.4s,v2.4s,v7.s[0]; fmla %21.4s,v2.4s,v7.s[1]\n\t"\
+  "fmla %22.4s,v2.4s,v7.s[2]; fmla %23.4s,v2.4s,v7.s[3]\n\t"
+
+#define NEON_SGEMM_KERNEL_M12N8_TAIL1_A72 \
+  "fmla %0.4s,v0.4s,v4.s[0]; fmla %1.4s,v0.4s,v4.s[1]; ldr q2,[%25,#-16]\n\t"\
+  "fmla %2.4s,v0.4s,v4.s[2]; fmla %3.4s,v0.4s,v4.s[3]\n\t"\
+  "fmla %4.4s,v0.4s,v5.s[0]; fmla %5.4s,v0.4s,v5.s[1]\n\t"\
+  "fmla %6.4s,v0.4s,v5.s[2]; fmla %7.4s,v0.4s,v5.s[3]\n\t"\
+  "fmla %8.4s,v1.4s,v4.s[0]; fmla %9.4s,v1.4s,v4.s[1]\n\t"\
+  "fmla %10.4s,v1.4s,v4.s[2]; fmla %11.4s,v1.4s,v4.s[3]\n\t"\
+  "fmla %12.4s,v1.4s,v5.s[0]; fmla %13.4s,v1.4s,v5.s[1]\n\t"\
+  "fmla %14.4s,v1.4s,v5.s[2]; fmla %15.4s,v1.4s,v5.s[3]\n\t"\
+  "fmla %16.4s,v2.4s,v4.s[0]; fmla %17.4s,v2.4s,v4.s[1]\n\t"\
+  "fmla %18.4s,v2.4s,v4.s[2]; fmla %19.4s,v2.4s,v4.s[3]\n\t"\
+  "fmla %20.4s,v2.4s,v5.s[0]; fmla %21.4s,v2.4s,v5.s[1]\n\t"\
+  "fmla %22.4s,v2.4s,v5.s[2]; fmla %23.4s,v2.4s,v5.s[3]\n\t"
+
+#define NEON_SGEMM_SAVE_M12N2_UNIT(cq1, cq2, cq3, cq4, cq5, cq6) \
+  ct1 = vld1q_f32(c_tmp1);\
+  ct2 = vld1q_f32(c_tmp1 + 4);\
+  ct3 = vld1q_f32(c_tmp1 + 8);\
+  ct4 = vld1q_f32(c_tmp2);\
+  ct5 = vld1q_f32(c_tmp2 + 4);\
+  ct6 = vld1q_f32(c_tmp2 + 8);\
+  cq1 = vfmaq_n_f32(cq1, ct1, beta); cq2 = vfmaq_n_f32(cq2, ct2, beta);\
+  cq3 = vfmaq_n_f32(cq3, ct3, beta); cq4 = vfmaq_n_f32(cq4, ct4, beta);\
+  cq5 = vfmaq_n_f32(cq5, ct5, beta); cq6 = vfmaq_n_f32(cq6, ct6, beta);\
+  vst1q_f32(c_tmp1, cq1);\
+  vst1q_f32(c_tmp1 + 4, cq2);\
+  vst1q_f32(c_tmp1 + 8, cq3); c_tmp1 += ldc2;\
+  vst1q_f32(c_tmp2, cq4);\
+  vst1q_f32(c_tmp2 + 4, cq5);\
+  vst1q_f32(c_tmp2 + 8, cq6); c_tmp2 += ldc2;
+
+#define NEON_SGEMM_SAVE_M12N8_ASM1 \
+  float *c_tmp1 = c_ptr;\
+  float *c_tmp2 = c_ptr + ldc;\
+  uint32_t ldc2 = ldc * 2;\
+  float32x4_t ct1, ct2, ct3, ct4, ct5, ct6;\
+  NEON_SGEMM_SAVE_M12N2_UNIT(cq01, cq09, cq17, cq02, cq10, cq18)\
+  NEON_SGEMM_SAVE_M12N2_UNIT(cq03, cq11, cq19, cq04, cq12, cq20)\
+  NEON_SGEMM_SAVE_M12N2_UNIT(cq05, cq13, cq21, cq06, cq14, cq22)\
+  NEON_SGEMM_SAVE_M12N2_UNIT(cq07, cq15, cq23, cq08, cq16, cq24)
+
+#define PREF_C_1_LANE(n, mdim) \
+  pref_c(c_pref); pref_c(c_pref + mdim - 1); c_pref += ldc;
+#define PREF_C(mdim, ndim) \
+  MACRO_EXPANSION_##ndim(VOID_BASE, PREF_C_1_LANE, mdim)
+
+#define NEON_SGEMM_COMPUTE_ASM1(mdim, ndim, cputype) \
+  float *c_pref = c_ptr; PREF_C(mdim, ndim)\
+  const float *b_ptr = b_head;\
+  const float *a_ptr = a_head;\
+  uint32_t k_left = K;\
+  float32x4_t cq01, cq02, cq03, cq04, cq05, cq06, cq07, cq08;\
+  float32x4_t cq09, cq10, cq11, cq12, cq13, cq14, cq15, cq16;\
+  float32x4_t cq17, cq18, cq19, cq20, cq21, cq22, cq23, cq24;\
+  __asm__ __volatile__ (\
+    "movi %0.16b,#0; movi %1.16b,#0\n\t"\
+    "mov %2.16b,%0.16b; mov %3.16b,%1.16b\n\t"\
+    "mov %4.16b,%0.16b; mov %5.16b,%1.16b\n\t"\
+    "mov %6.16b,%0.16b; mov %7.16b,%1.16b\n\t"\
+    "mov %8.16b,%0.16b; mov %9.16b,%1.16b\n\t"\
+    "mov %10.16b,%0.16b; mov %11.16b,%1.16b\n\t"\
+    "mov %12.16b,%0.16b; mov %13.16b,%1.16b\n\t"\
+    "mov %14.16b,%0.16b; mov %15.16b,%1.16b\n\t"\
+    "mov %16.16b,%0.16b; mov %17.16b,%1.16b\n\t"\
+    "mov %18.16b,%0.16b; mov %19.16b,%1.16b\n\t"\
+    "mov %20.16b,%0.16b; mov %21.16b,%1.16b\n\t"\
+    "mov %22.16b,%0.16b; mov %23.16b,%1.16b\n\t"\
+    "cmp %w24,#0; b.eq 4f\n\t"\
+    NEON_SGEMM_KERNEL_M##mdim##N##ndim##_PRELOAD_##cputype\
+    "cmp %w24,#2; b.le 2f\n\t"\
+    ".balign 16\n\t"\
+    "1:\n\t"\
+    NEON_SGEMM_KERNEL_M##mdim##N##ndim##_MAIN2_##cputype "b.gt 1b\n\t"\
+    "2:\n\t"\
+    "cmp %w24,#2; b.ne 3f\n\t"\
+    NEON_SGEMM_KERNEL_M##mdim##N##ndim##_TAIL2_##cputype "b 4f\n\t"\
+    "3:\n\t"\
+    NEON_SGEMM_KERNEL_M##mdim##N##ndim##_TAIL1_##cputype\
+    "4:\n\t"\
+  :"=w"(cq01),"=w"(cq02),"=w"(cq03),"=w"(cq04),"=w"(cq05),"=w"(cq06),\
+  "=w"(cq07),"=w"(cq08),"=w"(cq09),"=w"(cq10),"=w"(cq11),"=w"(cq12),\
+  "=w"(cq13),"=w"(cq14),"=w"(cq15),"=w"(cq16),"=w"(cq17),"=w"(cq18),\
+  "=w"(cq19),"=w"(cq20),"=w"(cq21),"=w"(cq22),"=w"(cq23),"=w"(cq24),\
+  "+r"(k_left),"+r"(a_ptr),"+r"(b_ptr)\
+  ::"cc","memory","x0","x1","v0","v1","v2","v3","v4","v5","v6","v7");\
+  NEON_SGEMM_SAVE_M##mdim##N##ndim##_ASM1
+
+#define NEON_SGEMM_KERNEL_M12N8_HALF_PRELOAD_A35 \
+  "ld1r {v0.2s},[%25],#4\n\t"\
+  "ldr d4,[%26]; ldr d5,[%26,#8]; ldr d6,[%26,#16]; add %26,%26,#32\n\t"
+
+#define NEON_SGEMM_KERNEL_M12N8_HALF_MAIN2_A35 \
+  "ld1r {v1.2s},[%25],#4\n\t"\
+  "fmla %0.2s,v0.2s,v4.2s; fmla %1.2s,v0.2s,v5.2s; fmla %2.2s,v0.2s,v6.2s\n\t"\
+  "ld1r {v2.2s},[%25],#4\n\t"\
+  "fmla %4.2s,v1.2s,v4.2s; fmla %5.2s,v1.2s,v5.2s; fmla %6.2s,v1.2s,v6.2s\n\t"\
+  "ldr d7,[%26,#-8]\n\t"\
+  "fmla %8.2s,v2.2s,v4.2s; fmla %9.2s,v2.2s,v5.2s; fmla %10.2s,v2.2s,v6.2s\n\t"\
+  "ld1r {v3.2s},[%25],#4\n\t"\
+  "fmla %3.2s,v0.2s,v7.2s; fmla %7.2s,v1.2s,v7.2s; fmla %11.2s,v2.2s,v7.2s\n\t"\
+  "ld1r {v1.2s},[%25],#4\n\t"\
+  "fmla %12.2s,v3.2s,v4.2s; fmla %13.2s,v3.2s,v5.2s; fmla %14.2s,v3.2s,v6.2s\n\t"\
+  "ld1r {v2.2s},[%25],#4\n\t"\
+  "fmla %16.2s,v1.2s,v4.2s; add %25,%25,#24\n\t"\
+  "fmla %17.2s,v1.2s,v5.2s; fmla %18.2s,v1.2s,v6.2s\n\t"\
+  "ld1r {v0.2s},[%25],#4\n\t"\
+  "fmla %20.2s,v2.2s,v4.2s; fmla %21.2s,v2.2s,v5.2s; fmla %22.2s,v2.2s,v6.2s\n\t"\
+  "ldr d4,[%26]; ldr d5,[%26,#8]; ldr d6,[%26,#16]\n\t"\
+  "fmla %15.2s,v3.2s,v7.2s; add %26,%26,#64\n\t"\
+  "fmla %19.2s,v1.2s,v7.2s\n\t"\
+  "fmla %23.2s,v2.2s,v7.2s\n\t"\
+  "ld1r {v1.2s},[%25],#4\n\t"\
+  "fmla %0.2s,v0.2s,v4.2s; fmla %1.2s,v0.2s,v5.2s; fmla %2.2s,v0.2s,v6.2s\n\t"\
+  "ld1r {v2.2s},[%25],#4\n\t"\
+  "fmla %4.2s,v1.2s,v4.2s; fmla %5.2s,v1.2s,v5.2s; fmla %6.2s,v1.2s,v6.2s\n\t"\
+  "ldr d7,[%26,#-40]\n\t"\
+  "fmla %8.2s,v2.2s,v4.2s; fmla %9.2s,v2.2s,v5.2s; fmla %10.2s,v2.2s,v6.2s\n\t"\
+  "ld1r {v3.2s},[%25],#4\n\t"\
+  "fmla %3.2s,v0.2s,v7.2s; fmla %7.2s,v1.2s,v7.2s; fmla %11.2s,v2.2s,v7.2s\n\t"\
+  "ld1r {v1.2s},[%25],#4\n\t"\
+  "fmla %12.2s,v3.2s,v4.2s; fmla %13.2s,v3.2s,v5.2s; fmla %14.2s,v3.2s,v6.2s\n\t"\
+  "ld1r {v2.2s},[%25],#4\n\t"\
+  "fmla %16.2s,v1.2s,v4.2s; add %25,%25,#24\n\t"\
+  "fmla %17.2s,v1.2s,v5.2s; fmla %18.2s,v1.2s,v6.2s\n\t"\
+  "ld1r {v0.2s},[%25],#4\n\t"\
+  "fmla %20.2s,v2.2s,v4.2s; fmla %21.2s,v2.2s,v5.2s; fmla %22.2s,v2.2s,v6.2s\n\t"\
+  "ldr d4,[%26,#-32]; ldr d5,[%26,#-24]; ldr d6,[%26,#-16]\n\t"\
+  "fmla %15.2s,v3.2s,v7.2s; sub %w24,%w24,#2\n\t"\
+  "fmla %19.2s,v1.2s,v7.2s; cmp %w24,#2\n\t"\
+  "fmla %23.2s,v2.2s,v7.2s\n\t"
+
+#define NEON_SGEMM_KERNEL_M12N8_HALF_TAIL2_A35 \
+  "ld1r {v1.2s},[%25],#4\n\t"\
+  "fmla %0.2s,v0.2s,v4.2s; fmla %1.2s,v0.2s,v5.2s; fmla %2.2s,v0.2s,v6.2s\n\t"\
+  "ld1r {v2.2s},[%25],#4\n\t"\
+  "fmla %4.2s,v1.2s,v4.2s; fmla %5.2s,v1.2s,v5.2s; fmla %6.2s,v1.2s,v6.2s\n\t"\
+  "ldr d7,[%26,#-8]\n\t"\
+  "fmla %8.2s,v2.2s,v4.2s; fmla %9.2s,v2.2s,v5.2s; fmla %10.2s,v2.2s,v6.2s\n\t"\
+  "ld1r {v3.2s},[%25],#4\n\t"\
+  "fmla %3.2s,v0.2s,v7.2s; fmla %7.2s,v1.2s,v7.2s; fmla %11.2s,v2.2s,v7.2s\n\t"\
+  "ld1r {v1.2s},[%25],#4\n\t"\
+  "fmla %12.2s,v3.2s,v4.2s; fmla %13.2s,v3.2s,v5.2s; fmla %14.2s,v3.2s,v6.2s\n\t"\
+  "ld1r {v2.2s},[%25],#4\n\t"\
+  "fmla %16.2s,v1.2s,v4.2s; add %25,%25,#24\n\t"\
+  "fmla %17.2s,v1.2s,v5.2s; fmla %18.2s,v1.2s,v6.2s\n\t"\
+  "ld1r {v0.2s},[%25],#4\n\t"\
+  "fmla %20.2s,v2.2s,v4.2s; fmla %21.2s,v2.2s,v5.2s; fmla %22.2s,v2.2s,v6.2s\n\t"\
+  "ldr d4,[%26]; ldr d5,[%26,#8]; ldr d6,[%26,#16]\n\t"\
+  "fmla %15.2s,v3.2s,v7.2s; add %26,%26,#32\n\t"\
+  "fmla %19.2s,v1.2s,v7.2s\n\t"\
+  "fmla %23.2s,v2.2s,v7.2s\n\t"\
+  "ld1r {v1.2s},[%25],#4\n\t"\
+  "fmla %0.2s,v0.2s,v4.2s; fmla %1.2s,v0.2s,v5.2s; fmla %2.2s,v0.2s,v6.2s\n\t"\
+  "ld1r {v2.2s},[%25],#4\n\t"\
+  "fmla %4.2s,v1.2s,v4.2s; fmla %5.2s,v1.2s,v5.2s; fmla %6.2s,v1.2s,v6.2s\n\t"\
+  "ldr d7,[%26,#-8]\n\t"\
+  "fmla %8.2s,v2.2s,v4.2s; fmla %9.2s,v2.2s,v5.2s; fmla %10.2s,v2.2s,v6.2s\n\t"\
+  "ld1r {v3.2s},[%25],#4\n\t"\
+  "fmla %3.2s,v0.2s,v7.2s; fmla %7.2s,v1.2s,v7.2s; fmla %11.2s,v2.2s,v7.2s\n\t"\
+  "ld1r {v1.2s},[%25],#4\n\t"\
+  "fmla %12.2s,v3.2s,v4.2s; fmla %13.2s,v3.2s,v5.2s; fmla %14.2s,v3.2s,v6.2s\n\t"\
+  "ld1r {v2.2s},[%25],#4\n\t"\
+  "fmla %16.2s,v1.2s,v4.2s; fmla %17.2s,v1.2s,v5.2s; fmla %18.2s,v1.2s,v6.2s\n\t"\
+  "fmla %20.2s,v2.2s,v4.2s; fmla %21.2s,v2.2s,v5.2s; fmla %22.2s,v2.2s,v6.2s\n\t"\
+  "fmla %15.2s,v3.2s,v7.2s; add %25,%25,#24\n\t"\
+  "fmla %19.2s,v1.2s,v7.2s\n\t"\
+  "fmla %23.2s,v2.2s,v7.2s\n\t"
+
+#define NEON_SGEMM_KERNEL_M12N8_HALF_TAIL1_A35 \
+  "ld1r {v1.2s},[%25],#4\n\t"\
+  "fmla %0.2s,v0.2s,v4.2s; fmla %1.2s,v0.2s,v5.2s; fmla %2.2s,v0.2s,v6.2s\n\t"\
+  "ld1r {v2.2s},[%25],#4\n\t"\
+  "fmla %4.2s,v1.2s,v4.2s; fmla %5.2s,v1.2s,v5.2s; fmla %6.2s,v1.2s,v6.2s\n\t"\
+  "ldr d7,[%26,#-8]\n\t"\
+  "fmla %8.2s,v2.2s,v4.2s; fmla %9.2s,v2.2s,v5.2s; fmla %10.2s,v2.2s,v6.2s\n\t"\
+  "ld1r {v3.2s},[%25],#4\n\t"\
+  "fmla %3.2s,v0.2s,v7.2s; fmla %7.2s,v1.2s,v7.2s; fmla %11.2s,v2.2s,v7.2s\n\t"\
+  "ld1r {v1.2s},[%25],#4\n\t"\
+  "fmla %12.2s,v3.2s,v4.2s; fmla %13.2s,v3.2s,v5.2s; fmla %14.2s,v3.2s,v6.2s\n\t"\
+  "ld1r {v2.2s},[%25],#4\n\t"\
+  "fmla %16.2s,v1.2s,v4.2s; fmla %17.2s,v1.2s,v5.2s; fmla %18.2s,v1.2s,v6.2s\n\t"\
+  "fmla %20.2s,v2.2s,v4.2s; fmla %21.2s,v2.2s,v5.2s; fmla %22.2s,v2.2s,v6.2s\n\t"\
+  "fmla %15.2s,v3.2s,v7.2s; add %25,%25,#24\n\t"\
+  "fmla %19.2s,v1.2s,v7.2s\n\t"\
+  "fmla %23.2s,v2.2s,v7.2s\n\t"
+
+#define NEON_SGEMM_SAVE_M6N2_UNIT_A35(c1, c2, c3, c4, c5, c6) \
+  ct1 = vzip1_f32(c1, c2); ct2 = vzip1_f32(c3, c4); ct3 = vzip1_f32(c5, c6);\
+  ct4 = vld1_f32(c_tmp), ct5 = vld1_f32(c_tmp + 2); ct6 = vld1_f32(c_tmp + 4);\
+  ct1 = vfma_f32(ct1, ct4, beta_d);\
+  ct2 = vfma_f32(ct2, ct5, beta_d);\
+  ct3 = vfma_f32(ct3, ct6, beta_d);\
+  vst1_f32(c_tmp, ct1); vst1_f32(c_tmp + 2, ct2); vst1_f32(c_tmp + 4, ct3);\
+  c_tmp += ldc;\
+  ct1 = vzip2_f32(c1, c2); ct2 = vzip2_f32(c3, c4); ct3 = vzip2_f32(c5, c6);\
+  ct4 = vld1_f32(c_tmp), ct5 = vld1_f32(c_tmp + 2); ct6 = vld1_f32(c_tmp + 4);\
+  ct1 = vfma_f32(ct1, ct4, beta_d);\
+  ct2 = vfma_f32(ct2, ct5, beta_d);\
+  ct3 = vfma_f32(ct3, ct6, beta_d);\
+  vst1_f32(c_tmp, ct1); vst1_f32(c_tmp + 2, ct2); vst1_f32(c_tmp + 4, ct3);\
+  c_tmp += ldc;
+
+#define NEON_SGEMM_SAVE_M6N8_A35 \
+  NEON_SGEMM_SAVE_M6N2_UNIT_A35(c01, c05, c09, c13, c17, c21)\
+  NEON_SGEMM_SAVE_M6N2_UNIT_A35(c02, c06, c10, c14, c18, c22)\
+  NEON_SGEMM_SAVE_M6N2_UNIT_A35(c03, c07, c11, c15, c19, c23)\
+  NEON_SGEMM_SAVE_M6N2_UNIT_A35(c04, c08, c12, c16, c20, c24)
+
+#define NEON_SGEMM_SAVE_M8N1_UNIT_A35(c1, c2, c3, c4) \
+  ct1 = vld1_f32(c_tmp); ct2 = vld1_f32(c_tmp + 2);\
+  ct3 = vld1_f32(c_tmp + 4); ct4 = vld1_f32(c_tmp + 6);\
+  c1 = vfma_f32(c1, ct1, beta_d); c2 = vfma_f32(c2, ct2, beta_d);\
+  c3 = vfma_f32(c3, ct3, beta_d); c4 = vfma_f32(c4, ct4, beta_d);\
+  vst1_f32(c_tmp, c1); vst1_f32(c_tmp + 2, c2);\
+  vst1_f32(c_tmp + 4, c3); vst1_f32(c_tmp + 6, c4); c_tmp += ldc;
+
+#define NEON_SGEMM_SAVE_M8N6_A35 \
+  NEON_SGEMM_SAVE_M8N1_UNIT_A35(c01, c02, c03, c04)\
+  NEON_SGEMM_SAVE_M8N1_UNIT_A35(c05, c06, c07, c08)\
+  NEON_SGEMM_SAVE_M8N1_UNIT_A35(c09, c10, c11, c12)\
+  NEON_SGEMM_SAVE_M8N1_UNIT_A35(c13, c14, c15, c16)\
+  NEON_SGEMM_SAVE_M8N1_UNIT_A35(c17, c18, c19, c20)\
+  NEON_SGEMM_SAVE_M8N1_UNIT_A35(c21, c22, c23, c24)
+
+#define NEON_SGEMM_KERNEL_M12N8_HALF_A35(a_ptr, b_ptr) \
+  k_left = K;\
+  __asm__ __volatile__ (\
+    "movi %0.8b,#0; movi %1.8b,#0\n\t"\
+    "mov %2.8b,%0.8b; mov %3.8b,%1.8b\n\t"\
+    "mov %4.8b,%0.8b; mov %5.8b,%1.8b\n\t"\
+    "mov %6.8b,%0.8b; mov %7.8b,%1.8b\n\t"\
+    "mov %8.8b,%0.8b; mov %9.8b,%1.8b\n\t"\
+    "mov %10.8b,%0.8b; mov %11.8b,%1.8b\n\t"\
+    "mov %12.8b,%0.8b; mov %13.8b,%1.8b\n\t"\
+    "mov %14.8b,%0.8b; mov %15.8b,%1.8b\n\t"\
+    "mov %16.8b,%0.8b; mov %17.8b,%1.8b\n\t"\
+    "mov %18.8b,%0.8b; mov %19.8b,%1.8b\n\t"\
+    "mov %20.8b,%0.8b; mov %21.8b,%1.8b\n\t"\
+    "mov %22.8b,%0.8b; mov %23.8b,%1.8b\n\t"\
+    "cmp %w24,#0; b.eq 4f\n\t"\
+    NEON_SGEMM_KERNEL_M12N8_HALF_PRELOAD_A35\
+    "cmp %w24,#2; b.le 2f\n\t"\
+    ".balign 16\n\t"\
+    "1:\n\t"\
+    NEON_SGEMM_KERNEL_M12N8_HALF_MAIN2_A35 "b.gt 1b\n\t"\
+    "2:\n\t"\
+    "cmp %w24,#2; b.ne 3f\n\t"\
+    NEON_SGEMM_KERNEL_M12N8_HALF_TAIL2_A35 "b 4f\n\t"\
+    "3:\n\t"\
+    NEON_SGEMM_KERNEL_M12N8_HALF_TAIL1_A35\
+    "4:\n\t"\
+  :"=w"(c01),"=w"(c02),"=w"(c03),"=w"(c04),"=w"(c05),"=w"(c06),\
+  "=w"(c07),"=w"(c08),"=w"(c09),"=w"(c10),"=w"(c11),"=w"(c12),\
+  "=w"(c13),"=w"(c14),"=w"(c15),"=w"(c16),"=w"(c17),"=w"(c18),\
+  "=w"(c19),"=w"(c20),"=w"(c21),"=w"(c22),"=w"(c23),"=w"(c24),\
+  "+r"(k_left),"+r"(a_ptr),"+r"(b_ptr)\
+  ::"cc","memory","v0","v1","v2","v3","v4","v5","v6","v7");
+
+#define NEON_SGEMM_COMPUTE_M8N12_A35 \
+  uint32_t k_left;\
+  float32x2_t c01, c02, c03, c04, c05, c06, c07, c08;\
+  float32x2_t c09, c10, c11, c12, c13, c14, c15, c16;\
+  float32x2_t c17, c18, c19, c20, c21, c22, c23, c24;\
+  float *c_pref = c_ptr; PREF_C(8, 6)\
+  const float *a_ptr = a_head;\
+  const float *b_ptr = b_head;\
+  NEON_SGEMM_KERNEL_M12N8_HALF_A35(b_ptr, a_ptr)\
+  const float32x2_t beta_d = vdup_n_f32(beta);\
+  float *c_tmp = c_ptr;\
+  float32x2_t ct1, ct2, ct3, ct4;\
+  NEON_SGEMM_SAVE_M8N6_A35\
+  a_ptr = a_head; b_ptr = b_head + 6;\
+  PREF_C(8, 6)\
+  NEON_SGEMM_KERNEL_M12N8_HALF_A35(b_ptr, a_ptr)\
+  NEON_SGEMM_SAVE_M8N6_A35
+
+#define NEON_SGEMM_COMPUTE_M12N8_A35 \
+  uint32_t k_left;\
+  float32x2_t c01, c02, c03, c04, c05, c06, c07, c08;\
+  float32x2_t c09, c10, c11, c12, c13, c14, c15, c16;\
+  float32x2_t c17, c18, c19, c20, c21, c22, c23, c24;\
+  float *c_pref = c_ptr; PREF_C(6, 8)\
+  const float *a_ptr = a_head;\
+  const float *b_ptr = b_head;\
+  NEON_SGEMM_KERNEL_M12N8_HALF_A35(a_ptr, b_ptr)\
+  const float32x2_t beta_d = vdup_n_f32(beta);\
+  float *c_tmp = c_ptr;\
+  float32x2_t ct1, ct2, ct3, ct4, ct5, ct6;\
+  NEON_SGEMM_SAVE_M6N8_A35\
+  c_tmp -= 8 * ldc;\
+  c_tmp += 6;\
+  c_pref = c_ptr + 6; PREF_C(6, 8)\
+  b_ptr = b_head; a_ptr = a_head + 6;\
+  NEON_SGEMM_KERNEL_M12N8_HALF_A35(a_ptr, b_ptr)\
+  NEON_SGEMM_SAVE_M6N8_A35
+
+#define CPUID_DETECT_MNK 1000000
+
+void sgemm_kernel_lm_m8n12(uint32_t M, uint32_t N, uint32_t K, float beta,
+  const float * __restrict__ sa, const float * __restrict__ sb,
+  float * __restrict__ C, uint32_t ldc) {
+  uint32_t n_left = N;
+  const float *b_head = sb;
+  float *c_head = C;
+  uint32_t acc_mnk = CPUID_DETECT_MNK;
+  uint8_t cpuid = 0, cputype = 0;
+  for (; n_left > 11; n_left -= 12) {
+    if (acc_mnk >= CPUID_DETECT_MNK) {
+      cpuid = sched_getcpu();
+      cputype = blas_arm_get_cpu_type(cpuid);
+      acc_mnk = 0;
+    }
+    const float *a_head = sa;
+    float *c_ptr = c_head;
+    uint32_t m_left = M;
+    if (cputype == 53) {
+      for (; m_left > 7; m_left -= 8) {
+        NEON_SGEMM_COMPUTE_ASM1(8, 12, A53)
+        a_head += 8 * K;
+        c_ptr += 8;
+      }
+    } else if (cputype == 55) {
+      for (; m_left > 7; m_left -= 8) {
+        NEON_SGEMM_COMPUTE_ASM1(8, 12, A55)
+        a_head += 8 * K;
+        c_ptr += 8;
+      }
+    } else if (cputype == 35) {
+      for (; m_left > 7; m_left -= 8) {
+        NEON_SGEMM_COMPUTE_M8N12_A35
+        a_head += 8 * K;
+        c_ptr += 8;
+      }
+    } else {
+      for (; m_left > 7; m_left -= 8) {
+        NEON_SGEMM_COMPUTE_ASM1(8, 12, A72)
+        a_head += 8 * K;
+        c_ptr += 8;
+      }
+    }
+    MICRO_COMPUTE_LM(4, 12, float, float, float)
+    b_head += K * 12;
+    c_head += ldc * 12;
+    acc_mnk += 12 * K * M;
+  }
+  ASSEMBLE_DUALPACK_COMPUTE_LM(8, float, float, float, 8)
+}
+
+void sgemm_kernel_ln_m12n8(uint32_t M, uint32_t N, uint32_t K, float beta,
+  const float * __restrict__ sa, const float * __restrict__ sb,
+  float * __restrict__ C, uint32_t ldc) {
+  uint32_t m_left = M;
+  const float *a_head = sa;
+  float *c_head = C;
+  uint32_t acc_mnk = CPUID_DETECT_MNK;
+  uint8_t cpuid = 0, cputype = 0;
+  for (; m_left > 11; m_left -= 12) {
+    if (acc_mnk >= CPUID_DETECT_MNK) {
+      cpuid = sched_getcpu();
+      cputype = blas_arm_get_cpu_type(cpuid);
+      acc_mnk = 0;
+    }
+    const float *b_head = sb;
+    float *c_ptr = c_head;
+    uint32_t n_left = N;
+    if (cputype == 53) {
+      for (; n_left > 7; n_left -= 8) {
+        NEON_SGEMM_COMPUTE_ASM1(12, 8, A53)
+        b_head += 8 * K;
+        c_ptr += 8 * ldc;
+      }
+    } else if (cputype == 55) {
+      for (; n_left > 7; n_left -= 8) {
+        NEON_SGEMM_COMPUTE_ASM1(12, 8, A55)
+        b_head += 8 * K;
+        c_ptr += 8 * ldc;
+      }
+    } else if (cputype == 35) {
+      for (; n_left > 7; n_left -= 8) {
+        NEON_SGEMM_COMPUTE_M12N8_A35
+        b_head += 8 * K;
+        c_ptr += 8 * ldc;
+      }
+    } else {
+      for (; n_left > 7; n_left -= 8) {
+        NEON_SGEMM_COMPUTE_ASM1(12, 8, A72)
+        b_head += 8 * K;
+        c_ptr += 8 * ldc;
+      }
+    }
+    MICRO_COMPUTE_LN(12, 4, float, float, float)
+    a_head += K * 12;
+    c_head += 12;
+    acc_mnk += 12 * N * K;
+  }
+  ASSEMBLE_DUALPACK_COMPUTE_LN(8, float, float, float, 8)
+}
+
diff --git a/src/neon_armv8a/SgemmSkinnyDot.c b/src/neon_armv8a/SgemmSkinnyDot.c
new file mode 100644
index 0000000..3c20438
--- /dev/null
+++ b/src/neon_armv8a/SgemmSkinnyDot.c
@@ -0,0 +1,800 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#define _GNU_SOURCE
+#include "arm_neon/ARMCompareAndSwap.h"
+#include "arm_neon/ARMCpuType.h"
+#include "common/CommonSkinnyDot.h"
+#include "neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotA35.h"
+#include "neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotA53.h"
+#include "neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotA7x.h"
+#include <arm_neon.h>
+#include <sched.h>
+
+typedef float sgemm_skinnydot_ascalar;
+typedef float sgemm_skinnydot_bscalar;
+typedef float sgemm_skinnydot_cscalar;
+
+static inline void inline_sgemm_arowmajor_bskinny_m4n1(const float *a_ptr1,
+  const float *b_ptr, float *c_ptr, uint32_t k_inc, uint32_t LDK,
+  uint32_t LDM, float beta, bool c_rowmajor) {
+
+  const float *a_ptr2 = a_ptr1 + LDK;
+  const float *a_ptr3 = a_ptr1 + LDK * 2;
+  const float *a_ptr4 = a_ptr2 + LDK * 2;
+
+  float32x2_t cd1, cd2, cd3, cd4, cd5, cd6, cd7, cd8;
+  const float *a_pref = a_ptr4 + LDK;
+  const uint32_t pref_inc = (LDK > k_inc) ?
+    (LDK - k_inc) * sizeof(float) : 64;
+  uint32_t k_left = k_inc;
+  __asm__ __volatile__(
+    "movz w0,#0; movz w1,#64\n\t" //pref
+    "movi %[cd1].8b,#0; movi %[cd2].8b,#0\n\t"
+    "movi %[cd3].8b,#0; movi %[cd4].8b,#0\n\t"
+    "movi %[cd5].8b,#0; movi %[cd6].8b,#0\n\t"
+    "movi %[cd7].8b,#0; movi %[cd8].8b,#0\n\t"
+    "cmp %w[k_left],#4; b.lt 3f\n\t"
+    "ldr d2,[%[a_ptr1]],#16; ldr d3,[%[a_ptr2]],#16\n\t"
+    "ldr d4,[%[a_ptr3]],#16; ldr d5,[%[a_ptr4]],#16\n\t"
+    "ldr d0,[%[b_ptr]],#16\n\t"
+    "ldr d6,[%[a_ptr1],#-8]; ldr d7,[%[a_ptr2],#-8]\n\t"
+    "ldr d8,[%[a_ptr3],#-8]; ldr d9,[%[a_ptr4],#-8]\n\t"
+    "ldr d1,[%[b_ptr],#-8]\n\t"
+    "cmp %w[k_left],#8; b.lt 2f\n\t"
+    ".balign 16; 1:\n\t"
+    "prfm pldl2keep,[%[a_pref]]; add w0,w0,#16\n\t"
+    "fmla %[cd1].2s,v2.2s,v0.2s; ldr d2,[%[a_ptr1]],#16\n\t"
+    "cmp w0,%w[k_inc]\n\t"
+    "fmla %[cd2].2s,v3.2s,v0.2s; ldr d3,[%[a_ptr2]],#16\n\t"
+    "csel w2,%w[pref_inc],w1,gt\n\t"
+    "fmla %[cd3].2s,v4.2s,v0.2s; ldr d4,[%[a_ptr3]],#16\n\t"
+    "fmla %[cd4].2s,v5.2s,v0.2s; ldr d5,[%[a_ptr4]],#16\n\t"
+    "csel w0,wzr,w0,gt\n\t"
+    "ldr d0,[%[b_ptr]],#16; sub %w[k_left],%w[k_left],#4\n\t"
+    "fmla %[cd5].2s,v6.2s,v1.2s; ldr d6,[%[a_ptr1],#-8]\n\t"
+    "add %[a_pref],%[a_pref],x2\n\t"
+    "fmla %[cd6].2s,v7.2s,v1.2s; ldr d7,[%[a_ptr2],#-8]\n\t"
+    "cmp %w[k_left],#8\n\t"
+    "fmla %[cd7].2s,v8.2s,v1.2s; ldr d8,[%[a_ptr3],#-8]\n\t"
+    "fmla %[cd8].2s,v9.2s,v1.2s; ldr d9,[%[a_ptr4],#-8]\n\t"
+    "ldr d1,[%[b_ptr],#-8]; b.ge 1b\n\t"
+    "2:\n\t"
+    "fmla %[cd1].2s,v2.2s,v0.2s; fmla %[cd2].2s,v3.2s,v0.2s\n\t"
+    "fmla %[cd3].2s,v4.2s,v0.2s; fmla %[cd4].2s,v5.2s,v0.2s\n\t"
+    "sub %w[k_left],%w[k_left],#4\n\t"
+    "fmla %[cd5].2s,v6.2s,v1.2s; fmla %[cd6].2s,v7.2s,v1.2s\n\t"
+    "fmla %[cd7].2s,v8.2s,v1.2s; fmla %[cd8].2s,v9.2s,v1.2s\n\t"
+    "3:\n\t"
+   :[cd1]"=w"(cd1), [cd2]"=w"(cd2), [cd3]"=w"(cd3), [cd4]"=w"(cd4),
+    [cd5]"=w"(cd5), [cd6]"=w"(cd6), [cd7]"=w"(cd7), [cd8]"=w"(cd8),
+    [k_left]"+r"(k_left), [a_pref]"+r"(a_pref), [b_ptr]"+r"(b_ptr),
+    [a_ptr1]"+r"(a_ptr1), [a_ptr2]"+r"(a_ptr2),
+    [a_ptr3]"+r"(a_ptr3), [a_ptr4]"+r"(a_ptr4)
+   :[k_inc]"r"(k_inc), [pref_inc]"r"(pref_inc)
+   :"cc","memory","x0","x1","x2",
+    "v0","v1","v2","v3","v4","v5","v6","v7","v8","v9");
+
+  cd1 = vadd_f32(cd1, cd5); cd2 = vadd_f32(cd2, cd6);
+  cd3 = vadd_f32(cd3, cd7); cd4 = vadd_f32(cd4, cd8);
+  float cs1 = vpadds_f32(cd1);
+  float cs2 = vpadds_f32(cd2);
+  float cs3 = vpadds_f32(cd3);
+  float cs4 = vpadds_f32(cd4);
+
+  for (; k_left > 0; k_left--) {
+    float bs1 = *b_ptr; b_ptr++;
+    cs1 += (*a_ptr1) * bs1; a_ptr1++; 
+    cs2 += (*a_ptr2) * bs1; a_ptr2++;
+    cs3 += (*a_ptr3) * bs1; a_ptr3++; 
+    cs4 += (*a_ptr4) * bs1; a_ptr4++;
+  }
+  c_ptr[0] = c_ptr[0] * beta + cs1;
+  c_ptr[1] = c_ptr[1] * beta + cs2;
+  c_ptr[2] = c_ptr[2] * beta + cs3;
+  c_ptr[3] = c_ptr[3] * beta + cs4;
+}
+
+static inline void inline_sgemm_arowmajor_bskinny_m1n1(const float *a_ptr,
+  const float *b_ptr, float *c_ptr, uint32_t k_inc, uint32_t LDK,
+  uint32_t LDM, float beta, bool c_rowmajor) {
+
+  float cs1;
+  __asm__ __volatile__(
+    "movi v16.8b,#0; movi v17.8b,#0\n\t"
+    "mov v18.8b,v16.8b; mov v19.8b,v17.8b\n\t"
+    "mov v20.8b,v16.8b; mov v21.8b,v17.8b\n\t"
+    "mov v22.8b,v16.8b; mov v23.8b,v17.8b\n\t"
+    "cmp %w[K],#16; b.lt 4f\n\t"
+    "prfm pldl1keep,[%[a_ptr],#256]\n\t"
+    "ldr d0,[%[a_ptr]],#64; ldr d8,[%[b_ptr]],#64\n\t"
+    "ldr d1,[%[a_ptr],#-56]; ldr d9,[%[b_ptr],#-56]\n\t"
+    "ldr d2,[%[a_ptr],#-48]; ldr d10,[%[b_ptr],#-48]\n\t"
+    "ldr d3,[%[a_ptr],#-40]; ldr d11,[%[b_ptr],#-40]\n\t"
+    "ldr d4,[%[a_ptr],#-32]; ldr d12,[%[b_ptr],#-32]\n\t"
+    "ldr d5,[%[a_ptr],#-24]; ldr d13,[%[b_ptr],#-24]\n\t"
+    "ldr d6,[%[a_ptr],#-16]; ldr d14,[%[b_ptr],#-16]\n\t"
+    "ldr d7,[%[a_ptr],#-8]; ldr d15,[%[b_ptr],#-8]\n\t"
+    "cmp %w[K],#32; b.lt 3f\n\t"
+    "2:\n\t"
+    "prfm pldl1keep,[%[a_ptr],#256]\n\t"
+    "fmla v16.2s,v0.2s,v8.2s; ldr d0,[%[a_ptr]],#64; ldr d8,[%[b_ptr]],#64\n\t"
+    "fmla v17.2s,v1.2s,v9.2s; ldr d1,[%[a_ptr],#-56]; ldr d9,[%[b_ptr],#-56]\n\t"
+    "fmla v18.2s,v2.2s,v10.2s; ldr d2,[%[a_ptr],#-48]; ldr d10,[%[b_ptr],#-48]\n\t"
+    "fmla v19.2s,v3.2s,v11.2s; ldr d3,[%[a_ptr],#-40]; ldr d11,[%[b_ptr],#-40]\n\t"
+    "sub %w[K],%w[K],#16\n\t"
+    "fmla v20.2s,v4.2s,v12.2s; ldr d4,[%[a_ptr],#-32]; ldr d12,[%[b_ptr],#-32]\n\t"
+    "fmla v21.2s,v5.2s,v13.2s; ldr d5,[%[a_ptr],#-24]; ldr d13,[%[b_ptr],#-24]\n\t"
+    "cmp %w[K],#32\n\t"
+    "fmla v22.2s,v6.2s,v14.2s; ldr d6,[%[a_ptr],#-16]; ldr d14,[%[b_ptr],#-16]\n\t"
+    "fmla v23.2s,v7.2s,v15.2s; ldr d7,[%[a_ptr],#-8]; ldr d15,[%[b_ptr],#-8]\n\t"
+    "b.ge 2b\n\t"
+    "3:\n\t"
+    "fmla v16.2s,v0.2s,v8.2s; fmla v17.2s,v1.2s,v9.2s\n\t"
+    "fmla v18.2s,v2.2s,v10.2s; fmla v19.2s,v3.2s,v11.2s; sub %w[K],%w[K],#16\n\t"
+    "fmla v20.2s,v4.2s,v12.2s; fmla v21.2s,v5.2s,v13.2s\n\t"
+    "fmla v22.2s,v6.2s,v14.2s; fmla v23.2s,v7.2s,v15.2s\n\t"
+    "4:\n\t"
+    "fadd v16.2s,v16.2s,v20.2s; fadd v17.2s,v17.2s,v21.2s\n\t"
+    "fadd v18.2s,v18.2s,v22.2s; fadd v19.2s,v19.2s,v23.2s\n\t"
+    "cmp %w[K],#8; b.lt 5f\n\t"
+    "ldr d0,[%[a_ptr]],#32; ldr d8,[%[b_ptr]],#32; fmla v16.2s,v0.2s,v8.2s\n\t"
+    "ldr d1,[%[a_ptr],#-24]; ldr d9,[%[b_ptr],#-24]; fmla v17.2s,v1.2s,v9.2s\n\t"
+    "sub %w[K],%w[K],#8\n\t"
+    "ldr d2,[%[a_ptr],#-16]; ldr d10,[%[b_ptr],#-16]; fmla v18.2s,v2.2s,v10.2s\n\t"
+    "ldr d3,[%[a_ptr],#-8]; ldr d11,[%[b_ptr],#-8]; fmla v19.2s,v3.2s,v11.2s\n\t"
+    "5:\n\t"
+    "fadd v16.2s,v16.2s,v18.2s; fadd v17.2s,v17.2s,v19.2s\n\t"
+    "cmp %w[K],#4; b.lt 6f\n\t"
+    "ldr d0,[%[a_ptr]],#16; ldr d8,[%[b_ptr]],#16; fmla v16.2s,v0.2s,v8.2s\n\t"
+    "sub %w[K],%w[K],#4\n\t"
+    "ldr d1,[%[a_ptr],#-8]; ldr d9,[%[b_ptr],#-8]; fmla v17.2s,v1.2s,v9.2s\n\t"
+    "6:\n\t"
+    "fadd v16.2s,v16.2s,v17.2s\n\t"
+    "cmp %w[K],#2; b.lt 7f\n\t"
+    "ldr d0,[%[a_ptr]],#8; ldr d8,[%[b_ptr]],#8; fmla v16.2s,v0.2s,v8.2s\n\t"
+    "sub %w[K],%w[K],#2\n\t"
+    "7:\n\t"
+    "faddp %s[cs1],v16.2s\n\t"
+    "cmp %w[K],#1; b.lt 10f\n\t"
+    "ldr s0,[%[a_ptr]],#4; ldr s8,[%[b_ptr]],#4; fmla %s[cs1],s0,v8.s[0]\n\t"
+    "10:\n\t"
+   :[cs1]"=w"(cs1), [a_ptr]"+r"(a_ptr), [b_ptr]"+r"(b_ptr), [K]"+r"(k_inc)
+   ::"cc","memory","v0","v1","v2","v3","v4","v5",
+     "v6","v7","v8","v9","v10","v11","v12","v13","v14","v15","v16","v17",
+     "v18","v19","v20","v21","v22","v23");
+   c_ptr[0] = c_ptr[0] * beta + cs1;
+}
+
+/* k_mask = 7 */
+static inline void inline_sgemm_arowmajor_bskinny_m4n2(const float *a_ptr1,
+  const float *b_ptr, float *c_ptr, uint32_t k_inc, uint32_t LDK,
+  uint32_t LDM, float beta, bool c_rowmajor) {
+
+  float32x4_t cq1, cq2, cq3, cq4, cq5, cq6, cq7, cq8;
+  const float *a_ptr2 = a_ptr1 + LDK;
+  const float *a_ptr3 = a_ptr1 + LDK * 2;
+  const float *a_ptr4 = a_ptr2 + LDK * 2;
+  uint32_t k_left = k_inc;
+  const float *a_pref = a_ptr4 + LDK;
+  const uint32_t pref_inc = (LDK > k_inc) ?
+    (LDK - k_inc) * sizeof(float) : 64;
+  __asm__ __volatile__(
+    "movz w0,#0; movz w1,#64\n\t" //pref
+    "movi %[cq1].16b,#0; movi %[cq2].16b,#0\n\t"
+    "movi %[cq3].16b,#0; movi %[cq4].16b,#0\n\t"
+    "movi %[cq5].16b,#0; movi %[cq6].16b,#0\n\t"
+    "movi %[cq7].16b,#0; movi %[cq8].16b,#0\n\t"
+    "cmp %w[k_left],#4; b.lt 3f\n\t"
+    "ldr q2,[%[a_ptr1]],#16; ldr q3,[%[a_ptr2]],#16\n\t"
+    "ldr q4,[%[a_ptr3]],#16; ldr q5,[%[a_ptr4]],#16\n\t"
+    "ldr q0,[%[b_ptr]]; ldr q1,[%[b_ptr],#16]; add %[b_ptr],%[b_ptr],#32\n\t"
+    "cmp %w[k_left],#8; b.lt 2f\n\t"
+    ".balign 16; 1:\n\t"
+    "prfm pldl2keep,[%[a_pref]]; add w0,w0,#16\n\t"
+    "fmla %[cq1].4s,v2.4s,v0.4s; fmla %[cq5].4s,v2.4s,v1.4s\n\t"
+    "ldr q2,[%[a_ptr1]],#16; cmp w0,%w[k_inc]\n\t"
+    "fmla %[cq2].4s,v3.4s,v0.4s; fmla %[cq6].4s,v3.4s,v1.4s\n\t"
+    "ldr q3,[%[a_ptr2]],#16; csel w2,%w[pref_inc],w1,gt\n\t"
+    "sub %w[k_left],%w[k_left],#4\n\t"
+    "fmla %[cq3].4s,v4.4s,v0.4s; fmla %[cq7].4s,v4.4s,v1.4s\n\t"
+    "ldr q4,[%[a_ptr3]],#16; csel w0,wzr,w0,gt\n\t"
+    "cmp %w[k_left],#8\n\t"
+    "fmla %[cq4].4s,v5.4s,v0.4s; fmla %[cq8].4s,v5.4s,v1.4s\n\t"
+    "ldr q5,[%[a_ptr4]],#16; add %[a_pref],%[a_pref],x2\n\t"
+    "ldr q0,[%[b_ptr]]; ldr q1,[%[b_ptr],#16]\n\t"
+    "add %[b_ptr],%[b_ptr],#32; b.ge 1b\n\t"
+    "2:\n\t"
+    "fmla %[cq1].4s,v2.4s,v0.4s; fmla %[cq5].4s,v2.4s,v1.4s\n\t"
+    "fmla %[cq2].4s,v3.4s,v0.4s; fmla %[cq6].4s,v3.4s,v1.4s\n\t"
+    "fmla %[cq3].4s,v4.4s,v0.4s; fmla %[cq7].4s,v4.4s,v1.4s\n\t"
+    "fmla %[cq4].4s,v5.4s,v0.4s; fmla %[cq8].4s,v5.4s,v1.4s\n\t"
+    "sub %w[k_left],%w[k_left],#4\n\t"
+    "3:\n\t"
+   :[cq1]"=w"(cq1), [cq2]"=w"(cq2), [cq3]"=w"(cq3), [cq4]"=w"(cq4),
+    [cq5]"=w"(cq5), [cq6]"=w"(cq6), [cq7]"=w"(cq7), [cq8]"=w"(cq8),
+    [k_left]"+r"(k_left), [a_pref]"+r"(a_pref),
+    [b_ptr]"+r"(b_ptr), [a_ptr1]"+r"(a_ptr1), [a_ptr2]"+r"(a_ptr2),
+    [a_ptr3]"+r"(a_ptr3), [a_ptr4]"+r"(a_ptr4)
+   :[k_inc]"r"(k_inc), [pref_inc]"r"(pref_inc)
+   :"x0","x1","x2","v0","v1","v2","v3","v4","v5","cc","memory");
+
+  cq1 = vpaddq_f32(cq1, cq5);
+  cq2 = vpaddq_f32(cq2, cq6);
+  cq3 = vpaddq_f32(cq3, cq7);
+  cq4 = vpaddq_f32(cq4, cq8);
+  
+  if (k_left >= 2) {
+    float32x4_t bq1 = vld1q_f32(b_ptr); b_ptr += 4;
+    float32x2_t ad1 = vld1_f32(a_ptr1); a_ptr1 += 2;
+    float32x2_t ad2 = vld1_f32(a_ptr2); a_ptr2 += 2;
+    float32x2_t ad3 = vld1_f32(a_ptr3); a_ptr3 += 2;
+    float32x2_t ad4 = vld1_f32(a_ptr4); a_ptr4 += 2;
+    float32x4_t aq1 = vcombine_f32(ad1, ad1);
+    float32x4_t aq2 = vcombine_f32(ad2, ad2);
+    float32x4_t aq3 = vcombine_f32(ad3, ad3);
+    float32x4_t aq4 = vcombine_f32(ad4, ad4);
+    cq1 = vfmaq_f32(cq1, aq1, bq1);
+    cq2 = vfmaq_f32(cq2, aq2, bq1);
+    cq3 = vfmaq_f32(cq3, aq3, bq1);
+    cq4 = vfmaq_f32(cq4, aq4, bq1);
+    k_left -= 2;
+  }
+  
+  float32x2_t cd1 = vget_low_f32(vpaddq_f32(cq1, cq1));
+  float32x2_t cd2 = vget_low_f32(vpaddq_f32(cq2, cq2));
+  float32x2_t cd3 = vget_low_f32(vpaddq_f32(cq3, cq3));
+  float32x2_t cd4 = vget_low_f32(vpaddq_f32(cq4, cq4));
+  
+  if (k_left > 0) {
+    float32x2_t bd1 = vld1_f32(b_ptr);
+    float32x2_t ad1 = vld1_dup_f32(a_ptr1);
+    float32x2_t ad2 = vld1_dup_f32(a_ptr2);
+    float32x2_t ad3 = vld1_dup_f32(a_ptr3);
+    float32x2_t ad4 = vld1_dup_f32(a_ptr4);
+    cd1 = vfma_f32(cd1, ad1, bd1);
+    cd2 = vfma_f32(cd2, ad2, bd1);
+    cd3 = vfma_f32(cd3, ad3, bd1);
+    cd4 = vfma_f32(cd4, ad4, bd1);
+  }
+  
+  if (c_rowmajor) {
+    cd1 = vfma_n_f32(cd1, vld1_f32(c_ptr), beta);
+    cd2 = vfma_n_f32(cd2, vld1_f32(c_ptr + 2), beta);
+    cd3 = vfma_n_f32(cd3, vld1_f32(c_ptr + 4), beta);
+    cd4 = vfma_n_f32(cd4, vld1_f32(c_ptr + 6), beta);
+    vst1_f32(c_ptr, cd1);
+    vst1_f32(c_ptr + 2, cd2);
+    vst1_f32(c_ptr + 4, cd3);
+    vst1_f32(c_ptr + 6, cd4);
+  } else {
+    float32x2_t cd00 = vzip1_f32(cd1, cd2);
+    float32x2_t cd01 = vzip1_f32(cd3, cd4);
+    float32x2_t cd10 = vzip2_f32(cd1, cd2);
+    float32x2_t cd11 = vzip2_f32(cd3, cd4);
+    float *c_ptr1 = c_ptr;
+    float *c_ptr2 = c_ptr + LDM;
+    cd00 = vfma_n_f32(cd00, vld1_f32(c_ptr1), beta);
+    cd01 = vfma_n_f32(cd01, vld1_f32(c_ptr1 + 2), beta);
+    cd10 = vfma_n_f32(cd10, vld1_f32(c_ptr2), beta);
+    cd11 = vfma_n_f32(cd11, vld1_f32(c_ptr2 + 2), beta);
+    vst1_f32(c_ptr1, cd00);
+    vst1_f32(c_ptr1 + 2, cd01);
+    vst1_f32(c_ptr2, cd10);
+    vst1_f32(c_ptr2 + 2, cd11);
+  }
+}
+
+static inline void inline_sgemm_arowmajor_bskinny_m1n2(const float *a_ptr,
+  const float *b_ptr, float *c_ptr, uint32_t k_inc, uint32_t LDK,
+  uint32_t LDM, float beta, bool c_rowmajor) {
+
+  uint32_t k_left = k_inc;
+  float cs1, cs2;
+  __asm__ __volatile__ (
+    "movi v8.16b,#0; movi v9.16b,#0\n\t"
+    "mov v10.16b,v8.16b; mov v11.16b,v9.16b\n\t"
+    "mov v16.16b,v8.16b; mov v17.16b,v9.16b\n\t"
+    "mov v18.16b,v8.16b; mov v19.16b,v9.16b\n\t"
+    "cmp %w[k_left],#16; b.lt 4f\n\t"
+    "prfm pldl1keep,[%[a_ptr],#256]\n\t"
+    "ldr q0,[%[a_ptr]]; ldr q1,[%[a_ptr],#16]\n\t"
+    "ldr q2,[%[a_ptr],#32]; ldr q3,[%[a_ptr],#48]\n\t"
+    "add %[a_ptr],%[a_ptr],#64\n\t"
+    "ldr q4,[%[b_ptr]]; ldr q12,[%[b_ptr],#16]\n\t"
+    "ldr q5,[%[b_ptr],#32]; ldr q13,[%[b_ptr],#48]\n\t"
+    "ldr q6,[%[b_ptr],#64]; ldr q14,[%[b_ptr],#80]\n\t"
+    "ldr q7,[%[b_ptr],#96]; ldr q15,[%[b_ptr],#112]\n\t"
+    "add %[b_ptr],%[b_ptr],#128\n\t"
+    "cmp %w[k_left],#32; b.lt 3f\n\t"
+    ".balign 16; 2:\n\t"
+    "prfm pldl1keep,[%[a_ptr],#256]\n\t"
+    "fmla v8.4s,v0.4s,v4.4s; ldr q4,[%[b_ptr]]\n\t"
+    "fmla v10.4s,v0.4s,v12.4s; ldr q12,[%[b_ptr],#16]; ldr q0,[%[a_ptr]],#64\n\t"
+    "fmla v9.4s,v1.4s,v5.4s; ldr q5,[%[b_ptr],#32]\n\t"
+    "fmla v11.4s,v1.4s,v13.4s; ldr q13,[%[b_ptr],#48]; ldr q1,[%[a_ptr],#-48]\n\t"
+    "sub %w[k_left],%w[k_left],#16\n\t"
+    "fmla v16.4s,v2.4s,v6.4s; ldr q6,[%[b_ptr],#64]\n\t"
+    "fmla v18.4s,v2.4s,v14.4s; ldr q14,[%[b_ptr],#80]; ldr q2,[%[a_ptr],#-32]\n\t"
+    "cmp %w[k_left],#32\n\t"
+    "fmla v17.4s,v3.4s,v7.4s; ldr q7,[%[b_ptr],#96]\n\t"
+    "fmla v19.4s,v3.4s,v15.4s; ldr q15,[%[b_ptr],#112]; ldr q3,[%[a_ptr],#-16]\n\t"
+    "add %[b_ptr],%[b_ptr],#128; b.ge 2b\n\t"
+    "3:\n\t"
+    "fmla v8.4s,v0.4s,v4.4s; fmla v10.4s,v0.4s,v12.4s\n\t"
+    "fmla v9.4s,v1.4s,v5.4s; fmla v11.4s,v1.4s,v13.4s\n\t"
+    "sub %w[k_left],%w[k_left],#16\n\t"
+    "fmla v16.4s,v2.4s,v6.4s; fmla v18.4s,v2.4s,v14.4s\n\t"
+    "fmla v17.4s,v3.4s,v7.4s; fmla v19.4s,v3.4s,v15.4s\n\t"
+    "4:\n\t"
+    "fadd v8.4s,v8.4s,v16.4s; fadd v9.4s,v9.4s,v17.4s\n\t"
+    "fadd v10.4s,v10.4s,v18.4s; fadd v11.4s,v11.4s,v19.4s\n\t"
+    "cmp %w[k_left],#8; b.lt 5f\n\t"
+    "ldr q0,[%[a_ptr]],#32; ldr q4,[%[b_ptr]]; ldr q12,[%[b_ptr],#16]\n\t"
+    "fmla v8.4s,v0.4s,v4.4s; fmla v10.4s,v0.4s,v12.4s\n\t"
+    "sub %w[k_left],%w[k_left],#8\n\t"
+    "ldr q1,[%[a_ptr],#-16]; ldr q5,[%[b_ptr],#32]; ldr q13,[%[b_ptr],#48]\n\t"
+    "add %[b_ptr],%[b_ptr],#64\n\t"
+    "fmla v9.4s,v1.4s,v5.4s; fmla v11.4s,v1.4s,v13.4s\n\t"
+    "5:\n\t"
+    "fadd v8.4s,v8.4s,v9.4s; fadd v10.4s,v10.4s,v11.4s\n\t"
+    "cmp %w[k_left],#4; b.lt 6f\n\t"
+    "ldr q0,[%[a_ptr]],#16; ldr q4,[%[b_ptr]]; ldr q12,[%[b_ptr],#16]\n\t"
+    "fmla v8.4s,v0.4s,v4.4s; fmla v10.4s,v0.4s,v12.4s\n\t"
+    "add %[b_ptr],%[b_ptr],#32; sub %w[k_left],%w[k_left],#4\n\t"
+    "6:\n\t"
+    "movi v9.16b,#0; faddp v8.4s,v8.4s,v9.4s; faddp v10.4s,v10.4s,v9.4s\n\t"
+    "cmp %w[k_left],#2; b.lt 7f\n\t"
+    "ldr d0,[%[a_ptr]],#8; ldr d4,[%[b_ptr]]; ldr d12,[%[b_ptr],#8]\n\t"
+    "fmla v8.2s,v0.2s,v4.2s; fmla v10.2s,v0.2s,v12.2s\n\t"
+    "add %[b_ptr],%[b_ptr],#16; sub %w[k_left],%w[k_left],#2\n\t"
+    "7:\n\t"
+    "faddp %s[cs1],v8.2s; faddp %s[cs2],v10.2s\n\t"
+    "cmp %w[k_left],#1; b.lt 10f\n\t"
+    "ldr s0,[%[a_ptr]],#4; ldr s4,[%[b_ptr]]; ldr s12,[%[b_ptr],#4]\n\t"
+    "fmla %s[cs1],s0,v4.s[0]; fmla %s[cs2],s0,v12.s[0]\n\t"
+    "10:\n\t"
+   :[cs1]"=w"(cs1), [cs2]"=w"(cs2),
+    [a_ptr]"+r"(a_ptr), [b_ptr]"+r"(b_ptr), [k_left]"+r"(k_left)
+   ::"cc","memory","v0","v1","v2","v3","v4","v5","v6","v7",
+     "v8","v9","v10","v11","v12","v13","v14","v15","v16","v17","v18","v19");
+
+  if (c_rowmajor) {
+    c_ptr[0] = c_ptr[0] * beta + cs1;
+    c_ptr[1] = c_ptr[1] * beta + cs2;
+  } else {
+    c_ptr[0] = c_ptr[0] * beta + cs1;
+    c_ptr[LDM] = c_ptr[LDM] * beta + cs2;
+  }
+}
+
+/* k_mask = 7 */
+static inline void inline_sgemm_arowmajor_bskinny_m4n3(const float *a_ptr1,
+  const float *b_ptr, float *c_ptr, uint32_t k_inc, uint32_t LDK,
+  uint32_t LDM, float beta, bool c_rowmajor) {
+
+  const float *a_ptr2 = a_ptr1 + LDK;
+  const float *a_ptr3 = a_ptr1 + LDK * 2;
+  const float *a_ptr4 = a_ptr2 + LDK * 2;
+  uint32_t k_left = k_inc;
+  uint32_t next_pref = (LDK * 4 >= k_inc) ?
+    (LDK * 4 - k_inc + 4) * sizeof(float) : 64;
+  float32x4_t cq1, cq2, cq3;
+  __asm__ __volatile__(
+    "movi %[q1].16b,#0; movi %[q2].16b,#0; movi %[q3].16b,#0\n\t"
+    "movi v10.16b,#0; movi v11.16b,#0; movi v12.16b,#0\n\t"
+    "movi v13.16b,#0; movi v14.16b,#0; movi v15.16b,#0\n\t"
+    "movi v16.16b,#0; movi v17.16b,#0; movi v18.16b,#0\n\t"
+    "cmp %w[k_left],#4; b.lt 4f\n\t"
+    "ldr q0,[%[a_ptr1]],#16; ldr q1,[%[a_ptr2]],#16\n\t"
+    "ldr q2,[%[a_ptr3]],#16; ldr q3,[%[a_ptr4]],#16\n\t"
+    "ldr q4,[%[b_ptr]]; ldr q5,[%[b_ptr],#16]\n\t"
+    "ldr q6,[%[b_ptr],#32]; add %[b_ptr],%[b_ptr],#48\n\t"
+    "cmp %w[k_left],#12; b.lt 2f\n\t"
+    ".balign 16; 1:\n\t"
+    "fmla %[q1].4s,v0.4s,v4.4s; ldr q7,[%[b_ptr]],#96\n\t"
+    "fmla %[q2].4s,v0.4s,v5.4s\n\t"
+    "fmla %[q3].4s,v0.4s,v6.4s; ldr q0,[%[a_ptr1]],#32\n\t"
+    "fmla v10.4s,v1.4s,v4.4s; ldr q8,[%[b_ptr],#-80]\n\t"
+    "fmla v11.4s,v1.4s,v5.4s; prfm pldl1keep,[%[a_ptr1],#64]\n\t"
+    "fmla v12.4s,v1.4s,v6.4s; ldr q1,[%[a_ptr2]],#32\n\t"
+    "fmla v13.4s,v2.4s,v4.4s; ldr q9,[%[b_ptr],#-64]\n\t"
+    "fmla v14.4s,v2.4s,v5.4s; prfm pldl1keep,[%[a_ptr2],#64]\n\t"
+    "fmla v15.4s,v2.4s,v6.4s; ldr q2,[%[a_ptr3]],#32\n\t"
+    "fmla v16.4s,v3.4s,v4.4s\n\t"
+    "fmla v17.4s,v3.4s,v5.4s; prfm pldl1keep,[%[a_ptr3],#64]\n\t"
+    "fmla v18.4s,v3.4s,v6.4s; ldr q3,[%[a_ptr4]],#32\n\t"
+    "fmla %[q1].4s,v0.4s,v7.4s; ldr q4,[%[b_ptr],#-48]\n\t"
+    "fmla %[q2].4s,v0.4s,v8.4s; prfm pldl1keep,[%[a_ptr4],#64]\n\t"
+    "fmla %[q3].4s,v0.4s,v9.4s; ldr q0,[%[a_ptr1],#-16]\n\t"
+    "fmla v10.4s,v1.4s,v7.4s; ldr q5,[%[b_ptr],#-32]\n\t"
+    "fmla v11.4s,v1.4s,v8.4s\n\t"
+    "fmla v12.4s,v1.4s,v9.4s; ldr q1,[%[a_ptr2],#-16]\n\t"
+    "fmla v13.4s,v2.4s,v7.4s; ldr q6,[%[b_ptr],#-16]\n\t"
+    "fmla v14.4s,v2.4s,v8.4s; sub %w[k_left],%w[k_left],#8\n\t"
+    "fmla v15.4s,v2.4s,v9.4s; ldr q2,[%[a_ptr3],#-16]\n\t"
+    "fmla v16.4s,v3.4s,v7.4s; cmp %w[k_left],#12\n\t"
+    "fmla v17.4s,v3.4s,v8.4s\n\t"
+    "fmla v18.4s,v3.4s,v9.4s; ldr q3,[%[a_ptr4],#-16]; b.ge 1b\n\t"
+    "2:\n\t"
+    "cmp %w[k_left],#8; b.lt 3f\n\t"
+    "fmla %[q1].4s,v0.4s,v4.4s; ldr q7,[%[b_ptr]],#48\n\t"
+    "fmla %[q2].4s,v0.4s,v5.4s\n\t"
+    "fmla %[q3].4s,v0.4s,v6.4s; ldr q0,[%[a_ptr1]],#16\n\t"
+    "fmla v10.4s,v1.4s,v4.4s; ldr q8,[%[b_ptr],#-32]\n\t"
+    "fmla v11.4s,v1.4s,v5.4s\n\t"
+    "prfm pldl1keep,[%[a_ptr1],%w[next_pref],SXTW #0]\n\t"
+    "fmla v12.4s,v1.4s,v6.4s; ldr q1,[%[a_ptr2]],#16\n\t"
+    "fmla v13.4s,v2.4s,v4.4s; ldr q9,[%[b_ptr],#-16]\n\t"
+    "fmla v14.4s,v2.4s,v5.4s\n\t"
+    "prfm pldl1keep,[%[a_ptr2],%w[next_pref],SXTW #0]\n\t"
+    "fmla v15.4s,v2.4s,v6.4s; ldr q2,[%[a_ptr3]],#16\n\t"
+    "fmla v16.4s,v3.4s,v4.4s\n\t"
+    "fmla v17.4s,v3.4s,v5.4s\n\t"
+    "prfm pldl1keep,[%[a_ptr3],%w[next_pref],SXTW #0]\n\t"
+    "fmla v18.4s,v3.4s,v6.4s; ldr q3,[%[a_ptr4]],#16\n\t"
+    "fmla %[q1].4s,v0.4s,v7.4s\n\t"
+    "fmla %[q2].4s,v0.4s,v8.4s\n\t"
+    "prfm pldl1keep,[%[a_ptr4],%w[next_pref],SXTW #0]\n\t"
+    "fmla %[q3].4s,v0.4s,v9.4s\n\t"
+    "fmla v10.4s,v1.4s,v7.4s\n\t"
+    "fmla v11.4s,v1.4s,v8.4s\n\t"
+    "fmla v12.4s,v1.4s,v9.4s\n\t"
+    "fmla v13.4s,v2.4s,v7.4s\n\t"
+    "fmla v14.4s,v2.4s,v8.4s; sub %w[k_left],%w[k_left],#8\n\t"
+    "fmla v15.4s,v2.4s,v9.4s\n\t"
+    "fmla v16.4s,v3.4s,v7.4s\n\t"
+    "fmla v17.4s,v3.4s,v8.4s\n\t"
+    "fmla v18.4s,v3.4s,v9.4s; b 4f\n\t"
+    "3:\n\t"
+    "fmla %[q1].4s,v0.4s,v4.4s\n\t"
+    "fmla %[q2].4s,v0.4s,v5.4s\n\t"
+    "prfm pldl1keep,[%[a_ptr1],%w[next_pref],SXTW #0]\n\t"
+    "fmla %[q3].4s,v0.4s,v6.4s\n\t"
+    "fmla v10.4s,v1.4s,v4.4s\n\t"
+    "prfm pldl1keep,[%[a_ptr2],%w[next_pref],SXTW #0]\n\t"
+    "fmla v11.4s,v1.4s,v5.4s\n\t"
+    "fmla v12.4s,v1.4s,v6.4s\n\t"
+    "prfm pldl1keep,[%[a_ptr3],%w[next_pref],SXTW #0]\n\t"
+    "fmla v13.4s,v2.4s,v4.4s\n\t"
+    "fmla v14.4s,v2.4s,v5.4s; sub %w[k_left],%w[k_left],#4\n\t"
+    "prfm pldl1keep,[%[a_ptr4],%w[next_pref],SXTW #0]\n\t"
+    "fmla v15.4s,v2.4s,v6.4s\n\t"
+    "fmla v16.4s,v3.4s,v4.4s\n\t"
+    "fmla v17.4s,v3.4s,v5.4s\n\t"
+    "fmla v18.4s,v3.4s,v6.4s\n\t"
+    "4:\n\t"
+    "faddp %[q1].4s,%[q1].4s,v10.4s; faddp v13.4s,v13.4s,v16.4s\n\t"
+    "faddp %[q2].4s,%[q2].4s,v11.4s; faddp v14.4s,v14.4s,v17.4s\n\t"
+    "faddp %[q3].4s,%[q3].4s,v12.4s; faddp v15.4s,v15.4s,v18.4s\n\t"
+    "cmp %w[k_left],#2; b.lt 5f\n\t"
+    "ldr d0,[%[a_ptr1]],#8; ldr d1,[%[a_ptr2]],#8\n\t"
+    "ldr d2,[%[a_ptr3]],#8; ldr d3,[%[a_ptr4]],#8\n\t"
+    "ld1r {v4.2d},[%[b_ptr]],#8; ins v0.d[1],v1.d[0]\n\t"
+    "ld1r {v5.2d},[%[b_ptr]],#8; ins v2.d[1],v3.d[0]\n\t"
+    "ld1r {v6.2d},[%[b_ptr]],#8; sub %w[k_left],%w[k_left],#2\n\t"
+    "fmla %[q1].4s,v0.4s,v4.4s\n\t"
+    "fmla %[q2].4s,v0.4s,v5.4s\n\t"
+    "fmla %[q3].4s,v0.4s,v6.4s\n\t"
+    "fmla v13.4s,v2.4s,v4.4s\n\t"
+    "fmla v14.4s,v2.4s,v5.4s\n\t"
+    "fmla v15.4s,v2.4s,v6.4s\n\t"
+    "5:\n\t"
+    "faddp %[q1].4s,%[q1].4s,v13.4s\n\t"
+    "faddp %[q2].4s,%[q2].4s,v14.4s\n\t"
+    "faddp %[q3].4s,%[q3].4s,v15.4s\n\t"
+    "cmp %w[k_left],#1; b.lt 6f\n\t"
+    "ldr s0,[%[a_ptr1]],#4; ldr s1,[%[a_ptr2]],#4\n\t"
+    "ldr s2,[%[a_ptr3]],#4; ldr s3,[%[a_ptr4]],#4\n\t"
+    "ldr s4,[%[b_ptr]],#4; ins v0.s[1],v1.s[0]\n\t"
+    "ldr s5,[%[b_ptr]],#4; ins v2.s[1],v3.s[0]\n\t"
+    "ldr s6,[%[b_ptr]],#4; ins v0.d[1],v2.d[0]\n\t"
+    "sub %w[k_left],%w[k_left],#1\n\t"
+    "fmla %[q1].4s,v0.4s,v4.s[0]\n\t"
+    "fmla %[q2].4s,v0.4s,v5.s[0]\n\t"
+    "fmla %[q3].4s,v0.4s,v6.s[0]\n\t"
+    "6:\n\t"
+   :[q1]"=w"(cq1), [q2]"=w"(cq2), [q3]"=w"(cq3), [k_left]"+r"(k_left),
+    [a_ptr1]"+r"(a_ptr1), [a_ptr2]"+r"(a_ptr2), [a_ptr3]"+r"(a_ptr3),
+    [a_ptr4]"+r"(a_ptr4), [b_ptr]"+r"(b_ptr), [next_pref]"+r"(next_pref)
+   ::"cc","memory","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9",
+   "v10","v11","v12","v13","v14","v15","v16","v17","v18");
+
+  if (c_rowmajor) {
+    float32x4x3_t cqt1 = vld3q_f32(c_ptr);
+    cqt1.val[0] = vfmaq_n_f32(cq1, cqt1.val[0], beta);
+    cqt1.val[1] = vfmaq_n_f32(cq2, cqt1.val[1], beta);
+    cqt1.val[2] = vfmaq_n_f32(cq3, cqt1.val[2], beta);
+    vst3q_f32(c_ptr, cqt1);
+  } else {
+    cq1 = vfmaq_n_f32(cq1, vld1q_f32(c_ptr), beta);
+    cq2 = vfmaq_n_f32(cq2, vld1q_f32(c_ptr + LDM), beta);
+    cq3 = vfmaq_n_f32(cq3, vld1q_f32(c_ptr + LDM * 2), beta);
+
+    vst1q_f32(c_ptr, cq1); c_ptr += LDM;
+    vst1q_f32(c_ptr, cq2); c_ptr += LDM;
+    vst1q_f32(c_ptr, cq3);
+  }
+}
+
+static inline void inline_sgemm_arowmajor_bskinny_m1n3(const float *a_ptr,
+  const float *b_scr, float *c_ptr, uint32_t k_inc, uint32_t LDK,
+  uint32_t LDM, float beta, bool c_rowmajor) {
+
+  const float *sb_ptr = b_scr;
+
+  float32x4_t cq01, cq02, cq03, cq04, cq05, cq06;
+  cq01 = cq02 = cq03 = cq04 = cq05 = cq06 = vdupq_n_f32(0.0f);
+  float32x4_t cq07, cq08, cq09, cq10, cq11, cq12;
+  cq07 = cq08 = cq09 = cq10 = cq11 = cq12 = vdupq_n_f32(0.0f);
+  float32x4_t aq1, aq2, bq01, bq02, bq03, bq04, bq05, bq06;
+  float32x4_t aq3, aq4, bq07, bq08, bq09, bq10, bq11, bq12;
+  uint32_t k_left = k_inc;
+  if (k_left > 7) {
+    aq1 = vld1q_f32(a_ptr); aq2 = vld1q_f32(a_ptr + 4); a_ptr += 8;
+    bq01 = vld1q_f32(sb_ptr); bq02 = vld1q_f32(sb_ptr + 4);
+    bq03 = vld1q_f32(sb_ptr + 8); bq04 = vld1q_f32(sb_ptr + 12);
+    bq05 = vld1q_f32(sb_ptr + 16); bq06 = vld1q_f32(sb_ptr + 20);
+    sb_ptr += 24;
+  }
+  for (; k_left > 23; k_left -= 16) {
+    aq3 = vld1q_f32(a_ptr);
+    cq01 = vfmaq_f32(cq01, aq1, bq01); bq07 = vld1q_f32(sb_ptr);
+    cq02 = vfmaq_f32(cq02, aq1, bq02); bq08 = vld1q_f32(sb_ptr + 4);
+    cq03 = vfmaq_f32(cq03, aq1, bq03); bq09 = vld1q_f32(sb_ptr + 8);
+    aq4 = vld1q_f32(a_ptr + 4);
+    cq04 = vfmaq_f32(cq04, aq2, bq04); bq10 = vld1q_f32(sb_ptr + 12);
+    cq05 = vfmaq_f32(cq05, aq2, bq05); bq11 = vld1q_f32(sb_ptr + 16);
+    cq06 = vfmaq_f32(cq06, aq2, bq06); bq12 = vld1q_f32(sb_ptr + 20);
+    aq1 = vld1q_f32(a_ptr + 8);
+    cq07 = vfmaq_f32(cq07, aq3, bq07); bq01 = vld1q_f32(sb_ptr + 24);
+    cq08 = vfmaq_f32(cq08, aq3, bq08); bq02 = vld1q_f32(sb_ptr + 28);
+    cq09 = vfmaq_f32(cq09, aq3, bq09); bq03 = vld1q_f32(sb_ptr + 32);
+    aq2 = vld1q_f32(a_ptr + 12); a_ptr += 16;
+    cq10 = vfmaq_f32(cq10, aq4, bq10); bq04 = vld1q_f32(sb_ptr + 36);
+    cq11 = vfmaq_f32(cq11, aq4, bq11); bq05 = vld1q_f32(sb_ptr + 40);
+    cq12 = vfmaq_f32(cq12, aq4, bq12); bq06 = vld1q_f32(sb_ptr + 44);
+    sb_ptr += 48;
+  }
+  if (k_left > 15) {
+    aq3 = vld1q_f32(a_ptr);
+    cq01 = vfmaq_f32(cq01, aq1, bq01); bq07 = vld1q_f32(sb_ptr);
+    cq02 = vfmaq_f32(cq02, aq1, bq02); bq08 = vld1q_f32(sb_ptr + 4);
+    cq03 = vfmaq_f32(cq03, aq1, bq03); bq09 = vld1q_f32(sb_ptr + 8);
+    aq4 = vld1q_f32(a_ptr + 4); a_ptr += 8;
+    cq04 = vfmaq_f32(cq04, aq2, bq04); bq10 = vld1q_f32(sb_ptr + 12);
+    cq05 = vfmaq_f32(cq05, aq2, bq05); bq11 = vld1q_f32(sb_ptr + 16);
+    cq06 = vfmaq_f32(cq06, aq2, bq06); bq12 = vld1q_f32(sb_ptr + 20);
+    cq07 = vfmaq_f32(cq07, aq3, bq07); sb_ptr += 24;
+    cq08 = vfmaq_f32(cq08, aq3, bq08); k_left -= 16;
+    cq09 = vfmaq_f32(cq09, aq3, bq09);
+    cq10 = vfmaq_f32(cq10, aq4, bq10);
+    cq11 = vfmaq_f32(cq11, aq4, bq11);
+    cq12 = vfmaq_f32(cq12, aq4, bq12);
+  }
+  if (k_left > 7) {
+    cq01 = vfmaq_f32(cq01, aq1, bq01); k_left -= 8;
+    cq02 = vfmaq_f32(cq02, aq1, bq02);
+    cq03 = vfmaq_f32(cq03, aq1, bq03);
+    cq04 = vfmaq_f32(cq04, aq2, bq04);
+    cq05 = vfmaq_f32(cq05, aq2, bq05);
+    cq06 = vfmaq_f32(cq06, aq2, bq06);
+  }
+  cq01 = vaddq_f32(cq01, cq07); cq02 = vaddq_f32(cq02, cq08);
+  cq03 = vaddq_f32(cq03, cq09); cq04 = vaddq_f32(cq04, cq10);
+  cq05 = vaddq_f32(cq05, cq11); cq06 = vaddq_f32(cq06, cq12);
+  cq01 = vaddq_f32(cq01, cq04); cq02 = vaddq_f32(cq02, cq05);
+  cq03 = vaddq_f32(cq03, cq06);
+
+  if (k_left > 3) {
+    aq1 = vld1q_f32(a_ptr); a_ptr += 4;
+    bq01 = vld1q_f32(sb_ptr); bq02 = vld1q_f32(sb_ptr + 4);
+    bq03 = vld1q_f32(sb_ptr + 8); sb_ptr += 12;
+    cq01 = vfmaq_f32(cq01, aq1, bq01); k_left -= 4;
+    cq02 = vfmaq_f32(cq02, aq1, bq02);
+    cq03 = vfmaq_f32(cq03, aq1, bq03);
+  }
+  float32x2_t cd1 = vadd_f32(vget_low_f32(cq01), vget_high_f32(cq01));
+  float32x2_t cd2 = vadd_f32(vget_low_f32(cq02), vget_high_f32(cq02));
+  float32x2_t cd3 = vadd_f32(vget_low_f32(cq03), vget_high_f32(cq03));
+  if (k_left > 1) {
+    float32x2_t ad1 = vld1_f32(a_ptr); a_ptr += 2;
+    float32x2_t bd1 = vld1_f32(sb_ptr);
+    float32x2_t bd2 = vld1_f32(sb_ptr + 2);
+    float32x2_t bd3 = vld1_f32(sb_ptr + 4); sb_ptr += 6;
+    cd1 = vfma_f32(cd1, ad1, bd1); k_left -= 2;
+    cd2 = vfma_f32(cd2, ad1, bd2);
+    cd3 = vfma_f32(cd3, ad1, bd3);
+  }
+  float cs1 = vget_lane_f32(cd1, 0) + vget_lane_f32(cd1, 1);
+  float cs2 = vget_lane_f32(cd2, 0) + vget_lane_f32(cd2, 1);
+  float cs3 = vget_lane_f32(cd3, 0) + vget_lane_f32(cd3, 1);
+  if (k_left > 0) {
+    float as1 = *a_ptr++;
+    cs1 += as1 * sb_ptr[0];
+    cs2 += as1 * sb_ptr[1];
+    cs3 += as1 * sb_ptr[2];
+  }
+
+  if (c_rowmajor) {
+    c_ptr[0] = c_ptr[0] * beta + cs1;
+    c_ptr[1] = c_ptr[1] * beta + cs2;
+    c_ptr[2] = c_ptr[2] * beta + cs3;
+  } else {
+    c_ptr[0] = c_ptr[0] * beta + cs1;
+    c_ptr[LDM] = c_ptr[LDM] * beta + cs2;
+    c_ptr[LDM * 2] = c_ptr[LDM * 2] * beta + cs3;
+  }
+}
+
+#define DEFAULT_SGEMV1_THRESH_K_UNROLL_M 512
+#define DEFAULT_SGEMV1_THRESH_DETECT_CPU 30000
+
+static inline bool unroll_test_m4n1(uint32_t M, uint32_t K) {
+  unsigned char cpu_type = 0, cpu_id = 0;
+  uint32_t gemv1_thresh_k_unroll_m = DEFAULT_SGEMV1_THRESH_K_UNROLL_M;
+  if ((uint64_t)M * (uint64_t)K > DEFAULT_SGEMV1_THRESH_DETECT_CPU) {
+    cpu_id = sched_getcpu();
+    cpu_type = blas_arm_get_cpu_type(cpu_id);
+    /* Based on a number of BLAS tests,
+     * unrolling M on Cortex-A55 degrades performance in all cases */
+    /* Unrolling M on other ARM cores can improve performance when K is small */
+    gemv1_thresh_k_unroll_m = cpu_type == 55 ?
+      0 : DEFAULT_SGEMV1_THRESH_K_UNROLL_M;
+  }
+  return K <= gemv1_thresh_k_unroll_m;
+}
+
+static inline bool unroll_test_m1n1(uint32_t M, uint32_t K) {
+  return true;
+}
+
+static inline bool unroll_test_m4n2(uint32_t M, uint32_t K) {
+  return unroll_test_m4n1(M, K);
+}
+
+static inline bool unroll_test_m1n2(uint32_t M, uint32_t K) {
+  return true;
+}
+
+static inline bool unroll_test_m4n3(uint32_t M, uint32_t K) {
+  unsigned char cpu_type = 0, cpu_id = 0;
+  if ((uint64_t)M * (uint64_t)K > DEFAULT_SGEMV1_THRESH_DETECT_CPU) {
+    cpu_id = sched_getcpu();
+    cpu_type = blas_arm_get_cpu_type(cpu_id);
+    if (cpu_type == 53 || cpu_type == 35) {
+      return true;
+    }
+    return false;
+  }
+  return false;
+}
+
+static inline bool unroll_test_m1n3(uint32_t M, uint32_t K) {
+  return true;
+}
+
+GEMM_SKINNY_DOT_PARALLEL_FUNC_NOINCINLINE(sgemm, 1, 7, 5, 32768, float, float, unroll_test)
+GEMM_SKINNY_DOT_PARALLEL_FUNC_NOINCINLINE(sgemm, 2, 7, 5, 32768, float, float, unroll_test)
+GEMM_SKINNY_DOT_PARALLEL_FUNC_NOINCINLINE(sgemm, 3, 7, 5, 32768, float, float, unroll_test)
+
+#define SGEMM_SKINNY1_FUNC_TEMPLATE(ndim) \
+void sgemm_arowmajor_bskinny_afloat_bfloat_n##ndim(\
+  const float *A, const float *B, float *C,\
+  uint32_t M, uint32_t K, uint8_t b_c_order, float beta) {\
+\
+  unsigned char cpu_type = 0;\
+  if ((uint64_t)M * (uint64_t)K * ndim > \
+    DEFAULT_SGEMV1_THRESH_DETECT_CPU << 3) {\
+    unsigned char cpu_id = sched_getcpu();\
+    cpu_type = blas_arm_get_cpu_type(cpu_id);\
+  }\
+\
+  const uint32_t LDB = (b_c_order & 1) ? ndim : K;\
+  const uint32_t LDC = (b_c_order & 2) ? ndim : M;\
+  if (cpu_type == 35) {\
+    sgemm_skinny1_arowmajor_n##ndim##_a35(A, B, C, M, K, K, LDB, LDC,\
+      b_c_order, beta);\
+  } else if (cpu_type == 53 || cpu_type == 55) {\
+    sgemm_skinny1_arowmajor_n##ndim##_a53(A, B, C, M, K, K, LDB, LDC,\
+      b_c_order, beta);\
+  } else {\
+    sgemm_skinny1_arowmajor_n##ndim##_a7x(A, B, C, M, K, K, LDB, LDC,\
+      b_c_order, beta);\
+  }\
+}\
+\
+void sgemm_arowmajor_bskinny_afloat_bfloat_n##ndim##_omp(const float *A,\
+  const float *B, float *C,\
+  uint32_t M, uint32_t K, uint8_t b_c_order,\
+  float beta, uint32_t num_threads) {\
+\
+  const uint32_t LDC = (b_c_order & 2) ? ndim : M;\
+  if (num_threads <= 1) {\
+    sgemm_arowmajor_bskinny_afloat_bfloat_n##ndim(A, B, C, M, K,\
+      b_c_order, beta);\
+    return;\
+  }\
+\
+  unsigned char cpu_type = 0;\
+  if ((uint64_t)M * (uint64_t)K * ndim > \
+    DEFAULT_SGEMV1_THRESH_DETECT_CPU << 3) {\
+    unsigned char cpu_id = sched_getcpu();\
+    cpu_type = blas_arm_get_cpu_type(cpu_id);\
+  }\
+\
+  const uint32_t LDB = (b_c_order & 1) ? ndim : K;\
+  if (cpu_type == 35) {\
+    sgemm_skinny1_arowmajor_n##ndim##_a35_omp(A, B, C, M, K, K,\
+      LDB, LDC, b_c_order, beta, num_threads);\
+  } else if (cpu_type == 53 || cpu_type == 55) {\
+    sgemm_skinny1_arowmajor_n##ndim##_a53_omp(A, B, C, M, K, K,\
+      LDB, LDC, b_c_order, beta, num_threads);\
+  } else {\
+    sgemm_skinny1_arowmajor_n##ndim##_a7x_omp(A, B, C, M, K, K,\
+      LDB, LDC, b_c_order, beta, num_threads);\
+  }\
+}
+
+SGEMM_SKINNY1_FUNC_TEMPLATE(4)
+SGEMM_SKINNY1_FUNC_TEMPLATE(5)
+SGEMM_SKINNY1_FUNC_TEMPLATE(6)
+SGEMM_SKINNY1_FUNC_TEMPLATE(7)
+SGEMM_SKINNY1_FUNC_TEMPLATE(8)
+SGEMM_SKINNY1_FUNC_TEMPLATE(9)
+SGEMM_SKINNY1_FUNC_TEMPLATE(10)
+SGEMM_SKINNY1_FUNC_TEMPLATE(11)
+SGEMM_SKINNY1_FUNC_TEMPLATE(12)
+SGEMM_SKINNY1_FUNC_TEMPLATE(13)
+SGEMM_SKINNY1_FUNC_TEMPLATE(14)
+SGEMM_SKINNY1_FUNC_TEMPLATE(15)
+SGEMM_SKINNY1_FUNC_TEMPLATE(16)
+SGEMM_SKINNY1_FUNC_TEMPLATE(17)
+SGEMM_SKINNY1_FUNC_TEMPLATE(18)
+SGEMM_SKINNY1_FUNC_TEMPLATE(19)
+SGEMM_SKINNY1_FUNC_TEMPLATE(20)
+SGEMM_SKINNY1_FUNC_TEMPLATE(21)
+SGEMM_SKINNY1_FUNC_TEMPLATE(22)
+SGEMM_SKINNY1_FUNC_TEMPLATE(23)
+SGEMM_SKINNY1_FUNC_TEMPLATE(24)
+SGEMM_SKINNY1_FUNC_TEMPLATE(25)
+SGEMM_SKINNY1_FUNC_TEMPLATE(26)
+SGEMM_SKINNY1_FUNC_TEMPLATE(27)
+SGEMM_SKINNY1_FUNC_TEMPLATE(28)
+SGEMM_SKINNY1_FUNC_TEMPLATE(29)
+SGEMM_SKINNY1_FUNC_TEMPLATE(30)
+SGEMM_SKINNY1_FUNC_TEMPLATE(31)
+SGEMM_SKINNY1_FUNC_TEMPLATE(32)
+SGEMM_SKINNY1_FUNC_TEMPLATE(33)
+SGEMM_SKINNY1_FUNC_TEMPLATE(34)
+SGEMM_SKINNY1_FUNC_TEMPLATE(35)
+SGEMM_SKINNY1_FUNC_TEMPLATE(36)
+SGEMM_SKINNY1_FUNC_TEMPLATE(37)
+SGEMM_SKINNY1_FUNC_TEMPLATE(38)
+SGEMM_SKINNY1_FUNC_TEMPLATE(39)
+SGEMM_SKINNY1_FUNC_TEMPLATE(40)
+SGEMM_SKINNY1_FUNC_TEMPLATE(41)
+SGEMM_SKINNY1_FUNC_TEMPLATE(42)
+SGEMM_SKINNY1_FUNC_TEMPLATE(43)
+SGEMM_SKINNY1_FUNC_TEMPLATE(44)
+SGEMM_SKINNY1_FUNC_TEMPLATE(45)
+SGEMM_SKINNY1_FUNC_TEMPLATE(46)
+SGEMM_SKINNY1_FUNC_TEMPLATE(47)
+SGEMM_SKINNY1_FUNC_TEMPLATE(48)
+SGEMM_SKINNY1_FUNC_TEMPLATE(49)
+SGEMM_SKINNY1_FUNC_TEMPLATE(50)
+
diff --git a/src/neon_armv8a/SgemmSkinnyGer.c b/src/neon_armv8a/SgemmSkinnyGer.c
new file mode 100644
index 0000000..0d64809
--- /dev/null
+++ b/src/neon_armv8a/SgemmSkinnyGer.c
@@ -0,0 +1,283 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include "arm_neon/ARMCompareAndSwap.h"
+#include "common/CommonSkinnyGer.h"
+
+#include <arm_neon.h>
+
+typedef float sgemm_skinnyger_ascalar;
+typedef float sgemm_skinnyger_bscalar;
+typedef float sgemm_skinnyger_cscalar;
+
+typedef float sgemm_skinnyger_avec1;
+typedef float sgemm_skinnyger_bvec1;
+typedef float sgemm_skinnyger_cvec1;
+
+typedef float32x2_t sgemm_skinnyger_avec2;
+typedef float32x2_t sgemm_skinnyger_bvec2;
+typedef float32x2_t sgemm_skinnyger_cvec2;
+
+typedef float32x4_t sgemm_skinnyger_avec4;
+typedef float32x4_t sgemm_skinnyger_bvec4;
+typedef float32x4_t sgemm_skinnyger_cvec4;
+
+typedef float32x4x2_t sgemm_skinnyger_avec8;
+typedef float32x4x2_t sgemm_skinnyger_bvec8;
+typedef float32x4x2_t sgemm_skinnyger_cvec8;
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 8, 4, 1) {
+  float32x4x2_t ret;
+  ret.val[0] = vfmaq_laneq_f32(c_vec.val[0], a_vec.val[0], b_vec, 0);
+  ret.val[1] = vfmaq_laneq_f32(c_vec.val[1], a_vec.val[1], b_vec, 0);
+  return ret;
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 8, 4, 2) {
+  float32x4x2_t ret;
+  ret.val[0] = vfmaq_laneq_f32(c_vec.val[0], a_vec.val[0], b_vec, 1);
+  ret.val[1] = vfmaq_laneq_f32(c_vec.val[1], a_vec.val[1], b_vec, 1);
+  return ret;
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 8, 4, 3) {
+  float32x4x2_t ret;
+  ret.val[0] = vfmaq_laneq_f32(c_vec.val[0], a_vec.val[0], b_vec, 2);
+  ret.val[1] = vfmaq_laneq_f32(c_vec.val[1], a_vec.val[1], b_vec, 2);
+  return ret;
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 8, 4, 4) {
+  float32x4x2_t ret;
+  ret.val[0] = vfmaq_laneq_f32(c_vec.val[0], a_vec.val[0], b_vec, 3);
+  ret.val[1] = vfmaq_laneq_f32(c_vec.val[1], a_vec.val[1], b_vec, 3);
+  return ret;
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 8, 2, 1) {
+  float32x4x2_t ret;
+  ret.val[0] = vfmaq_lane_f32(c_vec.val[0], a_vec.val[0], b_vec, 0);
+  ret.val[1] = vfmaq_lane_f32(c_vec.val[1], a_vec.val[1], b_vec, 0);
+  return ret;
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 8, 2, 2) {
+  float32x4x2_t ret;
+  ret.val[0] = vfmaq_lane_f32(c_vec.val[0], a_vec.val[0], b_vec, 1);
+  ret.val[1] = vfmaq_lane_f32(c_vec.val[1], a_vec.val[1], b_vec, 1);
+  return ret;
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 8, 1, 1) {
+  float32x4x2_t ret;
+  ret.val[0] = vfmaq_n_f32(c_vec.val[0], a_vec.val[0], b_vec);
+  ret.val[1] = vfmaq_n_f32(c_vec.val[1], a_vec.val[1], b_vec);
+  return ret;
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 4, 4, 1) {
+  return vfmaq_laneq_f32(c_vec, a_vec, b_vec, 0);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 4, 4, 2) {
+  return vfmaq_laneq_f32(c_vec, a_vec, b_vec, 1);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 4, 4, 3) {
+  return vfmaq_laneq_f32(c_vec, a_vec, b_vec, 2);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 4, 4, 4) {
+  return vfmaq_laneq_f32(c_vec, a_vec, b_vec, 3);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 4, 2, 1) {
+  return vfmaq_lane_f32(c_vec, a_vec, b_vec, 0);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 4, 2, 2) {
+  return vfmaq_lane_f32(c_vec, a_vec, b_vec, 1);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 4, 1, 1) {
+  return vfmaq_n_f32(c_vec, a_vec, b_vec);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 2, 4, 1) {
+  return vfma_laneq_f32(c_vec, a_vec, b_vec, 0);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 2, 4, 2) {
+  return vfma_laneq_f32(c_vec, a_vec, b_vec, 1);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 2, 4, 3) {
+  return vfma_laneq_f32(c_vec, a_vec, b_vec, 2);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 2, 4, 4) {
+  return vfma_laneq_f32(c_vec, a_vec, b_vec, 3);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 2, 2, 1) {
+  return vfma_lane_f32(c_vec, a_vec, b_vec, 0);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 2, 2, 2) {
+  return vfma_lane_f32(c_vec, a_vec, b_vec, 1);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 2, 1, 1) {
+  return vfma_n_f32(c_vec, a_vec, b_vec);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 1, 4, 1) {
+  return vfmas_laneq_f32(c_vec, a_vec, b_vec, 0);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 1, 4, 2) {
+  return vfmas_laneq_f32(c_vec, a_vec, b_vec, 1);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 1, 4, 3) {
+  return vfmas_laneq_f32(c_vec, a_vec, b_vec, 2);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 1, 4, 4) {
+  return vfmas_laneq_f32(c_vec, a_vec, b_vec, 3);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 1, 2, 1) {
+  return vfmas_lane_f32(c_vec, a_vec, b_vec, 0);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 1, 2, 2) {
+  return vfmas_lane_f32(c_vec, a_vec, b_vec, 1);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(sgemm, 1, 1, 1) {
+  return a_vec * b_vec + c_vec;
+}
+
+GEMM_SKINNY_GER_LOADA_UNIT(sgemm, 8) {
+  float32x4x2_t ret;
+  ret.val[0] = vld1q_f32(a_ptr);
+  ret.val[1] = vld1q_f32(a_ptr + 4);
+  __asm__("prfm pldl1keep,[%0,#96]"::"r"(a_ptr):);
+  return ret;
+}
+
+GEMM_SKINNY_GER_LOADA_UNIT(sgemm, 4) {
+  __asm__("prfm pldl1keep,[%0,#80]"::"r"(a_ptr):);
+  return vld1q_f32(a_ptr);
+}
+
+GEMM_SKINNY_GER_LOADA_UNIT(sgemm, 2) {
+  return vld1_f32(a_ptr);
+}
+
+GEMM_SKINNY_GER_LOADA_UNIT(sgemm, 1) {
+  return *a_ptr;
+}
+
+GEMM_SKINNY_GER_LOADC_UNIT(sgemm, 8) {
+  float32x4x2_t ret;
+  ret.val[0] = vld1q_f32(c_ptr);
+  ret.val[1] = vld1q_f32(c_ptr + 4);
+  return ret;
+}
+
+GEMM_SKINNY_GER_LOADC_UNIT(sgemm, 4) {
+  return vld1q_f32(c_ptr);
+}
+
+GEMM_SKINNY_GER_LOADC_UNIT(sgemm, 2) {
+  return vld1_f32(c_ptr);
+}
+
+GEMM_SKINNY_GER_LOADC_UNIT(sgemm, 1) {
+  return *c_ptr;
+}
+
+GEMM_SKINNY_GER_STOREC_UNIT(sgemm, 8) {
+  vst1q_f32(c_ptr, c_vec.val[0]);
+  vst1q_f32(c_ptr + 4, c_vec.val[1]);
+}
+
+GEMM_SKINNY_GER_STOREC_UNIT(sgemm, 4) {
+  vst1q_f32(c_ptr, c_vec);
+}
+
+GEMM_SKINNY_GER_STOREC_UNIT(sgemm, 2) {
+  vst1_f32(c_ptr, c_vec);
+}
+
+GEMM_SKINNY_GER_STOREC_UNIT(sgemm, 1) {
+  *c_ptr = c_vec;
+}
+
+GEMM_SKINNY_GER_LOADB_UNIT_BROWMAJOR(sgemm, 4) {
+  float32x4_t ret = vdupq_n_f32(0);
+  float b1 = *b_ptr; b_ptr += ldb;
+  float b2 = *b_ptr; b_ptr += ldb;
+  float b3 = *b_ptr; b_ptr += ldb;
+  float b4 = *b_ptr;
+  ret = vsetq_lane_f32(b1, ret, 0);
+  ret = vsetq_lane_f32(b2, ret, 1);
+  ret = vsetq_lane_f32(b3, ret, 2);
+  ret = vsetq_lane_f32(b4, ret, 3);
+  return ret;
+}
+
+GEMM_SKINNY_GER_LOADB_UNIT_BROWMAJOR(sgemm, 2) {
+  float32x2_t ret = vdup_n_f32(0);
+  float b1 = *b_ptr;
+  float b2 = b_ptr[ldb];
+  ret = vset_lane_f32(b1, ret, 0);
+  ret = vset_lane_f32(b2, ret, 1);
+  return ret;
+}
+
+GEMM_SKINNY_GER_LOADB_UNIT_BROWMAJOR(sgemm, 1) {
+  return *b_ptr;
+}
+
+GEMM_SKINNY_GER_LOADB_UNIT_BCOLMAJOR(sgemm, 4) {
+  return vld1q_f32(b_ptr);
+}
+
+GEMM_SKINNY_GER_LOADB_UNIT_BCOLMAJOR(sgemm, 2) {
+  return vld1_f32(b_ptr);
+}
+
+GEMM_SKINNY_GER_LOADB_UNIT_BCOLMAJOR(sgemm, 1) {
+  return *b_ptr;
+}
+
+GEMM_SKINNY_GER_PARALLEL_FUNC(sgemm, 1, 7, 15, 8192, float, float)
+GEMM_SKINNY_GER_PARALLEL_FUNC(sgemm, 2, 7, 15, 8192, float, float)
+GEMM_SKINNY_GER_PARALLEL_FUNC(sgemm, 3, 7, 15, 8192, float, float)
+GEMM_SKINNY_GER_PARALLEL_FUNC(sgemm, 4, 7, 15, 8192, float, float)
+GEMM_SKINNY_GER_PARALLEL_FUNC(sgemm, 5, 7, 15, 8192, float, float)
+GEMM_SKINNY_GER_PARALLEL_FUNC(sgemm, 6, 7, 15, 8192, float, float)
+GEMM_SKINNY_GER_PARALLEL_FUNC(sgemm, 7, 7, 15, 8192, float, float)
+GEMM_SKINNY_GER_PARALLEL_FUNC(sgemm, 8, 7, 7, 8192, float, float)
+GEMM_SKINNY_GER_PARALLEL_FUNC(sgemm, 9, 7, 7, 8192, float, float)
+GEMM_SKINNY_GER_PARALLEL_FUNC(sgemm, 10, 7, 7, 8192, float, float)
+GEMM_SKINNY_GER_PARALLEL_FUNC(sgemm, 11, 7, 7, 8192, float, float)
+GEMM_SKINNY_GER_PARALLEL_FUNC(sgemm, 12, 7, 7, 8192, float, float)
+
diff --git a/src/neon_armv8a/U8U32DotGemmDriver.c b/src/neon_armv8a/U8U32DotGemmDriver.c
new file mode 100644
index 0000000..b08fd5c
--- /dev/null
+++ b/src/neon_armv8a/U8U32DotGemmDriver.c
@@ -0,0 +1,36 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include "neon_armv8a/U8U32DotGemmCopy.h"
+#include "neon_armv8a/U8U32DotGemmKernel.h"
+#include "neon_armv8a/U8U32DotGemmSkinnyDot.h"
+#include "arm_neon/ARMCompareAndSwap.h"
+#include "common/CommonDriver.h"
+
+#ifdef SCRATCH_K_CORD
+#undef SCRATCH_K_CORD
+#define SCRATCH_K_CORD(k) ((k) >> 2)
+#endif
+
+#ifdef GEMM_D_K
+#undef GEMM_D_K
+#define GEMM_D_K 768
+#endif
+
+GEMM_PARALLEL_FUNC(u8u32dotgemm, uint8_t, uint32_t, uint8_t, uint32_t, uint32_t,
+  8, 12, 12, 12, 0, 0)
+
diff --git a/src/neon_armv8a/U8U32GemmDriver.c b/src/neon_armv8a/U8U32GemmDriver.c
new file mode 100644
index 0000000..88db447
--- /dev/null
+++ b/src/neon_armv8a/U8U32GemmDriver.c
@@ -0,0 +1,48 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include "neon_armv8a/U8U32MlaGemmDriver.h"
+#include "neon_armv8a/U8U32DotGemmDriver.h"
+#include "arm_neon/ARMCpuType.h"
+
+int u8u32gemm_serial(int a_rowmajor, int b_rowmajor,
+  const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t N, uint32_t K, uint32_t beta_inp) {
+
+  if (blas_arm_get_i8i32_support() == 2) {
+    return u8u32dotgemm_serial(a_rowmajor, b_rowmajor, A, B, C,
+      M, N, K, beta_inp);
+  } else {
+    return u8u32mlagemm_serial(a_rowmajor, b_rowmajor, A, B, C,
+      M, N, K, beta_inp);
+  }
+}
+
+int u8u32gemm(int a_rowmajor, int b_rowmajor,
+  const uint8_t *A, const uint8_t *B,
+  uint32_t *C, uint32_t M, uint32_t N, uint32_t K,
+  uint32_t beta_inp, uint32_t num_threads) {
+
+  if (blas_arm_get_i8i32_support() == 2) {
+    return u8u32dotgemm(a_rowmajor, b_rowmajor, A, B, C,
+      M, N, K, beta_inp, num_threads);
+  } else {
+    return u8u32mlagemm(a_rowmajor, b_rowmajor, A, B, C,
+      M, N, K, beta_inp, num_threads);
+  }
+}
+
diff --git a/src/neon_armv8a/U8U32MlaGemmCopy.c b/src/neon_armv8a/U8U32MlaGemmCopy.c
new file mode 100644
index 0000000..a27d2c0
--- /dev/null
+++ b/src/neon_armv8a/U8U32MlaGemmCopy.c
@@ -0,0 +1,30 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#ifndef GEMM_UNSIGNED_INT
+#define GEMM_UNSIGNED_INT
+#endif
+
+#include "common/CommonCopy.h"
+#include "arm_neon/NeonI8I32MlaGemmCopy.h"
+
+GENERIC_NCOPY_FUNC(u8u32mlagemm, uint8_t, uint16_t, 8)
+GENERIC_NCOPY_FUNC(u8u32mlagemm, uint8_t, uint16_t, 12)
+
+GENERIC_TCOPY_FUNC(u8u32mlagemm, uint8_t, uint16_t, 8)
+GENERIC_TCOPY_FUNC(u8u32mlagemm, uint8_t, uint16_t, 12)
+
diff --git a/src/neon_armv8a/U8U32MlaGemmDriver.c b/src/neon_armv8a/U8U32MlaGemmDriver.c
new file mode 100644
index 0000000..f2a562c
--- /dev/null
+++ b/src/neon_armv8a/U8U32MlaGemmDriver.c
@@ -0,0 +1,27 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include "neon_armv8a/U8U32MlaGemmCopy.h"
+#include "neon_armv8a/U8U32MlaGemmKernel.h"
+#include "neon_armv8a/U8U32MlaGemmSkinnyGer.h"
+#include "neon_armv8a/U8U32MlaGemmSkinnyDot.h"
+#include "arm_neon/ARMCompareAndSwap.h"
+#include "common/CommonDriver.h"
+
+GEMM_PARALLEL_FUNC(u8u32mlagemm, uint8_t, uint16_t, uint8_t, uint16_t, uint32_t,
+  8, 12, 8, 8, 8, 8)
+
diff --git a/src/neon_armv8a/U8U32MlaGemmKernel.c b/src/neon_armv8a/U8U32MlaGemmKernel.c
new file mode 100644
index 0000000..b9f3cd4
--- /dev/null
+++ b/src/neon_armv8a/U8U32MlaGemmKernel.c
@@ -0,0 +1,27 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#ifndef GEMM_UNSIGNED_INT
+#define GEMM_UNSIGNED_INT
+#endif
+
+#include "common/CommonKernel.h"
+#include "neon_armv8a/I8I32MlaGemmKernel.h"
+
+DUALPACK_KERNEL_FUNC_LM(u8u32mlagemm, uint16_t, uint16_t, uint32_t, 8, 12)
+DUALPACK_KERNEL_FUNC_LN(u8u32mlagemm, uint16_t, uint16_t, uint32_t, 12, 8)
+
diff --git a/src/neon_armv8a/U8U32MlaGemmSkinnyDot.c b/src/neon_armv8a/U8U32MlaGemmSkinnyDot.c
new file mode 100644
index 0000000..4cb495e
--- /dev/null
+++ b/src/neon_armv8a/U8U32MlaGemmSkinnyDot.c
@@ -0,0 +1,34 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#ifndef GEMM_UNSIGNED_INT
+#define GEMM_UNSIGNED_INT
+#endif
+
+#include "arm_neon/ARMCompareAndSwap.h"
+#include "neon_armv8a/I8I32MlaGemmSkinnyDot.h"
+#include "common/CommonSkinnyDot.h"
+
+GEMM_SKINNY_DOT_PARALLEL_FUNC_NOINCINLINE(u8u32mlagemm, 1, 31, 5, 131072, uint8_t, uint8_t, unroll_test)
+GEMM_SKINNY_DOT_PARALLEL_FUNC_NOINCINLINE(u8u32mlagemm, 2, 31, 5, 131072, uint8_t, uint8_t, unroll_test)
+GEMM_SKINNY_DOT_PARALLEL_FUNC_NOINCINLINE(u8u32mlagemm, 3, 31, 5, 131072, uint8_t, uint8_t, unroll_test)
+
+GEMM_SKINNY_DOT_PARALLEL_FUNC(u8u32mlagemm, 4, 15, 7, 131072, uint8_t, uint8_t)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(u8u32mlagemm, 5, 15, 7, 131072, uint8_t, uint8_t)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(u8u32mlagemm, 6, 15, 7, 131072, uint8_t, uint8_t)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(u8u32mlagemm, 7, 15, 3, 131072, uint8_t, uint8_t)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(u8u32mlagemm, 8, 15, 3, 131072, uint8_t, uint8_t)
\ No newline at end of file
diff --git a/src/neon_armv8a/U8U32MlaGemmSkinnyGer.c b/src/neon_armv8a/U8U32MlaGemmSkinnyGer.c
new file mode 100644
index 0000000..fc9948c
--- /dev/null
+++ b/src/neon_armv8a/U8U32MlaGemmSkinnyGer.c
@@ -0,0 +1,32 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#ifndef GEMM_UNSIGNED_INT
+#define GEMM_UNSIGNED_INT
+#endif
+
+#include "arm_neon/ARMCompareAndSwap.h"
+#include "arm_neon/NeonI8I32MlaGemmSkinnyGer.h"
+
+GEMM_SKINNY_GER_PARALLEL_FUNC(u8u32mlagemm, 1, 5, 29, 8192, uint8_t, uint8_t)
+GEMM_SKINNY_GER_PARALLEL_FUNC(u8u32mlagemm, 2, 5, 29, 8192, uint8_t, uint8_t)
+GEMM_SKINNY_GER_PARALLEL_FUNC(u8u32mlagemm, 3, 5, 29, 8192, uint8_t, uint8_t)
+GEMM_SKINNY_GER_PARALLEL_FUNC(u8u32mlagemm, 4, 5, 29, 8192, uint8_t, uint8_t)
+GEMM_SKINNY_GER_PARALLEL_FUNC(u8u32mlagemm, 5, 5, 13, 8192, uint8_t, uint8_t)
+GEMM_SKINNY_GER_PARALLEL_FUNC(u8u32mlagemm, 6, 5, 13, 8192, uint8_t, uint8_t)
+GEMM_SKINNY_GER_PARALLEL_FUNC(u8u32mlagemm, 7, 5, 13, 8192, uint8_t, uint8_t)
+GEMM_SKINNY_GER_PARALLEL_FUNC(u8u32mlagemm, 8, 5, 13, 8192, uint8_t, uint8_t)
diff --git a/src/neon_armv8a/extension/HgemmCopy.c b/src/neon_armv8a/extension/HgemmCopy.c
new file mode 100644
index 0000000..d31fd58
--- /dev/null
+++ b/src/neon_armv8a/extension/HgemmCopy.c
@@ -0,0 +1,111 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include "common/CommonCopy.h"
+#include <arm_neon.h>
+
+static inline void pref_ab(const float16_t *dat) {
+  __asm__ ("prfm pldl1keep,[%0,#64]\n\t"::"r"(dat):);
+}
+
+#define NCOPY_NEON_LOOP_K8_UNROLL4(inc, dst_ptr, src1, src2, src3, src4) \
+  for (dim1_count = dim1_cache; dim1_count > 7; dim1_count -= 8) {\
+    float16x8x4_t t1;\
+    t1.val[0] = vld1q_f16(src1); src1 += 8; pref_ab(src1);\
+    t1.val[1] = vld1q_f16(src2); src2 += 8; pref_ab(src2);\
+    t1.val[2] = vld1q_f16(src3); src3 += 8; pref_ab(src3);\
+    t1.val[3] = vld1q_f16(src4); src4 += 8; pref_ab(src4);\
+    vst4q_lane_f16(dst_ptr, t1, 0);\
+    vst4q_lane_f16(dst_ptr + inc, t1, 1);\
+    vst4q_lane_f16(dst_ptr + inc * 2, t1, 2);\
+    vst4q_lane_f16(dst_ptr + inc * 3, t1, 3);\
+    vst4q_lane_f16(dst_ptr + inc * 4, t1, 4);\
+    vst4q_lane_f16(dst_ptr + inc * 5, t1, 5);\
+    vst4q_lane_f16(dst_ptr + inc * 6, t1, 6);\
+    vst4q_lane_f16(dst_ptr + inc * 7, t1, 7);\
+    dst_ptr += inc * 8;\
+  }
+
+#define NCOPY_UNROLL_16 {\
+  float16_t *dst_h1 = dst1; uint32_t dim1_cache = dim1_count;\
+  NCOPY_NEON_LOOP_K8_UNROLL4(16, dst_h1, src1, src2, src3, src4)\
+  dst_h1 = dst1 + 4;\
+  NCOPY_NEON_LOOP_K8_UNROLL4(16, dst_h1, src5, src6, src7, src8)\
+  dst_h1 = dst1 + 8;\
+  NCOPY_NEON_LOOP_K8_UNROLL4(16, dst_h1, src9, src10, src11, src12)\
+  dst_h1 = dst1 + 12;\
+  NCOPY_NEON_LOOP_K8_UNROLL4(16, dst_h1, src13, src14, src15, src16)\
+  dst1 = dst_h1 - 12;\
+  NCOPY_STD(16)\
+}
+
+#define NCOPY_UNROLL_8 {\
+  float16_t *dst_h1 = dst1; uint32_t dim1_cache = dim1_count;\
+  NCOPY_NEON_LOOP_K8_UNROLL4(8, dst_h1, src1, src2, src3, src4)\
+  dst_h1 = dst1 + 4;\
+  NCOPY_NEON_LOOP_K8_UNROLL4(8, dst_h1, src5, src6, src7, src8)\
+  dst1 = dst_h1 - 4;\
+  NCOPY_STD(8)\
+}
+
+#define NCOPY_UNROLL_4 {\
+  float16_t *dst_h1 = dst1; uint32_t dim1_cache = dim1_count;\
+  NCOPY_NEON_LOOP_K8_UNROLL4(4, dst_h1, src1, src2, src3, src4)\
+  dst1 = dst_h1;\
+  NCOPY_STD(4)\
+}
+
+#define NCOPY_UNROLL_2 NCOPY_STD(2)
+#define NCOPY_UNROLL_1 NCOPY_STD(1)
+
+#define NCOPY_float16_t_float16_t(unroll) NCOPY_UNROLL_##unroll
+
+
+#define TCOPY_UNIT_1(src_ptr, dst_ptr, dst_offset) \
+  dst_ptr[dst_offset] = *src_ptr;
+
+#define TCOPY_UNIT_2(src_ptr, dst_ptr, dst_offset) {\
+  dst_ptr[dst_offset] = *src_ptr;\
+  dst_ptr[dst_offset + 1] = src_ptr[1];\
+}
+
+#define TCOPY_UNIT_4(src_ptr, dst_ptr, dst_offset) {\
+  float16x4_t tmp = vld1_f16(src_ptr); pref_ab(src_ptr + 4);\
+  vst1_f16(dst_ptr + dst_offset, tmp);\
+}
+
+#define TCOPY_UNIT_8(src_ptr, dst_ptr, dst_offset) {\
+  float16x8_t tmp = vld1q_f16(src_ptr); pref_ab(src_ptr + 8);\
+  vst1q_f16(dst_ptr + dst_offset, tmp);\
+}
+
+#define TCOPY_UNIT_16(src_ptr, dst_ptr, dst_offset) {\
+  float16x8_t tmp1 = vld1q_f16(src_ptr);\
+  float16x8_t tmp2 = vld1q_f16(src_ptr + 8); pref_ab(src_ptr + 16);\
+  vst1q_f16(dst_ptr + dst_offset, tmp1);\
+  vst1q_f16(dst_ptr + dst_offset + 8, tmp2);\
+}
+
+#define TCOPY_UNIT_float16_t_float16_t(src_ptr, dst_ptr, dst_offset, num_elements) \
+  TCOPY_UNIT_##num_elements(src_ptr, dst_ptr, dst_offset)
+
+GENERIC_NCOPY_FUNC(hgemm, float16_t, float16_t, 8)
+GENERIC_NCOPY_FUNC(hgemm, float16_t, float16_t, 16)
+
+GENERIC_TCOPY_FUNC(hgemm, float16_t, float16_t, 8)
+GENERIC_TCOPY_FUNC(hgemm, float16_t, float16_t, 16)
+
diff --git a/src/neon_armv8a/extension/HgemmKernel.c b/src/neon_armv8a/extension/HgemmKernel.c
new file mode 100644
index 0000000..b7cb731
--- /dev/null
+++ b/src/neon_armv8a/extension/HgemmKernel.c
@@ -0,0 +1,1420 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include "common/CommonKernel.h"
+#include "arm_neon/ARMCpuType.h"
+#include <sched.h>
+#include <arm_neon.h>
+
+static inline void pref_c(float16_t *dat) {
+  __asm__ ("prfm pstl1keep,[%0]\n\t"::"r"(dat):);
+}
+
+#define PREF_N1 pref_c(c_pref); c_pref += ldc;
+#define PREF_N2 PREF_N1 PREF_N1
+#define PREF_N4 PREF_N2 PREF_N2
+#define PREF_N8 PREF_N4 PREF_N4
+#define PREF_N16 PREF_N8 PREF_N8
+
+#define DECLARE_C_8X8 \
+  float16x8_t cq01, cq02, cq03, cq04, cq05, cq06, cq07, cq08;
+
+#define DECLARE_C_8X16 DECLARE_C_8X8 \
+  float16x8_t cq09, cq10, cq11, cq12, cq13, cq14, cq15, cq16;
+
+/* fp16-fma kernel for general out-of-order ARM processors */
+/* q0 and q1 for holding data from matrix A */
+/* q2 and q3 for holding data from matrix B */
+#define KERNEL_M8N16_A76 \
+  DECLARE_C_8X16\
+  float16_t *c_pref = c_ptr + 7; PREF_N16\
+  const float16_t *a_ptr = a_head;\
+  const float16_t *b_ptr1 = b_head;\
+  uint32_t k_left = K;\
+  __asm__ __volatile__(\
+    "movi %0.16b,#0; movi %1.16b,#0\n\t"\
+    "mov %2.16b,%0.16b; mov %3.16b,%1.16b\n\t"\
+    "mov %4.16b,%0.16b; mov %5.16b,%1.16b\n\t"\
+    "mov %6.16b,%0.16b; mov %7.16b,%1.16b\n\t"\
+    "mov %8.16b,%0.16b; mov %9.16b,%1.16b\n\t"\
+    "mov %10.16b,%0.16b; mov %11.16b,%1.16b\n\t"\
+    "mov %12.16b,%0.16b; mov %13.16b,%1.16b\n\t"\
+    "mov %14.16b,%0.16b; mov %15.16b,%1.16b\n\t"\
+    "cmp %w16,#0; b.eq 004f\n\t"\
+    "ldr q0,[%17],#16; ldr q2,[%18],#16; ldr q3,[%18],#16\n\t"\
+    "cmp %w16,#2; b.le 002f\n\t"\
+    "001:\n\t"\
+    "fmla %0.8h,v0.8h,v2.h[0]; fmla %1.8h,v0.8h,v2.h[1]\n\t"\
+    "ldr q1,[%17],#32\n\t"\
+    "fmla %2.8h,v0.8h,v2.h[2]; fmla %3.8h,v0.8h,v2.h[3]\n\t"\
+    "fmla %4.8h,v0.8h,v2.h[4]; fmla %5.8h,v0.8h,v2.h[5]\n\t"\
+    "prfm pldl1keep,[%17,#128]\n\t"\
+    "fmla %6.8h,v0.8h,v2.h[6]; fmla %7.8h,v0.8h,v2.h[7]\n\t"\
+    "ldr q2,[%18],#64\n\t"\
+    "fmla %8.8h,v0.8h,v3.h[0]; fmla %9.8h,v0.8h,v3.h[1]\n\t"\
+    "fmla %10.8h,v0.8h,v3.h[2]; fmla %11.8h,v0.8h,v3.h[3]\n\t"\
+    "sub %w16,%w16,#2\n\t"\
+    "fmla %12.8h,v0.8h,v3.h[4]; fmla %13.8h,v0.8h,v3.h[5]\n\t"\
+    "fmla %14.8h,v0.8h,v3.h[6]; fmla %15.8h,v0.8h,v3.h[7]\n\t"\
+    "ldr q3,[%18,#-48]\n\t"\
+    "fmla %0.8h,v1.8h,v2.h[0]; fmla %1.8h,v1.8h,v2.h[1]\n\t"\
+    "ldr q0,[%17,#-16]\n\t"\
+    "fmla %2.8h,v1.8h,v2.h[2]; fmla %3.8h,v1.8h,v2.h[3]\n\t"\
+    "fmla %4.8h,v1.8h,v2.h[4]; fmla %5.8h,v1.8h,v2.h[5]\n\t"\
+    "cmp %w16,#2\n\t"\
+    "fmla %6.8h,v1.8h,v2.h[6]; fmla %7.8h,v1.8h,v2.h[7]\n\t"\
+    "ldr q2,[%18,#-32]\n\t"\
+    "fmla %8.8h,v1.8h,v3.h[0]; fmla %9.8h,v1.8h,v3.h[1]\n\t"\
+    "fmla %10.8h,v1.8h,v3.h[2]; fmla %11.8h,v1.8h,v3.h[3]\n\t"\
+    "fmla %12.8h,v1.8h,v3.h[4]; fmla %13.8h,v1.8h,v3.h[5]\n\t"\
+    "fmla %14.8h,v1.8h,v3.h[6]; fmla %15.8h,v1.8h,v3.h[7]\n\t"\
+    "ldr q3,[%18,#-16]; b.gt 001b\n\t"\
+    "002:\n\t"\
+    "cmp %w16,#2; b.ne 003f\n\t"\
+    "fmla %0.8h,v0.8h,v2.h[0]; fmla %1.8h,v0.8h,v2.h[1]\n\t"\
+    "ldr q1,[%17],#16\n\t"\
+    "fmla %2.8h,v0.8h,v2.h[2]; fmla %3.8h,v0.8h,v2.h[3]\n\t"\
+    "fmla %4.8h,v0.8h,v2.h[4]; fmla %5.8h,v0.8h,v2.h[5]\n\t"\
+    "fmla %6.8h,v0.8h,v2.h[6]; fmla %7.8h,v0.8h,v2.h[7]\n\t"\
+    "ldr q2,[%18],#32\n\t"\
+    "fmla %8.8h,v0.8h,v3.h[0]; fmla %9.8h,v0.8h,v3.h[1]\n\t"\
+    "fmla %10.8h,v0.8h,v3.h[2]; fmla %11.8h,v0.8h,v3.h[3]\n\t"\
+    "sub %w16,%w16,#2\n\t"\
+    "fmla %12.8h,v0.8h,v3.h[4]; fmla %13.8h,v0.8h,v3.h[5]\n\t"\
+    "fmla %14.8h,v0.8h,v3.h[6]; fmla %15.8h,v0.8h,v3.h[7]\n\t"\
+    "ldr q3,[%18,#-16]\n\t"\
+    "fmla %0.8h,v1.8h,v2.h[0]; fmla %1.8h,v1.8h,v2.h[1]\n\t"\
+    "fmla %2.8h,v1.8h,v2.h[2]; fmla %3.8h,v1.8h,v2.h[3]\n\t"\
+    "fmla %4.8h,v1.8h,v2.h[4]; fmla %5.8h,v1.8h,v2.h[5]\n\t"\
+    "fmla %6.8h,v1.8h,v2.h[6]; fmla %7.8h,v1.8h,v2.h[7]\n\t"\
+    "fmla %8.8h,v1.8h,v3.h[0]; fmla %9.8h,v1.8h,v3.h[1]\n\t"\
+    "fmla %10.8h,v1.8h,v3.h[2]; fmla %11.8h,v1.8h,v3.h[3]\n\t"\
+    "fmla %12.8h,v1.8h,v3.h[4]; fmla %13.8h,v1.8h,v3.h[5]\n\t"\
+    "fmla %14.8h,v1.8h,v3.h[6]; fmla %15.8h,v1.8h,v3.h[7]\n\t"\
+    "b 004f\n\t"\
+    "003:\n\t"\
+    "fmla %0.8h,v0.8h,v2.h[0]; fmla %1.8h,v0.8h,v2.h[1]\n\t"\
+    "fmla %2.8h,v0.8h,v2.h[2]; fmla %3.8h,v0.8h,v2.h[3]\n\t"\
+    "fmla %4.8h,v0.8h,v2.h[4]; fmla %5.8h,v0.8h,v2.h[5]\n\t"\
+    "fmla %6.8h,v0.8h,v2.h[6]; fmla %7.8h,v0.8h,v2.h[7]\n\t"\
+    "fmla %8.8h,v0.8h,v3.h[0]; fmla %9.8h,v0.8h,v3.h[1]\n\t"\
+    "fmla %10.8h,v0.8h,v3.h[2]; fmla %11.8h,v0.8h,v3.h[3]\n\t"\
+    "sub %w16,%w16,#1\n\t"\
+    "fmla %12.8h,v0.8h,v3.h[4]; fmla %13.8h,v0.8h,v3.h[5]\n\t"\
+    "fmla %14.8h,v0.8h,v3.h[6]; fmla %15.8h,v0.8h,v3.h[7]\n\t"\
+    "004:\n\t"\
+  :"=w"(cq01),"=w"(cq02),"=w"(cq03),"=w"(cq04)\
+  ,"=w"(cq05),"=w"(cq06),"=w"(cq07),"=w"(cq08)\
+  ,"=w"(cq09),"=w"(cq10),"=w"(cq11),"=w"(cq12)\
+  ,"=w"(cq13),"=w"(cq14),"=w"(cq15),"=w"(cq16)\
+  ,"+r"(k_left),"+r"(a_ptr),"+r"(b_ptr1)\
+  ::"cc","memory","v0","v1","v2","v3");
+
+#define KERNEL_M16N8_A76 \
+  DECLARE_C_8X16\
+  float16_t *c_pref = c_ptr + 15; PREF_N8\
+  const float16_t *a_ptr = a_head;\
+  const float16_t *b_ptr1 = b_head;\
+  uint32_t k_left = K;\
+  __asm__ __volatile__(\
+    "movi %0.16b,#0; movi %1.16b,#0\n\t"\
+    "mov %2.16b,%0.16b; mov %3.16b,%1.16b\n\t"\
+    "mov %4.16b,%0.16b; mov %5.16b,%1.16b\n\t"\
+    "mov %6.16b,%0.16b; mov %7.16b,%1.16b\n\t"\
+    "mov %8.16b,%0.16b; mov %9.16b,%1.16b\n\t"\
+    "mov %10.16b,%0.16b; mov %11.16b,%1.16b\n\t"\
+    "mov %12.16b,%0.16b; mov %13.16b,%1.16b\n\t"\
+    "mov %14.16b,%0.16b; mov %15.16b,%1.16b\n\t"\
+    "cmp %w16,#0; b.eq 004f\n\t"\
+    "ldr q0,[%17],#32; ldr q2,[%18],#16; ldr q1,[%17,#-16]\n\t"\
+    "cmp %w16,#2; b.le 002f\n\t"\
+    "001:\n\t"\
+    "fmla %0.8h,v0.8h,v2.h[0]; fmla %2.8h,v0.8h,v2.h[1]\n\t"\
+    "ldr q3,[%18],#32\n\t"\
+    "fmla %4.8h,v0.8h,v2.h[2]; fmla %6.8h,v0.8h,v2.h[3]\n\t"\
+    "fmla %8.8h,v0.8h,v2.h[4]; fmla %10.8h,v0.8h,v2.h[5]\n\t"\
+    "prfm pldl1keep,[%18,#128]\n\t"\
+    "fmla %12.8h,v0.8h,v2.h[6]; fmla %14.8h,v0.8h,v2.h[7]\n\t"\
+    "ldr q0,[%17],#64\n\t"\
+    "fmla %1.8h,v1.8h,v2.h[0]; fmla %3.8h,v1.8h,v2.h[1]\n\t"\
+    "fmla %5.8h,v1.8h,v2.h[2]; fmla %7.8h,v1.8h,v2.h[3]\n\t"\
+    "sub %w16,%w16,#2\n\t"\
+    "fmla %9.8h,v1.8h,v2.h[4]; fmla %11.8h,v1.8h,v2.h[5]\n\t"\
+    "fmla %13.8h,v1.8h,v2.h[6]; fmla %15.8h,v1.8h,v2.h[7]\n\t"\
+    "ldr q1,[%17,#-48]\n\t"\
+    "fmla %0.8h,v0.8h,v3.h[0]; fmla %2.8h,v0.8h,v3.h[1]\n\t"\
+    "ldr q2,[%18,#-16]\n\t"\
+    "fmla %4.8h,v0.8h,v3.h[2]; fmla %6.8h,v0.8h,v3.h[3]\n\t"\
+    "fmla %8.8h,v0.8h,v3.h[4]; fmla %10.8h,v0.8h,v3.h[5]\n\t"\
+    "cmp %w16,#2\n\t"\
+    "fmla %12.8h,v0.8h,v3.h[6]; fmla %14.8h,v0.8h,v3.h[7]\n\t"\
+    "ldr q0,[%17,#-32]\n\t"\
+    "fmla %1.8h,v1.8h,v3.h[0]; fmla %3.8h,v1.8h,v3.h[1]\n\t"\
+    "fmla %5.8h,v1.8h,v3.h[2]; fmla %7.8h,v1.8h,v3.h[3]\n\t"\
+    "fmla %9.8h,v1.8h,v3.h[4]; fmla %11.8h,v1.8h,v3.h[5]\n\t"\
+    "fmla %13.8h,v1.8h,v3.h[6]; fmla %15.8h,v1.8h,v3.h[7]\n\t"\
+    "ldr q1,[%17,#-16]; b.gt 001b\n\t"\
+    "002:\n\t"\
+    "cmp %w16,#2; b.ne 003f\n\t"\
+    "fmla %0.8h,v0.8h,v2.h[0]; fmla %2.8h,v0.8h,v2.h[1]\n\t"\
+    "ldr q3,[%18],#16\n\t"\
+    "fmla %4.8h,v0.8h,v2.h[2]; fmla %6.8h,v0.8h,v2.h[3]\n\t"\
+    "fmla %8.8h,v0.8h,v2.h[4]; fmla %10.8h,v0.8h,v2.h[5]\n\t"\
+    "fmla %12.8h,v0.8h,v2.h[6]; fmla %14.8h,v0.8h,v2.h[7]\n\t"\
+    "ldr q0,[%17],#32\n\t"\
+    "fmla %1.8h,v1.8h,v2.h[0]; fmla %3.8h,v1.8h,v2.h[1]\n\t"\
+    "fmla %5.8h,v1.8h,v2.h[2]; fmla %7.8h,v1.8h,v2.h[3]\n\t"\
+    "sub %w16,%w16,#2\n\t"\
+    "fmla %9.8h,v1.8h,v2.h[4]; fmla %11.8h,v1.8h,v2.h[5]\n\t"\
+    "fmla %13.8h,v1.8h,v2.h[6]; fmla %15.8h,v1.8h,v2.h[7]\n\t"\
+    "ldr q1,[%17,#-16]\n\t"\
+    "fmla %0.8h,v0.8h,v3.h[0]; fmla %2.8h,v0.8h,v3.h[1]\n\t"\
+    "fmla %4.8h,v0.8h,v3.h[2]; fmla %6.8h,v0.8h,v3.h[3]\n\t"\
+    "fmla %8.8h,v0.8h,v3.h[4]; fmla %10.8h,v0.8h,v3.h[5]\n\t"\
+    "fmla %12.8h,v0.8h,v3.h[6]; fmla %14.8h,v0.8h,v3.h[7]\n\t"\
+    "fmla %1.8h,v1.8h,v3.h[0]; fmla %3.8h,v1.8h,v3.h[1]\n\t"\
+    "fmla %5.8h,v1.8h,v3.h[2]; fmla %7.8h,v1.8h,v3.h[3]\n\t"\
+    "fmla %9.8h,v1.8h,v3.h[4]; fmla %11.8h,v1.8h,v3.h[5]\n\t"\
+    "fmla %13.8h,v1.8h,v3.h[6]; fmla %15.8h,v1.8h,v3.h[7]\n\t"\
+    "b 004f\n\t"\
+    "003:\n\t"\
+    "fmla %0.8h,v0.8h,v2.h[0]; fmla %2.8h,v0.8h,v2.h[1]\n\t"\
+    "fmla %4.8h,v0.8h,v2.h[2]; fmla %6.8h,v0.8h,v2.h[3]\n\t"\
+    "fmla %8.8h,v0.8h,v2.h[4]; fmla %10.8h,v0.8h,v2.h[5]\n\t"\
+    "fmla %12.8h,v0.8h,v2.h[6]; fmla %14.8h,v0.8h,v2.h[7]\n\t"\
+    "fmla %1.8h,v1.8h,v2.h[0]; fmla %3.8h,v1.8h,v2.h[1]\n\t"\
+    "fmla %5.8h,v1.8h,v2.h[2]; fmla %7.8h,v1.8h,v2.h[3]\n\t"\
+    "sub %w16,%w16,#1\n\t"\
+    "fmla %9.8h,v1.8h,v2.h[4]; fmla %11.8h,v1.8h,v2.h[5]\n\t"\
+    "fmla %13.8h,v1.8h,v2.h[6]; fmla %15.8h,v1.8h,v2.h[7]\n\t"\
+    "004:\n\t"\
+  :"=w"(cq01),"=w"(cq02),"=w"(cq03),"=w"(cq04)\
+  ,"=w"(cq05),"=w"(cq06),"=w"(cq07),"=w"(cq08)\
+  ,"=w"(cq09),"=w"(cq10),"=w"(cq11),"=w"(cq12)\
+  ,"=w"(cq13),"=w"(cq14),"=w"(cq15),"=w"(cq16)\
+  ,"+r"(k_left),"+r"(a_ptr),"+r"(b_ptr1)\
+  ::"cc","memory","v0","v1","v2","v3");
+
+#define KERNEL_M8N8_A76 \
+  DECLARE_C_8X8\
+  float16_t *c_pref = c_ptr + 7; PREF_N8\
+  const float16_t *a_ptr = a_head;\
+  const float16_t *b_ptr1 = b_head;\
+  uint32_t k_left = K;\
+  __asm__ __volatile__(\
+    "movi %0.16b,#0; movi %1.16b,#0\n\t"\
+    "mov %2.16b,%0.16b; mov %3.16b,%1.16b\n\t"\
+    "mov %4.16b,%0.16b; mov %5.16b,%1.16b\n\t"\
+    "mov %6.16b,%0.16b; mov %7.16b,%1.16b\n\t"\
+    "cmp %w8,#0; b.eq 104f\n\t"\
+    "ldr q0,[%9],#16; ldr q2,[%10],#16\n\t"\
+    "cmp %w8,#2; b.le 102f\n\t"\
+    "101:\n\t"\
+    "fmla %0.8h,v0.8h,v2.h[0]; fmla %1.8h,v0.8h,v2.h[1]\n\t"\
+    "ldr q1,[%9],#32\n\t"\
+    "fmla %2.8h,v0.8h,v2.h[2]; fmla %3.8h,v0.8h,v2.h[3]\n\t"\
+    "ldr q3,[%10],#32\n\t"\
+    "fmla %4.8h,v0.8h,v2.h[4]; fmla %5.8h,v0.8h,v2.h[5]\n\t"\
+    "prfm pldl1keep,[%9,#128]\n\t"\
+    "fmla %6.8h,v0.8h,v2.h[6]; fmla %7.8h,v0.8h,v2.h[7]\n\t"\
+    "ldr q0,[%9,#-16]\n\t"\
+    "fmla %0.8h,v1.8h,v3.h[0]; fmla %1.8h,v1.8h,v3.h[1]\n\t"\
+    "ldr q2,[%10,#-16]\n\t"\
+    "fmla %2.8h,v1.8h,v3.h[2]; fmla %3.8h,v1.8h,v3.h[3]\n\t"\
+    "sub %w8,%w8,#2\n\t"\
+    "fmla %4.8h,v1.8h,v3.h[4]; fmla %5.8h,v1.8h,v3.h[5]\n\t"\
+    "cmp %w8,#2\n\t"\
+    "fmla %6.8h,v1.8h,v3.h[6]; fmla %7.8h,v1.8h,v3.h[7]\n\t"\
+    "b.gt 101b\n\t"\
+    "102:\n\t"\
+    "cmp %w8,#2; b.ne 103f\n\t"\
+    "fmla %0.8h,v0.8h,v2.h[0]; fmla %1.8h,v0.8h,v2.h[1]\n\t"\
+    "ldr q1,[%9],#16\n\t"\
+    "fmla %2.8h,v0.8h,v2.h[2]; fmla %3.8h,v0.8h,v2.h[3]\n\t"\
+    "ldr q3,[%10],#16\n\t"\
+    "fmla %4.8h,v0.8h,v2.h[4]; fmla %5.8h,v0.8h,v2.h[5]\n\t"\
+    "prfm pldl1keep,[%9,#128]\n\t"\
+    "fmla %6.8h,v0.8h,v2.h[6]; fmla %7.8h,v0.8h,v2.h[7]\n\t"\
+    "fmla %0.8h,v1.8h,v3.h[0]; fmla %1.8h,v1.8h,v3.h[1]\n\t"\
+    "fmla %2.8h,v1.8h,v3.h[2]; fmla %3.8h,v1.8h,v3.h[3]\n\t"\
+    "sub %w8,%w8,#2\n\t"\
+    "fmla %4.8h,v1.8h,v3.h[4]; fmla %5.8h,v1.8h,v3.h[5]\n\t"\
+    "fmla %6.8h,v1.8h,v3.h[6]; fmla %7.8h,v1.8h,v3.h[7]\n\t"\
+    "b 104f\n\t"\
+    "103:\n\t"\
+    "fmla %0.8h,v0.8h,v2.h[0]; fmla %1.8h,v0.8h,v2.h[1]\n\t"\
+    "fmla %2.8h,v0.8h,v2.h[2]; fmla %3.8h,v0.8h,v2.h[3]\n\t"\
+    "fmla %4.8h,v0.8h,v2.h[4]; fmla %5.8h,v0.8h,v2.h[5]\n\t"\
+    "fmla %6.8h,v0.8h,v2.h[6]; fmla %7.8h,v0.8h,v2.h[7]\n\t"\
+    "sub %w8,%w8,#1\n\t"\
+    "104:\n\t"\
+  :"=w"(cq01),"=w"(cq02),"=w"(cq03),"=w"(cq04)\
+  ,"=w"(cq05),"=w"(cq06),"=w"(cq07),"=w"(cq08)\
+  ,"+r"(k_left),"+r"(a_ptr),"+r"(b_ptr1)\
+  ::"cc","memory","v0","v1","v2","v3");
+
+/* fp16-fma kernel for A55 specially */
+#define KERNEL_M8N16_A55 \
+  DECLARE_C_8X16\
+  float16_t *c_pref = c_ptr + 7; PREF_N16\
+  const float16_t *a_ptr = a_head;\
+  const float16_t *b_ptr1 = b_head;\
+  uint32_t k_left = K;\
+  __asm__ __volatile__(\
+    "movi %0.16b,#0; movi %1.16b,#0\n\t"\
+    "mov %2.16b,%0.16b; mov %3.16b,%1.16b\n\t"\
+    "mov %4.16b,%0.16b; mov %5.16b,%1.16b\n\t"\
+    "mov %6.16b,%0.16b; mov %7.16b,%1.16b\n\t"\
+    "mov %8.16b,%0.16b; mov %9.16b,%1.16b\n\t"\
+    "mov %10.16b,%0.16b; mov %11.16b,%1.16b\n\t"\
+    "mov %12.16b,%0.16b; mov %13.16b,%1.16b\n\t"\
+    "mov %14.16b,%0.16b; mov %15.16b,%1.16b\n\t"\
+    "cmp %w16,#0; b.eq 004f\n\t"\
+    "ldr q0,[%17],#16; ldr d2,[%18],#32; ldr d3,[%18,#-24]\n\t"\
+    "ldr d4,[%18,#-16]; ldr d5,[%18,#-8]\n\t"\
+    "cmp %w16,#2; b.le 002f\n\t"\
+    "001:\n\t"\
+    "fmla %0.8h,v0.8h,v2.h[0]; fmla %1.8h,v0.8h,v2.h[1]\n\t"\
+    "ldr d1,[%17],#32\n\t"\
+    "fmla %2.8h,v0.8h,v2.h[2]; fmla %3.8h,v0.8h,v2.h[3]\n\t"\
+    "ldr x0,[%17,#-24]\n\t"\
+    "fmla %4.8h,v0.8h,v3.h[0]\n\t"\
+    "ldr d2,[%18],#64\n\t"\
+    "fmla %5.8h,v0.8h,v3.h[1]\n\t"\
+    "prfm pldl1keep,[%17,#128]\n\t"\
+    "fmla %6.8h,v0.8h,v3.h[2]; fmla %7.8h,v0.8h,v3.h[3]\n\t"\
+    "ldr d3,[%18,#-56]\n\t"\
+    "fmla %8.8h,v0.8h,v4.h[0]; fmla %9.8h,v0.8h,v4.h[1]\n\t"\
+    "fmov v1.d[1],x0\n\t"\
+    "fmla %10.8h,v0.8h,v4.h[2]; fmla %11.8h,v0.8h,v4.h[3]\n\t"\
+    "sub %w16,%w16,#2\n\t"\
+    "fmla %12.8h,v0.8h,v5.h[0]\n\t"\
+    "ldr d4,[%18,#-48]\n\t"\
+    "fmla %13.8h,v0.8h,v5.h[1]\n\t"\
+    "fmla %14.8h,v0.8h,v5.h[2]; fmla %15.8h,v0.8h,v5.h[3]\n\t"\
+    "ldr d5,[%18,#-40]\n\t"\
+    "fmla %0.8h,v1.8h,v2.h[0]; fmla %1.8h,v1.8h,v2.h[1]\n\t"\
+    "ldr d0,[%17,#-16]\n\t"\
+    "fmla %2.8h,v1.8h,v2.h[2]; fmla %3.8h,v1.8h,v2.h[3]\n\t"\
+    "ldr x0,[%17,#-8]\n\t"\
+    "fmla %4.8h,v1.8h,v3.h[0]\n\t"\
+    "ldr d2,[%18,#-32]\n\t"\
+    "fmla %5.8h,v1.8h,v3.h[1]\n\t"\
+    "cmp %w16,#2\n\t"\
+    "fmla %6.8h,v1.8h,v3.h[2]; fmla %7.8h,v1.8h,v3.h[3]\n\t"\
+    "ldr d3,[%18,#-24]\n\t"\
+    "fmla %8.8h,v1.8h,v4.h[0]; fmla %9.8h,v1.8h,v4.h[1]\n\t"\
+    "fmla %10.8h,v1.8h,v4.h[2]; fmla %11.8h,v1.8h,v4.h[3]\n\t"\
+    "fmov v0.d[1],x0\n\t"\
+    "fmla %12.8h,v1.8h,v5.h[0]\n\t"\
+    "ldr d4,[%18,#-16]\n\t"\
+    "fmla %13.8h,v1.8h,v5.h[1]\n\t"\
+    "fmla %14.8h,v1.8h,v5.h[2]; fmla %15.8h,v1.8h,v5.h[3]\n\t"\
+    "ldr d5,[%18,#-8]; b.gt 001b\n\t"\
+    "002:\n\t"\
+    "cmp %w16,#2; b.ne 003f\n\t"\
+    "fmla %0.8h,v0.8h,v2.h[0]; fmla %1.8h,v0.8h,v2.h[1]\n\t"\
+    "ldr d1,[%17],#16\n\t"\
+    "fmla %2.8h,v0.8h,v2.h[2]; fmla %3.8h,v0.8h,v2.h[3]\n\t"\
+    "ldr x0,[%17,#-8]\n\t"\
+    "fmla %4.8h,v0.8h,v3.h[0]\n\t"\
+    "ldr d2,[%18],#32\n\t"\
+    "fmla %5.8h,v0.8h,v3.h[1]\n\t"\
+    "fmla %6.8h,v0.8h,v3.h[2]; fmla %7.8h,v0.8h,v3.h[3]\n\t"\
+    "ldr d3,[%18,#-24]\n\t"\
+    "fmla %8.8h,v0.8h,v4.h[0]; fmla %9.8h,v0.8h,v4.h[1]\n\t"\
+    "fmla %10.8h,v0.8h,v4.h[2]; fmla %11.8h,v0.8h,v4.h[3]\n\t"\
+    "sub %w16,%w16,#2\n\t"\
+    "fmla %12.8h,v0.8h,v5.h[0]\n\t"\
+    "ldr d4,[%18,#-16]\n\t"\
+    "fmla %13.8h,v0.8h,v5.h[1]\n\t"\
+    "fmov v1.d[1],x0\n\t"\
+    "fmla %14.8h,v0.8h,v5.h[2]; fmla %15.8h,v0.8h,v5.h[3]\n\t"\
+    "ldr d5,[%18,#-8]\n\t"\
+    "fmla %0.8h,v1.8h,v2.h[0]; fmla %1.8h,v1.8h,v2.h[1]\n\t"\
+    "fmla %2.8h,v1.8h,v2.h[2]; fmla %3.8h,v1.8h,v2.h[3]\n\t"\
+    "fmla %4.8h,v1.8h,v3.h[0]; fmla %5.8h,v1.8h,v3.h[1]\n\t"\
+    "fmla %6.8h,v1.8h,v3.h[2]; fmla %7.8h,v1.8h,v3.h[3]\n\t"\
+    "fmla %8.8h,v1.8h,v4.h[0]; fmla %9.8h,v1.8h,v4.h[1]\n\t"\
+    "fmla %10.8h,v1.8h,v4.h[2]; fmla %11.8h,v1.8h,v4.h[3]\n\t"\
+    "fmla %12.8h,v1.8h,v5.h[0]; fmla %13.8h,v1.8h,v5.h[1]\n\t"\
+    "fmla %14.8h,v1.8h,v5.h[2]; fmla %15.8h,v1.8h,v5.h[3]\n\t"\
+    "b 004f\n\t"\
+    "003:\n\t"\
+    "fmla %0.8h,v0.8h,v2.h[0]; fmla %1.8h,v0.8h,v2.h[1]\n\t"\
+    "fmla %2.8h,v0.8h,v2.h[2]; fmla %3.8h,v0.8h,v2.h[3]\n\t"\
+    "fmla %4.8h,v0.8h,v3.h[0]; fmla %5.8h,v0.8h,v3.h[1]\n\t"\
+    "fmla %6.8h,v0.8h,v3.h[2]; fmla %7.8h,v0.8h,v3.h[3]\n\t"\
+    "fmla %8.8h,v0.8h,v4.h[0]; fmla %9.8h,v0.8h,v4.h[1]\n\t"\
+    "fmla %10.8h,v0.8h,v4.h[2]; fmla %11.8h,v0.8h,v4.h[3]\n\t"\
+    "sub %w16,%w16,#1\n\t"\
+    "fmla %12.8h,v0.8h,v5.h[0]; fmla %13.8h,v0.8h,v5.h[1]\n\t"\
+    "fmla %14.8h,v0.8h,v5.h[2]; fmla %15.8h,v0.8h,v5.h[3]\n\t"\
+    "004:\n\t"\
+  :"=w"(cq01),"=w"(cq02),"=w"(cq03),"=w"(cq04)\
+  ,"=w"(cq05),"=w"(cq06),"=w"(cq07),"=w"(cq08)\
+  ,"=w"(cq09),"=w"(cq10),"=w"(cq11),"=w"(cq12)\
+  ,"=w"(cq13),"=w"(cq14),"=w"(cq15),"=w"(cq16)\
+  ,"+r"(k_left),"+r"(a_ptr),"+r"(b_ptr1)\
+  ::"cc","memory","v0","v1","v2","v3","v4","v5","x0");
+
+#define KERNEL_M16N8_A55 \
+  DECLARE_C_8X16\
+  float16_t *c_pref = c_ptr + 15; PREF_N8\
+  const float16_t *a_ptr = a_head;\
+  const float16_t *b_ptr1 = b_head;\
+  uint32_t k_left = K;\
+  __asm__ __volatile__(\
+    "movi %0.16b,#0; movi %1.16b,#0\n\t"\
+    "mov %2.16b,%0.16b; mov %3.16b,%1.16b\n\t"\
+    "mov %4.16b,%0.16b; mov %5.16b,%1.16b\n\t"\
+    "mov %6.16b,%0.16b; mov %7.16b,%1.16b\n\t"\
+    "mov %8.16b,%0.16b; mov %9.16b,%1.16b\n\t"\
+    "mov %10.16b,%0.16b; mov %11.16b,%1.16b\n\t"\
+    "mov %12.16b,%0.16b; mov %13.16b,%1.16b\n\t"\
+    "mov %14.16b,%0.16b; mov %15.16b,%1.16b\n\t"\
+    "cmp %w16,#0; b.eq 004f\n\t"\
+    "ldr q0,[%17],#32\n\t"\
+    "ldr d2,[%18],#16; ldr d3,[%18,#-8]\n\t"\
+    "cmp %w16,#2; b.le 002f\n\t"\
+    "001:\n\t"\
+    "fmla %0.8h,v0.8h,v2.h[0]\n\t"\
+    "fmla %2.8h,v0.8h,v2.h[1]; ldr d1,[%17,#-16]\n\t"\
+    "fmla %4.8h,v0.8h,v2.h[2]; ldr x0,[%17,#-8]\n\t"\
+    "fmla %6.8h,v0.8h,v2.h[3]; prfm pldl2keep,[%18,#128]\n\t"\
+    "fmla %8.8h,v0.8h,v3.h[0]; ldr d4,[%18],#32\n\t"\
+    "fmla %10.8h,v0.8h,v3.h[1]; fmov v1.d[1],x0\n\t"\
+    "fmla %12.8h,v0.8h,v3.h[2]\n\t"\
+    "fmla %14.8h,v0.8h,v3.h[3]; ldr d5,[%18,#-24]\n\t"\
+    "fmla %1.8h,v1.8h,v2.h[0]; ldr d0,[%17],#64\n\t"\
+    "fmla %3.8h,v1.8h,v2.h[1]\n\t"\
+    "fmla %5.8h,v1.8h,v2.h[2]; ldr x0,[%17,#-56]\n\t"\
+    "fmla %7.8h,v1.8h,v2.h[3]\n\t"\
+    "fmla %9.8h,v1.8h,v3.h[0]\n\t"\
+    "fmla %11.8h,v1.8h,v3.h[1]; fmov v0.d[1],x0\n\t"\
+    "fmla %13.8h,v1.8h,v3.h[2]\n\t"\
+    "fmla %15.8h,v1.8h,v3.h[3]\n\t"\
+    "fmla %0.8h,v0.8h,v4.h[0]; ldr d1,[%17,#-48]\n\t"\
+    "fmla %2.8h,v0.8h,v4.h[1]; ldr x0,[%17,#-40]\n\t"\
+    "fmla %4.8h,v0.8h,v4.h[2]; ldr d2,[%18,#-16]\n\t"\
+    "fmla %6.8h,v0.8h,v4.h[3]\n\t"\
+    "fmla %8.8h,v0.8h,v5.h[0]\n\t"\
+    "fmla %10.8h,v0.8h,v5.h[1]; fmov v1.d[1],x0\n\t"\
+    "fmla %12.8h,v0.8h,v5.h[2]; ldr d3,[%18,#-8]\n\t"\
+    "fmla %14.8h,v0.8h,v5.h[3]\n\t"\
+    "fmla %1.8h,v1.8h,v4.h[0]; ldr d0,[%17,#-32]\n\t"\
+    "fmla %3.8h,v1.8h,v4.h[1]; ldr x0,[%17,#-24]\n\t"\
+    "fmla %5.8h,v1.8h,v4.h[2]\n\t"\
+    "fmla %7.8h,v1.8h,v4.h[3]; sub %w16,%w16,#2\n\t"\
+    "fmla %9.8h,v1.8h,v5.h[0]\n\t"\
+    "fmla %11.8h,v1.8h,v5.h[1]; fmov v0.d[1],x0\n\t"\
+    "fmla %13.8h,v1.8h,v5.h[2]; cmp %w16,#2\n\t"\
+    "fmla %15.8h,v1.8h,v5.h[3]; b.gt 001b\n\t"\
+    "002:\n\t"\
+    "cmp %w16,#2; b.ne 003f\n\t"\
+    "fmla %0.8h,v0.8h,v2.h[0]\n\t"\
+    "fmla %2.8h,v0.8h,v2.h[1]; ldr d1,[%17,#-16]\n\t"\
+    "fmla %4.8h,v0.8h,v2.h[2]; ldr x0,[%17,#-8]\n\t"\
+    "fmla %6.8h,v0.8h,v2.h[3]\n\t"\
+    "fmla %8.8h,v0.8h,v3.h[0]; ldr d4,[%18],#16\n\t"\
+    "fmla %10.8h,v0.8h,v3.h[1]; fmov v1.d[1],x0\n\t"\
+    "fmla %12.8h,v0.8h,v3.h[2]\n\t"\
+    "fmla %14.8h,v0.8h,v3.h[3]; ldr d5,[%18,#-8]\n\t"\
+    "fmla %1.8h,v1.8h,v2.h[0]; ldr d0,[%17],#32\n\t"\
+    "fmla %3.8h,v1.8h,v2.h[1]\n\t"\
+    "fmla %5.8h,v1.8h,v2.h[2]; ldr x0,[%17,#-24]\n\t"\
+    "fmla %7.8h,v1.8h,v2.h[3]\n\t"\
+    "fmla %9.8h,v1.8h,v3.h[0]\n\t"\
+    "fmla %11.8h,v1.8h,v3.h[1]; fmov v0.d[1],x0\n\t"\
+    "fmla %13.8h,v1.8h,v3.h[2]\n\t"\
+    "fmla %15.8h,v1.8h,v3.h[3]\n\t"\
+    "fmla %0.8h,v0.8h,v4.h[0]; ldr d1,[%17,#-16]\n\t"\
+    "fmla %2.8h,v0.8h,v4.h[1]; ldr x0,[%17,#-8]\n\t"\
+    "fmla %4.8h,v0.8h,v4.h[2]\n\t"\
+    "fmla %6.8h,v0.8h,v4.h[3]\n\t"\
+    "fmla %8.8h,v0.8h,v5.h[0]\n\t"\
+    "fmla %10.8h,v0.8h,v5.h[1]; fmov v1.d[1],x0\n\t"\
+    "fmla %12.8h,v0.8h,v5.h[2]\n\t"\
+    "fmla %14.8h,v0.8h,v5.h[3]\n\t"\
+    "fmla %1.8h,v1.8h,v4.h[0]\n\t"\
+    "fmla %3.8h,v1.8h,v4.h[1]\n\t"\
+    "fmla %5.8h,v1.8h,v4.h[2]\n\t"\
+    "fmla %7.8h,v1.8h,v4.h[3]; sub %w16,%w16,#2\n\t"\
+    "fmla %9.8h,v1.8h,v5.h[0]\n\t"\
+    "fmla %11.8h,v1.8h,v5.h[1]\n\t"\
+    "fmla %13.8h,v1.8h,v5.h[2]\n\t"\
+    "fmla %15.8h,v1.8h,v5.h[3]; b 004f\n\t"\
+    "003:\n\t"\
+    "fmla %0.8h,v0.8h,v2.h[0]\n\t"\
+    "fmla %2.8h,v0.8h,v2.h[1]; ldr d1,[%17,#-16]\n\t"\
+    "fmla %4.8h,v0.8h,v2.h[2]; ldr x0,[%17,#-8]\n\t"\
+    "fmla %6.8h,v0.8h,v2.h[3]\n\t"\
+    "fmla %8.8h,v0.8h,v3.h[0]\n\t"\
+    "fmla %10.8h,v0.8h,v3.h[1]; fmov v1.d[1],x0\n\t"\
+    "fmla %12.8h,v0.8h,v3.h[2]\n\t"\
+    "fmla %14.8h,v0.8h,v3.h[3]\n\t"\
+    "fmla %1.8h,v1.8h,v2.h[0]\n\t"\
+    "fmla %3.8h,v1.8h,v2.h[1]\n\t"\
+    "fmla %5.8h,v1.8h,v2.h[2]\n\t"\
+    "fmla %7.8h,v1.8h,v2.h[3]\n\t"\
+    "fmla %9.8h,v1.8h,v3.h[0]\n\t"\
+    "fmla %11.8h,v1.8h,v3.h[1]; sub %w16,%w16,#1\n\t"\
+    "fmla %13.8h,v1.8h,v3.h[2]\n\t"\
+    "fmla %15.8h,v1.8h,v3.h[3]\n\t"\
+    "004:\n\t"\
+  :"=w"(cq01),"=w"(cq02),"=w"(cq03),"=w"(cq04)\
+  ,"=w"(cq05),"=w"(cq06),"=w"(cq07),"=w"(cq08)\
+  ,"=w"(cq09),"=w"(cq10),"=w"(cq11),"=w"(cq12)\
+  ,"=w"(cq13),"=w"(cq14),"=w"(cq15),"=w"(cq16)\
+  ,"+r"(k_left),"+r"(a_ptr),"+r"(b_ptr1)\
+  ::"cc","memory","v0","v1","v2","v3","v4","v5","x0");
+
+#define KERNEL_M8N8_A55 \
+  DECLARE_C_8X8\
+  float16_t *c_pref = c_ptr + 7; PREF_N8\
+  const float16_t *a_ptr = a_head;\
+  const float16_t *b_ptr1 = b_head;\
+  uint32_t k_left = K;\
+  __asm__ __volatile__(\
+    "movi %0.16b,#0; movi %1.16b,#0\n\t"\
+    "mov %2.16b,%0.16b; mov %3.16b,%1.16b\n\t"\
+    "mov %4.16b,%0.16b; mov %5.16b,%1.16b\n\t"\
+    "mov %6.16b,%0.16b; mov %7.16b,%1.16b\n\t"\
+    "cmp %w8,#0; b.eq 104f\n\t"\
+    "ldr q0,[%9],#16; ldr d2,[%10],#16; ldr d3,[%10,#-8]\n\t"\
+    "cmp %w8,#2; b.le 102f\n\t"\
+    "101:\n\t"\
+    "fmla %0.8h,v0.8h,v2.h[0]; ldr d1,[%9],#32\n\t"\
+    "fmla %1.8h,v0.8h,v2.h[1]\n\t"\
+    "fmla %2.8h,v0.8h,v2.h[2]; ldr x0,[%9,#-24]\n\t"\
+    "fmla %3.8h,v0.8h,v2.h[3]; prfm pldl1keep,[%9,#128]\n\t"\
+    "fmla %4.8h,v0.8h,v3.h[0]; ldr d2,[%10],#32\n\t"\
+    "fmla %5.8h,v0.8h,v3.h[1]; fmov v1.d[1],x0\n\t"\
+    "fmla %6.8h,v0.8h,v3.h[2]\n\t"\
+    "fmla %7.8h,v0.8h,v3.h[3]; ldr d3,[%10,#-24]\n\t"\
+    "fmla %0.8h,v1.8h,v2.h[0]; ldr d0,[%9,#-16]\n\t"\
+    "fmla %1.8h,v1.8h,v2.h[1]; ldr x0,[%9,#-8]\n\t"\
+    "fmla %2.8h,v1.8h,v2.h[2]\n\t"\
+    "fmla %3.8h,v1.8h,v2.h[3]; ldr d2,[%10,#-16]\n\t"\
+    "fmla %4.8h,v1.8h,v3.h[0]; fmov v0.d[1],x0\n\t"\
+    "fmla %5.8h,v1.8h,v3.h[1]; sub %w8,%w8,#2\n\t"\
+    "fmla %6.8h,v1.8h,v3.h[2]; cmp %w8,#2\n\t"\
+    "fmla %7.8h,v1.8h,v3.h[3]; ldr d3,[%10,#-8]\n\t"\
+    "b.gt 101b\n\t"\
+    "102:\n\t"\
+    "cmp %w8,#2; b.ne 103f\n\t"\
+    "fmla %0.8h,v0.8h,v2.h[0]; ldr d1,[%9],#16\n\t"\
+    "fmla %1.8h,v0.8h,v2.h[1]\n\t"\
+    "fmla %2.8h,v0.8h,v2.h[2]; ldr x0,[%9,#-8]\n\t"\
+    "fmla %3.8h,v0.8h,v2.h[3]; prfm pldl1keep,[%9,#128]\n\t"\
+    "fmla %4.8h,v0.8h,v3.h[0]; ldr d2,[%10],#16\n\t"\
+    "fmla %5.8h,v0.8h,v3.h[1]; fmov v1.d[1],x0\n\t"\
+    "fmla %6.8h,v0.8h,v3.h[2]\n\t"\
+    "fmla %7.8h,v0.8h,v3.h[3]; ldr d3,[%10,#-8]\n\t"\
+    "fmla %0.8h,v1.8h,v2.h[0]\n\t"\
+    "fmla %1.8h,v1.8h,v2.h[1]\n\t"\
+    "fmla %2.8h,v1.8h,v2.h[2]\n\t"\
+    "fmla %3.8h,v1.8h,v2.h[3]\n\t"\
+    "fmla %4.8h,v1.8h,v3.h[0]\n\t"\
+    "fmla %5.8h,v1.8h,v3.h[1]; sub %w8,%w8,#2\n\t"\
+    "fmla %6.8h,v1.8h,v3.h[2]\n\t"\
+    "fmla %7.8h,v1.8h,v3.h[3]\n\t"\
+    "b 104f\n\t"\
+    "103:\n\t"\
+    "fmla %0.8h,v0.8h,v2.h[0]; fmla %1.8h,v0.8h,v2.h[1]\n\t"\
+    "fmla %2.8h,v0.8h,v2.h[2]; fmla %3.8h,v0.8h,v2.h[3]\n\t"\
+    "fmla %4.8h,v0.8h,v3.h[0]; fmla %5.8h,v0.8h,v3.h[1]\n\t"\
+    "fmla %6.8h,v0.8h,v3.h[2]; fmla %7.8h,v0.8h,v3.h[3]\n\t"\
+    "sub %w8,%w8,#1\n\t"\
+    "104:\n\t"\
+  :"=w"(cq01),"=w"(cq02),"=w"(cq03),"=w"(cq04)\
+  ,"=w"(cq05),"=w"(cq06),"=w"(cq07),"=w"(cq08)\
+  ,"+r"(k_left),"+r"(a_ptr),"+r"(b_ptr1)\
+  ::"cc","memory","v0","v1","v2","v3","x0");
+
+#define KERNEL_M8N4_UNIT(a_head, b_head) \
+  const float16_t *a_ptr = a_head;\
+  const float16_t *b_ptr1 = b_head;\
+  float16x8_t cq01, cq02, cq03, cq04, cq05, cq06, cq07, cq08;\
+  cq01 = cq02 = cq03 = cq04 = vdupq_n_f16(0.0f);\
+  cq05 = cq06 = cq07 = cq08 = vdupq_n_f16(0.0f);\
+  float16x8_t aq01, aq02, bq01;\
+  uint32_t k_left = K;\
+  if (k_left > 1) {\
+    aq01 = vld1q_f16(a_ptr);\
+    aq02 = vld1q_f16(a_ptr + 8); a_ptr += 16;\
+    bq01 = vld1q_f16(b_ptr1); b_ptr1 += 8;\
+  }\
+  for (; k_left > 3; k_left -= 2) {\
+    cq01 = vfmaq_laneq_f16(cq01, aq01, bq01, 0);\
+    cq02 = vfmaq_laneq_f16(cq02, aq01, bq01, 1);\
+    cq03 = vfmaq_laneq_f16(cq03, aq01, bq01, 2);\
+    cq04 = vfmaq_laneq_f16(cq04, aq01, bq01, 3);\
+    aq01 = vld1q_f16(a_ptr);\
+    cq05 = vfmaq_laneq_f16(cq05, aq02, bq01, 4);\
+    cq06 = vfmaq_laneq_f16(cq06, aq02, bq01, 5);\
+    cq07 = vfmaq_laneq_f16(cq07, aq02, bq01, 6);\
+    cq08 = vfmaq_laneq_f16(cq08, aq02, bq01, 7);\
+    aq02 = vld1q_f16(a_ptr + 8); a_ptr += 16;\
+    bq01 = vld1q_f16(b_ptr1); b_ptr1 += 8;\
+  }\
+  if (k_left > 1) {\
+    cq01 = vfmaq_laneq_f16(cq01, aq01, bq01, 0);\
+    cq02 = vfmaq_laneq_f16(cq02, aq01, bq01, 1);\
+    cq03 = vfmaq_laneq_f16(cq03, aq01, bq01, 2);\
+    cq04 = vfmaq_laneq_f16(cq04, aq01, bq01, 3);\
+    cq05 = vfmaq_laneq_f16(cq05, aq02, bq01, 4);\
+    cq06 = vfmaq_laneq_f16(cq06, aq02, bq01, 5);\
+    cq07 = vfmaq_laneq_f16(cq07, aq02, bq01, 6);\
+    cq08 = vfmaq_laneq_f16(cq08, aq02, bq01, 7);\
+    k_left -= 2;\
+  }\
+  cq01 = vaddq_f16(cq01, cq05);\
+  cq02 = vaddq_f16(cq02, cq06);\
+  cq03 = vaddq_f16(cq03, cq07);\
+  cq04 = vaddq_f16(cq04, cq08);\
+  if (k_left > 0) {\
+    float16x4_t bd01 = vld1_f16(b_ptr1); b_ptr1 += 4;\
+    aq01 = vld1q_f16(a_ptr); a_ptr += 8;\
+    cq01 = vfmaq_lane_f16(cq01, aq01, bd01, 0);\
+    cq02 = vfmaq_lane_f16(cq02, aq01, bd01, 1);\
+    cq03 = vfmaq_lane_f16(cq03, aq01, bd01, 2);\
+    cq04 = vfmaq_lane_f16(cq04, aq01, bd01, 3);\
+  }
+
+#define KERNEL_M8N2_UNIT(a_head, b_head) \
+  const float16_t *a_ptr = a_head;\
+  const float16_t *b_ptr1 = b_head;\
+  float16x8_t cq01, cq02, cq03, cq04;\
+  cq01 = cq02 = cq03 = cq04 = vdupq_n_f16(0.0f);\
+  float16x8_t aq01, aq02; float16x4_t bd01;\
+  uint32_t k_left = K;\
+  if (k_left > 1) {\
+    aq01 = vld1q_f16(a_ptr);\
+    aq02 = vld1q_f16(a_ptr + 8); a_ptr += 16;\
+    bd01 = vld1_f16(b_ptr1); b_ptr1 += 4;\
+  }\
+  for (; k_left > 3; k_left -= 2) {\
+    cq01 = vfmaq_lane_f16(cq01, aq01, bd01, 0);\
+    cq02 = vfmaq_lane_f16(cq02, aq01, bd01, 1);\
+    aq01 = vld1q_f16(a_ptr);\
+    cq03 = vfmaq_lane_f16(cq03, aq02, bd01, 2);\
+    cq04 = vfmaq_lane_f16(cq04, aq02, bd01, 3);\
+    aq02 = vld1q_f16(a_ptr + 8); a_ptr += 16;\
+    bd01 = vld1_f16(b_ptr1); b_ptr1 += 4;\
+  }\
+  if (k_left > 1) {\
+    cq01 = vfmaq_lane_f16(cq01, aq01, bd01, 0);\
+    cq02 = vfmaq_lane_f16(cq02, aq01, bd01, 1);\
+    cq03 = vfmaq_lane_f16(cq03, aq02, bd01, 2);\
+    cq04 = vfmaq_lane_f16(cq04, aq02, bd01, 3);\
+    k_left -= 2;\
+  }\
+  cq01 = vaddq_f16(cq01, cq03);\
+  cq02 = vaddq_f16(cq02, cq04);\
+  if (k_left > 0) {\
+    aq01 = vld1q_f16(a_ptr); a_ptr += 8;\
+    float16_t bs1 = b_ptr1[0];\
+    float16_t bs2 = b_ptr1[1]; b_ptr1 += 2;\
+    cq01 = vfmaq_n_f16(cq01, aq01, bs1);\
+    cq02 = vfmaq_n_f16(cq02, aq01, bs2);\
+  }
+
+#define KERNEL_M8N1_UNIT(a_head, b_head) \
+  const float16_t *a_ptr = a_head;\
+  const float16_t *b_ptr1 = b_head;\
+  float16x8_t cq01, cq02, cq03, cq04;\
+  cq01 = cq02 = cq03 = cq04 = vdupq_n_f16(0.0f);\
+  float16x8_t aq01, aq02, aq03, aq04;\
+  float16x4_t bd01;\
+  uint32_t k_left = K;\
+  if (k_left > 3) {\
+    aq01 = vld1q_f16(a_ptr);\
+    aq02 = vld1q_f16(a_ptr + 8);\
+    aq03 = vld1q_f16(a_ptr + 16);\
+    aq04 = vld1q_f16(a_ptr + 24); a_ptr += 32;\
+    bd01 = vld1_f16(b_ptr1); b_ptr1 += 4;\
+  }\
+  for (; k_left > 7; k_left -= 4) {\
+    cq01 = vfmaq_lane_f16(cq01, aq01, bd01, 0);\
+    aq01 = vld1q_f16(a_ptr);\
+    cq02 = vfmaq_lane_f16(cq02, aq02, bd01, 1);\
+    aq02 = vld1q_f16(a_ptr + 8);\
+    cq03 = vfmaq_lane_f16(cq03, aq03, bd01, 2);\
+    aq03 = vld1q_f16(a_ptr + 16);\
+    cq04 = vfmaq_lane_f16(cq04, aq04, bd01, 3);\
+    aq04 = vld1q_f16(a_ptr + 24); a_ptr += 32;\
+    bd01 = vld1_f16(b_ptr1); b_ptr1 += 4;\
+  }\
+  if (k_left > 3) {\
+    cq01 = vfmaq_lane_f16(cq01, aq01, bd01, 0);\
+    cq02 = vfmaq_lane_f16(cq02, aq02, bd01, 1);\
+    cq03 = vfmaq_lane_f16(cq03, aq03, bd01, 2);\
+    cq04 = vfmaq_lane_f16(cq04, aq04, bd01, 3);\
+    k_left -= 4;\
+  }\
+  cq01 = vaddq_f16(cq01, cq02);\
+  cq03 = vaddq_f16(cq03, cq04);\
+  cq01 = vaddq_f16(cq01, cq03);\
+  for (; k_left > 0; k_left--) {\
+    aq01 = vld1q_f16(a_ptr); a_ptr += 8;\
+    float16_t bs1 = *b_ptr1; b_ptr1++;\
+    cq01 = vfmaq_n_f16(cq01, aq01, bs1);\
+  }
+
+#define KERNEL_M8N4 KERNEL_M8N4_UNIT(a_head, b_head)
+#define KERNEL_M8N2 KERNEL_M8N2_UNIT(a_head, b_head)
+#define KERNEL_M8N1 KERNEL_M8N1_UNIT(a_head, b_head)
+#define KERNEL_M4N8 KERNEL_M8N4_UNIT(b_head, a_head)
+#define KERNEL_M2N8 KERNEL_M8N2_UNIT(b_head, a_head)
+#define KERNEL_M1N8 KERNEL_M8N1_UNIT(b_head, a_head)
+
+#define KERNEL_M4N16_UNIT(a_head, b_head) \
+  const float16_t *a_ptr = a_head;\
+  const float16_t *b_ptr1 = b_head;\
+  float16x8_t cq01, cq02, cq03, cq04, cq05, cq06, cq07, cq08;\
+  cq01 = cq02 = cq03 = cq04 = vdupq_n_f16(0.0f);\
+  cq05 = cq06 = cq07 = cq08 = vdupq_n_f16(0.0f);\
+  float16x8_t aq01, bq01, bq02, bq03, bq04;\
+  uint32_t k_left = K;\
+  if (k_left > 1) {\
+    aq01 = vld1q_f16(a_ptr); a_ptr += 8;\
+    bq01 = vld1q_f16(b_ptr1);\
+    bq02 = vld1q_f16(b_ptr1 + 8);\
+    bq03 = vld1q_f16(b_ptr1 + 16);\
+    bq04 = vld1q_f16(b_ptr1 + 24); b_ptr1 += 32;\
+  }\
+  for (; k_left > 3; k_left -= 2) {\
+    cq01 = vfmaq_laneq_f16(cq01, bq01, aq01, 0);\
+    cq03 = vfmaq_laneq_f16(cq03, bq01, aq01, 1);\
+    cq05 = vfmaq_laneq_f16(cq05, bq01, aq01, 2);\
+    cq07 = vfmaq_laneq_f16(cq07, bq01, aq01, 3);\
+    bq01 = vld1q_f16(b_ptr1);\
+    cq02 = vfmaq_laneq_f16(cq02, bq02, aq01, 0);\
+    cq04 = vfmaq_laneq_f16(cq04, bq02, aq01, 1);\
+    cq06 = vfmaq_laneq_f16(cq06, bq02, aq01, 2);\
+    cq08 = vfmaq_laneq_f16(cq08, bq02, aq01, 3);\
+    bq02 = vld1q_f16(b_ptr1 + 8);\
+    cq01 = vfmaq_laneq_f16(cq01, bq03, aq01, 4);\
+    cq03 = vfmaq_laneq_f16(cq03, bq03, aq01, 5);\
+    cq05 = vfmaq_laneq_f16(cq05, bq03, aq01, 6);\
+    cq07 = vfmaq_laneq_f16(cq07, bq03, aq01, 7);\
+    bq03 = vld1q_f16(b_ptr1 + 16);\
+    cq02 = vfmaq_laneq_f16(cq02, bq04, aq01, 4);\
+    cq04 = vfmaq_laneq_f16(cq04, bq04, aq01, 5);\
+    cq06 = vfmaq_laneq_f16(cq06, bq04, aq01, 6);\
+    cq08 = vfmaq_laneq_f16(cq08, bq04, aq01, 7);\
+    bq04 = vld1q_f16(b_ptr1 + 24); b_ptr1 += 32;\
+    aq01 = vld1q_f16(a_ptr); a_ptr += 8;\
+  }\
+  if (k_left > 1) {\
+    cq01 = vfmaq_laneq_f16(cq01, bq01, aq01, 0);\
+    cq03 = vfmaq_laneq_f16(cq03, bq01, aq01, 1);\
+    cq05 = vfmaq_laneq_f16(cq05, bq01, aq01, 2);\
+    cq07 = vfmaq_laneq_f16(cq07, bq01, aq01, 3);\
+    cq02 = vfmaq_laneq_f16(cq02, bq02, aq01, 0);\
+    cq04 = vfmaq_laneq_f16(cq04, bq02, aq01, 1);\
+    cq06 = vfmaq_laneq_f16(cq06, bq02, aq01, 2);\
+    cq08 = vfmaq_laneq_f16(cq08, bq02, aq01, 3);\
+    cq01 = vfmaq_laneq_f16(cq01, bq03, aq01, 4);\
+    cq03 = vfmaq_laneq_f16(cq03, bq03, aq01, 5);\
+    cq05 = vfmaq_laneq_f16(cq05, bq03, aq01, 6);\
+    cq07 = vfmaq_laneq_f16(cq07, bq03, aq01, 7);\
+    cq02 = vfmaq_laneq_f16(cq02, bq04, aq01, 4);\
+    cq04 = vfmaq_laneq_f16(cq04, bq04, aq01, 5);\
+    cq06 = vfmaq_laneq_f16(cq06, bq04, aq01, 6);\
+    cq08 = vfmaq_laneq_f16(cq08, bq04, aq01, 7);\
+    k_left -= 2;\
+  }\
+  if (k_left > 0) {\
+    float16x4_t ad01 = vld1_f16(a_ptr); a_ptr += 4;\
+    bq01 = vld1q_f16(b_ptr1);\
+    bq02 = vld1q_f16(b_ptr1 + 8); b_ptr1 += 16;\
+    cq01 = vfmaq_lane_f16(cq01, bq01, ad01, 0);\
+    cq03 = vfmaq_lane_f16(cq03, bq01, ad01, 1);\
+    cq05 = vfmaq_lane_f16(cq05, bq01, ad01, 2);\
+    cq07 = vfmaq_lane_f16(cq07, bq01, ad01, 3);\
+    cq02 = vfmaq_lane_f16(cq02, bq02, ad01, 0);\
+    cq04 = vfmaq_lane_f16(cq04, bq02, ad01, 1);\
+    cq06 = vfmaq_lane_f16(cq06, bq02, ad01, 2);\
+    cq08 = vfmaq_lane_f16(cq08, bq02, ad01, 3);\
+  }
+
+#define KERNEL_M2N16_UNIT(a_head, b_head) \
+  const float16_t *a_ptr = a_head;\
+  const float16_t *b_ptr1 = b_head;\
+  float16x8_t cq01, cq02, cq03, cq04, cq05, cq06, cq07, cq08;\
+  cq01 = cq02 = cq03 = cq04 = vdupq_n_f16(0.0f);\
+  cq05 = cq06 = cq07 = cq08 = vdupq_n_f16(0.0f);\
+  float16x8_t bq01, bq02, bq03, bq04;\
+  float16x4_t ad01;\
+  uint32_t k_left = K;\
+  if (k_left > 1) {\
+    ad01 = vld1_f16(a_ptr); a_ptr += 4;\
+    bq01 = vld1q_f16(b_ptr1);\
+    bq02 = vld1q_f16(b_ptr1 + 8);\
+    bq03 = vld1q_f16(b_ptr1 + 16);\
+    bq04 = vld1q_f16(b_ptr1 + 24); b_ptr1 += 32;\
+  }\
+  for (; k_left > 3; k_left -= 2) {\
+    cq01 = vfmaq_lane_f16(cq01, bq01, ad01, 0);\
+    cq03 = vfmaq_lane_f16(cq03, bq01, ad01, 1);\
+    bq01 = vld1q_f16(b_ptr1);\
+    cq02 = vfmaq_lane_f16(cq02, bq02, ad01, 0);\
+    cq04 = vfmaq_lane_f16(cq04, bq02, ad01, 1);\
+    bq02 = vld1q_f16(b_ptr1 + 8);\
+    cq05 = vfmaq_lane_f16(cq05, bq03, ad01, 2);\
+    cq07 = vfmaq_lane_f16(cq07, bq03, ad01, 3);\
+    bq03 = vld1q_f16(b_ptr1 + 16);\
+    cq06 = vfmaq_lane_f16(cq06, bq04, ad01, 2);\
+    cq08 = vfmaq_lane_f16(cq08, bq04, ad01, 3);\
+    bq04 = vld1q_f16(b_ptr1 + 24); b_ptr1 += 32;\
+    ad01 = vld1_f16(a_ptr); a_ptr += 4;\
+  }\
+  if (k_left > 1) {\
+    cq01 = vfmaq_lane_f16(cq01, bq01, ad01, 0);\
+    cq03 = vfmaq_lane_f16(cq03, bq01, ad01, 1);\
+    cq05 = vfmaq_lane_f16(cq05, bq03, ad01, 2);\
+    cq07 = vfmaq_lane_f16(cq07, bq03, ad01, 3);\
+    cq02 = vfmaq_lane_f16(cq02, bq02, ad01, 0);\
+    cq04 = vfmaq_lane_f16(cq04, bq02, ad01, 1);\
+    cq06 = vfmaq_lane_f16(cq06, bq04, ad01, 2);\
+    cq08 = vfmaq_lane_f16(cq08, bq04, ad01, 3);\
+    k_left -= 2;\
+  }\
+  cq01 = vaddq_f16(cq01, cq05);\
+  cq02 = vaddq_f16(cq02, cq06);\
+  cq03 = vaddq_f16(cq03, cq07);\
+  cq04 = vaddq_f16(cq04, cq08);\
+  if (k_left > 0) {\
+    bq01 = vld1q_f16(b_ptr1);\
+    bq02 = vld1q_f16(b_ptr1 + 8); b_ptr1 += 16;\
+    float16_t as1 = a_ptr[0];\
+    float16_t as2 = a_ptr[1]; a_ptr += 2;\
+    cq01 = vfmaq_n_f16(cq01, bq01, as1);\
+    cq02 = vfmaq_n_f16(cq02, bq02, as1);\
+    cq03 = vfmaq_n_f16(cq03, bq01, as2);\
+    cq04 = vfmaq_n_f16(cq04, bq02, as2);\
+  }
+
+#define KERNEL_M1N16_UNIT(a_head, b_head) \
+  const float16_t *a_ptr = a_head;\
+  const float16_t *b_ptr1 = b_head;\
+  float16x8_t cq01, cq02, cq03, cq04, cq05, cq06, cq07, cq08;\
+  cq01 = cq02 = cq03 = cq04 = vdupq_n_f16(0.0f);\
+  cq05 = cq06 = cq07 = cq08 = vdupq_n_f16(0.0f);\
+  float16x8_t bq01, bq02, bq03, bq04, bq05, bq06, bq07, bq08;\
+  float16x4_t ad01;\
+  uint32_t k_left = K;\
+  if (k_left > 3) {\
+    ad01 = vld1_f16(a_ptr); a_ptr += 4;\
+    bq01 = vld1q_f16(b_ptr1);\
+    bq02 = vld1q_f16(b_ptr1 + 8);\
+    bq03 = vld1q_f16(b_ptr1 + 16);\
+    bq04 = vld1q_f16(b_ptr1 + 24);\
+    bq05 = vld1q_f16(b_ptr1 + 32);\
+    bq06 = vld1q_f16(b_ptr1 + 40);\
+    bq07 = vld1q_f16(b_ptr1 + 48);\
+    bq08 = vld1q_f16(b_ptr1 + 56); b_ptr1 += 64;\
+  }\
+  for (; k_left > 7; k_left -= 4) {\
+    cq01 = vfmaq_lane_f16(cq01, bq01, ad01, 0);\
+    bq01 = vld1q_f16(b_ptr1);\
+    cq02 = vfmaq_lane_f16(cq02, bq02, ad01, 0);\
+    bq02 = vld1q_f16(b_ptr1 + 8);\
+    cq03 = vfmaq_lane_f16(cq03, bq03, ad01, 1);\
+    bq03 = vld1q_f16(b_ptr1 + 16);\
+    cq04 = vfmaq_lane_f16(cq04, bq04, ad01, 1);\
+    bq04 = vld1q_f16(b_ptr1 + 24);\
+    cq05 = vfmaq_lane_f16(cq05, bq05, ad01, 2);\
+    bq05 = vld1q_f16(b_ptr1 + 32);\
+    cq06 = vfmaq_lane_f16(cq06, bq06, ad01, 2);\
+    bq06 = vld1q_f16(b_ptr1 + 40);\
+    cq07 = vfmaq_lane_f16(cq07, bq07, ad01, 3);\
+    bq07 = vld1q_f16(b_ptr1 + 48);\
+    cq08 = vfmaq_lane_f16(cq08, bq08, ad01, 3);\
+    bq08 = vld1q_f16(b_ptr1 + 56); b_ptr1 += 64;\
+    ad01 = vld1_f16(a_ptr); a_ptr += 4;\
+  }\
+  if (k_left > 3) {\
+    cq01 = vfmaq_lane_f16(cq01, bq01, ad01, 0);\
+    cq03 = vfmaq_lane_f16(cq03, bq03, ad01, 1);\
+    cq05 = vfmaq_lane_f16(cq05, bq05, ad01, 2);\
+    cq07 = vfmaq_lane_f16(cq07, bq07, ad01, 3);\
+    cq02 = vfmaq_lane_f16(cq02, bq02, ad01, 0);\
+    cq04 = vfmaq_lane_f16(cq04, bq04, ad01, 1);\
+    cq06 = vfmaq_lane_f16(cq06, bq06, ad01, 2);\
+    cq08 = vfmaq_lane_f16(cq08, bq08, ad01, 3);\
+    k_left -= 4;\
+  }\
+  cq01 = vaddq_f16(cq01, cq03);\
+  cq05 = vaddq_f16(cq05, cq07);\
+  cq02 = vaddq_f16(cq02, cq04);\
+  cq06 = vaddq_f16(cq06, cq08);\
+  cq01 = vaddq_f16(cq01, cq05);\
+  cq02 = vaddq_f16(cq02, cq06);\
+  for (; k_left > 0; k_left--) {\
+    float16_t as1 = *a_ptr; a_ptr++;\
+    bq01 = vld1q_f16(b_ptr1);\
+    bq02 = vld1q_f16(b_ptr1 + 8); b_ptr1 += 16;\
+    cq01 = vfmaq_n_f16(cq01, bq01, as1);\
+    cq02 = vfmaq_n_f16(cq02, bq02, as1);\
+  }
+
+#define KERNEL_M4N16 KERNEL_M4N16_UNIT(a_head, b_head)
+#define KERNEL_M2N16 KERNEL_M2N16_UNIT(a_head, b_head)
+#define KERNEL_M1N16 KERNEL_M1N16_UNIT(a_head, b_head)
+#define KERNEL_M16N4 KERNEL_M4N16_UNIT(b_head, a_head)
+#define KERNEL_M16N2 KERNEL_M2N16_UNIT(b_head, a_head)
+#define KERNEL_M16N1 KERNEL_M1N16_UNIT(b_head, a_head)
+
+#define KERNEL_M4N4 \
+  const float16_t *a_ptr = a_head;\
+  const float16_t *b_ptr1 = b_head;\
+  float16x4_t cd01, cd02, cd03, cd04;\
+  cd01 = cd02 = cd03 = cd04 = vdup_n_f16(0.0f);\
+  float16x4_t ad01, bd01;\
+  uint32_t k_left = K;\
+  for (; k_left > 0; k_left--) {\
+    ad01 = vld1_f16(a_ptr); a_ptr += 4;\
+    bd01 = vld1_f16(b_ptr1); b_ptr1 += 4;\
+    cd01 = vfma_lane_f16(cd01, ad01, bd01, 0);\
+    cd02 = vfma_lane_f16(cd02, ad01, bd01, 1);\
+    cd03 = vfma_lane_f16(cd03, ad01, bd01, 2);\
+    cd04 = vfma_lane_f16(cd04, ad01, bd01, 3);\
+  }
+
+#define KERNEL_M4N2_UNIT(a_head, b_head) \
+  const float16_t *a_ptr = a_head;\
+  const float16_t *b_ptr1 = b_head;\
+  float16x4_t cd01, cd02, cd03, cd04;\
+  cd01 = cd02 = cd03 = cd04 = vdup_n_f16(0.0f);\
+  float16x4_t ad01, ad02, bd01;\
+  uint32_t k_left = K;\
+  for (; k_left > 1; k_left -= 2) {\
+    ad01 = vld1_f16(a_ptr);\
+    ad02 = vld1_f16(a_ptr + 4); a_ptr += 8;\
+    bd01 = vld1_f16(b_ptr1); b_ptr1 += 4;\
+    cd01 = vfma_lane_f16(cd01, ad01, bd01, 0);\
+    cd02 = vfma_lane_f16(cd02, ad01, bd01, 1);\
+    cd03 = vfma_lane_f16(cd03, ad02, bd01, 2);\
+    cd04 = vfma_lane_f16(cd04, ad02, bd01, 3);\
+  }\
+  cd01 = vadd_f16(cd01, cd03);\
+  cd02 = vadd_f16(cd02, cd04);\
+  if (k_left > 0) {\
+    ad01 = vld1_f16(a_ptr); a_ptr += 4;\
+    float16_t bs1 = b_ptr1[0];\
+    float16_t bs2 = b_ptr1[1]; b_ptr1 += 2;\
+    cd01 = vfma_n_f16(cd01, ad01, bs1);\
+    cd02 = vfma_n_f16(cd02, ad01, bs2);\
+  }
+
+#define KERNEL_M4N1_UNIT(a_head, b_head) \
+  const float16_t *a_ptr = a_head;\
+  const float16_t *b_ptr1 = b_head;\
+  float16x4_t cd01, cd02, cd03, cd04;\
+  cd01 = cd02 = cd03 = cd04 = vdup_n_f16(0.0f);\
+  float16x4_t ad01, ad02, ad03, ad04, bd01;\
+  uint32_t k_left = K;\
+  for (; k_left > 3; k_left -= 4) {\
+    ad01 = vld1_f16(a_ptr);\
+    ad02 = vld1_f16(a_ptr + 4);\
+    ad03 = vld1_f16(a_ptr + 8);\
+    ad04 = vld1_f16(a_ptr + 12); a_ptr += 16;\
+    bd01 = vld1_f16(b_ptr1); b_ptr1 += 4;\
+    cd01 = vfma_lane_f16(cd01, ad01, bd01, 0);\
+    cd02 = vfma_lane_f16(cd02, ad02, bd01, 1);\
+    cd03 = vfma_lane_f16(cd03, ad03, bd01, 2);\
+    cd04 = vfma_lane_f16(cd04, ad04, bd01, 3);\
+  }\
+  cd01 = vadd_f16(cd01, cd03);\
+  cd02 = vadd_f16(cd02, cd04);\
+  cd01 = vadd_f16(cd01, cd02);\
+  for (; k_left > 0; k_left--) {\
+    ad01 = vld1_f16(a_ptr); a_ptr += 4;\
+    float16_t bs1 = *b_ptr1; b_ptr1++;\
+    cd01 = vfma_n_f16(cd01, ad01, bs1);\
+  }
+
+#define KERNEL_M4N2 KERNEL_M4N2_UNIT(a_head, b_head)
+#define KERNEL_M4N1 KERNEL_M4N1_UNIT(a_head, b_head)
+#define KERNEL_M2N4 KERNEL_M4N2_UNIT(b_head, a_head)
+#define KERNEL_M1N4 KERNEL_M4N1_UNIT(b_head, a_head)
+
+#define KERNEL_M2N2 \
+  const float16_t *a_ptr = a_head;\
+  const float16_t *b_ptr1 = b_head;\
+  float16_t cs1, cs2, cs3, cs4;\
+  cs1 = cs2 = cs3 = cs4 = 0.0f;\
+  float16_t as1, as2, bs1, bs2;\
+  uint32_t k_left = K;\
+  for (; k_left > 0; k_left--) {\
+    as1 = a_ptr[0]; as2 = a_ptr[1]; a_ptr += 2;\
+    bs1 = b_ptr1[0]; bs2 = b_ptr1[1]; b_ptr1 += 2;\
+    cs1 += as1 * bs1;\
+    cs2 += as2 * bs1;\
+    cs3 += as1 * bs2;\
+    cs4 += as2 * bs2;\
+  }
+
+#define KERNEL_M2N1_UNIT(a_head, b_head) \
+  const float16_t *a_ptr = a_head;\
+  const float16_t *b_ptr1 = b_head;\
+  float16_t cs1, cs2; cs1 = cs2 = 0.0f;\
+  float16_t as1, as2, bs1;\
+  uint32_t k_left = K;\
+  for (; k_left > 0; k_left--) {\
+    as1 = a_ptr[0]; as2 = a_ptr[1]; a_ptr += 2;\
+    bs1 = b_ptr1[0]; b_ptr1++;\
+    cs1 += as1 * bs1;\
+    cs2 += as2 * bs1;\
+  }
+
+#define KERNEL_M2N1 KERNEL_M2N1_UNIT(a_head, b_head)
+#define KERNEL_M1N2 KERNEL_M2N1_UNIT(b_head, a_head)
+
+#define KERNEL_M1N1 \
+  const float16_t *a_ptr = a_head;\
+  const float16_t *b_ptr1 = b_head;\
+  float16x4_t cd01 = vdup_n_f16(0.0f);\
+  float16x4_t ad01, bd01;\
+  uint32_t k_left = K;\
+  for (; k_left > 3; k_left -= 4) {\
+    ad01 = vld1_f16(a_ptr); a_ptr += 4;\
+    bd01 = vld1_f16(b_ptr1); b_ptr1 += 4;\
+    cd01 = vfma_f16(cd01, ad01, bd01);\
+  }\
+  float16_t cs1 = vget_lane_f16(cd01, 0) + vget_lane_f16(cd01, 1) + \
+    vget_lane_f16(cd01, 2) + vget_lane_f16(cd01, 3);\
+  for (; k_left > 0; k_left--) {\
+    cs1 += (*a_ptr) * (*b_ptr1); a_ptr++; b_ptr1++;\
+  }
+
+
+#define SAVE_M1N8_UNIT(cq01, c_tmp) {\
+  float16_t cs1 = vgetq_lane_f16(cq01, 0);\
+  float16_t cs2 = vgetq_lane_f16(cq01, 1);\
+  float16_t cs3 = vgetq_lane_f16(cq01, 2);\
+  float16_t cs4 = vgetq_lane_f16(cq01, 3);\
+  float16_t cs5 = vgetq_lane_f16(cq01, 4);\
+  float16_t cs6 = vgetq_lane_f16(cq01, 5);\
+  float16_t cs7 = vgetq_lane_f16(cq01, 6);\
+  float16_t cs8 = vgetq_lane_f16(cq01, 7);\
+  *c_tmp = *c_tmp * beta + cs1; c_tmp += ldc;\
+  *c_tmp = *c_tmp * beta + cs2; c_tmp += ldc;\
+  *c_tmp = *c_tmp * beta + cs3; c_tmp += ldc;\
+  *c_tmp = *c_tmp * beta + cs4; c_tmp += ldc;\
+  *c_tmp = *c_tmp * beta + cs5; c_tmp += ldc;\
+  *c_tmp = *c_tmp * beta + cs6; c_tmp += ldc;\
+  *c_tmp = *c_tmp * beta + cs7; c_tmp += ldc;\
+  *c_tmp = *c_tmp * beta + cs8; c_tmp += ldc;\
+}
+
+#define SAVE_M2N8_UNIT(cq01, cq02, c_tmp) {\
+  float16x8x2_t cqd1;\
+  cqd1.val[0] = vdupq_n_f16(0.0f);\
+  cqd1.val[1] = vdupq_n_f16(0.0f);\
+  cqd1 = vld2q_lane_f16(c_tmp, cqd1, 0); c_tmp += ldc;\
+  cqd1 = vld2q_lane_f16(c_tmp, cqd1, 1); c_tmp += ldc;\
+  cqd1 = vld2q_lane_f16(c_tmp, cqd1, 2); c_tmp += ldc;\
+  cqd1 = vld2q_lane_f16(c_tmp, cqd1, 3); c_tmp += ldc;\
+  cqd1 = vld2q_lane_f16(c_tmp, cqd1, 4); c_tmp += ldc;\
+  cqd1 = vld2q_lane_f16(c_tmp, cqd1, 5); c_tmp += ldc;\
+  cqd1 = vld2q_lane_f16(c_tmp, cqd1, 6); c_tmp += ldc;\
+  cqd1 = vld2q_lane_f16(c_tmp, cqd1, 7); c_tmp -= ldc * 7;\
+  cqd1.val[0] = vfmaq_n_f16(cq01, cqd1.val[0], beta);\
+  cqd1.val[1] = vfmaq_n_f16(cq02, cqd1.val[1], beta);\
+  vst2q_lane_f16(c_tmp, cqd1, 0); c_tmp += ldc;\
+  vst2q_lane_f16(c_tmp, cqd1, 1); c_tmp += ldc;\
+  vst2q_lane_f16(c_tmp, cqd1, 2); c_tmp += ldc;\
+  vst2q_lane_f16(c_tmp, cqd1, 3); c_tmp += ldc;\
+  vst2q_lane_f16(c_tmp, cqd1, 4); c_tmp += ldc;\
+  vst2q_lane_f16(c_tmp, cqd1, 5); c_tmp += ldc;\
+  vst2q_lane_f16(c_tmp, cqd1, 6); c_tmp += ldc;\
+  vst2q_lane_f16(c_tmp, cqd1, 7); c_tmp += ldc;\
+}
+
+#define SAVE_M4N8_UNIT(cq01, cq02, cq03, cq04, c_tmp) {\
+  float16x8x4_t cqq1;\
+  cqq1.val[0] = vdupq_n_f16(0.0f);\
+  cqq1.val[1] = vdupq_n_f16(0.0f);\
+  cqq1.val[2] = vdupq_n_f16(0.0f);\
+  cqq1.val[3] = vdupq_n_f16(0.0f);\
+  cqq1 = vld4q_lane_f16(c_tmp, cqq1, 0); c_tmp += ldc;\
+  cqq1 = vld4q_lane_f16(c_tmp, cqq1, 1); c_tmp += ldc;\
+  cqq1 = vld4q_lane_f16(c_tmp, cqq1, 2); c_tmp += ldc;\
+  cqq1 = vld4q_lane_f16(c_tmp, cqq1, 3); c_tmp += ldc;\
+  cqq1 = vld4q_lane_f16(c_tmp, cqq1, 4); c_tmp += ldc;\
+  cqq1 = vld4q_lane_f16(c_tmp, cqq1, 5); c_tmp += ldc;\
+  cqq1 = vld4q_lane_f16(c_tmp, cqq1, 6); c_tmp += ldc;\
+  cqq1 = vld4q_lane_f16(c_tmp, cqq1, 7); c_tmp -= ldc * 7;\
+  cqq1.val[0] = vfmaq_n_f16(cq01, cqq1.val[0], beta);\
+  cqq1.val[1] = vfmaq_n_f16(cq02, cqq1.val[1], beta);\
+  cqq1.val[2] = vfmaq_n_f16(cq03, cqq1.val[2], beta);\
+  cqq1.val[3] = vfmaq_n_f16(cq04, cqq1.val[3], beta);\
+  vst4q_lane_f16(c_tmp, cqq1, 0); c_tmp += ldc;\
+  vst4q_lane_f16(c_tmp, cqq1, 1); c_tmp += ldc;\
+  vst4q_lane_f16(c_tmp, cqq1, 2); c_tmp += ldc;\
+  vst4q_lane_f16(c_tmp, cqq1, 3); c_tmp += ldc;\
+  vst4q_lane_f16(c_tmp, cqq1, 4); c_tmp += ldc;\
+  vst4q_lane_f16(c_tmp, cqq1, 5); c_tmp += ldc;\
+  vst4q_lane_f16(c_tmp, cqq1, 6); c_tmp += ldc;\
+  vst4q_lane_f16(c_tmp, cqq1, 7); c_tmp += ldc;\
+}
+
+#define SAVE_M2N4_UNIT(cd01, cd02, c_tmp) {\
+  float16x4x2_t cdd1;\
+  cdd1.val[0] = vdup_n_f16(0.0f);\
+  cdd1.val[1] = vdup_n_f16(0.0f);\
+  cdd1 = vld2_lane_f16(c_tmp, cdd1, 0); c_tmp += ldc;\
+  cdd1 = vld2_lane_f16(c_tmp, cdd1, 1); c_tmp += ldc;\
+  cdd1 = vld2_lane_f16(c_tmp, cdd1, 2); c_tmp += ldc;\
+  cdd1 = vld2_lane_f16(c_tmp, cdd1, 3); c_tmp -= ldc * 3;\
+  cdd1.val[0] = vfma_n_f16(cd01, cdd1.val[0], beta);\
+  cdd1.val[1] = vfma_n_f16(cd02, cdd1.val[1], beta);\
+  vst2_lane_f16(c_tmp, cdd1, 0); c_tmp += ldc;\
+  vst2_lane_f16(c_tmp, cdd1, 1); c_tmp += ldc;\
+  vst2_lane_f16(c_tmp, cdd1, 2); c_tmp += ldc;\
+  vst2_lane_f16(c_tmp, cdd1, 3); c_tmp += ldc;\
+}
+
+#define SAVE_M1N4_UNIT(cd01, c_tmp) {\
+  float16_t cs1 = vget_lane_f16(cd01, 0);\
+  float16_t cs2 = vget_lane_f16(cd01, 1);\
+  float16_t cs3 = vget_lane_f16(cd01, 2);\
+  float16_t cs4 = vget_lane_f16(cd01, 3);\
+  *c_tmp = *c_tmp * beta + cs1; c_tmp += ldc;\
+  *c_tmp = *c_tmp * beta + cs2; c_tmp += ldc;\
+  *c_tmp = *c_tmp * beta + cs3; c_tmp += ldc;\
+  *c_tmp = *c_tmp * beta + cs4; c_tmp += ldc;\
+}
+
+#define SAVE_M16N2_UNIT(cq01, cq02, cq03, cq04, c_tmp) \
+  cq01 = vfmaq_n_f16(cq01, vld1q_f16(c_tmp), beta);\
+  cq02 = vfmaq_n_f16(cq02, vld1q_f16(c_tmp + 8), beta);\
+  cq03 = vfmaq_n_f16(cq03, vld1q_f16(c_tmp + ldc), beta);\
+  cq04 = vfmaq_n_f16(cq04, vld1q_f16(c_tmp + ldc + 8), beta);\
+  vst1q_f16(c_tmp, cq01); vst1q_f16(c_tmp + 8, cq02);\
+  vst1q_f16(c_tmp + ldc, cq03); vst1q_f16(c_tmp + ldc + 8, cq04);\
+  c_tmp += ldc * 2;
+
+#define SAVE_M8N2_UNIT(cq01, cq02, c_tmp) \
+  cq01 = vfmaq_n_f16(cq01, vld1q_f16(c_tmp), beta);\
+  cq02 = vfmaq_n_f16(cq02, vld1q_f16(c_tmp + ldc), beta);\
+  vst1q_f16(c_tmp, cq01);\
+  vst1q_f16(c_tmp + ldc, cq02); c_tmp += ldc * 2;
+
+#define SAVE_M4N2_UNIT(cd01, cd02, c_tmp) \
+  cd01 = vfma_n_f16(cd01, vld1_f16(c_tmp), beta);\
+  cd02 = vfma_n_f16(cd02, vld1_f16(c_tmp + ldc), beta);\
+  vst1_f16(c_tmp, cd01);\
+  vst1_f16(c_tmp + ldc, cd02); c_tmp += ldc * 2;
+
+#define SAVE_M8N16 \
+  float16_t *c_tmp = c_ptr;\
+  SAVE_M8N2_UNIT(cq01, cq02, c_tmp)\
+  SAVE_M8N2_UNIT(cq03, cq04, c_tmp)\
+  SAVE_M8N2_UNIT(cq05, cq06, c_tmp)\
+  SAVE_M8N2_UNIT(cq07, cq08, c_tmp)\
+  SAVE_M8N2_UNIT(cq09, cq10, c_tmp)\
+  SAVE_M8N2_UNIT(cq11, cq12, c_tmp)\
+  SAVE_M8N2_UNIT(cq13, cq14, c_tmp)\
+  SAVE_M8N2_UNIT(cq15, cq16, c_tmp)
+
+#define SAVE_M4N16 \
+  float16_t *c_tmp = c_ptr;\
+  SAVE_M4N8_UNIT(cq01, cq03, cq05, cq07, c_tmp)\
+  SAVE_M4N8_UNIT(cq02, cq04, cq06, cq08, c_tmp)
+
+#define SAVE_M2N16 \
+  float16_t *c_tmp = c_ptr;\
+  SAVE_M2N8_UNIT(cq01, cq03, c_tmp)\
+  SAVE_M2N8_UNIT(cq02, cq04, c_tmp)
+
+#define SAVE_M1N16 \
+  float16_t *c_tmp = c_ptr;\
+  SAVE_M1N8_UNIT(cq01, c_tmp)\
+  SAVE_M1N8_UNIT(cq02, c_tmp)
+
+#define SAVE_M16N8 \
+  float16_t *c_tmp = c_ptr;\
+  SAVE_M16N2_UNIT(cq01, cq02, cq03, cq04, c_tmp)\
+  SAVE_M16N2_UNIT(cq05, cq06, cq07, cq08, c_tmp)\
+  SAVE_M16N2_UNIT(cq09, cq10, cq11, cq12, c_tmp)\
+  SAVE_M16N2_UNIT(cq13, cq14, cq15, cq16, c_tmp)
+
+#define SAVE_M8N8 \
+  float16_t *c_tmp = c_ptr;\
+  SAVE_M8N2_UNIT(cq01, cq02, c_tmp)\
+  SAVE_M8N2_UNIT(cq03, cq04, c_tmp)\
+  SAVE_M8N2_UNIT(cq05, cq06, c_tmp)\
+  SAVE_M8N2_UNIT(cq07, cq08, c_tmp)
+
+#define SAVE_M4N8 \
+  float16_t *c_tmp = c_ptr;\
+  SAVE_M4N8_UNIT(cq01, cq02, cq03, cq04, c_tmp)
+
+#define SAVE_M2N8 \
+  float16_t *c_tmp = c_ptr;\
+  SAVE_M2N8_UNIT(cq01, cq02, c_tmp)
+
+#define SAVE_M1N8 \
+  float16_t *c_tmp = c_ptr;\
+  SAVE_M1N8_UNIT(cq01, c_tmp)
+
+#define SAVE_M16N4 \
+  float16_t *c_tmp = c_ptr;\
+  SAVE_M16N2_UNIT(cq01, cq02, cq03, cq04, c_tmp)\
+  SAVE_M16N2_UNIT(cq05, cq06, cq07, cq08, c_tmp)
+
+#define SAVE_M8N4 \
+  float16_t *c_tmp = c_ptr;\
+  SAVE_M8N2_UNIT(cq01, cq02, c_tmp)\
+  SAVE_M8N2_UNIT(cq03, cq04, c_tmp)
+
+#define SAVE_M4N4 \
+  float16_t *c_tmp = c_ptr;\
+  SAVE_M4N2_UNIT(cd01, cd02, c_tmp)\
+  SAVE_M4N2_UNIT(cd03, cd04, c_tmp)
+
+#define SAVE_M2N4 \
+  float16_t *c_tmp = c_ptr; SAVE_M2N4_UNIT(cd01, cd02, c_tmp)
+
+#define SAVE_M1N4 \
+  float16_t *c_tmp = c_ptr; SAVE_M1N4_UNIT(cd01, c_tmp)
+
+#define SAVE_M16N2 \
+  float16_t *c_tmp = c_ptr; SAVE_M16N2_UNIT(cq01, cq02, cq03, cq04, c_tmp)
+
+#define SAVE_M8N2 \
+  float16_t *c_tmp = c_ptr; SAVE_M8N2_UNIT(cq01, cq02, c_tmp)
+
+#define SAVE_M4N2 \
+  float16_t *c_tmp = c_ptr; SAVE_M4N2_UNIT(cd01, cd02, c_tmp)
+
+#define SAVE_M2N2 \
+  c_ptr[0] = c_ptr[0] * beta + cs1;\
+  c_ptr[1] = c_ptr[1] * beta + cs2;\
+  c_ptr[ldc] = c_ptr[ldc] * beta + cs3;\
+  c_ptr[ldc + 1] = c_ptr[ldc + 1] * beta + cs4;\
+
+#define SAVE_M1N2 \
+  c_ptr[0] = c_ptr[0] * beta + cs1;\
+  c_ptr[ldc] = c_ptr[ldc] * beta + cs2;\
+
+#define SAVE_M16N1 \
+  cq01 = vfmaq_n_f16(cq01, vld1q_f16(c_ptr), beta);\
+  cq02 = vfmaq_n_f16(cq02, vld1q_f16(c_ptr + 8), beta);\
+  vst1q_f16(c_ptr, cq01); vst1q_f16(c_ptr + 8, cq02);
+
+#define SAVE_M8N1 \
+  cq01 = vfmaq_n_f16(cq01, vld1q_f16(c_ptr), beta);\
+  vst1q_f16(c_ptr, cq01);
+
+#define SAVE_M4N1 \
+  cd01 = vfma_n_f16(cd01, vld1_f16(c_ptr), beta);\
+  vst1_f16(c_ptr, cd01);
+
+#define SAVE_M2N1 \
+  c_ptr[0] = c_ptr[0] * beta + cs1;\
+  c_ptr[1] = c_ptr[1] * beta + cs2;\
+
+#define SAVE_M1N1 \
+  c_ptr[0] = c_ptr[0] * beta + cs1;
+
+#define NEON_HGEMM_INLINE_DUALPACK_UNIT_FUNC(mdim, ndim) \
+static inline void\
+  inline_dualpack_gemm_afloat16_t_bfloat16_t_cfloat16_t_m##mdim##_n##ndim(\
+  const float16_t *a_head, const float16_t *b_head, float16_t *c_ptr,\
+  uint32_t K, float16_t beta, uint32_t ldc) {\
+  KERNEL_M##mdim##N##ndim\
+  SAVE_M##mdim##N##ndim\
+}
+
+NEON_HGEMM_INLINE_DUALPACK_UNIT_FUNC(1, 1)
+NEON_HGEMM_INLINE_DUALPACK_UNIT_FUNC(1, 2)
+NEON_HGEMM_INLINE_DUALPACK_UNIT_FUNC(2, 1)
+NEON_HGEMM_INLINE_DUALPACK_UNIT_FUNC(2, 2)
+NEON_HGEMM_INLINE_DUALPACK_UNIT_FUNC(1, 4)
+NEON_HGEMM_INLINE_DUALPACK_UNIT_FUNC(2, 4)
+NEON_HGEMM_INLINE_DUALPACK_UNIT_FUNC(4, 1)
+NEON_HGEMM_INLINE_DUALPACK_UNIT_FUNC(4, 2)
+NEON_HGEMM_INLINE_DUALPACK_UNIT_FUNC(4, 4)
+NEON_HGEMM_INLINE_DUALPACK_UNIT_FUNC(1, 8)
+NEON_HGEMM_INLINE_DUALPACK_UNIT_FUNC(2, 8)
+NEON_HGEMM_INLINE_DUALPACK_UNIT_FUNC(4, 8)
+NEON_HGEMM_INLINE_DUALPACK_UNIT_FUNC(8, 1)
+NEON_HGEMM_INLINE_DUALPACK_UNIT_FUNC(8, 2)
+NEON_HGEMM_INLINE_DUALPACK_UNIT_FUNC(8, 4)
+NEON_HGEMM_INLINE_DUALPACK_UNIT_FUNC(1, 16)
+NEON_HGEMM_INLINE_DUALPACK_UNIT_FUNC(2, 16)
+NEON_HGEMM_INLINE_DUALPACK_UNIT_FUNC(4, 16)
+NEON_HGEMM_INLINE_DUALPACK_UNIT_FUNC(16, 1)
+NEON_HGEMM_INLINE_DUALPACK_UNIT_FUNC(16, 2)
+NEON_HGEMM_INLINE_DUALPACK_UNIT_FUNC(16, 4)
+
+#define CPUID_DETECT_MNK 1000000
+
+void hgemm_kernel_lm_m8n16(uint32_t M, uint32_t N, uint32_t K, float16_t beta,
+  const float16_t * __restrict__ sa, const float16_t * __restrict__ sb,
+  float16_t * __restrict__ C, uint32_t ldc) {
+
+  uint32_t n_left = N;
+  const float16_t *b_head = sb;
+  float16_t *c_head = C;
+  uint32_t acc_mnk = CPUID_DETECT_MNK;
+  uint8_t cpuid = 0, cputype = 0;
+
+  for (; n_left > 15; n_left -= 16) {
+    if (acc_mnk >= CPUID_DETECT_MNK) {
+      cpuid = sched_getcpu();
+      cputype = blas_arm_get_cpu_type(cpuid);
+      acc_mnk = 0;
+    }
+    const float16_t *a_head = sa;
+    float16_t *c_ptr = c_head;
+    uint32_t m_left = M;
+    if (cputype == 55) {
+      for (; m_left > 7; m_left -= 8) {
+        KERNEL_M8N16_A55
+        SAVE_M8N16
+        a_head += 8 * K;
+        c_ptr += 8;
+      }
+    } else {
+      for (; m_left > 7; m_left -= 8) {
+        KERNEL_M8N16_A76
+        SAVE_M8N16
+        a_head += 8 * K;
+        c_ptr += 8;
+      }
+    }
+    MICRO_COMPUTE_LM(4, 16, float16_t, float16_t, float16_t)
+    b_head += K * 16;
+    c_head += ldc * 16;
+    acc_mnk += 16 * K * M;
+  }
+
+  for (; n_left > 7; n_left -= 8) {
+    if (acc_mnk >= CPUID_DETECT_MNK) {
+      cpuid = sched_getcpu();
+      cputype = blas_arm_get_cpu_type(cpuid);
+      acc_mnk = 0;
+    }
+    const float16_t *a_head = sa;
+    float16_t *c_ptr = c_head;
+    uint32_t m_left = M;
+    if (cputype == 55) {
+      for (; m_left > 7; m_left -= 8) {
+        KERNEL_M8N8_A55
+        SAVE_M8N8
+        a_head += 8 * K;
+        c_ptr += 8;
+      }
+    } else {
+      for (; m_left > 7; m_left -= 8) {
+        KERNEL_M8N8_A76
+        SAVE_M8N8
+        a_head += 8 * K;
+        c_ptr += 8;
+      }
+    }
+    MICRO_COMPUTE_LM(4, 8, float16_t, float16_t, float16_t)
+    b_head += K * 8;
+    c_head += ldc * 8;
+    acc_mnk += 8 * K * M;
+  }
+
+  ASSEMBLE_DUALPACK_COMPUTE_LM(4, float16_t, float16_t, float16_t, 8)
+}
+
+void hgemm_kernel_ln_m16n8(uint32_t M, uint32_t N, uint32_t K, float16_t beta,
+  const float16_t * __restrict__ sa, const float16_t * __restrict__ sb,
+  float16_t * __restrict__ C, uint32_t ldc) {
+
+  uint32_t m_left = M;
+  const float16_t *a_head = sa;
+  float16_t *c_head = C;
+  uint32_t acc_mnk = CPUID_DETECT_MNK;
+  uint8_t cpuid = 0, cputype = 0;
+  for (; m_left > 15; m_left -= 16) {
+    if (acc_mnk >= CPUID_DETECT_MNK) {
+      cpuid = sched_getcpu();
+      cputype = blas_arm_get_cpu_type(cpuid);
+      acc_mnk = 0;
+    }
+    const float16_t *b_head = sb;
+    float16_t *c_ptr = c_head;
+    uint32_t n_left = N;
+    if (cputype == 55) {
+      for (; n_left > 7; n_left -= 8) {
+        KERNEL_M16N8_A55
+        SAVE_M16N8
+        b_head += 8 * K;
+        c_ptr += 8 * ldc;
+      }
+    } else {
+      for (; n_left > 7; n_left -= 8) {
+        KERNEL_M16N8_A76
+        SAVE_M16N8
+        b_head += 8 * K;
+        c_ptr += 8 * ldc;
+      }
+    }
+    MICRO_COMPUTE_LN(16, 4, float16_t, float16_t, float16_t)
+    a_head += K * 16;
+    c_head += 16;
+    acc_mnk += 16 * N * K;
+  }
+
+  for (; m_left > 7; m_left -= 8) {
+    if (acc_mnk >= CPUID_DETECT_MNK) {
+      cpuid = sched_getcpu();
+      cputype = blas_arm_get_cpu_type(cpuid);
+      acc_mnk = 0;
+    }
+    const float16_t *b_head = sb;
+    float16_t *c_ptr = c_head;
+    uint32_t n_left = N;
+    if (cputype == 55) {
+      for (; n_left > 7; n_left -= 8) {
+        KERNEL_M8N8_A55
+        SAVE_M8N8
+        b_head += 8 * K;
+        c_ptr += 8 * ldc;
+      }
+    } else {
+      for (; n_left > 7; n_left -= 8) {
+        KERNEL_M8N8_A76
+        SAVE_M8N8
+        b_head += 8 * K;
+        c_ptr += 8 * ldc;
+      }
+    }
+    MICRO_COMPUTE_LN(8, 4, float16_t, float16_t, float16_t)
+    a_head += K * 8;
+    c_head += 8;
+    acc_mnk += 8 * N * K;
+  }
+
+  ASSEMBLE_DUALPACK_COMPUTE_LN(4, float16_t, float16_t, float16_t, 8)
+}
+
diff --git a/src/neon_armv8a/extension/HgemmSkinnyDot.c b/src/neon_armv8a/extension/HgemmSkinnyDot.c
new file mode 100644
index 0000000..0e46b23
--- /dev/null
+++ b/src/neon_armv8a/extension/HgemmSkinnyDot.c
@@ -0,0 +1,350 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include "arm_neon/ARMCompareAndSwap.h"
+#include "common/CommonSkinnyDot.h"
+#include <arm_neon.h>
+
+static inline void inline_hgemm_arowmajor_bskinny_m1n1(
+  const float16_t *a_ptr, const float16_t *b_ptr, float16_t *c_ptr,
+  uint32_t k_left, uint32_t LDK, uint32_t LDM,
+  float16_t beta, bool c_rowmajor) {
+
+  float16x8_t cq1;
+  __asm__ __volatile__ (
+    "movi %[cq1].16b,#0; movi v0.16b,#0\n\t"
+    "mov v1.16b,%[cq1].16b; mov v2.16b,v0.16b\n\t"
+    "cmp %w[k_left],#32; b.lt 3f\n\t"
+    "ldr q3,[%[a_ptr]],#64; ldr q7,[%[b_ptr]],#64\n\t"
+    "ldr q4,[%[a_ptr],#-48]; ldr q8,[%[b_ptr],#-48]\n\t"
+    "ldr q5,[%[a_ptr],#-32]; ldr q9,[%[b_ptr],#-32]\n\t"
+    "ldr q6,[%[a_ptr],#-16]; ldr q10,[%[b_ptr],#-16]\n\t"
+    "cmp %w[k_left],#64; b.lt 2f\n\t"
+    ".balign 16; 1:\n\t"
+    "fmla %[cq1].8h,v3.8h,v7.8h; ldr q3,[%[a_ptr]],#64\n\t"
+    "ldr q7,[%[b_ptr]],#64; sub %w[k_left],%w[k_left],#32\n\t"
+    "fmla v0.8h,v4.8h,v8.8h; ldr q4,[%[a_ptr],#-48]\n\t"
+    "ldr q8,[%[b_ptr],#-48]; cmp %w[k_left],#64\n\t"
+    "fmla v1.8h,v5.8h,v9.8h; ldr q5,[%[a_ptr],#-32]\n\t"
+    "ldr q9,[%[b_ptr],#-32]\n\t"
+    "fmla v2.8h,v6.8h,v10.8h; ldr q6,[%[a_ptr],#-16]\n\t"
+    "ldr q10,[%[b_ptr],#-16]; b.ge 1b\n\t"
+    "2:\n\t"
+    "fmla %[cq1].8h,v3.8h,v7.8h; sub %w[k_left],%w[k_left],#32\n\t"
+    "fmla v0.8h,v4.8h,v8.8h\n\t"
+    "fmla v1.8h,v5.8h,v9.8h\n\t"
+    "fmla v2.8h,v6.8h,v10.8h\n\t"
+    "3:\n\t"
+    "cmp %w[k_left],#16; fadd %[cq1].8h,%[cq1].8h,v1.8h\n\t"
+    "fadd v0.8h,v0.8h,v2.8h; b.lt 4f\n\t"
+    "ldr q3,[%[a_ptr]],#32; ldr q7,[%[b_ptr]],#32\n\t"
+    "ldr q4,[%[a_ptr],#-16]; ldr q8,[%[b_ptr],#-16]\n\t"
+    "sub %w[k_left],%w[k_left],#16\n\t"
+    "fmla %[cq1].8h,v3.8h,v7.8h; fmla v0.8h,v4.8h,v8.8h\n\t"
+    "4:\n\t"
+    "cmp %w[k_left],#8; fadd %[cq1].8h,%[cq1].8h,v0.8h; b.lt 5f\n\t"
+    "ldr q3,[%[a_ptr]],#16; ldr q7,[%[b_ptr]],#16\n\t"
+    "sub %w[k_left],%w[k_left],#8; fmla %[cq1].8h,v3.8h,v7.8h\n\t"
+    "5:\n\t"
+   :[cq1]"=w"(cq1), [k_left]"+r"(k_left),
+    [a_ptr]"+r"(a_ptr), [b_ptr]"+r"(b_ptr)
+   ::"cc","memory","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10");
+
+  float16x4_t cd1 = vget_low_f16(vpaddq_f16(cq1, cq1));
+  if (k_left > 3) {
+    float16x4_t ad1 = vld1_f16(a_ptr); a_ptr += 4;
+    float16x4_t bd1 = vld1_f16(b_ptr); b_ptr += 4;
+    cd1 = vfma_f16(cd1, ad1, bd1); k_left -= 4;
+  }
+
+  float16_t cs1 = vget_lane_f16(cd1, 0) + vget_lane_f16(cd1, 1) +
+    vget_lane_f16(cd1, 2) + vget_lane_f16(cd1, 3);
+  for (; k_left > 0; k_left--) {
+    float16_t as1 = *a_ptr; a_ptr++;
+    float16_t bs1 = *b_ptr; b_ptr++;
+    cs1 += as1 * bs1;
+  }
+
+  *c_ptr = c_ptr[0] * beta + cs1;
+}
+
+/* k_mask = 15 */
+static inline void inline_hgemm_arowmajor_bskinny_m1n2(
+  const float16_t *a_ptr, const float16_t *b_ptr, float16_t *c_ptr,
+  uint32_t k_left, uint32_t LDK, uint32_t LDM,
+  float16_t beta, bool c_rowmajor) {
+
+  float16x8_t cq1, cq2;
+  __asm__ __volatile__ (
+    "movi %[cq1].16b,#0; movi %[cq2].16b,#0\n\t"
+    "mov v0.16b,%[cq1].16b; mov v1.16b,%[cq2].16b\n\t"
+    "cmp %w[k_left],#16; b.lt 3f\n\t"
+    "ldr q2,[%[a_ptr]],#32; ldr q4,[%[b_ptr]],#64; ldr q6,[%[b_ptr],#-48]\n\t"
+    "ldr q3,[%[a_ptr],#-16]; ldr q5,[%[b_ptr],#-32]; ldr q7,[%[b_ptr],#-16]\n\t"
+    "cmp %w[k_left],#32; b.lt 2f\n\t"
+    "1:\n\t"
+    "fmla %[cq1].8h,v2.8h,v4.8h; ldr q4,[%[b_ptr]],#64\n\t"
+    "sub %w[k_left],%w[k_left],#16\n\t"
+    "fmla %[cq2].8h,v2.8h,v6.8h; ldr q6,[%[b_ptr],#-48]\n\t"
+    "ldr q2,[%[a_ptr]],#32\n\t"
+    "fmla v0.8h,v3.8h,v5.8h; ldr q5,[%[b_ptr],#-32]\n\t"
+    "cmp %w[k_left],#32\n\t"
+    "fmla v1.8h,v3.8h,v7.8h; ldr q7,[%[b_ptr],#-16]\n\t"
+    "ldr q3,[%[a_ptr],#-16]\n\t"
+    "b.ge 1b\n\t"
+    "2:\n\t"
+    "fmla %[cq1].8h,v2.8h,v4.8h; sub %w[k_left],%w[k_left],#16\n\t"
+    "fmla %[cq2].8h,v2.8h,v6.8h\n\t"
+    "fmla v0.8h,v3.8h,v5.8h\n\t"
+    "fmla v1.8h,v3.8h,v7.8h\n\t"
+    "3:\n\t"
+    "cmp %w[k_left],#8; fadd %[cq1].8h,%[cq1].8h,v0.8h\n\t"
+    "fadd %[cq2].8h,%[cq2].8h,v1.8h; b.lt 4f\n\t"
+    "ldr q2,[%[a_ptr]],#16; ldr q4,[%[b_ptr]],#32; ldr q6,[%[b_ptr],#-16]\n\t"
+    "sub %w[k_left],%w[k_left],#8\n\t"
+    "fmla %[cq1].8h,v2.8h,v4.8h; fmla %[cq2].8h,v2.8h,v6.8h\n\t"
+    "4:\n\t"
+   :[cq1]"=w"(cq1), [cq2]"=w"(cq2), [k_left]"+r"(k_left),
+    [a_ptr]"+r"(a_ptr), [b_ptr]"+r"(b_ptr)
+   ::"cc","memory","v0","v1","v2","v3","v4","v5","v6","v7");
+
+  cq1 = vpaddq_f16(cq1, cq2);
+  if (k_left > 3) {
+    float16x4_t ad1 = vld1_f16(a_ptr); a_ptr += 4;
+    float16x8_t aq1 = vcombine_f16(ad1, ad1);
+    float16x8_t bq1 = vld1q_f16(b_ptr); b_ptr += 8;
+    cq1 = vfmaq_f16(cq1, aq1, bq1); k_left -= 4;
+  }
+
+  const float16x8_t cz1 = vdupq_n_f16(0);
+  float16x4_t cd1 = vget_low_f16(vpaddq_f16(cq1, cz1));
+  if (k_left > 1) {
+    float16x4_t ad1;
+    __asm__("ld1r {%0.2s},[%1],#4":"=w"(ad1),"+r"(a_ptr)::"memory");
+    float16x4_t bd1 = vld1_f16(b_ptr); b_ptr += 4;
+    cd1 = vfma_f16(cd1, ad1, bd1); k_left -= 2;
+  }
+  
+  cd1 = vpadd_f16(cd1, vget_low_f16(cz1));
+  if (k_left > 0) {
+    float16x4_t ad1, bd1;
+    __asm__("ld1r {%0.4h},[%1],#2":"=w"(ad1),"+r"(a_ptr)::"memory");
+    __asm__("ldr %s0,[%1],#4":"=w"(bd1),"+r"(b_ptr)::"memory");
+    cd1 = vfma_f16(cd1, ad1, bd1);
+  }
+  
+  if (c_rowmajor) {
+    c_ptr[0] = c_ptr[0] * beta + vget_lane_f16(cd1, 0);
+    c_ptr[1] = c_ptr[1] * beta + vget_lane_f16(cd1, 1);
+  } else {
+    c_ptr[0] = c_ptr[0] * beta + vget_lane_f16(cd1, 0);
+    c_ptr[LDM] = c_ptr[LDM] * beta + vget_lane_f16(cd1, 1);
+  }
+}
+
+/* k_mask = 13 */
+static inline void inline_hgemm_arowmajor_bskinny_m1n3(
+  const float16_t *a_ptr, const float16_t *b_ptr, float16_t *c_ptr,
+  uint32_t k_left, uint32_t LDK, uint32_t LDM,
+  float16_t beta, bool c_rowmajor) {
+
+  float16x8_t cq1, cq2, cq3;
+  __asm__ __volatile__ (
+    "movi %[cq1].16b,#0; movi %[cq2].16b,#0; movi %[cq3].16b,#0\n\t"
+    "mov v0.16b,%[cq1].16b; mov v1.16b,%[cq2].16b; mov v2.16b,%[cq3].16b\n\t"
+    "cmp %w[k_left],#16; b.lt 3f\n\t"
+    "ldr q3,[%[a_ptr]],#32; ldr q5,[%[b_ptr]],#96\n\t"
+    "ldr q7,[%[b_ptr],#-80]; ldr q9,[%[b_ptr],#-64]\n\t"
+    "ldr q4,[%[a_ptr],#-16]; ldr q6,[%[b_ptr],#-48]\n\t"
+    "ldr q8,[%[b_ptr],#-32]; ldr q10,[%[b_ptr],#-16]\n\t"
+    "cmp %w[k_left],#32; b.lt 2f\n\t"
+    "1:\n\t"
+    "fmla %[cq1].8h,v3.8h,v5.8h; ldr q5,[%[b_ptr]],#96\n\t"
+    "sub %w[k_left],%w[k_left],#16\n\t"
+    "fmla %[cq2].8h,v3.8h,v7.8h; ldr q7,[%[b_ptr],#-80]\n\t"
+    "fmla %[cq3].8h,v3.8h,v9.8h; ldr q9,[%[b_ptr],#-64]\n\t"
+    "ldr q3,[%[a_ptr]],#32\n\t"
+    "fmla v0.8h,v4.8h,v6.8h; ldr q6,[%[b_ptr],#-48]\n\t"
+    "cmp %w[k_left],#32\n\t"
+    "fmla v1.8h,v4.8h,v8.8h; ldr q8,[%[b_ptr],#-32]\n\t"
+    "fmla v2.8h,v4.8h,v10.8h; ldr q10,[%[b_ptr],#-16]\n\t"
+    "ldr q4,[%[a_ptr],#-16]\n\t"
+    "b.ge 1b\n\t"
+    "2:\n\t"
+    "fmla %[cq1].8h,v3.8h,v5.8h; sub %w[k_left],%w[k_left],#16\n\t"
+    "fmla %[cq2].8h,v3.8h,v7.8h\n\t"
+    "fmla %[cq3].8h,v3.8h,v9.8h\n\t"
+    "fmla v0.8h,v4.8h,v6.8h\n\t"
+    "fmla v1.8h,v4.8h,v8.8h\n\t"
+    "fmla v2.8h,v4.8h,v10.8h\n\t"
+    "3:\n\t"
+    "cmp %w[k_left],#8\n\t"
+    "fadd %[cq1].8h,%[cq1].8h,v0.8h\n\t"
+    "fadd %[cq2].8h,%[cq2].8h,v1.8h\n\t"
+    "fadd %[cq3].8h,%[cq3].8h,v2.8h; b.lt 4f\n\t"
+    "ldr q3,[%[a_ptr]],#16; ldr q5,[%[b_ptr]],#48\n\t"
+    "ldr q7,[%[b_ptr],#-32]; ldr q9,[%[b_ptr],#-16]\n\t"
+    "sub %w[k_left],%w[k_left],#8\n\t"
+    "fmla %[cq1].8h,v3.8h,v5.8h\n\t"
+    "fmla %[cq2].8h,v3.8h,v7.8h\n\t"
+    "fmla %[cq3].8h,v3.8h,v9.8h\n\t"
+    "4:\n\t"
+   :[cq1]"=w"(cq1), [cq2]"=w"(cq2), [cq3]"=w"(cq3),
+    [k_left]"+r"(k_left), [a_ptr]"+r"(a_ptr), [b_ptr]"+r"(b_ptr)
+   ::"cc","memory","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10");
+
+  float16x4_t cd1 = vadd_f16(vget_low_f16(cq1), vget_high_f16(cq1));
+  float16x4_t cd2 = vadd_f16(vget_low_f16(cq2), vget_high_f16(cq2));
+  float16x4_t cd3 = vadd_f16(vget_low_f16(cq3), vget_high_f16(cq3));
+  if (k_left > 3) {
+    float16x4_t ad1 = vld1_f16(a_ptr); a_ptr += 4;
+    float16x4_t bd1 = vld1_f16(b_ptr);
+    float16x4_t bd2 = vld1_f16(b_ptr + 4);
+    float16x4_t bd3 = vld1_f16(b_ptr + 8); b_ptr += 12;
+    cd1 = vfma_f16(cd1, ad1, bd1);
+    cd2 = vfma_f16(cd2, ad1, bd2);
+    cd3 = vfma_f16(cd3, ad1, bd3); k_left -= 4;
+  }
+
+  float16_t cs1 = vget_lane_f16(cd1, 0) + vget_lane_f16(cd1, 1) +
+    vget_lane_f16(cd1, 2) + vget_lane_f16(cd1, 3);
+  float16_t cs2 = vget_lane_f16(cd2, 0) + vget_lane_f16(cd2, 1) +
+    vget_lane_f16(cd2, 2) + vget_lane_f16(cd2, 3);
+  float16_t cs3 = vget_lane_f16(cd3, 0) + vget_lane_f16(cd3, 1) +
+    vget_lane_f16(cd3, 2) + vget_lane_f16(cd3, 3);
+  for (; k_left > 0; k_left--) {
+    float16_t as1 = *a_ptr; a_ptr++;
+    cs1 += as1 * b_ptr[0];
+    cs2 += as1 * b_ptr[1];
+    cs3 += as1 * b_ptr[2]; b_ptr += 3;
+  }
+
+  if (c_rowmajor) {
+    c_ptr[0] = c_ptr[0] * beta + cs1;
+    c_ptr[1] = c_ptr[1] * beta + cs2;
+    c_ptr[2] = c_ptr[2] * beta + cs3;
+  } else {
+    c_ptr[0] = c_ptr[0] * beta + cs1;
+    c_ptr[LDM] = c_ptr[LDM] * beta + cs2;
+    c_ptr[LDM * 2] = c_ptr[LDM * 2] * beta + cs3;
+  }
+}
+
+typedef float16_t hgemm_skinnydot_ascalar;
+typedef float16_t hgemm_skinnydot_bscalar;
+typedef float16_t hgemm_skinnydot_cscalar;
+
+static inline bool unroll_test_m1n1(uint32_t M, uint32_t K) {
+  return true;
+}
+
+static inline bool unroll_test_m1n2(uint32_t M, uint32_t K) {
+  return true;
+}
+
+static inline bool unroll_test_m1n3(uint32_t M, uint32_t K) {
+  return true;
+}
+
+GEMM_SKINNY_DOT_PARALLEL_FUNC_NOINCINLINE(hgemm, 1, 13, 1, 65536, float16_t, float16_t, unroll_test)
+GEMM_SKINNY_DOT_PARALLEL_FUNC_NOINCINLINE(hgemm, 2, 15, 1, 65536, float16_t, float16_t, unroll_test)
+GEMM_SKINNY_DOT_PARALLEL_FUNC_NOINCINLINE(hgemm, 3, 13, 1, 65536, float16_t, float16_t, unroll_test)
+
+typedef float16_t hgemm_skinnydot_avec1;
+typedef float16_t hgemm_skinnydot_bvec1;
+typedef float16_t hgemm_skinnydot_cvec1;
+
+typedef float16x4_t hgemm_skinnydot_avec4;
+typedef float16x4_t hgemm_skinnydot_bvec4;
+typedef float16x4_t hgemm_skinnydot_cvec4;
+
+typedef float16x8_t hgemm_skinnydot_avec8;
+typedef float16x8_t hgemm_skinnydot_bvec8;
+typedef float16x8_t hgemm_skinnydot_cvec8;
+
+GEMM_SKINNY_DOT_CALC_UNIT(hgemm, 8) {
+  return vfmaq_f16(c_vec, a_vec, b_vec);
+}
+
+GEMM_SKINNY_DOT_CALC_UNIT(hgemm, 4) {
+  return vfma_f16(c_vec, a_vec, b_vec);
+}
+
+GEMM_SKINNY_DOT_CALC_UNIT(hgemm, 1) {
+  return c_vec + a_vec * b_vec;
+}
+
+GEMM_SKINNY_DOT_LOADA_UNIT(hgemm, 8) {
+  __asm__("prfm pldl1keep,[%0,#80]"::"r"(a_ptr):);
+  return vld1q_f16(a_ptr);
+}
+
+GEMM_SKINNY_DOT_LOADA_UNIT(hgemm, 4) {
+  __asm__("prfm pldl1keep,[%0,#72]"::"r"(a_ptr):);
+  return vld1_f16(a_ptr);
+}
+
+GEMM_SKINNY_DOT_LOADA_UNIT(hgemm, 1) {
+  return *a_ptr;
+}
+
+GEMM_SKINNY_DOT_LOADB_UNIT(hgemm, 8) {
+  return vld1q_f16(b_ptr);
+}
+
+GEMM_SKINNY_DOT_LOADB_UNIT(hgemm, 4) {
+  return vld1_f16(b_ptr);
+}
+
+GEMM_SKINNY_DOT_LOADB_UNIT(hgemm, 1) {
+  return *b_ptr;
+}
+
+GEMM_SKINNY_DOT_REDUC_UNIT(hgemm, 8, 4) {
+  return vget_low_f16(vpaddq_f16(c_vec, c_vec));
+}
+
+GEMM_SKINNY_DOT_REDUC_UNIT(hgemm, 4, 1) {
+  float cs1 = vget_lane_f16(c_vec, 0);
+  float cs2 = vget_lane_f16(c_vec, 1);
+  float cs3 = vget_lane_f16(c_vec, 2);
+  float cs4 = vget_lane_f16(c_vec, 3);
+  cs1 += cs2; cs3 += cs4;
+  return cs1 + cs3;
+}
+
+GEMM_SKINNY_DOT_INITC_UNIT(hgemm, 8) {
+  return vdupq_n_f16(0);
+}
+
+GEMM_SKINNY_DOT_INITC_UNIT(hgemm, 4) {
+  return vdup_n_f16(0);
+}
+
+GEMM_SKINNY_DOT_INITC_UNIT(hgemm, 1) {
+  return 0;
+}
+
+GEMM_SKINNY_DOT_PARALLEL_FUNC(hgemm, 4, 13, 7, 65536, float16_t, float16_t)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(hgemm, 5, 13, 7, 65536, float16_t, float16_t)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(hgemm, 6, 13, 7, 65536, float16_t, float16_t)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(hgemm, 7, 13, 3, 65536, float16_t, float16_t)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(hgemm, 8, 13, 3, 65536, float16_t, float16_t)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(hgemm, 9, 13, 3, 65536, float16_t, float16_t)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(hgemm, 10, 13, 3, 65536, float16_t, float16_t)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(hgemm, 11, 13, 3, 65536, float16_t, float16_t)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(hgemm, 12, 13, 3, 65536, float16_t, float16_t)
diff --git a/src/neon_armv8a/extension/HgemmSkinnyGer.c b/src/neon_armv8a/extension/HgemmSkinnyGer.c
new file mode 100644
index 0000000..ec47b8e
--- /dev/null
+++ b/src/neon_armv8a/extension/HgemmSkinnyGer.c
@@ -0,0 +1,232 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include "arm_neon/ARMCompareAndSwap.h"
+#include "common/CommonSkinnyGer.h"
+
+#include <arm_neon.h>
+
+typedef float16_t hgemm_skinnyger_ascalar;
+typedef float16_t hgemm_skinnyger_bscalar;
+typedef float16_t hgemm_skinnyger_cscalar;
+
+typedef float16_t hgemm_skinnyger_avec1;
+typedef float16_t hgemm_skinnyger_bvec1;
+typedef float16_t hgemm_skinnyger_cvec1;
+
+typedef float16x4_t hgemm_skinnyger_avec4;
+typedef float16x4_t hgemm_skinnyger_bvec4;
+typedef float16x4_t hgemm_skinnyger_cvec4;
+
+typedef float16x8_t hgemm_skinnyger_avec8;
+typedef float16x8_t hgemm_skinnyger_bvec8;
+typedef float16x8_t hgemm_skinnyger_cvec8;
+
+typedef float16x8x2_t hgemm_skinnyger_avec16;
+typedef float16x8x2_t hgemm_skinnyger_bvec16;
+typedef float16x8x2_t hgemm_skinnyger_cvec16;
+
+GEMM_SKINNY_GER_CALC_UNIT(hgemm, 16, 4, 1) {
+  float16x8x2_t ret;
+  ret.val[0] = vfmaq_lane_f16(c_vec.val[0], a_vec.val[0], b_vec, 0);
+  ret.val[1] = vfmaq_lane_f16(c_vec.val[1], a_vec.val[1], b_vec, 0);
+  return ret;
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(hgemm, 16, 4, 2) {
+  float16x8x2_t ret;
+  ret.val[0] = vfmaq_lane_f16(c_vec.val[0], a_vec.val[0], b_vec, 1);
+  ret.val[1] = vfmaq_lane_f16(c_vec.val[1], a_vec.val[1], b_vec, 1);
+  return ret;
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(hgemm, 16, 4, 3) {
+  float16x8x2_t ret;
+  ret.val[0] = vfmaq_lane_f16(c_vec.val[0], a_vec.val[0], b_vec, 2);
+  ret.val[1] = vfmaq_lane_f16(c_vec.val[1], a_vec.val[1], b_vec, 2);
+  return ret;
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(hgemm, 16, 4, 4) {
+  float16x8x2_t ret;
+  ret.val[0] = vfmaq_lane_f16(c_vec.val[0], a_vec.val[0], b_vec, 3);
+  ret.val[1] = vfmaq_lane_f16(c_vec.val[1], a_vec.val[1], b_vec, 3);
+  return ret;
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(hgemm, 16, 1, 1) {
+  float16x8x2_t ret;
+  ret.val[0] = vfmaq_n_f16(c_vec.val[0], a_vec.val[0], b_vec);
+  ret.val[1] = vfmaq_n_f16(c_vec.val[1], a_vec.val[1], b_vec);
+  return ret;
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(hgemm, 8, 4, 1) {
+  return vfmaq_lane_f16(c_vec, a_vec, b_vec, 0);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(hgemm, 8, 4, 2) {
+  return vfmaq_lane_f16(c_vec, a_vec, b_vec, 1);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(hgemm, 8, 4, 3) {
+  return vfmaq_lane_f16(c_vec, a_vec, b_vec, 2);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(hgemm, 8, 4, 4) {
+  return vfmaq_lane_f16(c_vec, a_vec, b_vec, 3);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(hgemm, 8, 1, 1) {
+  return vfmaq_n_f16(c_vec, a_vec, b_vec);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(hgemm, 4, 4, 1) {
+  return vfma_lane_f16(c_vec, a_vec, b_vec, 0);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(hgemm, 4, 4, 2) {
+  return vfma_lane_f16(c_vec, a_vec, b_vec, 1);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(hgemm, 4, 4, 3) {
+  return vfma_lane_f16(c_vec, a_vec, b_vec, 2);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(hgemm, 4, 4, 4) {
+  return vfma_lane_f16(c_vec, a_vec, b_vec, 3);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(hgemm, 4, 1, 1) {
+  return vfma_n_f16(c_vec, a_vec, b_vec);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(hgemm, 1, 4, 1) {
+  return c_vec + a_vec * vget_lane_f16(b_vec, 0);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(hgemm, 1, 4, 2) {
+  return c_vec + a_vec * vget_lane_f16(b_vec, 1);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(hgemm, 1, 4, 3) {
+  return c_vec + a_vec * vget_lane_f16(b_vec, 2);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(hgemm, 1, 4, 4) {
+  return c_vec + a_vec * vget_lane_f16(b_vec, 3);
+}
+
+GEMM_SKINNY_GER_CALC_UNIT(hgemm, 1, 1, 1) {
+  return c_vec + a_vec * b_vec;
+}
+
+GEMM_SKINNY_GER_LOADA_UNIT(hgemm, 16) {
+  float16x8x2_t ret;
+  ret.val[0] = vld1q_f16(a_ptr);
+  ret.val[1] = vld1q_f16(a_ptr + 8);
+  __asm__("prfm pldl1keep,[%0,#96]"::"r"(a_ptr):);
+  return ret;
+}
+
+GEMM_SKINNY_GER_LOADA_UNIT(hgemm, 8) {
+  __asm__("prfm pldl1keep,[%0,#80]"::"r"(a_ptr):);
+  return vld1q_f16(a_ptr);
+}
+
+GEMM_SKINNY_GER_LOADA_UNIT(hgemm, 4) {
+  return vld1_f16(a_ptr);
+}
+
+GEMM_SKINNY_GER_LOADA_UNIT(hgemm, 1) {
+  return *a_ptr;
+}
+
+GEMM_SKINNY_GER_LOADC_UNIT(hgemm, 16) {
+  float16x8x2_t ret;
+  ret.val[0] = vld1q_f16(c_ptr);
+  ret.val[1] = vld1q_f16(c_ptr + 8);
+  return ret;
+}
+
+GEMM_SKINNY_GER_LOADC_UNIT(hgemm, 8) {
+  return vld1q_f16(c_ptr);
+}
+
+GEMM_SKINNY_GER_LOADC_UNIT(hgemm, 4) {
+  return vld1_f16(c_ptr);
+}
+
+GEMM_SKINNY_GER_LOADC_UNIT(hgemm, 1) {
+  return *c_ptr;
+}
+
+GEMM_SKINNY_GER_STOREC_UNIT(hgemm, 16) {
+  vst1q_f16(c_ptr, c_vec.val[0]);
+  vst1q_f16(c_ptr + 8, c_vec.val[1]);
+}
+
+GEMM_SKINNY_GER_STOREC_UNIT(hgemm, 8) {
+  vst1q_f16(c_ptr, c_vec);
+}
+
+GEMM_SKINNY_GER_STOREC_UNIT(hgemm, 4) {
+  vst1_f16(c_ptr, c_vec);
+}
+
+GEMM_SKINNY_GER_STOREC_UNIT(hgemm, 1) {
+  *c_ptr = c_vec;
+}
+
+GEMM_SKINNY_GER_LOADB_UNIT_BROWMAJOR(hgemm, 4) {
+  float16x4_t ret = vdup_n_f16(0);
+  float16_t b1 = *b_ptr; b_ptr += ldb;
+  float16_t b2 = *b_ptr; b_ptr += ldb;
+  float16_t b3 = *b_ptr; b_ptr += ldb;
+  float16_t b4 = *b_ptr;
+  ret = vset_lane_f16(b1, ret, 0);
+  ret = vset_lane_f16(b2, ret, 1);
+  ret = vset_lane_f16(b3, ret, 2);
+  ret = vset_lane_f16(b4, ret, 3);
+  return ret;
+}
+
+GEMM_SKINNY_GER_LOADB_UNIT_BROWMAJOR(hgemm, 1) {
+  return *b_ptr;
+}
+
+GEMM_SKINNY_GER_LOADB_UNIT_BCOLMAJOR(hgemm, 4) {
+  return vld1_f16(b_ptr);
+}
+
+GEMM_SKINNY_GER_LOADB_UNIT_BCOLMAJOR(hgemm, 1) {
+  return *b_ptr;
+}
+
+GEMM_SKINNY_GER_PARALLEL_FUNC(hgemm, 1, 5, 29, 16384, float16_t, float16_t)
+GEMM_SKINNY_GER_PARALLEL_FUNC(hgemm, 2, 5, 29, 16384, float16_t, float16_t)
+GEMM_SKINNY_GER_PARALLEL_FUNC(hgemm, 3, 5, 29, 16384, float16_t, float16_t)
+GEMM_SKINNY_GER_PARALLEL_FUNC(hgemm, 4, 5, 29, 16384, float16_t, float16_t)
+GEMM_SKINNY_GER_PARALLEL_FUNC(hgemm, 5, 5, 29, 16384, float16_t, float16_t)
+GEMM_SKINNY_GER_PARALLEL_FUNC(hgemm, 6, 5, 29, 16384, float16_t, float16_t)
+GEMM_SKINNY_GER_PARALLEL_FUNC(hgemm, 7, 5, 29, 16384, float16_t, float16_t)
+GEMM_SKINNY_GER_PARALLEL_FUNC(hgemm, 8, 5, 13, 16384, float16_t, float16_t)
+GEMM_SKINNY_GER_PARALLEL_FUNC(hgemm, 9, 5, 13, 16384, float16_t, float16_t)
+GEMM_SKINNY_GER_PARALLEL_FUNC(hgemm, 10, 5, 13, 16384, float16_t, float16_t)
+GEMM_SKINNY_GER_PARALLEL_FUNC(hgemm, 11, 5, 13, 16384, float16_t, float16_t)
+GEMM_SKINNY_GER_PARALLEL_FUNC(hgemm, 12, 5, 13, 16384, float16_t, float16_t)
+
diff --git a/src/neon_armv8a/extension/S8S32DotGemmCopy.c b/src/neon_armv8a/extension/S8S32DotGemmCopy.c
new file mode 100644
index 0000000..3cb9665
--- /dev/null
+++ b/src/neon_armv8a/extension/S8S32DotGemmCopy.c
@@ -0,0 +1,30 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#ifdef GEMM_UNSIGNED_INT
+#undef GEMM_UNSIGNED_INT
+#endif
+
+#include "common/CommonCopy.h"
+#include "neon_armv8a/I8I32DotGemmCopy.h"
+
+GENERIC_NCOPY_FUNC(s8s32dotgemm, int8_t, int32_t, 8)
+GENERIC_NCOPY_FUNC(s8s32dotgemm, int8_t, int32_t, 12)
+
+TCOPY_FUNC_TEMPLATE(s8s32dotgemm_int8_t_int32_t_tcopy_unroll, 8)
+TCOPY_FUNC_TEMPLATE(s8s32dotgemm_int8_t_int32_t_tcopy_unroll, 12)
+
diff --git a/src/neon_armv8a/extension/S8S32DotGemmKernel.c b/src/neon_armv8a/extension/S8S32DotGemmKernel.c
new file mode 100644
index 0000000..409ca4f
--- /dev/null
+++ b/src/neon_armv8a/extension/S8S32DotGemmKernel.c
@@ -0,0 +1,116 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#ifdef GEMM_UNSIGNED_INT
+#undef GEMM_UNSIGNED_INT
+#endif
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include "common/CommonKernel.h"
+#include "arm_neon/ARMCpuType.h"
+#include <sched.h>
+#include "neon_armv8a/I8I32DotGemmKernel.h"
+
+#define CPUID_DETECT_MNK 1000000
+
+void s8s32dotgemm_kernel_lm_m8n12(uint32_t M, uint32_t N, uint32_t K, int32_t beta,
+  const int32_t * __restrict__ sa, const int32_t * __restrict__ sb,
+  int32_t * __restrict__ C, uint32_t ldc) {
+
+  uint32_t n_left = N;
+  const int32_t *b_head = sb;
+  int32_t *c_head = C;
+  uint32_t acc_mnk = CPUID_DETECT_MNK;
+  uint8_t cpuid = 0, cputype = 0;
+
+  for (; n_left > 11; n_left -= 12) {
+    if (acc_mnk >= CPUID_DETECT_MNK) {
+      cpuid = sched_getcpu();
+      cputype = blas_arm_get_cpu_type(cpuid);
+      acc_mnk = 0;
+    }
+    const int32_t *a_head = sa;
+    int32_t *c_ptr = c_head;
+    uint32_t m_left = M;
+    if (cputype == 55) {
+      for (; m_left > 7; m_left -= 8) {
+        KERNEL_M8N12_TEMPLATE(A55)
+        SAVE_M8N12
+        a_head += 8 * K;
+        c_ptr += 8;
+      }
+    } else {
+      for (; m_left > 7; m_left -= 8) {
+        KERNEL_M8N12_TEMPLATE(A76)
+        SAVE_M8N12
+        a_head += 8 * K;
+        c_ptr += 8;
+      }
+    }
+    MICRO_COMPUTE_LM(4, 12, int32_t, int32_t, int32_t)
+    b_head += K * 12;
+    c_head += ldc * 12;
+    acc_mnk += 12 * K * M;
+  }
+
+  ASSEMBLE_DUALPACK_COMPUTE_LM(8, int32_t, int32_t, int32_t, 8)
+}
+
+void s8s32dotgemm_kernel_ln_m12n8(uint32_t M, uint32_t N, uint32_t K, int32_t beta,
+  const int32_t * __restrict__ sa, const int32_t * __restrict__ sb,
+  int32_t * __restrict__ C, uint32_t ldc) {
+
+  uint32_t m_left = M;
+  const int32_t *a_head = sa;
+  int32_t *c_head = C;
+  uint32_t acc_mnk = CPUID_DETECT_MNK;
+  uint8_t cpuid = 0, cputype = 0;
+  for (; m_left > 11; m_left -= 12) {
+    if (acc_mnk >= CPUID_DETECT_MNK) {
+      cpuid = sched_getcpu();
+      cputype = blas_arm_get_cpu_type(cpuid);
+      acc_mnk = 0;
+    }
+    const int32_t *b_head = sb;
+    int32_t *c_ptr = c_head;
+    uint32_t n_left = N;
+    if (cputype == 55) {
+      for (; n_left > 7; n_left -= 8) {
+        KERNEL_M12N8_TEMPLATE(A55)
+        SAVE_M12N8
+        b_head += 8 * K;
+        c_ptr += 8 * ldc;
+      }
+    } else {
+      for (; n_left > 7; n_left -= 8) {
+        KERNEL_M12N8_TEMPLATE(A76)
+        SAVE_M12N8
+        b_head += 8 * K;
+        c_ptr += 8 * ldc;
+      }
+    }
+    MICRO_COMPUTE_LN(12, 4, int32_t, int32_t, int32_t)
+    a_head += K * 12;
+    c_head += 12;
+    acc_mnk += 12 * N * K;
+  }
+
+  ASSEMBLE_DUALPACK_COMPUTE_LN(8, int32_t, int32_t, int32_t, 8)
+}
+
diff --git a/src/neon_armv8a/extension/S8S32DotGemmSkinnyDot.c b/src/neon_armv8a/extension/S8S32DotGemmSkinnyDot.c
new file mode 100644
index 0000000..c2fab56
--- /dev/null
+++ b/src/neon_armv8a/extension/S8S32DotGemmSkinnyDot.c
@@ -0,0 +1,37 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#ifdef GEMM_UNSIGNED_INT
+#undef GEMM_UNSIGNED_INT
+#endif
+
+#include "arm_neon/ARMCompareAndSwap.h"
+#include "common/CommonSkinnyDot.h"
+#include "arm_neon/NeonI8I32DotGemmSkinnyDot.h"
+
+GEMM_SKINNY_DOT_PARALLEL_FUNC(s8s32dotgemm, 1, 29, 7, 131072, int8_t, int8_t)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(s8s32dotgemm, 2, 29, 7, 131072, int8_t, int8_t)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(s8s32dotgemm, 3, 29, 7, 131072, int8_t, int8_t)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(s8s32dotgemm, 4, 29, 7, 131072, int8_t, int8_t)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(s8s32dotgemm, 5, 29, 7, 131072, int8_t, int8_t)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(s8s32dotgemm, 6, 29, 7, 131072, int8_t, int8_t)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(s8s32dotgemm, 7, 29, 3, 131072, int8_t, int8_t)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(s8s32dotgemm, 8, 29, 3, 131072, int8_t, int8_t)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(s8s32dotgemm, 9, 29, 3, 131072, int8_t, int8_t)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(s8s32dotgemm, 10, 29, 3, 131072, int8_t, int8_t)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(s8s32dotgemm, 11, 29, 3, 131072, int8_t, int8_t)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(s8s32dotgemm, 12, 29, 3, 131072, int8_t, int8_t)
diff --git a/src/neon_armv8a/extension/U8U32DotGemmCopy.c b/src/neon_armv8a/extension/U8U32DotGemmCopy.c
new file mode 100644
index 0000000..16f42ca
--- /dev/null
+++ b/src/neon_armv8a/extension/U8U32DotGemmCopy.c
@@ -0,0 +1,30 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#ifndef GEMM_UNSIGNED_INT
+#define GEMM_UNSIGNED_INT
+#endif
+
+#include "common/CommonCopy.h"
+#include "neon_armv8a/I8I32DotGemmCopy.h"
+
+GENERIC_NCOPY_FUNC(u8u32dotgemm, uint8_t, uint32_t, 8)
+GENERIC_NCOPY_FUNC(u8u32dotgemm, uint8_t, uint32_t, 12)
+
+TCOPY_FUNC_TEMPLATE(u8u32dotgemm_uint8_t_uint32_t_tcopy_unroll, 8)
+TCOPY_FUNC_TEMPLATE(u8u32dotgemm_uint8_t_uint32_t_tcopy_unroll, 12)
+
diff --git a/src/neon_armv8a/extension/U8U32DotGemmKernel.c b/src/neon_armv8a/extension/U8U32DotGemmKernel.c
new file mode 100644
index 0000000..ca2b01c
--- /dev/null
+++ b/src/neon_armv8a/extension/U8U32DotGemmKernel.c
@@ -0,0 +1,116 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#ifndef GEMM_UNSIGNED_INT
+#define GEMM_UNSIGNED_INT
+#endif
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include "common/CommonKernel.h"
+#include "arm_neon/ARMCpuType.h"
+#include <sched.h>
+#include "neon_armv8a/I8I32DotGemmKernel.h"
+
+#define CPUID_DETECT_MNK 1000000
+
+void u8u32dotgemm_kernel_lm_m8n12(uint32_t M, uint32_t N, uint32_t K, uint32_t beta,
+  const uint32_t * __restrict__ sa, const uint32_t * __restrict__ sb,
+  uint32_t * __restrict__ C, uint32_t ldc) {
+
+  uint32_t n_left = N;
+  const uint32_t *b_head = sb;
+  uint32_t *c_head = C;
+  uint32_t acc_mnk = CPUID_DETECT_MNK;
+  uint8_t cpuid = 0, cputype = 0;
+
+  for (; n_left > 11; n_left -= 12) {
+    if (acc_mnk >= CPUID_DETECT_MNK) {
+      cpuid = sched_getcpu();
+      cputype = blas_arm_get_cpu_type(cpuid);
+      acc_mnk = 0;
+    }
+    const uint32_t *a_head = sa;
+    uint32_t *c_ptr = c_head;
+    uint32_t m_left = M;
+    if (cputype == 55) {
+      for (; m_left > 7; m_left -= 8) {
+        KERNEL_M8N12_TEMPLATE(A55)
+        SAVE_M8N12
+        a_head += 8 * K;
+        c_ptr += 8;
+      }
+    } else {
+      for (; m_left > 7; m_left -= 8) {
+        KERNEL_M8N12_TEMPLATE(A76)
+        SAVE_M8N12
+        a_head += 8 * K;
+        c_ptr += 8;
+      }
+    }
+    MICRO_COMPUTE_LM(4, 12, uint32_t, uint32_t, uint32_t)
+    b_head += K * 12;
+    c_head += ldc * 12;
+    acc_mnk += 12 * K * M;
+  }
+
+  ASSEMBLE_DUALPACK_COMPUTE_LM(8, uint32_t, uint32_t, uint32_t, 8)
+}
+
+void u8u32dotgemm_kernel_ln_m12n8(uint32_t M, uint32_t N, uint32_t K, uint32_t beta,
+  const uint32_t * __restrict__ sa, const uint32_t * __restrict__ sb,
+  uint32_t * __restrict__ C, uint32_t ldc) {
+
+  uint32_t m_left = M;
+  const uint32_t *a_head = sa;
+  uint32_t *c_head = C;
+  uint32_t acc_mnk = CPUID_DETECT_MNK;
+  uint8_t cpuid = 0, cputype = 0;
+  for (; m_left > 11; m_left -= 12) {
+    if (acc_mnk >= CPUID_DETECT_MNK) {
+      cpuid = sched_getcpu();
+      cputype = blas_arm_get_cpu_type(cpuid);
+      acc_mnk = 0;
+    }
+    const uint32_t *b_head = sb;
+    uint32_t *c_ptr = c_head;
+    uint32_t n_left = N;
+    if (cputype == 55) {
+      for (; n_left > 7; n_left -= 8) {
+        KERNEL_M12N8_TEMPLATE(A55)
+        SAVE_M12N8
+        b_head += 8 * K;
+        c_ptr += 8 * ldc;
+      }
+    } else {
+      for (; n_left > 7; n_left -= 8) {
+        KERNEL_M12N8_TEMPLATE(A76)
+        SAVE_M12N8
+        b_head += 8 * K;
+        c_ptr += 8 * ldc;
+      }
+    }
+    MICRO_COMPUTE_LN(12, 4, uint32_t, uint32_t, uint32_t)
+    a_head += K * 12;
+    c_head += 12;
+    acc_mnk += 12 * N * K;
+  }
+
+  ASSEMBLE_DUALPACK_COMPUTE_LN(8, uint32_t, uint32_t, uint32_t, 8)
+}
+
diff --git a/src/neon_armv8a/extension/U8U32DotGemmSkinnyDot.c b/src/neon_armv8a/extension/U8U32DotGemmSkinnyDot.c
new file mode 100644
index 0000000..c11849e
--- /dev/null
+++ b/src/neon_armv8a/extension/U8U32DotGemmSkinnyDot.c
@@ -0,0 +1,37 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#ifndef GEMM_UNSIGNED_INT
+#define GEMM_UNSIGNED_INT
+#endif
+
+#include "arm_neon/ARMCompareAndSwap.h"
+#include "common/CommonSkinnyDot.h"
+#include "arm_neon/NeonI8I32DotGemmSkinnyDot.h"
+
+GEMM_SKINNY_DOT_PARALLEL_FUNC(u8u32dotgemm, 1, 29, 7, 131072, uint8_t, uint8_t)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(u8u32dotgemm, 2, 29, 7, 131072, uint8_t, uint8_t)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(u8u32dotgemm, 3, 29, 7, 131072, uint8_t, uint8_t)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(u8u32dotgemm, 4, 29, 7, 131072, uint8_t, uint8_t)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(u8u32dotgemm, 5, 29, 7, 131072, uint8_t, uint8_t)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(u8u32dotgemm, 6, 29, 7, 131072, uint8_t, uint8_t)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(u8u32dotgemm, 7, 29, 3, 131072, uint8_t, uint8_t)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(u8u32dotgemm, 8, 29, 3, 131072, uint8_t, uint8_t)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(u8u32dotgemm, 9, 29, 3, 131072, uint8_t, uint8_t)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(u8u32dotgemm, 10, 29, 3, 131072, uint8_t, uint8_t)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(u8u32dotgemm, 11, 29, 3, 131072, uint8_t, uint8_t)
+GEMM_SKINNY_DOT_PARALLEL_FUNC(u8u32dotgemm, 12, 29, 3, 131072, uint8_t, uint8_t)
diff --git a/src/neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotA35.c b/src/neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotA35.c
new file mode 100644
index 0000000..e8295e6
--- /dev/null
+++ b/src/neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotA35.c
@@ -0,0 +1,72 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include "neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotKernelA35.h"
+#include "neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotCopy.h"
+#include "neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotDriver.h"
+
+DRIVER_PURE_PACK(a35, 4, 10240, 3, 4)
+DRIVER_PURE_PACK(a35, 5, 8192, 3, 4)
+DRIVER_PURE_PACK(a35, 6, 8192, 3, 4)
+DRIVER_PURE_PACK(a35, 7, 6144, 3, 3)
+DRIVER_PURE_PACK(a35, 8, 6144, 3, 3)
+DRIVER_PURE_PACK(a35, 9, 5120, 4, 4)
+DRIVER_PURE_PACK(a35, 10, 5120, 0, 4)
+DRIVER_PURE_PACK(a35, 11, 4096, 4, 4)
+DRIVER_PURE_PACK(a35, 12, 4096, 0, 4)
+DRIVER_PURE_PACK(a35, 13, 3584, 4, 3)
+DRIVER_PURE_PACK(a35, 14, 3584, 0, 3)
+DRIVER_PURE_PACK(a35, 15, 3072, 4, 3)
+DRIVER_PURE_PACK(a35, 16, 3072, 0, 3)
+DRIVER_PURE_PACK(a35, 17, 2560, 4, 3)
+DRIVER_PURE_PACK(a35, 18, 2560, 4, 3)
+
+DRIVER_MIX2_PACK(a35, 19, 2560, 0, 4, 10, 9, 4)
+DRIVER_MIX2_PACK(a35, 20, 2560, 4, 4, 11, 9, 4)
+DRIVER_MIX2_PACK(a35, 21, 2048, 0, 4, 12, 9, 4)
+DRIVER_MIX2_PACK(a35, 22, 2048, 0, 3, 14, 8, 3)
+DRIVER_MIX2_PACK(a35, 23, 2048, 4, 3, 15, 8, 3)
+DRIVER_MIX2_PACK(a35, 24, 2048, 0, 3, 16, 8, 3)
+DRIVER_MIX2_PACK(a35, 25, 2048, 4, 3, 17, 8, 3)
+DRIVER_MIX2_PACK(a35, 26, 1536, 4, 3, 18, 8, 3)
+DRIVER_MIX2_PACK(a35, 27, 1536, 0, 4, 14, 13, 3)
+DRIVER_MIX2_PACK(a35, 28, 1536, 4, 4, 15, 13, 3)
+DRIVER_MIX2_PACK(a35, 29, 1536, 0, 4, 16, 13, 3)
+DRIVER_MIX2_PACK(a35, 30, 1536, 4, 4, 17, 13, 3)
+DRIVER_MIX2_PACK(a35, 31, 1536, 4, 4, 18, 13, 3)
+DRIVER_MIX2_PACK(a35, 32, 1536, 4, 0, 18, 14, 3)
+DRIVER_MIX2_PACK(a35, 33, 1536, 4, 4, 18, 15, 3)
+DRIVER_MIX2_PACK(a35, 34, 1280, 4, 0, 18, 16, 3)
+DRIVER_MIX2_PACK(a35, 35, 1280, 4, 4, 18, 17, 3)
+DRIVER_MIX2_PACK(a35, 36, 1280, 4, 4, 18, 18, 3)
+
+DRIVER_MIX3_PACK(a35, 37, 1280, 0, 4, 3, 16, 13, 8, 3)
+DRIVER_MIX3_PACK(a35, 38, 1280, 4, 4, 3, 17, 13, 8, 3)
+DRIVER_MIX3_PACK(a35, 39, 1280, 4, 4, 3, 18, 13, 8, 3)
+DRIVER_MIX3_PACK(a35, 40, 1280, 4, 0, 3, 18, 14, 8, 3)
+DRIVER_MIX3_PACK(a35, 41, 1024, 4, 4, 3, 18, 15, 8, 3)
+DRIVER_MIX3_PACK(a35, 42, 1024, 4, 0, 3, 18, 16, 8, 3)
+DRIVER_MIX3_PACK(a35, 43, 1024, 4, 4, 3, 18, 17, 8, 3)
+DRIVER_MIX3_PACK(a35, 44, 1024, 4, 4, 3, 18, 18, 8, 3)
+DRIVER_MIX3_PACK(a35, 45, 1024, 4, 0, 4, 18, 14, 13, 3)
+DRIVER_MIX3_PACK(a35, 46, 1024, 4, 0, 0, 18, 14, 14, 3)
+DRIVER_MIX3_PACK(a35, 47, 1024, 4, 4, 0, 18, 15, 14, 3)
+DRIVER_MIX3_PACK(a35, 48, 1024, 4, 4, 4, 18, 15, 15, 3)
+DRIVER_MIX3_PACK(a35, 49, 1024, 4, 0, 4, 18, 16, 15, 3)
+DRIVER_MIX3_PACK(a35, 50, 1024, 4, 0, 0, 18, 16, 16, 3)
+
+
diff --git a/src/neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotA53.c b/src/neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotA53.c
new file mode 100644
index 0000000..8587cd1
--- /dev/null
+++ b/src/neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotA53.c
@@ -0,0 +1,71 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include "neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotKernelA53.h"
+#include "neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotCopy.h"
+#include "neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotDriver.h"
+
+DRIVER_PURE_PACK(a53, 4, 10240, 1, 4)
+DRIVER_PURE_PACK(a53, 5, 8192, 1, 4)
+DRIVER_PURE_PACK(a53, 6, 8192, 1, 4)
+DRIVER_PURE_PACK(a53, 7, 6144, 1, 4)
+DRIVER_PURE_PACK(a53, 8, 6144, 1, 4)
+DRIVER_PURE_PACK(a53, 9, 5120, 1, 4)
+DRIVER_PURE_PACK(a53, 10, 5120, 1, 4)
+DRIVER_PURE_PACK(a53, 11, 4096, 1, 4)
+DRIVER_PURE_PACK(a53, 12, 4096, 1, 4)
+DRIVER_PURE_PACK(a53, 13, 3584, 1, 4)
+DRIVER_PURE_PACK(a53, 14, 3584, 1, 4)
+DRIVER_PURE_PACK(a53, 15, 3072, 2, 4)
+DRIVER_PURE_PACK(a53, 16, 3072, 2, 4)
+DRIVER_PURE_PACK(a53, 17, 2048, 2, 4)
+DRIVER_PURE_PACK(a53, 18, 2048, 2, 4)
+DRIVER_PURE_PACK(a53, 19, 2048, 2, 4)
+DRIVER_PURE_PACK(a53, 20, 2048, 2, 4)
+DRIVER_PURE_PACK(a53, 21, 2048, 2, 4)
+DRIVER_PURE_PACK(a53, 22, 2048, 2, 4)
+DRIVER_PURE_PACK(a53, 23, 2048, 0, 4)
+DRIVER_PURE_PACK(a53, 24, 2048, 0, 4)
+DRIVER_PURE_PACK(a53, 25, 1536, 0, 4)
+DRIVER_PURE_PACK(a53, 26, 1536, 0, 4)
+
+DRIVER_MIX2_PACK(a53, 27, 1536, 2, 1, 15, 12, 4)
+DRIVER_MIX2_PACK(a53, 28, 1536, 2, 1, 16, 12, 4)
+DRIVER_MIX2_PACK(a53, 29, 1536, 2, 1, 17, 12, 4)
+DRIVER_MIX2_PACK(a53, 30, 1536, 2, 1, 18, 12, 4)
+DRIVER_MIX2_PACK(a53, 31, 1536, 2, 1, 19, 12, 4)
+DRIVER_MIX2_PACK(a53, 32, 1536, 2, 1, 20, 12, 4)
+DRIVER_MIX2_PACK(a53, 33, 1536, 2, 1, 21, 12, 4)
+DRIVER_MIX2_PACK(a53, 34, 1280, 2, 1, 22, 12, 4)
+DRIVER_MIX2_PACK(a53, 35, 1280, 0, 1, 23, 12, 4)
+DRIVER_MIX2_PACK(a53, 36, 1280, 0, 1, 24, 12, 4)
+DRIVER_MIX2_PACK(a53, 37, 1280, 0, 1, 25, 12, 4)
+DRIVER_MIX2_PACK(a53, 38, 1280, 0, 1, 26, 12, 4)
+DRIVER_MIX2_PACK(a53, 39, 1280, 0, 2, 24, 15, 4)
+DRIVER_MIX2_PACK(a53, 40, 1280, 0, 2, 24, 16, 4)
+DRIVER_MIX2_PACK(a53, 41, 1024, 0, 2, 24, 17, 4)
+DRIVER_MIX2_PACK(a53, 42, 1024, 0, 2, 24, 18, 4)
+DRIVER_MIX2_PACK(a53, 43, 1024, 0, 2, 24, 19, 4)
+DRIVER_MIX2_PACK(a53, 44, 1024, 0, 2, 24, 20, 4)
+DRIVER_MIX2_PACK(a53, 45, 1024, 0, 2, 24, 21, 4)
+DRIVER_MIX2_PACK(a53, 46, 1024, 0, 2, 24, 22, 4)
+DRIVER_MIX2_PACK(a53, 47, 1024, 0, 0, 24, 23, 4)
+DRIVER_MIX2_PACK(a53, 48, 1024, 0, 0, 24, 24, 4)
+DRIVER_MIX2_PACK(a53, 49, 1024, 0, 0, 25, 24, 4)
+DRIVER_MIX2_PACK(a53, 50, 1024, 0, 0, 26, 24, 4)
+
+
diff --git a/src/neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotA7x.c b/src/neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotA7x.c
new file mode 100644
index 0000000..b1299ca
--- /dev/null
+++ b/src/neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotA7x.c
@@ -0,0 +1,71 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include "neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotKernelA7x.h"
+#include "neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotCopy.h"
+#include "neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotDriver.h"
+
+DRIVER_PURE_PACK(a7x, 4, 10240, 1, 4)
+DRIVER_PURE_PACK(a7x, 5, 8192, 1, 4)
+DRIVER_PURE_PACK(a7x, 6, 8192, 1, 4)
+DRIVER_PURE_PACK(a7x, 7, 6144, 1, 4)
+DRIVER_PURE_PACK(a7x, 8, 6144, 1, 4)
+DRIVER_PURE_PACK(a7x, 9, 5120, 1, 4)
+DRIVER_PURE_PACK(a7x, 10, 5120, 1, 4)
+DRIVER_PURE_PACK(a7x, 11, 4096, 1, 4)
+DRIVER_PURE_PACK(a7x, 12, 4096, 1, 4)
+DRIVER_PURE_PACK(a7x, 13, 3584, 1, 3)
+DRIVER_PURE_PACK(a7x, 14, 3584, 1, 3)
+DRIVER_PURE_PACK(a7x, 15, 3072, 1, 3)
+DRIVER_PURE_PACK(a7x, 16, 3072, 1, 3)
+DRIVER_PURE_PACK(a7x, 17, 2560, 1, 3)
+DRIVER_PURE_PACK(a7x, 18, 2560, 1, 3)
+DRIVER_PURE_PACK(a7x, 19, 2560, 1, 3)
+DRIVER_PURE_PACK(a7x, 20, 2560, 1, 3)
+DRIVER_PURE_PACK(a7x, 21, 2048, 1, 3)
+DRIVER_PURE_PACK(a7x, 22, 2048, 1, 3)
+DRIVER_PURE_PACK(a7x, 23, 2048, 1, 3)
+DRIVER_PURE_PACK(a7x, 24, 2048, 1, 3)
+DRIVER_PURE_PACK(a7x, 25, 2048, 1, 3)
+DRIVER_PURE_PACK(a7x, 26, 1536, 1, 3)
+
+DRIVER_MIX2_PACK(a7x, 27, 1536, 1, 1, 14, 13, 3)
+DRIVER_MIX2_PACK(a7x, 28, 1536, 1, 1, 15, 13, 3)
+DRIVER_MIX2_PACK(a7x, 29, 1536, 1, 1, 16, 13, 3)
+DRIVER_MIX2_PACK(a7x, 30, 1536, 1, 1, 17, 13, 3)
+DRIVER_MIX2_PACK(a7x, 31, 1536, 1, 1, 18, 13, 3)
+DRIVER_MIX2_PACK(a7x, 32, 1536, 1, 1, 19, 13, 3)
+DRIVER_MIX2_PACK(a7x, 33, 1536, 1, 1, 20, 13, 3)
+DRIVER_MIX2_PACK(a7x, 34, 1280, 1, 1, 21, 13, 3)
+DRIVER_MIX2_PACK(a7x, 35, 1280, 1, 1, 22, 13, 3)
+DRIVER_MIX2_PACK(a7x, 36, 1280, 1, 1, 23, 13, 3)
+DRIVER_MIX2_PACK(a7x, 37, 1280, 1, 1, 24, 13, 3)
+DRIVER_MIX2_PACK(a7x, 38, 1280, 1, 1, 25, 13, 3)
+DRIVER_MIX2_PACK(a7x, 39, 1280, 1, 1, 26, 13, 3)
+DRIVER_MIX2_PACK(a7x, 40, 1280, 1, 1, 26, 14, 3)
+DRIVER_MIX2_PACK(a7x, 41, 1024, 1, 1, 26, 15, 3)
+DRIVER_MIX2_PACK(a7x, 42, 1024, 1, 1, 26, 16, 3)
+DRIVER_MIX2_PACK(a7x, 43, 1024, 1, 1, 26, 17, 3)
+DRIVER_MIX2_PACK(a7x, 44, 1024, 1, 1, 26, 18, 3)
+DRIVER_MIX2_PACK(a7x, 45, 1024, 1, 1, 26, 19, 3)
+DRIVER_MIX2_PACK(a7x, 46, 1024, 1, 1, 26, 20, 3)
+DRIVER_MIX2_PACK(a7x, 47, 1024, 1, 1, 26, 21, 3)
+DRIVER_MIX2_PACK(a7x, 48, 1024, 1, 1, 26, 22, 3)
+DRIVER_MIX2_PACK(a7x, 49, 1024, 1, 1, 26, 23, 3)
+DRIVER_MIX2_PACK(a7x, 50, 1024, 1, 1, 26, 24, 3)
+
+
diff --git a/src/neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotCopy.c b/src/neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotCopy.c
new file mode 100644
index 0000000..0d2769b
--- /dev/null
+++ b/src/neon_armv8a/sgemm_skinny_dot_kernel/SgemmSkinnyDotCopy.c
@@ -0,0 +1,547 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include <stdint.h>
+#include <arm_neon.h>
+
+static inline void pref_b(const float *src) {
+  __asm__("prfm pldl1keep,[%0,#64]\n\t"::"r"(src):);
+}
+
+static inline void pack_rm_from_cm_4col(float * __restrict__ b_wt,
+  const float * __restrict__ b_rd, uint32_t K, uint32_t LDB,
+  uint32_t N, uint32_t ninc1_2,
+  uint32_t ninc1_4, uint32_t ninc2_4, uint32_t ninc3_4) {
+
+  const float *b_l1 = b_rd;
+  const float *b_l2 = b_rd + LDB;
+  const float *b_l3 = b_rd + LDB * 2;
+  const float *b_l4 = b_rd + LDB * 3;
+  float *b_w1 = b_wt;
+
+  uint32_t k_left = K;
+
+  for (; k_left > 3; k_left -= 4) {
+    float32x4x4_t tmp;
+    tmp.val[0] = vld1q_f32(b_l1); b_l1 += 4; pref_b(b_l1);
+    tmp.val[1] = vld1q_f32(b_l2); b_l2 += 4; pref_b(b_l2);
+    tmp.val[2] = vld1q_f32(b_l3); b_l3 += 4; pref_b(b_l3);
+    tmp.val[3] = vld1q_f32(b_l4); b_l4 += 4; pref_b(b_l4);
+    vst4q_lane_f32(b_w1, tmp, 0);
+    vst4q_lane_f32(b_w1 + ninc1_4, tmp, 1);
+    vst4q_lane_f32(b_w1 + ninc2_4, tmp, 2);
+    vst4q_lane_f32(b_w1 + ninc3_4, tmp, 3);
+    b_w1 += N * 4;
+  }
+  if (k_left > 1) {
+    float32x2x4_t tmp;
+    tmp.val[0] = vld1_f32(b_l1); b_l1 += 2;
+    tmp.val[1] = vld1_f32(b_l2); b_l2 += 2;
+    tmp.val[2] = vld1_f32(b_l3); b_l3 += 2;
+    tmp.val[3] = vld1_f32(b_l4); b_l4 += 2;
+    vst4_lane_f32(b_w1, tmp, 0);
+    vst4_lane_f32(b_w1 + ninc1_2, tmp, 1);
+    b_w1 += N * 2;
+    k_left -= 2;
+  }
+  if (k_left > 0) {
+    b_w1[0] = *b_l1;
+    b_w1[1] = *b_l2;
+    b_w1[2] = *b_l3;
+    b_w1[3] = *b_l4;
+  }
+}
+
+void pack_0_from_cm(float * __restrict__ b_scr,
+  const float * __restrict__ B, uint32_t LDB, uint32_t K, uint32_t N) {
+
+  const float *b_rd = B;
+  uint32_t n_left = N;
+  for (; n_left > 3; n_left -= 4) {
+    pack_rm_from_cm_4col(b_scr + N - n_left, b_rd, K, LDB, N,
+      N, N, N * 2, N * 3);
+    b_rd += 4 * LDB;
+  }
+  float *b_wt = b_scr + N - n_left;
+  if (n_left == 3) {
+    const float *b_rd2 = b_rd + LDB;
+    const float *b_rd3 = b_rd + LDB * 2;
+    uint32_t k_left = K;
+    for (; k_left > 3; k_left -= 4) {
+      float32x4x3_t tmp;
+      tmp.val[0] = vld1q_f32(b_rd); b_rd += 4; pref_b(b_rd);
+      tmp.val[1] = vld1q_f32(b_rd2); b_rd2 += 4; pref_b(b_rd2);
+      tmp.val[2] = vld1q_f32(b_rd3); b_rd3 += 4; pref_b(b_rd3);
+      vst3q_lane_f32(b_wt, tmp, 0); b_wt += N;
+      vst3q_lane_f32(b_wt, tmp, 1); b_wt += N;
+      vst3q_lane_f32(b_wt, tmp, 2); b_wt += N;
+      vst3q_lane_f32(b_wt, tmp, 3); b_wt += N;
+    }
+    if (k_left > 1) {
+      float32x2x3_t tmp;
+      tmp.val[0] = vld1_f32(b_rd); b_rd += 2;
+      tmp.val[1] = vld1_f32(b_rd2); b_rd2 += 2;
+      tmp.val[2] = vld1_f32(b_rd3); b_rd3 += 2;
+      vst3_lane_f32(b_wt, tmp, 0); b_wt += N;
+      vst3_lane_f32(b_wt, tmp, 1); b_wt += N;
+      k_left -= 2;
+    }
+    if (k_left > 0) {
+      b_wt[0] = *b_rd; b_wt[1] = *b_rd2; b_wt[2] = *b_rd3;
+    }
+  } else if (n_left == 2) {
+    const float *b_rd2 = b_rd + LDB;
+    uint32_t k_left = K;
+    for (; k_left > 3; k_left -= 4) {
+      float32x4x2_t tmp;
+      tmp.val[0] = vld1q_f32(b_rd); b_rd += 4; pref_b(b_rd);
+      tmp.val[1] = vld1q_f32(b_rd2); b_rd2 += 4; pref_b(b_rd2);
+      vst2q_lane_f32(b_wt, tmp, 0); b_wt += N;
+      vst2q_lane_f32(b_wt, tmp, 1); b_wt += N;
+      vst2q_lane_f32(b_wt, tmp, 2); b_wt += N;
+      vst2q_lane_f32(b_wt, tmp, 3); b_wt += N;
+    }
+    if (k_left > 1) {
+      float32x2x2_t tmp;
+      tmp.val[0] = vld1_f32(b_rd); b_rd += 2;
+      tmp.val[1] = vld1_f32(b_rd2); b_rd2 += 2;
+      vst2_lane_f32(b_wt, tmp, 0); b_wt += N;
+      vst2_lane_f32(b_wt, tmp, 1); b_wt += N;
+      k_left -= 2;
+    }
+    if (k_left > 0) {
+      b_wt[0] = *b_rd; b_wt[1] = *b_rd2;
+    }
+  } else if (n_left == 1) {
+    for (uint32_t k_pos = 0; k_pos < K; ++k_pos) {
+      *b_wt = b_rd[k_pos]; b_wt += N;
+    }
+  }
+}
+
+void pack_1_from_cm(float * __restrict__ b_scr,
+  const float * __restrict__ B, uint32_t LDB, uint32_t K, uint32_t N) {
+
+  const float *b_rd = B;
+  const uint32_t n_4z = N & 0xFFFFFFFC;
+  uint32_t n_left = N;
+  for (; n_left > 3; n_left -= 4) {
+    pack_rm_from_cm_4col(b_scr + N - n_left, b_rd, K, LDB, N,
+      N, n_4z, n_4z * 2, n_4z * 3);
+    b_rd += 4 * LDB;
+  }
+  float *b_wt = b_scr + (N - n_left) * 4;
+  if (n_left == 3) {
+    const float *b_rd2 = b_rd + LDB;
+    const float *b_rd3 = b_rd + LDB * 2;
+    uint32_t k_left = K;
+    for (; k_left > 3; k_left -= 4) {
+      float32x4_t tmp1 = vld1q_f32(b_rd); b_rd += 4; pref_b(b_rd);
+      float32x4_t tmp2 = vld1q_f32(b_rd2); b_rd2 += 4; pref_b(b_rd2);
+      float32x4_t tmp3 = vld1q_f32(b_rd3); b_rd3 += 4; pref_b(b_rd3);
+      vst1q_f32(b_wt, tmp1); vst1q_f32(b_wt + 4, tmp2);
+      vst1q_f32(b_wt + 8, tmp3); b_wt += N * 4;
+    }
+    b_wt -= (N - n_left) * 3;
+    for (; k_left > 0; k_left--) {
+      b_wt[0] = *b_rd++; b_wt[1] = *b_rd2++; b_wt[2] = *b_rd3++;
+      b_wt += N;
+    }
+  } else if (n_left == 2) {
+    const float *b_rd2 = b_rd + LDB;
+    uint32_t k_left = K;
+    for (; k_left > 3; k_left -= 4) {
+      float32x4_t tmp1 = vld1q_f32(b_rd); b_rd += 4; pref_b(b_rd);
+      float32x4_t tmp2 = vld1q_f32(b_rd2); b_rd2 += 4; pref_b(b_rd2);
+      vst1q_f32(b_wt, tmp1); vst1q_f32(b_wt + 4, tmp2);
+      b_wt += N * 4;
+    }
+    b_wt -= (N - n_left) * 3;
+    for (; k_left > 0; k_left--) {
+      b_wt[0] = *b_rd++; b_wt[1] = *b_rd2++;
+      b_wt += N;
+    }
+  } else if (n_left == 1) {
+    uint32_t k_left = K;
+    for (; k_left > 3; k_left -= 4) {
+      float32x4_t tmp1 = vld1q_f32(b_rd); b_rd += 4; pref_b(b_rd);
+      vst1q_f32(b_wt, tmp1); b_wt += N * 4;
+    }
+    b_wt -= (N - n_left) * 3;
+    for (; k_left > 0; k_left--) {
+      b_wt[0] = *b_rd++;
+      b_wt += N;
+    }
+  }
+}
+
+void pack_2_from_cm(float * __restrict__ b_scr,
+  const float * __restrict__ B, uint32_t LDB, uint32_t K, uint32_t N) {
+
+  const float *b_rd = B;
+  const uint32_t n_2z = N & 0xFFFFFFFC;
+  uint32_t n_left = N;
+  for (; n_left > 3; n_left -= 4) {
+    pack_rm_from_cm_4col(b_scr + N - n_left, b_rd, K, LDB, N,
+      n_2z, n_2z, N * 2, N * 2 + n_2z);
+    b_rd += 4 * LDB;
+  }
+  float *b_wt = b_scr + (N - n_left) * 2;
+  if (n_left == 3) {
+    const float *b_rd2 = b_rd + LDB;
+    const float *b_rd3 = b_rd + LDB * 2;
+    uint32_t k_left = K;
+    for (; k_left > 1; k_left -= 2) {
+      float32x2_t tmp1 = vld1_f32(b_rd); b_rd += 2;
+      float32x2_t tmp2 = vld1_f32(b_rd2); b_rd2 += 2;
+      float32x2_t tmp3 = vld1_f32(b_rd3); b_rd3 += 2;
+      vst1_f32(b_wt, tmp1); vst1_f32(b_wt + 2, tmp2);
+      vst1_f32(b_wt + 4, tmp3); b_wt += N * 2;
+    }
+    b_wt -= N - n_left;
+    if (k_left > 0) {
+      b_wt[0] = *b_rd; b_wt[1] = *b_rd2; b_wt[2] = *b_rd3;
+    }
+  } else if (n_left == 2) {
+    const float *b_rd2 = b_rd + LDB;
+    uint32_t k_left = K;
+    for (; k_left > 1; k_left -= 2) {
+      float32x2_t tmp1 = vld1_f32(b_rd); b_rd += 2;
+      float32x2_t tmp2 = vld1_f32(b_rd2); b_rd2 += 2;
+      vst1_f32(b_wt, tmp1); vst1_f32(b_wt + 2, tmp2);
+      b_wt += N * 2;
+    }
+    b_wt -= N - n_left;
+    if (k_left > 0) {
+      b_wt[0] = *b_rd; b_wt[1] = *b_rd2;
+    }
+  } else if (n_left == 1) {
+    uint32_t k_left = K;
+    for (; k_left > 1; k_left -= 2) {
+      float32x2_t tmp1 = vld1_f32(b_rd); b_rd += 2;
+      vst1_f32(b_wt, tmp1);
+      b_wt += N * 2;
+    }
+    b_wt -= N - n_left;
+    if (k_left > 0) {
+      b_wt[0] = *b_rd;
+    }
+  }
+}
+
+void pack_3_from_cm(float * __restrict__ b_scr,
+  const float * __restrict__ B, uint32_t LDB, uint32_t K, uint32_t N) {
+
+  const float *b_rd = B;
+  uint32_t n_left = N;
+  for (; n_left > 3; n_left -= 4) {
+    const float *b_rd1 = b_rd;
+    const float *b_rd2 = b_rd + LDB;
+    const float *b_rd3 = b_rd + LDB * 2;
+    const float *b_rd4 = b_rd2 + LDB * 2;
+    float *b_wt = b_scr + (N - n_left) * 2;
+    b_rd += LDB * 4;
+    uint32_t k_left = K;
+    for (; k_left > 3; k_left -= 4) {
+      float32x4_t t1 = vld1q_f32(b_rd1); b_rd1 += 4; pref_b(b_rd1);
+      float32x4_t t2 = vld1q_f32(b_rd2); b_rd2 += 4; pref_b(b_rd2);
+      float32x4_t t3 = vld1q_f32(b_rd3); b_rd3 += 4; pref_b(b_rd3);
+      float32x4_t t4 = vld1q_f32(b_rd4); b_rd4 += 4; pref_b(b_rd4);
+      vst1_f32(b_wt, vget_low_f32(t1));
+      vst1_f32(b_wt + 2, vget_low_f32(t2));
+      vst1_f32(b_wt + 4, vget_low_f32(t3));
+      vst1_f32(b_wt + 6, vget_low_f32(t4)); b_wt += 2 * N;
+      vst1_f32(b_wt, vget_high_f32(t1));
+      vst1_f32(b_wt + 2, vget_high_f32(t2));
+      vst1_f32(b_wt + 4, vget_high_f32(t3));
+      vst1_f32(b_wt + 6, vget_high_f32(t4)); b_wt += 2 * N;
+    }
+    if (k_left > 1) {
+      float32x2_t t1 = vld1_f32(b_rd1); b_rd1 += 2;
+      float32x2_t t2 = vld1_f32(b_rd2); b_rd2 += 2;
+      float32x2_t t3 = vld1_f32(b_rd3); b_rd3 += 2;
+      float32x2_t t4 = vld1_f32(b_rd4); b_rd4 += 2;
+      vst1_f32(b_wt, t1); vst1_f32(b_wt + 2, t2);
+      vst1_f32(b_wt + 4, t3); vst1_f32(b_wt + 6, t4); b_wt += 2 * N;
+      k_left -= 2;
+    }
+    b_wt -= N - n_left;
+    if (k_left > 0) {
+      b_wt[0] = *b_rd1; b_wt[1] = *b_rd2;
+      b_wt[2] = *b_rd3; b_wt[3] = *b_rd4;
+    }
+  }
+  if (n_left > 1) {
+    const float *b_rd1 = b_rd;
+    const float *b_rd2 = b_rd + LDB;
+    float *b_wt = b_scr + (N - n_left) * 2;
+    b_rd += LDB * 2;
+    uint32_t k_left = K;
+    for (; k_left > 1; k_left -= 2) {
+      float32x2_t t1 = vld1_f32(b_rd1); b_rd1 += 2;
+      float32x2_t t2 = vld1_f32(b_rd2); b_rd2 += 2;
+      vst1_f32(b_wt, t1); vst1_f32(b_wt + 2, t2); b_wt += 2 * N;
+    }
+    b_wt -= N - n_left;
+    if (k_left > 0) {
+      b_wt[0] = *b_rd1; b_wt[1] = *b_rd2;
+    }
+    n_left -= 2;
+  }
+  if (n_left > 0) {
+    float *b_wt = b_scr + (N - n_left) * 2;
+    uint32_t k_left = K;
+    for (; k_left > 1; k_left -= 2) {
+      float32x2_t t1 = vld1_f32(b_rd); b_rd += 2;
+      vst1_f32(b_wt, t1); b_wt += 2 * N;
+    }
+    b_wt -= N - n_left;
+    if (k_left > 0) {
+      b_wt[0] = *b_rd;
+    }
+  }
+}
+
+void pack_4_from_cm(float * __restrict__ b_scr,
+  const float * __restrict__ B, uint32_t LDB, uint32_t K, uint32_t N) {
+
+  const float *b_rd = B;
+  const uint32_t n_2z = (N << 1) - (N & 0xFFFFFFFE);
+  uint32_t n_left = N;
+  for (; n_left > 3; n_left -= 4) {
+    const float *b_rd1 = b_rd;
+    const float *b_rd2 = b_rd + LDB;
+    const float *b_rd3 = b_rd + LDB * 2;
+    const float *b_rd4 = b_rd2 + LDB * 2;
+    float *b_wt = b_scr + N - n_left;
+    b_rd += LDB * 4;
+    uint32_t k_left = K;
+    for (; k_left > 3; k_left -= 4) {
+      float32x4x4_t tmp;
+      tmp.val[0] = vld1q_f32(b_rd1); b_rd1 += 4; pref_b(b_rd1);
+      tmp.val[1] = vld1q_f32(b_rd2); b_rd2 += 4; pref_b(b_rd2);
+      tmp.val[2] = vld1q_f32(b_rd3); b_rd3 += 4; pref_b(b_rd3);
+      tmp.val[3] = vld1q_f32(b_rd4); b_rd4 += 4; pref_b(b_rd4);
+      tmp.val[1] = vrev64q_f32(tmp.val[1]);
+      tmp.val[3] = vrev64q_f32(tmp.val[3]);
+      vst4q_lane_f32(b_wt, tmp, 0);
+      vst4q_lane_f32(b_wt + n_2z, tmp, 1); b_wt += 2 * N;
+      vst4q_lane_f32(b_wt, tmp, 2);
+      vst4q_lane_f32(b_wt + n_2z, tmp, 3); b_wt += 2 * N;
+    }
+    if (k_left > 1) {
+      float32x2_t t1 = vld1_f32(b_rd1); b_rd1 += 2;
+      float32x2_t t2 = vld1_f32(b_rd2); b_rd2 += 2;
+      float32x2_t t3 = vld1_f32(b_rd3); b_rd3 += 2;
+      float32x2_t t4 = vld1_f32(b_rd4); b_rd4 += 2;
+      t2 = vrev64_f32(t2); t4 = vrev64_f32(t4);
+      float32x2_t d1 = vtrn1_f32(t1, t2);
+      float32x2_t d2 = vtrn1_f32(t3, t4);
+      float32x2_t d3 = vtrn2_f32(t1, t2);
+      float32x2_t d4 = vtrn2_f32(t3, t4);
+      vst1_f32(b_wt, d1); vst1_f32(b_wt + 2, d2);
+      vst1_f32(b_wt + n_2z, d3); vst1_f32(b_wt + n_2z + 2, d4);
+      b_wt += 2 * N; k_left -= 2;
+    }
+    if (k_left > 0) {
+      b_wt[0] = *b_rd1; b_wt[1] = *b_rd2;
+      b_wt[2] = *b_rd3; b_wt[3] = *b_rd4;
+    }
+  }
+  if (n_left > 1) {
+    const float *b_rd1 = b_rd;
+    const float *b_rd2 = b_rd + LDB;
+    float *b_wt = b_scr + N - n_left;
+    b_rd += LDB * 2;
+    uint32_t k_left = K;
+    for (; k_left > 1; k_left -= 2) {
+      float32x2_t t1 = vld1_f32(b_rd1); b_rd1 += 2;
+      float32x2_t t2 = vld1_f32(b_rd2); b_rd2 += 2;
+      t2 = vrev64_f32(t2);
+      float32x2_t d1 = vtrn1_f32(t1, t2);
+      float32x2_t d2 = vtrn2_f32(t1, t2);
+      vst1_f32(b_wt, d1); vst1_f32(b_wt + n_2z, d2);
+      b_wt += 2 * N;
+    }
+    if (k_left > 0) {
+      b_wt[0] = *b_rd1; b_wt[1] = *b_rd2;
+    }
+    n_left -= 2;
+  }
+  if (n_left > 0) {
+    float *b_wt = b_scr + N - n_left;
+    uint32_t k_left = K;
+    for (; k_left > 1; k_left -= 2) {
+      float32x2_t t1 = vld1_f32(b_rd); b_rd += 2;
+      vst1_f32(b_wt, t1); b_wt += 2 * N;
+    }
+    if (k_left > 0) {
+      b_wt[0] = *b_rd;
+    }
+  }
+}
+
+void pack_0_from_rm(float * __restrict__ b_scr,
+  const float * __restrict__ B, uint32_t LDB, uint32_t K, uint32_t N) {
+
+  uint32_t k_left = K;
+  const float *b_rd = B;
+  float *b_wt = b_scr;
+  for (; k_left > 0; k_left--) {
+    const float *b_rd1 = b_rd; b_rd += LDB;
+    uint32_t n_left = N;
+    for (; n_left > 3; n_left -= 4) {
+      float32x4_t t1 = vld1q_f32(b_rd1); b_rd1 += 4;
+      vst1q_f32(b_wt, t1); b_wt += 4;
+    }
+    if (n_left > 1) {
+      float32x2_t t1 = vld1_f32(b_rd1); b_rd1 += 2;
+      vst1_f32(b_wt, t1); b_wt += 2;
+      n_left -= 2;
+    }
+    if (n_left > 0) {
+      *b_wt = *b_rd1; b_wt++;
+    }
+  }
+}
+
+void pack_2_from_rm(float * __restrict__ b_scr,
+  const float * __restrict__ B, uint32_t LDB, uint32_t K, uint32_t N) {
+
+  uint32_t k_left = K;
+  const uint32_t n_4z = N & 0xFFFFFFFC;
+  const float *b_rd = B;
+  float *b_wt = b_scr;
+  for (; k_left > 1; k_left -= 2) {
+    const float *b_rd1 = b_rd;
+    const float *b_rd2 = b_rd + LDB;
+    b_rd += LDB * 2;
+    float *b_wt1 = b_wt;
+    float *b_wt2 = b_wt + n_4z;
+    b_wt += N * 2;
+    uint32_t n_left = N;
+    for (; n_left > 3; n_left -= 4) {
+      float32x4_t t1 = vld1q_f32(b_rd1); b_rd1 += 4;
+      float32x4_t t2 = vld1q_f32(b_rd2); b_rd2 += 4;
+      vst1q_f32(b_wt1, t1); b_wt1 += 4;
+      vst1q_f32(b_wt2, t2); b_wt2 += 4;
+    }
+    for (; n_left > 0; n_left--) {
+      b_wt2[0] = *b_rd1++;
+      b_wt2[1] = *b_rd2++;
+      b_wt2 += 2;
+    }
+  }
+  pack_0_from_rm(b_wt, b_rd, LDB, k_left, N);
+}
+
+void pack_1_from_rm(float * __restrict__ b_scr,
+  const float * __restrict__ B, uint32_t LDB, uint32_t K, uint32_t N) {
+
+  uint32_t k_left = K;
+  const float *b_rd = B;
+  float *b_wt = b_scr;
+  const uint32_t n_4z = N & 0xFFFFFFFC;
+  for (; k_left > 3; k_left -= 4) {
+    const float *b_rd1 = b_rd;
+    const float *b_rd2 = b_rd + LDB;
+    const float *b_rd3 = b_rd + LDB * 2;
+    const float *b_rd4 = b_rd2 + LDB * 2;
+    b_rd += LDB * 4;
+    float *b_wt1 = b_wt;
+    float *b_wt2 = b_wt + n_4z;
+    float *b_wt3 = b_wt + n_4z * 2;
+    float *b_wt4 = b_wt2 + n_4z * 2;
+    b_wt += N * 4;
+    uint32_t n_left = N;
+    for (; n_left > 3; n_left -= 4) {
+      float32x4_t t1 = vld1q_f32(b_rd1); b_rd1 += 4;
+      float32x4_t t2 = vld1q_f32(b_rd2); b_rd2 += 4;
+      float32x4_t t3 = vld1q_f32(b_rd3); b_rd3 += 4;
+      float32x4_t t4 = vld1q_f32(b_rd4); b_rd4 += 4;
+      vst1q_f32(b_wt1, t1); b_wt1 += 4;
+      vst1q_f32(b_wt2, t2); b_wt2 += 4;
+      vst1q_f32(b_wt3, t3); b_wt3 += 4;
+      vst1q_f32(b_wt4, t4); b_wt4 += 4;
+    }
+    for (; n_left > 0; n_left--) {
+      b_wt4[0] = *b_rd1++; b_wt4[1] = *b_rd2++;
+      b_wt4[2] = *b_rd3++; b_wt4[3] = *b_rd4++; b_wt4 += 4;
+    }
+  }
+  pack_0_from_rm(b_wt, b_rd, LDB, k_left, N);
+}
+
+void pack_3_from_rm(float * __restrict__ b_scr,
+  const float * __restrict__ B, uint32_t LDB, uint32_t K, uint32_t N) {
+
+  uint32_t k_left = K;
+  const float *b_rd = B;
+  float *b_wt = b_scr;
+  for (; k_left > 1; k_left -= 2) {
+    const float *b_rd1 = b_rd;
+    const float *b_rd2 = b_rd + LDB;
+    b_rd += LDB * 2;
+    float *b_wt1 = b_wt;
+    b_wt += N * 2;
+    uint32_t n_left = N;
+    for (; n_left > 1; n_left -= 2) {
+      float32x2_t t1 = vld1_f32(b_rd1); b_rd1 += 2;
+      float32x2_t t2 = vld1_f32(b_rd2); b_rd2 += 2;
+      float32x2_t d1 = vzip1_f32(t1, t2);
+      float32x2_t d2 = vzip2_f32(t1, t2);
+      vst1_f32(b_wt1, d1);
+      vst1_f32(b_wt1 + 2, d2); b_wt1 += 4;
+    }
+    if (n_left > 0) {
+      b_wt1[0] = *b_rd1; b_wt1[1] = *b_rd2;
+    }
+  }
+  pack_0_from_rm(b_wt, b_rd, LDB, k_left, N);
+}
+
+void pack_4_from_rm(float * __restrict__ b_scr,
+  const float * __restrict__ B, uint32_t LDB, uint32_t K, uint32_t N) {
+
+  uint32_t k_left = K;
+  const float *b_rd = B;
+  float *b_wt = b_scr;
+  const uint32_t n_2z = (N << 1) - (N & 0xFFFFFFFE);
+  for (; k_left > 1; k_left -= 2) {
+    const float *b_rd1 = b_rd;
+    const float *b_rd2 = b_rd + LDB;
+    b_rd += LDB * 2;
+    float *b_wt1 = b_wt;
+    float *b_wt2 = b_wt + n_2z;
+    b_wt += N * 2;
+    uint32_t n_left = N;
+    for (; n_left > 1; n_left -= 2) {
+      float32x2_t t1 = vld1_f32(b_rd1); b_rd1 += 2;
+      float32x2_t t2 = vld1_f32(b_rd2); b_rd2 += 2;
+      t2 = vrev64_f32(t2);
+      float32x2_t d1 = vzip1_f32(t1, t2);
+      float32x2_t d2 = vzip2_f32(t2, t1);
+      vst1_f32(b_wt1, d1); b_wt1 += 2;
+      vst1_f32(b_wt2, d2); b_wt2 += 2;
+    }
+    if (n_left > 0) {
+      b_wt1[0] = *b_rd1; b_wt1[1] = *b_rd2;
+    }
+  }
+  pack_0_from_rm(b_wt, b_rd, LDB, k_left, N);
+}
+
diff --git a/test/TestBias.c b/test/TestBias.c
new file mode 100644
index 0000000..10517a1
--- /dev/null
+++ b/test/TestBias.c
@@ -0,0 +1,281 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#if __aarch64__
+#include "neon_armv8a/Bias.h"
+#else
+#include "neon_armv7a/Bias.h"
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <time.h>
+#include <string.h>
+
+#define TEST_BIAS(type) \
+static void test_bias_##type(uint32_t dim1, uint32_t dim2, uint8_t status) {\
+  bool dim0_bias = status & 1;\
+  bool dim1_bias = status & 2;\
+  bool dim2_bias = status & 4;\
+  printf("Test info for bias:\n");\
+  printf("data type = "#type"\n");\
+  printf("dim1 = %u, dim2 = %u\n", dim1, dim2);\
+  printf("dim0_bias = %d\n", dim0_bias ? 1 : 0);\
+  printf("dim1_bias = %d\n", dim1_bias ? 1 : 0);\
+  printf("dim2_bias = %d\n", dim2_bias ? 1 : 0);\
+\
+  const uint64_t size_dat = (dim1 + 4) * (dim2 + 4);\
+  const uint32_t num_iters = 40000000 / size_dat;\
+  if (num_iters <= 2) {\
+    printf("Problem size too large.\n");\
+    return;\
+  }\
+  type * const ref = (type *)malloc(sizeof(type) * size_dat);\
+  type * const dat = (type *)malloc(sizeof(type) *\
+    size_dat * num_iters);\
+  type * const bias_dim1 = dim1_bias ? (type *)malloc(sizeof(type) *\
+    (dim1 + 4) * num_iters) : NULL;\
+  type * const bias_dim2 = dim2_bias ? (type *)malloc(sizeof(type) *\
+    (dim2 + 4) * num_iters) : NULL;\
+\
+  srand(time(NULL));\
+  for (uint64_t pos = 0; pos < size_dat; ++pos) {\
+    ref[pos] = rand() % 256;\
+  }\
+  for (uint32_t pos = 0; pos < num_iters; ++pos) {\
+    memcpy(dat + pos * size_dat, ref, size_dat * sizeof(type));\
+  }\
+  if (dim1_bias) {\
+    for (uint32_t pos = 0; pos < dim1 + 4; ++pos) {\
+      bias_dim1[pos] = rand() % 256;\
+    }\
+    for (uint32_t pos = 1; pos < num_iters; ++pos) {\
+      memcpy(bias_dim1 + pos * (dim1 + 4), bias_dim1, (dim1 + 4) * sizeof(type));\
+    }\
+  }\
+  if (dim2_bias) {\
+    for (uint32_t pos = 0; pos < dim2 + 4; ++pos) {\
+      bias_dim2[pos] = rand() % 256;\
+    }\
+    for (uint32_t pos = 1; pos < num_iters; ++pos) {\
+      memcpy(bias_dim2 + pos * (dim2 + 4), bias_dim2, (dim2 + 4) * sizeof(type));\
+    }\
+  }\
+\
+  const type bias_v0 = dim0_bias ? (rand() % 256 + 1) : 0;\
+  if (dim0_bias) {\
+    for (uint32_t pos = 0; pos < dim1 * dim2; ++pos) ref[pos] += bias_v0;\
+  }\
+\
+  if (dim1_bias) {\
+    for (uint32_t dim2_pos = 0; dim2_pos < dim2; ++dim2_pos) {\
+      type *curr = ref + dim2_pos * dim1;\
+      for (uint32_t dim1_pos = 0; dim1_pos < dim1; ++dim1_pos) {\
+        curr[dim1_pos] += (type)2.0 * bias_dim1[dim1_pos];\
+      }\
+    }\
+  }\
+\
+  if (dim2_bias) {\
+    for (uint32_t dim2_pos = 0; dim2_pos < dim2; ++dim2_pos) {\
+      const type bias = (type)3.0 * bias_dim2[dim2_pos];\
+      type *curr = ref + dim2_pos * dim1;\
+      for (uint32_t dim1_pos = 0; dim1_pos < dim1; ++dim1_pos) {\
+        curr[dim1_pos] += bias;\
+      }\
+    }\
+  }\
+\
+  bias_##type(dat, bias_v0, bias_dim1, 2.0, bias_dim2, 3.0, dim1, dim2);\
+  double max_diff = 0.0;\
+  for (uint64_t pos = 0; pos < size_dat; ++pos) {\
+    double tmp = (double)dat[pos] - (double)ref[pos];\
+    if (tmp < 0) tmp *= -1.0;\
+    if (tmp > max_diff) max_diff = tmp;\
+  }\
+  printf("Max diff. between calc. and ref.: %.2e\n", max_diff);\
+\
+  struct timespec st, et;\
+  clock_gettime(CLOCK_MONOTONIC, &st);\
+  for (uint32_t pos = 1; pos < num_iters; ++pos) {\
+    bias_##type(dat + pos * size_dat, bias_v0,\
+      bias_dim1 ? bias_dim1 + pos * dim1 : NULL, 2.0,\
+      bias_dim2 ? bias_dim2 + pos * dim2 : NULL, 3.0,\
+      dim1, dim2);\
+  }\
+  clock_gettime(CLOCK_MONOTONIC, &et);\
+  double nsec = (double)(et.tv_nsec - st.tv_nsec) + 1.0e9 * (double)\
+    (et.tv_sec - st.tv_sec);\
+  printf("Avg. perf.: %.2e G elements per second.\n", (double)dim1 * \
+    (double)dim2 * (double)(num_iters - 1) / nsec);\
+\
+  free(ref);\
+  free(dat);\
+  free(bias_dim1);\
+  free(bias_dim2);\
+}
+
+#define TEST_SUM(signint, sumfunc) \
+void test_sum_##signint##8to32(uint32_t dim1, uint32_t dim2,\
+  uint32_t status) {\
+\
+  printf("Test info for sum:\n");\
+  printf("data type = "#signint"8 -> "#signint"32\n");\
+  printf("dim1 = %u, dim2 = %u\n", dim1, dim2);\
+  if (status) {\
+    printf("sum along dim1 direction, output length = dim2\n");\
+  } else {\
+    printf("sum along dim2 direction, output length = dim1\n");\
+  }\
+\
+  const uint64_t size_dat = (dim1 + 4) * (dim2 + 4);\
+  const uint32_t num_iters = 40000000 / size_dat;\
+  if (num_iters <= 2) {\
+    printf("Problem size too large.\n");\
+    return;\
+  }\
+  signint##8_t * const dat = (signint##8_t *)malloc(size_dat * num_iters);\
+\
+  const uint32_t size_out = status ? (dim2 + 4) : (dim1 + 4);\
+  signint##32_t * const ref = (signint##32_t *)malloc(size_out * 4);\
+  signint##32_t * const tst = (signint##32_t *)malloc(size_out * num_iters * 4);\
+\
+  srand(time(NULL));\
+  for (uint64_t pos = 0; pos < size_dat; ++pos) {\
+    dat[pos] = rand();\
+  }\
+  for (uint32_t pos = 1; pos < num_iters; ++pos) {\
+    memcpy(dat + pos * size_dat, dat, size_dat);\
+  }\
+\
+  if (status) {\
+    for (uint32_t dim2_pos = 0; dim2_pos < dim2; ++dim2_pos) {\
+      const signint##8_t *src = dat + dim2_pos * dim1;\
+      signint##32_t sum = 0;\
+      for (uint32_t dim1_pos = 0; dim1_pos < dim1; ++dim1_pos) {\
+        sum += src[dim1_pos];\
+      }\
+      ref[dim2_pos] = sum;\
+    }\
+    for (uint32_t dim2_pos = dim2; dim2_pos < size_out; ++dim2_pos) {\
+      ref[dim2_pos] = tst[dim2_pos] = rand();\
+    }\
+  } else {\
+    for (uint32_t dim1_pos = 0; dim1_pos < dim1; ++dim1_pos) {\
+      ref[dim1_pos] = 0;\
+    }\
+    for (uint32_t dim1_pos = dim1; dim1_pos < size_out; dim1_pos++) {\
+      ref[dim1_pos] = tst[dim1_pos] = rand();\
+    }\
+    for (uint32_t dim2_pos = 0; dim2_pos < dim2; ++dim2_pos) {\
+      const signint##8_t *src = dat + dim2_pos * dim1;\
+      for (uint32_t dim1_pos = 0; dim1_pos < dim1; ++dim1_pos) {\
+        ref[dim1_pos] += src[dim1_pos];\
+      }\
+    }\
+  }\
+\
+  sumfunc(dat, tst, dim1, dim2, status);\
+  int consistent = 1;\
+  for (uint32_t pos = 0; pos < size_out; ++pos) {\
+    if (consistent != 0 && ref[pos] != tst[pos]) {\
+      consistent = 0;\
+      printf("elements at pos %u are unequal.\n", pos);\
+    }\
+  }\
+  if (consistent != 0) {\
+    printf("all elements are equal between ref. and tst.\n");\
+    struct timespec st, et;\
+    clock_gettime(CLOCK_MONOTONIC, &st);\
+    for (uint32_t pos = 1; pos < num_iters; ++pos) {\
+      sumfunc(dat + pos * size_dat, tst + pos * size_out,\
+        dim1, dim2, status);\
+    }\
+    clock_gettime(CLOCK_MONOTONIC, &et);\
+    double nsec = (double)(et.tv_nsec - st.tv_nsec) + 1.0e9 * \
+      (double)(et.tv_sec - st.tv_sec);\
+    printf("Avg. Perf.: %.2e G elements read per second.\n",\
+      (double)dim1 * (double)(dim2) * (double)(num_iters - 1) / nsec);\
+  }\
+  free(dat);\
+  free(ref);\
+  free(tst);\
+}
+
+
+TEST_BIAS(float)
+
+TEST_BIAS(int32_t)
+
+TEST_SUM(uint, u8u32_sum)
+
+/************************************************************************
+ * cmd usage of the test program for bias functions
+ *   <path/to/test_emll_bias> <dim1> <dim2> <bias_status> <data_type>
+ * dim1: the length of the first dimension of the matrix,
+ *       equals to number of columns for row-major matrices,
+ *       equals to number of rows for column-major matrices.
+ * dim2: the length of the second dimension of the matrix,
+ *       equals to number of rows for row-major matrices,
+ *       equals to number of columns for column-major matrices.
+ * bias_status: a number indicating which function to test.
+ *       0 - 7 for bias function:
+ *         0: no bias is performed.
+ *         1: only scalar bias is applied. the bias is identical
+ *            to each element.
+ *         2: bias only along the first dimension, the size of bias
+ *            vector equals dim1. for row-major matrix, this means
+ *            elem(col_id, row_id) += bias(col_id);
+ *         3: scalar & first-dimension bias operations.
+ *         4: bias only along the second dimension, the size of bias
+ *            vector equals dim2. for row-major matrix, this means
+ *            elem(col_id, row_id) += bias(row_id);
+ *         5: scalar & second-dimension bias operations.
+ *         6: first-dimension & second-dimension bias operations.
+ *         7: scalar & first-dimension & second-dimension bias operations.
+ *       8 - 9 for sum function:
+ *         8: sum along dim2. for a row-major matrix, the sum of elements
+ *            in each column is calculated.
+ *         9: sum along dim1. for a row-major matrix, the sum of elements
+ *            in each row is calculated.
+ * data_type: a string indicating the data type of bias
+ *         float: fp32
+ *         int32: int32_t
+ ************************************************************************/
+
+int main(int argc, char **argv) {
+
+  const uint32_t dim1 = (argc > 1) ? atoi(argv[1]) : 63;
+  const uint32_t dim2 = (argc > 2) ? atoi(argv[2]) : 143;
+  const uint8_t bias_status = (argc > 3) ? atoi(argv[3]) : 7;
+  const char * const data_type = (argc > 4) ? argv[4] : "float";
+
+  if (bias_status > 7) {
+    test_sum_uint8to32(dim1, dim2, bias_status & 1);
+    return 0;
+  }
+
+  if (data_type[0] == 'f' || data_type[0] == 'F') {
+    test_bias_float(dim1, dim2, bias_status);
+    return 0;
+  }
+
+  test_bias_int32_t(dim1, dim2, bias_status);
+  return 0;
+}
+
diff --git a/test/TestCompilerOpenMP.c b/test/TestCompilerOpenMP.c
new file mode 100644
index 0000000..b248218
--- /dev/null
+++ b/test/TestCompilerOpenMP.c
@@ -0,0 +1,12 @@
+#include <omp.h>
+#include <stdlib.h>
+
+int main() {
+  int *id = (int*)malloc(omp_get_max_threads() * sizeof(int));
+#pragma omp parallel
+  {
+    id[omp_get_thread_num()] = omp_get_thread_num();
+  }
+  free(id);
+  return 0;
+}
diff --git a/test/TestGemm.c b/test/TestGemm.c
new file mode 100644
index 0000000..19d6c9e
--- /dev/null
+++ b/test/TestGemm.c
@@ -0,0 +1,98 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include "common/CommonTest.h"
+#include "Gemm.h"
+
+STD_TEST(sgemm, float, float, float, (RAND_MAX >> 2), (RAND_MAX >> 2))
+
+STD_TEST(s8s32gemm, int8_t, int8_t, int32_t, 64, 1)
+
+STD_TEST(u8u32gemm, uint8_t, uint8_t, uint32_t, -64, 1)
+
+#if __aarch64__
+STD_TEST(hgemm, float16_t, float16_t, float16_t, 6, 32)
+#endif
+
+/*************************************************************************
+ * cmd usage of test program for GEMM functions
+ *   <path/to/test_emll_gemm> <M> <N> <K> <transAB> <num_threads>\
+ *      <gemm_type> <beta>
+ * GEMM operation: C[MxN] = A[MxK] B[KxN] + beta * C[MxN]
+ * Parameters:
+ *   M: the number of rows in matrix A.
+ *   N: the number of columns in matrix B.
+ *   K: the number of columns in matrix A.
+ *   transAB: a number indicating the storage order of source matrices:
+ *     0: A column-major, B column-major
+ *     1: A row-major, B column-major
+ *     2: A column-major, B row-major
+ *     3: A row-major, B row-major
+ *   num_threads: number of threads for GEMM.
+ *   gemm_type: a string indicating the type of GEMM:
+ *     sgemm: fp32 GEMM
+ *     hgemm: fp16 GEMM
+ *     u8u32: uint8 * uint8 -> uint32 GEMM
+ *     s8s32: int8 * int8 -> int32 GEMM
+ *   beta: the scaling factor applied to matrix C prior to GEMM operation,
+ *         C = AB + beta * C.
+ *************************************************************************/
+int main(int argc, char **argv) {
+  uint32_t M = 383;
+  uint32_t N = 479;
+  uint32_t K = 319;
+  uint8_t transAB = 0;
+  uint32_t num_threads = 0;
+  const char *gemm_type = "sgemm";
+  double beta = 0.5;
+  if (argc > 1) M = atoi(argv[1]);
+  if (argc > 2) N = atoi(argv[2]);
+  if (argc > 3) K = atoi(argv[3]);
+  if (argc > 4) transAB = atoi(argv[4]);
+  if (argc > 5) num_threads = atoi(argv[5]);
+  if (argc > 6) gemm_type = argv[6];
+  if (argc > 7) beta = atof(argv[7]);
+  printf("Test info: M = %u, N = %u, K = %u\n", M, N, K);
+  printf("Test info: a_rowmajor = %d, b_rowmajor = %d\n",
+    transAB & 1, (transAB & 2) >> 1);
+  printf("Test info: num_threads = %u, beta = %.2e\n", num_threads, beta);
+
+#if __aarch64__
+  if (strcmp(gemm_type, "hgemm") == 0) {
+    printf("Test info: gemmtype = hgemm.\n");
+    std_test_hgemm(hgemm, M, N, K, transAB, beta, num_threads);
+    return 0;
+  }
+#endif
+
+  if (strcmp(gemm_type, "u8u32") == 0) {
+    printf("Test info: gemmtype = u8u32gemm.\n");
+    std_test_u8u32gemm(u8u32gemm, M, N, K, transAB, beta, num_threads);
+    return 0;
+  }
+
+  if (strcmp(gemm_type, "s8s32") == 0) {
+    printf("Test info: gemmtype = s8s32gemm.\n");
+    std_test_s8s32gemm(s8s32gemm, M, N, K, transAB, beta, num_threads);
+    return 0;
+  }
+
+  printf("Test info: gemmtype = sgemm.\n");
+  std_test_sgemm(sgemm, M, N, K, transAB, beta, num_threads);
+  return 0;
+}
+
diff --git a/test/TestQuant.c b/test/TestQuant.c
new file mode 100644
index 0000000..a67f808
--- /dev/null
+++ b/test/TestQuant.c
@@ -0,0 +1,95 @@
+/*****************************************************************************/
+/* Copyright YouDao, Inc.                                                    */
+/*                                                                           */
+/* Licensed under the Apache License, Version 2.0 (the "License");           */
+/* you may not use this file except in compliance with the License.          */
+/* You may obtain a copy of the License at                                   */
+/*                                                                           */
+/*     http://www.apache.org/licenses/LICENSE-2.0                            */
+/*                                                                           */
+/* Unless required by applicable law or agreed to in writing, software       */
+/* distributed under the License is distributed on an "AS IS" BASIS,         */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
+/* See the License for the specific language governing permissions and       */
+/* limitations under the License.                                            */
+/*****************************************************************************/
+
+
+#include "common/CommonTest.h"
+#include "Quant.h"
+
+TEST_QUANT_UNSYM(32, 8)
+
+TEST_QUANT_SYM(32, 8)
+
+TEST_QUANT_UNSYM(32, 16)
+
+TEST_QUANT_SYM(32, 16)
+
+TEST_DEQUANT_SYM(32, 32)
+
+TEST_REQUANT_UNSYM(float, 32, 8)
+
+TEST_REQUANT_SYM(float, 32, 8)
+
+TEST_REQUANT_UNSYM(float, 32, 16)
+
+TEST_REQUANT_SYM(float, 32, 16)
+
+TEST_REQUANT_UNSYM(float, 16, 8)
+
+TEST_REQUANT_SYM(float, 16, 8)
+
+int main(int argc, char **argv) {
+
+  uint32_t size = argc > 1 ? atoi(argv[1]) : 4;
+  const char * const type = argc > 2 ? argv[2] : "qu";
+
+  if (type[0] == 'q') {
+    if (type[1] == 'u') {
+      if (type[2] == '1') {
+        test_quant_asym_f32_u16(size);
+      } else {
+        test_quant_asym_f32_u8(size);
+      }
+    } else if (type[1] == 's') {
+      if (type[2] == '1') {
+        test_quant_sym_f32_s16(size);
+      } else {
+        test_quant_sym_f32_s8(size);
+      }
+    }
+  } else if (type[0] == 'd') {
+    test_dequant_sym_f32_s32(size);
+  } else if (type[0] == 'r') {
+    if (type[1] == 'u') {
+      int32_t max = argc > 3 ? atoi(argv[3]) : 20000000;
+      int32_t min = argc > 4 ? atoi(argv[4]) : -10000000;
+      float org_scale = argc > 5 ? atof(argv[5]) : 2.0;
+      if (type[2] == '1') {
+        test_requant_int16_t_float_uint8_t(size,
+          (int16_t)(min & 0xFFFF), (int16_t)(max & 0xFFFF), org_scale);
+      } else {
+        if (type[2] == '3' && type[3] == '2' && type[4] == '1') {
+          test_requant_int32_t_float_uint16_t(size, min, max, org_scale);
+        } else {
+          test_requant_int32_t_float_uint8_t(size, min, max, org_scale);
+        }
+      }
+    } else if (type[1] == 's') {
+      uint32_t max_abs = argc > 3 ? atoi(argv[3]) : 2000000;
+      float org_scale = argc > 4 ? atof(argv[4]) : 2.0;
+      if (type[2] == '1') {
+        test_requant_int16_t_float_int8_t(size,
+          (uint16_t)(max_abs & 0xFFFF), org_scale);
+      } else {
+        if (type[2] == '3' && type[3] == '2' && type[4] == '1') {
+          test_requant_int32_t_float_int16_t(size, max_abs, org_scale);
+        } else {
+          test_requant_int32_t_float_int8_t(size, max_abs, org_scale);
+        }
+      }
+    }
+  }
+  return 0;
+}