From 6dc571d2d0de77a7d8f964e0488e613d0539b421 Mon Sep 17 00:00:00 2001
From: Cms Build <cmsbuild@cern.ch>
Date: Thu, 16 Jan 2020 18:01:56 +0100
Subject: [PATCH 1/2] Add changes for powerpc

---
 onnxruntime/core/mlas/lib/ppc64/NEON_2_SSE.h | 16790 +++++++++++++++++
 onnxruntime/core/mlas/lib/ppc64/emmintrin.h  |  2324 +++
 onnxruntime/core/mlas/lib/ppc64/mm_malloc.h  |    50 +
 onnxruntime/core/mlas/lib/ppc64/mmintrin.h   |  1450 ++
 onnxruntime/core/mlas/lib/ppc64/pmmintrin.h  |   150 +
 onnxruntime/core/mlas/lib/ppc64/smmintrin.h  |    85 +
 onnxruntime/core/mlas/lib/ppc64/tmmintrin.h  |   495 +
 onnxruntime/core/mlas/lib/ppc64/xmmintrin.h  |  1844 ++
 onnxruntime/core/mlas/lib/sgemm.cpp          |   518 +
 9 files changed, 23706 insertions(+)
 create mode 100644 onnxruntime/core/mlas/lib/ppc64/NEON_2_SSE.h
 create mode 100644 onnxruntime/core/mlas/lib/ppc64/emmintrin.h
 create mode 100644 onnxruntime/core/mlas/lib/ppc64/mm_malloc.h
 create mode 100644 onnxruntime/core/mlas/lib/ppc64/mmintrin.h
 create mode 100644 onnxruntime/core/mlas/lib/ppc64/pmmintrin.h
 create mode 100644 onnxruntime/core/mlas/lib/ppc64/smmintrin.h
 create mode 100644 onnxruntime/core/mlas/lib/ppc64/tmmintrin.h
 create mode 100644 onnxruntime/core/mlas/lib/ppc64/xmmintrin.h

diff --git a/onnxruntime/core/mlas/lib/ppc64/NEON_2_SSE.h b/onnxruntime/core/mlas/lib/ppc64/NEON_2_SSE.h
new file mode 100644
index 0000000000000..ddf7c71adc691
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/ppc64/NEON_2_SSE.h
@@ -0,0 +1,16790 @@
+//created by Victoria Zhislina, the Senior Application Engineer, Intel Corporation,  victoria.zhislina@intel.com
+
+//*** Copyright (C) 2012-2019 Intel Corporation.  All rights reserved.
+
+//IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+
+//By downloading, copying, installing or using the software you agree to this license.
+//If you do not agree to this license, do not download, install, copy or use the software.
+
+//                              License Agreement
+//Redistribution and use in source and binary forms, with or without modification,
+//are permitted provided that the following conditions are met:
+
+//  * Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+
+//  * The name of the copyright holders may not be used to endorse or promote products
+//    derived from this software without specific prior written permission.
+
+//This software is provided by the copyright holders and contributors "as is" and
+//any express or implied warranties, including, but not limited to, the implied
+//warranties of merchantability and fitness for a particular purpose are disclaimed.
+//In no event shall the Intel Corporation or contributors be liable for any direct,
+//indirect, incidental, special, exemplary, or consequential damages
+//(including, but not limited to, procurement of substitute goods or services;
+//loss of use, data, or profits; or business interruption) however caused
+//and on any theory of liability, whether in contract, strict liability,
+//or tort (including negligence or otherwise) arising in any way out of
+//the use of this software, even if advised of the possibility of such damage.
+
+//*****************************************************************************************
+// This file is intended to simplify ARM->IA32 porting
+// It makes the correspondence between ARM NEON intrinsics (as defined in "arm_neon.h")
+// and x86 SSE(up to SSE4.2) intrinsic functions as defined in headers files below
+//MMX instruction set is not used due to non availability on x64 systems,
+//performance overhead and the necessity to use the EMMS instruction (_mm_empty())for mmx-x87 floating point  switching
+//*****************************************************************************************
+
+//!!!!!!!!!!!!!!  To use this file just include it in your project that uses ARM NEON intinsics instead of "arm_neon.h" and compile it as usual
+//!!!!!!!!!!!!!!  but please pay attention at #define USE_SSE4 below - you might need to define it manualy for newest Intel Atom or any Intel Core platforms for greater performance.
+
+#ifndef NEON2SSE_H
+#define NEON2SSE_H
+
+/*********************************************************************************************************************/
+//!!!!!!!!!!!!!!
+//if USE_SSE4 is defined, some functions use SSE4 instructions instead of earlier SSE versions, when undefined - SIMD up to SSSE3 are used
+//For older devices without SSE4 support it should be undefined,  for newer devices - defined, probably manualy if your compiler doesn't set __SSE4_2__ predefine
+#ifndef USE_SSE4
+#   if defined(__SSE4_2__)
+#       define USE_SSE4
+#   endif
+#endif
+/*********************************************************************************************************************/
+
+#include "xmmintrin.h"     //SSE
+#include "emmintrin.h"     //SSE2
+#include "pmmintrin.h"     //SSE3
+#include "tmmintrin.h"     //SSSE3
+#pragma message ( "Intrinsic headers loaded " )
+#ifdef USE_SSE4
+#   include <smmintrin.h> //SSE4.1
+#   include <nmmintrin.h> //SSE4.2
+#endif
+
+#include <math.h>
+
+//***************  functions and data attributes, compiler dependent  *********************************
+//***********************************************************************************
+#ifdef __GNUC__
+#   define _GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+#   define _NEON2SSESTORAGE static
+#   define _NEON2SSE_ALIGN_16  __attribute__((aligned(16)))
+#   define _NEON2SSE_INLINE _NEON2SSESTORAGE inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+#   ifndef NEON2SSE_DISABLE_PERFORMANCE_WARNING
+#       if _GCC_VERSION <  40500
+#           define _NEON2SSE_PERFORMANCE_WARNING(function, explanation)   __attribute__((deprecated)) function
+#       else
+#           define _NEON2SSE_PERFORMANCE_WARNING(function, explanation)   __attribute__((deprecated(explanation))) function
+#       endif
+#   else
+#       define _NEON2SSE_PERFORMANCE_WARNING(function, explanation)  function
+#   endif
+#   if defined(__x86_64__)
+#       define _NEON2SSE_64BIT  __x86_64__
+#   endif
+#else
+#   define _NEON2SSESTORAGE static
+#   define _NEON2SSE_ALIGN_16  __declspec(align(16))
+#   define _NEON2SSE_INLINE _NEON2SSESTORAGE __inline
+#   if (defined(_MSC_VER) || defined (__INTEL_COMPILER)) && !defined(NEON2SSE_DISABLE_PERFORMANCE_WARNING)
+#       define _NEON2SSE_PERFORMANCE_WARNING(function, EXPLANATION) __declspec(deprecated(EXPLANATION)) function
+#       if defined(_M_X64)
+#           define _NEON2SSE_64BIT  _M_X64
+#       endif
+#   else
+#       define _NEON2SSE_PERFORMANCE_WARNING(function, explanation)  function
+#   endif
+#endif
+
+#if defined  (_NEON2SSE_64BIT) && defined (USE_SSE4)
+#   define _NEON2SSE_64BIT_SSE4
+#endif
+
+/*********************************************************************************************************************/
+//    data types conversion
+/*********************************************************************************************************************/
+#if defined(_MSC_VER) && (_MSC_VER < 1300)
+    typedef signed char int8_t;
+    typedef unsigned char uint8_t;
+    typedef signed short int16_t;
+    typedef unsigned short uint16_t;
+    typedef signed int int32_t;
+    typedef unsigned int uint32_t;
+    typedef signed long long int64_t;
+    typedef unsigned long long uint64_t;
+#elif defined(_MSC_VER)
+    typedef signed __int8 int8_t;
+    typedef unsigned __int8 uint8_t;
+    typedef signed __int16 int16_t;
+    typedef unsigned __int16 uint16_t;
+    typedef signed __int32 int32_t;
+    typedef unsigned __int32 uint32_t;
+
+    typedef signed long long int64_t;
+    typedef unsigned long long uint64_t;
+#else
+#   include <stdint.h>
+#   include <limits.h>
+#endif
+
+
+typedef   float float32_t;
+#if !defined(__clang__)
+typedef   float __fp16;
+#endif
+
+typedef   double float64_t;
+
+typedef union   __m64_128 {
+    uint64_t m64_u64[1];
+    int64_t m64_i64[1];
+	float64_t m64_d64[1];
+    uint32_t m64_u32[2];
+    int32_t m64_i32[2];
+    float32_t m64_f32[2];
+    int16_t m64_i16[4];
+    uint16_t m64_u16[4];
+    int8_t m64_i8[8];
+    uint8_t m64_u8[8];
+} __m64_128;
+
+typedef __m64_128 int8x8_t;
+typedef __m64_128 uint8x8_t;
+typedef __m64_128 int16x4_t;
+typedef __m64_128 uint16x4_t;
+typedef __m64_128 int32x2_t;
+typedef __m64_128 uint32x2_t;
+typedef __m64_128 int64x1_t;
+typedef __m64_128 uint64x1_t;
+typedef __m64_128 poly8x8_t;
+typedef __m64_128 poly16x4_t;
+
+typedef __m64_128 float32x2_t;
+typedef __m128 float32x4_t;
+
+typedef __m128 float16x4_t; //not supported by IA, for compartibility
+typedef __m128 float16x8_t; //not supported by IA, for compartibility
+
+typedef __m64_128 float64x1_t;
+typedef __m128d float64x2_t;
+
+typedef __m128i int8x16_t;
+typedef __m128i int16x8_t;
+typedef __m128i int32x4_t;
+typedef __m128i int64x2_t;
+typedef __m128i uint8x16_t;
+typedef __m128i uint16x8_t;
+typedef __m128i uint32x4_t;
+typedef __m128i uint64x2_t;
+typedef __m128i poly8x16_t;
+typedef __m128i poly16x8_t;
+
+#if defined(_MSC_VER)
+#   define SINT_MIN     (-2147483647 - 1) /* min signed int value */
+#   define SINT_MAX       2147483647 /* max signed int value */
+#else
+#   define SINT_MIN     INT_MIN /* min signed int value */
+#   define SINT_MAX     INT_MAX /* max signed int value */
+#endif
+
+typedef  uint8_t poly8_t;
+typedef  uint16_t poly16_t;
+
+
+//MSVC compilers (tested up to 2012 VS version) doesn't allow using structures or arrays of __m128x type  as functions arguments resulting in
+//error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned.  To avoid it we need the special trick for functions that use these types
+struct int8x16x2_t {
+    int8x16_t val[2];
+};
+struct int16x8x2_t {
+    int16x8_t val[2];
+};
+struct int32x4x2_t {
+    int32x4_t val[2];
+};
+struct int64x2x2_t {
+    int64x2_t val[2];
+};
+//Unfortunately we are unable to merge two 64-bits in on 128 bit register because user should be able to access val[n] members explicitly!!!
+struct int8x8x2_t {
+    int8x8_t val[2];
+};
+struct int16x4x2_t {
+    int16x4_t val[2];
+};
+struct int32x2x2_t {
+    int32x2_t val[2];
+};
+struct int64x1x2_t {
+    int64x1_t val[2];
+};
+
+typedef struct int8x16x2_t int8x16x2_t; //for C compilers to make them happy
+typedef struct int16x8x2_t int16x8x2_t; //for C compilers to make them happy
+typedef struct int32x4x2_t int32x4x2_t; //for C compilers to make them happy
+typedef struct int64x2x2_t int64x2x2_t; //for C compilers to make them happy
+
+typedef struct int8x8x2_t int8x8x2_t; //for C compilers to make them happy
+typedef struct int16x4x2_t int16x4x2_t; //for C compilers to make them happy
+typedef struct int32x2x2_t int32x2x2_t; //for C compilers to make them happy
+typedef struct int64x1x2_t int64x1x2_t; //for C compilers to make them happy
+
+/* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers structures above */
+typedef struct int8x16x2_t uint8x16x2_t;
+typedef struct int16x8x2_t uint16x8x2_t;
+typedef struct int32x4x2_t uint32x4x2_t;
+typedef struct int64x2x2_t uint64x2x2_t;
+typedef struct int8x16x2_t poly8x16x2_t;
+typedef struct int16x8x2_t poly16x8x2_t;
+
+typedef struct int8x8x2_t uint8x8x2_t;
+typedef struct int16x4x2_t uint16x4x2_t;
+typedef struct int32x2x2_t uint32x2x2_t;
+typedef struct int64x1x2_t uint64x1x2_t;
+typedef struct int8x8x2_t poly8x8x2_t;
+typedef struct int16x4x2_t poly16x4x2_t;
+
+//float
+struct float32x4x2_t {
+    float32x4_t val[2];
+};
+struct float16x8x2_t {
+    float16x8_t val[2];
+};
+struct float32x2x2_t {
+    float32x2_t val[2];
+};
+
+typedef struct float32x4x2_t float32x4x2_t; //for C compilers to make them happy
+typedef struct float16x8x2_t float16x8x2_t; //for C compilers to make them happy
+typedef struct  float32x2x2_t float32x2x2_t; //for C compilers to make them happy
+typedef  float16x8x2_t float16x4x2_t;
+
+//4
+struct int8x16x4_t {
+    int8x16_t val[4];
+};
+struct int16x8x4_t {
+    int16x8_t val[4];
+};
+struct int32x4x4_t {
+    int32x4_t val[4];
+};
+struct int64x2x4_t {
+    int64x2_t val[4];
+};
+
+struct int8x8x4_t {
+    int8x8_t val[4];
+};
+struct int16x4x4_t {
+    int16x4_t val[4];
+};
+struct int32x2x4_t {
+    int32x2_t val[4];
+};
+struct int64x1x4_t {
+    int64x1_t val[4];
+};
+
+typedef struct int8x16x4_t int8x16x4_t; //for C compilers to make them happy
+typedef struct int16x8x4_t int16x8x4_t; //for C compilers to make them happy
+typedef struct int32x4x4_t int32x4x4_t; //for C compilers to make them happy
+typedef struct int64x2x4_t int64x2x4_t; //for C compilers to make them happy
+
+typedef struct int8x8x4_t int8x8x4_t; //for C compilers to make them happy
+typedef struct int16x4x4_t int16x4x4_t; //for C compilers to make them happy
+typedef struct int32x2x4_t int32x2x4_t; //for C compilers to make them happy
+typedef struct int64x1x4_t int64x1x4_t; //for C compilers to make them happy
+
+/* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:*/
+typedef struct int8x8x4_t uint8x8x4_t;
+typedef struct int16x4x4_t uint16x4x4_t;
+typedef struct int32x2x4_t uint32x2x4_t;
+typedef struct int64x1x4_t uint64x1x4_t;
+typedef struct int8x8x4_t poly8x8x4_t;
+typedef struct int16x4x4_t poly16x4x4_t;
+
+typedef struct int8x16x4_t uint8x16x4_t;
+typedef struct int16x8x4_t uint16x8x4_t;
+typedef struct int32x4x4_t uint32x4x4_t;
+typedef struct int64x2x4_t uint64x2x4_t;
+typedef struct int8x16x4_t poly8x16x4_t;
+typedef struct int16x8x4_t poly16x8x4_t;
+
+struct float32x4x4_t {
+    float32x4_t val[4];
+};
+struct float16x8x4_t {
+    float16x8_t val[4];
+};
+struct float32x2x4_t {
+    float32x2_t val[4];
+};
+
+typedef struct float32x4x4_t float32x4x4_t; //for C compilers to make them happy
+typedef struct float16x8x4_t float16x8x4_t; //for C compilers to make them happy
+typedef struct  float32x2x4_t float32x2x4_t; //for C compilers to make them happy
+typedef  float16x8x4_t float16x4x4_t;
+
+//3
+struct int16x8x3_t {
+    int16x8_t val[3];
+};
+struct int32x4x3_t {
+    int32x4_t val[3];
+};
+struct int64x2x3_t {
+    int64x2_t val[3];
+};
+struct int8x16x3_t {
+    int8x16_t val[3];
+};
+
+struct int16x4x3_t {
+    int16x4_t val[3];
+};
+struct int32x2x3_t {
+    int32x2_t val[3];
+};
+struct int64x1x3_t {
+    int64x1_t val[3];
+};
+struct int8x8x3_t {
+    int8x8_t val[3];
+};
+typedef struct int16x8x3_t int16x8x3_t; //for C compilers to make them happy
+typedef struct int32x4x3_t int32x4x3_t; //for C compilers to make them happy
+typedef struct int64x2x3_t int64x2x3_t; //for C compilers to make them happy
+typedef struct int8x16x3_t int8x16x3_t; //for C compilers to make them happy
+
+typedef struct int8x8x3_t int8x8x3_t; //for C compilers to make them happy
+typedef struct int16x4x3_t int16x4x3_t; //for C compilers to make them happy
+typedef struct int32x2x3_t int32x2x3_t; //for C compilers to make them happy
+typedef struct int64x1x3_t int64x1x3_t; //for C compilers to make them happy
+
+
+/* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:*/
+typedef struct int8x16x3_t uint8x16x3_t;
+typedef struct int16x8x3_t uint16x8x3_t;
+typedef struct int32x4x3_t uint32x4x3_t;
+typedef struct int64x2x3_t uint64x2x3_t;
+typedef struct int8x16x3_t poly8x16x3_t;
+typedef struct int16x8x3_t poly16x8x3_t;
+typedef struct  int8x8x3_t uint8x8x3_t;
+typedef struct  int16x4x3_t uint16x4x3_t;
+typedef struct  int32x2x3_t uint32x2x3_t;
+typedef struct  int64x1x3_t uint64x1x3_t;
+typedef struct  int8x8x3_t poly8x8x3_t;
+typedef struct  int16x4x3_t poly16x4x3_t;
+
+//float
+struct float32x4x3_t {
+    float32x4_t val[3];
+};
+struct float32x2x3_t {
+    float32x2_t val[3];
+};
+struct float16x8x3_t {
+    float16x8_t val[3];
+};
+
+typedef struct float32x4x3_t float32x4x3_t; //for C compilers to make them happy
+typedef struct float16x8x3_t float16x8x3_t; //for C compilers to make them happy
+typedef struct float32x2x3_t float32x2x3_t; //for C compilers to make them happy
+typedef  float16x8x3_t float16x4x3_t;
+
+
+//****************************************************************************
+//****** Porting auxiliary macros ********************************************
+
+//** floating point related macros **
+#define _M128i(a) _mm_castps_si128(a)
+#define _M128(a) _mm_castsi128_ps(a)
+//here the most performance effective implementation is compiler and 32/64 bits build dependent
+#if defined (_NEON2SSE_64BIT) || (defined (__INTEL_COMPILER) && (__INTEL_COMPILER  >= 1500) )
+#   define _pM128i(a) _mm_cvtsi64_si128(*(int64_t*)(&(a)))
+#   define _M64(out, inp) out.m64_i64[0] = _mm_cvtsi128_si64 (inp);
+#   define _M64f(out, inp) out.m64_i64[0] = _mm_cvtsi128_si64 (_M128i(inp));
+#else
+   //for 32bit gcc and Microsoft compilers builds
+#   define _pM128i(a) _mm_loadl_epi64((__m128i*)&(a))
+#   define _M64(out, inp)  _mm_storel_epi64 ((__m128i*)&(out), inp)
+#   define _M64f(out, inp)  _mm_storel_epi64 ((__m128i*)&(out), _M128i(inp))
+#endif
+#define _pM128(a) _mm_castsi128_ps(_pM128i(a))
+
+#define return64(a)  _M64(res64,a); return res64;
+#define return64f(a)  _M64f(res64,a); return res64;
+
+#define _Ui64(a) (*(uint64_t*)&(a))
+#define _UNSIGNED_T(a) u ## a
+
+#define _SIGNBIT64 ((uint64_t)1 << 63)
+#define _SWAP_HI_LOW32  (2 | (3 << 2) | (0 << 4) | (1 << 6))
+#define _INSERTPS_NDX(srcField, dstField) (((srcField) << 6) | ((dstField) << 4) )
+
+#define  _NEON2SSE_REASON_SLOW_SERIAL "The function may be very slow due to the serial implementation, please try to avoid it"
+#define  _NEON2SSE_REASON_SLOW_UNEFFECTIVE "The function may be slow due to inefficient x86 SIMD implementation, please try to avoid it"
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#define __constrange(min,max)  const
+#define __transfersize(size)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+//&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& mask constants used in porting &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
+_NEON2SSE_ALIGN_16 static const int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7,  9, 11, 13, 15 };
+_NEON2SSE_ALIGN_16 static const int8_t mask8_32_even_odd[16] = { 0, 1, 4, 5, 8,  9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 };
+//&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
+
+//*************************************************************************
+//*************************************************************************
+//*********  Functions declarations as declared in original arm_neon.h *****
+//*************************************************************************
+//Vector add: vadd -> Vr[i]:=Va[i]+Vb[i], Vr, Va, Vb have equal lane sizes.
+_NEON2SSESTORAGE int8x8_t vadd_s8(int8x8_t a, int8x8_t b); // VADD.I8 d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vadd_s16(int16x4_t a, int16x4_t b); // VADD.I16 d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vadd_s32(int32x2_t a, int32x2_t b); // VADD.I32 d0,d0,d0
+_NEON2SSESTORAGE int64x1_t vadd_s64(int64x1_t a, int64x1_t b); // VADD.I64 d0,d0,d0
+_NEON2SSESTORAGE float32x2_t vadd_f32(float32x2_t a, float32x2_t b); // VADD.F32 d0,d0,d0
+_NEON2SSESTORAGE uint8x8_t vadd_u8(uint8x8_t a, uint8x8_t b); // VADD.I8 d0,d0,d0
+_NEON2SSESTORAGE uint16x4_t vadd_u16(uint16x4_t a, uint16x4_t b); // VADD.I16 d0,d0,d0
+_NEON2SSESTORAGE uint32x2_t vadd_u32(uint32x2_t a, uint32x2_t b); // VADD.I32 d0,d0,d0
+_NEON2SSESTORAGE uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b); // VADD.I64 d0,d0,d0
+_NEON2SSESTORAGE int8x16_t vaddq_s8(int8x16_t a, int8x16_t b); // VADD.I8 q0,q0,q0
+_NEON2SSESTORAGE int16x8_t vaddq_s16(int16x8_t a, int16x8_t b); // VADD.I16 q0,q0,q0
+_NEON2SSESTORAGE int32x4_t vaddq_s32(int32x4_t a, int32x4_t b); // VADD.I32 q0,q0,q0
+_NEON2SSESTORAGE int64x2_t vaddq_s64(int64x2_t a, int64x2_t b); // VADD.I64 q0,q0,q0
+_NEON2SSESTORAGE float32x4_t vaddq_f32(float32x4_t a, float32x4_t b); // VADD.F32 q0,q0,q0
+_NEON2SSESTORAGE uint8x16_t vaddq_u8(uint8x16_t a, uint8x16_t b); // VADD.I8 q0,q0,q0
+_NEON2SSESTORAGE uint16x8_t vaddq_u16(uint16x8_t a, uint16x8_t b); // VADD.I16 q0,q0,q0
+_NEON2SSESTORAGE uint32x4_t vaddq_u32(uint32x4_t a, uint32x4_t b); // VADD.I32 q0,q0,q0
+_NEON2SSESTORAGE uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b); // VADD.I64 q0,q0,q0
+//Vector long add: vaddl -> Vr[i]:=Va[i]+Vb[i], Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
+_NEON2SSESTORAGE int16x8_t vaddl_s8(int8x8_t a, int8x8_t b); // VADDL.S8 q0,d0,d0
+_NEON2SSESTORAGE int32x4_t vaddl_s16(int16x4_t a, int16x4_t b); // VADDL.S16 q0,d0,d0
+_NEON2SSESTORAGE int64x2_t vaddl_s32(int32x2_t a, int32x2_t b); // VADDL.S32 q0,d0,d0
+_NEON2SSESTORAGE uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b); // VADDL.U8 q0,d0,d0
+_NEON2SSESTORAGE uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b); // VADDL.U16 q0,d0,d0
+_NEON2SSESTORAGE uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b); // VADDL.U32 q0,d0,d0
+//Vector wide addw: vadd -> Vr[i]:=Va[i]+Vb[i]
+_NEON2SSESTORAGE int16x8_t vaddw_s8(int16x8_t a, int8x8_t b); // VADDW.S8 q0,q0,d0
+_NEON2SSESTORAGE int32x4_t vaddw_s16(int32x4_t a, int16x4_t b); // VADDW.S16 q0,q0,d0
+_NEON2SSESTORAGE int64x2_t vaddw_s32(int64x2_t a, int32x2_t b); // VADDW.S32 q0,q0,d0
+_NEON2SSESTORAGE uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b); // VADDW.U8 q0,q0,d0
+_NEON2SSESTORAGE uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b); // VADDW.U16 q0,q0,d0
+_NEON2SSESTORAGE uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b); // VADDW.U32 q0,q0,d0
+//Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1
+_NEON2SSESTORAGE int8x8_t vhadd_s8(int8x8_t a, int8x8_t b); // VHADD.S8 d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vhadd_s16(int16x4_t a, int16x4_t b); // VHADD.S16 d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vhadd_s32(int32x2_t a, int32x2_t b); // VHADD.S32 d0,d0,d0
+_NEON2SSESTORAGE uint8x8_t vhadd_u8(uint8x8_t a, uint8x8_t b); // VHADD.U8 d0,d0,d0
+_NEON2SSESTORAGE uint16x4_t vhadd_u16(uint16x4_t a, uint16x4_t b); // VHADD.U16 d0,d0,d0
+_NEON2SSESTORAGE uint32x2_t vhadd_u32(uint32x2_t a, uint32x2_t b); // VHADD.U32 d0,d0,d0
+_NEON2SSESTORAGE int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b); // VHADD.S8 q0,q0,q0
+_NEON2SSESTORAGE int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b); // VHADD.S16 q0,q0,q0
+_NEON2SSESTORAGE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b); // VHADD.S32 q0,q0,q0
+_NEON2SSESTORAGE uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b); // VHADD.U8 q0,q0,q0
+_NEON2SSESTORAGE uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b); // VHADD.U16 q0,q0,q0
+_NEON2SSESTORAGE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b); // VHADD.U32 q0,q0,q0
+//Vector rounding halving add: vrhadd -> Vr[i]:=(Va[i]+Vb[i]+1)>>1
+_NEON2SSESTORAGE int8x8_t vrhadd_s8(int8x8_t a, int8x8_t b); // VRHADD.S8 d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vrhadd_s16(int16x4_t a, int16x4_t b); // VRHADD.S16 d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vrhadd_s32(int32x2_t a, int32x2_t b); // VRHADD.S32 d0,d0,d0
+_NEON2SSESTORAGE uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b); // VRHADD.U8 d0,d0,d0
+_NEON2SSESTORAGE uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b); // VRHADD.U16 d0,d0,d0
+_NEON2SSESTORAGE uint32x2_t vrhadd_u32(uint32x2_t a, uint32x2_t b); // VRHADD.U32 d0,d0,d0
+_NEON2SSESTORAGE int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b); // VRHADD.S8 q0,q0,q0
+_NEON2SSESTORAGE int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b); // VRHADD.S16 q0,q0,q0
+_NEON2SSESTORAGE int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b); // VRHADD.S32 q0,q0,q0
+_NEON2SSESTORAGE uint8x16_t vrhaddq_u8(uint8x16_t a, uint8x16_t b); // VRHADD.U8 q0,q0,q0
+_NEON2SSESTORAGE uint16x8_t vrhaddq_u16(uint16x8_t a, uint16x8_t b); // VRHADD.U16 q0,q0,q0
+_NEON2SSESTORAGE uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b); // VRHADD.U32 q0,q0,q0
+//Vector saturating add: vqadd -> Vr[i]:=sat<size>(Va[i]+Vb[i])
+_NEON2SSESTORAGE int8x8_t vqadd_s8(int8x8_t a, int8x8_t b); // VQADD.S8 d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vqadd_s16(int16x4_t a, int16x4_t b); // VQADD.S16 d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vqadd_s32(int32x2_t a, int32x2_t b); // VQADD.S32 d0,d0,d0
+_NEON2SSESTORAGE int64x1_t vqadd_s64(int64x1_t a, int64x1_t b); // VQADD.S64 d0,d0,d0
+_NEON2SSESTORAGE uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b); // VQADD.U8 d0,d0,d0
+_NEON2SSESTORAGE uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b); // VQADD.U16 d0,d0,d0
+_NEON2SSESTORAGE uint32x2_t vqadd_u32(uint32x2_t a, uint32x2_t b); // VQADD.U32 d0,d0,d0
+_NEON2SSESTORAGE uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b); // VQADD.U64 d0,d0,d0
+_NEON2SSESTORAGE int8x16_t vqaddq_s8(int8x16_t a, int8x16_t b); // VQADD.S8 q0,q0,q0
+_NEON2SSESTORAGE int16x8_t vqaddq_s16(int16x8_t a, int16x8_t b); // VQADD.S16 q0,q0,q0
+_NEON2SSESTORAGE int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b); // VQADD.S32 q0,q0,q0
+_NEON2SSESTORAGE int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b); // VQADD.S64 q0,q0,q0
+_NEON2SSESTORAGE uint8x16_t vqaddq_u8(uint8x16_t a, uint8x16_t b); // VQADD.U8 q0,q0,q0
+_NEON2SSESTORAGE uint16x8_t vqaddq_u16(uint16x8_t a, uint16x8_t b); // VQADD.U16 q0,q0,q0
+_NEON2SSESTORAGE uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b); // VQADD.U32 q0,q0,q0
+_NEON2SSESTORAGE uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b); // VQADD.U64 q0,q0,q0
+//Vector add high half: vaddhn-> Vr[i]:=Va[i]+Vb[i]
+_NEON2SSESTORAGE int8x8_t vaddhn_s16(int16x8_t a, int16x8_t b); // VADDHN.I16 d0,q0,q0
+_NEON2SSESTORAGE int16x4_t vaddhn_s32(int32x4_t a, int32x4_t b); // VADDHN.I32 d0,q0,q0
+_NEON2SSESTORAGE int32x2_t vaddhn_s64(int64x2_t a, int64x2_t b); // VADDHN.I64 d0,q0,q0
+_NEON2SSESTORAGE uint8x8_t vaddhn_u16(uint16x8_t a, uint16x8_t b); // VADDHN.I16 d0,q0,q0
+_NEON2SSESTORAGE uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b); // VADDHN.I32 d0,q0,q0
+_NEON2SSESTORAGE uint32x2_t vaddhn_u64(uint64x2_t a, uint64x2_t b); // VADDHN.I64 d0,q0,q0
+//Vector rounding add high half: vraddhn
+_NEON2SSESTORAGE int8x8_t vraddhn_s16(int16x8_t a, int16x8_t b); // VRADDHN.I16 d0,q0,q0
+_NEON2SSESTORAGE int16x4_t vraddhn_s32(int32x4_t a, int32x4_t b); // VRADDHN.I32 d0,q0,q0
+_NEON2SSESTORAGE int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b); // VRADDHN.I64 d0,q0,q0
+_NEON2SSESTORAGE uint8x8_t vraddhn_u16(uint16x8_t a, uint16x8_t b); // VRADDHN.I16 d0,q0,q0
+_NEON2SSESTORAGE uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b); // VRADDHN.I32 d0,q0,q0
+_NEON2SSESTORAGE uint32x2_t vraddhn_u64(uint64x2_t a, uint64x2_t b); // VRADDHN.I64 d0,q0,q0
+//Multiplication
+//Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i]
+_NEON2SSESTORAGE int8x8_t vmul_s8(int8x8_t a, int8x8_t b); // VMUL.I8 d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vmul_s16(int16x4_t a, int16x4_t b); // VMUL.I16 d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vmul_s32(int32x2_t a, int32x2_t b); // VMUL.I32 d0,d0,d0
+_NEON2SSESTORAGE float32x2_t vmul_f32(float32x2_t a, float32x2_t b); // VMUL.F32 d0,d0,d0
+_NEON2SSESTORAGE uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b); // VMUL.I8 d0,d0,d0
+_NEON2SSESTORAGE uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b); // VMUL.I16 d0,d0,d0
+_NEON2SSESTORAGE uint32x2_t vmul_u32(uint32x2_t a, uint32x2_t b); // VMUL.I32 d0,d0,d0
+_NEON2SSESTORAGE poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b); // VMUL.P8 d0,d0,d0
+_NEON2SSESTORAGE int8x16_t vmulq_s8(int8x16_t a, int8x16_t b); // VMUL.I8 q0,q0,q0
+_NEON2SSESTORAGE int16x8_t vmulq_s16(int16x8_t a, int16x8_t b); // VMUL.I16 q0,q0,q0
+_NEON2SSESTORAGE int32x4_t vmulq_s32(int32x4_t a, int32x4_t b); // VMUL.I32 q0,q0,q0
+_NEON2SSESTORAGE float32x4_t vmulq_f32(float32x4_t a, float32x4_t b); // VMUL.F32 q0,q0,q0
+_NEON2SSESTORAGE uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b); // VMUL.I8 q0,q0,q0
+_NEON2SSESTORAGE uint16x8_t vmulq_u16(uint16x8_t a, uint16x8_t b); // VMUL.I16 q0,q0,q0
+_NEON2SSESTORAGE uint32x4_t vmulq_u32(uint32x4_t a, uint32x4_t b); // VMUL.I32 q0,q0,q0
+_NEON2SSESTORAGE poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b); // VMUL.P8 q0,q0,q0
+//multiply lane
+_NEON2SSESTORAGE int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c);
+_NEON2SSESTORAGE int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c);
+_NEON2SSESTORAGE float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c);
+_NEON2SSESTORAGE uint16x4_t vmul_lane_u16 (uint16x4_t a, uint16x4_t b, __constrange(0,3) int c);
+_NEON2SSESTORAGE uint32x2_t vmul_lane_u32 (uint32x2_t a, uint32x2_t b, __constrange(0,1) int c);
+_NEON2SSESTORAGE int16x8_t vmulq_lane_s16 (int16x8_t a, int16x4_t b, __constrange(0,3) int c);
+_NEON2SSESTORAGE int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c);
+_NEON2SSESTORAGE float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c);
+_NEON2SSESTORAGE uint16x8_t vmulq_lane_u16 (uint16x8_t a, uint16x4_t b, __constrange(0,3) int c);
+_NEON2SSESTORAGE uint32x4_t vmulq_lane_u32 (uint32x4_t a, uint32x2_t b, __constrange(0,1) int c);
+//Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i]
+_NEON2SSESTORAGE int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLA.I8 d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLA.I16 d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLA.I32 d0,d0,d0
+_NEON2SSESTORAGE float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLA.F32 d0,d0,d0
+_NEON2SSESTORAGE uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLA.I8 d0,d0,d0
+_NEON2SSESTORAGE uint16x4_t vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLA.I16 d0,d0,d0
+_NEON2SSESTORAGE uint32x2_t vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLA.I32 d0,d0,d0
+_NEON2SSESTORAGE int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLA.I8 q0,q0,q0
+_NEON2SSESTORAGE int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLA.I16 q0,q0,q0
+_NEON2SSESTORAGE int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLA.I32 q0,q0,q0
+_NEON2SSESTORAGE float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLA.F32 q0,q0,q0
+_NEON2SSESTORAGE uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLA.I8 q0,q0,q0
+_NEON2SSESTORAGE uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLA.I16 q0,q0,q0
+_NEON2SSESTORAGE uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLA.I32 q0,q0,q0
+//Vector multiply accumulate long: vmlal -> Vr[i] := Va[i] + Vb[i] * Vc[i]
+_NEON2SSESTORAGE int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLAL.S8 q0,d0,d0
+_NEON2SSESTORAGE int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLAL.S16 q0,d0,d0
+_NEON2SSESTORAGE int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLAL.S32 q0,d0,d0
+_NEON2SSESTORAGE uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLAL.U8 q0,d0,d0
+_NEON2SSESTORAGE uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLAL.U16 q0,d0,d0
+_NEON2SSESTORAGE uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLAL.U32 q0,d0,d0
+//Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i]
+_NEON2SSESTORAGE int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLS.I8 d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLS.I16 d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLS.I32 d0,d0,d0
+_NEON2SSESTORAGE float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLS.F32 d0,d0,d0
+_NEON2SSESTORAGE uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLS.I8 d0,d0,d0
+_NEON2SSESTORAGE uint16x4_t vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLS.I16 d0,d0,d0
+_NEON2SSESTORAGE uint32x2_t vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLS.I32 d0,d0,d0
+_NEON2SSESTORAGE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLS.I8 q0,q0,q0
+_NEON2SSESTORAGE int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLS.I16 q0,q0,q0
+_NEON2SSESTORAGE int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLS.I32 q0,q0,q0
+_NEON2SSESTORAGE float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLS.F32 q0,q0,q0
+_NEON2SSESTORAGE uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLS.I8 q0,q0,q0
+_NEON2SSESTORAGE uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLS.I16 q0,q0,q0
+_NEON2SSESTORAGE uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLS.I32 q0,q0,q0
+//Vector multiply subtract long
+_NEON2SSESTORAGE int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLSL.S8 q0,d0,d0
+_NEON2SSESTORAGE int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLSL.S16 q0,d0,d0
+_NEON2SSESTORAGE int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLSL.S32 q0,d0,d0
+_NEON2SSESTORAGE uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLSL.U8 q0,d0,d0
+_NEON2SSESTORAGE uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLSL.U16 q0,d0,d0
+_NEON2SSESTORAGE uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLSL.U32 q0,d0,d0
+//Vector saturating doubling multiply high
+_NEON2SSESTORAGE int16x4_t vqdmulh_s16(int16x4_t a, int16x4_t b); // VQDMULH.S16 d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b); // VQDMULH.S32 d0,d0,d0
+_NEON2SSESTORAGE int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b); // VQDMULH.S16 q0,q0,q0
+_NEON2SSESTORAGE int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b); // VQDMULH.S32 q0,q0,q0
+//Vector saturating rounding doubling multiply high
+_NEON2SSESTORAGE int16x4_t vqrdmulh_s16(int16x4_t a, int16x4_t b); // VQRDMULH.S16 d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b); // VQRDMULH.S32 d0,d0,d0
+_NEON2SSESTORAGE int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b); // VQRDMULH.S16 q0,q0,q0
+_NEON2SSESTORAGE int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b); // VQRDMULH.S32 q0,q0,q0
+//Vector saturating doubling multiply accumulate long
+_NEON2SSESTORAGE int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLAL.S16 q0,d0,d0
+_NEON2SSESTORAGE int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLAL.S32 q0,d0,d0
+//Vector saturating doubling multiply subtract long
+_NEON2SSESTORAGE int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLSL.S16 q0,d0,d0
+_NEON2SSESTORAGE int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLSL.S32 q0,d0,d0
+//Vector long multiply
+_NEON2SSESTORAGE int16x8_t vmull_s8(int8x8_t a, int8x8_t b); // VMULL.S8 q0,d0,d0
+_NEON2SSESTORAGE int32x4_t vmull_s16(int16x4_t a, int16x4_t b); // VMULL.S16 q0,d0,d0
+_NEON2SSESTORAGE int64x2_t vmull_s32(int32x2_t a, int32x2_t b); // VMULL.S32 q0,d0,d0
+_NEON2SSESTORAGE uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b); // VMULL.U8 q0,d0,d0
+_NEON2SSESTORAGE uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b); // VMULL.U16 q0,d0,d0
+_NEON2SSESTORAGE uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b); // VMULL.U32 q0,d0,d0
+_NEON2SSESTORAGE poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b); // VMULL.P8 q0,d0,d0
+//Vector saturating doubling long multiply
+_NEON2SSESTORAGE int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b); // VQDMULL.S16 q0,d0,d0
+_NEON2SSESTORAGE int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b); // VQDMULL.S32 q0,d0,d0
+//Subtraction
+//Vector subtract
+_NEON2SSESTORAGE int8x8_t vsub_s8(int8x8_t a, int8x8_t b); // VSUB.I8 d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vsub_s16(int16x4_t a, int16x4_t b); // VSUB.I16 d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vsub_s32(int32x2_t a, int32x2_t b); // VSUB.I32 d0,d0,d0
+_NEON2SSESTORAGE int64x1_t vsub_s64(int64x1_t a, int64x1_t b); // VSUB.I64 d0,d0,d0
+_NEON2SSESTORAGE float32x2_t vsub_f32(float32x2_t a, float32x2_t b); // VSUB.F32 d0,d0,d0
+_NEON2SSESTORAGE uint8x8_t vsub_u8(uint8x8_t a, uint8x8_t b); // VSUB.I8 d0,d0,d0
+_NEON2SSESTORAGE uint16x4_t vsub_u16(uint16x4_t a, uint16x4_t b); // VSUB.I16 d0,d0,d0
+_NEON2SSESTORAGE uint32x2_t vsub_u32(uint32x2_t a, uint32x2_t b); // VSUB.I32 d0,d0,d0
+_NEON2SSESTORAGE uint64x1_t vsub_u64(uint64x1_t a, uint64x1_t b); // VSUB.I64 d0,d0,d0
+_NEON2SSESTORAGE int8x16_t vsubq_s8(int8x16_t a, int8x16_t b); // VSUB.I8 q0,q0,q0
+_NEON2SSESTORAGE int16x8_t vsubq_s16(int16x8_t a, int16x8_t b); // VSUB.I16 q0,q0,q0
+_NEON2SSESTORAGE int32x4_t vsubq_s32(int32x4_t a, int32x4_t b); // VSUB.I32 q0,q0,q0
+_NEON2SSESTORAGE int64x2_t vsubq_s64(int64x2_t a, int64x2_t b); // VSUB.I64 q0,q0,q0
+_NEON2SSESTORAGE float32x4_t vsubq_f32(float32x4_t a, float32x4_t b); // VSUB.F32 q0,q0,q0
+_NEON2SSESTORAGE uint8x16_t vsubq_u8(uint8x16_t a, uint8x16_t b); // VSUB.I8 q0,q0,q0
+_NEON2SSESTORAGE uint16x8_t vsubq_u16(uint16x8_t a, uint16x8_t b); // VSUB.I16 q0,q0,q0
+_NEON2SSESTORAGE uint32x4_t vsubq_u32(uint32x4_t a, uint32x4_t b); // VSUB.I32 q0,q0,q0
+_NEON2SSESTORAGE uint64x2_t vsubq_u64(uint64x2_t a, uint64x2_t b); // VSUB.I64 q0,q0,q0
+//Vector long subtract: vsub -> Vr[i]:=Va[i]+Vb[i]
+_NEON2SSESTORAGE int16x8_t vsubl_s8(int8x8_t a, int8x8_t b); // VSUBL.S8 q0,d0,d0
+_NEON2SSESTORAGE int32x4_t vsubl_s16(int16x4_t a, int16x4_t b); // VSUBL.S16 q0,d0,d0
+_NEON2SSESTORAGE int64x2_t vsubl_s32(int32x2_t a, int32x2_t b); // VSUBL.S32 q0,d0,d0
+_NEON2SSESTORAGE uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b); // VSUBL.U8 q0,d0,d0
+_NEON2SSESTORAGE uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b); // VSUBL.U16 q0,d0,d0
+_NEON2SSESTORAGE uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b); // VSUBL.U32 q0,d0,d0
+//Vector wide subtract: vsub -> Vr[i]:=Va[i]+Vb[i]
+_NEON2SSESTORAGE int16x8_t vsubw_s8(int16x8_t a, int8x8_t b); // VSUBW.S8 q0,q0,d0
+_NEON2SSESTORAGE int32x4_t vsubw_s16(int32x4_t a, int16x4_t b); // VSUBW.S16 q0,q0,d0
+_NEON2SSESTORAGE int64x2_t vsubw_s32(int64x2_t a, int32x2_t b); // VSUBW.S32 q0,q0,d0
+_NEON2SSESTORAGE uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b); // VSUBW.U8 q0,q0,d0
+_NEON2SSESTORAGE uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b); // VSUBW.U16 q0,q0,d0
+_NEON2SSESTORAGE uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b); // VSUBW.U32 q0,q0,d0
+//Vector saturating subtract
+_NEON2SSESTORAGE int8x8_t vqsub_s8(int8x8_t a, int8x8_t b); // VQSUB.S8 d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vqsub_s16(int16x4_t a, int16x4_t b); // VQSUB.S16 d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vqsub_s32(int32x2_t a, int32x2_t b); // VQSUB.S32 d0,d0,d0
+_NEON2SSESTORAGE int64x1_t vqsub_s64(int64x1_t a, int64x1_t b); // VQSUB.S64 d0,d0,d0
+_NEON2SSESTORAGE uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b); // VQSUB.U8 d0,d0,d0
+_NEON2SSESTORAGE uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b); // VQSUB.U16 d0,d0,d0
+_NEON2SSESTORAGE uint32x2_t vqsub_u32(uint32x2_t a, uint32x2_t b); // VQSUB.U32 d0,d0,d0
+_NEON2SSESTORAGE uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b); // VQSUB.U64 d0,d0,d0
+_NEON2SSESTORAGE int8x16_t vqsubq_s8(int8x16_t a, int8x16_t b); // VQSUB.S8 q0,q0,q0
+_NEON2SSESTORAGE int16x8_t vqsubq_s16(int16x8_t a, int16x8_t b); // VQSUB.S16 q0,q0,q0
+_NEON2SSESTORAGE int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b); // VQSUB.S32 q0,q0,q0
+_NEON2SSESTORAGE int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b); // VQSUB.S64 q0,q0,q0
+_NEON2SSESTORAGE uint8x16_t vqsubq_u8(uint8x16_t a, uint8x16_t b); // VQSUB.U8 q0,q0,q0
+_NEON2SSESTORAGE uint16x8_t vqsubq_u16(uint16x8_t a, uint16x8_t b); // VQSUB.U16 q0,q0,q0
+_NEON2SSESTORAGE uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b); // VQSUB.U32 q0,q0,q0
+_NEON2SSESTORAGE uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b); // VQSUB.U64 q0,q0,q0
+//Vector halving subtract
+_NEON2SSESTORAGE int8x8_t vhsub_s8(int8x8_t a, int8x8_t b); // VHSUB.S8 d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vhsub_s16(int16x4_t a, int16x4_t b); // VHSUB.S16 d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vhsub_s32(int32x2_t a, int32x2_t b); // VHSUB.S32 d0,d0,d0
+_NEON2SSESTORAGE uint8x8_t vhsub_u8(uint8x8_t a, uint8x8_t b); // VHSUB.U8 d0,d0,d0
+_NEON2SSESTORAGE uint16x4_t vhsub_u16(uint16x4_t a, uint16x4_t b); // VHSUB.U16 d0,d0,d0
+_NEON2SSESTORAGE uint32x2_t vhsub_u32(uint32x2_t a, uint32x2_t b); // VHSUB.U32 d0,d0,d0
+_NEON2SSESTORAGE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b); // VHSUB.S8 q0,q0,q0
+_NEON2SSESTORAGE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b); // VHSUB.S16 q0,q0,q0
+_NEON2SSESTORAGE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b); // VHSUB.S32 q0,q0,q0
+_NEON2SSESTORAGE uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b); // VHSUB.U8 q0,q0,q0
+_NEON2SSESTORAGE uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b); // VHSUB.U16 q0,q0,q0
+_NEON2SSESTORAGE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b); // VHSUB.U32 q0,q0,q0
+//Vector subtract high half
+_NEON2SSESTORAGE int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b); // VSUBHN.I16 d0,q0,q0
+_NEON2SSESTORAGE int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b); // VSUBHN.I32 d0,q0,q0
+_NEON2SSESTORAGE int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b); // VSUBHN.I64 d0,q0,q0
+_NEON2SSESTORAGE uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b); // VSUBHN.I16 d0,q0,q0
+_NEON2SSESTORAGE uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b); // VSUBHN.I32 d0,q0,q0
+_NEON2SSESTORAGE uint32x2_t vsubhn_u64(uint64x2_t a, uint64x2_t b); // VSUBHN.I64 d0,q0,q0
+//Vector rounding subtract high half
+_NEON2SSESTORAGE int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b); // VRSUBHN.I16 d0,q0,q0
+_NEON2SSESTORAGE int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b); // VRSUBHN.I32 d0,q0,q0
+_NEON2SSESTORAGE int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b); // VRSUBHN.I64 d0,q0,q0
+_NEON2SSESTORAGE uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b); // VRSUBHN.I16 d0,q0,q0
+_NEON2SSESTORAGE uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b); // VRSUBHN.I32 d0,q0,q0
+_NEON2SSESTORAGE uint32x2_t vrsubhn_u64(uint64x2_t a, uint64x2_t b); // VRSUBHN.I64 d0,q0,q0
+//Comparison
+//Vector compare equal
+_NEON2SSESTORAGE uint8x8_t vceq_s8(int8x8_t a, int8x8_t b); // VCEQ.I8 d0, d0, d0
+_NEON2SSESTORAGE uint16x4_t vceq_s16(int16x4_t a, int16x4_t b); // VCEQ.I16 d0, d0, d0
+_NEON2SSESTORAGE uint32x2_t vceq_s32(int32x2_t a, int32x2_t b); // VCEQ.I32 d0, d0, d0
+_NEON2SSESTORAGE uint32x2_t vceq_f32(float32x2_t a, float32x2_t b); // VCEQ.F32 d0, d0, d0
+_NEON2SSESTORAGE uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b); // VCEQ.I8 d0, d0, d0
+_NEON2SSESTORAGE uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b); // VCEQ.I16 d0, d0, d0
+_NEON2SSESTORAGE uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b); // VCEQ.I32 d0, d0, d0
+_NEON2SSESTORAGE uint8x8_t vceq_p8(poly8x8_t a, poly8x8_t b); // VCEQ.I8 d0, d0, d0
+_NEON2SSESTORAGE uint8x16_t vceqq_s8(int8x16_t a, int8x16_t b); // VCEQ.I8 q0, q0, q0
+_NEON2SSESTORAGE uint16x8_t vceqq_s16(int16x8_t a, int16x8_t b); // VCEQ.I16 q0, q0, q0
+_NEON2SSESTORAGE uint32x4_t vceqq_s32(int32x4_t a, int32x4_t b); // VCEQ.I32 q0, q0, q0
+_NEON2SSESTORAGE uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b); // VCEQ.F32 q0, q0, q0
+_NEON2SSESTORAGE uint8x16_t vceqq_u8(uint8x16_t a, uint8x16_t b); // VCEQ.I8 q0, q0, q0
+_NEON2SSESTORAGE uint16x8_t vceqq_u16(uint16x8_t a, uint16x8_t b); // VCEQ.I16 q0, q0, q0
+_NEON2SSESTORAGE uint32x4_t vceqq_u32(uint32x4_t a, uint32x4_t b); // VCEQ.I32 q0, q0, q0
+_NEON2SSESTORAGE uint8x16_t vceqq_p8(poly8x16_t a, poly8x16_t b); // VCEQ.I8 q0, q0, q0
+//Vector compare greater-than or equal
+_NEON2SSESTORAGE uint8x8_t vcge_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0
+_NEON2SSESTORAGE uint16x4_t vcge_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0
+_NEON2SSESTORAGE uint32x2_t vcge_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0
+_NEON2SSESTORAGE uint32x2_t vcge_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0
+_NEON2SSESTORAGE uint8x8_t vcge_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0
+_NEON2SSESTORAGE uint16x4_t vcge_u16(uint16x4_t a, uint16x4_t b); // VCGE.U16 d0, d0, d0
+_NEON2SSESTORAGE uint32x2_t vcge_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0
+_NEON2SSESTORAGE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
+_NEON2SSESTORAGE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
+_NEON2SSESTORAGE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
+_NEON2SSESTORAGE uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
+_NEON2SSESTORAGE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
+_NEON2SSESTORAGE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b); // VCGE.U16 q0, q0, q0
+_NEON2SSESTORAGE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
+//Vector compare less-than or equal
+_NEON2SSESTORAGE uint8x8_t vcle_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0
+_NEON2SSESTORAGE uint16x4_t vcle_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0
+_NEON2SSESTORAGE uint32x2_t vcle_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0
+_NEON2SSESTORAGE uint32x2_t vcle_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0
+_NEON2SSESTORAGE uint8x8_t vcle_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0
+_NEON2SSESTORAGE uint16x4_t vcle_u16(uint16x4_t a, uint16x4_t b); // VCGE.U16 d0, d0, d0
+_NEON2SSESTORAGE uint32x2_t vcle_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0
+_NEON2SSESTORAGE uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
+_NEON2SSESTORAGE uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
+_NEON2SSESTORAGE uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
+_NEON2SSESTORAGE uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
+_NEON2SSESTORAGE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
+_NEON2SSESTORAGE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b); // VCGE.U16 q0, q0, q0
+_NEON2SSESTORAGE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
+//Vector compare greater-than
+_NEON2SSESTORAGE uint8x8_t vcgt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
+_NEON2SSESTORAGE uint16x4_t vcgt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
+_NEON2SSESTORAGE uint32x2_t vcgt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
+_NEON2SSESTORAGE uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
+_NEON2SSESTORAGE uint8x8_t vcgt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
+_NEON2SSESTORAGE uint16x4_t vcgt_u16(uint16x4_t a, uint16x4_t b); // VCGT.U16 d0, d0, d0
+_NEON2SSESTORAGE uint32x2_t vcgt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
+_NEON2SSESTORAGE uint8x16_t vcgtq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
+_NEON2SSESTORAGE uint16x8_t vcgtq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
+_NEON2SSESTORAGE uint32x4_t vcgtq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
+_NEON2SSESTORAGE uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
+_NEON2SSESTORAGE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
+_NEON2SSESTORAGE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b); // VCGT.U16 q0, q0, q0
+_NEON2SSESTORAGE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
+//Vector compare less-than
+_NEON2SSESTORAGE uint8x8_t vclt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
+_NEON2SSESTORAGE uint16x4_t vclt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
+_NEON2SSESTORAGE uint32x2_t vclt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
+_NEON2SSESTORAGE uint32x2_t vclt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
+_NEON2SSESTORAGE uint8x8_t vclt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
+_NEON2SSESTORAGE uint16x4_t vclt_u16(uint16x4_t a, uint16x4_t b); // VCGT.U16 d0, d0, d0
+_NEON2SSESTORAGE uint32x2_t vclt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
+_NEON2SSESTORAGE uint8x16_t vcltq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
+_NEON2SSESTORAGE uint16x8_t vcltq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
+_NEON2SSESTORAGE uint32x4_t vcltq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
+_NEON2SSESTORAGE uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
+_NEON2SSESTORAGE uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
+_NEON2SSESTORAGE uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b); // VCGT.U16 q0, q0, q0
+_NEON2SSESTORAGE uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
+//Vector compare absolute greater-than or equal
+_NEON2SSESTORAGE uint32x2_t vcage_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
+_NEON2SSESTORAGE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
+//Vector compare absolute less-than or equal
+_NEON2SSESTORAGE uint32x2_t vcale_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
+_NEON2SSESTORAGE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
+//Vector compare absolute greater-than
+_NEON2SSESTORAGE uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
+_NEON2SSESTORAGE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
+//Vector compare absolute less-than
+_NEON2SSESTORAGE uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
+_NEON2SSESTORAGE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
+//Vector test bits
+_NEON2SSESTORAGE uint8x8_t vtst_s8(int8x8_t a, int8x8_t b); // VTST.8 d0, d0, d0
+_NEON2SSESTORAGE uint16x4_t vtst_s16(int16x4_t a, int16x4_t b); // VTST.16 d0, d0, d0
+_NEON2SSESTORAGE uint32x2_t vtst_s32(int32x2_t a, int32x2_t b); // VTST.32 d0, d0, d0
+_NEON2SSESTORAGE uint8x8_t vtst_u8(uint8x8_t a, uint8x8_t b); // VTST.8 d0, d0, d0
+_NEON2SSESTORAGE uint16x4_t vtst_u16(uint16x4_t a, uint16x4_t b); // VTST.16 d0, d0, d0
+_NEON2SSESTORAGE uint32x2_t vtst_u32(uint32x2_t a, uint32x2_t b); // VTST.32 d0, d0, d0
+_NEON2SSESTORAGE uint8x8_t vtst_p8(poly8x8_t a, poly8x8_t b); // VTST.8 d0, d0, d0
+_NEON2SSESTORAGE uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b); // VTST.8 q0, q0, q0
+_NEON2SSESTORAGE uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b); // VTST.16 q0, q0, q0
+_NEON2SSESTORAGE uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b); // VTST.32 q0, q0, q0
+_NEON2SSESTORAGE uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b); // VTST.8 q0, q0, q0
+_NEON2SSESTORAGE uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b); // VTST.16 q0, q0, q0
+_NEON2SSESTORAGE uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b); // VTST.32 q0, q0, q0
+_NEON2SSESTORAGE uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b); // VTST.8 q0, q0, q0
+//Absolute difference
+//Absolute difference between the arguments: Vr[i] = | Va[i] - Vb[i] |
+_NEON2SSESTORAGE int8x8_t vabd_s8(int8x8_t a, int8x8_t b); // VABD.S8 d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vabd_s16(int16x4_t a, int16x4_t b); // VABD.S16 d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vabd_s32(int32x2_t a, int32x2_t b); // VABD.S32 d0,d0,d0
+_NEON2SSESTORAGE uint8x8_t vabd_u8(uint8x8_t a, uint8x8_t b); // VABD.U8 d0,d0,d0
+_NEON2SSESTORAGE uint16x4_t vabd_u16(uint16x4_t a, uint16x4_t b); // VABD.U16 d0,d0,d0
+_NEON2SSESTORAGE uint32x2_t vabd_u32(uint32x2_t a, uint32x2_t b); // VABD.U32 d0,d0,d0
+_NEON2SSESTORAGE float32x2_t vabd_f32(float32x2_t a, float32x2_t b); // VABD.F32 d0,d0,d0
+_NEON2SSESTORAGE int8x16_t vabdq_s8(int8x16_t a, int8x16_t b); // VABD.S8 q0,q0,q0
+_NEON2SSESTORAGE int16x8_t vabdq_s16(int16x8_t a, int16x8_t b); // VABD.S16 q0,q0,q0
+_NEON2SSESTORAGE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b); // VABD.S32 q0,q0,q0
+_NEON2SSESTORAGE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b); // VABD.U8 q0,q0,q0
+_NEON2SSESTORAGE uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b); // VABD.U16 q0,q0,q0
+_NEON2SSESTORAGE uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b); // VABD.U32 q0,q0,q0
+_NEON2SSESTORAGE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b); // VABD.F32 q0,q0,q0
+//Absolute difference - long
+_NEON2SSESTORAGE int16x8_t vabdl_s8(int8x8_t a, int8x8_t b); // VABDL.S8 q0,d0,d0
+_NEON2SSESTORAGE int32x4_t vabdl_s16(int16x4_t a, int16x4_t b); // VABDL.S16 q0,d0,d0
+_NEON2SSESTORAGE int64x2_t vabdl_s32(int32x2_t a, int32x2_t b); // VABDL.S32 q0,d0,d0
+_NEON2SSESTORAGE uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b); // VABDL.U8 q0,d0,d0
+_NEON2SSESTORAGE uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b); // VABDL.U16 q0,d0,d0
+_NEON2SSESTORAGE uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b); // VABDL.U32 q0,d0,d0
+//Absolute difference and accumulate: Vr[i] = Va[i] + | Vb[i] - Vc[i] |
+_NEON2SSESTORAGE int8x8_t vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VABA.S8 d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VABA.S16 d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VABA.S32 d0,d0,d0
+_NEON2SSESTORAGE uint8x8_t vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VABA.U8 d0,d0,d0
+_NEON2SSESTORAGE uint16x4_t vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VABA.U16 d0,d0,d0
+_NEON2SSESTORAGE uint32x2_t vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VABA.U32 d0,d0,d0
+_NEON2SSESTORAGE int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VABA.S8 q0,q0,q0
+_NEON2SSESTORAGE int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VABA.S16 q0,q0,q0
+_NEON2SSESTORAGE int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VABA.S32 q0,q0,q0
+_NEON2SSESTORAGE uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VABA.U8 q0,q0,q0
+_NEON2SSESTORAGE uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VABA.U16 q0,q0,q0
+_NEON2SSESTORAGE uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VABA.U32 q0,q0,q0
+//Absolute difference and accumulate - long
+_NEON2SSESTORAGE int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VABAL.S8 q0,d0,d0
+_NEON2SSESTORAGE int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VABAL.S16 q0,d0,d0
+_NEON2SSESTORAGE int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VABAL.S32 q0,d0,d0
+_NEON2SSESTORAGE uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VABAL.U8 q0,d0,d0
+_NEON2SSESTORAGE uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VABAL.U16 q0,d0,d0
+_NEON2SSESTORAGE uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VABAL.U32 q0,d0,d0
+//Max/Min
+//vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i]
+_NEON2SSESTORAGE int8x8_t vmax_s8(int8x8_t a, int8x8_t b); // VMAX.S8 d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vmax_s16(int16x4_t a, int16x4_t b); // VMAX.S16 d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vmax_s32(int32x2_t a, int32x2_t b); // VMAX.S32 d0,d0,d0
+_NEON2SSESTORAGE uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b); // VMAX.U8 d0,d0,d0
+_NEON2SSESTORAGE uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b); // VMAX.U16 d0,d0,d0
+_NEON2SSESTORAGE uint32x2_t vmax_u32(uint32x2_t a, uint32x2_t b); // VMAX.U32 d0,d0,d0
+_NEON2SSESTORAGE float32x2_t vmax_f32(float32x2_t a, float32x2_t b); // VMAX.F32 d0,d0,d0
+_NEON2SSESTORAGE int8x16_t vmaxq_s8(int8x16_t a, int8x16_t b); // VMAX.S8 q0,q0,q0
+_NEON2SSESTORAGE int16x8_t vmaxq_s16(int16x8_t a, int16x8_t b); // VMAX.S16 q0,q0,q0
+_NEON2SSESTORAGE int32x4_t vmaxq_s32(int32x4_t a, int32x4_t b); // VMAX.S32 q0,q0,q0
+_NEON2SSESTORAGE uint8x16_t vmaxq_u8(uint8x16_t a, uint8x16_t b); // VMAX.U8 q0,q0,q0
+_NEON2SSESTORAGE uint16x8_t vmaxq_u16(uint16x8_t a, uint16x8_t b); // VMAX.U16 q0,q0,q0
+_NEON2SSESTORAGE uint32x4_t vmaxq_u32(uint32x4_t a, uint32x4_t b); // VMAX.U32 q0,q0,q0
+_NEON2SSESTORAGE float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b); // VMAX.F32 q0,q0,q0
+
+_NEON2SSESTORAGE float64x2_t vmaxq_f64(float64x2_t a, float64x2_t b); // VMAX.F64 q0,q0,q0
+
+//vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i]
+_NEON2SSESTORAGE int8x8_t vmin_s8(int8x8_t a, int8x8_t b); // VMIN.S8 d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vmin_s16(int16x4_t a, int16x4_t b); // VMIN.S16 d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vmin_s32(int32x2_t a, int32x2_t b); // VMIN.S32 d0,d0,d0
+_NEON2SSESTORAGE uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b); // VMIN.U8 d0,d0,d0
+_NEON2SSESTORAGE uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b); // VMIN.U16 d0,d0,d0
+_NEON2SSESTORAGE uint32x2_t vmin_u32(uint32x2_t a, uint32x2_t b); // VMIN.U32 d0,d0,d0
+_NEON2SSESTORAGE float32x2_t vmin_f32(float32x2_t a, float32x2_t b); // VMIN.F32 d0,d0,d0
+_NEON2SSESTORAGE int8x16_t vminq_s8(int8x16_t a, int8x16_t b); // VMIN.S8 q0,q0,q0
+_NEON2SSESTORAGE int16x8_t vminq_s16(int16x8_t a, int16x8_t b); // VMIN.S16 q0,q0,q0
+_NEON2SSESTORAGE int32x4_t vminq_s32(int32x4_t a, int32x4_t b); // VMIN.S32 q0,q0,q0
+_NEON2SSESTORAGE uint8x16_t vminq_u8(uint8x16_t a, uint8x16_t b); // VMIN.U8 q0,q0,q0
+_NEON2SSESTORAGE uint16x8_t vminq_u16(uint16x8_t a, uint16x8_t b); // VMIN.U16 q0,q0,q0
+_NEON2SSESTORAGE uint32x4_t vminq_u32(uint32x4_t a, uint32x4_t b); // VMIN.U32 q0,q0,q0
+_NEON2SSESTORAGE float32x4_t vminq_f32(float32x4_t a, float32x4_t b); // VMIN.F32 q0,q0,q0
+
+_NEON2SSESTORAGE float64x2_t vminq_f64(float64x2_t a, float64x2_t b); // VMIN.F64 q0,q0,q0
+
+//Pairwise addition
+//Pairwise add
+_NEON2SSESTORAGE int8x8_t vpadd_s8(int8x8_t a, int8x8_t b); // VPADD.I8 d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vpadd_s16(int16x4_t a, int16x4_t b); // VPADD.I16 d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vpadd_s32(int32x2_t a, int32x2_t b); // VPADD.I32 d0,d0,d0
+_NEON2SSESTORAGE uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b); // VPADD.I8 d0,d0,d0
+_NEON2SSESTORAGE uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b); // VPADD.I16 d0,d0,d0
+_NEON2SSESTORAGE uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b); // VPADD.I32 d0,d0,d0
+_NEON2SSESTORAGE float32x2_t vpadd_f32(float32x2_t a, float32x2_t b); // VPADD.F32 d0,d0,d0
+//Long pairwise add
+_NEON2SSESTORAGE int16x4_t vpaddl_s8(int8x8_t a); // VPADDL.S8 d0,d0
+_NEON2SSESTORAGE int32x2_t vpaddl_s16(int16x4_t a); // VPADDL.S16 d0,d0
+_NEON2SSESTORAGE int64x1_t vpaddl_s32(int32x2_t a); // VPADDL.S32 d0,d0
+_NEON2SSESTORAGE uint16x4_t vpaddl_u8(uint8x8_t a); // VPADDL.U8 d0,d0
+_NEON2SSESTORAGE uint32x2_t vpaddl_u16(uint16x4_t a); // VPADDL.U16 d0,d0
+_NEON2SSESTORAGE uint64x1_t vpaddl_u32(uint32x2_t a); // VPADDL.U32 d0,d0
+_NEON2SSESTORAGE int16x8_t vpaddlq_s8(int8x16_t a); // VPADDL.S8 q0,q0
+_NEON2SSESTORAGE int32x4_t vpaddlq_s16(int16x8_t a); // VPADDL.S16 q0,q0
+_NEON2SSESTORAGE int64x2_t vpaddlq_s32(int32x4_t a); // VPADDL.S32 q0,q0
+_NEON2SSESTORAGE uint16x8_t vpaddlq_u8(uint8x16_t a); // VPADDL.U8 q0,q0
+_NEON2SSESTORAGE uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.U16 q0,q0
+_NEON2SSESTORAGE uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0
+//Long pairwise add and accumulate
+_NEON2SSESTORAGE int16x4_t vpadal_s8(int16x4_t a, int8x8_t b); // VPADAL.S8 d0,d0
+_NEON2SSESTORAGE int32x2_t vpadal_s16(int32x2_t a, int16x4_t b); // VPADAL.S16 d0,d0
+_NEON2SSESTORAGE int64x1_t vpadal_s32(int64x1_t a, int32x2_t b); // VPADAL.S32 d0,d0
+_NEON2SSESTORAGE uint16x4_t vpadal_u8(uint16x4_t a, uint8x8_t b); // VPADAL.U8 d0,d0
+_NEON2SSESTORAGE uint32x2_t vpadal_u16(uint32x2_t a, uint16x4_t b); // VPADAL.U16 d0,d0
+_NEON2SSESTORAGE uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b); // VPADAL.U32 d0,d0
+_NEON2SSESTORAGE int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b); // VPADAL.S8 q0,q0
+_NEON2SSESTORAGE int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b); // VPADAL.S16 q0,q0
+_NEON2SSESTORAGE int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b); // VPADAL.S32 q0,q0
+_NEON2SSESTORAGE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b); // VPADAL.U8 q0,q0
+_NEON2SSESTORAGE uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b); // VPADAL.U16 q0,q0
+_NEON2SSESTORAGE uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b); // VPADAL.U32 q0,q0
+//Folding maximum vpmax -> takes maximum of adjacent pairs
+_NEON2SSESTORAGE int8x8_t vpmax_s8(int8x8_t a, int8x8_t b); // VPMAX.S8 d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vpmax_s16(int16x4_t a, int16x4_t b); // VPMAX.S16 d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vpmax_s32(int32x2_t a, int32x2_t b); // VPMAX.S32 d0,d0,d0
+_NEON2SSESTORAGE uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b); // VPMAX.U8 d0,d0,d0
+_NEON2SSESTORAGE uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b); // VPMAX.U16 d0,d0,d0
+_NEON2SSESTORAGE uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b); // VPMAX.U32 d0,d0,d0
+_NEON2SSESTORAGE float32x2_t vpmax_f32(float32x2_t a, float32x2_t b); // VPMAX.F32 d0,d0,d0
+//Folding minimum vpmin -> takes minimum of adjacent pairs
+_NEON2SSESTORAGE int8x8_t vpmin_s8(int8x8_t a, int8x8_t b); // VPMIN.S8 d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vpmin_s16(int16x4_t a, int16x4_t b); // VPMIN.S16 d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vpmin_s32(int32x2_t a, int32x2_t b); // VPMIN.S32 d0,d0,d0
+_NEON2SSESTORAGE uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b); // VPMIN.U8 d0,d0,d0
+_NEON2SSESTORAGE uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b); // VPMIN.U16 d0,d0,d0
+_NEON2SSESTORAGE uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b); // VPMIN.U32 d0,d0,d0
+_NEON2SSESTORAGE float32x2_t vpmin_f32(float32x2_t a, float32x2_t b); // VPMIN.F32 d0,d0,d0
+//Reciprocal/Sqrt
+_NEON2SSESTORAGE float32x2_t vrecps_f32(float32x2_t a, float32x2_t b); // VRECPS.F32 d0, d0, d0
+_NEON2SSESTORAGE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b); // VRECPS.F32 q0, q0, q0
+_NEON2SSESTORAGE float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b); // VRSQRTS.F32 d0, d0, d0
+_NEON2SSESTORAGE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b); // VRSQRTS.F32 q0, q0, q0
+//Shifts by signed variable
+//Vector shift left: Vr[i] := Va[i] << Vb[i] (negative values shift right)
+_NEON2SSESTORAGE int8x8_t vshl_s8(int8x8_t a, int8x8_t b); // VSHL.S8 d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vshl_s16(int16x4_t a, int16x4_t b); // VSHL.S16 d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vshl_s32(int32x2_t a, int32x2_t b); // VSHL.S32 d0,d0,d0
+_NEON2SSESTORAGE int64x1_t vshl_s64(int64x1_t a, int64x1_t b); // VSHL.S64 d0,d0,d0
+_NEON2SSESTORAGE uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b); // VSHL.U8 d0,d0,d0
+_NEON2SSESTORAGE uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b); // VSHL.U16 d0,d0,d0
+_NEON2SSESTORAGE uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b); // VSHL.U32 d0,d0,d0
+_NEON2SSESTORAGE uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b); // VSHL.U64 d0,d0,d0
+_NEON2SSESTORAGE int8x16_t vshlq_s8(int8x16_t a, int8x16_t b); // VSHL.S8 q0,q0,q0
+_NEON2SSESTORAGE int16x8_t vshlq_s16(int16x8_t a, int16x8_t b); // VSHL.S16 q0,q0,q0
+_NEON2SSESTORAGE int32x4_t vshlq_s32(int32x4_t a, int32x4_t b); // VSHL.S32 q0,q0,q0
+_NEON2SSESTORAGE int64x2_t vshlq_s64(int64x2_t a, int64x2_t b); // VSHL.S64 q0,q0,q0
+_NEON2SSESTORAGE uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b); // VSHL.U8 q0,q0,q0
+_NEON2SSESTORAGE uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b); // VSHL.U16 q0,q0,q0
+_NEON2SSESTORAGE uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b); // VSHL.U32 q0,q0,q0
+_NEON2SSESTORAGE uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b); // VSHL.U64 q0,q0,q0
+//Vector saturating shift left: (negative values shift right)
+_NEON2SSESTORAGE int8x8_t vqshl_s8(int8x8_t a, int8x8_t b); // VQSHL.S8 d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vqshl_s16(int16x4_t a, int16x4_t b); // VQSHL.S16 d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vqshl_s32(int32x2_t a, int32x2_t b); // VQSHL.S32 d0,d0,d0
+_NEON2SSESTORAGE int64x1_t vqshl_s64(int64x1_t a, int64x1_t b); // VQSHL.S64 d0,d0,d0
+_NEON2SSESTORAGE uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b); // VQSHL.U8 d0,d0,d0
+_NEON2SSESTORAGE uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b); // VQSHL.U16 d0,d0,d0
+_NEON2SSESTORAGE uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b); // VQSHL.U32 d0,d0,d0
+_NEON2SSESTORAGE uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b); // VQSHL.U64 d0,d0,d0
+_NEON2SSESTORAGE int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b); // VQSHL.S8 q0,q0,q0
+_NEON2SSESTORAGE int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b); // VQSHL.S16 q0,q0,q0
+_NEON2SSESTORAGE int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b); // VQSHL.S32 q0,q0,q0
+_NEON2SSESTORAGE int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b); // VQSHL.S64 q0,q0,q0
+_NEON2SSESTORAGE uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b); // VQSHL.U8 q0,q0,q0
+_NEON2SSESTORAGE uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b); // VQSHL.U16 q0,q0,q0
+_NEON2SSESTORAGE uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b); // VQSHL.U32 q0,q0,q0
+_NEON2SSESTORAGE uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b); // VQSHL.U64 q0,q0,q0
+//Vector rounding shift left: (negative values shift right)
+_NEON2SSESTORAGE int8x8_t vrshl_s8(int8x8_t a, int8x8_t b); // VRSHL.S8 d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vrshl_s16(int16x4_t a, int16x4_t b); // VRSHL.S16 d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vrshl_s32(int32x2_t a, int32x2_t b); // VRSHL.S32 d0,d0,d0
+_NEON2SSESTORAGE int64x1_t vrshl_s64(int64x1_t a, int64x1_t b); // VRSHL.S64 d0,d0,d0
+_NEON2SSESTORAGE uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b); // VRSHL.U8 d0,d0,d0
+_NEON2SSESTORAGE uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b); // VRSHL.U16 d0,d0,d0
+_NEON2SSESTORAGE uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b); // VRSHL.U32 d0,d0,d0
+_NEON2SSESTORAGE uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b); // VRSHL.U64 d0,d0,d0
+_NEON2SSESTORAGE int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b); // VRSHL.S8 q0,q0,q0
+_NEON2SSESTORAGE int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b); // VRSHL.S16 q0,q0,q0
+_NEON2SSESTORAGE int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b); // VRSHL.S32 q0,q0,q0
+_NEON2SSESTORAGE int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b); // VRSHL.S64 q0,q0,q0
+_NEON2SSESTORAGE uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b); // VRSHL.U8 q0,q0,q0
+_NEON2SSESTORAGE uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b); // VRSHL.U16 q0,q0,q0
+_NEON2SSESTORAGE uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b); // VRSHL.U32 q0,q0,q0
+_NEON2SSESTORAGE uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b); // VRSHL.U64 q0,q0,q0
+//Vector saturating rounding shift left: (negative values shift right)
+_NEON2SSESTORAGE int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b); // VQRSHL.S8 d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b); // VQRSHL.S16 d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b); // VQRSHL.S32 d0,d0,d0
+_NEON2SSESTORAGE int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b); // VQRSHL.S64 d0,d0,d0
+_NEON2SSESTORAGE uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b); // VQRSHL.U8 d0,d0,d0
+_NEON2SSESTORAGE uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b); // VQRSHL.U16 d0,d0,d0
+_NEON2SSESTORAGE uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b); // VQRSHL.U32 d0,d0,d0
+_NEON2SSESTORAGE uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b); // VQRSHL.U64 d0,d0,d0
+_NEON2SSESTORAGE int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b); // VQRSHL.S8 q0,q0,q0
+_NEON2SSESTORAGE int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b); // VQRSHL.S16 q0,q0,q0
+_NEON2SSESTORAGE int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b); // VQRSHL.S32 q0,q0,q0
+_NEON2SSESTORAGE int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b); // VQRSHL.S64 q0,q0,q0
+_NEON2SSESTORAGE uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b); // VQRSHL.U8 q0,q0,q0
+_NEON2SSESTORAGE uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b); // VQRSHL.U16 q0,q0,q0
+_NEON2SSESTORAGE uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b); // VQRSHL.U32 q0,q0,q0
+_NEON2SSESTORAGE uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b); // VQRSHL.U64 q0,q0,q0
+//Shifts by a constant
+//Vector shift right by constant
+_NEON2SSESTORAGE int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VSHR.S8 d0,d0,#8
+_NEON2SSESTORAGE int16x4_t vshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VSHR.S16 d0,d0,#16
+_NEON2SSESTORAGE int32x2_t vshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VSHR.S32 d0,d0,#32
+_NEON2SSESTORAGE int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VSHR.S64 d0,d0,#64
+_NEON2SSESTORAGE uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VSHR.U8 d0,d0,#8
+_NEON2SSESTORAGE uint16x4_t vshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VSHR.U16 d0,d0,#16
+_NEON2SSESTORAGE uint32x2_t vshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VSHR.U32 d0,d0,#32
+_NEON2SSESTORAGE uint64x1_t vshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VSHR.U64 d0,d0,#64
+_NEON2SSESTORAGE int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VSHR.S8 q0,q0,#8
+_NEON2SSESTORAGE int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VSHR.S16 q0,q0,#16
+_NEON2SSESTORAGE int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VSHR.S32 q0,q0,#32
+_NEON2SSESTORAGE int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VSHR.S64 q0,q0,#64
+_NEON2SSESTORAGE uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VSHR.U8 q0,q0,#8
+_NEON2SSESTORAGE uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VSHR.U16 q0,q0,#16
+_NEON2SSESTORAGE uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VSHR.U32 q0,q0,#32
+_NEON2SSESTORAGE uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VSHR.U64 q0,q0,#64
+//Vector shift left by constant
+_NEON2SSESTORAGE int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
+_NEON2SSESTORAGE int16x4_t vshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
+_NEON2SSESTORAGE int32x2_t vshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
+_NEON2SSESTORAGE int64x1_t vshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
+_NEON2SSESTORAGE uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
+_NEON2SSESTORAGE uint16x4_t vshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
+_NEON2SSESTORAGE uint32x2_t vshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
+_NEON2SSESTORAGE uint64x1_t vshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
+_NEON2SSESTORAGE int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
+_NEON2SSESTORAGE int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
+_NEON2SSESTORAGE int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
+_NEON2SSESTORAGE int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
+_NEON2SSESTORAGE uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
+_NEON2SSESTORAGE uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
+_NEON2SSESTORAGE uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
+_NEON2SSESTORAGE uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
+//Vector rounding shift right by constant
+_NEON2SSESTORAGE int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VRSHR.S8 d0,d0,#8
+_NEON2SSESTORAGE int16x4_t vrshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VRSHR.S16 d0,d0,#16
+_NEON2SSESTORAGE int32x2_t vrshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VRSHR.S32 d0,d0,#32
+_NEON2SSESTORAGE int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VRSHR.S64 d0,d0,#64
+_NEON2SSESTORAGE uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VRSHR.U8 d0,d0,#8
+_NEON2SSESTORAGE uint16x4_t vrshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VRSHR.U16 d0,d0,#16
+_NEON2SSESTORAGE uint32x2_t vrshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VRSHR.U32 d0,d0,#32
+_NEON2SSESTORAGE uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VRSHR.U64 d0,d0,#64
+_NEON2SSESTORAGE int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VRSHR.S8 q0,q0,#8
+_NEON2SSESTORAGE int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VRSHR.S16 q0,q0,#16
+_NEON2SSESTORAGE int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VRSHR.S32 q0,q0,#32
+_NEON2SSESTORAGE int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VRSHR.S64 q0,q0,#64
+_NEON2SSESTORAGE uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VRSHR.U8 q0,q0,#8
+_NEON2SSESTORAGE uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VRSHR.U16 q0,q0,#16
+_NEON2SSESTORAGE uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VRSHR.U32 q0,q0,#32
+_NEON2SSESTORAGE uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VRSHR.U64 q0,q0,#64
+//Vector shift right by constant and accumulate
+_NEON2SSESTORAGE int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRA.S8 d0,d0,#8
+_NEON2SSESTORAGE int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRA.S16 d0,d0,#16
+_NEON2SSESTORAGE int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRA.S32 d0,d0,#32
+_NEON2SSESTORAGE int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRA.S64 d0,d0,#64
+_NEON2SSESTORAGE uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRA.U8 d0,d0,#8
+_NEON2SSESTORAGE uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRA.U16 d0,d0,#16
+_NEON2SSESTORAGE uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRA.U32 d0,d0,#32
+_NEON2SSESTORAGE uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRA.U64 d0,d0,#64
+_NEON2SSESTORAGE int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRA.S8 q0,q0,#8
+_NEON2SSESTORAGE int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRA.S16 q0,q0,#16
+_NEON2SSESTORAGE int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRA.S32 q0,q0,#32
+_NEON2SSESTORAGE int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRA.S64 q0,q0,#64
+_NEON2SSESTORAGE uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRA.U8 q0,q0,#8
+_NEON2SSESTORAGE uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRA.U16 q0,q0,#16
+_NEON2SSESTORAGE uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRA.U32 q0,q0,#32
+_NEON2SSESTORAGE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRA.U64 q0,q0,#64
+//Vector rounding shift right by constant and accumulate
+_NEON2SSESTORAGE int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VRSRA.S8 d0,d0,#8
+_NEON2SSESTORAGE int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VRSRA.S16 d0,d0,#16
+_NEON2SSESTORAGE int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VRSRA.S32 d0,d0,#32
+_NEON2SSESTORAGE int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VRSRA.S64 d0,d0,#64
+_NEON2SSESTORAGE uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VRSRA.U8 d0,d0,#8
+_NEON2SSESTORAGE uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VRSRA.U16 d0,d0,#16
+_NEON2SSESTORAGE uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VRSRA.U32 d0,d0,#32
+_NEON2SSESTORAGE uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VRSRA.U64 d0,d0,#64
+_NEON2SSESTORAGE int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VRSRA.S8 q0,q0,#8
+_NEON2SSESTORAGE int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VRSRA.S16 q0,q0,#16
+_NEON2SSESTORAGE int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VRSRA.S32 q0,q0,#32
+_NEON2SSESTORAGE int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VRSRA.S64 q0,q0,#64
+_NEON2SSESTORAGE uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VRSRA.U8 q0,q0,#8
+_NEON2SSESTORAGE uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VRSRA.U16 q0,q0,#16
+_NEON2SSESTORAGE uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VRSRA.U32 q0,q0,#32
+_NEON2SSESTORAGE uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VRSRA.U64 q0,q0,#64
+//Vector saturating shift left by constant
+_NEON2SSESTORAGE int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHL.S8 d0,d0,#0
+_NEON2SSESTORAGE int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHL.S16 d0,d0,#0
+_NEON2SSESTORAGE int32x2_t vqshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHL.S32 d0,d0,#0
+_NEON2SSESTORAGE int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHL.S64 d0,d0,#0
+_NEON2SSESTORAGE uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VQSHL.U8 d0,d0,#0
+_NEON2SSESTORAGE uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VQSHL.U16 d0,d0,#0
+_NEON2SSESTORAGE uint32x2_t vqshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VQSHL.U32 d0,d0,#0
+_NEON2SSESTORAGE uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VQSHL.U64 d0,d0,#0
+_NEON2SSESTORAGE int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHL.S8 q0,q0,#0
+_NEON2SSESTORAGE int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHL.S16 q0,q0,#0
+_NEON2SSESTORAGE int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHL.S32 q0,q0,#0
+_NEON2SSESTORAGE int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHL.S64 q0,q0,#0
+_NEON2SSESTORAGE uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VQSHL.U8 q0,q0,#0
+_NEON2SSESTORAGE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VQSHL.U16 q0,q0,#0
+_NEON2SSESTORAGE uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VQSHL.U32 q0,q0,#0
+_NEON2SSESTORAGE uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VQSHL.U64 q0,q0,#0
+//Vector signed->unsigned saturating shift left by constant
+_NEON2SSESTORAGE uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHLU.S8 d0,d0,#0
+_NEON2SSESTORAGE uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHLU.S16 d0,d0,#0
+_NEON2SSESTORAGE uint32x2_t vqshlu_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHLU.S32 d0,d0,#0
+_NEON2SSESTORAGE uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHLU.S64 d0,d0,#0
+_NEON2SSESTORAGE uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHLU.S8 q0,q0,#0
+_NEON2SSESTORAGE uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHLU.S16 q0,q0,#0
+_NEON2SSESTORAGE uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHLU.S32 q0,q0,#0
+_NEON2SSESTORAGE uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHLU.S64 q0,q0,#0
+//Vector narrowing shift right by constant
+_NEON2SSESTORAGE int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
+_NEON2SSESTORAGE int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
+_NEON2SSESTORAGE int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
+_NEON2SSESTORAGE uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
+_NEON2SSESTORAGE uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
+_NEON2SSESTORAGE uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
+//Vector signed->unsigned narrowing saturating shift right by constant
+_NEON2SSESTORAGE uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRUN.S16 d0,q0,#8
+_NEON2SSESTORAGE uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRUN.S32 d0,q0,#16
+_NEON2SSESTORAGE uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRUN.S64 d0,q0,#32
+//Vector signed->unsigned rounding narrowing saturating shift right by constant
+_NEON2SSESTORAGE uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRUN.S16 d0,q0,#8
+_NEON2SSESTORAGE uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRUN.S32 d0,q0,#16
+_NEON2SSESTORAGE uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRUN.S64 d0,q0,#32
+//Vector narrowing saturating shift right by constant
+_NEON2SSESTORAGE int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRN.S16 d0,q0,#8
+_NEON2SSESTORAGE int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRN.S32 d0,q0,#16
+_NEON2SSESTORAGE int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRN.S64 d0,q0,#32
+_NEON2SSESTORAGE uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQSHRN.U16 d0,q0,#8
+_NEON2SSESTORAGE uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQSHRN.U32 d0,q0,#16
+_NEON2SSESTORAGE uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQSHRN.U64 d0,q0,#32
+//Vector rounding narrowing shift right by constant
+_NEON2SSESTORAGE int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
+_NEON2SSESTORAGE int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
+_NEON2SSESTORAGE int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
+_NEON2SSESTORAGE uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
+_NEON2SSESTORAGE uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
+_NEON2SSESTORAGE uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
+//Vector rounding narrowing saturating shift right by constant
+_NEON2SSESTORAGE int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRN.S16 d0,q0,#8
+_NEON2SSESTORAGE int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRN.S32 d0,q0,#16
+_NEON2SSESTORAGE int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRN.S64 d0,q0,#32
+_NEON2SSESTORAGE uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQRSHRN.U16 d0,q0,#8
+_NEON2SSESTORAGE uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQRSHRN.U32 d0,q0,#16
+_NEON2SSESTORAGE uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQRSHRN.U64 d0,q0,#32
+//Vector widening shift left by constant
+_NEON2SSESTORAGE int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b); // VSHLL.S8 q0,d0,#0
+_NEON2SSESTORAGE int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b); // VSHLL.S16 q0,d0,#0
+_NEON2SSESTORAGE int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b); // VSHLL.S32 q0,d0,#0
+_NEON2SSESTORAGE uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b); // VSHLL.U8 q0,d0,#0
+_NEON2SSESTORAGE uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b); // VSHLL.U16 q0,d0,#0
+_NEON2SSESTORAGE uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b); // VSHLL.U32 q0,d0,#0
+//Shifts with insert
+//Vector shift right and insert
+_NEON2SSESTORAGE int8x8_t vsri_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
+_NEON2SSESTORAGE int16x4_t vsri_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
+_NEON2SSESTORAGE int32x2_t vsri_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
+_NEON2SSESTORAGE int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
+_NEON2SSESTORAGE uint8x8_t vsri_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
+_NEON2SSESTORAGE uint16x4_t vsri_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
+_NEON2SSESTORAGE uint32x2_t vsri_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
+_NEON2SSESTORAGE uint64x1_t vsri_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
+_NEON2SSESTORAGE poly8x8_t vsri_n_p8(poly8x8_t a, poly8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
+_NEON2SSESTORAGE poly16x4_t vsri_n_p16(poly16x4_t a, poly16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
+_NEON2SSESTORAGE int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
+_NEON2SSESTORAGE int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
+_NEON2SSESTORAGE int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
+_NEON2SSESTORAGE int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
+_NEON2SSESTORAGE uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
+_NEON2SSESTORAGE uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
+_NEON2SSESTORAGE uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
+_NEON2SSESTORAGE uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
+_NEON2SSESTORAGE poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
+_NEON2SSESTORAGE poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
+//Vector shift left and insert
+_NEON2SSESTORAGE int8x8_t vsli_n_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
+_NEON2SSESTORAGE int16x4_t vsli_n_s16(int16x4_t a, int16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
+_NEON2SSESTORAGE int32x2_t vsli_n_s32(int32x2_t a, int32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
+_NEON2SSESTORAGE int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
+_NEON2SSESTORAGE uint8x8_t vsli_n_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
+_NEON2SSESTORAGE uint16x4_t vsli_n_u16(uint16x4_t a, uint16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
+_NEON2SSESTORAGE uint32x2_t vsli_n_u32(uint32x2_t a, uint32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
+_NEON2SSESTORAGE uint64x1_t vsli_n_u64(uint64x1_t a, uint64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
+_NEON2SSESTORAGE poly8x8_t vsli_n_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
+_NEON2SSESTORAGE poly16x4_t vsli_n_p16(poly16x4_t a, poly16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
+_NEON2SSESTORAGE int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
+_NEON2SSESTORAGE int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
+_NEON2SSESTORAGE int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
+_NEON2SSESTORAGE int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
+_NEON2SSESTORAGE uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
+_NEON2SSESTORAGE uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
+_NEON2SSESTORAGE uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
+_NEON2SSESTORAGE uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
+_NEON2SSESTORAGE poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
+_NEON2SSESTORAGE poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
+//Loads of a single vector or lane. Perform loads and stores of a single vector of some type.
+//Load a single vector from memory
+_NEON2SSESTORAGE uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
+_NEON2SSESTORAGE uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
+_NEON2SSESTORAGE uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
+_NEON2SSESTORAGE uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
+_NEON2SSESTORAGE int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
+_NEON2SSESTORAGE int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
+_NEON2SSESTORAGE int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
+_NEON2SSESTORAGE int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
+_NEON2SSESTORAGE float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr); // VLD1.16 {d0, d1}, [r0]
+_NEON2SSESTORAGE float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
+_NEON2SSESTORAGE poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
+_NEON2SSESTORAGE poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
+_NEON2SSESTORAGE uint8x8_t vld1_u8(__transfersize(8) uint8_t const * ptr); // VLD1.8 {d0}, [r0]
+_NEON2SSESTORAGE uint16x4_t vld1_u16(__transfersize(4) uint16_t const * ptr); // VLD1.16 {d0}, [r0]
+_NEON2SSESTORAGE uint32x2_t vld1_u32(__transfersize(2) uint32_t const * ptr); // VLD1.32 {d0}, [r0]
+_NEON2SSESTORAGE uint64x1_t vld1_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
+_NEON2SSESTORAGE int8x8_t vld1_s8(__transfersize(8) int8_t const * ptr); // VLD1.8 {d0}, [r0]
+_NEON2SSESTORAGE int16x4_t vld1_s16(__transfersize(4) int16_t const * ptr); // VLD1.16 {d0}, [r0]
+_NEON2SSESTORAGE int32x2_t vld1_s32(__transfersize(2) int32_t const * ptr); // VLD1.32 {d0}, [r0]
+_NEON2SSESTORAGE int64x1_t vld1_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
+_NEON2SSESTORAGE float16x4_t vld1_f16(__transfersize(4) __fp16 const * ptr); // VLD1.16 {d0}, [r0]
+_NEON2SSESTORAGE float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr); // VLD1.32 {d0}, [r0]
+_NEON2SSESTORAGE poly8x8_t vld1_p8(__transfersize(8) poly8_t const * ptr); // VLD1.8 {d0}, [r0]
+_NEON2SSESTORAGE poly16x4_t vld1_p16(__transfersize(4) poly16_t const * ptr); // VLD1.16 {d0}, [r0]
+
+_NEON2SSESTORAGE float64x2_t vld1q_f64(__transfersize(4) float64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
+
+//Load a single lane from memory
+_NEON2SSESTORAGE uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0]
+_NEON2SSESTORAGE uint16x8_t vld1q_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
+_NEON2SSESTORAGE uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
+_NEON2SSESTORAGE uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0]
+_NEON2SSESTORAGE int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0]
+_NEON2SSESTORAGE int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane); //VLD1.16 {d0[0]}, [r0]
+_NEON2SSESTORAGE int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane); //VLD1.32 {d0[0]}, [r0]
+_NEON2SSESTORAGE float16x8_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x8_t vec, __constrange(0,7) int lane); //VLD1.16 {d0[0]}, [r0]
+_NEON2SSESTORAGE float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
+_NEON2SSESTORAGE int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane); //VLD1.64 {d0}, [r0]
+_NEON2SSESTORAGE poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0]
+_NEON2SSESTORAGE poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
+_NEON2SSESTORAGE uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane); //VLD1.8 {d0[0]}, [r0]
+_NEON2SSESTORAGE uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
+_NEON2SSESTORAGE uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane); //VLD1.32 {d0[0]}, [r0]
+_NEON2SSESTORAGE uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane); //VLD1.64 {d0}, [r0]
+_NEON2SSESTORAGE int8x8_t vld1_lane_s8(__transfersize(1) int8_t const * ptr, int8x8_t vec, __constrange(0,7) int lane); // VLD1.8{d0[0]}, [r0]
+_NEON2SSESTORAGE int16x4_t vld1_lane_s16(__transfersize(1) int16_t const * ptr, int16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
+_NEON2SSESTORAGE int32x2_t vld1_lane_s32(__transfersize(1) int32_t const * ptr, int32x2_t vec, __constrange(0,1) int lane); //VLD1.32 {d0[0]}, [r0]
+_NEON2SSESTORAGE float16x4_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
+_NEON2SSESTORAGE float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
+_NEON2SSESTORAGE int64x1_t vld1_lane_s64(__transfersize(1) int64_t const * ptr, int64x1_t vec, __constrange(0,0) int lane); //VLD1.64 {d0}, [r0]
+_NEON2SSESTORAGE poly8x8_t vld1_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x8_t vec, __constrange(0,7) int lane); //VLD1.8 {d0[0]}, [r0]
+_NEON2SSESTORAGE poly16x4_t vld1_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
+//Load all lanes of vector with same value from memory
+_NEON2SSESTORAGE uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
+_NEON2SSESTORAGE uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
+_NEON2SSESTORAGE uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
+_NEON2SSESTORAGE uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
+_NEON2SSESTORAGE int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
+_NEON2SSESTORAGE int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
+_NEON2SSESTORAGE int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
+_NEON2SSESTORAGE int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
+_NEON2SSESTORAGE float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
+_NEON2SSESTORAGE float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
+_NEON2SSESTORAGE poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
+_NEON2SSESTORAGE poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
+_NEON2SSESTORAGE uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
+_NEON2SSESTORAGE uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
+_NEON2SSESTORAGE uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
+_NEON2SSESTORAGE uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
+_NEON2SSESTORAGE int8x8_t vld1_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
+_NEON2SSESTORAGE int16x4_t vld1_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
+_NEON2SSESTORAGE int32x2_t vld1_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
+_NEON2SSESTORAGE int64x1_t vld1_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
+_NEON2SSESTORAGE float16x4_t vld1_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
+_NEON2SSESTORAGE float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
+_NEON2SSESTORAGE poly8x8_t vld1_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
+_NEON2SSESTORAGE poly16x4_t vld1_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
+//Store a single vector or lane. Stores all lanes or a single lane of a vector.
+//Store a single vector into memory
+_NEON2SSESTORAGE void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val); // VST1.8 {d0, d1}, [r0]
+_NEON2SSESTORAGE void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val); // VST1.16 {d0, d1}, [r0]
+_NEON2SSESTORAGE void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val); // VST1.32 {d0, d1}, [r0]
+_NEON2SSESTORAGE void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val); // VST1.64 {d0, d1}, [r0]
+_NEON2SSESTORAGE void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val); // VST1.8 {d0, d1}, [r0]
+_NEON2SSESTORAGE void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val); // VST1.16 {d0, d1}, [r0]
+_NEON2SSESTORAGE void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val); // VST1.32 {d0, d1}, [r0]
+_NEON2SSESTORAGE void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val); // VST1.64 {d0, d1}, [r0]
+_NEON2SSESTORAGE void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val); // VST1.16 {d0, d1}, [r0]
+_NEON2SSESTORAGE void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val); // VST1.32 {d0, d1}, [r0]
+_NEON2SSESTORAGE void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val); // VST1.8 {d0, d1}, [r0]
+_NEON2SSESTORAGE void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val); // VST1.16 {d0, d1}, [r0]
+_NEON2SSESTORAGE void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val); // VST1.8 {d0}, [r0]
+_NEON2SSESTORAGE void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val); // VST1.16 {d0}, [r0]
+_NEON2SSESTORAGE void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val); // VST1.32 {d0}, [r0]
+_NEON2SSESTORAGE void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val); // VST1.64 {d0}, [r0]
+_NEON2SSESTORAGE void vst1_s8(__transfersize(8) int8_t * ptr, int8x8_t val); // VST1.8 {d0}, [r0]
+_NEON2SSESTORAGE void vst1_s16(__transfersize(4) int16_t * ptr, int16x4_t val); // VST1.16 {d0}, [r0]
+_NEON2SSESTORAGE void vst1_s32(__transfersize(2) int32_t * ptr, int32x2_t val); // VST1.32 {d0}, [r0]
+_NEON2SSESTORAGE void vst1_s64(__transfersize(1) int64_t * ptr, int64x1_t val); // VST1.64 {d0}, [r0]
+_NEON2SSESTORAGE void vst1_f16(__transfersize(4) __fp16 * ptr, float16x4_t val); // VST1.16 {d0}, [r0]
+_NEON2SSESTORAGE void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val); // VST1.32 {d0}, [r0]
+_NEON2SSESTORAGE void vst1_p8(__transfersize(8) poly8_t * ptr, poly8x8_t val); // VST1.8 {d0}, [r0]
+_NEON2SSESTORAGE void vst1_p16(__transfersize(4) poly16_t * ptr, poly16x4_t val); // VST1.16 {d0}, [r0]
+//Store a lane of a vector into memory
+//Loads of an N-element structure
+//Load N-element structure from memory
+_NEON2SSESTORAGE uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
+_NEON2SSESTORAGE uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
+_NEON2SSESTORAGE uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
+_NEON2SSESTORAGE int8x16x2_t vld2q_s8(__transfersize(32) int8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
+_NEON2SSESTORAGE int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
+_NEON2SSESTORAGE int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
+_NEON2SSESTORAGE float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr); // VLD2.16 {d0, d2}, [r0]
+_NEON2SSESTORAGE float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
+_NEON2SSESTORAGE poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
+_NEON2SSESTORAGE poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
+_NEON2SSESTORAGE uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
+_NEON2SSESTORAGE uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
+_NEON2SSESTORAGE uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
+_NEON2SSESTORAGE uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
+_NEON2SSESTORAGE int8x8x2_t vld2_s8(__transfersize(16) int8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
+_NEON2SSESTORAGE int16x4x2_t vld2_s16(__transfersize(8) int16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
+_NEON2SSESTORAGE int32x2x2_t vld2_s32(__transfersize(4) int32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
+_NEON2SSESTORAGE int64x1x2_t vld2_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
+//float16x4x2_t vld2_f16(__transfersize(8) __fp16 const * ptr); // VLD2.16 {d0, d1}, [r0]
+_NEON2SSESTORAGE float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
+_NEON2SSESTORAGE poly8x8x2_t vld2_p8(__transfersize(16) poly8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
+_NEON2SSESTORAGE poly16x4x2_t vld2_p16(__transfersize(8) poly16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
+_NEON2SSESTORAGE uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
+_NEON2SSESTORAGE uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
+_NEON2SSESTORAGE uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
+_NEON2SSESTORAGE int8x16x3_t vld3q_s8(__transfersize(48) int8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
+_NEON2SSESTORAGE int16x8x3_t vld3q_s16(__transfersize(24) int16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
+_NEON2SSESTORAGE int32x4x3_t vld3q_s32(__transfersize(12) int32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
+_NEON2SSESTORAGE float16x8x3_t vld3q_f16(__transfersize(24) __fp16 const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
+_NEON2SSESTORAGE float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
+poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
+_NEON2SSESTORAGE poly16x8x3_t vld3q_p16(__transfersize(24) poly16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
+_NEON2SSESTORAGE uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
+_NEON2SSESTORAGE uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
+_NEON2SSESTORAGE uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
+_NEON2SSESTORAGE uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
+_NEON2SSESTORAGE int8x8x3_t vld3_s8(__transfersize(24) int8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
+_NEON2SSESTORAGE int16x4x3_t vld3_s16(__transfersize(12) int16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
+_NEON2SSESTORAGE int32x2x3_t vld3_s32(__transfersize(6) int32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
+int64x1x3_t vld3_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
+_NEON2SSESTORAGE float16x4x3_t vld3_f16(__transfersize(12) __fp16 const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
+_NEON2SSESTORAGE float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
+_NEON2SSESTORAGE poly8x8x3_t vld3_p8(__transfersize(24) poly8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
+_NEON2SSESTORAGE poly16x4x3_t vld3_p16(__transfersize(12) poly16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
+_NEON2SSESTORAGE uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
+_NEON2SSESTORAGE uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
+_NEON2SSESTORAGE uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
+_NEON2SSESTORAGE int8x16x4_t vld4q_s8(__transfersize(64) int8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
+_NEON2SSESTORAGE int16x8x4_t vld4q_s16(__transfersize(32) int16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
+_NEON2SSESTORAGE int32x4x4_t vld4q_s32(__transfersize(16) int32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
+_NEON2SSESTORAGE float16x8x4_t vld4q_f16(__transfersize(32) __fp16 const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
+_NEON2SSESTORAGE float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
+_NEON2SSESTORAGE poly8x16x4_t vld4q_p8(__transfersize(64) poly8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
+_NEON2SSESTORAGE poly16x8x4_t vld4q_p16(__transfersize(32) poly16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
+_NEON2SSESTORAGE uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
+_NEON2SSESTORAGE uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
+_NEON2SSESTORAGE uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
+_NEON2SSESTORAGE uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
+_NEON2SSESTORAGE int8x8x4_t vld4_s8(__transfersize(32) int8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
+_NEON2SSESTORAGE int16x4x4_t vld4_s16(__transfersize(16) int16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
+_NEON2SSESTORAGE int32x2x4_t vld4_s32(__transfersize(8) int32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
+int64x1x4_t vld4_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
+_NEON2SSESTORAGE float16x4x4_t vld4_f16(__transfersize(16) __fp16 const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
+_NEON2SSESTORAGE float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
+_NEON2SSESTORAGE poly8x8x4_t vld4_p8(__transfersize(32) poly8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
+_NEON2SSESTORAGE poly16x4x4_t vld4_p16(__transfersize(16) poly16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
+//Load all lanes of N-element structure with same value from memory
+_NEON2SSESTORAGE uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
+_NEON2SSESTORAGE uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
+_NEON2SSESTORAGE uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
+_NEON2SSESTORAGE uint64x1x2_t vld2_dup_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
+_NEON2SSESTORAGE int8x8x2_t vld2_dup_s8(__transfersize(2) int8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
+_NEON2SSESTORAGE int16x4x2_t vld2_dup_s16(__transfersize(2) int16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
+_NEON2SSESTORAGE int32x2x2_t vld2_dup_s32(__transfersize(2) int32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
+_NEON2SSESTORAGE int64x1x2_t vld2_dup_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
+//float16x4x2_t vld2_dup_f16(__transfersize(2) __fp16 const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
+_NEON2SSESTORAGE float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
+_NEON2SSESTORAGE poly8x8x2_t vld2_dup_p8(__transfersize(2) poly8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
+_NEON2SSESTORAGE poly16x4x2_t vld2_dup_p16(__transfersize(2) poly16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
+_NEON2SSESTORAGE uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
+_NEON2SSESTORAGE uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
+_NEON2SSESTORAGE uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
+_NEON2SSESTORAGE uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
+_NEON2SSESTORAGE int8x8x3_t vld3_dup_s8(__transfersize(3) int8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
+_NEON2SSESTORAGE int16x4x3_t vld3_dup_s16(__transfersize(3) int16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
+_NEON2SSESTORAGE int32x2x3_t vld3_dup_s32(__transfersize(3) int32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
+int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
+_NEON2SSESTORAGE float16x4x3_t vld3_dup_f16(__transfersize(3) __fp16 const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
+_NEON2SSESTORAGE float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
+_NEON2SSESTORAGE poly8x8x3_t vld3_dup_p8(__transfersize(3) poly8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
+_NEON2SSESTORAGE poly16x4x3_t vld3_dup_p16(__transfersize(3) poly16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
+_NEON2SSESTORAGE uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
+_NEON2SSESTORAGE uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
+_NEON2SSESTORAGE uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
+_NEON2SSESTORAGE uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
+_NEON2SSESTORAGE int8x8x4_t vld4_dup_s8(__transfersize(4) int8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
+_NEON2SSESTORAGE int16x4x4_t vld4_dup_s16(__transfersize(4) int16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
+_NEON2SSESTORAGE int32x2x4_t vld4_dup_s32(__transfersize(4) int32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
+int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
+_NEON2SSESTORAGE float16x4x4_t vld4_dup_f16(__transfersize(4) __fp16 const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
+_NEON2SSESTORAGE float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
+_NEON2SSESTORAGE poly8x8x4_t vld4_dup_p8(__transfersize(4) poly8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
+_NEON2SSESTORAGE poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
+//Load a single lane of N-element structure from memory
+//the functions below are modified to deal with the error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned
+_NEON2SSESTORAGE uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
+_NEON2SSESTORAGE uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0]
+_NEON2SSESTORAGE int16x8x2_t vld2q_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
+_NEON2SSESTORAGE int32x4x2_t vld2q_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0]
+_NEON2SSESTORAGE float16x8x2_t vld2q_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
+_NEON2SSESTORAGE float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0]
+_NEON2SSESTORAGE poly16x8x2_t vld2q_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
+_NEON2SSESTORAGE uint8x8x2_t vld2_lane_u8(__transfersize(2) uint8_t const * ptr, uint8x8x2_t src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0]
+_NEON2SSESTORAGE uint16x4x2_t vld2_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x4x2_t src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
+_NEON2SSESTORAGE uint32x2x2_t vld2_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x2x2_t src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
+_NEON2SSESTORAGE int8x8x2_t vld2_lane_s8(__transfersize(2) int8_t const * ptr, int8x8x2_t src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0]
+_NEON2SSESTORAGE int16x4x2_t vld2_lane_s16(__transfersize(2) int16_t const * ptr, int16x4x2_t src, __constrange(0,3) int lane); //VLD2.16 {d0[0], d1[0]}, [r0]
+_NEON2SSESTORAGE int32x2x2_t vld2_lane_s32(__transfersize(2) int32_t const * ptr, int32x2x2_t src, __constrange(0,1) int lane); //VLD2.32 {d0[0], d1[0]}, [r0]
+//float16x4x2_t vld2_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
+_NEON2SSESTORAGE float32x2x2_t vld2_lane_f32(__transfersize(2) float32_t const * ptr, float32x2x2_t  src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
+_NEON2SSESTORAGE poly8x8x2_t vld2_lane_p8(__transfersize(2) poly8_t const * ptr, poly8x8x2_t  src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0]
+_NEON2SSESTORAGE poly16x4x2_t vld2_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x4x2_t  src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
+_NEON2SSESTORAGE uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSESTORAGE uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSESTORAGE int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSESTORAGE int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSESTORAGE float16x8x3_t vld3q_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSESTORAGE float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSESTORAGE poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSESTORAGE uint8x8x3_t vld3_lane_u8(__transfersize(3) uint8_t const * ptr, uint8x8x3_t src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSESTORAGE uint16x4x3_t vld3_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x4x3_t src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSESTORAGE uint32x2x3_t vld3_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x2x3_t src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSESTORAGE int8x8x3_t vld3_lane_s8(__transfersize(3) int8_t const * ptr, int8x8x3_t src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSESTORAGE int16x4x3_t vld3_lane_s16(__transfersize(3) int16_t const * ptr, int16x4x3_t src, __constrange(0,3) int lane); //VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSESTORAGE int32x2x3_t vld3_lane_s32(__transfersize(3) int32_t const * ptr, int32x2x3_t src, __constrange(0,1) int lane); //VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSESTORAGE float16x4x3_t vld3_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSESTORAGE float32x2x3_t vld3_lane_f32(__transfersize(3) float32_t const * ptr, float32x2x3_t src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSESTORAGE poly8x8x3_t vld3_lane_p8(__transfersize(3) poly8_t const * ptr, poly8x8x3_t src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSESTORAGE poly16x4x3_t vld3_lane_p16(__transfersize(3) poly16_t const * ptr, poly16x4x3_t src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSESTORAGE uint16x8x4_t vld4q_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+_NEON2SSESTORAGE uint32x4x4_t vld4q_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+_NEON2SSESTORAGE int16x8x4_t vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+_NEON2SSESTORAGE int32x4x4_t vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+_NEON2SSESTORAGE float16x8x4_t vld4q_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+_NEON2SSESTORAGE float32x4x4_t vld4q_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+_NEON2SSESTORAGE poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+_NEON2SSESTORAGE uint8x8x4_t vld4_lane_u8(__transfersize(4) uint8_t const * ptr, uint8x8x4_t src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSESTORAGE uint16x4x4_t vld4_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x4x4_t src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSESTORAGE uint32x2x4_t vld4_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x2x4_t src, __constrange(0,1) int lane); // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSESTORAGE int8x8x4_t vld4_lane_s8(__transfersize(4) int8_t const * ptr, int8x8x4_t src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSESTORAGE int16x4x4_t vld4_lane_s16(__transfersize(4) int16_t const * ptr, int16x4x4_t src, __constrange(0,3) int lane); //VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSESTORAGE int32x2x4_t vld4_lane_s32(__transfersize(4) int32_t const * ptr, int32x2x4_t src, __constrange(0,1) int lane); //VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSESTORAGE float16x4x4_t vld4_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x4x4_t * src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSESTORAGE float32x2x4_t vld4_lane_f32(__transfersize(4) float32_t const * ptr, float32x2x4_t src, __constrange(0,1) int lane); // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSESTORAGE poly8x8x4_t vld4_lane_p8(__transfersize(4) poly8_t const * ptr, poly8x8x4_t src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSESTORAGE poly16x4x4_t vld4_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x4x4_t src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+//Store N-element structure to memory
+_NEON2SSESTORAGE void vst2q_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x16x2_t const * val); // VST2.8 {d0, d2}, [r0]
+_NEON2SSESTORAGE void vst2q_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x8x2_t const * val); // VST2.16 {d0, d2}, [r0]
+_NEON2SSESTORAGE void vst2q_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x4x2_t const * val); // VST2.32 {d0, d2}, [r0]
+_NEON2SSESTORAGE void vst2q_s8_ptr(__transfersize(32) int8_t * ptr, int8x16x2_t const * val); // VST2.8 {d0, d2}, [r0]
+_NEON2SSESTORAGE void vst2q_s16_ptr(__transfersize(16) int16_t * ptr, int16x8x2_t const * val); // VST2.16 {d0, d2}, [r0]
+_NEON2SSESTORAGE void vst2q_s32_ptr(__transfersize(8) int32_t * ptr, int32x4x2_t const * val); // VST2.32 {d0, d2}, [r0]
+_NEON2SSESTORAGE void vst2q_f16_ptr(__transfersize(16) __fp16 * ptr, float16x8x2_t const * val); // VST2.16 {d0, d2}, [r0]
+_NEON2SSESTORAGE void vst2q_f32_ptr(__transfersize(8) float32_t * ptr, float32x4x2_t const * val); // VST2.32 {d0, d2}, [r0]
+_NEON2SSESTORAGE void vst2q_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x16x2_t const * val); // VST2.8 {d0, d2}, [r0]
+_NEON2SSESTORAGE void vst2q_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x8x2_t const * val); // VST2.16 {d0, d2}, [r0]
+_NEON2SSESTORAGE void vst2_u8(__transfersize(16) uint8_t * ptr, uint8x8x2_t val); // VST2.8 {d0, d1}, [r0]
+_NEON2SSESTORAGE void vst2_u16(__transfersize(8) uint16_t * ptr, uint16x4x2_t val); // VST2.16 {d0, d1}, [r0]
+_NEON2SSESTORAGE void vst2_u32(__transfersize(4) uint32_t * ptr, uint32x2x2_t val); // VST2.32 {d0, d1}, [r0]
+_NEON2SSESTORAGE void vst2_u64(__transfersize(2) uint64_t * ptr, uint64x1x2_t val); // VST1.64 {d0, d1}, [r0]
+_NEON2SSESTORAGE void vst2_s8(__transfersize(16) int8_t * ptr, int8x8x2_t val); // VST2.8 {d0, d1}, [r0]
+_NEON2SSESTORAGE void vst2_s16(__transfersize(8) int16_t * ptr, int16x4x2_t val); // VST2.16 {d0, d1}, [r0]
+_NEON2SSESTORAGE void vst2_s32(__transfersize(4) int32_t * ptr, int32x2x2_t val); // VST2.32 {d0, d1}, [r0]
+_NEON2SSESTORAGE void vst2_s64(__transfersize(2) int64_t * ptr, int64x1x2_t val); // VST1.64 {d0, d1}, [r0]
+//void vst2_f16_ptr(__transfersize(8) __fp16 * ptr, float16x4x2_t const * val); // VST2.16 {d0, d1}, [r0]
+_NEON2SSESTORAGE void vst2_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x2_t const * val); // VST2.32 {d0, d1}, [r0]
+_NEON2SSESTORAGE void vst2_p8(__transfersize(16) poly8_t * ptr, poly8x8x2_t val); // VST2.8 {d0, d1}, [r0]
+_NEON2SSESTORAGE void vst2_p16(__transfersize(8) poly16_t * ptr, poly16x4x2_t val); // VST2.16 {d0, d1}, [r0]
+_NEON2SSESTORAGE void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr, uint8x16x3_t const * val); // VST3.8 {d0, d2, d4}, [r0]
+_NEON2SSESTORAGE void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr, uint16x8x3_t const * val); // VST3.16 {d0, d2, d4}, [r0]
+_NEON2SSESTORAGE void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr, uint32x4x3_t const * val); // VST3.32 {d0, d2, d4}, [r0]
+_NEON2SSESTORAGE void vst3q_s8_ptr(__transfersize(48) int8_t * ptr, int8x16x3_t const * val); // VST3.8 {d0, d2, d4}, [r0]
+_NEON2SSESTORAGE void vst3q_s16_ptr(__transfersize(24) int16_t * ptr, int16x8x3_t const * val); // VST3.16 {d0, d2, d4}, [r0]
+_NEON2SSESTORAGE void vst3q_s32_ptr(__transfersize(12) int32_t * ptr, int32x4x3_t const * val); // VST3.32 {d0, d2, d4}, [r0]
+_NEON2SSESTORAGE void vst3q_f16_ptr(__transfersize(24) __fp16 * ptr, float16x8x3_t const * val); // VST3.16 {d0, d2, d4}, [r0]
+_NEON2SSESTORAGE void vst3q_f32_ptr(__transfersize(12) float32_t * ptr, float32x4x3_t const * val); // VST3.32 {d0, d2, d4}, [r0]
+_NEON2SSESTORAGE void vst3q_p8_ptr(__transfersize(48) poly8_t * ptr, poly8x16x3_t const * val); // VST3.8 {d0, d2, d4}, [r0]
+_NEON2SSESTORAGE void vst3q_p16_ptr(__transfersize(24) poly16_t * ptr, poly16x8x3_t const * val); // VST3.16 {d0, d2, d4}, [r0]
+_NEON2SSESTORAGE void vst3_u8(__transfersize(24) uint8_t * ptr, uint8x8x3_t val); // VST3.8 {d0, d1, d2}, [r0]
+_NEON2SSESTORAGE void vst3_u16(__transfersize(12) uint16_t * ptr, uint16x4x3_t val); // VST3.16 {d0, d1, d2}, [r0]
+_NEON2SSESTORAGE void vst3_u32(__transfersize(6) uint32_t * ptr, uint32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0]
+_NEON2SSESTORAGE void vst3_u64(__transfersize(3) uint64_t * ptr, uint64x1x3_t val); // VST1.64 {d0, d1, d2}, [r0]
+_NEON2SSESTORAGE void vst3_s8(__transfersize(24) int8_t * ptr, int8x8x3_t val); // VST3.8 {d0, d1, d2}, [r0]
+_NEON2SSESTORAGE void vst3_s16(__transfersize(12) int16_t * ptr, int16x4x3_t val); // VST3.16 {d0, d1, d2}, [r0]
+_NEON2SSESTORAGE void vst3_s32(__transfersize(6) int32_t * ptr, int32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0]
+_NEON2SSESTORAGE void vst3_s64(__transfersize(3) int64_t * ptr, int64x1x3_t val); // VST1.64 {d0, d1, d2}, [r0]
+_NEON2SSESTORAGE void vst3_f16_ptr(__transfersize(12) __fp16 * ptr, float16x4x3_t const * val); // VST3.16 {d0, d1, d2}, [r0]
+_NEON2SSESTORAGE void vst3_f32(__transfersize(6) float32_t * ptr, float32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0]
+_NEON2SSESTORAGE void vst3_p8(__transfersize(24) poly8_t * ptr, poly8x8x3_t val); // VST3.8 {d0, d1, d2}, [r0]
+_NEON2SSESTORAGE void vst3_p16(__transfersize(12) poly16_t * ptr, poly16x4x3_t val); // VST3.16 {d0, d1, d2}, [r0]
+_NEON2SSESTORAGE void vst4q_u8_ptr(__transfersize(64) uint8_t * ptr, uint8x16x4_t const * val); // VST4.8 {d0, d2, d4, d6}, [r0]
+_NEON2SSESTORAGE void vst4q_u16_ptr(__transfersize(32) uint16_t * ptr, uint16x8x4_t const * val); // VST4.16 {d0, d2, d4, d6}, [r0]
+_NEON2SSESTORAGE void vst4q_u32_ptr(__transfersize(16) uint32_t * ptr, uint32x4x4_t const * val); // VST4.32 {d0, d2, d4, d6}, [r0]
+_NEON2SSESTORAGE void vst4q_s8_ptr(__transfersize(64) int8_t * ptr, int8x16x4_t const * val); // VST4.8 {d0, d2, d4, d6}, [r0]
+_NEON2SSESTORAGE void vst4q_s16_ptr(__transfersize(32) int16_t * ptr, int16x8x4_t const * val); // VST4.16 {d0, d2, d4, d6}, [r0]
+_NEON2SSESTORAGE void vst4q_s32_ptr(__transfersize(16) int32_t * ptr, int32x4x4_t const * val); // VST4.32 {d0, d2, d4, d6}, [r0]
+_NEON2SSESTORAGE void vst4q_f16_ptr(__transfersize(32) __fp16 * ptr, float16x8x4_t const * val); // VST4.16 {d0, d2, d4, d6}, [r0]
+_NEON2SSESTORAGE void vst4q_f32_ptr(__transfersize(16) float32_t * ptr, float32x4x4_t const * val); // VST4.32 {d0, d2, d4, d6}, [r0]
+_NEON2SSESTORAGE void vst4q_p8_ptr(__transfersize(64) poly8_t * ptr, poly8x16x4_t const * val); // VST4.8 {d0, d2, d4, d6}, [r0]
+_NEON2SSESTORAGE void vst4q_p16_ptr(__transfersize(32) poly16_t * ptr, poly16x8x4_t const * val); // VST4.16 {d0, d2, d4, d6}, [r0]
+_NEON2SSESTORAGE void vst4_u8(__transfersize(32) uint8_t * ptr, uint8x8x4_t val); // VST4.8 {d0, d1, d2, d3}, [r0]
+_NEON2SSESTORAGE void vst4_u16(__transfersize(16) uint16_t * ptr, uint16x4x4_t val); // VST4.16 {d0, d1, d2, d3}, [r0]
+_NEON2SSESTORAGE void vst4_u32(__transfersize(8) uint32_t * ptr, uint32x2x4_t val); // VST4.32 {d0, d1, d2, d3}, [r0]
+_NEON2SSESTORAGE void vst4_u64(__transfersize(4) uint64_t * ptr, uint64x1x4_t val); // VST1.64 {d0, d1, d2, d3}, [r0]
+_NEON2SSESTORAGE void vst4_s8(__transfersize(32) int8_t * ptr, int8x8x4_t val); // VST4.8 {d0, d1, d2, d3}, [r0]
+_NEON2SSESTORAGE void vst4_s16(__transfersize(16) int16_t * ptr, int16x4x4_t val); // VST4.16 {d0, d1, d2, d3}, [r0]
+_NEON2SSESTORAGE void vst4_s32(__transfersize(8) int32_t * ptr, int32x2x4_t val); // VST4.32 {d0, d1, d2, d3}, [r0]
+_NEON2SSESTORAGE void vst4_s64(__transfersize(4) int64_t * ptr, int64x1x4_t val); // VST1.64 {d0, d1, d2, d3}, [r0]
+_NEON2SSESTORAGE void vst4_f16_ptr(__transfersize(16) __fp16 * ptr, float16x4x4_t const * val); // VST4.16 {d0, d1, d2, d3}, [r0]
+_NEON2SSESTORAGE void vst4_f32(__transfersize(8) float32_t * ptr, float32x2x4_t val); // VST4.32 {d0, d1, d2, d3}, [r0]
+_NEON2SSESTORAGE void vst4_p8(__transfersize(32) poly8_t * ptr, poly8x8x4_t val); // VST4.8 {d0, d1, d2, d3}, [r0]
+_NEON2SSESTORAGE void vst4_p16(__transfersize(16) poly16_t * ptr, poly16x4x4_t val); // VST4.16 {d0, d1, d2, d3}, [r0]
+//Store a single lane of N-element structure to memory
+_NEON2SSESTORAGE void vst2q_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x8x2_t const * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
+_NEON2SSESTORAGE void vst2q_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x4x2_t const * val, __constrange(0,3) int lane); // VST2.32{d0[0], d2[0]}, [r0]
+_NEON2SSESTORAGE void vst2q_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x8x2_t const * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
+_NEON2SSESTORAGE void vst2q_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x4x2_t const * val, __constrange(0,3) int lane); // VST2.32{d0[0], d2[0]}, [r0]
+_NEON2SSESTORAGE void vst2q_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x8x2_t const * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
+_NEON2SSESTORAGE void vst2q_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x4x2_t const * val, __constrange(0,3) int lane); //VST2.32 {d0[0], d2[0]}, [r0]
+_NEON2SSESTORAGE void vst2q_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x8x2_t const * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
+_NEON2SSESTORAGE void vst2_lane_u8(__transfersize(2) uint8_t * ptr, uint8x8x2_t val, __constrange(0,7) int lane); // VST2.8{d0[0], d1[0]}, [r0]
+_NEON2SSESTORAGE void vst2_lane_u16(__transfersize(2) uint16_t * ptr, uint16x4x2_t val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
+_NEON2SSESTORAGE void vst2_lane_u32(__transfersize(2) uint32_t * ptr, uint32x2x2_t val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0]
+_NEON2SSESTORAGE void vst2_lane_s8(__transfersize(2) int8_t * ptr, int8x8x2_t val, __constrange(0,7) int lane); // VST2.8 {d0[0],d1[0]}, [r0]
+_NEON2SSESTORAGE void vst2_lane_s16(__transfersize(2) int16_t * ptr, int16x4x2_t val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
+_NEON2SSESTORAGE void vst2_lane_s32(__transfersize(2) int32_t * ptr, int32x2x2_t val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0]
+_NEON2SSESTORAGE void vst2_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x4x2_t const * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
+_NEON2SSESTORAGE void vst2_lane_f32(__transfersize(2) float32_t * ptr, float32x2x2_t val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0]
+_NEON2SSESTORAGE void vst2_lane_p8(__transfersize(2) poly8_t * ptr, poly8x8x2_t val, __constrange(0,7) int lane); // VST2.8{d0[0], d1[0]}, [r0]
+_NEON2SSESTORAGE void vst2_lane_p16(__transfersize(2) poly16_t * ptr, poly16x4x2_t val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
+_NEON2SSESTORAGE void vst3q_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x8x3_t const * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSESTORAGE void vst3q_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x4x3_t const * val, __constrange(0,3) int lane); // VST3.32{d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSESTORAGE void vst3q_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x8x3_t const * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSESTORAGE void vst3q_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x4x3_t const * val, __constrange(0,3) int lane); // VST3.32{d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSESTORAGE void vst3q_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x8x3_t const * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSESTORAGE void vst3q_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x4x3_t const * val, __constrange(0,3) int lane); //VST3.32 {d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSESTORAGE void vst3q_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x8x3_t const * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSESTORAGE void vst3_lane_u8(__transfersize(3) uint8_t * ptr, uint8x8x3_t val, __constrange(0,7) int lane); // VST3.8{d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSESTORAGE void vst3_lane_u16(__transfersize(3) uint16_t * ptr, uint16x4x3_t val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSESTORAGE void vst3_lane_u32(__transfersize(3) uint32_t * ptr, uint32x2x3_t val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSESTORAGE void vst3_lane_s8(__transfersize(3) int8_t * ptr, int8x8x3_t val, __constrange(0,7) int lane); // VST3.8 {d0[0],d1[0], d2[0]}, [r0]
+_NEON2SSESTORAGE void vst3_lane_s16(__transfersize(3) int16_t * ptr, int16x4x3_t val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSESTORAGE void vst3_lane_s32(__transfersize(3) int32_t * ptr, int32x2x3_t val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSESTORAGE void vst3_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x4x3_t const * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSESTORAGE void vst3_lane_f32(__transfersize(3) float32_t * ptr, float32x2x3_t val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSESTORAGE void vst3_lane_p8(__transfersize(3) poly8_t * ptr, poly8x8x3_t val, __constrange(0,7) int lane); // VST3.8{d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSESTORAGE void vst3_lane_p16(__transfersize(3) poly16_t * ptr, poly16x4x3_t val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSESTORAGE void vst4q_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x8x4_t const * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
+_NEON2SSESTORAGE void vst4q_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x4x4_t const * val, __constrange(0,3) int lane); // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0]
+_NEON2SSESTORAGE void vst4q_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x8x4_t const * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
+_NEON2SSESTORAGE void vst4q_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x4x4_t const * val, __constrange(0,3) int lane); // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0]
+_NEON2SSESTORAGE void vst4q_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x8x4_t const * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
+_NEON2SSESTORAGE void vst4q_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x4x4_t const * val, __constrange(0,3) int lane); //VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+_NEON2SSESTORAGE void vst4q_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x8x4_t const * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
+_NEON2SSESTORAGE void vst4_lane_u8(__transfersize(4) uint8_t * ptr, uint8x8x4_t val, __constrange(0,7) int lane); // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSESTORAGE void vst4_lane_u16(__transfersize(4) uint16_t * ptr, uint16x4x4_t val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSESTORAGE void vst4_lane_u32(__transfersize(4) uint32_t * ptr, uint32x2x4_t val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSESTORAGE void vst4_lane_s8(__transfersize(4) int8_t * ptr, int8x8x4_t val, __constrange(0,7) int lane); // VST4.8 {d0[0],d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSESTORAGE void vst4_lane_s16(__transfersize(4) int16_t * ptr, int16x4x4_t val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSESTORAGE void vst4_lane_s32(__transfersize(4) int32_t * ptr, int32x2x4_t val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSESTORAGE void vst4_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x4x4_t const * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSESTORAGE void vst4_lane_f32(__transfersize(4) float32_t * ptr, float32x2x4_t val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSESTORAGE void vst4_lane_p8(__transfersize(4) poly8_t * ptr, poly8x8x4_t val, __constrange(0,7) int lane); // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSESTORAGE void vst4_lane_p16(__transfersize(4) poly16_t * ptr, poly16x4x4_t val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
+//Extract lanes from a vector and put into a register. These intrinsics extract a single lane (element) from a vector.
+_NEON2SSESTORAGE uint8_t vget_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
+_NEON2SSESTORAGE uint16_t vget_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VMOV.U16 r0, d0[0]
+_NEON2SSESTORAGE uint32_t vget_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
+_NEON2SSESTORAGE int8_t vget_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VMOV.S8 r0, d0[0]
+_NEON2SSESTORAGE int16_t vget_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VMOV.S16 r0, d0[0]
+_NEON2SSESTORAGE int32_t vget_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
+_NEON2SSESTORAGE poly8_t vget_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
+_NEON2SSESTORAGE poly16_t vget_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VMOV.U16 r0, d0[0]
+_NEON2SSESTORAGE float32_t vget_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
+_NEON2SSESTORAGE uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
+_NEON2SSESTORAGE uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(0,7) int lane); // VMOV.U16 r0, d0[0]
+_NEON2SSESTORAGE uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
+_NEON2SSESTORAGE int8_t vgetq_lane_s8(int8x16_t vec, __constrange(0,15) int lane); // VMOV.S8 r0, d0[0]
+_NEON2SSESTORAGE int16_t vgetq_lane_s16(int16x8_t vec, __constrange(0,7) int lane); // VMOV.S16 r0, d0[0]
+_NEON2SSESTORAGE int32_t vgetq_lane_s32(int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
+_NEON2SSESTORAGE poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
+_NEON2SSESTORAGE poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(0,7) int lane); // VMOV.U16 r0, d0[0]
+_NEON2SSESTORAGE float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
+_NEON2SSESTORAGE int64_t vget_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
+_NEON2SSESTORAGE uint64_t vget_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
+_NEON2SSESTORAGE int64_t vgetq_lane_s64(int64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
+_NEON2SSESTORAGE uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
+//Load a single lane of a vector from a literal. These intrinsics set a single lane (element) within a vector.
+_NEON2SSESTORAGE uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
+_NEON2SSESTORAGE uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
+_NEON2SSESTORAGE uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
+_NEON2SSESTORAGE int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
+_NEON2SSESTORAGE int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
+_NEON2SSESTORAGE int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
+_NEON2SSESTORAGE poly8x8_t vset_lane_p8(poly8_t value, poly8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
+_NEON2SSESTORAGE poly16x4_t vset_lane_p16(poly16_t value, poly16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
+_NEON2SSESTORAGE float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
+_NEON2SSESTORAGE uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
+_NEON2SSESTORAGE uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
+_NEON2SSESTORAGE uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
+_NEON2SSESTORAGE int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
+_NEON2SSESTORAGE int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
+_NEON2SSESTORAGE int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
+_NEON2SSESTORAGE poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
+_NEON2SSESTORAGE poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
+_NEON2SSESTORAGE float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
+_NEON2SSESTORAGE int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
+_NEON2SSESTORAGE uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
+_NEON2SSESTORAGE int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
+_NEON2SSESTORAGE uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
+//Initialize a vector from a literal bit pattern.
+_NEON2SSESTORAGE int8x8_t vcreate_s8(uint64_t a); // VMOV d0,r0,r0
+_NEON2SSESTORAGE int16x4_t vcreate_s16(uint64_t a); // VMOV d0,r0,r0
+_NEON2SSESTORAGE int32x2_t vcreate_s32(uint64_t a); // VMOV d0,r0,r0
+_NEON2SSESTORAGE float16x4_t vcreate_f16(uint64_t a); // VMOV d0,r0,r0
+_NEON2SSESTORAGE float32x2_t vcreate_f32(uint64_t a); // VMOV d0,r0,r0
+_NEON2SSESTORAGE uint8x8_t vcreate_u8(uint64_t a); // VMOV d0,r0,r0
+_NEON2SSESTORAGE uint16x4_t vcreate_u16(uint64_t a); // VMOV d0,r0,r0
+_NEON2SSESTORAGE uint32x2_t vcreate_u32(uint64_t a); // VMOV d0,r0,r0
+_NEON2SSESTORAGE uint64x1_t vcreate_u64(uint64_t a); // VMOV d0,r0,r0
+_NEON2SSESTORAGE poly8x8_t vcreate_p8(uint64_t a); // VMOV d0,r0,r0
+_NEON2SSESTORAGE poly16x4_t vcreate_p16(uint64_t a); // VMOV d0,r0,r0
+_NEON2SSESTORAGE int64x1_t vcreate_s64(uint64_t a); // VMOV d0,r0,r0
+//Set all lanes to same value
+//Load all lanes of vector to the same literal value
+_NEON2SSESTORAGE uint8x8_t vdup_n_u8(uint8_t value); // VDUP.8 d0,r0
+_NEON2SSESTORAGE uint16x4_t vdup_n_u16(uint16_t value); // VDUP.16 d0,r0
+_NEON2SSESTORAGE uint32x2_t vdup_n_u32(uint32_t value); // VDUP.32 d0,r0
+_NEON2SSESTORAGE int8x8_t vdup_n_s8(int8_t value); // VDUP.8 d0,r0
+_NEON2SSESTORAGE int16x4_t vdup_n_s16(int16_t value); // VDUP.16 d0,r0
+_NEON2SSESTORAGE int32x2_t vdup_n_s32(int32_t value); // VDUP.32 d0,r0
+_NEON2SSESTORAGE poly8x8_t vdup_n_p8(poly8_t value); // VDUP.8 d0,r0
+_NEON2SSESTORAGE poly16x4_t vdup_n_p16(poly16_t value); // VDUP.16 d0,r0
+_NEON2SSESTORAGE float32x2_t vdup_n_f32(float32_t value); // VDUP.32 d0,r0
+_NEON2SSESTORAGE uint8x16_t vdupq_n_u8(uint8_t value); // VDUP.8 q0,r0
+_NEON2SSESTORAGE uint16x8_t vdupq_n_u16(uint16_t value); // VDUP.16 q0,r0
+_NEON2SSESTORAGE uint32x4_t vdupq_n_u32(uint32_t value); // VDUP.32 q0,r0
+_NEON2SSESTORAGE int8x16_t vdupq_n_s8(int8_t value); // VDUP.8 q0,r0
+_NEON2SSESTORAGE int16x8_t vdupq_n_s16(int16_t value); // VDUP.16 q0,r0
+_NEON2SSESTORAGE int32x4_t vdupq_n_s32(int32_t value); // VDUP.32 q0,r0
+_NEON2SSESTORAGE poly8x16_t vdupq_n_p8(poly8_t value); // VDUP.8 q0,r0
+_NEON2SSESTORAGE poly16x8_t vdupq_n_p16(poly16_t value); // VDUP.16 q0,r0
+_NEON2SSESTORAGE float32x4_t vdupq_n_f32(float32_t value); // VDUP.32 q0,r0
+_NEON2SSESTORAGE int64x1_t vdup_n_s64(int64_t value); // VMOV d0,r0,r0
+_NEON2SSESTORAGE uint64x1_t vdup_n_u64(uint64_t value); // VMOV d0,r0,r0
+_NEON2SSESTORAGE int64x2_t vdupq_n_s64(int64_t value); // VMOV d0,r0,r0
+_NEON2SSESTORAGE uint64x2_t vdupq_n_u64(uint64_t value); // VMOV d0,r0,r0
+_NEON2SSESTORAGE uint8x8_t vmov_n_u8(uint8_t value); // VDUP.8 d0,r0
+_NEON2SSESTORAGE uint16x4_t vmov_n_u16(uint16_t value); // VDUP.16 d0,r0
+_NEON2SSESTORAGE uint32x2_t vmov_n_u32(uint32_t value); // VDUP.32 d0,r0
+_NEON2SSESTORAGE int8x8_t vmov_n_s8(int8_t value); // VDUP.8 d0,r0
+_NEON2SSESTORAGE int16x4_t vmov_n_s16(int16_t value); // VDUP.16 d0,r0
+_NEON2SSESTORAGE int32x2_t vmov_n_s32(int32_t value); // VDUP.32 d0,r0
+_NEON2SSESTORAGE poly8x8_t vmov_n_p8(poly8_t value); // VDUP.8 d0,r0
+_NEON2SSESTORAGE poly16x4_t vmov_n_p16(poly16_t value); // VDUP.16 d0,r0
+_NEON2SSESTORAGE float32x2_t vmov_n_f32(float32_t value); // VDUP.32 d0,r0
+_NEON2SSESTORAGE uint8x16_t vmovq_n_u8(uint8_t value); // VDUP.8 q0,r0
+_NEON2SSESTORAGE uint16x8_t vmovq_n_u16(uint16_t value); // VDUP.16 q0,r0
+_NEON2SSESTORAGE uint32x4_t vmovq_n_u32(uint32_t value); // VDUP.32 q0,r0
+_NEON2SSESTORAGE int8x16_t vmovq_n_s8(int8_t value); // VDUP.8 q0,r0
+_NEON2SSESTORAGE int16x8_t vmovq_n_s16(int16_t value); // VDUP.16 q0,r0
+_NEON2SSESTORAGE int32x4_t vmovq_n_s32(int32_t value); // VDUP.32 q0,r0
+_NEON2SSESTORAGE poly8x16_t vmovq_n_p8(poly8_t value); // VDUP.8 q0,r0
+_NEON2SSESTORAGE poly16x8_t vmovq_n_p16(poly16_t value); // VDUP.16 q0,r0
+_NEON2SSESTORAGE float32x4_t vmovq_n_f32(float32_t value); // VDUP.32 q0,r0
+_NEON2SSESTORAGE int64x1_t vmov_n_s64(int64_t value); // VMOV d0,r0,r0
+_NEON2SSESTORAGE uint64x1_t vmov_n_u64(uint64_t value); // VMOV d0,r0,r0
+_NEON2SSESTORAGE int64x2_t vmovq_n_s64(int64_t value); // VMOV d0,r0,r0
+_NEON2SSESTORAGE uint64x2_t vmovq_n_u64(uint64_t value); // VMOV d0,r0,r0
+//Load all lanes of the vector to the value of a lane of a vector
+_NEON2SSESTORAGE uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
+_NEON2SSESTORAGE uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
+_NEON2SSESTORAGE uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
+_NEON2SSESTORAGE int8x8_t vdup_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
+_NEON2SSESTORAGE int16x4_t vdup_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
+_NEON2SSESTORAGE int32x2_t vdup_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
+_NEON2SSESTORAGE poly8x8_t vdup_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
+_NEON2SSESTORAGE poly16x4_t vdup_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
+_NEON2SSESTORAGE float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
+_NEON2SSESTORAGE uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
+_NEON2SSESTORAGE uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
+_NEON2SSESTORAGE uint32x4_t vdupq_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
+_NEON2SSESTORAGE int8x16_t vdupq_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
+_NEON2SSESTORAGE int16x8_t vdupq_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
+_NEON2SSESTORAGE int32x4_t vdupq_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
+_NEON2SSESTORAGE poly8x16_t vdupq_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
+_NEON2SSESTORAGE poly16x8_t vdupq_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
+_NEON2SSESTORAGE float32x4_t vdupq_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
+_NEON2SSESTORAGE int64x1_t vdup_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
+_NEON2SSESTORAGE uint64x1_t vdup_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
+_NEON2SSESTORAGE int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
+_NEON2SSESTORAGE uint64x2_t vdupq_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
+//Combining vectors. These intrinsics join two 64 bit vectors into a single 128bit vector.
+_NEON2SSESTORAGE int8x16_t vcombine_s8(int8x8_t low, int8x8_t high); // VMOV d0,d0
+_NEON2SSESTORAGE int16x8_t vcombine_s16(int16x4_t low, int16x4_t high); // VMOV d0,d0
+_NEON2SSESTORAGE int32x4_t vcombine_s32(int32x2_t low, int32x2_t high); // VMOV d0,d0
+_NEON2SSESTORAGE int64x2_t vcombine_s64(int64x1_t low, int64x1_t high); // VMOV d0,d0
+_NEON2SSESTORAGE float16x8_t vcombine_f16(float16x4_t low, float16x4_t high); // VMOV d0,d0
+_NEON2SSESTORAGE float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
+_NEON2SSESTORAGE uint8x16_t vcombine_u8(uint8x8_t low, uint8x8_t high); // VMOV d0,d0
+_NEON2SSESTORAGE uint16x8_t vcombine_u16(uint16x4_t low, uint16x4_t high); // VMOV d0,d0
+_NEON2SSESTORAGE uint32x4_t vcombine_u32(uint32x2_t low, uint32x2_t high); // VMOV d0,d0
+_NEON2SSESTORAGE uint64x2_t vcombine_u64(uint64x1_t low, uint64x1_t high); // VMOV d0,d0
+_NEON2SSESTORAGE poly8x16_t vcombine_p8(poly8x8_t low, poly8x8_t high); // VMOV d0,d0
+_NEON2SSESTORAGE poly16x8_t vcombine_p16(poly16x4_t low, poly16x4_t high); // VMOV d0,d0
+//Splitting vectors. These intrinsics split a 128 bit vector into 2 component 64 bit vectors
+_NEON2SSESTORAGE int8x8_t vget_high_s8(int8x16_t a); // VMOV d0,d0
+_NEON2SSESTORAGE int16x4_t vget_high_s16(int16x8_t a); // VMOV d0,d0
+_NEON2SSESTORAGE int32x2_t vget_high_s32(int32x4_t a); // VMOV d0,d0
+_NEON2SSESTORAGE int64x1_t vget_high_s64(int64x2_t a); // VMOV d0,d0
+_NEON2SSESTORAGE float16x4_t vget_high_f16(float16x8_t a); // VMOV d0,d0
+_NEON2SSESTORAGE float32x2_t vget_high_f32(float32x4_t a); // VMOV d0,d0
+_NEON2SSESTORAGE uint8x8_t vget_high_u8(uint8x16_t a); // VMOV d0,d0
+_NEON2SSESTORAGE uint16x4_t vget_high_u16(uint16x8_t a); // VMOV d0,d0
+_NEON2SSESTORAGE uint32x2_t vget_high_u32(uint32x4_t a); // VMOV d0,d0
+_NEON2SSESTORAGE uint64x1_t vget_high_u64(uint64x2_t a); // VMOV d0,d0
+_NEON2SSESTORAGE poly8x8_t vget_high_p8(poly8x16_t a); // VMOV d0,d0
+_NEON2SSESTORAGE poly16x4_t vget_high_p16(poly16x8_t a); // VMOV d0,d0
+_NEON2SSESTORAGE int8x8_t vget_low_s8(int8x16_t a); // VMOV d0,d0
+_NEON2SSESTORAGE int16x4_t vget_low_s16(int16x8_t a); // VMOV d0,d0
+_NEON2SSESTORAGE int32x2_t vget_low_s32(int32x4_t a); // VMOV d0,d0
+_NEON2SSESTORAGE int64x1_t vget_low_s64(int64x2_t a); // VMOV d0,d0
+_NEON2SSESTORAGE float16x4_t vget_low_f16(float16x8_t a); // VMOV d0,d0
+_NEON2SSESTORAGE float32x2_t vget_low_f32(float32x4_t a); // VMOV d0,d0
+_NEON2SSESTORAGE uint8x8_t vget_low_u8(uint8x16_t a); // VMOV d0,d0
+_NEON2SSESTORAGE uint16x4_t vget_low_u16(uint16x8_t a); // VMOV d0,d0
+_NEON2SSESTORAGE uint32x2_t vget_low_u32(uint32x4_t a); // VMOV d0,d0
+_NEON2SSESTORAGE uint64x1_t vget_low_u64(uint64x2_t a); // VMOV d0,d0
+_NEON2SSESTORAGE poly8x8_t vget_low_p8(poly8x16_t a); // VMOV d0,d0
+_NEON2SSESTORAGE poly16x4_t vget_low_p16(poly16x8_t a); // VMOV d0,d0
+//Converting vectors. These intrinsics are used to convert vectors.
+//Convert from float
+_NEON2SSESTORAGE int32x2_t vcvt_s32_f32(float32x2_t a); // VCVT.S32.F32 d0, d0
+_NEON2SSESTORAGE uint32x2_t vcvt_u32_f32(float32x2_t a); // VCVT.U32.F32 d0, d0
+_NEON2SSESTORAGE int32x4_t vcvtq_s32_f32(float32x4_t a); // VCVT.S32.F32 q0, q0
+_NEON2SSESTORAGE uint32x4_t vcvtq_u32_f32(float32x4_t a); // VCVT.U32.F32 q0, q0
+_NEON2SSESTORAGE int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.S32.F32 d0, d0, #32
+_NEON2SSESTORAGE uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.U32.F32 d0, d0, #32
+_NEON2SSESTORAGE int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.S32.F32 q0, q0, #32
+_NEON2SSESTORAGE uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.U32.F32 q0, q0, #32
+_NEON2SSESTORAGE int32x4_t vcvtnq_s32_f32(float32x4_t a); // VCVTN.S32.F32 q0, q0
+//Convert to float
+_NEON2SSESTORAGE float32x2_t vcvt_f32_s32(int32x2_t a); // VCVT.F32.S32 d0, d0
+_NEON2SSESTORAGE float32x2_t vcvt_f32_u32(uint32x2_t a); // VCVT.F32.U32 d0, d0
+_NEON2SSESTORAGE float32x4_t vcvtq_f32_s32(int32x4_t a); // VCVT.F32.S32 q0, q0
+_NEON2SSESTORAGE float32x4_t vcvtq_f32_u32(uint32x4_t a); // VCVT.F32.U32 q0, q0
+_NEON2SSESTORAGE float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b); // VCVT.F32.S32 d0, d0, #32
+_NEON2SSESTORAGE float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b); // VCVT.F32.U32 d0, d0, #32
+_NEON2SSESTORAGE float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b); // VCVT.F32.S32 q0, q0, #32
+_NEON2SSESTORAGE float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b); // VCVT.F32.U32 q0, q0, #32
+//Convert between floats
+_NEON2SSESTORAGE float16x4_t vcvt_f16_f32(float32x4_t a); // VCVT.F16.F32 d0, q0
+_NEON2SSESTORAGE float32x4_t vcvt_f32_f16(float16x4_t a); // VCVT.F32.F16 q0, d0
+//Vector narrow integer
+_NEON2SSESTORAGE int8x8_t vmovn_s16(int16x8_t a); // VMOVN.I16 d0,q0
+_NEON2SSESTORAGE int16x4_t vmovn_s32(int32x4_t a); // VMOVN.I32 d0,q0
+_NEON2SSESTORAGE int32x2_t vmovn_s64(int64x2_t a); // VMOVN.I64 d0,q0
+_NEON2SSESTORAGE uint8x8_t vmovn_u16(uint16x8_t a); // VMOVN.I16 d0,q0
+_NEON2SSESTORAGE uint16x4_t vmovn_u32(uint32x4_t a); // VMOVN.I32 d0,q0
+_NEON2SSESTORAGE uint32x2_t vmovn_u64(uint64x2_t a); // VMOVN.I64 d0,q0
+//Vector long move
+_NEON2SSESTORAGE int16x8_t vmovl_s8(int8x8_t a); // VMOVL.S8 q0,d0
+_NEON2SSESTORAGE int32x4_t vmovl_s16(int16x4_t a); // VMOVL.S16 q0,d0
+_NEON2SSESTORAGE int64x2_t vmovl_s32(int32x2_t a); // VMOVL.S32 q0,d0
+_NEON2SSESTORAGE uint16x8_t vmovl_u8(uint8x8_t a); // VMOVL.U8 q0,d0
+_NEON2SSESTORAGE uint32x4_t vmovl_u16(uint16x4_t a); // VMOVL.U16 q0,d0
+_NEON2SSESTORAGE uint64x2_t vmovl_u32(uint32x2_t a); // VMOVL.U32 q0,d0
+//Vector saturating narrow integer
+_NEON2SSESTORAGE int8x8_t vqmovn_s16(int16x8_t a); // VQMOVN.S16 d0,q0
+_NEON2SSESTORAGE int16x4_t vqmovn_s32(int32x4_t a); // VQMOVN.S32 d0,q0
+_NEON2SSESTORAGE int32x2_t vqmovn_s64(int64x2_t a); // VQMOVN.S64 d0,q0
+_NEON2SSESTORAGE uint8x8_t vqmovn_u16(uint16x8_t a); // VQMOVN.U16 d0,q0
+_NEON2SSESTORAGE uint16x4_t vqmovn_u32(uint32x4_t a); // VQMOVN.U32 d0,q0
+_NEON2SSESTORAGE uint32x2_t vqmovn_u64(uint64x2_t a); // VQMOVN.U64 d0,q0
+//Vector saturating narrow integer signed->unsigned
+_NEON2SSESTORAGE uint8x8_t vqmovun_s16(int16x8_t a); // VQMOVUN.S16 d0,q0
+_NEON2SSESTORAGE uint16x4_t vqmovun_s32(int32x4_t a); // VQMOVUN.S32 d0,q0
+_NEON2SSESTORAGE uint32x2_t vqmovun_s64(int64x2_t a); // VQMOVUN.S64 d0,q0
+//Table look up
+_NEON2SSESTORAGE uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
+_NEON2SSESTORAGE int8x8_t vtbl1_s8(int8x8_t a, int8x8_t b); // VTBL.8 d0, {d0}, d0
+_NEON2SSESTORAGE poly8x8_t vtbl1_p8(poly8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
+//Extended table look up intrinsics
+_NEON2SSESTORAGE uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
+_NEON2SSESTORAGE int8x8_t vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VTBX.8 d0, {d0}, d0
+_NEON2SSESTORAGE poly8x8_t vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
+_NEON2SSESTORAGE uint8x8_t vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
+_NEON2SSESTORAGE int8x8_t vtbx2_s8(int8x8_t a, int8x8x2_t b, int8x8_t c); // VTBX.8 d0, {d0, d1}, d0
+_NEON2SSESTORAGE poly8x8_t vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
+_NEON2SSESTORAGE uint8x8_t vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
+_NEON2SSESTORAGE int8x8_t vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
+_NEON2SSESTORAGE poly8x8_t vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
+_NEON2SSESTORAGE uint8x8_t vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
+_NEON2SSESTORAGE int8x8_t vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
+_NEON2SSESTORAGE poly8x8_t vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
+//Operations with a scalar value
+//Vector multiply accumulate with scalar
+_NEON2SSESTORAGE int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0,d0[0]
+_NEON2SSESTORAGE int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0,d0[0]
+_NEON2SSESTORAGE uint16x4_t vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0,d0[0]
+_NEON2SSESTORAGE uint32x2_t vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0,d0[0]
+_NEON2SSESTORAGE float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 d0,d0, d0[0]
+_NEON2SSESTORAGE int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0,d0[0]
+_NEON2SSESTORAGE int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0,d0[0]
+_NEON2SSESTORAGE uint16x8_t vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0,q0, d0[0]
+_NEON2SSESTORAGE uint32x4_t vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0,q0, d0[0]
+_NEON2SSESTORAGE float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0,q0, d0[0]
+//Vector widening multiply accumulate with scalar
+_NEON2SSESTORAGE int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); //VMLAL.S16 q0, d0,d0[0]
+_NEON2SSESTORAGE int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); //VMLAL.S32 q0, d0,d0[0]
+_NEON2SSESTORAGE uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.U16 q0,d0, d0[0]
+_NEON2SSESTORAGE uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0,d0, d0[0]
+//Vector widening saturating doubling multiply accumulate with scalar
+_NEON2SSESTORAGE int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLAL.S16 q0,d0, d0[0]
+_NEON2SSESTORAGE int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLAL.S32 q0,d0, d0[0]
+//Vector multiply subtract with scalar
+_NEON2SSESTORAGE int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0,d0[0]
+_NEON2SSESTORAGE int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0,d0[0]
+_NEON2SSESTORAGE uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0,d0[0]
+_NEON2SSESTORAGE uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0,d0[0]
+_NEON2SSESTORAGE float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 d0,d0, d0[0]
+_NEON2SSESTORAGE int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0, q0,d0[0]
+_NEON2SSESTORAGE int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0, q0,d0[0]
+_NEON2SSESTORAGE uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0,q0, d0[0]
+_NEON2SSESTORAGE uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0,q0, d0[0]
+_NEON2SSESTORAGE float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 q0,q0, d0[0]
+//Vector widening multiply subtract with scalar
+_NEON2SSESTORAGE int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLSL.S16 q0, d0,d0[0]
+_NEON2SSESTORAGE int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLSL.S32 q0, d0,d0[0]
+_NEON2SSESTORAGE uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLSL.U16 q0,d0, d0[0]
+_NEON2SSESTORAGE uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLSL.U32 q0,d0, d0[0]
+//Vector widening saturating doubling multiply subtract with scalar
+_NEON2SSESTORAGE int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLSL.S16 q0,d0, d0[0]
+_NEON2SSESTORAGE int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLSL.S32 q0,d0, d0[0]
+//Vector multiply by scalar
+_NEON2SSESTORAGE int16x4_t vmul_n_s16(int16x4_t a, int16_t b); // VMUL.I16 d0,d0,d0[0]
+_NEON2SSESTORAGE int32x2_t vmul_n_s32(int32x2_t a, int32_t b); // VMUL.I32 d0,d0,d0[0]
+_NEON2SSESTORAGE float32x2_t vmul_n_f32(float32x2_t a, float32_t b); // VMUL.F32 d0,d0,d0[0]
+_NEON2SSESTORAGE uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b); // VMUL.I16 d0,d0,d0[0]
+_NEON2SSESTORAGE uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b); // VMUL.I32 d0,d0,d0[0]
+_NEON2SSESTORAGE int16x8_t vmulq_n_s16(int16x8_t a, int16_t b); // VMUL.I16 q0,q0,d0[0]
+_NEON2SSESTORAGE int32x4_t vmulq_n_s32(int32x4_t a, int32_t b); // VMUL.I32 q0,q0,d0[0]
+_NEON2SSESTORAGE float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); // VMUL.F32 q0,q0,d0[0]
+_NEON2SSESTORAGE uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b); // VMUL.I16 q0,q0,d0[0]
+_NEON2SSESTORAGE uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b); // VMUL.I32 q0,q0,d0[0]
+//Vector long multiply with scalar
+_NEON2SSESTORAGE int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2); // VMULL.S16 q0,d0,d0[0]
+_NEON2SSESTORAGE int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2); // VMULL.S32 q0,d0,d0[0]
+_NEON2SSESTORAGE uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2); // VMULL.U16 q0,d0,d0[0]
+_NEON2SSESTORAGE uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2); // VMULL.U32 q0,d0,d0[0]
+//Vector long multiply by scalar
+_NEON2SSESTORAGE int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VMULL.S16 q0,d0,d0[0]
+_NEON2SSESTORAGE int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VMULL.S32 q0,d0,d0[0]
+_NEON2SSESTORAGE uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3); // VMULL.U16 q0,d0,d0[0]
+_NEON2SSESTORAGE uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3); // VMULL.U32 q0,d0,d0[0]
+//Vector saturating doubling long multiply with scalar
+_NEON2SSESTORAGE int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2); // VQDMULL.S16 q0,d0,d0[0]
+_NEON2SSESTORAGE int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2); // VQDMULL.S32 q0,d0,d0[0]
+//Vector saturating doubling long multiply by scalar
+_NEON2SSESTORAGE int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULL.S16 q0,d0,d0[0]
+_NEON2SSESTORAGE int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULL.S32 q0,d0,d0[0]
+//Vector saturating doubling multiply high with scalar
+_NEON2SSESTORAGE int16x4_t vqdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQDMULH.S16 d0,d0,d0[0]
+_NEON2SSESTORAGE int32x2_t vqdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQDMULH.S32 d0,d0,d0[0]
+_NEON2SSESTORAGE int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQDMULH.S16 q0,q0,d0[0]
+_NEON2SSESTORAGE int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQDMULH.S32 q0,q0,d0[0]
+//Vector saturating doubling multiply high by scalar
+_NEON2SSESTORAGE int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 d0,d0,d0[0]
+_NEON2SSESTORAGE int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 d0,d0,d0[0]
+_NEON2SSESTORAGE int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 q0,q0,d0[0]
+_NEON2SSESTORAGE int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 q0,q0,d0[0]
+//Vector saturating rounding doubling multiply high with scalar
+_NEON2SSESTORAGE int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQRDMULH.S16 d0,d0,d0[0]
+_NEON2SSESTORAGE int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQRDMULH.S32 d0,d0,d0[0]
+_NEON2SSESTORAGE int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQRDMULH.S16 q0,q0,d0[0]
+_NEON2SSESTORAGE int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQRDMULH.S32 q0,q0,d0[0]
+//Vector rounding saturating doubling multiply high by scalar
+_NEON2SSESTORAGE int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 d0,d0,d0[0]
+_NEON2SSESTORAGE int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 d0,d0,d0[0]
+_NEON2SSESTORAGE int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 q0,q0,d0[0]
+_NEON2SSESTORAGE int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 q0,q0,d0[0]
+//Vector multiply accumulate with scalar
+_NEON2SSESTORAGE int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLA.I16 d0, d0, d0[0]
+_NEON2SSESTORAGE int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLA.I32 d0, d0, d0[0]
+_NEON2SSESTORAGE uint16x4_t vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLA.I16 d0, d0, d0[0]
+_NEON2SSESTORAGE uint32x2_t vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLA.I32 d0, d0, d0[0]
+_NEON2SSESTORAGE float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLA.F32 d0, d0, d0[0]
+_NEON2SSESTORAGE int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLA.I16 q0, q0, d0[0]
+_NEON2SSESTORAGE int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLA.I32 q0, q0, d0[0]
+_NEON2SSESTORAGE uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLA.I16 q0, q0, d0[0]
+_NEON2SSESTORAGE uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLA.I32 q0, q0, d0[0]
+_NEON2SSESTORAGE float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLA.F32 q0, q0, d0[0]
+//Vector widening multiply accumulate with scalar
+_NEON2SSESTORAGE int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLAL.S16 q0, d0, d0[0]
+_NEON2SSESTORAGE int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLAL.S32 q0, d0, d0[0]
+_NEON2SSESTORAGE uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLAL.U16 q0, d0, d0[0]
+_NEON2SSESTORAGE uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLAL.U32 q0, d0, d0[0]
+//Vector widening saturating doubling multiply accumulate with scalar
+_NEON2SSESTORAGE int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLAL.S16 q0, d0, d0[0]
+_NEON2SSESTORAGE int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLAL.S32 q0, d0, d0[0]
+//Vector multiply subtract with scalar
+_NEON2SSESTORAGE int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLS.I16 d0, d0, d0[0]
+_NEON2SSESTORAGE int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLS.I32 d0, d0, d0[0]
+_NEON2SSESTORAGE uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLS.I16 d0, d0, d0[0]
+_NEON2SSESTORAGE uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLS.I32 d0, d0, d0[0]
+_NEON2SSESTORAGE float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLS.F32 d0, d0, d0[0]
+_NEON2SSESTORAGE int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLS.I16 q0, q0, d0[0]
+_NEON2SSESTORAGE int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLS.I32 q0, q0, d0[0]
+_NEON2SSESTORAGE uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLS.I16 q0, q0, d0[0]
+_NEON2SSESTORAGE uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLS.I32 q0, q0, d0[0]
+_NEON2SSESTORAGE float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLS.F32 q0, q0, d0[0]
+//Vector widening multiply subtract with scalar
+_NEON2SSESTORAGE int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLSL.S16 q0, d0, d0[0]
+_NEON2SSESTORAGE int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLSL.S32 q0, d0, d0[0]
+_NEON2SSESTORAGE uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLSL.U16 q0, d0, d0[0]
+_NEON2SSESTORAGE uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLSL.U32 q0, d0, d0[0]
+//Vector widening saturating doubling multiply subtract with scalar
+_NEON2SSESTORAGE int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLSL.S16 q0, d0, d0[0]
+_NEON2SSESTORAGE int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLSL.S32 q0, d0, d0[0]
+//Vector extract
+_NEON2SSESTORAGE int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
+_NEON2SSESTORAGE uint8x8_t vext_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
+_NEON2SSESTORAGE poly8x8_t vext_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
+_NEON2SSESTORAGE int16x4_t vext_s16(int16x4_t a, int16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
+_NEON2SSESTORAGE uint16x4_t vext_u16(uint16x4_t a, uint16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
+_NEON2SSESTORAGE poly16x4_t vext_p16(poly16x4_t a, poly16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
+_NEON2SSESTORAGE int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
+_NEON2SSESTORAGE uint32x2_t vext_u32(uint32x2_t a, uint32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
+_NEON2SSESTORAGE int64x1_t vext_s64(int64x1_t a, int64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
+_NEON2SSESTORAGE uint64x1_t vext_u64(uint64x1_t a, uint64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
+_NEON2SSESTORAGE float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
+_NEON2SSESTORAGE int8x16_t vextq_s8(int8x16_t a, int8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
+_NEON2SSESTORAGE uint8x16_t vextq_u8(uint8x16_t a, uint8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
+_NEON2SSESTORAGE poly8x16_t vextq_p8(poly8x16_t a, poly8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
+_NEON2SSESTORAGE int16x8_t vextq_s16(int16x8_t a, int16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
+_NEON2SSESTORAGE uint16x8_t vextq_u16(uint16x8_t a, uint16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
+_NEON2SSESTORAGE poly16x8_t vextq_p16(poly16x8_t a, poly16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
+_NEON2SSESTORAGE int32x4_t vextq_s32(int32x4_t a, int32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
+_NEON2SSESTORAGE uint32x4_t vextq_u32(uint32x4_t a, uint32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
+_NEON2SSESTORAGE int64x2_t vextq_s64(int64x2_t a, int64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
+_NEON2SSESTORAGE uint64x2_t vextq_u64(uint64x2_t a, uint64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
+_NEON2SSESTORAGE float32x4_t vextq_f32(float32x4_t a, float32x4_t b, __constrange(0,3) float c); // VEXT.32 q0,q0,q0,#0
+//Reverse vector elements (swap endianness). VREVn.m reverses the order of the m-bit lanes within a set that is n bits wide.
+_NEON2SSESTORAGE int8x8_t vrev64_s8(int8x8_t vec); // VREV64.8 d0,d0
+_NEON2SSESTORAGE int16x4_t vrev64_s16(int16x4_t vec); // VREV64.16 d0,d0
+_NEON2SSESTORAGE int32x2_t vrev64_s32(int32x2_t vec); // VREV64.32 d0,d0
+_NEON2SSESTORAGE uint8x8_t vrev64_u8(uint8x8_t vec); // VREV64.8 d0,d0
+_NEON2SSESTORAGE uint16x4_t vrev64_u16(uint16x4_t vec); // VREV64.16 d0,d0
+_NEON2SSESTORAGE uint32x2_t vrev64_u32(uint32x2_t vec); // VREV64.32 d0,d0
+_NEON2SSESTORAGE poly8x8_t vrev64_p8(poly8x8_t vec); // VREV64.8 d0,d0
+_NEON2SSESTORAGE poly16x4_t vrev64_p16(poly16x4_t vec); // VREV64.16 d0,d0
+_NEON2SSESTORAGE float32x2_t vrev64_f32(float32x2_t vec); // VREV64.32 d0,d0
+_NEON2SSESTORAGE int8x16_t vrev64q_s8(int8x16_t vec); // VREV64.8 q0,q0
+_NEON2SSESTORAGE int16x8_t vrev64q_s16(int16x8_t vec); // VREV64.16 q0,q0
+_NEON2SSESTORAGE int32x4_t vrev64q_s32(int32x4_t vec); // VREV64.32 q0,q0
+_NEON2SSESTORAGE uint8x16_t vrev64q_u8(uint8x16_t vec); // VREV64.8 q0,q0
+_NEON2SSESTORAGE uint16x8_t vrev64q_u16(uint16x8_t vec); // VREV64.16 q0,q0
+_NEON2SSESTORAGE uint32x4_t vrev64q_u32(uint32x4_t vec); // VREV64.32 q0,q0
+_NEON2SSESTORAGE poly8x16_t vrev64q_p8(poly8x16_t vec); // VREV64.8 q0,q0
+_NEON2SSESTORAGE poly16x8_t vrev64q_p16(poly16x8_t vec); // VREV64.16 q0,q0
+_NEON2SSESTORAGE float32x4_t vrev64q_f32(float32x4_t vec); // VREV64.32 q0,q0
+_NEON2SSESTORAGE int8x8_t vrev32_s8(int8x8_t vec); // VREV32.8 d0,d0
+_NEON2SSESTORAGE int16x4_t vrev32_s16(int16x4_t vec); // VREV32.16 d0,d0
+_NEON2SSESTORAGE uint8x8_t vrev32_u8(uint8x8_t vec); // VREV32.8 d0,d0
+_NEON2SSESTORAGE uint16x4_t vrev32_u16(uint16x4_t vec); // VREV32.16 d0,d0
+_NEON2SSESTORAGE poly8x8_t vrev32_p8(poly8x8_t vec); // VREV32.8 d0,d0
+_NEON2SSESTORAGE poly16x4_t vrev32_p16(poly16x4_t vec); // VREV32.16 d0,d0
+_NEON2SSESTORAGE int8x16_t vrev32q_s8(int8x16_t vec); // VREV32.8 q0,q0
+_NEON2SSESTORAGE int16x8_t vrev32q_s16(int16x8_t vec); // VREV32.16 q0,q0
+_NEON2SSESTORAGE uint8x16_t vrev32q_u8(uint8x16_t vec); // VREV32.8 q0,q0
+_NEON2SSESTORAGE uint16x8_t vrev32q_u16(uint16x8_t vec); // VREV32.16 q0,q0
+_NEON2SSESTORAGE poly8x16_t vrev32q_p8(poly8x16_t vec); // VREV32.8 q0,q0
+_NEON2SSESTORAGE poly16x8_t vrev32q_p16(poly16x8_t vec); // VREV32.16 q0,q0
+_NEON2SSESTORAGE int8x8_t vrev16_s8(int8x8_t vec); // VREV16.8 d0,d0
+_NEON2SSESTORAGE uint8x8_t vrev16_u8(uint8x8_t vec); // VREV16.8 d0,d0
+_NEON2SSESTORAGE poly8x8_t vrev16_p8(poly8x8_t vec); // VREV16.8 d0,d0
+_NEON2SSESTORAGE int8x16_t vrev16q_s8(int8x16_t vec); // VREV16.8 q0,q0
+_NEON2SSESTORAGE uint8x16_t vrev16q_u8(uint8x16_t vec); // VREV16.8 q0,q0
+_NEON2SSESTORAGE poly8x16_t vrev16q_p8(poly8x16_t vec); // VREV16.8 q0,q0
+//Other single operand arithmetic
+//Absolute: Vd[i] = |Va[i]|
+_NEON2SSESTORAGE int8x8_t vabs_s8(int8x8_t a); // VABS.S8 d0,d0
+_NEON2SSESTORAGE int16x4_t vabs_s16(int16x4_t a); // VABS.S16 d0,d0
+_NEON2SSESTORAGE int32x2_t vabs_s32(int32x2_t a); // VABS.S32 d0,d0
+_NEON2SSESTORAGE float32x2_t vabs_f32(float32x2_t a); // VABS.F32 d0,d0
+_NEON2SSESTORAGE int8x16_t vabsq_s8(int8x16_t a); // VABS.S8 q0,q0
+_NEON2SSESTORAGE int16x8_t vabsq_s16(int16x8_t a); // VABS.S16 q0,q0
+_NEON2SSESTORAGE int32x4_t vabsq_s32(int32x4_t a); // VABS.S32 q0,q0
+_NEON2SSESTORAGE float32x4_t vabsq_f32(float32x4_t a); // VABS.F32 q0,q0
+
+_NEON2SSESTORAGE int64x2_t vabsq_s64(int64x2_t a); // VABS.S64 q0,q0
+_NEON2SSESTORAGE float64x2_t vabsq_f64(float64x2_t a); // VABS.F64 q0,q0
+
+//Saturating absolute: Vd[i] = sat(|Va[i]|)
+_NEON2SSESTORAGE int8x8_t vqabs_s8(int8x8_t a); // VQABS.S8 d0,d0
+_NEON2SSESTORAGE int16x4_t vqabs_s16(int16x4_t a); // VQABS.S16 d0,d0
+_NEON2SSESTORAGE int32x2_t vqabs_s32(int32x2_t a); // VQABS.S32 d0,d0
+_NEON2SSESTORAGE int8x16_t vqabsq_s8(int8x16_t a); // VQABS.S8 q0,q0
+_NEON2SSESTORAGE int16x8_t vqabsq_s16(int16x8_t a); // VQABS.S16 q0,q0
+_NEON2SSESTORAGE int32x4_t vqabsq_s32(int32x4_t a); // VQABS.S32 q0,q0
+//Negate: Vd[i] = - Va[i]
+_NEON2SSESTORAGE int8x8_t vneg_s8(int8x8_t a); // VNE//d0,d0
+_NEON2SSESTORAGE int16x4_t vneg_s16(int16x4_t a); // VNE//d0,d0
+_NEON2SSESTORAGE int32x2_t vneg_s32(int32x2_t a); // VNE//d0,d0
+_NEON2SSESTORAGE float32x2_t vneg_f32(float32x2_t a); // VNE//d0,d0
+_NEON2SSESTORAGE int8x16_t vnegq_s8(int8x16_t a); // VNE//q0,q0
+_NEON2SSESTORAGE int16x8_t vnegq_s16(int16x8_t a); // VNE//q0,q0
+_NEON2SSESTORAGE int32x4_t vnegq_s32(int32x4_t a); // VNE//q0,q0
+_NEON2SSESTORAGE float32x4_t vnegq_f32(float32x4_t a); // VNE//q0,q0
+//Saturating Negate: sat(Vd[i] = - Va[i])
+_NEON2SSESTORAGE int8x8_t vqneg_s8(int8x8_t a); // VQNE//d0,d0
+_NEON2SSESTORAGE int16x4_t vqneg_s16(int16x4_t a); // VQNE//d0,d0
+_NEON2SSESTORAGE int32x2_t vqneg_s32(int32x2_t a); // VQNE//d0,d0
+_NEON2SSESTORAGE int8x16_t vqnegq_s8(int8x16_t a); // VQNE//q0,q0
+_NEON2SSESTORAGE int16x8_t vqnegq_s16(int16x8_t a); // VQNE//q0,q0
+_NEON2SSESTORAGE int32x4_t vqnegq_s32(int32x4_t a); // VQNE//q0,q0
+//Count leading sign bits
+_NEON2SSESTORAGE int8x8_t vcls_s8(int8x8_t a); // VCLS.S8 d0,d0
+_NEON2SSESTORAGE int16x4_t vcls_s16(int16x4_t a); // VCLS.S16 d0,d0
+_NEON2SSESTORAGE int32x2_t vcls_s32(int32x2_t a); // VCLS.S32 d0,d0
+_NEON2SSESTORAGE int8x16_t vclsq_s8(int8x16_t a); // VCLS.S8 q0,q0
+_NEON2SSESTORAGE int16x8_t vclsq_s16(int16x8_t a); // VCLS.S16 q0,q0
+_NEON2SSESTORAGE int32x4_t vclsq_s32(int32x4_t a); // VCLS.S32 q0,q0
+//Count leading zeros
+_NEON2SSESTORAGE int8x8_t vclz_s8(int8x8_t a); // VCLZ.I8 d0,d0
+_NEON2SSESTORAGE int16x4_t vclz_s16(int16x4_t a); // VCLZ.I16 d0,d0
+_NEON2SSESTORAGE int32x2_t vclz_s32(int32x2_t a); // VCLZ.I32 d0,d0
+_NEON2SSESTORAGE uint8x8_t vclz_u8(uint8x8_t a); // VCLZ.I8 d0,d0
+_NEON2SSESTORAGE uint16x4_t vclz_u16(uint16x4_t a); // VCLZ.I16 d0,d0
+_NEON2SSESTORAGE uint32x2_t vclz_u32(uint32x2_t a); // VCLZ.I32 d0,d0
+_NEON2SSESTORAGE int8x16_t vclzq_s8(int8x16_t a); // VCLZ.I8 q0,q0
+_NEON2SSESTORAGE int16x8_t vclzq_s16(int16x8_t a); // VCLZ.I16 q0,q0
+_NEON2SSESTORAGE int32x4_t vclzq_s32(int32x4_t a); // VCLZ.I32 q0,q0
+_NEON2SSESTORAGE uint8x16_t vclzq_u8(uint8x16_t a); // VCLZ.I8 q0,q0
+_NEON2SSESTORAGE uint16x8_t vclzq_u16(uint16x8_t a); // VCLZ.I16 q0,q0
+_NEON2SSESTORAGE uint32x4_t vclzq_u32(uint32x4_t a); // VCLZ.I32 q0,q0
+//Count number of set bits
+_NEON2SSESTORAGE uint8x8_t vcnt_u8(uint8x8_t a); // VCNT.8 d0,d0
+_NEON2SSESTORAGE int8x8_t vcnt_s8(int8x8_t a); // VCNT.8 d0,d0
+_NEON2SSESTORAGE poly8x8_t vcnt_p8(poly8x8_t a); // VCNT.8 d0,d0
+_NEON2SSESTORAGE uint8x16_t vcntq_u8(uint8x16_t a); // VCNT.8 q0,q0
+_NEON2SSESTORAGE int8x16_t vcntq_s8(int8x16_t a); // VCNT.8 q0,q0
+_NEON2SSESTORAGE poly8x16_t vcntq_p8(poly8x16_t a); // VCNT.8 q0,q0
+//Reciprocal estimate
+_NEON2SSESTORAGE float32x2_t vrecpe_f32(float32x2_t a); // VRECPE.F32 d0,d0
+_NEON2SSESTORAGE uint32x2_t vrecpe_u32(uint32x2_t a); // VRECPE.U32 d0,d0
+_NEON2SSESTORAGE float32x4_t vrecpeq_f32(float32x4_t a); // VRECPE.F32 q0,q0
+_NEON2SSESTORAGE uint32x4_t vrecpeq_u32(uint32x4_t a); // VRECPE.U32 q0,q0
+//Reciprocal square root estimate
+_NEON2SSESTORAGE float32x2_t vrsqrte_f32(float32x2_t a); // VRSQRTE.F32 d0,d0
+_NEON2SSESTORAGE uint32x2_t vrsqrte_u32(uint32x2_t a); // VRSQRTE.U32 d0,d0
+_NEON2SSESTORAGE float32x4_t vrsqrteq_f32(float32x4_t a); // VRSQRTE.F32 q0,q0
+_NEON2SSESTORAGE uint32x4_t vrsqrteq_u32(uint32x4_t a); // VRSQRTE.U32 q0,q0
+//Logical operations
+//Bitwise not
+_NEON2SSESTORAGE int8x8_t vmvn_s8(int8x8_t a); // VMVN d0,d0
+_NEON2SSESTORAGE int16x4_t vmvn_s16(int16x4_t a); // VMVN d0,d0
+_NEON2SSESTORAGE int32x2_t vmvn_s32(int32x2_t a); // VMVN d0,d0
+_NEON2SSESTORAGE uint8x8_t vmvn_u8(uint8x8_t a); // VMVN d0,d0
+_NEON2SSESTORAGE uint16x4_t vmvn_u16(uint16x4_t a); // VMVN d0,d0
+_NEON2SSESTORAGE uint32x2_t vmvn_u32(uint32x2_t a); // VMVN d0,d0
+_NEON2SSESTORAGE poly8x8_t vmvn_p8(poly8x8_t a); // VMVN d0,d0
+_NEON2SSESTORAGE int8x16_t vmvnq_s8(int8x16_t a); // VMVN q0,q0
+_NEON2SSESTORAGE int16x8_t vmvnq_s16(int16x8_t a); // VMVN q0,q0
+_NEON2SSESTORAGE int32x4_t vmvnq_s32(int32x4_t a); // VMVN q0,q0
+_NEON2SSESTORAGE uint8x16_t vmvnq_u8(uint8x16_t a); // VMVN q0,q0
+_NEON2SSESTORAGE uint16x8_t vmvnq_u16(uint16x8_t a); // VMVN q0,q0
+_NEON2SSESTORAGE uint32x4_t vmvnq_u32(uint32x4_t a); // VMVN q0,q0
+_NEON2SSESTORAGE poly8x16_t vmvnq_p8(poly8x16_t a); // VMVN q0,q0
+//Bitwise and
+_NEON2SSESTORAGE int8x8_t vand_s8(int8x8_t a, int8x8_t b); // VAND d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vand_s16(int16x4_t a, int16x4_t b); // VAND d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vand_s32(int32x2_t a, int32x2_t b); // VAND d0,d0,d0
+_NEON2SSESTORAGE int64x1_t vand_s64(int64x1_t a, int64x1_t b); // VAND d0,d0,d0
+_NEON2SSESTORAGE uint8x8_t vand_u8(uint8x8_t a, uint8x8_t b); // VAND d0,d0,d0
+_NEON2SSESTORAGE uint16x4_t vand_u16(uint16x4_t a, uint16x4_t b); // VAND d0,d0,d0
+_NEON2SSESTORAGE uint32x2_t vand_u32(uint32x2_t a, uint32x2_t b); // VAND d0,d0,d0
+_NEON2SSESTORAGE uint64x1_t vand_u64(uint64x1_t a, uint64x1_t b); // VAND d0,d0,d0
+_NEON2SSESTORAGE int8x16_t vandq_s8(int8x16_t a, int8x16_t b); // VAND q0,q0,q0
+_NEON2SSESTORAGE int16x8_t vandq_s16(int16x8_t a, int16x8_t b); // VAND q0,q0,q0
+_NEON2SSESTORAGE int32x4_t vandq_s32(int32x4_t a, int32x4_t b); // VAND q0,q0,q0
+_NEON2SSESTORAGE int64x2_t vandq_s64(int64x2_t a, int64x2_t b); // VAND q0,q0,q0
+_NEON2SSESTORAGE uint8x16_t vandq_u8(uint8x16_t a, uint8x16_t b); // VAND q0,q0,q0
+_NEON2SSESTORAGE uint16x8_t vandq_u16(uint16x8_t a, uint16x8_t b); // VAND q0,q0,q0
+_NEON2SSESTORAGE uint32x4_t vandq_u32(uint32x4_t a, uint32x4_t b); // VAND q0,q0,q0
+_NEON2SSESTORAGE uint64x2_t vandq_u64(uint64x2_t a, uint64x2_t b); // VAND q0,q0,q0
+//Bitwise or
+_NEON2SSESTORAGE int8x8_t vorr_s8(int8x8_t a, int8x8_t b); // VORR d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vorr_s16(int16x4_t a, int16x4_t b); // VORR d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vorr_s32(int32x2_t a, int32x2_t b); // VORR d0,d0,d0
+_NEON2SSESTORAGE int64x1_t vorr_s64(int64x1_t a, int64x1_t b); // VORR d0,d0,d0
+_NEON2SSESTORAGE uint8x8_t vorr_u8(uint8x8_t a, uint8x8_t b); // VORR d0,d0,d0
+_NEON2SSESTORAGE uint16x4_t vorr_u16(uint16x4_t a, uint16x4_t b); // VORR d0,d0,d0
+_NEON2SSESTORAGE uint32x2_t vorr_u32(uint32x2_t a, uint32x2_t b); // VORR d0,d0,d0
+_NEON2SSESTORAGE uint64x1_t vorr_u64(uint64x1_t a, uint64x1_t b); // VORR d0,d0,d0
+_NEON2SSESTORAGE int8x16_t vorrq_s8(int8x16_t a, int8x16_t b); // VORR q0,q0,q0
+_NEON2SSESTORAGE int16x8_t vorrq_s16(int16x8_t a, int16x8_t b); // VORR q0,q0,q0
+_NEON2SSESTORAGE int32x4_t vorrq_s32(int32x4_t a, int32x4_t b); // VORR q0,q0,q0
+_NEON2SSESTORAGE int64x2_t vorrq_s64(int64x2_t a, int64x2_t b); // VORR q0,q0,q0
+_NEON2SSESTORAGE uint8x16_t vorrq_u8(uint8x16_t a, uint8x16_t b); // VORR q0,q0,q0
+_NEON2SSESTORAGE uint16x8_t vorrq_u16(uint16x8_t a, uint16x8_t b); // VORR q0,q0,q0
+_NEON2SSESTORAGE uint32x4_t vorrq_u32(uint32x4_t a, uint32x4_t b); // VORR q0,q0,q0
+_NEON2SSESTORAGE uint64x2_t vorrq_u64(uint64x2_t a, uint64x2_t b); // VORR q0,q0,q0
+//Bitwise exclusive or (EOR or XOR)
+_NEON2SSESTORAGE int8x8_t veor_s8(int8x8_t a, int8x8_t b); // VEOR d0,d0,d0
+_NEON2SSESTORAGE int16x4_t veor_s16(int16x4_t a, int16x4_t b); // VEOR d0,d0,d0
+_NEON2SSESTORAGE int32x2_t veor_s32(int32x2_t a, int32x2_t b); // VEOR d0,d0,d0
+_NEON2SSESTORAGE int64x1_t veor_s64(int64x1_t a, int64x1_t b); // VEOR d0,d0,d0
+_NEON2SSESTORAGE uint8x8_t veor_u8(uint8x8_t a, uint8x8_t b); // VEOR d0,d0,d0
+_NEON2SSESTORAGE uint16x4_t veor_u16(uint16x4_t a, uint16x4_t b); // VEOR d0,d0,d0
+_NEON2SSESTORAGE uint32x2_t veor_u32(uint32x2_t a, uint32x2_t b); // VEOR d0,d0,d0
+_NEON2SSESTORAGE uint64x1_t veor_u64(uint64x1_t a, uint64x1_t b); // VEOR d0,d0,d0
+_NEON2SSESTORAGE int8x16_t veorq_s8(int8x16_t a, int8x16_t b); // VEOR q0,q0,q0
+_NEON2SSESTORAGE int16x8_t veorq_s16(int16x8_t a, int16x8_t b); // VEOR q0,q0,q0
+_NEON2SSESTORAGE int32x4_t veorq_s32(int32x4_t a, int32x4_t b); // VEOR q0,q0,q0
+_NEON2SSESTORAGE int64x2_t veorq_s64(int64x2_t a, int64x2_t b); // VEOR q0,q0,q0
+_NEON2SSESTORAGE uint8x16_t veorq_u8(uint8x16_t a, uint8x16_t b); // VEOR q0,q0,q0
+_NEON2SSESTORAGE uint16x8_t veorq_u16(uint16x8_t a, uint16x8_t b); // VEOR q0,q0,q0
+_NEON2SSESTORAGE uint32x4_t veorq_u32(uint32x4_t a, uint32x4_t b); // VEOR q0,q0,q0
+_NEON2SSESTORAGE uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b); // VEOR q0,q0,q0
+//Bit Clear
+_NEON2SSESTORAGE int8x8_t vbic_s8(int8x8_t a, int8x8_t b); // VBIC d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vbic_s16(int16x4_t a, int16x4_t b); // VBIC d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vbic_s32(int32x2_t a, int32x2_t b); // VBIC d0,d0,d0
+_NEON2SSESTORAGE int64x1_t vbic_s64(int64x1_t a, int64x1_t b); // VBIC d0,d0,d0
+_NEON2SSESTORAGE uint8x8_t vbic_u8(uint8x8_t a, uint8x8_t b); // VBIC d0,d0,d0
+_NEON2SSESTORAGE uint16x4_t vbic_u16(uint16x4_t a, uint16x4_t b); // VBIC d0,d0,d0
+_NEON2SSESTORAGE uint32x2_t vbic_u32(uint32x2_t a, uint32x2_t b); // VBIC d0,d0,d0
+_NEON2SSESTORAGE uint64x1_t vbic_u64(uint64x1_t a, uint64x1_t b); // VBIC d0,d0,d0
+_NEON2SSESTORAGE int8x16_t vbicq_s8(int8x16_t a, int8x16_t b); // VBIC q0,q0,q0
+_NEON2SSESTORAGE int16x8_t vbicq_s16(int16x8_t a, int16x8_t b); // VBIC q0,q0,q0
+_NEON2SSESTORAGE int32x4_t vbicq_s32(int32x4_t a, int32x4_t b); // VBIC q0,q0,q0
+_NEON2SSESTORAGE int64x2_t vbicq_s64(int64x2_t a, int64x2_t b); // VBIC q0,q0,q0
+_NEON2SSESTORAGE uint8x16_t vbicq_u8(uint8x16_t a, uint8x16_t b); // VBIC q0,q0,q0
+_NEON2SSESTORAGE uint16x8_t vbicq_u16(uint16x8_t a, uint16x8_t b); // VBIC q0,q0,q0
+_NEON2SSESTORAGE uint32x4_t vbicq_u32(uint32x4_t a, uint32x4_t b); // VBIC q0,q0,q0
+_NEON2SSESTORAGE uint64x2_t vbicq_u64(uint64x2_t a, uint64x2_t b); // VBIC q0,q0,q0
+//Bitwise OR complement
+_NEON2SSESTORAGE int8x8_t vorn_s8(int8x8_t a, int8x8_t b); // VORN d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vorn_s16(int16x4_t a, int16x4_t b); // VORN d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vorn_s32(int32x2_t a, int32x2_t b); // VORN d0,d0,d0
+_NEON2SSESTORAGE int64x1_t vorn_s64(int64x1_t a, int64x1_t b); // VORN d0,d0,d0
+_NEON2SSESTORAGE uint8x8_t vorn_u8(uint8x8_t a, uint8x8_t b); // VORN d0,d0,d0
+_NEON2SSESTORAGE uint16x4_t vorn_u16(uint16x4_t a, uint16x4_t b); // VORN d0,d0,d0
+_NEON2SSESTORAGE uint32x2_t vorn_u32(uint32x2_t a, uint32x2_t b); // VORN d0,d0,d0
+_NEON2SSESTORAGE uint64x1_t vorn_u64(uint64x1_t a, uint64x1_t b); // VORN d0,d0,d0
+_NEON2SSESTORAGE int8x16_t vornq_s8(int8x16_t a, int8x16_t b); // VORN q0,q0,q0
+_NEON2SSESTORAGE int16x8_t vornq_s16(int16x8_t a, int16x8_t b); // VORN q0,q0,q0
+_NEON2SSESTORAGE int32x4_t vornq_s32(int32x4_t a, int32x4_t b); // VORN q0,q0,q0
+_NEON2SSESTORAGE int64x2_t vornq_s64(int64x2_t a, int64x2_t b); // VORN q0,q0,q0
+_NEON2SSESTORAGE uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b); // VORN q0,q0,q0
+_NEON2SSESTORAGE uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b); // VORN q0,q0,q0
+_NEON2SSESTORAGE uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b); // VORN q0,q0,q0
+_NEON2SSESTORAGE uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b); // VORN q0,q0,q0
+//Bitwise Select
+_NEON2SSESTORAGE int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c); // VBSL d0,d0,d0
+_NEON2SSESTORAGE int16x4_t vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c); // VBSL d0,d0,d0
+_NEON2SSESTORAGE int32x2_t vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c); // VBSL d0,d0,d0
+_NEON2SSESTORAGE int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c); // VBSL d0,d0,d0
+_NEON2SSESTORAGE uint8x8_t vbsl_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VBSL d0,d0,d0
+_NEON2SSESTORAGE uint16x4_t vbsl_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VBSL d0,d0,d0
+_NEON2SSESTORAGE uint32x2_t vbsl_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VBSL d0,d0,d0
+_NEON2SSESTORAGE uint64x1_t vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c); // VBSL d0,d0,d0
+_NEON2SSESTORAGE float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c); // VBSL d0,d0,d0
+_NEON2SSESTORAGE poly8x8_t vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c); // VBSL d0,d0,d0
+_NEON2SSESTORAGE poly16x4_t vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c); // VBSL d0,d0,d0
+_NEON2SSESTORAGE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c); // VBSL q0,q0,q0
+_NEON2SSESTORAGE int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c); // VBSL q0,q0,q0
+_NEON2SSESTORAGE int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c); // VBSL q0,q0,q0
+_NEON2SSESTORAGE int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c); // VBSL q0,q0,q0
+_NEON2SSESTORAGE uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VBSL q0,q0,q0
+_NEON2SSESTORAGE uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VBSL q0,q0,q0
+_NEON2SSESTORAGE uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VBSL q0,q0,q0
+_NEON2SSESTORAGE uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c); // VBSL q0,q0,q0
+_NEON2SSESTORAGE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c); // VBSL q0,q0,q0
+_NEON2SSESTORAGE poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c); // VBSL q0,q0,q0
+_NEON2SSESTORAGE poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c); // VBSL q0,q0,q0
+//Transposition operations
+//Transpose elements
+_NEON2SSESTORAGE int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b); // VTRN.8 d0,d0
+_NEON2SSESTORAGE int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b); // VTRN.16 d0,d0
+_NEON2SSESTORAGE int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b); // VTRN.32 d0,d0
+_NEON2SSESTORAGE uint8x8x2_t vtrn_u8(uint8x8_t a, uint8x8_t b); // VTRN.8 d0,d0
+_NEON2SSESTORAGE uint16x4x2_t vtrn_u16(uint16x4_t a, uint16x4_t b); // VTRN.16 d0,d0
+_NEON2SSESTORAGE uint32x2x2_t vtrn_u32(uint32x2_t a, uint32x2_t b); // VTRN.32 d0,d0
+_NEON2SSESTORAGE float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b); // VTRN.32 d0,d0
+_NEON2SSESTORAGE poly8x8x2_t vtrn_p8(poly8x8_t a, poly8x8_t b); // VTRN.8 d0,d0
+_NEON2SSESTORAGE poly16x4x2_t vtrn_p16(poly16x4_t a, poly16x4_t b); // VTRN.16 d0,d0
+_NEON2SSESTORAGE int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b); // VTRN.8 q0,q0
+_NEON2SSESTORAGE int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b); // VTRN.16 q0,q0
+_NEON2SSESTORAGE int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b); // VTRN.32 q0,q0
+_NEON2SSESTORAGE uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b); // VTRN.8 q0,q0
+_NEON2SSESTORAGE uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b); // VTRN.16 q0,q0
+_NEON2SSESTORAGE uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b); // VTRN.32 q0,q0
+_NEON2SSESTORAGE float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b); // VTRN.32 q0,q0
+_NEON2SSESTORAGE poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b); // VTRN.8 q0,q0
+_NEON2SSESTORAGE poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b); // VTRN.16 q0,q0
+//Interleave elements
+_NEON2SSESTORAGE int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b); // VZIP.8 d0,d0
+_NEON2SSESTORAGE int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b); // VZIP.16 d0,d0
+_NEON2SSESTORAGE int32x2x2_t vzip_s32(int32x2_t a, int32x2_t b); // VZIP.32 d0,d0
+_NEON2SSESTORAGE uint8x8x2_t vzip_u8(uint8x8_t a, uint8x8_t b); // VZIP.8 d0,d0
+_NEON2SSESTORAGE uint16x4x2_t vzip_u16(uint16x4_t a, uint16x4_t b); // VZIP.16 d0,d0
+_NEON2SSESTORAGE uint32x2x2_t vzip_u32(uint32x2_t a, uint32x2_t b); // VZIP.32 d0,d0
+_NEON2SSESTORAGE float32x2x2_t vzip_f32(float32x2_t a, float32x2_t b); // VZIP.32 d0,d0
+_NEON2SSESTORAGE poly8x8x2_t vzip_p8(poly8x8_t a, poly8x8_t b); // VZIP.8 d0,d0
+_NEON2SSESTORAGE poly16x4x2_t vzip_p16(poly16x4_t a, poly16x4_t b); // VZIP.16 d0,d0
+_NEON2SSESTORAGE int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b); // VZIP.8 q0,q0
+_NEON2SSESTORAGE int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b); // VZIP.16 q0,q0
+_NEON2SSESTORAGE int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b); // VZIP.32 q0,q0
+_NEON2SSESTORAGE uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b); // VZIP.8 q0,q0
+_NEON2SSESTORAGE uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b); // VZIP.16 q0,q0
+_NEON2SSESTORAGE uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b); // VZIP.32 q0,q0
+_NEON2SSESTORAGE float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b); // VZIP.32 q0,q0
+_NEON2SSESTORAGE poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b); // VZIP.8 q0,q0
+_NEON2SSESTORAGE poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b); // VZIP.16 q0,q0
+//De-Interleave elements
+_NEON2SSESTORAGE int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b); // VUZP.8 d0,d0
+_NEON2SSESTORAGE int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b); // VUZP.16 d0,d0
+_NEON2SSESTORAGE int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b); // VUZP.32 d0,d0
+_NEON2SSESTORAGE uint8x8x2_t vuzp_u8(uint8x8_t a, uint8x8_t b); // VUZP.8 d0,d0
+_NEON2SSESTORAGE uint16x4x2_t vuzp_u16(uint16x4_t a, uint16x4_t b); // VUZP.16 d0,d0
+_NEON2SSESTORAGE uint32x2x2_t vuzp_u32(uint32x2_t a, uint32x2_t b); // VUZP.32 d0,d0
+_NEON2SSESTORAGE float32x2x2_t vuzp_f32(float32x2_t a, float32x2_t b); // VUZP.32 d0,d0
+_NEON2SSESTORAGE poly8x8x2_t vuzp_p8(poly8x8_t a, poly8x8_t b); // VUZP.8 d0,d0
+_NEON2SSESTORAGE poly16x4x2_t vuzp_p16(poly16x4_t a, poly16x4_t b); // VUZP.16 d0,d0
+_NEON2SSESTORAGE int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b); // VUZP.8 q0,q0
+_NEON2SSESTORAGE int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b); // VUZP.16 q0,q0
+_NEON2SSESTORAGE int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b); // VUZP.32 q0,q0
+_NEON2SSESTORAGE uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b); // VUZP.8 q0,q0
+_NEON2SSESTORAGE uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b); // VUZP.16 q0,q0
+_NEON2SSESTORAGE uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b); // VUZP.32 q0,q0
+_NEON2SSESTORAGE float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b); // VUZP.32 q0,q0
+_NEON2SSESTORAGE poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b); // VUZP.8 q0,q0
+_NEON2SSESTORAGE poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b); // VUZP.16 q0,q0
+
+_NEON2SSESTORAGE float32x4_t vrndnq_f32(float32x4_t a); // VRND.F32 q0,q0
+
+_NEON2SSESTORAGE float64x2_t vrndnq_f64(float64x2_t a); // VRND.F64 q0,q0
+
+//Sqrt
+_NEON2SSESTORAGE float32x4_t vsqrtq_f32(float32x4_t a); // VSQRT.F32 q0,q0
+
+_NEON2SSESTORAGE float64x2_t vsqrtq_f64(float64x2_t a); // VSQRT.F64 q0,q0
+
+
+
+//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+// the following macros solve the problem of the "immediate parameters requirement" for some x86 intrinsics.
+// we need it to compile the code unless the "Intrinsic parameter must be an immediate value" error is our goal
+//
+#if  ( defined (__INTEL_COMPILER)  || defined (__GNUC__) && !defined(__llvm__) )
+#   define _MM_ALIGNR_EPI8 _mm_alignr_epi8
+#   define _MM_EXTRACT_EPI16  (int16_t) _mm_extract_epi16
+#   define _MM_INSERT_EPI16 _mm_insert_epi16
+#   ifdef USE_SSE4
+#       define _MM_EXTRACT_EPI8  _mm_extract_epi8
+#       define _MM_EXTRACT_EPI32  _mm_extract_epi32
+#       define _MM_EXTRACT_PS  _mm_extract_ps
+#       define _MM_INSERT_EPI8  _mm_insert_epi8
+#       define _MM_INSERT_EPI32 _mm_insert_epi32
+#       define _MM_INSERT_PS    _mm_insert_ps
+#       ifdef  _NEON2SSE_64BIT
+#           define _MM_INSERT_EPI64 _mm_insert_epi64
+#           define _MM_EXTRACT_EPI64 _mm_extract_epi64
+#       endif
+#   endif //SSE4
+#else
+#   define _NEON2SSE_COMMA ,
+#   define _NEON2SSE_SWITCH16(NAME, a, b, LANE) \
+        switch(LANE)         \
+        {                \
+        case 0:     return NAME(a b, 0); \
+        case 1:     return NAME(a b, 1); \
+        case 2:     return NAME(a b, 2); \
+        case 3:     return NAME(a b, 3); \
+        case 4:     return NAME(a b, 4); \
+        case 5:     return NAME(a b, 5); \
+        case 6:     return NAME(a b, 6); \
+        case 7:     return NAME(a b, 7); \
+        case 8:     return NAME(a b, 8); \
+        case 9:     return NAME(a b, 9); \
+        case 10:    return NAME(a b, 10); \
+        case 11:    return NAME(a b, 11); \
+        case 12:    return NAME(a b, 12); \
+        case 13:    return NAME(a b, 13); \
+        case 14:    return NAME(a b, 14); \
+        case 15:    return NAME(a b, 15); \
+        default:    return NAME(a b, 0); \
+        }
+
+#   define _NEON2SSE_SWITCH8(NAME, vec, LANE, p) \
+        switch(LANE)              \
+        {                          \
+        case 0:  return NAME(vec p,0); \
+        case 1:  return NAME(vec p,1); \
+        case 2:  return NAME(vec p,2); \
+        case 3:  return NAME(vec p,3); \
+        case 4:  return NAME(vec p,4); \
+        case 5:  return NAME(vec p,5); \
+        case 6:  return NAME(vec p,6); \
+        case 7:  return NAME(vec p,7); \
+        default: return NAME(vec p,0); \
+        }
+
+#   define _NEON2SSE_SWITCH4(NAME, case0, case1, case2, case3, vec, LANE, p) \
+        switch(LANE)              \
+        {                          \
+        case case0:  return NAME(vec p,case0); \
+        case case1:  return NAME(vec p,case1); \
+        case case2:  return NAME(vec p,case2); \
+        case case3:  return NAME(vec p,case3); \
+        default:     return NAME(vec p,case0); \
+        }
+
+    _NEON2SSE_INLINE __m128i _MM_ALIGNR_EPI8(__m128i a, __m128i b, int LANE)
+    {
+        _NEON2SSE_SWITCH16(_mm_alignr_epi8, a, _NEON2SSE_COMMA b, LANE)
+    }
+
+    _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI16(__m128i vec, int p, const int LANE)
+    {
+        _NEON2SSE_SWITCH8(_mm_insert_epi16, vec, LANE, _NEON2SSE_COMMA p)
+    }
+
+    _NEON2SSE_INLINE int16_t _MM_EXTRACT_EPI16(__m128i vec, const int LANE)
+    {
+        _NEON2SSE_SWITCH8(_mm_extract_epi16, vec, LANE,)
+    }
+
+#ifdef USE_SSE4
+        _NEON2SSE_INLINE int _MM_EXTRACT_EPI32(__m128i vec, const int LANE)
+        {
+            _NEON2SSE_SWITCH4(_mm_extract_epi32, 0,1,2,3, vec, LANE,)
+        }
+
+        _NEON2SSE_INLINE int _MM_EXTRACT_PS(__m128 vec, const int LANE)
+        {
+            _NEON2SSE_SWITCH4(_mm_extract_ps, 0,1,2,3, vec, LANE,)
+        }
+
+        _NEON2SSE_INLINE int _MM_EXTRACT_EPI8(__m128i vec, const int LANE)
+        {
+            _NEON2SSE_SWITCH16(_mm_extract_epi8, vec, , LANE)
+        }
+
+        _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI32(__m128i vec, int p, const int LANE)
+        {
+            _NEON2SSE_SWITCH4(_mm_insert_epi32, 0, 1, 2, 3, vec, LANE, _NEON2SSE_COMMA p)
+        }
+
+        _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI8(__m128i vec, int p, const int LANE)
+        {
+            _NEON2SSE_SWITCH16(_mm_insert_epi8, vec, _NEON2SSE_COMMA p, LANE)
+        }
+
+#ifdef  _NEON2SSE_64BIT
+            //the special case of functions available only for SSE4 and 64-bit build.
+            _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI64(__m128i vec, int p, const int LANE)
+            {
+                switch(LANE) {
+                case 0:
+                    return _mm_insert_epi64(vec,  p, 0);
+                case 1:
+                    return _mm_insert_epi64(vec,  p, 1);
+                default:
+                    return _mm_insert_epi64(vec,  p, 0);
+                }
+            }
+
+            _NEON2SSE_INLINE int64_t _MM_EXTRACT_EPI64(__m128i val, const int LANE)
+            {
+                if (LANE ==0) return _mm_extract_epi64(val, 0);
+                else return _mm_extract_epi64(val, 1);
+            }
+#endif
+
+        _NEON2SSE_INLINE __m128 _MM_INSERT_PS(__m128 vec, __m128 p, const int LANE)
+        {
+            _NEON2SSE_SWITCH4(_mm_insert_ps, 0, 16, 32, 48, vec, LANE, _NEON2SSE_COMMA p)
+        }
+
+#endif //USE_SSE4
+
+#endif     //#ifdef NDEBUG
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// Below are some helper functions used either for SSE4 intrinsics "emulation" for SSSE3 limited devices
+// or for some specific commonly used operations implementation missing in SSE
+#ifdef USE_SSE4
+#   define _MM_CVTEPU8_EPI16  _mm_cvtepu8_epi16
+#   define _MM_CVTEPU16_EPI32 _mm_cvtepu16_epi32
+#   define _MM_CVTEPU32_EPI64  _mm_cvtepu32_epi64
+
+#   define _MM_CVTEPI8_EPI16  _mm_cvtepi8_epi16
+#   define _MM_CVTEPI16_EPI32 _mm_cvtepi16_epi32
+#   define _MM_CVTEPI32_EPI64  _mm_cvtepi32_epi64
+
+#   define _MM_MAX_EPI8  _mm_max_epi8
+#   define _MM_MAX_EPI32 _mm_max_epi32
+#   define _MM_MAX_EPU16 _mm_max_epu16
+#   define _MM_MAX_EPU32 _mm_max_epu32
+
+#   define _MM_MIN_EPI8  _mm_min_epi8
+#   define _MM_MIN_EPI32 _mm_min_epi32
+#   define _MM_MIN_EPU16 _mm_min_epu16
+#   define _MM_MIN_EPU32 _mm_min_epu32
+
+#   define _MM_BLENDV_EPI8 _mm_blendv_epi8
+#   define _MM_PACKUS_EPI32 _mm_packus_epi32
+#   define _MM_PACKUS1_EPI32(a) _mm_packus_epi32(a, a)
+
+#   define _MM_MULLO_EPI32 _mm_mullo_epi32
+#   define _MM_MUL_EPI32  _mm_mul_epi32
+
+#   define _MM_CMPEQ_EPI64 _mm_cmpeq_epi64
+#else     //no SSE4 !!!!!!
+    _NEON2SSE_INLINE __m128i _MM_CVTEPU8_EPI16(__m128i a)
+    {
+        __m128i zero = _mm_setzero_si128();
+        return _mm_unpacklo_epi8(a, zero);
+    }
+
+    _NEON2SSE_INLINE __m128i _MM_CVTEPU16_EPI32(__m128i a)
+    {
+        __m128i zero = _mm_setzero_si128();
+        return _mm_unpacklo_epi16(a, zero);
+    }
+
+    _NEON2SSE_INLINE __m128i _MM_CVTEPU32_EPI64(__m128i a)
+    {
+        __m128i zero = _mm_setzero_si128();
+        return _mm_unpacklo_epi32(a, zero);
+    }
+
+    _NEON2SSE_INLINE __m128i _MM_CVTEPI8_EPI16(__m128i a)
+    {
+        __m128i zero = _mm_setzero_si128();
+        __m128i sign = _mm_cmpgt_epi8(zero, a);
+        return _mm_unpacklo_epi8(a, sign);
+    }
+
+    _NEON2SSE_INLINE __m128i _MM_CVTEPI16_EPI32(__m128i a)
+    {
+        __m128i zero = _mm_setzero_si128();
+        __m128i sign = _mm_cmpgt_epi16(zero, a);
+        return _mm_unpacklo_epi16(a, sign);
+    }
+
+    _NEON2SSE_INLINE __m128i _MM_CVTEPI32_EPI64(__m128i a)
+    {
+        __m128i zero = _mm_setzero_si128();
+        __m128i sign = _mm_cmpgt_epi32(zero, a);
+        return _mm_unpacklo_epi32(a, sign);
+    }
+
+    _NEON2SSE_INLINE int _MM_EXTRACT_EPI32(__m128i vec, const int LANE)
+    {
+        _NEON2SSE_ALIGN_16 int32_t tmp[4];
+        _mm_store_si128((__m128i*)tmp, vec);
+        return tmp[LANE];
+    }
+
+    _NEON2SSE_INLINE int _MM_EXTRACT_EPI8(__m128i vec, const int LANE)
+    {
+        _NEON2SSE_ALIGN_16 int8_t tmp[16];
+        _mm_store_si128((__m128i*)tmp, vec);
+        return (int)tmp[LANE];
+    }
+
+    _NEON2SSE_INLINE int _MM_EXTRACT_PS(__m128 vec, const int LANE)
+    {
+        _NEON2SSE_ALIGN_16 int32_t tmp[4];
+        _mm_store_si128((__m128i*)tmp, _M128i(vec));
+        return tmp[LANE];
+    }
+
+    _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI32(__m128i vec, int p, const int LANE)
+    {
+        _NEON2SSE_ALIGN_16 int32_t pvec[4] = {0,0,0,0};
+        _NEON2SSE_ALIGN_16 uint32_t mask[4] = {0xffffffff,0xffffffff,0xffffffff,0xffffffff};
+        __m128i vec_masked, p_masked;
+        pvec[LANE] = p;
+        mask[LANE] = 0x0;
+        vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p
+        p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec
+        return _mm_or_si128(vec_masked, p_masked);
+    }
+
+    _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI8(__m128i vec, int p, const int LANE)
+    {
+        _NEON2SSE_ALIGN_16 int8_t pvec[16] = {0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0};
+        _NEON2SSE_ALIGN_16 uint8_t mask[16] = {0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};
+        __m128i vec_masked, p_masked;
+        pvec[LANE] = (int8_t)p;
+        mask[LANE] = 0x0;
+        vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p
+        p_masked = _mm_andnot_si128  (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec
+        return _mm_or_si128(vec_masked, p_masked);
+    }
+
+    _NEON2SSE_INLINE __m128 _MM_INSERT_PS(__m128 vec, __m128 p, const int LANE)
+    {
+        _NEON2SSE_ALIGN_16 uint32_t mask[4] = {0xffffffff,0xffffffff,0xffffffff,0xffffffff};
+        __m128 tmp, vec_masked, p_masked;
+        mask[LANE >> 4] = 0x0; //here the LANE is not actural lane, need to deal with it
+        vec_masked = _mm_and_ps (*(__m128*)mask,vec); //ready for p
+        p_masked = _mm_andnot_ps (*(__m128*)mask, p); //ready for vec
+        tmp = _mm_or_ps(vec_masked, p_masked);
+        return tmp;
+    }
+
+    _NEON2SSE_INLINE __m128i _MM_MAX_EPI8(__m128i a, __m128i b)
+    {
+        __m128i cmp, resa, resb;
+        cmp = _mm_cmpgt_epi8 (a, b);
+        resa = _mm_and_si128 (cmp, a);
+        resb = _mm_andnot_si128 (cmp,b);
+        return _mm_or_si128(resa, resb);
+    }
+
+    _NEON2SSE_INLINE __m128i _MM_MAX_EPI32(__m128i a, __m128i b)
+    {
+        __m128i cmp, resa, resb;
+        cmp = _mm_cmpgt_epi32(a, b);
+        resa = _mm_and_si128 (cmp, a);
+        resb = _mm_andnot_si128 (cmp,b);
+        return _mm_or_si128(resa, resb);
+    }
+
+    _NEON2SSE_INLINE __m128i _MM_MAX_EPU16(__m128i a, __m128i b)
+    {
+        __m128i c8000, b_s, a_s, cmp;
+        c8000 = _mm_cmpeq_epi16 (a,a); //0xffff
+        c8000 = _mm_slli_epi16 (c8000, 15); //0x8000
+        b_s = _mm_sub_epi16 (b, c8000);
+        a_s = _mm_sub_epi16 (a, c8000);
+        cmp = _mm_cmpgt_epi16 (a_s, b_s); //no unsigned comparison, need to go to signed
+        a_s = _mm_and_si128 (cmp,a);
+        b_s = _mm_andnot_si128 (cmp,b);
+        return _mm_or_si128(a_s, b_s);
+    }
+
+    _NEON2SSE_INLINE __m128i _MM_MAX_EPU32(__m128i a, __m128i b)
+    {
+        __m128i c80000000, b_s, a_s, cmp;
+        c80000000 = _mm_cmpeq_epi32 (a,a); //0xffffffff
+        c80000000 = _mm_slli_epi32 (c80000000, 31); //0x80000000
+        b_s = _mm_sub_epi32 (b, c80000000);
+        a_s = _mm_sub_epi32 (a, c80000000);
+        cmp = _mm_cmpgt_epi32 (a_s, b_s); //no unsigned comparison, need to go to signed
+        a_s = _mm_and_si128 (cmp,a);
+        b_s = _mm_andnot_si128 (cmp,b);
+        return _mm_or_si128(a_s, b_s);
+    }
+
+    _NEON2SSE_INLINE __m128i _MM_MIN_EPI8(__m128i a, __m128i b)
+    {
+        __m128i cmp, resa, resb;
+        cmp = _mm_cmpgt_epi8 (b, a);
+        resa = _mm_and_si128 (cmp, a);
+        resb = _mm_andnot_si128 (cmp,b);
+        return _mm_or_si128(resa, resb);
+    }
+
+    _NEON2SSE_INLINE __m128i _MM_MIN_EPI32(__m128i a, __m128i b)
+    {
+        __m128i cmp, resa, resb;
+        cmp = _mm_cmpgt_epi32(b, a);
+        resa = _mm_and_si128 (cmp, a);
+        resb = _mm_andnot_si128 (cmp,b);
+        return _mm_or_si128(resa, resb);
+    }
+
+    _NEON2SSE_INLINE __m128i _MM_MIN_EPU16(__m128i a, __m128i b)
+    {
+        __m128i c8000, b_s, a_s, cmp;
+        c8000 = _mm_cmpeq_epi16 (a,a); //0xffff
+        c8000 = _mm_slli_epi16 (c8000, 15); //0x8000
+        b_s = _mm_sub_epi16 (b, c8000);
+        a_s = _mm_sub_epi16 (a, c8000);
+        cmp = _mm_cmpgt_epi16 (b_s, a_s); //no unsigned comparison, need to go to signed
+        a_s = _mm_and_si128 (cmp,a);
+        b_s = _mm_andnot_si128 (cmp,b);
+        return _mm_or_si128(a_s, b_s);
+    }
+
+    _NEON2SSE_INLINE __m128i _MM_MIN_EPU32(__m128i a, __m128i b)
+    {
+        __m128i c80000000, b_s, a_s, cmp;
+        c80000000 = _mm_cmpeq_epi32 (a,a); //0xffffffff
+        c80000000 = _mm_slli_epi32 (c80000000, 31); //0x80000000
+        b_s = _mm_sub_epi32 (b, c80000000);
+        a_s = _mm_sub_epi32 (a, c80000000);
+        cmp = _mm_cmpgt_epi32 (b_s, a_s); //no unsigned comparison, need to go to signed
+        a_s = _mm_and_si128 (cmp,a);
+        b_s = _mm_andnot_si128 (cmp,b);
+        return _mm_or_si128(a_s, b_s);
+    }
+
+    _NEON2SSE_INLINE __m128i  _MM_BLENDV_EPI8(__m128i a, __m128i b, __m128i mask) //this is NOT exact implementation of _mm_blendv_epi8  !!!!! - please see below
+    {
+        //it assumes mask is either 0xff or 0  always (like in all usecases below) while for the original _mm_blendv_epi8 only MSB mask byte matters.
+        __m128i a_masked, b_masked;
+        b_masked = _mm_and_si128 (mask,b); //use b if mask 0xff
+        a_masked = _mm_andnot_si128 (mask,a);
+        return _mm_or_si128(a_masked, b_masked);
+    }
+
+    _NEON2SSE_INLINE __m128i _MM_PACKUS_EPI32(__m128i a, __m128i b)
+    {
+        __m128i a16, b16, res, reshi,cmp, zero;
+        zero = _mm_setzero_si128();
+        a16 = _mm_shuffle_epi8 (a, *(__m128i*) mask8_32_even_odd);
+        b16 = _mm_shuffle_epi8 (b, *(__m128i*) mask8_32_even_odd);
+        res = _mm_unpacklo_epi64(a16, b16); //result without saturation
+        reshi = _mm_unpackhi_epi64(a16, b16); //hi part of result used for saturation
+        cmp = _mm_cmpgt_epi16(zero, reshi); //if cmp<0 the result should be zero
+        res = _mm_andnot_si128(cmp,res); //if cmp zero - do nothing, otherwise cmp <0  and the result is 0
+        cmp = _mm_cmpgt_epi16(reshi,zero); //if cmp positive
+        return _mm_or_si128(res, cmp); //if cmp positive we are out of 16bits need to saturaate to 0xffff
+    }
+
+    _NEON2SSE_INLINE __m128i _MM_PACKUS1_EPI32(__m128i a)
+    {
+        __m128i a16, res, reshi,cmp, zero;
+        zero = _mm_setzero_si128();
+        a16 = _mm_shuffle_epi8 (a, *(__m128i*)mask8_32_even_odd);
+        reshi = _mm_unpackhi_epi64(a16, a16); //hi part of result used for saturation
+        cmp = _mm_cmpgt_epi16(zero, reshi); //if cmp<0 the result should be zero
+        res = _mm_andnot_si128(cmp, a16); //if cmp zero - do nothing, otherwise cmp <0  and the result is 0
+        cmp = _mm_cmpgt_epi16(reshi,zero); //if cmp positive
+        return _mm_or_si128(res, cmp); //if cmp positive we are out of 16bits need to saturaate to 0xffff
+    }
+
+    // method used by GCC with generic vector extensions
+    _NEON2SSE_INLINE __m128i _MM_MULLO_EPI32(__m128i a, __m128i b)
+    {
+        __m128i a_high = _mm_srli_epi64(a, 32);
+        __m128i low = _mm_mul_epu32(a, b);
+        __m128i b_high = _mm_srli_epi64(b, 32);
+        __m128i high = _mm_mul_epu32(a_high, b_high);
+        low = _mm_shuffle_epi32(low, _MM_SHUFFLE(0, 0, 2, 0));
+        high = _mm_shuffle_epi32(high, _MM_SHUFFLE(0, 0, 2, 0));
+        return _mm_unpacklo_epi32(low, high);
+    }
+
+    _NEON2SSE_INLINE __m128i _MM_MUL_EPI32(__m128i a, __m128i b)
+    {
+        __m128i sign, zero,  mul_us, a_neg, b_neg, mul_us_neg;
+        sign = _mm_xor_si128 (a, b);
+        sign =  _mm_srai_epi32 (sign, 31); //promote sign bit to all fields, all fff if negative and all 0 if positive
+        sign = _mm_shuffle_epi32(sign, _MM_SHUFFLE(2, 2, 0, 0)); //promote sign bit to 3 and 1st data lanes
+        zero = _mm_setzero_si128();
+        a_neg = _mm_abs_epi32 (a); //negate a and b
+        b_neg = _mm_abs_epi32 (b); //negate a and b
+        mul_us = _mm_mul_epu32 (a_neg, b_neg); //uses 0 and 2nd data lanes, (abs), the multiplication gives 64 bit result
+        mul_us_neg = _mm_sub_epi64(zero, mul_us);
+        mul_us_neg = _mm_and_si128(sign, mul_us_neg);
+        mul_us = _mm_andnot_si128(sign, mul_us);
+        return _mm_or_si128 (mul_us, mul_us_neg);
+    }
+
+    _NEON2SSE_INLINE __m128i _MM_CMPEQ_EPI64(__m128i a, __m128i b)
+    {
+        __m128i res;
+        res = _mm_cmpeq_epi32 (a, b);
+        return _mm_shuffle_epi32 (res, 1 | (1 << 2) | (3 << 4) | (3 << 6)); //copy the information from hi to low part of the 64 bit data
+    }
+#endif     //SSE4
+
+//the special case of functions working only for 32 bits, no SSE4
+_NEON2SSE_INLINE __m128i  _MM_INSERT_EPI64_32(__m128i vec, int p, const int LANE)
+{
+    _NEON2SSE_ALIGN_16 uint64_t pvec[2] = {0,0};
+    _NEON2SSE_ALIGN_16 uint64_t mask[2] = {0xffffffffffffffff, 0xffffffffffffffff};
+    __m128i vec_masked, p_masked;
+    pvec[LANE] = p;
+    mask[LANE] = 0x0;
+    vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p
+    p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec
+    return _mm_or_si128(vec_masked, p_masked);
+}
+
+_NEON2SSE_INLINE int64_t _MM_EXTRACT_EPI64_32(__m128i val, const int LANE)
+{
+    _NEON2SSE_ALIGN_16 int64_t tmp[2];
+    _mm_store_si128((__m128i*)tmp, val);
+    return tmp[LANE];
+}
+
+#ifndef _NEON2SSE_64BIT_SSE4
+#   define _MM_INSERT_EPI64 _MM_INSERT_EPI64_32
+#   define _MM_EXTRACT_EPI64 _MM_EXTRACT_EPI64_32
+#endif
+
+_NEON2SSESTORAGE int32x4_t  vqd_s32(int32x4_t a); //Doubling saturation for signed ints
+_NEON2SSE_INLINE int32x4_t  vqd_s32(int32x4_t a)
+{
+    //Overflow happens only if a and sum have the opposite signs
+    __m128i c7fffffff, res, res_sat, res_xor_a;
+    c7fffffff = _mm_set1_epi32(0x7fffffff);
+    res = _mm_slli_epi32 (a, 1); // res = a*2
+    res_sat = _mm_srli_epi32(a, 31);
+    res_sat = _mm_add_epi32(res_sat, c7fffffff);
+    res_xor_a = _mm_xor_si128(res, a);
+    res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise
+    res_sat = _mm_and_si128(res_xor_a, res_sat);
+    res = _mm_andnot_si128(res_xor_a, res);
+    return _mm_or_si128(res, res_sat);
+}
+
+
+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+//*************************************************************************
+//*************************************************************************
+//*****************  Functions redefinition\implementatin starts here *****
+//*************************************************************************
+//*************************************************************************
+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+/*If the unified intrinsics solutions is necessary please define your SSE intrinsics wrap here like in the following sample:
+#ifdef ARM
+#define vector_addq_s32 _mm_add_epi32
+#else //if we have IA
+#define vector_addq_s32 vadd_s32
+#endif
+
+********************************************************************************************
+Functions below are organised in the following way:
+
+Each NEON intrinsic function has one of the following options:
+1.  its x86 full equivalent SSE intrinsic - in this case x86 version just follows the NEON one under the corresponding #define statement
+2.  x86 implementation using more than one x86 intrinsics. In this case it is shaped as inlined C function with return statement
+3.  the reference to the NEON function returning the same result and implemented in x86 as above. In this case it is shaped as matching NEON function definition
+4.  for about 5% of functions due to the corresponding x86 SIMD unavailability or inefficiency in terms of performance
+the serial implementation is provided along with the corresponding compiler warning. If these functions are on your app critical path
+- please consider such functions removal from your code.
+*/
+
+//***********************************************************************
+//************************      Vector add   *****************************
+//***********************************************************************
+_NEON2SSESTORAGE int8x8_t vadd_s8(int8x8_t a, int8x8_t b); // VADD.I8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vadd_s8(int8x8_t a, int8x8_t b)
+{
+    int8x8_t res64;
+    return64(_mm_add_epi8(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE int16x4_t vadd_s16(int16x4_t a, int16x4_t b); // VADD.I16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vadd_s16(int16x4_t a, int16x4_t b)
+{
+    int16x4_t res64;
+    return64(_mm_add_epi16(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE int32x2_t vadd_s32(int32x2_t a, int32x2_t b); // VADD.I32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vadd_s32(int32x2_t a, int32x2_t b)
+{
+    int32x2_t res64;
+    return64(_mm_add_epi32(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE int64x1_t  vadd_s64(int64x1_t a,  int64x1_t b); // VADD.I64 d0,d0,d0
+_NEON2SSE_INLINE int64x1_t  vadd_s64(int64x1_t a,  int64x1_t b)
+{
+    int64x1_t res64;
+    res64.m64_i64[0] = a.m64_i64[0] + b.m64_i64[0];
+    return res64;
+}
+
+
+_NEON2SSESTORAGE float32x2_t vadd_f32(float32x2_t a, float32x2_t b); // VADD.F32 d0,d0,d0
+_NEON2SSE_INLINE float32x2_t vadd_f32(float32x2_t a, float32x2_t b)
+{
+    __m128 res;
+    __m64_128 res64;
+    res = _mm_add_ps(_pM128(a),_pM128(b)); //SSE, use only low 64 bits
+    _M64f(res64, res);
+    return res64;
+}
+
+_NEON2SSESTORAGE uint8x8_t  vadd_u8(uint8x8_t a, uint8x8_t b); // VADD.I8 d0,d0,d0
+#define vadd_u8 vadd_s8
+
+_NEON2SSESTORAGE uint16x4_t  vadd_u16(uint16x4_t a, uint16x4_t b); // VADD.I16 d0,d0,d0
+#define vadd_u16 vadd_s16
+
+_NEON2SSESTORAGE uint32x2_t  vadd_u32(uint32x2_t a, uint32x2_t b); // VADD.I32 d0,d0,d0
+#define vadd_u32 vadd_s32
+
+_NEON2SSESTORAGE uint64x1_t vadd_u64(uint64x1_t a,  uint64x1_t b); // VADD.I64 d0,d0,d0
+_NEON2SSE_INLINE uint64x1_t vadd_u64(uint64x1_t a,  uint64x1_t b)
+{
+    uint64x1_t res64;
+    res64.m64_u64[0] = a.m64_u64[0] + b.m64_u64[0];
+    return res64;
+}
+
+
+_NEON2SSESTORAGE int8x16_t   vaddq_s8(int8x16_t a, int8x16_t b); // VADD.I8 q0,q0,q0
+#define vaddq_s8 _mm_add_epi8
+
+_NEON2SSESTORAGE int16x8_t   vaddq_s16(int16x8_t a, int16x8_t b); // VADD.I16 q0,q0,q0
+#define vaddq_s16 _mm_add_epi16
+
+_NEON2SSESTORAGE int32x4_t   vaddq_s32(int32x4_t a, int32x4_t b); // VADD.I32 q0,q0,q0
+#define vaddq_s32 _mm_add_epi32
+
+_NEON2SSESTORAGE int64x2_t   vaddq_s64(int64x2_t a, int64x2_t b); // VADD.I64 q0,q0,q0
+#define vaddq_s64 _mm_add_epi64
+
+_NEON2SSESTORAGE float32x4_t vaddq_f32(float32x4_t a, float32x4_t b); // VADD.F32 q0,q0,q0
+#define vaddq_f32 _mm_add_ps
+
+_NEON2SSESTORAGE uint8x16_t   vaddq_u8(uint8x16_t a, uint8x16_t b); // VADD.I8 q0,q0,q0
+#define vaddq_u8 _mm_add_epi8
+
+_NEON2SSESTORAGE uint16x8_t   vaddq_u16(uint16x8_t a, uint16x8_t b); // VADD.I16 q0,q0,q0
+#define vaddq_u16 _mm_add_epi16
+
+_NEON2SSESTORAGE uint32x4_t   vaddq_u32(uint32x4_t a, uint32x4_t b); // VADD.I32 q0,q0,q0
+#define vaddq_u32 _mm_add_epi32
+
+_NEON2SSESTORAGE uint64x2_t   vaddq_u64(uint64x2_t a, uint64x2_t b); // VADD.I64 q0,q0,q0
+#define vaddq_u64 _mm_add_epi64
+
+//**************************** Vector long add *****************************:
+//***********************************************************************
+//Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
+_NEON2SSESTORAGE int16x8_t  vaddl_s8(int8x8_t a, int8x8_t b); // VADDL.S8 q0,d0,d0
+_NEON2SSE_INLINE int16x8_t  vaddl_s8(int8x8_t a, int8x8_t b) // VADDL.S8 q0,d0,d0
+{
+    __m128i a16, b16;
+    a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1,
+    b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
+    return _mm_add_epi16 (a16, b16);
+}
+
+_NEON2SSESTORAGE int32x4_t  vaddl_s16(int16x4_t a, int16x4_t b); // VADDL.S16 q0,d0,d0
+_NEON2SSE_INLINE int32x4_t  vaddl_s16(int16x4_t a, int16x4_t b) // VADDL.S16 q0,d0,d0
+{
+    __m128i a32, b32;
+    a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1
+    b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1
+    return _mm_add_epi32 (a32, b32);
+}
+
+_NEON2SSESTORAGE int64x2_t  vaddl_s32(int32x2_t a, int32x2_t b); // VADDL.S32 q0,d0,d0
+_NEON2SSE_INLINE int64x2_t  vaddl_s32(int32x2_t a, int32x2_t b) // VADDL.S32 q0,d0,d0
+{
+    //may be not optimal
+    __m128i a64, b64;
+    a64 = _MM_CVTEPI32_EPI64 (_pM128i(a)); //SSE4.1
+    b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1
+    return _mm_add_epi64 ( a64, b64);
+}
+
+_NEON2SSESTORAGE uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b); // VADDL.U8 q0,d0,d0
+_NEON2SSE_INLINE uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b) // VADDL.U8 q0,d0,d0
+{
+    __m128i a16, b16;
+    a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE4.1
+    b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1
+    return _mm_add_epi16 (a16, b16);
+}
+
+_NEON2SSESTORAGE uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b); // VADDL.s16 q0,d0,d0
+_NEON2SSE_INLINE uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b) // VADDL.s16 q0,d0,d0
+{
+    __m128i a32, b32;
+    a32 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE4.1
+    b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1
+    return _mm_add_epi32 (a32, b32);
+}
+
+_NEON2SSESTORAGE uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b); // VADDL.U32 q0,d0,d0
+_NEON2SSE_INLINE uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b) // VADDL.U32 q0,d0,d0
+{
+    //may be not optimal
+    __m128i a64, b64;
+    a64 = _MM_CVTEPU32_EPI64 (_pM128i(a)); //SSE4.1
+    b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1
+    return _mm_add_epi64 (a64, b64);
+}
+
+//***************   Vector wide add: vaddw_<type>. Vr[i]:=Va[i]+Vb[i] ******************
+//*************** *********************************************************************
+_NEON2SSESTORAGE int16x8_t  vaddw_s8(int16x8_t a, int8x8_t b); // VADDW.S8 q0,q0,d0
+_NEON2SSE_INLINE int16x8_t  vaddw_s8(int16x8_t a, int8x8_t b) // VADDW.S8 q0,q0,d0
+{
+    __m128i b16;
+    b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
+    return _mm_add_epi16 (a, b16);
+}
+
+_NEON2SSESTORAGE int32x4_t  vaddw_s16(int32x4_t a, int16x4_t b); // VADDW.S16 q0,q0,d0
+_NEON2SSE_INLINE int32x4_t  vaddw_s16(int32x4_t a, int16x4_t b) // VADDW.S16 q0,q0,d0
+{
+    __m128i b32;
+    b32 =  _MM_CVTEPI16_EPI32(_pM128i(b)); //SSE4.1,
+    return _mm_add_epi32 (a, b32);
+}
+
+_NEON2SSESTORAGE int64x2_t  vaddw_s32(int64x2_t a, int32x2_t b); // VADDW.S32 q0,q0,d0
+_NEON2SSE_INLINE int64x2_t  vaddw_s32(int64x2_t a, int32x2_t b) // VADDW.S32 q0,q0,d0
+{
+    __m128i b64;
+    b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1
+    return _mm_add_epi64 (a, b64);
+}
+
+_NEON2SSESTORAGE uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b); // VADDW.U8 q0,q0,d0
+_NEON2SSE_INLINE uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b) // VADDW.U8 q0,q0,d0
+{
+    __m128i b16;
+    b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1
+    return _mm_add_epi16 (a, b16);
+}
+
+_NEON2SSESTORAGE uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b); // VADDW.s16 q0,q0,d0
+_NEON2SSE_INLINE uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b) // VADDW.s16 q0,q0,d0
+{
+    __m128i b32;
+    b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1
+    return _mm_add_epi32 (a, b32);
+}
+
+_NEON2SSESTORAGE uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b); // VADDW.U32 q0,q0,d0
+_NEON2SSE_INLINE uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b) // VADDW.U32 q0,q0,d0
+{
+    __m128i b64;
+    b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1
+    return _mm_add_epi64 (a, b64);
+}
+
+//******************************Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1 ,  result truncated *******************************
+//*************************************************************************************************************************
+_NEON2SSESTORAGE int8x8_t vhadd_s8(int8x8_t a,  int8x8_t b); // VHADD.S8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vhadd_s8(int8x8_t a,  int8x8_t b)
+{
+    int8x8_t res64;
+    return64(vhaddq_s8(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE int16x4_t vhadd_s16(int16x4_t a,  int16x4_t b); // VHADD.S16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vhadd_s16(int16x4_t a,  int16x4_t b)
+{
+    int16x4_t res64;
+    return64( vhaddq_s16(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE int32x2_t vhadd_s32(int32x2_t a,  int32x2_t b); // VHADD.S32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vhadd_s32(int32x2_t a,  int32x2_t b)
+{
+    int32x2_t res64;
+    return64( vhaddq_s32(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint8x8_t vhadd_u8(uint8x8_t a,  uint8x8_t b); // VHADD.w d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vhadd_u8(uint8x8_t a,  uint8x8_t b)
+{
+    uint8x8_t res64;
+    return64( vhaddq_u8(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint16x4_t vhadd_u16(uint16x4_t a,  uint16x4_t b); // VHADD.s16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vhadd_u16(uint16x4_t a,  uint16x4_t b)
+{
+    uint16x4_t res64;
+    return64( vhaddq_u16(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint32x2_t vhadd_u32(uint32x2_t a,  uint32x2_t b); // VHADD.U32 d0,d0,d0
+_NEON2SSE_INLINE uint32x2_t vhadd_u32(uint32x2_t a,  uint32x2_t b)
+{
+    uint32x2_t res64;
+    return64( vhaddq_u32(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b); // VHADD.S8 q0,q0,q0
+_NEON2SSE_INLINE int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b)
+{
+    //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
+    __m128i tmp1, tmp2;
+    tmp1 = _mm_and_si128(a,b);
+    tmp2 = _mm_xor_si128(a,b);
+    tmp2 = vshrq_n_s8(tmp2,1);
+    return _mm_add_epi8(tmp1,tmp2);
+}
+
+_NEON2SSESTORAGE int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b); // VHADD.S1 6 q0,q0,q0
+_NEON2SSE_INLINE int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b)
+{
+    //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
+    __m128i tmp1, tmp2;
+    tmp1 = _mm_and_si128(a,b);
+    tmp2 = _mm_xor_si128(a,b);
+    tmp2 = _mm_srai_epi16(tmp2,1);
+    return _mm_add_epi16(tmp1,tmp2);
+}
+
+_NEON2SSESTORAGE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b); // VHADD.S32 q0,q0,q0
+_NEON2SSE_INLINE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b) // VHADD.S32 q0,q0,q0
+{
+    //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
+    __m128i tmp1, tmp2;
+    tmp1 = _mm_and_si128(a,b);
+    tmp2 = _mm_xor_si128(a,b);
+    tmp2 = _mm_srai_epi32(tmp2,1);
+    return _mm_add_epi32(tmp1,tmp2);
+}
+
+_NEON2SSESTORAGE uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b); // VHADD.U8 q0,q0,q0
+_NEON2SSE_INLINE uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b) // VHADD.U8 q0,q0,q0
+{
+    __m128i c1, sum, res;
+    c1 = _mm_set1_epi8(1);
+    sum = _mm_avg_epu8(a, b); //result is rounded, need to compensate it
+    res = _mm_xor_si128(a, b); //for rounding compensation
+    res = _mm_and_si128(res,c1); //for rounding compensation
+    return _mm_sub_epi8 (sum, res); //actual rounding compensation
+}
+
+_NEON2SSESTORAGE uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b); // VHADD.s16 q0,q0,q0
+_NEON2SSE_INLINE uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b) // VHADD.s16 q0,q0,q0
+{
+    __m128i sum, res;
+    sum = _mm_avg_epu16(a, b); //result is rounded, need to compensate it
+    res = _mm_xor_si128(a, b); //for rounding compensation
+    res = _mm_slli_epi16 (res,15); //shift left  then back right to
+    res = _mm_srli_epi16 (res,15); //get 1 or zero
+    return _mm_sub_epi16 (sum, res); //actual rounding compensation
+}
+
+_NEON2SSESTORAGE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b); // VHADD.U32 q0,q0,q0
+_NEON2SSE_INLINE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b) // VHADD.U32 q0,q0,q0
+{
+    //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
+    __m128i tmp1, tmp2;
+    tmp1 = _mm_and_si128(a,b);
+    tmp2 = _mm_xor_si128(a,b);
+    tmp2 = _mm_srli_epi32(tmp2,1);
+    return _mm_add_epi32(tmp1,tmp2);
+}
+
+//************************Vector rounding halving add: vrhadd{q}_<type>. Vr[i]:=(Va[i]+Vb[i]+1)>>1   ***************************
+//*****************************************************************************************************************************
+_NEON2SSESTORAGE int8x8_t vrhadd_s8(int8x8_t a,  int8x8_t b); // VRHADD.S8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vrhadd_s8(int8x8_t a,  int8x8_t b)
+{
+    int8x8_t res64;
+    return64(vrhaddq_s8(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE int16x4_t vrhadd_s16(int16x4_t a,  int16x4_t b); // VRHADD.S16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vrhadd_s16(int16x4_t a,  int16x4_t b)
+{
+    int16x4_t res64;
+    return64(vrhaddq_s16(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE int32x2_t vrhadd_s32(int32x2_t a,  int32x2_t b); // VRHADD.S32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vrhadd_s32(int32x2_t a,  int32x2_t b)
+{
+    int32x2_t res64;
+    return64(vrhaddq_s32(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b); // VRHADD.U8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b)
+{
+    uint8x8_t res64;
+    return64(_mm_avg_epu8(_pM128i(a),_pM128i(b))); //SSE, result rounding!!!
+}
+
+
+_NEON2SSESTORAGE uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b); // VRHADD.s16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b)
+{
+    uint16x4_t res64;
+    return64(_mm_avg_epu16(_pM128i(a),_pM128i(b))); //SSE, result rounding!!!
+}
+
+
+_NEON2SSESTORAGE uint32x2_t vrhadd_u32(uint32x2_t a,  uint32x2_t b); // VRHADD.U32 d0,d0,d0
+_NEON2SSE_INLINE uint32x2_t vrhadd_u32(uint32x2_t a,  uint32x2_t b)
+{
+    uint32x2_t res64;
+    return64(vrhaddq_u32(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE int8x16_t  vrhaddq_s8(int8x16_t a, int8x16_t b); // VRHADD.S8 q0,q0,q0
+_NEON2SSE_INLINE int8x16_t  vrhaddq_s8(int8x16_t a, int8x16_t b) // VRHADD.S8 q0,q0,q0
+{
+    //no signed average in x86 SIMD, go to unsigned
+    __m128i c128, au, bu, sum;
+    c128 = _mm_set1_epi8(-128); //(int8_t)0x80
+    au = _mm_sub_epi8(a, c128); //add 128
+    bu = _mm_sub_epi8(b, c128); //add 128
+    sum = _mm_avg_epu8(au, bu);
+    return _mm_add_epi8 (sum, c128); //sub 128
+}
+
+_NEON2SSESTORAGE int16x8_t  vrhaddq_s16(int16x8_t a, int16x8_t b); // VRHADD.S16 q0,q0,q0
+_NEON2SSE_INLINE int16x8_t  vrhaddq_s16(int16x8_t a, int16x8_t b) // VRHADD.S16 q0,q0,q0
+{
+    //no signed average in x86 SIMD, go to unsigned
+    __m128i cx8000, au, bu, sum;
+    cx8000 = _mm_set1_epi16(-32768); //(int16_t)0x8000
+    au = _mm_sub_epi16(a, cx8000); //add 32768
+    bu = _mm_sub_epi16(b, cx8000); //add 32768
+    sum = _mm_avg_epu16(au, bu);
+    return _mm_add_epi16 (sum, cx8000); //sub 32768
+}
+
+_NEON2SSESTORAGE int32x4_t  vrhaddq_s32(int32x4_t a, int32x4_t b); // VRHADD.S32 q0,q0,q0
+_NEON2SSE_INLINE int32x4_t  vrhaddq_s32(int32x4_t a, int32x4_t b)
+{
+    //need to avoid overflow
+    __m128i a2, b2, res, sum;
+    a2 = _mm_srai_epi32(a,1); //a2=a/2;
+    b2 = _mm_srai_epi32(b,1); // b2=b/2;
+    res = _mm_or_si128(a,b); //for rounding
+    res = _mm_slli_epi32 (res,31); //shift left  then back right to
+    res = _mm_srli_epi32 (res,31); //get 1 or zero
+    sum = _mm_add_epi32(a2,b2);
+    return _mm_add_epi32(sum,res);
+}
+
+_NEON2SSESTORAGE uint8x16_t   vrhaddq_u8(uint8x16_t a, uint8x16_t b); // VRHADD.U8 q0,q0,q0
+#define vrhaddq_u8 _mm_avg_epu8 //SSE2, results rounded
+
+_NEON2SSESTORAGE uint16x8_t   vrhaddq_u16(uint16x8_t a, uint16x8_t b); // VRHADD.s16 q0,q0,q0
+#define vrhaddq_u16 _mm_avg_epu16 //SSE2, results rounded
+
+
+_NEON2SSESTORAGE uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b); // VRHADD.U32 q0,q0,q0
+_NEON2SSE_INLINE uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b) // VRHADD.U32 q0,q0,q0
+{
+    //need to avoid overflow
+    __m128i a2, b2, res, sum;
+    a2 = _mm_srli_epi32(a,1); //a2=a/2;
+    b2 = _mm_srli_epi32(b,1); // b2=b/2;
+    res = _mm_or_si128(a,b); //for rounding
+    res = _mm_slli_epi32 (res,31); //shift left  then back right to
+    res = _mm_srli_epi32 (res,31); //get 1 or zero
+    sum = _mm_add_epi32(a2,b2);
+    return _mm_add_epi32(sum,res);
+}
+
+//****************** VQADD: Vector saturating add ************************
+//************************************************************************
+_NEON2SSESTORAGE int8x8_t vqadd_s8(int8x8_t a, int8x8_t b); // VQADD.S8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vqadd_s8(int8x8_t a, int8x8_t b)
+{
+    int8x8_t res64;
+    return64(_mm_adds_epi8(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE int16x4_t vqadd_s16(int16x4_t a, int16x4_t b); // VQADD.S16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vqadd_s16(int16x4_t a, int16x4_t b)
+{
+    int16x4_t res64;
+    return64(_mm_adds_epi16(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE int32x2_t vqadd_s32(int32x2_t a,  int32x2_t b); // VQADD.S32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vqadd_s32(int32x2_t a,  int32x2_t b)
+{
+    int32x2_t res64;
+    return64(vqaddq_s32(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE int64x1_t  vqadd_s64(int64x1_t a, int64x1_t b); // VQADD.S64 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqadd_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    int64x1_t res;
+    uint64_t a64, b64;
+    a64 = a.m64_u64[0];
+    b64 = b.m64_u64[0];
+    res.m64_u64[0] = a64 + b64;
+    a64 = (a64 >> 63) + (~_SIGNBIT64);
+    if ((int64_t)((b64 ^ a64) | ~(res.m64_u64[0] ^ b64))>=0) {
+        res.m64_u64[0] = a64;
+    }
+    return res;
+}
+
+_NEON2SSESTORAGE uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b); // VQADD.U8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b)
+{
+    uint8x8_t res64;
+    return64(_mm_adds_epu8(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b); // VQADD.s16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b)
+{
+    uint16x4_t res64;
+    return64(_mm_adds_epu16(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint32x2_t vqadd_u32(uint32x2_t a,  uint32x2_t b); // VQADD.U32 d0,d0,d0
+_NEON2SSE_INLINE uint32x2_t vqadd_u32(uint32x2_t a,  uint32x2_t b)
+{
+    uint32x2_t res64;
+    return64(vqaddq_u32(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b); // VQADD.U64 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    _NEON2SSE_ALIGN_16 uint64_t a64, b64;
+    uint64x1_t res;
+    a64 = a.m64_u64[0];
+    b64 = b.m64_u64[0];
+    res.m64_u64[0] = a64 + b64;
+    if (res.m64_u64[0] < a64) {
+        res.m64_u64[0] = ~(uint64_t)0;
+    }
+    return res;
+}
+
+_NEON2SSESTORAGE int8x16_t   vqaddq_s8(int8x16_t a, int8x16_t b); // VQADD.S8 q0,q0,q0
+#define vqaddq_s8 _mm_adds_epi8
+
+_NEON2SSESTORAGE int16x8_t   vqaddq_s16(int16x8_t a, int16x8_t b); // VQADD.S16 q0,q0,q0
+#define vqaddq_s16 _mm_adds_epi16
+
+_NEON2SSESTORAGE int32x4_t  vqaddq_s32(int32x4_t a, int32x4_t b); // VQADD.S32 q0,q0,q0
+_NEON2SSE_INLINE int32x4_t  vqaddq_s32(int32x4_t a, int32x4_t b)
+{
+    //no corresponding x86 SIMD soulution, special tricks are necessary. Overflow happens only if a and b have the same sign and sum has the opposite sign
+    __m128i c7fffffff, res, res_sat, res_xor_a, b_xor_a_;
+    c7fffffff = _mm_set1_epi32(0x7fffffff);
+    res = _mm_add_epi32(a, b);
+    res_sat = _mm_srli_epi32(a, 31);
+    res_sat = _mm_add_epi32(res_sat, c7fffffff);
+    res_xor_a = _mm_xor_si128(res, a);
+    b_xor_a_ = _mm_xor_si128(b, a);
+    res_xor_a = _mm_andnot_si128(b_xor_a_, res_xor_a);
+    res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise
+    res_sat = _mm_and_si128(res_xor_a, res_sat);
+    res = _mm_andnot_si128(res_xor_a, res);
+    return _mm_or_si128(res, res_sat);
+}
+
+_NEON2SSESTORAGE int64x2_t  vqaddq_s64(int64x2_t a, int64x2_t b); // VQADD.S64 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2];
+    _mm_store_si128((__m128i*)atmp, a);
+    _mm_store_si128((__m128i*)btmp, b);
+    res[0] = atmp[0] + btmp[0];
+    res[1] = atmp[1] + btmp[1];
+
+    atmp[0] = (atmp[0] >> 63) + (~_SIGNBIT64);
+    atmp[1] = (atmp[1] >> 63) + (~_SIGNBIT64);
+
+    if ((int64_t)((btmp[0] ^ atmp[0]) | ~(res[0] ^ btmp[0]))>=0) {
+        res[0] = atmp[0];
+    }
+    if ((int64_t)((btmp[1] ^ atmp[1]) | ~(res[1] ^ btmp[1]))>=0) {
+        res[1] = atmp[1];
+    }
+    return _mm_load_si128((__m128i*)res);
+}
+
+_NEON2SSESTORAGE uint8x16_t   vqaddq_u8(uint8x16_t a, uint8x16_t b); // VQADD.U8 q0,q0,q0
+#define vqaddq_u8 _mm_adds_epu8
+
+_NEON2SSESTORAGE uint16x8_t   vqaddq_u16(uint16x8_t a, uint16x8_t b); // VQADD.s16 q0,q0,q0
+#define vqaddq_u16 _mm_adds_epu16
+
+_NEON2SSESTORAGE uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b); // VQADD.U32 q0,q0,q0
+_NEON2SSE_INLINE uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b)
+{
+    __m128i c80000000, cmp, subsum, suba, sum;
+    c80000000 = _mm_set1_epi32 (0x80000000);
+    sum = _mm_add_epi32 (a, b);
+    subsum = _mm_sub_epi32 (sum, c80000000);
+    suba = _mm_sub_epi32 (a, c80000000);
+    cmp = _mm_cmpgt_epi32 ( suba, subsum); //no unsigned comparison, need to go to signed
+    return _mm_or_si128 (sum, cmp); //saturation
+}
+
+_NEON2SSESTORAGE uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b); // VQADD.U64 q0,q0,q0
+#ifdef USE_SSE4
+    _NEON2SSE_INLINE uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b)
+    {
+        __m128i c80000000, sum, cmp, suba, subsum;
+        c80000000 = _mm_set_epi32 (0x80000000, 0x0, 0x80000000, 0x0);
+        sum = _mm_add_epi64 (a, b);
+        subsum = _mm_sub_epi64 (sum, c80000000);
+        suba = _mm_sub_epi64 (a, c80000000);
+        cmp = _mm_cmpgt_epi64 ( suba, subsum); //no unsigned comparison, need to go to signed, SSE4.2!!!
+        return _mm_or_si128 (sum, cmp); //saturation
+    }
+#else
+    _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
+    {
+        _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2];
+        _mm_store_si128((__m128i*)atmp, a);
+        _mm_store_si128((__m128i*)btmp, b);
+        res[0] = atmp[0] + btmp[0];
+        res[1] = atmp[1] + btmp[1];
+        if (res[0] < atmp[0]) res[0] = ~(uint64_t)0;
+        if (res[1] < atmp[1]) res[1] = ~(uint64_t)0;
+        return _mm_load_si128((__m128i*)(res));
+    }
+#endif
+
+
+//******************* Vector add high half (truncated)  ******************
+//************************************************************************
+_NEON2SSESTORAGE int8x8_t   vaddhn_s16(int16x8_t a, int16x8_t b); // VADDHN.I16 d0,q0,q0
+_NEON2SSE_INLINE int8x8_t   vaddhn_s16(int16x8_t a, int16x8_t b) // VADDHN.I16 d0,q0,q0
+{
+    int8x8_t res64;
+    __m128i sum;
+    sum = _mm_add_epi16 (a, b);
+    sum = _mm_srai_epi16 (sum, 8);
+    sum = _mm_packs_epi16 (sum, sum); //use 64 low bits only
+    return64(sum);
+}
+
+_NEON2SSESTORAGE int16x4_t  vaddhn_s32(int32x4_t a, int32x4_t b); // VADDHN.I32 d0,q0,q0
+_NEON2SSE_INLINE int16x4_t  vaddhn_s32(int32x4_t a, int32x4_t b) // VADDHN.I32 d0,q0,q0
+{
+    int16x4_t res64;
+    __m128i sum;
+    sum = _mm_add_epi32 (a, b);
+    sum = _mm_srai_epi32(sum, 16);
+    sum = _mm_packs_epi32 (sum, sum); //use 64 low bits only
+    return64(sum);
+}
+
+_NEON2SSESTORAGE int32x2_t  vaddhn_s64(int64x2_t a, int64x2_t b); // VADDHN.I64 d0,q0,q0
+_NEON2SSE_INLINE int32x2_t  vaddhn_s64(int64x2_t a, int64x2_t b)
+{
+    int32x2_t res64;
+    __m128i sum;
+    sum = _mm_add_epi64 (a, b);
+    sum = _mm_shuffle_epi32(sum,  1 | (3 << 2) | (0 << 4) | (2 << 6));
+    return64(sum);
+}
+
+_NEON2SSESTORAGE uint8x8_t  vaddhn_u16(uint16x8_t a, uint16x8_t b); // VADDHN.I16 d0,q0,q0
+_NEON2SSE_INLINE uint8x8_t  vaddhn_u16(uint16x8_t a, uint16x8_t b) // VADDHN.I16 d0,q0,q0
+{
+    uint8x8_t res64;
+    __m128i sum;
+    sum = _mm_add_epi16 (a, b);
+    sum = _mm_srli_epi16 (sum, 8);
+    sum = _mm_packus_epi16 (sum,sum); //use 64 low bits only
+    return64(sum);
+}
+
+_NEON2SSESTORAGE uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b); // VADDHN.I32 d0,q0,q0
+_NEON2SSE_INLINE uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b) // VADDHN.I32 d0,q0,q0
+{
+    uint16x4_t res64;
+     __m128i sum;
+    sum = _mm_add_epi32 (a, b);
+    sum = _mm_srli_epi32 (sum, 16);
+#ifdef USE_SSE4
+    sum = _MM_PACKUS1_EPI32 (sum); //use 64 low bits only
+#else
+    sum = _mm_shuffle_epi8 (sum, *(__m128i*) mask8_32_even_odd); //go to 16 bits
+#endif
+    return64(sum);
+}
+
+_NEON2SSESTORAGE uint32x2_t vaddhn_u64(uint64x2_t a, uint64x2_t b); // VADDHN.I64 d0,q0,q0
+#define vaddhn_u64 vaddhn_s64
+
+//*********** Vector rounding add high half: vraddhn_<type> ******************.
+//***************************************************************************
+_NEON2SSESTORAGE int8x8_t   vraddhn_s16(int16x8_t a, int16x8_t b); // VRADDHN.I16 d0,q0,q0
+_NEON2SSE_INLINE int8x8_t   vraddhn_s16(int16x8_t a, int16x8_t b) // VRADDHN.I16 d0,q0,q0
+{
+    int8x8_t res64;
+    __m128i sum, mask1;
+    sum = _mm_add_epi16 (a, b);
+    mask1 = _mm_slli_epi16(sum, 9); //shift left then back right to
+    mask1 = _mm_srli_epi16(mask1, 15); //get  7-th bit 1 or zero
+    sum = _mm_srai_epi16 (sum, 8); //get high half
+    sum = _mm_add_epi16 (sum, mask1); //actual rounding
+    sum = _mm_packs_epi16 (sum, sum);
+    return64(sum);
+}
+
+_NEON2SSESTORAGE int16x4_t  vraddhn_s32(int32x4_t a, int32x4_t b); // VRADDHN.I32 d0,q0,q0
+_NEON2SSE_INLINE int16x4_t  vraddhn_s32(int32x4_t a, int32x4_t b) // VRADDHN.I32 d0,q0,q0
+{
+    //SIMD may be not optimal, serial may be faster
+    int16x4_t res64;
+    __m128i sum, mask1;
+    sum = _mm_add_epi32 (a, b);
+    mask1 = _mm_slli_epi32(sum, 17); //shift left then back right to
+    mask1 = _mm_srli_epi32(mask1,31); //get  15-th bit 1 or zero
+    sum = _mm_srai_epi32 (sum, 16); //get high half
+    sum = _mm_add_epi32 (sum, mask1); //actual rounding
+    sum = _mm_packs_epi32 (sum, sum);
+    return64(sum);
+}
+
+_NEON2SSESTORAGE int32x2_t  vraddhn_s64(int64x2_t a, int64x2_t b); // VRADDHN.I64 d0,q0,q0
+_NEON2SSE_INLINE int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b)
+{
+    //SIMD may be not optimal, serial may be faster
+    int32x2_t res64;
+    __m128i sum, mask1;
+    sum = _mm_add_epi64 (a, b);
+    mask1 = _mm_slli_epi64(sum, 33); //shift left then back right to
+    mask1 = _mm_srli_epi64(mask1,32); //get  31-th bit 1 or zero
+    sum = _mm_add_epi64 (sum, mask1); //actual high half rounding
+    sum = _mm_shuffle_epi32(sum,  1 | (3 << 2) | (1 << 4) | (3 << 6));
+    return64(sum);
+}
+
+_NEON2SSESTORAGE uint8x8_t  vraddhn_u16(uint16x8_t a, uint16x8_t b); // VRADDHN.I16 d0,q0,q0
+_NEON2SSE_INLINE uint8x8_t  vraddhn_u16(uint16x8_t a, uint16x8_t b) // VRADDHN.I16 d0,q0,q0
+{
+    uint8x8_t res64;
+    __m128i sum, mask1;
+    sum = _mm_add_epi16 (a, b);
+    mask1 = _mm_slli_epi16(sum, 9); //shift left then back right to
+    mask1 = _mm_srli_epi16(mask1, 15); //get  7-th bit 1 or zero
+    sum = _mm_srai_epi16 (sum, 8); //get high half
+    sum = _mm_add_epi16 (sum, mask1); //actual rounding
+    sum = _mm_packus_epi16 (sum, sum);
+    return64(sum);
+}
+
+_NEON2SSESTORAGE uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b); // VRADDHN.I32 d0,q0,q0
+_NEON2SSE_INLINE uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b)
+{
+    //SIMD may be not optimal, serial may be faster
+    uint16x4_t res64;
+    __m128i sum, mask1;
+    sum = _mm_add_epi32 (a, b);
+    mask1 = _mm_slli_epi32(sum, 17); //shift left then back right to
+    mask1 = _mm_srli_epi32(mask1,31); //get  15-th bit 1 or zero
+    sum = _mm_srai_epi32 (sum, 16); //get high half
+    sum = _mm_add_epi32 (sum, mask1); //actual rounding
+    sum = _MM_PACKUS1_EPI32 (sum);
+    return64(sum);
+}
+
+_NEON2SSESTORAGE uint32x2_t vraddhn_u64(uint64x2_t a, uint64x2_t b); // VRADDHN.I64 d0,q0,q0
+#define vraddhn_u64 vraddhn_s64
+
+//**********************************************************************************
+//*********             Multiplication            *************************************
+//**************************************************************************************
+
+//Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i]
+//As we don't go to wider result functions are equal to "multiply low" in x86
+_NEON2SSESTORAGE int8x8_t vmul_s8(int8x8_t a, int8x8_t b); // VMUL.I8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vmul_s8(int8x8_t a, int8x8_t b) // VMUL.I8 d0,d0,d0
+{
+    // no 8 bit simd multiply, need to go to 16 bits in SSE
+    int8x8_t res64;
+    __m128i a128, b128, res;
+    a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1 use low 64 bits
+    b128 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits
+    res = _mm_mullo_epi16 (a128, b128);
+    res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd); //return to 8 bit from 16, use 64 low bits only
+    return64(res);
+}
+
+_NEON2SSESTORAGE int16x4_t vmul_s16(int16x4_t a,  int16x4_t b); // VMUL.I16 d0,d0,d0
+#define vmul_s16 vmul_u16
+
+_NEON2SSESTORAGE int32x2_t vmul_s32(int32x2_t a,  int32x2_t b); // VMUL.I32 d0,d0,d0
+#define vmul_s32 vmul_u32
+
+_NEON2SSESTORAGE float32x2_t vmul_f32(float32x2_t a, float32x2_t b); // VMUL.F32 d0,d0,d0
+_NEON2SSE_INLINE float32x2_t vmul_f32(float32x2_t a, float32x2_t b)
+{
+    float32x4_t tmp;
+    __m64_128 res64;
+    tmp =  _mm_mul_ps(_pM128(a),_pM128(b));
+    _M64f(res64, tmp); //use low 64 bits
+    return res64;
+}
+
+_NEON2SSESTORAGE uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b); // VMUL.I8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b) // VMUL.I8 d0,d0,d0
+{
+    // no 8 bit simd multiply, need to go to 16 bits in SSE
+    uint8x8_t res64;
+    __m128i mask, a128, b128, res;
+    mask = _mm_set1_epi16(0xff);
+    a128 = _MM_CVTEPU8_EPI16 (_pM128i(a));
+    b128 = _MM_CVTEPU8_EPI16 (_pM128i(b));
+    res = _mm_mullo_epi16 (a128, b128);
+    res = _mm_and_si128(res, mask); //to avoid saturation
+    res = _mm_packus_epi16 (res,res); //use only low 64 bits
+    return64(res);
+}
+
+_NEON2SSESTORAGE uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b); // VMUL.I16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b)
+{
+    uint16x4_t res64;
+    return64(_mm_mullo_epi16(_pM128i(a),_pM128i(b)));
+}
+
+_NEON2SSESTORAGE uint32x2_t   vmul_u32(uint32x2_t a, uint32x2_t b); // VMUL.I32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( uint32x2_t   vmul_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    uint32x2_t res;
+    res.m64_u32[0] = a.m64_u32[0] * b.m64_u32[0];
+    res.m64_u32[1] = a.m64_u32[1] * b.m64_u32[1];
+    return res;
+}
+
+_NEON2SSESTORAGE poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b); // VMUL.P8 d0,d0,d0
+_NEON2SSE_INLINE poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b)
+{
+    //may be optimized
+    poly8x8_t res64;
+    __m128i a64, b64, c1, res, tmp, bmasked;
+    int i;
+    a64 = _pM128i(a);
+    b64 = _pM128i(b);
+    c1 = _mm_cmpeq_epi8 (a64,a64); //all ones 0xff....
+    c1 = vshrq_n_u8(c1,7); //0x1
+    bmasked = _mm_and_si128(b64, c1); //0x1
+    res = vmulq_u8(a64, bmasked);
+    for(i = 1; i<8; i++) {
+        c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here
+        bmasked = _mm_and_si128(b64, c1); //0x1
+        tmp = vmulq_u8(a64, bmasked);
+        res = _mm_xor_si128(res, tmp);
+    }
+    return64 (res);
+}
+
+_NEON2SSESTORAGE int8x16_t vmulq_s8(int8x16_t a, int8x16_t b); // VMUL.I8 q0,q0,q0
+_NEON2SSE_INLINE int8x16_t vmulq_s8(int8x16_t a, int8x16_t b) // VMUL.I8 q0,q0,q0
+{
+    // no 8 bit simd multiply, need to go to 16 bits
+    //solution may be not optimal
+    __m128i a16, b16, r16_1, r16_2;
+    a16 = _MM_CVTEPI8_EPI16 (a); // SSE 4.1
+    b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1
+    r16_1 = _mm_mullo_epi16 (a16, b16);
+    //swap hi and low part of a and b to process the remaining data
+    a16 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
+    b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
+    a16 = _MM_CVTEPI8_EPI16 (a16); // SSE 4.1
+    b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1  __m128i r16_2
+
+    r16_2 = _mm_mullo_epi16 (a16, b16);
+    r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*)mask8_16_even_odd); //return to 8 bit
+    r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*)mask8_16_even_odd); //return to 8 bit
+
+    return _mm_unpacklo_epi64(r16_1,  r16_2);
+}
+
+_NEON2SSESTORAGE int16x8_t   vmulq_s16(int16x8_t a, int16x8_t b); // VMUL.I16 q0,q0,q0
+#define vmulq_s16 _mm_mullo_epi16
+
+_NEON2SSESTORAGE int32x4_t   vmulq_s32(int32x4_t a, int32x4_t b); // VMUL.I32 q0,q0,q0
+#define vmulq_s32 _MM_MULLO_EPI32 //SSE4.1
+
+_NEON2SSESTORAGE float32x4_t vmulq_f32(float32x4_t a, float32x4_t b); // VMUL.F32 q0,q0,q0
+#define vmulq_f32 _mm_mul_ps
+
+_NEON2SSESTORAGE uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b); // VMUL.I8 q0,q0,q0
+_NEON2SSE_INLINE uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b) // VMUL.I8 q0,q0,q0
+{
+    // no 8 bit simd multiply, need to go to 16 bits
+    //solution may be not optimal
+    __m128i maskff, a16, b16, r16_1, r16_2;
+    maskff = _mm_set1_epi16(0xff);
+    a16 = _MM_CVTEPU8_EPI16 (a); // SSE 4.1
+    b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1
+    r16_1 = _mm_mullo_epi16 (a16, b16);
+    r16_1 = _mm_and_si128(r16_1, maskff); //to avoid saturation
+    //swap hi and low part of a and b to process the remaining data
+    a16 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
+    b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
+    a16 = _MM_CVTEPI8_EPI16 (a16); // SSE 4.1
+    b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1
+
+    r16_2 = _mm_mullo_epi16 (a16, b16);
+    r16_2 = _mm_and_si128(r16_2, maskff); //to avoid saturation
+    return _mm_packus_epi16 (r16_1,  r16_2);
+}
+
+_NEON2SSESTORAGE uint16x8_t   vmulq_u16(uint16x8_t a, uint16x8_t b); // VMUL.I16 q0,q0,q0
+#define vmulq_u16 _mm_mullo_epi16
+
+_NEON2SSESTORAGE uint32x4_t   vmulq_u32(uint32x4_t a, uint32x4_t b); // VMUL.I32 q0,q0,q0
+#define vmulq_u32 _MM_MULLO_EPI32 //SSE4.1
+
+_NEON2SSESTORAGE poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b); // VMUL.P8 q0,q0,q0
+_NEON2SSE_INLINE poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b)
+{
+    //may be optimized
+    __m128i c1, res, tmp, bmasked;
+    int i;
+    c1 = _mm_cmpeq_epi8 (a,a); //all ones 0xff....
+    c1 = vshrq_n_u8(c1,7); //0x1
+    bmasked = _mm_and_si128(b, c1); //0x1
+    res = vmulq_u8(a, bmasked);
+    for(i = 1; i<8; i++) {
+        c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here
+        bmasked = _mm_and_si128(b, c1); //0x1
+        tmp = vmulq_u8(a, bmasked);
+        res = _mm_xor_si128(res, tmp);
+    }
+    return res;
+}
+
+//************************* Vector long multiply ***********************************
+//****************************************************************************
+_NEON2SSESTORAGE int16x8_t vmull_s8(int8x8_t a, int8x8_t b); // VMULL.S8 q0,d0,d0
+_NEON2SSE_INLINE int16x8_t vmull_s8(int8x8_t a, int8x8_t b) // VMULL.S8 q0,d0,d0
+{
+    //no 8 bit simd multiply, need to go to 16 bits
+    __m128i a16, b16;
+    a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1
+    b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1
+    return _mm_mullo_epi16 (a16, b16); //should fit into 16 bit
+}
+
+_NEON2SSESTORAGE int32x4_t vmull_s16(int16x4_t a, int16x4_t b); // VMULL.S16 q0,d0,d0
+_NEON2SSE_INLINE int32x4_t vmull_s16(int16x4_t a, int16x4_t b) // VMULL.S16 q0,d0,d0
+{
+#ifdef USE_SSE4
+    __m128i a16, b16;
+    a16 = _MM_CVTEPI16_EPI32 (_pM128i(a)); // SSE 4.1
+    b16 = _MM_CVTEPI16_EPI32 (_pM128i(b)); // SSE 4.1
+    return _MM_MULLO_EPI32 (a16, b16); // SSE 4.1
+#else
+    __m128i low, hi, a128,b128;
+    a128 = _pM128i(a);
+    b128 = _pM128i(b);
+    low =  _mm_mullo_epi16(a128,b128);
+    hi =   _mm_mulhi_epi16(a128,b128);
+    return _mm_unpacklo_epi16(low,hi);
+#endif
+}
+
+_NEON2SSESTORAGE int64x2_t vmull_s32(int32x2_t a, int32x2_t b); // VMULL.S32 q0,d0,d0
+_NEON2SSE_INLINE int64x2_t vmull_s32(int32x2_t a, int32x2_t b) // VMULL.S32 q0,d0,d0
+{
+    __m128i ab, ba, a128, b128;
+    a128 = _pM128i(a);
+    b128 = _pM128i(b);
+    ab = _mm_unpacklo_epi32 (a128, b128); //a0, b0, a1,b1
+    ba = _mm_unpacklo_epi32 (b128, a128); //b0, a0, b1,a1
+    return _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
+}
+
+_NEON2SSESTORAGE uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b); // VMULL.U8 q0,d0,d0
+_NEON2SSE_INLINE uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b) // VMULL.U8 q0,d0,d0
+{
+    //no 8 bit simd multiply, need to go to 16 bits
+    __m128i a16, b16;
+    a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1
+    b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1
+    return _mm_mullo_epi16 (a16, b16); //should fit into 16 bit
+}
+
+_NEON2SSESTORAGE uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b); // VMULL.s16 q0,d0,d0
+_NEON2SSE_INLINE uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b) // VMULL.s16 q0,d0,d0
+{
+#ifdef USE_SSE4
+    __m128i a16, b16;
+    a16 = _MM_CVTEPU16_EPI32 (_pM128i(a)); // SSE 4.1
+    b16 = _MM_CVTEPU16_EPI32 (_pM128i(b)); // SSE 4.1
+    return _MM_MULLO_EPI32 (a16, b16); // SSE 4.1
+#else
+    __m128i a128,b128,low, hi;
+    a128 = _pM128i(a);
+    b128 = _pM128i(b);
+    low =  _mm_mullo_epi16(a128,b128);
+    hi =   _mm_mulhi_epu16(a128,b128);
+    return _mm_unpacklo_epi16(low,hi);
+#endif
+}
+
+_NEON2SSESTORAGE uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b); // VMULL.U32 q0,d0,d0
+_NEON2SSE_INLINE uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b) // VMULL.U32 q0,d0,d0
+{
+    ///may be not optimal compared with serial implementation
+    __m128i ab, ba, a128, b128;
+    a128 = _pM128i(a);
+    b128 = _pM128i(b);
+    ab = _mm_unpacklo_epi32 (a128, b128); //a0, b0, a1,b1
+    ba = _mm_unpacklo_epi32 (b128, a128); //b0, a0, b1,a1
+    return _mm_mul_epu32 (ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
+}
+
+_NEON2SSESTORAGE poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b); // VMULL.P8 q0,d0,d0
+_NEON2SSE_INLINE poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b)
+{
+    //may be optimized
+    __m128i a128,b128, c1, a128_16, bmasked_16, res, tmp, bmasked;
+    int i;
+    a128 = _pM128i(a);
+    b128 = _pM128i(b);
+    c1 = _mm_cmpeq_epi8 (a128,a128); //all ones 0xff....
+    c1 = vshrq_n_u8(c1,7); //0x1
+    bmasked = _mm_and_si128(b128, c1); //0x1
+
+    a128_16 = _MM_CVTEPU8_EPI16 (a128); // SSE 4.1
+    bmasked_16 = _MM_CVTEPU8_EPI16 (bmasked); // SSE 4.1
+    res = _mm_mullo_epi16 (a128_16, bmasked_16); //should fit into 16 bit
+    for(i = 1; i<8; i++) {
+        c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here
+        bmasked = _mm_and_si128(b128, c1); //0x1
+        bmasked_16 = _MM_CVTEPU8_EPI16 (bmasked); // SSE 4.1
+        tmp = _mm_mullo_epi16 (a128_16, bmasked_16); //should fit into 16 bit, vmull_u8(a, bmasked);
+        res = _mm_xor_si128(res, tmp);
+    }
+    return res;
+}
+
+//****************Vector saturating doubling long multiply **************************
+//*****************************************************************
+_NEON2SSESTORAGE int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b); // VQDMULL.S16 q0,d0,d0
+_NEON2SSE_INLINE int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b)
+{
+    //the serial soulution may be faster due to saturation
+    __m128i res;
+    res = vmull_s16(a, b);
+    return vqd_s32(res);
+}
+
+_NEON2SSESTORAGE int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b); // VQDMULL.S32 q0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //the serial soulution may be faster due to saturation
+    __m128i res;
+    res = vmull_s32(a,b);
+    return vqaddq_s64(res,res); //slow serial function!!!!
+}
+
+//********************* Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i]  ************************
+//******************************************************************************************
+_NEON2SSESTORAGE int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLA.I8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c) // VMLA.I8 d0,d0,d0
+{
+    // no 8 bit x86 simd multiply, need to go to 16 bits,  and use the low 64 bits
+    int8x8_t res64;
+    __m128i b128, c128, res;
+    b128 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits
+    c128 = _MM_CVTEPI8_EPI16 (_pM128i(c)); // SSE 4.1 use low 64 bits
+    res = _mm_mullo_epi16 (c128, b128);
+    res  =  _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd);
+    res  = _mm_add_epi8 (res, _pM128i(a)); //use the low 64 bits
+    return64(res);
+}
+
+_NEON2SSESTORAGE int16x4_t vmla_s16(int16x4_t a,  int16x4_t b, int16x4_t c); // VMLA.I16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vmla_s16(int16x4_t a,  int16x4_t b, int16x4_t c)
+{
+    int16x4_t res64;
+    return64(vmlaq_s16(_pM128i(a),_pM128i(b), _pM128i(c)));
+}
+
+
+_NEON2SSESTORAGE int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLA.I32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c) // VMLA.I32 d0,d0,d0
+{
+    int32x2_t res64;
+    __m128i res;
+    res = _MM_MULLO_EPI32 (_pM128i(b), _pM128i(c)); //SSE4.1
+    res = _mm_add_epi32 (res, _pM128i(a)); //use the low 64 bits
+    return64(res);
+}
+
+_NEON2SSESTORAGE float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLA.F32 d0,d0,d0
+_NEON2SSE_INLINE float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c)
+{
+    //fma is coming soon, but right now:
+    __m128 res;
+    __m64_128 res64;
+    res = _mm_mul_ps (_pM128(c), _pM128(b));
+    res = _mm_add_ps (_pM128(a), res);
+    _M64f(res64, res);
+    return res64;
+}
+
+_NEON2SSESTORAGE uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLA.I8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) // VMLA.I8 d0,d0,d0
+{
+    // no 8 bit x86 simd multiply, need to go to 16 bits,  and use the low 64 bits
+    uint8x8_t res64;
+    __m128i mask, b128, c128, res;
+    mask = _mm_set1_epi16(0xff);
+    b128 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits
+    c128 = _MM_CVTEPU8_EPI16 (_pM128i(c)); // SSE 4.1 use low 64 bits
+    res = _mm_mullo_epi16 (c128, b128);
+    res = _mm_and_si128(res, mask); //to avoid saturation
+    res = _mm_packus_epi16 (res, res);
+    res =  _mm_add_epi8 (res, _pM128i(a)); //use the low 64 bits
+    return64(res);
+}
+
+_NEON2SSESTORAGE uint16x4_t vmla_u16(uint16x4_t a,  uint16x4_t b, uint16x4_t c); // VMLA.I16 d0,d0,d0
+#define vmla_u16 vmla_s16
+
+_NEON2SSESTORAGE uint32x2_t vmla_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t c); // VMLA.I32 d0,d0,d0
+#define vmla_u32 vmla_s32
+
+_NEON2SSESTORAGE int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLA.I8 q0,q0,q0
+_NEON2SSE_INLINE int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VMLA.I8 q0,q0,q0
+{
+    //solution may be not optimal
+    // no 8 bit simd multiply, need to go to 16 bits
+    __m128i b16, c16, r16_1, a_2,r16_2;
+    b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1
+    c16 = _MM_CVTEPI8_EPI16 (c); // SSE 4.1
+    r16_1 = _mm_mullo_epi16 (b16, c16);
+    r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits
+    r16_1 = _mm_add_epi8 (r16_1, a);
+    //swap hi and low part of a, b and c to process the remaining data
+    a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
+    b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
+    c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
+    b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1
+    c16 = _MM_CVTEPI8_EPI16 (c16); // SSE 4.1
+
+    r16_2 = _mm_mullo_epi16 (b16, c16);
+    r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
+    r16_2 = _mm_add_epi8(r16_2, a_2);
+    return _mm_unpacklo_epi64(r16_1,r16_2);
+}
+
+_NEON2SSESTORAGE int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLA.I16 q0,q0,q0
+_NEON2SSE_INLINE int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VMLA.I16 q0,q0,q0
+{
+    __m128i res;
+    res = _mm_mullo_epi16 (c, b);
+    return _mm_add_epi16 (res, a);
+}
+
+_NEON2SSESTORAGE int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLA.I32 q0,q0,q0
+_NEON2SSE_INLINE int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VMLA.I32 q0,q0,q0
+{
+    __m128i res;
+    res = _MM_MULLO_EPI32 (c,  b); //SSE4.1
+    return _mm_add_epi32 (res, a);
+}
+
+_NEON2SSESTORAGE float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLA.F32 q0,q0,q0
+_NEON2SSE_INLINE float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) // VMLA.F32 q0,q0,q0
+{
+    //fma is coming soon, but right now:
+    __m128 res;
+    res = _mm_mul_ps (c, b);
+    return _mm_add_ps (a, res);
+}
+
+_NEON2SSESTORAGE uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLA.I8 q0,q0,q0
+_NEON2SSE_INLINE uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) // VMLA.I8 q0,q0,q0
+{
+    //solution may be not optimal
+    // no 8 bit simd multiply, need to go to 16 bits
+    __m128i b16, c16, r16_1, a_2, r16_2;
+    b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1
+    c16 = _MM_CVTEPU8_EPI16 (c); // SSE 4.1
+    r16_1 = _mm_mullo_epi16 (b16, c16);
+    r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits
+    r16_1 = _mm_add_epi8 (r16_1, a);
+    //swap hi and low part of a, b and c to process the remaining data
+    a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
+    b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
+    c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
+    b16 = _MM_CVTEPU8_EPI16 (b16); // SSE 4.1
+    c16 = _MM_CVTEPU8_EPI16 (c16); // SSE 4.1
+
+    r16_2 = _mm_mullo_epi16 (b16, c16);
+    r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
+    r16_2 = _mm_add_epi8(r16_2, a_2);
+    return _mm_unpacklo_epi64(r16_1,r16_2);
+}
+
+_NEON2SSESTORAGE uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLA.I16 q0,q0,q0
+#define vmlaq_u16 vmlaq_s16
+
+_NEON2SSESTORAGE uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLA.I32 q0,q0,q0
+#define vmlaq_u32 vmlaq_s32
+
+//**********************  Vector widening multiply accumulate (long multiply accumulate):
+//                          vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i]  **************
+//********************************************************************************************
+_NEON2SSESTORAGE int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLAL.S8 q0,d0,d0
+_NEON2SSE_INLINE int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VMLAL.S8 q0,d0,d0
+{
+    int16x8_t res;
+    res = vmull_s8(b, c);
+    return _mm_add_epi16 (res, a);
+}
+
+_NEON2SSESTORAGE int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLAL.S16 q0,d0,d0
+_NEON2SSE_INLINE int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VMLAL.S16 q0,d0,d0
+{
+    //may be not optimal compared with serial implementation
+    int32x4_t res;
+    res = vmull_s16(b,  c);
+    return _mm_add_epi32 (res, a);
+}
+
+_NEON2SSESTORAGE int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLAL.S32 q0,d0,d0
+_NEON2SSE_INLINE int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) // VMLAL.S32 q0,d0,d0
+{
+    //may be not optimal compared with serial implementation
+    int64x2_t res;
+    res = vmull_s32( b, c);
+    return _mm_add_epi64 (res, a);
+}
+
+_NEON2SSESTORAGE uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLAL.U8 q0,d0,d0
+_NEON2SSE_INLINE uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) // VMLAL.U8 q0,d0,d0
+{
+    uint16x8_t res;
+    res = vmull_u8(b, c);
+    return _mm_add_epi16 (res, a);
+}
+
+_NEON2SSESTORAGE uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLAL.s16 q0,d0,d0
+_NEON2SSE_INLINE uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) // VMLAL.s16 q0,d0,d0
+{
+    //may be not optimal compared with serial implementation
+    uint32x4_t res;
+    res = vmull_u16(b, c);
+    return _mm_add_epi32 (res, a);
+}
+
+_NEON2SSESTORAGE uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLAL.U32 q0,d0,d0
+_NEON2SSE_INLINE uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) // VMLAL.U32 q0,d0,d0
+{
+    //may be not optimal compared with serial implementation
+    int64x2_t res;
+    res = vmull_u32( b,c);
+    return _mm_add_epi64 (res, a);
+}
+
+//******************** Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i] ***************************************
+//********************************************************************************************
+_NEON2SSESTORAGE int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLS.I8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c) // VMLS.I8 d0,d0,d0
+{
+    // no 8 bit simd multiply, need to go to 16 bits -  and use the low 64 bits
+    int8x8_t res64;
+    __m128i res;
+    res64 = vmul_s8(b,c);
+    res = _mm_sub_epi8 (_pM128i(a), _pM128i(res64));
+    return64(res);
+}
+
+_NEON2SSESTORAGE int16x4_t vmls_s16(int16x4_t a,  int16x4_t b, int16x4_t c); // VMLS.I16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vmls_s16(int16x4_t a,  int16x4_t b, int16x4_t c)
+{
+    int16x4_t res64;
+    return64(vmlsq_s16(_pM128i(a),_pM128i(b), _pM128i(c)));
+}
+
+
+_NEON2SSESTORAGE int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLS.I32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c) // VMLS.I32 d0,d0,d0
+{
+    int32x2_t res64;
+    __m128i res;
+    res = _MM_MULLO_EPI32 (_pM128i(c),_pM128i( b)); //SSE4.1
+    res =  _mm_sub_epi32 (_pM128i(a),res); //use low 64 bits only
+    return64(res);
+}
+
+_NEON2SSESTORAGE float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLS.F32 d0,d0,d0
+_NEON2SSE_INLINE float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c)
+{
+    __m128 res;
+    __m64_128 res64;
+    res = _mm_mul_ps (_pM128(c), _pM128(b));
+    res = _mm_sub_ps (_pM128(a), res);
+    _M64f(res64, res);
+    return res64;
+}
+
+_NEON2SSESTORAGE uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLS.I8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c)
+{
+    // no 8 bit simd multiply, need to go to 16 bits -  and use the low 64 bits
+    uint8x8_t res64;
+    __m128i res;
+    res64 = vmul_u8(b,c);
+    res = _mm_sub_epi8 (_pM128i(a), _pM128i(res64));
+    return64(res);
+}
+
+_NEON2SSESTORAGE uint16x4_t vmls_u16(uint16x4_t a,  uint16x4_t b, uint16x4_t c); // VMLS.I16 d0,d0,d0
+#define vmls_u16 vmls_s16
+
+_NEON2SSESTORAGE uint32x2_t vmls_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t c); // VMLS.I32 d0,d0,d0
+#define vmls_u32 vmls_s32
+
+
+_NEON2SSESTORAGE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLS.I8 q0,q0,q0
+_NEON2SSE_INLINE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VMLS.I8 q0,q0,q0
+{
+    //solution may be not optimal
+    // no 8 bit simd multiply, need to go to 16 bits
+    __m128i b16, c16, r16_1, a_2, r16_2;
+    b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1
+    c16 = _MM_CVTEPI8_EPI16 (c); // SSE 4.1
+    r16_1 = _mm_mullo_epi16 (b16, c16);
+    r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd);
+    r16_1 = _mm_sub_epi8 (a, r16_1);
+    //swap hi and low part of a, b, c to process the remaining data
+    a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
+    b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
+    c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
+    b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1
+    c16 = _MM_CVTEPI8_EPI16 (c16); // SSE 4.1
+
+    r16_2 = _mm_mullo_epi16 (b16, c16);
+    r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
+    r16_2 = _mm_sub_epi8 (a_2, r16_2);
+    return _mm_unpacklo_epi64(r16_1,r16_2);
+}
+
+_NEON2SSESTORAGE int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLS.I16 q0,q0,q0
+_NEON2SSE_INLINE int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VMLS.I16 q0,q0,q0
+{
+    __m128i res;
+    res = _mm_mullo_epi16 (c, b);
+    return _mm_sub_epi16 (a, res);
+}
+
+_NEON2SSESTORAGE int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLS.I32 q0,q0,q0
+_NEON2SSE_INLINE int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VMLS.I32 q0,q0,q0
+{
+    __m128i res;
+    res = _MM_MULLO_EPI32 (c, b); //SSE4.1
+    return _mm_sub_epi32 (a, res);
+}
+
+_NEON2SSESTORAGE float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLS.F32 q0,q0,q0
+_NEON2SSE_INLINE float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) // VMLS.F32 q0,q0,q0
+{
+    __m128 res;
+    res = _mm_mul_ps (c, b);
+    return _mm_sub_ps (a, res);
+}
+
+_NEON2SSESTORAGE uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLS.I8 q0,q0,q0
+_NEON2SSE_INLINE uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) // VMLS.I8 q0,q0,q0
+{
+    //solution may be not optimal
+    // no 8 bit simd multiply, need to go to 16 bits
+    __m128i b16, c16, r16_1, a_2, r16_2;
+    b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1
+    c16 = _MM_CVTEPU8_EPI16 (c); // SSE 4.1
+    r16_1 = _mm_mullo_epi16 (b16, c16);
+    r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits
+    r16_1 = _mm_sub_epi8 (a, r16_1);
+    //swap hi and low part of a, b and c to process the remaining data
+    a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
+    b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
+    c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
+    b16 = _MM_CVTEPU8_EPI16 (b16); // SSE 4.1
+    c16 = _MM_CVTEPU8_EPI16 (c16); // SSE 4.1
+
+    r16_2 = _mm_mullo_epi16 (b16, c16);
+    r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
+    r16_2 = _mm_sub_epi8(a_2, r16_2);
+    return _mm_unpacklo_epi64(r16_1,r16_2);
+}
+
+_NEON2SSESTORAGE uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLS.I16 q0,q0,q0
+#define vmlsq_u16 vmlsq_s16
+
+_NEON2SSESTORAGE uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLS.I32 q0,q0,q0
+#define vmlsq_u32 vmlsq_s32
+
+//******************** Vector multiply subtract long (widening multiply subtract) ************************************
+//*************************************************************************************************************
+_NEON2SSESTORAGE int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLSL.S8 q0,d0,d0
+_NEON2SSE_INLINE int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VMLSL.S8 q0,d0,d0
+{
+    int16x8_t res;
+    res = vmull_s8(b, c);
+    return _mm_sub_epi16 (a, res);
+}
+
+_NEON2SSESTORAGE int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLSL.S16 q0,d0,d0
+_NEON2SSE_INLINE int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VMLSL.S16 q0,d0,d0
+{
+    //may be not optimal compared with serial implementation
+    int32x4_t res;
+    res = vmull_s16(b,  c);
+    return _mm_sub_epi32 (a, res);
+}
+
+_NEON2SSESTORAGE int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLSL.S32 q0,d0,d0
+_NEON2SSE_INLINE int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) // VMLSL.S32 q0,d0,d0
+{
+    //may be not optimal compared with serial implementation
+    int64x2_t res;
+    res = vmull_s32( b,c);
+    return _mm_sub_epi64 (a, res);
+}
+
+_NEON2SSESTORAGE uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLSL.U8 q0,d0,d0
+_NEON2SSE_INLINE uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) // VMLSL.U8 q0,d0,d0
+{
+    uint16x8_t res;
+    res = vmull_u8(b, c);
+    return _mm_sub_epi16 (a, res);
+}
+
+_NEON2SSESTORAGE uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLSL.s16 q0,d0,d0
+_NEON2SSE_INLINE uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) // VMLSL.s16 q0,d0,d0
+{
+    //may be not optimal compared with serial implementation
+    uint32x4_t res;
+    res = vmull_u16(b, c);
+    return _mm_sub_epi32 (a, res);
+}
+
+_NEON2SSESTORAGE uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLSL.U32 q0,d0,d0
+_NEON2SSE_INLINE uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) // VMLSL.U32 q0,d0,d0
+{
+    //may be not optimal compared with serial implementation
+    int64x2_t res;
+    res = vmull_u32( b,c);
+    return _mm_sub_epi64 (a, res);
+}
+
+//******  Vector saturating doubling multiply high **********************
+//*************************************************************************
+_NEON2SSESTORAGE int16x4_t vqdmulh_s16(int16x4_t a,  int16x4_t b); // VQDMULH.S16 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqdmulh_s16(int16x4_t a,  int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    int16x4_t res;
+    int32_t a32, b32, i;
+    for (i = 0; i<4; i++) {
+        a32 = (int32_t) a.m64_i16[i];
+        b32 = (int32_t) b.m64_i16[i];
+        a32 = (a32 * b32) >> 15;
+        res.m64_i16[i] = (a32 == 0x8000) ? 0x7fff : (int16_t) a32;
+    }
+    return res;
+}
+
+_NEON2SSESTORAGE int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b); // VQDMULH.S32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b) // no multiply high 32 bit SIMD in IA32, so need to do some tricks, serial solution may be faster
+{
+    //may be not optimal compared with a serial solution
+    int32x2_t res64;
+    __m128i mask;
+    _NEON2SSE_ALIGN_16 static const uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
+    int64x2_t mul;
+    mul = vmull_s32(a,b);
+    mul = _mm_slli_epi64(mul,1); //double the result
+    //at this point start treating 2 64-bit numbers as 4 32-bit
+    mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits
+    mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
+    mul = _mm_xor_si128 (mul,  mask); //res saturated for 0x80000000
+    return64(mul);
+}
+
+_NEON2SSESTORAGE int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b); // VQDMULH.S16 q0,q0,q0
+_NEON2SSE_INLINE int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b) // VQDMULH.S16 q0,q0,q0
+{
+    __m128i res, res_lo, mask;
+    _NEON2SSE_ALIGN_16 static const uint16_t cmask[] = {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000};
+    res = _mm_mulhi_epi16 (a, b);
+    res = _mm_slli_epi16 (res, 1); //double the result, don't care about saturation
+    res_lo = _mm_mullo_epi16 (a, b);
+    res_lo = _mm_srli_epi16(res_lo,15); //take the highest bit
+    res = _mm_add_epi16(res, res_lo); //combine results
+    mask = _mm_cmpeq_epi16 (res, *(__m128i*)cmask);
+    return _mm_xor_si128 (res,  mask); //res saturated for 0x8000
+}
+
+_NEON2SSESTORAGE int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b); // VQDMULH.S32 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
+{
+    // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target
+    __m128i ab, ba, mask, mul, mul1;
+    _NEON2SSE_ALIGN_16 static const uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
+    ab = _mm_unpacklo_epi32 (a, b); //a0, b0, a1,b1
+    ba = _mm_unpacklo_epi32 (b, a); //b0, a0, b1,a1
+    mul = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
+    mul = _mm_slli_epi64(mul,1); //double the result
+    ab = _mm_unpackhi_epi32 (a, b); //a2, b2, a3,b3
+    ba = _mm_unpackhi_epi32 (b, a); //b2, a2, b3,a3
+    mul1 = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
+    mul1 = _mm_slli_epi64(mul1,1); //double the result
+    mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits
+    mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits
+    mul = _mm_unpacklo_epi64(mul, mul1);
+    mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
+    return _mm_xor_si128 (mul,  mask); //res saturated for 0x80000000
+}
+
+//********* Vector saturating rounding doubling multiply high ****************
+//****************************************************************************
+//If use _mm_mulhrs_xx functions  the result may differ from NEON one a little  due to different rounding rules and order
+_NEON2SSESTORAGE int16x4_t vqrdmulh_s16(int16x4_t a,  int16x4_t b); // VQRDMULH.S16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vqrdmulh_s16(int16x4_t a,  int16x4_t b)
+{
+    int16x4_t res64;
+    return64(vqrdmulhq_s16(_pM128i(a), _pM128i(b)));
+}
+
+_NEON2SSESTORAGE int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b); // VQRDMULH.S32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
+{
+    //may be not optimal compared with a serial solution
+    int32x2_t res64;
+    _NEON2SSE_ALIGN_16 static const uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
+    __m128i res_sat, mask, mask1;
+    int64x2_t mul;
+    mul = vmull_s32(a,b);
+    res_sat = _mm_slli_epi64 (mul, 1); //double the result, saturation not considered
+    mask1 = _mm_slli_epi64(res_sat, 32); //shift left then back right to
+    mask1 = _mm_srli_epi64(mask1,31); //get  31-th bit 1 or zero
+    mul = _mm_add_epi32 (res_sat, mask1); //actual rounding
+    //at this point start treating 2 64-bit numbers as 4 32-bit
+    mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit
+    mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
+    mul = _mm_xor_si128 (mul,  mask); //res saturated for 0x80000000
+    return64(mul);
+}
+
+_NEON2SSESTORAGE int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b); // VQRDMULH.S16 q0,q0,q0
+_NEON2SSE_INLINE int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b) // VQRDMULH.S16 q0,q0,q0
+{
+    __m128i mask, res;
+    _NEON2SSE_ALIGN_16 static const uint16_t cmask[] = {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000};
+    res = _mm_mulhrs_epi16 (a, b);
+    mask = _mm_cmpeq_epi16 (res, *(__m128i*)cmask);
+    return _mm_xor_si128 (res,  mask); //res saturated for 0x8000
+}
+
+_NEON2SSESTORAGE int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b); // VQRDMULH.S32 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
+{
+    // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target
+    __m128i ab, ba,  mask, mul, mul1, mask1;
+    _NEON2SSE_ALIGN_16 static const uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
+    ab = _mm_unpacklo_epi32 (a, b); //a0, b0, a1,b1
+    ba = _mm_unpacklo_epi32 (b, a); //b0, a0, b1,a1
+    mul = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
+    mul = _mm_slli_epi64 (mul, 1); //double the result, saturation not considered
+    mask1 = _mm_slli_epi64(mul, 32); //shift left then back right to
+    mask1 = _mm_srli_epi64(mask1,31); //get  31-th bit 1 or zero
+    mul = _mm_add_epi32 (mul, mask1); //actual rounding
+
+    ab = _mm_unpackhi_epi32 (a, b); //a2, b2, a3,b3
+    ba = _mm_unpackhi_epi32 (b, a); //b2, a2, b3,a3
+    mul1 = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
+    mul1 = _mm_slli_epi64 (mul1, 1); //double the result, saturation not considered
+    mask1 = _mm_slli_epi64(mul1, 32); //shift left then back right to
+    mask1 = _mm_srli_epi64(mask1,31); //get  31-th bit 1 or zero
+    mul1 = _mm_add_epi32 (mul1, mask1); //actual rounding
+    //at this point start treating 2 64-bit numbers as 4 32-bit
+    mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit
+    mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit
+    mul = _mm_unpacklo_epi64(mul, mul1);
+    mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
+    return _mm_xor_si128 (mul,  mask); //res saturated for 0x80000000
+}
+
+//*************Vector widening saturating doubling multiply accumulate (long saturating doubling multiply accumulate) *****
+//*************************************************************************************************************************
+_NEON2SSESTORAGE int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLAL.S16 q0,d0,d0
+_NEON2SSE_INLINE int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VQDMLAL.S16 q0,d0,d0
+{
+    //not optimal SIMD soulution, serial may be faster
+    __m128i res32;
+    res32 = vmull_s16(b,  c);
+    res32 = vqd_s32(res32); //doubling & saturation ,if no saturation we could use _mm_slli_epi32 (res, 1);
+    return vqaddq_s32(res32, a); //saturation
+}
+
+_NEON2SSESTORAGE int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLAL.S32 q0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c),_NEON2SSE_REASON_SLOW_SERIAL)
+{
+    __m128i res64;
+    res64 = vmull_s32(b,c);
+    res64 = vqaddq_s64(res64, res64); //doubling & saturation ,if no saturation we could use _mm_slli_epi64 (res, 1);
+    return vqaddq_s64(res64, a); //saturation
+}
+
+//************************************************************************************
+//******************  Vector subtract ***********************************************
+//************************************************************************************
+_NEON2SSESTORAGE int8x8_t vsub_s8(int8x8_t a, int8x8_t b); // VSUB.I8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vsub_s8(int8x8_t a, int8x8_t b)
+{
+    int8x8_t res64;
+    return64(_mm_sub_epi8(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE int16x4_t vsub_s16(int16x4_t a, int16x4_t b); // VSUB.I16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vsub_s16(int16x4_t a, int16x4_t b)
+{
+    int16x4_t res64;
+    return64(_mm_sub_epi16(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE int32x2_t vsub_s32(int32x2_t a, int32x2_t b); // VSUB.I32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vsub_s32(int32x2_t a, int32x2_t b)
+{
+    int32x2_t res64;
+    return64(_mm_sub_epi32(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE int64x1_t vsub_s64(int64x1_t a,  int64x1_t b); // VSUB.I64 d0,d0,d0
+_NEON2SSE_INLINE int64x1_t vsub_s64(int64x1_t a,  int64x1_t b)
+{
+    int64x1_t res64;
+    res64.m64_i64[0] = a.m64_i64[0] - b.m64_i64[0];
+    return res64;
+}
+
+
+_NEON2SSESTORAGE float32x2_t vsub_f32(float32x2_t a, float32x2_t b); // VSUB.F32 d0,d0,d0
+_NEON2SSE_INLINE float32x2_t vsub_f32(float32x2_t a, float32x2_t b)
+{
+    float32x2_t res;
+    res.m64_f32[0] = a.m64_f32[0] - b.m64_f32[0];
+    res.m64_f32[1] = a.m64_f32[1] - b.m64_f32[1];
+    return res;
+}
+
+_NEON2SSESTORAGE uint8x8_t vsub_u8(uint8x8_t a, uint8x8_t b); // VSUB.I8 d0,d0,d0
+#define vsub_u8 vsub_s8
+
+_NEON2SSESTORAGE uint16x4_t vsub_u16(uint16x4_t a, uint16x4_t b); // VSUB.I16 d0,d0,d0
+#define vsub_u16 vsub_s16
+
+_NEON2SSESTORAGE uint32x2_t vsub_u32(uint32x2_t a, uint32x2_t b); // VSUB.I32 d0,d0,d0
+#define vsub_u32 vsub_s32
+
+
+_NEON2SSESTORAGE uint64x1_t vsub_u64(uint64x1_t a,  uint64x1_t b); // VSUB.I64 d0,d0,d0
+_NEON2SSE_INLINE uint64x1_t vsub_u64(uint64x1_t a,  uint64x1_t b)
+{
+    int64x1_t res64;
+    res64.m64_u64[0] = a.m64_u64[0] - b.m64_u64[0];
+    return res64;
+}
+
+
+_NEON2SSESTORAGE int8x16_t   vsubq_s8(int8x16_t a, int8x16_t b); // VSUB.I8 q0,q0,q0
+#define vsubq_s8 _mm_sub_epi8
+
+_NEON2SSESTORAGE int16x8_t   vsubq_s16(int16x8_t a, int16x8_t b); // VSUB.I16 q0,q0,q0
+#define vsubq_s16 _mm_sub_epi16
+
+_NEON2SSESTORAGE int32x4_t   vsubq_s32(int32x4_t a, int32x4_t b); // VSUB.I32 q0,q0,q0
+#define vsubq_s32 _mm_sub_epi32
+
+_NEON2SSESTORAGE int64x2_t   vsubq_s64(int64x2_t a, int64x2_t b); // VSUB.I64 q0,q0,q0
+#define vsubq_s64 _mm_sub_epi64
+
+_NEON2SSESTORAGE float32x4_t vsubq_f32(float32x4_t a, float32x4_t b); // VSUB.F32 q0,q0,q0
+#define vsubq_f32 _mm_sub_ps
+
+_NEON2SSESTORAGE uint8x16_t   vsubq_u8(uint8x16_t a, uint8x16_t b); // VSUB.I8 q0,q0,q0
+#define vsubq_u8 _mm_sub_epi8
+
+_NEON2SSESTORAGE uint16x8_t   vsubq_u16(uint16x8_t a, uint16x8_t b); // VSUB.I16 q0,q0,q0
+#define vsubq_u16 _mm_sub_epi16
+
+_NEON2SSESTORAGE uint32x4_t   vsubq_u32(uint32x4_t a, uint32x4_t b); // VSUB.I32 q0,q0,q0
+#define vsubq_u32 _mm_sub_epi32
+
+_NEON2SSESTORAGE uint64x2_t   vsubq_u64(uint64x2_t a, uint64x2_t b); // VSUB.I64 q0,q0,q0
+#define vsubq_u64 _mm_sub_epi64
+
+//***************Vector long subtract: vsub -> Vr[i]:=Va[i]-Vb[i] ******************
+//***********************************************************************************
+//Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
+_NEON2SSESTORAGE int16x8_t vsubl_s8(int8x8_t a, int8x8_t b); // VSUBL.S8 q0,d0,d0
+_NEON2SSE_INLINE int16x8_t vsubl_s8(int8x8_t a, int8x8_t b) // VSUBL.S8 q0,d0,d0
+{
+    __m128i a16, b16;
+    a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1,
+    b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
+    return _mm_sub_epi16 (a16, b16);
+}
+
+_NEON2SSESTORAGE int32x4_t vsubl_s16(int16x4_t a, int16x4_t b); // VSUBL.S16 q0,d0,d0
+_NEON2SSE_INLINE int32x4_t vsubl_s16(int16x4_t a, int16x4_t b) // VSUBL.S16 q0,d0,d0
+{
+    __m128i a32, b32;
+    a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1
+    b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1,
+    return _mm_sub_epi32 (a32, b32);
+}
+
+_NEON2SSESTORAGE int64x2_t vsubl_s32(int32x2_t a, int32x2_t b); // VSUBL.S32 q0,d0,d0
+_NEON2SSE_INLINE int64x2_t vsubl_s32(int32x2_t a, int32x2_t b) // VSUBL.S32 q0,d0,d0
+{
+    //may be not optimal
+    __m128i a64, b64;
+    a64 = _MM_CVTEPI32_EPI64 (_pM128i(a)); //SSE4.1
+    b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1,
+    return _mm_sub_epi64 (a64, b64);
+}
+
+_NEON2SSESTORAGE uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b); // VSUBL.U8 q0,d0,d0
+_NEON2SSE_INLINE uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b) // VSUBL.U8 q0,d0,d0
+{
+    __m128i a16, b16;
+    a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE4.1,
+    b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1,
+    return _mm_sub_epi16 (a16, b16);
+}
+
+_NEON2SSESTORAGE uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b); // VSUBL.s16 q0,d0,d0
+_NEON2SSE_INLINE uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b) // VSUBL.s16 q0,d0,d0
+{
+    __m128i a32, b32;
+    a32 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE4.1
+    b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1,
+    return _mm_sub_epi32 (a32, b32);
+}
+
+_NEON2SSESTORAGE uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b); // VSUBL.U32 q0,d0,d0
+_NEON2SSE_INLINE uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b) // VSUBL.U32 q0,d0,d0
+{
+    //may be not optimal
+    __m128i a64, b64;
+    a64 = _MM_CVTEPU32_EPI64 (_pM128i(a)); //SSE4.1
+    b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1,
+    return _mm_sub_epi64 (a64, b64);
+}
+
+//***************** Vector wide subtract: vsub -> Vr[i]:=Va[i]-Vb[i] **********************************
+//*****************************************************************************************************
+_NEON2SSESTORAGE int16x8_t vsubw_s8(int16x8_t a, int8x8_t b); // VSUBW.S8 q0,q0,d0
+_NEON2SSE_INLINE int16x8_t vsubw_s8(int16x8_t a, int8x8_t b) // VSUBW.S8 q0,q0,d0
+{
+    __m128i b16;
+    b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
+    return _mm_sub_epi16 (a, b16);
+}
+
+_NEON2SSESTORAGE int32x4_t vsubw_s16(int32x4_t a, int16x4_t b); // VSUBW.S16 q0,q0,d0
+_NEON2SSE_INLINE int32x4_t vsubw_s16(int32x4_t a, int16x4_t b) // VSUBW.S16 q0,q0,d0
+{
+    __m128i b32;
+    b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1,
+    return _mm_sub_epi32 (a, b32);
+}
+
+_NEON2SSESTORAGE int64x2_t vsubw_s32(int64x2_t a, int32x2_t b); // VSUBW.S32 q0,q0,d0
+_NEON2SSE_INLINE int64x2_t vsubw_s32(int64x2_t a, int32x2_t b) // VSUBW.S32 q0,q0,d0
+{
+    __m128i b64;
+    b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1
+    return _mm_sub_epi64 (a, b64);
+}
+
+_NEON2SSESTORAGE uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b); // VSUBW.U8 q0,q0,d0
+_NEON2SSE_INLINE uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b) // VSUBW.U8 q0,q0,d0
+{
+    __m128i b16;
+    b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1,
+    return _mm_sub_epi16 (a, b16);
+}
+
+_NEON2SSESTORAGE uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b); // VSUBW.s16 q0,q0,d0
+_NEON2SSE_INLINE uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b) // VSUBW.s16 q0,q0,d0
+{
+    __m128i b32;
+    b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1,
+    return _mm_sub_epi32 (a, b32);
+}
+
+_NEON2SSESTORAGE uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b); // VSUBW.U32 q0,q0,d0
+_NEON2SSE_INLINE uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b) // VSUBW.U32 q0,q0,d0
+{
+    __m128i b64;
+    b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1
+    return _mm_sub_epi64 (a, b64);
+}
+
+//************************Vector saturating subtract *********************************
+//*************************************************************************************
+_NEON2SSESTORAGE int8x8_t vqsub_s8(int8x8_t a, int8x8_t b); // VQSUB.S8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vqsub_s8(int8x8_t a, int8x8_t b)
+{
+    int8x8_t res64;
+    return64(_mm_subs_epi8(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE int16x4_t vqsub_s16(int16x4_t a, int16x4_t b); // VQSUB.S16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vqsub_s16(int16x4_t a, int16x4_t b)
+{
+    int16x4_t res64;
+    return64(_mm_subs_epi16(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE int32x2_t vqsub_s32(int32x2_t a,  int32x2_t b); // VQSUB.S32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vqsub_s32(int32x2_t a,  int32x2_t b)
+{
+    int32x2_t res64;
+    return64(vqsubq_s32(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE int64x1_t vqsub_s64(int64x1_t a, int64x1_t b); // VQSUB.S64 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqsub_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) //no optimal SIMD soulution
+{
+    uint64x1_t res;
+    uint64_t a64,b64;
+    a64 = a.m64_u64[0];
+    b64 = b.m64_u64[0];
+    res.m64_u64[0] = a64 - b64;
+
+    a64 =  (a64 >> 63) + (~_SIGNBIT64);
+    if ((int64_t)((a64 ^ b64) & (a64 ^ res.m64_u64[0])) < 0) {
+        res.m64_u64[0] = a64;
+    }
+    return res;
+}
+
+_NEON2SSESTORAGE uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b); // VQSUB.U8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b)
+{
+    uint8x8_t res64;
+    return64(_mm_subs_epu8(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b); // VQSUB.s16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b)
+{
+    uint16x4_t res64;
+    return64(_mm_subs_epu16(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint32x2_t vqsub_u32(uint32x2_t a,  uint32x2_t b); // VQSUB.U32 d0,d0,d0
+_NEON2SSE_INLINE uint32x2_t vqsub_u32(uint32x2_t a,  uint32x2_t b)
+{
+    uint32x2_t res64;
+    return64(vqsubq_u32(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b); // VQSUB.U64 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    uint64x1_t res;
+    uint64_t a64, b64;
+    a64 = _Ui64(a);
+    b64 = _Ui64(b);
+    if (a64 > b64) {
+        res.m64_u64[0] = a64 - b64;
+    } else {
+        res.m64_u64[0] = 0;
+    }
+    return res;
+}
+
+_NEON2SSESTORAGE int8x16_t   vqsubq_s8(int8x16_t a, int8x16_t b); // VQSUB.S8 q0,q0,q0
+#define vqsubq_s8 _mm_subs_epi8
+
+_NEON2SSESTORAGE int16x8_t   vqsubq_s16(int16x8_t a, int16x8_t b); // VQSUB.S16 q0,q0,q0
+#define vqsubq_s16 _mm_subs_epi16
+
+_NEON2SSESTORAGE int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b); // VQSUB.S32 q0,q0,q0
+_NEON2SSE_INLINE int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b)
+{
+    //no corresponding x86 SIMD soulution, special tricks are necessary. The overflow is possible only if a and b have opposite signs and sub has opposite sign to a
+    __m128i c7fffffff, res, res_sat, res_xor_a, b_xor_a;
+    c7fffffff = _mm_set1_epi32(0x7fffffff);
+    res = _mm_sub_epi32(a, b);
+    res_sat = _mm_srli_epi32(a, 31);
+    res_sat = _mm_add_epi32(res_sat, c7fffffff);
+    res_xor_a = _mm_xor_si128(res, a);
+    b_xor_a = _mm_xor_si128(b, a);
+    res_xor_a = _mm_and_si128(b_xor_a, res_xor_a);
+    res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise
+    res_sat = _mm_and_si128(res_xor_a, res_sat);
+    res = _mm_andnot_si128(res_xor_a, res);
+    return _mm_or_si128(res, res_sat);
+}
+
+_NEON2SSESTORAGE int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b); // VQSUB.S64 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) //no optimal SIMD soulution
+{
+    _NEON2SSE_ALIGN_16 int64_t atmp[2], btmp[2];
+    _NEON2SSE_ALIGN_16 uint64_t res[2];
+    _mm_store_si128((__m128i*)atmp, a);
+    _mm_store_si128((__m128i*)btmp, b);
+    res[0] = atmp[0] - btmp[0];
+    res[1] = atmp[1] - btmp[1];
+    if (((res[0] ^ atmp[0]) & _SIGNBIT64) && ((atmp[0] ^ btmp[0]) & _SIGNBIT64)) {
+        res[0] = (atmp[0] >> 63) ^ ~_SIGNBIT64;
+    }
+    if (((res[1] ^ atmp[1]) & _SIGNBIT64) && ((atmp[1] ^ btmp[1]) & _SIGNBIT64)) {
+        res[1] = (atmp[1] >> 63) ^ ~_SIGNBIT64;
+    }
+    return _mm_load_si128((__m128i*)res);
+}
+
+_NEON2SSESTORAGE uint8x16_t   vqsubq_u8(uint8x16_t a, uint8x16_t b); // VQSUB.U8 q0,q0,q0
+#define vqsubq_u8 _mm_subs_epu8
+
+_NEON2SSESTORAGE uint16x8_t   vqsubq_u16(uint16x8_t a, uint16x8_t b); // VQSUB.s16 q0,q0,q0
+#define vqsubq_u16 _mm_subs_epu16
+
+_NEON2SSESTORAGE uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b); // VQSUB.U32 q0,q0,q0
+_NEON2SSE_INLINE uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b) // VQSUB.U32 q0,q0,q0
+{
+    __m128i min, mask, sub;
+    min = _MM_MIN_EPU32(a, b); //SSE4.1
+    mask = _mm_cmpeq_epi32 (min,  b);
+    sub = _mm_sub_epi32 (a, b);
+    return _mm_and_si128 ( sub, mask);
+}
+
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL); // VQSUB.U64 q0,q0,q0
+#ifdef USE_SSE4
+    _NEON2SSE_INLINE uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b)
+    {
+        __m128i c80000000, subb, suba, cmp, sub;
+        c80000000 = _mm_set_epi32 (0x80000000, 0x0, 0x80000000, 0x0);
+        sub  = _mm_sub_epi64 (a, b);
+        suba = _mm_sub_epi64 (a, c80000000);
+        subb = _mm_sub_epi64 (b, c80000000);
+        cmp = _mm_cmpgt_epi64 ( suba, subb); //no unsigned comparison, need to go to signed, SSE4.2!!!
+        return _mm_and_si128 (sub, cmp); //saturation
+    }
+#else
+    _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
+    {
+        _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2];
+        _mm_store_si128((__m128i*)atmp, a);
+        _mm_store_si128((__m128i*)btmp, b);
+        res[0] = (atmp[0] > btmp[0]) ? atmp[0] -  btmp[0] : 0;
+        res[1] = (atmp[1] > btmp[1]) ? atmp[1] -  btmp[1] : 0;
+        return _mm_load_si128((__m128i*)(res));
+    }
+#endif
+
+//**********Vector halving subtract Vr[i]:=(Va[i]-Vb[i])>>1  ******************************************************
+//****************************************************************
+_NEON2SSESTORAGE int8x8_t vhsub_s8(int8x8_t a, int8x8_t b); // VHSUB.S8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vhsub_s8(int8x8_t a, int8x8_t b) // VHSUB.S8 d0,d0,d0
+{
+    //no 8 bit shift available, internal overflow is possible, so let's go to 16 bit,
+    int8x8_t res64;
+    __m128i r16;
+    int8x8_t r;
+    r = vsub_s8 (a, b);
+    r16 = _MM_CVTEPI8_EPI16 (_pM128i(r)); //SSE 4.1
+    r16 = _mm_srai_epi16 (r16, 1); //SSE2
+    r16 =  _mm_packs_epi16 (r16,r16); //use low 64 bits
+    return64(r16);
+}
+
+_NEON2SSESTORAGE int16x4_t vhsub_s16(int16x4_t a,  int16x4_t b); // VHSUB.S16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vhsub_s16(int16x4_t a,  int16x4_t b)
+{
+    int16x4_t res64;
+    return64(vhsubq_s16(_pM128i(a), _pM128i(b)));
+}
+
+
+
+_NEON2SSESTORAGE int32x2_t vhsub_s32(int32x2_t a,  int32x2_t b); // VHSUB.S32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vhsub_s32(int32x2_t a,  int32x2_t b)
+{
+    int32x2_t res64;
+    return64(vhsubq_s32(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint8x8_t vhsub_u8(uint8x8_t a,  uint8x8_t b); // VHSUB.U8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vhsub_u8(uint8x8_t a,  uint8x8_t b)
+{
+    uint8x8_t res64;
+    return64(vhsubq_u8(_pM128i(a), _pM128i(b)));
+}
+
+_NEON2SSESTORAGE uint16x4_t vhsub_u16(uint16x4_t a,  uint16x4_t b); // VHSUB.s16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vhsub_u16(uint16x4_t a,  uint16x4_t b)
+{
+    uint16x4_t res64;
+    return64(vhsubq_u16(_pM128i(a), _pM128i(b)));
+}
+
+_NEON2SSESTORAGE uint32x2_t vhsub_u32(uint32x2_t a,  uint32x2_t b); // VHSUB.U32 d0,d0,d0
+_NEON2SSE_INLINE uint32x2_t vhsub_u32(uint32x2_t a,  uint32x2_t b)
+{
+    uint32x2_t res64;
+    return64(vhsubq_u32(_pM128i(a), _pM128i(b)));
+}
+
+_NEON2SSESTORAGE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b); // VHSUB.S8 q0,q0,q0
+_NEON2SSE_INLINE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b) // VHSUB.S8 q0,q0,q0
+{
+    //need to deal with the possibility of internal overflow
+    __m128i c128, au,bu;
+    c128 = _mm_set1_epi8(-128); //(int8_t)0x80
+    au = _mm_add_epi8( a, c128);
+    bu = _mm_add_epi8( b, c128);
+    return vhsubq_u8(au,bu);
+}
+
+_NEON2SSESTORAGE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b); // VHSUB.S16 q0,q0,q0
+_NEON2SSE_INLINE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b) // VHSUB.S16 q0,q0,q0
+{
+    //need to deal with the possibility of internal overflow
+    __m128i c8000, au,bu;
+    c8000 = _mm_set1_epi16(-32768); //(int16_t)0x8000
+    au = _mm_add_epi16( a, c8000);
+    bu = _mm_add_epi16( b, c8000);
+    return vhsubq_u16(au,bu);
+}
+
+_NEON2SSESTORAGE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b); // VHSUB.S32 q0,q0,q0
+_NEON2SSE_INLINE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b) // VHSUB.S32 q0,q0,q0
+{
+    //need to deal with the possibility of internal overflow
+    __m128i a2, b2,r, b_1;
+    a2 = _mm_srai_epi32 (a,1);
+    b2 = _mm_srai_epi32 (b,1);
+    r = _mm_sub_epi32 (a2, b2);
+    b_1 = _mm_andnot_si128(a, b); //!a and b
+    b_1 = _mm_slli_epi32 (b_1,31);
+    b_1 = _mm_srli_epi32 (b_1,31); //0 or 1, last b bit
+    return _mm_sub_epi32(r,b_1);
+}
+
+_NEON2SSESTORAGE uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b); // VHSUB.U8 q0,q0,q0
+_NEON2SSE_INLINE uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b) // VHSUB.U8 q0,q0,q0
+{
+    __m128i avg;
+    avg = _mm_avg_epu8 (a, b);
+    return _mm_sub_epi8(a, avg);
+}
+
+_NEON2SSESTORAGE uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b); // VHSUB.s16 q0,q0,q0
+_NEON2SSE_INLINE uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b) // VHSUB.s16 q0,q0,q0
+{
+    __m128i avg;
+    avg = _mm_avg_epu16 (a, b);
+    return _mm_sub_epi16(a, avg);
+}
+
+_NEON2SSESTORAGE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b); // VHSUB.U32 q0,q0,q0
+_NEON2SSE_INLINE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b) // VHSUB.U32 q0,q0,q0
+{
+    //need to deal with the possibility of internal overflow
+    __m128i a2, b2,r, b_1;
+    a2 = _mm_srli_epi32 (a,1);
+    b2 = _mm_srli_epi32 (b,1);
+    r = _mm_sub_epi32 (a2, b2);
+    b_1 = _mm_andnot_si128(a, b); //!a and b
+    b_1 = _mm_slli_epi32 (b_1,31);
+    b_1 = _mm_srli_epi32 (b_1,31); //0 or 1, last b bit
+    return _mm_sub_epi32(r,b_1);
+}
+
+//******* Vector subtract high half (truncated) ** ************
+//************************************************************
+_NEON2SSESTORAGE int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b); // VSUBHN.I16 d0,q0,q0
+_NEON2SSE_INLINE int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b) // VSUBHN.I16 d0,q0,q0
+{
+    int8x8_t res64;
+    __m128i sum, sum8;
+    sum = _mm_sub_epi16 (a, b);
+    sum8 = _mm_srai_epi16 (sum, 8);
+    sum8 = _mm_packs_epi16(sum8,sum8);
+    return64(sum8);
+}
+
+_NEON2SSESTORAGE int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b); // VSUBHN.I32 d0,q0,q0
+_NEON2SSE_INLINE int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b) // VSUBHN.I32 d0,q0,q0
+{
+    int16x4_t res64;
+    __m128i sum, sum16;
+    sum = _mm_sub_epi32 (a, b);
+    sum16 = _mm_srai_epi32 (sum, 16);
+    sum16 = _mm_packs_epi32(sum16,sum16);
+    return64(sum16);
+}
+
+_NEON2SSESTORAGE int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b); // VSUBHN.I64 d0,q0,q0
+_NEON2SSE_INLINE int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b)
+{
+    int32x2_t res64;
+    __m128i sub;
+    sub = _mm_sub_epi64 (a, b);
+    sub = _mm_shuffle_epi32(sub,  1 | (3 << 2) | (0 << 4) | (2 << 6));
+    return64(sub);
+}
+
+_NEON2SSESTORAGE uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b); // VSUBHN.I16 d0,q0,q0
+_NEON2SSE_INLINE uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b) // VSUBHN.I16 d0,q0,q0
+{
+    uint8x8_t res64;
+    __m128i sum, sum8;
+    sum = _mm_sub_epi16 (a, b);
+    sum8 = _mm_srli_epi16 (sum, 8);
+    sum8 =  _mm_packus_epi16(sum8,sum8);
+    return64(sum8);
+}
+
+_NEON2SSESTORAGE uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b); // VSUBHN.I32 d0,q0,q0
+_NEON2SSE_INLINE uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b) // VSUBHN.I32 d0,q0,q0
+{
+    uint16x4_t res64;
+     __m128i sum, sum16;
+    sum = _mm_sub_epi32 (a, b);
+    sum16 = _mm_srli_epi32 (sum, 16);
+#ifdef USE_SSE4
+    sum16 =  _MM_PACKUS1_EPI32(sum16);
+#else
+    sum16  = _mm_shuffle_epi8 (sum16, *(__m128i*) mask8_32_even_odd); //go to 16 bits
+#endif
+    return64(sum16);
+}
+
+_NEON2SSESTORAGE uint32x2_t vsubhn_u64(uint64x2_t a, uint64x2_t b); // VSUBHN.I64 d0,q0,q0
+#define vsubhn_u64 vsubhn_s64
+
+//************ Vector rounding subtract high half *********************
+//*********************************************************************
+_NEON2SSESTORAGE int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b); // VRSUBHN.I16 d0,q0,q0
+_NEON2SSE_INLINE int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b) // VRSUBHN.I16 d0,q0,q0
+{
+    int8x8_t res64;
+    __m128i sub, mask1;
+    sub = _mm_sub_epi16 (a, b);
+    mask1 = _mm_slli_epi16(sub, 9); //shift left then back right to
+    mask1 = _mm_srli_epi16(mask1, 15); //get  7-th bit 1 or zero
+    sub = _mm_srai_epi16 (sub, 8); //get high half
+    sub = _mm_add_epi16 (sub, mask1); //actual rounding
+    sub =  _mm_packs_epi16 (sub, sub);
+    return64(sub);
+}
+
+_NEON2SSESTORAGE int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b); // VRSUBHN.I32 d0,q0,q0
+_NEON2SSE_INLINE int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b) // VRSUBHN.I32 d0,q0,q0
+{
+    //SIMD may be not optimal, serial may be faster
+    int16x4_t res64;
+    __m128i sub, mask1;
+    sub = _mm_sub_epi32 (a, b);
+    mask1 = _mm_slli_epi32(sub, 17); //shift left then back right to
+    mask1 = _mm_srli_epi32(mask1,31); //get  15-th bit 1 or zero
+    sub = _mm_srai_epi32 (sub, 16); //get high half
+    sub = _mm_add_epi32 (sub, mask1); //actual rounding
+    sub = _mm_packs_epi32 (sub, sub);
+    return64(sub);
+}
+
+_NEON2SSESTORAGE int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b); // VRSUBHN.I64 d0,q0,q0
+_NEON2SSE_INLINE int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b)
+{
+    //SIMD may be not optimal, serial may be faster
+    int32x2_t res64;
+    __m128i sub, mask1;
+    sub = _mm_sub_epi64 (a, b);
+    mask1 = _mm_slli_epi64(sub, 33); //shift left then back right to
+    mask1 = _mm_srli_epi64(mask1,32); //get  31-th bit 1 or zero
+    sub = _mm_add_epi64 (sub, mask1); //actual high half rounding
+    sub = _mm_shuffle_epi32(sub,  1 | (3 << 2) | (0 << 4) | (2 << 6));
+    return64(sub);
+}
+
+_NEON2SSESTORAGE uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b); // VRSUBHN.I16 d0,q0,q0
+_NEON2SSE_INLINE uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b) // VRSUBHN.I16 d0,q0,q0
+{
+    uint8x8_t res64;
+    __m128i sub, mask1;
+    sub = _mm_sub_epi16 (a, b);
+    mask1 = _mm_slli_epi16(sub, 9); //shift left then back right to
+    mask1 = _mm_srli_epi16(mask1, 15); //get  7-th bit 1 or zero
+    sub = _mm_srai_epi16 (sub, 8); //get high half
+    sub = _mm_add_epi16 (sub, mask1); //actual rounding
+    sub = _mm_packus_epi16 (sub, sub);
+    return64(sub);
+}
+
+_NEON2SSESTORAGE uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b); // VRSUBHN.I32 d0,q0,q0
+_NEON2SSE_INLINE uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b) // VRSUBHN.I32 d0,q0,q0
+{
+    //SIMD may be not optimal, serial may be faster
+    uint16x4_t res64;
+    __m128i sub, mask1;
+    sub = _mm_sub_epi32 (a, b);
+    mask1 = _mm_slli_epi32(sub, 17); //shift left then back right to
+    mask1 = _mm_srli_epi32(mask1,31); //get  15-th bit 1 or zero
+    sub = _mm_srai_epi32 (sub, 16); //get high half
+    sub = _mm_add_epi32 (sub, mask1); //actual rounding
+#ifdef USE_SSE4
+    sub =  _MM_PACKUS1_EPI32 (sub);
+#else
+    sub = _mm_shuffle_epi8 (sub, *(__m128i*) mask8_32_even_odd); //go to 16 bits
+#endif
+    return64(sub);
+}
+
+_NEON2SSESTORAGE uint32x2_t vrsubhn_u64(uint64x2_t a, uint64x2_t b); // VRSUBHN.I64 d0,q0,q0
+#define vrsubhn_u64 vrsubhn_s64
+
+//*********** Vector saturating doubling multiply subtract long ********************
+//************************************************************************************
+_NEON2SSESTORAGE int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLSL.S16 q0,d0,d0
+_NEON2SSE_INLINE int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c)
+{
+    //not optimal SIMD soulution, serial may be faster
+    __m128i res32, mask;
+    int32x4_t res;
+    _NEON2SSE_ALIGN_16 static const uint32_t cmask[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
+    res = vmull_s16(b,  c);
+    res32 = _mm_slli_epi32 (res, 1); //double the result, saturation not considered
+    mask = _mm_cmpeq_epi32 (res32, *(__m128i*)cmask);
+    res32 = _mm_xor_si128 (res32,  mask); //res32 saturated for 0x80000000
+    return vqsubq_s32(a, res32); //saturation
+}
+
+_NEON2SSESTORAGE int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLSL.S32 q0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    __m128i res64, mask;
+    int64x2_t res;
+    _NEON2SSE_ALIGN_16 static const uint64_t cmask[] = {0x8000000000000000, 0x8000000000000000};
+    res = vmull_s32(b,  c);
+    res64 = _mm_slli_epi64 (res, 1); //double the result, saturation not considered
+    mask = _MM_CMPEQ_EPI64 (res64, *(__m128i*)cmask);
+    res64 = _mm_xor_si128 (res64,  mask); //res32 saturated for 0x80000000
+    return vqsubq_s64(a, res64); //saturation
+}
+
+//******************  COMPARISON ***************************************
+//******************* Vector compare equal *************************************
+//****************************************************************************
+_NEON2SSESTORAGE uint8x8_t vceq_s8(int8x8_t a, int8x8_t b); // VCEQ.I8 d0, d0, d0
+_NEON2SSE_INLINE int8x8_t vceq_s8(int8x8_t a, int8x8_t b)
+{
+    int8x8_t res64;
+    return64(_mm_cmpeq_epi8(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint16x4_t vceq_s16(int16x4_t a, int16x4_t b); // VCEQ.I16 d0, d0, d0
+_NEON2SSE_INLINE int16x4_t vceq_s16(int16x4_t a, int16x4_t b)
+{
+    int16x4_t res64;
+    return64(_mm_cmpeq_epi16(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint32x2_t vceq_s32(int32x2_t a, int32x2_t b); // VCEQ.I32 d0, d0, d0
+_NEON2SSE_INLINE int32x2_t vceq_s32(int32x2_t a, int32x2_t b)
+{
+    int32x2_t res64;
+    return64(_mm_cmpeq_epi32(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint32x2_t vceq_f32(float32x2_t a, float32x2_t b); // VCEQ.F32 d0, d0, d0
+_NEON2SSE_INLINE uint32x2_t vceq_f32(float32x2_t a, float32x2_t b)
+{
+    uint32x2_t res64;
+    __m128 res;
+    res = _mm_cmpeq_ps(_pM128(a), _pM128(b) );
+    return64f(res);
+}
+
+_NEON2SSESTORAGE uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b); // VCEQ.I8 d0, d0, d0
+_NEON2SSE_INLINE uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b)
+{
+    uint8x8_t res64;
+    return64(_mm_cmpeq_epi8(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b); // VCEQ.I16 d0, d0, d0
+_NEON2SSE_INLINE uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b)
+{
+    uint16x4_t res64;
+    return64(_mm_cmpeq_epi16(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b); // VCEQ.I32 d0, d0, d0
+_NEON2SSE_INLINE uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b)
+{
+    uint32x2_t res64;
+    return64(_mm_cmpeq_epi32(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint8x8_t   vceq_p8(poly8x8_t a, poly8x8_t b); // VCEQ.I8 d0, d0, d0
+#define vceq_p8 vceq_u8
+
+
+_NEON2SSESTORAGE uint8x16_t   vceqq_s8(int8x16_t a, int8x16_t b); // VCEQ.I8 q0, q0, q0
+#define vceqq_s8 _mm_cmpeq_epi8
+
+_NEON2SSESTORAGE uint16x8_t   vceqq_s16(int16x8_t a, int16x8_t b); // VCEQ.I16 q0, q0, q0
+#define vceqq_s16 _mm_cmpeq_epi16
+
+_NEON2SSESTORAGE uint32x4_t   vceqq_s32(int32x4_t a, int32x4_t b); // VCEQ.I32 q0, q0, q0
+#define vceqq_s32 _mm_cmpeq_epi32
+
+_NEON2SSESTORAGE uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b); // VCEQ.F32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b)
+{
+    __m128 res;
+    res = _mm_cmpeq_ps(a,b);
+    return _M128i(res);
+}
+
+_NEON2SSESTORAGE uint8x16_t   vceqq_u8(uint8x16_t a, uint8x16_t b); // VCEQ.I8 q0, q0, q0
+#define vceqq_u8 _mm_cmpeq_epi8
+
+_NEON2SSESTORAGE uint16x8_t   vceqq_u16(uint16x8_t a, uint16x8_t b); // VCEQ.I16 q0, q0, q0
+#define vceqq_u16 _mm_cmpeq_epi16
+
+_NEON2SSESTORAGE uint32x4_t   vceqq_u32(uint32x4_t a, uint32x4_t b); // VCEQ.I32 q0, q0, q0
+#define vceqq_u32 _mm_cmpeq_epi32
+
+_NEON2SSESTORAGE uint8x16_t   vceqq_p8(poly8x16_t a, poly8x16_t b); // VCEQ.I8 q0, q0, q0
+#define vceqq_p8 _mm_cmpeq_epi8
+
+//******************Vector compare greater-than or equal*************************
+//*******************************************************************************
+//in IA SIMD no greater-than-or-equal comparison for integers,
+// there is greater-than available only, so we need the following tricks
+
+_NEON2SSESTORAGE uint8x8_t vcge_s8(int8x8_t a,  int8x8_t b); // VCGE.S8 d0, d0, d0
+_NEON2SSE_INLINE int8x8_t vcge_s8(int8x8_t a,  int8x8_t b)
+{
+    int8x8_t res64;
+    return64(vcgeq_s8(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint16x4_t vcge_s16(int16x4_t a,  int16x4_t b); // VCGE.S16 d0, d0, d0
+_NEON2SSE_INLINE int16x4_t vcge_s16(int16x4_t a,  int16x4_t b)
+{
+    int16x4_t res64;
+    return64(vcgeq_s16(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint32x2_t vcge_s32(int32x2_t a,  int32x2_t b); // VCGE.S32 d0, d0, d0
+_NEON2SSE_INLINE int32x2_t vcge_s32(int32x2_t a,  int32x2_t b)
+{
+    int32x2_t res64;
+    return64(vcgeq_s32(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint32x2_t vcge_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0
+_NEON2SSE_INLINE uint32x2_t vcge_f32(float32x2_t a, float32x2_t b)
+{
+    uint32x2_t res64;
+    __m128 res;
+    res = _mm_cmpge_ps(_pM128(a),_pM128(b)); //use only 2 first entries
+    return64f(res);
+}
+
+_NEON2SSESTORAGE uint8x8_t vcge_u8(uint8x8_t a,  uint8x8_t b); // VCGE.U8 d0, d0, d0
+_NEON2SSE_INLINE uint8x8_t vcge_u8(uint8x8_t a,  uint8x8_t b)
+{
+    uint8x8_t res64;
+    return64(vcgeq_u8(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint16x4_t vcge_u16(uint16x4_t a,  uint16x4_t b); // VCGE.s16 d0, d0, d0
+_NEON2SSE_INLINE uint16x4_t vcge_u16(uint16x4_t a,  uint16x4_t b)
+{
+    uint16x4_t res64;
+    return64(vcgeq_u16(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint32x2_t vcge_u32(uint32x2_t a,  uint32x2_t b); // VCGE.U32 d0, d0, d0
+_NEON2SSE_INLINE uint32x2_t vcge_u32(uint32x2_t a,  uint32x2_t b)
+{
+    //serial solution looks faster
+    uint32x2_t res64;
+    return64(vcgeq_u32 (_pM128i(a), _pM128i(b)));
+}
+
+
+
+_NEON2SSESTORAGE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
+_NEON2SSE_INLINE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b) // VCGE.S8 q0, q0, q0
+{
+    __m128i m1, m2;
+    m1 = _mm_cmpgt_epi8 ( a, b);
+    m2 = _mm_cmpeq_epi8 ( a, b);
+    return _mm_or_si128  ( m1, m2);
+}
+
+_NEON2SSESTORAGE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
+_NEON2SSE_INLINE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b) // VCGE.S16 q0, q0, q0
+{
+    __m128i m1, m2;
+    m1 = _mm_cmpgt_epi16 ( a, b);
+    m2 = _mm_cmpeq_epi16 ( a, b);
+    return _mm_or_si128   ( m1,m2);
+}
+
+_NEON2SSESTORAGE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b) // VCGE.S32 q0, q0, q0
+{
+    __m128i m1, m2;
+    m1 = _mm_cmpgt_epi32 (a, b);
+    m2 = _mm_cmpeq_epi32 (a, b);
+    return _mm_or_si128   (m1, m2);
+}
+
+_NEON2SSESTORAGE uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b)
+{
+    __m128 res;
+    res = _mm_cmpge_ps(a,b); //use only 2 first entries
+    return *(__m128i*)&res;
+}
+
+_NEON2SSESTORAGE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
+_NEON2SSE_INLINE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b) // VCGE.U8 q0, q0, q0
+{
+    //no unsigned chars comparison, only signed available,so need the trick
+    __m128i cmp;
+    cmp = _mm_max_epu8(a, b);
+    return _mm_cmpeq_epi8(cmp, a); //a>=b
+}
+
+_NEON2SSESTORAGE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b); // VCGE.s16 q0, q0, q0
+_NEON2SSE_INLINE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0, q0, q0
+{
+    //no unsigned shorts comparison, only signed available,so need the trick
+#ifdef USE_SSE4
+    __m128i cmp;
+    cmp = _mm_max_epu16(a, b);
+    return _mm_cmpeq_epi16(cmp, a); //a>=b
+#else
+   __m128i zero = _mm_setzero_si128();
+   __m128i  as = _mm_subs_epu16(b, a);
+   return _mm_cmpeq_epi16(as, zero);  
+#endif
+}
+
+_NEON2SSESTORAGE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b) // VCGE.U32 q0, q0, q0
+{
+    //no unsigned ints comparison, only signed available,so need the trick
+#ifdef USE_SSE4
+    __m128i cmp;
+    cmp = _mm_max_epu32(a, b);
+    return _mm_cmpeq_epi32(cmp, a); //a>=b
+#else
+    //serial solution may be faster
+    __m128i c80000000, as, bs, m1, m2;
+    c80000000 = _mm_set1_epi32 (0x80000000);
+    as = _mm_sub_epi32(a,c80000000);
+    bs = _mm_sub_epi32(b,c80000000);
+    m1 = _mm_cmpgt_epi32 (as, bs);
+    m2 = _mm_cmpeq_epi32 (as, bs);
+    return _mm_or_si128 ( m1,  m2);
+#endif
+}
+
+//**********************Vector compare less-than or equal******************************
+//***************************************************************************************
+//in IA SIMD no less-than-or-equal comparison for integers present, so we need the tricks
+
+_NEON2SSESTORAGE uint8x8_t vcle_s8(int8x8_t a,  int8x8_t b); // VCGE.S8 d0, d0, d0
+_NEON2SSE_INLINE int8x8_t vcle_s8(int8x8_t a,  int8x8_t b)
+{
+    int8x8_t res64;
+    return64(vcleq_s8(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint16x4_t vcle_s16(int16x4_t a,  int16x4_t b); // VCGE.S16 d0, d0, d0
+_NEON2SSE_INLINE int16x4_t vcle_s16(int16x4_t a,  int16x4_t b)
+{
+    int16x4_t res64;
+    return64(vcleq_s16(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint32x2_t vcle_s32(int32x2_t a,  int32x2_t b); // VCGE.S32 d0, d0, d0
+_NEON2SSE_INLINE int32x2_t vcle_s32(int32x2_t a,  int32x2_t b)
+{
+    int32x2_t res64;
+    return64(vcleq_s32(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint32x2_t vcle_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0?
+_NEON2SSE_INLINE uint32x2_t vcle_f32(float32x2_t a, float32x2_t b)
+{
+    uint32x2_t res64;
+    __m128 res;
+    res = _mm_cmple_ps(_pM128(a),_pM128(b));
+    return64f(res);
+}
+
+_NEON2SSESTORAGE uint8x8_t vcle_u8(uint8x8_t a,  uint8x8_t b); // VCGE.U8 d0, d0, d0
+#define vcle_u8(a,b) vcge_u8(b,a)
+
+
+_NEON2SSESTORAGE uint16x4_t vcle_u16(uint16x4_t a,  uint16x4_t b); // VCGE.s16 d0, d0, d0
+#define vcle_u16(a,b) vcge_u16(b,a)
+
+
+_NEON2SSESTORAGE uint32x2_t vcle_u32(uint32x2_t a,  uint32x2_t b); // VCGE.U32 d0, d0, d0
+#define vcle_u32(a,b) vcge_u32(b,a)
+
+_NEON2SSESTORAGE uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
+_NEON2SSE_INLINE uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b) // VCGE.S8 q0, q0, q0
+{
+    __m128i c1, res;
+    c1 = _mm_cmpeq_epi8 (a,a); //all ones 0xff....
+    res = _mm_cmpgt_epi8 ( a,  b);
+    return _mm_andnot_si128 (res, c1); //inverse the cmpgt result, get less-than-or-equal
+}
+
+_NEON2SSESTORAGE uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
+_NEON2SSE_INLINE uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b) // VCGE.S16 q0, q0, q0
+{
+    __m128i c1, res;
+    c1 = _mm_cmpeq_epi16 (a,a); //all ones 0xff....
+    res = _mm_cmpgt_epi16 ( a,  b);
+    return _mm_andnot_si128 (res, c1);
+}
+
+_NEON2SSESTORAGE uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b) // VCGE.S32 q0, q0, q0
+{
+    __m128i c1, res;
+    c1 = _mm_cmpeq_epi32 (a,a); //all ones 0xff....
+    res = _mm_cmpgt_epi32 ( a,  b);
+    return _mm_andnot_si128 (res, c1);
+}
+
+_NEON2SSESTORAGE uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b)
+{
+    __m128 res;
+    res = _mm_cmple_ps(a,b);
+    return *(__m128i*)&res;
+}
+
+_NEON2SSESTORAGE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
+#ifdef USE_SSE4
+    _NEON2SSE_INLINE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b) // VCGE.U8 q0, q0, q0
+    {
+        //no unsigned chars comparison in SSE, only signed available,so need the trick
+        __m128i cmp;
+        cmp = _mm_min_epu8(a, b);
+        return _mm_cmpeq_epi8(cmp, a); //a<=b
+    }
+#else
+#   define vcleq_u8(a,b) vcgeq_u8(b,a)
+#endif
+
+
+_NEON2SSESTORAGE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b); // VCGE.s16 q0, q0, q0
+#ifdef USE_SSE4
+    _NEON2SSE_INLINE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0, q0, q0
+    {
+        //no unsigned shorts comparison in SSE, only signed available,so need the trick
+        __m128i cmp;
+        cmp = _mm_min_epu16(a, b);
+        return _mm_cmpeq_epi16(cmp, a); //a<=b
+    }
+#else
+#   define vcleq_u16(a,b) vcgeq_u16(b,a)
+#endif
+
+
+_NEON2SSESTORAGE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
+#ifdef USE_SSE4
+    _NEON2SSE_INLINE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b) // VCGE.U32 q0, q0, q0
+    {
+        //no unsigned chars comparison in SSE, only signed available,so need the trick
+        __m128i cmp;
+        cmp = _mm_min_epu32(a, b);
+        return _mm_cmpeq_epi32(cmp, a); //a<=b
+    }
+#else
+//solution may be not optimal compared with the serial one
+#   define vcleq_u32(a,b) vcgeq_u32(b,a)
+#endif
+
+
+//****** Vector compare greater-than ******************************************
+//**************************************************************************
+_NEON2SSESTORAGE uint8x8_t vcgt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
+_NEON2SSE_INLINE int8x8_t vcgt_s8(int8x8_t a, int8x8_t b)
+{
+    int8x8_t res64;
+    return64(_mm_cmpgt_epi8(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint16x4_t vcgt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
+_NEON2SSE_INLINE int16x4_t vcgt_s16(int16x4_t a, int16x4_t b)
+{
+    int16x4_t res64;
+    return64(_mm_cmpgt_epi16(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint32x2_t vcgt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
+_NEON2SSE_INLINE int32x2_t vcgt_s32(int32x2_t a, int32x2_t b)
+{
+    int32x2_t res64;
+    return64(_mm_cmpgt_epi32(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
+_NEON2SSE_INLINE uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b)
+{
+    uint32x2_t res64;
+    __m128 res;
+    res = _mm_cmpgt_ps(_pM128(a),_pM128(b)); //use only 2 first entries
+    return64f(res);
+}
+
+_NEON2SSESTORAGE uint8x8_t vcgt_u8(uint8x8_t a,  uint8x8_t b); // VCGT.U8 d0, d0, d0
+_NEON2SSE_INLINE uint8x8_t vcgt_u8(uint8x8_t a,  uint8x8_t b)
+{
+    uint8x8_t res64;
+    return64(vcgtq_u8(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint16x4_t vcgt_u16(uint16x4_t a,  uint16x4_t b); // VCGT.s16 d0, d0, d0
+_NEON2SSE_INLINE uint16x4_t vcgt_u16(uint16x4_t a,  uint16x4_t b)
+{
+    uint16x4_t res64;
+    return64(vcgtq_u16(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint32x2_t vcgt_u32(uint32x2_t a,  uint32x2_t b); // VCGT.U32 d0, d0, d0
+_NEON2SSE_INLINE uint32x2_t vcgt_u32(uint32x2_t a,  uint32x2_t b)
+{
+    uint32x2_t res64;
+    return64(vcgtq_u32(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint8x16_t   vcgtq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
+#define vcgtq_s8 _mm_cmpgt_epi8
+
+_NEON2SSESTORAGE uint16x8_t   vcgtq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
+#define vcgtq_s16 _mm_cmpgt_epi16
+
+_NEON2SSESTORAGE uint32x4_t   vcgtq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
+#define vcgtq_s32 _mm_cmpgt_epi32
+
+_NEON2SSESTORAGE uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b)
+{
+    __m128 res;
+    res = _mm_cmpgt_ps(a,b); //use only 2 first entries
+    return *(__m128i*)&res;
+}
+
+_NEON2SSESTORAGE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
+_NEON2SSE_INLINE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b) // VCGT.U8 q0, q0, q0
+{
+      //no unsigned chars comparison, only signed available,so need the trick
+        __m128i c128, as, bs;
+        c128 = _mm_set1_epi8(0x80);
+        as = _mm_sub_epi8(a, c128);
+        bs = _mm_sub_epi8(b, c128);
+        return _mm_cmpgt_epi8(as, bs);
+}
+
+_NEON2SSESTORAGE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b); // VCGT.s16 q0, q0, q0
+_NEON2SSE_INLINE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b) // VCGT.s16 q0, q0, q0
+{   
+    //no unsigned short comparison, only signed available,so need the trick
+    __m128i c8000, as, bs;
+    c8000 = _mm_set1_epi16(0x8000);
+    as = _mm_sub_epi16(a, c8000);
+    bs = _mm_sub_epi16(b, c8000);
+    return _mm_cmpgt_epi16(as, bs);
+}
+
+_NEON2SSESTORAGE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b) // VCGT.U32 q0, q0, q0
+{
+    //no unsigned int comparison, only signed available,so need the trick
+    __m128i c80000000, as, bs;
+    c80000000 = _mm_set1_epi32 (0x80000000);
+    as = _mm_sub_epi32(a,c80000000);
+    bs = _mm_sub_epi32(b,c80000000);
+    return _mm_cmpgt_epi32 ( as, bs);
+}
+
+//********************* Vector compare less-than **************************
+//*************************************************************************
+_NEON2SSESTORAGE uint8x8_t   vclt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
+#define vclt_s8(a,b) vcgt_s8(b,a) //swap the arguments!!
+
+
+_NEON2SSESTORAGE uint16x4_t   vclt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
+#define vclt_s16(a,b) vcgt_s16(b,a) //swap the arguments!!
+
+
+_NEON2SSESTORAGE uint32x2_t   vclt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
+#define vclt_s32(a,b)  vcgt_s32(b,a) //swap the arguments!!
+
+
+_NEON2SSESTORAGE uint32x2_t vclt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
+#define vclt_f32(a,b) vcgt_f32(b, a) //swap the arguments!!
+
+_NEON2SSESTORAGE uint8x8_t vclt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
+#define vclt_u8(a,b) vcgt_u8(b,a) //swap the arguments!!
+
+_NEON2SSESTORAGE uint16x4_t vclt_u16(uint16x4_t a, uint16x4_t b); // VCGT.s16 d0, d0, d0
+#define vclt_u16(a,b) vcgt_u16(b,a) //swap the arguments!!
+
+_NEON2SSESTORAGE uint32x2_t vclt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
+#define vclt_u32(a,b) vcgt_u32(b,a) //swap the arguments!!
+
+_NEON2SSESTORAGE uint8x16_t   vcltq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
+#define vcltq_s8(a,b) vcgtq_s8(b, a) //swap the arguments!!
+
+_NEON2SSESTORAGE uint16x8_t   vcltq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
+#define vcltq_s16(a,b) vcgtq_s16(b, a) //swap the arguments!!
+
+_NEON2SSESTORAGE uint32x4_t   vcltq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
+#define vcltq_s32(a,b) vcgtq_s32(b, a) //swap the arguments!!
+
+_NEON2SSESTORAGE uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
+#define vcltq_f32(a,b) vcgtq_f32(b, a) //swap the arguments!!
+
+_NEON2SSESTORAGE uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
+#define vcltq_u8(a,b) vcgtq_u8(b, a) //swap the arguments!!
+
+_NEON2SSESTORAGE uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b); // VCGT.s16 q0, q0, q0
+#define vcltq_u16(a,b) vcgtq_u16(b, a) //swap the arguments!!
+
+_NEON2SSESTORAGE uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
+#define vcltq_u32(a,b) vcgtq_u32(b, a) //swap the arguments!!
+
+//*****************Vector compare absolute greater-than or equal ************
+//***************************************************************************
+_NEON2SSESTORAGE uint32x2_t vcage_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
+_NEON2SSE_INLINE uint32x2_t vcage_f32(float32x2_t a, float32x2_t b)
+{
+    uint32x2_t res64;
+    __m128i c7fffffff;
+    __m128 a0, b0;
+    c7fffffff = _mm_set1_epi32 (0x7fffffff);
+    a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
+    b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
+    a0 = _mm_cmpge_ps ( a0, b0);
+    return64f(a0);
+}
+
+_NEON2SSESTORAGE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b) // VACGE.F32 q0, q0, q0
+{
+    __m128i c7fffffff;
+    __m128 a0, b0;
+    c7fffffff = _mm_set1_epi32 (0x7fffffff);
+    a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
+    b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
+    a0 = _mm_cmpge_ps ( a0, b0);
+    return (*(__m128i*)&a0);
+}
+
+//********Vector compare absolute less-than or equal ******************
+//********************************************************************
+_NEON2SSESTORAGE uint32x2_t vcale_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
+_NEON2SSE_INLINE uint32x2_t vcale_f32(float32x2_t a, float32x2_t b)
+{
+    uint32x2_t res64;
+    __m128i c7fffffff;
+    __m128 a0, b0;
+    c7fffffff = _mm_set1_epi32 (0x7fffffff);
+    a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
+    b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
+    a0 = _mm_cmple_ps (a0, b0);
+    return64f(a0);
+}
+
+_NEON2SSESTORAGE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b) // VACGE.F32 q0, q0, q0
+{
+    __m128i c7fffffff;
+    __m128 a0, b0;
+    c7fffffff = _mm_set1_epi32 (0x7fffffff);
+    a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
+    b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
+    a0 = _mm_cmple_ps (a0, b0);
+    return (*(__m128i*)&a0);
+}
+
+//********  Vector compare absolute greater-than    ******************
+//******************************************************************
+_NEON2SSESTORAGE uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
+_NEON2SSE_INLINE uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b)
+{
+    uint32x2_t res64;
+    __m128i c7fffffff;
+    __m128 a0, b0;
+    c7fffffff = _mm_set1_epi32 (0x7fffffff);
+    a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
+    b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
+    a0 = _mm_cmpgt_ps (a0, b0);
+    return64f(a0);
+}
+
+_NEON2SSESTORAGE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b) // VACGT.F32 q0, q0, q0
+{
+    __m128i c7fffffff;
+    __m128 a0, b0;
+    c7fffffff = _mm_set1_epi32 (0x7fffffff);
+    a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
+    b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
+    a0 = _mm_cmpgt_ps (a0, b0);
+    return (*(__m128i*)&a0);
+}
+
+//***************Vector compare absolute less-than  ***********************
+//*************************************************************************
+_NEON2SSESTORAGE uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
+_NEON2SSE_INLINE uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b)
+{
+    uint32x2_t res64;
+    __m128i c7fffffff;
+    __m128 a0, b0;
+    c7fffffff = _mm_set1_epi32 (0x7fffffff);
+    a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
+    b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
+    a0 = _mm_cmplt_ps (a0, b0);
+    return64f(a0);
+}
+
+_NEON2SSESTORAGE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b) // VACGT.F32 q0, q0, q0
+{
+    __m128i c7fffffff;
+    __m128 a0, b0;
+    c7fffffff = _mm_set1_epi32 (0x7fffffff);
+    a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
+    b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
+    a0 = _mm_cmplt_ps (a0, b0);
+    return (*(__m128i*)&a0);
+}
+
+//*************************Vector test bits************************************
+//*****************************************************************************
+/*VTST (Vector Test Bits) takes each element in a vector, and bitwise logical ANDs them
+with the corresponding element of a second vector. If the result is not zero, the
+corresponding element in the destination vector is set to all ones. Otherwise, it is set to
+all zeros. */
+
+_NEON2SSESTORAGE uint8x8_t vtst_s8(int8x8_t a,  int8x8_t b); // VTST.8 d0, d0, d0
+_NEON2SSE_INLINE uint8x8_t vtst_s8(int8x8_t a,  int8x8_t b)
+{
+    int8x8_t res64;
+    return64(vtstq_s8(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint16x4_t vtst_s16(int16x4_t a,  int16x4_t b); // VTST.16 d0, d0, d0
+_NEON2SSE_INLINE uint16x4_t vtst_s16(int16x4_t a,  int16x4_t b)
+{
+    int16x4_t res64;
+    return64(vtstq_s16(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint32x2_t vtst_s32(int32x2_t a,  int32x2_t b); // VTST.32 d0, d0, d0
+_NEON2SSE_INLINE uint32x2_t vtst_s32(int32x2_t a,  int32x2_t b)
+{
+    int32x2_t res64;
+    return64(vtstq_s32(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint8x8_t vtst_u8(uint8x8_t a,  uint8x8_t b); // VTST.8 d0, d0, d0
+#define vtst_u8 vtst_s8
+
+_NEON2SSESTORAGE uint16x4_t vtst_u16(uint16x4_t a,  uint16x4_t b); // VTST.16 d0, d0, d0
+#define vtst_u16 vtst_s16
+
+_NEON2SSESTORAGE uint32x2_t vtst_u32(uint32x2_t a,  uint32x2_t b); // VTST.32 d0, d0, d0
+#define vtst_u32 vtst_s32
+
+
+_NEON2SSESTORAGE uint8x8_t vtst_p8(poly8x8_t a, poly8x8_t b); // VTST.8 d0, d0, d0
+#define vtst_p8 vtst_u8
+
+_NEON2SSESTORAGE uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b); // VTST.8 q0, q0, q0
+_NEON2SSE_INLINE uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b) // VTST.8 q0, q0, q0
+{
+    __m128i zero, one, res;
+    zero = _mm_setzero_si128 ();
+    one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff
+    res = _mm_and_si128 (a, b);
+    res =  _mm_cmpeq_epi8 (res, zero);
+    return _mm_xor_si128(res, one); //invert result
+}
+
+_NEON2SSESTORAGE uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b); // VTST.16 q0, q0, q0
+_NEON2SSE_INLINE uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b) // VTST.16 q0, q0, q0
+{
+    __m128i zero, one, res;
+    zero = _mm_setzero_si128 ();
+    one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff
+    res = _mm_and_si128 (a, b);
+    res =  _mm_cmpeq_epi16 (res, zero);
+    return _mm_xor_si128(res, one); //invert result
+}
+
+_NEON2SSESTORAGE uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b); // VTST.32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b) // VTST.32 q0, q0, q0
+{
+    __m128i zero, one, res;
+    zero = _mm_setzero_si128 ();
+    one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff
+    res = _mm_and_si128 (a, b);
+    res =  _mm_cmpeq_epi32 (res, zero);
+    return _mm_xor_si128(res, one); //invert result
+}
+
+_NEON2SSESTORAGE uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b); // VTST.8 q0, q0, q0
+#define vtstq_u8 vtstq_s8
+
+_NEON2SSESTORAGE uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b); // VTST.16 q0, q0, q0
+#define vtstq_u16 vtstq_s16
+
+_NEON2SSESTORAGE uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b); // VTST.32 q0, q0, q0
+#define vtstq_u32 vtstq_s32
+
+_NEON2SSESTORAGE uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b); // VTST.8 q0, q0, q0
+#define vtstq_p8 vtstq_u8
+
+//****************** Absolute difference ********************
+//*** Absolute difference between the arguments: Vr[i] = | Va[i] - Vb[i] |*****
+//************************************************************
+_NEON2SSESTORAGE int8x8_t vabd_s8(int8x8_t a,  int8x8_t b); // VABD.S8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vabd_s8(int8x8_t a,  int8x8_t b)
+{
+    int8x8_t res64;
+    return64(vabdq_s8(_pM128i(a), _pM128i(b)));
+}
+
+_NEON2SSESTORAGE int16x4_t vabd_s16(int16x4_t a,  int16x4_t b); // VABD.S16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vabd_s16(int16x4_t a,  int16x4_t b)
+{
+    int16x4_t res64;
+    return64(vabdq_s16(_pM128i(a), _pM128i(b)));
+}
+
+_NEON2SSESTORAGE int32x2_t vabd_s32(int32x2_t a,  int32x2_t b); // VABD.S32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vabd_s32(int32x2_t a,  int32x2_t b)
+{//need to deal with an intermediate overflow
+    int32x2_t res;
+    res.m64_i32[0] = (a.m64_i32[0] > b.m64_i32[0]) ? a.m64_i32[0] -  b.m64_i32[0]: b.m64_i32[0] -  a.m64_i32[0];
+    res.m64_i32[1] = (a.m64_i32[1] > b.m64_i32[1]) ? a.m64_i32[1] -  b.m64_i32[1]: b.m64_i32[1] -  a.m64_i32[1];
+    return res;
+}
+
+_NEON2SSESTORAGE uint8x8_t vabd_u8(uint8x8_t a,  uint8x8_t b); // VABD.U8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vabd_u8(uint8x8_t a,  uint8x8_t b)
+{
+    uint8x8_t res64;
+    return64(vabdq_u8(_pM128i(a), _pM128i(b)));
+}
+
+_NEON2SSESTORAGE uint16x4_t vabd_u16(uint16x4_t a,  uint16x4_t b); // VABD.s16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vabd_u16(uint16x4_t a,  uint16x4_t b)
+{
+    uint16x4_t res64;
+    return64(vabdq_u16(_pM128i(a), _pM128i(b)));
+}
+
+_NEON2SSESTORAGE uint32x2_t vabd_u32(uint32x2_t a,  uint32x2_t b); // VABD.U32 d0,d0,d0
+_NEON2SSE_INLINE uint32x2_t vabd_u32(uint32x2_t a,  uint32x2_t b)
+{
+    uint32x2_t res64;
+    return64(vabdq_u32(_pM128i(a), _pM128i(b)));
+}
+
+_NEON2SSESTORAGE float32x2_t vabd_f32(float32x2_t a, float32x2_t b); // VABD.F32 d0,d0,d0
+_NEON2SSE_INLINE float32x2_t vabd_f32(float32x2_t a, float32x2_t b)
+{
+    float32x4_t res;
+    __m64_128 res64;
+    res = vabdq_f32(_pM128(a), _pM128(b));
+    _M64f(res64, res);
+    return res64;
+}
+
+_NEON2SSESTORAGE int8x16_t vabdq_s8(int8x16_t a, int8x16_t b); // VABD.S8 q0,q0,q0
+_NEON2SSE_INLINE int8x16_t vabdq_s8(int8x16_t a, int8x16_t b) // VABD.S8 q0,q0,q0
+{ //need to deal with an intermediate overflow
+   __m128i cmp, difab, difba;
+   cmp = vcgtq_s8(a,b);
+   difab = _mm_sub_epi8(a,b);
+   difba = _mm_sub_epi8(b,a);
+   difab = _mm_and_si128(cmp, difab);
+   difba = _mm_andnot_si128(cmp, difba);
+   return _mm_or_si128(difab, difba);
+}
+
+_NEON2SSESTORAGE int16x8_t vabdq_s16(int16x8_t a, int16x8_t b); // VABD.S16 q0,q0,q0
+_NEON2SSE_INLINE int16x8_t vabdq_s16(int16x8_t a, int16x8_t b) // VABD.S16 q0,q0,q0
+{//need to deal with an intermediate overflow
+    __m128i cmp, difab, difba;
+    cmp = vcgtq_s16(a,b);
+    difab = _mm_sub_epi16(a,b);
+    difba = _mm_sub_epi16 (b,a);
+    difab = _mm_and_si128(cmp, difab);
+    difba = _mm_andnot_si128(cmp, difba);
+    return _mm_or_si128(difab, difba);
+}
+
+_NEON2SSESTORAGE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b); // VABD.S32 q0,q0,q0
+_NEON2SSE_INLINE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b) // VABD.S32 q0,q0,q0
+{//need to deal with an intermediate overflow
+    __m128i cmp, difab, difba;
+    cmp = vcgtq_s32(a,b);
+    difab = _mm_sub_epi32(a,b);
+    difba = _mm_sub_epi32(b,a);
+    difab = _mm_and_si128(cmp, difab);
+    difba = _mm_andnot_si128(cmp, difba);
+    return _mm_or_si128(difab, difba);
+}
+
+_NEON2SSESTORAGE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b); // VABD.U8 q0,q0,q0
+_NEON2SSE_INLINE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b) //no abs for unsigned
+{
+    __m128i  difab, difba;
+    difab = _mm_subs_epu8(a,b);
+    difba = _mm_subs_epu8 (b,a);
+    return _mm_or_si128(difab, difba);
+}
+
+_NEON2SSESTORAGE uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b); // VABD.s16 q0,q0,q0
+_NEON2SSE_INLINE uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b)
+{
+    __m128i difab, difba;
+    difab = _mm_subs_epu16(a,b);
+    difba = _mm_subs_epu16 (b,a);
+    return _mm_or_si128(difab, difba);
+}
+
+_NEON2SSESTORAGE uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b); // VABD.U32 q0,q0,q0
+_NEON2SSE_INLINE uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b)
+{
+    __m128i cmp, difab, difba;
+    cmp = vcgtq_u32(a,b);
+    difab = _mm_sub_epi32(a,b);
+    difba = _mm_sub_epi32 (b,a);
+    difab = _mm_and_si128(cmp, difab);
+    difba = _mm_andnot_si128(cmp, difba);
+    return _mm_or_si128(difab, difba);
+}
+
+_NEON2SSESTORAGE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b); // VABD.F32 q0,q0,q0
+_NEON2SSE_INLINE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b) // VABD.F32 q0,q0,q0
+{
+    __m128i c1;
+    __m128 res;
+    c1 =  _mm_set1_epi32(0x7fffffff);
+    res = _mm_sub_ps (a, b);
+    return _mm_and_ps (res, *(__m128*)&c1);
+}
+
+//************  Absolute difference - long **************************
+//********************************************************************
+_NEON2SSESTORAGE int16x8_t vabdl_s8(int8x8_t a, int8x8_t b); // VABDL.S8 q0,d0,d0
+_NEON2SSE_INLINE int16x8_t vabdl_s8(int8x8_t a, int8x8_t b) // VABDL.S8 q0,d0,d0
+{
+    __m128i a16, b16;
+    a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1,
+    b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
+    return vabdq_s16(a16, b16);
+
+}
+
+_NEON2SSESTORAGE int32x4_t vabdl_s16(int16x4_t a, int16x4_t b); // VABDL.S16 q0,d0,d0
+_NEON2SSE_INLINE int32x4_t vabdl_s16(int16x4_t a, int16x4_t b) // VABDL.S16 q0,d0,d0
+{
+    __m128i a32, b32;
+    a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1
+    b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1,
+    return vabdq_s32(a32, b32);
+}
+
+_NEON2SSESTORAGE int64x2_t vabdl_s32(int32x2_t a, int32x2_t b); // VABDL.S32 q0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (int64x2_t vabdl_s32(int32x2_t a, int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //no optimal SIMD solution, serial looks faster
+    _NEON2SSE_ALIGN_16 int64_t res[2];
+    if(a.m64_i32[0] > b.m64_i32[0]) res[0] = ( int64_t) a.m64_i32[0] - ( int64_t) b.m64_i32[0];
+    else res[0] = ( int64_t) b.m64_i32[0] - ( int64_t) a.m64_i32[0];
+    if(a.m64_i32[1] > b.m64_i32[1]) res[1] = ( int64_t) a.m64_i32[1] - ( int64_t) b.m64_i32[1];
+    else res[1] = ( int64_t) b.m64_i32[1] - ( int64_t) a.m64_i32[1];
+    return _mm_load_si128((__m128i*)res);
+}
+
+_NEON2SSESTORAGE uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b); // VABDL.U8 q0,d0,d0
+_NEON2SSE_INLINE uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b)
+{
+    __m128i res;
+    res = vsubl_u8(a,b);
+    return _mm_abs_epi16(res);
+}
+
+_NEON2SSESTORAGE uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b); // VABDL.s16 q0,d0,d0
+_NEON2SSE_INLINE uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b)
+{
+    __m128i res;
+    res = vsubl_u16(a,b);
+    return _mm_abs_epi32(res);
+}
+
+_NEON2SSESTORAGE uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b); // VABDL.U32 q0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    _NEON2SSE_ALIGN_16 uint64_t res[2];
+    if(a.m64_u32[0] > b.m64_u32[0]) res[0] = ( uint64_t) a.m64_u32[0] - ( uint64_t) b.m64_u32[0];
+    else res[0] = ( uint64_t) b.m64_u32[0] - ( uint64_t) a.m64_u32[0];
+    if(a.m64_u32[1] > b.m64_u32[1]) res[1] = ( uint64_t) a.m64_u32[1] - ( uint64_t) b.m64_u32[1];
+    else res[1] = ( uint64_t) b.m64_u32[1] - ( uint64_t) a.m64_u32[1];
+    return _mm_load_si128((__m128i*)res);
+}
+
+//**********Absolute difference and accumulate: Vr[i] = Va[i] + | Vb[i] - Vc[i] | *************
+//*********************************************************************************************
+_NEON2SSESTORAGE int8x8_t vaba_s8(int8x8_t a,  int8x8_t b, int8x8_t c); // VABA.S8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vaba_s8(int8x8_t a,  int8x8_t b, int8x8_t c)
+{
+    int8x8_t res64;
+    return64(vabaq_s8(_pM128i(a),_pM128i(b), _pM128i(c)));
+}
+
+_NEON2SSESTORAGE int16x4_t vaba_s16(int16x4_t a,  int16x4_t b, int16x4_t c); // VABA.S16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vaba_s16(int16x4_t a,  int16x4_t b, int16x4_t c)
+{
+    int16x4_t res64;
+    return64(vabaq_s16(_pM128i(a), _pM128i(b), _pM128i(c)));
+}
+
+_NEON2SSESTORAGE int32x2_t vaba_s32(int32x2_t a,  int32x2_t b, int32x2_t c); // VABA.S32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vaba_s32(int32x2_t a,  int32x2_t b, int32x2_t c)
+{
+    int32x2_t res64;
+    return64(vabaq_s32(_pM128i(a), _pM128i(b), _pM128i(c)));
+}
+
+_NEON2SSESTORAGE uint8x8_t vaba_u8(uint8x8_t a,  uint8x8_t b, uint8x8_t c); // VABA.U8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vaba_u8(uint8x8_t a,  uint8x8_t b, uint8x8_t c)
+{
+    int8x8_t res64;
+    return64(vabaq_u8(_pM128i(a),_pM128i(b), _pM128i(c)));
+}
+
+
+_NEON2SSESTORAGE uint16x4_t vaba_u16(uint16x4_t a,  uint16x4_t b, uint16x4_t c); // VABA.s16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vaba_u16(uint16x4_t a,  uint16x4_t b, uint16x4_t c)
+{
+    int16x4_t res64;
+    return64(vabaq_u16(_pM128i(a), _pM128i(b), _pM128i(c)));
+}
+
+_NEON2SSESTORAGE uint32x2_t vaba_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t c); // VABA.U32 d0,d0,d0
+_NEON2SSE_INLINE uint32x2_t vaba_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t c)
+{
+    uint32x2_t res64;
+    return64(vabaq_u32(_pM128i(a), _pM128i(b), _pM128i(c)));
+}
+
+_NEON2SSESTORAGE int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VABA.S8 q0,q0,q0
+_NEON2SSE_INLINE int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VABA.S8 q0,q0,q0
+{
+    int8x16_t sub;
+    sub = vabdq_s8(b, c);
+    return vaddq_s8( a, sub);
+}
+
+_NEON2SSESTORAGE int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VABA.S16 q0,q0,q0
+_NEON2SSE_INLINE int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VABA.S16 q0,q0,q0
+{
+    int16x8_t sub;
+    sub = vabdq_s16(b, c);
+    return vaddq_s16( a, sub);
+}
+
+_NEON2SSESTORAGE int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VABA.S32 q0,q0,q0
+_NEON2SSE_INLINE int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VABA.S32 q0,q0,q0
+{
+    int32x4_t sub;
+    sub = vabdq_s32(b, c);
+    return vaddq_s32( a, sub);
+}
+
+_NEON2SSESTORAGE uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VABA.U8 q0,q0,q0
+_NEON2SSE_INLINE uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c)
+{
+    uint8x16_t sub;
+    sub = vabdq_u8(b, c);
+    return vaddq_u8( a, sub);
+}
+
+_NEON2SSESTORAGE uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VABA.s16 q0,q0,q0
+_NEON2SSE_INLINE uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c)
+{
+    uint16x8_t sub;
+    sub = vabdq_u16(b, c);
+    return vaddq_u16( a, sub);
+}
+
+_NEON2SSESTORAGE uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VABA.U32 q0,q0,q0
+_NEON2SSE_INLINE uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c)
+{
+    uint32x4_t sub;
+    sub = vabdq_u32(b, c);
+    return vaddq_u32( a, sub);
+}
+
+//************** Absolute difference and accumulate - long ********************************
+//*************************************************************************************
+_NEON2SSESTORAGE int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VABAL.S8 q0,d0,d0
+_NEON2SSE_INLINE int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VABAL.S8 q0,d0,d0
+{
+    __m128i b16, c16, res;
+    b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
+    c16 = _MM_CVTEPI8_EPI16 (_pM128i(c)); //SSE4.1,
+    res = _mm_abs_epi16 (_mm_sub_epi16 (b16, c16) );
+    return _mm_add_epi16 (a, res);
+}
+
+_NEON2SSESTORAGE int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VABAL.S16 q0,d0,d0
+_NEON2SSE_INLINE int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VABAL.S16 q0,d0,d0
+{
+    __m128i b32, c32, res;
+    b32 = _MM_CVTEPI16_EPI32(_pM128i(b)); //SSE4.1
+    c32 = _MM_CVTEPI16_EPI32(_pM128i(c)); //SSE4.1
+    res = _mm_abs_epi32 (_mm_sub_epi32 (b32, c32) );
+    return _mm_add_epi32 (a, res);
+}
+
+_NEON2SSESTORAGE int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VABAL.S32 q0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    __m128i res;
+    res = vabdl_s32(b,c);
+    return _mm_add_epi64(a, res);
+}
+
+_NEON2SSESTORAGE uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VABAL.U8 q0,d0,d0
+_NEON2SSE_INLINE uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c)
+{
+    __m128i b16, c16, res;
+    b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1,
+    c16 = _MM_CVTEPU8_EPI16 (_pM128i(c)); //SSE4.1,
+    res = _mm_abs_epi16 (_mm_sub_epi16 (b16, c16) );
+    return _mm_add_epi16 (a, res);
+}
+
+_NEON2SSESTORAGE uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VABAL.s16 q0,d0,d0
+_NEON2SSE_INLINE uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c)
+{
+    __m128i b32, c32, res;
+    b32 = _MM_CVTEPU16_EPI32(_pM128i(b)); //SSE4.1
+    c32 = _MM_CVTEPU16_EPI32(_pM128i(c)); //SSE4.1
+    res = _mm_abs_epi32 (_mm_sub_epi32 (b32, c32) );
+    return _mm_add_epi32 (a, res);
+}
+
+_NEON2SSESTORAGE uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VABAL.U32 q0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    __m128i res;
+    res = vabdl_u32(b,c);
+    return _mm_add_epi64(a, res);
+}
+
+//***********************************************************************************
+//****************  Maximum and minimum operations **********************************
+//***********************************************************************************
+//************* Maximum:  vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i]    *******
+//***********************************************************************************
+_NEON2SSESTORAGE int8x8_t   vmax_s8(int8x8_t a, int8x8_t b); // VMAX.S8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t   vmax_s8(int8x8_t a, int8x8_t b)
+{
+    int8x8_t res64;
+    __m128i res;
+    res = _MM_MAX_EPI8(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
+    return64(res);
+}
+
+_NEON2SSESTORAGE int16x4_t vmax_s16(int16x4_t a, int16x4_t b); // VMAX.S16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vmax_s16(int16x4_t a, int16x4_t b)
+{
+    int16x4_t res64;
+    return64(_mm_max_epi16(_pM128i(a),_pM128i(b)));
+}
+
+_NEON2SSESTORAGE int32x2_t   vmax_s32(int32x2_t a, int32x2_t b); // VMAX.S32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t   vmax_s32(int32x2_t a, int32x2_t b)
+{
+    int32x2_t res64;
+    __m128i res;
+    res =  _MM_MAX_EPI32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
+    return64(res);
+}
+
+_NEON2SSESTORAGE uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b); // VMAX.U8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b)
+{
+    uint8x8_t res64;
+    return64(_mm_max_epu8(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b); // VMAX.s16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b)
+{
+    uint16x4_t res64;
+    return64(_MM_MAX_EPU16(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint32x2_t   vmax_u32(uint32x2_t a, uint32x2_t b); // VMAX.U32 d0,d0,d0
+_NEON2SSE_INLINE uint32x2_t   vmax_u32(uint32x2_t a, uint32x2_t b)
+{
+    uint32x2_t res64;
+    __m128i res;
+    res = _MM_MAX_EPU32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits, may be not effective compared with serial
+    return64(res);
+}
+
+_NEON2SSESTORAGE float32x2_t vmax_f32(float32x2_t a, float32x2_t b); // VMAX.F32 d0,d0,d0
+_NEON2SSE_INLINE float32x2_t vmax_f32(float32x2_t a, float32x2_t b)
+{
+    //serial solution looks faster than  SIMD one
+    float32x2_t res;
+    res.m64_f32[0] = (a.m64_f32[0] > b.m64_f32[0]) ? a.m64_f32[0] : b.m64_f32[0];
+    res.m64_f32[1] = (a.m64_f32[1] > b.m64_f32[1]) ? a.m64_f32[1] : b.m64_f32[1];
+    return res;
+}
+
+_NEON2SSESTORAGE int8x16_t   vmaxq_s8(int8x16_t a, int8x16_t b); // VMAX.S8 q0,q0,q0
+#define vmaxq_s8 _MM_MAX_EPI8 //SSE4.1
+
+_NEON2SSESTORAGE int16x8_t   vmaxq_s16(int16x8_t a, int16x8_t b); // VMAX.S16 q0,q0,q0
+#define vmaxq_s16 _mm_max_epi16
+
+_NEON2SSESTORAGE int32x4_t   vmaxq_s32(int32x4_t a, int32x4_t b); // VMAX.S32 q0,q0,q0
+#define vmaxq_s32 _MM_MAX_EPI32 //SSE4.1
+
+_NEON2SSESTORAGE uint8x16_t   vmaxq_u8(uint8x16_t a, uint8x16_t b); // VMAX.U8 q0,q0,q0
+#define vmaxq_u8 _mm_max_epu8
+
+_NEON2SSESTORAGE uint16x8_t   vmaxq_u16(uint16x8_t a, uint16x8_t b); // VMAX.s16 q0,q0,q0
+#define vmaxq_u16 _MM_MAX_EPU16 //SSE4.1
+
+_NEON2SSESTORAGE uint32x4_t   vmaxq_u32(uint32x4_t a, uint32x4_t b); // VMAX.U32 q0,q0,q0
+#define vmaxq_u32 _MM_MAX_EPU32 //SSE4.1
+
+
+_NEON2SSESTORAGE float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b); // VMAX.F32 q0,q0,q0
+#define vmaxq_f32 _mm_max_ps
+
+
+_NEON2SSESTORAGE float64x2_t vmaxq_f64(float64x2_t a, float64x2_t b); // VMAX.F64 q0,q0,q0
+#define vmaxq_f64 _mm_max_pd
+
+
+//*************** Minimum: vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i] ********************************
+//***********************************************************************************************************
+_NEON2SSESTORAGE int8x8_t   vmin_s8(int8x8_t a, int8x8_t b); // VMIN.S8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t   vmin_s8(int8x8_t a, int8x8_t b)
+{
+    int8x8_t res64;
+    __m128i res;
+    res = _MM_MIN_EPI8(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
+    return64(res);
+}
+
+_NEON2SSESTORAGE int16x4_t vmin_s16(int16x4_t a, int16x4_t b); // VMIN.S16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vmin_s16(int16x4_t a, int16x4_t b)
+{
+    int16x4_t res64;
+    return64(_mm_min_epi16(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE int32x2_t   vmin_s32(int32x2_t a, int32x2_t b); // VMIN.S32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t   vmin_s32(int32x2_t a, int32x2_t b)
+{
+    int32x2_t res64;
+    __m128i res;
+    res = _MM_MIN_EPI32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
+    return64(res);
+}
+
+_NEON2SSESTORAGE uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b); // VMIN.U8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b)
+{
+    uint8x8_t res64;
+    return64(_mm_min_epu8(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b); // VMIN.s16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b)
+{
+    uint16x4_t res64;
+    return64(_MM_MIN_EPU16(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint32x2_t   vmin_u32(uint32x2_t a, uint32x2_t b); // VMIN.U32 d0,d0,d0
+_NEON2SSE_INLINE uint32x2_t   vmin_u32(uint32x2_t a, uint32x2_t b)
+{
+    uint32x2_t res64;
+    __m128i res;
+    res = _MM_MIN_EPU32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits, may be not effective compared with serial
+    return64(res);
+}
+
+_NEON2SSESTORAGE float32x2_t vmin_f32(float32x2_t a, float32x2_t b); // VMIN.F32 d0,d0,d0
+_NEON2SSE_INLINE float32x2_t vmin_f32(float32x2_t a, float32x2_t b)
+{
+    //serial solution looks faster than  SIMD one
+    float32x2_t res;
+    res.m64_f32[0] = (a.m64_f32[0] < b.m64_f32[0]) ? a.m64_f32[0] : b.m64_f32[0];
+    res.m64_f32[1] = (a.m64_f32[1] < b.m64_f32[1]) ? a.m64_f32[1] : b.m64_f32[1];
+    return res;
+}
+
+_NEON2SSESTORAGE int8x16_t   vminq_s8(int8x16_t a, int8x16_t b); // VMIN.S8 q0,q0,q0
+#define vminq_s8 _MM_MIN_EPI8 //SSE4.1
+
+_NEON2SSESTORAGE int16x8_t   vminq_s16(int16x8_t a, int16x8_t b); // VMIN.S16 q0,q0,q0
+#define vminq_s16 _mm_min_epi16
+
+_NEON2SSESTORAGE int32x4_t   vminq_s32(int32x4_t a, int32x4_t b); // VMIN.S32 q0,q0,q0
+#define vminq_s32 _MM_MIN_EPI32 //SSE4.1
+
+_NEON2SSESTORAGE uint8x16_t   vminq_u8(uint8x16_t a, uint8x16_t b); // VMIN.U8 q0,q0,q0
+#define vminq_u8 _mm_min_epu8
+
+_NEON2SSESTORAGE uint16x8_t   vminq_u16(uint16x8_t a, uint16x8_t b); // VMIN.s16 q0,q0,q0
+#define vminq_u16 _MM_MIN_EPU16 //SSE4.1
+
+_NEON2SSESTORAGE uint32x4_t   vminq_u32(uint32x4_t a, uint32x4_t b); // VMIN.U32 q0,q0,q0
+#define vminq_u32 _MM_MIN_EPU32 //SSE4.1
+
+_NEON2SSESTORAGE float32x4_t vminq_f32(float32x4_t a, float32x4_t b); // VMIN.F32 q0,q0,q0
+#define vminq_f32 _mm_min_ps
+
+
+_NEON2SSESTORAGE float64x2_t vminq_f64(float64x2_t a, float64x2_t b); // VMIN.F64 q0,q0,q0
+#define vminq_f64 _mm_min_pd
+
+
+//*************  Pairwise addition operations. **************************************
+//************************************************************************************
+//Pairwise add - adds adjacent pairs of elements of two vectors, and places the results in the destination vector
+_NEON2SSESTORAGE int8x8_t vpadd_s8(int8x8_t a, int8x8_t b); // VPADD.I8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vpadd_s8(int8x8_t a, int8x8_t b) // VPADD.I8 d0,d0,d0
+{
+    //no 8 bit hadd in IA32, need to go to 16 bit and then pack
+    int8x8_t res64;
+    __m128i a16, b16, res;
+    a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1
+    b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1
+    res = _mm_hadd_epi16 (a16, b16);
+    res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd); //return to 8 bit, use low 64 bits
+    return64(res);
+}
+
+_NEON2SSESTORAGE int16x4_t   vpadd_s16(int16x4_t a, int16x4_t b); // VPADD.I16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t   vpadd_s16(int16x4_t a, int16x4_t b)
+{
+    int16x4_t res64;
+    __m128i hadd128;
+    hadd128 = _mm_hadd_epi16 (_pM128i(a), _pM128i(b));
+    hadd128 = _mm_shuffle_epi32 (hadd128, 0 | (2 << 2) | (1 << 4) | (3 << 6));
+    return64(hadd128);
+}
+
+
+_NEON2SSESTORAGE int32x2_t   vpadd_s32(int32x2_t a, int32x2_t b); // VPADD.I32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t   vpadd_s32(int32x2_t a, int32x2_t b)
+{
+    int32x2_t res64;
+    __m128i hadd128;
+    hadd128 = _mm_hadd_epi32 (_pM128i(a), _pM128i(b));
+    hadd128 = _mm_shuffle_epi32 (hadd128, 0 | (2 << 2) | (1 << 4) | (3 << 6));
+    return64(hadd128);
+}
+
+
+_NEON2SSESTORAGE uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b); // VPADD.I8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b) // VPADD.I8 d0,d0,d0
+{
+    //  no 8 bit hadd in IA32, need to go to 16 bit and then pack
+    uint8x8_t res64;
+//  no unsigned _mm_hadd_ functions in IA32, but 8 unsigned is less then 16 signed, so it should work
+    __m128i mask8, a16, b16, res;
+    mask8 = _mm_set1_epi16(0xff);
+    a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1
+    b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1
+    res = _mm_hadd_epi16 (a16, b16);
+    res = _mm_and_si128(res, mask8); //to avoid saturation
+    res = _mm_packus_epi16 (res,res); //use low 64 bits
+    return64(res);
+}
+
+_NEON2SSESTORAGE uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b); // VPADD.I16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b) // VPADD.I16 d0,d0,d0
+{
+    // solution may be not optimal, serial execution may be faster
+    // no unsigned _mm_hadd_ functions in IA32, need to move from unsigned to signed
+    uint16x4_t res64;
+    __m128i c32767,  cfffe, as, bs, res;
+    c32767 = _mm_set1_epi16 (32767);
+    cfffe = _mm_set1_epi16 (-2); //(int16_t)0xfffe
+    as = _mm_sub_epi16 (_pM128i(a), c32767);
+    bs = _mm_sub_epi16 (_pM128i(b), c32767);
+    res = _mm_hadd_epi16 (as, bs);
+    res = _mm_add_epi16 (res, cfffe);
+    res = _mm_shuffle_epi32 (res, 0 | (2 << 2) | (1 << 4) | (3 << 6));
+    return64(res);
+}
+
+_NEON2SSESTORAGE uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b); // VPADD.I32 d0,d0,d0
+_NEON2SSE_INLINE uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b) //serial may be faster
+{
+    //hadd doesn't work for unsigned values
+    uint32x2_t res64;
+    __m128i ab, ab_sh, res;
+    ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //a0 a1 b0 b1
+    ab_sh = _mm_shuffle_epi32(ab, 1 | (0 << 2) | (3 << 4) | (2 << 6)); //a1, a0, b1, b0
+    res = _mm_add_epi32(ab, ab_sh);
+    res = _mm_shuffle_epi32(res, 0 | (2 << 2) | (1 << 4) | (3 << 6));
+    return64(res);
+}
+
+_NEON2SSESTORAGE float32x2_t vpadd_f32(float32x2_t a, float32x2_t b); // VPADD.F32 d0,d0,d0
+_NEON2SSE_INLINE float32x2_t vpadd_f32(float32x2_t a, float32x2_t b)
+{
+    __m128 hadd128;
+    __m64_128 res64;
+    hadd128 = _mm_hadd_ps (_pM128(a), _pM128(b));
+    hadd128 = _mm_shuffle_ps (hadd128, hadd128, _MM_SHUFFLE(3,1, 2, 0)); //use low 64 bits
+    _M64f(res64, hadd128);
+    return res64;
+}
+
+
+//**************************  Long pairwise add  **********************************
+//*********************************************************************************
+//Adds adjacent pairs of elements of a vector,sign or zero extends the results to twice their original width,
+// and places the final results in the destination vector.
+
+_NEON2SSESTORAGE int16x4_t vpaddl_s8(int8x8_t a); // VPADDL.S8 d0,d0
+_NEON2SSE_INLINE int16x4_t vpaddl_s8(int8x8_t a) // VPADDL.S8 d0,d0
+{
+    //no 8 bit hadd in IA32, need to go to 16 bit anyway
+    __m128i a16;
+    int16x4_t res64;
+    a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1
+    a16 = _mm_hadd_epi16 (a16,  a16); //use low 64 bits
+    return64(a16);
+}
+
+_NEON2SSESTORAGE int32x2_t vpaddl_s16(int16x4_t a); // VPADDL.S16 d0,d0
+_NEON2SSE_INLINE int32x2_t vpaddl_s16(int16x4_t a) // VPADDL.S16 d0,d0
+{
+    // solution may be not optimal, serial execution may be faster
+    int32x2_t res64;
+    __m128i r32_1;
+    r32_1 = _MM_CVTEPI16_EPI32 (_pM128i(a));
+    r32_1 = _mm_hadd_epi32(r32_1, r32_1); //use low 64 bits
+    return64(r32_1);
+}
+
+_NEON2SSESTORAGE int64x1_t vpaddl_s32(int32x2_t a); // VPADDL.S32 d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vpaddl_s32(int32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution looks faster
+{
+    int64x1_t res;
+    res.m64_i64[0] = (int64_t)a.m64_i32[0] + (int64_t)a.m64_i32[1];
+    return res;
+}
+
+_NEON2SSESTORAGE uint16x4_t vpaddl_u8(uint8x8_t a); // VPADDL.U8 d0,d0
+_NEON2SSE_INLINE uint16x4_t vpaddl_u8(uint8x8_t a) // VPADDL.U8 d0,d0
+{
+    //  no 8 bit hadd in IA32, need to go to 16 bit
+//  no unsigned _mm_hadd_ functions in IA32, but 8 unsigned is less then 16 signed, so it should work
+    uint16x4_t res64;
+    __m128i a16;
+    a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1 use low 64 bits
+    a16 = _mm_hadd_epi16 (a16, a16); //use low 64 bits
+    return64(a16);
+}
+
+_NEON2SSESTORAGE uint32x2_t vpaddl_u16(uint16x4_t a); // VPADDL.s16 d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpaddl_u16(uint16x4_t a),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //serial solution looks faster than a SIMD one
+    uint32x2_t res;
+    res.m64_u32[0] = (uint32_t)a.m64_u16[0] + (uint32_t)a.m64_u16[1];
+    res.m64_u32[1] = (uint32_t)a.m64_u16[2] + (uint32_t)a.m64_u16[3];
+    return res;
+}
+
+_NEON2SSESTORAGE uint64x1_t vpaddl_u32(uint32x2_t a); // VPADDL.U32 d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vpaddl_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution looks faster
+{
+    uint64x1_t res;
+    res.m64_u64[0] = (uint64_t)a.m64_u32[0] + (uint64_t)a.m64_u32[1];
+    return res;
+}
+
+_NEON2SSESTORAGE int16x8_t vpaddlq_s8(int8x16_t a); // VPADDL.S8 q0,q0
+_NEON2SSE_INLINE int16x8_t vpaddlq_s8(int8x16_t a) // VPADDL.S8 q0,q0
+{
+    //no 8 bit hadd in IA32, need to go to 16 bit
+    __m128i r16_1, r16_2;
+    r16_1 = _MM_CVTEPI8_EPI16 (a); // SSE 4.1
+    //swap hi and low part of r to process the remaining data
+    r16_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
+    r16_2 = _MM_CVTEPI8_EPI16 (r16_2);
+    return _mm_hadd_epi16 (r16_1, r16_2);
+}
+
+_NEON2SSESTORAGE int32x4_t vpaddlq_s16(int16x8_t a); // VPADDL.S16 q0,q0
+_NEON2SSE_INLINE int32x4_t vpaddlq_s16(int16x8_t a) // VPADDL.S16 q0,q0
+{
+    //no 8 bit hadd in IA32, need to go to 16 bit
+    __m128i r32_1, r32_2;
+    r32_1 = _MM_CVTEPI16_EPI32(a);
+    //swap hi and low part of r to process the remaining data
+    r32_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
+    r32_2 = _MM_CVTEPI16_EPI32 (r32_2);
+    return _mm_hadd_epi32 (r32_1, r32_2);
+}
+
+_NEON2SSESTORAGE int64x2_t vpaddlq_s32(int32x4_t a); // VPADDL.S32 q0,q0
+_NEON2SSE_INLINE int64x2_t vpaddlq_s32(int32x4_t a)
+{
+    __m128i top, bot;
+    bot = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
+    bot = _MM_CVTEPI32_EPI64(bot);
+    top = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 3, 1));
+    top = _MM_CVTEPI32_EPI64(top);
+    return _mm_add_epi64(top, bot);
+}
+
+_NEON2SSESTORAGE uint16x8_t vpaddlq_u8(uint8x16_t a); // VPADDL.U8 q0,q0
+_NEON2SSE_INLINE uint16x8_t vpaddlq_u8(uint8x16_t a) // VPADDL.U8 q0,q0
+{
+    const __m128i ff = _mm_set1_epi16(0xFF);
+    __m128i low = _mm_and_si128(a, ff);
+    __m128i high = _mm_srli_epi16(a, 8);
+    return _mm_add_epi16(low, high);
+}
+
+#ifdef USE_SSE4
+_NEON2SSESTORAGE uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.s16 q0,q0
+_NEON2SSE_INLINE uint32x4_t vpaddlq_u16(uint16x8_t a)
+{
+    const __m128i zero = _mm_setzero_si128();
+    __m128i low = _mm_blend_epi16(zero, a, 0x55); // 0b1010101
+    __m128i high = _mm_srli_epi32(a, 16);
+    return _mm_add_epi32(low, high);
+}
+
+_NEON2SSESTORAGE uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0
+_NEON2SSE_INLINE uint64x2_t vpaddlq_u32(uint32x4_t a)
+{
+    const __m128i zero = _mm_setzero_si128();
+    __m128i low = _mm_blend_epi16(zero, a, 0x33); // 0b00110011
+    __m128i high = _mm_srli_epi64(a, 32);
+    return _mm_add_epi64(low, high);
+}
+#else
+_NEON2SSESTORAGE uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.s16 q0,q0
+_NEON2SSE_INLINE uint32x4_t vpaddlq_u16(uint16x8_t a)
+{
+    const __m128i ff = _mm_set1_epi32(0xFFFF);
+    __m128i low = _mm_and_si128(a, ff);
+    __m128i high = _mm_srli_epi32(a, 16);
+    return _mm_add_epi32(low, high);
+}
+
+_NEON2SSESTORAGE uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0
+_NEON2SSE_INLINE uint64x2_t vpaddlq_u32(uint32x4_t a)
+{
+    const __m128i ff = _mm_set_epi32(0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF);
+    __m128i low = _mm_and_si128(a, ff);
+    __m128i high = _mm_srli_epi64(a, 32);
+    return _mm_add_epi64(low, high);
+}
+#endif
+
+//************************  Long pairwise add and accumulate **************************
+//****************************************************************************************
+//VPADAL (Vector Pairwise Add and Accumulate Long) adds adjacent pairs of elements of a vector,
+// and accumulates the  values of the results into the elements of the destination (wide) vector
+_NEON2SSESTORAGE int16x4_t vpadal_s8(int16x4_t a,  int8x8_t b); // VPADAL.S8 d0,d0
+_NEON2SSE_INLINE int16x4_t vpadal_s8(int16x4_t a,  int8x8_t b)
+{
+    int16x4_t res64;
+    return64(vpadalq_s8(_pM128i(a), _pM128i(b)));
+}
+
+_NEON2SSESTORAGE int32x2_t vpadal_s16(int32x2_t a,  int16x4_t b); // VPADAL.S16 d0,d0
+_NEON2SSE_INLINE int32x2_t vpadal_s16(int32x2_t a,  int16x4_t b)
+{
+    int32x2_t res64;
+    return64(vpadalq_s16(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE int64x1_t vpadal_s32(int64x1_t a, int32x2_t b); // VPADAL.S32 d0,d0
+_NEON2SSE_INLINE int64x1_t vpadal_s32(int64x1_t a, int32x2_t b)
+{
+    int64x1_t res;
+    res.m64_i64[0] = (int64_t)b.m64_i32[0] + (int64_t)b.m64_i32[1] + a.m64_i64[0];
+    return res;
+}
+
+_NEON2SSESTORAGE uint16x4_t vpadal_u8(uint16x4_t a,  uint8x8_t b); // VPADAL.U8 d0,d0
+_NEON2SSE_INLINE uint16x4_t vpadal_u8(uint16x4_t a,  uint8x8_t b)
+{
+    uint16x4_t res64;
+    return64(vpadalq_u8(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE uint32x2_t vpadal_u16(uint32x2_t a,  uint16x4_t b); // VPADAL.s16 d0,d0
+_NEON2SSE_INLINE uint32x2_t vpadal_u16(uint32x2_t a,  uint16x4_t b)
+{
+    uint32x2_t res64;
+    return64(vpadalq_u16(_pM128i(a), _pM128i(b)));
+}
+
+_NEON2SSESTORAGE uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b); // VPADAL.U32 d0,d0
+_NEON2SSE_INLINE uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b)
+{
+    uint64x1_t res;
+    res.m64_u64[0] = (uint64_t)b.m64_u32[0] + (uint64_t)b.m64_u32[1] + a.m64_u64[0];
+    return res;
+}
+
+_NEON2SSESTORAGE int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b); // VPADAL.S8 q0,q0
+_NEON2SSE_INLINE int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b) // VPADAL.S8 q0,q0
+{
+    int16x8_t pad;
+    pad = vpaddlq_s8(b);
+    return _mm_add_epi16 (a, pad);
+}
+
+_NEON2SSESTORAGE int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b); // VPADAL.S16 q0,q0
+_NEON2SSE_INLINE int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b) // VPADAL.S16 q0,q0
+{
+    int32x4_t pad;
+    pad = vpaddlq_s16(b);
+    return _mm_add_epi32(a, pad);
+}
+
+_NEON2SSESTORAGE int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b); // VPADAL.S32 q0,q0
+_NEON2SSE_INLINE int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b)
+{
+    int64x2_t pad;
+    pad = vpaddlq_s32(b);
+    return _mm_add_epi64 (a, pad);
+}
+
+_NEON2SSESTORAGE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b); // VPADAL.U8 q0,q0
+_NEON2SSE_INLINE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b) // VPADAL.U8 q0,q0
+{
+    uint16x8_t pad;
+    pad = vpaddlq_u8(b);
+    return _mm_add_epi16 (a, pad);
+}
+
+_NEON2SSESTORAGE uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b); // VPADAL.s16 q0,q0
+_NEON2SSE_INLINE uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b)
+{
+    uint32x4_t pad;
+    pad = vpaddlq_u16(b);
+    return _mm_add_epi32(a, pad);
+} //no optimal SIMD solution, serial is faster
+
+_NEON2SSESTORAGE uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b); // VPADAL.U32 q0,q0
+_NEON2SSE_INLINE uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b)
+{
+    uint64x2_t pad;
+    pad = vpaddlq_u32(b);
+    return _mm_add_epi64(a, pad);
+}
+
+//**********  Folding maximum   *************************************
+//*******************************************************************
+//VPMAX (Vector Pairwise Maximum) compares adjacent pairs of elements in two vectors,
+//and copies the larger of each pair into the corresponding element in the destination
+//    no corresponding functionality in IA32 SIMD, so we need to do the vertical comparison
+_NEON2SSESTORAGE int8x8_t vpmax_s8(int8x8_t a, int8x8_t b); // VPMAX.S8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vpmax_s8(int8x8_t a, int8x8_t b) // VPMAX.S8 d0,d0,d0
+{
+    int8x8_t res64;
+    __m128i ab, ab1, max;
+    _NEON2SSE_ALIGN_16 static const uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5,  4,  7,  6,    9,    8,   11,   10,   13,   12,   15,   14};
+    _NEON2SSE_ALIGN_16 static const uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+    ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
+    ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding
+    max = _MM_MAX_EPI8 (ab, ab1); // SSE4.1
+    max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_odd); //remove repetitive data
+    return64(max); //we need 64 bits only
+}
+
+_NEON2SSESTORAGE int16x4_t vpmax_s16(int16x4_t a, int16x4_t b); // VPMAX.S16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vpmax_s16(int16x4_t a, int16x4_t b) // VPMAX.S16 d0,d0,d0
+{
+    //solution may be not optimal compared with the serial one
+    int16x4_t res64;
+    __m128i ab, ab1, max;
+    _NEON2SSE_ALIGN_16 static const int8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
+    ab = _mm_unpacklo_epi64 ( _pM128i(a),  _pM128i(b)); //ab
+    ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
+    max = _mm_max_epi16 (ab, ab1);
+    max =  _mm_shuffle_epi8 (max, *(__m128i*)  mask8_32_even_odd); //remove repetitive data, only the low part of mask is used
+    return64(max);
+}
+
+_NEON2SSESTORAGE int32x2_t vpmax_s32(int32x2_t a, int32x2_t b); // VPMAX.S32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vpmax_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //serial solution looks faster than SIMD one
+    int32x2_t res;
+    res.m64_i32[0] = (a.m64_i32[0] < a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0];
+    res.m64_i32[1] = (b.m64_i32[0] < b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0];
+    return res;
+}
+
+_NEON2SSESTORAGE uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b); // VPMAX.U8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b) // VPMAX.U8 d0,d0,d0
+{
+    uint8x8_t res64;
+    __m128i ab, ab1, max;
+    _NEON2SSE_ALIGN_16 static const int8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+    _NEON2SSE_ALIGN_16 static const uint8_t mask8_odd[16] = { 1, 3,  5,  7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+    ab = _mm_unpacklo_epi64 (_pM128i(a), _pM128i(b)); //ab
+    ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding
+    max = _mm_max_epu8 (ab, ab1); // SSE4.1
+    max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_odd); //remove repetitive data
+    return64(max);
+}
+
+_NEON2SSESTORAGE uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b); // VPMAX.s16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b) // VPMAX.s16 d0,d0,d0
+{
+    //solution may be not optimal compared with the serial one
+    uint16x4_t res64;
+    __m128i ab, ab1, max;
+    _NEON2SSE_ALIGN_16 static const uint8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
+    ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
+    ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
+    max = _MM_MAX_EPU16 (ab, ab1);
+    max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_32_even_odd); //remove repetitive data, only the low part of mask is used
+    return64(max);
+}
+
+_NEON2SSESTORAGE uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b); // VPMAX.U32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //serial solution looks faster than SIMD one
+    uint32x2_t res;
+    res.m64_u32[0] = (a.m64_u32[0] < a.m64_u32[1]) ? a.m64_u32[1] : a.m64_u32[0];
+    res.m64_u32[1] = (b.m64_u32[0] < b.m64_u32[1]) ? b.m64_u32[1] : b.m64_u32[0];
+    return res;
+}
+
+_NEON2SSESTORAGE float32x2_t vpmax_f32(float32x2_t a, float32x2_t b); // VPMAX.F32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vpmax_f32(float32x2_t a, float32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //serial solution looks faster than  SIMD one
+    float32x2_t res;
+    res.m64_f32[0] = (a.m64_f32[0] < a.m64_f32[1]) ? a.m64_f32[1] : a.m64_f32[0];
+    res.m64_f32[1] = (b.m64_f32[0] < b.m64_f32[1]) ? b.m64_f32[1] : b.m64_f32[0];
+    return res;
+}
+
+// ***************** Folding minimum  ****************************
+// **************************************************************
+//vpmin -> takes minimum of adjacent pairs
+_NEON2SSESTORAGE int8x8_t vpmin_s8(int8x8_t a, int8x8_t b); // VPMIN.S8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vpmin_s8(int8x8_t a, int8x8_t b) // VPMIN.S8 d0,d0,d0
+{
+    int8x8_t res64;
+    __m128i ab, ab1, min;
+    _NEON2SSE_ALIGN_16 static const uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5,  4,  7,  6,    9,    8,   11,   10,   13,   12,   15,   14};
+    _NEON2SSE_ALIGN_16 static const uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+    ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
+    ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical min finding
+    min =  _MM_MIN_EPI8 (ab, ab1); // SSE4.1
+    min =  _mm_shuffle_epi8 (min, *(__m128i*) mask8_odd); //remove repetitive data
+    return64(min);
+}
+
+_NEON2SSESTORAGE int16x4_t vpmin_s16(int16x4_t a, int16x4_t b); // VPMIN.S16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vpmin_s16(int16x4_t a, int16x4_t b) // VPMIN.S16 d0,d0,d0
+{
+    //solution may be not optimal compared with the serial one
+    int16x4_t res64;
+    __m128i ab, ab1, min;
+    _NEON2SSE_ALIGN_16 static const int8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
+    ab = _mm_unpacklo_epi64 (  _pM128i(a),  _pM128i(b)); //ab
+    ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
+    min = _mm_min_epi16 (ab, ab1);
+    min = _mm_shuffle_epi8 (min, *(__m128i*) mask8_32_even_odd); //remove repetitive data, only the low part of mask is used
+    return64(min);
+}
+
+_NEON2SSESTORAGE int32x2_t vpmin_s32(int32x2_t a, int32x2_t b); // VPMIN.S32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vpmin_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //serial solution looks faster than SIMD one
+    int32x2_t res;
+    res.m64_i32[0] = (a.m64_i32[0] > a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0];
+    res.m64_i32[1] = (b.m64_i32[0] > b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0];
+    return res;
+}
+
+_NEON2SSESTORAGE uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b); // VPMIN.U8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b) // VPMIN.U8 d0,d0,d0
+{
+    uint8x8_t res64;
+    __m128i ab, ab1, min;
+    _NEON2SSE_ALIGN_16 static const uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5,  4,  7,  6,    9,    8,   11,   10,   13,   12,   15,   14};
+    _NEON2SSE_ALIGN_16 static const uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+    ab = _mm_unpacklo_epi64 (  _pM128i(a),  _pM128i(b)); //ab
+    ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding
+    min = _mm_min_epu8 (ab, ab1); // SSE4.1
+    min = _mm_shuffle_epi8 (min, *(__m128i*) mask8_odd); //remove repetitive data
+    return64(min);
+}
+
+_NEON2SSESTORAGE uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b); // VPMIN.s16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b) // VPMIN.s16 d0,d0,d0
+{
+    //solution may be not optimal compared with the serial one
+    uint16x4_t res64;
+    __m128i ab, ab1, min;
+    _NEON2SSE_ALIGN_16 static const uint8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
+    ab = _mm_unpacklo_epi64 ( _pM128i(a),  _pM128i(b)); //ab
+    ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical min finding, use 8bit fn and the corresponding mask
+    min = _MM_MIN_EPU16 (ab, ab1);
+    min =    _mm_shuffle_epi8 (min, *(__m128i*) mask8_32_even_odd); //remove repetitive data, only the low part of mask is used
+    return64(min);
+}
+
+_NEON2SSESTORAGE uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b); // VPMIN.U32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //serial solution looks faster than SIMD one
+    uint32x2_t res;
+    res.m64_u32[0] = (a.m64_u32[0] > a.m64_u32[1]) ? a.m64_u32[1] : a.m64_u32[0];
+    res.m64_u32[1] = (b.m64_u32[0] > b.m64_u32[1]) ? b.m64_u32[1] : b.m64_u32[0];
+    return res;
+}
+
+_NEON2SSESTORAGE float32x2_t vpmin_f32(float32x2_t a, float32x2_t b); // VPMIN.F32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vpmin_f32(float32x2_t a, float32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //serial solution looks faster than SIMD one
+    float32x2_t res;
+    res.m64_f32[0] = (a.m64_f32[0] > a.m64_f32[1]) ? a.m64_f32[1] : a.m64_f32[0];
+    res.m64_f32[1] = (b.m64_f32[0] > b.m64_f32[1]) ? b.m64_f32[1] : b.m64_f32[0];
+    return res;
+}
+
+//***************************************************************
+//***********  Reciprocal/Sqrt ************************************
+//***************************************************************
+//****************** Reciprocal estimate *******************************
+//the ARM NEON and x86 SIMD results may be slightly different
+_NEON2SSESTORAGE float32x2_t vrecpe_f32(float32x2_t a); // VRECPE.F32 d0,d0
+_NEON2SSE_INLINE float32x2_t vrecpe_f32(float32x2_t a) //use low 64 bits
+{
+    float32x4_t res;
+    __m64_128 res64;
+    res = _mm_rcp_ps(_pM128(a));
+    _M64f(res64, res);
+    return res64;
+}
+
+_NEON2SSESTORAGE uint32x2_t vrecpe_u32(uint32x2_t a); // VRECPE.U32 d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrecpe_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //Input is  fixed point number!!! No reciprocal for ints in IA32 available
+    uint32x2_t res;
+    float resf, r;
+    int i, q, s;
+    for (i =0; i<2; i++){
+        if((a.m64_u32[i] & 0x80000000) == 0) {
+            res.m64_u32[i] = 0xffffffff;
+        }else{
+            resf =  (float) (a.m64_u32[i] * (0.5f / (uint32_t)(1 << 31)));
+            q = (int)(resf * 512.0f); /* a in units of 1/512 rounded down */
+            r = (float)(1.0f / (((float)q + 0.5f) / 512.0f)); /* reciprocal r */
+            s = (int)(256.0f * r + 0.5f); /* r in units of 1/256 rounded to nearest */
+            r =  (float)s / 256.0f;
+            res.m64_u32[i] = (uint32_t)(r * (uint32_t)(1 << 31));
+        }
+    }
+    return res;
+}
+
+_NEON2SSESTORAGE float32x4_t vrecpeq_f32(float32x4_t a); // VRECPE.F32 q0,q0
+#define vrecpeq_f32 _mm_rcp_ps
+
+
+_NEON2SSESTORAGE uint32x4_t vrecpeq_u32(uint32x4_t a); // VRECPE.U32 q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrecpeq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //Input is  fixed point number!!!
+    //We implement the recip_estimate function as described in ARMv7 reference manual (VRECPE instruction) but use float instead of double
+    _NEON2SSE_ALIGN_16 uint32_t atmp[4];
+    _NEON2SSE_ALIGN_16 uint32_t res[4];
+    _NEON2SSE_ALIGN_16 static const uint32_t c80000000[4] = {0x80000000,0x80000000, 0x80000000,0x80000000};
+    float resf, r;
+    int i, q, s;
+    __m128i res128, mask, zero;
+    _mm_store_si128((__m128i*)atmp, a);
+    zero = _mm_setzero_si128();
+    for (i =0; i<4; i++){
+        resf = (atmp[i] * (0.5f / (uint32_t) (1 << 31)));  //  2.3283064365386963E-10 ~(0.5f / (uint32_t) (1 << 31))
+        q = (int)(resf * 512.0f); /* a in units of 1/512 rounded down */
+        r = 1.0f / (((float)q + 0.5f) / 512.0f); /* reciprocal r */
+        s = (int)(256.0f * r + 0.5f); /* r in units of 1/256 rounded to nearest */
+        r =  (float)s / 256.0f;
+        res[i] = (uint32_t) (r * (((uint32_t)1) << 31) );
+    }
+    res128 = _mm_load_si128((__m128i*)res);
+    mask = _mm_and_si128(a, *(__m128i*)c80000000);
+    mask = _mm_cmpeq_epi32(zero, mask);  //0xffffffff if atmp[i] <= 0x7fffffff
+    return _mm_or_si128(res128, mask);
+}
+
+//**********Reciprocal square root estimate ****************
+//**********************************************************
+//no reciprocal square root for ints in IA32 available, neither for unsigned int to float4 lanes conversion, so a serial solution looks faster
+//but the particular implementation for vrsqrte_u32 may vary for various ARM compilers
+////the ARM NEON and x86 SIMD results may be slightly different
+_NEON2SSESTORAGE float32x2_t vrsqrte_f32(float32x2_t a); // VRSQRTE.F32 d0,d0
+_NEON2SSE_INLINE float32x2_t vrsqrte_f32(float32x2_t a) //use low 64 bits
+{
+    float32x4_t res;
+    __m64_128 res64;
+    res = _mm_rsqrt_ps(_pM128(a));
+    _M64f(res64, res);
+    return res64;
+}
+
+_NEON2SSESTORAGE uint32x2_t vrsqrte_u32(uint32x2_t a); // VRSQRTE.U32 d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrsqrte_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+  // Input is  fixed point number!!!
+  // We implement the recip_sqrt_estimate function as described in ARMv7
+  // reference manual (VRSQRTE instruction) But results may be slightly different
+  // from ARM implementation due to _mm_rsqrt_ps precision
+  uint32x2_t res;
+  __m64_128 res64[2];
+  int i;
+  _NEON2SSE_ALIGN_16 float coeff[2];
+  for (i = 0; i < 2; i++) {
+    // Generate double-precision value = operand * 2^(-32). This has zero sign
+    // bit, with:
+    //     exponent = 1022 or 1021 = double-precision representation of 2^(-1)
+    //     or 2^(-2) fraction taken from operand, excluding its most significant
+    //     one or two bits.
+    uint64_t dp_operand;
+    if (a.m64_u32[i] & 0x80000000) {
+      dp_operand =
+          (0x3feLL << 52) | (((uint64_t)a.m64_u32[i] & 0x7FFFFFFF) << 21);
+    } else {
+      dp_operand =
+          (0x3fdLL << 52) | (((uint64_t)a.m64_u32[i] & 0x3FFFFFFF) << 22);
+    }
+	res64[i].m64_u64[0] = dp_operand;
+	coeff[i] = (res64[i].m64_d64[0] < 0.5) ? 512.0 : 256.0; /* range 0.25 <= resf < 0.5  or range 0.5 <= resf < 1.0*/
+  }
+  __m128 coeff_f = _mm_load_ps(coeff);
+  __m128d q0_d = _mm_mul_pd(_mm_loadu_pd(&res64[0].m64_d64[0]), _mm_cvtps_pd(coeff_f));
+  __m128i q0_i = _mm_cvttpd_epi32(q0_d);
+  __m128 c05_f = _mm_set1_ps(0.5);
+  __m128 r_f = _mm_div_ps(_mm_add_ps(_mm_cvtepi32_ps(q0_i), c05_f), coeff_f);
+  __m128 rsqrt_f = _mm_rsqrt_ps(r_f);
+  __m128 c256_f = _mm_set1_ps(256.0);
+  __m128 s_f = _mm_add_ps(_mm_mul_ps(rsqrt_f, c256_f), c05_f);
+#ifdef USE_SSE4
+  s_f = _mm_floor_ps(s_f);
+#else
+  s_f = _mm_cvtepi32_ps(_mm_cvttps_epi32(s_f));
+#endif
+  s_f = _mm_div_ps(s_f, c256_f);
+  _M64f(res64[0], s_f);
+
+  for (i = 0; i < 2; i++) {
+    if ((a.m64_u32[i] & 0xc0000000) == 0) { // a <=0x3fffffff
+      res.m64_u32[i] = 0xffffffff;
+    } else {
+      res.m64_u32[i] = res64[0].m64_f32[i] * (((uint32_t)1) << 31);
+    }
+  }
+  return res;
+}
+
+_NEON2SSESTORAGE float32x4_t vrsqrteq_f32(float32x4_t a); // VRSQRTE.F32 q0,q0
+#define vrsqrteq_f32 _mm_rsqrt_ps
+
+_NEON2SSESTORAGE uint32x4_t vrsqrteq_u32(uint32x4_t a); // VRSQRTE.U32 q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrsqrteq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+  // Input is  fixed point number!!!
+  // We implement the recip_sqrt_estimate function as described in ARMv7
+  // reference manual (VRSQRTE instruction) But results may be slightly different
+  // from ARM implementation due to _mm_rsqrt_ps precision
+  int i;
+  _NEON2SSE_ALIGN_16 uint32_t atmp[4], res[4];
+  _NEON2SSE_ALIGN_16 float coeff[4], rr[4];
+  __m64_128 res64[4];
+  _mm_store_si128((__m128i *)atmp, a);
+  for (i = 0; i < 4; i++) {
+    // Generate double-precision value = operand * 2^(-32). This has zero sign
+    // bit, with:
+    //     exponent = 1022 or 1021 = double-precision representation of 2^(-1)
+    //     or 2^(-2) fraction taken from operand, excluding its most significant
+    //     one or two bits.
+    uint64_t dp_operand;
+    if (atmp[i] & 0x80000000) {
+      dp_operand = (0x3feLL << 52) | (((uint64_t)atmp[i] & 0x7FFFFFFF) << 21);
+    } else {
+      dp_operand = (0x3fdLL << 52) | (((uint64_t)atmp[i] & 0x3FFFFFFF) << 22);
+    }
+	res64[i].m64_u64[0] = dp_operand;
+	coeff[i] = (res64[i].m64_d64[0] < 0.5) ? 512.0 : 256.0; /* range 0.25 <= resf < 0.5  or range 0.5 <= resf < 1.0*/
+  }
+  __m128 c05_f = _mm_set1_ps(0.5);
+  __m128 coeff_f = _mm_load_ps(coeff);
+  __m128d q0_d = _mm_mul_pd(_mm_loadu_pd(&res64[0].m64_d64[0]), _mm_cvtps_pd(coeff_f));
+  __m128i q0_i = _mm_cvttpd_epi32(q0_d);
+
+  __m128 coeff_f2 = _M128(_pM128i(coeff[2]));
+  q0_d = _mm_mul_pd(_mm_loadu_pd(&res64[2].m64_d64[0]), _mm_cvtps_pd(coeff_f2));
+  __m128i q0_i2 = _mm_cvttpd_epi32(q0_d);
+  coeff_f = _M128(_mm_unpacklo_epi64(_M128i(coeff_f), _M128i(coeff_f2)));
+  q0_i = _mm_unpacklo_epi64(q0_i, q0_i2);
+
+  __m128 r_f = _mm_div_ps(_mm_add_ps(_mm_cvtepi32_ps(q0_i), c05_f), coeff_f);
+  __m128 rsqrt_f = _mm_rsqrt_ps(r_f);
+  __m128 c256_f = _mm_set1_ps(256.0);
+  __m128 s_f = _mm_add_ps(_mm_mul_ps(rsqrt_f, c256_f), c05_f);
+#ifdef USE_SSE4
+  s_f = _mm_floor_ps(s_f);
+#else
+  s_f = _mm_cvtepi32_ps(_mm_cvttps_epi32(s_f));
+#endif
+  s_f = _mm_div_ps(s_f, c256_f);
+  _mm_store_ps(rr, s_f);
+
+  for (i = 0; i < 4; i++) {
+    if ((atmp[i] & 0xc0000000) == 0) { // a <=0x3fffffff
+      res[i] = 0xffffffff;
+    } else {
+      res[i] = rr[i] * (((uint32_t)1) << 31);
+    }
+  }
+  return _mm_load_si128((__m128i *)res);
+}
+
+//************ Reciprocal estimate/step and 1/sqrt estimate/step ***************************
+//******************************************************************************************
+//******VRECPS (Vector Reciprocal Step) ***************************************************
+//multiplies the elements of one vector by the corresponding elements of another vector,
+//subtracts each of the results from 2, and places the final results into the elements of the destination vector.
+
+_NEON2SSESTORAGE float32x2_t vrecps_f32(float32x2_t a, float32x2_t b); // VRECPS.F32 d0, d0, d0
+_NEON2SSE_INLINE float32x2_t vrecps_f32(float32x2_t a, float32x2_t b)
+{
+    float32x4_t res;
+    __m64_128 res64;
+    res = vrecpsq_f32(_pM128(a), _pM128(b));
+    _M64f(res64, res);
+    return res64;
+}
+
+_NEON2SSESTORAGE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b); // VRECPS.F32 q0, q0, q0
+_NEON2SSE_INLINE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b) // VRECPS.F32 q0, q0, q0
+{
+    __m128 f2, mul;
+    f2 =  _mm_set1_ps(2.);
+    mul = _mm_mul_ps(a,b);
+    return _mm_sub_ps(f2,mul);
+}
+
+//*****************VRSQRTS (Vector Reciprocal Square Root Step) *****************************
+//multiplies the elements of one vector by the corresponding elements of another vector,
+//subtracts each of the results from 3, divides these results by two, and places the final results into the elements of the destination vector.
+
+_NEON2SSESTORAGE float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b); // VRSQRTS.F32 d0, d0, d0
+_NEON2SSE_INLINE float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b)
+{
+    float32x2_t res;
+    res.m64_f32[0] = (3 - a.m64_f32[0] * b.m64_f32[0]) / 2;
+    res.m64_f32[1] = (3 - a.m64_f32[1] * b.m64_f32[1]) / 2;
+    return res;
+}
+
+_NEON2SSESTORAGE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b); // VRSQRTS.F32 q0, q0, q0
+_NEON2SSE_INLINE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b) // VRSQRTS.F32 q0, q0, q0
+{
+    __m128 f3, f05, mul;
+    f3 =  _mm_set1_ps(3.f);
+    f05 =  _mm_set1_ps(0.5f);
+    mul = _mm_mul_ps(a,b);
+    f3 = _mm_sub_ps(f3,mul);
+    return _mm_mul_ps (f3, f05);
+}
+//********************************************************************************************
+//***************************** Shifts by signed variable ***********************************
+//********************************************************************************************
+//***** Vector shift left: Vr[i] := Va[i] << Vb[i] (negative values shift right) ***********************
+//********************************************************************************************
+//No such operations in IA32 SIMD unfortunately, constant shift only available, so need to do the serial solution
+//helper macro. It matches ARM implementation for big shifts
+#define SERIAL_SHIFT(TYPE, INTERNAL_TYPE, LENMAX, LEN) \
+        _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 INTERNAL_TYPE btmp[LENMAX]; int i, lanesize = sizeof(INTERNAL_TYPE) << 3; \
+        _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
+        for (i = 0; i<LEN; i++) { \
+        if( (btmp[i] >= lanesize)||(btmp[i] <= -lanesize) ) res[i] = 0; \
+        else res[i] = (btmp[i] >=0) ? atmp[i] << btmp[i] : atmp[i] >> (-btmp[i]); } \
+        return _mm_load_si128((__m128i*)res);
+
+#define SERIAL_SHIFT_64(TYPE, SIGN, LEN) \
+        int ## TYPE ## x ## LEN ## _t res;  int i, lanesize = sizeof(int ## TYPE ## _t) << 3; \
+        for (i = 0; i<LEN; i++) { \
+        if( (b.m64_i ## TYPE[i] >= lanesize)||(b.m64_i ## TYPE[i] <= -lanesize) ) res.m64_ ## SIGN ## TYPE[i] = 0; \
+        else res.m64_ ## SIGN ## TYPE[i] = (b.m64_i ## TYPE[i] >=0) ? a.m64_ ## SIGN ## TYPE[i] << b.m64_i ## TYPE[i] : a.m64_ ## SIGN ## TYPE[i] >> (-b.m64_i ## TYPE[i]); } \
+        return res;
+
+_NEON2SSESTORAGE int8x8_t vshl_s8(int8x8_t a, int8x8_t b); // VSHL.S8 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vshl_s8(int8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT_64(8, i, 8)
+}
+
+_NEON2SSESTORAGE int16x4_t vshl_s16(int16x4_t a, int16x4_t b); // VSHL.S16 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vshl_s16(int16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT_64(16, i, 4)
+}
+
+_NEON2SSESTORAGE int32x2_t vshl_s32(int32x2_t a, int32x2_t b); // VSHL.S32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vshl_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT_64(32, i, 2)
+}
+
+_NEON2SSESTORAGE int64x1_t vshl_s64(int64x1_t a, int64x1_t b); // VSHL.S64 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vshl_s64(int64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT_64(64, i, 1)
+}
+
+_NEON2SSESTORAGE uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b); // VSHL.U8 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT_64(8, u, 8)
+}
+
+_NEON2SSESTORAGE uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b); // VSHL.s16 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT_64(16, u, 4)
+}
+
+_NEON2SSESTORAGE uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b); // VSHL.U32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT_64(32, u, 2)
+}
+
+_NEON2SSESTORAGE uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b); // VSHL.U64 d0,d0,d0
+_NEON2SSE_INLINE uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b) //if we use the SERIAL_SHIFT macro need to have the special processing  for large numbers
+{
+    SERIAL_SHIFT_64(64, u, 1)
+}
+
+_NEON2SSESTORAGE int8x16_t vshlq_s8(int8x16_t a, int8x16_t b); // VSHL.S8 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vshlq_s8(int8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT(int8_t, int8_t, 16, 16)
+}
+
+_NEON2SSESTORAGE int16x8_t vshlq_s16(int16x8_t a, int16x8_t b); // VSHL.S16 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vshlq_s16(int16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT(int16_t, int16_t, 8, 8)
+}
+
+_NEON2SSESTORAGE int32x4_t vshlq_s32(int32x4_t a, int32x4_t b); // VSHL.S32 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vshlq_s32(int32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT(int32_t, int32_t, 4, 4)
+}
+
+_NEON2SSESTORAGE int64x2_t vshlq_s64(int64x2_t a, int64x2_t b); // VSHL.S64 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vshlq_s64(int64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT(int64_t, int64_t, 2, 2)
+}
+
+_NEON2SSESTORAGE uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b); // VSHL.U8 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT(uint8_t, int8_t, 16, 16)
+}
+
+_NEON2SSESTORAGE uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b); // VSHL.s16 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT(uint16_t, int16_t, 8, 8)
+}
+
+_NEON2SSESTORAGE uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b); // VSHL.U32 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT(uint32_t, int32_t, 4, 4)
+}
+
+_NEON2SSESTORAGE uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b); // VSHL.U64 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT(uint64_t, int64_t, 2, 2)
+}
+
+
+//*********** Vector saturating shift left: (negative values shift right) **********************
+//********************************************************************************************
+//No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution
+#define SERIAL_SATURATING_SHIFT_SIGNED(TYPE, LENMAX, LEN) \
+        _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX], btmp[LENMAX]; TYPE limit; int i; \
+        int lanesize_1 = (sizeof(TYPE) << 3) - 1; \
+        _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
+        for (i = 0; i<LEN; i++) { \
+        if ((atmp[i] ==0)||(btmp[i] ==0)) res[i] = atmp[i]; \
+        else{ \
+            if(btmp[i] <0) res[i] = atmp[i] >> (-btmp[i]); \
+            else{ \
+                if (btmp[i]>lanesize_1) { \
+                    res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
+                }else{ \
+                    limit = (TYPE)1 << (lanesize_1 - btmp[i]); \
+                    if((atmp[i] >= limit)||(atmp[i] <= -limit)) \
+                        res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
+                    else res[i] = atmp[i] << btmp[i]; }}}} \
+        return _mm_load_si128((__m128i*)res);
+
+#define SERIAL_SATURATING_SHIFT_UNSIGNED(TYPE, LENMAX, LEN) \
+        _NEON2SSE_ALIGN_16 _UNSIGNED_T(TYPE) atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 TYPE btmp[LENMAX]; _UNSIGNED_T(TYPE) limit; int i; \
+        TYPE lanesize = (sizeof(TYPE) << 3); \
+        _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
+        for (i = 0; i<LEN; i++) { \
+        if ((atmp[i] ==0)||(btmp[i] ==0)) { res[i] = atmp[i]; \
+        }else{ \
+            if(btmp[i] < 0) res[i] = atmp[i] >> (-btmp[i]); \
+            else{ \
+                if (btmp[i]>lanesize) res[i] = ~((TYPE)0); \
+                else{ \
+                    limit = (TYPE) 1 << (lanesize - btmp[i]); \
+                    res[i] = ( atmp[i] >= limit) ? res[i] = ~((TYPE)0) : atmp[i] << btmp[i]; }}}} \
+        return _mm_load_si128((__m128i*)res);
+
+#define SERIAL_SATURATING_SHIFT_SIGNED_64(TYPE, LEN) \
+        int ## TYPE ## x ## LEN ## _t res; int ## TYPE ## _t limit; int i; \
+        int lanesize_1 = (sizeof( int ## TYPE ## _t) << 3) - 1; \
+        for (i = 0; i<LEN; i++) { \
+        if ((a.m64_i ## TYPE[i] == 0) ||(b.m64_i ## TYPE[i] == 0)) res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i]; \
+        else{ \
+            if(b.m64_i ## TYPE[i] <0) res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] >> (-(b.m64_i ## TYPE[i])); \
+            else{ \
+                if (b.m64_i ## TYPE[i]>lanesize_1) { \
+                    res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t) 1 << lanesize_1) - 1; \
+                }else{ \
+                    limit = (int ## TYPE ## _t) 1 << (lanesize_1 - b.m64_i ## TYPE[i]); \
+                    if((a.m64_i ## TYPE[i] >= limit)||(a.m64_i ## TYPE[i] <= -limit)) \
+                        res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t) 1 << lanesize_1) - 1; \
+                    else res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
+        return res;
+
+#define SERIAL_SATURATING_SHIFT_UNSIGNED_64(TYPE, LEN) \
+        int ## TYPE ## x ## LEN ## _t res;  _UNSIGNED_T(int ## TYPE ## _t) limit; int i; \
+        int ## TYPE ## _t lanesize = (sizeof(int ## TYPE ## _t) << 3); \
+        for (i = 0; i<LEN; i++) { \
+        if ((a.m64_u ## TYPE[i] == 0) ||(b.m64_u ## TYPE[i] == 0)) {res.m64_u ## TYPE[i] = a.m64_u ## TYPE[i]; \
+        }else{ \
+            if(b.m64_i ## TYPE[i] < 0) res.m64_u ## TYPE[i] = a.m64_u ## TYPE[i] >> (-(b.m64_i ## TYPE[i])); \
+            else{ \
+                if (b.m64_i ## TYPE[i]>lanesize) res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0); \
+                else{ \
+                    limit = (int ## TYPE ## _t) 1 << (lanesize - b.m64_i ## TYPE[i]); \
+                    res.m64_u ## TYPE[i] = ( a.m64_u ## TYPE[i] >= limit) ? res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0) : a.m64_u ## TYPE[i] << b.m64_u ## TYPE[i]; }}}} \
+        return res;
+
+_NEON2SSESTORAGE int8x8_t vqshl_s8(int8x8_t a, int8x8_t b); // VQSHL.S8 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vqshl_s8(int8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_SIGNED_64(8,8)
+}
+
+_NEON2SSESTORAGE int16x4_t vqshl_s16(int16x4_t a, int16x4_t b); // VQSHL.S16 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqshl_s16(int16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_SIGNED_64(16,4)
+}
+
+_NEON2SSESTORAGE int32x2_t vqshl_s32(int32x2_t a, int32x2_t b); // VQSHL.S32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqshl_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_SIGNED_64(32,2)
+}
+
+_NEON2SSESTORAGE int64x1_t vqshl_s64(int64x1_t a, int64x1_t b); // VQSHL.S64 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqshl_s64(int64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_SIGNED_64(64,1)
+}
+
+_NEON2SSESTORAGE uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b); // VQSHL.U8 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_UNSIGNED_64(8,8)
+}
+
+_NEON2SSESTORAGE uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b); // VQSHL.s16 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_UNSIGNED_64(16,4)
+}
+
+_NEON2SSESTORAGE uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b); // VQSHL.U32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_UNSIGNED_64(32,2)
+}
+
+_NEON2SSESTORAGE uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b); // VQSHL.U64 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_UNSIGNED_64(64,1)
+}
+
+_NEON2SSESTORAGE int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b); // VQSHL.S8 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_SIGNED(int8_t, 16, 16)
+}
+
+_NEON2SSESTORAGE int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b); // VQSHL.S16 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_SIGNED(int16_t, 8, 8)
+}
+
+_NEON2SSESTORAGE int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b); // VQSHL.S32 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_SIGNED(int32_t, 4, 4)
+}
+
+_NEON2SSESTORAGE int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b); // VQSHL.S64 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_SIGNED(int64_t, 2, 2)
+}
+
+_NEON2SSESTORAGE uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b); // VQSHL.U8 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_UNSIGNED(int8_t, 16, 16)
+}
+
+_NEON2SSESTORAGE uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b); // VQSHL.s16 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_UNSIGNED(int16_t, 8, 8)
+}
+
+_NEON2SSESTORAGE uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b); // VQSHL.U32 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_UNSIGNED(int32_t, 4, 4)
+}
+
+_NEON2SSESTORAGE uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b); // VQSHL.U64 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_UNSIGNED(int64_t, 2, 2)
+}
+
+
+//******** Vector rounding shift left: (negative values shift right) **********
+//****************************************************************************
+//No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution
+//rounding makes sense for right shifts only.
+#define SERIAL_ROUNDING_SHIFT(TYPE, INTERNAL_TYPE, LENMAX, LEN) \
+        _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 INTERNAL_TYPE btmp[LENMAX]; INTERNAL_TYPE i, lanesize = sizeof(INTERNAL_TYPE) << 3; \
+        _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
+        for (i = 0; i<LEN; i++) { \
+        if( btmp[i] >= 0) { \
+            if(btmp[i] >= lanesize) res[i] = 0; \
+            else res[i] = (atmp[i] << btmp[i]); \
+        }else{ \
+            res[i] = (btmp[i] < -lanesize) ? res[i] = 0 : \
+                            (btmp[i] == -lanesize) ? (atmp[i] & ((INTERNAL_TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) : \
+                            (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((INTERNAL_TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) );    }} \
+        return _mm_load_si128((__m128i*)res);
+
+
+#define SERIAL_ROUNDING_SHIFT_64(TYPE, SIGN, LEN) \
+        int ## TYPE ## x ## LEN ## _t res;  int i;  int lanesize = sizeof(int ## TYPE ## _t) << 3; \
+        for (i = 0; i<LEN; i++) { \
+        if( b.m64_i ## TYPE[i] >= 0) { \
+            if(b.m64_i ## TYPE[i] >= lanesize) res.m64_ ## SIGN ## TYPE[i] = 0; \
+            else res.m64_ ## SIGN ## TYPE[i] = (a.m64_ ## SIGN ## TYPE[i] << b.m64_i ## TYPE[i]); \
+        }else{ \
+            res.m64_ ## SIGN ## TYPE[i] = (b.m64_i ## TYPE[i] < -lanesize) ? res.m64_ ## SIGN ## TYPE[i] = 0 : \
+                            (b.m64_i ## TYPE[i] == -lanesize) ? (a.m64_ ## SIGN ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) : \
+                            (a.m64_ ## SIGN ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_ ## SIGN ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) );    }} \
+        return res;
+
+
+_NEON2SSESTORAGE int8x8_t vrshl_s8(int8x8_t a, int8x8_t b); // VRSHL.S8 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vrshl_s8(int8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT_64(8,i,8)
+}
+
+_NEON2SSESTORAGE int16x4_t vrshl_s16(int16x4_t a, int16x4_t b); // VRSHL.S16 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vrshl_s16(int16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT_64(16,i,4)
+}
+
+_NEON2SSESTORAGE int32x2_t vrshl_s32(int32x2_t a, int32x2_t b); // VRSHL.S32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vrshl_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT_64(32,i,2)
+}
+
+_NEON2SSESTORAGE int64x1_t vrshl_s64(int64x1_t a, int64x1_t b); // VRSHL.S64 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrshl_s64(int64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT_64(64,i,1)
+}
+
+_NEON2SSESTORAGE uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b); // VRSHL.U8 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT_64(8,u,8)
+}
+
+_NEON2SSESTORAGE uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b); // VRSHL.s16 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT_64(16,u,4)
+}
+
+_NEON2SSESTORAGE uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b); // VRSHL.U32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT_64(32,u,2)
+}
+
+_NEON2SSESTORAGE uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b); // VRSHL.U64 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT_64(64,u,1)
+}
+
+_NEON2SSESTORAGE int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b); // VRSHL.S8 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT(int8_t, int8_t, 16, 16)
+}
+
+_NEON2SSESTORAGE int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b); // VRSHL.S16 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT(int16_t, int16_t, 8, 8)
+}
+
+_NEON2SSESTORAGE int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b); // VRSHL.S32 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT(int32_t, int32_t, 4, 4)
+}
+
+_NEON2SSESTORAGE int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b); // VRSHL.S64 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT(int64_t, int64_t, 2, 2)
+}
+
+_NEON2SSESTORAGE uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b); // VRSHL.U8 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT(uint8_t, int8_t, 16, 16)
+}
+
+_NEON2SSESTORAGE uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b); // VRSHL.s16 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT(uint16_t, int16_t, 8, 8)
+}
+
+_NEON2SSESTORAGE uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b); // VRSHL.U32 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT(uint32_t, int32_t, 4, 4)
+}
+
+_NEON2SSESTORAGE uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b); // VRSHL.U64 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT(uint64_t, int64_t, 2, 2)
+}
+
+
+//********** Vector saturating rounding shift left: (negative values shift right) ****************
+//*************************************************************************************************
+//No such operations in IA32 SIMD unfortunately, constant shift only available, so need to do the serial solution
+//Saturation happens for left shifts only while rounding makes sense for right shifts only.
+#define SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(TYPE, LENMAX, LEN) \
+        _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX], btmp[LENMAX]; TYPE limit; int i; \
+        int lanesize_1 = (sizeof(TYPE) << 3) - 1; \
+        _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
+        for (i = 0; i<LEN; i++) { \
+        if (atmp[i] ==0) res[i] = 0; \
+        else{ \
+            if(btmp[i] <0) res[i] = (btmp[i] < (-lanesize_1)) ? 0 : (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); \
+            else{ \
+                if (btmp[i]>lanesize_1) { \
+                    res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
+                }else{ \
+                    limit = (TYPE)1 << (lanesize_1 - btmp[i]); \
+                    if((atmp[i] >= limit)||(atmp[i] <= -limit)) \
+                        res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
+                    else res[i] = atmp[i] << btmp[i]; }}}} \
+        return _mm_load_si128((__m128i*)res);
+
+#define SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(TYPE, LENMAX, LEN) \
+        _NEON2SSE_ALIGN_16 _UNSIGNED_T(TYPE) atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 TYPE btmp[LENMAX]; _UNSIGNED_T(TYPE) limit; int i; \
+        int lanesize = (sizeof(TYPE) << 3); \
+        _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
+        for (i = 0; i<LEN; i++) { \
+        if (atmp[i] ==0) {res[i] = 0; \
+        }else{ \
+            if(btmp[i] < 0) res[i] = (btmp[i] < (-lanesize)) ? 0 : (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); \
+            else{ \
+                if (btmp[i]>lanesize) res[i] = ~((TYPE)0); \
+                else{ \
+                    limit = (TYPE) 1 << (lanesize - btmp[i]); \
+                    res[i] = ( atmp[i] >= limit) ? res[i] = ~((TYPE)0) : atmp[i] << btmp[i]; }}}} \
+        return _mm_load_si128((__m128i*)res);
+
+#define SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(TYPE, LEN) \
+        __m64_128 res; int ## TYPE ## _t limit; int i; \
+        int lanesize_1 = (sizeof(int ## TYPE ## _t ) << 3) - 1; \
+        for (i = 0; i<LEN; i++) { \
+        if (a.m64_i ## TYPE[i] ==0) res.m64_i ## TYPE[i] = 0; \
+        else{ \
+            if(b.m64_i ## TYPE[i] <0) res.m64_i ## TYPE[i] = (b.m64_i ## TYPE[i] < (-lanesize_1)) ? 0 : (a.m64_i ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_i ## TYPE[i] & ((int ## TYPE ## _t ) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); \
+            else{ \
+                if (b.m64_i ## TYPE[i]>lanesize_1) { \
+                    res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t ))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t ) 1 << lanesize_1) - 1; \
+                }else{ \
+                    limit = (int ## TYPE ## _t ) 1 << (lanesize_1 - b.m64_i ## TYPE[i]); \
+                    if((a.m64_i ## TYPE[i] >= limit)||(a.m64_i ## TYPE[i] <= -limit)) \
+                        res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t ))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t ) 1 << lanesize_1) - 1; \
+                    else res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
+        return res;
+
+#define SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(TYPE, LEN) \
+        __m64_128 res; _UNSIGNED_T(int ## TYPE ## _t) limit; int i; \
+        int lanesize = (sizeof(int ## TYPE ## _t) << 3); \
+        for (i = 0; i<LEN; i++) { \
+        if (a.m64_u ## TYPE[i] ==0) {res.m64_u ## TYPE[i] = 0; \
+        }else{ \
+            if(b.m64_i ## TYPE[i] < 0) res.m64_u ## TYPE[i] = (b.m64_i ## TYPE[i] < (-lanesize)) ? 0 : (a.m64_u ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_u ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); \
+            else{ \
+                if (b.m64_i ## TYPE[i]>lanesize) res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0); \
+                else{ \
+                    limit = (int ## TYPE ## _t) 1 << (lanesize - b.m64_i ## TYPE[i]); \
+                    res.m64_u ## TYPE[i] = ( a.m64_u ## TYPE[i] >= limit) ? res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0) : a.m64_u ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
+        return res;
+
+_NEON2SSESTORAGE int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b); // VQRSHL.S8 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(8,8)
+}
+
+_NEON2SSESTORAGE int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b); // VQRSHL.S16 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(16,4)
+}
+
+_NEON2SSESTORAGE int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b); // VQRSHL.S32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(32,2)
+}
+
+_NEON2SSESTORAGE int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b); // VQRSHL.S64 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(64,1)
+}
+
+_NEON2SSESTORAGE uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b); // VQRSHL.U8 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(8,8)
+}
+
+_NEON2SSESTORAGE uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b); // VQRSHL.s16 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(16,4)
+}
+
+_NEON2SSESTORAGE uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b); // VQRSHL.U32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(32,2)
+}
+
+_NEON2SSESTORAGE uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b); // VQRSHL.U64 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(64,1)
+}
+
+_NEON2SSESTORAGE int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b); // VQRSHL.S8 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int8_t, 16, 16)
+}
+
+_NEON2SSESTORAGE int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b); // VQRSHL.S16 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int16_t, 8, 8)
+}
+
+_NEON2SSESTORAGE int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b); // VQRSHL.S32 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int32_t, 4, 4)
+}
+
+_NEON2SSESTORAGE int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b); // VQRSHL.S64 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int64_t, 2, 2)
+}
+
+_NEON2SSESTORAGE uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b); // VQRSHL.U8 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int8_t, 16, 16)
+}
+
+_NEON2SSESTORAGE uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b); // VQRSHL.s16 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int16_t, 8, 8)
+}
+
+_NEON2SSESTORAGE uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b); // VQRSHL.U32 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int32_t, 4, 4)
+}
+
+_NEON2SSESTORAGE uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b); // VQRSHL.U64 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int64_t, 2, 2)
+}
+
+// *********************************************************************************
+// *****************************  Shifts by a constant *****************************
+// *********************************************************************************
+//**************** Vector shift right by constant*************************************
+//************************************************************************************
+_NEON2SSESTORAGE int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VSHR.S8 d0,d0,#8
+_NEON2SSE_INLINE int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b) // VSHR.S8 d0,d0,#8
+{
+    //no 8 bit shift available, go to 16 bit
+    int8x8_t res64;
+    __m128i r;
+    r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
+    r = _mm_srai_epi16 (r, b); //SSE2
+    r = _mm_packs_epi16 (r,r); //we need 64 bits only
+    return64(r);
+}
+
+_NEON2SSESTORAGE int16x4_t vshr_n_s16(int16x4_t a,  __constrange(1,16) int b); // VSHR.S16 d0,d0,#16
+_NEON2SSE_INLINE int16x4_t vshr_n_s16(int16x4_t a,  __constrange(1,16) int b)
+{
+    int16x4_t res64;
+    return64(_mm_srai_epi16(_pM128i(a), b));
+}
+
+
+_NEON2SSESTORAGE int32x2_t vshr_n_s32(int32x2_t a,  __constrange(1,32) int b); // VSHR.S32 d0,d0,#32
+_NEON2SSE_INLINE int32x2_t vshr_n_s32(int32x2_t a,  __constrange(1,32) int b)
+{
+    int32x2_t res64;
+    return64(_mm_srai_epi32(_pM128i(a), b));
+}
+
+_NEON2SSESTORAGE int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VSHR.S64 d0,d0,#64
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //no arithmetic shift for 64bit values, serial solution used
+    int64x1_t res;
+    if(b>=64) res.m64_i64[0] = 0;
+    else res.m64_i64[0] = (*(int64_t*)&a) >> b;
+    return res;
+}
+
+_NEON2SSESTORAGE uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VSHR.U8 d0,d0,#8
+_NEON2SSE_INLINE uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b) // VSHR.U8 d0,d0,#8
+{
+    //no 8 bit shift available, go to 16 bit
+    uint8x8_t res64;
+    __m128i r;
+    r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
+    r = _mm_srli_epi16 (r, b); //for unsigned variables we use the logical shift not arithmetical one
+    r = _mm_packus_epi16 (r,r); //we need 64 bits only
+    return64(r);
+}
+
+_NEON2SSESTORAGE uint16x4_t vshr_n_u16(uint16x4_t a,  __constrange(1,16) int b); // VSHR.s16 d0,d0,#16
+_NEON2SSE_INLINE uint16x4_t vshr_n_u16(uint16x4_t a,  __constrange(1,16) int b)
+{
+    uint16x4_t res64;
+    return64(_mm_srli_epi16(_pM128i(a), b));
+}
+
+
+_NEON2SSESTORAGE uint32x2_t vshr_n_u32(uint32x2_t a,  __constrange(1,32) int b); // VSHR.U32 d0,d0,#32
+_NEON2SSE_INLINE uint32x2_t vshr_n_u32(uint32x2_t a,  __constrange(1,32) int b)
+{
+    uint32x2_t res64;
+    return64(_mm_srli_epi32(_pM128i(a), b));
+}
+
+
+_NEON2SSESTORAGE uint64x1_t vshr_n_u64(uint64x1_t a,  __constrange(1,64) int b); // VSHR.U64 d0,d0,#64
+_NEON2SSE_INLINE uint64x1_t vshr_n_u64(uint64x1_t a,  __constrange(1,64) int b)
+{
+    uint64x1_t res64;
+    return64(_mm_srli_epi64(_pM128i(a), b));
+}
+
+
+_NEON2SSESTORAGE int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VSHR.S8 q0,q0,#8
+_NEON2SSE_INLINE int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b) // VSHR.S8 q0,q0,#8
+{
+    //no 8 bit shift available, go to 16 bit trick
+    __m128i zero, mask0, a_sign, r, a_sign_mask;
+    _NEON2SSE_ALIGN_16 static const int16_t mask0_16[9] = {0x0000, 0x0080, 0x00c0, 0x00e0, 0x00f0,  0x00f8, 0x00fc, 0x00fe, 0x00ff};
+    zero = _mm_setzero_si128();
+    mask0 = _mm_set1_epi16(mask0_16[b]); //to mask the bits to be "spoiled"  by 16 bit shift
+    a_sign =  _mm_cmpgt_epi8 (zero, a); //ff if a<0 or zero if a>0
+    r = _mm_srai_epi16 (a, b);
+    a_sign_mask =  _mm_and_si128 (mask0, a_sign);
+    r =  _mm_andnot_si128 (mask0, r);
+    return _mm_or_si128 (r, a_sign_mask);
+}
+
+_NEON2SSESTORAGE int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VSHR.S16 q0,q0,#16
+#define vshrq_n_s16 _mm_srai_epi16
+
+_NEON2SSESTORAGE int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VSHR.S32 q0,q0,#32
+#define vshrq_n_s32 _mm_srai_epi32
+
+_NEON2SSESTORAGE int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VSHR.S64 q0,q0,#64
+_NEON2SSE_INLINE int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b)
+{
+    //SIMD implementation may be not optimal due to 64 bit arithmetic shift absense in x86 SIMD
+    __m128i c1, signmask,a0,  res64;
+    _NEON2SSE_ALIGN_16 static const uint64_t mask[] = {0x8000000000000000, 0x8000000000000000};
+    c1 =  _mm_cmpeq_epi32(a,a); //0xffffffffffffffff
+    signmask  =  _mm_slli_epi64 (c1, (64 - b));
+    a0 = _mm_or_si128(a, *(__m128i*)mask); //get the first bit
+    a0 = _MM_CMPEQ_EPI64 (a, a0);
+    signmask = _mm_and_si128(a0, signmask);
+    res64 = _mm_srli_epi64 (a, b);
+    return _mm_or_si128(res64, signmask);
+}
+
+_NEON2SSESTORAGE uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VSHR.U8 q0,q0,#8
+_NEON2SSE_INLINE uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b) // VSHR.U8 q0,q0,#8
+{
+    //no 8 bit shift available, need the special trick
+    __m128i mask0, r;
+    _NEON2SSE_ALIGN_16 static const uint16_t mask10_16[9] = {0xffff, 0xff7f, 0xff3f, 0xff1f, 0xff0f,  0xff07, 0xff03, 0xff01, 0xff00};
+    mask0 = _mm_set1_epi16(mask10_16[b]); //to mask the bits to be "spoiled"  by 16 bit shift
+    r = _mm_srli_epi16 ( a, b);
+    return _mm_and_si128 (r,  mask0);
+}
+
+_NEON2SSESTORAGE uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VSHR.s16 q0,q0,#16
+#define vshrq_n_u16 _mm_srli_epi16
+
+_NEON2SSESTORAGE uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VSHR.U32 q0,q0,#32
+#define vshrq_n_u32 _mm_srli_epi32
+
+_NEON2SSESTORAGE uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VSHR.U64 q0,q0,#64
+#define vshrq_n_u64 _mm_srli_epi64
+
+//*************************** Vector shift left by constant *************************
+//*********************************************************************************
+_NEON2SSESTORAGE int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
+_NEON2SSE_INLINE int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b) // VSHL.I8 d0,d0,#0
+{
+    //no 8 bit shift available, go to 16 bit
+    int8x8_t res64;
+    __m128i r;
+    r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
+    r = _mm_slli_epi16 (r, b); //SSE2
+    r = _mm_shuffle_epi8 (r, *(__m128i*) mask8_16_even_odd); //return to 8 bit, we need 64 bits only
+    return64(r);
+}
+
+_NEON2SSESTORAGE int16x4_t vshl_n_s16(int16x4_t a,  __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
+_NEON2SSE_INLINE int16x4_t vshl_n_s16(int16x4_t a,  __constrange(0,15) int b)
+{
+    int16x4_t res64;
+    return64(_mm_slli_epi16(_pM128i(a), b));
+}
+
+
+_NEON2SSESTORAGE int32x2_t vshl_n_s32(int32x2_t a,  __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
+_NEON2SSE_INLINE int32x2_t vshl_n_s32(int32x2_t a,  __constrange(0,31) int b)
+{
+    int32x2_t res64;
+    return64(_mm_slli_epi32(_pM128i(a), b));
+}
+
+
+_NEON2SSESTORAGE int64x1_t vshl_n_s64(int64x1_t a,  __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
+_NEON2SSE_INLINE int64x1_t vshl_n_s64(int64x1_t a,  __constrange(0,63) int b)
+{
+    int64x1_t res64;
+    return64(_mm_slli_epi64(_pM128i(a), b));
+}
+
+
+_NEON2SSESTORAGE uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
+_NEON2SSE_INLINE uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b)
+{
+    //no 8 bit shift available, go to 16 bit
+    uint8x8_t res64;
+    __m128i mask8;
+    __m128i r;
+    mask8 = _mm_set1_epi16(0xff);
+    r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
+    r = _mm_slli_epi16 (r, b); //SSE2
+    r = _mm_and_si128(r, mask8); //to avoid saturation
+    r = _mm_packus_epi16 (r,r); //we need 64 bits only
+    return64(r);
+}
+
+_NEON2SSESTORAGE uint16x4_t vshl_n_u16(uint16x4_t a,  __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
+#define vshl_n_u16 vshl_n_s16
+
+
+_NEON2SSESTORAGE uint32x2_t vshl_n_u32(uint32x2_t a,  __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
+#define vshl_n_u32 vshl_n_s32
+
+_NEON2SSESTORAGE uint64x1_t vshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
+#define vshl_n_u64 vshl_n_s64
+
+_NEON2SSESTORAGE int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
+#define vshlq_n_s8 vshlq_n_u8
+
+_NEON2SSESTORAGE int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
+#define vshlq_n_s16 _mm_slli_epi16
+
+_NEON2SSESTORAGE int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
+#define vshlq_n_s32 _mm_slli_epi32
+
+_NEON2SSESTORAGE int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
+#define vshlq_n_s64 _mm_slli_epi64
+
+_NEON2SSESTORAGE uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
+_NEON2SSE_INLINE uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b)
+{
+    //no 8 bit shift available, need the special trick
+    __m128i mask0, r;
+    _NEON2SSE_ALIGN_16 static const uint16_t mask10_16[9] = {0xffff, 0xfeff, 0xfcff, 0xf8ff, 0xf0ff,  0xe0ff, 0xc0ff, 0x80ff, 0xff};
+    mask0 = _mm_set1_epi16(mask10_16[b]); //to mask the bits to be "spoiled"  by 16 bit shift
+    r = _mm_slli_epi16 ( a, b);
+    return _mm_and_si128 (r,  mask0);
+}
+
+_NEON2SSESTORAGE uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
+#define vshlq_n_u16 vshlq_n_s16
+
+_NEON2SSESTORAGE uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
+#define vshlq_n_u32 vshlq_n_s32
+
+_NEON2SSESTORAGE uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
+#define vshlq_n_u64 vshlq_n_s64
+
+//************* Vector rounding shift right by constant ******************
+//*************************************************************************
+//No corresponding  x86 intrinsics exist, need to do some tricks
+_NEON2SSESTORAGE int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VRSHR.S8 d0,d0,#8
+_NEON2SSE_INLINE int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b) // VRSHR.S8 d0,d0,#8
+{
+    //no 8 bit shift available, go to 16 bit
+    int8x8_t res64;
+    __m128i r, maskb;
+    r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
+    maskb =  _mm_slli_epi16 (r, (16 - b)); //to get rounding (b-1)th bit
+    maskb = _mm_srli_epi16 (maskb, 15); //1 or 0
+    r = _mm_srai_epi16 (r, b);
+    r = _mm_add_epi16 (r, maskb); //actual rounding
+    r = _mm_packs_epi16 (r,r); ////we need 64 bits only
+    return64(r);
+}
+
+_NEON2SSESTORAGE int16x4_t vrshr_n_s16(int16x4_t a,  __constrange(1,16) int b); // VRSHR.S16 d0,d0,#16
+_NEON2SSE_INLINE int16x4_t vrshr_n_s16(int16x4_t a,  __constrange(1,16) int b)
+{
+    int16x4_t res64;
+    return64(vrshrq_n_s16(_pM128i(a), b));
+}
+
+
+_NEON2SSESTORAGE int32x2_t vrshr_n_s32(int32x2_t a,  __constrange(1,32) int b); // VRSHR.S32 d0,d0,#32
+_NEON2SSE_INLINE int32x2_t vrshr_n_s32(int32x2_t a,  __constrange(1,32) int b)
+{
+    int32x2_t res64;
+    return64(vrshrq_n_s32(_pM128i(a), b));
+}
+
+
+_NEON2SSESTORAGE int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VRSHR.S64 d0,d0,#64
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //serial solution is faster
+    int64x1_t res;
+    int64_t a_i64 = *( int64_t*)&a;
+    if(b==64) {
+        res.m64_i64[0] = 0; //for some compilers rounding happens and we need to use(a_i64 & _SIGNBIT64)>>63;
+    } else {
+        int64_t maskb = a_i64 & (( int64_t)1 << (b - 1));
+        res.m64_i64[0] = (a_i64 >> b) + (maskb >> (b - 1));
+    }
+    return res;
+}
+
+_NEON2SSESTORAGE uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VRSHR.U8 d0,d0,#8
+_NEON2SSE_INLINE uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b) // VRSHR.U8 d0,d0,#8
+{
+    //no 8 bit shift available, go to 16 bit, solution may be not optimal compared with the serial one
+    uint8x8_t res64;
+    __m128i r, maskb;
+    r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
+    maskb =  _mm_slli_epi16 (r, (16 - b)); //to get rounding (b-1)th bit
+    maskb = _mm_srli_epi16 (maskb, 15); //1 or 0
+    r = _mm_srli_epi16 (r, b);
+    r = _mm_add_epi16 (r, maskb); //actual rounding
+    r =  _mm_packus_epi16 (r,r); ////we need 64 bits only
+    return64(r);
+}
+
+_NEON2SSESTORAGE uint16x4_t vrshr_n_u16(uint16x4_t a,  __constrange(1,16) int b); // VRSHR.s16 d0,d0,#16
+_NEON2SSE_INLINE uint16x4_t vrshr_n_u16(uint16x4_t a,  __constrange(1,16) int b)
+{
+    uint16x4_t res64;
+    return64(vrshrq_n_u16(_pM128i(a), b));
+}
+
+
+_NEON2SSESTORAGE uint32x2_t vrshr_n_u32(uint32x2_t a,  __constrange(1,32) int b); // VRSHR.U32 d0,d0,#32
+_NEON2SSE_INLINE uint32x2_t vrshr_n_u32(uint32x2_t a,  __constrange(1,32) int b)
+{
+    uint32x2_t res64;
+    return64(vrshrq_n_u32(_pM128i(a), b));
+}
+
+
+_NEON2SSESTORAGE uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VRSHR.U64 d0,d0,#64
+_NEON2SSE_INLINE uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b)
+{
+    uint64x1_t res64;
+    return64(vrshrq_n_u64(_pM128i(a), b));
+}
+
+_NEON2SSESTORAGE int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VRSHR.S8 q0,q0,#8
+_NEON2SSE_INLINE int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b) // VRSHR.S8 q0,q0,#8
+{
+    //no 8 bit shift available, go to 16 bit trick
+    __m128i r, mask1, maskb;
+    _NEON2SSE_ALIGN_16 static const uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080}; // 2^b-th bit set to 1
+    r = vshrq_n_s8 (a, b);
+    mask1 = _mm_set1_epi16(mask2b[b]); // 2^b-th bit set to 1 for 16bit, need it for rounding
+    maskb = _mm_and_si128(a, mask1); //get b or 0 for rounding
+    maskb =  _mm_srli_epi16 (maskb, b - 1); // to add 1
+    return _mm_add_epi8(r, maskb); //actual rounding
+}
+
+_NEON2SSESTORAGE int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VRSHR.S16 q0,q0,#16
+_NEON2SSE_INLINE int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b) // VRSHR.S16 q0,q0,#16
+{
+    __m128i maskb, r;
+    maskb =  _mm_slli_epi16(a, (16 - b)); //to get rounding (b-1)th bit
+    maskb = _mm_srli_epi16(maskb, 15); //1 or 0
+    r = _mm_srai_epi16 (a, b);
+    return _mm_add_epi16 (r, maskb); //actual rounding
+}
+
+_NEON2SSESTORAGE int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VRSHR.S32 q0,q0,#32
+_NEON2SSE_INLINE int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b) // VRSHR.S32 q0,q0,#32
+{
+    __m128i maskb,  r;
+    maskb = _mm_slli_epi32 (a, (32 - b)); //to get rounding (b-1)th bit
+    maskb = _mm_srli_epi32 (maskb,31); //1 or 0
+    r = _mm_srai_epi32(a, b);
+    return _mm_add_epi32 (r, maskb); //actual rounding
+}
+
+_NEON2SSESTORAGE int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VRSHR.S64 q0,q0,#64
+_NEON2SSE_INLINE int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b)
+{
+    //solution may be not optimal compared with a serial one
+    __m128i maskb;
+    int64x2_t r;
+    maskb = _mm_slli_epi64 (a, (64 - b)); //to get rounding (b-1)th bit
+    maskb = _mm_srli_epi64 (maskb,63); //1 or 0
+    r = vshrq_n_s64(a, b);
+    return _mm_add_epi64 (r, maskb); //actual rounding
+}
+
+_NEON2SSESTORAGE uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VRSHR.U8 q0,q0,#8
+_NEON2SSE_INLINE uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b) // VRSHR.U8 q0,q0,#8
+{
+    //no 8 bit shift available, go to 16 bit trick
+    __m128i r, mask1, maskb;
+    _NEON2SSE_ALIGN_16 static const uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080}; // 2^b-th bit set to 1
+    r = vshrq_n_u8 (a, b);
+    mask1 = _mm_set1_epi16(mask2b[b]); // 2^b-th bit set to 1 for 16bit, need it for rounding
+    maskb = _mm_and_si128(a, mask1); //get b or 0 for rounding
+    maskb =  _mm_srli_epi16 (maskb, b - 1); // to add 1
+    return _mm_add_epi8(r, maskb); //actual rounding
+}
+
+_NEON2SSESTORAGE uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VRSHR.s16 q0,q0,#16
+_NEON2SSE_INLINE uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b) // VRSHR.S16 q0,q0,#16
+{
+    __m128i maskb, r;
+    maskb =  _mm_slli_epi16(a, (16 - b)); //to get rounding (b-1)th bit
+    maskb = _mm_srli_epi16(maskb, 15); //1 or 0
+    r = _mm_srli_epi16 (a, b);
+    return _mm_add_epi16 (r, maskb); //actual rounding
+}
+
+_NEON2SSESTORAGE uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VRSHR.U32 q0,q0,#32
+_NEON2SSE_INLINE uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b) // VRSHR.S32 q0,q0,#32
+{
+    __m128i maskb,  r;
+    maskb = _mm_slli_epi32 (a, (32 - b)); //to get rounding (b-1)th bit
+    maskb = _mm_srli_epi32 (maskb,31); //1 or 0
+    r = _mm_srli_epi32(a, b);
+    return _mm_add_epi32 (r, maskb); //actual rounding
+}
+
+_NEON2SSESTORAGE uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VRSHR.U64 q0,q0,#64
+_NEON2SSE_INLINE uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b)
+{
+    //solution may be not optimal compared with a serial one
+    __m128i maskb,  r;
+    maskb = _mm_slli_epi64 (a, (64 - b)); //to get rounding (b-1)th bit
+    maskb = _mm_srli_epi64 (maskb,63); //1 or 0
+    r = _mm_srli_epi64(a, b);
+    return _mm_add_epi64 (r, maskb); //actual rounding
+}
+
+//************* Vector shift right by constant and accumulate *********
+//*********************************************************************
+_NEON2SSESTORAGE int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRA.S8 d0,d0,#8
+_NEON2SSE_INLINE int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c) // VSRA.S8 d0,d0,#8
+{
+    int8x8_t shift;
+    shift = vshr_n_s8(b, c);
+    return vadd_s8( a, shift);
+}
+
+_NEON2SSESTORAGE int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRA.S16 d0,d0,#16
+_NEON2SSE_INLINE int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c) // VSRA.S16 d0,d0,#16
+{
+    int16x4_t shift;
+    shift = vshr_n_s16( b, c);
+    return vadd_s16(a, shift);
+}
+
+_NEON2SSESTORAGE int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRA.S32 d0,d0,#32
+_NEON2SSE_INLINE int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c) // VSRA.S32 d0,d0,#32
+{
+    //may be not optimal compared with the serial execution
+    int32x2_t shift;
+    shift = vshr_n_s32(b, c);
+    return vadd_s32( a, shift);
+}
+
+_NEON2SSESTORAGE int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRA.S64 d0,d0,#64
+_NEON2SSE_INLINE int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c)
+{
+    //may be not optimal compared with a serial solution
+    int64x1_t shift;
+    shift = vshr_n_s64(b, c);
+    return vadd_s64( a, shift);
+}
+
+_NEON2SSESTORAGE uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRA.U8 d0,d0,#8
+_NEON2SSE_INLINE uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c) // VSRA.U8 d0,d0,#8
+{
+    uint8x8_t shift;
+    shift = vshr_n_u8(b, c);
+    return vadd_u8(a, shift);
+}
+
+_NEON2SSESTORAGE uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRA.s16 d0,d0,#16
+_NEON2SSE_INLINE uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c) // VSRA.s16 d0,d0,#16
+{
+    uint16x4_t shift;
+    shift = vshr_n_u16(b, c);
+    return vadd_u16(a,shift);
+}
+
+_NEON2SSESTORAGE uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRA.U32 d0,d0,#32
+_NEON2SSE_INLINE uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c) // VSRA.U32 d0,d0,#32
+{
+    //may be not optimal compared with the serial execution
+    uint32x2_t shift;
+    shift = vshr_n_u32(b, c);
+    return vadd_u32( a, shift);
+}
+
+_NEON2SSESTORAGE uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRA.U64 d0,d0,#64
+_NEON2SSE_INLINE uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c) // VSRA.U64 d0,d0,#64
+{
+    //may be not optimal compared with the serial execution
+    uint64x1_t shift;
+    shift = vshr_n_u64(b, c);
+    return vadd_u64(a, shift);
+}
+
+_NEON2SSESTORAGE int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRA.S8 q0,q0,#8
+_NEON2SSE_INLINE int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VSRA.S8 q0,q0,#8
+{
+    int8x16_t shift;
+    shift = vshrq_n_s8(b, c);
+    return vaddq_s8(a, shift);
+}
+
+_NEON2SSESTORAGE int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRA.S16 q0,q0,#16
+_NEON2SSE_INLINE int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VSRA.S16 q0,q0,#16
+{
+    int16x8_t shift;
+    shift = vshrq_n_s16(b, c);
+    return vaddq_s16(a, shift);
+}
+
+_NEON2SSESTORAGE int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRA.S32 q0,q0,#32
+_NEON2SSE_INLINE int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VSRA.S32 q0,q0,#32
+{
+    int32x4_t shift;
+    shift = vshrq_n_s32(b, c);
+    return vaddq_s32(a, shift);
+}
+
+_NEON2SSESTORAGE int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRA.S64 q0,q0,#64
+_NEON2SSE_INLINE int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c) // VSRA.S64 q0,q0,#64
+{
+    int64x2_t shift;
+    shift = vshrq_n_s64(b, c);
+    return vaddq_s64( a, shift);
+}
+
+_NEON2SSESTORAGE uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRA.U8 q0,q0,#8
+_NEON2SSE_INLINE uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c) // VSRA.U8 q0,q0,#8
+{
+    uint8x16_t shift;
+    shift = vshrq_n_u8(b, c);
+    return vaddq_u8(a, shift);
+}
+
+_NEON2SSESTORAGE uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRA.s16 q0,q0,#16
+_NEON2SSE_INLINE uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c) // VSRA.s16 q0,q0,#16
+{
+    uint16x8_t shift;
+    shift = vshrq_n_u16(b, c);
+    return vaddq_u16(a,  shift);
+}
+
+_NEON2SSESTORAGE uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRA.U32 q0,q0,#32
+_NEON2SSE_INLINE uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c) // VSRA.U32 q0,q0,#32
+{
+    uint32x4_t shift;
+    shift = vshrq_n_u32(b, c);
+    return vaddq_u32(a, shift);
+}
+
+_NEON2SSESTORAGE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRA.U64 q0,q0,#64
+_NEON2SSE_INLINE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c) // VSRA.U64 q0,q0,#64
+{
+    uint64x2_t shift;
+    shift = vshrq_n_u64(b, c);
+    return vaddq_u64(a, shift);
+}
+
+//************* Vector rounding shift right by constant and accumulate ****************************
+//************************************************************************************************
+_NEON2SSESTORAGE int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VRSRA.S8 d0,d0,#8
+_NEON2SSE_INLINE int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c) // VRSRA.S8 d0,d0,#8
+{
+    int8x8_t shift;
+    shift = vrshr_n_s8(b, c);
+    return vadd_s8( a, shift);
+}
+
+_NEON2SSESTORAGE int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VRSRA.S16 d0,d0,#16
+_NEON2SSE_INLINE int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c) // VRSRA.S16 d0,d0,#16
+{
+    int16x4_t shift;
+    shift = vrshr_n_s16( b, c);
+    return vadd_s16(a, shift);
+}
+
+_NEON2SSESTORAGE int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VRSRA.S32 d0,d0,#32
+_NEON2SSE_INLINE int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c) // VRSRA.S32 d0,d0,#32
+{
+    //may be not optimal compared with the serial execution
+    int32x2_t shift;
+    shift = vrshr_n_s32(b, c);
+    return vadd_s32( a, shift);
+}
+
+_NEON2SSESTORAGE int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VRSRA.S64 d0,d0,#64
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution
+{
+    int64x1_t shift;
+    shift = vrshr_n_s64(b, c);
+    return vadd_s64( a, shift);
+}
+
+_NEON2SSESTORAGE uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VRSRA.U8 d0,d0,#8
+_NEON2SSE_INLINE uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c) // VRSRA.U8 d0,d0,#8
+{
+    uint8x8_t shift;
+    shift = vrshr_n_u8(b, c);
+    return vadd_u8(a, shift);
+}
+
+_NEON2SSESTORAGE uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VRSRA.s16 d0,d0,#16
+_NEON2SSE_INLINE uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c) // VRSRA.s16 d0,d0,#16
+{
+    uint16x4_t shift;
+    shift = vrshr_n_u16(b, c);
+    return vadd_u16(a,shift);
+}
+
+_NEON2SSESTORAGE uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VRSRA.U32 d0,d0,#32
+_NEON2SSE_INLINE uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c) // VRSRA.U32 d0,d0,#32
+{
+    //may be not optimal compared with the serial execution
+    uint32x2_t shift;
+    shift = vrshr_n_u32(b, c);
+    return vadd_u32( a, shift);
+}
+
+_NEON2SSESTORAGE uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VRSRA.U64 d0,d0,#64
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution
+{
+    //may be not optimal compared with the serial execution
+    uint64x1_t shift;
+    shift = vrshr_n_u64(b, c);
+    return vadd_u64( a, shift);
+}
+
+_NEON2SSESTORAGE int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VRSRA.S8 q0,q0,#8
+_NEON2SSE_INLINE int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VRSRA.S8 q0,q0,#8
+{
+    int8x16_t shift;
+    shift = vrshrq_n_s8(b, c);
+    return vaddq_s8(a, shift);
+}
+
+_NEON2SSESTORAGE int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VRSRA.S16 q0,q0,#16
+_NEON2SSE_INLINE int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VRSRA.S16 q0,q0,#16
+{
+    int16x8_t shift;
+    shift = vrshrq_n_s16(b, c);
+    return vaddq_s16(a, shift);
+}
+
+_NEON2SSESTORAGE int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VRSRA.S32 q0,q0,#32
+_NEON2SSE_INLINE int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VRSRA.S32 q0,q0,#32
+{
+    int32x4_t shift;
+    shift = vrshrq_n_s32(b, c);
+    return vaddq_s32(a, shift);
+}
+
+_NEON2SSESTORAGE int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VRSRA.S64 q0,q0,#64
+_NEON2SSE_INLINE int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c)
+{
+    int64x2_t shift;
+    shift = vrshrq_n_s64(b, c);
+    return vaddq_s64(a, shift);
+}
+
+_NEON2SSESTORAGE uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VRSRA.U8 q0,q0,#8
+_NEON2SSE_INLINE uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c) // VRSRA.U8 q0,q0,#8
+{
+    uint8x16_t shift;
+    shift = vrshrq_n_u8(b, c);
+    return vaddq_u8(a, shift);
+}
+
+_NEON2SSESTORAGE uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VRSRA.s16 q0,q0,#16
+_NEON2SSE_INLINE uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c) // VRSRA.s16 q0,q0,#16
+{
+    uint16x8_t shift;
+    shift = vrshrq_n_u16(b, c);
+    return vaddq_u16(a,  shift);
+}
+
+_NEON2SSESTORAGE uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VRSRA.U32 q0,q0,#32
+_NEON2SSE_INLINE uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c) // VRSRA.U32 q0,q0,#32
+{
+    uint32x4_t shift;
+    shift = vrshrq_n_u32(b, c);
+    return vaddq_u32(a, shift);
+}
+
+_NEON2SSESTORAGE uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VRSRA.U64 q0,q0,#64
+_NEON2SSE_INLINE uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c)
+{
+    uint64x2_t shift;
+    shift = vrshrq_n_u64(b, c);
+    return vaddq_u64(a, shift);
+}
+
+//**********************Vector saturating shift left by constant *****************************
+//********************************************************************************************
+//we don't check const ranges  assuming they are met
+_NEON2SSESTORAGE int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHL.S8 d0,d0,#0
+_NEON2SSE_INLINE int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b) // VQSHL.S8 d0,d0,#0
+{
+    //no 8 bit shift available in IA32 SIMD, go to 16 bit. It also provides the auto saturation (in packs function)
+    int8x8_t res64;
+    __m128i a128, r128;
+    a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
+    r128 = _mm_slli_epi16 (a128, b);
+    r128 = _mm_packs_epi16 (r128,r128); //saturated s8, use 64 low bits only
+    return64(r128);
+}
+
+_NEON2SSESTORAGE int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHL.S16 d0,d0,#0
+_NEON2SSE_INLINE int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b) // VQSHL.S16 d0,d0,#0
+{
+    // go to 32 bit to get the auto saturation (in packs function)
+    int16x4_t res64;
+    __m128i a128, r128;
+    a128 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE 4.1
+    r128 = _mm_slli_epi32 (a128, b); //shift_res
+    r128 = _mm_packs_epi32 (r128,r128); //saturated s16, use 64 low bits only
+    return64(r128);
+}
+
+_NEON2SSESTORAGE int32x2_t vqshl_n_s32(int32x2_t a,  __constrange(0,31) int b); // VQSHL.S32 d0,d0,#0
+_NEON2SSE_INLINE int32x2_t vqshl_n_s32(int32x2_t a,  __constrange(0,31) int b)
+{
+    //serial execution may be faster
+    int32x2_t res64;
+    return64(vqshlq_n_s32 (_pM128i(a), b));
+}
+
+
+_NEON2SSESTORAGE int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHL.S64 d0,d0,#0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    // no effective SIMD solution here
+    int64x1_t res;
+    int64_t bmask;
+    int64_t a_i64 = *( int64_t*)&a;
+    bmask = ( int64_t)1 << (63 - b); //positive
+    if (a_i64 >= bmask) {
+        res.m64_i64[0] = ~(_SIGNBIT64);
+    } else {
+        res.m64_i64[0]  = (a_i64 <= -bmask) ? _SIGNBIT64 : a_i64 << b;
+    }
+    return res;
+}
+
+
+_NEON2SSESTORAGE uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VQSHL.U8 d0,d0,#0
+_NEON2SSE_INLINE uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b) // VQSHL.U8 d0,d0,#0
+{
+    //no 8 bit shift available in IA32 SIMD, go to 16 bit
+    uint8x8_t res64;
+    __m128i a128, r128;
+    a128 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
+    r128 = _mm_slli_epi16 (a128, b); //shift_res
+    r128 = _mm_packus_epi16 (r128,r128); //saturated u8, use 64 low bits only
+    return64(r128);
+}
+
+_NEON2SSESTORAGE uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VQSHL.s16 d0,d0,#0
+_NEON2SSE_INLINE uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b) // VQSHL.s16 d0,d0,#0
+{
+    // go to 32 bit to get the auto saturation (in packus function)
+    uint16x4_t res64;
+    __m128i a128, r128;
+    a128 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE 4.1
+    r128 = _mm_slli_epi32 (a128, b); //shift_res
+    r128 = _MM_PACKUS1_EPI32 (r128); //saturated s16
+    return64(r128);
+}
+
+_NEON2SSESTORAGE uint32x2_t vqshl_n_u32(uint32x2_t a,  __constrange(0,31) int b); // VQSHL.U32 d0,d0,#0
+_NEON2SSE_INLINE uint32x2_t vqshl_n_u32(uint32x2_t a,  __constrange(0,31) int b)
+{
+    uint32x2_t res64;
+    return64(vqshlq_n_u32(_pM128i(a), b));
+}
+
+_NEON2SSESTORAGE uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VQSHL.U64 d0,d0,#0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    // no effective SIMD solution here
+    uint64x1_t res;
+    uint64_t bmask;
+    uint64_t a_i64 = *(uint64_t*)&a;
+    bmask = ( uint64_t)1 << (64 - b);
+    res.m64_u64[0] = (a_i64 >= bmask)&&(b>0) ? 0xffffffffffffffff : a_i64 << b; //if b=0 we are fine with any a
+    return res;
+}
+
+_NEON2SSESTORAGE int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHL.S8 q0,q0,#0
+_NEON2SSE_INLINE int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b) // VQSHL.S8 q0,q0,#0
+{
+    // go to 16 bit to get the auto saturation (in packs function)
+    __m128i a128, r128_1, r128_2;
+    a128 = _MM_CVTEPI8_EPI16 (a); //SSE 4.1
+    r128_1 = _mm_slli_epi16 (a128, b);
+    //swap hi and low part of a128 to process the remaining data
+    a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
+    a128 = _MM_CVTEPI8_EPI16 (a128);
+    r128_2 = _mm_slli_epi16 (a128, b);
+    return _mm_packs_epi16 (r128_1, r128_2); //saturated s8
+}
+
+_NEON2SSESTORAGE int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHL.S16 q0,q0,#0
+_NEON2SSE_INLINE int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b) // VQSHL.S16 q0,q0,#0
+{
+    // manual saturation solution looks LESS optimal than 32 bits conversion one
+    // go to 32 bit to get the auto saturation (in packs function)
+    __m128i a128, r128_1, r128_2;
+    a128 = _MM_CVTEPI16_EPI32 (a); //SSE 4.1
+    r128_1 = _mm_slli_epi32 (a128, b); //shift_res
+    //swap hi and low part of a128 to process the remaining data
+    a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
+    a128 = _MM_CVTEPI16_EPI32 (a128);
+    r128_2 = _mm_slli_epi32 (a128, b);
+    return _mm_packs_epi32 (r128_1, r128_2); //saturated s16
+}
+
+_NEON2SSESTORAGE int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHL.S32 q0,q0,#0
+_NEON2SSE_INLINE int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b) // VQSHL.S32 q0,q0,#0
+{
+    // no 64 bit saturation option available, special tricks necessary
+    __m128i c1, maskA, saturation_mask, c7ffffff_mask, shift_res, shift_res_mask;
+    c1 = _mm_cmpeq_epi32(a,a); //0xff..ff
+    maskA = _mm_srli_epi32(c1, b + 1); //mask for positive numbers (32-b+1) zeros and b-1 ones
+    saturation_mask = _mm_cmpgt_epi32 (a, maskA); //0xff...ff if we need saturation, 0  otherwise
+    c7ffffff_mask  = _mm_srli_epi32(saturation_mask, 1); //saturated to 0x7f..ff when needed and zeros if not
+    shift_res = _mm_slli_epi32 (a, b);
+    shift_res_mask = _mm_andnot_si128(saturation_mask, shift_res);
+    //result with positive numbers saturated
+    shift_res = _mm_or_si128 (c7ffffff_mask, shift_res_mask);
+    //treat negative numbers
+    maskA = _mm_slli_epi32(c1, 31 - b); //mask for negative numbers b-1 ones  and (32-b+1)  zeros
+    saturation_mask = _mm_cmpgt_epi32 (maskA,a); //0xff...ff if we need saturation, 0  otherwise
+    c7ffffff_mask  = _mm_slli_epi32(saturation_mask, 31); //saturated to 0x80..00 when needed and zeros if not
+    shift_res_mask = _mm_andnot_si128(saturation_mask, shift_res);
+    return _mm_or_si128 (c7ffffff_mask, shift_res_mask);
+}
+
+_NEON2SSESTORAGE int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHL.S64 q0,q0,#0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    // no effective SIMD solution here
+    _NEON2SSE_ALIGN_16 int64_t atmp[2], res[2];
+    int64_t bmask;
+    int i;
+    bmask = ( int64_t)1 << (63 - b); //positive
+    _mm_store_si128((__m128i*)atmp, a);
+    for (i = 0; i<2; i++) {
+        if (atmp[i] >= bmask) {
+            res[i] = ~(_SIGNBIT64);
+        } else {
+            res[i] = (atmp[i] <= -bmask) ? _SIGNBIT64 : atmp[i] << b;
+        }
+    }
+    return _mm_load_si128((__m128i*)res);
+}
+
+_NEON2SSESTORAGE uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VQSHL.U8 q0,q0,#0
+_NEON2SSE_INLINE uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b) // VQSHL.U8 q0,q0,#0
+{
+    // go to 16 bit to get the auto saturation (in packs function)
+    __m128i a128, r128_1, r128_2;
+    a128 = _MM_CVTEPU8_EPI16 (a); //SSE 4.1
+    r128_1 = _mm_slli_epi16 (a128, b);
+    //swap hi and low part of a128 to process the remaining data
+    a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
+    a128 = _MM_CVTEPU8_EPI16 (a128);
+    r128_2 = _mm_slli_epi16 (a128, b);
+    return _mm_packus_epi16 (r128_1, r128_2); //saturated u8
+}
+
+_NEON2SSESTORAGE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VQSHL.s16 q0,q0,#0
+_NEON2SSE_INLINE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b) // VQSHL.s16 q0,q0,#0
+{
+    // manual saturation solution looks more optimal than 32 bits conversion one
+    __m128i cb, c8000, a_signed, saturation_mask,  shift_res;
+    cb = _mm_set1_epi16((1 << (16 - b)) - 1 - 0x8000 );
+    c8000 = _mm_set1_epi16 (-32768); // (int16_t)0x8000
+//no unsigned shorts comparison in SSE, only signed available, so need the trick
+    a_signed = _mm_sub_epi16(a, c8000); //go to signed
+    saturation_mask = _mm_cmpgt_epi16 (a_signed, cb);
+    shift_res = _mm_slli_epi16 (a, b);
+    return _mm_or_si128 (shift_res, saturation_mask);
+}
+
+_NEON2SSESTORAGE uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VQSHL.U32 q0,q0,#0
+_NEON2SSE_INLINE uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b) // VQSHL.U32 q0,q0,#0
+{
+    // manual saturation solution, no 64 bit saturation option, the serial version may be faster
+    __m128i cb, c80000000, a_signed, saturation_mask,  shift_res;
+    cb = _mm_set1_epi32((1 << (32 - b)) - 1 - 0x80000000 );
+    c80000000 = _mm_set1_epi32 (0x80000000);
+//no unsigned ints comparison in SSE, only signed available, so need the trick
+    a_signed = _mm_sub_epi32(a, c80000000); //go to signed
+    saturation_mask = _mm_cmpgt_epi32 (a_signed, cb);
+    shift_res = _mm_slli_epi32 (a, b);
+    return _mm_or_si128 (shift_res, saturation_mask);
+}
+
+_NEON2SSESTORAGE uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VQSHL.U64 q0,q0,#0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    // no effective SIMD solution here
+    _NEON2SSE_ALIGN_16 uint64_t atmp[2], res[2];
+    uint64_t bmask;
+    int i;
+    bmask = ( uint64_t)1 << (64 - b);
+    _mm_store_si128((__m128i*)atmp, a);
+    for (i = 0; i<2; i++) {
+        res[i] = (atmp[i] >= bmask)&&(b>0) ? 0xffffffffffffffff : atmp[i] << b; //if b=0 we are fine with any a
+    }
+    return _mm_load_si128((__m128i*)res);
+}
+
+//**************Vector signed->unsigned saturating shift left by constant *************
+//*************************************************************************************
+_NEON2SSESTORAGE uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHLU.S8 d0,d0,#0
+_NEON2SSE_INLINE uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b) // VQSHLU.S8 d0,d0,#0
+{
+    //no 8 bit shift available in IA32 SIMD, go to 16 bit. It also provides the auto saturation (in packs function)
+    uint8x8_t res64;
+    __m128i a128, r128;
+    a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
+    r128 = _mm_slli_epi16 (a128, b);
+    r128 = _mm_packus_epi16 (r128,r128); //saturated u8, use 64 low bits only
+    return64(r128);
+}
+
+_NEON2SSESTORAGE uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHLU.S16 d0,d0,#0
+_NEON2SSE_INLINE uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b) // VQSHLU.S16 d0,d0,#0
+{
+    uint16x4_t res64;
+    __m128i a128, r128;
+    a128 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE 4.1
+    r128 = _mm_slli_epi32 (a128, b); //shift_res
+    r128 = _MM_PACKUS1_EPI32 (r128); //saturated s16, use 64 low bits only
+    return64(r128);
+}
+
+_NEON2SSESTORAGE uint32x2_t vqshlu_n_s32(int32x2_t a,  __constrange(0,31) int b); // VQSHLU.S32 d0,d0,#0
+_NEON2SSE_INLINE int32x2_t vqshlu_n_s32(int32x2_t a,  __constrange(0,31) int b)
+{
+    int32x2_t res64;
+    return64( vqshluq_n_s32(_pM128i(a), b));
+}
+
+_NEON2SSESTORAGE uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHLU.S64 d0,d0,#0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL) // no effective SIMD solution here, serial execution looks faster
+{
+    uint64x1_t res;
+    uint64_t limit;
+    if (a.m64_i64[0]<=0) {
+        res.m64_u64[0] = 0;
+    } else {
+        limit = (uint64_t) 1 << (64 - b);
+        res.m64_u64[0] = ( ((uint64_t)a.m64_i64[0]) >= limit) ? res.m64_u64[0] = ~((uint64_t)0) : a.m64_i64[0] << b;
+    }
+    return res;
+}
+
+_NEON2SSESTORAGE uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHLU.S8 q0,q0,#0
+_NEON2SSE_INLINE uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b) // VQSHLU.S8 q0,q0,#0
+{
+    __m128i a128, r128_1, r128_2;
+    a128 = _MM_CVTEPI8_EPI16 (a); //SSE 4.1
+    r128_1 = _mm_slli_epi16 (a128, b);
+    //swap hi and low part of a128 to process the remaining data
+    a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
+    a128 = _MM_CVTEPI8_EPI16 (a128);
+    r128_2 = _mm_slli_epi16 (a128, b);
+    return _mm_packus_epi16 (r128_1, r128_2); //saturated u8
+}
+
+_NEON2SSESTORAGE uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHLU.S16 q0,q0,#0
+_NEON2SSE_INLINE uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b) // VQSHLU.S16 q0,q0,#0
+{
+    // manual saturation solution looks LESS optimal than 32 bits conversion one
+    __m128i a128, r128_1, r128_2;
+    a128 = _MM_CVTEPI16_EPI32 (a); //SSE 4.1
+    r128_1 = _mm_slli_epi32 (a128, b); //shift_res
+    //swap hi and low part of a128 to process the remaining data
+    a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
+    a128 = _MM_CVTEPI16_EPI32 (a128);
+    r128_2 = _mm_slli_epi32 (a128, b);
+    return _MM_PACKUS_EPI32 (r128_1, r128_2); //saturated s16
+}
+
+_NEON2SSESTORAGE uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHLU.S32 q0,q0,#0
+_NEON2SSE_INLINE uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b) // VQSHLU.S32 q0,q0,#0
+{
+    //solution may be  not optimal compared with the serial one
+    __m128i zero, maskA, maskGT0, a0,  a_masked, a_shift;
+    zero = _mm_setzero_si128();
+    maskA = _mm_cmpeq_epi32(a, a);
+    maskA = _mm_slli_epi32(maskA,(32 - b)); // b ones and (32-b)zeros
+    //saturate negative numbers to zero
+    maskGT0   = _mm_cmpgt_epi32 (a, zero); // //0xffffffff if positive number and zero otherwise (negative numbers)
+    a0 = _mm_and_si128 (a,  maskGT0); //negative are zeros now
+    //saturate positive to 0xffffffff
+    a_masked = _mm_and_si128 (a0, maskA);
+    a_masked = _mm_cmpgt_epi32 (a_masked, zero); //0xffffffff if saturation necessary 0 otherwise
+    a_shift = _mm_slli_epi32 (a0, b);
+    return _mm_or_si128 (a_shift, a_masked); //actual saturation
+}
+
+_NEON2SSESTORAGE uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHLU.S64 q0,q0,#0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    // no effective SIMD solution here, serial execution looks faster
+    _NEON2SSE_ALIGN_16 int64_t atmp[2];
+    _NEON2SSE_ALIGN_16 uint64_t res[2];
+    uint64_t limit;
+    int i;
+    _mm_store_si128((__m128i*)atmp, a);
+    for (i = 0; i<2; i++) {
+        if (atmp[i]<=0) {
+            res[i] = 0;
+        } else {
+            limit = (uint64_t) 1 << (64 - b);
+            res[i] = ( ((uint64_t)atmp[i]) >= limit) ? res[i] = ~((uint64_t)0) : atmp[i] << b;
+        }
+    }
+    return _mm_load_si128((__m128i*)res);
+}
+
+//************** Vector narrowing  shift right by constant **************
+//**********************************************************************
+_NEON2SSESTORAGE int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
+_NEON2SSE_INLINE int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VSHRN.I16 d0,q0,#8
+{
+    int8x8_t res64;
+    __m128i r16;
+    r16  = vshrq_n_s16(a,b);
+    r16  = _mm_shuffle_epi8 (r16, *(__m128i*) mask8_16_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
+    return64(r16);
+}
+
+_NEON2SSESTORAGE int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
+_NEON2SSE_INLINE int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VSHRN.I32 d0,q0,#16
+{
+    int16x4_t res64;
+    __m128i r32;
+    r32  = vshrq_n_s32(a,b);
+    r32  =  _mm_shuffle_epi8 (r32, *(__m128i*) mask8_32_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
+    return64(r32);
+}
+
+_NEON2SSESTORAGE int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
+_NEON2SSE_INLINE int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b)
+{
+    int32x2_t res64;
+    __m128i r64;
+    r64  = vshrq_n_s64(a,b);
+    r64  = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
+    return64(r64);
+}
+
+_NEON2SSESTORAGE uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
+_NEON2SSE_INLINE uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VSHRN.I16 d0,q0,#8
+{
+    uint8x8_t res64;
+    __m128i mask, r16;
+    mask = _mm_set1_epi16(0xff);
+    r16  = vshrq_n_s16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
+    r16 = _mm_and_si128(r16, mask); //to avoid saturation
+    r16 = _mm_packus_epi16 (r16,r16); //narrow, use low 64 bits only
+    return64(r16);
+}
+
+_NEON2SSESTORAGE uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
+_NEON2SSE_INLINE uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VSHRN.I32 d0,q0,#16
+{
+    uint16x4_t res64;
+    __m128i mask, r32;
+    mask = _mm_set1_epi32(0xffff);
+    r32  = vshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 16)
+    r32 = _mm_and_si128(r32, mask); //to avoid saturation
+    r32 =  _MM_PACKUS1_EPI32 (r32); //saturate and  narrow, use low 64 bits only
+    return64(r32);
+}
+
+_NEON2SSESTORAGE uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
+_NEON2SSE_INLINE uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b)
+{
+    uint32x2_t res64;
+    __m128i r64;
+    r64  = vshrq_n_u64(a,b);
+    r64  = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
+    return64(r64);
+}
+
+//************** Vector signed->unsigned narrowing saturating shift right by constant ********
+//*********************************************************************************************
+_NEON2SSESTORAGE uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRUN.S16 d0,q0,#8
+_NEON2SSE_INLINE uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b) // VQSHRUN.S16 d0,q0,#8
+{
+    uint8x8_t res64;
+    __m128i r16;
+    r16  = vshrq_n_s16(a,b);
+    r16 = _mm_packus_epi16 (r16,r16); //saturate and  narrow (signed to unsigned), use low 64 bits only
+    return64(r16);
+}
+
+_NEON2SSESTORAGE uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRUN.S32 d0,q0,#16
+_NEON2SSE_INLINE uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b) // VQSHRUN.S32 d0,q0,#16
+{
+    uint16x4_t res64;
+    __m128i r32;
+    r32  = vshrq_n_s32(a,b);
+    r32  = _MM_PACKUS1_EPI32 (r32); //saturate and  narrow(signed to unsigned), use low 64 bits only
+    return64(r32);
+}
+
+_NEON2SSESTORAGE uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRUN.S64 d0,q0,#32
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution is faster
+{
+    _NEON2SSE_ALIGN_16 int64_t atmp[2];
+    uint32x2_t res;
+    int64_t res64;
+    _mm_store_si128((__m128i*)atmp, a);
+    if (atmp[0] < 0) {
+        res.m64_u32[0] = 0;
+    } else {
+        res64 = (atmp[0] >> b);
+        res.m64_u32[0] = (res64 > (int64_t)0xffffffff) ? 0xffffffff : (uint32_t) res64;
+    }
+    if (atmp[1] < 0) {
+        res.m64_u32[1] = 0;
+    } else {
+        res64 = (atmp[1] >> b);
+        res.m64_u32[1] = (res64 > (int64_t)0xffffffff) ? 0xffffffff : (uint32_t)res64;
+    }
+    return res;
+}
+
+//**** Vector signed->unsigned rounding narrowing saturating shift right by constant *****
+_NEON2SSESTORAGE uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRUN.S16 d0,q0,#8
+_NEON2SSE_INLINE uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b) // VQRSHRUN.S16 d0,q0,#8
+{
+    //solution may be not optimal compared with the serial one
+    __m128i r16;
+    uint8x8_t res64;
+    r16 = vrshrq_n_s16(a,b);
+    r16 =  _mm_packus_epi16 (r16,r16); //saturate and  narrow (signed to unsigned), use low 64 bits only
+    return64(r16);
+}
+
+_NEON2SSESTORAGE uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRUN.S32 d0,q0,#16
+_NEON2SSE_INLINE uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b) // VQRSHRUN.S32 d0,q0,#16
+{
+    //solution may be not optimal compared with the serial one
+    __m128i r32;
+    uint16x4_t res64;
+    r32 = vrshrq_n_s32(a,b);
+    r32 =  _MM_PACKUS1_EPI32 (r32); //saturate and  narrow (signed to unsigned), use low 64 bits only
+    return64(r32);
+}
+
+_NEON2SSESTORAGE uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRUN.S64 d0,q0,#32
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution is faster
+{
+    _NEON2SSE_ALIGN_16 int64_t atmp[2];
+    uint32x2_t res;
+    int64_t res64;
+    _mm_store_si128((__m128i*)atmp, a);
+    if (atmp[0] < 0) {
+        res.m64_u32[0] = 0;
+    } else {
+        res64 = (atmp[0] >> b) + ( (atmp[0] & ((int64_t)1 << (b - 1))) >> (b - 1)  );
+        res.m64_u32[0] = (uint32_t) ((res64 > (int64_t)0xffffffff ) ? 0xffffffff : res64);
+    }
+    if (atmp[1] < 0) {
+        res.m64_u32[1] = 0;
+    } else {
+        res64 = (atmp[1] >> b) + ( (atmp[0] & ((int64_t)1 << (b - 1))) >> (b - 1)  );
+        res.m64_u32[1] = (uint32_t)((res64 > (int64_t)0xffffffff ) ? 0xffffffff : res64);
+    }
+    return res;
+}
+
+//***** Vector narrowing saturating shift right by constant ******
+//*****************************************************************
+_NEON2SSESTORAGE int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRN.S16 d0,q0,#8
+_NEON2SSE_INLINE int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VQSHRN.S16 d0,q0,#8
+{
+    int8x8_t res64;
+    __m128i r16;
+    r16  = vshrq_n_s16(a,b);
+    r16  = _mm_packs_epi16 (r16,r16); //saturate and  narrow, use low 64 bits only
+    return64(r16);
+}
+
+_NEON2SSESTORAGE int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRN.S32 d0,q0,#16
+_NEON2SSE_INLINE int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VQSHRN.S32 d0,q0,#16
+{
+    int16x4_t res64;
+    __m128i r32;
+    r32  = vshrq_n_s32(a,b);
+    r32  = _mm_packs_epi32 (r32,r32); //saturate and  narrow, use low 64 bits only
+    return64(r32);
+}
+
+_NEON2SSESTORAGE int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRN.S64 d0,q0,#32
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
+{
+    //no optimal SIMD solution found
+    _NEON2SSE_ALIGN_16 int64_t res64[2], atmp[2];
+    int32x2_t res;
+    _mm_store_si128((__m128i*)atmp, a);
+    res64[0] = (atmp[0] >> b);
+    res64[1] = (atmp[1] >> b);
+    if(res64[0]>SINT_MAX) res64[0] = SINT_MAX;
+    if(res64[0]<SINT_MIN) res64[0] = SINT_MIN;
+    if(res64[1]>SINT_MAX) res64[1] = SINT_MAX;
+    if(res64[1]<SINT_MIN) res64[1] = SINT_MIN;
+    res.m64_i32[0] = (int32_t)res64[0];
+    res.m64_i32[1] = (int32_t)res64[1];
+    return res;
+}
+
+_NEON2SSESTORAGE uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQSHRN.s16 d0,q0,#8
+_NEON2SSE_INLINE uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VQSHRN.s16 d0,q0,#8
+{
+    uint8x8_t res64;
+    __m128i r16;
+    r16  = vshrq_n_u16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
+    r16  = _mm_packus_epi16 (r16,r16); //saturate and  narrow, use low 64 bits only
+    return64(r16);
+}
+
+_NEON2SSESTORAGE uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQSHRN.U32 d0,q0,#16
+_NEON2SSE_INLINE uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VQSHRN.U32 d0,q0,#16
+{
+    uint16x4_t res64;
+    __m128i r32;
+    r32  = vshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
+    r32  = _MM_PACKUS1_EPI32 (r32); //saturate and  narrow, use low 64 bits only
+    return64(r32);
+}
+
+_NEON2SSESTORAGE uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQSHRN.U64 d0,q0,#32
+_NEON2SSE_INLINE uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b)
+{
+    //serial solution may be faster
+    uint32x2_t res64;
+    __m128i r64, res_hi, zero;
+    zero = _mm_setzero_si128();
+    r64  = vshrq_n_u64(a,b);
+    res_hi = _mm_srli_epi64(r64,  32);
+    res_hi = _mm_cmpgt_epi32(res_hi, zero);
+    r64 = _mm_or_si128(r64, res_hi);
+    r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
+    return64(r64);
+}
+
+
+//********* Vector rounding narrowing shift right by constant *************************
+//****************************************************************************************
+_NEON2SSESTORAGE int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
+_NEON2SSE_INLINE int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VRSHRN.I16 d0,q0,#8
+{
+    int8x8_t res64;
+    __m128i r16;
+     r16  = vrshrq_n_s16(a,b);
+    r16  = _mm_shuffle_epi8 (r16, *(__m128i*) mask8_16_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
+    return64(r16);
+}
+
+_NEON2SSESTORAGE int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
+_NEON2SSE_INLINE int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VRSHRN.I32 d0,q0,#16
+{
+    int16x4_t res64;
+    __m128i r32;
+    r32  = vrshrq_n_s32(a,b);
+    r32  =  _mm_shuffle_epi8 (r32, *(__m128i*) mask8_32_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
+    return64(r32);
+}
+
+_NEON2SSESTORAGE int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
+_NEON2SSE_INLINE int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b)
+{
+    int32x2_t res64;
+    __m128i r64;
+    r64  = vrshrq_n_s64(a,b);
+    r64  = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
+    return64(r64);
+}
+
+_NEON2SSESTORAGE uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
+_NEON2SSE_INLINE uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VRSHRN.I16 d0,q0,#8
+{
+    uint8x8_t res64;
+    __m128i mask, r16;
+    mask = _mm_set1_epi16(0xff);
+    r16  = vrshrq_n_s16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
+    r16 = _mm_and_si128(r16, mask); //to avoid saturation
+    r16 = _mm_packus_epi16 (r16,r16); //saturate and  narrow, use low 64 bits only
+    return64(r16);
+}
+
+_NEON2SSESTORAGE uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
+_NEON2SSE_INLINE uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VRSHRN.I32 d0,q0,#16
+{
+    uint16x4_t res64;
+    __m128i mask, r32;
+    mask = _mm_set1_epi32(0xffff);
+    r32  = vrshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
+    r32 = _mm_and_si128(r32, mask); //to avoid saturation
+    r32 = _MM_PACKUS1_EPI32 (r32); //saturate and  narrow, use low 64 bits only
+    return64(r32);
+}
+
+_NEON2SSESTORAGE uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
+_NEON2SSE_INLINE uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b) //serial solution may be faster
+{
+    uint32x2_t res64;
+    __m128i r64;
+    r64  = vrshrq_n_u64(a,b);
+    r64  =  _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
+    return64(r64);
+}
+
+//************* Vector rounding narrowing saturating shift right by constant ************
+//****************************************************************************************
+_NEON2SSESTORAGE int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRN.S16 d0,q0,#8
+_NEON2SSE_INLINE int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VQRSHRN.S16 d0,q0,#8
+{
+    int8x8_t res64;
+    __m128i r16;
+    r16  = vrshrq_n_s16(a,b);
+    r16  =  _mm_packs_epi16 (r16,r16); //saturate and  narrow, use low 64 bits only
+    return64(r16);
+}
+
+_NEON2SSESTORAGE int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRN.S32 d0,q0,#16
+_NEON2SSE_INLINE int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VQRSHRN.S32 d0,q0,#16
+{
+    int16x4_t res64;
+    __m128i r32;
+    r32  = vrshrq_n_s32(a,b);
+    r32  = _mm_packs_epi32 (r32,r32); //saturate and  narrow, use low 64 bits only
+    return64(r32);
+}
+
+_NEON2SSESTORAGE int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRN.S64 d0,q0,#32
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
+{
+    //no optimal SIMD solution found
+    _NEON2SSE_ALIGN_16 int64_t res64[2], atmp[2], maskb[2];
+    int32x2_t res;
+    _mm_store_si128((__m128i*)atmp, a);
+    maskb[0] = atmp[0] & (( int64_t)1 << (b - 1));
+    res64[0] = (atmp[0] >> b) + (maskb[0] >> (b - 1)); //rounded result
+    maskb[1] = atmp[1] & (( int64_t)1 << (b - 1));
+    res64[1] = (atmp[1] >> b) + (maskb[1] >> (b - 1)); //rounded result
+    if(res64[0]>SINT_MAX) res64[0] = SINT_MAX;
+    if(res64[0]<SINT_MIN) res64[0] = SINT_MIN;
+    if(res64[1]>SINT_MAX) res64[1] = SINT_MAX;
+    if(res64[1]<SINT_MIN) res64[1] = SINT_MIN;
+    res.m64_i32[0] = (int32_t)res64[0];
+    res.m64_i32[1] = (int32_t)res64[1];
+    return res;
+}
+
+_NEON2SSESTORAGE uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQRSHRN.s16 d0,q0,#8
+_NEON2SSE_INLINE uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VQRSHRN.s16 d0,q0,#8
+{
+    uint8x8_t res64;
+    __m128i r16;
+    r16  = vrshrq_n_u16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
+    r16  = _mm_packus_epi16 (r16,r16); //saturate and  narrow, use low 64 bits only
+    return64(r16);
+}
+
+_NEON2SSESTORAGE uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQRSHRN.U32 d0,q0,#16
+_NEON2SSE_INLINE uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VQRSHRN.U32 d0,q0,#16
+{
+    uint16x4_t res64;
+    __m128i r32;
+    r32  = vrshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
+    r32  = _MM_PACKUS1_EPI32 (r32); //saturate and  narrow, use low 64 bits only
+    return64(r32);
+}
+
+_NEON2SSESTORAGE uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQRSHRN.U64 d0,q0,#32
+_NEON2SSE_INLINE uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b)
+{
+    //serial solution may be faster
+    uint32x2_t res64;
+    __m128i r64, res_hi, zero;
+    zero = _mm_setzero_si128();
+    r64  = vrshrq_n_u64(a,b);
+    res_hi = _mm_srli_epi64(r64,  32);
+    res_hi = _mm_cmpgt_epi32(res_hi, zero);
+    r64 = _mm_or_si128(r64, res_hi);
+    r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
+    return64(r64);
+}
+
+//************** Vector widening shift left by constant ****************
+//************************************************************************
+_NEON2SSESTORAGE int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b); // VSHLL.S8 q0,d0,#0
+_NEON2SSE_INLINE int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b) // VSHLL.S8 q0,d0,#0
+{
+    __m128i r;
+    r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
+    return _mm_slli_epi16 (r, b);
+}
+
+_NEON2SSESTORAGE int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b); // VSHLL.S16 q0,d0,#0
+_NEON2SSE_INLINE int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b) // VSHLL.S16 q0,d0,#0
+{
+    __m128i r;
+    r =  _MM_CVTEPI16_EPI32(_pM128i(a)); //SSE4.1,
+    return _mm_slli_epi32 (r, b);
+}
+
+_NEON2SSESTORAGE int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b); // VSHLL.S32 q0,d0,#0
+_NEON2SSE_INLINE int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b) // VSHLL.S32 q0,d0,#0
+{
+    __m128i r;
+    r =  _MM_CVTEPI32_EPI64(_pM128i(a)); //SSE4.1,
+    return _mm_slli_epi64 (r, b);
+}
+
+_NEON2SSESTORAGE uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b); // VSHLL.U8 q0,d0,#0
+_NEON2SSE_INLINE uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b) // VSHLL.U8 q0,d0,#0
+{
+    //no uint8 to uint16 conversion available, manual conversion used
+    __m128i zero,  r;
+    zero = _mm_setzero_si128 ();
+    r = _mm_unpacklo_epi8(_pM128i(a), zero);
+    return _mm_slli_epi16 (r, b);
+}
+
+_NEON2SSESTORAGE uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b); // VSHLL.s16 q0,d0,#0
+_NEON2SSE_INLINE uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b) // VSHLL.s16 q0,d0,#0
+{
+    //no uint16 to uint32 conversion available, manual conversion used
+    __m128i zero,  r;
+    zero = _mm_setzero_si128 ();
+    r = _mm_unpacklo_epi16(_pM128i(a), zero);
+    return _mm_slli_epi32 (r, b);
+}
+
+_NEON2SSESTORAGE uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b); // VSHLL.U32 q0,d0,#0
+_NEON2SSE_INLINE uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b) // VSHLL.U32 q0,d0,#0
+{
+    //no uint32 to uint64 conversion available, manual conversion used
+    __m128i zero,  r;
+    zero = _mm_setzero_si128 ();
+    r = _mm_unpacklo_epi32(_pM128i(a), zero);
+    return _mm_slli_epi64 (r, b);
+}
+
+//************************************************************************************
+//**************************** Shifts with insert ************************************
+//************************************************************************************
+//takes each element in a vector,  shifts them by an immediate value,
+//and inserts the results in the destination vector. Bits shifted out of the each element are lost.
+
+//**************** Vector shift right and insert ************************************
+//Actually the "c" left bits from "a" are the only bits remained from "a"  after the shift.
+//All other bits are taken from b shifted.
+_NEON2SSESTORAGE int8x8_t vsri_n_s8(int8x8_t a,  int8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
+_NEON2SSE_INLINE int8x8_t vsri_n_s8(int8x8_t a,  int8x8_t b, __constrange(1,8) int c)
+{
+    int8x8_t res64;
+    return64(vsriq_n_s8(_pM128i(a),_pM128i(b), c));
+}
+
+
+_NEON2SSESTORAGE int16x4_t vsri_n_s16(int16x4_t a,  int16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
+_NEON2SSE_INLINE int16x4_t vsri_n_s16(int16x4_t a,  int16x4_t b, __constrange(1,16) int c)
+{
+    int16x4_t res64;
+    return64(vsriq_n_s16(_pM128i(a),_pM128i(b), c));
+}
+
+
+_NEON2SSESTORAGE int32x2_t vsri_n_s32(int32x2_t a,  int32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
+_NEON2SSE_INLINE int32x2_t vsri_n_s32(int32x2_t a,  int32x2_t b, __constrange(1,32) int c)
+{
+    int32x2_t res64;
+    return64(vsriq_n_s32(_pM128i(a),_pM128i(b), c));
+}
+
+
+_NEON2SSESTORAGE int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
+_NEON2SSE_INLINE int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c)
+{
+    int64x1_t res;
+    if (c ==64)
+        res = a;
+    else{
+        res.m64_i64[0] = (b.m64_u64[0] >> c) | ((a.m64_i64[0] >> (64 - c)) << (64 - c)); //treat b as unsigned for shift to get leading zeros
+    }
+    return res;
+}
+
+_NEON2SSESTORAGE uint8x8_t vsri_n_u8(uint8x8_t a,  uint8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
+#define vsri_n_u8 vsri_n_s8
+
+_NEON2SSESTORAGE uint16x4_t vsri_n_u16(uint16x4_t a,  uint16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
+#define vsri_n_u16 vsri_n_s16
+
+_NEON2SSESTORAGE uint32x2_t vsri_n_u32(uint32x2_t a,  uint32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
+#define vsri_n_u32 vsri_n_s32
+
+
+_NEON2SSESTORAGE uint64x1_t vsri_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
+#define vsri_n_u64 vsri_n_s64
+
+_NEON2SSESTORAGE poly8x8_t vsri_n_p8(poly8x8_t a, poly8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
+#define vsri_n_p8 vsri_n_u8
+
+_NEON2SSESTORAGE poly16x4_t vsri_n_p16(poly16x4_t a, poly16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
+#define vsri_n_p16 vsri_n_u16
+
+_NEON2SSESTORAGE int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
+_NEON2SSE_INLINE int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VSRI.8 q0,q0,#8
+{
+    __m128i maskA, a_masked;
+    uint8x16_t b_shift;
+    _NEON2SSE_ALIGN_16 uint8_t maskLeft[9] = {0x0, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff}; //"a" bits mask, 0 bit not used
+    maskA = _mm_set1_epi8(maskLeft[c]); // c ones and (8-c)zeros
+    a_masked = _mm_and_si128 (a, maskA);
+    b_shift = vshrq_n_u8( b, c); // c zeros on the left in b due to logical shift
+    return _mm_or_si128 (a_masked, b_shift); //combine (insert b into a)
+}
+
+_NEON2SSESTORAGE int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
+_NEON2SSE_INLINE int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VSRI.16 q0,q0,#16
+{
+    //to cut "c" left bits from a we do shift right and then  shift back left providing c right zeros in a
+    uint16x8_t b_shift;
+    uint16x8_t a_c;
+    b_shift = vshrq_n_u16( b, c); // c zeros on the left in b due to logical shift
+    a_c = vshrq_n_u16( a, (16 - c));
+    a_c  = _mm_slli_epi16(a_c, (16 - c)); //logical shift provides right "c" bits zeros in a
+    return _mm_or_si128 (a_c, b_shift); //combine (insert b into a)
+}
+
+_NEON2SSESTORAGE int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
+_NEON2SSE_INLINE int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VSRI.32 q0,q0,#32
+{
+    //to cut "c" left bits from a we do shift right and then  shift back left providing c right zeros in a
+    uint32x4_t b_shift;
+    uint32x4_t a_c;
+    b_shift = vshrq_n_u32( b, c); // c zeros on the left in b due to logical shift
+    a_c = vshrq_n_u32( a, (32 - c));
+    a_c  = _mm_slli_epi32(a_c, (32 - c)); //logical shift provides right "c" bits zeros in a
+    return _mm_or_si128 (a_c, b_shift); //combine (insert b into a)
+}
+
+_NEON2SSESTORAGE int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
+_NEON2SSE_INLINE int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c)
+{
+    //serial solution may be faster
+    uint64x2_t b_shift;
+    uint64x2_t a_c;
+    b_shift = _mm_srli_epi64(b, c); // c zeros on the left in b due to logical shift
+    a_c = _mm_srli_epi64(a, (64 - c));
+    a_c  = _mm_slli_epi64(a_c, (64 - c)); //logical shift provides right "c" bits zeros in a
+    return _mm_or_si128 (a_c, b_shift); //combine (insert b into a)
+}
+
+_NEON2SSESTORAGE uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
+#define vsriq_n_u8 vsriq_n_s8
+
+_NEON2SSESTORAGE uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
+#define vsriq_n_u16 vsriq_n_s16
+
+_NEON2SSESTORAGE uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
+#define vsriq_n_u32 vsriq_n_s32
+
+_NEON2SSESTORAGE uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
+#define vsriq_n_u64 vsriq_n_s64
+
+_NEON2SSESTORAGE poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
+#define vsriq_n_p8 vsriq_n_u8
+
+_NEON2SSESTORAGE poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
+#define vsriq_n_p16 vsriq_n_u16
+
+//***** Vector shift left and insert *********************************************
+//*********************************************************************************
+//Actually the "c" right bits from "a" are the only bits remained from "a"  after the shift.
+//All other bits are taken from b shifted. Ending zeros are inserted in b in the shift proces. We need to combine "a" and "b shifted".
+_NEON2SSESTORAGE int8x8_t vsli_n_s8(int8x8_t a,  int8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
+_NEON2SSE_INLINE int8x8_t vsli_n_s8(int8x8_t a,  int8x8_t b, __constrange(0,7) int c)
+{
+    int8x8_t res64;
+    return64(vsliq_n_s8(_pM128i(a),_pM128i(b), c));
+}
+
+
+_NEON2SSESTORAGE int16x4_t vsli_n_s16(int16x4_t a,  int16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
+_NEON2SSE_INLINE int16x4_t vsli_n_s16(int16x4_t a,  int16x4_t b, __constrange(0,15) int c)
+{
+    int16x4_t res64;
+    return64(vsliq_n_s16(_pM128i(a),_pM128i(b), c));
+}
+
+
+_NEON2SSESTORAGE int32x2_t vsli_n_s32(int32x2_t a,  int32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
+_NEON2SSE_INLINE int32x2_t vsli_n_s32(int32x2_t a,  int32x2_t b, __constrange(0,31) int c)
+{
+    int32x2_t res64;
+    return64(vsliq_n_s32(_pM128i(a),_pM128i(b), c));
+}
+
+_NEON2SSESTORAGE int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
+_NEON2SSE_INLINE int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c)
+{
+    int64x1_t res;
+    res.m64_i64[0] = (b.m64_i64[0] << c) | ((a.m64_u64[0] << (64 - c)) >> (64 - c)); //need to treat a as unsigned to get leading zeros
+    return res;
+}
+
+
+_NEON2SSESTORAGE uint8x8_t vsli_n_u8(uint8x8_t a,  uint8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
+#define vsli_n_u8 vsli_n_s8
+
+_NEON2SSESTORAGE uint16x4_t vsli_n_u16(uint16x4_t a,  uint16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
+#define vsli_n_u16 vsli_n_s16
+
+_NEON2SSESTORAGE uint32x2_t vsli_n_u32(uint32x2_t a,  uint32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
+#define vsli_n_u32 vsli_n_s32
+
+_NEON2SSESTORAGE uint64x1_t vsli_n_u64(uint64x1_t a, uint64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
+#define vsli_n_u64 vsli_n_s64
+
+_NEON2SSESTORAGE poly8x8_t vsli_n_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
+#define vsli_n_p8 vsli_n_u8
+
+_NEON2SSESTORAGE poly16x4_t vsli_n_p16(poly16x4_t a, poly16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
+#define vsli_n_p16 vsli_n_u16
+
+_NEON2SSESTORAGE int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
+_NEON2SSE_INLINE int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c) // VSLI.8 q0,q0,#0
+{
+    __m128i maskA, a_masked;
+    int8x16_t b_shift;
+    _NEON2SSE_ALIGN_16 uint8_t maskRight[8] = {0x0, 0x1, 0x3, 0x7, 0x0f, 0x1f, 0x3f, 0x7f}; //"a" bits mask
+    maskA = _mm_set1_epi8(maskRight[c]); // (8-c)zeros and c ones
+    b_shift = vshlq_n_s8( b, c);
+    a_masked = _mm_and_si128 (a, maskA);
+    return _mm_or_si128 (b_shift, a_masked); //combine (insert b into a)
+}
+
+_NEON2SSESTORAGE int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
+_NEON2SSE_INLINE int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c) // VSLI.16 q0,q0,#0
+{
+    //to cut "c" right bits from a we do shift left and then logical shift back right providing (16-c)zeros in a
+    int16x8_t b_shift;
+    int16x8_t a_c;
+    b_shift = vshlq_n_s16( b, c);
+    a_c = vshlq_n_s16( a, (16 - c));
+    a_c  = _mm_srli_epi16(a_c, (16 - c));
+    return _mm_or_si128 (b_shift, a_c); //combine (insert b into a)
+}
+
+_NEON2SSESTORAGE int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
+_NEON2SSE_INLINE int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c) // VSLI.32 q0,q0,#0
+{
+    //solution may be  not optimal compared with the serial one
+    //to cut "c" right bits from a we do shift left and then logical shift back right providing (32-c)zeros in a
+    int32x4_t b_shift;
+    int32x4_t a_c;
+    b_shift = vshlq_n_s32( b, c);
+    a_c = vshlq_n_s32( a, (32 - c));
+    a_c  = _mm_srli_epi32(a_c, (32 - c));
+    return _mm_or_si128 (b_shift, a_c); //combine (insert b into a)
+}
+
+_NEON2SSESTORAGE int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
+_NEON2SSE_INLINE int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c) // VSLI.64 q0,q0,#0
+{
+    //solution may be  not optimal compared with the serial one
+    //to cut "c" right bits from a we do shift left and then logical shift back right providing (64-c)zeros in a
+    int64x2_t b_shift;
+    int64x2_t a_c;
+    b_shift = vshlq_n_s64( b, c);
+    a_c = vshlq_n_s64( a, (64 - c));
+    a_c  = _mm_srli_epi64(a_c, (64 - c));
+    return _mm_or_si128 (b_shift, a_c); //combine (insert b into a)
+}
+
+_NEON2SSESTORAGE uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
+#define vsliq_n_u8 vsliq_n_s8
+
+_NEON2SSESTORAGE uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
+#define vsliq_n_u16 vsliq_n_s16
+
+_NEON2SSESTORAGE uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
+#define vsliq_n_u32 vsliq_n_s32
+
+_NEON2SSESTORAGE uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
+#define vsliq_n_u64 vsliq_n_s64
+
+_NEON2SSESTORAGE poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
+#define vsliq_n_p8 vsliq_n_u8
+
+_NEON2SSESTORAGE poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
+#define vsliq_n_p16 vsliq_n_u16
+
+// ***********************************************************************************************
+// ****************** Loads and stores of a single vector ***************************************
+// ***********************************************************************************************
+//Performs loads and stores of a single vector of some type.
+//*******************************  Loads ********************************************************
+// ***********************************************************************************************
+//We assume ptr is NOT aligned in general case and use __m128i _mm_loadu_si128 ((__m128i*) ptr);.
+//also for SSE3  supporting systems the __m128i _mm_lddqu_si128 (__m128i const* p) usage for unaligned access may be advantageous.
+// it loads a 32-byte block aligned on a 16-byte boundary and extracts the 16 bytes corresponding to the unaligned access
+//If the ptr is aligned then could use __m128i _mm_load_si128 ((__m128i*) ptr) instead;
+#define LOAD_SI128(ptr) \
+        ( ((uintptr_t)(ptr) & 15) == 0 ) ? _mm_load_si128((__m128i*)(ptr)) : _mm_loadu_si128((__m128i*)(ptr))
+
+_NEON2SSESTORAGE uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
+#define vld1q_u8 LOAD_SI128
+
+_NEON2SSESTORAGE uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
+#define vld1q_u16 LOAD_SI128
+
+_NEON2SSESTORAGE uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
+#define vld1q_u32 LOAD_SI128
+
+_NEON2SSESTORAGE uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
+#define vld1q_u64 LOAD_SI128
+
+_NEON2SSESTORAGE int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
+#define vld1q_s8 LOAD_SI128
+
+_NEON2SSESTORAGE int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
+#define vld1q_s16 LOAD_SI128
+
+_NEON2SSESTORAGE int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
+#define vld1q_s32 LOAD_SI128
+
+_NEON2SSESTORAGE int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
+#define vld1q_s64 LOAD_SI128
+
+_NEON2SSESTORAGE float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr); // VLD1.16 {d0, d1}, [r0]
+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers
+/* _NEON2SSE_INLINE float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr)// VLD1.16 {d0, d1}, [r0]
+{__m128 f1 = _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]);
+__m128 f2;
+f2 = _mm_set_ps (ptr[7], ptr[6], ptr[5], ptr[4]);
+}*/
+
+_NEON2SSESTORAGE float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
+_NEON2SSE_INLINE float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr)
+{
+    if( (((uintptr_t)(ptr)) & 15 ) == 0 ) //16 bits aligned
+        return _mm_load_ps(ptr);
+    else
+        return _mm_loadu_ps(ptr);
+}
+
+_NEON2SSESTORAGE poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
+#define vld1q_p8  LOAD_SI128
+
+_NEON2SSESTORAGE poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
+#define vld1q_p16 LOAD_SI128
+
+_NEON2SSESTORAGE uint8x8_t vld1_u8(__transfersize(8) uint8_t const * ptr); // VLD1.8 {d0}, [r0]
+#define vld1_u8(ptr)  *((__m64_128*)(ptr)) //was _mm_loadl_epi64((__m128i*)(ptr))
+
+_NEON2SSESTORAGE uint16x4_t vld1_u16(__transfersize(4) uint16_t const * ptr); // VLD1.16 {d0}, [r0]
+#define vld1_u16 vld1_u8
+
+_NEON2SSESTORAGE uint32x2_t vld1_u32(__transfersize(2) uint32_t const * ptr); // VLD1.32 {d0}, [r0]
+#define vld1_u32 vld1_u8
+
+
+_NEON2SSESTORAGE uint64x1_t vld1_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
+#define vld1_u64 vld1_u8
+
+_NEON2SSESTORAGE int8x8_t vld1_s8(__transfersize(8) int8_t const * ptr); // VLD1.8 {d0}, [r0]
+#define vld1_s8 vld1_u8
+
+_NEON2SSESTORAGE int16x4_t vld1_s16(__transfersize(4) int16_t const * ptr); // VLD1.16 {d0}, [r0]
+#define vld1_s16 vld1_u16
+
+_NEON2SSESTORAGE int32x2_t vld1_s32(__transfersize(2) int32_t const * ptr); // VLD1.32 {d0}, [r0]
+#define vld1_s32 vld1_u32
+
+_NEON2SSESTORAGE int64x1_t vld1_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
+#define vld1_s64 vld1_u64
+
+_NEON2SSESTORAGE float16x4_t vld1_f16(__transfersize(4) __fp16 const * ptr); // VLD1.16 {d0}, [r0]
+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit like _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]);
+
+_NEON2SSESTORAGE float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr); // VLD1.32 {d0}, [r0]
+_NEON2SSE_INLINE float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr)
+{
+    float32x2_t res;
+    res.m64_f32[0] = *(ptr);
+    res.m64_f32[1] = *(ptr + 1);
+    return res;
+}
+
+_NEON2SSESTORAGE poly8x8_t vld1_p8(__transfersize(8) poly8_t const * ptr); // VLD1.8 {d0}, [r0]
+#define vld1_p8 vld1_u8
+
+_NEON2SSESTORAGE poly16x4_t vld1_p16(__transfersize(4) poly16_t const * ptr); // VLD1.16 {d0}, [r0]
+#define vld1_p16 vld1_u16
+
+
+_NEON2SSESTORAGE float64x2_t vld1q_f64(__transfersize(4) float64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
+_NEON2SSE_INLINE float64x2_t vld1q_f64(__transfersize(4) float64_t const * ptr)
+{
+    if ((((uintptr_t)(ptr)) & 15) == 0) //16 bits aligned
+        return _mm_load_pd(ptr);
+    else
+        return _mm_loadu_pd(ptr);
+}
+
+
+//***********************************************************************************************************
+//******* Lane load functions - insert the data at  vector's given position (lane) *************************
+//***********************************************************************************************************
+_NEON2SSESTORAGE uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0]
+#define vld1q_lane_u8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
+
+_NEON2SSESTORAGE uint16x8_t vld1q_lane_u16(__transfersize(1)    uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
+#define vld1q_lane_u16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
+
+_NEON2SSESTORAGE uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
+#define vld1q_lane_u32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane)
+
+_NEON2SSESTORAGE uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0]
+#define vld1q_lane_u64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane); // _p;
+
+
+_NEON2SSESTORAGE int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0]
+#define vld1q_lane_s8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
+
+_NEON2SSESTORAGE int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
+#define vld1q_lane_s16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
+
+_NEON2SSESTORAGE int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
+#define vld1q_lane_s32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane)
+
+_NEON2SSESTORAGE float16x8_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
+//current IA SIMD doesn't support float16
+
+_NEON2SSESTORAGE float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
+_NEON2SSE_INLINE float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane)
+{
+    //we need to deal with  ptr  16bit NOT aligned case
+    __m128 p;
+    p = _mm_set1_ps(*(ptr));
+    return _MM_INSERT_PS(vec,  p, _INSERTPS_NDX(0, lane));
+}
+
+_NEON2SSESTORAGE int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0]
+#define vld1q_lane_s64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane)
+
+_NEON2SSESTORAGE poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0]
+#define vld1q_lane_p8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
+
+_NEON2SSESTORAGE poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
+#define vld1q_lane_p16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
+
+_NEON2SSESTORAGE uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0]
+_NEON2SSE_INLINE uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane)
+{
+    uint8x8_t res;
+    res = vec;
+    res.m64_u8[lane] = *(ptr);
+    return res;
+}
+
+_NEON2SSESTORAGE uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
+_NEON2SSE_INLINE uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane)
+{
+    uint16x4_t res;
+    res = vec;
+    res.m64_u16[lane] = *(ptr);
+    return res;
+}
+
+_NEON2SSESTORAGE uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
+_NEON2SSE_INLINE uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane)
+{
+    uint32x2_t res;
+    res = vec;
+    res.m64_u32[lane] = *(ptr);
+    return res;
+}
+
+_NEON2SSESTORAGE uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane); // VLD1.64 {d0}, [r0]
+_NEON2SSE_INLINE uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane)
+{
+    uint64x1_t res;
+    res.m64_u64[0] = *(ptr);
+    return res;
+}
+
+
+_NEON2SSESTORAGE int8x8_t vld1_lane_s8(__transfersize(1) int8_t const * ptr, int8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0]
+#define vld1_lane_s8(ptr, vec, lane) vld1_lane_u8((uint8_t*)ptr, vec, lane)
+
+_NEON2SSESTORAGE int16x4_t vld1_lane_s16(__transfersize(1) int16_t const * ptr, int16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
+#define vld1_lane_s16(ptr, vec, lane) vld1_lane_u16((uint16_t*)ptr, vec, lane)
+
+_NEON2SSESTORAGE int32x2_t vld1_lane_s32(__transfersize(1) int32_t const * ptr, int32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
+#define vld1_lane_s32(ptr, vec, lane) vld1_lane_u32((uint32_t*)ptr, vec, lane)
+
+_NEON2SSESTORAGE float16x4_t vld1_lane_f16(__transfersize(1) __fp16 const * ptr, float16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
+//current IA SIMD doesn't support float16
+
+_NEON2SSESTORAGE float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
+_NEON2SSE_INLINE float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane)
+{
+    float32x2_t res;
+    res = vec;
+    res.m64_f32[lane] = *(ptr);
+    return res;
+}
+
+_NEON2SSESTORAGE int64x1_t vld1_lane_s64(__transfersize(1) int64_t const * ptr, int64x1_t vec, __constrange(0,0) int lane); // VLD1.64 {d0}, [r0]
+#define vld1_lane_s64(ptr, vec, lane) vld1_lane_u64((uint64_t*)ptr, vec, lane)
+
+_NEON2SSESTORAGE poly8x8_t vld1_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0]
+#define vld1_lane_p8 vld1_lane_u8
+
+_NEON2SSESTORAGE poly16x4_t vld1_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
+#define vld1_lane_p16 vld1_lane_s16
+
+// ****************** Load single value ( set all lanes of vector with same value from memory)**********************
+// ******************************************************************************************************************
+_NEON2SSESTORAGE uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
+#define vld1q_dup_u8(ptr) _mm_set1_epi8(*(ptr))
+
+_NEON2SSESTORAGE uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
+#define vld1q_dup_u16(ptr) _mm_set1_epi16(*(ptr))
+
+_NEON2SSESTORAGE uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
+#define vld1q_dup_u32(ptr) _mm_set1_epi32(*(ptr))
+
+_NEON2SSESTORAGE uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
+_NEON2SSE_INLINE uint64x2_t   vld1q_dup_u64(__transfersize(1) uint64_t const * ptr)
+{
+    _NEON2SSE_ALIGN_16 uint64_t val[2] = {*(ptr), *(ptr)};
+    return LOAD_SI128(val);
+}
+
+_NEON2SSESTORAGE int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
+#define vld1q_dup_s8(ptr) _mm_set1_epi8(*(ptr))
+
+_NEON2SSESTORAGE int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
+#define vld1q_dup_s16(ptr) _mm_set1_epi16 (*(ptr))
+
+_NEON2SSESTORAGE int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
+#define vld1q_dup_s32(ptr) _mm_set1_epi32 (*(ptr))
+
+_NEON2SSESTORAGE int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
+#define vld1q_dup_s64(ptr) vld1q_dup_u64((uint64_t*)ptr)
+
+_NEON2SSESTORAGE float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
+//current IA SIMD doesn't support float16, need to go to 32 bits
+
+_NEON2SSESTORAGE float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
+#define vld1q_dup_f32(ptr) _mm_set1_ps (*(ptr))
+
+_NEON2SSESTORAGE poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
+#define vld1q_dup_p8(ptr) _mm_set1_epi8(*(ptr))
+
+_NEON2SSESTORAGE poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
+#define vld1q_dup_p16(ptr) _mm_set1_epi16 (*(ptr))
+
+_NEON2SSESTORAGE uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    uint8x8_t res;
+    int i;
+    for(i = 0; i<8; i++) {
+        res.m64_u8[i] =  *(ptr);
+    }
+    return res;
+}
+
+_NEON2SSESTORAGE uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    uint16x4_t res;
+    int i;
+    for(i = 0; i<4; i++) {
+        res.m64_u16[i] =  *(ptr);
+    }
+    return res;
+}
+
+_NEON2SSESTORAGE uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    uint32x2_t res;
+    res.m64_u32[0] = *(ptr);
+    res.m64_u32[1] = *(ptr);
+    return res;
+}
+
+_NEON2SSESTORAGE uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
+_NEON2SSE_INLINE uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr)
+{
+    uint64x1_t res;
+    res.m64_u64[0] = *(ptr);
+    return res;
+}
+
+_NEON2SSESTORAGE int8x8_t vld1_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
+#define vld1_dup_s8(ptr) vld1_dup_u8((uint8_t*)ptr)
+
+
+_NEON2SSESTORAGE int16x4_t vld1_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
+#define vld1_dup_s16(ptr) vld1_dup_u16((uint16_t*)ptr)
+
+
+_NEON2SSESTORAGE int32x2_t vld1_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
+#define vld1_dup_s32(ptr) vld1_dup_u32((uint32_t*)ptr)
+
+
+_NEON2SSESTORAGE int64x1_t vld1_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
+#define vld1_dup_s64(ptr) vld1_dup_u64((uint64_t*)ptr)
+
+_NEON2SSESTORAGE float16x4_t vld1_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
+//current IA SIMD doesn't support float16
+
+_NEON2SSESTORAGE float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
+_NEON2SSE_INLINE float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr)
+{
+    float32x2_t res;
+    res.m64_f32[0] = *(ptr);
+    res.m64_f32[1] = res.m64_f32[0];
+    return res; // use last 64bits only
+}
+
+_NEON2SSESTORAGE poly8x8_t vld1_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
+#define vld1_dup_p8 vld1_dup_u8
+
+
+_NEON2SSESTORAGE poly16x4_t vld1_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
+#define vld1_dup_p16 vld1_dup_u16
+
+
+//*************************************************************************************
+//********************************* Store **********************************************
+//*************************************************************************************
+// If ptr is 16bit aligned and you  need to store data without cache pollution then use void _mm_stream_si128 ((__m128i*)ptr, val);
+//here we assume the case of  NOT 16bit aligned ptr possible. If it is aligned we could to use _mm_store_si128 like shown in the following macro
+#define STORE_SI128(ptr, val) \
+        (((uintptr_t)(ptr) & 15) == 0 ) ? _mm_store_si128 ((__m128i*)(ptr), val) : _mm_storeu_si128 ((__m128i*)(ptr), val);
+
+_NEON2SSESTORAGE void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val); // VST1.8 {d0, d1}, [r0]
+#define vst1q_u8 STORE_SI128
+
+_NEON2SSESTORAGE void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val); // VST1.16 {d0, d1}, [r0]
+#define vst1q_u16 STORE_SI128
+
+_NEON2SSESTORAGE void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val); // VST1.32 {d0, d1}, [r0]
+#define vst1q_u32 STORE_SI128
+
+_NEON2SSESTORAGE void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val); // VST1.64 {d0, d1}, [r0]
+#define vst1q_u64 STORE_SI128
+
+_NEON2SSESTORAGE void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val); // VST1.8 {d0, d1}, [r0]
+#define vst1q_s8 STORE_SI128
+
+_NEON2SSESTORAGE void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val); // VST1.16 {d0, d1}, [r0]
+#define vst1q_s16 STORE_SI128
+
+_NEON2SSESTORAGE void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val); // VST1.32 {d0, d1}, [r0]
+#define vst1q_s32 STORE_SI128
+
+_NEON2SSESTORAGE void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val); // VST1.64 {d0, d1}, [r0]
+#define vst1q_s64 STORE_SI128
+
+_NEON2SSESTORAGE void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val); // VST1.16 {d0, d1}, [r0]
+// IA32 SIMD doesn't work with 16bit floats currently
+
+_NEON2SSESTORAGE void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val); // VST1.32 {d0, d1}, [r0]
+_NEON2SSE_INLINE void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val)
+{
+    if( ((uintptr_t)(ptr) & 15)  == 0 ) //16 bits aligned
+        _mm_store_ps (ptr, val);
+    else
+        _mm_storeu_ps (ptr, val);
+}
+
+_NEON2SSESTORAGE void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val); // VST1.8 {d0, d1}, [r0]
+#define vst1q_p8  vst1q_u8
+
+_NEON2SSESTORAGE void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val); // VST1.16 {d0, d1}, [r0]
+#define vst1q_p16 vst1q_u16
+
+_NEON2SSESTORAGE void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val); // VST1.8 {d0}, [r0]
+_NEON2SSE_INLINE void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val)
+{
+    int i;
+    for (i = 0; i<8; i++) {
+        *(ptr + i) = ((uint8_t*)&val)[i];
+    }
+    //_mm_storel_epi64((__m128i*)ptr, val);
+    return;
+}
+
+_NEON2SSESTORAGE void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val); // VST1.16 {d0}, [r0]
+_NEON2SSE_INLINE void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val)
+{
+    int i;
+    for (i = 0; i<4; i++) {
+        *(ptr + i) = ((uint16_t*)&val)[i];
+    }
+    //_mm_storel_epi64((__m128i*)ptr, val);
+    return;
+}
+
+_NEON2SSESTORAGE void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val); // VST1.32 {d0}, [r0]
+_NEON2SSE_INLINE void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val)
+{
+    int i;
+    for (i = 0; i<2; i++) {
+        *(ptr + i) = ((uint32_t*)&val)[i];
+    }
+    //_mm_storel_epi64((__m128i*)ptr, val);
+    return;
+}
+
+_NEON2SSESTORAGE void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val); // VST1.64 {d0}, [r0]
+_NEON2SSE_INLINE void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val)
+{
+    *(ptr) = *((uint64_t*)&val);
+    //_mm_storel_epi64((__m128i*)ptr, val);
+    return;
+}
+
+_NEON2SSESTORAGE void vst1_s8(__transfersize(8) int8_t * ptr, int8x8_t val); // VST1.8 {d0}, [r0]
+#define vst1_s8(ptr,val) vst1_u8((uint8_t*)ptr,val)
+
+_NEON2SSESTORAGE void vst1_s16(__transfersize(4) int16_t * ptr, int16x4_t val); // VST1.16 {d0}, [r0]
+#define vst1_s16(ptr,val) vst1_u16((uint16_t*)ptr,val)
+
+_NEON2SSESTORAGE void vst1_s32(__transfersize(2) int32_t * ptr, int32x2_t val); // VST1.32 {d0}, [r0]
+#define vst1_s32(ptr,val) vst1_u32((uint32_t*)ptr,val)
+
+_NEON2SSESTORAGE void vst1_s64(__transfersize(1) int64_t * ptr, int64x1_t val); // VST1.64 {d0}, [r0]
+#define vst1_s64(ptr,val) vst1_u64((uint64_t*)ptr,val)
+
+_NEON2SSESTORAGE void vst1_f16(__transfersize(4) __fp16 * ptr, float16x4_t val); // VST1.16 {d0}, [r0]
+//current IA SIMD doesn't support float16
+
+_NEON2SSESTORAGE void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val); // VST1.32 {d0}, [r0]
+_NEON2SSE_INLINE void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val)
+{
+    *(ptr) =   val.m64_f32[0];
+    *(ptr + 1) = val.m64_f32[1];
+    return;
+}
+
+_NEON2SSESTORAGE void vst1_p8(__transfersize(8) poly8_t * ptr, poly8x8_t val); // VST1.8 {d0}, [r0]
+#define vst1_p8 vst1_u8
+
+_NEON2SSESTORAGE void vst1_p16(__transfersize(4) poly16_t * ptr, poly16x4_t val); // VST1.16 {d0}, [r0]
+#define vst1_p16 vst1_u16
+
+//***********Store a lane of a vector into memory (extract given lane) *********************
+//******************************************************************************************
+_NEON2SSESTORAGE void vst1q_lane_u8(__transfersize(1) uint8_t * ptr, uint8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0]
+#define vst1q_lane_u8(ptr, val, lane) *(ptr) = (uint8_t) _MM_EXTRACT_EPI8 (val, lane)
+
+_NEON2SSESTORAGE void vst1q_lane_u16(__transfersize(1) uint16_t * ptr, uint16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
+#define vst1q_lane_u16(ptr, val, lane) *(ptr) = (uint16_t) _MM_EXTRACT_EPI16 (val, lane)
+
+_NEON2SSESTORAGE void vst1q_lane_u32(__transfersize(1) uint32_t * ptr, uint32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0]
+#define vst1q_lane_u32(ptr, val, lane) *(ptr) = (uint32_t) _MM_EXTRACT_EPI32 (val, lane)
+
+_NEON2SSESTORAGE void vst1q_lane_u64(__transfersize(1) uint64_t * ptr, uint64x2_t val, __constrange(0,1) int lane); // VST1.64 {d0}, [r0]
+#define vst1q_lane_u64(ptr, val, lane) *(ptr) = (uint64_t) _MM_EXTRACT_EPI64 (val, lane)
+
+_NEON2SSESTORAGE void vst1q_lane_s8(__transfersize(1) int8_t * ptr, int8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0]
+#define vst1q_lane_s8(ptr, val, lane) *(ptr) = (int8_t) _MM_EXTRACT_EPI8 (val, lane)
+
+_NEON2SSESTORAGE void vst1q_lane_s16(__transfersize(1) int16_t * ptr, int16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
+#define vst1q_lane_s16(ptr, val, lane) *(ptr) = (int16_t) _MM_EXTRACT_EPI16 (val, lane)
+
+_NEON2SSESTORAGE void vst1q_lane_s32(__transfersize(1) int32_t * ptr, int32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0]
+#define vst1q_lane_s32(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI32 (val, lane)
+
+_NEON2SSESTORAGE void vst1q_lane_s64(__transfersize(1) int64_t * ptr, int64x2_t val, __constrange(0,1) int lane); // VST1.64 {d0}, [r0]
+#define vst1q_lane_s64(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI64 (val, lane)
+
+_NEON2SSESTORAGE void vst1q_lane_f16(__transfersize(1) __fp16 * ptr, float16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
+//current IA SIMD doesn't support float16
+
+_NEON2SSESTORAGE void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0]
+_NEON2SSE_INLINE void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x4_t val, __constrange(0,3) int lane)
+{
+    int32_t ilane;
+    ilane = _MM_EXTRACT_PS(val,lane);
+    *(ptr) =  *((float*)&ilane);
+}
+
+_NEON2SSESTORAGE void vst1q_lane_p8(__transfersize(1) poly8_t * ptr, poly8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0]
+#define vst1q_lane_p8   vst1q_lane_u8
+
+_NEON2SSESTORAGE void vst1q_lane_p16(__transfersize(1) poly16_t * ptr, poly16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
+#define vst1q_lane_p16   vst1q_lane_s16
+
+_NEON2SSESTORAGE void vst1_lane_u8(__transfersize(1) uint8_t * ptr, uint8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0]
+_NEON2SSE_INLINE void vst1_lane_u8(__transfersize(1) uint8_t * ptr, uint8x8_t val, __constrange(0,7) int lane)
+{
+    *(ptr) = val.m64_u8[lane];
+}
+
+_NEON2SSESTORAGE void vst1_lane_u16(__transfersize(1) uint16_t * ptr, uint16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
+_NEON2SSE_INLINE void vst1_lane_u16(__transfersize(1) uint16_t * ptr, uint16x4_t val, __constrange(0,3) int lane)
+{
+    *(ptr) = val.m64_u16[lane];
+}
+
+_NEON2SSESTORAGE void vst1_lane_u32(__transfersize(1) uint32_t * ptr, uint32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0]
+_NEON2SSE_INLINE void vst1_lane_u32(__transfersize(1) uint32_t * ptr, uint32x2_t val, __constrange(0,1) int lane)
+{
+    *(ptr) = val.m64_u32[lane];
+}
+
+_NEON2SSESTORAGE void vst1_lane_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val, __constrange(0,0) int lane); // VST1.64 {d0}, [r0]
+_NEON2SSE_INLINE void vst1_lane_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val, __constrange(0,0) int lane)
+{
+    *(ptr) = val.m64_u64[0];
+}
+
+_NEON2SSESTORAGE void vst1_lane_s8(__transfersize(1) int8_t * ptr, int8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0]
+#define  vst1_lane_s8(ptr, val, lane) vst1_lane_u8((uint8_t*)ptr, val, lane)
+
+_NEON2SSESTORAGE void vst1_lane_s16(__transfersize(1) int16_t * ptr, int16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
+#define vst1_lane_s16(ptr, val, lane) vst1_lane_u16((uint16_t*)ptr, val, lane)
+
+_NEON2SSESTORAGE void vst1_lane_s32(__transfersize(1) int32_t * ptr, int32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0]
+#define vst1_lane_s32(ptr, val, lane)  vst1_lane_u32((uint32_t*)ptr, val, lane)
+
+
+_NEON2SSESTORAGE void vst1_lane_s64(__transfersize(1) int64_t * ptr, int64x1_t val, __constrange(0,0) int lane); // VST1.64 {d0}, [r0]
+#define vst1_lane_s64(ptr, val, lane) vst1_lane_u64((uint64_t*)ptr, val, lane)
+
+
+_NEON2SSESTORAGE void vst1_lane_f16(__transfersize(1) __fp16 * ptr, float16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
+//current IA SIMD doesn't support float16
+
+_NEON2SSESTORAGE void vst1_lane_f32(__transfersize(1) float32_t * ptr, float32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0]
+_NEON2SSE_INLINE void vst1_lane_f32(__transfersize(1) float32_t * ptr, float32x2_t val, __constrange(0,1) int lane)
+{
+    *(ptr) = val.m64_f32[lane];
+}
+
+_NEON2SSESTORAGE void vst1_lane_p8(__transfersize(1) poly8_t * ptr, poly8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0]
+#define vst1_lane_p8 vst1_lane_u8
+
+_NEON2SSESTORAGE void vst1_lane_p16(__transfersize(1) poly16_t * ptr, poly16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
+#define vst1_lane_p16 vst1_lane_s16
+
+//***********************************************************************************************
+//**************** Loads and stores of an N-element structure **********************************
+//***********************************************************************************************
+//These intrinsics load or store an n-element structure. The array structures are defined in the beginning
+//We assume ptr is NOT aligned in general case, for more details see  "Loads and stores of a single vector functions"
+//****************** 2 elements load  *********************************************
+_NEON2SSESTORAGE uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
+_NEON2SSE_INLINE uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr) // VLD2.8 {d0, d2}, [r0]
+{
+    uint8x16x2_t v;
+    v.val[0] = vld1q_u8(ptr);
+    v.val[1] = vld1q_u8((ptr + 16));
+    v = vuzpq_s8(v.val[0], v.val[1]);
+    return v;
+}
+
+_NEON2SSESTORAGE uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
+_NEON2SSE_INLINE uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr) // VLD2.16 {d0, d2}, [r0]
+{
+    uint16x8x2_t v;
+    v.val[0] = vld1q_u16( ptr);
+    v.val[1] = vld1q_u16( (ptr + 8));
+    v = vuzpq_s16(v.val[0], v.val[1]);
+    return v;
+}
+
+_NEON2SSESTORAGE uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
+_NEON2SSE_INLINE uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr) // VLD2.32 {d0, d2}, [r0]
+{
+    uint32x4x2_t v;
+    v.val[0] = vld1q_u32 ( ptr);
+    v.val[1] = vld1q_u32 ( (ptr + 4));
+    v = vuzpq_s32(v.val[0], v.val[1]);
+    return v;
+}
+
+_NEON2SSESTORAGE int8x16x2_t vld2q_s8(__transfersize(32) int8_t const * ptr);
+#define  vld2q_s8(ptr) vld2q_u8((uint8_t*) ptr)
+
+_NEON2SSESTORAGE int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
+#define vld2q_s16(ptr) vld2q_u16((uint16_t*) ptr)
+
+_NEON2SSESTORAGE int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
+#define vld2q_s32(ptr) vld2q_u32((uint32_t*) ptr)
+
+
+_NEON2SSESTORAGE float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr); // VLD2.16 {d0, d2}, [r0]
+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
+
+_NEON2SSESTORAGE float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
+_NEON2SSE_INLINE float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr) // VLD2.32 {d0, d2}, [r0]
+{
+    float32x4x2_t v;
+    v.val[0] =  vld1q_f32 (ptr);
+    v.val[1] =  vld1q_f32 ((ptr + 4));
+    v = vuzpq_f32(v.val[0], v.val[1]);
+    return v;
+}
+
+_NEON2SSESTORAGE poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
+#define  vld2q_p8 vld2q_u8
+
+_NEON2SSESTORAGE poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
+#define vld2q_p16 vld2q_u16
+
+_NEON2SSESTORAGE uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
+_NEON2SSE_INLINE uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr)
+{
+    uint8x8x2_t v;
+    __m128i ld128;
+    ld128 = vld1q_u8(ptr); //merge two 64-bits in 128 bit
+    ld128 =  _mm_shuffle_epi8(ld128, *(__m128i*)mask8_16_even_odd);
+    vst1q_u8((v.val), ld128); //  v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32);
+    return v;
+}
+
+_NEON2SSESTORAGE uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
+_NEON2SSE_INLINE uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr)
+{
+    _NEON2SSE_ALIGN_16 uint16x4x2_t v;
+    __m128i ld128;
+    ld128 = vld1q_u16(ptr); //merge two 64-bits in 128 bit
+    ld128 = _mm_shuffle_epi8(ld128, *(__m128i*) mask8_32_even_odd);
+    vst1q_u16((v.val), ld128);
+    return v;
+}
+
+_NEON2SSESTORAGE uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
+_NEON2SSE_INLINE uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr)
+{
+    _NEON2SSE_ALIGN_16 uint32x2x2_t v;
+    __m128i ld128;
+    ld128 = vld1q_u32(ptr); //merge two 64-bits in 128 bit
+    ld128 = _mm_shuffle_epi32(ld128,  0 | (2 << 2) | (1 << 4) | (3 << 6));
+    vst1q_u32((v.val), ld128);
+    return v;
+}
+
+_NEON2SSESTORAGE uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
+_NEON2SSE_INLINE uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr)
+{
+    uint64x1x2_t v;
+    v.val[0].m64_u64[0] = *(ptr);
+    v.val[1].m64_u64[0] = *(ptr + 1);
+    return v;
+}
+
+_NEON2SSESTORAGE int8x8x2_t vld2_s8(__transfersize(16) int8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
+#define vld2_s8(ptr) vld2_u8((uint8_t*)ptr)
+
+_NEON2SSESTORAGE int16x4x2_t vld2_s16(__transfersize(8) int16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
+#define vld2_s16(ptr) vld2_u16((uint16_t*)ptr)
+
+_NEON2SSESTORAGE int32x2x2_t vld2_s32(__transfersize(4) int32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
+#define vld2_s32(ptr) vld2_u32((uint32_t*)ptr)
+
+_NEON2SSESTORAGE int64x1x2_t vld2_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
+#define vld2_s64(ptr) vld2_u64((uint64_t*)ptr)
+
+_NEON2SSESTORAGE float16x4x2_t vld2_f16(__transfersize(8) __fp16 const * ptr); // VLD2.16 {d0, d1}, [r0]
+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1_f16 for example
+
+_NEON2SSESTORAGE float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
+_NEON2SSE_INLINE float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr)
+{
+    float32x2x2_t v;
+    v.val[0].m64_f32[0] = *(ptr);
+    v.val[0].m64_f32[1] = *(ptr + 2);
+    v.val[1].m64_f32[0] = *(ptr + 1);
+    v.val[1].m64_f32[1] = *(ptr + 3);
+    return v;
+}
+
+_NEON2SSESTORAGE poly8x8x2_t vld2_p8(__transfersize(16) poly8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
+#define vld2_p8 vld2_u8
+
+_NEON2SSESTORAGE poly16x4x2_t vld2_p16(__transfersize(8) poly16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
+#define vld2_p16 vld2_u16
+
+//******************** Triplets ***************************************
+//*********************************************************************
+_NEON2SSESTORAGE uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
+_NEON2SSE_INLINE uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr) // VLD3.8 {d0, d2, d4}, [r0]
+{
+    //a0,a1,a2,a3,...a7,a8,...a15,  b0,b1,b2,...b7,b8,...b15, c0,c1,c2,...c7,c8,...c15 ->
+    //a:0,3,6,9,12,15,b:2,5,8,11,14,  c:1,4,7,10,13
+    //a:1,4,7,10,13,  b:0,3,6,9,12,15,c:2,5,8,11,14,
+    //a:2,5,8,11,14,  b:1,4,7,10,13,  c:0,3,6,9,12,15
+    uint8x16x3_t v;
+    __m128i tmp0, tmp1,tmp2, tmp3;
+    _NEON2SSE_ALIGN_16 static const int8_t mask8_0[16] = {0,3,6,9,12,15,1,4,7,10,13,2,5,8,11,14};
+    _NEON2SSE_ALIGN_16 static const int8_t mask8_1[16] = {2,5,8,11,14,0,3,6,9,12,15,1,4,7,10,13};
+    _NEON2SSE_ALIGN_16 static const int8_t mask8_2[16] = {1,4,7,10,13,2,5,8,11,14,0,3,6,9,12,15};
+
+    v.val[0] =  vld1q_u8 (ptr); //a0,a1,a2,a3,...a7, ...a15
+    v.val[1] =  vld1q_u8 ((ptr + 16)); //b0,b1,b2,b3...b7, ...b15
+    v.val[2] =  vld1q_u8 ((ptr + 32)); //c0,c1,c2,c3,...c7,...c15
+
+    tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask8_0); //a:0,3,6,9,12,15,1,4,7,10,13,2,5,8,11
+    tmp1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask8_1); //b:2,5,8,11,14,0,3,6,9,12,15,1,4,7,10,13
+    tmp2 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask8_2); //c:1,4,7,10,13,2,5,8,11,14,3,6,9,12,15
+
+    tmp3 = _mm_slli_si128(tmp0,10); //0,0,0,0,0,0,0,0,0,0,a0,a3,a6,a9,a12,a15
+    tmp3 = _mm_alignr_epi8(tmp1,tmp3, 10); //a:0,3,6,9,12,15,b:2,5,8,11,14,x,x,x,x,x
+    tmp3 = _mm_slli_si128(tmp3, 5); //0,0,0,0,0,a:0,3,6,9,12,15,b:2,5,8,11,14,
+    tmp3 = _mm_srli_si128(tmp3, 5); //a:0,3,6,9,12,15,b:2,5,8,11,14,:0,0,0,0,0
+    v.val[0] = _mm_slli_si128(tmp2, 11); //0,0,0,0,0,0,0,0,0,0,0,0, 1,4,7,10,13,
+    v.val[0] = _mm_or_si128(v.val[0],tmp3); //a:0,3,6,9,12,15,b:2,5,8,11,14,c:1,4,7,10,13,
+
+    tmp3 = _mm_slli_si128(tmp0, 5); //0,0,0,0,0,a:0,3,6,9,12,15,1,4,7,10,13,
+    tmp3 = _mm_srli_si128(tmp3, 11); //a:1,4,7,10,13, 0,0,0,0,0,0,0,0,0,0,0
+    v.val[1] = _mm_srli_si128(tmp1,5); //b:0,3,6,9,12,15,C:1,4,7,10,13, 0,0,0,0,0
+    v.val[1] = _mm_slli_si128(v.val[1], 5); //0,0,0,0,0,b:0,3,6,9,12,15,C:1,4,7,10,13,
+    v.val[1] = _mm_or_si128(v.val[1],tmp3); //a:1,4,7,10,13,b:0,3,6,9,12,15,C:1,4,7,10,13,
+    v.val[1] =  _mm_slli_si128(v.val[1],5); //0,0,0,0,0,a:1,4,7,10,13,b:0,3,6,9,12,15,
+    v.val[1] = _mm_srli_si128(v.val[1], 5); //a:1,4,7,10,13,b:0,3,6,9,12,15,0,0,0,0,0
+    tmp3 = _mm_srli_si128(tmp2,5); //c:2,5,8,11,14,0,3,6,9,12,15,0,0,0,0,0
+    tmp3 = _mm_slli_si128(tmp3,11); //0,0,0,0,0,0,0,0,0,0,0,c:2,5,8,11,14,
+    v.val[1] = _mm_or_si128(v.val[1],tmp3); //a:1,4,7,10,13,b:0,3,6,9,12,15,c:2,5,8,11,14,
+
+    tmp3 = _mm_srli_si128(tmp2,10); //c:0,3,6,9,12,15, 0,0,0,0,0,0,0,0,0,0,
+    tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0,0,0,0,0,0, c:0,3,6,9,12,15,
+    v.val[2] = _mm_srli_si128(tmp1,11); //b:1,4,7,10,13,0,0,0,0,0,0,0,0,0,0,0
+    v.val[2] = _mm_slli_si128(v.val[2],5); //0,0,0,0,0,b:1,4,7,10,13, 0,0,0,0,0,0
+    v.val[2] = _mm_or_si128(v.val[2],tmp3); //0,0,0,0,0,b:1,4,7,10,13,c:0,3,6,9,12,15,
+    tmp0 = _mm_srli_si128(tmp0, 11); //a:2,5,8,11,14, 0,0,0,0,0,0,0,0,0,0,0,
+    v.val[2] = _mm_or_si128(v.val[2],tmp0); //a:2,5,8,11,14,b:1,4,7,10,13,c:0,3,6,9,12,15,
+    return v;
+}
+
+_NEON2SSESTORAGE uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
+_NEON2SSE_INLINE uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr) // VLD3.16 {d0, d2, d4}, [r0]
+{
+    //a0, a1,a2,a3,...a7,  b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7
+    uint16x8x3_t v;
+    __m128i tmp0, tmp1,tmp2, tmp3;
+    _NEON2SSE_ALIGN_16 static const int8_t mask16_0[16] = {0,1, 6,7, 12,13, 2,3, 8,9, 14,15, 4,5, 10,11};
+    _NEON2SSE_ALIGN_16 static const int8_t mask16_1[16] = {2,3, 8,9, 14,15, 4,5, 10,11, 0,1, 6,7, 12,13};
+    _NEON2SSE_ALIGN_16 static const int8_t mask16_2[16] = {4,5, 10,11, 0,1, 6,7, 12,13, 2,3, 8,9, 14,15};
+
+    v.val[0] =  vld1q_u16 (ptr); //a0,a1,a2,a3,...a7,
+    v.val[1] =  vld1q_u16 ((ptr + 8)); //b0,b1,b2,b3...b7
+    v.val[2] =  vld1q_u16 ((ptr + 16)); //c0,c1,c2,c3,...c7
+
+    tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask16_0); //a0,a3,a6,a1,a4,a7,a2,a5,
+    tmp1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask16_1); //b1,b4,b7,b2,b5,b0,b3,b6
+    tmp2 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask16_2); //c2,c5, c0,c3,c6, c1,c4,c7
+
+    tmp3 = _mm_slli_si128(tmp0,10); //0,0,0,0,0,a0,a3,a6,
+    tmp3 = _mm_alignr_epi8(tmp1,tmp3, 10); //a0,a3,a6,b1,b4,b7,x,x
+    tmp3 = _mm_slli_si128(tmp3, 4); //0,0, a0,a3,a6,b1,b4,b7
+    tmp3 = _mm_srli_si128(tmp3, 4); //a0,a3,a6,b1,b4,b7,0,0
+    v.val[0] = _mm_slli_si128(tmp2, 12); //0,0,0,0,0,0, c2,c5,
+    v.val[0] = _mm_or_si128(v.val[0],tmp3); //a0,a3,a6,b1,b4,b7,c2,c5
+
+    tmp3 = _mm_slli_si128(tmp0, 4); //0,0,a0,a3,a6,a1,a4,a7
+    tmp3 = _mm_srli_si128(tmp3,10); //a1,a4,a7, 0,0,0,0,0
+    v.val[1] = _mm_srli_si128(tmp1,6); //b2,b5,b0,b3,b6,0,0
+    v.val[1] = _mm_slli_si128(v.val[1], 6); //0,0,0,b2,b5,b0,b3,b6,
+    v.val[1] = _mm_or_si128(v.val[1],tmp3); //a1,a4,a7,b2,b5,b0,b3,b6,
+    v.val[1] =  _mm_slli_si128(v.val[1],6); //0,0,0,a1,a4,a7,b2,b5,
+    v.val[1] = _mm_srli_si128(v.val[1], 6); //a1,a4,a7,b2,b5,0,0,0,
+    tmp3 = _mm_srli_si128(tmp2,4); //c0,c3,c6, c1,c4,c7,0,0
+    tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0,c0,c3,c6,
+    v.val[1] = _mm_or_si128(v.val[1],tmp3); //a1,a4,a7,b2,b5,c0,c3,c6,
+
+    tmp3 = _mm_srli_si128(tmp2,10); //c1,c4,c7, 0,0,0,0,0
+    tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0, c1,c4,c7,
+    v.val[2] = _mm_srli_si128(tmp1,10); //b0,b3,b6,0,0, 0,0,0
+    v.val[2] = _mm_slli_si128(v.val[2],4); //0,0, b0,b3,b6,0,0,0
+    v.val[2] = _mm_or_si128(v.val[2],tmp3); //0,0, b0,b3,b6,c1,c4,c7,
+    tmp0 = _mm_srli_si128(tmp0, 12); //a2,a5,0,0,0,0,0,0
+    v.val[2] = _mm_or_si128(v.val[2],tmp0); //a2,a5,b0,b3,b6,c1,c4,c7,
+    return v;
+}
+
+_NEON2SSESTORAGE uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
+_NEON2SSE_INLINE uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr) // VLD3.32 {d0, d2, d4}, [r0]
+{
+    //a0,a1,a2,a3,  b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1,  a1,b0,b3,c2, a2,b1,c0,c3,
+    uint32x4x3_t v;
+    __m128i tmp0, tmp1,tmp2, tmp3;
+    v.val[0] =  vld1q_u32 (ptr); //a0,a1,a2,a3,
+    v.val[1] =  vld1q_u32 ((ptr + 4)); //b0,b1,b2,b3
+    v.val[2] =  vld1q_u32 ((ptr + 8)); //c0,c1,c2,c3,
+
+    tmp0 = _mm_shuffle_epi32(v.val[0], 0 | (3 << 2) | (1 << 4) | (2 << 6)); //a0,a3,a1,a2
+    tmp1 = _mm_shuffle_epi32(v.val[1], _SWAP_HI_LOW32); //b2,b3,b0,b1
+    tmp2 = _mm_shuffle_epi32(v.val[2], 1 | (2 << 2) | (0 << 4) | (3 << 6)); //c1,c2, c0,c3
+
+    tmp3 = _mm_unpacklo_epi32(tmp1, tmp2); //b2,c1, b3,c2
+    v.val[0] = _mm_unpacklo_epi64(tmp0,tmp3); //a0,a3,b2,c1
+    tmp0 = _mm_unpackhi_epi32(tmp0, tmp1); //a1,b0, a2,b1
+    v.val[1] = _mm_shuffle_epi32(tmp0, _SWAP_HI_LOW32 ); //a2,b1, a1,b0,
+    v.val[1] = _mm_unpackhi_epi64(v.val[1], tmp3); //a1,b0, b3,c2
+    v.val[2] = _mm_unpackhi_epi64(tmp0, tmp2); //a2,b1, c0,c3
+    return v;
+}
+
+_NEON2SSESTORAGE int8x16x3_t vld3q_s8(__transfersize(48) int8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
+#define  vld3q_s8(ptr) vld3q_u8((uint8_t*) (ptr))
+
+_NEON2SSESTORAGE int16x8x3_t vld3q_s16(__transfersize(24) int16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
+#define  vld3q_s16(ptr) vld3q_u16((uint16_t*) (ptr))
+
+_NEON2SSESTORAGE int32x4x3_t vld3q_s32(__transfersize(12) int32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
+#define  vld3q_s32(ptr) vld3q_u32((uint32_t*) (ptr))
+
+_NEON2SSESTORAGE float16x8x3_t vld3q_f16(__transfersize(24) __fp16 const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
+
+_NEON2SSESTORAGE float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
+_NEON2SSE_INLINE float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr) // VLD3.32 {d0, d2, d4}, [r0]
+{
+    //a0,a1,a2,a3,  b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1,  a1,b0,b3,c2, a2,b1,c0,c3,
+    float32x4x3_t v;
+    __m128 tmp0, tmp1,tmp2, tmp3;
+    v.val[0] =  vld1q_f32 (ptr); //a0,a1,a2,a3,
+    v.val[1] =  vld1q_f32 ((ptr + 4)); //b0,b1,b2,b3
+    v.val[2] =  vld1q_f32 ((ptr + 8)); //c0,c1,c2,c3,
+
+    tmp0 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[0]), 0 | (3 << 2) | (1 << 4) | (2 << 6))); //a0,a3,a1,a2
+    tmp1 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[1]), _SWAP_HI_LOW32)); //b2,b3,b0,b1
+    tmp2 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[2]), 1 | (2 << 2) | (0 << 4) | (3 << 6))); //c1,c2, c0,c3
+    tmp3 = _mm_unpacklo_ps(tmp1, tmp2); //b2,c1, b3,c2
+
+    v.val[0] = _mm_movelh_ps(tmp0,tmp3); //a0,a3,b2,c1
+    tmp0 = _mm_unpackhi_ps(tmp0, tmp1); //a1,b0, a2,b1
+    v.val[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(tmp0), _SWAP_HI_LOW32 )); //a2,b1, a1,b0,
+    v.val[1] = _mm_movehl_ps(tmp3,v.val[1]); //a1,b0, b3,c2
+    v.val[2] = _mm_movehl_ps(tmp2,tmp0); //a2,b1, c0,c3
+    return v;
+}
+
+poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
+#define vld3q_p8 vld3q_u8
+
+_NEON2SSESTORAGE poly16x8x3_t vld3q_p16(__transfersize(24) poly16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
+#define vld3q_p16 vld3q_u16
+
+_NEON2SSESTORAGE uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
+_NEON2SSE_INLINE uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr) // VLD3.8 {d0, d1, d2}, [r0]
+{
+    //a0, a1,a2,a3,...a7,  b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7
+    uint8x8x3_t v;
+    __m128i val0, val1, val2, tmp0, tmp1;
+    _NEON2SSE_ALIGN_16 static const int8_t mask8_0[16] = {0,3,6,9,12,15, 1,4,7,10,13, 2,5,8,11,14};
+    _NEON2SSE_ALIGN_16 static const int8_t mask8_1[16] = {2,5, 0,3,6, 1,4,7, 0,0,0,0,0,0,0,0};
+    val0 =  vld1q_u8 (ptr); //a0,a1,a2,a3,...a7, b0,b1,b2,b3...b7
+    val2 =  _mm_loadl_epi64((__m128i*)(ptr + 16)); //c0,c1,c2,c3,...c7
+
+    tmp0 = _mm_shuffle_epi8(val0, *(__m128i*)mask8_0); //a0,a3,a6,b1,b4,b7, a1,a4,a7,b2,b5, a2,a5,b0,b3,b6,
+    tmp1 = _mm_shuffle_epi8(val2, *(__m128i*)mask8_1); //c2,c5, c0,c3,c6, c1,c4,c7,x,x,x,x,x,x,x,x
+    val0 = _mm_slli_si128(tmp0,10);
+    val0 = _mm_srli_si128(val0,10); //a0,a3,a6,b1,b4,b7, 0,0,0,0,0,0,0,0,0,0
+    val2 = _mm_slli_si128(tmp1,6); //0,0,0,0,0,0,c2,c5,x,x,x,x,x,x,x,x
+    val0 = _mm_or_si128(val0,val2); //a0,a3,a6,b1,b4,b7,c2,c5 x,x,x,x,x,x,x,x
+    _M64(v.val[0], val0);
+    val1 = _mm_slli_si128(tmp0,5); //0,0,0,0,0,0,0,0,0,0,0, a1,a4,a7,b2,b5,
+    val1 = _mm_srli_si128(val1,11); //a1,a4,a7,b2,b5,0,0,0,0,0,0,0,0,0,0,0,
+    val2 = _mm_srli_si128(tmp1,2); //c0,c3,c6,c1,c4,c7,x,x,x,x,x,x,x,x,0,0
+    val2 = _mm_slli_si128(val2,5); //0,0,0,0,0,c0,c3,c6,0,0,0,0,0,0,0,0
+    val1 = _mm_or_si128(val1,val2); //a1,a4,a7,b2,b5,c0,c3,c6,x,x,x,x,x,x,x,x
+    _M64(v.val[1], val1);
+
+    tmp0 = _mm_srli_si128(tmp0,11); //a2,a5,b0,b3,b6,0,0,0,0,0,0,0,0,0,0,0,
+    val2 = _mm_srli_si128(tmp1,5); //c1,c4,c7,0,0,0,0,0,0,0,0,0,0,0,0,0
+    val2 = _mm_slli_si128(val2,5); //0,0,0,0,0,c1,c4,c7,
+    val2 = _mm_or_si128(tmp0, val2); //a2,a5,b0,b3,b6,c1,c4,c7,x,x,x,x,x,x,x,x
+    _M64(v.val[2], val2);
+    return v;
+}
+
+_NEON2SSESTORAGE uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
+_NEON2SSE_INLINE uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr) // VLD3.16 {d0, d1, d2}, [r0]
+{
+    //a0,a1,a2,a3,  b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1,  a1,b0,b3,c2, a2,b1,c0,c3,
+    uint16x4x3_t v;
+    __m128i val0, val1, val2, tmp0, tmp1;
+    _NEON2SSE_ALIGN_16 static const int8_t mask16[16] = {0,1, 6,7, 12,13, 2,3, 8,9, 14,15, 4,5, 10,11};
+    val0 =  vld1q_u16 (ptr); //a0,a1,a2,a3,  b0,b1,b2,b3
+    val2 =  _mm_loadl_epi64((__m128i*)(ptr + 8)); //c0,c1,c2,c3, x,x,x,x
+
+    tmp0 = _mm_shuffle_epi8(val0, *(__m128i*)mask16); //a0, a3, b2,a1, b0, b3, a2, b1
+    tmp1 = _mm_shufflelo_epi16(val2, 201); //11 00 10 01     : c1, c2, c0, c3,
+    val0 = _mm_slli_si128(tmp0,10);
+    val0 = _mm_srli_si128(val0,10); //a0, a3, b2, 0,0, 0,0,
+    val2 = _mm_slli_si128(tmp1,14); //0,0,0,0,0,0,0,c1
+    val2 = _mm_srli_si128(val2,8); //0,0,0,c1,0,0,0,0
+    val0 = _mm_or_si128(val0,val2); //a0, a3, b2, c1, x,x,x,x
+    _M64(v.val[0], val0);
+
+    val1 = _mm_slli_si128(tmp0,4); //0,0,0,0,0,a1, b0, b3
+    val1 = _mm_srli_si128(val1,10); //a1, b0, b3, 0,0, 0,0,
+    val2 = _mm_srli_si128(tmp1,2); //c2, 0,0,0,0,0,0,0,
+    val2 = _mm_slli_si128(val2,6); //0,0,0,c2,0,0,0,0
+    val1 = _mm_or_si128(val1,val2); //a1, b0, b3, c2, x,x,x,x
+    _M64(v.val[1], val1);
+
+    tmp0 = _mm_srli_si128(tmp0,12); //a2, b1,0,0,0,0,0,0
+    tmp1 = _mm_srli_si128(tmp1,4);
+    tmp1 = _mm_slli_si128(tmp1,4); //0,0,c0, c3,
+    val2 = _mm_or_si128(tmp0, tmp1); //a2, b1, c0, c3,
+    _M64(v.val[2], val2);
+    return v;
+}
+
+_NEON2SSESTORAGE uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
+_NEON2SSE_INLINE uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr) // VLD3.32 {d0, d1, d2}, [r0]
+{
+    //a0,a1,  b0,b1, c0,c1,  -> a0,b1, a1,c0, b0,c1
+    uint32x2x3_t v;
+    __m128i val0, val1, val2;
+    val0 =  vld1q_u32 (ptr); //a0,a1,  b0,b1,
+    val2 =   _mm_loadl_epi64((__m128i*) (ptr + 4)); //c0,c1, x,x
+
+    val0 = _mm_shuffle_epi32(val0, 0 | (3 << 2) | (1 << 4) | (2 << 6)); //a0,b1, a1, b0
+    _M64(v.val[0], val0);
+    val2 =  _mm_slli_si128(val2, 8); //x, x,c0,c1,
+    val1 =  _mm_unpackhi_epi32(val0,val2); //a1,c0, b0, c1
+    _M64(v.val[1], val1);
+    val2 =  _mm_srli_si128(val1, 8); //b0, c1, x, x,
+    _M64(v.val[2], val2);
+    return v;
+}
+_NEON2SSESTORAGE uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
+_NEON2SSE_INLINE uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr) // VLD1.64 {d0, d1, d2}, [r0]
+{
+    uint64x1x3_t v;
+    v.val[0].m64_u64[0] = *(ptr);
+    v.val[1].m64_u64[0] = *(ptr + 1);
+    v.val[2].m64_u64[0] = *(ptr + 2);
+    return v;
+}
+
+_NEON2SSESTORAGE int8x8x3_t vld3_s8(__transfersize(24) int8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
+#define vld3_s8(ptr) vld3_u8((uint8_t*)ptr)
+
+_NEON2SSESTORAGE int16x4x3_t vld3_s16(__transfersize(12) int16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
+#define vld3_s16(ptr) vld3_u16((uint16_t*)ptr)
+
+_NEON2SSESTORAGE int32x2x3_t vld3_s32(__transfersize(6) int32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
+#define vld3_s32(ptr) vld3_u32((uint32_t*)ptr)
+
+int64x1x3_t vld3_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
+#define vld3_s64(ptr) vld3_u64((uint64_t*)ptr)
+
+_NEON2SSESTORAGE float16x4x3_t vld3_f16(__transfersize(12) __fp16 const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
+
+_NEON2SSESTORAGE float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
+_NEON2SSE_INLINE float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr)
+{
+    //a0,a1,  b0,b1, c0,c1,  -> a0,b1, a1,c0, b0,c1
+    float32x2x3_t v;
+    v.val[0].m64_f32[0] = *(ptr);
+    v.val[0].m64_f32[1] = *(ptr + 3);
+
+    v.val[1].m64_f32[0] = *(ptr + 1);
+    v.val[1].m64_f32[1] = *(ptr + 4);
+
+    v.val[2].m64_f32[0] = *(ptr + 2);
+    v.val[2].m64_f32[1] = *(ptr + 5);
+    return v;
+}
+
+_NEON2SSESTORAGE poly8x8x3_t vld3_p8(__transfersize(24) poly8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
+#define vld3_p8 vld3_u8
+
+_NEON2SSESTORAGE poly16x4x3_t vld3_p16(__transfersize(12) poly16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
+#define vld3_p16 vld3_u16
+
+//***************  Quadruples load ********************************
+//*****************************************************************
+_NEON2SSESTORAGE uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
+_NEON2SSE_INLINE uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr) // VLD4.8 {d0, d2, d4, d6}, [r0]
+{
+    uint8x16x4_t v;
+    __m128i tmp3, tmp2, tmp1, tmp0;
+
+    v.val[0] = vld1q_u8 ( ptr); //a0,a1,a2,...a7, ...a15
+    v.val[1] = vld1q_u8 ( (ptr + 16)); //b0, b1,b2,...b7.... b15
+    v.val[2] = vld1q_u8 ( (ptr + 32)); //c0, c1,c2,...c7....c15
+    v.val[3] = vld1q_u8 ( (ptr + 48)); //d0,d1,d2,...d7....d15
+
+    tmp0 = _mm_unpacklo_epi8(v.val[0],v.val[1]); //a0,b0, a1,b1, a2,b2, a3,b3,....a7,b7
+    tmp1 = _mm_unpacklo_epi8(v.val[2],v.val[3]); //c0,d0, c1,d1, c2,d2, c3,d3,... c7,d7
+    tmp2 = _mm_unpackhi_epi8(v.val[0],v.val[1]); //a8,b8, a9,b9, a10,b10, a11,b11,...a15,b15
+    tmp3 = _mm_unpackhi_epi8(v.val[2],v.val[3]); //c8,d8, c9,d9, c10,d10, c11,d11,...c15,d15
+
+    v.val[0] = _mm_unpacklo_epi8(tmp0, tmp2); //a0,a8, b0,b8,  a1,a9, b1,b9, ....a3,a11, b3,b11
+    v.val[1] = _mm_unpackhi_epi8(tmp0, tmp2); //a4,a12, b4,b12, a5,a13, b5,b13,....a7,a15,b7,b15
+    v.val[2] = _mm_unpacklo_epi8(tmp1, tmp3); //c0,c8, d0,d8, c1,c9, d1,d9.....d3,d11
+    v.val[3] = _mm_unpackhi_epi8(tmp1, tmp3); //c4,c12,d4,d12, c5,c13, d5,d13,....d7,d15
+
+    tmp0 =  _mm_unpacklo_epi32(v.val[0], v.val[2] ); ///a0,a8, b0,b8, c0,c8,  d0,d8, a1,a9, b1,b9, c1,c9, d1,d9
+    tmp1 =  _mm_unpackhi_epi32(v.val[0], v.val[2] ); //a2,a10, b2,b10, c2,c10, d2,d10, a3,a11, b3,b11, c3,c11, d3,d11
+    tmp2 =  _mm_unpacklo_epi32(v.val[1], v.val[3] ); //a4,a12, b4,b12, c4,c12, d4,d12, a5,a13, b5,b13, c5,c13, d5,d13,
+    tmp3 =  _mm_unpackhi_epi32(v.val[1], v.val[3] ); //a6,a14, b6,b14, c6,c14, d6,d14, a7,a15,b7,b15,c7,c15,d7,d15
+
+    v.val[0] = _mm_unpacklo_epi8(tmp0, tmp2); //a0,a4,a8,a12,b0,b4,b8,b12,c0,c4,c8,c12,d0,d4,d8,d12
+    v.val[1] = _mm_unpackhi_epi8(tmp0, tmp2); //a1,a5, a9, a13, b1,b5, b9,b13, c1,c5, c9, c13, d1,d5, d9,d13
+    v.val[2] = _mm_unpacklo_epi8(tmp1, tmp3); //a2,a6, a10,a14, b2,b6, b10,b14,c2,c6, c10,c14, d2,d6, d10,d14
+    v.val[3] = _mm_unpackhi_epi8(tmp1, tmp3); //a3,a7, a11,a15, b3,b7, b11,b15,c3,c7, c11, c15,d3,d7, d11,d15
+    return v;
+}
+
+_NEON2SSESTORAGE uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
+_NEON2SSE_INLINE uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr) // VLD4.16 {d0, d2, d4, d6}, [r0]
+{
+    uint16x8x4_t v;
+    __m128i tmp3, tmp2, tmp1, tmp0;
+    tmp0  =  vld1q_u16 (ptr); //a0,a1,a2,...a7
+    tmp1  =  vld1q_u16 ((ptr + 8)); //b0, b1,b2,...b7
+    tmp2  =  vld1q_u16 ((ptr + 16)); //c0, c1,c2,...c7
+    tmp3  =  vld1q_u16 ((ptr + 24)); //d0,d1,d2,...d7
+    v.val[0] = _mm_unpacklo_epi16(tmp0,tmp1); //a0,b0, a1,b1, a2,b2, a3,b3,
+    v.val[1] = _mm_unpacklo_epi16(tmp2,tmp3); //c0,d0, c1,d1, c2,d2, c3,d3,
+    v.val[2] = _mm_unpackhi_epi16(tmp0,tmp1); //a4,b4, a5,b5, a6,b6, a7,b7
+    v.val[3] = _mm_unpackhi_epi16(tmp2,tmp3); //c4,d4, c5,d5, c6,d6, c7,d7
+    tmp0 = _mm_unpacklo_epi16(v.val[0], v.val[2]); //a0,a4, b0,b4, a1,a5, b1,b5
+    tmp1 = _mm_unpackhi_epi16(v.val[0], v.val[2]); //a2,a6, b2,b6, a3,a7, b3,b7
+    tmp2 = _mm_unpacklo_epi16(v.val[1], v.val[3]); //c0,c4, d0,d4, c1,c5, d1,d5
+    tmp3 = _mm_unpackhi_epi16(v.val[1], v.val[3]); //c2,c6, d2,d6, c3,c7, d3,d7
+    v.val[0] =  _mm_unpacklo_epi64(tmp0, tmp2); //a0,a4, b0,b4, c0,c4, d0,d4,
+    v.val[1] =  _mm_unpackhi_epi64(tmp0, tmp2); //a1,a5, b1,b5, c1,c5, d1,d5
+    v.val[2] =  _mm_unpacklo_epi64(tmp1, tmp3); //a2,a6, b2,b6, c2,c6, d2,d6,
+    v.val[3] =  _mm_unpackhi_epi64(tmp1, tmp3); //a3,a7, b3,b7, c3,c7, d3,d7
+    return v;
+}
+
+_NEON2SSESTORAGE uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
+_NEON2SSE_INLINE uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr) // VLD4.32 {d0, d2, d4, d6}, [r0]
+{
+    uint32x4x4_t v;
+    __m128i tmp3, tmp2, tmp1, tmp0;
+    v.val[0] =  vld1q_u32 (ptr);
+    v.val[1] =  vld1q_u32 ((ptr + 4));
+    v.val[2] =  vld1q_u32 ((ptr + 8));
+    v.val[3] =  vld1q_u32 ((ptr + 12));
+    tmp0 = _mm_unpacklo_epi32(v.val[0],v.val[1]);
+    tmp1 = _mm_unpacklo_epi32(v.val[2],v.val[3]);
+    tmp2 = _mm_unpackhi_epi32(v.val[0],v.val[1]);
+    tmp3 = _mm_unpackhi_epi32(v.val[2],v.val[3]);
+    v.val[0] = _mm_unpacklo_epi64(tmp0, tmp1);
+    v.val[1] = _mm_unpackhi_epi64(tmp0, tmp1);
+    v.val[2] = _mm_unpacklo_epi64(tmp2, tmp3);
+    v.val[3] = _mm_unpackhi_epi64(tmp2, tmp3);
+    return v;
+}
+
+_NEON2SSESTORAGE int8x16x4_t vld4q_s8(__transfersize(64) int8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
+#define vld4q_s8(ptr) vld4q_u8((uint8_t*)ptr)
+
+_NEON2SSESTORAGE int16x8x4_t vld4q_s16(__transfersize(32) int16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
+#define  vld4q_s16(ptr) vld4q_u16((uint16_t*)ptr)
+
+_NEON2SSESTORAGE int32x4x4_t vld4q_s32(__transfersize(16) int32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
+#define  vld4q_s32(ptr) vld4q_u32((uint32_t*)ptr)
+
+_NEON2SSESTORAGE float16x8x4_t vld4q_f16(__transfersize(32) __fp16 const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
+
+_NEON2SSESTORAGE float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
+_NEON2SSE_INLINE float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr) // VLD4.32 {d0, d2, d4, d6}, [r0]
+{
+    float32x4x4_t v;
+    __m128 tmp3, tmp2, tmp1, tmp0;
+
+    v.val[0] =  vld1q_f32 ((float*) ptr);
+    v.val[1] =  vld1q_f32 ((float*) (ptr + 4));
+    v.val[2] =  vld1q_f32 ((float*) (ptr + 8));
+    v.val[3] =  vld1q_f32 ((float*) (ptr + 12));
+    tmp0 = _mm_unpacklo_ps(v.val[0], v.val[1]);
+    tmp2 = _mm_unpacklo_ps(v.val[2], v.val[3]);
+    tmp1 = _mm_unpackhi_ps(v.val[0], v.val[1]);
+    tmp3 = _mm_unpackhi_ps(v.val[2], v.val[3]);
+    v.val[0] = _mm_movelh_ps(tmp0, tmp2);
+    v.val[1] = _mm_movehl_ps(tmp2, tmp0);
+    v.val[2] = _mm_movelh_ps(tmp1, tmp3);
+    v.val[3] = _mm_movehl_ps(tmp3, tmp1);
+    return v;
+}
+
+_NEON2SSESTORAGE poly8x16x4_t vld4q_p8(__transfersize(64) poly8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
+#define vld4q_p8 vld4q_u8
+
+_NEON2SSESTORAGE poly16x8x4_t vld4q_p16(__transfersize(32) poly16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
+#define vld4q_p16 vld4q_s16
+
+_NEON2SSESTORAGE uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
+_NEON2SSE_INLINE uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr) // VLD4.8 {d0, d1, d2, d3}, [r0]
+{
+    uint8x8x4_t v;
+    __m128i sh0, sh1;
+    __m128i val0,  val2;
+    _NEON2SSE_ALIGN_16 int8_t mask4_8[16] = {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15};
+
+    val0 = vld1q_u8(( ptr)); //load first 64-bits in val[0] and val[1]
+    val2 = vld1q_u8(( ptr + 16)); //load third and forth 64-bits in val[2], val[3]
+
+    sh0 = _mm_shuffle_epi8(val0, *(__m128i*)mask4_8);
+    sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask4_8);
+    val0 = _mm_unpacklo_epi32(sh0,sh1); //0,4,8,12,16,20,24,28, 1,5,9,13,17,21,25,29
+    vst1q_u8(&v.val[0], val0 );
+    val2 = _mm_unpackhi_epi32(sh0,sh1); //2,6,10,14,18,22,26,30, 3,7,11,15,19,23,27,31
+    vst1q_u8(&v.val[2], val2 );
+    return v;
+}
+
+_NEON2SSESTORAGE uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
+_NEON2SSE_INLINE uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr) // VLD4.16 {d0, d1, d2, d3}, [r0]
+{
+    uint16x4x4_t v;
+    __m128i sh0, sh1;
+    __m128i val0, val2;
+    _NEON2SSE_ALIGN_16 static const int8_t mask4_16[16] = {0,1, 8,9, 2,3, 10,11, 4,5, 12,13, 6,7, 14,15}; //0, 4, 1, 5, 2, 6, 3, 7
+    val0 = vld1q_u16 ( (ptr)); //load first 64-bits in val[0] and val[1]
+    val2 = vld1q_u16 ( (ptr + 8)); //load third and forth 64-bits in val[2], val[3]
+    sh0 = _mm_shuffle_epi8(val0, *(__m128i*)mask4_16);
+    sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask4_16);
+    val0 = _mm_unpacklo_epi32(sh0,sh1); //0,4,8,12, 1,5,9,13
+    vst1q_u16(&v.val[0], val0 );
+    val2 = _mm_unpackhi_epi32(sh0,sh1); //2,6,10,14, 3,7,11,15
+    vst1q_u16(&v.val[2], val2 );
+    return v;
+}
+
+_NEON2SSESTORAGE uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
+_NEON2SSE_INLINE uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr)
+{
+    //a0,a1,  b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1
+    uint32x2x4_t v;
+    __m128i val0, val01, val2;
+    val0 =  vld1q_u32 (ptr); //a0,a1,  b0,b1,
+    val2 =  vld1q_u32 ((ptr + 4)); //c0,c1, d0,d1
+    val01 = _mm_unpacklo_epi32(val0,val2); //a0, c0, a1,c1,
+    val2 = _mm_unpackhi_epi32(val0,val2); //b0,d0, b1, d1
+    vst1q_u32(&v.val[0], val01);
+    vst1q_u32(&v.val[2], val2 );
+    return v;
+}
+
+_NEON2SSESTORAGE uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
+_NEON2SSE_INLINE uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr) // VLD1.64 {d0, d1, d2, d3}, [r0]
+{
+    uint64x1x4_t v;
+    v.val[0].m64_u64[0] = *(ptr); //load first 64-bits in val[0] and val[1]
+    v.val[1].m64_u64[0] = *(ptr + 1); //load first 64-bits in val[0] and val[1]
+    v.val[2].m64_u64[0] = *(ptr + 2); //load third and forth 64-bits in val[2], val[3]
+    v.val[3].m64_u64[0] = *(ptr + 3); //load third and forth 64-bits in val[2], val[3]
+    return v;
+}
+
+_NEON2SSESTORAGE int8x8x4_t vld4_s8(__transfersize(32) int8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
+#define  vld4_s8(ptr) vld4_u8((uint8_t*)ptr)
+
+_NEON2SSESTORAGE int16x4x4_t vld4_s16(__transfersize(16) int16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
+#define vld4_s16(ptr) vld4_u16((uint16_t*)ptr)
+
+_NEON2SSESTORAGE int32x2x4_t vld4_s32(__transfersize(8) int32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
+#define vld4_s32(ptr) vld4_u32((uint32_t*)ptr)
+
+int64x1x4_t vld4_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
+#define vld4_s64(ptr) vld4_u64((uint64_t*)ptr)
+
+_NEON2SSESTORAGE float16x4x4_t vld4_f16(__transfersize(16) __fp16 const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
+
+_NEON2SSESTORAGE float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
+_NEON2SSE_INLINE float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr) // VLD4.32 {d0, d1, d2, d3}, [r0]
+{
+    //a0,a1,  b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1
+    float32x2x4_t res;
+    res.val[0].m64_f32[0] = *(ptr);
+    res.val[0].m64_f32[1] = *(ptr + 4);
+    res.val[1].m64_f32[0] = *(ptr + 1);
+    res.val[1].m64_f32[1] = *(ptr + 5);
+    res.val[2].m64_f32[0] = *(ptr + 2);
+    res.val[2].m64_f32[1] = *(ptr + 6);
+    res.val[3].m64_f32[0] = *(ptr + 3);
+    res.val[3].m64_f32[1] = *(ptr + 7);
+    return res;
+}
+
+_NEON2SSESTORAGE poly8x8x4_t vld4_p8(__transfersize(32) poly8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
+#define vld4_p8 vld4_u8
+
+_NEON2SSESTORAGE poly16x4x4_t vld4_p16(__transfersize(16) poly16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
+#define vld4_p16 vld4_u16
+
+//************* Duplicate (or propagate) ptr[0] to all val[0] lanes and ptr[1] to all val[1] lanes *******************
+//*******************************************************************************************************************
+_NEON2SSESTORAGE uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
+_NEON2SSE_INLINE uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr) // VLD2.8 {d0[], d1[]}, [r0]
+{
+    uint8x8x2_t v;
+    __m128i val0, val1;
+    val0 = LOAD_SI128(ptr); //0,1,x,x, x,x,x,x,x,x,x,x, x,x,x,x
+    val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,x,x,x,x, x,x,x,x,x,x,x,x,
+    val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,x,x,x,x, x,x,x,x
+    val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
+    vst1q_u8(v.val, val0);
+    return v;
+}
+
+_NEON2SSESTORAGE uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
+_NEON2SSE_INLINE uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr) // VLD2.16 {d0[], d1[]}, [r0]
+{
+    uint16x4x2_t v;
+    __m128i val0, val1;
+    val1 = LOAD_SI128(ptr); //0,1,x,x, x,x,x,x
+    val0 = _mm_shufflelo_epi16(val1, 0); //00 00 00 00 (all 0)
+    _M64(v.val[0], val0);
+    val1 = _mm_shufflelo_epi16(val1, 85); //01 01 01 01 (all 1)
+    _M64(v.val[1], val1);
+    return v;
+}
+
+_NEON2SSESTORAGE uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
+_NEON2SSE_INLINE uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr) // VLD2.32 {d0[], d1[]}, [r0]
+{
+    uint32x2x2_t v;
+    __m128i val0;
+    val0 = LOAD_SI128(ptr); //0,1,x,x
+    val0 = _mm_shuffle_epi32(val0,   0 | (0 << 2) | (1 << 4) | (1 << 6)); //0,0,1,1
+    vst1q_u32(v.val, val0);
+    return v;
+}
+
+_NEON2SSESTORAGE uint64x1x2_t vld2_dup_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
+#define vld2_dup_u64 vld2_u64
+
+_NEON2SSESTORAGE int8x8x2_t vld2_dup_s8(__transfersize(2) int8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
+#define vld2_dup_s8(ptr) vld2_dup_u8((uint8_t*)ptr)
+
+_NEON2SSESTORAGE int16x4x2_t vld2_dup_s16(__transfersize(2) int16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
+#define vld2_dup_s16(ptr) vld2_dup_u16((uint16_t*)ptr)
+
+_NEON2SSESTORAGE int32x2x2_t vld2_dup_s32(__transfersize(2) int32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
+#define vld2_dup_s32(ptr) vld2_dup_u32((uint32_t*)ptr)
+
+_NEON2SSESTORAGE int64x1x2_t vld2_dup_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
+#define vld2_dup_s64(ptr) vld2_dup_u64((uint64_t*)ptr)
+
+_NEON2SSESTORAGE float16x4x2_t vld2_dup_f16(__transfersize(2) __fp16 const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
+
+_NEON2SSESTORAGE float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
+_NEON2SSE_INLINE float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr) // VLD2.32 {d0[], d1[]}, [r0]
+{
+    float32x2x2_t v;
+    v.val[0].m64_f32[0] = *(ptr); //0,0
+    v.val[0].m64_f32[1] = *(ptr); //0,0
+    v.val[1].m64_f32[0] = *(ptr + 1); //1,1
+    v.val[1].m64_f32[1] = *(ptr + 1); //1,1
+    return v;
+}
+
+_NEON2SSESTORAGE poly8x8x2_t vld2_dup_p8(__transfersize(2) poly8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
+#define vld2_dup_p8 vld2_dup_u8
+
+_NEON2SSESTORAGE poly16x4x2_t vld2_dup_p16(__transfersize(2) poly16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
+#define vld2_dup_p16 vld2_dup_s16
+
+//************* Duplicate (or propagate)triplets: *******************
+//********************************************************************
+//ptr[0] to all val[0] lanes, ptr[1] to all val[1] lanes and ptr[2] to all val[2] lanes
+_NEON2SSESTORAGE uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
+_NEON2SSE_INLINE uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr) // VLD3.8 {d0[], d1[], d2[]}, [r0]
+{
+    uint8x8x3_t v;
+    __m128i val0, val1, val2;
+    val0 = LOAD_SI128(ptr); //0,1,2,x, x,x,x,x,x,x,x,x, x,x,x,x
+    val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,2,2,x,x, x,x,x,x,x,x,x,x,
+    val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,2,2,2,2,x,x,x,x,
+    val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
+    val2 = _mm_unpackhi_epi32(val1,val1); // 2,2,2,2,2,2,2,2, x,x,x,x,x,x,x,x,
+    vst1q_u8(v.val, val0);
+    _M64(v.val[2], val2);
+    return v;
+}
+
+_NEON2SSESTORAGE uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
+_NEON2SSE_INLINE uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr) // VLD3.16 {d0[], d1[], d2[]}, [r0]
+{
+    uint16x4x3_t v;
+    __m128i val0, val1, val2;
+    val2 = LOAD_SI128(ptr); //0,1,2,x, x,x,x,x
+    val0 = _mm_shufflelo_epi16(val2, 0); //00 00 00 00 (all 0)
+    val1 = _mm_shufflelo_epi16(val2, 85); //01 01 01 01 (all 1)
+    val2 = _mm_shufflelo_epi16(val2, 170); //10 10 10 10 (all 2)
+    _M64(v.val[0], val0);
+    _M64(v.val[1], val1);
+    _M64(v.val[2], val2);
+    return v;
+}
+
+_NEON2SSESTORAGE uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
+_NEON2SSE_INLINE uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr) // VLD3.32 {d0[], d1[], d2[]}, [r0]
+{
+    uint32x2x3_t v;
+    __m128i val0, val1, val2;
+    val2 = LOAD_SI128(ptr); //0,1,2,x
+    val0 = _mm_shuffle_epi32(val2,   0 | (0 << 2) | (2 << 4) | (2 << 6)); //0,0,2,2
+    val1 = _mm_shuffle_epi32(val2,   1 | (1 << 2) | (2 << 4) | (2 << 6)); //1,1,2,2
+    val2 = _mm_srli_si128(val0, 8); //2,2,0x0,0x0
+    _M64(v.val[0], val0);
+    _M64(v.val[1], val1);
+    _M64(v.val[2], val2);
+    return v;
+}
+
+_NEON2SSESTORAGE uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
+_NEON2SSE_INLINE uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr) // VLD1.64 {d0, d1, d2}, [r0]
+{
+    uint64x1x3_t v;
+    v.val[0].m64_u64[0] = *(ptr);
+    v.val[1].m64_u64[0] = *(ptr + 1);
+    v.val[2].m64_u64[0] = *(ptr + 2);
+    return v;
+}
+
+_NEON2SSESTORAGE int8x8x3_t vld3_dup_s8(__transfersize(3) int8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
+#define vld3_dup_s8(ptr) vld3_dup_u8((uint8_t*)ptr)
+
+_NEON2SSESTORAGE int16x4x3_t vld3_dup_s16(__transfersize(3) int16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
+#define vld3_dup_s16(ptr) vld3_dup_u16((uint16_t*)ptr)
+
+_NEON2SSESTORAGE int32x2x3_t vld3_dup_s32(__transfersize(3) int32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
+#define vld3_dup_s32(ptr) vld3_dup_u32((uint32_t*)ptr)
+
+int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
+#define vld3_dup_s64(ptr) vld3_dup_u64((uint64_t*)ptr)
+
+
+_NEON2SSESTORAGE float16x4x3_t vld3_dup_f16(__transfersize(3) __fp16 const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
+
+_NEON2SSESTORAGE float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
+_NEON2SSE_INLINE float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr) // VLD3.32 {d0[], d1[], d2[]}, [r0]
+{
+    float32x2x3_t v;
+    int i;
+    for (i = 0; i<3; i++) {
+        v.val[i].m64_f32[0] = *(ptr + i);
+        v.val[i].m64_f32[1] = *(ptr + i);
+    }
+    return v;
+}
+
+_NEON2SSESTORAGE poly8x8x3_t vld3_dup_p8(__transfersize(3) poly8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
+#define vld3_dup_p8 vld3_dup_u8
+
+_NEON2SSESTORAGE poly16x4x3_t vld3_dup_p16(__transfersize(3) poly16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
+#define vld3_dup_p16 vld3_dup_s16
+
+
+//************* Duplicate (or propagate) quadruples: *******************
+//***********************************************************************
+//ptr[0] to all val[0] lanes, ptr[1] to all val[1] lanes, ptr[2] to all val[2] lanes  and  ptr[3] to all val[3] lanes
+_NEON2SSESTORAGE uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
+_NEON2SSE_INLINE uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr) // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
+{
+    uint8x8x4_t v;
+    __m128i val0, val1, val2;
+    val0 = LOAD_SI128(ptr); //0,1,2,3, x,x,x,x,x,x,x,x, x,x,x,x
+    val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,2,2,3,3, x,x,x,x,x,x,x,x,
+    val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,2,2,2,2,3,3,3,3
+    val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
+    val2 = _mm_unpackhi_epi32(val1,val1); // 2,2,2,2,2,2,2,2, 3,3,3,3, 3,3,3,3
+    vst1q_u8(&v.val[0], val0);
+    vst1q_u8(&v.val[2], val2);
+    return v;
+}
+
+_NEON2SSESTORAGE uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
+_NEON2SSE_INLINE uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr) // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
+{
+    uint16x4x4_t v;
+    __m128i val0, val1, val2, val3;
+    val3 = LOAD_SI128(ptr); //0,1,2,3, x,x,x,x
+    val0 = _mm_shufflelo_epi16(val3, 0); //00 00 00 00 (all 0)
+    val1 = _mm_shufflelo_epi16(val3, 85); //01 01 01 01 (all 1)
+    val2 = _mm_shufflelo_epi16(val3, 170); //10 10 10 10 (all 2)
+    val3 = _mm_shufflelo_epi16(val3, 255); //11 11 11 11 (all 3)
+    _M64(v.val[0], val0);
+    _M64(v.val[1], val1);
+    _M64(v.val[2], val2);
+    _M64(v.val[3], val3);
+    return v;
+}
+
+_NEON2SSESTORAGE uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
+_NEON2SSE_INLINE uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr) // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
+{
+    uint32x2x4_t v;
+    __m128i val0, val1, val2, val3;
+    val3 = LOAD_SI128(ptr); //0,1,2,3
+    val0 = _mm_shuffle_epi32(val3,   0 | (0 << 2) | (2 << 4) | (3 << 6)); //0,0,2,3
+    val1 = _mm_shuffle_epi32(val3,   1 | (1 << 2) | (2 << 4) | (3 << 6)); //1,1,2,3
+    val2 = _mm_shuffle_epi32(val3,   2 | (2 << 2) | (3 << 4) | (3 << 6)); //2,2,3,3
+    val3 = _mm_shuffle_epi32(val3,   3 | (3 << 2) | (3 << 4) | (3 << 6)); //3,3,2,2
+    _M64(v.val[0], val0);
+    _M64(v.val[1], val1);
+    _M64(v.val[2], val2);
+    _M64(v.val[3], val3);
+    return v;
+}
+
+_NEON2SSESTORAGE uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
+_NEON2SSE_INLINE uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr) // VLD1.64 {d0, d1, d2, d3}, [r0]
+{
+    uint64x1x4_t v;
+    v.val[0].m64_u64[0] = *(ptr);
+    v.val[1].m64_u64[0] = *(ptr + 1);
+    v.val[2].m64_u64[0] = *(ptr + 2);
+    v.val[3].m64_u64[0] = *(ptr + 3);
+    return v;
+}
+
+_NEON2SSESTORAGE int8x8x4_t vld4_dup_s8(__transfersize(4) int8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
+#define vld4_dup_s8(ptr) vld4_dup_u8((uint8_t*)ptr)
+
+_NEON2SSESTORAGE int16x4x4_t vld4_dup_s16(__transfersize(4) int16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
+#define vld4_dup_s16(ptr) vld4_dup_u16((uint16_t*)ptr)
+
+_NEON2SSESTORAGE int32x2x4_t vld4_dup_s32(__transfersize(4) int32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
+#define vld4_dup_s32(ptr) vld4_dup_u32((uint32_t*)ptr)
+
+int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
+#define vld4_dup_s64(ptr) vld4_dup_u64((uint64_t*)ptr)
+
+_NEON2SSESTORAGE float16x4x4_t vld4_dup_f16(__transfersize(4) __fp16 const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
+
+_NEON2SSESTORAGE float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
+_NEON2SSE_INLINE float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr) // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
+{
+    float32x2x4_t v;
+    int i;
+    for (i = 0; i<4; i++) {
+        v.val[i].m64_f32[0] = *(ptr + i);
+        v.val[i].m64_f32[1] = *(ptr + i);
+    }
+    return v;
+}
+
+_NEON2SSESTORAGE poly8x8x4_t vld4_dup_p8(__transfersize(4) poly8_t const  * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
+#define vld4_dup_p8 vld4_dup_u8
+
+_NEON2SSESTORAGE poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
+#define vld4_dup_p16 vld4_dup_u16
+
+
+//**********************************************************************************
+//*******************Lane loads for  an N-element structures ***********************
+//**********************************************************************************
+//********************** Lane pairs  ************************************************
+//does vld1_lane_xx ptr[0] to src->val[0] at lane positon and ptr[1] to src->val[1] at lane positon
+//we assume  src is 16 bit aligned
+
+//!!!!!! Microsoft compiler does not allow xxxxxx_2t function arguments resulting in "formal parameter with __declspec(align('16')) won't be aligned" error
+//to fix it the all functions below work with  xxxxxx_2t pointers and the corresponding original functions are redefined
+
+//uint16x8x2_t vld2q_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x8x2_t src,__constrange(0,7) int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
+_NEON2SSE_INLINE uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x8x2_t* src,__constrange(0,7) int lane) // VLD2.16 {d0[0], d2[0]}, [r0]
+{
+    uint16x8x2_t v;
+    v.val[0] = vld1q_lane_s16 (ptr, src->val[0],  lane);
+    v.val[1] = vld1q_lane_s16 ((ptr + 1), src->val[1],  lane);
+    return v;
+}
+#define vld2q_lane_u16(ptr, src, lane) vld2q_lane_u16_ptr(ptr, &src, lane)
+
+//uint32x4x2_t vld2q_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x4x2_t src,__constrange(0,3) int lane);// VLD2.32 {d0[0], d2[0]}, [r0]
+_NEON2SSE_INLINE uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x4x2_t* src,__constrange(0,3) int lane) // VLD2.32 {d0[0], d2[0]}, [r0]
+{
+    uint32x4x2_t v;
+    v.val[0] = _MM_INSERT_EPI32 (src->val[0],  ptr[0], lane);
+    v.val[1] = _MM_INSERT_EPI32 (src->val[1],  ptr[1], lane);
+    return v;
+}
+#define vld2q_lane_u32(ptr, src, lane) vld2q_lane_u32_ptr(ptr, &src, lane)
+
+//int16x8x2_t vld2q_lane_s16(__transfersize(2) int16_t const * ptr, int16x8x2_t src, __constrange(0,7)int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
+_NEON2SSE_INLINE int16x8x2_t vld2q_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x8x2_t* src, __constrange(0,7) int lane)
+{
+    int16x8x2_t v;
+    v.val[0] = vld1q_lane_s16 (ptr, src->val[0],  lane);
+    v.val[1] = vld1q_lane_s16 ((ptr + 1), src->val[1],  lane);
+    return v;
+}
+#define vld2q_lane_s16(ptr, src, lane) vld2q_lane_s16_ptr(ptr, &src, lane)
+
+//int32x4x2_t vld2q_lane_s32(__transfersize(2) int32_t const * ptr, int32x4x2_t src, __constrange(0,3)int lane);// VLD2.32 {d0[0], d2[0]}, [r0]
+_NEON2SSE_INLINE int32x4x2_t vld2q_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x4x2_t* src, __constrange(0,3) int lane)
+{
+    int32x4x2_t v;
+    v.val[0] = _MM_INSERT_EPI32 (src->val[0],  ptr[0], lane);
+    v.val[1] = _MM_INSERT_EPI32 (src->val[1],  ptr[1], lane);
+    return v;
+}
+#define vld2q_lane_s32(ptr, src, lane) vld2q_lane_s32_ptr(ptr, &src, lane)
+
+//float16x8x2_t vld2q_lane_f16(__transfersize(2) __fp16 const * ptr, float16x8x2_t src, __constrange(0,7)int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
+//current IA SIMD doesn't support float16
+
+//float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t src,__constrange(0,3) int lane);// VLD2.32 {d0[0], d2[0]}, [r0]
+_NEON2SSE_INLINE float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t* src,__constrange(0,3) int lane) // VLD2.32 {d0[0], d2[0]}, [r0]
+{
+    float32x4x2_t v;
+    v.val[0] = vld1q_lane_f32(ptr, src->val[0], lane);
+    v.val[1] = vld1q_lane_f32((ptr + 1), src->val[1], lane);
+    return v;
+}
+#define vld2q_lane_f32(ptr,src,lane) vld2q_lane_f32_ptr(ptr,&src,lane)
+
+//poly16x8x2_t vld2q_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x8x2_t src,__constrange(0,7) int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
+#define vld2q_lane_p16 vld2q_lane_u16
+
+_NEON2SSESTORAGE uint8x8x2_t vld2_lane_u8(__transfersize(2) uint8_t const * ptr, uint8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
+_NEON2SSE_INLINE uint8x8x2_t vld2_lane_u8(__transfersize(2) uint8_t const * ptr, uint8x8x2_t src, __constrange(0,7) int lane) // VLD2.8 {d0[0], d1[0]}, [r0]
+{
+    uint8x8x2_t v;
+    v.val[0] = vld1_lane_u8(ptr, src.val[0], lane);
+    v.val[1] = vld1_lane_u8((ptr + 1), src.val[1], lane);
+    return v;
+}
+
+_NEON2SSESTORAGE uint16x4x2_t vld2_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x4x2_t src, __constrange(0,3)int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
+_NEON2SSE_INLINE uint16x4x2_t vld2_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x4x2_t src, __constrange(0,3) int lane)
+{
+    uint16x4x2_t v;
+    v.val[0]  =  vld1_lane_u16(ptr, src.val[0], lane);
+    v.val[1]  = vld1_lane_u16((ptr + 1), src.val[1], lane);
+    return v;
+}
+
+_NEON2SSESTORAGE uint32x2x2_t vld2_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x2x2_t src, __constrange(0,1)int lane);// VLD2.32 {d0[0], d1[0]}, [r0]
+_NEON2SSE_INLINE uint32x2x2_t vld2_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x2x2_t src, __constrange(0,1) int lane)
+{
+    uint32x2x2_t v;
+    v.val[0]  =  vld1_lane_u32(ptr, src.val[0], lane);
+    v.val[1]  = vld1_lane_u32((ptr + 1), src.val[1], lane);
+    return v;
+}
+
+_NEON2SSESTORAGE int8x8x2_t vld2_lane_s8(__transfersize(2) int8_t const * ptr, int8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
+#define vld2_lane_s8(ptr, src, lane)  vld2_lane_u8(( uint8_t*) ptr, src, lane)
+
+_NEON2SSESTORAGE int16x4x2_t vld2_lane_s16(__transfersize(2) int16_t const * ptr, int16x4x2_t src, __constrange(0,3) int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
+#define vld2_lane_s16(ptr, src, lane) vld2_lane_u16(( uint16_t*) ptr, src, lane)
+
+_NEON2SSESTORAGE int32x2x2_t vld2_lane_s32(__transfersize(2) int32_t const * ptr, int32x2x2_t src, __constrange(0,1) int lane);// VLD2.32 {d0[0], d1[0]}, [r0]
+#define vld2_lane_s32(ptr, src, lane) vld2_lane_u32(( uint32_t*) ptr, src, lane)
+
+//float16x4x2_t vld2_lane_f16(__transfersize(2) __fp16 const * ptr, float16x4x2_t src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
+//current IA SIMD doesn't support float16
+
+_NEON2SSESTORAGE float32x2x2_t vld2_lane_f32(__transfersize(2) float32_t const * ptr, float32x2x2_t src,__constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
+_NEON2SSE_INLINE float32x2x2_t vld2_lane_f32(__transfersize(2) float32_t const * ptr, float32x2x2_t  src,__constrange(0,1) int lane)
+{
+    float32x2x2_t v;
+    v.val[0] = vld1_lane_f32(ptr, src.val[0], lane);
+    v.val[1] = vld1_lane_f32((ptr + 1), src.val[1], lane);
+    return v;
+}
+
+//poly8x8x2_t vld2_lane_p8(__transfersize(2) poly8_t const * ptr, poly8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
+_NEON2SSESTORAGE poly8x8x2_t vld2_lane_p8_ptr(__transfersize(2) poly8_t const * ptr, poly8x8x2_t * src, __constrange(0,7) int lane); // VLD2.8 {d0[0], d1[0]}, [r0]
+#define vld2_lane_p8 vld2_lane_u8
+
+//poly16x4x2_t vld2_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x4x2_t src, __constrange(0,3)int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
+_NEON2SSESTORAGE poly16x4x2_t vld2_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
+#define vld2_lane_p16 vld2_lane_u16
+
+//*********** Lane triplets **********************
+//*************************************************
+//does vld1_lane_xx ptr[0] to src->val[0], ptr[1] to src->val[1] and ptr[2] to src->val[2] at lane positon
+//we assume src is 16 bit aligned
+
+//uint16x8x3_t vld3q_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x8x3_t src,__constrange(0,7) int lane);// VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSE_INLINE uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x8x3_t* src,__constrange(0,7) int lane) // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
+{
+    uint16x8x3_t v;
+    v.val[0] = _MM_INSERT_EPI16 ( src->val[0],  ptr[0], lane);
+    v.val[1] = _MM_INSERT_EPI16 ( src->val[1],  ptr[1], lane);
+    v.val[2] = _MM_INSERT_EPI16 ( src->val[2],  ptr[2], lane);
+    return v;
+}
+#define vld3q_lane_u16(ptr, src, lane) vld3q_lane_u16_ptr(ptr, &src, lane)
+
+//uint32x4x3_t vld3q_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x4x3_t src,__constrange(0,3) int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSE_INLINE uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x4x3_t* src,__constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
+{
+    uint32x4x3_t v;
+    v.val[0] = _MM_INSERT_EPI32 ( src->val[0],  ptr[0], lane);
+    v.val[1] = _MM_INSERT_EPI32 ( src->val[1],  ptr[1], lane);
+    v.val[2] = _MM_INSERT_EPI32 ( src->val[2],  ptr[2], lane);
+    return v;
+}
+#define vld3q_lane_u32(ptr, src, lane) vld3q_lane_u32_ptr(ptr, &src, lane)
+
+//int16x8x3_t vld3q_lane_s16(__transfersize(3) int16_t const * ptr, int16x8x3_t src, __constrange(0,7)int lane);// VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSE_INLINE int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x8x3_t* src, __constrange(0,7) int lane) // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
+{
+    int16x8x3_t v;
+    v.val[0] = _MM_INSERT_EPI16 ( src->val[0],  ptr[0], lane);
+    v.val[1] = _MM_INSERT_EPI16 ( src->val[1],  ptr[1], lane);
+    v.val[2] = _MM_INSERT_EPI16 ( src->val[2],  ptr[2], lane);
+    return v;
+}
+#define vld3q_lane_s16(ptr, src, lane) vld3q_lane_s16_ptr(ptr, &src, lane)
+
+//int32x4x3_t vld3q_lane_s32(__transfersize(3) int32_t const * ptr, int32x4x3_t src, __constrange(0,3)int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSE_INLINE int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x4x3_t* src, __constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
+{
+    int32x4x3_t v;
+    v.val[0] = _MM_INSERT_EPI32 ( src->val[0],  ptr[0], lane);
+    v.val[1] = _MM_INSERT_EPI32 ( src->val[1],  ptr[1], lane);
+    v.val[2] = _MM_INSERT_EPI32 ( src->val[2],  ptr[2], lane);
+    return v;
+}
+#define vld3q_lane_s32(ptr, src, lane) vld3q_lane_s32_ptr(ptr, &src, lane)
+
+_NEON2SSESTORAGE float16x8x3_t vld3q_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
+//current IA SIMD doesn't support float16
+#define vld3q_lane_f16(ptr, src, lane) vld3q_lane_f16_ptr(ptr, &src, lane)
+
+
+//float32x4x3_t vld3q_lane_f32(__transfersize(3) float32_t const * ptr, float32x4x3_t src,__constrange(0,3) int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSE_INLINE float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x4x3_t* src,__constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
+{
+    float32x4x3_t v;
+    v.val[0] = vld1q_lane_f32(&ptr[0], src->val[0], lane);
+    v.val[1] = vld1q_lane_f32(&ptr[1], src->val[1], lane);
+    v.val[2] = vld1q_lane_f32(&ptr[2], src->val[2], lane);
+    return v;
+}
+#define vld3q_lane_f32(ptr,src,lane) vld3q_lane_f32_ptr(ptr,&src,lane)
+
+_NEON2SSESTORAGE poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x8x3_t * src,__constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
+#define vld3q_lane_p16 vld3q_lane_u16
+
+_NEON2SSESTORAGE uint8x8x3_t vld3_lane_u8(__transfersize(3) uint8_t const * ptr, uint8x8x3_t src, __constrange(0,7) int lane);// VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSE_INLINE uint8x8x3_t vld3_lane_u8(__transfersize(3) uint8_t const * ptr, uint8x8x3_t src, __constrange(0,7) int lane) // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
+{
+    uint8x8x3_t v;
+    v.val[0] = vld1_lane_u8(ptr, src.val[0], lane);
+    v.val[1] = vld1_lane_u8((ptr + 1), src.val[1], lane);
+    v.val[2] = vld1_lane_u8((ptr + 2), src.val[2], lane);
+    return v;
+}
+
+_NEON2SSESTORAGE uint16x4x3_t vld3_lane_u16(__transfersize(3) uint16_t   const * ptr, uint16x4x3_t src, __constrange(0,3)int lane);// VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSE_INLINE uint16x4x3_t vld3_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x4x3_t src, __constrange(0,3) int lane) // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
+{
+    uint16x4x3_t v;
+    v.val[0] = vld1_lane_u16(ptr, src.val[0], lane);
+    v.val[1] = vld1_lane_u16((ptr + 1), src.val[1], lane);
+    v.val[2] = vld1_lane_u16((ptr + 2), src.val[2], lane);
+    return v;
+}
+
+_NEON2SSESTORAGE uint32x2x3_t vld3_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x2x3_t src, __constrange(0,1)int lane);// VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSE_INLINE uint32x2x3_t vld3_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x2x3_t src, __constrange(0,1) int lane) // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
+{
+    //need to merge into 128 bit anyway
+    uint32x2x3_t v;
+    v.val[0] = vld1_lane_u32(ptr, src.val[0], lane);;
+    v.val[1] = vld1_lane_u32((ptr + 1), src.val[1], lane);;
+    v.val[2] = vld1_lane_u32((ptr + 2), src.val[2], lane);;
+    return v;
+}
+
+_NEON2SSESTORAGE int8x8x3_t vld3_lane_s8(__transfersize(3) int8_t const * ptr, int8x8x3_t  src, __constrange(0,7) int lane); // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
+#define vld3_lane_s8(ptr, src, lane)  vld3_lane_u8(( uint8_t*) ptr, src, lane)
+
+_NEON2SSESTORAGE int16x4x3_t vld3_lane_s16(__transfersize(3) int16_t const * ptr, int16x4x3_t  src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
+#define vld3_lane_s16(ptr, src, lane)  vld3_lane_u16(( uint16_t*) ptr, src, lane)
+
+_NEON2SSESTORAGE int32x2x3_t vld3_lane_s32(__transfersize(3) int32_t const * ptr, int32x2x3_t  src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
+#define vld3_lane_s32(ptr, src, lane)  vld3_lane_u32(( uint32_t*) ptr, src, lane)
+
+_NEON2SSESTORAGE float16x4x3_t vld3_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
+//current IA SIMD doesn't support float16
+
+_NEON2SSESTORAGE float32x2x3_t vld3_lane_f32(__transfersize(3) float32_t const * ptr, float32x2x3_t src,__constrange(0,1) int lane);// VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSE_INLINE float32x2x3_t vld3_lane_f32(__transfersize(3) float32_t const * ptr, float32x2x3_t src,__constrange(0,1) int lane) // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
+{
+    float32x2x3_t v;
+    v.val[0] = vld1_lane_f32(ptr, src.val[0], lane);
+    v.val[1] = vld1_lane_f32((ptr + 1), src.val[1], lane);
+    v.val[2] = vld1_lane_f32((ptr + 2), src.val[2], lane);
+    return v;
+}
+
+_NEON2SSESTORAGE poly8x8x3_t vld3_lane_p8(__transfersize(3) poly8_t const * ptr, poly8x8x3_t src, __constrange(0,7) int lane); // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
+#define vld3_lane_p8 vld3_lane_u8
+
+_NEON2SSESTORAGE poly16x4x3_t vld3_lane_p16(__transfersize(3) poly16_t const * ptr, poly16x4x3_t src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
+#define vld3_lane_p16 vld3_lane_u16
+
+//******************* Lane Quadruples  load ***************************
+//*********************************************************************
+//does vld1_lane_xx ptr[0] to src->val[0], ptr[1] to src->val[1], ptr[2] to src->val[2] and ptr[3] to src->val[3] at lane positon
+//we assume src is 16 bit aligned
+
+//uint16x8x4_t vld4q_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x8x4_t src,__constrange(0,7) int lane)// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+_NEON2SSE_INLINE uint16x8x4_t vld4q_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x8x4_t* src,__constrange(0,7) int lane)
+{
+    uint16x8x4_t v;
+    v.val[0] = _MM_INSERT_EPI16 ( src->val[0],  ptr[0], lane);
+    v.val[1] = _MM_INSERT_EPI16 ( src->val[1],  ptr[1], lane);
+    v.val[2] = _MM_INSERT_EPI16 ( src->val[2],  ptr[2], lane);
+    v.val[3] = _MM_INSERT_EPI16 ( src->val[3],  ptr[3], lane);
+    return v;
+}
+#define vld4q_lane_u16(ptr, src, lane) vld4q_lane_u16_ptr(ptr, &src, lane)
+
+//uint32x4x4_t vld4q_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x4x4_t src,__constrange(0,3) int lane)// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+_NEON2SSE_INLINE uint32x4x4_t vld4q_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x4x4_t* src,__constrange(0,3) int lane)
+{
+    uint32x4x4_t v;
+    v.val[0] = _MM_INSERT_EPI32 ( src->val[0],  ptr[0], lane);
+    v.val[1] = _MM_INSERT_EPI32 ( src->val[1],  ptr[1], lane);
+    v.val[2] = _MM_INSERT_EPI32 ( src->val[2],  ptr[2], lane);
+    v.val[3] = _MM_INSERT_EPI32 ( src->val[3],  ptr[3], lane);
+    return v;
+}
+#define vld4q_lane_u32(ptr, src, lane) vld4q_lane_u32_ptr(ptr, &src, lane)
+
+//int16x8x4_t vld4q_lane_s16(__transfersize(4) int16_t const * ptr, int16x8x4_t src, __constrange(0,7)int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+_NEON2SSESTORAGE int16x8x4_t vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+#define vld4q_lane_s16(ptr, src, lane) vld4q_lane_u16(( uint16_t*) ptr, src, lane)
+
+//int32x4x4_t vld4q_lane_s32(__transfersize(4) int32_t const * ptr, int32x4x4_t src, __constrange(0,3)int lane);// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+_NEON2SSESTORAGE int32x4x4_t vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+#define vld4q_lane_s32(ptr, src, lane)  vld4q_lane_u32(( uint32_t*) ptr, src, lane)
+
+//float16x8x4_t vld4q_lane_f16(__transfersize(4) __fp16 const * ptr, float16x8x4_t src, __constrange(0,7)int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+_NEON2SSESTORAGE float16x8x4_t vld4q_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+//current IA SIMD doesn't support float16
+
+//float32x4x4_t vld4q_lane_f32(__transfersize(4) float32_t const * ptr, float32x4x4_t src,__constrange(0,3) int lane)// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+_NEON2SSE_INLINE float32x4x4_t vld4q_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x4x4_t* src,__constrange(0,3) int lane)
+{
+    float32x4x4_t v;
+    v.val[0] = vld1q_lane_f32(&ptr[0], src->val[0], lane);
+    v.val[1] = vld1q_lane_f32(&ptr[1], src->val[1], lane);
+    v.val[2] = vld1q_lane_f32(&ptr[2], src->val[2], lane);
+    v.val[3] = vld1q_lane_f32(&ptr[3], src->val[3], lane);
+    return v;
+}
+#define vld4q_lane_f32(ptr,val,lane) vld4q_lane_f32_ptr(ptr,&val,lane)
+
+//poly16x8x4_t vld4q_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x8x4_t src,__constrange(0,7) int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+_NEON2SSESTORAGE poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x8x4_t * src,__constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+#define vld4q_lane_p16 vld4q_lane_u16
+
+_NEON2SSESTORAGE uint8x8x4_t vld4_lane_u8(__transfersize(4) uint8_t const * ptr, uint8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSE_INLINE uint8x8x4_t vld4_lane_u8(__transfersize(4) uint8_t const * ptr, uint8x8x4_t src, __constrange(0,7) int lane)
+{
+    uint8x8x4_t v;
+    v.val[0] = vld1_lane_u8(ptr, src.val[0], lane);
+    v.val[1] = vld1_lane_u8((ptr + 1), src.val[1], lane);
+    v.val[2] = vld1_lane_u8((ptr + 2), src.val[2], lane);
+    v.val[3] = vld1_lane_u8((ptr + 3), src.val[3], lane);
+    return v;
+}
+
+_NEON2SSESTORAGE uint16x4x4_t vld4_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSE_INLINE uint16x4x4_t vld4_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x4x4_t src, __constrange(0,3) int lane)
+{
+    uint16x4x4_t v;
+    v.val[0] = vld1_lane_u16(ptr, src.val[0], lane);
+    v.val[1] = vld1_lane_u16((ptr + 1), src.val[1], lane);
+    v.val[2] = vld1_lane_u16((ptr + 2), src.val[2], lane);
+    v.val[3] = vld1_lane_u16((ptr + 3), src.val[3], lane);
+    return v;
+}
+
+_NEON2SSESTORAGE uint32x2x4_t vld4_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x2x4_t src, __constrange(0,1)int lane);// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSE_INLINE uint32x2x4_t vld4_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x2x4_t src, __constrange(0,1) int lane)
+{
+    uint32x2x4_t v;
+    v.val[0] = vld1_lane_u32(ptr, src.val[0], lane);
+    v.val[1] = vld1_lane_u32((ptr + 1), src.val[1], lane);
+    v.val[2] = vld1_lane_u32((ptr + 2), src.val[2], lane);
+    v.val[3] = vld1_lane_u32((ptr + 3), src.val[3], lane);
+    return v;
+}
+
+_NEON2SSESTORAGE int8x8x4_t vld4_lane_s8(__transfersize(4) int8_t const * ptr, int8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+#define vld4_lane_s8(ptr,src,lane) vld4_lane_u8((uint8_t*)ptr,src,lane)
+
+_NEON2SSESTORAGE int16x4x4_t vld4_lane_s16(__transfersize(4) int16_t const * ptr, int16x4x4_t src, __constrange(0,3) int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+#define vld4_lane_s16(ptr,src,lane) vld4_lane_u16((uint16_t*)ptr,src,lane)
+
+_NEON2SSESTORAGE int32x2x4_t vld4_lane_s32(__transfersize(4) int32_t const * ptr, int32x2x4_t src, __constrange(0,1) int lane);// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+#define vld4_lane_s32(ptr,src,lane) vld4_lane_u32((uint32_t*)ptr,src,lane)
+
+//float16x4x4_t vld4_lane_f16(__transfersize(4) __fp16 const * ptr, float16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSESTORAGE float16x4x4_t vld4_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x4x4_t * src, __constrange(0,3) int lane);
+//current IA SIMD doesn't support float16
+
+_NEON2SSESTORAGE float32x2x4_t vld4_lane_f32(__transfersize(4) float32_t const * ptr, float32x2x4_t src,__constrange(0,1) int lane);// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSE_INLINE float32x2x4_t vld4_lane_f32(__transfersize(4) float32_t const * ptr, float32x2x4_t src,__constrange(0,1) int lane)
+{
+    //serial solution may be faster
+    float32x2x4_t v;
+    v.val[0] = vld1_lane_f32(ptr, src.val[0], lane);
+    v.val[1] = vld1_lane_f32((ptr + 1), src.val[1], lane);
+    v.val[2] = vld1_lane_f32((ptr + 2), src.val[2], lane);
+    v.val[3] = vld1_lane_f32((ptr + 3), src.val[3], lane);
+    return v;
+}
+
+_NEON2SSESTORAGE poly8x8x4_t vld4_lane_p8(__transfersize(4) poly8_t const * ptr, poly8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+#define vld4_lane_p8 vld4_lane_u8
+
+_NEON2SSESTORAGE poly16x4x4_t vld4_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+#define vld4_lane_p16 vld4_lane_u16
+
+//******************* Store duplets *********************************************
+//********************************************************************************
+//void vst2q_u8(__transfersize(32) uint8_t * ptr, uint8x16x2_t val)// VST2.8 {d0, d2}, [r0]
+_NEON2SSE_INLINE void vst2q_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x16x2_t const * val)
+{
+    uint8x16x2_t v;
+    v.val[0] = _mm_unpacklo_epi8(val->val[0], val->val[1]);
+    v.val[1] = _mm_unpackhi_epi8(val->val[0], val->val[1]);
+    vst1q_u8 (ptr, v.val[0]);
+    vst1q_u8 ((ptr + 16),  v.val[1]);
+}
+#define vst2q_u8(ptr, val) vst2q_u8_ptr(ptr, &val)
+
+//void vst2q_u16(__transfersize(16) uint16_t * ptr, uint16x8x2_t val)// VST2.16 {d0, d2}, [r0]
+_NEON2SSE_INLINE void vst2q_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x8x2_t const * val)
+{
+    uint16x8x2_t v;
+    v.val[0] = _mm_unpacklo_epi16(val->val[0], val->val[1]);
+    v.val[1] = _mm_unpackhi_epi16(val->val[0], val->val[1]);
+    vst1q_u16 (ptr, v.val[0]);
+    vst1q_u16 ((ptr + 8),  v.val[1]);
+}
+#define vst2q_u16(ptr, val) vst2q_u16_ptr(ptr, &val)
+
+//void vst2q_u32(__transfersize(8) uint32_t * ptr, uint32x4x2_t val)// VST2.32 {d0, d2}, [r0]
+_NEON2SSE_INLINE void vst2q_u32_ptr(__transfersize(8) uint32_t* ptr, uint32x4x2_t const * val)
+{
+    uint32x4x2_t v;
+    v.val[0] = _mm_unpacklo_epi32(val->val[0], val->val[1]);
+    v.val[1] = _mm_unpackhi_epi32(val->val[0], val->val[1]);
+    vst1q_u32 (ptr, v.val[0]);
+    vst1q_u32 ((ptr + 4),  v.val[1]);
+}
+#define vst2q_u32(ptr, val) vst2q_u32_ptr(ptr, &val)
+
+//void vst2q_s8(__transfersize(32) int8_t * ptr, int8x16x2_t val); // VST2.8 {d0, d2}, [r0]
+_NEON2SSESTORAGE void vst2q_s8_ptr(__transfersize(32) int8_t * ptr, int8x16x2_t const * val);
+#define vst2q_s8(ptr, val) vst2q_u8((uint8_t*)(ptr), val)
+
+//void vst2q_s16(__transfersize(16) int16_t * ptr, int16x8x2_t val);// VST2.16 {d0, d2}, [r0]
+_NEON2SSESTORAGE void vst2q_s16_ptr(__transfersize(16) int16_t * ptr, int16x8x2_t const * val);
+#define vst2q_s16(ptr, val) vst2q_u16((uint16_t*)(ptr), val)
+
+//void vst2q_s32(__transfersize(8) int32_t * ptr, int32x4x2_t val);// VST2.32 {d0, d2}, [r0]
+_NEON2SSESTORAGE void vst2q_s32_ptr(__transfersize(8) int32_t * ptr, int32x4x2_t const * val);
+#define vst2q_s32(ptr, val)  vst2q_u32((uint32_t*)(ptr), val)
+
+//void vst2q_f16(__transfersize(16) __fp16 * ptr, float16x8x2_t val);// VST2.16 {d0, d2}, [r0]
+_NEON2SSESTORAGE void vst2q_f16_ptr(__transfersize(16) __fp16 * ptr, float16x8x2_t const * val);
+// IA32 SIMD doesn't work with 16bit floats currently
+
+//void vst2q_f32(__transfersize(8) float32_t * ptr, float32x4x2_t val)// VST2.32 {d0, d2}, [r0]
+_NEON2SSE_INLINE void vst2q_f32_ptr(__transfersize(8) float32_t* ptr, float32x4x2_t const * val)
+{
+    float32x4x2_t v;
+    v.val[0] = _mm_unpacklo_ps(val->val[0], val->val[1]);
+    v.val[1] = _mm_unpackhi_ps(val->val[0], val->val[1]);
+    vst1q_f32 (ptr, v.val[0]);
+    vst1q_f32 ((ptr + 4),  v.val[1]);
+}
+#define vst2q_f32(ptr, val) vst2q_f32_ptr(ptr, &val)
+
+//void vst2q_p8(__transfersize(32) poly8_t * ptr, poly8x16x2_t val);// VST2.8 {d0, d2}, [r0]
+_NEON2SSESTORAGE void vst2q_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x16x2_t const * val);
+#define vst2q_p8 vst2q_u8
+
+//void vst2q_p16(__transfersize(16) poly16_t * ptr, poly16x8x2_t val);// VST2.16 {d0, d2}, [r0]
+_NEON2SSESTORAGE void vst2q_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x8x2_t const * val);
+#define vst2q_p16 vst2q_u16
+
+_NEON2SSESTORAGE void vst2_u8(__transfersize(16) uint8_t * ptr, uint8x8x2_t val);// VST2.8 {d0, d1}, [r0]
+_NEON2SSE_INLINE void vst2_u8(__transfersize(16) uint8_t * ptr, uint8x8x2_t val)
+{
+    __m128i v0;
+    v0 = _mm_unpacklo_epi8(_pM128i(val.val[0]), _pM128i(val.val[1]));
+    vst1q_u8 (ptr, v0);
+}
+
+_NEON2SSESTORAGE void vst2_u16(__transfersize(8) uint16_t * ptr, uint16x4x2_t val);// VST2.16 {d0, d1}, [r0]
+_NEON2SSE_INLINE void vst2_u16(__transfersize(8) uint16_t * ptr, uint16x4x2_t val)
+{
+    __m128i v0;
+    v0 = _mm_unpacklo_epi16(_pM128i(val.val[0]), _pM128i(val.val[1]));
+    vst1q_u16 (ptr, v0);
+}
+
+_NEON2SSESTORAGE void vst2_u32(__transfersize(4) uint32_t * ptr, uint32x2x2_t val);// VST2.32 {d0, d1}, [r0]
+_NEON2SSE_INLINE void vst2_u32(__transfersize(4) uint32_t * ptr, uint32x2x2_t val)
+{
+    __m128i v0;
+    v0 = _mm_unpacklo_epi32(_pM128i(val.val[0]), _pM128i(val.val[1]));
+    vst1q_u32 (ptr, v0);
+}
+
+_NEON2SSESTORAGE void vst2_u64(__transfersize(2) uint64_t * ptr, uint64x1x2_t val);// VST1.64 {d0, d1}, [r0]
+_NEON2SSE_INLINE void vst2_u64(__transfersize(2) uint64_t * ptr, uint64x1x2_t val)
+{
+    *(ptr) = val.val[0].m64_u64[0];
+    *(ptr + 1) = val.val[1].m64_u64[0];
+}
+
+_NEON2SSESTORAGE void vst2_s8(__transfersize(16) int8_t * ptr, int8x8x2_t val);// VST2.8 {d0, d1}, [r0]
+#define vst2_s8(ptr, val) vst2_u8((uint8_t*) ptr, val)
+
+_NEON2SSESTORAGE void vst2_s16(__transfersize(8) int16_t * ptr, int16x4x2_t val); // VST2.16 {d0, d1}, [r0]
+#define vst2_s16(ptr,val) vst2_u16((uint16_t*) ptr, val)
+
+_NEON2SSESTORAGE void vst2_s32(__transfersize(4) int32_t * ptr, int32x2x2_t val); // VST2.32 {d0, d1}, [r0]
+#define vst2_s32(ptr,val) vst2_u32((uint32_t*) ptr, val)
+
+_NEON2SSESTORAGE void vst2_s64(__transfersize(2) int64_t * ptr, int64x1x2_t val);
+#define vst2_s64(ptr,val) vst2_u64((uint64_t*) ptr,val)
+
+//void vst2_f16(__transfersize(8) __fp16 * ptr, float16x4x2_t val); // VST2.16 {d0, d1}, [r0]
+//current IA SIMD doesn't support float16
+
+_NEON2SSESTORAGE void vst2_f32(__transfersize(4) float32_t * ptr, float32x2x2_t val); // VST2.32 {d0, d1}, [r0]
+_NEON2SSE_INLINE void vst2_f32(__transfersize(4) float32_t* ptr, float32x2x2_t val)
+{
+    *(ptr) =   val.val[0].m64_f32[0];
+    *(ptr + 1) = val.val[1].m64_f32[0];
+    *(ptr + 2) = val.val[0].m64_f32[1];
+    *(ptr + 3) = val.val[1].m64_f32[1];
+}
+
+_NEON2SSESTORAGE void vst2_p8(__transfersize(16) poly8_t * ptr, poly8x8x2_t  val); // VST2.8 {d0, d1}, [r0]
+#define vst2_p8 vst2_u8
+
+_NEON2SSESTORAGE void vst2_p16(__transfersize(8) poly16_t * ptr, poly16x4x2_t  val); // VST2.16 {d0, d1}, [r0]
+#define vst2_p16 vst2_u16
+
+//******************** Triplets store  *****************************************
+//******************************************************************************
+//void vst3q_u8(__transfersize(48) uint8_t * ptr, uint8x16x3_t val)// VST3.8 {d0, d2, d4}, [r0]
+_NEON2SSE_INLINE void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr, uint8x16x3_t const * val)
+{
+    uint8x16x3_t v;
+    __m128i v0,v1,v2, cff, bldmask;
+    _NEON2SSE_ALIGN_16 static const uint8_t mask0[16]   = {0, 1, 0xff, 2, 3,0xff, 4, 5,0xff, 6,7,0xff, 8,9,0xff, 10};
+    _NEON2SSE_ALIGN_16 static const uint8_t mask1[16]   = {0, 0xff, 1, 2, 0xff, 3, 4, 0xff, 5, 6, 0xff, 7,8,0xff, 9,10};
+    _NEON2SSE_ALIGN_16 static const uint8_t mask2[16] =    {0xff, 6, 7, 0xff, 8, 9,0xff, 10, 11,0xff, 12,13,0xff, 14,15,0xff};
+    _NEON2SSE_ALIGN_16 static const uint8_t mask2lo[16] = {0xff,0xff, 0, 0xff,0xff, 1, 0xff,0xff, 2, 0xff,0xff, 3, 0xff,0xff, 4, 0xff};
+    _NEON2SSE_ALIGN_16 static const uint8_t mask2med[16] = {0xff, 5, 0xff, 0xff, 6, 0xff,0xff, 7, 0xff,0xff, 8, 0xff,0xff, 9, 0xff, 0xff};
+    _NEON2SSE_ALIGN_16 static const uint8_t mask2hi[16] = {10, 0xff,0xff, 11, 0xff,0xff, 12, 0xff,0xff, 13, 0xff,0xff, 14, 0xff, 0xff, 15};
+
+    v0 =  _mm_unpacklo_epi8(val->val[0], val->val[1]); //0,1, 3,4, 6,7, 9,10, 12,13, 15,16, 18,19, 21,22
+    v2 =  _mm_unpackhi_epi8(val->val[0], val->val[1]); //24,25,  27,28, 30,31, 33,34, 36,37, 39,40, 42,43, 45,46
+    v1 =  _mm_alignr_epi8(v2, v0, 11); //12,13, 15,16, 18,19, 21,22, 24,25,  27,28, 30,31, 33,34
+    v.val[0] =  _mm_shuffle_epi8(v0, *(__m128i*)mask0); //make holes for the v.val[2] data embedding
+    v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2lo); //make plugs for the v.val[2] data embedding
+    cff = _mm_cmpeq_epi8(v0, v0); //all ff
+    bldmask = _mm_cmpeq_epi8(*(__m128i*)mask0, cff);
+    v.val[0] = _MM_BLENDV_EPI8(v.val[0], v.val[2], bldmask);
+    vst1q_u8(ptr,   v.val[0]);
+    v.val[0] =  _mm_shuffle_epi8(v1, *(__m128i*)mask1); //make holes for the v.val[2] data embedding
+    v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2med); //make plugs for the v.val[2] data embedding
+    bldmask = _mm_cmpeq_epi8(*(__m128i*)mask1, cff);
+    v.val[1] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask);
+    vst1q_u8((ptr + 16),  v.val[1]);
+    v.val[0] =  _mm_shuffle_epi8(v2, *(__m128i*)mask2); //make holes for the v.val[2] data embedding
+    v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2hi); //make plugs for the v.val[2] data embedding
+    bldmask = _mm_cmpeq_epi8(*(__m128i*)mask2, cff);
+    v.val[2] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask );
+    vst1q_u8((ptr + 32),  v.val[2]);
+}
+#define vst3q_u8(ptr, val) vst3q_u8_ptr(ptr, &val)
+
+//void vst3q_u16(__transfersize(24) uint16_t * ptr, uint16x8x3_t val)// VST3.16 {d0, d2, d4}, [r0]
+_NEON2SSE_INLINE void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr, uint16x8x3_t const * val)
+{
+    uint16x8x3_t v;
+    __m128i v0,v1,v2, cff, bldmask;
+    _NEON2SSE_ALIGN_16 static const uint8_t mask0[16]   = {0,1, 2,3, 0xff,0xff, 4,5, 6,7,0xff,0xff, 8,9,10,11};
+    _NEON2SSE_ALIGN_16 static const uint8_t mask1[16]   = {0xff, 0xff, 0,1, 2,3, 0xff,0xff, 4,5, 6,7, 0xff,0xff, 8,9};
+    _NEON2SSE_ALIGN_16 static const uint8_t mask2[16] =    {6,7,0xff,0xff, 8,9,10,11, 0xff, 0xff, 12,13,14,15, 0xff, 0xff};
+    _NEON2SSE_ALIGN_16 static const uint8_t mask2lo[16] = {0xff,0xff, 0xff,0xff, 0,1, 0xff,0xff, 0xff,0xff, 2,3, 0xff,0xff, 0xff,0xff};
+    _NEON2SSE_ALIGN_16 static const uint8_t mask2med[16] = {4,5, 0xff,0xff,0xff,0xff, 6,7, 0xff, 0xff,0xff,0xff, 8,9, 0xff, 0xff};
+    _NEON2SSE_ALIGN_16 static const uint8_t mask2hi[16] = {0xff, 0xff, 10,11, 0xff, 0xff, 0xff, 0xff, 12,13, 0xff, 0xff, 0xff, 0xff,14,15};
+
+    v0 =  _mm_unpacklo_epi16(val->val[0], val->val[1]); //0,1, 3,4, 6,7, 9,10
+    v2 =  _mm_unpackhi_epi16(val->val[0], val->val[1]); //12,13, 15,16, 18,19, 21,22,
+    v1 =  _mm_alignr_epi8(v2, v0, 12); //9,10, 12,13, 15,16, 18,19
+    v.val[0] =  _mm_shuffle_epi8(v0, *(__m128i*)mask0); //make holes for the v.val[2] data embedding
+    v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2lo); //make plugs for the v.val[2] data embedding
+    cff = _mm_cmpeq_epi16(v0, v0); //all ff
+    bldmask = _mm_cmpeq_epi16(*(__m128i*)mask0, cff);
+    v.val[0] = _MM_BLENDV_EPI8(v.val[0], v.val[2], bldmask);
+    vst1q_u16(ptr,      v.val[0]);
+    v.val[0] =  _mm_shuffle_epi8(v1, *(__m128i*)mask1); //make holes for the v.val[2] data embedding
+    v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2med); //make plugs for the v.val[2] data embedding
+    bldmask = _mm_cmpeq_epi16(*(__m128i*)mask1, cff);
+    v.val[1] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask);
+    vst1q_u16((ptr + 8),  v.val[1]);
+    v.val[0] =  _mm_shuffle_epi8(v2, *(__m128i*)mask2); //make holes for the v.val[2] data embedding
+    v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2hi); //make plugs for the v.val[2] data embedding
+    bldmask = _mm_cmpeq_epi16(*(__m128i*)mask2, cff);
+    v.val[2] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask );
+    vst1q_u16((ptr + 16), v.val[2]);
+}
+#define vst3q_u16(ptr, val) vst3q_u16_ptr(ptr, &val)
+
+//void vst3q_u32(__transfersize(12) uint32_t * ptr, uint32x4x3_t val)// VST3.32 {d0, d2, d4}, [r0]
+_NEON2SSE_INLINE void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr, uint32x4x3_t const * val)
+{
+    //a0,a1,a2,a3,  b0,b1,b2,b3, c0,c1,c2,c3 -> a0,b0,c0,a1, b1,c1,a2,b2, c2,a3,b3,c3
+    uint32x4x3_t v;
+    __m128i tmp0, tmp1,tmp2;
+    tmp0 = _mm_unpacklo_epi32(val->val[0], val->val[1]); //a0,b0,a1,b1
+    tmp1 = _mm_unpackhi_epi32(val->val[0], val->val[1]); //a2,b2,a3,b3
+    tmp2 = _mm_unpacklo_epi32(val->val[1], val->val[2]); //b0,c0,b1,c1
+    v.val[1] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp2),_mm_castsi128_ps(tmp1), _MM_SHUFFLE(1,0,3,2))); //b1,c1,a2,b2,
+    v.val[2] = _mm_unpackhi_epi64(tmp1, val->val[2]); //a3,b3, c2,c3
+    v.val[2] = _mm_shuffle_epi32(v.val[2], 2 | (0 << 2) | (1 << 4) | (3 << 6)); //c2,a3,b3,c3
+    tmp1 = _mm_unpacklo_epi32(tmp2,val->val[0]); //b0,a0,c0,a1
+    v.val[0] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp0),_mm_castsi128_ps(tmp1), _MM_SHUFFLE(3,2,1,0))); //a0,b0,c0,a1,
+
+    vst1q_u32(ptr,      v.val[0]);
+    vst1q_u32((ptr + 4),  v.val[1]);
+    vst1q_u32((ptr + 8),  v.val[2]);
+}
+#define vst3q_u32(ptr, val) vst3q_u32_ptr(ptr, &val)
+
+//void vst3q_s8(__transfersize(48) int8_t * ptr, int8x16x3_t val);
+_NEON2SSESTORAGE void vst3q_s8_ptr(__transfersize(48) int8_t * ptr, int8x16x3_t const * val);
+#define vst3q_s8(ptr, val) vst3q_u8((uint8_t*)(ptr), val)
+
+//void vst3q_s16(__transfersize(24) int16_t * ptr, int16x8x3_t val);
+_NEON2SSESTORAGE void vst3q_s16_ptr(__transfersize(24) int16_t * ptr, int16x8x3_t const * val);
+#define vst3q_s16(ptr, val) vst3q_u16((uint16_t*)(ptr), val)
+
+//void vst3q_s32(__transfersize(12) int32_t * ptr, int32x4x3_t val);
+_NEON2SSESTORAGE void vst3q_s32_ptr(__transfersize(12) int32_t * ptr, int32x4x3_t const * val);
+#define vst3q_s32(ptr, val)  vst3q_u32((uint32_t*)(ptr), val)
+
+//void vst3q_f16(__transfersize(24) __fp16 * ptr, float16x8x3_t val);// VST3.16 {d0, d2, d4}, [r0]
+_NEON2SSESTORAGE void vst3q_f16_ptr(__transfersize(24) __fp16 * ptr, float16x8x3_t const * val);
+// IA32 SIMD doesn't work with 16bit floats currently
+
+//void vst3q_f32(__transfersize(12) float32_t * ptr, float32x4x3_t val)// VST3.32 {d0, d2, d4}, [r0]
+_NEON2SSE_INLINE void vst3q_f32_ptr(__transfersize(12) float32_t * ptr, float32x4x3_t const * val)
+{
+    float32x4x3_t v;
+    __m128 tmp0, tmp1,tmp2;
+    tmp0 = _mm_unpacklo_ps(val->val[0], val->val[1]); //a0,b0,a1,b1
+    tmp1 = _mm_unpackhi_ps(val->val[0], val->val[1]); //a2,b2,a3,b3
+    tmp2 = _mm_unpacklo_ps(val->val[1], val->val[2]); //b0,c0,b1,c1
+    v.val[1] = _mm_shuffle_ps(tmp2,tmp1, _MM_SHUFFLE(1,0,3,2)); //b1,c1,a2,b2,
+    v.val[2] = _mm_movehl_ps(val->val[2],tmp1); //a3,b3, c2,c3
+    v.val[2] = _mm_shuffle_ps(v.val[2],v.val[2], _MM_SHUFFLE(3,1,0,2)); //c2,a3,b3,c3
+    tmp1 = _mm_unpacklo_ps(tmp2,val->val[0]); //b0,a0,c0,a1
+    v.val[0] = _mm_shuffle_ps(tmp0,tmp1, _MM_SHUFFLE(3,2,1,0)); //a0,b0,c0,a1,
+
+    vst1q_f32( ptr,    v.val[0]);
+    vst1q_f32( (ptr + 4),  v.val[1]);
+    vst1q_f32( (ptr + 8),  v.val[2]);
+}
+#define vst3q_f32(ptr, val) vst3q_f32_ptr(ptr, &val)
+
+//void vst3q_p8(__transfersize(48) poly8_t * ptr, poly8x16x3_t val);// VST3.8 {d0, d2, d4}, [r0]
+_NEON2SSESTORAGE void vst3q_p8_ptr(__transfersize(48) poly8_t * ptr, poly8x16x3_t const * val);
+#define vst3q_p8 vst3q_u8
+
+//void vst3q_p16(__transfersize(24) poly16_t * ptr, poly16x8x3_t val);// VST3.16 {d0, d2, d4}, [r0]
+_NEON2SSESTORAGE void vst3q_p16_ptr(__transfersize(24) poly16_t * ptr, poly16x8x3_t const * val);
+#define vst3q_p16 vst3q_u16
+
+_NEON2SSESTORAGE void vst3_u8(__transfersize(24) uint8_t * ptr, uint8x8x3_t val);// VST3.8 {d0, d1, d2}, [r0]
+_NEON2SSE_INLINE void vst3_u8(__transfersize(24) uint8_t * ptr, uint8x8x3_t val)
+{
+    __m128i tmp, sh0, sh1, val0, val2;
+    _NEON2SSE_ALIGN_16 static const int8_t mask0[16] = { 0, 8, 16, 1, 9, 17, 2, 10, 18, 3, 11, 19, 4, 12, 20, 5};
+    _NEON2SSE_ALIGN_16 static const int8_t mask1[16] = {13, 21, 6, 14, 22, 7, 15, 23, 0,0,0,0,0,0,0,0};
+    _NEON2SSE_ALIGN_16 static const uint8_t mask0_sel[16] = {0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0};
+    _NEON2SSE_ALIGN_16 static const uint8_t mask1_sel[16] = {0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0,0,0,0,0,0,0,0};
+    tmp = _mm_unpacklo_epi64(_pM128i(val.val[0]), _pM128i(val.val[1]) );
+    sh0 =  _mm_shuffle_epi8(tmp, *(__m128i*)mask0); //for bi>15 bi is wrapped (bi-=15)
+    val2 = _pM128i(val.val[2]);
+    sh1 =  _mm_shuffle_epi8(val2, *(__m128i*)mask0);
+    val0 = _MM_BLENDV_EPI8(sh0, sh1, *(__m128i*)mask0_sel);
+    vst1q_u8(ptr,   val0); //store as 128 bit structure
+    sh0 =  _mm_shuffle_epi8(tmp, *(__m128i*)mask1); //for bi>15 bi is wrapped (bi-=15)
+    sh1 =  _mm_shuffle_epi8(val2, *(__m128i*)mask1);
+    val2 = _MM_BLENDV_EPI8(sh0, sh1, *(__m128i*)mask1_sel);
+    _M64((*(__m64_128*)(ptr + 16)),  val2); //need it to fit into *ptr memory
+}
+
+_NEON2SSESTORAGE void vst3_u16(__transfersize(12) uint16_t * ptr, uint16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0]
+_NEON2SSE_INLINE void vst3_u16(__transfersize(12) uint16_t * ptr, uint16x4x3_t val)
+{
+    __m128i tmp, val0, val1, val2;
+    _NEON2SSE_ALIGN_16 static const int8_t mask0[16] = {0,1, 8,9, 16,17, 2,3, 10,11, 18,19, 4,5, 12,13};
+    _NEON2SSE_ALIGN_16 static const int8_t mask1[16] = {20,21, 6,7, 14,15, 22,23,   0,0,0,0,0,0,0,0};
+    _NEON2SSE_ALIGN_16 static const uint16_t mask0f[8] = {0xffff, 0xffff, 0, 0xffff, 0xffff, 0, 0xffff, 0xffff}; //if all ones we take the result from v.val[0]  otherwise from v.val[1]
+    _NEON2SSE_ALIGN_16 static const uint16_t mask1f[8] = {0xffff, 0, 0, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}; //if all ones we take the result from v.val[1]  otherwise from v.val[0]
+    tmp = _mm_unpacklo_epi64(_pM128i(val.val[0]), _pM128i(val.val[1]));
+    val0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask0);
+    val2 = _pM128i(val.val[2]);
+    val1 = _mm_shuffle_epi8(val2, *(__m128i*)mask0);
+    val0 = _MM_BLENDV_EPI8(val1, val0, *(__m128i*)mask0f);
+    vst1q_u16(ptr,    val0); //store as 128 bit structure
+    val0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask1);
+    val1 = _mm_shuffle_epi8(val2, *(__m128i*)mask1);
+    val1 = _MM_BLENDV_EPI8(val0, val1,  *(__m128i*)mask1f); //change the operands order
+    _M64((*(__m64_128*)(ptr + 8)),  val1); //need it to fit into *ptr memory
+}
+
+_NEON2SSESTORAGE void vst3_u32(__transfersize(6) uint32_t * ptr, uint32x2x3_t val);// VST3.32 {d0, d1, d2}, [r0]
+_NEON2SSE_INLINE void vst3_u32(__transfersize(6) uint32_t * ptr, uint32x2x3_t val)
+{
+    //val.val[0]:0,3,val.val[1]:1,4; val.val[2]:2,5,x,x;
+    __m128i val0, val1;
+    val0 = _mm_unpacklo_epi64(_pM128i(val.val[1]), _pM128i(val.val[2])); //val[0]: 1,4,2,5
+    val0 = _mm_shuffle_epi32(val0, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //1,2,4,5
+    val1 = _mm_srli_si128(val0, 8); //4,5, x,x
+    _M64((*(__m64_128*)(ptr + 4)),  val1);
+    val0 = _mm_unpacklo_epi32(_pM128i(val.val[0]), val0); //0,1,3,2
+    val0 = _mm_shuffle_epi32(val0, 0 | (1 << 2) | (3 << 4) | (2 << 6)); //0,1,2, 3
+    vst1q_u32(ptr, val0); //store as 128 bit structure
+}
+
+_NEON2SSESTORAGE void vst3_u64(__transfersize(3) uint64_t * ptr, uint64x1x3_t val);// VST1.64 {d0, d1, d2}, [r0]
+_NEON2SSE_INLINE void vst3_u64(__transfersize(3) uint64_t * ptr, uint64x1x3_t val)
+{
+    *(ptr) = val.val[0].m64_u64[0];
+    *(ptr + 1) = val.val[1].m64_u64[0];
+    *(ptr + 2) = val.val[2].m64_u64[0];
+}
+
+_NEON2SSESTORAGE void vst3_s8(__transfersize(24) int8_t * ptr, int8x8x3_t val);  // VST3.8 {d0, d1, d2}, [r0]
+#define vst3_s8(ptr, val) vst3_u8((uint8_t*)ptr, val)
+
+_NEON2SSESTORAGE void vst3_s16(__transfersize(12) int16_t * ptr, int16x4x3_t val);  // VST3.16 {d0, d1, d2}, [r0]
+#define vst3_s16(ptr, val) vst3_u16((uint16_t*)ptr, val)
+
+_NEON2SSESTORAGE void vst3_s32(__transfersize(6) int32_t * ptr, int32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0]
+#define vst3_s32(ptr, val) vst3_u32((uint32_t*)ptr, val)
+
+_NEON2SSESTORAGE void vst3_s64(__transfersize(3) int64_t * ptr, int64x1x3_t val); // VST1.64 {d0, d1, d2}, [r0]
+#define vst3_s64(ptr, val) vst3_u64((uint64_t*)ptr, val)
+
+//void vst3_f16(__transfersize(12) __fp16 * ptr, float16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0]
+_NEON2SSESTORAGE void vst3_f16_ptr(__transfersize(12) __fp16 * ptr, float16x4x3_t const * val); // VST3.16 {d0, d1, d2}, [r0]
+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
+
+_NEON2SSESTORAGE void vst3_f32(__transfersize(6) float32_t * ptr, float32x2x3_t val);// VST3.32 {d0, d1, d2}, [r0]
+_NEON2SSE_INLINE void vst3_f32(__transfersize(6) float32_t * ptr, float32x2x3_t val)
+{
+    //val->val[0]:0,3,val->val[1]:1,4; val->val[2]:2,5,x,x;   -> 0,2, 4,1, 3,5
+    *(ptr) =   val.val[0].m64_f32[0];
+    *(ptr + 1) = val.val[1].m64_f32[0];
+    *(ptr + 2) = val.val[2].m64_f32[0];
+    *(ptr + 3) = val.val[0].m64_f32[1];
+    *(ptr + 4) = val.val[1].m64_f32[1];
+    *(ptr + 5) = val.val[2].m64_f32[1];
+}
+
+_NEON2SSESTORAGE void vst3_p8(__transfersize(24) poly8_t * ptr, poly8x8x3_t val);// VST3.8 {d0, d1, d2}, [r0]
+#define vst3_p8 vst3_u8
+
+_NEON2SSESTORAGE void vst3_p16(__transfersize(12) poly16_t * ptr, poly16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0]
+#define vst3_p16 vst3_u16
+
+//***************  Quadruples store ********************************
+//*********************************************************************
+//void vst4q_u8(__transfersize(64) uint8_t * ptr, uint8x16x4_t val)// VST4.8 {d0, d2, d4, d6}, [r0]
+_NEON2SSE_INLINE void vst4q_u8_ptr(__transfersize(64) uint8_t * ptr, uint8x16x4_t const * val)
+{
+    __m128i tmp1, tmp2, res;
+    tmp1 = _mm_unpacklo_epi8(val->val[0], val->val[1]); //  0,1, 4,5, 8,9, 12,13, 16,17, 20,21, 24,25, 28,29
+    tmp2 = _mm_unpacklo_epi8(val->val[2], val->val[3]); //  2,3, 6,7, 10,11, 14,15, 18,19, 22,23, 26,27, 30,31
+    res = _mm_unpacklo_epi16(tmp1, tmp2); //0,1, 2,3, 4,5, 6,7, 8,9, 10,11, 12,13, 14,15
+    vst1q_u8(ptr,  res);
+    res = _mm_unpackhi_epi16(tmp1, tmp2); //16,17, 18,19, 20,21, 22,23, 24,25, 26,27, 28,29, 30,31
+    vst1q_u8((ptr + 16), res);
+    tmp1 = _mm_unpackhi_epi8(val->val[0], val->val[1]); //
+    tmp2 = _mm_unpackhi_epi8(val->val[2], val->val[3]); //
+    res = _mm_unpacklo_epi16(tmp1, tmp2); //
+    vst1q_u8((ptr + 32), res);
+    res = _mm_unpackhi_epi16(tmp1, tmp2); //
+    vst1q_u8((ptr + 48), res);
+}
+#define vst4q_u8(ptr, val) vst4q_u8_ptr(ptr, &val)
+
+//void vst4q_u16(__transfersize(32) uint16_t * ptr, uint16x8x4_t val)// VST4.16 {d0, d2, d4, d6}, [r0]
+_NEON2SSE_INLINE void vst4q_u16_ptr(__transfersize(32) uint16_t * ptr, uint16x8x4_t const * val)
+{
+    uint16x8x4_t v;
+    __m128i tmp1, tmp2;
+    tmp1 = _mm_unpacklo_epi16(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13
+    tmp2 = _mm_unpacklo_epi16(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15
+    v.val[0] = _mm_unpacklo_epi32(tmp1, tmp2);
+    v.val[1] = _mm_unpackhi_epi32(tmp1, tmp2);
+    tmp1 = _mm_unpackhi_epi16(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13
+    tmp2 = _mm_unpackhi_epi16(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15
+    v.val[2] = _mm_unpacklo_epi32(tmp1, tmp2);
+    v.val[3] = _mm_unpackhi_epi32(tmp1, tmp2);
+    vst1q_u16(ptr,     v.val[0]);
+    vst1q_u16((ptr + 8), v.val[1]);
+    vst1q_u16((ptr + 16),v.val[2]);
+    vst1q_u16((ptr + 24), v.val[3]);
+}
+#define vst4q_u16(ptr, val) vst4q_u16_ptr(ptr, &val)
+
+//void vst4q_u32(__transfersize(16) uint32_t * ptr, uint32x4x4_t val)// VST4.32 {d0, d2, d4, d6}, [r0]
+_NEON2SSE_INLINE void vst4q_u32_ptr(__transfersize(16) uint32_t * ptr, uint32x4x4_t const * val)
+{
+    uint16x8x4_t v;
+    __m128i tmp1, tmp2;
+    tmp1 = _mm_unpacklo_epi32(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13
+    tmp2 = _mm_unpacklo_epi32(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15
+    v.val[0] = _mm_unpacklo_epi64(tmp1, tmp2);
+    v.val[1] = _mm_unpackhi_epi64(tmp1, tmp2);
+    tmp1 = _mm_unpackhi_epi32(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13
+    tmp2 = _mm_unpackhi_epi32(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15
+    v.val[2] = _mm_unpacklo_epi64(tmp1, tmp2);
+    v.val[3] = _mm_unpackhi_epi64(tmp1, tmp2);
+    vst1q_u32(ptr,      v.val[0]);
+    vst1q_u32((ptr + 4),  v.val[1]);
+    vst1q_u32((ptr + 8),  v.val[2]);
+    vst1q_u32((ptr + 12), v.val[3]);
+}
+#define vst4q_u32(ptr, val) vst4q_u32_ptr(ptr, &val)
+
+//void vst4q_s8(__transfersize(64) int8_t * ptr, int8x16x4_t val);
+_NEON2SSESTORAGE void vst4q_s8_ptr(__transfersize(64) int8_t * ptr, int8x16x4_t const * val);
+#define vst4q_s8(ptr, val) vst4q_u8((uint8_t*)(ptr), val)
+
+//void vst4q_s16(__transfersize(32) int16_t * ptr, int16x8x4_t val);
+_NEON2SSESTORAGE void vst4q_s16_ptr(__transfersize(32) int16_t * ptr, int16x8x4_t const * val);
+#define vst4q_s16(ptr, val) vst4q_u16((uint16_t*)(ptr), val)
+
+//void vst4q_s32(__transfersize(16) int32_t * ptr, int32x4x4_t val);
+_NEON2SSESTORAGE void vst4q_s32_ptr(__transfersize(16) int32_t * ptr, int32x4x4_t const * val);
+#define vst4q_s32(ptr, val) vst4q_u32((uint32_t*)(ptr), val)
+
+//void vst4q_f16(__transfersize(32) __fp16 * ptr, float16x8x4_t val);// VST4.16 {d0, d2, d4, d6}, [r0]
+_NEON2SSESTORAGE void vst4q_f16_ptr(__transfersize(32) __fp16 * ptr, float16x8x4_t const * val);
+// IA32 SIMD doesn't work with 16bit floats currently
+
+//void vst4q_f32(__transfersize(16) float32_t * ptr, float32x4x4_t val)// VST4.32 {d0, d2, d4, d6}, [r0]
+_NEON2SSE_INLINE void vst4q_f32_ptr(__transfersize(16) float32_t * ptr, float32x4x4_t const * val)
+{
+    __m128 tmp3, tmp2, tmp1, tmp0;
+    float32x4x4_t v;
+    tmp0 = _mm_unpacklo_ps(val->val[0], val->val[1]);
+    tmp2 = _mm_unpacklo_ps(val->val[2], val->val[3]);
+    tmp1 = _mm_unpackhi_ps(val->val[0], val->val[1]);
+    tmp3 = _mm_unpackhi_ps(val->val[2], val->val[3]);
+    v.val[0] = _mm_movelh_ps(tmp0, tmp2);
+    v.val[1] = _mm_movehl_ps(tmp2, tmp0);
+    v.val[2] = _mm_movelh_ps(tmp1, tmp3);
+    v.val[3] = _mm_movehl_ps(tmp3, tmp1);
+    vst1q_f32(ptr,   v.val[0]);
+    vst1q_f32((ptr + 4), v.val[1]);
+    vst1q_f32((ptr + 8), v.val[2]);
+    vst1q_f32((ptr + 12), v.val[3]);
+}
+#define vst4q_f32(ptr, val) vst4q_f32_ptr(ptr, &val)
+
+//void vst4q_p8(__transfersize(64) poly8_t * ptr, poly8x16x4_t val);// VST4.8 {d0, d2, d4, d6}, [r0]
+_NEON2SSESTORAGE void vst4q_p8_ptr(__transfersize(64) poly8_t * ptr, poly8x16x4_t const * val);
+#define vst4q_p8 vst4q_u8
+
+//void vst4q_p16(__transfersize(32) poly16_t * ptr, poly16x8x4_t val);// VST4.16 {d0, d2, d4, d6}, [r0]
+_NEON2SSESTORAGE void vst4q_p16_ptr(__transfersize(32) poly16_t * ptr, poly16x8x4_t const * val);
+#define vst4q_p16 vst4q_s16
+
+_NEON2SSESTORAGE void vst4_u8(__transfersize(32) uint8_t * ptr, uint8x8x4_t val);// VST4.8 {d0, d1, d2, d3}, [r0]
+_NEON2SSE_INLINE void vst4_u8(__transfersize(32) uint8_t * ptr, uint8x8x4_t val)
+{
+    __m128i sh0, sh1, val0, val2;
+    sh0 = _mm_unpacklo_epi8(_pM128i(val.val[0]),_pM128i(val.val[1])); // a0,b0,a1,b1,a2,b2,a3,b3,a4,b4,a5,b5, a6,b6,a7,b7,
+    sh1 = _mm_unpacklo_epi8(_pM128i(val.val[2]),_pM128i(val.val[3])); // c0,d0,c1,d1,c2,d2,c3,d3, c4,d4,c5,d5,c6,d6,c7,d7
+    val0 = _mm_unpacklo_epi16(sh0,sh1); // a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,
+    val2 = _mm_unpackhi_epi16(sh0,sh1); //a4,b4,c4,d4,a5,b5,c5,d5, a6,b6,c6,d6,a7,b7,c7,d7
+    vst1q_u8(ptr,    val0);
+    vst1q_u8((ptr + 16),  val2);
+}
+
+_NEON2SSESTORAGE void vst4_u16(__transfersize(16) uint16_t * ptr, uint16x4x4_t val);// VST4.16 {d0, d1, d2, d3}, [r0]
+_NEON2SSE_INLINE void vst4_u16(__transfersize(16) uint16_t * ptr, uint16x4x4_t val)
+{
+    __m128i sh0, sh1, val0, val2;
+    sh0 = _mm_unpacklo_epi16(_pM128i(val.val[0]),_pM128i(val.val[1])); //a0,a1,b0,b1,c0,c1,d0,d1,
+    sh1 = _mm_unpacklo_epi16(_pM128i(val.val[2]),_pM128i(val.val[3])); //a2,a3,b2,b3,c2,c3,d2,d3
+    val0 = _mm_unpacklo_epi32(sh0,sh1); // a0,a1,a2,a3,b0,b1,b2,b3
+    val2 = _mm_unpackhi_epi32(sh0,sh1); // c0,c1,c2,c3,d0,d1,d2,d3
+    vst1q_u16(ptr,      val0); //store as 128 bit structure
+    vst1q_u16((ptr + 8),  val2);
+}
+
+_NEON2SSESTORAGE void vst4_u32(__transfersize(8) uint32_t * ptr, uint32x2x4_t val);// VST4.32 {d0, d1, d2, d3}, [r0]
+_NEON2SSE_INLINE void vst4_u32(__transfersize(8) uint32_t * ptr, uint32x2x4_t val)
+{
+    //0,4,   1,5,  2,6,  3,7
+    __m128i sh0, sh1, val0, val1;
+    sh0 = _mm_unpacklo_epi32(_pM128i(val.val[0]), _pM128i(val.val[1])); //0,1,4,5
+    sh1 = _mm_unpacklo_epi32(_pM128i(val.val[2]), _pM128i(val.val[3])); //2,3,6,7
+    val0 = _mm_unpacklo_epi64(sh0,sh1); //
+    val1 = _mm_unpackhi_epi64(sh0,sh1); //
+    vst1q_u32(ptr,     val0); //store as 128 bit structure
+    vst1q_u32((ptr + 4),  val1);
+}
+
+_NEON2SSESTORAGE void vst4_u64(__transfersize(4) uint64_t * ptr, uint64x1x4_t val);// VST1.64 {d0, d1, d2, d3}, [r0]
+_NEON2SSE_INLINE void vst4_u64(__transfersize(4) uint64_t * ptr, uint64x1x4_t val)
+{
+    *(ptr) =  val.val[0].m64_u64[0];
+    *(ptr + 1) =  val.val[1].m64_u64[0];
+    *(ptr + 2) =  val.val[2].m64_u64[0];
+    *(ptr + 3) =  val.val[3].m64_u64[0];
+}
+
+//void vst4_s8(__transfersize(32) int8_t * ptr, int8x8x4_t val)  //VST4.8 {d0, d1, d2, d3}, [r0]
+#define vst4_s8(ptr, val) vst4_u8((uint8_t*)ptr, val)
+
+//void vst4_s16(__transfersize(16) int16_t * ptr, int16x4x4_t val)  // VST4.16 {d0, d1, d2, d3}, [r0]
+#define vst4_s16(ptr, val) vst4_u16((uint16_t*)ptr, val)
+
+//void vst4_s32(__transfersize(8) int32_t * ptr, int32x2x4_t val) // VST4.32 {d0, d1, d2, d3}, [r0]
+#define vst4_s32(ptr, val) vst4_u32((uint32_t*)ptr, val)
+
+//void vst4_s64(__transfersize(4) int64_t * ptr, int64x1x4_t val); // VST1.64 {d0, d1, d2, d3}, [r0]
+_NEON2SSESTORAGE void vst4_s64_ptr(__transfersize(4) int64_t * ptr, int64x1x4_t const * val);
+#define vst4_s64(ptr, val) vst4_u64((uint64_t*)ptr, val)
+
+//void vst4_f16(__transfersize(16) __fp16 * ptr, float16x4x4_t val);// VST4.16 {d0, d1, d2, d3}, [r0]
+_NEON2SSESTORAGE void vst4_f16_ptr(__transfersize(16) __fp16 * ptr, float16x4x4_t const * val);
+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
+
+_NEON2SSESTORAGE void vst4_f32(__transfersize(8) float32_t * ptr, float32x2x4_t val);// VST4.32 {d0, d1, d2, d3}, [r0]
+_NEON2SSE_INLINE void vst4_f32(__transfersize(8) float32_t * ptr, float32x2x4_t val)
+{
+    //0,4,   1,5,  2,6,  3,7 -> 0,1, 2,3, 4,5, 6,7
+    *(ptr) =   val.val[0].m64_f32[0];
+    *(ptr + 1) = val.val[1].m64_f32[0];
+    *(ptr + 2) = val.val[2].m64_f32[0];
+    *(ptr + 3) = val.val[3].m64_f32[0];
+    *(ptr + 4) = val.val[0].m64_f32[1];
+    *(ptr + 5) = val.val[1].m64_f32[1];
+    *(ptr + 6) = val.val[2].m64_f32[1];
+    *(ptr + 7) = val.val[3].m64_f32[1];
+}
+
+_NEON2SSESTORAGE void vst4_p8(__transfersize(32) poly8_t * ptr, poly8x8x4_t val);// VST4.8 {d0, d1, d2, d3}, [r0]
+#define vst4_p8 vst4_u8
+
+_NEON2SSESTORAGE void vst4_p16(__transfersize(16) poly16_t * ptr, poly16x4x4_t val);// VST4.16 {d0, d1, d2, d3}, [r0]
+#define vst4_p16 vst4_u16
+
+//*********** Store a lane of a vector into memory (extract given lane) for a couple of vectors  *********************
+//********************************************************************************************************************
+//void vst2q_lane_u16(__transfersize(2) uint16_t * ptr, uint16x8x2_t val, __constrange(0,7) int lane)// VST2.16 {d0[0], d2[0]}, [r0]
+_NEON2SSE_INLINE void vst2q_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x8x2_t const * val, __constrange(0,7) int lane)
+{
+    vst1q_lane_s16(ptr, val->val[0], lane);
+    vst1q_lane_s16((ptr + 1), val->val[1], lane);
+}
+#define vst2q_lane_u16(ptr, val, lane) vst2q_lane_u16_ptr(ptr, &val, lane)
+
+//void vst2q_lane_u32(__transfersize(2) uint32_t * ptr, uint32x4x2_t val, __constrange(0,3) int lane)// VST2.32 {d0[0], d2[0]}, [r0]
+_NEON2SSE_INLINE void vst2q_lane_u32_ptr(__transfersize(2) uint32_t* ptr, uint32x4x2_t const * val, __constrange(0,3) int lane)
+{
+    vst1q_lane_u32(ptr, val->val[0], lane);
+    vst1q_lane_u32((ptr + 1), val->val[1], lane);
+}
+#define vst2q_lane_u32(ptr, val, lane) vst2q_lane_u32_ptr(ptr, &val, lane)
+
+//void vst2q_lane_s16(__transfersize(2) int16_t * ptr, int16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0]
+_NEON2SSESTORAGE void vst2q_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x8x2_t const * val, __constrange(0,7) int lane);
+#define vst2q_lane_s16(ptr, val, lane) vst2q_lane_u16((uint16_t*)ptr, val, lane)
+
+//void vst2q_lane_s32(__transfersize(2) int32_t * ptr, int32x4x2_t val, __constrange(0,3) int lane);// VST2.32 {d0[0], d2[0]}, [r0]
+_NEON2SSESTORAGE void vst2q_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x4x2_t const * val, __constrange(0,3) int lane);
+#define vst2q_lane_s32(ptr, val, lane)  vst2q_lane_u32((uint32_t*)ptr, val, lane)
+
+//void vst2q_lane_f16(__transfersize(2) __fp16 * ptr, float16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0]
+_NEON2SSESTORAGE void vst2q_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x8x2_t const * val, __constrange(0,7) int lane);
+//current IA SIMD doesn't support float16
+
+//void vst2q_lane_f32(__transfersize(2) float32_t * ptr, float32x4x2_t val, __constrange(0,3) int lane)// VST2.32 {d0[0], d2[0]}, [r0]
+_NEON2SSE_INLINE void vst2q_lane_f32_ptr(__transfersize(2) float32_t* ptr, float32x4x2_t const * val, __constrange(0,3) int lane)
+{
+    vst1q_lane_f32(ptr, val->val[0], lane);
+    vst1q_lane_f32((ptr + 1), val->val[1], lane);
+}
+#define vst2q_lane_f32(ptr,src,lane) vst2q_lane_f32_ptr(ptr,&src,lane)
+
+//void vst2q_lane_p16(__transfersize(2) poly16_t * ptr, poly16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0]
+_NEON2SSESTORAGE void vst2q_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x8x2_t const * val, __constrange(0,7) int lane);
+#define vst2q_lane_p16 vst2q_lane_s16
+
+_NEON2SSESTORAGE void vst2_lane_u8(__transfersize(2) uint8_t * ptr, uint8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0]
+_NEON2SSE_INLINE void vst2_lane_u8(__transfersize(2) uint8_t * ptr, uint8x8x2_t val, __constrange(0,7) int lane) // VST2.8 {d0[0], d1[0]}, [r0]
+{
+    *(ptr) = val.val[0].m64_u8[lane];
+    *(ptr + 1) = val.val[1].m64_u8[lane];
+}
+
+_NEON2SSESTORAGE void vst2_lane_u16(__transfersize(2) uint16_t * ptr, uint16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0]
+_NEON2SSE_INLINE void vst2_lane_u16(__transfersize(2) uint16_t * ptr, uint16x4x2_t val, __constrange(0,3) int lane)
+{
+    *(ptr) = val.val[0].m64_u16[lane];
+    *(ptr + 1) = val.val[1].m64_u16[lane];
+}
+
+_NEON2SSESTORAGE void vst2_lane_u32(__transfersize(2) uint32_t * ptr, uint32x2x2_t val, __constrange(0,1) int lane);// VST2.32 {d0[0], d1[0]}, [r0]
+_NEON2SSE_INLINE void vst2_lane_u32(__transfersize(2) uint32_t * ptr, uint32x2x2_t val, __constrange(0,1) int lane)
+{
+    *(ptr) = val.val[0].m64_u32[lane];
+    *(ptr + 1) = val.val[1].m64_u32[lane];
+}
+
+_NEON2SSESTORAGE void vst2_lane_s8(__transfersize(2) int8_t * ptr, int8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0]
+#define vst2_lane_s8(ptr, val, lane)  vst2_lane_u8((uint8_t*)ptr, val, lane)
+
+_NEON2SSESTORAGE void vst2_lane_s16(__transfersize(2) int16_t * ptr, int16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0]
+#define vst2_lane_s16(ptr, val, lane)  vst2_lane_u16((uint16_t*)ptr, val, lane)
+
+_NEON2SSESTORAGE void vst2_lane_s32(__transfersize(2) int32_t * ptr, int32x2x2_t val, __constrange(0,1) int lane);// VST2.32 {d0[0], d1[0]}, [r0]
+#define vst2_lane_s32(ptr, val, lane)  vst2_lane_u32((uint32_t*)ptr, val, lane)
+
+//void vst2_lane_f16(__transfersize(2) __fp16 * ptr, float16x4x2_t val, __constrange(0,3) int lane); // VST2.16 {d0[0], d1[0]}, [r0]
+//current IA SIMD doesn't support float16
+
+_NEON2SSESTORAGE void vst2_lane_f32(__transfersize(2) float32_t * ptr, float32x2x2_t val, __constrange(0,1) int lane); // VST2.32 {d0[0], d1[0]}, [r0]
+_NEON2SSE_INLINE void vst2_lane_f32(__transfersize(2) float32_t * ptr, float32x2x2_t val, __constrange(0,1) int lane)
+{
+    *(ptr) = val.val[0].m64_f32[lane];
+    *(ptr + 1) = val.val[1].m64_f32[lane];
+}
+
+_NEON2SSESTORAGE void vst2_lane_p8(__transfersize(2) poly8_t * ptr, poly8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0]
+#define vst2_lane_p8 vst2_lane_u8
+
+_NEON2SSESTORAGE void vst2_lane_p16(__transfersize(2) poly16_t * ptr, poly16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0]
+#define vst2_lane_p16 vst2_lane_u16
+
+//************************* Triple lanes  stores *******************************************************
+//*******************************************************************************************************
+//void vst3q_lane_u16(__transfersize(3) uint16_t * ptr, uint16x8x3_t val, __constrange(0,7) int lane)// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSE_INLINE void vst3q_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x8x3_t const * val, __constrange(0,7) int lane)
+{
+    vst2q_lane_u16_ptr(ptr, (uint16x8x2_t*)val, lane);
+    vst1q_lane_u16((ptr + 2), val->val[2], lane);
+}
+#define vst3q_lane_u16(ptr, val, lane) vst3q_lane_u16_ptr(ptr, &val, lane)
+
+//void vst3q_lane_u32(__transfersize(3) uint32_t * ptr, uint32x4x3_t val, __constrange(0,3) int lane)// VST3.32 {d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSE_INLINE void vst3q_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x4x3_t const * val, __constrange(0,3) int lane)
+{
+    vst2q_lane_u32_ptr(ptr, (uint32x4x2_t*)val, lane);
+    vst1q_lane_u32((ptr + 2), val->val[2], lane);
+}
+#define vst3q_lane_u32(ptr, val, lane) vst3q_lane_u32_ptr(ptr, &val, lane)
+
+//void vst3q_lane_s16(__transfersize(3) int16_t * ptr, int16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSESTORAGE void vst3q_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x8x3_t const * val, __constrange(0,7) int lane);
+#define vst3q_lane_s16(ptr, val, lane) vst3q_lane_u16((uint16_t *)ptr, val, lane)
+
+//void vst3q_lane_s32(__transfersize(3) int32_t * ptr, int32x4x3_t val, __constrange(0,3) int lane);// VST3.32 {d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSESTORAGE void vst3q_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x4x3_t const * val, __constrange(0,3) int lane);
+#define vst3q_lane_s32(ptr, val, lane) vst3q_lane_u32((uint32_t *)ptr, val, lane)
+
+//void vst3q_lane_f16(__transfersize(3) __fp16 * ptr, float16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSESTORAGE void vst3q_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x8x3_t const * val, __constrange(0,7) int lane);
+//current IA SIMD doesn't support float16
+
+//void vst3q_lane_f32(__transfersize(3) float32_t * ptr, float32x4x3_t val, __constrange(0,3) int lane)// VST3.32 {d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSE_INLINE void vst3q_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x4x3_t const * val, __constrange(0,3) int lane)
+{
+    vst1q_lane_f32(ptr,   val->val[0], lane);
+    vst1q_lane_f32((ptr + 1),   val->val[1], lane);
+    vst1q_lane_f32((ptr + 2), val->val[2], lane);
+}
+#define vst3q_lane_f32(ptr,val,lane) vst3q_lane_f32_ptr(ptr,&val,lane)
+
+//void vst3q_lane_p16(__transfersize(3) poly16_t * ptr, poly16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSESTORAGE void vst3q_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x8x3_t const * val, __constrange(0,7) int lane);
+#define vst3q_lane_p16 vst3q_lane_s16
+
+_NEON2SSESTORAGE void vst3_lane_u8(__transfersize(3) uint8_t * ptr, uint8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSE_INLINE void vst3_lane_u8(__transfersize(3) uint8_t * ptr, uint8x8x3_t val, __constrange(0,7) int lane)
+{
+    *(ptr) =     val.val[0].m64_u8[lane];
+    *(ptr + 1) = val.val[1].m64_u8[lane];
+    *(ptr + 2) = val.val[2].m64_u8[lane];
+}
+
+_NEON2SSESTORAGE void vst3_lane_u16(__transfersize(3) uint16_t * ptr, uint16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSE_INLINE void vst3_lane_u16(__transfersize(3) uint16_t * ptr, uint16x4x3_t val, __constrange(0,3) int lane)
+{
+    *(ptr) =     val.val[0].m64_u16[lane];
+    *(ptr + 1) = val.val[1].m64_u16[lane];
+    *(ptr + 2) = val.val[2].m64_u16[lane];
+}
+
+_NEON2SSESTORAGE void vst3_lane_u32(__transfersize(3) uint32_t * ptr, uint32x2x3_t val, __constrange(0,1) int lane);// VST3.32 {d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSE_INLINE void vst3_lane_u32(__transfersize(3) uint32_t * ptr, uint32x2x3_t val, __constrange(0,1) int lane)
+{
+    *(ptr) =     val.val[0].m64_u32[lane];
+    *(ptr + 1) = val.val[1].m64_u32[lane];
+    *(ptr + 2) = val.val[2].m64_u32[lane];
+}
+
+_NEON2SSESTORAGE void vst3_lane_s8(__transfersize(3) int8_t * ptr, int8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0]
+#define  vst3_lane_s8(ptr, val, lane) vst3_lane_u8((uint8_t *)ptr, val, lane)
+
+_NEON2SSESTORAGE void vst3_lane_s16(__transfersize(3) int16_t * ptr, int16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
+#define vst3_lane_s16(ptr, val, lane) vst3_lane_u16((uint16_t *)ptr, val, lane)
+
+_NEON2SSESTORAGE void vst3_lane_s32(__transfersize(3) int32_t * ptr, int32x2x3_t val, __constrange(0,1) int lane);// VST3.32 {d0[0], d1[0], d2[0]}, [r0]
+#define vst3_lane_s32(ptr, val, lane) vst3_lane_u32((uint32_t *)ptr, val, lane)
+
+//void vst3_lane_f16(__transfersize(3) __fp16 * ptr, float16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSESTORAGE void vst3_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x4x3_t const * val, __constrange(0,3) int lane);
+//current IA SIMD doesn't support float16
+
+_NEON2SSESTORAGE void vst3_lane_f32(__transfersize(3) float32_t * ptr, float32x2x3_t val, __constrange(0,1) int lane);// VST3.32 {d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSE_INLINE void vst3_lane_f32(__transfersize(3) float32_t * ptr, float32x2x3_t val, __constrange(0,1) int lane)
+{
+    *(ptr) = val.val[0].m64_f32[lane];
+    *(ptr + 1) = val.val[1].m64_f32[lane];
+    *(ptr + 2) = val.val[2].m64_f32[lane];
+}
+
+_NEON2SSESTORAGE void vst3_lane_p8(__transfersize(3) poly8_t * ptr, poly8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0]
+#define vst3_lane_p8 vst3_lane_u8
+
+_NEON2SSESTORAGE void vst3_lane_p16(__transfersize(3) poly16_t * ptr, poly16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
+#define vst3_lane_p16 vst3_lane_u16
+
+//******************************** Quadruple lanes stores ***********************************************
+//*******************************************************************************************************
+//void vst4q_lane_u16(__transfersize(4) uint16_t * ptr, uint16x8x4_t val, __constrange(0,7) int lane)// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+_NEON2SSE_INLINE void vst4q_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x8x4_t const * val4, __constrange(0,7) int lane)
+{
+    vst2q_lane_u16_ptr(ptr,    (uint16x8x2_t*)val4->val, lane);
+    vst2q_lane_u16_ptr((ptr + 2),((uint16x8x2_t*)val4->val + 1), lane);
+}
+#define vst4q_lane_u16(ptr, val, lane) vst4q_lane_u16_ptr(ptr, &val, lane)
+
+//void vst4q_lane_u32(__transfersize(4) uint32_t * ptr, uint32x4x4_t val, __constrange(0,3) int lane)// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+_NEON2SSE_INLINE void vst4q_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x4x4_t const * val4, __constrange(0,3) int lane)
+{
+    vst2q_lane_u32_ptr(ptr,     (uint32x4x2_t*)val4->val, lane);
+    vst2q_lane_u32_ptr((ptr + 2), ((uint32x4x2_t*)val4->val + 1), lane);
+}
+#define vst4q_lane_u32(ptr, val, lane) vst4q_lane_u32_ptr(ptr, &val, lane)
+
+//void vst4q_lane_s16(__transfersize(4) int16_t * ptr, int16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+_NEON2SSESTORAGE void vst4q_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x8x4_t const * val, __constrange(0,7) int lane);
+#define vst4q_lane_s16(ptr,val,lane) vst4q_lane_u16((uint16_t *)ptr,val,lane)
+
+//void vst4q_lane_s32(__transfersize(4) int32_t * ptr, int32x4x4_t val, __constrange(0,3) int lane);// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+_NEON2SSESTORAGE void vst4q_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x4x4_t const * val, __constrange(0,3) int lane);
+#define vst4q_lane_s32(ptr,val,lane) vst4q_lane_u32((uint32_t *)ptr,val,lane)
+
+//void vst4q_lane_f16(__transfersize(4) __fp16 * ptr, float16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+_NEON2SSESTORAGE void vst4q_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x8x4_t const * val, __constrange(0,7) int lane);
+//current IA SIMD doesn't support float16
+
+//void vst4q_lane_f32(__transfersize(4) float32_t * ptr, float32x4x4_t val, __constrange(0,3) int lane)// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+_NEON2SSE_INLINE void vst4q_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x4x4_t const * val, __constrange(0,3) int lane)
+{
+    vst1q_lane_f32(ptr,   val->val[0], lane);
+    vst1q_lane_f32((ptr + 1), val->val[1], lane);
+    vst1q_lane_f32((ptr + 2), val->val[2], lane);
+    vst1q_lane_f32((ptr + 3), val->val[3], lane);
+}
+#define vst4q_lane_f32(ptr,val,lane) vst4q_lane_f32_ptr(ptr,&val,lane)
+
+//void vst4q_lane_p16(__transfersize(4) poly16_t * ptr, poly16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+_NEON2SSESTORAGE void vst4q_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x8x4_t const * val, __constrange(0,7) int lane);
+#define vst4q_lane_p16 vst4q_lane_u16
+
+_NEON2SSESTORAGE void vst4_lane_u8(__transfersize(4) uint8_t * ptr, uint8x8x4_t val, __constrange(0,7) int lane);// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSE_INLINE void vst4_lane_u8(__transfersize(4) uint8_t * ptr, uint8x8x4_t val, __constrange(0,7) int lane)
+{
+    *(ptr) =     val.val[0].m64_u8[lane];
+    *(ptr + 1) = val.val[1].m64_u8[lane];
+    *(ptr + 2) = val.val[2].m64_u8[lane];
+    *(ptr + 3) = val.val[3].m64_u8[lane];
+}
+
+_NEON2SSESTORAGE void vst4_lane_u16(__transfersize(4) uint16_t * ptr, uint16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSE_INLINE void vst4_lane_u16(__transfersize(4) uint16_t * ptr, uint16x4x4_t val, __constrange(0,3) int lane)
+{
+    *(ptr) =     val.val[0].m64_u16[lane];
+    *(ptr + 1) = val.val[1].m64_u16[lane];
+    *(ptr + 2) = val.val[2].m64_u16[lane];
+    *(ptr + 3) = val.val[3].m64_u16[lane];
+}
+
+_NEON2SSESTORAGE void vst4_lane_u32(__transfersize(4) uint32_t * ptr, uint32x2x4_t val, __constrange(0,1) int lane);// VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSE_INLINE void vst4_lane_u32(__transfersize(4) uint32_t * ptr, uint32x2x4_t val, __constrange(0,1) int lane)
+{
+    *(ptr) =     val.val[0].m64_u32[lane];
+    *(ptr + 1) = val.val[1].m64_u32[lane];
+    *(ptr + 2) = val.val[2].m64_u32[lane];
+    *(ptr + 3) = val.val[3].m64_u32[lane];
+}
+
+_NEON2SSESTORAGE void vst4_lane_s8(__transfersize(4) int8_t * ptr, int8x8x4_t val, __constrange(0,7) int lane);// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+#define vst4_lane_s8(ptr, val, lane) vst4_lane_u8((uint8_t*)ptr, val, lane)
+
+_NEON2SSESTORAGE void vst4_lane_s16(__transfersize(4) int16_t * ptr, int16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+#define vst4_lane_s16(ptr, val, lane) vst4_lane_u16((uint16_t*)ptr, val, lane)
+
+_NEON2SSESTORAGE void vst4_lane_s32(__transfersize(4) int32_t * ptr, int32x2x4_t val, __constrange(0,1) int lane);// VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+#define vst4_lane_s32(ptr, val, lane) vst4_lane_u32((uint32_t*)ptr, val, lane)
+
+//void vst4_lane_f16(__transfersize(4) __fp16 * ptr, float16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSESTORAGE void vst4_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x4x4_t const * val, __constrange(0,3) int lane);
+//current IA SIMD doesn't support float16
+
+_NEON2SSESTORAGE void vst4_lane_f32(__transfersize(4) float32_t * ptr, float32x2x4_t  val, __constrange(0,1) int lane); // VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSE_INLINE void vst4_lane_f32(__transfersize(4) float32_t * ptr, float32x2x4_t val, __constrange(0,1) int lane)
+{
+    *(ptr) = val.val[0].m64_f32[lane];
+    *(ptr + 1) = val.val[1].m64_f32[lane];
+    *(ptr + 2) = val.val[2].m64_f32[lane];
+    *(ptr + 3) = val.val[3].m64_f32[lane];
+}
+
+_NEON2SSESTORAGE void vst4_lane_p8(__transfersize(4) poly8_t * ptr, poly8x8x4_t val, __constrange(0,7) int lane);// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+#define vst4_lane_p8 vst4_lane_u8
+
+_NEON2SSESTORAGE void vst4_lane_p16(__transfersize(4) poly16_t * ptr, poly16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+#define vst4_lane_p16 vst4_lane_u16
+
+//**************************************************************************************************
+//************************ Extract lanes from a vector ********************************************
+//**************************************************************************************************
+//These intrinsics extract a single lane (element) from a vector.
+_NEON2SSESTORAGE uint8_t vget_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
+#define vget_lane_u8(vec, lane) vec.m64_u8[lane]
+
+_NEON2SSESTORAGE uint16_t vget_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VMOV.s16 r0, d0[0]
+#define vget_lane_u16(vec, lane) vec.m64_u16[lane]
+
+
+_NEON2SSESTORAGE uint32_t vget_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
+#define vget_lane_u32(vec, lane) vec.m64_u32[lane]
+
+_NEON2SSESTORAGE int8_t vget_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VMOV.S8 r0, d0[0]
+#define vget_lane_s8(vec, lane) vec.m64_i8[lane]
+
+_NEON2SSESTORAGE int16_t vget_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VMOV.S16 r0, d0[0]
+#define vget_lane_s16(vec, lane) vec.m64_i16[lane]
+
+_NEON2SSESTORAGE int32_t vget_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
+#define vget_lane_s32(vec, lane) vec.m64_i32[lane]
+
+_NEON2SSESTORAGE poly8_t vget_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
+#define vget_lane_p8 vget_lane_u8
+
+_NEON2SSESTORAGE poly16_t vget_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VMOV.s16 r0, d0[0]
+#define vget_lane_p16 vget_lane_u16
+
+_NEON2SSESTORAGE float32_t vget_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
+#define vget_lane_f32(vec, lane) vec.m64_f32[lane]
+
+_NEON2SSESTORAGE uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
+#define vgetq_lane_u8 (uint8_t) _MM_EXTRACT_EPI8
+
+_NEON2SSESTORAGE uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(0,7) int lane); // VMOV.s16 r0, d0[0]
+#define  vgetq_lane_u16 (uint16_t) _MM_EXTRACT_EPI16
+
+_NEON2SSESTORAGE uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
+#define vgetq_lane_u32 (uint32_t) _MM_EXTRACT_EPI32
+
+_NEON2SSESTORAGE int8_t vgetq_lane_s8(int8x16_t vec, __constrange(0,15) int lane); // VMOV.S8 r0, d0[0]
+#define vgetq_lane_s8 _MM_EXTRACT_EPI8
+
+_NEON2SSESTORAGE int16_t vgetq_lane_s16(int16x8_t vec, __constrange(0,7) int lane); // VMOV.S16 r0, d0[0]
+#define vgetq_lane_s16 _MM_EXTRACT_EPI16
+
+_NEON2SSESTORAGE int32_t vgetq_lane_s32(int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
+#define vgetq_lane_s32 _MM_EXTRACT_EPI32
+
+_NEON2SSESTORAGE poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
+#define vgetq_lane_p8 vgetq_lane_u8
+
+_NEON2SSESTORAGE poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(0,7) int lane); // VMOV.s16 r0, d0[0]
+#define vgetq_lane_p16 vgetq_lane_u16
+
+_NEON2SSESTORAGE float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
+_NEON2SSE_INLINE float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane)
+{
+    int32_t ilane;
+    ilane = _MM_EXTRACT_PS(vec,lane);
+    return *(float*)&ilane;
+}
+
+_NEON2SSESTORAGE int64_t vget_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
+#define vget_lane_s64(vec, lane) vec.m64_i64[0]
+
+_NEON2SSESTORAGE uint64_t vget_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
+#define vget_lane_u64(vec, lane) vec.m64_u64[0]
+
+
+_NEON2SSESTORAGE int64_t vgetq_lane_s64(int64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
+#define vgetq_lane_s64 _MM_EXTRACT_EPI64
+
+_NEON2SSESTORAGE uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
+#define vgetq_lane_u64 (uint64_t) _MM_EXTRACT_EPI64
+
+// ***************** Set lanes within a vector ********************************************
+// **************************************************************************************
+//These intrinsics set a single lane (element) within a vector.
+//same functions as vld1_lane_xx ones, but take the value to be set directly.
+
+_NEON2SSESTORAGE uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
+_NEON2SSE_INLINE uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane)
+{
+    uint8_t val;
+    val = value;
+    return vld1_lane_u8(&val, vec,  lane);
+}
+
+_NEON2SSESTORAGE uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
+_NEON2SSE_INLINE uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane)
+{
+    uint16_t val;
+    val = value;
+    return vld1_lane_u16(&val, vec,  lane);
+}
+
+_NEON2SSESTORAGE uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
+_NEON2SSE_INLINE uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane)
+{
+    uint32_t val;
+    val = value;
+    return vld1_lane_u32(&val, vec,  lane);
+}
+
+_NEON2SSESTORAGE int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
+_NEON2SSE_INLINE int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane)
+{
+    int8_t val;
+    val = value;
+    return vld1_lane_s8(&val, vec,  lane);
+}
+
+_NEON2SSESTORAGE int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
+_NEON2SSE_INLINE int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane)
+{
+    int16_t val;
+    val = value;
+    return vld1_lane_s16(&val, vec,  lane);
+}
+
+_NEON2SSESTORAGE int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
+_NEON2SSE_INLINE int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane)
+{
+    int32_t val;
+    val = value;
+    return vld1_lane_s32(&val, vec,  lane);
+}
+
+_NEON2SSESTORAGE poly8x8_t vset_lane_p8(poly8_t value, poly8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
+#define vset_lane_p8  vset_lane_u8
+
+_NEON2SSESTORAGE poly16x4_t vset_lane_p16(poly16_t value, poly16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
+#define vset_lane_p16  vset_lane_u16
+
+_NEON2SSESTORAGE float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
+_NEON2SSE_INLINE float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane)
+{
+    float32_t val;
+    val = value;
+    return vld1_lane_f32(&val, vec,  lane);
+}
+
+_NEON2SSESTORAGE uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
+_NEON2SSE_INLINE uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane)
+{
+    uint8_t val;
+    val = value;
+    return vld1q_lane_u8(&val, vec,  lane);
+}
+
+_NEON2SSESTORAGE uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
+_NEON2SSE_INLINE uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane)
+{
+    uint16_t val;
+    val = value;
+    return vld1q_lane_u16(&val, vec,  lane);
+}
+
+_NEON2SSESTORAGE uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
+_NEON2SSE_INLINE uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane)
+{
+    uint32_t val;
+    val = value;
+    return vld1q_lane_u32(&val, vec,  lane);
+}
+
+_NEON2SSESTORAGE int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
+_NEON2SSE_INLINE int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane)
+{
+    int8_t val;
+    val = value;
+    return vld1q_lane_s8(&val, vec,  lane);
+}
+
+_NEON2SSESTORAGE int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
+_NEON2SSE_INLINE int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane)
+{
+    int16_t val;
+    val = value;
+    return vld1q_lane_s16(&val, vec,  lane);
+}
+
+_NEON2SSESTORAGE int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
+_NEON2SSE_INLINE int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane)
+{
+    int32_t val;
+    val = value;
+    return vld1q_lane_s32(&val, vec,  lane);
+}
+
+_NEON2SSESTORAGE poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
+#define vsetq_lane_p8 vsetq_lane_u8
+
+_NEON2SSESTORAGE poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
+#define vsetq_lane_p16 vsetq_lane_u16
+
+_NEON2SSESTORAGE float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
+_NEON2SSE_INLINE float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane)
+{
+    float32_t val;
+    val = value;
+    return vld1q_lane_f32(&val, vec,  lane);
+}
+
+_NEON2SSESTORAGE int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
+_NEON2SSE_INLINE int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane)
+{
+    int64_t val;
+    val = value;
+    return vld1_lane_s64(&val, vec,  lane);
+}
+
+_NEON2SSESTORAGE uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
+_NEON2SSE_INLINE uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane)
+{
+    uint64_t val;
+    val = value;
+    return vld1_lane_u64(&val, vec,  lane);
+}
+
+_NEON2SSESTORAGE int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
+_NEON2SSE_INLINE int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane)
+{
+    uint64_t val;
+    val = value;
+    return vld1q_lane_s64(&val, vec,  lane);
+}
+
+_NEON2SSESTORAGE uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
+#define vsetq_lane_u64 vsetq_lane_s64
+
+// *******************************************************************************
+// **************** Initialize a vector from bit pattern ***************************
+// *******************************************************************************
+//These intrinsics create a vector from a literal bit pattern.
+_NEON2SSESTORAGE int8x8_t vcreate_s8(uint64_t a); // VMOV d0,r0,r0
+_NEON2SSE_INLINE int8x8_t vcreate_s8(uint64_t a)
+{
+    return (*(__m64_128*)&(a)); //here we couldn't use macro due to possible immediate value usage
+}
+
+_NEON2SSESTORAGE int16x4_t vcreate_s16(uint64_t a); // VMOV d0,r0,r0
+#define vcreate_s16  vcreate_s8
+
+_NEON2SSESTORAGE int32x2_t vcreate_s32(uint64_t a); // VMOV d0,r0,r0
+#define vcreate_s32  vcreate_s8
+
+_NEON2SSESTORAGE float16x4_t vcreate_f16(uint64_t a); // VMOV d0,r0,r0
+//no IA32 SIMD avalilable
+
+_NEON2SSESTORAGE float32x2_t vcreate_f32(uint64_t a); // VMOV d0,r0,r0
+_NEON2SSE_INLINE float32x2_t vcreate_f32(uint64_t a)
+{
+    return (*(__m64_128*)&(a)); //here we couldn't use macro due to possible immediate value usage
+}
+
+_NEON2SSESTORAGE uint8x8_t vcreate_u8(uint64_t a); // VMOV d0,r0,r0
+#define vcreate_u8 vcreate_s8
+
+_NEON2SSESTORAGE uint16x4_t vcreate_u16(uint64_t a); // VMOV d0,r0,r0
+#define vcreate_u16 vcreate_s16
+
+_NEON2SSESTORAGE uint32x2_t vcreate_u32(uint64_t a); // VMOV d0,r0,r0
+#define vcreate_u32 vcreate_s32
+
+_NEON2SSESTORAGE uint64x1_t vcreate_u64(uint64_t a); // VMOV d0,r0,r0
+#define vcreate_u64  vcreate_s8
+
+
+_NEON2SSESTORAGE poly8x8_t vcreate_p8(uint64_t a); // VMOV d0,r0,r0
+#define vcreate_p8 vcreate_u8
+
+_NEON2SSESTORAGE poly16x4_t vcreate_p16(uint64_t a); // VMOV d0,r0,r0
+#define vcreate_p16 vcreate_u16
+
+_NEON2SSESTORAGE int64x1_t vcreate_s64(uint64_t a); // VMOV d0,r0,r0
+#define vcreate_s64 vcreate_u64
+
+//********************* Set all lanes to same value ********************************
+//*********************************************************************************
+//These intrinsics set all lanes to the same value.
+_NEON2SSESTORAGE uint8x8_t   vdup_n_u8(uint8_t value); // VDUP.8 d0,r0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t  vdup_n_u8(uint8_t value),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    uint8x8_t res;
+    int i;
+    for (i = 0; i<8; i++) {
+        res.m64_u8[i] = value;
+    }
+    return res;
+}
+
+_NEON2SSESTORAGE uint16x4_t   vdup_n_u16(uint16_t value); // VDUP.16 d0,r0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t  vdup_n_u16(uint16_t value),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    uint16x4_t res;
+    int i;
+    for (i = 0; i<4; i++) {
+        res.m64_u16[i] = value;
+    }
+    return res;
+}
+
+_NEON2SSESTORAGE uint32x2_t   vdup_n_u32(uint32_t value); // VDUP.32 d0,r0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t  vdup_n_u32(uint32_t value),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    uint32x2_t res;
+    res.m64_u32[0] = value;
+    res.m64_u32[1] = value;
+    return res;
+}
+
+_NEON2SSESTORAGE int8x8_t   vdup_n_s8(int8_t value); // VDUP.8 d0,r0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t  vdup_n_s8(int8_t value),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    int8x8_t res;
+    int i;
+    for (i = 0; i<8; i++) {
+        res.m64_i8[i] = value;
+    }
+    return res;
+}
+
+_NEON2SSESTORAGE int16x4_t   vdup_n_s16(int16_t value); // VDUP.16 d0,r0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t  vdup_n_s16(int16_t value),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    int16x4_t res;
+    int i;
+    for (i = 0; i<4; i++) {
+        res.m64_i16[i] = value;
+    }
+    return res;
+}
+
+_NEON2SSESTORAGE int32x2_t   vdup_n_s32(int32_t value); // VDUP.32 d0,r0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t  vdup_n_s32(int32_t value),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    int32x2_t res;
+    res.m64_i32[0] = value;
+    res.m64_i32[1] = value;
+    return res;
+}
+
+_NEON2SSESTORAGE poly8x8_t vdup_n_p8(poly8_t value); // VDUP.8 d0,r0
+#define vdup_n_p8 vdup_n_u8
+
+_NEON2SSESTORAGE poly16x4_t vdup_n_p16(poly16_t value); // VDUP.16 d0,r0
+#define vdup_n_p16 vdup_n_s16
+
+_NEON2SSESTORAGE float32x2_t vdup_n_f32(float32_t value); // VDUP.32 d0,r0
+_NEON2SSE_INLINE float32x2_t vdup_n_f32(float32_t value)
+{
+    float32x2_t res;
+    res.m64_f32[0] = value;
+    res.m64_f32[1] = value;
+    return res;
+}
+
+_NEON2SSESTORAGE uint8x16_t   vdupq_n_u8(uint8_t value); // VDUP.8 q0,r0
+#define vdupq_n_u8(value) _mm_set1_epi8((uint8_t) (value))
+
+_NEON2SSESTORAGE uint16x8_t   vdupq_n_u16(uint16_t value); // VDUP.16 q0,r0
+#define vdupq_n_u16(value) _mm_set1_epi16((uint16_t) (value))
+
+_NEON2SSESTORAGE uint32x4_t   vdupq_n_u32(uint32_t value); // VDUP.32 q0,r0
+#define vdupq_n_u32(value) _mm_set1_epi32((uint32_t) (value))
+
+_NEON2SSESTORAGE int8x16_t   vdupq_n_s8(int8_t value); // VDUP.8 q0,r0
+#define vdupq_n_s8 _mm_set1_epi8
+
+_NEON2SSESTORAGE int16x8_t   vdupq_n_s16(int16_t value); // VDUP.16 q0,r0
+#define vdupq_n_s16 _mm_set1_epi16
+
+_NEON2SSESTORAGE int32x4_t   vdupq_n_s32(int32_t value); // VDUP.32 q0,r0
+#define vdupq_n_s32 _mm_set1_epi32
+
+_NEON2SSESTORAGE poly8x16_t vdupq_n_p8(poly8_t value); // VDUP.8 q0,r0
+#define  vdupq_n_p8 vdupq_n_u8
+
+_NEON2SSESTORAGE poly16x8_t vdupq_n_p16(poly16_t value); // VDUP.16 q0,r0
+#define  vdupq_n_p16 vdupq_n_u16
+
+_NEON2SSESTORAGE float32x4_t vdupq_n_f32(float32_t value); // VDUP.32 q0,r0
+#define vdupq_n_f32 _mm_set1_ps
+
+_NEON2SSESTORAGE int64x1_t vdup_n_s64(int64_t value); // VMOV d0,r0,r0
+_NEON2SSE_INLINE int64x1_t vdup_n_s64(int64_t value)
+{
+    int64x1_t res;
+    res.m64_i64[0] = value;
+    return res;
+}
+
+_NEON2SSESTORAGE uint64x1_t vdup_n_u64(uint64_t value); // VMOV d0,r0,r0
+_NEON2SSE_INLINE uint64x1_t  vdup_n_u64(uint64_t value)
+{
+    uint64x1_t res;
+    res.m64_u64[0] = value;
+    return res;
+}
+
+_NEON2SSESTORAGE int64x2_t   vdupq_n_s64(int64_t value); // VMOV d0,r0,r0
+_NEON2SSE_INLINE int64x2_t   vdupq_n_s64(int64_t value)
+{
+    _NEON2SSE_ALIGN_16 int64_t value2[2] = {value, value}; //value may be an immediate
+    return LOAD_SI128(value2);
+}
+
+_NEON2SSESTORAGE uint64x2_t   vdupq_n_u64(uint64_t value); // VMOV d0,r0,r0
+_NEON2SSE_INLINE uint64x2_t   vdupq_n_u64(uint64_t value)
+{
+    _NEON2SSE_ALIGN_16 uint64_t val[2] = {value, value}; //value may be an immediate
+    return LOAD_SI128(val);
+}
+
+//****  Set all lanes to same value  ************************
+//Same functions as above - just aliaces.********************
+//Probably they reflect the fact that 128-bit functions versions use VMOV instruction **********
+_NEON2SSESTORAGE uint8x8_t vmov_n_u8(uint8_t value); // VDUP.8 d0,r0
+#define vmov_n_u8 vdup_n_s8
+
+_NEON2SSESTORAGE uint16x4_t vmov_n_u16(uint16_t value); // VDUP.16 d0,r0
+#define vmov_n_u16 vdup_n_s16
+
+_NEON2SSESTORAGE uint32x2_t vmov_n_u32(uint32_t value); // VDUP.32 d0,r0
+#define vmov_n_u32 vdup_n_u32
+
+_NEON2SSESTORAGE int8x8_t vmov_n_s8(int8_t value); // VDUP.8 d0,r0
+#define vmov_n_s8 vdup_n_s8
+
+_NEON2SSESTORAGE int16x4_t vmov_n_s16(int16_t value); // VDUP.16 d0,r0
+#define vmov_n_s16 vdup_n_s16
+
+_NEON2SSESTORAGE int32x2_t vmov_n_s32(int32_t value); // VDUP.32 d0,r0
+#define vmov_n_s32 vdup_n_s32
+
+_NEON2SSESTORAGE poly8x8_t vmov_n_p8(poly8_t value); // VDUP.8 d0,r0
+#define vmov_n_p8 vdup_n_u8
+
+_NEON2SSESTORAGE poly16x4_t vmov_n_p16(poly16_t value); // VDUP.16 d0,r0
+#define vmov_n_p16 vdup_n_s16
+
+_NEON2SSESTORAGE float32x2_t vmov_n_f32(float32_t value); // VDUP.32 d0,r0
+#define vmov_n_f32 vdup_n_f32
+
+_NEON2SSESTORAGE uint8x16_t vmovq_n_u8(uint8_t value); // VDUP.8 q0,r0
+#define vmovq_n_u8 vdupq_n_u8
+
+_NEON2SSESTORAGE uint16x8_t vmovq_n_u16(uint16_t value); // VDUP.16 q0,r0
+#define vmovq_n_u16 vdupq_n_s16
+
+_NEON2SSESTORAGE uint32x4_t vmovq_n_u32(uint32_t value); // VDUP.32 q0,r0
+#define vmovq_n_u32 vdupq_n_u32
+
+_NEON2SSESTORAGE int8x16_t vmovq_n_s8(int8_t value); // VDUP.8 q0,r0
+#define vmovq_n_s8 vdupq_n_s8
+
+_NEON2SSESTORAGE int16x8_t vmovq_n_s16(int16_t value); // VDUP.16 q0,r0
+#define vmovq_n_s16 vdupq_n_s16
+
+_NEON2SSESTORAGE int32x4_t vmovq_n_s32(int32_t value); // VDUP.32 q0,r0
+#define vmovq_n_s32 vdupq_n_s32
+
+_NEON2SSESTORAGE poly8x16_t vmovq_n_p8(poly8_t value); // VDUP.8 q0,r0
+#define vmovq_n_p8 vdupq_n_u8
+
+_NEON2SSESTORAGE poly16x8_t vmovq_n_p16(poly16_t value); // VDUP.16 q0,r0
+#define vmovq_n_p16 vdupq_n_s16
+
+_NEON2SSESTORAGE float32x4_t vmovq_n_f32(float32_t value); // VDUP.32 q0,r0
+#define vmovq_n_f32 vdupq_n_f32
+
+_NEON2SSESTORAGE int64x1_t vmov_n_s64(int64_t value); // VMOV d0,r0,r0
+#define vmov_n_s64 vdup_n_s64
+
+_NEON2SSESTORAGE uint64x1_t vmov_n_u64(uint64_t value); // VMOV d0,r0,r0
+#define vmov_n_u64 vdup_n_u64
+
+_NEON2SSESTORAGE int64x2_t vmovq_n_s64(int64_t value); // VMOV d0,r0,r0
+#define vmovq_n_s64 vdupq_n_s64
+
+_NEON2SSESTORAGE uint64x2_t vmovq_n_u64(uint64_t value); // VMOV d0,r0,r0
+#define vmovq_n_u64 vdupq_n_u64
+
+//**************Set all lanes to the value of one lane of a vector *************
+//****************************************************************************
+//here shuffle is better solution than lane extraction followed by set1 function
+_NEON2SSESTORAGE uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
+_NEON2SSE_INLINE uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane)
+{
+    uint8x8_t res;
+    uint8_t valane;
+    int i = 0;
+    valane = vec.m64_u8[lane];
+    for (i = 0; i<8; i++) {
+        res.m64_u8[i] = valane;
+    }
+    return res;
+}
+
+_NEON2SSESTORAGE uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
+_NEON2SSE_INLINE uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane)
+{
+    uint16x4_t res;
+    uint16_t valane;
+    valane = vec.m64_u16[lane];
+    res.m64_u16[0] = valane;
+    res.m64_u16[1] = valane;
+    res.m64_u16[2] = valane;
+    res.m64_u16[3] = valane;
+    return res;
+}
+
+_NEON2SSESTORAGE uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
+_NEON2SSE_INLINE uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane)
+{
+    uint32x2_t res;
+    res.m64_u32[0] = vec.m64_u32[lane];
+    res.m64_u32[1] = res.m64_u32[0];
+    return res;
+}
+
+_NEON2SSESTORAGE int8x8_t vdup_lane_s8(int8x8_t vec,  __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
+#define vdup_lane_s8 vdup_lane_u8
+
+_NEON2SSESTORAGE int16x4_t vdup_lane_s16(int16x4_t vec,  __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
+#define vdup_lane_s16 vdup_lane_u16
+
+_NEON2SSESTORAGE int32x2_t vdup_lane_s32(int32x2_t vec,  __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
+#define vdup_lane_s32 vdup_lane_u32
+
+_NEON2SSESTORAGE poly8x8_t vdup_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
+#define vdup_lane_p8 vdup_lane_u8
+
+_NEON2SSESTORAGE poly16x4_t vdup_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
+#define vdup_lane_p16 vdup_lane_s16
+
+_NEON2SSESTORAGE float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
+_NEON2SSE_INLINE float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane)
+{
+    float32x2_t res;
+    res.m64_f32[0] = vec.m64_f32[lane];
+    res.m64_f32[1] = res.m64_f32[0];
+    return res;
+}
+
+_NEON2SSESTORAGE uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
+_NEON2SSE_INLINE uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane) // VDUP.8 q0,d0[0]
+{
+    const int8_t lane8 = (int8_t) lane;
+    _NEON2SSE_ALIGN_16 int8_t lanemask8[16] = {lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8};
+    return _mm_shuffle_epi8 (_pM128i(vec), *(__m128i*) lanemask8);
+}
+
+_NEON2SSESTORAGE uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
+_NEON2SSE_INLINE uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane) // VDUP.16 q0,d0[0]
+{
+    //we could use 8bit shuffle for 16 bit as well
+    const int8_t lane16 = ((int8_t) lane) << 1;
+    const int8_t lane16_1 = lane16 + 1;
+    _NEON2SSE_ALIGN_16 int8_t lanemask_e16[16] = {lane16, lane16_1, lane16, lane16_1, lane16, lane16_1, lane16, lane16_1,
+                                                lane16, lane16_1, lane16, lane16_1, lane16, lane16_1, lane16, lane16_1};
+    return _mm_shuffle_epi8 (_pM128i(vec), *(__m128i*)lanemask_e16);
+}
+
+_NEON2SSESTORAGE uint32x4_t vdupq_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
+_NEON2SSE_INLINE uint32x4_t vdupq_lane_u32(uint32x2_t vec, __constrange(0,1) int lane)
+{
+    //need to use function not macro to make it gcc friendly and meet the immediate const requirement for _mm_shuffle_epi32
+    if (lane == 1)
+        return _mm_shuffle_epi32 (_pM128i(vec), (1 | (1 << 2) | (1 << 4) | (1 << 6)) );
+    else
+        return _mm_shuffle_epi32 (_pM128i(vec), 0);
+}
+
+_NEON2SSESTORAGE int8x16_t vdupq_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
+#define vdupq_lane_s8 vdupq_lane_u8
+
+_NEON2SSESTORAGE int16x8_t vdupq_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
+#define vdupq_lane_s16 vdupq_lane_u16
+
+_NEON2SSESTORAGE int32x4_t vdupq_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
+#define vdupq_lane_s32 vdupq_lane_u32
+
+_NEON2SSESTORAGE poly8x16_t vdupq_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
+#define vdupq_lane_p8 vdupq_lane_u8
+
+_NEON2SSESTORAGE poly16x8_t vdupq_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
+#define vdupq_lane_p16 vdupq_lane_s16
+
+_NEON2SSESTORAGE float32x4_t vdupq_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
+#define  vdupq_lane_f32(vec, lane)  _mm_load1_ps((vec.m64_f32 + lane))
+
+_NEON2SSESTORAGE int64x1_t vdup_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
+#define vdup_lane_s64(vec,lane) vec
+
+_NEON2SSESTORAGE uint64x1_t vdup_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
+#define vdup_lane_u64(vec,lane) vec
+
+_NEON2SSESTORAGE int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
+_NEON2SSE_INLINE int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane)
+{
+    __m128i vec128;
+    vec128 = _pM128i(vec);
+    return _mm_unpacklo_epi64(vec128,vec128);
+}
+
+_NEON2SSESTORAGE uint64x2_t vdupq_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
+#define vdupq_lane_u64 vdupq_lane_s64
+
+// ********************************************************************
+// ********************  Combining vectors *****************************
+// ********************************************************************
+//These intrinsics join two 64 bit vectors into a single 128bit vector.
+_NEON2SSESTORAGE int8x16_t   vcombine_s8(int8x8_t low, int8x8_t high); // VMOV d0,d0
+_NEON2SSE_INLINE int8x16_t  vcombine_s8(int8x8_t low, int8x8_t high)
+{
+   return _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) );
+}
+
+_NEON2SSESTORAGE int16x8_t   vcombine_s16(int16x4_t low, int16x4_t high); // VMOV d0,d0
+#define vcombine_s16 vcombine_s8
+
+_NEON2SSESTORAGE int32x4_t   vcombine_s32(int32x2_t low, int32x2_t high); // VMOV d0,d0
+#define vcombine_s32 vcombine_s8
+
+_NEON2SSESTORAGE int64x2_t   vcombine_s64(int64x1_t low, int64x1_t high); // VMOV d0,d0
+#define vcombine_s64 vcombine_s8
+
+_NEON2SSESTORAGE float16x8_t vcombine_f16(float16x4_t low, float16x4_t high); // VMOV d0,d0
+//current IA SIMD doesn't support float16
+
+_NEON2SSESTORAGE float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
+_NEON2SSE_INLINE float32x4_t vcombine_f32(float32x2_t low, float32x2_t high)
+{
+    __m128i res;
+    res = _mm_unpacklo_epi64(_pM128i(low), _pM128i(high) );
+    return _M128(res);
+}
+
+_NEON2SSESTORAGE uint8x16_t   vcombine_u8(uint8x8_t low, uint8x8_t high); // VMOV d0,d0
+#define vcombine_u8 vcombine_s8
+
+_NEON2SSESTORAGE uint16x8_t   vcombine_u16(uint16x4_t low, uint16x4_t high); // VMOV d0,d0
+#define vcombine_u16 vcombine_s16
+
+_NEON2SSESTORAGE uint32x4_t   vcombine_u32(uint32x2_t low, uint32x2_t high); // VMOV d0,d0
+#define vcombine_u32 vcombine_s32
+
+_NEON2SSESTORAGE uint64x2_t   vcombine_u64(uint64x1_t low, uint64x1_t high); // VMOV d0,d0
+#define vcombine_u64 vcombine_s64
+
+_NEON2SSESTORAGE poly8x16_t   vcombine_p8(poly8x8_t low, poly8x8_t high); // VMOV d0,d0
+#define vcombine_p8 vcombine_u8
+
+_NEON2SSESTORAGE poly16x8_t   vcombine_p16(poly16x4_t low, poly16x4_t high); // VMOV d0,d0
+#define vcombine_p16 vcombine_u16
+
+//**********************************************************************
+//************************* Splitting vectors **************************
+//**********************************************************************
+//**************** Get high part ******************************************
+//These intrinsics split a 128 bit vector into 2 component 64 bit vectors
+_NEON2SSESTORAGE int8x8_t vget_high_s8(int8x16_t a); // VMOV d0,d0
+_NEON2SSE_INLINE int8x8_t vget_high_s8(int8x16_t a)
+{
+    int8x8_t res64;
+    __m128i res;
+    res = _mm_unpackhi_epi64(a,a); //SSE2
+    return64(res);
+}
+
+_NEON2SSESTORAGE int16x4_t vget_high_s16(int16x8_t a); // VMOV d0,d0
+_NEON2SSE_INLINE int16x4_t vget_high_s16(int16x8_t a)
+{
+    int16x4_t res64;
+    __m128i res;
+    res =  _mm_unpackhi_epi64(a,a); //SSE2
+    return64(res);
+}
+
+_NEON2SSESTORAGE int32x2_t vget_high_s32(int32x4_t a); // VMOV d0,d0
+_NEON2SSE_INLINE int32x2_t vget_high_s32(int32x4_t a)
+{
+    int32x2_t res64;
+    __m128i res;
+    res =  _mm_unpackhi_epi64(a,a); //SSE2
+    return64(res);
+}
+
+_NEON2SSESTORAGE int64x1_t vget_high_s64(int64x2_t a); // VMOV d0,d0
+_NEON2SSE_INLINE int64x1_t vget_high_s64(int64x2_t a)
+{
+    int64x1_t res64;
+    __m128i res;
+    res =  _mm_unpackhi_epi64(a,a); //SSE2
+    return64(res);
+}
+
+_NEON2SSESTORAGE float16x4_t vget_high_f16(float16x8_t a); // VMOV d0,d0
+// IA32 SIMD doesn't work with 16bit floats currently
+
+_NEON2SSESTORAGE float32x2_t vget_high_f32(float32x4_t a); // VMOV d0,d0
+_NEON2SSE_INLINE float32x2_t vget_high_f32(float32x4_t a)
+{
+    __m128i res;
+    __m64_128 res64;
+    res = _mm_unpackhi_epi64(_M128i(a),_M128i(a));
+    return64(res);
+}
+
+_NEON2SSESTORAGE uint8x8_t vget_high_u8(uint8x16_t a); // VMOV d0,d0
+#define vget_high_u8 vget_high_s8
+
+_NEON2SSESTORAGE uint16x4_t vget_high_u16(uint16x8_t a); // VMOV d0,d0
+#define vget_high_u16 vget_high_s16
+
+_NEON2SSESTORAGE uint32x2_t vget_high_u32(uint32x4_t a); // VMOV d0,d0
+#define vget_high_u32 vget_high_s32
+
+_NEON2SSESTORAGE uint64x1_t vget_high_u64(uint64x2_t a); // VMOV d0,d0
+#define vget_high_u64 vget_high_s64
+
+_NEON2SSESTORAGE poly8x8_t vget_high_p8(poly8x16_t a); // VMOV d0,d0
+#define vget_high_p8 vget_high_u8
+
+_NEON2SSESTORAGE poly16x4_t vget_high_p16(poly16x8_t a); // VMOV d0,d0
+#define vget_high_p16 vget_high_u16
+
+//********************** Get low part **********************
+//**********************************************************
+_NEON2SSESTORAGE int8x8_t vget_low_s8(int8x16_t a); // VMOV d0,d0
+_NEON2SSE_INLINE int8x8_t vget_low_s8(int8x16_t a) // VMOV d0,d0
+{
+    int16x4_t res64;
+    return64(a);
+}
+
+_NEON2SSESTORAGE int16x4_t vget_low_s16(int16x8_t a); // VMOV d0,d0
+_NEON2SSE_INLINE int16x4_t vget_low_s16(int16x8_t a) // VMOV d0,d0
+{
+    int16x4_t res64;
+    return64(a);
+}
+
+_NEON2SSESTORAGE int32x2_t vget_low_s32(int32x4_t a); // VMOV d0,d0
+_NEON2SSE_INLINE int32x2_t vget_low_s32(int32x4_t a) // VMOV d0,d0
+{
+    int32x2_t res64;
+    return64(a);
+}
+
+_NEON2SSESTORAGE int64x1_t vget_low_s64(int64x2_t a); // VMOV d0,d0
+_NEON2SSE_INLINE int64x1_t vget_low_s64(int64x2_t a) // VMOV d0,d0
+{
+    int64x1_t res64;
+    return64 (a);
+}
+
+_NEON2SSESTORAGE float16x4_t vget_low_f16(float16x8_t a); // VMOV d0,d0
+// IA32 SIMD doesn't work with 16bit floats currently
+
+_NEON2SSESTORAGE float32x2_t vget_low_f32(float32x4_t a); // VMOV d0,d0
+_NEON2SSE_INLINE float32x2_t vget_low_f32(float32x4_t a)
+{
+    float32x2_t res64;
+    _M64f(res64, a);
+    return res64;
+}
+
+_NEON2SSESTORAGE uint8x8_t vget_low_u8(uint8x16_t a); // VMOV d0,d0
+#define vget_low_u8 vget_low_s8
+
+_NEON2SSESTORAGE uint16x4_t vget_low_u16(uint16x8_t a); // VMOV d0,d0
+#define vget_low_u16 vget_low_s16
+
+_NEON2SSESTORAGE uint32x2_t vget_low_u32(uint32x4_t a); // VMOV d0,d0
+#define vget_low_u32 vget_low_s32
+
+_NEON2SSESTORAGE uint64x1_t vget_low_u64(uint64x2_t a); // VMOV d0,d0
+#define vget_low_u64 vget_low_s64
+
+_NEON2SSESTORAGE poly8x8_t vget_low_p8(poly8x16_t a); // VMOV d0,d0
+#define vget_low_p8 vget_low_u8
+
+_NEON2SSESTORAGE poly16x4_t vget_low_p16(poly16x8_t a); // VMOV d0,d0
+#define vget_low_p16 vget_low_s16
+
+//**************************************************************************
+//************************ Converting vectors **********************************
+//**************************************************************************
+//************* Convert from float ***************************************
+// need to set _MM_SET_ROUNDING_MODE ( x) accordingly
+_NEON2SSESTORAGE int32x2_t   vcvt_s32_f32(float32x2_t a); // VCVT.S32.F32 d0, d0
+_NEON2SSE_INLINE int32x2_t   vcvt_s32_f32(float32x2_t a)
+{
+    int32x2_t res64;
+    __m128i res;
+    res =  _mm_cvtps_epi32(_pM128(a)); //use low 64 bits of result only
+    return64(res);
+}
+
+_NEON2SSESTORAGE uint32x2_t vcvt_u32_f32(float32x2_t a); // VCVT.U32.F32 d0, d0
+_NEON2SSE_INLINE uint32x2_t vcvt_u32_f32(float32x2_t a)
+{
+    uint32x2_t res64;
+    __m128i res;
+    res = vcvtq_u32_f32(_pM128(a));
+    return64(res);
+}
+
+_NEON2SSESTORAGE int32x4_t  vcvtq_s32_f32(float32x4_t a); // VCVT.S32.F32 q0, q0
+_NEON2SSE_INLINE int32x4_t  vcvtq_s32_f32(float32x4_t a)
+{
+    __m128 dif;
+    __m128i res;
+    //_mm_cvttps_epi32 incorrectly treats the case a > =2.14748364e+009, therefore the special processing is necessary
+    _NEON2SSE_ALIGN_16 static const float32_t fmax[] = { 2.14748364e+009f, 2.14748364e+009f, 2.14748364e+009f, 2.14748364e+009f };
+    dif = _mm_cmpge_ps(a, *(__m128*)fmax);
+    res = _mm_cvttps_epi32(a);
+    return _mm_xor_si128(res, _M128i(dif));
+}
+
+_NEON2SSESTORAGE uint32x4_t vcvtq_u32_f32(float32x4_t a); // VCVT.U32.F32 q0, q0
+_NEON2SSE_INLINE uint32x4_t vcvtq_u32_f32(float32x4_t a) // VCVT.U32.F32 q0, q0
+{
+    //No single instruction SSE solution  but we could implement it as following:
+    __m128i res1, res2, zero, mask;
+    __m128  max, min, dif;
+    _NEON2SSE_ALIGN_16 static const float32_t fmax[] = { 2.14748364e+009f, 2.14748364e+009f, 2.14748364e+009f, 2.14748364e+009f };
+    _NEON2SSE_ALIGN_16 static const float32_t fmax_unsigned[] = { 4.29496729e+009f, 4.29496729e+009f, 4.29496729e+009f, 4.29496729e+009f };
+    zero = _mm_setzero_si128();
+    mask = _mm_cmpgt_epi32(_M128i(a), zero);
+    min = _mm_and_ps(_M128(mask), a);
+    max = _mm_min_ps(min, *(__m128*)fmax_unsigned); //clamped in 0 - 4.29496729+009
+
+    dif = _mm_sub_ps(max, *(__m128*)fmax);
+    mask = _mm_cmpgt_epi32(_M128i(dif),zero);
+    dif = _mm_and_ps(_M128(mask), dif);
+
+    res1 = _mm_cvttps_epi32(dif);
+    res2 = vcvtq_s32_f32(max);
+    return _mm_add_epi32(res1, res2);
+}
+
+// ***** Convert to the fixed point  with   the number of fraction bits specified by b ***********
+//*************************************************************************************************
+_NEON2SSESTORAGE int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.S32.F32 d0, d0, #32
+_NEON2SSE_INLINE int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b)
+{
+    int32x2_t res64;
+    return64(vcvtq_n_s32_f32(_pM128(a),b));
+}
+
+_NEON2SSESTORAGE uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.U32.F32 d0, d0, #32
+_NEON2SSE_INLINE uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b)
+{
+    uint32x2_t res;
+    float convconst;
+    convconst = (float)((uint32_t)1 << b);
+    res.m64_u32[0] = (uint32_t) (a.m64_f32[0] * convconst);
+    res.m64_u32[1] = (uint32_t) (a.m64_f32[1] * convconst);
+    return res;
+}
+
+_NEON2SSESTORAGE int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.S32.F32 q0, q0, #32
+_NEON2SSE_INLINE int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b)
+{
+    float convconst;
+    _NEON2SSE_ALIGN_16 static const uint32_t cmask[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
+    __m128 cconst128;
+    __m128i mask, res;
+    convconst = (float)(1 << b);
+    cconst128 = vdupq_n_f32(convconst);
+    res =  _mm_cvttps_epi32(_mm_mul_ps(a,cconst128));
+    mask = _mm_cmpeq_epi32 (res, *(__m128i*)cmask);
+    return _mm_xor_si128 (res,  mask); //res saturated for 0x80000000
+}
+
+_NEON2SSESTORAGE uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.U32.F32 q0, q0, #32
+_NEON2SSE_INLINE uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b)
+{
+    float convconst;
+    __m128 cconst128;
+    convconst = (float)(1 << b);
+    cconst128 = vdupq_n_f32(convconst);
+    return vcvtq_u32_f32(_mm_mul_ps(a,cconst128));
+}
+
+
+_NEON2SSESTORAGE int32x4_t vcvtnq_s32_f32(float32x4_t a); // VCVTN.S32.F32 q0, q0
+_NEON2SSE_INLINE int32x4_t vcvtnq_s32_f32(float32x4_t a)
+{
+  return _mm_cvtps_epi32(a);
+}
+
+//***************** Convert to float *************************
+//*************************************************************
+_NEON2SSESTORAGE float32x2_t vcvt_f32_s32(int32x2_t a); // VCVT.F32.S32 d0, d0
+_NEON2SSE_INLINE float32x2_t vcvt_f32_s32(int32x2_t a) //use low 64 bits
+{
+    float32x2_t res;
+    res.m64_f32[0] = (float) a.m64_i32[0];
+    res.m64_f32[1] = (float) a.m64_i32[1];
+    return res;
+}
+
+_NEON2SSESTORAGE float32x2_t vcvt_f32_u32(uint32x2_t a); // VCVT.F32.U32 d0, d0
+_NEON2SSE_INLINE float32x2_t vcvt_f32_u32(uint32x2_t a)
+{
+    float32x2_t res;
+    res.m64_f32[0] = (float) a.m64_u32[0];
+    res.m64_f32[1] = (float) a.m64_u32[1];
+    return res;
+}
+
+_NEON2SSESTORAGE float32x4_t vcvtq_f32_s32(int32x4_t a); // VCVT.F32.S32 q0, q0
+#define vcvtq_f32_s32(a) _mm_cvtepi32_ps(a)
+
+_NEON2SSESTORAGE float32x4_t vcvtq_f32_u32(uint32x4_t a); // VCVT.F32.U32 q0, q0
+_NEON2SSE_INLINE float32x4_t vcvtq_f32_u32(uint32x4_t a) // VCVT.F32.U32 q0, q0
+{
+    //solution may be not optimal
+    __m128 two16, fHi, fLo;
+    __m128i hi, lo;
+    two16 = _mm_set1_ps((float)0x10000); //2^16
+    // Avoid double rounding by doing two exact conversions
+    // of high and low 16-bit segments
+    hi = _mm_srli_epi32(a, 16);
+    lo = _mm_srli_epi32(_mm_slli_epi32(a, 16), 16);
+    fHi = _mm_mul_ps(_mm_cvtepi32_ps(hi), two16);
+    fLo = _mm_cvtepi32_ps(lo);
+    // do single rounding according to current rounding mode
+    return _mm_add_ps(fHi, fLo);
+}
+
+// ***** Convert to the float from fixed point  with   the number of fraction bits specified by b ***********
+_NEON2SSESTORAGE float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b); // VCVT.F32.S32 d0, d0, #32
+_NEON2SSE_INLINE float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b)
+{
+    float32x2_t res;
+    float convconst;
+    convconst = (float)(1. / ((uint32_t)1 << b));
+    res.m64_f32[0] =  a.m64_i32[0] * convconst;
+    res.m64_f32[1] = a.m64_i32[1] * convconst;
+    return res;
+}
+
+_NEON2SSESTORAGE float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b); // VCVT.F32.U32 d0, d0, #32
+_NEON2SSE_INLINE float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b) // VCVT.F32.U32 d0, d0, #32
+{
+    float32x2_t res;
+    float convconst;
+    convconst = (float)(1. / ((uint32_t)1 << b));
+    res.m64_f32[0] =  a.m64_u32[0] * convconst;
+    res.m64_f32[1] = a.m64_u32[1] * convconst;
+    return res;
+}
+
+_NEON2SSESTORAGE float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b); // VCVT.F32.S32 q0, q0, #32
+_NEON2SSE_INLINE float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b)
+{
+    float convconst;
+    __m128 cconst128, af;
+    convconst = (float)(1. / ((uint32_t)1 << b));
+    af = _mm_cvtepi32_ps(a);
+    cconst128 = vdupq_n_f32(convconst);
+    return _mm_mul_ps(af,cconst128);
+}
+
+_NEON2SSESTORAGE float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b); // VCVT.F32.U32 q0, q0, #32
+_NEON2SSE_INLINE float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b)
+{
+    float convconst;
+    __m128 cconst128, af;
+    convconst = (float)(1. / (1 << b));
+    af = vcvtq_f32_u32(a);
+    cconst128 = vdupq_n_f32(convconst);
+    return _mm_mul_ps(af,cconst128);
+}
+
+//**************Convert between floats ***********************
+//************************************************************
+_NEON2SSESTORAGE float16x4_t vcvt_f16_f32(float32x4_t a); // VCVT.F16.F32 d0, q0
+//Intel SIMD doesn't support 16bits floats curently
+
+_NEON2SSESTORAGE float32x4_t vcvt_f32_f16(float16x4_t a); // VCVT.F32.F16 q0, d0
+//Intel SIMD doesn't support 16bits floats curently, the only solution is to store 16bit floats and load as 32 bits
+
+//************Vector narrow integer conversion (truncation) ******************
+//****************************************************************************
+_NEON2SSESTORAGE int8x8_t vmovn_s16(int16x8_t a); // VMOVN.I16 d0,q0
+_NEON2SSE_INLINE int8x8_t vmovn_s16(int16x8_t a) // VMOVN.I16 d0,q0
+{
+    int8x8_t res64;
+    __m128i res;
+    res = _mm_shuffle_epi8 (a, *(__m128i*) mask8_16_even_odd); //use 64 low bits only
+    return64(res);
+}
+
+_NEON2SSESTORAGE int16x4_t vmovn_s32(int32x4_t a); // VMOVN.I32 d0,q0
+_NEON2SSE_INLINE int16x4_t vmovn_s32(int32x4_t a) // VMOVN.I32 d0,q0
+{
+    int16x4_t res64;
+    __m128i res;
+    res = _mm_shuffle_epi8 (a, *(__m128i*) mask8_32_even_odd); //use 64 low bits only
+    return64(res);
+}
+
+_NEON2SSESTORAGE int32x2_t vmovn_s64(int64x2_t a); // VMOVN.I64 d0,q0
+_NEON2SSE_INLINE int32x2_t vmovn_s64(int64x2_t a)
+{
+    //may be not effective compared with a serial implementation
+    int32x2_t res64;
+    __m128i res;
+    res = _mm_shuffle_epi32 (a, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //use 64 low bits only, _MM_SHUFFLE(3, 1, 2, 0)
+    return64(res);
+}
+
+_NEON2SSESTORAGE uint8x8_t vmovn_u16(uint16x8_t a); // VMOVN.I16 d0,q0
+#define vmovn_u16 vmovn_s16
+
+_NEON2SSESTORAGE uint16x4_t vmovn_u32(uint32x4_t a); // VMOVN.I32 d0,q0
+#define vmovn_u32 vmovn_s32
+
+_NEON2SSESTORAGE uint32x2_t vmovn_u64(uint64x2_t a); // VMOVN.I64 d0,q0
+#define vmovn_u64 vmovn_s64
+
+//**************** Vector long move   ***********************
+//***********************************************************
+_NEON2SSESTORAGE int16x8_t vmovl_s8(int8x8_t a); // VMOVL.S8 q0,d0
+_NEON2SSE_INLINE int16x8_t vmovl_s8(int8x8_t a)
+{
+    return _MM_CVTEPI8_EPI16(_pM128i(a)); //SSE4.1
+}
+
+_NEON2SSESTORAGE int32x4_t vmovl_s16(int16x4_t a); // VMOVL.S16 q0,d0
+_NEON2SSE_INLINE int32x4_t vmovl_s16(int16x4_t a)
+{
+    return _MM_CVTEPI16_EPI32(_pM128i(a)); //SSE4.1
+}
+
+_NEON2SSESTORAGE int64x2_t vmovl_s32(int32x2_t a); // VMOVL.S32 q0,d0
+_NEON2SSE_INLINE int64x2_t  vmovl_s32(int32x2_t a)
+{
+    return _MM_CVTEPI32_EPI64(_pM128i(a)); //SSE4.1
+}
+
+_NEON2SSESTORAGE uint16x8_t vmovl_u8(uint8x8_t a); // VMOVL.U8 q0,d0
+_NEON2SSE_INLINE uint16x8_t vmovl_u8(uint8x8_t a)
+{
+    return _MM_CVTEPU8_EPI16(_pM128i(a)); //SSE4.1
+}
+
+_NEON2SSESTORAGE uint32x4_t vmovl_u16(uint16x4_t a); // VMOVL.s16 q0,d0
+_NEON2SSE_INLINE uint32x4_t  vmovl_u16(uint16x4_t a)
+{
+    return _MM_CVTEPU16_EPI32(_pM128i(a)); //SSE4.1
+}
+
+_NEON2SSESTORAGE uint64x2_t vmovl_u32(uint32x2_t a); // VMOVL.U32 q0,d0
+_NEON2SSE_INLINE uint64x2_t  vmovl_u32(uint32x2_t a)
+{
+    return _MM_CVTEPU32_EPI64(_pM128i(a)); //SSE4.1
+}
+
+//*************Vector saturating narrow integer*****************
+//**************************************************************
+_NEON2SSESTORAGE int8x8_t   vqmovn_s16(int16x8_t a); // VQMOVN.S16 d0,q0
+_NEON2SSE_INLINE int8x8_t   vqmovn_s16(int16x8_t a)
+{
+    int8x8_t res64;
+    __m128i res;
+    res = _mm_packs_epi16(a, a);
+    return64(res);
+}
+
+_NEON2SSESTORAGE int16x4_t   vqmovn_s32(int32x4_t a); // VQMOVN.S32 d0,q0
+_NEON2SSE_INLINE int16x4_t   vqmovn_s32(int32x4_t a)
+{
+    int16x4_t res64;
+    __m128i res;
+    res = _mm_packs_epi32(a, a);
+    return64(res);
+}
+
+_NEON2SSESTORAGE int32x2_t vqmovn_s64(int64x2_t a); // VQMOVN.S64 d0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqmovn_s64(int64x2_t a),_NEON2SSE_REASON_SLOW_SERIAL) //no effective SIMD solution
+{
+    int32x2_t res;
+    _NEON2SSE_ALIGN_16 int64_t atmp[2];
+    _mm_store_si128((__m128i*)atmp, a);
+    if(atmp[0]>SINT_MAX) atmp[0] = SINT_MAX;
+    if(atmp[0]<SINT_MIN) atmp[0] = SINT_MIN;
+    if(atmp[1]>SINT_MAX) atmp[1] = SINT_MAX;
+    if(atmp[1]<SINT_MIN) atmp[1] = SINT_MIN;
+    res.m64_i32[0] = (int32_t)atmp[0];
+    res.m64_i32[1] = (int32_t)atmp[1];
+    return res;
+}
+
+_NEON2SSESTORAGE uint8x8_t vqmovn_u16(uint16x8_t a); // VQMOVN.s16 d0,q0
+_NEON2SSE_INLINE uint8x8_t vqmovn_u16(uint16x8_t a) // VQMOVN.s16 d0,q0
+{
+    //no uint16 to uint8 conversion in SSE, need truncate to max signed first. Also trying to avoid _mm_shuffle_epi8 because of its big latency for old Atom CPUs
+    uint8x8_t res64;
+    __m128i c7fff, a_trunc, mask_trunc;
+    c7fff = _mm_set1_epi16 (0x7fff); // 15-th bit set to zero
+    a_trunc =  _mm_and_si128(a,  c7fff); // a truncated to max signed
+    mask_trunc =  _mm_cmpgt_epi16(a_trunc, a); //if after the shift we have bigger value than before then the 15-th bit had been set initially.
+    mask_trunc =  _mm_and_si128(mask_trunc,  c7fff);  //zero or c7fff if the 15-th bit had been set initially
+    a_trunc = _mm_or_si128(a_trunc,  mask_trunc);
+    a_trunc =  _mm_packus_epi16 (a_trunc, a_trunc); //use low 64bits only
+    return64(a_trunc);
+}
+
+_NEON2SSESTORAGE uint16x4_t vqmovn_u32(uint32x4_t a); // VQMOVN.U32 d0,q0
+_NEON2SSE_INLINE uint16x4_t vqmovn_u32(uint32x4_t a) // VQMOVN.U32 d0,q0
+{
+     #ifdef USE_SSE4
+        //no uint32 to uint16 conversion in SSE, need truncate to max signed first
+        uint16x4_t res64;
+        __m128i c7fffffff, a_trunc, mask_trunc;
+        c7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff); // 31-th bit set to zero
+        a_trunc =  _mm_and_si128(a,  c7fffffff); // a truncated to max signed
+        mask_trunc =  _mm_cmpgt_epi16(a_trunc, a); //if after the shift we have bigger value than before then the 15-th bit had been set initially.
+        mask_trunc =  _mm_and_si128(mask_trunc,  c7fffffff);  //zero or c7fff if the 15-th bit had been set initially
+        a_trunc = _mm_or_si128(a_trunc,  mask_trunc);
+        a_trunc = _MM_PACKUS1_EPI32 (a_trunc); //use low 64bits only
+        return64(a_trunc);
+    #else
+        uint16x4_t res64;
+       __m128i res_hi, mask;
+        mask = _mm_setzero_si128();
+        res_hi = _mm_srli_epi32(a, 16);
+        res_hi = _mm_cmpeq_epi16(res_hi, mask);
+        mask = _mm_cmpeq_epi16(mask,mask); //all fff
+        mask = _mm_andnot_si128(res_hi,mask); //inverst res_hi to get >16 bits numbers
+        res_hi = _mm_or_si128(a, mask); //saturated res
+        res_hi = _mm_shuffle_epi8 (res_hi, *(__m128i*) mask8_32_even_odd); //go to 16 bits
+        return64(res_hi);
+    #endif
+}
+
+_NEON2SSESTORAGE uint32x2_t vqmovn_u64(uint64x2_t a); // VQMOVN.U64 d0,q0
+_NEON2SSE_INLINE uint32x2_t vqmovn_u64(uint64x2_t a)
+{
+    //serial solution may be faster
+    uint32x2_t res64;
+    __m128i res_hi, mask;
+    mask = _mm_setzero_si128();
+    res_hi = _mm_srli_epi64(a, 32);
+    res_hi = _mm_cmpeq_epi32(res_hi, mask);
+    mask = _mm_cmpeq_epi32(mask,mask); //all fff
+    mask = _mm_andnot_si128(res_hi,mask); //inverst res_hi to get >32 bits numbers
+    res_hi = _mm_or_si128(a, mask);
+    res_hi = _mm_shuffle_epi32(res_hi, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
+    return64(res_hi);
+}
+//************* Vector saturating narrow integer signed->unsigned **************
+//*****************************************************************************
+_NEON2SSESTORAGE uint8x8_t vqmovun_s16(int16x8_t a); // VQMOVUN.S16 d0,q0
+_NEON2SSE_INLINE uint8x8_t vqmovun_s16(int16x8_t a)
+{
+    uint8x8_t res64;
+    __m128i res;
+    res = _mm_packus_epi16(a, a); //use low 64bits only
+    return64(res);
+}
+
+_NEON2SSESTORAGE uint16x4_t vqmovun_s32(int32x4_t a); // VQMOVUN.S32 d0,q0
+_NEON2SSE_INLINE uint16x4_t vqmovun_s32(int32x4_t a)
+{
+    uint16x4_t res64;
+    __m128i res;
+    res = _MM_PACKUS1_EPI32(a); //use low 64bits only
+    return64(res);
+}
+
+_NEON2SSESTORAGE uint32x2_t vqmovun_s64(int64x2_t a); // VQMOVUN.S64 d0,q0
+_NEON2SSE_INLINE uint32x2_t vqmovun_s64(int64x2_t a)
+{
+    uint32x2_t res64;
+    __m128i res_hi,res_lo, zero, cmp;
+    zero = _mm_setzero_si128();
+    res_hi = _mm_srli_epi64(a,  32);
+    cmp = _mm_cmpgt_epi32(zero, res_hi); //if cmp<0 the result should be zero
+    res_lo = _mm_andnot_si128(cmp,a); //if cmp zero - do nothing, otherwise cmp <0  and the result is 0
+    cmp = _mm_cmpgt_epi32(res_hi,zero); //if cmp positive
+    res_lo =  _mm_or_si128(res_lo, cmp); //if cmp positive we are out of 32bits need to saturaate to 0xffffffff
+    res_lo = _mm_shuffle_epi32(res_lo, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
+    return64(res_lo);
+}
+
+// ********************************************************
+// **************** Table look up **************************
+// ********************************************************
+//VTBL (Vector Table Lookup) uses byte indexes in a control vector to look up byte values
+//in a table and generate a new vector. Indexes out of range return 0.
+
+//for Intel SIMD we need to set the MSB to 1 for zero return
+//if b is unsigned ( > max signed)  or negative it has MSB 1 set and doesn't need any special processing
+_NEON2SSESTORAGE uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
+_NEON2SSE_INLINE uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b)
+{
+    uint8x8_t res64;
+    __m128i c7, maskgt, bmask, b128;
+    c7 = _mm_set1_epi8 (7);
+    b128 = _pM128i(b);
+    maskgt = _mm_cmpgt_epi8(b128,c7);
+    bmask = _mm_or_si128(b128,maskgt);
+    bmask = _mm_shuffle_epi8(_pM128i(a),bmask);
+    return64(bmask);
+}
+
+_NEON2SSESTORAGE int8x8_t vtbl1_s8(int8x8_t a,  int8x8_t b); // VTBL.8 d0, {d0}, d0
+#define vtbl1_s8 vtbl1_u8
+
+_NEON2SSESTORAGE poly8x8_t vtbl1_p8(poly8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
+#define vtbl1_p8 vtbl1_u8
+
+_NEON2SSESTORAGE uint8x8_t vtbl2_u8(uint8x8x2_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0
+_NEON2SSE_INLINE uint8x8_t vtbl2_u8(uint8x8x2_t a, uint8x8_t b)
+{
+    uint8x8_t res64;
+    __m128i c15, a01, maskgt15, bmask, b128;
+    c15 = _mm_set1_epi8 (15);
+    b128 = _pM128i(b);
+    maskgt15 = _mm_cmpgt_epi8(b128,c15);
+    bmask = _mm_or_si128(b128, maskgt15);
+    a01 = _mm_unpacklo_epi64(_pM128i(a.val[0]), _pM128i(a.val[1]));
+    a01 =  _mm_shuffle_epi8(a01, bmask);
+    return64(a01);
+}
+
+//int8x8_t vtbl2_s8(int8x8x2_t a, int8x8_t b); // VTBL.8 d0, {d0, d1}, d0
+#define vtbl2_s8 vtbl2_u8
+
+//poly8x8_t vtbl2_p8(poly8x8x2_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0
+#define vtbl2_p8 vtbl2_u8
+
+_NEON2SSESTORAGE uint8x8_t vtbl3_u8(uint8x8x3_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
+_NEON2SSE_INLINE uint8x8_t vtbl3_u8(uint8x8x3_t a, uint8x8_t b)
+{
+    //solution may be not optimal
+    uint8x8_t res64;
+    __m128i c15, c23, maskgt23, bmask, maskgt15, sh0, sh1, a01, b128;
+    c15 = _mm_set1_epi8 (15);
+    c23 = _mm_set1_epi8 (23);
+    b128 = _pM128i(b);
+    maskgt23 = _mm_cmpgt_epi8(b128,c23);
+    bmask = _mm_or_si128(b128, maskgt23);
+    maskgt15 = _mm_cmpgt_epi8(b128,c15);
+    a01 = _mm_unpacklo_epi64(_pM128i(a.val[0]),_pM128i(a.val[1]));
+    sh0 =  _mm_shuffle_epi8(a01, bmask);
+    sh1 =  _mm_shuffle_epi8(_pM128i(a.val[2]), bmask); //for bi>15 bi is wrapped (bi-=15)
+    sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15); //SSE4.1
+    return64(sh0);
+}
+
+_NEON2SSESTORAGE int8x8_t vtbl3_s8(int8x8x3_t a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
+#define vtbl3_s8 vtbl3_u8
+
+_NEON2SSESTORAGE poly8x8_t vtbl3_p8(poly8x8x3_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
+#define vtbl3_p8 vtbl3_u8
+
+_NEON2SSESTORAGE uint8x8_t vtbl4_u8(uint8x8x4_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
+_NEON2SSE_INLINE uint8x8_t vtbl4_u8(uint8x8x4_t a, uint8x8_t b)
+{
+    //solution may be not optimal
+    uint8x8_t res64;
+    __m128i c15, c31, maskgt31, bmask, maskgt15, sh0, sh1, a01, a23, b128;
+    c15 = _mm_set1_epi8 (15);
+    c31 = _mm_set1_epi8 (31);
+    b128 = _pM128i(b);
+    maskgt31 = _mm_cmpgt_epi8(b128,c31);
+    bmask = _mm_or_si128(b128, maskgt31);
+    maskgt15 = _mm_cmpgt_epi8(b128,c15);
+    a01 = _mm_unpacklo_epi64(_pM128i(a.val[0]),_pM128i(a.val[1]));
+    a23 = _mm_unpacklo_epi64(_pM128i(a.val[2]),_pM128i(a.val[3]));
+    sh0 =  _mm_shuffle_epi8(a01, bmask);
+    sh1 =  _mm_shuffle_epi8(a23, bmask); //for bi>15 bi is wrapped (bi-=15)
+    sh0 = _MM_BLENDV_EPI8 (sh0, sh1, maskgt15); //SSE4.1
+    return64(sh0);
+}
+
+_NEON2SSESTORAGE int8x8_t vtbl4_s8(int8x8x4_t a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
+#define vtbl4_s8 vtbl4_u8
+
+_NEON2SSESTORAGE poly8x8_t vtbl4_p8(poly8x8x4_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
+#define vtbl4_p8 vtbl4_u8
+
+//****************** Extended table look up intrinsics ***************************
+//**********************************************************************************
+//VTBX (Vector Table Extension) works in the same way as VTBL do,
+// except that indexes out of range leave the destination element unchanged.
+
+_NEON2SSESTORAGE uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
+_NEON2SSE_INLINE uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c)
+{
+    uint8x8_t res64;
+    __m128i c8, maskgt, sh, c128;
+    c8 = _mm_set1_epi8(8);
+    c128 = _pM128i(c);
+    //need to pre-clamp c values to avoid unsigned comparison
+    c128 = _mm_min_epu8(c128, c8);
+    maskgt = _mm_cmpgt_epi8(c8,c128);
+    sh = _mm_shuffle_epi8(_pM128i(b),c128);
+    sh = _mm_and_si128(maskgt,sh);
+    c8 = _mm_andnot_si128(maskgt,_pM128i(a));
+    sh =  _mm_or_si128(sh,c8);
+    return64(sh);
+}
+
+_NEON2SSESTORAGE int8x8_t vtbx1_s8(int8x8_t a,  int8x8_t b, int8x8_t c); // VTBX.8 d0, {d0}, d0
+#define vtbx1_s8 vtbx1_u8
+
+_NEON2SSESTORAGE poly8x8_t vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
+#define vtbx1_p8 vtbx1_u8
+
+_NEON2SSESTORAGE uint8x8_t vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
+_NEON2SSE_INLINE uint8x8_t vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c)
+{
+    uint8x8_t res64;
+    __m128i c16, b01, maskgt15, sh, c128;
+    c16 = _mm_set1_epi8(16);
+    c128 = _pM128i(c);
+    //need to pre-clamp c values to avoid unsigned comparison
+    c128 = _mm_min_epu8(c128, c16);
+    maskgt15 = _mm_cmpgt_epi8(c16,c128);
+    b01 = _mm_unpacklo_epi64(_pM128i(b.val[0]), _pM128i(b.val[1]));
+    sh =  _mm_shuffle_epi8(b01, c128);
+    sh = _mm_and_si128(maskgt15, sh);
+    c16 = _mm_andnot_si128(maskgt15, _pM128i(a));
+    sh =  _mm_or_si128(sh,c16);
+    return64(sh);
+}
+
+//int8x8_t vtbx2_s8(int8x8_t a,  int8x8x2_t b, int8x8_t c);  // VTBX.8 d0, {d0, d1}, d0
+#define vtbx2_s8 vtbx2_u8
+
+//poly8x8_t vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
+#define vtbx2_p8 vtbx2_u8
+
+_NEON2SSESTORAGE uint8x8_t vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
+_NEON2SSE_INLINE uint8x8_t vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c)
+{
+    //solution may be not optimal
+    uint8x8_t res64;
+    __m128i c15, c24, maskgt15, maskgt23, sh0, sh1, b01, c128;
+    c15 = _mm_set1_epi8 (15);
+    c24 = _mm_set1_epi8 (24);
+    c128 = _pM128i(c);
+    //need to pre-clamp c values to avoid unsigned comparison
+    c128 = _mm_min_epu8(c128, c24);
+    maskgt23 = _mm_cmpgt_epi8(c24,c128);
+    maskgt15 = _mm_cmpgt_epi8(c128,c15);
+    c24 = _mm_andnot_si128(maskgt23, _pM128i(a));
+    b01 = _mm_unpacklo_epi64(_pM128i(b.val[0]),_pM128i(b.val[1]));
+    sh0 =  _mm_shuffle_epi8(b01, c128);
+    sh1 =  _mm_shuffle_epi8(_pM128i(b.val[2]), c128); //for bi>15 bi is wrapped (bi-=15)
+    sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15);
+    sh0 = _mm_and_si128(maskgt23,sh0);
+    sh0 = _mm_or_si128(sh0,c24);
+    return64(sh0);
+}
+
+_NEON2SSESTORAGE int8x8_t vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
+#define vtbx3_s8 vtbx3_u8
+
+_NEON2SSESTORAGE poly8x8_t vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
+#define vtbx3_p8 vtbx3_u8
+
+_NEON2SSESTORAGE uint8x8_t vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
+_NEON2SSE_INLINE uint8x8_t vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c)
+{
+    //solution may be not optimal
+    uint8x8_t res64;
+    __m128i c15, c32, maskgt15, maskgt31, sh0, sh1, b01, b23, c128;
+    c15 = _mm_set1_epi8 (15);
+    c32 = _mm_set1_epi8 (32);
+    c128 = _pM128i(c);
+    //need to pre-clamp c values to avoid unsigned comparison
+    c128 = _mm_min_epu8(c128, c32);
+    maskgt15 = _mm_cmpgt_epi8(c128,c15);
+    maskgt31 = _mm_cmpgt_epi8(c32,c128);
+    c32 = _mm_andnot_si128(maskgt31, _pM128i(a));
+
+    b01 = _mm_unpacklo_epi64(_pM128i(b.val[0]),_pM128i(b.val[1]));
+    b23 = _mm_unpacklo_epi64(_pM128i(b.val[2]),_pM128i(b.val[3]));
+    sh0 =  _mm_shuffle_epi8(b01, c128);
+    sh1 =  _mm_shuffle_epi8(b23, c128); //for bi>15 bi is wrapped (bi-=15)
+    sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15);
+    sh0 = _mm_and_si128(maskgt31,sh0);
+    sh0 =  _mm_or_si128(sh0,c32);
+    return64(sh0);
+}
+
+_NEON2SSESTORAGE int8x8_t vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
+#define vtbx4_s8 vtbx4_u8
+
+_NEON2SSESTORAGE poly8x8_t vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
+#define vtbx4_p8 vtbx4_u8
+
+//*************************************************************************************************
+// *************************** Operations with a scalar value *********************************
+//*************************************************************************************************
+
+//******* Vector multiply accumulate by scalar *************************************************
+//**********************************************************************************************
+_NEON2SSESTORAGE int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0, d0[0]
+_NEON2SSE_INLINE int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLA.I16 d0, d0, d0[0]
+{
+    int16_t c;
+    int16x4_t scalar;
+    c = vget_lane_s16(v, l);
+    scalar = vdup_n_s16(c);
+    return vmla_s16(a, b, scalar);
+}
+
+_NEON2SSESTORAGE int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0, d0[0]
+_NEON2SSE_INLINE int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLA.I32 d0, d0, d0[0]
+{
+    int32_t c;
+    int32x2_t scalar;
+    c = vget_lane_s32(v, l);
+    scalar = vdup_n_s32(c);
+    return vmla_s32(a, b, scalar);
+}
+
+_NEON2SSESTORAGE uint16x4_t vmla_lane_u16(uint16x4_t a,  uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0, d0[0]
+#define vmla_lane_u16 vmla_lane_s16
+
+
+_NEON2SSESTORAGE uint32x2_t vmla_lane_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0, d0[0]
+#define vmla_lane_u32 vmla_lane_s32
+
+_NEON2SSESTORAGE float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 d0, d0, d0[0]
+_NEON2SSE_INLINE float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l)
+{
+    float32_t vlane;
+    float32x2_t c;
+    vlane = vget_lane_f32(v, l);
+    c = vdup_n_f32(vlane);
+    return vmla_f32(a,b,c);
+}
+
+_NEON2SSESTORAGE int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0]
+_NEON2SSE_INLINE int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l) // VMLA.I16 q0, q0, d0[0]
+{
+    int16_t vlane;
+    int16x8_t c;
+    vlane = vget_lane_s16(v, l);
+    c = vdupq_n_s16(vlane);
+    return vmlaq_s16(a,b,c);
+}
+
+_NEON2SSESTORAGE int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0]
+_NEON2SSE_INLINE int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l) // VMLA.I32 q0, q0, d0[0]
+{
+    int32_t vlane;
+    int32x4_t c;
+    vlane = vget_lane_s32(v, l);
+    c = vdupq_n_s32(vlane);
+    return vmlaq_s32(a,b,c);
+}
+
+_NEON2SSESTORAGE uint16x8_t vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0]
+#define vmlaq_lane_u16 vmlaq_lane_s16
+
+_NEON2SSESTORAGE uint32x4_t vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0]
+#define vmlaq_lane_u32 vmlaq_lane_s32
+
+_NEON2SSESTORAGE float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0, q0, d0[0]
+_NEON2SSE_INLINE float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l) // VMLA.F32 q0, q0, d0[0]
+{
+    float32_t vlane;
+    float32x4_t c;
+    vlane = vget_lane_f32(v, l);
+    c = vdupq_n_f32(vlane);
+    return vmlaq_f32(a,b,c);
+}
+
+//***************** Vector widening multiply accumulate by scalar **********************
+//***************************************************************************************
+_NEON2SSESTORAGE int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLAL.S16 q0, d0, d0[0]
+_NEON2SSE_INLINE int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLAL.S16 q0, d0, d0[0]
+{
+    int16_t vlane;
+    int16x4_t c;
+    vlane = vget_lane_s16(v, l);
+    c = vdup_n_s16(vlane);
+    return vmlal_s16(a, b, c);
+}
+
+_NEON2SSESTORAGE int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLAL.S32 q0, d0, d0[0]
+_NEON2SSE_INLINE int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLAL.S32 q0, d0, d0[0]
+{
+    int32_t vlane;
+    int32x2_t c;
+    vlane = vget_lane_s32(v, l);
+    c = vdup_n_s32(vlane);
+    return vmlal_s32(a, b, c);
+}
+
+_NEON2SSESTORAGE uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.s16 q0, d0, d0[0]
+_NEON2SSE_INLINE uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLAL.s16 q0, d0, d0[0]
+{
+    uint16_t vlane;
+    uint16x4_t c;
+    vlane = vget_lane_u16(v, l);
+    c = vdup_n_u16(vlane);
+    return vmlal_u16(a, b, c);
+}
+
+_NEON2SSESTORAGE uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0, d0, d0[0]
+_NEON2SSE_INLINE uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLAL.U32 q0, d0, d0[0]
+{
+    uint32_t vlane;
+    uint32x2_t c;
+    vlane = vget_lane_u32(v, l);
+    c = vdup_n_u32(vlane);
+    return vmlal_u32(a, b, c);
+}
+
+// ******** Vector widening saturating doubling multiply accumulate by scalar *******************************
+// ************************************************************************************************
+_NEON2SSESTORAGE int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLAL.S16 q0, d0, d0[0]
+_NEON2SSE_INLINE int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l)
+{
+    int16_t vlane;
+    int16x4_t c;
+    vlane = vget_lane_s16(v, l);
+    c = vdup_n_s16(vlane);
+    return vqdmlal_s16(a, b, c);
+}
+
+_NEON2SSESTORAGE int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLAL.S32 q0, d0, d0[0]
+_NEON2SSE_INLINE int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l)
+{
+    int32_t vlane;
+    uint32x2_t c;
+    vlane = vget_lane_s32(v, l);
+    c = vdup_n_s32(vlane);
+    return vqdmlal_s32(a, b, c);
+}
+
+// ****** Vector multiply subtract by scalar *****************
+// *************************************************************
+_NEON2SSESTORAGE int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0, d0[0]
+_NEON2SSE_INLINE int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLS.I16 d0, d0, d0[0]
+{
+    int16_t vlane;
+    int16x4_t c;
+    vlane = vget_lane_s16(v, l);
+    c = vdup_n_s16(vlane);
+    return vmls_s16(a, b, c);
+}
+
+_NEON2SSESTORAGE int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0, d0[0]
+_NEON2SSE_INLINE int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLS.I32 d0, d0, d0[0]
+{
+    int32_t vlane;
+    int32x2_t c;
+    vlane = vget_lane_s32(v, l);
+    c = vdup_n_s32(vlane);
+    return vmls_s32(a, b, c);
+}
+
+_NEON2SSESTORAGE uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0, d0[0]
+_NEON2SSE_INLINE uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLS.I16 d0, d0, d0[0]
+{
+    uint16_t vlane;
+    uint16x4_t c;
+    vlane = vget_lane_s16(v, l);
+    c = vdup_n_s16(vlane);
+    return vmls_s16(a, b, c);
+}
+
+_NEON2SSESTORAGE uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0, d0[0]
+_NEON2SSE_INLINE uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLS.I32 d0, d0, d0[0]
+{
+    uint32_t vlane;
+    uint32x2_t c;
+    vlane = vget_lane_u32(v, l);
+    c = vdup_n_u32(vlane);
+    return vmls_u32(a, b, c);
+}
+
+_NEON2SSESTORAGE float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 d0, d0, d0[0]
+_NEON2SSE_INLINE float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l)
+{
+    float32_t vlane;
+    float32x2_t c;
+    vlane = (float) vget_lane_f32(v, l);
+    c = vdup_n_f32(vlane);
+    return vmls_f32(a,b,c);
+}
+
+_NEON2SSESTORAGE int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0, q0, d0[0]
+_NEON2SSE_INLINE int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l) // VMLS.I16 q0, q0, d0[0]
+{
+    int16_t vlane;
+    int16x8_t c;
+    vlane = vget_lane_s16(v, l);
+    c = vdupq_n_s16(vlane);
+    return vmlsq_s16(a, b,c);
+}
+
+_NEON2SSESTORAGE int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0, q0, d0[0]
+_NEON2SSE_INLINE int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l) // VMLS.I32 q0, q0, d0[0]
+{
+    int32_t vlane;
+    int32x4_t c;
+    vlane = vget_lane_s32(v, l);
+    c = vdupq_n_s32(vlane);
+    return vmlsq_s32(a,b,c);
+}
+
+_NEON2SSESTORAGE uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0]
+_NEON2SSE_INLINE uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l) // VMLA.I16 q0, q0, d0[0]
+{
+    uint16_t vlane;
+    uint16x8_t c;
+    vlane = vget_lane_u16(v, l);
+    c = vdupq_n_u16(vlane);
+    return vmlsq_u16(a,b,c);
+}
+
+_NEON2SSESTORAGE uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0]
+_NEON2SSE_INLINE uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l) // VMLA.I32 q0, q0, d0[0]
+{
+    uint32_t vlane;
+    uint32x4_t c;
+    vlane = vget_lane_u32(v, l);
+    c = vdupq_n_u32(vlane);
+    return vmlsq_u32(a,b,c);
+}
+
+_NEON2SSESTORAGE float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0, q0, d0[0]
+_NEON2SSE_INLINE float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l) // VMLA.F32 q0, q0, d0[0]
+{
+    float32_t vlane;
+    float32x4_t c;
+    vlane = (float) vget_lane_f32(v, l);
+    c = vdupq_n_f32(vlane);
+    return vmlsq_f32(a,b,c);
+}
+
+// **** Vector widening multiply subtract by scalar ****
+// ****************************************************
+_NEON2SSESTORAGE int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLAL.S16 q0, d0, d0[0]
+_NEON2SSE_INLINE int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLAL.S16 q0, d0, d0[0]
+{
+    int16_t vlane;
+    int16x4_t c;
+    vlane = vget_lane_s16(v, l);
+    c = vdup_n_s16(vlane);
+    return vmlsl_s16(a, b, c);
+}
+
+_NEON2SSESTORAGE int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLAL.S32 q0, d0, d0[0]
+_NEON2SSE_INLINE int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLAL.S32 q0, d0, d0[0]
+{
+    int32_t vlane;
+    int32x2_t c;
+    vlane = vget_lane_s32(v, l);
+    c = vdup_n_s32(vlane);
+    return vmlsl_s32(a, b, c);
+}
+
+_NEON2SSESTORAGE uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.U16 q0, d0, d0[0]
+_NEON2SSE_INLINE uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLAL.U16 q0, d0, d0[0]
+{
+    uint16_t vlane;
+    uint16x4_t c;
+    vlane = vget_lane_u16(v, l);
+    c = vdup_n_u16(vlane);
+    return vmlsl_u16(a, b, c);
+}
+
+_NEON2SSESTORAGE uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0, d0, d0[0]
+_NEON2SSE_INLINE uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLAL.U32 q0, d0, d0[0]
+{
+    uint32_t vlane;
+    uint32x2_t c;
+    vlane = vget_lane_u32(v, l);
+    c = vdup_n_u32(vlane);
+    return vmlsl_u32(a, b, c);
+}
+
+//********* Vector widening saturating doubling multiply subtract by scalar **************************
+//******************************************************************************************************
+_NEON2SSESTORAGE int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLSL.S16 q0, d0, d0[0]
+_NEON2SSE_INLINE int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l)
+{
+    int16_t vlane;
+    int16x4_t c;
+    vlane = vget_lane_s16(v, l);
+    c = vdup_n_s16(vlane);
+    return vqdmlsl_s16(a, b, c);
+}
+
+_NEON2SSESTORAGE int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLSL.S32 q0, d0, d0[0]
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    int32_t vlane;
+    int32x2_t c;
+    vlane = vget_lane_s32(v, l);
+    c = vdup_n_s32(vlane);
+    return vqdmlsl_s32(a, b, c);
+}
+//********** Vector multiply with scalar *****************************
+_NEON2SSESTORAGE int16x4_t vmul_n_s16(int16x4_t a, int16_t b); // VMUL.I16 d0,d0,d0[0]
+_NEON2SSE_INLINE int16x4_t vmul_n_s16(int16x4_t a, int16_t b) // VMUL.I16 d0,d0,d0[0]
+{
+    int16x4_t b16x4;
+    b16x4 = vdup_n_s16(b);
+    return vmul_s16(a, b16x4);
+}
+
+_NEON2SSESTORAGE int32x2_t vmul_n_s32(int32x2_t a, int32_t b); // VMUL.I32 d0,d0,d0[0]
+_NEON2SSE_INLINE int32x2_t vmul_n_s32(int32x2_t a, int32_t b) // VMUL.I32 d0,d0,d0[0]
+{
+    //serial solution looks faster
+    int32x2_t b32x2;
+    b32x2 = vdup_n_s32(b);
+    return vmul_s32(a, b32x2);
+}
+
+_NEON2SSESTORAGE float32x2_t vmul_n_f32(float32x2_t a, float32_t b); // VMUL.F32 d0,d0,d0[0]
+_NEON2SSE_INLINE float32x2_t vmul_n_f32(float32x2_t a, float32_t b) // VMUL.F32 d0,d0,d0[0]
+{
+    float32x2_t b32x2;
+    b32x2 = vdup_n_f32(b);
+    return vmul_f32(a, b32x2);
+}
+
+_NEON2SSESTORAGE uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b); // VMUL.I16 d0,d0,d0[0]
+_NEON2SSE_INLINE uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b) // VMUL.I16 d0,d0,d0[0]
+{
+    uint16x4_t b16x4;
+    b16x4 = vdup_n_s16(b);
+    return vmul_s16(a, b16x4);
+}
+
+_NEON2SSESTORAGE uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b); // VMUL.I32 d0,d0,d0[0]
+_NEON2SSE_INLINE uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b) // VMUL.I32 d0,d0,d0[0]
+{
+    //serial solution looks faster
+    uint32x2_t b32x2;
+    b32x2 = vdup_n_u32(b);
+    return vmul_u32(a, b32x2);
+}
+
+_NEON2SSESTORAGE int16x8_t vmulq_n_s16(int16x8_t a, int16_t b); // VMUL.I16 q0,q0,d0[0]
+_NEON2SSE_INLINE int16x8_t vmulq_n_s16(int16x8_t a, int16_t b) // VMUL.I16 q0,q0,d0[0]
+{
+    int16x8_t b16x8;
+    b16x8 = vdupq_n_s16(b);
+    return vmulq_s16(a, b16x8);
+}
+
+_NEON2SSESTORAGE int32x4_t vmulq_n_s32(int32x4_t a, int32_t b); // VMUL.I32 q0,q0,d0[0]
+_NEON2SSE_INLINE int32x4_t vmulq_n_s32(int32x4_t a, int32_t b) // VMUL.I32 q0,q0,d0[0]
+{
+    int32x4_t b32x4;
+    b32x4 = vdupq_n_s32(b);
+    return vmulq_s32(a, b32x4);
+}
+
+_NEON2SSESTORAGE float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); // VMUL.F32 q0,q0,d0[0]
+_NEON2SSE_INLINE float32x4_t vmulq_n_f32(float32x4_t a, float32_t b) // VMUL.F32 q0,q0,d0[0]
+{
+    float32x4_t b32x4;
+    b32x4 = vdupq_n_f32(b);
+    return vmulq_f32(a, b32x4);
+}
+
+_NEON2SSESTORAGE uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b); // VMUL.I16 q0,q0,d0[0]
+_NEON2SSE_INLINE uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b) // VMUL.I16 q0,q0,d0[0]
+{
+    uint16x8_t b16x8;
+    b16x8 = vdupq_n_s16(b);
+    return vmulq_s16(a, b16x8);
+}
+
+_NEON2SSESTORAGE uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b); // VMUL.I32 q0,q0,d0[0]
+_NEON2SSE_INLINE uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b) // VMUL.I32 q0,q0,d0[0]
+{
+    uint32x4_t b32x4;
+    b32x4 = vdupq_n_u32(b);
+    return vmulq_u32(a, b32x4);
+}
+
+//********** Vector multiply lane *****************************
+_NEON2SSESTORAGE int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c);
+_NEON2SSE_INLINE int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c)
+{
+    int16x4_t b16x4;
+    int16_t vlane;
+    vlane = vget_lane_s16(b, c);
+    b16x4 = vdup_n_s16(vlane);
+    return vmul_s16(a, b16x4);
+}
+
+_NEON2SSESTORAGE int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c);
+_NEON2SSE_INLINE int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c)
+{
+    int32x2_t b32x2;
+    int32_t vlane;
+    vlane = vget_lane_s32(b, c);
+    b32x2 = vdup_n_s32(vlane);
+    return vmul_s32(a, b32x2);
+}
+
+_NEON2SSESTORAGE float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c);
+_NEON2SSE_INLINE float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c)
+{
+    float32x2_t b32x2;
+    float32_t vlane;
+    vlane = vget_lane_f32(b, c);
+    b32x2 = vdup_n_f32(vlane);
+    return vmul_f32(a, b32x2);
+}
+
+_NEON2SSESTORAGE uint16x4_t vmul_lane_u16 (uint16x4_t a, uint16x4_t b, __constrange(0,3) int c);
+#define vmul_lane_u16 vmul_lane_s16
+
+_NEON2SSESTORAGE uint32x2_t vmul_lane_u32 (uint32x2_t a, uint32x2_t b, __constrange(0,1) int c);
+#define vmul_lane_u32 vmul_lane_s32
+
+_NEON2SSESTORAGE int16x8_t vmulq_lane_s16(int16x8_t a, int16x4_t b, __constrange(0,3) int c);
+_NEON2SSE_INLINE int16x8_t vmulq_lane_s16 (int16x8_t a, int16x4_t b, __constrange(0,3) int c)
+{
+    int16x8_t b16x8;
+    int16_t vlane;
+    vlane = vget_lane_s16(b, c);
+    b16x8 = vdupq_n_s16(vlane);
+    return vmulq_s16(a, b16x8);
+}
+
+_NEON2SSESTORAGE int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c);
+_NEON2SSE_INLINE int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c)
+{
+    int32x4_t b32x4;
+    int32_t vlane;
+    vlane = vget_lane_s32(b, c);
+    b32x4 = vdupq_n_s32(vlane);
+    return vmulq_s32(a, b32x4);
+}
+
+_NEON2SSESTORAGE float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c);
+_NEON2SSE_INLINE float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c)
+{
+    float32x4_t b32x4;
+    float32_t vlane;
+    vlane = vget_lane_f32(b, c);
+    b32x4 = vdupq_n_f32(vlane);
+    return vmulq_f32(a, b32x4);
+}
+
+_NEON2SSESTORAGE uint16x8_t vmulq_lane_u16 (uint16x8_t a, uint16x4_t b, __constrange(0,3) int c);
+#define vmulq_lane_u16 vmulq_lane_s16
+
+_NEON2SSESTORAGE uint32x4_t vmulq_lane_u32 (uint32x4_t a, uint32x2_t b, __constrange(0,1) int c);
+#define vmulq_lane_u32 vmulq_lane_s32
+
+//**** Vector long multiply with scalar ************
+_NEON2SSESTORAGE int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2); // VMULL.S16 q0,d0,d0[0]
+_NEON2SSE_INLINE int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2) // VMULL.S16 q0,d0,d0[0]
+{
+    int16x4_t b16x4;
+    b16x4 = vdup_n_s16(val2);
+    return vmull_s16(vec1, b16x4);
+}
+
+_NEON2SSESTORAGE int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2); // VMULL.S32 q0,d0,d0[0]
+_NEON2SSE_INLINE int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2) // VMULL.S32 q0,d0,d0[0]
+{
+    int32x2_t b32x2;
+    b32x2 = vdup_n_s32(val2);
+    return vmull_s32(vec1, b32x2);
+}
+
+_NEON2SSESTORAGE uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2); // VMULL.s16 q0,d0,d0[0]
+_NEON2SSE_INLINE uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2) // VMULL.s16 q0,d0,d0[0]
+{
+    uint16x4_t b16x4;
+    b16x4 = vdup_n_s16(val2);
+    return vmull_u16(vec1, b16x4);
+}
+
+_NEON2SSESTORAGE uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2); // VMULL.U32 q0,d0,d0[0]
+_NEON2SSE_INLINE uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2) // VMULL.U32 q0,d0,d0[0]
+{
+    uint32x2_t b32x2;
+    b32x2 = vdup_n_u32(val2);
+    return vmull_u32(vec1, b32x2);
+}
+
+//**** Vector long multiply by scalar ****
+_NEON2SSESTORAGE int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VMULL.S16 q0,d0,d0[0]
+_NEON2SSE_INLINE int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VMULL.S16 q0,d0,d0[0]
+{
+    int16_t vlane;
+    int16x4_t b;
+    vlane = vget_lane_s16(val2, val3);
+    b = vdup_n_s16(vlane);
+    return vmull_s16(vec1, b);
+}
+
+_NEON2SSESTORAGE int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VMULL.S32 q0,d0,d0[0]
+_NEON2SSE_INLINE int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3) // VMULL.S32 q0,d0,d0[0]
+{
+    int32_t vlane;
+    int32x2_t b;
+    vlane = vget_lane_s32(val2, val3);
+    b = vdup_n_s32(vlane);
+    return vmull_s32(vec1, b);
+}
+
+_NEON2SSESTORAGE uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3); // VMULL.s16 q0,d0,d0[0]
+_NEON2SSE_INLINE uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3) // VMULL.s16 q0,d0,d0[0]
+{
+    uint16_t vlane;
+    uint16x4_t b;
+    vlane = vget_lane_s16(val2, val3);
+    b = vdup_n_s16(vlane);
+    return vmull_u16(vec1, b);
+}
+
+_NEON2SSESTORAGE uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3); // VMULL.U32 q0,d0,d0[0]
+_NEON2SSE_INLINE uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3) // VMULL.U32 q0,d0,d0[0]
+{
+    uint32_t vlane;
+    uint32x2_t b;
+    vlane = vget_lane_u32(val2, val3);
+    b = vdup_n_u32(vlane);
+    return vmull_u32(vec1, b);
+}
+
+//********* Vector saturating doubling long multiply with scalar  *******************
+_NEON2SSESTORAGE int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2); // VQDMULL.S16 q0,d0,d0[0]
+_NEON2SSE_INLINE int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2)
+{
+    //the serial soulution may be faster due to saturation
+    int16x4_t b;
+    b = vdup_n_s16(val2);
+    return vqdmull_s16(vec1, b);
+}
+
+_NEON2SSESTORAGE int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2); // VQDMULL.S32 q0,d0,d0[0]
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    int32x2_t b;
+    b = vdup_n_s32(val2);
+    return vqdmull_s32(vec1,b); //slow serial function!!!!
+}
+
+//************* Vector saturating doubling long multiply by scalar ***********************************************
+_NEON2SSESTORAGE int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULL.S16 q0,d0,d0[0]
+_NEON2SSE_INLINE int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3)
+{
+    int16_t c;
+    int16x4_t scalar;
+    c = vget_lane_s16(val2, val3);
+    scalar = vdup_n_s16(c);
+    return vqdmull_s16(vec1, scalar);
+}
+
+
+_NEON2SSESTORAGE int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); //  VQDMULL.S32 q0,d0,d0[0]
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    int32_t c;
+    int32x2_t scalar;
+    c = vget_lane_s32(val2, val3);
+    scalar = vdup_n_s32(c);
+    return vqdmull_s32(vec1,scalar); //slow serial function!!!!
+}
+
+// *****Vector saturating doubling multiply high with scalar *****
+_NEON2SSESTORAGE int16x4_t vqdmulh_n_s16(int16x4_t vec1,  int16_t val2); //  VQDMULH.S16 d0,d0,d0[0]
+_NEON2SSE_INLINE int16x4_t vqdmulh_n_s16(int16x4_t vec1,  int16_t val2)
+{
+    int16x4_t res64;
+    return64(vqdmulhq_n_s16(_pM128i(vec1), val2));
+}
+
+_NEON2SSESTORAGE int32x2_t vqdmulh_n_s32(int32x2_t vec1,  int32_t val2); //  VQDMULH.S32 d0,d0,d0[0]
+_NEON2SSE_INLINE int32x2_t vqdmulh_n_s32(int32x2_t vec1,  int32_t val2)
+{
+    int32x2_t res64;
+    return64(vqdmulhq_n_s32(_pM128i(vec1), val2));
+}
+
+_NEON2SSESTORAGE int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2); //  VQDMULH.S16 q0,q0,d0[0]
+_NEON2SSE_INLINE int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2) //  VQDMULH.S16 q0,q0,d0[0]
+{
+    //solution may be not optimal
+    int16x8_t scalar;
+    scalar = vdupq_n_s16(val2);
+    return vqdmulhq_s16(vec1, scalar);
+}
+
+_NEON2SSESTORAGE int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2); //  VQDMULH.S32 q0,q0,d0[0]
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
+{
+    int32x4_t scalar;
+    scalar = vdupq_n_s32(val2);
+    return vqdmulhq_s32(vec1, scalar);
+}
+
+//***** Vector saturating doubling multiply high by scalar ****************
+_NEON2SSESTORAGE int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); //  VQDMULH.S16 d0,d0,d0[0]
+_NEON2SSE_INLINE int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) //  VQDMULH.S16 d0,d0,d0[0]
+{
+    //solution may be not optimal
+    int16_t vlane;
+    int16x4_t scalar;
+    vlane = vget_lane_s16(val2, val3);
+    scalar = vdup_n_s16(vlane);
+    return vqdmulh_s16(vec1, scalar);
+}
+
+_NEON2SSESTORAGE int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); //  VQDMULH.S32 d0,d0,d0[0]
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
+{
+    int32_t vlane;
+    int32x2_t scalar;
+    vlane = vget_lane_s32(val2, val3);
+    scalar = vdup_n_s32(vlane);
+    return vqdmulh_s32(vec1, scalar);
+}
+
+_NEON2SSESTORAGE int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); //  VQDMULH.S16 q0,q0,d0[0]
+_NEON2SSE_INLINE int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3) //  VQDMULH.S16 q0,q0,d0[0]
+{
+    //solution may be not optimal
+    int16_t vlane;
+    int16x8_t scalar;
+    vlane = vget_lane_s16(val2, val3);
+    scalar = vdupq_n_s16(vlane );
+    return vqdmulhq_s16(vec1, scalar);
+}
+
+_NEON2SSESTORAGE int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); //  VQDMULH.S32 q0,q0,d0[0]
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
+{
+    //solution may be not optimal
+    int32_t vlane;
+    int32x4_t scalar;
+    vlane = vgetq_lane_s32(_pM128i(val2), val3);
+    scalar = vdupq_n_s32(vlane );
+    return vqdmulhq_s32(vec1, scalar);
+}
+
+//******** Vector saturating rounding doubling multiply high with scalar ***
+_NEON2SSESTORAGE int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQRDMULH.S16 d0,d0,d0[0]
+_NEON2SSE_INLINE int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2) // VQRDMULH.S16 d0,d0,d0[0]
+{
+    //solution may be not optimal
+    int16x4_t scalar;
+    scalar = vdup_n_s16(val2);
+    return vqrdmulh_s16(vec1, scalar);
+}
+
+_NEON2SSESTORAGE int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQRDMULH.S32 d0,d0,d0[0]
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
+{
+    int32x2_t scalar;
+    scalar = vdup_n_s32(val2);
+    return vqrdmulh_s32(vec1, scalar);
+}
+
+_NEON2SSESTORAGE int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQRDMULH.S16 q0,q0,d0[0]
+_NEON2SSE_INLINE int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2) // VQRDMULH.S16 q0,q0,d0[0]
+{
+    //solution may be not optimal
+    int16x8_t scalar;
+    scalar = vdupq_n_s16(val2);
+    return vqrdmulhq_s16(vec1, scalar);
+}
+
+_NEON2SSESTORAGE int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQRDMULH.S32 q0,q0,d0[0]
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
+{
+    int32x4_t scalar;
+    scalar = vdupq_n_s32(val2);
+    return vqrdmulhq_s32(vec1, scalar);
+}
+
+//********* Vector rounding saturating doubling multiply high by scalar  ****
+_NEON2SSESTORAGE int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 d0,d0,d0[0]
+_NEON2SSE_INLINE int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQRDMULH.S16 d0,d0,d0[0]
+{
+    //solution may be not optimal
+    int16_t vlane;
+    int16x4_t scalar;
+    vlane = vget_lane_s16(val2, val3);
+    scalar = vdup_n_s16(vlane);
+    return vqrdmulh_s16(vec1, scalar);
+}
+
+_NEON2SSESTORAGE int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 d0,d0,d0[0]
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
+{
+    int32_t vlane;
+    int32x2_t scalar;
+    vlane = vget_lane_s32(val2, val3);
+    scalar = vdup_n_s32(vlane);
+    return vqrdmulh_s32(vec1, scalar);
+}
+
+_NEON2SSESTORAGE int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 q0,q0,d0[0]
+_NEON2SSE_INLINE int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQRDMULH.S16 q0,q0,d0[0]
+{
+    //solution may be not optimal
+    int16_t vlane;
+    int16x8_t scalar;
+    vlane = vget_lane_s16(val2, val3);
+    scalar = vdupq_n_s16(vlane);
+    return vqrdmulhq_s16(vec1, scalar);
+}
+
+_NEON2SSESTORAGE int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 q0,q0,d0[0]
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
+{
+    //solution may be not optimal
+    int32_t vlane;
+    int32x4_t scalar;
+    vlane = vgetq_lane_s32(_pM128i(val2), val3);
+    scalar = vdupq_n_s32(vlane );
+    return vqrdmulhq_s32(vec1, scalar);
+}
+
+//**************Vector multiply accumulate with scalar *******************
+_NEON2SSESTORAGE int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLA.I16 d0, d0, d0[0]
+_NEON2SSE_INLINE int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) // VMLA.I16 d0, d0, d0[0]
+{
+    int16x4_t scalar;
+    scalar = vdup_n_s16(c);
+    return vmla_s16(a, b, scalar);
+}
+
+_NEON2SSESTORAGE int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLA.I32 d0, d0, d0[0]
+_NEON2SSE_INLINE int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) // VMLA.I32 d0, d0, d0[0]
+{
+    int32x2_t scalar;
+    scalar = vdup_n_s32(c);
+    return vmla_s32(a, b, scalar);
+}
+
+_NEON2SSESTORAGE uint16x4_t vmla_n_u16(uint16x4_t a,  uint16x4_t b, uint16_t c); // VMLA.I16 d0, d0, d0[0]
+#define vmla_n_u16 vmla_n_s16
+
+
+_NEON2SSESTORAGE uint32x2_t vmla_n_u32(uint32x2_t a,  uint32x2_t b, uint32_t c); // VMLA.I32 d0, d0, d0[0]
+#define vmla_n_u32 vmla_n_s32
+
+
+_NEON2SSESTORAGE float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLA.F32 d0, d0, d0[0]
+_NEON2SSE_INLINE float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) // VMLA.F32 d0, d0, d0[0]
+{
+    float32x2_t scalar;
+    scalar = vdup_n_f32(c);
+    return vmla_f32(a, b, scalar);
+}
+
+_NEON2SSESTORAGE int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLA.I16 q0, q0, d0[0]
+_NEON2SSE_INLINE int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) // VMLA.I16 q0, q0, d0[0]
+{
+    int16x8_t scalar;
+    scalar = vdupq_n_s16(c);
+    return vmlaq_s16(a,b,scalar);
+}
+
+_NEON2SSESTORAGE int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLA.I32 q0, q0, d0[0]
+_NEON2SSE_INLINE int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) // VMLA.I32 q0, q0, d0[0]
+{
+    int32x4_t scalar;
+    scalar = vdupq_n_s32(c);
+    return vmlaq_s32(a,b,scalar);
+}
+
+_NEON2SSESTORAGE uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLA.I16 q0, q0, d0[0]
+#define vmlaq_n_u16 vmlaq_n_s16
+
+_NEON2SSESTORAGE uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLA.I32 q0, q0, d0[0]
+#define vmlaq_n_u32 vmlaq_n_s32
+
+_NEON2SSESTORAGE float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLA.F32 q0, q0, d0[0]
+_NEON2SSE_INLINE float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) // VMLA.F32 q0, q0, d0[0]
+{
+    float32x4_t scalar;
+    scalar = vdupq_n_f32(c);
+    return vmlaq_f32(a,b,scalar);
+}
+
+//************Vector widening multiply accumulate with scalar****************************
+_NEON2SSESTORAGE int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLAL.S16 q0, d0, d0[0]
+_NEON2SSE_INLINE int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) // VMLAL.S16 q0, d0, d0[0]
+{
+    int16x4_t vc;
+    vc = vdup_n_s16(c);
+    return vmlal_s16(a, b, vc);
+}
+
+_NEON2SSESTORAGE int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLAL.S32 q0, d0, d0[0]
+_NEON2SSE_INLINE int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) // VMLAL.S32 q0, d0, d0[0]
+{
+    int32x2_t vc;
+    vc = vdup_n_s32(c);
+    return vmlal_s32(a, b, vc);
+}
+
+_NEON2SSESTORAGE uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLAL.s16 q0, d0, d0[0]
+_NEON2SSE_INLINE uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) // VMLAL.s16 q0, d0, d0[0]
+{
+    uint16x4_t vc;
+    vc = vdup_n_u16(c);
+    return vmlal_u16(a, b, vc);
+}
+
+_NEON2SSESTORAGE uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLAL.U32 q0, d0, d0[0]
+_NEON2SSE_INLINE uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) // VMLAL.U32 q0, d0, d0[0]
+{
+    uint32x2_t vc;
+    vc = vdup_n_u32(c);
+    return vmlal_u32(a, b, vc);
+}
+
+//************ Vector widening saturating doubling multiply accumulate with scalar **************
+_NEON2SSESTORAGE int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLAL.S16 q0, d0, d0[0]
+_NEON2SSE_INLINE int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c)
+{
+    //not optimal SIMD soulution, serial may be faster
+    int16x4_t vc;
+    vc = vdup_n_s16(c);
+    return vqdmlal_s16(a, b, vc);
+}
+
+_NEON2SSESTORAGE int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLAL.S32 q0, d0, d0[0]
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    int32x2_t vc;
+    vc = vdup_n_s32(c);
+    return vqdmlal_s32(a, b, vc);
+}
+
+//******** Vector multiply subtract with scalar **************
+_NEON2SSESTORAGE int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLS.I16 d0, d0, d0[0]
+_NEON2SSE_INLINE int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) // VMLS.I16 d0, d0, d0[0]
+{
+    int16x4_t vc;
+    vc = vdup_n_s16(c);
+    return vmls_s16(a, b, vc);
+}
+
+_NEON2SSESTORAGE int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLS.I32 d0, d0, d0[0]
+_NEON2SSE_INLINE int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) // VMLS.I32 d0, d0, d0[0]
+{
+    int32x2_t vc;
+    vc = vdup_n_s32(c);
+    return vmls_s32(a, b, vc);
+}
+
+_NEON2SSESTORAGE uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLS.I16 d0, d0, d0[0]
+_NEON2SSE_INLINE uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) // VMLS.I16 d0, d0, d0[0]
+{
+    uint16x4_t vc;
+    vc = vdup_n_s16(c);
+    return vmls_s16(a, b, vc);
+}
+
+_NEON2SSESTORAGE uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLS.I32 d0, d0, d0[0]
+_NEON2SSE_INLINE uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) // VMLS.I32 d0, d0, d0[0]
+{
+    uint32x2_t vc;
+    vc = vdup_n_u32(c);
+    return vmls_u32(a, b, vc);
+}
+
+_NEON2SSESTORAGE float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLS.F32 d0, d0, d0[0]
+_NEON2SSE_INLINE float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c)
+{
+    float32x2_t res;
+    res.m64_f32[0] = a.m64_f32[0] - b.m64_f32[0] * c;
+    res.m64_f32[1] = a.m64_f32[1] - b.m64_f32[1] * c;
+    return res;
+}
+
+_NEON2SSESTORAGE int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLS.I16 q0, q0, d0[0]
+_NEON2SSE_INLINE int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) // VMLS.I16 q0, q0, d0[0]
+{
+    int16x8_t vc;
+    vc = vdupq_n_s16(c);
+    return vmlsq_s16(a, b,vc);
+}
+
+_NEON2SSESTORAGE int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLS.I32 q0, q0, d0[0]
+_NEON2SSE_INLINE int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) // VMLS.I32 q0, q0, d0[0]
+{
+    int32x4_t vc;
+    vc = vdupq_n_s32(c);
+    return vmlsq_s32(a,b,vc);
+}
+
+_NEON2SSESTORAGE uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLS.I16 q0, q0, d0[0]
+_NEON2SSE_INLINE uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) // VMLS.I16 q0, q0, d0[0]
+{
+    uint16x8_t vc;
+    vc = vdupq_n_u16(c);
+    return vmlsq_u16(a,b,vc);
+}
+
+_NEON2SSESTORAGE uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLS.I32 q0, q0, d0[0]
+_NEON2SSE_INLINE uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) // VMLS.I32 q0, q0, d0[0]
+{
+    uint32x4_t vc;
+    vc = vdupq_n_u32(c);
+    return vmlsq_u32(a,b,vc);
+}
+
+_NEON2SSESTORAGE float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLS.F32 q0, q0, d0[0]
+_NEON2SSE_INLINE float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c)
+{
+    float32x4_t vc;
+    vc = vdupq_n_f32(c);
+    return vmlsq_f32(a,b,vc);
+}
+
+//**** Vector widening multiply subtract with scalar ******
+_NEON2SSESTORAGE int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLSL.S16 q0, d0, d0[0]
+_NEON2SSE_INLINE int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) // VMLSL.S16 q0, d0, d0[0]
+{
+    int16x4_t vc;
+    vc = vdup_n_s16(c);
+    return vmlsl_s16(a, b, vc);
+}
+
+_NEON2SSESTORAGE int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLSL.S32 q0, d0, d0[0]
+_NEON2SSE_INLINE int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) // VMLSL.S32 q0, d0, d0[0]
+{
+    int32x2_t vc;
+    vc = vdup_n_s32(c);
+    return vmlsl_s32(a, b, vc);
+}
+
+_NEON2SSESTORAGE uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLSL.s16 q0, d0, d0[0]
+_NEON2SSE_INLINE uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) // VMLSL.s16 q0, d0, d0[0]
+{
+    uint16x4_t vc;
+    vc = vdup_n_u16(c);
+    return vmlsl_u16(a, b, vc);
+}
+
+_NEON2SSESTORAGE uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLSL.U32 q0, d0, d0[0]
+_NEON2SSE_INLINE uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) // VMLSL.U32 q0, d0, d0[0]
+{
+    uint32x2_t vc;
+    vc = vdup_n_u32(c);
+    return vmlsl_u32(a, b, vc);
+}
+
+//***** Vector widening saturating doubling multiply subtract with scalar *********
+//**********************************************************************************
+_NEON2SSESTORAGE int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLSL.S16 q0, d0, d0[0]
+_NEON2SSE_INLINE int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c)
+{
+    int16x4_t vc;
+    vc = vdup_n_s16(c);
+    return vqdmlsl_s16(a, b, vc);
+}
+
+_NEON2SSESTORAGE int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLSL.S32 q0, d0, d0[0]
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    int32x2_t vc;
+    vc = vdup_n_s32(c);
+    return vqdmlsl_s32(a, b, vc);
+}
+
+//*******************  Vector extract ***********************************************
+//*************************************************************************************
+//VEXT (Vector Extract) extracts  elements from the bottom end of the second operand
+//vector and the top end of the first, concatenates them, and places the result in the destination vector
+//c elements from the bottom end of the second operand and (8-c) from the top end of the first
+_NEON2SSESTORAGE int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c),_NEON2SSE_REASON_SLOW_SERIAL)
+{
+    int8x8_t res;
+    int i;
+    for (i = 0; i<8 - c; i++) {
+        res.m64_i8[i] = a.m64_i8[i + c];
+    }
+    for(i = 0; i<c; i++) {
+        res.m64_i8[8 - c + i] = b.m64_i8[i];
+    }
+    return res;
+}
+
+_NEON2SSESTORAGE uint8x8_t vext_u8(uint8x8_t a,  uint8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
+#define vext_u8 vext_s8
+//same result tested
+
+_NEON2SSESTORAGE poly8x8_t vext_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
+#define vext_p8 vext_u8
+
+_NEON2SSESTORAGE int16x4_t vext_s16(int16x4_t a, int16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
+_NEON2SSE_INLINE int16x4_t  _NEON2SSE_PERFORMANCE_WARNING (vext_s16(int16x4_t a, int16x4_t b, __constrange(0,3) int c), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    int16x4_t res;
+    int i;
+    for (i = 0; i<4 - c; i++) {
+        res.m64_i16[i] = a.m64_i16[i + c];
+    }
+    for(i = 0; i<c; i++) {
+        res.m64_i16[4 - c + i] = b.m64_i16[i];
+    }
+    return res;
+}
+
+_NEON2SSESTORAGE uint16x4_t vext_u16(uint16x4_t a,  uint16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
+#define vext_u16 vext_s16
+
+_NEON2SSESTORAGE poly16x4_t vext_p16(poly16x4_t a, poly16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
+#define vext_p16 vext_s16
+
+_NEON2SSESTORAGE int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(0,1) int c), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    int32x2_t res;
+    if (c==0) {
+        res.m64_i32[0] = a.m64_i32[0];
+        res.m64_i32[1] = a.m64_i32[1];
+    } else {
+        res.m64_i32[0] = a.m64_i32[1];
+        res.m64_i32[1] = b.m64_i32[0];
+    }
+    return res;
+}
+
+_NEON2SSESTORAGE float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(0,1) int c), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    float32x2_t res;
+    if (c==0) {
+        res.m64_f32[0] = a.m64_f32[0];
+        res.m64_f32[1] = a.m64_f32[1];
+    } else {
+        res.m64_f32[0] = a.m64_f32[1];
+        res.m64_f32[1] = b.m64_f32[0];
+    }
+    return res;
+}
+
+_NEON2SSESTORAGE uint32x2_t vext_u32(uint32x2_t a,  uint32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
+#define vext_u32 vext_s32
+
+
+_NEON2SSESTORAGE int64x1_t vext_s64(int64x1_t a, int64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
+#define vext_s64(a,b,c) a
+
+_NEON2SSESTORAGE uint64x1_t vext_u64(uint64x1_t a, uint64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
+#define vext_u64(a,b,c) a
+
+_NEON2SSESTORAGE int8x16_t vextq_s8(int8x16_t a, int8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
+#define vextq_s8(a,b,c) _MM_ALIGNR_EPI8 (b,a,c)
+
+_NEON2SSESTORAGE uint8x16_t vextq_u8(uint8x16_t a, uint8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
+#define vextq_u8(a,b,c) _MM_ALIGNR_EPI8 (b,a,c)
+
+_NEON2SSESTORAGE poly8x16_t vextq_p8(poly8x16_t a, poly8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
+#define vextq_p8 vextq_s8
+
+_NEON2SSESTORAGE int16x8_t vextq_s16(int16x8_t a, int16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
+#define vextq_s16(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 2)
+
+_NEON2SSESTORAGE uint16x8_t vextq_u16(uint16x8_t a, uint16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
+#define vextq_u16(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 2)
+
+_NEON2SSESTORAGE poly16x8_t vextq_p16(poly16x8_t a, poly16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
+#define vextq_p16 vextq_s16
+
+_NEON2SSESTORAGE int32x4_t vextq_s32(int32x4_t a, int32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
+#define vextq_s32(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 4)
+
+_NEON2SSESTORAGE uint32x4_t vextq_u32(uint32x4_t a, uint32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
+#define vextq_u32(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 4)
+
+_NEON2SSESTORAGE float32x4_t vextq_f32(float32x4_t a, float32x4_t b, __constrange(0,3) float c); // VEXT.32 q0,q0,q0,#0
+#define vextq_f32(a,b,c) _M128(vextq_s32(_M128i(a),_M128i(b),c) )
+
+_NEON2SSESTORAGE int64x2_t vextq_s64(int64x2_t a, int64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
+#define vextq_s64(a,b,c) _MM_ALIGNR_EPI8(b,a,c * 8)
+
+_NEON2SSESTORAGE uint64x2_t vextq_u64(uint64x2_t a, uint64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
+#define vextq_u64(a,b,c) _MM_ALIGNR_EPI8(b,a,c * 8)
+
+//************ Reverse vector elements (swap endianness)*****************
+//*************************************************************************
+//VREVn.m reverses the order of the m-bit lanes within a set that is n bits wide.
+_NEON2SSESTORAGE int8x8_t vrev64_s8(int8x8_t vec); // VREV64.8 d0,d0
+_NEON2SSE_INLINE int8x8_t vrev64_s8(int8x8_t vec)
+{
+    int8x8_t res64;
+    __m128i res;
+    res = vrev64q_s8(_pM128i(vec));
+    return64(res);
+}
+
+_NEON2SSESTORAGE int16x4_t vrev64_s16(int16x4_t vec); // VREV64.16 d0,d0
+_NEON2SSE_INLINE int16x4_t vrev64_s16(int16x4_t vec)
+{
+    int16x4_t res64;
+    __m128i res;
+    res = vrev64q_s16(_pM128i(vec));
+    return64(res);
+}
+
+_NEON2SSESTORAGE int32x2_t vrev64_s32(int32x2_t vec); // VREV64.32 d0,d0
+_NEON2SSE_INLINE int32x2_t vrev64_s32(int32x2_t vec)
+{
+    int32x2_t res;
+    res.m64_i32[0] = vec.m64_i32[1];
+    res.m64_i32[1] = vec.m64_i32[0];
+    return res;
+}
+
+_NEON2SSESTORAGE uint8x8_t vrev64_u8(uint8x8_t vec); // VREV64.8 d0,d0
+#define vrev64_u8 vrev64_s8
+
+_NEON2SSESTORAGE uint16x4_t vrev64_u16(uint16x4_t vec); // VREV64.16 d0,d0
+#define vrev64_u16 vrev64_s16
+
+_NEON2SSESTORAGE uint32x2_t vrev64_u32(uint32x2_t vec); // VREV64.32 d0,d0
+#define vrev64_u32 vrev64_s32
+
+_NEON2SSESTORAGE poly8x8_t vrev64_p8(poly8x8_t vec); // VREV64.8 d0,d0
+#define vrev64_p8 vrev64_u8
+
+_NEON2SSESTORAGE poly16x4_t vrev64_p16(poly16x4_t vec); // VREV64.16 d0,d0
+#define vrev64_p16 vrev64_u16
+
+_NEON2SSESTORAGE float32x2_t vrev64_f32(float32x2_t vec); // VREV64.32 d0,d0
+_NEON2SSE_INLINE float32x2_t vrev64_f32(float32x2_t vec)
+{
+    float32x2_t res;
+    res.m64_f32[0] = vec.m64_f32[1];
+    res.m64_f32[1] = vec.m64_f32[0];
+    return res;
+}
+
+_NEON2SSESTORAGE int8x16_t vrev64q_s8(int8x16_t vec); // VREV64.8 q0,q0
+_NEON2SSE_INLINE int8x16_t vrev64q_s8(int8x16_t vec) // VREV64.8 q0,q0
+{
+    _NEON2SSE_ALIGN_16 static const int8_t mask_rev_e8[16] = {7,6,5,4,3,2,1,0, 15,14,13,12,11,10,9, 8};
+    return _mm_shuffle_epi8 (vec, *(__m128i*)  mask_rev_e8);
+}
+
+_NEON2SSESTORAGE int16x8_t vrev64q_s16(int16x8_t vec); // VREV64.16 q0,q0
+_NEON2SSE_INLINE int16x8_t vrev64q_s16(int16x8_t vec) // VREV64.16 q0,q0
+{
+    //no _mm_shuffle_epi16, _mm_shuffle_epi8 to be used with the corresponding mask
+    _NEON2SSE_ALIGN_16 static const int8_t mask_rev_e16[16] = {6,7, 4,5,2,3,0,1,14,15,12,13,10,11,8,9};
+    return _mm_shuffle_epi8 (vec, *(__m128i*)mask_rev_e16);
+}
+
+_NEON2SSESTORAGE int32x4_t vrev64q_s32(int32x4_t vec); // VREV64.32 q0,q0
+_NEON2SSE_INLINE int32x4_t vrev64q_s32(int32x4_t vec) // VREV64.32 q0,q0
+{
+    return _mm_shuffle_epi32 (vec, 1 | (0 << 2) | (3 << 4) | (2 << 6) );
+}
+
+_NEON2SSESTORAGE uint8x16_t vrev64q_u8(uint8x16_t vec); // VREV64.8 q0,q0
+#define vrev64q_u8 vrev64q_s8
+
+_NEON2SSESTORAGE uint16x8_t vrev64q_u16(uint16x8_t vec); // VREV64.16 q0,q0
+#define vrev64q_u16 vrev64q_s16
+
+_NEON2SSESTORAGE uint32x4_t vrev64q_u32(uint32x4_t vec); // VREV64.32 q0,q0
+#define vrev64q_u32 vrev64q_s32
+
+_NEON2SSESTORAGE poly8x16_t vrev64q_p8(poly8x16_t vec); // VREV64.8 q0,q0
+#define vrev64q_p8 vrev64q_u8
+
+_NEON2SSESTORAGE poly16x8_t vrev64q_p16(poly16x8_t vec); // VREV64.16 q0,q0
+#define vrev64q_p16 vrev64q_u16
+
+_NEON2SSESTORAGE float32x4_t vrev64q_f32(float32x4_t vec); // VREV64.32 q0,q0
+#define vrev64q_f32(vec) _mm_shuffle_ps (vec,  vec, _MM_SHUFFLE(2,3, 0,1))
+
+//********************  32 bit shuffles **********************
+//************************************************************
+_NEON2SSESTORAGE int8x8_t vrev32_s8(int8x8_t vec); // VREV32.8 d0,d0
+_NEON2SSE_INLINE int8x8_t vrev32_s8(int8x8_t vec)
+{
+    int8x8_t res64;
+    __m128i res;
+    res = vrev32q_s8(_pM128i(vec));
+    return64(res);
+}
+
+_NEON2SSESTORAGE int16x4_t vrev32_s16(int16x4_t vec); // VREV32.16 d0,d0
+_NEON2SSE_INLINE int16x4_t vrev32_s16(int16x4_t vec)
+{
+    int16x4_t res64;
+    __m128i res;
+    res = vrev32q_s16(_pM128i(vec));
+    return64(res);
+}
+
+_NEON2SSESTORAGE uint8x8_t vrev32_u8(uint8x8_t vec); // VREV32.8 d0,d0
+#define vrev32_u8 vrev32_s8
+
+_NEON2SSESTORAGE uint16x4_t vrev32_u16(uint16x4_t vec); // VREV32.16 d0,d0
+#define vrev32_u16 vrev32_s16
+
+_NEON2SSESTORAGE poly8x8_t vrev32_p8(poly8x8_t vec); // VREV32.8 d0,d0
+#define vrev32_p8 vrev32_u8
+
+_NEON2SSESTORAGE poly16x4_t vrev32_p16(poly16x4_t vec); // VREV32.16 d0,d0
+#define vrev32_p16 vrev32_u16
+
+_NEON2SSESTORAGE int8x16_t vrev32q_s8(int8x16_t vec); // VREV32.8 q0,q0
+_NEON2SSE_INLINE int8x16_t vrev32q_s8(int8x16_t vec) // VREV32.8 q0,q0
+{
+    _NEON2SSE_ALIGN_16 static const int8_t mask_rev_e8[16] = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
+    return _mm_shuffle_epi8 (vec, *(__m128i*)  mask_rev_e8);
+}
+
+_NEON2SSESTORAGE int16x8_t vrev32q_s16(int16x8_t vec); // VREV32.16 q0,q0
+_NEON2SSE_INLINE int16x8_t vrev32q_s16(int16x8_t vec) // VREV32.16 q0,q0
+{
+    _NEON2SSE_ALIGN_16 static const int8_t mask_rev_e8[16] = {2,3,0,1, 6,7, 4,5, 10,11, 8,9, 14,15,12,13};
+    return _mm_shuffle_epi8 (vec, *(__m128i*)  mask_rev_e8);
+}
+
+_NEON2SSESTORAGE uint8x16_t vrev32q_u8(uint8x16_t vec); // VREV32.8 q0,q0
+#define vrev32q_u8 vrev32q_s8
+
+_NEON2SSESTORAGE uint16x8_t vrev32q_u16(uint16x8_t vec); // VREV32.16 q0,q0
+#define vrev32q_u16 vrev32q_s16
+
+_NEON2SSESTORAGE poly8x16_t vrev32q_p8(poly8x16_t vec); // VREV32.8 q0,q0
+#define vrev32q_p8 vrev32q_u8
+
+_NEON2SSESTORAGE poly16x8_t vrev32q_p16(poly16x8_t vec); // VREV32.16 q0,q0
+#define vrev32q_p16 vrev32q_u16
+
+//*************  16 bit shuffles **********************
+//******************************************************
+_NEON2SSESTORAGE int8x8_t vrev16_s8(int8x8_t vec); // VREV16.8 d0,d0
+_NEON2SSE_INLINE int8x8_t vrev16_s8(int8x8_t vec)
+{
+    int8x8_t res64;
+    __m128i res;
+    res = vrev16q_s8(_pM128i(vec));
+    return64(res);
+}
+
+_NEON2SSESTORAGE uint8x8_t vrev16_u8(uint8x8_t vec); // VREV16.8 d0,d0
+#define vrev16_u8 vrev16_s8
+
+_NEON2SSESTORAGE poly8x8_t vrev16_p8(poly8x8_t vec); // VREV16.8 d0,d0
+#define vrev16_p8 vrev16_u8
+
+_NEON2SSESTORAGE int8x16_t vrev16q_s8(int8x16_t vec); // VREV16.8 q0,q0
+_NEON2SSE_INLINE int8x16_t vrev16q_s8(int8x16_t vec) // VREV16.8 q0,q0
+{
+    _NEON2SSE_ALIGN_16 static const int8_t mask_rev8[16] = {1,0, 3,2, 5,4, 7,6, 9,8, 11, 10, 13, 12, 15, 14};
+    return _mm_shuffle_epi8 (vec, *(__m128i*)  mask_rev8);
+}
+
+_NEON2SSESTORAGE uint8x16_t vrev16q_u8(uint8x16_t vec); // VREV16.8 q0,q0
+#define vrev16q_u8 vrev16q_s8
+
+_NEON2SSESTORAGE poly8x16_t vrev16q_p8(poly8x16_t vec); // VREV16.8 q0,q0
+#define vrev16q_p8 vrev16q_u8
+
+//*********************************************************************
+//**************** Other single operand arithmetic *******************
+//*********************************************************************
+
+//*********** Absolute: Vd[i] = |Va[i]| **********************************
+//************************************************************************
+_NEON2SSESTORAGE int8x8_t   vabs_s8(int8x8_t a); // VABS.S8 d0,d0
+_NEON2SSE_INLINE int8x8_t   vabs_s8(int8x8_t a)
+{
+    int8x8_t res64;
+    __m128i res;
+    res = _mm_abs_epi8(_pM128i(a));
+    return64(res);
+}
+
+
+_NEON2SSESTORAGE int16x4_t   vabs_s16(int16x4_t a); // VABS.S16 d0,d0
+_NEON2SSE_INLINE int16x4_t   vabs_s16(int16x4_t a)
+{
+    int16x4_t res64;
+    __m128i res;
+    res = _mm_abs_epi16(_pM128i(a));
+    return64(res);
+}
+
+_NEON2SSESTORAGE int32x2_t   vabs_s32(int32x2_t a); // VABS.S32 d0,d0
+_NEON2SSE_INLINE int32x2_t   vabs_s32(int32x2_t a)
+{
+    int32x2_t res64;
+    __m128i res;
+    res = _mm_abs_epi32(_pM128i(a));
+    return64(res);
+}
+
+_NEON2SSESTORAGE float32x2_t vabs_f32(float32x2_t a); // VABS.F32 d0,d0
+_NEON2SSE_INLINE float32x2_t vabs_f32(float32x2_t a) // VABS.F32 d0,d0
+{
+    float32x4_t res;
+    __m64_128 res64;
+    _NEON2SSE_ALIGN_16 static const int32_t c7fffffff[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
+    res = _mm_and_ps (_pM128(a), *(__m128*)c7fffffff); //use 64 low bits only
+    _M64f(res64, res);
+    return res64;
+}
+
+_NEON2SSESTORAGE int8x16_t   vabsq_s8(int8x16_t a); // VABS.S8 q0,q0
+#define vabsq_s8 _mm_abs_epi8
+
+_NEON2SSESTORAGE int16x8_t   vabsq_s16(int16x8_t a); // VABS.S16 q0,q0
+#define vabsq_s16 _mm_abs_epi16
+
+_NEON2SSESTORAGE int32x4_t   vabsq_s32(int32x4_t a); // VABS.S32 q0,q0
+#define vabsq_s32 _mm_abs_epi32
+
+_NEON2SSESTORAGE float32x4_t vabsq_f32(float32x4_t a); // VABS.F32 q0,q0
+_NEON2SSE_INLINE float32x4_t vabsq_f32(float32x4_t a) // VABS.F32 q0,q0
+{
+    _NEON2SSE_ALIGN_16 static const int32_t c7fffffff[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
+    return _mm_and_ps (a, *(__m128*)c7fffffff);
+}
+
+#ifdef _NEON2SSE_64BIT
+_NEON2SSESTORAGE int64x2_t vabsq_s64(int64x2_t a); // VABS.S64 q0,q0
+_NEON2SSE_INLINE int64x2_t vabsq_s64(int64x2_t a) // VABS.S64 q0,q0
+{
+    __m128i sign = _mm_srai_epi32 (_mm_shuffle_epi32 (a, 0xf5), 31);
+    return _mm_sub_epi64 (_mm_xor_si128 (a, sign), sign);
+}
+
+_NEON2SSESTORAGE float64x2_t vabsq_f64(float64x2_t a); // VABS.F64 q0,q0
+_NEON2SSE_INLINE float64x2_t vabsq_f64(float64x2_t a) // VABS.F64 q0,q0
+{
+    _NEON2SSE_ALIGN_16 static const int64_t mask[2] = {0x7fffffffffffffffLL, 0x7fffffffffffffffLL};
+    return _mm_and_pd (a, *(__m128d*)mask);
+}
+#endif
+
+//****** Saturating absolute: Vd[i] = sat(|Va[i]|) *********************
+//**********************************************************************
+//For signed-integer data types, the absolute value of the most negative value is not representable by the data type, saturation takes place
+_NEON2SSESTORAGE int8x8_t vqabs_s8(int8x8_t a); // VQABS.S8 d0,d0
+_NEON2SSE_INLINE int8x8_t vqabs_s8(int8x8_t a)
+{
+    int8x8_t res64;
+    __m128i res;
+    res = vqabsq_s8(_pM128i(a));
+    return64(res);
+}
+
+_NEON2SSESTORAGE int16x4_t vqabs_s16(int16x4_t a); // VQABS.S16 d0,d0
+_NEON2SSE_INLINE int16x4_t vqabs_s16(int16x4_t a)
+{
+    int16x4_t res64;
+    __m128i res;
+    res = vqabsq_s16(_pM128i(a));
+    return64(res);
+}
+
+_NEON2SSESTORAGE int32x2_t vqabs_s32(int32x2_t a); // VQABS.S32 d0,d0
+_NEON2SSE_INLINE int32x2_t vqabs_s32(int32x2_t a)
+{
+    int32x2_t res64;
+    __m128i res;
+    res = vqabsq_s32(_pM128i(a));
+    return64(res);
+}
+
+_NEON2SSESTORAGE int8x16_t vqabsq_s8(int8x16_t a); // VQABS.S8 q0,q0
+_NEON2SSE_INLINE int8x16_t vqabsq_s8(int8x16_t a) // VQABS.S8 q0,q0
+{
+    __m128i c_128, abs, abs_cmp;
+    c_128 = _mm_set1_epi8 (-128); //(int8_t)0x80
+    abs = _mm_abs_epi8 (a);
+    abs_cmp = _mm_cmpeq_epi8 (abs, c_128);
+    return _mm_xor_si128 (abs,  abs_cmp);
+}
+
+_NEON2SSESTORAGE int16x8_t vqabsq_s16(int16x8_t a); // VQABS.S16 q0,q0
+_NEON2SSE_INLINE int16x8_t vqabsq_s16(int16x8_t a) // VQABS.S16 q0,q0
+{
+    __m128i c_32768, abs, abs_cmp;
+    c_32768 = _mm_set1_epi16 (-32768); //(int16_t)0x8000
+    abs = _mm_abs_epi16 (a);
+    abs_cmp = _mm_cmpeq_epi16 (abs, c_32768);
+    return _mm_xor_si128 (abs,  abs_cmp);
+}
+
+_NEON2SSESTORAGE int32x4_t vqabsq_s32(int32x4_t a); // VQABS.S32 q0,q0
+_NEON2SSE_INLINE int32x4_t vqabsq_s32(int32x4_t a) // VQABS.S32 q0,q0
+{
+    __m128i c80000000, abs, abs_cmp;
+    c80000000 = _mm_set1_epi32 (0x80000000); //most negative value
+    abs = _mm_abs_epi32 (a);
+    abs_cmp = _mm_cmpeq_epi32 (abs, c80000000);
+    return _mm_xor_si128 (abs,  abs_cmp);
+}
+
+//*************** Negate: Vd[i] = - Va[i] *************************************
+//*****************************************************************************
+//several Negate implementations possible for SIMD.
+//e.//function _mm_sign function(a, negative numbers vector), but the following one gives good performance:
+_NEON2SSESTORAGE int8x8_t vneg_s8(int8x8_t a); // VNE//d0,d0
+_NEON2SSE_INLINE int8x8_t vneg_s8(int8x8_t a)
+{
+    int8x8_t res64;
+    __m128i res;
+    res = vnegq_s8(_pM128i(a));
+    return64(res);
+}
+
+_NEON2SSESTORAGE int16x4_t vneg_s16(int16x4_t a); // VNE//d0,d0
+_NEON2SSE_INLINE int16x4_t vneg_s16(int16x4_t a)
+{
+    int16x4_t res64;
+    __m128i res;
+    res = vnegq_s16(_pM128i(a));
+    return64(res);
+}
+
+_NEON2SSESTORAGE int32x2_t vneg_s32(int32x2_t a); // VNE//d0,d0
+_NEON2SSE_INLINE int32x2_t vneg_s32(int32x2_t a)
+{
+    int32x2_t res64;
+    __m128i res;
+    res = vnegq_s32(_pM128i(a));
+    return64(res);
+}
+
+_NEON2SSESTORAGE float32x2_t vneg_f32(float32x2_t a); // VNE//d0,d0
+_NEON2SSE_INLINE float32x2_t vneg_f32(float32x2_t a) // VNE//d0,d0
+{
+    float32x4_t res;
+    __m64_128 res64;
+    _NEON2SSE_ALIGN_16 static const uint32_t c80000000[4] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
+    res = _mm_xor_ps (_pM128(a), *(__m128*) c80000000); //use low 64 bits
+    _M64f(res64, res);
+    return res64;
+}
+
+_NEON2SSESTORAGE int8x16_t vnegq_s8(int8x16_t a); // VNE//q0,q0
+_NEON2SSE_INLINE int8x16_t vnegq_s8(int8x16_t a) // VNE//q0,q0
+{
+    __m128i zero;
+    zero = _mm_setzero_si128 ();
+    return _mm_sub_epi8 (zero, a);
+} //or _mm_sign_epi8 (a, negative numbers vector)
+
+_NEON2SSESTORAGE int16x8_t vnegq_s16(int16x8_t a); // VNE//q0,q0
+_NEON2SSE_INLINE int16x8_t vnegq_s16(int16x8_t a) // VNE//q0,q0
+{
+    __m128i zero;
+    zero = _mm_setzero_si128 ();
+    return _mm_sub_epi16 (zero, a);
+} //or _mm_sign_epi16 (a, negative numbers vector)
+
+_NEON2SSESTORAGE int32x4_t vnegq_s32(int32x4_t a); // VNE//q0,q0
+_NEON2SSE_INLINE int32x4_t vnegq_s32(int32x4_t a) // VNE//q0,q0
+{
+    __m128i zero;
+    zero = _mm_setzero_si128 ();
+    return _mm_sub_epi32 (zero, a);
+} //or _mm_sign_epi32 (a, negative numbers vector)
+
+_NEON2SSESTORAGE float32x4_t vnegq_f32(float32x4_t a); // VNE//q0,q0
+_NEON2SSE_INLINE float32x4_t vnegq_f32(float32x4_t a) // VNE//q0,q0
+{
+    _NEON2SSE_ALIGN_16 static const uint32_t c80000000[4] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
+    return _mm_xor_ps (a, *(__m128*) c80000000);
+}
+
+//************** Saturating Negate: sat(Vd[i] = - Va[i]) **************************
+//***************************************************************************************
+//For signed-integer data types, the negation of the most negative value can't be produced without saturation, while with saturation it is max positive
+_NEON2SSESTORAGE int8x8_t vqneg_s8(int8x8_t a); // VQNE//d0,d0
+_NEON2SSE_INLINE int8x8_t vqneg_s8(int8x8_t a)
+{
+    int8x8_t res64;
+    __m128i res;
+    res = vqnegq_s8(_pM128i(a));
+    return64(res);
+}
+
+_NEON2SSESTORAGE int16x4_t vqneg_s16(int16x4_t a); // VQNE//d0,d0
+_NEON2SSE_INLINE int16x4_t vqneg_s16(int16x4_t a)
+{
+    int16x4_t res64;
+    __m128i res;
+    res = vqnegq_s16(_pM128i(a));
+    return64(res);
+}
+
+_NEON2SSESTORAGE int32x2_t vqneg_s32(int32x2_t a); // VQNE//d0,d0
+_NEON2SSE_INLINE int32x2_t vqneg_s32(int32x2_t a)
+{
+    int32x2_t res64;
+    __m128i res;
+    res = vqnegq_s32(_pM128i(a));
+    return64(res);
+}
+
+_NEON2SSESTORAGE int8x16_t vqnegq_s8(int8x16_t a); // VQNE//q0,q0
+_NEON2SSE_INLINE int8x16_t vqnegq_s8(int8x16_t a) // VQNE//q0,q0
+{
+    __m128i zero;
+    zero = _mm_setzero_si128 ();
+    return _mm_subs_epi8 (zero, a); //saturating substraction
+}
+
+_NEON2SSESTORAGE int16x8_t vqnegq_s16(int16x8_t a); // VQNE//q0,q0
+_NEON2SSE_INLINE int16x8_t vqnegq_s16(int16x8_t a) // VQNE//q0,q0
+{
+    __m128i zero;
+    zero = _mm_setzero_si128 ();
+    return _mm_subs_epi16 (zero, a); //saturating substraction
+}
+
+_NEON2SSESTORAGE int32x4_t vqnegq_s32(int32x4_t a); // VQNE//q0,q0
+_NEON2SSE_INLINE int32x4_t vqnegq_s32(int32x4_t a) // VQNE//q0,q0
+{
+    //solution may be not optimal compared with a serial
+    __m128i c80000000, zero, sub, cmp;
+    c80000000 = _mm_set1_epi32 (0x80000000); //most negative value
+    zero = _mm_setzero_si128 ();
+    sub =  _mm_sub_epi32 (zero, a); //substraction
+    cmp = _mm_cmpeq_epi32 (a, c80000000);
+    return _mm_xor_si128 (sub,  cmp);
+}
+
+//****************** Count leading zeros ********************************
+//**************************************************************************
+//no corresponding vector intrinsics in IA32, need to implement it.  While the implementation is effective for 8 bits, it may be not for 16 and 32 bits
+_NEON2SSESTORAGE int8x8_t vclz_s8(int8x8_t a); // VCLZ.I8 d0,d0
+_NEON2SSE_INLINE int8x8_t vclz_s8(int8x8_t a)
+{
+    int8x8_t res64;
+    __m128i res;
+    res = vclzq_s8(_pM128i(a));
+    return64(res);
+}
+
+_NEON2SSESTORAGE int16x4_t vclz_s16(int16x4_t a); // VCLZ.I16 d0,d0
+_NEON2SSE_INLINE int16x4_t vclz_s16(int16x4_t a)
+{
+    int16x4_t res64;
+    __m128i res;
+    res = vclzq_s16(_pM128i(a));
+    return64(res);
+}
+
+_NEON2SSESTORAGE int32x2_t vclz_s32(int32x2_t a); // VCLZ.I32 d0,d0
+_NEON2SSE_INLINE int32x2_t vclz_s32(int32x2_t a)
+{
+    int32x2_t res64;
+    __m128i res;
+    res = vclzq_s32(_pM128i(a));
+    return64(res);
+}
+
+
+_NEON2SSESTORAGE uint8x8_t vclz_u8(uint8x8_t a); // VCLZ.I8 d0,d0
+#define vclz_u8 vclz_s8
+
+_NEON2SSESTORAGE uint16x4_t vclz_u16(uint16x4_t a); // VCLZ.I16 d0,d0
+#define vclz_u16 vclz_s16
+
+_NEON2SSESTORAGE uint32x2_t vclz_u32(uint32x2_t a); // VCLZ.I32 d0,d0
+#define vclz_u32 vclz_s32
+
+_NEON2SSESTORAGE int8x16_t vclzq_s8(int8x16_t a); // VCLZ.I8 q0,q0
+_NEON2SSE_INLINE int8x16_t vclzq_s8(int8x16_t a)
+{
+    _NEON2SSE_ALIGN_16 static const int8_t mask_CLZ[16] = { /* 0 */ 4,/* 1 */ 3,/* 2 */ 2,/* 3 */ 2,
+                                                            /* 4 */ 1,/* 5 */ 1,/* 6 */ 1,/* 7 */ 1,
+                                                            /* 8 */ 0,/* 9 */ 0,/* a */ 0,/* b */ 0,
+                                                            /* c */ 0,/* d */ 0,/* e */ 0,/* f */ 0                          };
+    __m128i maskLOW, c4, lowclz, mask, hiclz;
+    maskLOW = _mm_set1_epi8(0x0f); //low 4 bits, don't need masking low to avoid zero if MSB is set - it happens automatically
+    c4 = _mm_set1_epi8(4);
+    lowclz = _mm_shuffle_epi8( *(__m128i*)mask_CLZ, a); //uses low 4 bits anyway
+    mask =  _mm_srli_epi16(a, 4); //get high 4 bits as low bits
+    mask = _mm_and_si128(mask, maskLOW); //low 4 bits, need masking to avoid zero if MSB is set
+    hiclz = _mm_shuffle_epi8( *(__m128i*) mask_CLZ, mask); //uses low 4 bits anyway
+    mask = _mm_cmpeq_epi8(hiclz, c4); // shows the need to add lowclz zeros
+    lowclz = _mm_and_si128(lowclz,mask);
+    return _mm_add_epi8(lowclz, hiclz);
+}
+
+_NEON2SSESTORAGE int16x8_t vclzq_s16(int16x8_t a); // VCLZ.I16 q0,q0
+_NEON2SSE_INLINE int16x8_t vclzq_s16(int16x8_t a)
+{
+    __m128i c7, res8x16, res8x16_swap;
+    _NEON2SSE_ALIGN_16 static const int8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+    _NEON2SSE_ALIGN_16 static const uint16_t mask8bit[8] = {0x00ff, 0x00ff, 0x00ff, 0x00ff,0x00ff, 0x00ff, 0x00ff, 0x00ff};
+    c7 = _mm_srli_epi16(*(__m128i*)mask8bit, 5); //7
+    res8x16 = vclzq_s8(a);
+    res8x16_swap = _mm_shuffle_epi8 (res8x16, *(__m128i*) mask8_sab); //horisontal pairs swap
+    res8x16 = _mm_and_si128(res8x16, *(__m128i*)mask8bit); //lowclz
+    res8x16_swap = _mm_and_si128(res8x16_swap, *(__m128i*)mask8bit); //hiclz
+    c7 = _mm_cmpgt_epi16(res8x16_swap, c7); // shows the need to add lowclz zeros
+    res8x16 = _mm_and_si128(res8x16, c7); //lowclz
+    return _mm_add_epi16(res8x16_swap, res8x16);
+}
+
+_NEON2SSESTORAGE int32x4_t vclzq_s32(int32x4_t a); // VCLZ.I32 q0,q0
+_NEON2SSE_INLINE int32x4_t vclzq_s32(int32x4_t a)
+{
+    __m128i c55555555, c33333333, c0f0f0f0f, c3f, c32, tmp, tmp1, res;
+    c55555555 = _mm_set1_epi32(0x55555555);
+    c33333333 = _mm_set1_epi32(0x33333333);
+    c0f0f0f0f = _mm_set1_epi32(0x0f0f0f0f);
+    c3f = _mm_set1_epi32(0x3f);
+    c32 = _mm_set1_epi32(32);
+    tmp = _mm_srli_epi32(a, 1);
+    res = _mm_or_si128(tmp, a); //atmp[i] |= (atmp[i] >> 1);
+    tmp = _mm_srli_epi32(res, 2);
+    res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 2);
+    tmp = _mm_srli_epi32(res, 4);
+    res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 4);
+    tmp = _mm_srli_epi32(res, 8);
+    res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 8);
+    tmp = _mm_srli_epi32(res, 16);
+    res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 16);
+
+    tmp = _mm_srli_epi32(res, 1);
+    tmp = _mm_and_si128(tmp, c55555555);
+    res = _mm_sub_epi32(res, tmp); //atmp[i] -= ((atmp[i] >> 1) & 0x55555555);
+
+    tmp = _mm_srli_epi32(res, 2);
+    tmp = _mm_and_si128(tmp, c33333333);
+    tmp1 = _mm_and_si128(res, c33333333);
+    res = _mm_add_epi32(tmp, tmp1); //atmp[i] = (((atmp[i] >> 2) & 0x33333333) + (atmp[i] & 0x33333333));
+
+    tmp = _mm_srli_epi32(res, 4);
+    tmp = _mm_add_epi32(tmp, res);
+    res = _mm_and_si128(tmp, c0f0f0f0f); //atmp[i] = (((atmp[i] >> 4) + atmp[i]) & 0x0f0f0f0f);
+
+    tmp = _mm_srli_epi32(res, 8);
+    res = _mm_add_epi32(tmp, res); //atmp[i] += (atmp[i] >> 8);
+
+    tmp = _mm_srli_epi32(res, 16);
+    res = _mm_add_epi32(tmp, res); //atmp[i] += (atmp[i] >> 16);
+
+    res = _mm_and_si128(res, c3f); //atmp[i] = atmp[i] & 0x0000003f;
+
+    return _mm_sub_epi32(c32, res); //res[i] = 32 - atmp[i];
+}
+
+_NEON2SSESTORAGE uint8x16_t vclzq_u8(uint8x16_t a); // VCLZ.I8 q0,q0
+#define vclzq_u8 vclzq_s8
+
+_NEON2SSESTORAGE uint16x8_t vclzq_u16(uint16x8_t a); // VCLZ.I16 q0,q0
+#define vclzq_u16 vclzq_s16
+
+_NEON2SSESTORAGE uint32x4_t vclzq_u32(uint32x4_t a); // VCLZ.I32 q0,q0
+#define vclzq_u32 vclzq_s32
+
+//************** Count leading sign bits **************************
+//********************************************************************
+//VCLS (Vector Count Leading Sign bits) counts the number of consecutive bits following
+// the topmost bit, that are the same as the topmost bit, in each element in a vector
+//No corresponding vector intrinsics in IA32, need to implement it.
+//While the implementation is effective for 8 bits, it may be not for 16 and 32 bits
+_NEON2SSESTORAGE int8x8_t vcls_s8(int8x8_t a); // VCLS.S8 d0,d0
+_NEON2SSE_INLINE int8x8_t vcls_s8(int8x8_t a)
+{
+    int8x8_t res64;
+    __m128i res;
+    res = vclsq_s8(_pM128i(a));
+    return64(res);
+}
+
+_NEON2SSESTORAGE int16x4_t vcls_s16(int16x4_t a); // VCLS.S16 d0,d0
+_NEON2SSE_INLINE int16x4_t vcls_s16(int16x4_t a)
+{
+    int16x4_t res64;
+    __m128i res;
+    res = vclsq_s16(_pM128i(a));
+    return64(res);
+}
+
+_NEON2SSESTORAGE int32x2_t vcls_s32(int32x2_t a); // VCLS.S32 d0,d0
+_NEON2SSE_INLINE int32x2_t vcls_s32(int32x2_t a)
+{
+    int32x2_t res64;
+    __m128i res;
+    res = vclsq_s32(_pM128i(a));
+    return64(res);
+}
+
+_NEON2SSESTORAGE int8x16_t vclsq_s8(int8x16_t a); // VCLS.S8 q0,q0
+_NEON2SSE_INLINE int8x16_t vclsq_s8(int8x16_t a)
+{
+    __m128i cff, c80, c1, a_mask, a_neg, a_pos, a_comb;
+    cff = _mm_cmpeq_epi8 (a,a); //0xff
+    c80 = _mm_set1_epi8(-128); //(int8_t)0x80
+    c1 = _mm_set1_epi8(1);
+    a_mask = _mm_and_si128(a, c80);
+    a_mask = _mm_cmpeq_epi8(a_mask, c80); //0xff if negative input and 0 if positive
+    a_neg = _mm_xor_si128(a, cff);
+    a_neg = _mm_and_si128(a_mask, a_neg);
+    a_pos = _mm_andnot_si128(a_mask, a);
+    a_comb = _mm_or_si128(a_pos, a_neg);
+    a_comb = vclzq_s8(a_comb);
+    return _mm_sub_epi8(a_comb, c1);
+}
+
+_NEON2SSESTORAGE int16x8_t vclsq_s16(int16x8_t a); // VCLS.S16 q0,q0
+_NEON2SSE_INLINE int16x8_t vclsq_s16(int16x8_t a)
+{
+    __m128i cffff, c8000, c1, a_mask, a_neg, a_pos, a_comb;
+    cffff = _mm_cmpeq_epi16(a,a);
+    c8000 =  _mm_slli_epi16(cffff, 15); //0x8000
+    c1 = _mm_srli_epi16(cffff,15); //0x1
+    a_mask = _mm_and_si128(a, c8000);
+    a_mask = _mm_cmpeq_epi16(a_mask, c8000); //0xffff if negative input and 0 if positive
+    a_neg = _mm_xor_si128(a, cffff);
+    a_neg = _mm_and_si128(a_mask, a_neg);
+    a_pos = _mm_andnot_si128(a_mask, a);
+    a_comb = _mm_or_si128(a_pos, a_neg);
+    a_comb = vclzq_s16(a_comb);
+    return _mm_sub_epi16(a_comb, c1);
+}
+
+_NEON2SSESTORAGE int32x4_t vclsq_s32(int32x4_t a); // VCLS.S32 q0,q0
+_NEON2SSE_INLINE int32x4_t vclsq_s32(int32x4_t a)
+{
+    __m128i cffffffff, c80000000, c1, a_mask, a_neg, a_pos, a_comb;
+    cffffffff = _mm_cmpeq_epi32(a,a);
+    c80000000 =  _mm_slli_epi32(cffffffff, 31); //0x80000000
+    c1 = _mm_srli_epi32(cffffffff,31); //0x1
+    a_mask = _mm_and_si128(a, c80000000);
+    a_mask = _mm_cmpeq_epi32(a_mask, c80000000); //0xffffffff if negative input and 0 if positive
+    a_neg = _mm_xor_si128(a, cffffffff);
+    a_neg = _mm_and_si128(a_mask, a_neg);
+    a_pos = _mm_andnot_si128(a_mask, a);
+    a_comb = _mm_or_si128(a_pos, a_neg);
+    a_comb = vclzq_s32(a_comb);
+    return _mm_sub_epi32(a_comb, c1);
+}
+
+//************************* Count number of set bits   ********************************
+//*************************************************************************************
+//No corresponding SIMD solution. One option is to get a elements, convert it to 32 bits and then use SSE4.2  _mm_popcnt__u32 (unsigned int v) for each element
+//another option is to do the following algorithm:
+
+_NEON2SSESTORAGE uint8x8_t vcnt_u8(uint8x8_t a); // VCNT.8 d0,d0
+_NEON2SSE_INLINE uint8x8_t vcnt_u8(uint8x8_t a)
+{
+    uint8x8_t res64;
+    __m128i res;
+    res = vcntq_u8(_pM128i(a));
+    return64(res);
+}
+
+_NEON2SSESTORAGE int8x8_t vcnt_s8(int8x8_t a); // VCNT.8 d0,d0
+#define vcnt_s8 vcnt_u8
+
+_NEON2SSESTORAGE poly8x8_t vcnt_p8(poly8x8_t a); // VCNT.8 d0,d0
+#define vcnt_p8 vcnt_u8
+
+_NEON2SSESTORAGE uint8x16_t vcntq_u8(uint8x16_t a); // VCNT.8 q0,q0
+_NEON2SSE_INLINE uint8x16_t vcntq_u8(uint8x16_t a)
+{
+    _NEON2SSE_ALIGN_16 static const int8_t mask_POPCOUNT[16] = { /* 0 */ 0,/* 1 */ 1,/* 2 */ 1,/* 3 */ 2,
+                                                                 /* 4 */ 1,/* 5 */ 2,/* 6 */ 2,/* 7 */ 3,
+                                                                 /* 8 */ 1,/* 9 */ 2,/* a */ 2,/* b */ 3,
+                                                                 /* c */ 2,/* d */ 3,/* e */ 3,/* f */ 4};
+    __m128i maskLOW, mask, lowpopcnt, hipopcnt;
+    maskLOW = _mm_set1_epi8(0x0f); //low 4 bits, need masking to avoid zero if MSB is set
+    mask = _mm_and_si128(a, maskLOW);
+    lowpopcnt = _mm_shuffle_epi8( *(__m128i*)mask_POPCOUNT, mask); //uses low 4 bits anyway
+    mask =  _mm_srli_epi16(a, 4); //get high 4 bits as low bits
+    mask = _mm_and_si128(mask, maskLOW); //low 4 bits, need masking to avoid zero if MSB is set
+    hipopcnt = _mm_shuffle_epi8( *(__m128i*) mask_POPCOUNT, mask); //uses low 4 bits anyway
+    return _mm_add_epi8(lowpopcnt, hipopcnt);
+}
+
+_NEON2SSESTORAGE int8x16_t vcntq_s8(int8x16_t a); // VCNT.8 q0,q0
+#define vcntq_s8 vcntq_u8
+
+_NEON2SSESTORAGE poly8x16_t vcntq_p8(poly8x16_t a); // VCNT.8 q0,q0
+#define vcntq_p8 vcntq_u8
+
+//**************************************************************************************
+//*********************** Logical operations ****************************************
+//**************************************************************************************
+//************************** Bitwise not ***********************************
+//several Bitwise not implementations possible for SIMD. Eg "xor" with all ones, but the following one gives good performance
+_NEON2SSESTORAGE int8x8_t vmvn_s8(int8x8_t a); // VMVN d0,d0
+_NEON2SSE_INLINE int8x8_t vmvn_s8(int8x8_t a)
+{
+    int8x8_t res64;
+    __m128i res;
+    res = vmvnq_s8(_pM128i(a));
+    return64(res);
+}
+
+_NEON2SSESTORAGE int16x4_t vmvn_s16(int16x4_t a); // VMVN d0,d0
+_NEON2SSE_INLINE int16x4_t vmvn_s16(int16x4_t a)
+{
+    int16x4_t res64;
+    __m128i res;
+    res = vmvnq_s16(_pM128i(a));
+    return64(res);
+}
+
+_NEON2SSESTORAGE int32x2_t vmvn_s32(int32x2_t a); // VMVN d0,d0
+_NEON2SSE_INLINE int32x2_t vmvn_s32(int32x2_t a)
+{
+    int32x2_t res64;
+    __m128i res;
+    res = vmvnq_s32(_pM128i(a));
+    return64(res);
+}
+
+_NEON2SSESTORAGE uint8x8_t vmvn_u8(uint8x8_t a); // VMVN d0,d0
+#define vmvn_u8 vmvn_s8
+
+_NEON2SSESTORAGE uint16x4_t vmvn_u16(uint16x4_t a); // VMVN d0,d0
+#define vmvn_u16 vmvn_s16
+
+_NEON2SSESTORAGE uint32x2_t vmvn_u32(uint32x2_t a); // VMVN d0,d0
+#define vmvn_u32 vmvn_s32
+
+_NEON2SSESTORAGE poly8x8_t vmvn_p8(poly8x8_t a); // VMVN d0,d0
+#define vmvn_p8 vmvn_u8
+
+_NEON2SSESTORAGE int8x16_t vmvnq_s8(int8x16_t a); // VMVN q0,q0
+_NEON2SSE_INLINE int8x16_t vmvnq_s8(int8x16_t a) // VMVN q0,q0
+{
+    __m128i c1;
+    c1 = _mm_cmpeq_epi8 (a,a); //0xff
+    return _mm_andnot_si128 (a, c1);
+}
+
+_NEON2SSESTORAGE int16x8_t vmvnq_s16(int16x8_t a); // VMVN q0,q0
+_NEON2SSE_INLINE int16x8_t vmvnq_s16(int16x8_t a) // VMVN q0,q0
+{
+    __m128i c1;
+    c1 = _mm_cmpeq_epi16 (a,a); //0xffff
+    return _mm_andnot_si128 (a, c1);
+}
+
+_NEON2SSESTORAGE int32x4_t vmvnq_s32(int32x4_t a); // VMVN q0,q0
+_NEON2SSE_INLINE int32x4_t vmvnq_s32(int32x4_t a) // VMVN q0,q0
+{
+    __m128i c1;
+    c1 = _mm_cmpeq_epi32 (a,a); //0xffffffff
+    return _mm_andnot_si128 (a, c1);
+}
+
+_NEON2SSESTORAGE uint8x16_t vmvnq_u8(uint8x16_t a); // VMVN q0,q0
+#define vmvnq_u8 vmvnq_s8
+
+_NEON2SSESTORAGE uint16x8_t vmvnq_u16(uint16x8_t a); // VMVN q0,q0
+#define vmvnq_u16 vmvnq_s16
+
+_NEON2SSESTORAGE uint32x4_t vmvnq_u32(uint32x4_t a); // VMVN q0,q0
+#define vmvnq_u32 vmvnq_s32
+
+_NEON2SSESTORAGE poly8x16_t vmvnq_p8(poly8x16_t a); // VMVN q0,q0
+#define vmvnq_p8 vmvnq_u8
+
+//****************** Bitwise and ***********************
+//******************************************************
+_NEON2SSESTORAGE int8x8_t vand_s8(int8x8_t a, int8x8_t b); // VAND d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vand_s8(int8x8_t a, int8x8_t b)
+{
+    int8x8_t res64;
+    return64(_mm_and_si128(_pM128i(a),_pM128i(b)));
+}
+
+_NEON2SSESTORAGE int16x4_t vand_s16(int16x4_t a, int16x4_t b); // VAND d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vand_s16(int16x4_t a, int16x4_t b)
+{
+    int16x4_t res64;
+    return64(_mm_and_si128(_pM128i(a),_pM128i(b)));
+}
+
+_NEON2SSESTORAGE int32x2_t vand_s32(int32x2_t a, int32x2_t b); // VAND d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vand_s32(int32x2_t a, int32x2_t b)
+{
+    int32x2_t res64;
+    return64(_mm_and_si128(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE int64x1_t vand_s64(int64x1_t a,  int64x1_t b); // VAND d0,d0,d0
+_NEON2SSE_INLINE int64x1_t vand_s64(int64x1_t a,  int64x1_t b)
+{
+    int64x1_t res;
+    res.m64_i64[0] = a.m64_i64[0] & b.m64_i64[0];
+    return res;
+}
+
+_NEON2SSESTORAGE uint8x8_t vand_u8(uint8x8_t a, uint8x8_t b); // VAND d0,d0,d0
+#define vand_u8 vand_s8
+
+_NEON2SSESTORAGE uint16x4_t vand_u16(uint16x4_t a, uint16x4_t b); // VAND d0,d0,d0
+#define vand_u16 vand_s16
+
+_NEON2SSESTORAGE uint32x2_t vand_u32(uint32x2_t a, uint32x2_t b); // VAND d0,d0,d0
+#define vand_u32 vand_s32
+
+_NEON2SSESTORAGE uint64x1_t vand_u64(uint64x1_t a,  uint64x1_t b); // VAND d0,d0,d0
+#define vand_u64 vand_s64
+
+
+_NEON2SSESTORAGE int8x16_t   vandq_s8(int8x16_t a, int8x16_t b); // VAND q0,q0,q0
+#define vandq_s8 _mm_and_si128
+
+_NEON2SSESTORAGE int16x8_t   vandq_s16(int16x8_t a, int16x8_t b); // VAND q0,q0,q0
+#define vandq_s16 _mm_and_si128
+
+_NEON2SSESTORAGE int32x4_t   vandq_s32(int32x4_t a, int32x4_t b); // VAND q0,q0,q0
+#define vandq_s32 _mm_and_si128
+
+_NEON2SSESTORAGE int64x2_t   vandq_s64(int64x2_t a, int64x2_t b); // VAND q0,q0,q0
+#define vandq_s64 _mm_and_si128
+
+_NEON2SSESTORAGE uint8x16_t   vandq_u8(uint8x16_t a, uint8x16_t b); // VAND q0,q0,q0
+#define vandq_u8 _mm_and_si128
+
+_NEON2SSESTORAGE uint16x8_t   vandq_u16(uint16x8_t a, uint16x8_t b); // VAND q0,q0,q0
+#define vandq_u16 _mm_and_si128
+
+_NEON2SSESTORAGE uint32x4_t   vandq_u32(uint32x4_t a, uint32x4_t b); // VAND q0,q0,q0
+#define vandq_u32 _mm_and_si128
+
+_NEON2SSESTORAGE uint64x2_t   vandq_u64(uint64x2_t a, uint64x2_t b); // VAND q0,q0,q0
+#define vandq_u64 _mm_and_si128
+
+//******************** Bitwise or *********************************
+//******************************************************************
+_NEON2SSESTORAGE int8x8_t vorr_s8(int8x8_t a, int8x8_t b); // VORR d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vorr_s8(int8x8_t a, int8x8_t b)
+{
+    int8x8_t res64;
+    return64(_mm_or_si128(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE int16x4_t vorr_s16(int16x4_t a, int16x4_t b); // VORR d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vorr_s16(int16x4_t a, int16x4_t b)
+{
+    int16x4_t res64;
+    return64(_mm_or_si128(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE int32x2_t vorr_s32(int32x2_t a, int32x2_t b); // VORR d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vorr_s32(int32x2_t a, int32x2_t b)
+{
+    int32x2_t res64;
+    return64(_mm_or_si128(_pM128i(a),_pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE int64x1_t vorr_s64(int64x1_t a,  int64x1_t b); // VORR d0,d0,d0
+_NEON2SSE_INLINE int64x1_t vorr_s64(int64x1_t a,  int64x1_t b)
+{
+    int64x1_t res;
+    res.m64_i64[0] = a.m64_i64[0] | b.m64_i64[0];
+    return res;
+}
+
+_NEON2SSESTORAGE uint8x8_t vorr_u8(uint8x8_t a, uint8x8_t b); // VORR d0,d0,d0
+#define vorr_u8 vorr_s8
+
+_NEON2SSESTORAGE uint16x4_t vorr_u16(uint16x4_t a, uint16x4_t b); // VORR d0,d0,d0
+#define vorr_u16 vorr_s16
+
+_NEON2SSESTORAGE uint32x2_t vorr_u32(uint32x2_t a, uint32x2_t b); // VORR d0,d0,d0
+#define vorr_u32 vorr_s32
+
+_NEON2SSESTORAGE uint64x1_t vorr_u64(uint64x1_t a,  uint64x1_t b); // VORR d0,d0,d0
+#define vorr_u64 vorr_s64
+
+_NEON2SSESTORAGE int8x16_t   vorrq_s8(int8x16_t a, int8x16_t b); // VORR q0,q0,q0
+#define vorrq_s8 _mm_or_si128
+
+_NEON2SSESTORAGE int16x8_t   vorrq_s16(int16x8_t a, int16x8_t b); // VORR q0,q0,q0
+#define vorrq_s16 _mm_or_si128
+
+_NEON2SSESTORAGE int32x4_t   vorrq_s32(int32x4_t a, int32x4_t b); // VORR q0,q0,q0
+#define vorrq_s32 _mm_or_si128
+
+_NEON2SSESTORAGE int64x2_t   vorrq_s64(int64x2_t a, int64x2_t b); // VORR q0,q0,q0
+#define vorrq_s64 _mm_or_si128
+
+_NEON2SSESTORAGE uint8x16_t   vorrq_u8(uint8x16_t a, uint8x16_t b); // VORR q0,q0,q0
+#define vorrq_u8 _mm_or_si128
+
+_NEON2SSESTORAGE uint16x8_t   vorrq_u16(uint16x8_t a, uint16x8_t b); // VORR q0,q0,q0
+#define vorrq_u16 _mm_or_si128
+
+_NEON2SSESTORAGE uint32x4_t   vorrq_u32(uint32x4_t a, uint32x4_t b); // VORR q0,q0,q0
+#define vorrq_u32 _mm_or_si128
+
+_NEON2SSESTORAGE uint64x2_t   vorrq_u64(uint64x2_t a, uint64x2_t b); // VORR q0,q0,q0
+#define vorrq_u64 _mm_or_si128
+
+//************* Bitwise exclusive or (EOR or XOR) ******************
+//*******************************************************************
+_NEON2SSESTORAGE int8x8_t veor_s8(int8x8_t a, int8x8_t b); // VEOR d0,d0,d0
+_NEON2SSE_INLINE int8x8_t veor_s8(int8x8_t a, int8x8_t b)
+{
+    int8x8_t res64;
+    return64(_mm_xor_si128(_pM128i(a),_pM128i(b)));
+}
+
+_NEON2SSESTORAGE int16x4_t veor_s16(int16x4_t a, int16x4_t b); // VEOR d0,d0,d0
+#define veor_s16 veor_s8
+
+_NEON2SSESTORAGE int32x2_t veor_s32(int32x2_t a, int32x2_t b); // VEOR d0,d0,d0
+#define veor_s32 veor_s8
+
+_NEON2SSESTORAGE int64x1_t veor_s64(int64x1_t a,  int64x1_t b); // VEOR d0,d0,d0
+_NEON2SSE_INLINE int64x1_t veor_s64(int64x1_t a,  int64x1_t b)
+{
+    int64x1_t res;
+    res.m64_i64[0] = a.m64_i64[0] ^ b.m64_i64[0];
+    return res;
+}
+
+_NEON2SSESTORAGE uint8x8_t veor_u8(uint8x8_t a, uint8x8_t b); // VEOR d0,d0,d0
+#define veor_u8 veor_s8
+
+_NEON2SSESTORAGE uint16x4_t veor_u16(uint16x4_t a, uint16x4_t b); // VEOR d0,d0,d0
+#define veor_u16 veor_s16
+
+_NEON2SSESTORAGE uint32x2_t veor_u32(uint32x2_t a, uint32x2_t b); // VEOR d0,d0,d0
+#define veor_u32 veor_s32
+
+_NEON2SSESTORAGE uint64x1_t veor_u64(uint64x1_t a,  uint64x1_t b); // VEOR d0,d0,d0
+#define veor_u64 veor_s64
+
+_NEON2SSESTORAGE int8x16_t   veorq_s8(int8x16_t a, int8x16_t b); // VEOR q0,q0,q0
+#define veorq_s8 _mm_xor_si128
+
+_NEON2SSESTORAGE int16x8_t   veorq_s16(int16x8_t a, int16x8_t b); // VEOR q0,q0,q0
+#define veorq_s16 _mm_xor_si128
+
+_NEON2SSESTORAGE int32x4_t   veorq_s32(int32x4_t a, int32x4_t b); // VEOR q0,q0,q0
+#define veorq_s32 _mm_xor_si128
+
+_NEON2SSESTORAGE int64x2_t   veorq_s64(int64x2_t a, int64x2_t b); // VEOR q0,q0,q0
+#define veorq_s64 _mm_xor_si128
+
+_NEON2SSESTORAGE uint8x16_t   veorq_u8(uint8x16_t a, uint8x16_t b); // VEOR q0,q0,q0
+#define veorq_u8 _mm_xor_si128
+
+_NEON2SSESTORAGE uint16x8_t   veorq_u16(uint16x8_t a, uint16x8_t b); // VEOR q0,q0,q0
+#define veorq_u16 _mm_xor_si128
+
+_NEON2SSESTORAGE uint32x4_t   veorq_u32(uint32x4_t a, uint32x4_t b); // VEOR q0,q0,q0
+#define veorq_u32 _mm_xor_si128
+
+_NEON2SSESTORAGE uint64x2_t   veorq_u64(uint64x2_t a, uint64x2_t b); // VEOR q0,q0,q0
+#define veorq_u64 _mm_xor_si128
+
+//********************** Bit Clear **********************************
+//*******************************************************************
+//Logical AND complement (AND negation or AND NOT)
+_NEON2SSESTORAGE int8x8_t   vbic_s8(int8x8_t a, int8x8_t b); // VBIC d0,d0,d0
+_NEON2SSE_INLINE int8x8_t   vbic_s8(int8x8_t a, int8x8_t b)
+{
+    int8x8_t res64;
+    return64(_mm_andnot_si128(_pM128i(b),_pM128i(a))); //notice the arguments "swap"
+}
+
+_NEON2SSESTORAGE int16x4_t   vbic_s16(int16x4_t a, int16x4_t b); // VBIC d0,d0,d0
+#define vbic_s16 vbic_s8
+
+_NEON2SSESTORAGE int32x2_t   vbic_s32(int32x2_t a, int32x2_t b); // VBIC d0,d0,d0
+#define vbic_s32 vbic_s8
+
+_NEON2SSESTORAGE int64x1_t   vbic_s64(int64x1_t a, int64x1_t b); // VBIC d0,d0,d0
+_NEON2SSE_INLINE int64x1_t   vbic_s64(int64x1_t a, int64x1_t b)
+{
+    int64x1_t res;
+    res.m64_i64[0] = a.m64_i64[0] & (~b.m64_i64[0]);
+    return res;
+}
+
+_NEON2SSESTORAGE uint8x8_t   vbic_u8(uint8x8_t a, uint8x8_t b); // VBIC d0,d0,d0
+#define vbic_u8 vbic_s8
+
+_NEON2SSESTORAGE uint16x4_t   vbic_u16(uint16x4_t a, uint16x4_t b); // VBIC d0,d0,d0
+#define vbic_u16 vbic_s16
+
+_NEON2SSESTORAGE uint32x2_t   vbic_u32(uint32x2_t a, uint32x2_t b); // VBIC d0,d0,d0
+#define vbic_u32 vbic_s32
+
+_NEON2SSESTORAGE uint64x1_t   vbic_u64(uint64x1_t a, uint64x1_t b); // VBIC d0,d0,d0
+#define vbic_u64 vbic_s64
+
+_NEON2SSESTORAGE int8x16_t   vbicq_s8(int8x16_t a, int8x16_t b); // VBIC q0,q0,q0
+#define vbicq_s8(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
+
+_NEON2SSESTORAGE int16x8_t   vbicq_s16(int16x8_t a, int16x8_t b); // VBIC q0,q0,q0
+#define vbicq_s16(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
+
+_NEON2SSESTORAGE int32x4_t   vbicq_s32(int32x4_t a, int32x4_t b); // VBIC q0,q0,q0
+#define vbicq_s32(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
+
+_NEON2SSESTORAGE int64x2_t   vbicq_s64(int64x2_t a, int64x2_t b); // VBIC q0,q0,q0
+#define vbicq_s64(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
+
+_NEON2SSESTORAGE uint8x16_t   vbicq_u8(uint8x16_t a, uint8x16_t b); // VBIC q0,q0,q0
+#define vbicq_u8(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
+
+_NEON2SSESTORAGE uint16x8_t   vbicq_u16(uint16x8_t a, uint16x8_t b); // VBIC q0,q0,q0
+#define vbicq_u16(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
+
+_NEON2SSESTORAGE uint32x4_t   vbicq_u32(uint32x4_t a, uint32x4_t b); // VBIC q0,q0,q0
+#define vbicq_u32(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
+
+_NEON2SSESTORAGE uint64x2_t   vbicq_u64(uint64x2_t a, uint64x2_t b); // VBIC q0,q0,q0
+#define vbicq_u64(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
+
+//**************** Bitwise OR complement ********************************
+//**************************************** ********************************
+//no exact IA 32 match, need to implement it as following
+_NEON2SSESTORAGE int8x8_t vorn_s8(int8x8_t a,  int8x8_t b); // VORN d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vorn_s8(int8x8_t a,  int8x8_t b)
+{
+    int8x8_t res64;
+    return64(vornq_s8(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE int16x4_t vorn_s16(int16x4_t a,  int16x4_t b); // VORN d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vorn_s16(int16x4_t a,  int16x4_t b)
+{
+    int16x4_t res64;
+    return64(vornq_s16(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE int32x2_t vorn_s32(int32x2_t a,  int32x2_t b); // VORN d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vorn_s32(int32x2_t a,  int32x2_t b)
+{
+    int32x2_t res64;
+    return64(vornq_s32(_pM128i(a), _pM128i(b)));
+}
+
+
+_NEON2SSESTORAGE int64x1_t vorn_s64(int64x1_t a, int64x1_t b); // VORN d0,d0,d0
+_NEON2SSE_INLINE int64x1_t vorn_s64(int64x1_t a, int64x1_t b)
+{
+    int64x1_t res;
+    res.m64_i64[0] = a.m64_i64[0] | (~b.m64_i64[0]);
+    return res;
+}
+
+_NEON2SSESTORAGE uint8x8_t vorn_u8(uint8x8_t a,  uint8x8_t b); // VORN d0,d0,d0
+#define vorn_u8 vorn_s8
+
+
+_NEON2SSESTORAGE uint16x4_t vorn_u16(uint16x4_t a,  uint16x4_t b); // VORN d0,d0,d0
+#define vorn_u16 vorn_s16
+
+_NEON2SSESTORAGE uint32x2_t vorn_u32(uint32x2_t a,  uint32x2_t b); // VORN d0,d0,d0
+#define vorn_u32 vorn_s32
+
+_NEON2SSESTORAGE uint64x1_t vorn_u64(uint64x1_t a, uint64x1_t b); // VORN d0,d0,d0
+#define vorn_u64 vorn_s64
+
+
+_NEON2SSESTORAGE int8x16_t vornq_s8(int8x16_t a, int8x16_t b); // VORN q0,q0,q0
+_NEON2SSE_INLINE int8x16_t vornq_s8(int8x16_t a, int8x16_t b) // VORN q0,q0,q0
+{
+    __m128i b1;
+    b1 = vmvnq_s8( b); //bitwise not for b
+    return _mm_or_si128 (a, b1);
+}
+
+_NEON2SSESTORAGE int16x8_t vornq_s16(int16x8_t a, int16x8_t b); // VORN q0,q0,q0
+_NEON2SSE_INLINE int16x8_t vornq_s16(int16x8_t a, int16x8_t b) // VORN q0,q0,q0
+{
+    __m128i b1;
+    b1 = vmvnq_s16( b); //bitwise not for b
+    return _mm_or_si128 (a, b1);
+}
+
+_NEON2SSESTORAGE int32x4_t vornq_s32(int32x4_t a, int32x4_t b); // VORN q0,q0,q0
+_NEON2SSE_INLINE int32x4_t vornq_s32(int32x4_t a, int32x4_t b) // VORN q0,q0,q0
+{
+    __m128i b1;
+    b1 = vmvnq_s32( b); //bitwise not for b
+    return _mm_or_si128 (a, b1);
+}
+
+_NEON2SSESTORAGE int64x2_t vornq_s64(int64x2_t a, int64x2_t b); // VORN q0,q0,q0
+_NEON2SSE_INLINE int64x2_t vornq_s64(int64x2_t a, int64x2_t b)
+{
+    __m128i c1, b1;
+    c1 = _mm_cmpeq_epi8 (a, a); //all ones 0xfffffff...fffff
+    b1 = _mm_andnot_si128 (b, c1);
+    return _mm_or_si128 (a, b1);
+}
+
+_NEON2SSESTORAGE uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b); // VORN q0,q0,q0
+_NEON2SSE_INLINE uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b) // VORN q0,q0,q0
+{
+    __m128i b1;
+    b1 = vmvnq_u8( b); //bitwise not for b
+    return _mm_or_si128 (a, b1);
+}
+
+_NEON2SSESTORAGE uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b); // VORN q0,q0,q0
+_NEON2SSE_INLINE uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b) // VORN q0,q0,q0
+{
+    __m128i b1;
+    b1 = vmvnq_s16( b); //bitwise not for b
+    return _mm_or_si128 (a, b1);
+}
+
+_NEON2SSESTORAGE uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b); // VORN q0,q0,q0
+_NEON2SSE_INLINE uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b) // VORN q0,q0,q0
+{
+    __m128i b1;
+    b1 = vmvnq_u32( b); //bitwise not for b
+    return _mm_or_si128 (a, b1);
+}
+_NEON2SSESTORAGE uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b); // VORN q0,q0,q0
+#define vornq_u64 vornq_s64
+
+//********************* Bitwise Select *****************************
+//******************************************************************
+//Note This intrinsic can compile to any of VBSL/VBIF/VBIT depending on register allocation.(?????????)
+
+//VBSL (Bitwise Select) selects each bit for the destination from the first operand if the
+//corresponding bit of the destination is 1, or from the second operand if the corresponding bit of the destination is 0.
+
+//VBIF (Bitwise Insert if False) inserts each bit from the first operand into the destination
+//if the corresponding bit of the second operand is 0, otherwise leaves the destination bit unchanged
+
+//VBIT (Bitwise Insert if True) inserts each bit from the first operand into the destination
+//if the corresponding bit of the second operand is 1, otherwise leaves the destination bit unchanged.
+
+//VBSL only is implemented for SIMD
+_NEON2SSESTORAGE int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c); // VBSL d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c)
+{
+    int8x8_t res64;
+    __m128i res;
+    res = vbslq_s8(_pM128i(a), _pM128i(b), _pM128i(c));
+    return64(res);
+}
+
+_NEON2SSESTORAGE int16x4_t vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c); // VBSL d0,d0,d0
+#define vbsl_s16 vbsl_s8
+
+_NEON2SSESTORAGE int32x2_t vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c); // VBSL d0,d0,d0
+#define vbsl_s32 vbsl_s8
+
+_NEON2SSESTORAGE int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c); // VBSL d0,d0,d0
+_NEON2SSE_INLINE int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c)
+{
+    int64x1_t res;
+    res.m64_i64[0] = (a.m64_i64[0] & b.m64_i64[0]) | ( (~a.m64_i64[0]) & c.m64_i64[0]);
+    return res;
+}
+
+_NEON2SSESTORAGE uint8x8_t vbsl_u8(uint8x8_t a,  uint8x8_t b, uint8x8_t c); // VBSL d0,d0,d0
+#define vbsl_u8 vbsl_s8
+
+_NEON2SSESTORAGE uint16x4_t vbsl_u16(uint16x4_t a,  uint16x4_t b, uint16x4_t c); // VBSL d0,d0,d0
+#define vbsl_u16 vbsl_s8
+
+_NEON2SSESTORAGE uint32x2_t vbsl_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t c); // VBSL d0,d0,d0
+#define vbsl_u32 vbsl_s8
+
+_NEON2SSESTORAGE uint64x1_t vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c); // VBSL d0,d0,d0
+#define vbsl_u64 vbsl_s64
+
+_NEON2SSESTORAGE float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c); // VBSL d0,d0,d0
+_NEON2SSE_INLINE float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c)
+{
+    __m128 sel1, sel2;
+    __m64_128 res64;
+    sel1 = _mm_and_ps   (_pM128(a), _pM128(b));
+    sel2 = _mm_andnot_ps (_pM128(a), _pM128(c));
+    sel1 = _mm_or_ps (sel1, sel2);
+    _M64f(res64, sel1);
+    return res64;
+}
+
+_NEON2SSESTORAGE poly8x8_t vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c); // VBSL d0,d0,d0
+#define  vbsl_p8 vbsl_s8
+
+_NEON2SSESTORAGE poly16x4_t vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c); // VBSL d0,d0,d0
+#define  vbsl_p16 vbsl_s8
+
+_NEON2SSESTORAGE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c); // VBSL q0,q0,q0
+_NEON2SSE_INLINE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c) // VBSL q0,q0,q0
+{
+    __m128i sel1, sel2;
+    sel1 = _mm_and_si128   (a, b);
+    sel2 = _mm_andnot_si128 (a, c);
+    return _mm_or_si128 (sel1, sel2);
+}
+
+_NEON2SSESTORAGE int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c); // VBSL q0,q0,q0
+#define vbslq_s16 vbslq_s8
+
+_NEON2SSESTORAGE int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c); // VBSL q0,q0,q0
+#define vbslq_s32 vbslq_s8
+
+_NEON2SSESTORAGE int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c); // VBSL q0,q0,q0
+#define vbslq_s64 vbslq_s8
+
+_NEON2SSESTORAGE uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VBSL q0,q0,q0
+#define vbslq_u8 vbslq_s8
+
+_NEON2SSESTORAGE uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VBSL q0,q0,q0
+#define vbslq_u16 vbslq_s8
+
+_NEON2SSESTORAGE uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VBSL q0,q0,q0
+#define vbslq_u32 vbslq_s8
+
+_NEON2SSESTORAGE uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c); // VBSL q0,q0,q0
+#define vbslq_u64 vbslq_s8
+
+_NEON2SSESTORAGE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c); // VBSL q0,q0,q0
+_NEON2SSE_INLINE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) // VBSL q0,q0,q0
+{
+    __m128 sel1, sel2;
+    sel1 = _mm_and_ps   (*(__m128*)&a, b);
+    sel2 = _mm_andnot_ps (*(__m128*)&a, c);
+    return _mm_or_ps (sel1, sel2);
+}
+
+_NEON2SSESTORAGE poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c); // VBSL q0,q0,q0
+#define vbslq_p8 vbslq_u8
+
+_NEON2SSESTORAGE poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c); // VBSL q0,q0,q0
+#define vbslq_p16 vbslq_s8
+
+//************************************************************************************
+//**************** Transposition operations ****************************************
+//************************************************************************************
+//*****************  Vector Transpose ************************************************
+//************************************************************************************
+//VTRN (Vector Transpose) treats the elements of its operand vectors as elements of 2 x 2 matrices, and transposes the matrices.
+// making the result look as (a0, b0, a2, b2, a4, b4,....) (a1, b1, a3, b3, a5, b5,.....)
+_NEON2SSESTORAGE int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b); // VTRN.8 d0,d0
+_NEON2SSE_INLINE int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b) // VTRN.8 d0,d0
+{
+    int8x8x2_t val;
+    __m128i tmp, val0;
+    tmp = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3,...,a7,b7
+    val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)mask8_32_even_odd); //(a0, b0, a2, b2, a4, b4, a6, b6), (a1,b1, a3,b3, a5,b5, a7,b7)
+    vst1q_s8 (val.val, val0); // _mm_shuffle_epi32 (val.val[0], _SWAP_HI_LOW32); //(a1,b1, a3,b3, a5,b5, a7,b7),(a0, b0, a2, b2, a4, b4, a6, b6),
+    return val;
+}
+
+_NEON2SSESTORAGE int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b); // VTRN.16 d0,d0
+_NEON2SSE_INLINE int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b) // VTRN.16 d0,d0
+{
+    int16x4x2_t val;
+    __m128i tmp, val0;
+    _NEON2SSE_ALIGN_16 static const int8_t maskdlv16[16] = {0,1, 2,3, 8,9, 10,11, 4,5, 6,7, 12,13, 14, 15};
+    tmp = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3
+    val0 =  _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv16); //a0, b0, a2, b2, a1,b1, a3, b3
+    vst1q_s16(val.val, val0); // _mm_shuffle_epi32 (val.val[0], _SWAP_HI_LOW32); //(a1,b1, a3,b3),(a0, b0, a2, b2),
+    return val;
+}
+
+_NEON2SSESTORAGE int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b); // VTRN.32 d0,d0
+_NEON2SSE_INLINE int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b)
+{
+    int32x2x2_t val;
+    __m128i val0;
+    val0 = _mm_unpacklo_epi32(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1
+    vst1q_s32(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32); //a1,b1, a0,b0,
+    return val;
+}
+
+_NEON2SSESTORAGE uint8x8x2_t vtrn_u8(uint8x8_t a, uint8x8_t b); // VTRN.8 d0,d0
+#define vtrn_u8 vtrn_s8
+
+_NEON2SSESTORAGE uint16x4x2_t vtrn_u16(uint16x4_t a, uint16x4_t b); // VTRN.16 d0,d0
+#define vtrn_u16 vtrn_s16
+
+_NEON2SSESTORAGE uint32x2x2_t vtrn_u32(uint32x2_t a, uint32x2_t b); // VTRN.32 d0,d0
+#define vtrn_u32 vtrn_s32
+
+_NEON2SSESTORAGE float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b); // VTRN.32 d0,d0
+_NEON2SSE_INLINE float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b)
+{
+    float32x2x2_t val;
+    val.val[0].m64_f32[0] = a.m64_f32[0];
+    val.val[0].m64_f32[1] = b.m64_f32[0];
+    val.val[1].m64_f32[0] = a.m64_f32[1];
+    val.val[1].m64_f32[1] = b.m64_f32[1];
+    return val; //a0,b0,a1,b1
+}
+
+_NEON2SSESTORAGE poly8x8x2_t vtrn_p8(poly8x8_t a, poly8x8_t b); // VTRN.8 d0,d0
+#define  vtrn_p8 vtrn_u8
+
+_NEON2SSESTORAGE poly16x4x2_t vtrn_p16(poly16x4_t a, poly16x4_t b); // VTRN.16 d0,d0
+#define  vtrn_p16 vtrn_s16
+
+//int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b); // VTRN.8 q0,q0
+_NEON2SSE_INLINE int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b) // VTRN.8 q0,q0
+{
+    int8x16x2_t r8x16;
+    __m128i a_sh, b_sh;
+    a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_16_even_odd); //a0, a2, a4, a6, a8, a10, a12, a14, a1, a3, a5, a7, a9, a11, a13, a15
+    b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_16_even_odd); //b0, b2, b4, b6, b8, b10, b12, b14, b1, b3, b5, b7, b9, b11, b13, b15
+
+    r8x16.val[0] =  _mm_unpacklo_epi8(a_sh, b_sh); //(a0, b0, a2, b2, a4, b4, a6, b6, a8,b8, a10,b10, a12,b12, a14,b14)
+    r8x16.val[1] =  _mm_unpackhi_epi8(a_sh, b_sh); // (a1, b1, a3, b3, a5, b5, a7, b7, a9,b9, a11,b11, a13,b13, a15,b15)
+    return r8x16;
+}
+
+_NEON2SSESTORAGE int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b); // VTRN.16 q0,q0
+_NEON2SSE_INLINE int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b) // VTRN.16 q0,q0
+{
+    int16x8x2_t v16x8;
+    __m128i a_sh, b_sh;
+    a_sh = _mm_shuffle_epi8 (a, *(__m128i*) mask8_32_even_odd); //a0, a2, a4, a6,  a1, a3, a5, a7
+    b_sh = _mm_shuffle_epi8 (b, *(__m128i*) mask8_32_even_odd); //b0, b2, b4, b6,  b1, b3, b5, b7
+    v16x8.val[0] = _mm_unpacklo_epi16(a_sh, b_sh); //a0, b0, a2, b2, a4, b4, a6, b6
+    v16x8.val[1] = _mm_unpackhi_epi16(a_sh, b_sh); //a1, b1, a3, b3, a5, b5, a7, b7
+    return v16x8;
+}
+
+_NEON2SSESTORAGE int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b); // VTRN.32 q0,q0
+_NEON2SSE_INLINE int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b) // VTRN.32 q0,q0
+{
+    //may be not optimal solution compared with serial
+    int32x4x2_t v32x4;
+    __m128i a_sh, b_sh;
+    a_sh = _mm_shuffle_epi32 (a, 216); //a0, a2, a1, a3
+    b_sh = _mm_shuffle_epi32 (b, 216); //b0, b2, b1, b3
+
+    v32x4.val[0] = _mm_unpacklo_epi32(a_sh, b_sh); //a0, b0, a2, b2
+    v32x4.val[1] = _mm_unpackhi_epi32(a_sh, b_sh); //a1, b1, a3,  b3
+    return v32x4;
+}
+
+_NEON2SSESTORAGE uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b); // VTRN.8 q0,q0
+#define vtrnq_u8 vtrnq_s8
+
+_NEON2SSESTORAGE uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b); // VTRN.16 q0,q0
+#define vtrnq_u16 vtrnq_s16
+
+_NEON2SSESTORAGE uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b); // VTRN.32 q0,q0
+#define vtrnq_u32 vtrnq_s32
+
+_NEON2SSESTORAGE float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b); // VTRN.32 q0,q0
+_NEON2SSE_INLINE float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b) // VTRN.32 q0,q0
+{
+    //may be not optimal solution compared with serial
+    float32x4x2_t f32x4;
+    __m128 a_sh, b_sh;
+    a_sh = _mm_shuffle_ps (a, a, _MM_SHUFFLE(3,1, 2, 0)); //a0, a2, a1, a3, need to check endiness
+    b_sh = _mm_shuffle_ps (b, b, _MM_SHUFFLE(3,1, 2, 0)); //b0, b2, b1, b3, need to check endiness
+
+    f32x4.val[0] = _mm_unpacklo_ps(a_sh, b_sh); //a0, b0, a2, b2
+    f32x4.val[1] = _mm_unpackhi_ps(a_sh, b_sh); //a1, b1, a3,  b3
+    return f32x4;
+}
+
+_NEON2SSESTORAGE poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b); // VTRN.8 q0,q0
+#define vtrnq_p8 vtrnq_s8
+
+_NEON2SSESTORAGE poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b); // VTRN.16 q0,q0
+#define vtrnq_p16 vtrnq_s16
+
+//***************** Interleave elements ***************************
+//*****************************************************************
+//output has (a0,b0,a1,b1, a2,b2,.....)
+_NEON2SSESTORAGE int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b); // VZIP.8 d0,d0
+_NEON2SSE_INLINE int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b) // VZIP.8 d0,d0
+{
+    int8x8x2_t val;
+    __m128i val0;
+    val0 = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b));
+    vst1q_s8(val.val, val0); //_mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
+    return val;
+}
+
+_NEON2SSESTORAGE int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b); // VZIP.16 d0,d0
+_NEON2SSE_INLINE int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b) // VZIP.16 d0,d0
+{
+    int16x4x2_t val;
+    __m128i val0;
+    val0 = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b));
+    vst1q_s16(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
+    return val;
+}
+
+_NEON2SSESTORAGE int32x2x2_t vzip_s32(int32x2_t a, int32x2_t b); // VZIP.32 d0,d0
+#define vzip_s32 vtrn_s32
+
+_NEON2SSESTORAGE uint8x8x2_t vzip_u8(uint8x8_t a, uint8x8_t b); // VZIP.8 d0,d0
+#define vzip_u8 vzip_s8
+
+_NEON2SSESTORAGE uint16x4x2_t vzip_u16(uint16x4_t a, uint16x4_t b); // VZIP.16 d0,d0
+#define vzip_u16 vzip_s16
+
+_NEON2SSESTORAGE uint32x2x2_t vzip_u32(uint32x2_t a, uint32x2_t b); // VZIP.32 d0,d0
+#define vzip_u32 vzip_s32
+
+_NEON2SSESTORAGE float32x2x2_t vzip_f32(float32x2_t a, float32x2_t b); // VZIP.32 d0,d0
+#define vzip_f32 vtrn_f32
+
+_NEON2SSESTORAGE poly8x8x2_t vzip_p8(poly8x8_t a, poly8x8_t b); // VZIP.8 d0,d0
+#define vzip_p8 vzip_u8
+
+_NEON2SSESTORAGE poly16x4x2_t vzip_p16(poly16x4_t a, poly16x4_t b); // VZIP.16 d0,d0
+#define vzip_p16 vzip_u16
+
+_NEON2SSESTORAGE int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b); // VZIP.8 q0,q0
+_NEON2SSE_INLINE int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b) // VZIP.8 q0,q0
+{
+    int8x16x2_t r8x16;
+    r8x16.val[0] =  _mm_unpacklo_epi8(a, b);
+    r8x16.val[1] =  _mm_unpackhi_epi8(a, b);
+    return r8x16;
+}
+
+_NEON2SSESTORAGE int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b); // VZIP.16 q0,q0
+_NEON2SSE_INLINE int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b) // VZIP.16 q0,q0
+{
+    int16x8x2_t r16x8;
+    r16x8.val[0] =  _mm_unpacklo_epi16(a, b);
+    r16x8.val[1] =  _mm_unpackhi_epi16(a, b);
+    return r16x8;
+}
+
+_NEON2SSESTORAGE int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b); // VZIP.32 q0,q0
+_NEON2SSE_INLINE int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b) // VZIP.32 q0,q0
+{
+    int32x4x2_t r32x4;
+    r32x4.val[0] =  _mm_unpacklo_epi32(a, b);
+    r32x4.val[1] =  _mm_unpackhi_epi32(a, b);
+    return r32x4;
+}
+
+_NEON2SSESTORAGE uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b); // VZIP.8 q0,q0
+#define vzipq_u8 vzipq_s8
+
+_NEON2SSESTORAGE uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b); // VZIP.16 q0,q0
+#define vzipq_u16 vzipq_s16
+
+_NEON2SSESTORAGE uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b); // VZIP.32 q0,q0
+#define vzipq_u32 vzipq_s32
+
+_NEON2SSESTORAGE float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b); // VZIP.32 q0,q0
+_NEON2SSE_INLINE float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b) // VZIP.32 q0,q0
+{
+    float32x4x2_t f32x4;
+    f32x4.val[0] =   _mm_unpacklo_ps ( a,  b);
+    f32x4.val[1] =   _mm_unpackhi_ps ( a,  b);
+    return f32x4;
+}
+
+_NEON2SSESTORAGE poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b); // VZIP.8 q0,q0
+#define vzipq_p8 vzipq_u8
+
+_NEON2SSESTORAGE poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b); // VZIP.16 q0,q0
+#define vzipq_p16 vzipq_u16
+
+//*********************** De-Interleave elements *************************
+//*************************************************************************
+//As the result of these functions first val  contains (a0,a2,a4,....,b0,b2, b4,...) and the second val (a1,a3,a5,....b1,b3,b5...)
+//no such functions in IA32 SIMD, shuffle is required
+_NEON2SSESTORAGE int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b); // VUZP.8 d0,d0
+_NEON2SSE_INLINE int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b) // VUZP.8 d0,d0
+{
+    int8x8x2_t val;
+    __m128i tmp, val0;
+    _NEON2SSE_ALIGN_16 static const int8_t maskdlv8[16] = { 0, 4, 8, 12, 1, 5, 9, 13,  2, 6, 10, 14, 3, 7, 11,15};
+    tmp = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3,...,a7,b7
+    val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv8); //(a0, a2, a4, a6, b0, b2, b4, b6),  (a1, a3, a5, a7, b1,b3, b5, b7)
+    vst1q_s8(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
+    return val;
+}
+
+_NEON2SSESTORAGE int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b); // VUZP.16 d0,d0
+_NEON2SSE_INLINE int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b) // VUZP.16 d0,d0
+{
+    int16x4x2_t val;
+    __m128i tmp, val0;
+    _NEON2SSE_ALIGN_16 static const int8_t maskdlv16[16] = {0,1,  8,9,  2,3, 10,11,  4,5, 12,13, 6,7, 14,15};
+    tmp = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3
+    val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv16); //a0,a2, b0, b2, a1,a3, b1,b3
+    vst1q_s16(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
+    return val;
+}
+
+_NEON2SSESTORAGE int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b); // VUZP.32 d0,d0
+_NEON2SSE_INLINE int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b) // VUZP.32 d0,d0
+{
+    int32x2x2_t val;
+    __m128i val0;
+    val0 = _mm_unpacklo_epi32(_pM128i(a), _pM128i(b)); //a0,b0, a1,b1
+    vst1q_s32(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
+    return val;
+}
+
+_NEON2SSESTORAGE uint8x8x2_t vuzp_u8(uint8x8_t a, uint8x8_t b); // VUZP.8 d0,d0
+#define vuzp_u8 vuzp_s8
+
+_NEON2SSESTORAGE uint16x4x2_t vuzp_u16(uint16x4_t a, uint16x4_t b); // VUZP.16 d0,d0
+#define vuzp_u16 vuzp_s16
+
+_NEON2SSESTORAGE uint32x2x2_t vuzp_u32(uint32x2_t a, uint32x2_t b); // VUZP.32 d0,d0
+#define vuzp_u32 vuzp_s32
+
+_NEON2SSESTORAGE float32x2x2_t vuzp_f32(float32x2_t a, float32x2_t b); // VUZP.32 d0,d0
+#define vuzp_f32 vzip_f32
+
+_NEON2SSESTORAGE poly8x8x2_t vuzp_p8(poly8x8_t a, poly8x8_t b); // VUZP.8 d0,d0
+#define vuzp_p8 vuzp_u8
+
+_NEON2SSESTORAGE poly16x4x2_t vuzp_p16(poly16x4_t a, poly16x4_t b); // VUZP.16 d0,d0
+#define vuzp_p16 vuzp_u16
+
+_NEON2SSESTORAGE int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b); // VUZP.8 q0,q0
+_NEON2SSE_INLINE int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b) // VUZP.8 q0,q0
+{
+    int8x16x2_t v8x16;
+    __m128i a_sh, b_sh;
+    a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_16_even_odd); //a0, a2, a4, a6, a8, a10, a12, a14, a1, a3, a5, a7, a9, a11, a13, a15
+    b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_16_even_odd); //b0, b2, b4, b6, b8, b10, b12, b14, b1, b3, b5, b7, b9, b11, b13, b15
+    //we need unpack64 to combine lower (upper) 64 bits from a with lower (upper) 64 bits from b
+    v8x16.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); ///a0, a2, a4, a6, a8, a10, a12, a14,  b0, b2, b4, b6, b8, b10, b12, b14,
+    v8x16.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, a5, a7, a9, a11, a13, a15,  b1, b3, b5, b7, b9, b11, b13, b15
+    return v8x16;
+}
+
+_NEON2SSESTORAGE int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b); // VUZP.16 q0,q0
+_NEON2SSE_INLINE int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b) // VUZP.16 q0,q0
+{
+    int16x8x2_t v16x8;
+    __m128i a_sh, b_sh;
+     a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_32_even_odd); //a0, a2, a4, a6,  a1, a3, a5, a7
+    b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_32_even_odd); //b0, b2, b4, b6,  b1, b3, b5, b7
+    v16x8.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); //a0, a2, a4, a6, b0, b2, b4, b6
+    v16x8.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, a5, a7, b1, b3, b5, b7
+    return v16x8;
+}
+
+_NEON2SSESTORAGE int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b); // VUZP.32 q0,q0
+_NEON2SSE_INLINE int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b) // VUZP.32 q0,q0
+{
+    //may be not optimal solution compared with serial
+    int32x4x2_t v32x4;
+    __m128i a_sh, b_sh;
+    a_sh = _mm_shuffle_epi32 (a, 216); //a0, a2, a1, a3
+    b_sh = _mm_shuffle_epi32 (b, 216); //b0, b2, b1, b3
+
+    v32x4.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); //a0, a2, b0, b2
+    v32x4.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, b1, b3
+    return v32x4;
+}
+
+_NEON2SSESTORAGE uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b); // VUZP.8 q0,q0
+#define vuzpq_u8 vuzpq_s8
+
+_NEON2SSESTORAGE uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b); // VUZP.16 q0,q0
+#define vuzpq_u16 vuzpq_s16
+
+_NEON2SSESTORAGE uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b); // VUZP.32 q0,q0
+#define vuzpq_u32 vuzpq_s32
+
+_NEON2SSESTORAGE float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b); // VUZP.32 q0,q0
+_NEON2SSE_INLINE float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b) // VUZP.32 q0,q0
+{
+    float32x4x2_t v32x4;
+    v32x4.val[0] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2,0, 2, 0)); //a0, a2, b0, b2 , need to check endianess however
+    v32x4.val[1] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3,1, 3, 1)); //a1, a3, b1, b3, need to check endianess however
+    return v32x4;
+}
+
+_NEON2SSESTORAGE poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b); // VUZP.8 q0,q0
+#define vuzpq_p8 vuzpq_u8
+
+_NEON2SSESTORAGE poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b); // VUZP.16 q0,q0
+#define vuzpq_p16 vuzpq_u16
+
+//##############################################################################################
+//*********************** Reinterpret cast intrinsics.******************************************
+//##############################################################################################
+// Not a part of oficial NEON instruction set but available in gcc compiler *********************
+_NEON2SSESTORAGE poly8x8_t vreinterpret_p8_u32 (uint32x2_t t);
+#define vreinterpret_p8_u32
+
+_NEON2SSESTORAGE poly8x8_t vreinterpret_p8_u16 (uint16x4_t t);
+#define vreinterpret_p8_u16
+
+_NEON2SSESTORAGE poly8x8_t vreinterpret_p8_u8 (uint8x8_t t);
+#define vreinterpret_p8_u8
+
+_NEON2SSESTORAGE poly8x8_t vreinterpret_p8_s32 (int32x2_t t);
+#define vreinterpret_p8_s32
+
+_NEON2SSESTORAGE poly8x8_t vreinterpret_p8_s16 (int16x4_t t);
+#define vreinterpret_p8_s16
+
+_NEON2SSESTORAGE poly8x8_t vreinterpret_p8_s8 (int8x8_t t);
+#define vreinterpret_p8_s8
+
+_NEON2SSESTORAGE poly8x8_t vreinterpret_p8_u64 (uint64x1_t t);
+#define vreinterpret_p8_u64
+
+_NEON2SSESTORAGE poly8x8_t vreinterpret_p8_s64 (int64x1_t t);
+#define vreinterpret_p8_s64
+
+_NEON2SSESTORAGE poly8x8_t vreinterpret_p8_f32 (float32x2_t t);
+#define vreinterpret_p8_f32
+
+_NEON2SSESTORAGE poly8x8_t vreinterpret_p8_p16 (poly16x4_t t);
+#define vreinterpret_p8_p16
+
+_NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_u32 (uint32x4_t t);
+#define vreinterpretq_p8_u32
+
+_NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_u16 (uint16x8_t t);
+#define vreinterpretq_p8_u16
+
+_NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_u8 (uint8x16_t t);
+#define vreinterpretq_p8_u8
+
+_NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_s32 (int32x4_t t);
+#define vreinterpretq_p8_s32
+
+_NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_s16 (int16x8_t t);
+#define vreinterpretq_p8_s16
+
+_NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_s8 (int8x16_t t);
+#define vreinterpretq_p8_s8
+
+_NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_u64 (uint64x2_t t);
+#define vreinterpretq_p8_u64
+
+_NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_s64 (int64x2_t t);
+#define vreinterpretq_p8_s64
+
+_NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_f32 (float32x4_t t);
+#define vreinterpretq_p8_f32(t) _M128i(t)
+
+_NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_p16 (poly16x8_t t);
+#define vreinterpretq_p8_p16
+
+_NEON2SSESTORAGE poly16x4_t vreinterpret_p16_u32 (uint32x2_t t);
+#define vreinterpret_p16_u32
+
+_NEON2SSESTORAGE poly16x4_t vreinterpret_p16_u16 (uint16x4_t t);
+#define vreinterpret_p16_u16
+
+_NEON2SSESTORAGE poly16x4_t vreinterpret_p16_u8 (uint8x8_t t);
+#define vreinterpret_p16_u8
+
+_NEON2SSESTORAGE poly16x4_t vreinterpret_p16_s32 (int32x2_t t);
+#define vreinterpret_p16_s32
+
+_NEON2SSESTORAGE poly16x4_t vreinterpret_p16_s16 (int16x4_t t);
+#define vreinterpret_p16_s16
+
+_NEON2SSESTORAGE poly16x4_t vreinterpret_p16_s8 (int8x8_t t);
+#define vreinterpret_p16_s8
+
+_NEON2SSESTORAGE poly16x4_t vreinterpret_p16_u64 (uint64x1_t t);
+#define vreinterpret_p16_u64
+
+_NEON2SSESTORAGE poly16x4_t vreinterpret_p16_s64 (int64x1_t t);
+#define vreinterpret_p16_s64
+
+_NEON2SSESTORAGE poly16x4_t vreinterpret_p16_f32 (float32x2_t t);
+#define vreinterpret_p16_f32
+
+_NEON2SSESTORAGE poly16x4_t vreinterpret_p16_p8 (poly8x8_t t);
+#define vreinterpret_p16_p8
+
+_NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_u32 (uint32x4_t t);
+#define vreinterpretq_p16_u32
+
+_NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_u16 (uint16x8_t t);
+#define vreinterpretq_p16_u16
+
+_NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_s32 (int32x4_t t);
+#define vreinterpretq_p16_s32
+
+_NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_s16 (int16x8_t t);
+#define vreinterpretq_p16_s16
+
+_NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_s8 (int8x16_t t);
+#define vreinterpretq_p16_s8
+
+_NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_u64 (uint64x2_t t);
+#define vreinterpretq_p16_u64
+
+_NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_s64 (int64x2_t t);
+#define vreinterpretq_p16_s64
+
+_NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_f32 (float32x4_t t);
+#define vreinterpretq_p16_f32(t) _M128i(t)
+
+_NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_p8 (poly8x16_t t);
+#define vreinterpretq_p16_p8  vreinterpretq_s16_p8
+
+//****  Integer to float  ******
+_NEON2SSESTORAGE float32x2_t vreinterpret_f32_u32 (uint32x2_t t);
+_NEON2SSE_INLINE float32x2_t vreinterpret_f32_u32 (uint32x2_t t)
+{
+    return (*(__m64_128*)&(t));
+}
+
+_NEON2SSESTORAGE float32x2_t vreinterpret_f32_u16 (uint16x4_t t);
+#define vreinterpret_f32_u16 vreinterpret_f32_u32
+
+
+_NEON2SSESTORAGE float32x2_t vreinterpret_f32_u8 (uint8x8_t t);
+#define vreinterpret_f32_u8 vreinterpret_f32_u32
+
+
+_NEON2SSESTORAGE float32x2_t vreinterpret_f32_s32 (int32x2_t t);
+#define vreinterpret_f32_s32 vreinterpret_f32_u32
+
+
+_NEON2SSESTORAGE float32x2_t vreinterpret_f32_s16 (int16x4_t t);
+#define vreinterpret_f32_s16 vreinterpret_f32_u32
+
+_NEON2SSESTORAGE float32x2_t vreinterpret_f32_s8 (int8x8_t t);
+#define vreinterpret_f32_s8 vreinterpret_f32_u32
+
+
+_NEON2SSESTORAGE float32x2_t vreinterpret_f32_u64(uint64x1_t t);
+#define vreinterpret_f32_u64 vreinterpret_f32_u32
+
+
+_NEON2SSESTORAGE float32x2_t vreinterpret_f32_s64 (int64x1_t t);
+#define vreinterpret_f32_s64 vreinterpret_f32_u32
+
+
+_NEON2SSESTORAGE float32x2_t vreinterpret_f32_p16 (poly16x4_t t);
+#define vreinterpret_f32_p16 vreinterpret_f32_u32
+
+_NEON2SSESTORAGE float32x2_t vreinterpret_f32_p8 (poly8x8_t t);
+#define vreinterpret_f32_p8 vreinterpret_f32_u32
+
+_NEON2SSESTORAGE float32x4_t vreinterpretq_f32_u32 (uint32x4_t t);
+#define  vreinterpretq_f32_u32(t) _M128(t)
+
+_NEON2SSESTORAGE float32x4_t vreinterpretq_f32_u16 (uint16x8_t t);
+#define vreinterpretq_f32_u16 vreinterpretq_f32_u32
+
+_NEON2SSESTORAGE float32x4_t vreinterpretq_f32_u8 (uint8x16_t t);
+#define vreinterpretq_f32_u8 vreinterpretq_f32_u32
+
+_NEON2SSESTORAGE float32x4_t vreinterpretq_f32_s32 (int32x4_t t);
+#define vreinterpretq_f32_s32 vreinterpretq_f32_u32
+
+_NEON2SSESTORAGE float32x4_t vreinterpretq_f32_s16 (int16x8_t t);
+#define vreinterpretq_f32_s16 vreinterpretq_f32_u32
+
+_NEON2SSESTORAGE float32x4_t vreinterpretq_f32_s8 (int8x16_t t);
+#define vreinterpretq_f32_s8 vreinterpretq_f32_u32
+
+_NEON2SSESTORAGE float32x4_t vreinterpretq_f32_u64 (uint64x2_t t);
+#define vreinterpretq_f32_u64 vreinterpretq_f32_u32
+
+_NEON2SSESTORAGE float32x4_t vreinterpretq_f32_s64 (int64x2_t t);
+#define vreinterpretq_f32_s64 vreinterpretq_f32_u32
+
+_NEON2SSESTORAGE float32x4_t vreinterpretq_f32_p16 (poly16x8_t t);
+#define vreinterpretq_f32_p16 vreinterpretq_f32_u32
+
+_NEON2SSESTORAGE float32x4_t vreinterpretq_f32_p8 (poly8x16_t t);
+#define vreinterpretq_f32_p8 vreinterpretq_f32_u32
+
+//*** Integer type conversions ******************
+//no conversion necessary for the following functions because it is same data type
+_NEON2SSESTORAGE int64x1_t vreinterpret_s64_u32 (uint32x2_t t);
+#define vreinterpret_s64_u32
+
+_NEON2SSESTORAGE int64x1_t vreinterpret_s64_u16 (uint16x4_t t);
+#define vreinterpret_s64_u16
+
+_NEON2SSESTORAGE int64x1_t vreinterpret_s64_u8 (uint8x8_t t);
+#define vreinterpret_s64_u8
+
+_NEON2SSESTORAGE int64x1_t vreinterpret_s64_s32 (int32x2_t t);
+#define  vreinterpret_s64_s32
+
+_NEON2SSESTORAGE int64x1_t vreinterpret_s64_s16 (int16x4_t t);
+#define vreinterpret_s64_s16
+
+_NEON2SSESTORAGE int64x1_t vreinterpret_s64_s8 (int8x8_t t);
+#define  vreinterpret_s64_s8
+
+_NEON2SSESTORAGE int64x1_t vreinterpret_s64_u64 (uint64x1_t t);
+#define  vreinterpret_s64_u64
+
+_NEON2SSESTORAGE int64x1_t vreinterpret_s64_f32 (float32x2_t t);
+#define  vreinterpret_s64_f32
+
+_NEON2SSESTORAGE int64x1_t vreinterpret_s64_p16 (poly16x4_t t);
+#define vreinterpret_s64_p16
+
+_NEON2SSESTORAGE int64x1_t vreinterpret_s64_p8 (poly8x8_t t);
+#define vreinterpret_s64_p8
+
+_NEON2SSESTORAGE int64x2_t vreinterpretq_s64_u32 (uint32x4_t t);
+#define vreinterpretq_s64_u32
+
+_NEON2SSESTORAGE int64x2_t vreinterpretq_s64_s16 (uint16x8_t t);
+#define vreinterpretq_s64_s16
+
+_NEON2SSESTORAGE int64x2_t vreinterpretq_s64_u8 (uint8x16_t t);
+#define vreinterpretq_s64_u8
+
+_NEON2SSESTORAGE int64x2_t vreinterpretq_s64_s32 (int32x4_t t);
+#define vreinterpretq_s64_s32
+
+_NEON2SSESTORAGE int64x2_t vreinterpretq_s64_u16 (int16x8_t t);
+#define vreinterpretq_s64_u16
+
+_NEON2SSESTORAGE int64x2_t vreinterpretq_s64_s8 (int8x16_t t);
+#define vreinterpretq_s64_s8
+
+_NEON2SSESTORAGE int64x2_t vreinterpretq_s64_u64 (uint64x2_t t);
+#define vreinterpretq_s64_u64
+
+_NEON2SSESTORAGE int64x2_t vreinterpretq_s64_f32 (float32x4_t t);
+#define vreinterpretq_s64_f32(t) _M128i(t)
+
+_NEON2SSESTORAGE int64x2_t vreinterpretq_s64_p16 (poly16x8_t t);
+#define vreinterpretq_s64_p16
+
+_NEON2SSESTORAGE int64x2_t vreinterpretq_s64_p8 (poly8x16_t t);
+#define vreinterpretq_s64_p8
+
+_NEON2SSESTORAGE uint64x1_t vreinterpret_u64_u32 (uint32x2_t t);
+#define vreinterpret_u64_u32
+
+_NEON2SSESTORAGE uint64x1_t vreinterpret_u64_u16 (uint16x4_t t);
+#define vreinterpret_u64_u16
+
+_NEON2SSESTORAGE uint64x1_t vreinterpret_u64_u8 (uint8x8_t t);
+#define vreinterpret_u64_u8
+
+_NEON2SSESTORAGE uint64x1_t vreinterpret_u64_s32 (int32x2_t t);
+#define vreinterpret_u64_s32
+
+_NEON2SSESTORAGE uint64x1_t vreinterpret_u64_s16 (int16x4_t t);
+#define vreinterpret_u64_s16
+
+_NEON2SSESTORAGE uint64x1_t vreinterpret_u64_s8 (int8x8_t t);
+#define vreinterpret_u64_s8
+
+_NEON2SSESTORAGE uint64x1_t vreinterpret_u64_s64 (int64x1_t t);
+#define vreinterpret_u64_s64
+
+_NEON2SSESTORAGE uint64x1_t vreinterpret_u64_f32 (float32x2_t t);
+#define vreinterpret_u64_f32
+
+_NEON2SSESTORAGE uint64x1_t vreinterpret_u64_p16 (poly16x4_t t);
+#define vreinterpret_u64_p16
+
+_NEON2SSESTORAGE uint64x1_t vreinterpret_u64_p8 (poly8x8_t t);
+#define vreinterpret_u64_p8
+
+_NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_u32 (uint32x4_t t);
+#define vreinterpretq_u64_u32
+
+_NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_u16 (uint16x8_t t);
+#define vreinterpretq_u64_u16
+
+_NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_u8 (uint8x16_t t);
+#define vreinterpretq_u64_u8
+
+_NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_s32 (int32x4_t t);
+#define vreinterpretq_u64_s32
+
+_NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_s16 (int16x8_t t);
+#define vreinterpretq_u64_s16
+
+_NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_s8 (int8x16_t t);
+#define vreinterpretq_u64_s8
+
+_NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_s64 (int64x2_t t);
+#define vreinterpretq_u64_s64
+
+_NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_f32 (float32x4_t t);
+#define vreinterpretq_u64_f32(t) _M128i(t)
+
+_NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_p16 (poly16x8_t t);
+#define vreinterpretq_u64_p16
+
+_NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_p8 (poly8x16_t t);
+#define vreinterpretq_u64_p8
+
+_NEON2SSESTORAGE int8x8_t vreinterpret_s8_u32 (uint32x2_t t);
+#define vreinterpret_s8_u32
+
+_NEON2SSESTORAGE int8x8_t vreinterpret_s8_u16 (uint16x4_t t);
+#define vreinterpret_s8_u16
+
+_NEON2SSESTORAGE int8x8_t vreinterpret_s8_u8 (uint8x8_t t);
+#define vreinterpret_s8_u8
+
+_NEON2SSESTORAGE int8x8_t vreinterpret_s8_s32 (int32x2_t t);
+#define vreinterpret_s8_s32
+
+_NEON2SSESTORAGE int8x8_t vreinterpret_s8_s16 (int16x4_t t);
+#define vreinterpret_s8_s16
+
+_NEON2SSESTORAGE int8x8_t vreinterpret_s8_u64 (uint64x1_t t);
+#define vreinterpret_s8_u64
+
+_NEON2SSESTORAGE int8x8_t vreinterpret_s8_s64 (int64x1_t t);
+#define vreinterpret_s8_s64
+
+_NEON2SSESTORAGE int8x8_t vreinterpret_s8_f32 (float32x2_t t);
+#define vreinterpret_s8_f32
+
+_NEON2SSESTORAGE int8x8_t vreinterpret_s8_p16 (poly16x4_t t);
+#define vreinterpret_s8_p16
+
+_NEON2SSESTORAGE int8x8_t vreinterpret_s8_p8 (poly8x8_t t);
+#define vreinterpret_s8_p8
+
+_NEON2SSESTORAGE int8x16_t vreinterpretq_s8_u32 (uint32x4_t t);
+#define vreinterpretq_s8_u32
+
+_NEON2SSESTORAGE int8x16_t vreinterpretq_s8_u16 (uint16x8_t t);
+#define vreinterpretq_s8_u16
+
+_NEON2SSESTORAGE int8x16_t vreinterpretq_s8_u8 (uint8x16_t t);
+#define vreinterpretq_s8_u8
+
+_NEON2SSESTORAGE int8x16_t vreinterpretq_s8_s32 (int32x4_t t);
+#define vreinterpretq_s8_s32
+
+_NEON2SSESTORAGE int8x16_t vreinterpretq_s8_s16 (int16x8_t t);
+#define vreinterpretq_s8_s16
+
+_NEON2SSESTORAGE int8x16_t vreinterpretq_s8_u64 (uint64x2_t t);
+#define vreinterpretq_s8_u64
+
+_NEON2SSESTORAGE int8x16_t vreinterpretq_s8_s64 (int64x2_t t);
+#define vreinterpretq_s8_s64
+
+_NEON2SSESTORAGE int8x16_t vreinterpretq_s8_f32 (float32x4_t t);
+#define vreinterpretq_s8_f32(t) _M128i(t)
+
+_NEON2SSESTORAGE int8x16_t vreinterpretq_s8_p16 (poly16x8_t t);
+#define vreinterpretq_s8_p16
+
+_NEON2SSESTORAGE int8x16_t vreinterpretq_s8_p8 (poly8x16_t t);
+#define vreinterpretq_s8_p8
+
+_NEON2SSESTORAGE int16x4_t vreinterpret_s16_u32 (uint32x2_t t);
+#define vreinterpret_s16_u32
+
+_NEON2SSESTORAGE int16x4_t vreinterpret_s16_u16 (uint16x4_t t);
+#define vreinterpret_s16_u16
+
+_NEON2SSESTORAGE int16x4_t vreinterpret_s16_u8 (uint8x8_t t);
+#define vreinterpret_s16_u8
+
+_NEON2SSESTORAGE int16x4_t vreinterpret_s16_s32 (int32x2_t t);
+#define vreinterpret_s16_s32
+
+_NEON2SSESTORAGE int16x4_t vreinterpret_s16_s8 (int8x8_t t);
+#define vreinterpret_s16_s8
+
+_NEON2SSESTORAGE int16x4_t vreinterpret_s16_u64 (uint64x1_t t);
+#define vreinterpret_s16_u64
+
+_NEON2SSESTORAGE int16x4_t vreinterpret_s16_s64 (int64x1_t t);
+#define vreinterpret_s16_s64
+
+_NEON2SSESTORAGE int16x4_t vreinterpret_s16_f32 (float32x2_t t);
+#define vreinterpret_s16_f32
+
+
+_NEON2SSESTORAGE int16x4_t vreinterpret_s16_p16 (poly16x4_t t);
+#define vreinterpret_s16_p16
+
+_NEON2SSESTORAGE int16x4_t vreinterpret_s16_p8 (poly8x8_t t);
+#define vreinterpret_s16_p8
+
+_NEON2SSESTORAGE int16x8_t vreinterpretq_s16_u32 (uint32x4_t t);
+#define vreinterpretq_s16_u32
+
+_NEON2SSESTORAGE int16x8_t vreinterpretq_s16_u16 (uint16x8_t t);
+#define vreinterpretq_s16_u16
+
+_NEON2SSESTORAGE int16x8_t vreinterpretq_s16_u8 (uint8x16_t t);
+#define vreinterpretq_s16_u8
+
+_NEON2SSESTORAGE int16x8_t vreinterpretq_s16_s32 (int32x4_t t);
+#define vreinterpretq_s16_s32
+
+_NEON2SSESTORAGE int16x8_t vreinterpretq_s16_s8 (int8x16_t t);
+#define vreinterpretq_s16_s8
+
+_NEON2SSESTORAGE int16x8_t vreinterpretq_s16_u64 (uint64x2_t t);
+#define vreinterpretq_s16_u64
+
+_NEON2SSESTORAGE int16x8_t vreinterpretq_s16_s64 (int64x2_t t);
+#define vreinterpretq_s16_s64
+
+_NEON2SSESTORAGE int16x8_t vreinterpretq_s16_f32 (float32x4_t t);
+#define vreinterpretq_s16_f32(t) _M128i(t)
+
+_NEON2SSESTORAGE int16x8_t vreinterpretq_s16_p16 (poly16x8_t t);
+#define vreinterpretq_s16_p16
+
+_NEON2SSESTORAGE int16x8_t vreinterpretq_s16_p8 (poly8x16_t t);
+#define vreinterpretq_s16_p8
+
+_NEON2SSESTORAGE int32x2_t vreinterpret_s32_u32 (uint32x2_t t);
+#define vreinterpret_s32_u32
+
+_NEON2SSESTORAGE int32x2_t vreinterpret_s32_u16 (uint16x4_t t);
+#define vreinterpret_s32_u16
+
+_NEON2SSESTORAGE int32x2_t vreinterpret_s32_u8 (uint8x8_t t);
+#define vreinterpret_s32_u8
+
+_NEON2SSESTORAGE int32x2_t vreinterpret_s32_s16 (int16x4_t t);
+#define vreinterpret_s32_s16
+
+_NEON2SSESTORAGE int32x2_t vreinterpret_s32_s8 (int8x8_t t);
+#define vreinterpret_s32_s8
+
+_NEON2SSESTORAGE int32x2_t vreinterpret_s32_u64 (uint64x1_t t);
+#define vreinterpret_s32_u64
+
+_NEON2SSESTORAGE int32x2_t vreinterpret_s32_s64 (int64x1_t t);
+#define vreinterpret_s32_s64
+
+_NEON2SSESTORAGE int32x2_t vreinterpret_s32_f32 (float32x2_t t);
+#define vreinterpret_s32_f32
+
+_NEON2SSESTORAGE int32x2_t vreinterpret_s32_p16 (poly16x4_t t);
+#define vreinterpret_s32_p16
+
+_NEON2SSESTORAGE int32x2_t vreinterpret_s32_p8 (poly8x8_t t);
+#define vreinterpret_s32_p8
+
+_NEON2SSESTORAGE int32x4_t vreinterpretq_s32_u32 (uint32x4_t t);
+#define vreinterpretq_s32_u32
+
+_NEON2SSESTORAGE int32x4_t vreinterpretq_s32_u16 (uint16x8_t t);
+#define vreinterpretq_s32_u16
+
+_NEON2SSESTORAGE int32x4_t vreinterpretq_s32_u8 (uint8x16_t t);
+#define vreinterpretq_s32_u8
+
+_NEON2SSESTORAGE int32x4_t vreinterpretq_s32_s16 (int16x8_t t);
+#define vreinterpretq_s32_s16
+
+_NEON2SSESTORAGE int32x4_t vreinterpretq_s32_s8 (int8x16_t t);
+#define vreinterpretq_s32_s8
+
+_NEON2SSESTORAGE int32x4_t vreinterpretq_s32_u64 (uint64x2_t t);
+#define vreinterpretq_s32_u64
+
+_NEON2SSESTORAGE int32x4_t vreinterpretq_s32_s64 (int64x2_t t);
+#define vreinterpretq_s32_s64
+
+_NEON2SSESTORAGE int32x4_t vreinterpretq_s32_f32 (float32x4_t t);
+#define vreinterpretq_s32_f32(t)  _M128i(t)
+
+_NEON2SSESTORAGE int32x4_t vreinterpretq_s32_p16 (poly16x8_t t);
+#define vreinterpretq_s32_p16
+
+_NEON2SSESTORAGE int32x4_t vreinterpretq_s32_p8 (poly8x16_t t);
+#define vreinterpretq_s32_p8
+
+_NEON2SSESTORAGE uint8x8_t vreinterpret_u8_u32 (uint32x2_t t);
+#define vreinterpret_u8_u32
+
+_NEON2SSESTORAGE uint8x8_t vreinterpret_u8_u16 (uint16x4_t t);
+#define vreinterpret_u8_u16
+
+_NEON2SSESTORAGE uint8x8_t vreinterpret_u8_s32 (int32x2_t t);
+#define vreinterpret_u8_s32
+
+_NEON2SSESTORAGE uint8x8_t vreinterpret_u8_s16 (int16x4_t t);
+#define vreinterpret_u8_s16
+
+_NEON2SSESTORAGE uint8x8_t vreinterpret_u8_s8 (int8x8_t t);
+#define vreinterpret_u8_s8
+
+_NEON2SSESTORAGE uint8x8_t vreinterpret_u8_u64 (uint64x1_t t);
+#define vreinterpret_u8_u64
+
+_NEON2SSESTORAGE uint8x8_t vreinterpret_u8_s64 (int64x1_t t);
+#define vreinterpret_u8_s64
+
+_NEON2SSESTORAGE uint8x8_t vreinterpret_u8_f32 (float32x2_t t);
+#define vreinterpret_u8_f32
+
+_NEON2SSESTORAGE uint8x8_t vreinterpret_u8_p16 (poly16x4_t t);
+#define vreinterpret_u8_p16
+
+_NEON2SSESTORAGE uint8x8_t vreinterpret_u8_p8 (poly8x8_t t);
+#define vreinterpret_u8_p8
+
+_NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_u32 (uint32x4_t t);
+#define vreinterpretq_u8_u32
+
+_NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_u16 (uint16x8_t t);
+#define vreinterpretq_u8_u16
+
+_NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_s32 (int32x4_t t);
+#define vreinterpretq_u8_s32
+
+_NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_s16 (int16x8_t t);
+#define vreinterpretq_u8_s16
+
+_NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_s8 (int8x16_t t);
+#define vreinterpretq_u8_s8
+
+_NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_u64 (uint64x2_t t);
+#define vreinterpretq_u8_u64
+
+_NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_s64 (int64x2_t t);
+#define vreinterpretq_u8_s64
+
+_NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_f32 (float32x4_t t);
+#define vreinterpretq_u8_f32(t) _M128i(t)
+
+
+_NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_p16 (poly16x8_t t);
+#define vreinterpretq_u8_p16
+
+_NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_p8 (poly8x16_t t);
+#define vreinterpretq_u8_p8
+
+_NEON2SSESTORAGE uint16x4_t vreinterpret_u16_u32 (uint32x2_t t);
+#define vreinterpret_u16_u32
+
+_NEON2SSESTORAGE uint16x4_t vreinterpret_u16_u8 (uint8x8_t t);
+#define vreinterpret_u16_u8
+
+_NEON2SSESTORAGE uint16x4_t vreinterpret_u16_s32 (int32x2_t t);
+#define vreinterpret_u16_s32
+
+_NEON2SSESTORAGE uint16x4_t vreinterpret_u16_s16 (int16x4_t t);
+#define vreinterpret_u16_s16
+
+_NEON2SSESTORAGE uint16x4_t vreinterpret_u16_s8 (int8x8_t t);
+#define vreinterpret_u16_s8
+
+_NEON2SSESTORAGE uint16x4_t vreinterpret_u16_u64 (uint64x1_t t);
+#define vreinterpret_u16_u64
+
+_NEON2SSESTORAGE uint16x4_t vreinterpret_u16_s64 (int64x1_t t);
+#define vreinterpret_u16_s64
+
+_NEON2SSESTORAGE uint16x4_t vreinterpret_u16_f32 (float32x2_t t);
+#define vreinterpret_u16_f32
+
+_NEON2SSESTORAGE uint16x4_t vreinterpret_u16_p16 (poly16x4_t t);
+#define vreinterpret_u16_p16
+
+_NEON2SSESTORAGE uint16x4_t vreinterpret_u16_p8 (poly8x8_t t);
+#define vreinterpret_u16_p8
+
+_NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_u32 (uint32x4_t t);
+#define vreinterpretq_u16_u32
+
+_NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_u8 (uint8x16_t t);
+#define vreinterpretq_u16_u8
+
+_NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_s32 (int32x4_t t);
+#define vreinterpretq_u16_s32
+
+_NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_s16 (int16x8_t t);
+#define vreinterpretq_u16_s16
+
+_NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_s8 (int8x16_t t);
+#define vreinterpretq_u16_s8
+
+_NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_u64 (uint64x2_t t);
+#define vreinterpretq_u16_u64
+
+_NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_s64 (int64x2_t t);
+#define vreinterpretq_u16_s64
+
+_NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_f32 (float32x4_t t);
+#define vreinterpretq_u16_f32(t) _M128i(t)
+
+_NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_p16 (poly16x8_t t);
+#define vreinterpretq_u16_p16
+
+_NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_p8 (poly8x16_t t);
+#define vreinterpretq_u16_p8
+
+_NEON2SSESTORAGE uint32x2_t vreinterpret_u32_u16 (uint16x4_t t);
+#define vreinterpret_u32_u16
+
+_NEON2SSESTORAGE uint32x2_t vreinterpret_u32_u8 (uint8x8_t t);
+#define vreinterpret_u32_u8
+
+_NEON2SSESTORAGE uint32x2_t vreinterpret_u32_s32 (int32x2_t t);
+#define vreinterpret_u32_s32
+
+_NEON2SSESTORAGE uint32x2_t vreinterpret_u32_s16 (int16x4_t t);
+#define vreinterpret_u32_s16
+
+_NEON2SSESTORAGE uint32x2_t vreinterpret_u32_s8 (int8x8_t t);
+#define vreinterpret_u32_s8
+
+_NEON2SSESTORAGE uint32x2_t vreinterpret_u32_u64 (uint64x1_t t);
+#define vreinterpret_u32_u64
+
+_NEON2SSESTORAGE uint32x2_t vreinterpret_u32_s64 (int64x1_t t);
+#define vreinterpret_u32_s64
+
+_NEON2SSESTORAGE uint32x2_t vreinterpret_u32_f32 (float32x2_t t);
+#define vreinterpret_u32_f32
+
+_NEON2SSESTORAGE uint32x2_t vreinterpret_u32_p16 (poly16x4_t t);
+#define vreinterpret_u32_p16
+
+_NEON2SSESTORAGE uint32x2_t vreinterpret_u32_p8 (poly8x8_t t);
+#define vreinterpret_u32_p8
+
+_NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_u16 (uint16x8_t t);
+#define vreinterpretq_u32_u16
+
+_NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_u8 (uint8x16_t t);
+#define vreinterpretq_u32_u8
+
+_NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_s32 (int32x4_t t);
+#define vreinterpretq_u32_s32
+
+_NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_s16 (int16x8_t t);
+#define vreinterpretq_u32_s16
+
+_NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_s8 (int8x16_t t);
+#define vreinterpretq_u32_s8
+
+_NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_u64 (uint64x2_t t);
+#define vreinterpretq_u32_u64
+
+_NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_s64 (int64x2_t t);
+#define vreinterpretq_u32_s64
+
+_NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_f32 (float32x4_t t);
+#define  vreinterpretq_u32_f32(t) _M128i(t)
+
+_NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_p16 (poly16x8_t t);
+#define vreinterpretq_u32_p16
+
+_NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_p8 (poly8x16_t t);
+#define vreinterpretq_u32_p8
+
+//*************  Round ******************
+_NEON2SSESTORAGE float32x4_t vrndnq_f32(float32x4_t a);
+#ifdef USE_SSE4
+#   define vrndnq_f32(a) _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
+#else
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( float32x4_t vrndnq_f32(float32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    int i;
+    _NEON2SSE_ALIGN_16 float32_t res[4];
+    _mm_store_ps(res, a);
+     for(i = 0; i<4; i++) {
+       res[i] = nearbyintf(res[i]);
+     }
+    return _mm_load_ps(res);
+}
+#endif
+
+
+_NEON2SSESTORAGE float64x2_t vrndnq_f64(float64x2_t a);
+#ifdef USE_SSE4
+#   define  vrndnq_f64(a)  _mm_round_pd(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
+#else
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float64x2_t vrndnq_f64(float64x2_t a), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+     _NEON2SSE_ALIGN_16 float64_t res[2];
+     _mm_store_pd(res, a);
+     res[0] = nearbyint(res[0]);
+     res[1] = nearbyint(res[1]);
+     return _mm_load_pd(res);
+}
+#endif
+
+
+
+//************* Sqrt ******************
+_NEON2SSESTORAGE float32x4_t vsqrtq_f32(float32x4_t a);
+#define vsqrtq_f32 _mm_sqrt_ps
+
+_NEON2SSESTORAGE float64x2_t vsqrtq_f64(float64x2_t a);
+#define vsqrtq_f64 _mm_sqrt_pd
+
+
+#endif /* NEON2SSE_H */
diff --git a/onnxruntime/core/mlas/lib/ppc64/emmintrin.h b/onnxruntime/core/mlas/lib/ppc64/emmintrin.h
new file mode 100644
index 0000000000000..ea3512420ff40
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/ppc64/emmintrin.h
@@ -0,0 +1,2324 @@
+/*===---- emmintrin.h - Implementation of SSE2 intrinsics on PowerPC -------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* Implemented from the specification included in the Intel C++ Compiler
+   User Guide and Reference, version 9.0.  */
+
+#ifndef NO_WARN_X86_INTRINSICS
+/* This header file is to help porting code using Intel intrinsics
+   explicitly from x86_64 to powerpc64/powerpc64le.
+
+   Since X86 SSE2 intrinsics mainly handles __m128i and __m128d type,
+   PowerPC VMX/VSX ISA is a good match for vector float SIMD operations.
+   However scalar float operations in vector (XMM) registers require
+   the POWER8 VSX ISA (2.07) level. There are differences for data
+   format and placement of float scalars in the vector register, which
+   require extra steps to match SSE2 scalar float semantics on POWER.
+
+   It should be noted that there's much difference between X86_64's
+   MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
+   portable <fenv.h> instead of access MXSCR directly.
+
+   Most SSE2 scalar float intrinsic operations can be performed more
+   efficiently as C language float scalar operations or optimized to
+   use vector SIMD operations. We recommend this for new applications.
+*/
+#error "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
+#endif
+
+#ifndef EMMINTRIN_H_
+#define EMMINTRIN_H_
+
+#if defined(__linux__) && defined(__powerpc64__)
+
+#include <altivec.h>
+
+/* We need definitions from the SSE header files.  */
+#include "xmmintrin.h"
+
+/* SSE2 */
+typedef __vector double __v2df;
+typedef __vector long long __v2di;
+typedef __vector unsigned long long __v2du;
+typedef __vector int __v4si;
+typedef __vector unsigned int __v4su;
+typedef __vector short __v8hi;
+typedef __vector unsigned short __v8hu;
+typedef __vector signed char __v16qi;
+typedef __vector unsigned char __v16qu;
+
+/* The Intel API is flexible enough that we must allow aliasing with other
+   vector types, and their scalar components.  */
+typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
+typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
+
+/* Unaligned version of the same types.  */
+typedef long long __m128i_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
+typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
+
+/* Define two value permute mask.  */
+#define _MM_SHUFFLE2(x,y) (((x) << 1) | (y))
+
+/* Create a vector with element 0 as F and the rest zero.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_sd (double __F)
+{
+  return __extension__ (__m128d){ __F, 0.0 };
+}
+
+/* Create a vector with both elements equal to F.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_pd (double __F)
+{
+  return __extension__ (__m128d){ __F, __F };
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_pd1 (double __F)
+{
+  return _mm_set1_pd (__F);
+}
+
+/* Create a vector with the lower value X and upper value W.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_pd (double __W, double __X)
+{
+  return __extension__ (__m128d){ __X, __W };
+}
+
+/* Create a vector with the lower value W and upper value X.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setr_pd (double __W, double __X)
+{
+  return __extension__ (__m128d){ __W, __X };
+}
+
+/* Create an undefined vector.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_undefined_pd (void)
+{
+  __m128d __Y = __Y;
+  return __Y;
+}
+
+/* Create a vector of zeros.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setzero_pd (void)
+{
+  return (__m128d) vec_splats (0);
+}
+
+/* Sets the low DPFP value of A from the low value of B.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_move_sd (__m128d __A, __m128d __B)
+{
+  __v2df result = (__v2df) __A;
+  result [0] = ((__v2df) __B)[0];
+  return (__m128d) result;
+}
+
+/* Load two DPFP values from P.  The address must be 16-byte aligned.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_pd (double const *__P)
+{
+  return ((__m128d)vec_ld(0, (__v16qu*)__P));
+}
+
+/* Load two DPFP values from P.  The address need not be 16-byte aligned.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadu_pd (double const *__P)
+{
+  return (vec_vsx_ld(0, __P));
+}
+
+/* Create a vector with all two elements equal to *P.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load1_pd (double const *__P)
+{
+  return (vec_splats (*__P));
+}
+
+/* Create a vector with element 0 as *P and the rest zero.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_sd (double const *__P)
+{
+  return _mm_set_sd (*__P);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_pd1 (double const *__P)
+{
+  return _mm_load1_pd (__P);
+}
+
+/* Load two DPFP values in reverse order.  The address must be aligned.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadr_pd (double const *__P)
+{
+  __v2df __tmp = _mm_load_pd (__P);
+  return (__m128d)vec_xxpermdi (__tmp, __tmp, 2);
+}
+
+/* Store two DPFP values.  The address must be 16-byte aligned.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_pd (double *__P, __m128d __A)
+{
+  vec_st((__v16qu)__A, 0, (__v16qu*)__P);
+}
+
+/* Store two DPFP values.  The address need not be 16-byte aligned.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storeu_pd (double *__P, __m128d __A)
+{
+  *(__m128d_u *)__P = __A;
+}
+
+/* Stores the lower DPFP value.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_sd (double *__P, __m128d __A)
+{
+  *__P = ((__v2df)__A)[0];
+}
+
+extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsd_f64 (__m128d __A)
+{
+  return ((__v2df)__A)[0];
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storel_pd (double *__P, __m128d __A)
+{
+  _mm_store_sd (__P, __A);
+}
+
+/* Stores the upper DPFP value.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storeh_pd (double *__P, __m128d __A)
+{
+  *__P = ((__v2df)__A)[1];
+}
+/* Store the lower DPFP value across two words.
+   The address must be 16-byte aligned.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store1_pd (double *__P, __m128d __A)
+{
+  _mm_store_pd (__P, vec_splat (__A, 0));
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_pd1 (double *__P, __m128d __A)
+{
+  _mm_store1_pd (__P, __A);
+}
+
+/* Store two DPFP values in reverse order.  The address must be aligned.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storer_pd (double *__P, __m128d __A)
+{
+  _mm_store_pd (__P, vec_xxpermdi (__A, __A, 2));
+}
+
+/* Intel intrinsic.  */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi128_si64 (__m128i __A)
+{
+  return ((__v2di)__A)[0];
+}
+
+/* Microsoft intrinsic.  */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi128_si64x (__m128i __A)
+{
+  return ((__v2di)__A)[0];
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d) ((__v2df)__A + (__v2df)__B);
+}
+
+/* Add the lower double-precision (64-bit) floating-point element in
+   a and b, store the result in the lower element of dst, and copy
+   the upper element from a to the upper element of dst. */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_sd (__m128d __A, __m128d __B)
+{
+  __A[0] = __A[0] + __B[0];
+  return (__A);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d) ((__v2df)__A - (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_sd (__m128d __A, __m128d __B)
+{
+  __A[0] = __A[0] - __B[0];
+  return (__A);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d) ((__v2df)__A * (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_sd (__m128d __A, __m128d __B)
+{
+  __A[0] = __A[0] * __B[0];
+  return (__A);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_div_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d) ((__v2df)__A / (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_div_sd (__m128d __A, __m128d __B)
+{
+  __A[0] = __A[0] / __B[0];
+  return (__A);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sqrt_pd (__m128d __A)
+{
+  return (vec_sqrt (__A));
+}
+
+/* Return pair {sqrt (B[0]), A[1]}.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sqrt_sd (__m128d __A, __m128d __B)
+{
+  __v2df c;
+  c = vec_sqrt ((__v2df) _mm_set1_pd (__B[0]));
+  return (__m128d) _mm_setr_pd (c[0], __A[1]);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_pd (__m128d __A, __m128d __B)
+{
+  return (vec_min (__A, __B));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_sd (__m128d __A, __m128d __B)
+{
+  __v2df a, b, c;
+  a = vec_splats (__A[0]);
+  b = vec_splats (__B[0]);
+  c = vec_min (a, b);
+  return (__m128d) _mm_setr_pd (c[0], __A[1]);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_pd (__m128d __A, __m128d __B)
+{
+  return (vec_max (__A, __B));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_sd (__m128d __A, __m128d __B)
+{
+  __v2df a, b, c;
+  a = vec_splats (__A[0]);
+  b = vec_splats (__B[0]);
+  c = vec_max (a, b);
+  return (__m128d) _mm_setr_pd (c[0], __A[1]);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_pd (__m128d __A, __m128d __B)
+{
+  return ((__m128d)vec_cmpeq ((__v2df) __A, (__v2df) __B));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_pd (__m128d __A, __m128d __B)
+{
+  return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmple_pd (__m128d __A, __m128d __B)
+{
+  return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_pd (__m128d __A, __m128d __B)
+{
+  return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpge_pd (__m128d __A, __m128d __B)
+{
+  return ((__m128d)vec_cmpge ((__v2df) __A,(__v2df) __B));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpneq_pd (__m128d __A, __m128d __B)
+{
+  __v2df temp = (__v2df) vec_cmpeq ((__v2df) __A, (__v2df)__B);
+  return ((__m128d)vec_nor (temp, temp));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnlt_pd (__m128d __A, __m128d __B)
+{
+  return ((__m128d)vec_cmpge ((__v2df) __A, (__v2df) __B));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnle_pd (__m128d __A, __m128d __B)
+{
+  return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpngt_pd (__m128d __A, __m128d __B)
+{
+  return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnge_pd (__m128d __A, __m128d __B)
+{
+  return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpord_pd (__m128d __A, __m128d __B)
+{
+#if _ARCH_PWR8
+  __v2du c, d;
+  /* Compare against self will return false (0's) if NAN.  */
+  c = (__v2du)vec_cmpeq (__A, __A);
+  d = (__v2du)vec_cmpeq (__B, __B);
+#else
+  __v2du a, b;
+  __v2du c, d;
+  const __v2du double_exp_mask  = {0x7ff0000000000000, 0x7ff0000000000000};
+  a = (__v2du)vec_abs ((__v2df)__A);
+  b = (__v2du)vec_abs ((__v2df)__B);
+  c = (__v2du)vec_cmpgt (double_exp_mask, a);
+  d = (__v2du)vec_cmpgt (double_exp_mask, b);
+#endif
+  /* A != NAN and B != NAN.  */
+  return ((__m128d)vec_and(c, d));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpunord_pd (__m128d __A, __m128d __B)
+{
+#if _ARCH_PWR8
+  __v2du c, d;
+  /* Compare against self will return false (0's) if NAN.  */
+  c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A);
+  d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B);
+  /* A == NAN OR B == NAN converts too:
+     NOT(A != NAN) OR NOT(B != NAN).  */
+  c = vec_nor (c, c);
+  return ((__m128d)vec_orc(c, d));
+#else
+  __v2du c, d;
+  /* Compare against self will return false (0's) if NAN.  */
+  c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A);
+  d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B);
+  /* Convert the true ('1's) is NAN.  */
+  c = vec_nor (c, c);
+  d = vec_nor (d, d);
+  return ((__m128d)vec_or(c, d));
+#endif
+}
+
+extern __inline  __m128d  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_sd(__m128d  __A, __m128d  __B)
+{
+  __v2df a, b, c;
+  /* PowerISA VSX does not allow partial (for just lower double)
+     results. So to insure we don't generate spurious exceptions
+     (from the upper double values) we splat the lower double
+     before we do the operation. */
+  a = vec_splats (__A[0]);
+  b = vec_splats (__B[0]);
+  c = (__v2df) vec_cmpeq(a, b);
+  /* Then we merge the lower double result with the original upper
+     double from __A.  */
+  return (__m128d) _mm_setr_pd (c[0], __A[1]);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_sd (__m128d __A, __m128d __B)
+{
+  __v2df a, b, c;
+  a = vec_splats (__A[0]);
+  b = vec_splats (__B[0]);
+  c = (__v2df) vec_cmplt(a, b);
+  return (__m128d) _mm_setr_pd (c[0], __A[1]);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmple_sd (__m128d __A, __m128d __B)
+{
+  __v2df a, b, c;
+  a = vec_splats (__A[0]);
+  b = vec_splats (__B[0]);
+  c = (__v2df) vec_cmple(a, b);
+  return (__m128d) _mm_setr_pd (c[0], __A[1]);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_sd (__m128d __A, __m128d __B)
+{
+  __v2df a, b, c;
+  a = vec_splats (__A[0]);
+  b = vec_splats (__B[0]);
+  c = (__v2df) vec_cmpgt(a, b);
+  return (__m128d) _mm_setr_pd (c[0], __A[1]);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpge_sd (__m128d __A, __m128d __B)
+{
+  __v2df a, b, c;
+  a = vec_splats (__A[0]);
+  b = vec_splats (__B[0]);
+  c = (__v2df) vec_cmpge(a, b);
+  return (__m128d) _mm_setr_pd (c[0], __A[1]);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpneq_sd (__m128d __A, __m128d __B)
+{
+  __v2df a, b, c;
+  a = vec_splats (__A[0]);
+  b = vec_splats (__B[0]);
+  c = (__v2df) vec_cmpeq(a, b);
+  c = vec_nor (c, c);
+  return (__m128d) _mm_setr_pd (c[0], __A[1]);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnlt_sd (__m128d __A, __m128d __B)
+{
+  __v2df a, b, c;
+  a = vec_splats (__A[0]);
+  b = vec_splats (__B[0]);
+  /* Not less than is just greater than or equal.  */
+  c = (__v2df) vec_cmpge(a, b);
+  return (__m128d) _mm_setr_pd (c[0], __A[1]);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnle_sd (__m128d __A, __m128d __B)
+{
+  __v2df a, b, c;
+  a = vec_splats (__A[0]);
+  b = vec_splats (__B[0]);
+  /* Not less than or equal is just greater than.  */
+  c = (__v2df) vec_cmpge(a, b);
+  return (__m128d) _mm_setr_pd (c[0], __A[1]);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpngt_sd (__m128d __A, __m128d __B)
+{
+  __v2df a, b, c;
+  a = vec_splats (__A[0]);
+  b = vec_splats (__B[0]);
+  /* Not greater than is just less than or equal.  */
+  c = (__v2df) vec_cmple(a, b);
+  return (__m128d) _mm_setr_pd (c[0], __A[1]);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnge_sd (__m128d __A, __m128d __B)
+{
+  __v2df a, b, c;
+  a = vec_splats (__A[0]);
+  b = vec_splats (__B[0]);
+  /* Not greater than or equal is just less than.  */
+  c = (__v2df) vec_cmplt(a, b);
+  return (__m128d) _mm_setr_pd (c[0], __A[1]);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpord_sd (__m128d __A, __m128d __B)
+{
+  __v2df r;
+  r = (__v2df)_mm_cmpord_pd (vec_splats (__A[0]), vec_splats (__B[0]));
+  return (__m128d) _mm_setr_pd (r[0], ((__v2df)__A)[1]);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpunord_sd (__m128d __A, __m128d __B)
+{
+  __v2df r;
+  r = _mm_cmpunord_pd (vec_splats (__A[0]), vec_splats (__B[0]));
+  return (__m128d) _mm_setr_pd (r[0], __A[1]);
+}
+
+/* FIXME
+   The __mm_comi??_sd and __mm_ucomi??_sd implementations below are
+   exactly the same because GCC for PowerPC only generates unordered
+   compares (scalar and vector).
+   Technically __mm_comieq_sp et all should be using the ordered
+   compare and signal for QNaNs.  The __mm_ucomieq_sd et all should
+   be OK.   */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comieq_sd (__m128d __A, __m128d __B)
+{
+  return (__A[0] == __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comilt_sd (__m128d __A, __m128d __B)
+{
+  return (__A[0] < __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comile_sd (__m128d __A, __m128d __B)
+{
+  return (__A[0] <= __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comigt_sd (__m128d __A, __m128d __B)
+{
+  return (__A[0] > __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comige_sd (__m128d __A, __m128d __B)
+{
+  return (__A[0] >= __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comineq_sd (__m128d __A, __m128d __B)
+{
+  return (__A[0] != __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomieq_sd (__m128d __A, __m128d __B)
+{
+	return (__A[0] == __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomilt_sd (__m128d __A, __m128d __B)
+{
+	return (__A[0] < __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomile_sd (__m128d __A, __m128d __B)
+{
+	return (__A[0] <= __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomigt_sd (__m128d __A, __m128d __B)
+{
+	return (__A[0] > __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomige_sd (__m128d __A, __m128d __B)
+{
+	return (__A[0] >= __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomineq_sd (__m128d __A, __m128d __B)
+{
+  return (__A[0] != __B[0]);
+}
+
+/* Create a vector of Qi, where i is the element number.  */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_epi64x (long long __q1, long long __q0)
+{
+  return __extension__ (__m128i)(__v2di){ __q0, __q1 };
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_epi64 (__m64 __q1,  __m64 __q0)
+{
+  return _mm_set_epi64x ((long long)__q1, (long long)__q0);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_epi32 (int __q3, int __q2, int __q1, int __q0)
+{
+  return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 };
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_epi16 (short __q7, short __q6, short __q5, short __q4,
+	       short __q3, short __q2, short __q1, short __q0)
+{
+  return __extension__ (__m128i)(__v8hi){
+    __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 };
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_epi8 (char __q15, char __q14, char __q13, char __q12,
+	      char __q11, char __q10, char __q09, char __q08,
+	      char __q07, char __q06, char __q05, char __q04,
+	      char __q03, char __q02, char __q01, char __q00)
+{
+  return __extension__ (__m128i)(__v16qi){
+    __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
+    __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
+  };
+}
+
+/* Set all of the elements of the vector to A.  */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_epi64x (long long __A)
+{
+  return _mm_set_epi64x (__A, __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_epi64 (__m64 __A)
+{
+  return _mm_set_epi64 (__A, __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_epi32 (int __A)
+{
+  return _mm_set_epi32 (__A, __A, __A, __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_epi16 (short __A)
+{
+  return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_epi8 (char __A)
+{
+  return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
+		       __A, __A, __A, __A, __A, __A, __A, __A);
+}
+
+/* Create a vector of Qi, where i is the element number.
+   The parameter order is reversed from the _mm_set_epi* functions.  */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setr_epi64 (__m64 __q0, __m64 __q1)
+{
+  return _mm_set_epi64 (__q1, __q0);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3)
+{
+  return _mm_set_epi32 (__q3, __q2, __q1, __q0);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3,
+	        short __q4, short __q5, short __q6, short __q7)
+{
+  return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03,
+	       char __q04, char __q05, char __q06, char __q07,
+	       char __q08, char __q09, char __q10, char __q11,
+	       char __q12, char __q13, char __q14, char __q15)
+{
+  return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
+		       __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
+}
+
+/* Create a vector with element 0 as *P and the rest zero.  */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_si128 (__m128i const *__P)
+{
+  return *__P;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadu_si128 (__m128i_u const *__P)
+{
+  return (__m128i) (vec_vsx_ld(0, (signed int const *)__P));
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadl_epi64 (__m128i_u const *__P)
+{
+  return _mm_set_epi64 ((__m64)0LL, *(__m64 *)__P);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_si128 (__m128i *__P, __m128i __B)
+{
+  vec_st ((__v16qu) __B, 0, (__v16qu*)__P);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storeu_si128 (__m128i_u *__P, __m128i __B)
+{
+  *__P = __B;
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storel_epi64 (__m128i_u *__P, __m128i __B)
+{
+  *(long long *)__P = ((__v2di)__B)[0];
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movepi64_pi64 (__m128i_u __B)
+{
+  return (__m64) ((__v2di)__B)[0];
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movpi64_epi64 (__m64 __A)
+{
+  return _mm_set_epi64 ((__m64)0LL, __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_move_epi64 (__m128i __A)
+{
+  return _mm_set_epi64 ((__m64)0LL, (__m64)__A[0]);
+}
+
+/* Create an undefined vector.  */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_undefined_si128 (void)
+{
+  __m128i __Y = __Y;
+  return __Y;
+}
+
+/* Create a vector of zeros.  */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setzero_si128 (void)
+{
+  return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 };
+}
+
+#ifdef _ARCH_PWR8
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi32_pd (__m128i __A)
+{
+  __v2di val;
+  /* For LE need to generate Vector Unpack Low Signed Word.
+     Which is generated from unpackh.  */
+  val = (__v2di)vec_unpackh ((__v4si)__A);
+
+  return (__m128d)vec_ctf (val, 0);
+}
+#endif
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi32_ps (__m128i __A)
+{
+  return ((__m128)vec_ctf((__v4si)__A, 0));
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpd_epi32 (__m128d __A)
+{
+  __v2df rounded = vec_rint (__A);
+  __v4si result, temp;
+  const __v4si vzero =
+    { 0, 0, 0, 0 };
+
+  /* VSX Vector truncate Double-Precision to integer and Convert to
+   Signed Integer Word format with Saturate.  */
+  __asm__(
+      "xvcvdpsxws %x0,%x1"
+      : "=wa" (temp)
+      : "wa" (rounded)
+      : );
+
+#ifdef _ARCH_PWR8
+  temp = vec_mergeo (temp, temp);
+  result = (__v4si) vec_vpkudum ((__vector long long) temp,
+				 (__vector long long) vzero);
+#else
+  {
+    const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
+	0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
+    result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
+  }
+#endif
+  return (__m128i) result;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpd_pi32 (__m128d __A)
+{
+  __m128i result = _mm_cvtpd_epi32(__A);
+
+  return (__m64) result[0];
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpd_ps (__m128d __A)
+{
+  __v4sf result;
+  __v4si temp;
+  const __v4si vzero = { 0, 0, 0, 0 };
+
+  __asm__(
+      "xvcvdpsp %x0,%x1"
+      : "=wa" (temp)
+      : "wa" (__A)
+      : );
+
+#ifdef _ARCH_PWR8
+  temp = vec_mergeo (temp, temp);
+  result = (__v4sf) vec_vpkudum ((__vector long long) temp,
+				 (__vector long long) vzero);
+#else
+  {
+    const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
+	0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
+    result = (__v4sf) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
+  }
+#endif
+  return ((__m128)result);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttpd_epi32 (__m128d __A)
+{
+  __v4si result;
+  __v4si temp;
+  const __v4si vzero = { 0, 0, 0, 0 };
+
+  /* VSX Vector truncate Double-Precision to integer and Convert to
+   Signed Integer Word format with Saturate.  */
+  __asm__(
+      "xvcvdpsxws %x0,%x1"
+      : "=wa" (temp)
+      : "wa" (__A)
+      : );
+
+#ifdef _ARCH_PWR8
+  temp = vec_mergeo (temp, temp);
+  result = (__v4si) vec_vpkudum ((__vector long long) temp,
+				 (__vector long long) vzero);
+#else
+  {
+    const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
+	0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
+    result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
+  }
+#endif
+
+  return ((__m128i) result);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttpd_pi32 (__m128d __A)
+{
+  __m128i result = _mm_cvttpd_epi32 (__A);
+
+  return (__m64) result[0];
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi128_si32 (__m128i __A)
+{
+  return ((__v4si)__A)[0];
+}
+
+#ifdef _ARCH_PWR8
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpi32_pd (__m64 __A)
+{
+  __v4si temp;
+  __v2di tmp2;
+  __v2df result;
+
+  temp = (__v4si)vec_splats (__A);
+  tmp2 = (__v2di)vec_unpackl (temp);
+  result = vec_ctf ((__vector signed long long) tmp2, 0);
+  return (__m128d)result;
+}
+#endif
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtps_epi32 (__m128 __A)
+{
+  __v4sf rounded;
+  __v4si result;
+
+  rounded = vec_rint((__v4sf) __A);
+  result = vec_cts (rounded, 0);
+  return (__m128i) result;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttps_epi32 (__m128 __A)
+{
+  __v4si result;
+
+  result = vec_cts ((__v4sf) __A, 0);
+  return (__m128i) result;
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtps_pd (__m128 __A)
+{
+  /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */
+#ifdef vec_doubleh
+  return (__m128d) vec_doubleh ((__v4sf)__A);
+#else
+  /* Otherwise the compiler is not current and so need to generate the
+     equivalent code.  */
+  __v4sf a = (__v4sf)__A;
+  __v4sf temp;
+  __v2df result;
+#ifdef __LITTLE_ENDIAN__
+  /* The input float values are in elements {[0], [1]} but the convert
+     instruction needs them in elements {[1], [3]}, So we use two
+     shift left double vector word immediates to get the elements
+     lined up.  */
+  temp = __builtin_vsx_xxsldwi (a, a, 3);
+  temp = __builtin_vsx_xxsldwi (a, temp, 2);
+#else
+  /* The input float values are in elements {[0], [1]} but the convert
+     instruction needs them in elements {[0], [2]}, So we use two
+     shift left double vector word immediates to get the elements
+     lined up.  */
+  temp = vec_vmrghw (a, a);
+#endif
+  __asm__(
+      " xvcvspdp %x0,%x1"
+      : "=wa" (result)
+      : "wa" (temp)
+      : );
+  return (__m128d) result;
+#endif
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsd_si32 (__m128d __A)
+{
+  __v2df rounded = vec_rint((__v2df) __A);
+  int result = ((__v2df)rounded)[0];
+
+  return result;
+}
+/* Intel intrinsic.  */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsd_si64 (__m128d __A)
+{
+  __v2df rounded = vec_rint ((__v2df) __A );
+  long long result = ((__v2df) rounded)[0];
+
+  return result;
+}
+
+/* Microsoft intrinsic.  */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsd_si64x (__m128d __A)
+{
+  return _mm_cvtsd_si64 ((__v2df)__A);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttsd_si32 (__m128d __A)
+{
+  int result = ((__v2df)__A)[0];
+
+  return result;
+}
+
+/* Intel intrinsic.  */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttsd_si64 (__m128d __A)
+{
+  long long result = ((__v2df)__A)[0];
+
+  return result;
+}
+
+/* Microsoft intrinsic.  */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttsd_si64x (__m128d __A)
+{
+  return _mm_cvttsd_si64 (__A);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsd_ss (__m128 __A, __m128d __B)
+{
+  __v4sf result = (__v4sf)__A;
+
+#ifdef __LITTLE_ENDIAN__
+  __v4sf temp_s;
+  /* Copy double element[0] to element [1] for conversion.  */
+  __v2df temp_b = vec_splat((__v2df)__B, 0);
+
+  /* Pre-rotate __A left 3 (logically right 1) elements.  */
+  result = __builtin_vsx_xxsldwi (result, result, 3);
+  /* Convert double to single float scalar in a vector.  */
+  __asm__(
+      "xscvdpsp %x0,%x1"
+      : "=wa" (temp_s)
+      : "wa" (temp_b)
+      : );
+  /* Shift the resulting scalar into vector element [0].  */
+  result = __builtin_vsx_xxsldwi (result, temp_s, 1);
+#else
+  result [0] = ((__v2df)__B)[0];
+#endif
+  return (__m128) result;
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi32_sd (__m128d __A, int __B)
+{
+  __v2df result = (__v2df)__A;
+  double db = __B;
+  result [0] = db;
+  return (__m128d)result;
+}
+
+/* Intel intrinsic.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64_sd (__m128d __A, long long __B)
+{
+  __v2df result = (__v2df)__A;
+  double db = __B;
+  result [0] = db;
+  return (__m128d)result;
+}
+
+/* Microsoft intrinsic.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64x_sd (__m128d __A, long long __B)
+{
+  return _mm_cvtsi64_sd (__A, __B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtss_sd (__m128d __A, __m128 __B)
+{
+#ifdef __LITTLE_ENDIAN__
+  /* Use splat to move element [0] into position for the convert. */
+  __v4sf temp = vec_splat ((__v4sf)__B, 0);
+  __v2df res;
+  /* Convert single float scalar to double in a vector.  */
+  __asm__(
+      "xscvspdp %x0,%x1"
+      : "=wa" (res)
+      : "wa" (temp)
+      : );
+  return (__m128d) vec_mergel (res, (__v2df)__A);
+#else
+  __v2df res = (__v2df)__A;
+  res [0] = ((__v4sf)__B) [0];
+  return (__m128d) res;
+#endif
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask)
+{
+  __vector double result;
+  const int litmsk = __mask & 0x3;
+
+  if (litmsk == 0)
+    result = vec_mergeh (__A, __B);
+#if __GNUC__ < 6
+  else if (litmsk == 1)
+    result = vec_xxpermdi (__B, __A, 2);
+  else if (litmsk == 2)
+    result = vec_xxpermdi (__B, __A, 1);
+#else
+  else if (litmsk == 1)
+    result = vec_xxpermdi (__A, __B, 2);
+  else if (litmsk == 2)
+    result = vec_xxpermdi (__A, __B, 1);
+#endif
+  else
+    result = vec_mergel (__A, __B);
+
+  return result;
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d) vec_mergel ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d) vec_mergeh ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadh_pd (__m128d __A, double const *__B)
+{
+  __v2df result = (__v2df)__A;
+  result [1] = *__B;
+  return (__m128d)result;
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadl_pd (__m128d __A, double const *__B)
+{
+  __v2df result = (__v2df)__A;
+  result [0] = *__B;
+  return (__m128d)result;
+}
+
+#ifdef _ARCH_PWR8
+/* Intrinsic functions that require PowerISA 2.07 minimum.  */
+
+/* Creates a 2-bit mask from the most significant bits of the DPFP values.  */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movemask_pd (__m128d  __A)
+{
+  __vector unsigned long long result;
+  static const __vector unsigned int perm_mask =
+    {
+#ifdef __LITTLE_ENDIAN__
+	0x80800040, 0x80808080, 0x80808080, 0x80808080
+#else
+      0x80808080, 0x80808080, 0x80808080, 0x80804000
+#endif
+    };
+
+  result = ((__vector unsigned long long)
+	    vec_vbpermq ((__vector unsigned char) __A,
+			 (__vector unsigned char) perm_mask));
+
+#ifdef __LITTLE_ENDIAN__
+  return result[1];
+#else
+  return result[0];
+#endif
+}
+#endif /* _ARCH_PWR8 */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_packs_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_packs ((__v8hi) __A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_packs_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_packs ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_packus_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_packsu ((__v8hi) __A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_epi8 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_mergel ((__v16qu)__A, (__v16qu)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_mergel ((__v8hu)__A, (__v8hu)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_mergel ((__v4su)__A, (__v4su)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_epi64 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_mergel ((__vector long long) __A,
+			       (__vector long long) __B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_epi8 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_mergeh ((__v16qu)__A, (__v16qu)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_mergeh ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_mergeh ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_epi64 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_mergeh ((__vector long long) __A,
+			       (__vector long long) __B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_epi8 (__m128i __A, __m128i __B)
+{
+  return (__m128i) ((__v16qu)__A + (__v16qu)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) ((__v8hu)__A + (__v8hu)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i) ((__v4su)__A + (__v4su)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_epi64 (__m128i __A, __m128i __B)
+{
+  return (__m128i) ((__v2du)__A + (__v2du)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_adds_epi8 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_adds ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_adds_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_adds ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_adds_epu8 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_adds ((__v16qu)__A, (__v16qu)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_adds_epu16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_adds ((__v8hu)__A, (__v8hu)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_epi8 (__m128i __A, __m128i __B)
+{
+  return (__m128i) ((__v16qu)__A - (__v16qu)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) ((__v8hu)__A - (__v8hu)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i) ((__v4su)__A - (__v4su)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_epi64 (__m128i __A, __m128i __B)
+{
+  return (__m128i) ((__v2du)__A - (__v2du)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_subs_epi8 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_subs ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_subs_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_subs ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_subs_epu8 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_subs ((__v16qu)__A, (__v16qu)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_subs_epu16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_subs ((__v8hu)__A, (__v8hu)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_madd_epi16 (__m128i __A, __m128i __B)
+{
+  __vector signed int zero = {0, 0, 0, 0};
+
+  return (__m128i) vec_vmsumshm ((__v8hi)__A, (__v8hi)__B, zero);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mulhi_epi16 (__m128i __A, __m128i __B)
+{
+  __vector signed int w0, w1;
+
+  __vector unsigned char xform1 = {
+#ifdef __LITTLE_ENDIAN__
+      0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
+      0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
+#else
+      0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15,
+      0x08, 0x09, 0x18, 0x19,  0x0C, 0x0D, 0x1C, 0x1D
+#endif
+    };
+
+  w0 = vec_vmulesh ((__v8hi)__A, (__v8hi)__B);
+  w1 = vec_vmulosh ((__v8hi)__A, (__v8hi)__B);
+  return (__m128i) vec_perm (w0, w1, xform1);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mullo_epi16 (__m128i __A, __m128i __B)
+{
+    return (__m128i) ((__v8hi)__A * (__v8hi)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_su32 (__m64 __A, __m64 __B)
+{
+  unsigned int a = __A;
+  unsigned int b = __B;
+
+  return ((__m64)a * (__m64)b);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_epu32 (__m128i __A, __m128i __B)
+{
+#if __GNUC__ < 8
+  __v2du result;
+
+#ifdef __LITTLE_ENDIAN__
+  /* VMX Vector Multiply Odd Unsigned Word.  */
+  __asm__(
+      "vmulouw %0,%1,%2"
+      : "=v" (result)
+      : "v" (__A), "v" (__B)
+      : );
+#else
+  /* VMX Vector Multiply Even Unsigned Word.  */
+  __asm__(
+      "vmuleuw %0,%1,%2"
+      : "=v" (result)
+      : "v" (__A), "v" (__B)
+      : );
+#endif
+  return (__m128i) result;
+#else
+  return (__m128i) vec_mule ((__v4su)__A, (__v4su)__B);
+#endif
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_slli_epi16 (__m128i __A, int __B)
+{
+  __v8hu lshift;
+  __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 };
+
+  if (__B >= 0 && __B < 16)
+    {
+      if (__builtin_constant_p(__B))
+	lshift = (__v8hu) vec_splat_s16(__B);
+      else
+	lshift = vec_splats ((unsigned short) __B);
+
+      result = vec_sl ((__v8hi) __A, lshift);
+    }
+
+  return (__m128i) result;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_slli_epi32 (__m128i __A, int __B)
+{
+  __v4su lshift;
+  __v4si result = { 0, 0, 0, 0 };
+
+  if (__B >= 0 && __B < 32)
+    {
+      if (__builtin_constant_p(__B) && __B < 16)
+	lshift = (__v4su) vec_splat_s32(__B);
+      else
+	lshift = vec_splats ((unsigned int) __B);
+
+      result = vec_sl ((__v4si) __A, lshift);
+    }
+
+  return (__m128i) result;
+}
+
+#ifdef _ARCH_PWR8
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_slli_epi64 (__m128i __A, int __B)
+{
+  __v2du lshift;
+  __v2di result = { 0, 0 };
+
+  if (__B >= 0 && __B < 64)
+    {
+      if (__builtin_constant_p(__B) && __B < 16)
+	lshift = (__v2du) vec_splat_s32(__B);
+      else
+	lshift = (__v2du) vec_splats ((unsigned int) __B);
+
+      result = vec_sl ((__v2di) __A, lshift);
+    }
+
+  return (__m128i) result;
+}
+#endif
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srai_epi16 (__m128i __A, int __B)
+{
+  __v8hu rshift = { 15, 15, 15, 15, 15, 15, 15, 15 };
+  __v8hi result;
+
+  if (__B < 16)
+    {
+      if (__builtin_constant_p(__B))
+	rshift = (__v8hu) vec_splat_s16(__B);
+      else
+	rshift = vec_splats ((unsigned short) __B);
+    }
+  result = vec_sra ((__v8hi) __A, rshift);
+
+  return (__m128i) result;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srai_epi32 (__m128i __A, int __B)
+{
+  __v4su rshift = { 31, 31, 31, 31 };
+  __v4si result;
+
+  if (__B < 32)
+    {
+      if (__builtin_constant_p(__B))
+	{
+	  if (__B < 16)
+	      rshift = (__v4su) vec_splat_s32(__B);
+	    else
+	      rshift = (__v4su) vec_splats((unsigned int)__B);
+	}
+      else
+	rshift = vec_splats ((unsigned int) __B);
+    }
+  result = vec_sra ((__v4si) __A, rshift);
+
+  return (__m128i) result;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_bslli_si128 (__m128i __A, const int __N)
+{
+  __v16qu result;
+  const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+  if (__N < 16)
+    result = vec_sld ((__v16qu) __A, zeros, __N);
+  else
+    result = zeros;
+
+  return (__m128i) result;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_bsrli_si128 (__m128i __A, const int __N)
+{
+  __v16qu result;
+  const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+  if (__N < 16)
+#ifdef __LITTLE_ENDIAN__
+    if (__builtin_constant_p(__N))
+      /* Would like to use Vector Shift Left Double by Octet
+	 Immediate here to use the immediate form and avoid
+	 load of __N * 8 value into a separate VR.  */
+      result = vec_sld (zeros, (__v16qu) __A, (16 - __N));
+    else
+#endif
+      {
+	__v16qu shift = vec_splats((unsigned char)(__N*8));
+#ifdef __LITTLE_ENDIAN__
+	result = vec_sro ((__v16qu)__A, shift);
+#else
+	result = vec_slo ((__v16qu)__A, shift);
+#endif
+      }
+  else
+    result = zeros;
+
+  return (__m128i) result;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srli_si128 (__m128i __A, const int __N)
+{
+  return _mm_bsrli_si128 (__A, __N);
+}
+
+extern __inline  __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_slli_si128 (__m128i __A, const int _imm5)
+{
+  __v16qu result;
+  const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+  if (_imm5 < 16)
+#ifdef __LITTLE_ENDIAN__
+    result = vec_sld ((__v16qu) __A, zeros, _imm5);
+#else
+    result = vec_sld (zeros, (__v16qu) __A, (16 - _imm5));
+#endif
+  else
+    result = zeros;
+
+  return (__m128i) result;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+
+_mm_srli_epi16 (__m128i  __A, int __B)
+{
+  __v8hu rshift;
+  __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 };
+
+  if (__B < 16)
+    {
+      if (__builtin_constant_p(__B))
+	rshift = (__v8hu) vec_splat_s16(__B);
+      else
+	rshift = vec_splats ((unsigned short) __B);
+
+      result = vec_sr ((__v8hi) __A, rshift);
+    }
+
+  return (__m128i) result;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srli_epi32 (__m128i __A, int __B)
+{
+  __v4su rshift;
+  __v4si result = { 0, 0, 0, 0 };
+
+  if (__B < 32)
+    {
+      if (__builtin_constant_p(__B))
+	{
+	  if (__B < 16)
+	      rshift = (__v4su) vec_splat_s32(__B);
+	    else
+	      rshift = (__v4su) vec_splats((unsigned int)__B);
+	}
+      else
+	rshift = vec_splats ((unsigned int) __B);
+
+      result = vec_sr ((__v4si) __A, rshift);
+    }
+
+  return (__m128i) result;
+}
+
+#ifdef _ARCH_PWR8
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srli_epi64 (__m128i __A, int __B)
+{
+  __v2du rshift;
+  __v2di result = { 0, 0 };
+
+  if (__B < 64)
+    {
+      if (__builtin_constant_p(__B))
+	{
+	  if (__B < 16)
+	      rshift = (__v2du) vec_splat_s32(__B);
+	    else
+	      rshift = (__v2du) vec_splats((unsigned long long)__B);
+	}
+      else
+	rshift = (__v2du) vec_splats ((unsigned int) __B);
+
+      result = vec_sr ((__v2di) __A, rshift);
+    }
+
+  return (__m128i) result;
+}
+#endif
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sll_epi16 (__m128i __A, __m128i __B)
+{
+  __v8hu lshift;
+  __vector __bool short shmask;
+  const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
+  __v8hu result;
+
+#ifdef __LITTLE_ENDIAN__
+  lshift = vec_splat ((__v8hu) __B, 0);
+#else
+  lshift = vec_splat ((__v8hu) __B, 3);
+#endif
+  shmask = vec_cmple (lshift, shmax);
+  result = vec_sl ((__v8hu) __A, lshift);
+  result = vec_sel ((__v8hu) shmask, result, shmask);
+
+  return (__m128i) result;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sll_epi32 (__m128i __A, __m128i __B)
+{
+  __v4su lshift;
+  __vector __bool int shmask;
+  const __v4su shmax = { 32, 32, 32, 32 };
+  __v4su result;
+#ifdef __LITTLE_ENDIAN__
+  lshift = vec_splat ((__v4su) __B, 0);
+#else
+  lshift = vec_splat ((__v4su) __B, 1);
+#endif
+  shmask = vec_cmplt (lshift, shmax);
+  result = vec_sl ((__v4su) __A, lshift);
+  result = vec_sel ((__v4su) shmask, result, shmask);
+
+  return (__m128i) result;
+}
+
+#ifdef _ARCH_PWR8
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sll_epi64 (__m128i __A, __m128i __B)
+{
+  __v2du lshift;
+  __vector __bool long long shmask;
+  const __v2du shmax = { 64, 64 };
+  __v2du result;
+
+  lshift = vec_splat ((__v2du) __B, 0);
+  shmask = vec_cmplt (lshift, shmax);
+  result = vec_sl ((__v2du) __A, lshift);
+  result = vec_sel ((__v2du) shmask, result, shmask);
+
+  return (__m128i) result;
+}
+#endif
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sra_epi16 (__m128i __A, __m128i __B)
+{
+  const __v8hu rshmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
+  __v8hu rshift;
+  __v8hi result;
+
+#ifdef __LITTLE_ENDIAN__
+  rshift = vec_splat ((__v8hu)__B, 0);
+#else
+  rshift = vec_splat ((__v8hu)__B, 3);
+#endif
+  rshift = vec_min (rshift, rshmax);
+  result = vec_sra ((__v8hi) __A, rshift);
+
+  return (__m128i) result;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sra_epi32 (__m128i __A, __m128i __B)
+{
+  const __v4su rshmax = { 31, 31, 31, 31 };
+  __v4su rshift;
+  __v4si result;
+
+#ifdef __LITTLE_ENDIAN__
+  rshift = vec_splat ((__v4su)__B, 0);
+#else
+  rshift = vec_splat ((__v4su)__B, 1);
+#endif
+  rshift = vec_min (rshift, rshmax);
+  result = vec_sra ((__v4si) __A, rshift);
+
+  return (__m128i) result;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srl_epi16 (__m128i __A, __m128i __B)
+{
+  __v8hu rshift;
+  __vector __bool short shmask;
+  const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
+  __v8hu result;
+
+#ifdef __LITTLE_ENDIAN__
+  rshift = vec_splat ((__v8hu) __B, 0);
+#else
+  rshift = vec_splat ((__v8hu) __B, 3);
+#endif
+  shmask = vec_cmple (rshift, shmax);
+  result = vec_sr ((__v8hu) __A, rshift);
+  result = vec_sel ((__v8hu) shmask, result, shmask);
+
+  return (__m128i) result;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srl_epi32 (__m128i __A, __m128i __B)
+{
+  __v4su rshift;
+  __vector __bool int shmask;
+  const __v4su shmax = { 32, 32, 32, 32 };
+  __v4su result;
+
+#ifdef __LITTLE_ENDIAN__
+  rshift = vec_splat ((__v4su) __B, 0);
+#else
+  rshift = vec_splat ((__v4su) __B, 1);
+#endif
+  shmask = vec_cmplt (rshift, shmax);
+  result = vec_sr ((__v4su) __A, rshift);
+  result = vec_sel ((__v4su) shmask, result, shmask);
+
+  return (__m128i) result;
+}
+
+#ifdef _ARCH_PWR8
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srl_epi64 (__m128i __A, __m128i __B)
+{
+  __v2du rshift;
+  __vector __bool long long shmask;
+  const __v2du shmax = { 64, 64 };
+  __v2du result;
+
+  rshift = vec_splat ((__v2du) __B, 0);
+  shmask = vec_cmplt (rshift, shmax);
+  result = vec_sr ((__v2du) __A, rshift);
+  result = vec_sel ((__v2du) shmask, result, shmask);
+
+  return (__m128i) result;
+}
+#endif
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_and_pd (__m128d __A, __m128d __B)
+{
+  return (vec_and ((__v2df) __A, (__v2df) __B));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_andnot_pd (__m128d __A, __m128d __B)
+{
+  return (vec_andc ((__v2df) __B, (__v2df) __A));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_or_pd (__m128d __A, __m128d __B)
+{
+  return (vec_or ((__v2df) __A, (__v2df) __B));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_xor_pd (__m128d __A, __m128d __B)
+{
+  return (vec_xor ((__v2df) __A, (__v2df) __B));
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_and_si128 (__m128i __A, __m128i __B)
+{
+  return (__m128i)vec_and ((__v2di) __A, (__v2di) __B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_andnot_si128 (__m128i __A, __m128i __B)
+{
+  return (__m128i)vec_andc ((__v2di) __B, (__v2di) __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_or_si128 (__m128i __A, __m128i __B)
+{
+  return (__m128i)vec_or ((__v2di) __A, (__v2di) __B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_xor_si128 (__m128i __A, __m128i __B)
+{
+  return (__m128i)vec_xor ((__v2di) __A, (__v2di) __B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_epi8 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_cmpeq ((__v16qi) __A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_cmpeq ((__v8hi) __A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_cmpeq ((__v4si) __A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_epi8 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_cmplt ((__v16qi) __A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_cmplt ((__v8hi) __A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_cmplt ((__v4si) __A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_epi8 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_cmpgt ((__v16qi) __A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_cmpgt ((__v8hi) __A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_cmpgt ((__v4si) __A, (__v4si)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_extract_epi16 (__m128i const __A, int const __N)
+{
+  return (unsigned short) ((__v8hi)__A)[__N & 7];
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_insert_epi16 (__m128i const __A, int const __D, int const __N)
+{
+  __v8hi result = (__v8hi)__A;
+
+  result [(__N & 7)] = __D;
+
+  return (__m128i) result;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_max ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_epu8 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_max ((__v16qu) __A, (__v16qu)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_min ((__v8hi) __A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_epu8 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_min ((__v16qu) __A, (__v16qu)__B);
+}
+
+
+#ifdef _ARCH_PWR8
+/* Intrinsic functions that require PowerISA 2.07 minimum.  */
+
+/* Creates a 4-bit mask from the most significant bits of the SPFP values.  */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movemask_epi8 (__m128i __A)
+{
+  __vector unsigned long long result;
+  static const __vector unsigned char perm_mask =
+    {
+	0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
+	0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00
+    };
+
+  result = ((__vector unsigned long long)
+	    vec_vbpermq ((__vector unsigned char) __A,
+			 (__vector unsigned char) perm_mask));
+
+#ifdef __LITTLE_ENDIAN__
+  return result[1];
+#else
+  return result[0];
+#endif
+}
+#endif /* _ARCH_PWR8 */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mulhi_epu16 (__m128i __A, __m128i __B)
+{
+  __v4su w0, w1;
+  __v16qu xform1 = {
+#ifdef __LITTLE_ENDIAN__
+      0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
+      0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
+#else
+      0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15,
+      0x08, 0x09, 0x18, 0x19,  0x0C, 0x0D, 0x1C, 0x1D
+#endif
+    };
+
+  w0 = vec_vmuleuh ((__v8hu)__A, (__v8hu)__B);
+  w1 = vec_vmulouh ((__v8hu)__A, (__v8hu)__B);
+  return (__m128i) vec_perm (w0, w1, xform1);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shufflehi_epi16 (__m128i __A, const int __mask)
+{
+  unsigned long element_selector_98 = __mask & 0x03;
+  unsigned long element_selector_BA = (__mask >> 2) & 0x03;
+  unsigned long element_selector_DC = (__mask >> 4) & 0x03;
+  unsigned long element_selector_FE = (__mask >> 6) & 0x03;
+  static const unsigned short permute_selectors[4] =
+    {
+#ifdef __LITTLE_ENDIAN__
+	      0x0908, 0x0B0A, 0x0D0C, 0x0F0E
+#else
+	      0x0809, 0x0A0B, 0x0C0D, 0x0E0F
+#endif
+    };
+  __v2du pmask =
+#ifdef __LITTLE_ENDIAN__
+      { 0x1716151413121110UL,  0UL};
+#else
+      { 0x1011121314151617UL,  0UL};
+#endif
+  __m64_union t;
+  __v2du a, r;
+
+  t.as_short[0] = permute_selectors[element_selector_98];
+  t.as_short[1] = permute_selectors[element_selector_BA];
+  t.as_short[2] = permute_selectors[element_selector_DC];
+  t.as_short[3] = permute_selectors[element_selector_FE];
+  pmask[1] = t.as_m64;
+  a = (__v2du)__A;
+  r = vec_perm (a, a, (__vector unsigned char)pmask);
+  return (__m128i) r;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shufflelo_epi16 (__m128i __A, const int __mask)
+{
+  unsigned long element_selector_10 = __mask & 0x03;
+  unsigned long element_selector_32 = (__mask >> 2) & 0x03;
+  unsigned long element_selector_54 = (__mask >> 4) & 0x03;
+  unsigned long element_selector_76 = (__mask >> 6) & 0x03;
+  static const unsigned short permute_selectors[4] =
+    {
+#ifdef __LITTLE_ENDIAN__
+	      0x0100, 0x0302, 0x0504, 0x0706
+#else
+	      0x0001, 0x0203, 0x0405, 0x0607
+#endif
+    };
+  __v2du pmask =
+#ifdef __LITTLE_ENDIAN__
+                 { 0UL,  0x1f1e1d1c1b1a1918UL};
+#else
+                 { 0UL,  0x18191a1b1c1d1e1fUL};
+#endif
+  __m64_union t;
+  __v2du a, r;
+  t.as_short[0] = permute_selectors[element_selector_10];
+  t.as_short[1] = permute_selectors[element_selector_32];
+  t.as_short[2] = permute_selectors[element_selector_54];
+  t.as_short[3] = permute_selectors[element_selector_76];
+  pmask[0] = t.as_m64;
+  a = (__v2du)__A;
+  r = vec_perm (a, a, (__vector unsigned char)pmask);
+  return (__m128i) r;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shuffle_epi32 (__m128i __A, const int __mask)
+{
+  unsigned long element_selector_10 = __mask & 0x03;
+  unsigned long element_selector_32 = (__mask >> 2) & 0x03;
+  unsigned long element_selector_54 = (__mask >> 4) & 0x03;
+  unsigned long element_selector_76 = (__mask >> 6) & 0x03;
+  static const unsigned int permute_selectors[4] =
+    {
+#ifdef __LITTLE_ENDIAN__
+	0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
+#else
+      0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
+#endif
+    };
+  __v4su t;
+
+  t[0] = permute_selectors[element_selector_10];
+  t[1] = permute_selectors[element_selector_32];
+  t[2] = permute_selectors[element_selector_54] + 0x10101010;
+  t[3] = permute_selectors[element_selector_76] + 0x10101010;
+  return (__m128i)vec_perm ((__v4si) __A, (__v4si)__A, (__vector unsigned char)t);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C)
+{
+  __v2du hibit = { 0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL};
+  __v16qu mask, tmp;
+  __m128i_u *p = (__m128i_u*)__C;
+
+  tmp = (__v16qu)_mm_loadu_si128(p);
+  mask = (__v16qu)vec_cmpgt ((__v16qu)__B, (__v16qu)hibit);
+  tmp = vec_sel (tmp, (__v16qu)__A, mask);
+  _mm_storeu_si128 (p, (__m128i)tmp);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_avg_epu8 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_avg ((__v16qu)__A, (__v16qu)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_avg_epu16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_avg ((__v8hu)__A, (__v8hu)__B);
+}
+
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sad_epu8 (__m128i __A, __m128i __B)
+{
+  __v16qu a, b;
+  __v16qu vmin, vmax, vabsdiff;
+  __v4si vsum;
+  const __v4su zero = { 0, 0, 0, 0 };
+  __v4si result;
+
+  a = (__v16qu) __A;
+  b = (__v16qu) __B;
+  vmin = vec_min (a, b);
+  vmax = vec_max (a, b);
+  vabsdiff = vec_sub (vmax, vmin);
+  /* Sum four groups of bytes into integers.  */
+  vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
+  /* Sum across four integers with two integer results.  */
+  result = vec_sum2s (vsum, (__vector signed int) zero);
+  /* Rotate the sums into the correct position.  */
+#ifdef __LITTLE_ENDIAN__
+  result = vec_sld (result, result, 4);
+#else
+  result = vec_sld (result, result, 6);
+#endif
+  /* Rotate the sums into the correct position.  */
+  return (__m128i) result;
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_stream_si32 (int *__A, int __B)
+{
+  /* Use the data cache block touch for store transient.  */
+  __asm__ (
+    "dcbtstt 0,%0"
+    :
+    : "b" (__A)
+    : "memory"
+  );
+  *__A = __B;
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_stream_si64 (long long int *__A, long long int __B)
+{
+  /* Use the data cache block touch for store transient.  */
+  __asm__ (
+    "	dcbtstt	0,%0"
+    :
+    : "b" (__A)
+    : "memory"
+  );
+  *__A = __B;
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_stream_si128 (__m128i *__A, __m128i __B)
+{
+  /* Use the data cache block touch for store transient.  */
+  __asm__ (
+    "dcbtstt 0,%0"
+    :
+    : "b" (__A)
+    : "memory"
+  );
+  *__A = __B;
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_stream_pd (double *__A, __m128d __B)
+{
+  /* Use the data cache block touch for store transient.  */
+  __asm__ (
+    "dcbtstt 0,%0"
+    :
+    : "b" (__A)
+    : "memory"
+  );
+  *(__m128d*)__A = __B;
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_clflush (void const *__A)
+{
+  /* Use the data cache block flush.  */
+  __asm__ (
+    "dcbf 0,%0"
+    :
+    : "b" (__A)
+    : "memory"
+  );
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_lfence (void)
+{
+  /* Use light weight sync for load to load ordering.  */
+  __atomic_thread_fence (__ATOMIC_RELEASE);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mfence (void)
+{
+  /* Use heavy weight sync for any to any ordering.  */
+  __atomic_thread_fence (__ATOMIC_SEQ_CST);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi32_si128 (int __A)
+{
+  return _mm_set_epi32 (0, 0, 0, __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64_si128 (long long __A)
+{
+  return __extension__ (__m128i)(__v2di){ __A, 0LL };
+}
+
+/* Microsoft intrinsic.  */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64x_si128 (long long __A)
+{
+  return __extension__ (__m128i)(__v2di){ __A, 0LL };
+}
+
+/* Casts between various SP, DP, INT vector types.  Note that these do no
+   conversion of values, they just change the type.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castpd_ps(__m128d __A)
+{
+  return (__m128) __A;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castpd_si128(__m128d __A)
+{
+  return (__m128i) __A;
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castps_pd(__m128 __A)
+{
+  return (__m128d) __A;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castps_si128(__m128 __A)
+{
+  return (__m128i) __A;
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castsi128_ps(__m128i __A)
+{
+  return (__m128) __A;
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castsi128_pd(__m128i __A)
+{
+  return (__m128d) __A;
+}
+
+#else
+#include_next <emmintrin.h>
+#endif /* defined(__linux__) && defined(__ppc64__) */
+
+#endif /* EMMINTRIN_H_ */
diff --git a/onnxruntime/core/mlas/lib/ppc64/mm_malloc.h b/onnxruntime/core/mlas/lib/ppc64/mm_malloc.h
new file mode 100644
index 0000000000000..b3ad41dfd3faa
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/ppc64/mm_malloc.h
@@ -0,0 +1,50 @@
+/*===---- mm_malloc.h - Implementation of _mm_malloc and _mm_free ----------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _MM_MALLOC_H_INCLUDED
+#define _MM_MALLOC_H_INCLUDED
+
+#if defined(__linux__) && defined(__powerpc64__)
+
+#include <stdlib.h>
+
+/* We can't depend on <stdlib.h> since the prototype of posix_memalign
+   may not be visible.  */
+#ifndef __cplusplus
+extern int posix_memalign (void **, size_t, size_t);
+#else
+extern "C" int posix_memalign (void **, size_t, size_t) throw ();
+#endif
+
+static __inline void *
+_mm_malloc (size_t size, size_t alignment)
+{
+  /* PowerPC64 ELF V2 ABI requires quadword alignment.  */
+  size_t vec_align = sizeof (__vector float);
+  void *ptr;
+
+  if (alignment < vec_align)
+    alignment = vec_align;
+  if (posix_memalign (&ptr, alignment, size) == 0)
+    return ptr;
+  else
+    return NULL;
+}
+
+static __inline void
+_mm_free (void * ptr)
+{
+  free (ptr);
+}
+
+#else
+#include_next <mm_malloc.h>
+#endif
+
+#endif /* _MM_MALLOC_H_INCLUDED */
diff --git a/onnxruntime/core/mlas/lib/ppc64/mmintrin.h b/onnxruntime/core/mlas/lib/ppc64/mmintrin.h
new file mode 100644
index 0000000000000..1722f8ba293de
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/ppc64/mmintrin.h
@@ -0,0 +1,1450 @@
+/*===---- mmintrin.h - Implementation of MMX intrinsics on PowerPC ---------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* Implemented from the specification included in the Intel C++ Compiler
+   User Guide and Reference, version 9.0.  */
+
+#ifndef NO_WARN_X86_INTRINSICS
+/* This header file is to help porting code using Intel intrinsics
+   explicitly from x86_64 to powerpc64/powerpc64le.
+
+   Since PowerPC target doesn't support native 64-bit vector type, we
+   typedef __m64 to 64-bit unsigned long long in MMX intrinsics, which
+   works well for _si64 and some _pi32 operations.
+
+   For _pi16 and _pi8 operations, it's better to transfer __m64 into
+   128-bit PowerPC vector first. Power8 introduced direct register
+   move instructions which helps for more efficient implementation.
+
+   It's user's responsibility to determine if the results of such port
+   are acceptable or further changes are needed. Please note that much
+   code using Intel intrinsics CAN BE REWRITTEN in more portable and
+   efficient standard C or GNU C extensions with 64-bit scalar
+   operations, or 128-bit SSE/Altivec operations, which are more
+   recommended. */
+#error                                                                         \
+    "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
+#endif
+
+#ifndef _MMINTRIN_H_INCLUDED
+#define _MMINTRIN_H_INCLUDED
+
+#if defined(__linux__) && defined(__powerpc64__)
+
+#include <altivec.h>
+/* The Intel API is flexible enough that we must allow aliasing with other
+   vector types, and their scalar components.  */
+typedef __attribute__((__aligned__(8))) unsigned long long __m64;
+
+typedef __attribute__((__aligned__(8))) union {
+  __m64 as_m64;
+  char as_char[8];
+  signed char as_signed_char[8];
+  short as_short[4];
+  int as_int[2];
+  long long as_long_long;
+  float as_float[2];
+  double as_double;
+} __m64_union;
+
+/* Empty the multimedia state.  */
+extern __inline void
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_empty(void) {
+  /* nothing to do on PowerPC.  */
+}
+
+extern __inline void
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_empty(void) {
+  /* nothing to do on PowerPC.  */
+}
+
+/* Convert I to a __m64 object.  The integer is zero-extended to 64-bits.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_cvtsi32_si64(int __i) {
+  return (__m64)(unsigned int)__i;
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_from_int(int __i) {
+  return _mm_cvtsi32_si64(__i);
+}
+
+/* Convert the lower 32 bits of the __m64 object into an integer.  */
+extern __inline int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_cvtsi64_si32(__m64 __i) {
+  return ((int)__i);
+}
+
+extern __inline int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_to_int(__m64 __i) {
+  return _mm_cvtsi64_si32(__i);
+}
+
+/* Convert I to a __m64 object.  */
+
+/* Intel intrinsic.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_from_int64(long long __i) {
+  return (__m64)__i;
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_cvtsi64_m64(long long __i) {
+  return (__m64)__i;
+}
+
+/* Microsoft intrinsic.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_cvtsi64x_si64(long long __i) {
+  return (__m64)__i;
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_set_pi64x(long long __i) {
+  return (__m64)__i;
+}
+
+/* Convert the __m64 object to a 64bit integer.  */
+
+/* Intel intrinsic.  */
+extern __inline long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_to_int64(__m64 __i) {
+  return (long long)__i;
+}
+
+extern __inline long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_cvtm64_si64(__m64 __i) {
+  return (long long)__i;
+}
+
+/* Microsoft intrinsic.  */
+extern __inline long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_cvtsi64_si64x(__m64 __i) {
+  return (long long)__i;
+}
+
+#ifdef _ARCH_PWR8
+/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
+   the result, and the four 16-bit values from M2 into the upper four 8-bit
+   values of the result, all with signed saturation.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_packs_pi16(__m64 __m1, __m64 __m2) {
+  __vector signed short vm1;
+  __vector signed char vresult;
+
+  vm1 = (__vector signed short)(__vector unsigned long long)
+#ifdef __LITTLE_ENDIAN__
+      {__m1, __m2};
+#else
+      {__m2, __m1};
+#endif
+  vresult = vec_packs(vm1, vm1);
+  return (__m64)((__vector long long)vresult)[0];
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_packsswb(__m64 __m1, __m64 __m2) {
+  return _mm_packs_pi16(__m1, __m2);
+}
+
+/* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
+   the result, and the two 32-bit values from M2 into the upper two 16-bit
+   values of the result, all with signed saturation.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_packs_pi32(__m64 __m1, __m64 __m2) {
+  __vector signed int vm1;
+  __vector signed short vresult;
+
+  vm1 = (__vector signed int)(__vector unsigned long long)
+#ifdef __LITTLE_ENDIAN__
+      {__m1, __m2};
+#else
+      {__m2, __m1};
+#endif
+  vresult = vec_packs(vm1, vm1);
+  return (__m64)((__vector long long)vresult)[0];
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_packssdw(__m64 __m1, __m64 __m2) {
+  return _mm_packs_pi32(__m1, __m2);
+}
+
+/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
+   the result, and the four 16-bit values from M2 into the upper four 8-bit
+   values of the result, all with unsigned saturation.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_packs_pu16(__m64 __m1, __m64 __m2) {
+  __vector unsigned char r;
+  __vector signed short vm1 = (__vector signed short)(__vector long long)
+#ifdef __LITTLE_ENDIAN__
+      {__m1, __m2};
+#else
+      {__m2, __m1};
+#endif
+  const __vector signed short __zero = {0};
+  __vector __bool short __select = vec_cmplt(vm1, __zero);
+  r = vec_packs((__vector unsigned short)vm1, (__vector unsigned short)vm1);
+  __vector __bool char packsel = vec_pack(__select, __select);
+  r = vec_sel(r, (const __vector unsigned char)__zero, packsel);
+  return (__m64)((__vector long long)r)[0];
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_packuswb(__m64 __m1, __m64 __m2) {
+  return _mm_packs_pu16(__m1, __m2);
+}
+#endif /* end ARCH_PWR8 */
+
+/* Interleave the four 8-bit values from the high half of M1 with the four
+   8-bit values from the high half of M2.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_unpackhi_pi8(__m64 __m1, __m64 __m2) {
+#if _ARCH_PWR8
+  __vector unsigned char a, b, c;
+
+  a = (__vector unsigned char)vec_splats(__m1);
+  b = (__vector unsigned char)vec_splats(__m2);
+  c = vec_mergel(a, b);
+  return (__m64)((__vector long long)c)[1];
+#else
+  __m64_union m1, m2, res;
+
+  m1.as_m64 = __m1;
+  m2.as_m64 = __m2;
+
+  res.as_char[0] = m1.as_char[4];
+  res.as_char[1] = m2.as_char[4];
+  res.as_char[2] = m1.as_char[5];
+  res.as_char[3] = m2.as_char[5];
+  res.as_char[4] = m1.as_char[6];
+  res.as_char[5] = m2.as_char[6];
+  res.as_char[6] = m1.as_char[7];
+  res.as_char[7] = m2.as_char[7];
+
+  return (__m64)res.as_m64;
+#endif
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_punpckhbw(__m64 __m1, __m64 __m2) {
+  return _mm_unpackhi_pi8(__m1, __m2);
+}
+
+/* Interleave the two 16-bit values from the high half of M1 with the two
+   16-bit values from the high half of M2.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_unpackhi_pi16(__m64 __m1, __m64 __m2) {
+  __m64_union m1, m2, res;
+
+  m1.as_m64 = __m1;
+  m2.as_m64 = __m2;
+
+  res.as_short[0] = m1.as_short[2];
+  res.as_short[1] = m2.as_short[2];
+  res.as_short[2] = m1.as_short[3];
+  res.as_short[3] = m2.as_short[3];
+
+  return (__m64)res.as_m64;
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_punpckhwd(__m64 __m1, __m64 __m2) {
+  return _mm_unpackhi_pi16(__m1, __m2);
+}
+/* Interleave the 32-bit value from the high half of M1 with the 32-bit
+   value from the high half of M2.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_unpackhi_pi32(__m64 __m1, __m64 __m2) {
+  __m64_union m1, m2, res;
+
+  m1.as_m64 = __m1;
+  m2.as_m64 = __m2;
+
+  res.as_int[0] = m1.as_int[1];
+  res.as_int[1] = m2.as_int[1];
+
+  return (__m64)res.as_m64;
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_punpckhdq(__m64 __m1, __m64 __m2) {
+  return _mm_unpackhi_pi32(__m1, __m2);
+}
+/* Interleave the four 8-bit values from the low half of M1 with the four
+   8-bit values from the low half of M2.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_unpacklo_pi8(__m64 __m1, __m64 __m2) {
+#if _ARCH_PWR8
+  __vector unsigned char a, b, c;
+
+  a = (__vector unsigned char)vec_splats(__m1);
+  b = (__vector unsigned char)vec_splats(__m2);
+  c = vec_mergel(a, b);
+  return (__m64)((__vector long long)c)[0];
+#else
+  __m64_union m1, m2, res;
+
+  m1.as_m64 = __m1;
+  m2.as_m64 = __m2;
+
+  res.as_char[0] = m1.as_char[0];
+  res.as_char[1] = m2.as_char[0];
+  res.as_char[2] = m1.as_char[1];
+  res.as_char[3] = m2.as_char[1];
+  res.as_char[4] = m1.as_char[2];
+  res.as_char[5] = m2.as_char[2];
+  res.as_char[6] = m1.as_char[3];
+  res.as_char[7] = m2.as_char[3];
+
+  return (__m64)res.as_m64;
+#endif
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_punpcklbw(__m64 __m1, __m64 __m2) {
+  return _mm_unpacklo_pi8(__m1, __m2);
+}
+/* Interleave the two 16-bit values from the low half of M1 with the two
+   16-bit values from the low half of M2.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_unpacklo_pi16(__m64 __m1, __m64 __m2) {
+  __m64_union m1, m2, res;
+
+  m1.as_m64 = __m1;
+  m2.as_m64 = __m2;
+
+  res.as_short[0] = m1.as_short[0];
+  res.as_short[1] = m2.as_short[0];
+  res.as_short[2] = m1.as_short[1];
+  res.as_short[3] = m2.as_short[1];
+
+  return (__m64)res.as_m64;
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_punpcklwd(__m64 __m1, __m64 __m2) {
+  return _mm_unpacklo_pi16(__m1, __m2);
+}
+
+/* Interleave the 32-bit value from the low half of M1 with the 32-bit
+   value from the low half of M2.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_unpacklo_pi32(__m64 __m1, __m64 __m2) {
+  __m64_union m1, m2, res;
+
+  m1.as_m64 = __m1;
+  m2.as_m64 = __m2;
+
+  res.as_int[0] = m1.as_int[0];
+  res.as_int[1] = m2.as_int[0];
+
+  return (__m64)res.as_m64;
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_punpckldq(__m64 __m1, __m64 __m2) {
+  return _mm_unpacklo_pi32(__m1, __m2);
+}
+
+/* Add the 8-bit values in M1 to the 8-bit values in M2.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_add_pi8(__m64 __m1, __m64 __m2) {
+#if _ARCH_PWR8
+  __vector signed char a, b, c;
+
+  a = (__vector signed char)vec_splats(__m1);
+  b = (__vector signed char)vec_splats(__m2);
+  c = vec_add(a, b);
+  return (__m64)((__vector long long)c)[0];
+#else
+  __m64_union m1, m2, res;
+
+  m1.as_m64 = __m1;
+  m2.as_m64 = __m2;
+
+  res.as_char[0] = m1.as_char[0] + m2.as_char[0];
+  res.as_char[1] = m1.as_char[1] + m2.as_char[1];
+  res.as_char[2] = m1.as_char[2] + m2.as_char[2];
+  res.as_char[3] = m1.as_char[3] + m2.as_char[3];
+  res.as_char[4] = m1.as_char[4] + m2.as_char[4];
+  res.as_char[5] = m1.as_char[5] + m2.as_char[5];
+  res.as_char[6] = m1.as_char[6] + m2.as_char[6];
+  res.as_char[7] = m1.as_char[7] + m2.as_char[7];
+
+  return (__m64)res.as_m64;
+#endif
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_paddb(__m64 __m1, __m64 __m2) {
+  return _mm_add_pi8(__m1, __m2);
+}
+
+/* Add the 16-bit values in M1 to the 16-bit values in M2.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_add_pi16(__m64 __m1, __m64 __m2) {
+#if _ARCH_PWR8
+  __vector signed short a, b, c;
+
+  a = (__vector signed short)vec_splats(__m1);
+  b = (__vector signed short)vec_splats(__m2);
+  c = vec_add(a, b);
+  return (__m64)((__vector long long)c)[0];
+#else
+  __m64_union m1, m2, res;
+
+  m1.as_m64 = __m1;
+  m2.as_m64 = __m2;
+
+  res.as_short[0] = m1.as_short[0] + m2.as_short[0];
+  res.as_short[1] = m1.as_short[1] + m2.as_short[1];
+  res.as_short[2] = m1.as_short[2] + m2.as_short[2];
+  res.as_short[3] = m1.as_short[3] + m2.as_short[3];
+
+  return (__m64)res.as_m64;
+#endif
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_paddw(__m64 __m1, __m64 __m2) {
+  return _mm_add_pi16(__m1, __m2);
+}
+
+/* Add the 32-bit values in M1 to the 32-bit values in M2.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_add_pi32(__m64 __m1, __m64 __m2) {
+#if _ARCH_PWR9
+  __vector signed int a, b, c;
+
+  a = (__vector signed int)vec_splats(__m1);
+  b = (__vector signed int)vec_splats(__m2);
+  c = vec_add(a, b);
+  return (__m64)((__vector long long)c)[0];
+#else
+  __m64_union m1, m2, res;
+
+  m1.as_m64 = __m1;
+  m2.as_m64 = __m2;
+
+  res.as_int[0] = m1.as_int[0] + m2.as_int[0];
+  res.as_int[1] = m1.as_int[1] + m2.as_int[1];
+
+  return (__m64)res.as_m64;
+#endif
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_paddd(__m64 __m1, __m64 __m2) {
+  return _mm_add_pi32(__m1, __m2);
+}
+
+/* Subtract the 8-bit values in M2 from the 8-bit values in M1.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_sub_pi8(__m64 __m1, __m64 __m2) {
+#if _ARCH_PWR8
+  __vector signed char a, b, c;
+
+  a = (__vector signed char)vec_splats(__m1);
+  b = (__vector signed char)vec_splats(__m2);
+  c = vec_sub(a, b);
+  return (__m64)((__vector long long)c)[0];
+#else
+  __m64_union m1, m2, res;
+
+  m1.as_m64 = __m1;
+  m2.as_m64 = __m2;
+
+  res.as_char[0] = m1.as_char[0] - m2.as_char[0];
+  res.as_char[1] = m1.as_char[1] - m2.as_char[1];
+  res.as_char[2] = m1.as_char[2] - m2.as_char[2];
+  res.as_char[3] = m1.as_char[3] - m2.as_char[3];
+  res.as_char[4] = m1.as_char[4] - m2.as_char[4];
+  res.as_char[5] = m1.as_char[5] - m2.as_char[5];
+  res.as_char[6] = m1.as_char[6] - m2.as_char[6];
+  res.as_char[7] = m1.as_char[7] - m2.as_char[7];
+
+  return (__m64)res.as_m64;
+#endif
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_psubb(__m64 __m1, __m64 __m2) {
+  return _mm_sub_pi8(__m1, __m2);
+}
+
+/* Subtract the 16-bit values in M2 from the 16-bit values in M1.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_sub_pi16(__m64 __m1, __m64 __m2) {
+#if _ARCH_PWR8
+  __vector signed short a, b, c;
+
+  a = (__vector signed short)vec_splats(__m1);
+  b = (__vector signed short)vec_splats(__m2);
+  c = vec_sub(a, b);
+  return (__m64)((__vector long long)c)[0];
+#else
+  __m64_union m1, m2, res;
+
+  m1.as_m64 = __m1;
+  m2.as_m64 = __m2;
+
+  res.as_short[0] = m1.as_short[0] - m2.as_short[0];
+  res.as_short[1] = m1.as_short[1] - m2.as_short[1];
+  res.as_short[2] = m1.as_short[2] - m2.as_short[2];
+  res.as_short[3] = m1.as_short[3] - m2.as_short[3];
+
+  return (__m64)res.as_m64;
+#endif
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_psubw(__m64 __m1, __m64 __m2) {
+  return _mm_sub_pi16(__m1, __m2);
+}
+
+/* Subtract the 32-bit values in M2 from the 32-bit values in M1.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_sub_pi32(__m64 __m1, __m64 __m2) {
+#if _ARCH_PWR9
+  __vector signed int a, b, c;
+
+  a = (__vector signed int)vec_splats(__m1);
+  b = (__vector signed int)vec_splats(__m2);
+  c = vec_sub(a, b);
+  return (__m64)((__vector long long)c)[0];
+#else
+  __m64_union m1, m2, res;
+
+  m1.as_m64 = __m1;
+  m2.as_m64 = __m2;
+
+  res.as_int[0] = m1.as_int[0] - m2.as_int[0];
+  res.as_int[1] = m1.as_int[1] - m2.as_int[1];
+
+  return (__m64)res.as_m64;
+#endif
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_psubd(__m64 __m1, __m64 __m2) {
+  return _mm_sub_pi32(__m1, __m2);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_add_si64(__m64 __m1, __m64 __m2) {
+  return (__m1 + __m2);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_sub_si64(__m64 __m1, __m64 __m2) {
+  return (__m1 - __m2);
+}
+
+/* Shift the 64-bit value in M left by COUNT.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_sll_si64(__m64 __m, __m64 __count) {
+  return (__m << __count);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_psllq(__m64 __m, __m64 __count) {
+  return _mm_sll_si64(__m, __count);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_slli_si64(__m64 __m, const int __count) {
+  return (__m << __count);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_psllqi(__m64 __m, const int __count) {
+  return _mm_slli_si64(__m, __count);
+}
+
+/* Shift the 64-bit value in M left by COUNT; shift in zeros.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_srl_si64(__m64 __m, __m64 __count) {
+  return (__m >> __count);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_psrlq(__m64 __m, __m64 __count) {
+  return _mm_srl_si64(__m, __count);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_srli_si64(__m64 __m, const int __count) {
+  return (__m >> __count);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_psrlqi(__m64 __m, const int __count) {
+  return _mm_srli_si64(__m, __count);
+}
+
+/* Bit-wise AND the 64-bit values in M1 and M2.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_and_si64(__m64 __m1, __m64 __m2) {
+  return (__m1 & __m2);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_pand(__m64 __m1, __m64 __m2) {
+  return _mm_and_si64(__m1, __m2);
+}
+
+/* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
+   64-bit value in M2.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_andnot_si64(__m64 __m1, __m64 __m2) {
+  return (~__m1 & __m2);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_pandn(__m64 __m1, __m64 __m2) {
+  return _mm_andnot_si64(__m1, __m2);
+}
+
+/* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_or_si64(__m64 __m1, __m64 __m2) {
+  return (__m1 | __m2);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_por(__m64 __m1, __m64 __m2) {
+  return _mm_or_si64(__m1, __m2);
+}
+
+/* Bit-wise exclusive OR the 64-bit values in M1 and M2.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_xor_si64(__m64 __m1, __m64 __m2) {
+  return (__m1 ^ __m2);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_pxor(__m64 __m1, __m64 __m2) {
+  return _mm_xor_si64(__m1, __m2);
+}
+
+/* Creates a 64-bit zero.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_setzero_si64(void) {
+  return (__m64)0;
+}
+
+/* Compare eight 8-bit values.  The result of the comparison is 0xFF if the
+   test is true and zero if false.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_cmpeq_pi8(__m64 __m1, __m64 __m2) {
+#if defined(_ARCH_PWR6) && defined(__powerpc64__)
+  __m64 res;
+  __asm__("cmpb %0,%1,%2;\n" : "=r"(res) : "r"(__m1), "r"(__m2) :);
+  return (res);
+#else
+  __m64_union m1, m2, res;
+
+  m1.as_m64 = __m1;
+  m2.as_m64 = __m2;
+
+  res.as_char[0] = (m1.as_char[0] == m2.as_char[0]) ? -1 : 0;
+  res.as_char[1] = (m1.as_char[1] == m2.as_char[1]) ? -1 : 0;
+  res.as_char[2] = (m1.as_char[2] == m2.as_char[2]) ? -1 : 0;
+  res.as_char[3] = (m1.as_char[3] == m2.as_char[3]) ? -1 : 0;
+  res.as_char[4] = (m1.as_char[4] == m2.as_char[4]) ? -1 : 0;
+  res.as_char[5] = (m1.as_char[5] == m2.as_char[5]) ? -1 : 0;
+  res.as_char[6] = (m1.as_char[6] == m2.as_char[6]) ? -1 : 0;
+  res.as_char[7] = (m1.as_char[7] == m2.as_char[7]) ? -1 : 0;
+
+  return (__m64)res.as_m64;
+#endif
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_pcmpeqb(__m64 __m1, __m64 __m2) {
+  return _mm_cmpeq_pi8(__m1, __m2);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_cmpgt_pi8(__m64 __m1, __m64 __m2) {
+#if _ARCH_PWR8
+  __vector signed char a, b, c;
+
+  a = (__vector signed char)vec_splats(__m1);
+  b = (__vector signed char)vec_splats(__m2);
+  c = (__vector signed char)vec_cmpgt(a, b);
+  return (__m64)((__vector long long)c)[0];
+#else
+  __m64_union m1, m2, res;
+
+  m1.as_m64 = __m1;
+  m2.as_m64 = __m2;
+
+  res.as_char[0] = (m1.as_char[0] > m2.as_char[0]) ? -1 : 0;
+  res.as_char[1] = (m1.as_char[1] > m2.as_char[1]) ? -1 : 0;
+  res.as_char[2] = (m1.as_char[2] > m2.as_char[2]) ? -1 : 0;
+  res.as_char[3] = (m1.as_char[3] > m2.as_char[3]) ? -1 : 0;
+  res.as_char[4] = (m1.as_char[4] > m2.as_char[4]) ? -1 : 0;
+  res.as_char[5] = (m1.as_char[5] > m2.as_char[5]) ? -1 : 0;
+  res.as_char[6] = (m1.as_char[6] > m2.as_char[6]) ? -1 : 0;
+  res.as_char[7] = (m1.as_char[7] > m2.as_char[7]) ? -1 : 0;
+
+  return (__m64)res.as_m64;
+#endif
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_pcmpgtb(__m64 __m1, __m64 __m2) {
+  return _mm_cmpgt_pi8(__m1, __m2);
+}
+
+/* Compare four 16-bit values.  The result of the comparison is 0xFFFF if
+   the test is true and zero if false.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_cmpeq_pi16(__m64 __m1, __m64 __m2) {
+#if _ARCH_PWR8
+  __vector signed short a, b, c;
+
+  a = (__vector signed short)vec_splats(__m1);
+  b = (__vector signed short)vec_splats(__m2);
+  c = (__vector signed short)vec_cmpeq(a, b);
+  return (__m64)((__vector long long)c)[0];
+#else
+  __m64_union m1, m2, res;
+
+  m1.as_m64 = __m1;
+  m2.as_m64 = __m2;
+
+  res.as_short[0] = (m1.as_short[0] == m2.as_short[0]) ? -1 : 0;
+  res.as_short[1] = (m1.as_short[1] == m2.as_short[1]) ? -1 : 0;
+  res.as_short[2] = (m1.as_short[2] == m2.as_short[2]) ? -1 : 0;
+  res.as_short[3] = (m1.as_short[3] == m2.as_short[3]) ? -1 : 0;
+
+  return (__m64)res.as_m64;
+#endif
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_pcmpeqw(__m64 __m1, __m64 __m2) {
+  return _mm_cmpeq_pi16(__m1, __m2);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_cmpgt_pi16(__m64 __m1, __m64 __m2) {
+#if _ARCH_PWR8
+  __vector signed short a, b, c;
+
+  a = (__vector signed short)vec_splats(__m1);
+  b = (__vector signed short)vec_splats(__m2);
+  c = (__vector signed short)vec_cmpgt(a, b);
+  return (__m64)((__vector long long)c)[0];
+#else
+  __m64_union m1, m2, res;
+
+  m1.as_m64 = __m1;
+  m2.as_m64 = __m2;
+
+  res.as_short[0] = (m1.as_short[0] > m2.as_short[0]) ? -1 : 0;
+  res.as_short[1] = (m1.as_short[1] > m2.as_short[1]) ? -1 : 0;
+  res.as_short[2] = (m1.as_short[2] > m2.as_short[2]) ? -1 : 0;
+  res.as_short[3] = (m1.as_short[3] > m2.as_short[3]) ? -1 : 0;
+
+  return (__m64)res.as_m64;
+#endif
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_pcmpgtw(__m64 __m1, __m64 __m2) {
+  return _mm_cmpgt_pi16(__m1, __m2);
+}
+
+/* Compare two 32-bit values.  The result of the comparison is 0xFFFFFFFF if
+   the test is true and zero if false.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_cmpeq_pi32(__m64 __m1, __m64 __m2) {
+#if _ARCH_PWR9
+  __vector signed int a, b, c;
+
+  a = (__vector signed int)vec_splats(__m1);
+  b = (__vector signed int)vec_splats(__m2);
+  c = (__vector signed int)vec_cmpeq(a, b);
+  return (__m64)((__vector long long)c)[0];
+#else
+  __m64_union m1, m2, res;
+
+  m1.as_m64 = __m1;
+  m2.as_m64 = __m2;
+
+  res.as_int[0] = (m1.as_int[0] == m2.as_int[0]) ? -1 : 0;
+  res.as_int[1] = (m1.as_int[1] == m2.as_int[1]) ? -1 : 0;
+
+  return (__m64)res.as_m64;
+#endif
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_pcmpeqd(__m64 __m1, __m64 __m2) {
+  return _mm_cmpeq_pi32(__m1, __m2);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_cmpgt_pi32(__m64 __m1, __m64 __m2) {
+#if _ARCH_PWR9
+  __vector signed int a, b, c;
+
+  a = (__vector signed int)vec_splats(__m1);
+  b = (__vector signed int)vec_splats(__m2);
+  c = (__vector signed int)vec_cmpgt(a, b);
+  return (__m64)((__vector long long)c)[0];
+#else
+  __m64_union m1, m2, res;
+
+  m1.as_m64 = __m1;
+  m2.as_m64 = __m2;
+
+  res.as_int[0] = (m1.as_int[0] > m2.as_int[0]) ? -1 : 0;
+  res.as_int[1] = (m1.as_int[1] > m2.as_int[1]) ? -1 : 0;
+
+  return (__m64)res.as_m64;
+#endif
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_pcmpgtd(__m64 __m1, __m64 __m2) {
+  return _mm_cmpgt_pi32(__m1, __m2);
+}
+
+#if _ARCH_PWR8
+/* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
+   saturated arithmetic.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_adds_pi8(__m64 __m1, __m64 __m2) {
+  __vector signed char a, b, c;
+
+  a = (__vector signed char)vec_splats(__m1);
+  b = (__vector signed char)vec_splats(__m2);
+  c = vec_adds(a, b);
+  return (__m64)((__vector long long)c)[0];
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_paddsb(__m64 __m1, __m64 __m2) {
+  return _mm_adds_pi8(__m1, __m2);
+}
+/* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
+   saturated arithmetic.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_adds_pi16(__m64 __m1, __m64 __m2) {
+  __vector signed short a, b, c;
+
+  a = (__vector signed short)vec_splats(__m1);
+  b = (__vector signed short)vec_splats(__m2);
+  c = vec_adds(a, b);
+  return (__m64)((__vector long long)c)[0];
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_paddsw(__m64 __m1, __m64 __m2) {
+  return _mm_adds_pi16(__m1, __m2);
+}
+/* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
+   saturated arithmetic.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_adds_pu8(__m64 __m1, __m64 __m2) {
+  __vector unsigned char a, b, c;
+
+  a = (__vector unsigned char)vec_splats(__m1);
+  b = (__vector unsigned char)vec_splats(__m2);
+  c = vec_adds(a, b);
+  return (__m64)((__vector long long)c)[0];
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_paddusb(__m64 __m1, __m64 __m2) {
+  return _mm_adds_pu8(__m1, __m2);
+}
+
+/* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
+   saturated arithmetic.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_adds_pu16(__m64 __m1, __m64 __m2) {
+  __vector unsigned short a, b, c;
+
+  a = (__vector unsigned short)vec_splats(__m1);
+  b = (__vector unsigned short)vec_splats(__m2);
+  c = vec_adds(a, b);
+  return (__m64)((__vector long long)c)[0];
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_paddusw(__m64 __m1, __m64 __m2) {
+  return _mm_adds_pu16(__m1, __m2);
+}
+
+/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
+   saturating arithmetic.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_subs_pi8(__m64 __m1, __m64 __m2) {
+  __vector signed char a, b, c;
+
+  a = (__vector signed char)vec_splats(__m1);
+  b = (__vector signed char)vec_splats(__m2);
+  c = vec_subs(a, b);
+  return (__m64)((__vector long long)c)[0];
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_psubsb(__m64 __m1, __m64 __m2) {
+  return _mm_subs_pi8(__m1, __m2);
+}
+
+/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
+   signed saturating arithmetic.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_subs_pi16(__m64 __m1, __m64 __m2) {
+  __vector signed short a, b, c;
+
+  a = (__vector signed short)vec_splats(__m1);
+  b = (__vector signed short)vec_splats(__m2);
+  c = vec_subs(a, b);
+  return (__m64)((__vector long long)c)[0];
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_psubsw(__m64 __m1, __m64 __m2) {
+  return _mm_subs_pi16(__m1, __m2);
+}
+
+/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
+   unsigned saturating arithmetic.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_subs_pu8(__m64 __m1, __m64 __m2) {
+  __vector unsigned char a, b, c;
+
+  a = (__vector unsigned char)vec_splats(__m1);
+  b = (__vector unsigned char)vec_splats(__m2);
+  c = vec_subs(a, b);
+  return (__m64)((__vector long long)c)[0];
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_psubusb(__m64 __m1, __m64 __m2) {
+  return _mm_subs_pu8(__m1, __m2);
+}
+
+/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
+   unsigned saturating arithmetic.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_subs_pu16(__m64 __m1, __m64 __m2) {
+  __vector unsigned short a, b, c;
+
+  a = (__vector unsigned short)vec_splats(__m1);
+  b = (__vector unsigned short)vec_splats(__m2);
+  c = vec_subs(a, b);
+  return (__m64)((__vector long long)c)[0];
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_psubusw(__m64 __m1, __m64 __m2) {
+  return _mm_subs_pu16(__m1, __m2);
+}
+
+/* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
+   four 32-bit intermediate results, which are then summed by pairs to
+   produce two 32-bit results.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_madd_pi16(__m64 __m1, __m64 __m2) {
+  __vector signed short a, b;
+  __vector signed int c;
+  __vector signed int zero = {0, 0, 0, 0};
+
+  a = (__vector signed short)vec_splats(__m1);
+  b = (__vector signed short)vec_splats(__m2);
+  c = vec_vmsumshm(a, b, zero);
+  return (__m64)((__vector long long)c)[0];
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_pmaddwd(__m64 __m1, __m64 __m2) {
+  return _mm_madd_pi16(__m1, __m2);
+}
+/* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
+   M2 and produce the high 16 bits of the 32-bit results.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_mulhi_pi16(__m64 __m1, __m64 __m2) {
+  __vector signed short a, b;
+  __vector signed short c;
+  __vector signed int w0, w1;
+  __vector unsigned char xform1 = {
+#ifdef __LITTLE_ENDIAN__
+      0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
+      0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
+#else
+      0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x00,
+      0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
+#endif
+  };
+
+  a = (__vector signed short)vec_splats(__m1);
+  b = (__vector signed short)vec_splats(__m2);
+
+  w0 = vec_vmulesh(a, b);
+  w1 = vec_vmulosh(a, b);
+  c = (__vector signed short)vec_perm(w0, w1, xform1);
+
+  return (__m64)((__vector long long)c)[0];
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_pmulhw(__m64 __m1, __m64 __m2) {
+  return _mm_mulhi_pi16(__m1, __m2);
+}
+
+/* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
+   the low 16 bits of the results.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_mullo_pi16(__m64 __m1, __m64 __m2) {
+  __vector signed short a, b, c;
+
+  a = (__vector signed short)vec_splats(__m1);
+  b = (__vector signed short)vec_splats(__m2);
+  c = a * b;
+  return (__m64)((__vector long long)c)[0];
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_pmullw(__m64 __m1, __m64 __m2) {
+  return _mm_mullo_pi16(__m1, __m2);
+}
+
+/* Shift four 16-bit values in M left by COUNT.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_sll_pi16(__m64 __m, __m64 __count) {
+  __vector signed short m, r;
+  __vector unsigned short c;
+
+  if (__count <= 15) {
+    m = (__vector signed short)vec_splats(__m);
+    c = (__vector unsigned short)vec_splats((unsigned short)__count);
+    r = vec_sl(m, (__vector unsigned short)c);
+    return (__m64)((__vector long long)r)[0];
+  } else
+    return (0);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_psllw(__m64 __m, __m64 __count) {
+  return _mm_sll_pi16(__m, __count);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_slli_pi16(__m64 __m, int __count) {
+  /* Promote int to long then invoke mm_sll_pi16.  */
+  return _mm_sll_pi16(__m, __count);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_psllwi(__m64 __m, int __count) {
+  return _mm_slli_pi16(__m, __count);
+}
+
+/* Shift two 32-bit values in M left by COUNT.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_sll_pi32(__m64 __m, __m64 __count) {
+  __m64_union m, res;
+
+  m.as_m64 = __m;
+
+  res.as_int[0] = m.as_int[0] << __count;
+  res.as_int[1] = m.as_int[1] << __count;
+  return (res.as_m64);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_pslld(__m64 __m, __m64 __count) {
+  return _mm_sll_pi32(__m, __count);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_slli_pi32(__m64 __m, int __count) {
+  /* Promote int to long then invoke mm_sll_pi32.  */
+  return _mm_sll_pi32(__m, __count);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_pslldi(__m64 __m, int __count) {
+  return _mm_slli_pi32(__m, __count);
+}
+
+/* Shift four 16-bit values in M right by COUNT; shift in the sign bit.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_sra_pi16(__m64 __m, __m64 __count) {
+  __vector signed short m, r;
+  __vector unsigned short c;
+
+  if (__count <= 15) {
+    m = (__vector signed short)vec_splats(__m);
+    c = (__vector unsigned short)vec_splats((unsigned short)__count);
+    r = vec_sra(m, (__vector unsigned short)c);
+    return (__m64)((__vector long long)r)[0];
+  } else
+    return (0);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_psraw(__m64 __m, __m64 __count) {
+  return _mm_sra_pi16(__m, __count);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_srai_pi16(__m64 __m, int __count) {
+  /* Promote int to long then invoke mm_sra_pi32.  */
+  return _mm_sra_pi16(__m, __count);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_psrawi(__m64 __m, int __count) {
+  return _mm_srai_pi16(__m, __count);
+}
+
+/* Shift two 32-bit values in M right by COUNT; shift in the sign bit.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_sra_pi32(__m64 __m, __m64 __count) {
+  __m64_union m, res;
+
+  m.as_m64 = __m;
+
+  res.as_int[0] = m.as_int[0] >> __count;
+  res.as_int[1] = m.as_int[1] >> __count;
+  return (res.as_m64);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_psrad(__m64 __m, __m64 __count) {
+  return _mm_sra_pi32(__m, __count);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_srai_pi32(__m64 __m, int __count) {
+  /* Promote int to long then invoke mm_sra_pi32.  */
+  return _mm_sra_pi32(__m, __count);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_psradi(__m64 __m, int __count) {
+  return _mm_srai_pi32(__m, __count);
+}
+
+/* Shift four 16-bit values in M right by COUNT; shift in zeros.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_srl_pi16(__m64 __m, __m64 __count) {
+  __vector unsigned short m, r;
+  __vector unsigned short c;
+
+  if (__count <= 15) {
+    m = (__vector unsigned short)vec_splats(__m);
+    c = (__vector unsigned short)vec_splats((unsigned short)__count);
+    r = vec_sr(m, (__vector unsigned short)c);
+    return (__m64)((__vector long long)r)[0];
+  } else
+    return (0);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_psrlw(__m64 __m, __m64 __count) {
+  return _mm_srl_pi16(__m, __count);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_srli_pi16(__m64 __m, int __count) {
+  /* Promote int to long then invoke mm_sra_pi32.  */
+  return _mm_srl_pi16(__m, __count);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_psrlwi(__m64 __m, int __count) {
+  return _mm_srli_pi16(__m, __count);
+}
+
+/* Shift two 32-bit values in M right by COUNT; shift in zeros.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_srl_pi32(__m64 __m, __m64 __count) {
+  __m64_union m, res;
+
+  m.as_m64 = __m;
+
+  res.as_int[0] = (unsigned int)m.as_int[0] >> __count;
+  res.as_int[1] = (unsigned int)m.as_int[1] >> __count;
+  return (res.as_m64);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_psrld(__m64 __m, __m64 __count) {
+  return _mm_srl_pi32(__m, __count);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_srli_pi32(__m64 __m, int __count) {
+  /* Promote int to long then invoke mm_srl_pi32.  */
+  return _mm_srl_pi32(__m, __count);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_psrldi(__m64 __m, int __count) {
+  return _mm_srli_pi32(__m, __count);
+}
+#endif /* _ARCH_PWR8 */
+
+/* Creates a vector of two 32-bit values; I0 is least significant.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_set_pi32(int __i1, int __i0) {
+  __m64_union res;
+
+  res.as_int[0] = __i0;
+  res.as_int[1] = __i1;
+  return (res.as_m64);
+}
+
+/* Creates a vector of four 16-bit values; W0 is least significant.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_set_pi16(short __w3, short __w2, short __w1, short __w0) {
+  __m64_union res;
+
+  res.as_short[0] = __w0;
+  res.as_short[1] = __w1;
+  res.as_short[2] = __w2;
+  res.as_short[3] = __w3;
+  return (res.as_m64);
+}
+
+/* Creates a vector of eight 8-bit values; B0 is least significant.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3,
+                char __b2, char __b1, char __b0) {
+  __m64_union res;
+
+  res.as_char[0] = __b0;
+  res.as_char[1] = __b1;
+  res.as_char[2] = __b2;
+  res.as_char[3] = __b3;
+  res.as_char[4] = __b4;
+  res.as_char[5] = __b5;
+  res.as_char[6] = __b6;
+  res.as_char[7] = __b7;
+  return (res.as_m64);
+}
+
+/* Similar, but with the arguments in reverse order.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_setr_pi32(int __i0, int __i1) {
+  __m64_union res;
+
+  res.as_int[0] = __i0;
+  res.as_int[1] = __i1;
+  return (res.as_m64);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) {
+  return _mm_set_pi16(__w3, __w2, __w1, __w0);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4,
+                 char __b5, char __b6, char __b7) {
+  return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
+}
+
+/* Creates a vector of two 32-bit values, both elements containing I.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_set1_pi32(int __i) {
+  __m64_union res;
+
+  res.as_int[0] = __i;
+  res.as_int[1] = __i;
+  return (res.as_m64);
+}
+
+/* Creates a vector of four 16-bit values, all elements containing W.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_set1_pi16(short __w) {
+#if _ARCH_PWR9
+  __vector signed short w;
+
+  w = (__vector signed short)vec_splats(__w);
+  return (__m64)((__vector long long)w)[0];
+#else
+  __m64_union res;
+
+  res.as_short[0] = __w;
+  res.as_short[1] = __w;
+  res.as_short[2] = __w;
+  res.as_short[3] = __w;
+  return (res.as_m64);
+#endif
+}
+
+/* Creates a vector of eight 8-bit values, all elements containing B.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_set1_pi8(signed char __b) {
+#if _ARCH_PWR8
+  __vector signed char b;
+
+  b = (__vector signed char)vec_splats(__b);
+  return (__m64)((__vector long long)b)[0];
+#else
+  __m64_union res;
+
+  res.as_char[0] = __b;
+  res.as_char[1] = __b;
+  res.as_char[2] = __b;
+  res.as_char[3] = __b;
+  res.as_char[4] = __b;
+  res.as_char[5] = __b;
+  res.as_char[6] = __b;
+  res.as_char[7] = __b;
+  return (res.as_m64);
+#endif
+}
+
+#else
+#include_next <mmintrin.h>
+#endif /* defined(__linux__) && defined(__ppc64__) */
+
+#endif /* _MMINTRIN_H_INCLUDED */
diff --git a/onnxruntime/core/mlas/lib/ppc64/pmmintrin.h b/onnxruntime/core/mlas/lib/ppc64/pmmintrin.h
new file mode 100644
index 0000000000000..f50776541a263
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/ppc64/pmmintrin.h
@@ -0,0 +1,150 @@
+/*===---- pmmintrin.h - Implementation of SSE3 intrinsics on PowerPC -------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* Implemented from the specification included in the Intel C++ Compiler
+   User Guide and Reference, version 9.0.  */
+
+#ifndef NO_WARN_X86_INTRINSICS
+/* This header is distributed to simplify porting x86_64 code that
+   makes explicit use of Intel intrinsics to powerpc64le.
+   It is the user's responsibility to determine if the results are
+   acceptable and make additional changes as necessary.
+   Note that much code that uses Intel intrinsics can be rewritten in
+   standard C or GNU C extensions, which are more portable and better
+   optimized across multiple targets.
+
+   In the specific case of X86 SSE3 intrinsics, the PowerPC VMX/VSX ISA
+   is a good match for most SIMD operations.  However the Horizontal
+   add/sub requires the data pairs be permuted into a separate
+   registers with vertical even/odd alignment for the operation.
+   And the addsub operation requires the sign of only the even numbered
+   elements be flipped (xored with -0.0).
+   For larger blocks of code using these intrinsic implementations,
+   the compiler be should be able to schedule instructions to avoid
+   additional latency.
+
+   In the specific case of the monitor and mwait instructions there are
+   no direct equivalent in the PowerISA at this time.  So those
+   intrinsics are not implemented.  */
+#error "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this warning."
+#endif
+
+#ifndef PMMINTRIN_H_
+#define PMMINTRIN_H_
+
+#if defined(__linux__) && defined(__powerpc64__)
+
+/* We need definitions from the SSE2 and SSE header files*/
+#include "emmintrin.h"
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_addsub_ps (__m128 __X, __m128 __Y)
+{
+  const __v4sf even_n0 = {-0.0, 0.0, -0.0, 0.0};
+  __v4sf even_neg_Y = vec_xor(__Y, even_n0);
+  return (__m128) vec_add (__X, even_neg_Y);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_addsub_pd (__m128d __X, __m128d __Y)
+{
+  const __v2df even_n0 = {-0.0, 0.0};
+  __v2df even_neg_Y = vec_xor(__Y, even_n0);
+  return (__m128d) vec_add (__X, even_neg_Y);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hadd_ps (__m128 __X, __m128 __Y)
+{
+  __vector unsigned char xform2 = {
+      0x00, 0x01, 0x02, 0x03,
+      0x08, 0x09, 0x0A, 0x0B,
+      0x10, 0x11, 0x12, 0x13,
+      0x18, 0x19, 0x1A, 0x1B
+    };
+  __vector unsigned char xform1 = {
+      0x04, 0x05, 0x06, 0x07,
+      0x0C, 0x0D, 0x0E, 0x0F,
+      0x14, 0x15, 0x16, 0x17,
+      0x1C, 0x1D, 0x1E, 0x1F
+    };
+  return (__m128) vec_add (vec_perm ((__v4sf) __X, (__v4sf) __Y, xform2),
+			   vec_perm ((__v4sf) __X, (__v4sf) __Y, xform1));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsub_ps (__m128 __X, __m128 __Y)
+{
+  __vector unsigned char xform2 = {
+      0x00, 0x01, 0x02, 0x03,
+      0x08, 0x09, 0x0A, 0x0B,
+      0x10, 0x11, 0x12, 0x13,
+      0x18, 0x19, 0x1A, 0x1B
+    };
+  __vector unsigned char xform1 = {
+      0x04, 0x05, 0x06, 0x07,
+      0x0C, 0x0D, 0x0E, 0x0F,
+      0x14, 0x15, 0x16, 0x17,
+      0x1C, 0x1D, 0x1E, 0x1F
+    };
+  return (__m128) vec_sub (vec_perm ((__v4sf) __X, (__v4sf) __Y, xform2),
+			   vec_perm ((__v4sf) __X, (__v4sf) __Y, xform1));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hadd_pd (__m128d __X, __m128d __Y)
+{
+  return (__m128d) vec_add (vec_mergeh ((__v2df) __X, (__v2df)__Y),
+				  vec_mergel ((__v2df) __X, (__v2df)__Y));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsub_pd (__m128d __X, __m128d __Y)
+{
+  return (__m128d) vec_sub (vec_mergeh ((__v2df) __X, (__v2df)__Y),
+			    vec_mergel ((__v2df) __X, (__v2df)__Y));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movehdup_ps (__m128 __X)
+{
+  return (__m128)vec_mergeo ((__v4su)__X, (__v4su)__X);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_moveldup_ps (__m128 __X)
+{
+  return (__m128)vec_mergee ((__v4su)__X, (__v4su)__X);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loaddup_pd (double const *__P)
+{
+  return (__m128d) vec_splats (*__P);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movedup_pd (__m128d __X)
+{
+  return _mm_shuffle_pd (__X, __X, _MM_SHUFFLE2 (0,0));
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_lddqu_si128 (__m128i const *__P)
+{
+  return (__m128i) (vec_vsx_ld(0, (signed int const *)__P));
+}
+
+/* POWER8 / POWER9 have no equivalent for _mm_monitor nor _mm_wait.  */
+
+#else
+#include_next <pmmintrin.h>
+#endif /* defined(__linux__) && defined(__ppc64__) */
+
+#endif /* PMMINTRIN_H_ */
diff --git a/onnxruntime/core/mlas/lib/ppc64/smmintrin.h b/onnxruntime/core/mlas/lib/ppc64/smmintrin.h
new file mode 100644
index 0000000000000..56ef6ba76b06e
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/ppc64/smmintrin.h
@@ -0,0 +1,85 @@
+/*===---- smmintrin.h - Implementation of SSE4 intrinsics on PowerPC -------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* Implemented from the specification included in the Intel C++ Compiler
+   User Guide and Reference, version 9.0.
+
+   NOTE: This is NOT a complete implementation of the SSE4 intrinsics!  */
+
+#ifndef NO_WARN_X86_INTRINSICS
+/* This header is distributed to simplify porting x86_64 code that
+   makes explicit use of Intel intrinsics to powerp64/powerpc64le.
+
+   It is the user's responsibility to determine if the results are
+   acceptable and make additional changes as necessary.
+
+   Note that much code that uses Intel intrinsics can be rewritten in
+   standard C or GNU C extensions, which are more portable and better
+   optimized across multiple targets.  */
+#error                                                                         \
+    "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
+#endif
+
+#ifndef SMMINTRIN_H_
+#define SMMINTRIN_H_
+
+#if defined(__linux__) && defined(__ppc64__)
+
+#include <altivec.h>
+#include <emmintrin.h>
+
+extern __inline int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_extract_epi8(__m128i __X, const int __N) {
+  return (unsigned char)((__v16qi)__X)[__N & 15];
+}
+
+extern __inline int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_extract_epi32(__m128i __X, const int __N) {
+  return ((__v4si)__X)[__N & 3];
+}
+
+extern __inline int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_extract_epi64(__m128i __X, const int __N) {
+  return ((__v2di)__X)[__N & 1];
+}
+
+extern __inline int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_extract_ps(__m128 __X, const int __N) {
+  return ((__v4si)__X)[__N & 3];
+}
+
+extern __inline __m128i
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_blend_epi16(__m128i __A, __m128i __B, const int __imm8) {
+  __v16qi __charmask = vec_splats((signed char)__imm8);
+  __charmask = vec_gb(__charmask);
+  __v8hu __shortmask = (__v8hu)vec_unpackh(__charmask);
+#ifdef __BIG_ENDIAN__
+  __shortmask = vec_reve(__shortmask);
+#endif
+  return (__m128i)vec_sel((__v8hu)__A, (__v8hu)__B, __shortmask);
+}
+
+extern __inline __m128i
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_blendv_epi8(__m128i __A, __m128i __B, __m128i __mask) {
+  const __v16qu __seven = vec_splats((unsigned char)0x07);
+  __v16qu __lmask = vec_sra((__v16qu)__mask, __seven);
+  return (__m128i)vec_sel((__v16qu)__A, (__v16qu)__B, __lmask);
+}
+
+#else
+#include_next <smmintrin.h>
+#endif /* defined(__linux__) && defined(__ppc64__) */
+
+#endif /* _SMMINTRIN_H_ */
diff --git a/onnxruntime/core/mlas/lib/ppc64/tmmintrin.h b/onnxruntime/core/mlas/lib/ppc64/tmmintrin.h
new file mode 100644
index 0000000000000..c361bf098e4f5
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/ppc64/tmmintrin.h
@@ -0,0 +1,495 @@
+/*===---- tmmintrin.h - Implementation of SSSE3 intrinsics on PowerPC ------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* Implemented from the specification included in the Intel C++ Compiler
+   User Guide and Reference, version 9.0.  */
+
+#ifndef NO_WARN_X86_INTRINSICS
+/* This header is distributed to simplify porting x86_64 code that
+   makes explicit use of Intel intrinsics to powerpc64le.
+
+   It is the user's responsibility to determine if the results are
+   acceptable and make additional changes as necessary.
+
+   Note that much code that uses Intel intrinsics can be rewritten in
+   standard C or GNU C extensions, which are more portable and better
+   optimized across multiple targets.  */
+#endif
+
+#ifndef TMMINTRIN_H_
+#define TMMINTRIN_H_
+
+#if defined(__linux__) && defined(__powerpc64__)
+
+#include <altivec.h>
+
+/* We need definitions from the SSE header files.  */
+#include "pmmintrin.h"
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_abs_epi16 (__m128i __A)
+{
+  return (__m128i) vec_abs ((__v8hi) __A);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_abs_epi32 (__m128i __A)
+{
+  return (__m128i) vec_abs ((__v4si) __A);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_abs_epi8 (__m128i __A)
+{
+  return (__m128i) vec_abs ((__v16qi) __A);
+}
+
+extern __inline __m64
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_abs_pi16 (__m64 __A)
+{
+  __v8hi __B = (__v8hi) (__v2du) { __A, __A };
+  return (__m64) ((__v2du) vec_abs (__B))[0];
+}
+
+extern __inline __m64
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_abs_pi32 (__m64 __A)
+{
+  __v4si __B = (__v4si) (__v2du) { __A, __A };
+  return (__m64) ((__v2du) vec_abs (__B))[0];
+}
+
+extern __inline __m64
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_abs_pi8 (__m64 __A)
+{
+  __v16qi __B = (__v16qi) (__v2du) { __A, __A };
+  return (__m64) ((__v2du) vec_abs (__B))[0];
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_alignr_epi8 (__m128i __A, __m128i __B, const unsigned int __count)
+{
+  if (__builtin_constant_p (__count) && __count < 16)
+    {
+#ifdef __LITTLE_ENDIAN__
+      __A = (__m128i) vec_reve ((__v16qu) __A);
+      __B = (__m128i) vec_reve ((__v16qu) __B);
+#endif
+      __A = (__m128i) vec_sld ((__v16qu) __B, (__v16qu) __A, __count);
+#ifdef __LITTLE_ENDIAN__
+      __A = (__m128i) vec_reve ((__v16qu) __A);
+#endif
+      return __A;
+    }
+
+  if (__count == 0)
+    return __B;
+
+  if (__count >= 16)
+    {
+      if (__count >= 32)
+	{
+	  const __v16qu zero = { 0 };
+	  return (__m128i) zero;
+	}
+      else
+	{
+	  const __v16qu __shift =
+	    vec_splats ((unsigned char) ((__count - 16) * 8));
+#ifdef __LITTLE_ENDIAN__
+	  return (__m128i) vec_sro ((__v16qu) __A, __shift);
+#else
+	  return (__m128i) vec_slo ((__v16qu) __A, __shift);
+#endif
+	}
+    }
+  else
+    {
+      const __v16qu __shiftA =
+	vec_splats ((unsigned char) ((16 - __count) * 8));
+      const __v16qu __shiftB = vec_splats ((unsigned char) (__count * 8));
+#ifdef __LITTLE_ENDIAN__
+      __A = (__m128i) vec_slo ((__v16qu) __A, __shiftA);
+      __B = (__m128i) vec_sro ((__v16qu) __B, __shiftB);
+#else
+      __A = (__m128i) vec_sro ((__v16qu) __A, __shiftA);
+      __B = (__m128i) vec_slo ((__v16qu) __B, __shiftB);
+#endif
+      return (__m128i) vec_or ((__v16qu) __A, (__v16qu) __B);
+    }
+}
+
+extern __inline __m64
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_alignr_pi8 (__m64 __A, __m64 __B, unsigned int __count)
+{
+  if (__count < 16)
+    {
+      __v2du __C = { __B, __A };
+#ifdef __LITTLE_ENDIAN__
+      const __v4su __shift = { __count << 3, 0, 0, 0 };
+      __C = (__v2du) vec_sro ((__v16qu) __C, (__v16qu) __shift);
+#else
+      const __v4su __shift = { 0, 0, 0, __count << 3 };
+      __C = (__v2du) vec_slo ((__v16qu) __C, (__v16qu) __shift);
+#endif
+      return (__m64) __C[0];
+    }
+  else
+    {
+      const __m64 __zero = { 0 };
+      return __zero;
+    }
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hadd_epi16 (__m128i __A, __m128i __B)
+{
+  const __v16qu __P =
+    {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
+  const __v16qu __Q =
+    {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
+  __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
+  __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
+  return (__m128i) vec_add (__C, __D);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hadd_epi32 (__m128i __A, __m128i __B)
+{
+  const __v16qu __P =
+    {  0,  1,  2,  3,  8,  9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 };
+  const __v16qu __Q =
+    {  4,  5,  6,  7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 };
+  __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P);
+  __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q);
+  return (__m128i) vec_add (__C, __D);
+}
+
+extern __inline __m64
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hadd_pi16 (__m64 __A, __m64 __B)
+{
+  __v8hi __C = (__v8hi) (__v2du) { __A, __B };
+  const __v16qu __P =
+    {  0,  1,  4,  5,  8,  9, 12, 13,  0,  1,  4,  5,  8,  9, 12, 13 };
+  const __v16qu __Q =
+    {  2,  3,  6,  7, 10, 11, 14, 15,  2,  3,  6,  7, 10, 11, 14, 15 };
+  __v8hi __D = vec_perm (__C, __C, __Q);
+  __C = vec_perm (__C, __C, __P);
+  __C = vec_add (__C, __D);
+  return (__m64) ((__v2du) __C)[1];
+}
+
+extern __inline __m64
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hadd_pi32 (__m64 __A, __m64 __B)
+{
+  __v4si __C = (__v4si) (__v2du) { __A, __B };
+  const __v16qu __P =
+    {  0,  1,  2,  3,  8,  9, 10, 11,  0,  1,  2,  3,  8,  9, 10, 11 };
+  const __v16qu __Q =
+    {  4,  5,  6,  7, 12, 13, 14, 15,  4,  5,  6,  7, 12, 13, 14, 15 };
+  __v4si __D = vec_perm (__C, __C, __Q);
+  __C = vec_perm (__C, __C, __P);
+  __C = vec_add (__C, __D);
+  return (__m64) ((__v2du) __C)[1];
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hadds_epi16 (__m128i __A, __m128i __B)
+{
+  __v4si __C = { 0 }, __D = { 0 };
+  __C = vec_sum4s ((__v8hi) __A, __C);
+  __D = vec_sum4s ((__v8hi) __B, __D);
+  __C = (__v4si) vec_packs (__C, __D);
+  return (__m128i) __C;
+}
+
+extern __inline __m64
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hadds_pi16 (__m64 __A, __m64 __B)
+{
+  const __v4si __zero = { 0 };
+  __v8hi __C = (__v8hi) (__v2du) { __A, __B };
+  __v4si __D = vec_sum4s (__C, __zero);
+  __C = vec_packs (__D, __D);
+  return (__m64) ((__v2du) __C)[1];
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsub_epi16 (__m128i __A, __m128i __B)
+{
+  const __v16qu __P =
+    {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
+  const __v16qu __Q =
+    {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
+  __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
+  __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
+  return (__m128i) vec_sub (__C, __D);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsub_epi32 (__m128i __A, __m128i __B)
+{
+  const __v16qu __P =
+    {  0,  1,  2,  3,  8,  9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 };
+  const __v16qu __Q =
+    {  4,  5,  6,  7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 };
+  __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P);
+  __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q);
+  return (__m128i) vec_sub (__C, __D);
+}
+
+extern __inline __m64
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsub_pi16 (__m64 __A, __m64 __B)
+{
+  const __v16qu __P =
+    {  0,  1,  4,  5,  8,  9, 12, 13,  0,  1,  4,  5,  8,  9, 12, 13 };
+  const __v16qu __Q =
+    {  2,  3,  6,  7, 10, 11, 14, 15,  2,  3,  6,  7, 10, 11, 14, 15 };
+  __v8hi __C = (__v8hi) (__v2du) { __A, __B };
+  __v8hi __D = vec_perm (__C, __C, __Q);
+  __C = vec_perm (__C, __C, __P);
+  __C = vec_sub (__C, __D);
+  return (__m64) ((__v2du) __C)[1];
+}
+
+extern __inline __m64
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsub_pi32 (__m64 __A, __m64 __B)
+{
+  const __v16qu __P =
+    {  0,  1,  2,  3,  8,  9, 10, 11,  0,  1,  2,  3,  8,  9, 10, 11 };
+  const __v16qu __Q =
+    {  4,  5,  6,  7, 12, 13, 14, 15,  4,  5,  6,  7, 12, 13, 14, 15 };
+  __v4si __C = (__v4si) (__v2du) { __A, __B };
+  __v4si __D = vec_perm (__C, __C, __Q);
+  __C = vec_perm (__C, __C, __P);
+  __C = vec_sub (__C, __D);
+  return (__m64) ((__v2du) __C)[1];
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsubs_epi16 (__m128i __A, __m128i __B)
+{
+  const __v16qu __P =
+    {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
+  const __v16qu __Q =
+    {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
+  __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
+  __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
+  return (__m128i) vec_subs (__C, __D);
+}
+
+extern __inline __m64
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsubs_pi16 (__m64 __A, __m64 __B)
+{
+  const __v16qu __P =
+    {  0,  1,  4,  5,  8,  9, 12, 13,  0,  1,  4,  5,  8,  9, 12, 13 };
+  const __v16qu __Q =
+    {  2,  3,  6,  7, 10, 11, 14, 15,  2,  3,  6,  7, 10, 11, 14, 15 };
+  __v8hi __C = (__v8hi) (__v2du) { __A, __B };
+  __v8hi __D = vec_perm (__C, __C, __P);
+  __v8hi __E = vec_perm (__C, __C, __Q);
+  __C = vec_subs (__D, __E);
+  return (__m64) ((__v2du) __C)[1];
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shuffle_epi8 (__m128i __A, __m128i __B)
+{
+  const __v16qi __zero = { 0 };
+  __vector __bool char __select = vec_cmplt ((__v16qi) __B, __zero);
+  __v16qi __C = vec_perm ((__v16qi) __A, (__v16qi) __A, (__v16qu) __B);
+  return (__m128i) vec_sel (__C, __zero, __select);
+}
+
+extern __inline __m64
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shuffle_pi8 (__m64 __A, __m64 __B)
+{
+  const __v16qi __zero = { 0 };
+  __v16qi __C = (__v16qi) (__v2du) { __A, __A };
+  __v16qi __D = (__v16qi) (__v2du) { __B, __B };
+  __vector __bool char __select = vec_cmplt ((__v16qi) __D, __zero);
+  __C = vec_perm ((__v16qi) __C, (__v16qi) __C, (__v16qu) __D);
+  __C = vec_sel (__C, __zero, __select);
+  return (__m64) ((__v2du) (__C))[0];
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sign_epi8 (__m128i __A, __m128i __B)
+{
+  const __v16qi __zero = { 0 };
+  __v16qi __selectneg = (__v16qi) vec_cmplt ((__v16qi) __B, __zero);
+  __v16qi __selectpos =
+    (__v16qi) vec_neg ((__v16qi) vec_cmpgt ((__v16qi) __B, __zero));
+  __v16qi __conv = vec_add (__selectneg, __selectpos);
+  return (__m128i) vec_mul ((__v16qi) __A, (__v16qi) __conv);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sign_epi16 (__m128i __A, __m128i __B)
+{
+  const __v8hi __zero = { 0 };
+  __v8hi __selectneg = (__v8hi) vec_cmplt ((__v8hi) __B, __zero);
+  __v8hi __selectpos =
+    (__v8hi) vec_neg ((__v8hi) vec_cmpgt ((__v8hi) __B, __zero));
+  __v8hi __conv = vec_add (__selectneg, __selectpos);
+  return (__m128i) vec_mul ((__v8hi) __A, (__v8hi) __conv);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sign_epi32 (__m128i __A, __m128i __B)
+{
+  const __v4si __zero = { 0 };
+  __v4si __selectneg = (__v4si) vec_cmplt ((__v4si) __B, __zero);
+  __v4si __selectpos =
+    (__v4si) vec_neg ((__v4si) vec_cmpgt ((__v4si) __B, __zero));
+  __v4si __conv = vec_add (__selectneg, __selectpos);
+  return (__m128i) vec_mul ((__v4si) __A, (__v4si) __conv);
+}
+
+extern __inline __m64
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sign_pi8 (__m64 __A, __m64 __B)
+{
+  const __v16qi __zero = { 0 };
+  __v16qi __C = (__v16qi) (__v2du) { __A, __A };
+  __v16qi __D = (__v16qi) (__v2du) { __B, __B };
+  __C = (__v16qi) _mm_sign_epi8 ((__m128i) __C, (__m128i) __D);
+  return (__m64) ((__v2du) (__C))[0];
+}
+
+extern __inline __m64
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sign_pi16 (__m64 __A, __m64 __B)
+{
+  const __v8hi __zero = { 0 };
+  __v8hi __C = (__v8hi) (__v2du) { __A, __A };
+  __v8hi __D = (__v8hi) (__v2du) { __B, __B };
+  __C = (__v8hi) _mm_sign_epi16 ((__m128i) __C, (__m128i) __D);
+  return (__m64) ((__v2du) (__C))[0];
+}
+
+extern __inline __m64
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sign_pi32 (__m64 __A, __m64 __B)
+{
+  const __v4si __zero = { 0 };
+  __v4si __C = (__v4si) (__v2du) { __A, __A };
+  __v4si __D = (__v4si) (__v2du) { __B, __B };
+  __C = (__v4si) _mm_sign_epi32 ((__m128i) __C, (__m128i) __D);
+  return (__m64) ((__v2du) (__C))[0];
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maddubs_epi16 (__m128i __A, __m128i __B)
+{
+  __v8hi __unsigned = vec_splats ((signed short) 0x00ff);
+  __v8hi __C = vec_and (vec_unpackh ((__v16qi) __A), __unsigned);
+  __v8hi __D = vec_and (vec_unpackl ((__v16qi) __A), __unsigned);
+  __v8hi __E = vec_unpackh ((__v16qi) __B);
+  __v8hi __F = vec_unpackl ((__v16qi) __B);
+  __C = vec_mul (__C, __E);
+  __D = vec_mul (__D, __F);
+  const __v16qu __odds  =
+    {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
+  const __v16qu __evens =
+    {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
+  __E = vec_perm (__C, __D, __odds);
+  __F = vec_perm (__C, __D, __evens);
+  return (__m128i) vec_adds (__E, __F);
+}
+
+extern __inline __m64
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maddubs_pi16 (__m64 __A, __m64 __B)
+{
+  __v8hi __C = (__v8hi) (__v2du) { __A, __A };
+  __C = vec_unpackl ((__v16qi) __C);
+  const __v8hi __unsigned = vec_splats ((signed short) 0x00ff);
+  __C = vec_and (__C, __unsigned);
+  __v8hi __D = (__v8hi) (__v2du) { __B, __B };
+  __D = vec_unpackl ((__v16qi) __D);
+  __D = vec_mul (__C, __D);
+  const __v16qu __odds  =
+    {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
+  const __v16qu __evens =
+    {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
+  __C = vec_perm (__D, __D, __odds);
+  __D = vec_perm (__D, __D, __evens);
+  __C = vec_adds (__C, __D);
+  return (__m64) ((__v2du) (__C))[0];
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mulhrs_epi16 (__m128i __A, __m128i __B)
+{
+  __v4si __C = vec_unpackh ((__v8hi) __A);
+  __v4si __D = vec_unpackh ((__v8hi) __B);
+  __C = vec_mul (__C, __D);
+  __D = vec_unpackl ((__v8hi) __A);
+  __v4si __E = vec_unpackl ((__v8hi) __B);
+  __D = vec_mul (__D, __E);
+  const __v4su __shift = vec_splats ((unsigned int) 14);
+  __C = vec_sr (__C, __shift);
+  __D = vec_sr (__D, __shift);
+  const __v4si __ones = vec_splats ((signed int) 1);
+  __C = vec_add (__C, __ones);
+  __C = vec_sr (__C, (__v4su) __ones);
+  __D = vec_add (__D, __ones);
+  __D = vec_sr (__D, (__v4su) __ones);
+  return (__m128i) vec_pack (__C, __D);
+}
+
+extern __inline __m64
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mulhrs_pi16 (__m64 __A, __m64 __B)
+{
+  __v4si __C = (__v4si) (__v2du) { __A, __A };
+  __C = vec_unpackh ((__v8hi) __C);
+  __v4si __D = (__v4si) (__v2du) { __B, __B };
+  __D = vec_unpackh ((__v8hi) __D);
+  __C = vec_mul (__C, __D);
+  const __v4su __shift = vec_splats ((unsigned int) 14);
+  __C = vec_sr (__C, __shift);
+  const __v4si __ones = vec_splats ((signed int) 1);
+  __C = vec_add (__C, __ones);
+  __C = vec_sr (__C, (__v4su) __ones);
+  __v8hi __E = vec_pack (__C, __D);
+  return (__m64) ((__v2du) (__E))[0];
+}
+
+#else
+#include_next <tmmintrin.h>
+#endif /* defined(__linux__) && defined(__ppc64__) */
+
+#endif /* TMMINTRIN_H_ */
diff --git a/onnxruntime/core/mlas/lib/ppc64/xmmintrin.h b/onnxruntime/core/mlas/lib/ppc64/xmmintrin.h
new file mode 100644
index 0000000000000..3f0dbece9baad
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/ppc64/xmmintrin.h
@@ -0,0 +1,1844 @@
+/*===---- xmmintrin.h - Implementation of SSE intrinsics on PowerPC --------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* Implemented from the specification included in the Intel C++ Compiler
+   User Guide and Reference, version 9.0.  */
+
+#ifndef NO_WARN_X86_INTRINSICS
+/* This header file is to help porting code using Intel intrinsics
+   explicitly from x86_64 to powerpc64/powerpc64le.
+
+   Since X86 SSE intrinsics mainly handles __m128 type, PowerPC
+   VMX/VSX ISA is a good match for vector float SIMD operations.
+   However scalar float operations in vector (XMM) registers require
+   the POWER8 VSX ISA (2.07) level. There are differences for data
+   format and placement of float scalars in the vector register, which
+   require extra steps to match SSE scalar float semantics on POWER.
+
+   It should be noted that there's much difference between X86_64's
+   MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
+   portable <fenv.h> instead of access MXSCR directly.
+
+   Most SSE scalar float intrinsic operations can be performed more
+   efficiently as C language float scalar operations or optimized to
+   use vector SIMD operations. We recommend this for new applications. */
+#error "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
+#endif
+
+#ifndef _XMMINTRIN_H_INCLUDED
+#define _XMMINTRIN_H_INCLUDED
+
+#if defined(__linux__) && defined(__powerpc64__)
+
+/* Define four value permute mask */
+#define _MM_SHUFFLE(w,x,y,z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z))
+
+#include <altivec.h>
+
+/* Avoid collisions between altivec.h and strict adherence to C++ and
+   C11 standards.  This should eventually be done inside altivec.h itself,
+   but only after testing a full distro build.  */
+#if defined(__STRICT_ANSI__) && (defined(__cplusplus) || \
+				 (defined(__STDC_VERSION__) &&	\
+				  __STDC_VERSION__ >= 201112L))
+#undef vector
+#undef pixel
+#undef bool
+#endif
+
+/* We need type definitions from the MMX header file.  */
+#include "mmintrin.h"
+
+/* Get _mm_malloc () and _mm_free ().  */
+#if __STDC_HOSTED__
+#include "mm_malloc.h"
+#endif
+
+/* The Intel API is flexible enough that we must allow aliasing with other
+   vector types, and their scalar components.  */
+typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
+
+/* Unaligned version of the same type.  */
+typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__,
+				       __aligned__ (1)));
+
+/* Internal data types for implementing the intrinsics.  */
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+/* Create an undefined vector.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_undefined_ps (void)
+{
+  __m128 __Y = __Y;
+  return __Y;
+}
+
+/* Create a vector of zeros.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setzero_ps (void)
+{
+  return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
+}
+
+/* Load four SPFP values from P.  The address must be 16-byte aligned.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_ps (float const *__P)
+{
+  return ((__m128)vec_ld(0, (__v4sf*)__P));
+}
+
+/* Load four SPFP values from P.  The address need not be 16-byte aligned.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadu_ps (float const *__P)
+{
+  return (vec_vsx_ld(0, __P));
+}
+
+/* Load four SPFP values in reverse order.  The address must be aligned.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadr_ps (float const *__P)
+{
+  __v4sf   __tmp;
+  __m128 result;
+  static const __vector unsigned char permute_vector =
+    { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
+	0x17, 0x10, 0x11, 0x12, 0x13 };
+
+  __tmp = vec_ld (0, (__v4sf *) __P);
+  result = (__m128) vec_perm (__tmp, __tmp, permute_vector);
+  return result;
+}
+
+/* Create a vector with all four elements equal to F.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_ps (float __F)
+{
+  return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F };
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_ps1 (float __F)
+{
+  return _mm_set1_ps (__F);
+}
+
+/* Create the vector [Z Y X W].  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_ps (const float __Z, const float __Y, const float __X, const float __W)
+{
+  return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z };
+}
+
+/* Create the vector [W X Y Z].  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setr_ps (float __Z, float __Y, float __X, float __W)
+{
+  return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
+}
+
+/* Store four SPFP values.  The address must be 16-byte aligned.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_ps (float *__P, __m128 __A)
+{
+  vec_st((__v4sf)__A, 0, (__v4sf*)__P);
+}
+
+/* Store four SPFP values.  The address need not be 16-byte aligned.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storeu_ps (float *__P, __m128 __A)
+{
+  *(__m128_u *)__P = __A;
+}
+
+/* Store four SPFP values in reverse order.  The address must be aligned.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storer_ps (float *__P, __m128 __A)
+{
+  __v4sf   __tmp;
+  static const __vector unsigned char permute_vector =
+    { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
+	0x17, 0x10, 0x11, 0x12, 0x13 };
+
+  __tmp = (__m128) vec_perm (__A, __A, permute_vector);
+
+  _mm_store_ps (__P, __tmp);
+}
+
+/* Store the lower SPFP value across four words.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store1_ps (float *__P, __m128 __A)
+{
+  __v4sf __va = vec_splat((__v4sf)__A, 0);
+  _mm_store_ps (__P, __va);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_ps1 (float *__P, __m128 __A)
+{
+  _mm_store1_ps (__P, __A);
+}
+
+/* Create a vector with element 0 as F and the rest zero.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_ss (float __F)
+{
+  return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f };
+}
+
+/* Sets the low SPFP value of A from the low value of B.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_move_ss (__m128 __A, __m128 __B)
+{
+  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
+
+  return (vec_sel ((__v4sf)__A, (__v4sf)__B, mask));
+}
+
+/* Create a vector with element 0 as *P and the rest zero.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_ss (float const *__P)
+{
+  return _mm_set_ss (*__P);
+}
+
+/* Stores the lower SPFP value.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_ss (float *__P, __m128 __A)
+{
+  *__P = ((__v4sf)__A)[0];
+}
+
+/* Perform the respective operation on the lower SPFP (single-precision
+   floating-point) values of A and B; the upper three SPFP values are
+   passed through from A.  */
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_ss (__m128 __A, __m128 __B)
+{
+#ifdef _ARCH_PWR7
+  __m128 a, b, c;
+  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
+  /* PowerISA VSX does not allow partial (for just lower double)
+     results. So to insure we don't generate spurious exceptions
+     (from the upper double values) we splat the lower double
+     before we to the operation.  */
+  a = vec_splat (__A, 0);
+  b = vec_splat (__B, 0);
+  c = a + b;
+  /* Then we merge the lower float result with the original upper
+     float elements from __A.  */
+  return (vec_sel (__A, c, mask));
+#else
+  __A[0] = __A[0] + __B[0];
+  return (__A);
+#endif
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_ss (__m128 __A, __m128 __B)
+{
+#ifdef _ARCH_PWR7
+  __m128 a, b, c;
+  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
+  /* PowerISA VSX does not allow partial (for just lower double)
+     results. So to insure we don't generate spurious exceptions
+     (from the upper double values) we splat the lower double
+     before we to the operation.  */
+  a = vec_splat (__A, 0);
+  b = vec_splat (__B, 0);
+  c = a - b;
+  /* Then we merge the lower float result with the original upper
+     float elements from __A.  */
+  return (vec_sel (__A, c, mask));
+#else
+  __A[0] = __A[0] - __B[0];
+  return (__A);
+#endif
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_ss (__m128 __A, __m128 __B)
+{
+#ifdef _ARCH_PWR7
+  __m128 a, b, c;
+  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
+  /* PowerISA VSX does not allow partial (for just lower double)
+     results. So to insure we don't generate spurious exceptions
+     (from the upper double values) we splat the lower double
+     before we to the operation.  */
+  a = vec_splat (__A, 0);
+  b = vec_splat (__B, 0);
+  c = a * b;
+  /* Then we merge the lower float result with the original upper
+     float elements from __A.  */
+  return (vec_sel (__A, c, mask));
+#else
+  __A[0] = __A[0] * __B[0];
+  return (__A);
+#endif
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_div_ss (__m128 __A, __m128 __B)
+{
+#ifdef _ARCH_PWR7
+  __m128 a, b, c;
+  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
+  /* PowerISA VSX does not allow partial (for just lower double)
+     results. So to insure we don't generate spurious exceptions
+     (from the upper double values) we splat the lower double
+     before we to the operation.  */
+  a = vec_splat (__A, 0);
+  b = vec_splat (__B, 0);
+  c = a / b;
+  /* Then we merge the lower float result with the original upper
+     float elements from __A.  */
+  return (vec_sel (__A, c, mask));
+#else
+  __A[0] = __A[0] / __B[0];
+  return (__A);
+#endif
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sqrt_ss (__m128 __A)
+{
+  __m128 a, c;
+  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
+  /* PowerISA VSX does not allow partial (for just lower double)
+   * results. So to insure we don't generate spurious exceptions
+   * (from the upper double values) we splat the lower double
+   * before we to the operation. */
+  a = vec_splat (__A, 0);
+  c = vec_sqrt (a);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return (vec_sel (__A, c, mask));
+}
+
+/* Perform the respective operation on the four SPFP values in A and B.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) ((__v4sf)__A + (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) ((__v4sf)__A - (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) ((__v4sf)__A * (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_div_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) ((__v4sf)__A / (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sqrt_ps (__m128 __A)
+{
+  return (vec_sqrt ((__v4sf)__A));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rcp_ps (__m128 __A)
+{
+  return (vec_re ((__v4sf)__A));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rsqrt_ps (__m128 __A)
+{
+  return (vec_rsqrte (__A));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rcp_ss (__m128 __A)
+{
+  __m128 a, c;
+  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
+  /* PowerISA VSX does not allow partial (for just lower double)
+   * results. So to insure we don't generate spurious exceptions
+   * (from the upper double values) we splat the lower double
+   * before we to the operation. */
+  a = vec_splat (__A, 0);
+  c = _mm_rcp_ps (a);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return (vec_sel (__A, c, mask));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rsqrt_ss (__m128 __A)
+{
+  __m128 a, c;
+  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
+  /* PowerISA VSX does not allow partial (for just lower double)
+   * results. So to insure we don't generate spurious exceptions
+   * (from the upper double values) we splat the lower double
+   * before we to the operation. */
+  a = vec_splat (__A, 0);
+  c = vec_rsqrte (a);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return (vec_sel (__A, c, mask));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_ss (__m128 __A, __m128 __B)
+{
+  __v4sf a, b, c;
+  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
+  /* PowerISA VSX does not allow partial (for just lower float)
+   * results. So to insure we don't generate spurious exceptions
+   * (from the upper float values) we splat the lower float
+   * before we to the operation. */
+  a = vec_splat ((__v4sf)__A, 0);
+  b = vec_splat ((__v4sf)__B, 0);
+  c = vec_min (a, b);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return (vec_sel ((__v4sf)__A, c, mask));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_ss (__m128 __A, __m128 __B)
+{
+  __v4sf a, b, c;
+  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
+  /* PowerISA VSX does not allow partial (for just lower float)
+   * results. So to insure we don't generate spurious exceptions
+   * (from the upper float values) we splat the lower float
+   * before we to the operation. */
+  a = vec_splat (__A, 0);
+  b = vec_splat (__B, 0);
+  c = vec_max (a, b);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return (vec_sel ((__v4sf)__A, c, mask));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_ps (__m128 __A, __m128 __B)
+{
+  __vector __bool int m = vec_cmpgt ((__v4sf) __B, (__v4sf) __A);
+  return vec_sel (__B, __A, m);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_ps (__m128 __A, __m128 __B)
+{
+  __vector __bool int m = vec_cmpgt ((__v4sf) __A, (__v4sf) __B);
+  return vec_sel (__B, __A, m);
+}
+
+/* Perform logical bit-wise operations on 128-bit values.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_and_ps (__m128 __A, __m128 __B)
+{
+  return ((__m128)vec_and ((__v4sf)__A, (__v4sf)__B));
+//  return __builtin_ia32_andps (__A, __B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_andnot_ps (__m128 __A, __m128 __B)
+{
+  return ((__m128)vec_andc ((__v4sf)__B, (__v4sf)__A));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_or_ps (__m128 __A, __m128 __B)
+{
+  return ((__m128)vec_or ((__v4sf)__A, (__v4sf)__B));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_xor_ps (__m128 __A, __m128 __B)
+{
+  return ((__m128)vec_xor ((__v4sf)__A, (__v4sf)__B));
+}
+
+/* Perform a comparison on the four SPFP values of A and B.  For each
+   element, if the comparison is true, place a mask of all ones in the
+   result, otherwise a mask of zeros.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_ps (__m128 __A, __m128 __B)
+{
+  return ((__m128)vec_cmpeq ((__v4sf)__A,(__v4sf) __B));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_ps (__m128 __A, __m128 __B)
+{
+  return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmple_ps (__m128 __A, __m128 __B)
+{
+  return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_ps (__m128 __A, __m128 __B)
+{
+  return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpge_ps (__m128 __A, __m128 __B)
+{
+  return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
+}
+
+extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpneq_ps (__m128  __A, __m128  __B)
+{
+  __v4sf temp = (__v4sf ) vec_cmpeq ((__v4sf) __A, (__v4sf)__B);
+  return ((__m128)vec_nor (temp, temp));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnlt_ps (__m128 __A, __m128 __B)
+{
+  return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnle_ps (__m128 __A, __m128 __B)
+{
+  return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpngt_ps (__m128 __A, __m128 __B)
+{
+  return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnge_ps (__m128 __A, __m128 __B)
+{
+  return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
+}
+
+extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpord_ps (__m128  __A, __m128  __B)
+{
+  __vector unsigned int a, b;
+  __vector unsigned int c, d;
+  static const __vector unsigned int float_exp_mask =
+    { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
+
+  a = (__vector unsigned int) vec_abs ((__v4sf)__A);
+  b = (__vector unsigned int) vec_abs ((__v4sf)__B);
+  c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
+  d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
+  return ((__m128 ) vec_and (c, d));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpunord_ps (__m128 __A, __m128 __B)
+{
+  __vector unsigned int a, b;
+  __vector unsigned int c, d;
+  static const __vector unsigned int float_exp_mask =
+    { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
+
+  a = (__vector unsigned int) vec_abs ((__v4sf)__A);
+  b = (__vector unsigned int) vec_abs ((__v4sf)__B);
+  c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
+  d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
+  return ((__m128 ) vec_or (c, d));
+}
+
+/* Perform a comparison on the lower SPFP values of A and B.  If the
+   comparison is true, place a mask of all ones in the result, otherwise a
+   mask of zeros.  The upper three SPFP values are passed through from A.  */
+extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_ss (__m128  __A, __m128  __B)
+{
+  static const __vector unsigned int mask =
+    { 0xffffffff, 0, 0, 0 };
+  __v4sf a, b, c;
+  /* PowerISA VMX does not allow partial (for just element 0)
+   * results. So to insure we don't generate spurious exceptions
+   * (from the upper elements) we splat the lower float
+   * before we to the operation. */
+  a = vec_splat ((__v4sf) __A, 0);
+  b = vec_splat ((__v4sf) __B, 0);
+  c = (__v4sf) vec_cmpeq(a, b);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_ss (__m128 __A, __m128 __B)
+{
+  static const __vector unsigned int mask =
+    { 0xffffffff, 0, 0, 0 };
+  __v4sf a, b, c;
+  /* PowerISA VMX does not allow partial (for just element 0)
+   * results. So to insure we don't generate spurious exceptions
+   * (from the upper elements) we splat the lower float
+   * before we to the operation. */
+  a = vec_splat ((__v4sf) __A, 0);
+  b = vec_splat ((__v4sf) __B, 0);
+  c = (__v4sf) vec_cmplt(a, b);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmple_ss (__m128 __A, __m128 __B)
+{
+  static const __vector unsigned int mask =
+    { 0xffffffff, 0, 0, 0 };
+  __v4sf a, b, c;
+  /* PowerISA VMX does not allow partial (for just element 0)
+   * results. So to insure we don't generate spurious exceptions
+   * (from the upper elements) we splat the lower float
+   * before we to the operation. */
+  a = vec_splat ((__v4sf) __A, 0);
+  b = vec_splat ((__v4sf) __B, 0);
+  c = (__v4sf) vec_cmple(a, b);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_ss (__m128 __A, __m128 __B)
+{
+  static const __vector unsigned int mask =
+    { 0xffffffff, 0, 0, 0 };
+  __v4sf a, b, c;
+  /* PowerISA VMX does not allow partial (for just element 0)
+   * results. So to insure we don't generate spurious exceptions
+   * (from the upper elements) we splat the lower float
+   * before we to the operation. */
+  a = vec_splat ((__v4sf) __A, 0);
+  b = vec_splat ((__v4sf) __B, 0);
+  c = (__v4sf) vec_cmpgt(a, b);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpge_ss (__m128 __A, __m128 __B)
+{
+  static const __vector unsigned int mask =
+    { 0xffffffff, 0, 0, 0 };
+  __v4sf a, b, c;
+  /* PowerISA VMX does not allow partial (for just element 0)
+   * results. So to insure we don't generate spurious exceptions
+   * (from the upper elements) we splat the lower float
+   * before we to the operation. */
+  a = vec_splat ((__v4sf) __A, 0);
+  b = vec_splat ((__v4sf) __B, 0);
+  c = (__v4sf) vec_cmpge(a, b);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpneq_ss (__m128 __A, __m128 __B)
+{
+  static const __vector unsigned int mask =
+    { 0xffffffff, 0, 0, 0 };
+  __v4sf a, b, c;
+  /* PowerISA VMX does not allow partial (for just element 0)
+   * results. So to insure we don't generate spurious exceptions
+   * (from the upper elements) we splat the lower float
+   * before we to the operation. */
+  a = vec_splat ((__v4sf) __A, 0);
+  b = vec_splat ((__v4sf) __B, 0);
+  c = (__v4sf) vec_cmpeq(a, b);
+  c = vec_nor (c, c);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnlt_ss (__m128 __A, __m128 __B)
+{
+  static const __vector unsigned int mask =
+    { 0xffffffff, 0, 0, 0 };
+  __v4sf a, b, c;
+  /* PowerISA VMX does not allow partial (for just element 0)
+   * results. So to insure we don't generate spurious exceptions
+   * (from the upper elements) we splat the lower float
+   * before we to the operation. */
+  a = vec_splat ((__v4sf) __A, 0);
+  b = vec_splat ((__v4sf) __B, 0);
+  c = (__v4sf) vec_cmpge(a, b);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnle_ss (__m128 __A, __m128 __B)
+{
+  static const __vector unsigned int mask =
+    { 0xffffffff, 0, 0, 0 };
+  __v4sf a, b, c;
+  /* PowerISA VMX does not allow partial (for just element 0)
+   * results. So to insure we don't generate spurious exceptions
+   * (from the upper elements) we splat the lower float
+   * before we to the operation. */
+  a = vec_splat ((__v4sf) __A, 0);
+  b = vec_splat ((__v4sf) __B, 0);
+  c = (__v4sf) vec_cmpgt(a, b);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpngt_ss (__m128 __A, __m128 __B)
+{
+  static const __vector unsigned int mask =
+    { 0xffffffff, 0, 0, 0 };
+  __v4sf a, b, c;
+  /* PowerISA VMX does not allow partial (for just element 0)
+   * results. So to insure we don't generate spurious exceptions
+   * (from the upper elements) we splat the lower float
+   * before we to the operation. */
+  a = vec_splat ((__v4sf) __A, 0);
+  b = vec_splat ((__v4sf) __B, 0);
+  c = (__v4sf) vec_cmple(a, b);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnge_ss (__m128 __A, __m128 __B)
+{
+  static const __vector unsigned int mask =
+    { 0xffffffff, 0, 0, 0 };
+  __v4sf a, b, c;
+  /* PowerISA VMX does not allow partial (for just element 0)
+   * results. So to insure we don't generate spurious exceptions
+   * (from the upper elements) we splat the lower float
+   * before we do the operation. */
+  a = vec_splat ((__v4sf) __A, 0);
+  b = vec_splat ((__v4sf) __B, 0);
+  c = (__v4sf) vec_cmplt(a, b);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpord_ss (__m128 __A, __m128 __B)
+{
+  __vector unsigned int a, b;
+  __vector unsigned int c, d;
+  static const __vector unsigned int float_exp_mask =
+    { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
+  static const __vector unsigned int mask =
+    { 0xffffffff, 0, 0, 0 };
+
+  a = (__vector unsigned int) vec_abs ((__v4sf)__A);
+  b = (__vector unsigned int) vec_abs ((__v4sf)__B);
+  c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
+  d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
+  c = vec_and (c, d);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpunord_ss (__m128 __A, __m128 __B)
+{
+  __vector unsigned int a, b;
+  __vector unsigned int c, d;
+  static const __vector unsigned int float_exp_mask =
+    { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
+  static const __vector unsigned int mask =
+    { 0xffffffff, 0, 0, 0 };
+
+  a = (__vector unsigned int) vec_abs ((__v4sf)__A);
+  b = (__vector unsigned int) vec_abs ((__v4sf)__B);
+  c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
+  d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
+  c = vec_or (c, d);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
+}
+
+/* Compare the lower SPFP values of A and B and return 1 if true
+   and 0 if false.  */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comieq_ss (__m128 __A, __m128 __B)
+{
+  return (__A[0] == __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comilt_ss (__m128 __A, __m128 __B)
+{
+  return (__A[0] < __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comile_ss (__m128 __A, __m128 __B)
+{
+  return (__A[0] <= __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comigt_ss (__m128 __A, __m128 __B)
+{
+  return (__A[0] > __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comige_ss (__m128 __A, __m128 __B)
+{
+  return (__A[0] >= __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comineq_ss (__m128 __A, __m128 __B)
+{
+  return (__A[0] != __B[0]);
+}
+
+/* FIXME
+ * The __mm_ucomi??_ss implementations below are exactly the same as
+ * __mm_comi??_ss because GCC for PowerPC only generates unordered
+ * compares (scalar and vector).
+ * Technically __mm_comieq_ss et al should be using the ordered
+ * compare and signal for QNaNs.
+ * The __mm_ucomieq_sd et all should be OK, as is.
+ */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomieq_ss (__m128 __A, __m128 __B)
+{
+  return (__A[0] == __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomilt_ss (__m128 __A, __m128 __B)
+{
+  return (__A[0] < __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomile_ss (__m128 __A, __m128 __B)
+{
+  return (__A[0] <= __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomigt_ss (__m128 __A, __m128 __B)
+{
+  return (__A[0] > __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomige_ss (__m128 __A, __m128 __B)
+{
+  return (__A[0] >= __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomineq_ss (__m128 __A, __m128 __B)
+{
+  return (__A[0] != __B[0]);
+}
+
+extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtss_f32 (__m128 __A)
+{
+  return ((__v4sf)__A)[0];
+}
+
+/* Convert the lower SPFP value to a 32-bit integer according to the current
+   rounding mode.  */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtss_si32 (__m128 __A)
+{
+  __m64 res = 0;
+#ifdef _ARCH_PWR8
+  double dtmp;
+  __asm__(
+#ifdef __LITTLE_ENDIAN__
+      "xxsldwi %x0,%x0,%x0,3;\n"
+#endif
+      "xscvspdp %x2,%x0;\n"
+      "fctiw  %2,%2;\n"
+      "mfvsrd  %1,%x2;\n"
+      : "+wa" (__A),
+        "=r" (res),
+        "=f" (dtmp)
+      : );
+#else
+  res = __builtin_rint(__A[0]);
+#endif
+  return (res);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_ss2si (__m128 __A)
+{
+  return _mm_cvtss_si32 (__A);
+}
+
+/* Convert the lower SPFP value to a 32-bit integer according to the
+   current rounding mode.  */
+
+/* Intel intrinsic.  */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtss_si64 (__m128 __A)
+{
+  __m64 res = 0;
+#ifdef _ARCH_PWR8
+  double dtmp;
+  __asm__(
+#ifdef __LITTLE_ENDIAN__
+      "xxsldwi %x0,%x0,%x0,3;\n"
+#endif
+      "xscvspdp %x2,%x0;\n"
+      "fctid  %2,%2;\n"
+      "mfvsrd  %1,%x2;\n"
+      : "+wa" (__A),
+        "=r" (res),
+        "=f" (dtmp)
+      : );
+#else
+  res = __builtin_llrint(__A[0]);
+#endif
+  return (res);
+}
+
+/* Microsoft intrinsic.  */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtss_si64x (__m128 __A)
+{
+  return _mm_cvtss_si64 ((__v4sf) __A);
+}
+
+/* Constants for use with _mm_prefetch.  */
+enum _mm_hint
+{
+  /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit.  */
+  _MM_HINT_ET0 = 7,
+  _MM_HINT_ET1 = 6,
+  _MM_HINT_T0 = 3,
+  _MM_HINT_T1 = 2,
+  _MM_HINT_T2 = 1,
+  _MM_HINT_NTA = 0
+};
+
+/* Loads one cache line from address P to a location "closer" to the
+   processor.  The selector I specifies the type of prefetch operation.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_prefetch (const void *__P, enum _mm_hint __I)
+{
+  /* Current PowerPC will ignores the hint parameters.  */
+  __builtin_prefetch (__P);
+}
+
+/* Convert the two lower SPFP values to 32-bit integers according to the
+   current rounding mode.  Return the integers in packed form.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtps_pi32 (__m128 __A)
+{
+  /* Splat two lower SPFP values to both halves.  */
+  __v4sf temp, rounded;
+  __vector unsigned long long result;
+
+  /* Splat two lower SPFP values to both halves.  */
+  temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
+  rounded = vec_rint(temp);
+  result = (__vector unsigned long long) vec_cts (rounded, 0);
+
+  return (__m64) ((__vector long long) result)[0];
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_ps2pi (__m128 __A)
+{
+  return _mm_cvtps_pi32 (__A);
+}
+
+/* Truncate the lower SPFP value to a 32-bit integer.  */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttss_si32 (__m128 __A)
+{
+  /* Extract the lower float element.  */
+  float temp = __A[0];
+  /* truncate to 32-bit integer and return.  */
+  return temp;
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_ss2si (__m128 __A)
+{
+  return _mm_cvttss_si32 (__A);
+}
+
+/* Intel intrinsic.  */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttss_si64 (__m128 __A)
+{
+  /* Extract the lower float element.  */
+  float temp = __A[0];
+  /* truncate to 32-bit integer and return.  */
+  return temp;
+}
+
+/* Microsoft intrinsic.  */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttss_si64x (__m128 __A)
+{
+  /* Extract the lower float element.  */
+  float temp = __A[0];
+  /* truncate to 32-bit integer and return.  */
+  return temp;
+}
+
+/* Truncate the two lower SPFP values to 32-bit integers.  Return the
+   integers in packed form.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttps_pi32 (__m128 __A)
+{
+  __v4sf temp;
+  __vector unsigned long long result;
+
+  /* Splat two lower SPFP values to both halves.  */
+  temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
+  result = (__vector unsigned long long) vec_cts (temp, 0);
+
+  return (__m64) ((__vector long long) result)[0];
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_ps2pi (__m128 __A)
+{
+  return _mm_cvttps_pi32 (__A);
+}
+
+/* Convert B to a SPFP value and insert it as element zero in A.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi32_ss (__m128 __A, int __B)
+{
+  float temp = __B;
+  __A[0] = temp;
+
+  return __A;
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_si2ss (__m128 __A, int __B)
+{
+  return _mm_cvtsi32_ss (__A, __B);
+}
+
+/* Convert B to a SPFP value and insert it as element zero in A.  */
+/* Intel intrinsic.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64_ss (__m128 __A, long long __B)
+{
+  float temp = __B;
+  __A[0] = temp;
+
+  return __A;
+}
+
+/* Microsoft intrinsic.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64x_ss (__m128 __A, long long __B)
+{
+  return _mm_cvtsi64_ss (__A, __B);
+}
+
+/* Convert the two 32-bit values in B to SPFP form and insert them
+   as the two lower elements in A.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpi32_ps (__m128        __A, __m64        __B)
+{
+  __vector signed int vm1;
+  __vector float vf1;
+
+  vm1 = (__vector signed int) (__vector unsigned long long) {__B, __B};
+  vf1 = (__vector float) vec_ctf (vm1, 0);
+
+  return ((__m128) (__vector unsigned long long)
+    { ((__vector unsigned long long)vf1) [0],
+	((__vector unsigned long long)__A) [1]});
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_pi2ps (__m128 __A, __m64 __B)
+{
+  return _mm_cvtpi32_ps (__A, __B);
+}
+
+/* Convert the four signed 16-bit values in A to SPFP form.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpi16_ps (__m64 __A)
+{
+  __vector signed short vs8;
+  __vector signed int vi4;
+  __vector float vf1;
+
+  vs8 = (__vector signed short) (__vector unsigned long long) { __A, __A };
+  vi4 = vec_vupklsh (vs8);
+  vf1 = (__vector float) vec_ctf (vi4, 0);
+
+  return (__m128) vf1;
+}
+
+/* Convert the four unsigned 16-bit values in A to SPFP form.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpu16_ps (__m64 __A)
+{
+  const __vector unsigned short zero =
+    { 0, 0, 0, 0, 0, 0, 0, 0 };
+  __vector unsigned short vs8;
+  __vector unsigned int vi4;
+  __vector float vf1;
+
+  vs8 = (__vector unsigned short) (__vector unsigned long long) { __A, __A };
+  vi4 = (__vector unsigned int) vec_mergel
+#ifdef __LITTLE_ENDIAN__
+                                           (vs8, zero);
+#else
+                                           (zero, vs8);
+#endif
+  vf1 = (__vector float) vec_ctf (vi4, 0);
+
+  return (__m128) vf1;
+}
+
+/* Convert the low four signed 8-bit values in A to SPFP form.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpi8_ps (__m64 __A)
+{
+  __vector signed char vc16;
+  __vector signed short vs8;
+  __vector signed int vi4;
+  __vector float vf1;
+
+  vc16 = (__vector signed char) (__vector unsigned long long) { __A, __A };
+  vs8 = vec_vupkhsb (vc16);
+  vi4 = vec_vupkhsh (vs8);
+  vf1 = (__vector float) vec_ctf (vi4, 0);
+
+  return (__m128) vf1;
+}
+
+/* Convert the low four unsigned 8-bit values in A to SPFP form.  */
+extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+
+_mm_cvtpu8_ps (__m64  __A)
+{
+  const __vector unsigned char zero =
+    { 0, 0, 0, 0, 0, 0, 0, 0 };
+  __vector unsigned char vc16;
+  __vector unsigned short vs8;
+  __vector unsigned int vi4;
+  __vector float vf1;
+
+  vc16 = (__vector unsigned char) (__vector unsigned long long) { __A, __A };
+#ifdef __LITTLE_ENDIAN__
+  vs8 = (__vector unsigned short) vec_mergel (vc16, zero);
+  vi4 = (__vector unsigned int) vec_mergeh (vs8,
+					    (__vector unsigned short) zero);
+#else
+  vs8 = (__vector unsigned short) vec_mergel (zero, vc16);
+  vi4 = (__vector unsigned int) vec_mergeh ((__vector unsigned short) zero,
+                                            vs8);
+#endif
+  vf1 = (__vector float) vec_ctf (vi4, 0);
+
+  return (__m128) vf1;
+}
+
+/* Convert the four signed 32-bit values in A and B to SPFP form.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpi32x2_ps (__m64 __A, __m64 __B)
+{
+  __vector signed int vi4;
+  __vector float vf4;
+
+  vi4 = (__vector signed int) (__vector unsigned long long) { __A, __B };
+  vf4 = (__vector float) vec_ctf (vi4, 0);
+  return (__m128) vf4;
+}
+
+/* Convert the four SPFP values in A to four signed 16-bit integers.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtps_pi16 (__m128 __A)
+{
+  __v4sf rounded;
+  __vector signed int temp;
+  __vector unsigned long long result;
+
+  rounded = vec_rint(__A);
+  temp = vec_cts (rounded, 0);
+  result = (__vector unsigned long long) vec_pack (temp, temp);
+
+  return (__m64) ((__vector long long) result)[0];
+}
+
+/* Convert the four SPFP values in A to four signed 8-bit integers.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtps_pi8 (__m128 __A)
+{
+  __v4sf rounded;
+  __vector signed int tmp_i;
+  static const __vector signed int zero = {0, 0, 0, 0};
+  __vector signed short tmp_s;
+  __vector signed char res_v;
+
+  rounded = vec_rint(__A);
+  tmp_i = vec_cts (rounded, 0);
+  tmp_s = vec_pack (tmp_i, zero);
+  res_v = vec_pack (tmp_s, tmp_s);
+  return (__m64) ((__vector long long) res_v)[0];
+}
+
+/* Selects four specific SPFP values from A and B based on MASK.  */
+extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+
+_mm_shuffle_ps (__m128  __A, __m128  __B, int const __mask)
+{
+  unsigned long element_selector_10 = __mask & 0x03;
+  unsigned long element_selector_32 = (__mask >> 2) & 0x03;
+  unsigned long element_selector_54 = (__mask >> 4) & 0x03;
+  unsigned long element_selector_76 = (__mask >> 6) & 0x03;
+  static const unsigned int permute_selectors[4] =
+    {
+#ifdef __LITTLE_ENDIAN__
+      0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
+#else
+      0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
+#endif
+    };
+  __vector unsigned int t;
+
+  t[0] = permute_selectors[element_selector_10];
+  t[1] = permute_selectors[element_selector_32];
+  t[2] = permute_selectors[element_selector_54] + 0x10101010;
+  t[3] = permute_selectors[element_selector_76] + 0x10101010;
+  return vec_perm ((__v4sf) __A, (__v4sf)__B, (__vector unsigned char)t);
+}
+
+/* Selects and interleaves the upper two SPFP values from A and B.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) vec_vmrglw ((__v4sf) __A, (__v4sf)__B);
+}
+
+/* Selects and interleaves the lower two SPFP values from A and B.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) vec_vmrghw ((__v4sf) __A, (__v4sf)__B);
+}
+
+/* Sets the upper two SPFP values with 64-bits of data loaded from P;
+   the lower two values are passed through from A.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadh_pi (__m128 __A, __m64 const *__P)
+{
+  __vector unsigned long long __a = (__vector unsigned long long)__A;
+  __vector unsigned long long __p = vec_splats(*__P);
+  __a [1] = __p [1];
+
+  return (__m128)__a;
+}
+
+/* Stores the upper two SPFP values of A into P.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storeh_pi (__m64 *__P, __m128 __A)
+{
+  __vector unsigned long long __a = (__vector unsigned long long) __A;
+
+  *__P = __a[1];
+}
+
+/* Moves the upper two values of B into the lower two values of A.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movehl_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) vec_mergel ((__vector unsigned long long)__B,
+			      (__vector unsigned long long)__A);
+}
+
+/* Moves the lower two values of B into the upper two values of A.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movelh_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) vec_mergeh ((__vector unsigned long long)__A,
+			      (__vector unsigned long long)__B);
+}
+
+/* Sets the lower two SPFP values with 64-bits of data loaded from P;
+   the upper two values are passed through from A.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadl_pi (__m128 __A, __m64 const *__P)
+{
+  __vector unsigned long long __a = (__vector unsigned long long)__A;
+  __vector unsigned long long __p = vec_splats(*__P);
+  __a [0] = __p [0];
+
+  return (__m128)__a;
+}
+
+/* Stores the lower two SPFP values of A into P.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storel_pi (__m64 *__P, __m128 __A)
+{
+  __vector unsigned long long __a = (__vector unsigned long long) __A;
+
+  *__P = __a[0];
+}
+
+#ifdef _ARCH_PWR8
+/* Intrinsic functions that require PowerISA 2.07 minimum.  */
+
+/* Creates a 4-bit mask from the most significant bits of the SPFP values.  */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movemask_ps (__m128  __A)
+{
+  __vector unsigned long long result;
+  static const __vector unsigned int perm_mask =
+    {
+#ifdef __LITTLE_ENDIAN__
+	0x00204060, 0x80808080, 0x80808080, 0x80808080
+#else
+      0x80808080, 0x80808080, 0x80808080, 0x00204060
+#endif
+    };
+
+  result = ((__vector unsigned long long)
+	    vec_vbpermq ((__vector unsigned char) __A,
+			 (__vector unsigned char) perm_mask));
+
+#ifdef __LITTLE_ENDIAN__
+  return result[1];
+#else
+  return result[0];
+#endif
+}
+#endif /* _ARCH_PWR8 */
+
+/* Create a vector with all four elements equal to *P.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load1_ps (float const *__P)
+{
+  return _mm_set1_ps (*__P);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_ps1 (float const *__P)
+{
+  return _mm_load1_ps (__P);
+}
+
+/* Extracts one of the four words of A.  The selector N must be immediate.  */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_extract_pi16 (__m64 const __A, int const __N)
+{
+  unsigned int shiftr = __N & 3;
+#ifdef __BIG_ENDIAN__
+  shiftr = 3 - shiftr;
+#endif
+
+  return ((__A >> (shiftr * 16)) & 0xffff);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pextrw (__m64 const __A, int const __N)
+{
+  return _mm_extract_pi16 (__A, __N);
+}
+
+/* Inserts word D into one of four words of A.  The selector N must be
+   immediate.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_insert_pi16 (__m64 const __A, int const __D, int const __N)
+{
+  const int shiftl = (__N & 3) * 16;
+  const __m64 shiftD = (const __m64) __D << shiftl;
+  const __m64 mask = 0xffffUL << shiftl;
+  __m64 result = (__A & (~mask)) | (shiftD & mask);
+
+  return (result);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pinsrw (__m64 const __A, int const __D, int const __N)
+{
+  return _mm_insert_pi16 (__A, __D, __N);
+}
+
+/* Compute the element-wise maximum of signed 16-bit values.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+
+_mm_max_pi16 (__m64 __A, __m64 __B)
+{
+#if _ARCH_PWR8
+  __vector signed short a, b, r;
+  __vector __bool short c;
+
+  a = (__vector signed short)vec_splats (__A);
+  b = (__vector signed short)vec_splats (__B);
+  c = (__vector __bool short)vec_cmpgt (a, b);
+  r = vec_sel (b, a, c);
+  return (__m64) ((__vector long long) r)[0];
+#else
+  __m64_union m1, m2, res;
+
+  m1.as_m64 = __A;
+  m2.as_m64 = __B;
+
+  res.as_short[0] =
+      (m1.as_short[0] > m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
+  res.as_short[1] =
+      (m1.as_short[1] > m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
+  res.as_short[2] =
+      (m1.as_short[2] > m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
+  res.as_short[3] =
+      (m1.as_short[3] > m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
+
+  return (__m64) res.as_m64;
+#endif
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pmaxsw (__m64 __A, __m64 __B)
+{
+  return _mm_max_pi16 (__A, __B);
+}
+
+/* Compute the element-wise maximum of unsigned 8-bit values.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_pu8 (__m64 __A, __m64 __B)
+{
+#if _ARCH_PWR8
+  __vector unsigned char a, b, r;
+  __vector __bool char c;
+
+  a = (__vector unsigned char)vec_splats (__A);
+  b = (__vector unsigned char)vec_splats (__B);
+  c = (__vector __bool char)vec_cmpgt (a, b);
+  r = vec_sel (b, a, c);
+  return (__m64) ((__vector long long) r)[0];
+#else
+  __m64_union m1, m2, res;
+  long i;
+
+  m1.as_m64 = __A;
+  m2.as_m64 = __B;
+
+
+  for (i = 0; i < 8; i++)
+  res.as_char[i] =
+      ((unsigned char) m1.as_char[i] > (unsigned char) m2.as_char[i]) ?
+	  m1.as_char[i] : m2.as_char[i];
+
+  return (__m64) res.as_m64;
+#endif
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pmaxub (__m64 __A, __m64 __B)
+{
+  return _mm_max_pu8 (__A, __B);
+}
+
+/* Compute the element-wise minimum of signed 16-bit values.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_pi16 (__m64 __A, __m64 __B)
+{
+#if _ARCH_PWR8
+  __vector signed short a, b, r;
+  __vector __bool short c;
+
+  a = (__vector signed short)vec_splats (__A);
+  b = (__vector signed short)vec_splats (__B);
+  c = (__vector __bool short)vec_cmplt (a, b);
+  r = vec_sel (b, a, c);
+  return (__m64) ((__vector long long) r)[0];
+#else
+  __m64_union m1, m2, res;
+
+  m1.as_m64 = __A;
+  m2.as_m64 = __B;
+
+  res.as_short[0] =
+      (m1.as_short[0] < m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
+  res.as_short[1] =
+      (m1.as_short[1] < m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
+  res.as_short[2] =
+      (m1.as_short[2] < m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
+  res.as_short[3] =
+      (m1.as_short[3] < m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
+
+  return (__m64) res.as_m64;
+#endif
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pminsw (__m64 __A, __m64 __B)
+{
+  return _mm_min_pi16 (__A, __B);
+}
+
+/* Compute the element-wise minimum of unsigned 8-bit values.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_pu8 (__m64 __A, __m64 __B)
+{
+#if _ARCH_PWR8
+  __vector unsigned char a, b, r;
+  __vector __bool char c;
+
+  a = (__vector unsigned char)vec_splats (__A);
+  b = (__vector unsigned char)vec_splats (__B);
+  c = (__vector __bool char)vec_cmplt (a, b);
+  r = vec_sel (b, a, c);
+  return (__m64) ((__vector long long) r)[0];
+#else
+  __m64_union m1, m2, res;
+  long i;
+
+  m1.as_m64 = __A;
+  m2.as_m64 = __B;
+
+
+  for (i = 0; i < 8; i++)
+  res.as_char[i] =
+      ((unsigned char) m1.as_char[i] < (unsigned char) m2.as_char[i]) ?
+	  m1.as_char[i] : m2.as_char[i];
+
+  return (__m64) res.as_m64;
+#endif
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pminub (__m64 __A, __m64 __B)
+{
+  return _mm_min_pu8 (__A, __B);
+}
+
+/* Create an 8-bit mask of the signs of 8-bit values.  */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movemask_pi8 (__m64 __A)
+{
+  unsigned long long p =
+#ifdef __LITTLE_ENDIAN__
+                         0x0008101820283038UL; // permute control for sign bits
+#else
+                         0x3830282018100800UL; // permute control for sign bits
+#endif
+  return __builtin_bpermd (p, __A);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pmovmskb (__m64 __A)
+{
+  return _mm_movemask_pi8 (__A);
+}
+
+/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
+   in B and produce the high 16 bits of the 32-bit results.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mulhi_pu16 (__m64 __A, __m64 __B)
+{
+  __vector unsigned short a, b;
+  __vector unsigned short c;
+  __vector unsigned int w0, w1;
+  __vector unsigned char xform1 = {
+#ifdef __LITTLE_ENDIAN__
+      0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
+      0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
+#else
+      0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15,
+      0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15
+#endif
+    };
+
+  a = (__vector unsigned short)vec_splats (__A);
+  b = (__vector unsigned short)vec_splats (__B);
+
+  w0 = vec_vmuleuh (a, b);
+  w1 = vec_vmulouh (a, b);
+  c = (__vector unsigned short)vec_perm (w0, w1, xform1);
+
+  return (__m64) ((__vector long long) c)[0];
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pmulhuw (__m64 __A, __m64 __B)
+{
+  return _mm_mulhi_pu16 (__A, __B);
+}
+
+/* Return a combination of the four 16-bit values in A.  The selector
+   must be an immediate.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shuffle_pi16 (__m64 __A, int const __N)
+{
+  unsigned long element_selector_10 = __N & 0x03;
+  unsigned long element_selector_32 = (__N >> 2) & 0x03;
+  unsigned long element_selector_54 = (__N >> 4) & 0x03;
+  unsigned long element_selector_76 = (__N >> 6) & 0x03;
+  static const unsigned short permute_selectors[4] =
+    {
+#ifdef __LITTLE_ENDIAN__
+	      0x0908, 0x0B0A, 0x0D0C, 0x0F0E
+#else
+	      0x0607, 0x0405, 0x0203, 0x0001
+#endif
+    };
+  __m64_union t;
+  __vector unsigned long long a, p, r;
+
+#ifdef __LITTLE_ENDIAN__
+  t.as_short[0] = permute_selectors[element_selector_10];
+  t.as_short[1] = permute_selectors[element_selector_32];
+  t.as_short[2] = permute_selectors[element_selector_54];
+  t.as_short[3] = permute_selectors[element_selector_76];
+#else
+  t.as_short[3] = permute_selectors[element_selector_10];
+  t.as_short[2] = permute_selectors[element_selector_32];
+  t.as_short[1] = permute_selectors[element_selector_54];
+  t.as_short[0] = permute_selectors[element_selector_76];
+#endif
+  p = vec_splats (t.as_m64);
+  a = vec_splats (__A);
+  r = vec_perm (a, a, (__vector unsigned char)p);
+  return (__m64) ((__vector long long) r)[0];
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pshufw (__m64 __A, int const __N)
+{
+  return _mm_shuffle_pi16 (__A, __N);
+}
+
+/* Conditionally store byte elements of A into P.  The high bit of each
+   byte in the selector N determines whether the corresponding byte from
+   A is stored.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
+{
+  __m64 hibit = 0x8080808080808080UL;
+  __m64 mask, tmp;
+  __m64 *p = (__m64*)__P;
+
+  tmp = *p;
+  mask = _mm_cmpeq_pi8 ((__N & hibit), hibit);
+  tmp = (tmp & (~mask)) | (__A & mask);
+  *p = tmp;
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_maskmovq (__m64 __A, __m64 __N, char *__P)
+{
+  _mm_maskmove_si64 (__A, __N, __P);
+}
+
+/* Compute the rounded averages of the unsigned 8-bit values in A and B.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_avg_pu8 (__m64 __A, __m64 __B)
+{
+  __vector unsigned char a, b, c;
+
+  a = (__vector unsigned char)vec_splats (__A);
+  b = (__vector unsigned char)vec_splats (__B);
+  c = vec_avg (a, b);
+  return (__m64) ((__vector long long) c)[0];
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pavgb (__m64 __A, __m64 __B)
+{
+  return _mm_avg_pu8 (__A, __B);
+}
+
+/* Compute the rounded averages of the unsigned 16-bit values in A and B.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_avg_pu16 (__m64 __A, __m64 __B)
+{
+  __vector unsigned short a, b, c;
+
+  a = (__vector unsigned short)vec_splats (__A);
+  b = (__vector unsigned short)vec_splats (__B);
+  c = vec_avg (a, b);
+  return (__m64) ((__vector long long) c)[0];
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pavgw (__m64 __A, __m64 __B)
+{
+  return _mm_avg_pu16 (__A, __B);
+}
+
+/* Compute the sum of the absolute differences of the unsigned 8-bit
+   values in A and B.  Return the value in the lower 16-bit word; the
+   upper words are cleared.  */
+extern __inline    __m64    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sad_pu8 (__m64  __A, __m64  __B)
+{
+  __vector unsigned char a, b;
+  __vector unsigned char vmin, vmax, vabsdiff;
+  __vector signed int vsum;
+  const __vector unsigned int zero =
+    { 0, 0, 0, 0 };
+  __m64_union result = {0};
+
+  a = (__vector unsigned char) (__vector unsigned long long) { 0UL, __A };
+  b = (__vector unsigned char) (__vector unsigned long long) { 0UL, __B };
+  vmin = vec_min (a, b);
+  vmax = vec_max (a, b);
+  vabsdiff = vec_sub (vmax, vmin);
+  /* Sum four groups of bytes into integers.  */
+  vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
+  /* Sum across four integers with integer result.  */
+  vsum = vec_sums (vsum, (__vector signed int) zero);
+  /* The sum is in the right most 32-bits of the vector result.
+     Transfer to a GPR and truncate to 16 bits.  */
+  result.as_short[0] = vsum[3];
+  return result.as_m64;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psadbw (__m64 __A, __m64 __B)
+{
+  return _mm_sad_pu8 (__A, __B);
+}
+
+/* Stores the data in A to the address P without polluting the caches.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_stream_pi (__m64 *__P, __m64 __A)
+{
+  /* Use the data cache block touch for store transient.  */
+  __asm__ (
+    "	dcbtstt	0,%0"
+    :
+    : "b" (__P)
+    : "memory"
+  );
+  *__P = __A;
+}
+
+/* Likewise.  The address must be 16-byte aligned.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_stream_ps (float *__P, __m128 __A)
+{
+  /* Use the data cache block touch for store transient.  */
+  __asm__ (
+    "	dcbtstt	0,%0"
+    :
+    : "b" (__P)
+    : "memory"
+  );
+  _mm_store_ps (__P, __A);
+}
+
+/* Guarantees that every preceding store is globally visible before
+   any subsequent store.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sfence (void)
+{
+  /* Generate a light weight sync.  */
+  __atomic_thread_fence (__ATOMIC_RELEASE);
+}
+
+/* The execution of the next instruction is delayed by an implementation
+   specific amount of time.  The instruction does not modify the
+   architectural state.  This is after the pop_options pragma because
+   it does not require SSE support in the processor--the encoding is a
+   nop on processors that do not support it.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_pause (void)
+{
+  /* There is no exact match with this construct, but the following is
+     close to the desired effect.  */
+#if _ARCH_PWR8
+  /* On power8 and later processors we can depend on Program Priority
+     (PRI) and associated "very low" PPI setting.  Since we don't know
+     what PPI this thread is running at we: 1) save the current PRI
+     from the PPR SPR into a local GRP, 2) set the PRI to "very low*
+     via the special or 31,31,31 encoding. 3) issue an "isync" to
+     insure the PRI change takes effect before we execute any more
+     instructions.
+     Now we can execute a lwsync (release barrier) while we execute
+     this thread at "very low" PRI.  Finally we restore the original
+     PRI and continue execution.  */
+  unsigned long __PPR;
+
+  __asm__ volatile (
+    "	mfppr	%0;"
+    "   or 31,31,31;"
+    "   isync;"
+    "   lwsync;"
+    "   isync;"
+    "   mtppr	%0;"
+    : "=r" (__PPR)
+    :
+    : "memory"
+  );
+#else
+  /* For older processor where we may not even have Program Priority
+     controls we can only depend on Heavy Weight Sync.  */
+  __atomic_thread_fence (__ATOMIC_SEQ_CST);
+#endif
+}
+
+/* Transpose the 4x4 matrix composed of row[0-3].  */
+#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)			\
+do {									\
+  __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3);	\
+  __v4sf __t0 = vec_vmrghw (__r0, __r1);			\
+  __v4sf __t1 = vec_vmrghw (__r2, __r3);			\
+  __v4sf __t2 = vec_vmrglw (__r0, __r1);			\
+  __v4sf __t3 = vec_vmrglw (__r2, __r3);			\
+  (row0) = (__v4sf)vec_mergeh ((__vector long long)__t0, 	\
+			       (__vector long long)__t1);	\
+  (row1) = (__v4sf)vec_mergel ((__vector long long)__t0,	\
+			       (__vector long long)__t1);	\
+  (row2) = (__v4sf)vec_mergeh ((__vector long long)__t2,	\
+			       (__vector long long)__t3);	\
+  (row3) = (__v4sf)vec_mergel ((__vector long long)__t2,	\
+			       (__vector long long)__t3);	\
+} while (0)
+
+/* For backward source compatibility.  */
+//# include <emmintrin.h>
+
+#else
+#include_next <xmmintrin.h>
+#endif /* defined(__linux__) && defined(__ppc64__) */
+
+#endif /* _XMMINTRIN_H_INCLUDED */
diff --git a/onnxruntime/core/mlas/lib/sgemm.cpp b/onnxruntime/core/mlas/lib/sgemm.cpp
index 92ad6d641657b..61a4eaa09eea9 100644
--- a/onnxruntime/core/mlas/lib/sgemm.cpp
+++ b/onnxruntime/core/mlas/lib/sgemm.cpp
@@ -31,6 +31,524 @@ Module Name:
 // threads.
 //
 
+#if defined(__powerpc64__)
+#pragma message "Compiling KernelAdd and Zero for powerpc"
+
+template<bool ZeroMode, bool ProcessTwoRows>
+size_t
+MlasSgemmKernel(
+    const float* A,
+    const float* B,
+    float* C,
+    size_t CountK,
+    size_t CountN,
+    size_t lda,
+    size_t ldc,
+    float alpha
+    )
+/*++
+
+Routine Description:
+
+    This routine is an inner kernel to compute matrix multiplication for a
+    set of rows.
+
+Arguments:
+
+    A - Supplies the address of matrix A.
+
+    B - Supplies the address of matrix B. The matrix data has been packed using
+        MlasSgemmCopyPackB or MlasSgemmTransposePackB.
+
+    C - Supplies the address of matrix C.
+
+    CountK - Supplies the number of columns from matrix A and the number of rows
+        from matrix B to iterate over.
+
+    CountN - Supplies the number of columns from matrix B and matrix C to
+        iterate over.
+
+    lda - Supplies the first dimension of matrix A.
+
+    ldc - Supplies the first dimension of matrix C.
+
+    alpha - Supplies the scaler multiplier (see SGEMM definition).
+
+Return Value:
+
+    Returns the number of rows handled.
+
+--*/
+{
+    float32x4_t Row0Block0;
+    float32x4_t Row0Block1;
+    float32x4_t Row0Block2;
+    float32x4_t Row0Block3;
+
+    float32x4_t Row1Block0;
+    float32x4_t Row1Block1;
+    float32x4_t Row1Block2;
+    float32x4_t Row1Block3;
+
+#if defined(_WIN32)
+
+    if (!ProcessTwoRows) {
+        UNREFERENCED_PARAMETER(lda);
+        UNREFERENCED_PARAMETER(ldc);
+    }
+
+#endif
+
+    do {
+
+        float32x4_t BElements0;
+        float32x4_t BElements1;
+        float32x4_t BElements2;
+        float32x4_t BElements3;
+
+        float32x2_t Row0AElements;
+        float32x2_t Row1AElements;
+
+        //
+        // Clear the block accumulators.
+        //
+
+        Row0Block0 = vdupq_n_f32(0.0f);
+        Row0Block1 = vdupq_n_f32(0.0f);
+        Row0Block2 = vdupq_n_f32(0.0f);
+        Row0Block3 = vdupq_n_f32(0.0f);
+
+        if (ProcessTwoRows) {
+            Row1Block0 = vdupq_n_f32(0.0f);
+            Row1Block1 = vdupq_n_f32(0.0f);
+            Row1Block2 = vdupq_n_f32(0.0f);
+            Row1Block3 = vdupq_n_f32(0.0f);
+        }
+
+        //
+        // Compute the 16x1 or 16x2 output block.
+        //
+
+        const float* a = A;
+        size_t k = CountK;
+
+        while (k >= 2) {
+
+            Row0AElements = vld1_f32(a);
+
+            if (ProcessTwoRows) {
+                Row1AElements = vld1_f32(a + lda);
+            }
+
+            BElements0 = vld1q_f32(B + 0);
+            BElements1 = vld1q_f32(B + 4);
+            BElements2 = vld1q_f32(B + 8);
+            BElements3 = vld1q_f32(B + 12);
+
+            Row0Block0 = vmlaq_lane_f32(Row0Block0, BElements0, Row0AElements, 0);
+            Row0Block1 = vmlaq_lane_f32(Row0Block1, BElements1, Row0AElements, 0);
+            Row0Block2 = vmlaq_lane_f32(Row0Block2, BElements2, Row0AElements, 0);
+            Row0Block3 = vmlaq_lane_f32(Row0Block3, BElements3, Row0AElements, 0);
+
+            if (ProcessTwoRows) {
+                Row1Block0 = vmlaq_lane_f32(Row1Block0, BElements0, Row1AElements, 0);
+                Row1Block1 = vmlaq_lane_f32(Row1Block1, BElements1, Row1AElements, 0);
+                Row1Block2 = vmlaq_lane_f32(Row1Block2, BElements2, Row1AElements, 0);
+                Row1Block3 = vmlaq_lane_f32(Row1Block3, BElements3, Row1AElements, 0);
+            }
+
+            BElements0 = vld1q_f32(B + 16);
+            BElements1 = vld1q_f32(B + 20);
+            BElements2 = vld1q_f32(B + 24);
+            BElements3 = vld1q_f32(B + 28);
+
+            Row0Block0 = vmlaq_lane_f32(Row0Block0, BElements0, Row0AElements, 1);
+            Row0Block1 = vmlaq_lane_f32(Row0Block1, BElements1, Row0AElements, 1);
+            Row0Block2 = vmlaq_lane_f32(Row0Block2, BElements2, Row0AElements, 1);
+            Row0Block3 = vmlaq_lane_f32(Row0Block3, BElements3, Row0AElements, 1);
+
+            if (ProcessTwoRows) {
+                Row1Block0 = vmlaq_lane_f32(Row1Block0, BElements0, Row1AElements, 1);
+                Row1Block1 = vmlaq_lane_f32(Row1Block1, BElements1, Row1AElements, 1);
+                Row1Block2 = vmlaq_lane_f32(Row1Block2, BElements2, Row1AElements, 1);
+                Row1Block3 = vmlaq_lane_f32(Row1Block3, BElements3, Row1AElements, 1);
+            }
+
+            a += 2;
+            B += 32;
+            k -= 2;
+        }
+
+        if (k > 0) {
+
+            Row0AElements = vld1_dup_f32(a);
+
+            if (ProcessTwoRows) {
+                Row1AElements = vld1_dup_f32(a + lda);
+            }
+
+            BElements0 = vld1q_f32(B + 0);
+            BElements1 = vld1q_f32(B + 4);
+            BElements2 = vld1q_f32(B + 8);
+            BElements3 = vld1q_f32(B + 12);
+
+            Row0Block0 = vmlaq_lane_f32(Row0Block0, BElements0, Row0AElements, 0);
+            Row0Block1 = vmlaq_lane_f32(Row0Block1, BElements1, Row0AElements, 0);
+            Row0Block2 = vmlaq_lane_f32(Row0Block2, BElements2, Row0AElements, 0);
+            Row0Block3 = vmlaq_lane_f32(Row0Block3, BElements3, Row0AElements, 0);
+
+            if (ProcessTwoRows) {
+                Row1Block0 = vmlaq_lane_f32(Row1Block0, BElements0, Row1AElements, 0);
+                Row1Block1 = vmlaq_lane_f32(Row1Block1, BElements1, Row1AElements, 0);
+                Row1Block2 = vmlaq_lane_f32(Row1Block2, BElements2, Row1AElements, 0);
+                Row1Block3 = vmlaq_lane_f32(Row1Block3, BElements3, Row1AElements, 0);
+            }
+
+            B += 16;
+        }
+
+        //
+        // Multiply by the alpha value.
+        //
+
+        Row0Block0 = vmulq_n_f32(Row0Block0, alpha);
+        Row0Block1 = vmulq_n_f32(Row0Block1, alpha);
+        Row0Block2 = vmulq_n_f32(Row0Block2, alpha);
+        Row0Block3 = vmulq_n_f32(Row0Block3, alpha);
+
+        if (ProcessTwoRows) {
+            Row1Block0 = vmulq_n_f32(Row1Block0, alpha);
+            Row1Block1 = vmulq_n_f32(Row1Block1, alpha);
+            Row1Block2 = vmulq_n_f32(Row1Block2, alpha);
+            Row1Block3 = vmulq_n_f32(Row1Block3, alpha);
+        }
+
+        if (CountN >= 16) {
+
+            //
+            // Store the entire output block.
+            //
+
+            if (!ZeroMode) {
+                Row0Block0 = vaddq_f32(Row0Block0, vld1q_f32(C));
+                Row0Block1 = vaddq_f32(Row0Block1, vld1q_f32(C + 4));
+                Row0Block2 = vaddq_f32(Row0Block2, vld1q_f32(C + 8));
+                Row0Block3 = vaddq_f32(Row0Block3, vld1q_f32(C + 12));
+            }
+
+            vst1q_f32(C, Row0Block0);
+            vst1q_f32(C + 4, Row0Block1);
+            vst1q_f32(C + 8, Row0Block2);
+            vst1q_f32(C + 12, Row0Block3);
+
+            if (ProcessTwoRows) {
+
+                if (!ZeroMode) {
+                    Row1Block0 = vaddq_f32(Row1Block0, vld1q_f32(C + ldc));
+                    Row1Block1 = vaddq_f32(Row1Block1, vld1q_f32(C + ldc + 4));
+                    Row1Block2 = vaddq_f32(Row1Block2, vld1q_f32(C + ldc + 8));
+                    Row1Block3 = vaddq_f32(Row1Block3, vld1q_f32(C + ldc + 12));
+                }
+
+                vst1q_f32(C + ldc, Row1Block0);
+                vst1q_f32(C + ldc + 4, Row1Block1);
+                vst1q_f32(C + ldc + 8, Row1Block2);
+                vst1q_f32(C + ldc + 12, Row1Block3);
+            }
+
+        } else {
+
+            //
+            // Store the partial output block.
+            //
+
+            if ((CountN & 8) != 0) {
+
+                if (!ZeroMode) {
+                    Row0Block0 = vaddq_f32(Row0Block0, vld1q_f32(C));
+                    Row0Block1 = vaddq_f32(Row0Block1, vld1q_f32(C + 4));
+                }
+
+                vst1q_f32(C, Row0Block0);
+                vst1q_f32(C + 4, Row0Block1);
+                Row0Block0 = Row0Block2;
+                Row0Block1 = Row0Block3;
+
+                if (ProcessTwoRows) {
+
+                    if (!ZeroMode) {
+                        Row1Block0 = vaddq_f32(Row1Block0, vld1q_f32(C + ldc));
+                        Row1Block1 = vaddq_f32(Row1Block1, vld1q_f32(C + ldc + 4));
+                    }
+
+                    vst1q_f32(C + ldc, Row1Block0);
+                    vst1q_f32(C + ldc + 4, Row1Block1);
+                    Row1Block0 = Row1Block2;
+                    Row1Block1 = Row1Block3;
+                }
+
+                C += 8;
+            }
+
+            if ((CountN & 4) != 0) {
+
+                if (!ZeroMode) {
+                    Row0Block0 = vaddq_f32(Row0Block0, vld1q_f32(C));
+                }
+
+                vst1q_f32(C, Row0Block0);
+                Row0Block0 = Row0Block1;
+
+                if (ProcessTwoRows) {
+
+                    if (!ZeroMode) {
+                        Row1Block0 = vaddq_f32(Row1Block0, vld1q_f32(C + ldc));
+                    }
+
+                    vst1q_f32(C + ldc, Row1Block0);
+                    Row1Block0 = Row1Block1;
+                }
+
+                C += 4;
+            }
+
+            float32x2_t Row0Block0High;
+            float32x2_t Row0Block0Low;
+
+            float32x2_t Row1Block0High;
+            float32x2_t Row1Block0Low;
+
+            Row0Block0High = vget_high_f32(Row0Block0);
+            Row0Block0Low = vget_low_f32(Row0Block0);
+
+            if (ProcessTwoRows) {
+                Row1Block0High = vget_high_f32(Row1Block0);
+                Row1Block0Low = vget_low_f32(Row1Block0);
+            }
+
+            if ((CountN & 2) != 0) {
+
+                if (!ZeroMode) {
+                    Row0Block0Low = vadd_f32(Row0Block0Low, vld1_f32(C));
+                }
+
+                vst1_f32(C, Row0Block0Low);
+                Row0Block0Low = Row0Block0High;
+
+                if (ProcessTwoRows) {
+
+                    if (!ZeroMode) {
+                        Row1Block0Low = vadd_f32(Row1Block0Low, vld1_f32(C + ldc));
+                    }
+
+                    vst1_f32(C + ldc, Row1Block0Low);
+                    Row1Block0Low = Row1Block0High;
+                }
+
+                C += 2;
+            }
+
+            if ((CountN & 1) != 0) {
+
+                if (!ZeroMode) {
+                    Row0Block0Low = vadd_f32(Row0Block0Low, vld1_dup_f32(C));
+                }
+
+                vst1_lane_f32(C, Row0Block0Low, 0);
+
+                if (ProcessTwoRows) {
+
+                    if (!ZeroMode) {
+                        Row1Block0Low = vadd_f32(Row1Block0Low, vld1_dup_f32(C + ldc));
+                    }
+
+                    vst1_lane_f32(C + ldc, Row1Block0Low, 0);
+                }
+            }
+
+            break;
+        }
+
+        C += 16;
+        CountN -= 16;
+
+    } while (CountN > 0);
+
+    return ProcessTwoRows ? 2 : 1;
+}
+
+template<bool ZeroMode>
+size_t
+MlasSgemmKernel(
+    const float* A,
+    const float* B,
+    float* C,
+    size_t CountK,
+    size_t CountM,
+    size_t CountN,
+    size_t lda,
+    size_t ldc,
+    float alpha
+    )
+/*++
+
+Routine Description:
+
+    This routine is an inner kernel to compute matrix multiplication for a
+    set of rows.
+
+Arguments:
+
+    A - Supplies the address of matrix A.
+
+    B - Supplies the address of matrix B. The matrix data has been packed using
+        MlasSgemmCopyPackB or MlasSgemmTransposePackB.
+
+    C - Supplies the address of matrix C.
+
+    CountK - Supplies the number of columns from matrix A and the number of rows
+        from matrix B to iterate over.
+
+    CountM - Supplies the maximum number of rows that can be processed for
+        matrix A and matrix C. The actual number of rows handled for this
+        invocation depends on the kernel implementation.
+
+    CountN - Supplies the number of columns from matrix B and matrix C to
+        iterate over.
+
+    lda - Supplies the first dimension of matrix A.
+
+    ldc - Supplies the first dimension of matrix C.
+
+    alpha - Supplies the scaler multiplier (see SGEMM definition).
+
+Return Value:
+
+    Returns the number of rows handled.
+
+--*/
+{
+    size_t RowsHandled;
+
+    if (CountM >= 2) {
+        RowsHandled = MlasSgemmKernel<ZeroMode, true>(A, B, C, CountK, CountN, lda, ldc, alpha);
+    } else {
+        RowsHandled = MlasSgemmKernel<ZeroMode, false>(A, B, C, CountK, CountN, lda, ldc, alpha);
+    }
+
+    return RowsHandled;
+}
+
+size_t
+MLASCALL
+MlasSgemmKernelZero(
+    const float* A,
+    const float* B,
+    float* C,
+    size_t CountK,
+    size_t CountM,
+    size_t CountN,
+    size_t lda,
+    size_t ldc,
+    float alpha
+    )
+/*++
+
+Routine Description:
+
+    This routine is an inner kernel to compute matrix multiplication for a
+    set of rows.
+
+Arguments:
+
+    A - Supplies the address of matrix A.
+
+    B - Supplies the address of matrix B. The matrix data has been packed using
+        MlasSgemmCopyPackB or MlasSgemmTransposePackB.
+
+    C - Supplies the address of matrix C.
+
+    CountK - Supplies the number of columns from matrix A and the number of rows
+        from matrix B to iterate over.
+
+    CountM - Supplies the maximum number of rows that can be processed for
+        matrix A and matrix C. The actual number of rows handled for this
+        invocation depends on the kernel implementation.
+
+    CountN - Supplies the number of columns from matrix B and matrix C to
+        iterate over.
+
+    lda - Supplies the first dimension of matrix A.
+
+    ldc - Supplies the first dimension of matrix C.
+
+    alpha - Supplies the scaler multiplier (see SGEMM definition).
+
+Return Value:
+
+    Returns the number of rows handled.
+
+--*/
+{
+    return MlasSgemmKernel<true>(A, B, C, CountK, CountM, CountN, lda, ldc, alpha);
+}
+
+size_t
+MLASCALL
+MlasSgemmKernelAdd(
+    const float* A,
+    const float* B,
+    float* C,
+    size_t CountK,
+    size_t CountM,
+    size_t CountN,
+    size_t lda,
+    size_t ldc,
+    float alpha
+    )
+/*++
+
+Routine Description:
+
+    This routine is an inner kernel to compute matrix multiplication for a
+    set of rows.
+
+Arguments:
+
+    A - Supplies the address of matrix A.
+
+    B - Supplies the address of matrix B. The matrix data has been packed using
+        MlasSgemmCopyPackB or MlasSgemmTransposePackB.
+
+    C - Supplies the address of matrix C.
+
+    CountK - Supplies the number of columns from matrix A and the number of rows
+        from matrix B to iterate over.
+
+    CountM - Supplies the maximum number of rows that can be processed for
+        matrix A and matrix C. The actual number of rows handled for this
+        invocation depends on the kernel implementation.
+
+    CountN - Supplies the number of columns from matrix B and matrix C to
+        iterate over.
+
+    lda - Supplies the first dimension of matrix A.
+
+    ldc - Supplies the first dimension of matrix C.
+
+    alpha - Supplies the scaler multiplier (see SGEMM definition).
+
+Return Value:
+
+    Returns the number of rows handled.
+
+--*/
+{
+    return MlasSgemmKernel<false>(A, B, C, CountK, CountM, CountN, lda, ldc, alpha);
+}
+
+#endif
+
 struct MLAS_SGEMM_WORK_BLOCK {
     CBLAS_TRANSPOSE TransA;
     CBLAS_TRANSPOSE TransB;

From e750412683dffe83d0d126543567d34713e79af8 Mon Sep 17 00:00:00 2001
From: Cms Build <cmsbuild@cern.ch>
Date: Thu, 16 Jan 2020 18:22:37 +0100
Subject: [PATCH 2/2] Changes for mlasi.h

---
 onnxruntime/core/mlas/lib/mlasi.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h
index 3b839af458383..bdc0fa282ca4d 100644
--- a/onnxruntime/core/mlas/lib/mlasi.h
+++ b/onnxruntime/core/mlas/lib/mlasi.h
@@ -34,6 +34,12 @@ Module Name:
 #include <immintrin.h>
 #endif
 #endif
+#if defined(__powerpc64__)
+#define NO_WARN_X86_INTRINSICS
+#include "ppc64/NEON_2_SSE.h" 
+//#error Im definitely in that section
+#endif
+
 
 //
 // Macro to place variables at a specified alignment.
@@ -696,7 +702,7 @@ MlasGetMaximumThreadCount(
 #elif defined(MLAS_TARGET_ARM64)
 #define MLAS_NEON_INTRINSICS
 #define MLAS_NEON64_INTRINSICS
-#elif defined(MLAS_TARGET_AMD64_IX86)
+#elif defined(MLAS_TARGET_AMD64_IX86) || defined(__ALTIVEC__)
 #define MLAS_SSE2_INTRINSICS
 #if defined(__SSE4_1__) || (defined(_MSC_VER) && defined(__AVX__))
 #define MLAS_SSE41_INTRINSICS