#ifndef _SMMINTRIN_H #define _SMMINTRIN_H #include //--------------------------------------- // Type Definitions //--------------------------------------- typedef signed char ssp_s8; typedef unsigned char ssp_u8; typedef signed short ssp_s16; typedef unsigned short ssp_u16; typedef signed int ssp_s32; typedef unsigned int ssp_u32; typedef float ssp_f32; typedef double ssp_f64; typedef signed long long ssp_s64; typedef unsigned long long ssp_u64; typedef union { __m128 f; __m128d d; __m128i i; //__m64 m64[ 2]; ssp_u64 u64[ 2]; ssp_s64 s64[ 2]; ssp_f64 f64[ 2]; ssp_u32 u32[ 4]; ssp_s32 s32[ 4]; ssp_f32 f32[ 4]; ssp_u16 u16[ 8]; ssp_s16 s16[ 8]; ssp_u8 u8 [16]; ssp_s8 s8 [16]; } ssp_m128; typedef union { //__m64 m64; ssp_u64 u64; ssp_s64 s64; ssp_u32 u32[ 2]; ssp_s32 s32[ 2]; ssp_u16 u16[ 4]; ssp_s16 s16[ 4]; ssp_u8 u8 [ 8]; ssp_s8 s8 [ 8]; } ssp_m64; //ssp_u16 MAX_U16 = 65535; //--------------------------------------- // Rounding mode macros //--------------------------------------- #define SSP_FROUND_TO_NEAREST_INT 0x00 #define SSP_FROUND_TO_NEG_INF 0x01 #define SSP_FROUND_TO_POS_INF 0x02 #define SSP_FROUND_TO_ZERO 0x03 #define SSP_FROUND_CUR_DIRECTION 0x04 #define SSP_FROUND_RAISE_EXC 0x00 #define SSP_FROUND_NO_EXC 0x08 //--------------------------------------- // Floating point precision requirements //--------------------------------------- const static float SSP_F32_ALLOWANCE = 0.0001f; const static double SSP_F64_ALLOWANCE = 0.0001; static __inline__ __m128i ssp_logical_bitwise_select_SSE2( __m128i a, __m128i b, __m128i mask ) // Bitwise (mask ? a : b) { a = _mm_and_si128 ( a, mask ); // clear a where mask = 0 b = _mm_andnot_si128( mask, b ); // clear b where mask = 1 a = _mm_or_si128 ( a, b ); // a = a OR b return a; } static __inline__ __m128i ssp_comlt_epu32_SSE2(__m128i a, __m128i b) { __m128i signMask, mask; mask = _mm_cmplt_epi32( a, b ); // FFFF where a < b (signed) signMask = _mm_xor_si128 ( a, b ); // Signbit is 1 where signs differ signMask = _mm_srai_epi32 ( signMask, 31 ); // fill all fields with sign bit mask = _mm_xor_si128 ( mask, signMask ); // Invert output where signs differed return mask; } static __inline__ __m128i ssp_comgt_epu32_SSE2(__m128i a, __m128i b) { __m128i signMask, mask; mask = _mm_cmpgt_epi32( a, b ); // FFFF where a < b (signed) signMask = _mm_xor_si128 ( a, b ); // Signbit is 1 where signs differ signMask = _mm_srai_epi32 ( signMask, 31 ); // fill all fields with sign bit mask = _mm_xor_si128 ( mask, signMask ); // Invert output where signs differed return mask; } static __inline__ __m128i ssp_comge_epi8_SSE2(__m128i a, __m128i b) { __m128i c; c = _mm_cmpgt_epi8( a, b ); a = _mm_cmpeq_epi8( a, b ); a = _mm_or_si128 ( a, c ); return a; } static __inline__ __m128i _mm_shuffle_epi8 (__m128i a, __m128i mask) { ssp_m128 A,B, MASK, maskZero; A.i = a; maskZero.i = ssp_comge_epi8_SSE2( mask, _mm_setzero_si128() ); MASK.i = _mm_and_si128 ( mask, _mm_set1_epi8( (char)0x0F) ); B.s8[ 0] = A.s8[ (MASK.s8[ 0]) ]; B.s8[ 1] = A.s8[ (MASK.s8[ 1]) ]; B.s8[ 2] = A.s8[ (MASK.s8[ 2]) ]; B.s8[ 3] = A.s8[ (MASK.s8[ 3]) ]; B.s8[ 4] = A.s8[ (MASK.s8[ 4]) ]; B.s8[ 5] = A.s8[ (MASK.s8[ 5]) ]; B.s8[ 6] = A.s8[ (MASK.s8[ 6]) ]; B.s8[ 7] = A.s8[ (MASK.s8[ 7]) ]; B.s8[ 8] = A.s8[ (MASK.s8[ 8]) ]; B.s8[ 9] = A.s8[ (MASK.s8[ 9]) ]; B.s8[10] = A.s8[ (MASK.s8[10]) ]; B.s8[11] = A.s8[ (MASK.s8[11]) ]; B.s8[12] = A.s8[ (MASK.s8[12]) ]; B.s8[13] = A.s8[ (MASK.s8[13]) ]; B.s8[14] = A.s8[ (MASK.s8[14]) ]; B.s8[15] = A.s8[ (MASK.s8[15]) ]; B.i = _mm_and_si128( B.i, maskZero.i ); return B.i; } static __inline__ int _mm_extract_epi32( __m128i a, const int imm ) { ssp_m128 mask; switch( imm & 0x3 ) { case 3: a = _mm_srli_si128( a, 12 ); break; case 2: a = _mm_srli_si128( a, 8 ); break; case 1: a = _mm_srli_si128( a, 4 ); break; } mask.i = _mm_set_epi32 ( 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF ); mask.i = _mm_and_si128 ( mask.i, a ); return mask.s32[0]; } static __inline__ __m128i _mm_min_epi32 (__m128i a, __m128i b) { //return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2); __m128i mask = _mm_cmplt_epi32( a, b ); // FFFFFFFF where a < b a = ssp_logical_bitwise_select_SSE2( a, b, mask ); return a; } static __inline__ __m128i _mm_max_epi32 ( __m128i a, __m128i b ) { //return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2); __m128i mask = _mm_cmpgt_epi32( a, b ); // FFFFFFFF where a > b a = ssp_logical_bitwise_select_SSE2( a, b, mask ); return a; } static __inline__ __m128i _mm_min_epu32 (__m128i a, __m128i b) { //return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2); __m128i mask = ssp_comlt_epu32_SSE2( a, b ); a = ssp_logical_bitwise_select_SSE2( a, b, mask ); return a; } static __inline__ __m128i _mm_max_epu32 (__m128i a, __m128i b) { //return (__m128i) __builtin_ia32_pmaxud128((__v4si) __V1, (__v4si) __V2); __m128i mask = ssp_comgt_epu32_SSE2( a, b ); a = ssp_logical_bitwise_select_SSE2( a, b, mask ); return a; } #endif /* _SMMINTRIN_H */