From 647dc121216f12ddedea82ce5da5662f158c856b Mon Sep 17 00:00:00 2001 From: Atharva Nimbalkar Date: Sun, 15 Aug 2021 16:46:20 +0530 Subject: [PATCH] neon/cge: Improve some of the SSE2 fallbacks Fixes #900 --- simde/arm/neon/cge.h | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/simde/arm/neon/cge.h b/simde/arm/neon/cge.h index 38425cec0..2ed6655a4 100644 --- a/simde/arm/neon/cge.h +++ b/simde/arm/neon/cge.h @@ -290,8 +290,11 @@ simde_vcgeq_u8(simde_uint8x16_t a, simde_uint8x16_t b) { b_ = simde_uint8x16_to_private(b); #if defined(SIMDE_X86_SSE2_NATIVE) - __m128i sign_bits = _mm_set1_epi8(INT8_MIN); - r_.m128i = _mm_or_si128(_mm_cmpgt_epi8(_mm_xor_si128(a_.m128i, sign_bits), _mm_xor_si128(b_.m128i, sign_bits)), _mm_cmpeq_epi8(a_.m128i, b_.m128i)); + r_.m128i = + _mm_cmpeq_epi8( + _mm_min_epu8(b_.m128i, a_.m128i), + b_.m128i + ); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_u8x16_ge(a_.v128, b_.v128); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) @@ -324,7 +327,13 @@ simde_vcgeq_u16(simde_uint16x8_t a, simde_uint16x8_t b) { a_ = simde_uint16x8_to_private(a), b_ = simde_uint16x8_to_private(b); - #if defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SSE4_1_NATIVE) + r_.m128i = + _mm_cmpeq_epi16( + _mm_min_epu16(b_.m128i, a_.m128i), + b_.m128i + ); + #elif defined(SIMDE_X86_SSE2_NATIVE) __m128i sign_bits = _mm_set1_epi16(INT16_MIN); r_.m128i = _mm_or_si128(_mm_cmpgt_epi16(_mm_xor_si128(a_.m128i, sign_bits), _mm_xor_si128(b_.m128i, sign_bits)), _mm_cmpeq_epi16(a_.m128i, b_.m128i)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) @@ -359,7 +368,13 @@ simde_vcgeq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { a_ = simde_uint32x4_to_private(a), b_ = simde_uint32x4_to_private(b); - #if defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SSE4_1_NATIVE) + r_.m128i = + _mm_cmpeq_epi32( + _mm_min_epu32(b_.m128i, a_.m128i), + b_.m128i + ); + #elif defined(SIMDE_X86_SSE2_NATIVE) __m128i sign_bits = _mm_set1_epi32(INT32_MIN); r_.m128i = _mm_or_si128(_mm_cmpgt_epi32(_mm_xor_si128(a_.m128i, sign_bits), _mm_xor_si128(b_.m128i, sign_bits)), _mm_cmpeq_epi32(a_.m128i, b_.m128i)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) @@ -394,7 +409,13 @@ simde_vcgeq_u64(simde_uint64x2_t a, simde_uint64x2_t b) { a_ = simde_uint64x2_to_private(a), b_ = simde_uint64x2_to_private(b); - #if defined(SIMDE_X86_SSE4_2_NATIVE) + #if defined(SIMDE_X86_AVX512VL_NATIVE) + r_.m128i = + _mm_cmpeq_epi64( + _mm_min_epu64(b_.m128i, a_.m128i), + b_.m128i + ); + #elif defined(SIMDE_X86_SSE4_2_NATIVE) __m128i sign_bits = _mm_set1_epi64x(INT64_MIN); r_.m128i = _mm_or_si128(_mm_cmpgt_epi64(_mm_xor_si128(a_.m128i, sign_bits), _mm_xor_si128(b_.m128i, sign_bits)), _mm_cmpeq_epi64(a_.m128i, b_.m128i)); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)