From be7dad345e46a8ed8fdb036594b9298f13af7ffa Mon Sep 17 00:00:00 2001 From: Jean Pierre Cimalando Date: Sat, 1 Aug 2020 12:08:07 +0200 Subject: [PATCH] Some speedup with SSE 4.1 --- cmake/SfizzConfig.cmake | 2 +- src/sfizz/Interpolators.hpp | 12 ++---------- src/sfizz/MathHelpers.h | 29 ++++++++++------------------- 3 files changed, 13 insertions(+), 30 deletions(-) diff --git a/cmake/SfizzConfig.cmake b/cmake/SfizzConfig.cmake index 16c805bdc..65d5a4208 100644 --- a/cmake/SfizzConfig.cmake +++ b/cmake/SfizzConfig.cmake @@ -35,7 +35,7 @@ if (CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang") add_compile_options(-ffast-math) add_compile_options(-fno-omit-frame-pointer) # For debugging purposes if (SFIZZ_SYSTEM_PROCESSOR MATCHES "^(i.86|x86_64)$") - add_compile_options(-msse2) + add_compile_options(-msse4.1) endif() elseif (CMAKE_CXX_COMPILER_ID MATCHES "MSVC") set(CMAKE_CXX_STANDARD 17) diff --git a/src/sfizz/Interpolators.hpp b/src/sfizz/Interpolators.hpp index 5c3029008..4c76dcbc5 100644 --- a/src/sfizz/Interpolators.hpp +++ b/src/sfizz/Interpolators.hpp @@ -101,16 +101,8 @@ class Interpolator { __m128 x = _mm_sub_ps(_mm_setr_ps(-1, 0, 1, 2), _mm_set1_ps(coeff)); __m128 h = bspline3x4(x); - __m128 y = _mm_mul_ps(h, _mm_loadu_ps(values - 1)); - // sum 4 to 1 - __m128 xmm0 = y; - __m128 xmm1 = _mm_shuffle_ps(xmm0, xmm0, 0xe5); - __m128 xmm2 = _mm_movehl_ps(xmm0, xmm0); - xmm1 = _mm_add_ss(xmm1, xmm0); - xmm0 = _mm_shuffle_ps(xmm0, xmm0, 0xe7); - xmm2 = _mm_add_ss(xmm2, xmm1); - xmm0 = _mm_add_ss(xmm0, xmm2); - return _mm_cvtss_f32(xmm0); + __m128 y = _mm_dp_ps(h, _mm_loadu_ps(values - 1), 0xf1); + return _mm_cvtss_f32(y); } }; #endif diff --git a/src/sfizz/MathHelpers.h b/src/sfizz/MathHelpers.h index f0dbebc44..1db12fe2e 100644 --- a/src/sfizz/MathHelpers.h +++ b/src/sfizz/MathHelpers.h @@ -23,6 +23,7 @@ #include #if SFIZZ_HAVE_SSE #include +#include #endif template @@ -162,15 +163,13 @@ inline CXX14_CONSTEXPR void incrementAll(T& first, Args&... rest) template R hermite3(R x) { - x = std::abs(x); R x2 = x * x; + x = std::abs(x); R x3 = x2 * x; - R y = 0; R q = R(5./2.) * x2; // a reoccurring term R p1 = R(1) - q + R(3./2.) * x3; R p2 = R(2) - R(4) * x + q - R(1./2.) * x3; - y = (x < R(2)) ? p2 : y; - y = (x < R(1)) ? p1 : y; + R y = (x < R(1)) ? p1 : p2; return y; } @@ -183,17 +182,14 @@ R hermite3(R x) */ inline __m128 hermite3x4(__m128 x) { - x = _mm_andnot_ps(_mm_set1_ps(-0.0f), x); __m128 x2 = _mm_mul_ps(x, x); + x = _mm_andnot_ps(_mm_set1_ps(-0.0f), x); __m128 x3 = _mm_mul_ps(x2, x); - __m128 y = _mm_set1_ps(0.0f); __m128 q = _mm_mul_ps(_mm_set1_ps(5./2.), x2); __m128 p1 = _mm_add_ps(_mm_sub_ps(_mm_set1_ps(1), q), _mm_mul_ps(_mm_set1_ps(3./2.), x3)); __m128 p2 = _mm_sub_ps(_mm_add_ps(_mm_sub_ps(_mm_set1_ps(2), _mm_mul_ps(_mm_set1_ps(4), x)), q), _mm_mul_ps(_mm_set1_ps(1./2.), x3)); - __m128 m2 = _mm_cmple_ps(x, _mm_set1_ps(2)); - y = _mm_or_ps(_mm_and_ps(m2, p2), _mm_andnot_ps(m2, y)); __m128 m1 = _mm_cmple_ps(x, _mm_set1_ps(1)); - y = _mm_or_ps(_mm_and_ps(m1, p1), _mm_andnot_ps(m1, y)); + __m128 y = _mm_or_ps(_mm_and_ps(m1, p1), _mm_andnot_ps(m1, p2)); return y; } #endif @@ -208,14 +204,12 @@ inline __m128 hermite3x4(__m128 x) template R bspline3(R x) { - x = std::abs(x); R x2 = x * x; + x = std::abs(x); R x3 = x2 * x; - R y = 0; R p1 = R(2./3.) - x2 + R(1./2.) * x3; R p2 = R(4./3.) - R(2) * x + x2 - R(1./6.) * x3; - y = (x < R(2)) ? p2 : y; - y = (x < R(1)) ? p1 : y; + R y = (x < R(1)) ? p1 : p2; return y; } @@ -228,16 +222,13 @@ R bspline3(R x) */ inline __m128 bspline3x4(__m128 x) { - x = _mm_andnot_ps(_mm_set1_ps(-0.0f), x); __m128 x2 = _mm_mul_ps(x, x); + x = _mm_andnot_ps(_mm_set1_ps(-0.0f), x); __m128 x3 = _mm_mul_ps(x2, x); - __m128 y = _mm_set1_ps(0.0f); __m128 p1 = _mm_add_ps(_mm_sub_ps(_mm_set1_ps(2./3.), x2), _mm_mul_ps(_mm_set1_ps(1./2.), x3)); __m128 p2 = _mm_sub_ps(_mm_add_ps(_mm_sub_ps(_mm_set1_ps(4./3.), _mm_mul_ps(_mm_set1_ps(2), x)), x2), _mm_mul_ps(_mm_set1_ps(1./6.), x3)); - __m128 m2 = _mm_cmple_ps(x, _mm_set1_ps(2)); - y = _mm_or_ps(_mm_and_ps(m2, p2), _mm_andnot_ps(m2, y)); - __m128 m1 = _mm_cmple_ps(x, _mm_set1_ps(1)); - y = _mm_or_ps(_mm_and_ps(m1, p1), _mm_andnot_ps(m1, y)); + __m128 m1 = _mm_cmplt_ps(x, _mm_set1_ps(1)); + __m128 y = _mm_or_ps(_mm_and_ps(m1, p1), _mm_andnot_ps(m1, p2)); return y; } #endif