Some speedup with SSE 4.1

sfztools · Aug 1, 2020 · be7dad3 · be7dad3
1 parent 45583b6
commit be7dad3
Show file tree

Hide file tree

Showing 3 changed files with 13 additions and 30 deletions.
diff --git a/cmake/SfizzConfig.cmake b/cmake/SfizzConfig.cmake
@@ -35,7 +35,7 @@ if (CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
     add_compile_options(-ffast-math)
     add_compile_options(-fno-omit-frame-pointer) # For debugging purposes
     if (SFIZZ_SYSTEM_PROCESSOR MATCHES "^(i.86|x86_64)$")
-        add_compile_options(-msse2)
+        add_compile_options(-msse4.1)
     endif()
 elseif (CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
     set(CMAKE_CXX_STANDARD 17)

diff --git a/src/sfizz/Interpolators.hpp b/src/sfizz/Interpolators.hpp
@@ -101,16 +101,8 @@ class Interpolator<kInterpolatorBspline3, float>
     {
         __m128 x = _mm_sub_ps(_mm_setr_ps(-1, 0, 1, 2), _mm_set1_ps(coeff));
         __m128 h = bspline3x4(x);
-        __m128 y = _mm_mul_ps(h, _mm_loadu_ps(values - 1));
-        // sum 4 to 1
-        __m128 xmm0 = y;
-        __m128 xmm1 = _mm_shuffle_ps(xmm0, xmm0, 0xe5);
-        __m128 xmm2 = _mm_movehl_ps(xmm0, xmm0);
-        xmm1 = _mm_add_ss(xmm1, xmm0);
-        xmm0 = _mm_shuffle_ps(xmm0, xmm0, 0xe7);
-        xmm2 = _mm_add_ss(xmm2, xmm1);
-        xmm0 = _mm_add_ss(xmm0, xmm2);
-        return _mm_cvtss_f32(xmm0);
+        __m128 y = _mm_dp_ps(h, _mm_loadu_ps(values - 1), 0xf1);
+        return _mm_cvtss_f32(y);
     }
 };
 #endif

diff --git a/src/sfizz/MathHelpers.h b/src/sfizz/MathHelpers.h
@@ -23,6 +23,7 @@
 #include <cfenv>
 #if SFIZZ_HAVE_SSE
 #include <xmmintrin.h>
+#include <immintrin.h>
 #endif
 
 template <class T>
@@ -162,15 +163,13 @@ inline CXX14_CONSTEXPR void incrementAll(T& first, Args&... rest)
 template <class R>
 R hermite3(R x)
 {
-    x = std::abs(x);
     R x2 = x * x;
+    x = std::abs(x);
     R x3 = x2 * x;
-    R y = 0;
     R q = R(5./2.) * x2; // a reoccurring term
     R p1 = R(1) - q + R(3./2.) * x3;
     R p2 = R(2) - R(4) * x + q - R(1./2.) * x3;
-    y = (x < R(2)) ? p2 : y;
-    y = (x < R(1)) ? p1 : y;
+    R y = (x < R(1)) ? p1 : p2;
     return y;
 }
 
@@ -183,17 +182,14 @@ R hermite3(R x)
  */
 inline __m128 hermite3x4(__m128 x)
 {
-    x = _mm_andnot_ps(_mm_set1_ps(-0.0f), x);
     __m128 x2 = _mm_mul_ps(x, x);
+    x = _mm_andnot_ps(_mm_set1_ps(-0.0f), x);
     __m128 x3 = _mm_mul_ps(x2, x);
-    __m128 y = _mm_set1_ps(0.0f);
     __m128 q = _mm_mul_ps(_mm_set1_ps(5./2.), x2);
     __m128 p1 = _mm_add_ps(_mm_sub_ps(_mm_set1_ps(1), q), _mm_mul_ps(_mm_set1_ps(3./2.), x3));
     __m128 p2 = _mm_sub_ps(_mm_add_ps(_mm_sub_ps(_mm_set1_ps(2), _mm_mul_ps(_mm_set1_ps(4), x)), q), _mm_mul_ps(_mm_set1_ps(1./2.), x3));
-    __m128 m2 = _mm_cmple_ps(x, _mm_set1_ps(2));
-    y = _mm_or_ps(_mm_and_ps(m2, p2), _mm_andnot_ps(m2, y));
     __m128 m1 = _mm_cmple_ps(x, _mm_set1_ps(1));
-    y = _mm_or_ps(_mm_and_ps(m1, p1), _mm_andnot_ps(m1, y));
+    __m128 y = _mm_or_ps(_mm_and_ps(m1, p1), _mm_andnot_ps(m1, p2));
     return y;
 }
 #endif
@@ -208,14 +204,12 @@ inline __m128 hermite3x4(__m128 x)
 template <class R>
 R bspline3(R x)
 {
-    x = std::abs(x);
     R x2 = x * x;
+    x = std::abs(x);
     R x3 = x2 * x;
-    R y = 0;
     R p1 = R(2./3.) - x2 + R(1./2.) * x3;
     R p2 = R(4./3.) - R(2) * x + x2 - R(1./6.) * x3;
-    y = (x < R(2)) ? p2 : y;
-    y = (x < R(1)) ? p1 : y;
+    R y = (x < R(1)) ? p1 : p2;
     return y;
 }
 
@@ -228,16 +222,13 @@ R bspline3(R x)
  */
 inline __m128 bspline3x4(__m128 x)
 {
-    x = _mm_andnot_ps(_mm_set1_ps(-0.0f), x);
     __m128 x2 = _mm_mul_ps(x, x);
+    x = _mm_andnot_ps(_mm_set1_ps(-0.0f), x);
     __m128 x3 = _mm_mul_ps(x2, x);
-    __m128 y = _mm_set1_ps(0.0f);
     __m128 p1 = _mm_add_ps(_mm_sub_ps(_mm_set1_ps(2./3.), x2), _mm_mul_ps(_mm_set1_ps(1./2.), x3));
     __m128 p2 = _mm_sub_ps(_mm_add_ps(_mm_sub_ps(_mm_set1_ps(4./3.), _mm_mul_ps(_mm_set1_ps(2), x)), x2), _mm_mul_ps(_mm_set1_ps(1./6.), x3));
-    __m128 m2 = _mm_cmple_ps(x, _mm_set1_ps(2));
-    y = _mm_or_ps(_mm_and_ps(m2, p2), _mm_andnot_ps(m2, y));
-    __m128 m1 = _mm_cmple_ps(x, _mm_set1_ps(1));
-    y = _mm_or_ps(_mm_and_ps(m1, p1), _mm_andnot_ps(m1, y));
+    __m128 m1 = _mm_cmplt_ps(x, _mm_set1_ps(1));
+    __m128 y = _mm_or_ps(_mm_and_ps(m1, p1), _mm_andnot_ps(m1, p2));
     return y;
 }
 #endif