Skip to content

Commit

Permalink
Some speedup with SSE 4.1
Browse files Browse the repository at this point in the history
  • Loading branch information
jpcima committed Aug 1, 2020
1 parent 45583b6 commit be7dad3
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 30 deletions.
2 changes: 1 addition & 1 deletion cmake/SfizzConfig.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ if (CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
add_compile_options(-ffast-math)
add_compile_options(-fno-omit-frame-pointer) # For debugging purposes
if (SFIZZ_SYSTEM_PROCESSOR MATCHES "^(i.86|x86_64)$")
add_compile_options(-msse2)
add_compile_options(-msse4.1)
endif()
elseif (CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
set(CMAKE_CXX_STANDARD 17)
Expand Down
12 changes: 2 additions & 10 deletions src/sfizz/Interpolators.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,16 +101,8 @@ class Interpolator<kInterpolatorBspline3, float>
{
__m128 x = _mm_sub_ps(_mm_setr_ps(-1, 0, 1, 2), _mm_set1_ps(coeff));
__m128 h = bspline3x4(x);
__m128 y = _mm_mul_ps(h, _mm_loadu_ps(values - 1));
// sum 4 to 1
__m128 xmm0 = y;
__m128 xmm1 = _mm_shuffle_ps(xmm0, xmm0, 0xe5);
__m128 xmm2 = _mm_movehl_ps(xmm0, xmm0);
xmm1 = _mm_add_ss(xmm1, xmm0);
xmm0 = _mm_shuffle_ps(xmm0, xmm0, 0xe7);
xmm2 = _mm_add_ss(xmm2, xmm1);
xmm0 = _mm_add_ss(xmm0, xmm2);
return _mm_cvtss_f32(xmm0);
__m128 y = _mm_dp_ps(h, _mm_loadu_ps(values - 1), 0xf1);
return _mm_cvtss_f32(y);
}
};
#endif
Expand Down
29 changes: 10 additions & 19 deletions src/sfizz/MathHelpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include <cfenv>
#if SFIZZ_HAVE_SSE
#include <xmmintrin.h>
#include <immintrin.h>
#endif

template <class T>
Expand Down Expand Up @@ -162,15 +163,13 @@ inline CXX14_CONSTEXPR void incrementAll(T& first, Args&... rest)
template <class R>
R hermite3(R x)
{
x = std::abs(x);
R x2 = x * x;
x = std::abs(x);
R x3 = x2 * x;
R y = 0;
R q = R(5./2.) * x2; // a reoccurring term
R p1 = R(1) - q + R(3./2.) * x3;
R p2 = R(2) - R(4) * x + q - R(1./2.) * x3;
y = (x < R(2)) ? p2 : y;
y = (x < R(1)) ? p1 : y;
R y = (x < R(1)) ? p1 : p2;
return y;
}

Expand All @@ -183,17 +182,14 @@ R hermite3(R x)
*/
inline __m128 hermite3x4(__m128 x)
{
x = _mm_andnot_ps(_mm_set1_ps(-0.0f), x);
__m128 x2 = _mm_mul_ps(x, x);
x = _mm_andnot_ps(_mm_set1_ps(-0.0f), x);
__m128 x3 = _mm_mul_ps(x2, x);
__m128 y = _mm_set1_ps(0.0f);
__m128 q = _mm_mul_ps(_mm_set1_ps(5./2.), x2);
__m128 p1 = _mm_add_ps(_mm_sub_ps(_mm_set1_ps(1), q), _mm_mul_ps(_mm_set1_ps(3./2.), x3));
__m128 p2 = _mm_sub_ps(_mm_add_ps(_mm_sub_ps(_mm_set1_ps(2), _mm_mul_ps(_mm_set1_ps(4), x)), q), _mm_mul_ps(_mm_set1_ps(1./2.), x3));
__m128 m2 = _mm_cmple_ps(x, _mm_set1_ps(2));
y = _mm_or_ps(_mm_and_ps(m2, p2), _mm_andnot_ps(m2, y));
__m128 m1 = _mm_cmple_ps(x, _mm_set1_ps(1));
y = _mm_or_ps(_mm_and_ps(m1, p1), _mm_andnot_ps(m1, y));
__m128 y = _mm_or_ps(_mm_and_ps(m1, p1), _mm_andnot_ps(m1, p2));
return y;
}
#endif
Expand All @@ -208,14 +204,12 @@ inline __m128 hermite3x4(__m128 x)
template <class R>
R bspline3(R x)
{
x = std::abs(x);
R x2 = x * x;
x = std::abs(x);
R x3 = x2 * x;
R y = 0;
R p1 = R(2./3.) - x2 + R(1./2.) * x3;
R p2 = R(4./3.) - R(2) * x + x2 - R(1./6.) * x3;
y = (x < R(2)) ? p2 : y;
y = (x < R(1)) ? p1 : y;
R y = (x < R(1)) ? p1 : p2;
return y;
}

Expand All @@ -228,16 +222,13 @@ R bspline3(R x)
*/
inline __m128 bspline3x4(__m128 x)
{
x = _mm_andnot_ps(_mm_set1_ps(-0.0f), x);
__m128 x2 = _mm_mul_ps(x, x);
x = _mm_andnot_ps(_mm_set1_ps(-0.0f), x);
__m128 x3 = _mm_mul_ps(x2, x);
__m128 y = _mm_set1_ps(0.0f);
__m128 p1 = _mm_add_ps(_mm_sub_ps(_mm_set1_ps(2./3.), x2), _mm_mul_ps(_mm_set1_ps(1./2.), x3));
__m128 p2 = _mm_sub_ps(_mm_add_ps(_mm_sub_ps(_mm_set1_ps(4./3.), _mm_mul_ps(_mm_set1_ps(2), x)), x2), _mm_mul_ps(_mm_set1_ps(1./6.), x3));
__m128 m2 = _mm_cmple_ps(x, _mm_set1_ps(2));
y = _mm_or_ps(_mm_and_ps(m2, p2), _mm_andnot_ps(m2, y));
__m128 m1 = _mm_cmple_ps(x, _mm_set1_ps(1));
y = _mm_or_ps(_mm_and_ps(m1, p1), _mm_andnot_ps(m1, y));
__m128 m1 = _mm_cmplt_ps(x, _mm_set1_ps(1));
__m128 y = _mm_or_ps(_mm_and_ps(m1, p1), _mm_andnot_ps(m1, p2));
return y;
}
#endif
Expand Down

0 comments on commit be7dad3

Please sign in to comment.