Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimized the performance of float object #218

Open
wants to merge 3 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 30 additions & 25 deletions includes/rtm/impl/matrix_affine_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,18 +58,21 @@ namespace rtm
{
RTM_ASSERT(quat_is_normalized(quat_input), "Quaternion is not normalized");

const float_type x2 = quat_get_x(quat_input) + quat_get_x(quat_input);
const float_type y2 = quat_get_y(quat_input) + quat_get_y(quat_input);
const float_type z2 = quat_get_z(quat_input) + quat_get_z(quat_input);
const float_type xx = quat_get_x(quat_input) * x2;
const float_type xy = quat_get_x(quat_input) * y2;
const float_type xz = quat_get_x(quat_input) * z2;
const float_type yy = quat_get_y(quat_input) * y2;
const float_type yz = quat_get_y(quat_input) * z2;
const float_type zz = quat_get_z(quat_input) * z2;
const float_type wx = quat_get_w(quat_input) * x2;
const float_type wy = quat_get_w(quat_input) * y2;
const float_type wz = quat_get_w(quat_input) * z2;
float_type quatval[4];
quat_store(quat_input, quatval);

const float_type x2 = quatval[0] + quatval[0];
const float_type y2 = quatval[1] + quatval[1];
const float_type z2 = quatval[2] + quatval[2];
const float_type xx = quatval[0] * x2;
const float_type xy = quatval[0] * y2;
const float_type xz = quatval[0] * z2;
const float_type yy = quatval[1] * y2;
const float_type yz = quatval[1] * z2;
const float_type zz = quatval[2] * z2;
const float_type wx = quatval[3] * x2;
const float_type wy = quatval[3] * y2;
const float_type wz = quatval[3] * z2;

const vector4 x_axis = vector_set(float_type(1.0) - (yy + zz), xy + wz, xz - wy, float_type(0.0));
const vector4 y_axis = vector_set(xy - wz, float_type(1.0) - (xx + zz), yz + wx, float_type(0.0));
Expand All @@ -80,19 +83,21 @@ namespace rtm
RTM_DISABLE_SECURITY_COOKIE_CHECK inline RTM_SIMD_CALL operator matrix3x4() const RTM_NO_EXCEPT
{
RTM_ASSERT(quat_is_normalized(quat_input), "Quaternion is not normalized");

const float_type x2 = quat_get_x(quat_input) + quat_get_x(quat_input);
const float_type y2 = quat_get_y(quat_input) + quat_get_y(quat_input);
const float_type z2 = quat_get_z(quat_input) + quat_get_z(quat_input);
const float_type xx = quat_get_x(quat_input) * x2;
const float_type xy = quat_get_x(quat_input) * y2;
const float_type xz = quat_get_x(quat_input) * z2;
const float_type yy = quat_get_y(quat_input) * y2;
const float_type yz = quat_get_y(quat_input) * z2;
const float_type zz = quat_get_z(quat_input) * z2;
const float_type wx = quat_get_w(quat_input) * x2;
const float_type wy = quat_get_w(quat_input) * y2;
const float_type wz = quat_get_w(quat_input) * z2;
float_type quatval[4];
quat_store(quat_input, quatval);

const float_type x2 = quatval[0] + quatval[0];
const float_type y2 = quatval[1] + quatval[1];
const float_type z2 = quatval[2] + quatval[2];
const float_type xx = quatval[0] * x2;
const float_type xy = quatval[0] * y2;
const float_type xz = quatval[0] * z2;
const float_type yy = quatval[1] * y2;
const float_type yz = quatval[1] * z2;
const float_type zz = quatval[2] * z2;
const float_type wx = quatval[3] * x2;
const float_type wy = quatval[3] * y2;
const float_type wz = quatval[3] * z2;

const vector4 x_axis = vector_set(float_type(1.0) - (yy + zz), xy + wz, xz - wy, float_type(0.0));
const vector4 y_axis = vector_set(xy - wz, float_type(1.0) - (xx + zz), yz + wx, float_type(0.0));
Expand Down
193 changes: 193 additions & 0 deletions includes/rtm/impl/vector4f_swizzle.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
#pragma once
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good idea to move this into its own header.
As a convention, all similar headers have the suffix .impl.h
The header is also missing the MIT license information, see other headers as example.


#include <cstdint>
#include <cstring>
#include <type_traits>

#include "rtm/math.h"
#include "rtm/types.h"
#include "rtm/impl/compiler_utils.h"
#include "rtm/scalarf.h"
#include "rtm/scalard.h"
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this header required?

#include "rtm/version.h"

RTM_IMPL_FILE_PRAGMA_PUSH

#if !defined(RTM_NO_INTRINSICS)
namespace rtm
{
RTM_IMPL_VERSION_NAMESPACE_BEGIN

#if defined(RTM_SSE2_INTRINSICS) || defined(RTM_AVX_INTRINSICS)

#define SHUFFLE_MASK(a0,a1,b2,b3) ( (a0) | ((a1)<<2) | ((b2)<<4) | ((b3)<<6) )
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is already a standardized macro for this, see _MM_SHUFFLE in Intel intrinsic documentation.


/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Float swizzle
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
template<int index0, int index1, int index2, int index3>
RTM_FORCE_INLINE vector4f vector_swizzle_impl(const vector4f& vec)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since this is an implementation detail, we should move all of this into the rtm_impl namespace

{
return _mm_shuffle_ps(vec, vec, SHUFFLE_MASK(index0, index1, index2, index3));
}
template<> RTM_FORCE_INLINE vector4f vector_swizzle_impl<0, 1, 2, 3>(const vector4f& vec)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like the idea of using template specialization for this

{
return vec;
}
template<> RTM_FORCE_INLINE vector4f vector_swizzle_impl<0, 1, 0, 1>(const vector4f& vec)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All of these should be using the RTM_SIMD_CALL calling convention and vector4f_arg0 where possible

{
return _mm_movelh_ps(vec, vec);
}
template<> RTM_FORCE_INLINE vector4f vector_swizzle_impl<2, 3, 2, 3>(const vector4f& vec)
{
return _mm_movehl_ps(vec, vec);
}
template<> RTM_FORCE_INLINE vector4f vector_swizzle_impl<0, 0, 1, 1>(const vector4f& vec)
{
return _mm_unpacklo_ps(vec, vec);
}
template<> RTM_FORCE_INLINE vector4f vector_swizzle_impl<2, 2, 3, 3>(const vector4f& vec)
{
return _mm_unpackhi_ps(vec, vec);
}

#if defined(RTM_SSE4_INTRINSICS)
template<> RTM_FORCE_INLINE vector4f vector_swizzle_impl<0, 0, 2, 2>(const vector4f& vec)
{
return _mm_moveldup_ps(vec);
}
template<> RTM_FORCE_INLINE vector4f vector_swizzle_impl<1, 1, 3, 3>(const vector4f& vec)
{
return _mm_movehdup_ps(vec);
}
#endif

#if defined(RTM_AVX2_INTRINSICS)
template<> RTM_FORCE_INLINE vector4f vector_swizzle_impl<0, 0, 0, 0>(const vector4f& vec)
{
return _mm_broadcastss_ps(vec);
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A lot more specializations appear to be missing, in particular those with AVX (e.g. using insert/extract) and those for NEON (e.g. using zip). The clang shuffle may do a better/identical job for these but RTM also supports other toolchains that do not have it available (e.g. MSVC ARM/x64).

}
#endif

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Float replicate
template<int index>
RTM_FORCE_INLINE vector4f vector_replicate_impl(const vector4f& vec)
{
static_assert(index >= 0 && index <= 3, "Invalid Index");
return vector_swizzle_impl<index, index, index, index>(vec);
}

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Float shuffle
template<int index0, int index1, int index2, int index3>
RTM_FORCE_INLINE vector4f vector_shuffle_impl(const vector4f& vec1, const vector4f& vec2)
{
static_assert(index0 >= 0 && index0 <= 3 && index1 >= 0 && index1 <= 3 && index2 >= 0 && index2 <= 3 && index3 >= 0 && index3 <= 3, "Invalid Index");
return _mm_shuffle_ps(vec1, vec2, SHUFFLE_MASK(index0, index1, index2, index3));
}

// Float Shuffle specializations
template<> RTM_FORCE_INLINE vector4f vector_shuffle_impl<0, 1, 0, 1>(const vector4f& vec1, const vector4f& vec2)
{
return _mm_movelh_ps(vec1, vec2);

}
template<> RTM_FORCE_INLINE vector4f vector_shuffle_impl<2, 3, 2, 3>(const vector4f& vec1, const vector4f& vec2)
{
// Note: movehl copies first from the 2nd argument
return _mm_movehl_ps(vec2, vec1);
}

#elif defined(RTM_NEON_INTRINSICS)
template <int element_index>
RTM_FORCE_INLINE vector4f vector_replicate_impl(const vector4f& vec)
{
return vdupq_n_f32(vgetq_lane_f32(vec, element_index));
}

#if defined(__clang__)
template <int x, int y, int z, int w>
RTM_FORCE_INLINE vector4f vector_swizzle_impl(vector4f vec)
{
return __builtin_shufflevector(vec, vec, x, y, z, w);
}

template <int x, int y, int z, int w>
RTM_FORCE_INLINE vector4f vector_shuffle_impl(vector4f vec1, vector4f vec2)
{
return __builtin_shufflevector(vec1, vec2, x, y, z + 4, w + 4);
}
#else

template<int index0, int index1, int index2, int index3>
RTM_FORCE_INLINE vector4f vector_shuffle_impl(vector4f vec1, vector4f vec2)
{
static_assert(index0 <= 3 && index1 <= 3 && index2 <= 3 && index3 <= 3, "Invalid Index");

static constexpr uint32_t control_element[8] =
{
0x03020100, // XM_PERMUTE_0X
0x07060504, // XM_PERMUTE_0Y
0x0B0A0908, // XM_PERMUTE_0Z
0x0F0E0D0C, // XM_PERMUTE_0W
0x13121110, // XM_PERMUTE_1X
0x17161514, // XM_PERMUTE_1Y
0x1B1A1918, // XM_PERMUTE_1Z
0x1F1E1D1C, // XM_PERMUTE_1W
};

uint8x8x4_t tbl;
tbl.val[0] = vget_low_f32(vec1);
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm a little concerned with this. As a fallback for non-specialized code paths it might be okay but is the compiler able to identify patterns to use specialized instructions when possible (e.g. zip)? When I last reviewed the assembly for the old vector mix code on ARM, I found that it was sensible. Why change it?

tbl.val[1] = vget_high_f32(vec1);
tbl.val[2] = vget_low_f32(vec2);
tbl.val[3] = vget_high_f32(vec2);

uint32x2_t idx = vcreate_u32(static_cast<uint64_t>(control_element[index0]) | (static_cast<uint64_t>(control_element[index1]) << 32));
const uint8x8_t rL = vtbl4_u8(tbl, idx);

idx = vcreate_u32(static_cast<uint64_t>(control_element[index2 + 4]) | (static_cast<uint64_t>(control_element[index3 + 4]) << 32));
const uint8x8_t rH = vtbl4_u8(tbl, idx);

return vcombine_f32(rL, rH);
}

template<int index0, int index1, int index2, int index3>
RTM_FORCE_INLINE vector4f vector_swizzle_impl(vector4f vec)
{
static_assert(index0 <= 3 && index1 <= 3 && index2 <= 3 && index3 <= 3, "Invalid Index");

static constexpr uint32_t control_element[4] =
{
0x03020100, // XM_SWIZZLE_X
0x07060504, // XM_SWIZZLE_Y
0x0B0A0908, // XM_SWIZZLE_Z
0x0F0E0D0C, // XM_SWIZZLE_W
};

uint8x8x2_t tbl;
tbl.val[0] = vget_low_f32(vec);
tbl.val[1] = vget_high_f32(vec);

uint32x2_t idx = vcreate_u32(static_cast<uint64_t>(control_element[index0]) | (static_cast<uint64_t>(control_element[index1]) << 32));
const uint8x8_t rL = vtbl2_u8(tbl, idx);

idx = vcreate_u32(static_cast<uint64_t>(control_element[index2]) | (static_cast<uint64_t>(control_element[index3]) << 32));
const uint8x8_t rH = vtbl2_u8(tbl, idx);

return vcombine_f32(rL, rH);
}
#endif
#else
#pragma error("vector swizzle not implement here!");
#endif

#define VECTOR_REPLICATE( vec, element_index ) vector_replicate_impl<element_index>(vec)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

RTM implementation detail macros are prefixed with RTM_IMPL_

#define VECTOR_SWIZZLE( vec, x, y, z, w ) vector_swizzle_impl<x,y,z,w>( vec )
#define VECTOR_SHUFFLE( vec1, vec2, x, y, z, w ) vector_shuffle_impl<x,y,z,w>( vec1, vec2 )

RTM_IMPL_VERSION_NAMESPACE_END
}
#endif
RTM_IMPL_FILE_PRAGMA_POP

2 changes: 1 addition & 1 deletion includes/rtm/matrix3x3f.h
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ namespace rtm
// is to multiply the normal with the cofactor matrix.
// See: https://github.com/graphitemaster/normals_revisited
//////////////////////////////////////////////////////////////////////////
RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL matrix_mul_vector3(vector4f_arg0 vec3, matrix3x3f_arg0 mtx) RTM_NO_EXCEPT
RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL matrix_mul_vector3(vector4f_arg0 vec3, matrix3x3f_argn mtx) RTM_NO_EXCEPT
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I haven't had the chance to profile this particular change, I'll do that soon.

{
vector4f tmp;

Expand Down
Loading
Loading