diff --git a/includes/rtm/impl/matrix_affine_common.h b/includes/rtm/impl/matrix_affine_common.h index 5bef3a43..df914e5c 100644 --- a/includes/rtm/impl/matrix_affine_common.h +++ b/includes/rtm/impl/matrix_affine_common.h @@ -58,18 +58,21 @@ namespace rtm { RTM_ASSERT(quat_is_normalized(quat_input), "Quaternion is not normalized"); - const float_type x2 = quat_get_x(quat_input) + quat_get_x(quat_input); - const float_type y2 = quat_get_y(quat_input) + quat_get_y(quat_input); - const float_type z2 = quat_get_z(quat_input) + quat_get_z(quat_input); - const float_type xx = quat_get_x(quat_input) * x2; - const float_type xy = quat_get_x(quat_input) * y2; - const float_type xz = quat_get_x(quat_input) * z2; - const float_type yy = quat_get_y(quat_input) * y2; - const float_type yz = quat_get_y(quat_input) * z2; - const float_type zz = quat_get_z(quat_input) * z2; - const float_type wx = quat_get_w(quat_input) * x2; - const float_type wy = quat_get_w(quat_input) * y2; - const float_type wz = quat_get_w(quat_input) * z2; + float_type quatval[4]; + quat_store(quat_input, quatval); + + const float_type x2 = quatval[0] + quatval[0]; + const float_type y2 = quatval[1] + quatval[1]; + const float_type z2 = quatval[2] + quatval[2]; + const float_type xx = quatval[0] * x2; + const float_type xy = quatval[0] * y2; + const float_type xz = quatval[0] * z2; + const float_type yy = quatval[1] * y2; + const float_type yz = quatval[1] * z2; + const float_type zz = quatval[2] * z2; + const float_type wx = quatval[3] * x2; + const float_type wy = quatval[3] * y2; + const float_type wz = quatval[3] * z2; const vector4 x_axis = vector_set(float_type(1.0) - (yy + zz), xy + wz, xz - wy, float_type(0.0)); const vector4 y_axis = vector_set(xy - wz, float_type(1.0) - (xx + zz), yz + wx, float_type(0.0)); @@ -80,19 +83,21 @@ namespace rtm RTM_DISABLE_SECURITY_COOKIE_CHECK inline RTM_SIMD_CALL operator matrix3x4() const RTM_NO_EXCEPT { RTM_ASSERT(quat_is_normalized(quat_input), "Quaternion is not normalized"); - - const float_type x2 = quat_get_x(quat_input) + quat_get_x(quat_input); - const float_type y2 = quat_get_y(quat_input) + quat_get_y(quat_input); - const float_type z2 = quat_get_z(quat_input) + quat_get_z(quat_input); - const float_type xx = quat_get_x(quat_input) * x2; - const float_type xy = quat_get_x(quat_input) * y2; - const float_type xz = quat_get_x(quat_input) * z2; - const float_type yy = quat_get_y(quat_input) * y2; - const float_type yz = quat_get_y(quat_input) * z2; - const float_type zz = quat_get_z(quat_input) * z2; - const float_type wx = quat_get_w(quat_input) * x2; - const float_type wy = quat_get_w(quat_input) * y2; - const float_type wz = quat_get_w(quat_input) * z2; + float_type quatval[4]; + quat_store(quat_input, quatval); + + const float_type x2 = quatval[0] + quatval[0]; + const float_type y2 = quatval[1] + quatval[1]; + const float_type z2 = quatval[2] + quatval[2]; + const float_type xx = quatval[0] * x2; + const float_type xy = quatval[0] * y2; + const float_type xz = quatval[0] * z2; + const float_type yy = quatval[1] * y2; + const float_type yz = quatval[1] * z2; + const float_type zz = quatval[2] * z2; + const float_type wx = quatval[3] * x2; + const float_type wy = quatval[3] * y2; + const float_type wz = quatval[3] * z2; const vector4 x_axis = vector_set(float_type(1.0) - (yy + zz), xy + wz, xz - wy, float_type(0.0)); const vector4 y_axis = vector_set(xy - wz, float_type(1.0) - (xx + zz), yz + wx, float_type(0.0)); diff --git a/includes/rtm/impl/vector4f_swizzle.h b/includes/rtm/impl/vector4f_swizzle.h new file mode 100644 index 00000000..9846b32d --- /dev/null +++ b/includes/rtm/impl/vector4f_swizzle.h @@ -0,0 +1,193 @@ +#pragma once + +#include +#include +#include + +#include "rtm/math.h" +#include "rtm/types.h" +#include "rtm/impl/compiler_utils.h" +#include "rtm/scalarf.h" +#include "rtm/scalard.h" +#include "rtm/version.h" + +RTM_IMPL_FILE_PRAGMA_PUSH + +#if !defined(RTM_NO_INTRINSICS) +namespace rtm +{ + RTM_IMPL_VERSION_NAMESPACE_BEGIN + +#if defined(RTM_SSE2_INTRINSICS) || defined(RTM_AVX_INTRINSICS) + +#define SHUFFLE_MASK(a0,a1,b2,b3) ( (a0) | ((a1)<<2) | ((b2)<<4) | ((b3)<<6) ) + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Float swizzle + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + template + RTM_FORCE_INLINE vector4f vector_swizzle_impl(const vector4f& vec) + { + return _mm_shuffle_ps(vec, vec, SHUFFLE_MASK(index0, index1, index2, index3)); + } + template<> RTM_FORCE_INLINE vector4f vector_swizzle_impl<0, 1, 2, 3>(const vector4f& vec) + { + return vec; + } + template<> RTM_FORCE_INLINE vector4f vector_swizzle_impl<0, 1, 0, 1>(const vector4f& vec) + { + return _mm_movelh_ps(vec, vec); + } + template<> RTM_FORCE_INLINE vector4f vector_swizzle_impl<2, 3, 2, 3>(const vector4f& vec) + { + return _mm_movehl_ps(vec, vec); + } + template<> RTM_FORCE_INLINE vector4f vector_swizzle_impl<0, 0, 1, 1>(const vector4f& vec) + { + return _mm_unpacklo_ps(vec, vec); + } + template<> RTM_FORCE_INLINE vector4f vector_swizzle_impl<2, 2, 3, 3>(const vector4f& vec) + { + return _mm_unpackhi_ps(vec, vec); + } + +#if defined(RTM_SSE4_INTRINSICS) + template<> RTM_FORCE_INLINE vector4f vector_swizzle_impl<0, 0, 2, 2>(const vector4f& vec) + { + return _mm_moveldup_ps(vec); + } + template<> RTM_FORCE_INLINE vector4f vector_swizzle_impl<1, 1, 3, 3>(const vector4f& vec) + { + return _mm_movehdup_ps(vec); + } +#endif + +#if defined(RTM_AVX2_INTRINSICS) + template<> RTM_FORCE_INLINE vector4f vector_swizzle_impl<0, 0, 0, 0>(const vector4f& vec) + { + return _mm_broadcastss_ps(vec); + } +#endif + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Float replicate + template + RTM_FORCE_INLINE vector4f vector_replicate_impl(const vector4f& vec) + { + static_assert(index >= 0 && index <= 3, "Invalid Index"); + return vector_swizzle_impl(vec); + } + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Float shuffle + template + RTM_FORCE_INLINE vector4f vector_shuffle_impl(const vector4f& vec1, const vector4f& vec2) + { + static_assert(index0 >= 0 && index0 <= 3 && index1 >= 0 && index1 <= 3 && index2 >= 0 && index2 <= 3 && index3 >= 0 && index3 <= 3, "Invalid Index"); + return _mm_shuffle_ps(vec1, vec2, SHUFFLE_MASK(index0, index1, index2, index3)); + } + + // Float Shuffle specializations + template<> RTM_FORCE_INLINE vector4f vector_shuffle_impl<0, 1, 0, 1>(const vector4f& vec1, const vector4f& vec2) + { + return _mm_movelh_ps(vec1, vec2); + + } + template<> RTM_FORCE_INLINE vector4f vector_shuffle_impl<2, 3, 2, 3>(const vector4f& vec1, const vector4f& vec2) + { + // Note: movehl copies first from the 2nd argument + return _mm_movehl_ps(vec2, vec1); + } + +#elif defined(RTM_NEON_INTRINSICS) + template + RTM_FORCE_INLINE vector4f vector_replicate_impl(const vector4f& vec) + { + return vdupq_n_f32(vgetq_lane_f32(vec, element_index)); + } + +#if defined(__clang__) + template + RTM_FORCE_INLINE vector4f vector_swizzle_impl(vector4f vec) + { + return __builtin_shufflevector(vec, vec, x, y, z, w); + } + + template + RTM_FORCE_INLINE vector4f vector_shuffle_impl(vector4f vec1, vector4f vec2) + { + return __builtin_shufflevector(vec1, vec2, x, y, z + 4, w + 4); + } +#else + + template + RTM_FORCE_INLINE vector4f vector_shuffle_impl(vector4f vec1, vector4f vec2) + { + static_assert(index0 <= 3 && index1 <= 3 && index2 <= 3 && index3 <= 3, "Invalid Index"); + + static constexpr uint32_t control_element[8] = + { + 0x03020100, // XM_PERMUTE_0X + 0x07060504, // XM_PERMUTE_0Y + 0x0B0A0908, // XM_PERMUTE_0Z + 0x0F0E0D0C, // XM_PERMUTE_0W + 0x13121110, // XM_PERMUTE_1X + 0x17161514, // XM_PERMUTE_1Y + 0x1B1A1918, // XM_PERMUTE_1Z + 0x1F1E1D1C, // XM_PERMUTE_1W + }; + + uint8x8x4_t tbl; + tbl.val[0] = vget_low_f32(vec1); + tbl.val[1] = vget_high_f32(vec1); + tbl.val[2] = vget_low_f32(vec2); + tbl.val[3] = vget_high_f32(vec2); + + uint32x2_t idx = vcreate_u32(static_cast(control_element[index0]) | (static_cast(control_element[index1]) << 32)); + const uint8x8_t rL = vtbl4_u8(tbl, idx); + + idx = vcreate_u32(static_cast(control_element[index2 + 4]) | (static_cast(control_element[index3 + 4]) << 32)); + const uint8x8_t rH = vtbl4_u8(tbl, idx); + + return vcombine_f32(rL, rH); + } + + template + RTM_FORCE_INLINE vector4f vector_swizzle_impl(vector4f vec) + { + static_assert(index0 <= 3 && index1 <= 3 && index2 <= 3 && index3 <= 3, "Invalid Index"); + + static constexpr uint32_t control_element[4] = + { + 0x03020100, // XM_SWIZZLE_X + 0x07060504, // XM_SWIZZLE_Y + 0x0B0A0908, // XM_SWIZZLE_Z + 0x0F0E0D0C, // XM_SWIZZLE_W + }; + + uint8x8x2_t tbl; + tbl.val[0] = vget_low_f32(vec); + tbl.val[1] = vget_high_f32(vec); + + uint32x2_t idx = vcreate_u32(static_cast(control_element[index0]) | (static_cast(control_element[index1]) << 32)); + const uint8x8_t rL = vtbl2_u8(tbl, idx); + + idx = vcreate_u32(static_cast(control_element[index2]) | (static_cast(control_element[index3]) << 32)); + const uint8x8_t rH = vtbl2_u8(tbl, idx); + + return vcombine_f32(rL, rH); + } +#endif +#else +#pragma error("vector swizzle not implement here!"); +#endif + +#define VECTOR_REPLICATE( vec, element_index ) vector_replicate_impl(vec) +#define VECTOR_SWIZZLE( vec, x, y, z, w ) vector_swizzle_impl( vec ) +#define VECTOR_SHUFFLE( vec1, vec2, x, y, z, w ) vector_shuffle_impl( vec1, vec2 ) + +RTM_IMPL_VERSION_NAMESPACE_END +} +#endif +RTM_IMPL_FILE_PRAGMA_POP + diff --git a/includes/rtm/matrix3x3f.h b/includes/rtm/matrix3x3f.h index 40a90f9c..0fe31a7e 100644 --- a/includes/rtm/matrix3x3f.h +++ b/includes/rtm/matrix3x3f.h @@ -181,7 +181,7 @@ namespace rtm // is to multiply the normal with the cofactor matrix. // See: https://github.com/graphitemaster/normals_revisited ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL matrix_mul_vector3(vector4f_arg0 vec3, matrix3x3f_arg0 mtx) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL matrix_mul_vector3(vector4f_arg0 vec3, matrix3x3f_argn mtx) RTM_NO_EXCEPT { vector4f tmp; diff --git a/includes/rtm/matrix3x4f.h b/includes/rtm/matrix3x4f.h index 71f7f1b0..e7fa5963 100644 --- a/includes/rtm/matrix3x4f.h +++ b/includes/rtm/matrix3x4f.h @@ -61,22 +61,25 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// // Sets a 3x4 affine matrix from a rotation quaternion and translation. ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix3x4f RTM_SIMD_CALL matrix_from_qv(quatf_arg0 quat, vector4f_arg1 translation) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix3x4f RTM_SIMD_CALL matrix_from_qv(quatf_arg0 quat, vector4f_arg1 translation) RTM_NO_EXCEPT { RTM_ASSERT(quat_is_normalized(quat), "Quaternion is not normalized"); - const float x2 = quat_get_x(quat) + quat_get_x(quat); - const float y2 = quat_get_y(quat) + quat_get_y(quat); - const float z2 = quat_get_z(quat) + quat_get_z(quat); - const float xx = quat_get_x(quat) * x2; - const float xy = quat_get_x(quat) * y2; - const float xz = quat_get_x(quat) * z2; - const float yy = quat_get_y(quat) * y2; - const float yz = quat_get_y(quat) * z2; - const float zz = quat_get_z(quat) * z2; - const float wx = quat_get_w(quat) * x2; - const float wy = quat_get_w(quat) * y2; - const float wz = quat_get_w(quat) * z2; + float quatval[4]; + quat_store(quat, quatval); + + const float x2 = quatval[0] + quatval[0]; + const float y2 = quatval[1] + quatval[1]; + const float z2 = quatval[2] + quatval[2]; + const float xx = quatval[0] * x2; + const float xy = quatval[0] * y2; + const float xz = quatval[0] * z2; + const float yy = quatval[1] * y2; + const float yz = quatval[1] * z2; + const float zz = quatval[2] * z2; + const float wx = quatval[3] * x2; + const float wy = quatval[3] * y2; + const float wz = quatval[3] * z2; const vector4f x_axis = vector_set(1.0F - (yy + zz), xy + wz, xz - wy, 0.0F); const vector4f y_axis = vector_set(xy - wz, 1.0F - (xx + zz), yz + wx, 0.0F); @@ -87,7 +90,7 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// // Converts a QV transform into a 3x4 affine matrix. ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix3x4f RTM_SIMD_CALL matrix_from_qv(qvf_arg0 transform) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix3x4f RTM_SIMD_CALL matrix_from_qv(qvf_argn transform) RTM_NO_EXCEPT { return matrix_from_qv(transform.rotation, transform.translation); } @@ -95,23 +98,25 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// // Sets a 3x4 affine matrix from a rotation quaternion, translation, and scalar scale. ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix3x4f RTM_SIMD_CALL matrix_from_qvs(quatf_arg0 quat, vector4f_arg1 translation, float scale) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix3x4f RTM_SIMD_CALL matrix_from_qvs(quatf_arg0 quat, vector4f_arg1 translation, float scale) RTM_NO_EXCEPT { RTM_ASSERT(quat_is_normalized(quat), "Quaternion is not normalized"); - const float x2 = quat_get_x(quat) + quat_get_x(quat); - const float y2 = quat_get_y(quat) + quat_get_y(quat); - const float z2 = quat_get_z(quat) + quat_get_z(quat); - const float xx = quat_get_x(quat) * x2; - const float xy = quat_get_x(quat) * y2; - const float xz = quat_get_x(quat) * z2; - const float yy = quat_get_y(quat) * y2; - const float yz = quat_get_y(quat) * z2; - const float zz = quat_get_z(quat) * z2; - const float wx = quat_get_w(quat) * x2; - const float wy = quat_get_w(quat) * y2; - const float wz = quat_get_w(quat) * z2; - + float quatval[4]; + quat_store(quat, quatval); + + const float x2 = quatval[0] + quatval[0]; + const float y2 = quatval[1] + quatval[1]; + const float z2 = quatval[2] + quatval[2]; + const float xx = quatval[0] * x2; + const float xy = quatval[0] * y2; + const float xz = quatval[0] * z2; + const float yy = quatval[1] * y2; + const float yz = quatval[1] * z2; + const float zz = quatval[2] * z2; + const float wx = quatval[3] * x2; + const float wy = quatval[3] * y2; + const float wz = quatval[3] * z2; const scalarf scale_s = scalar_set(scale); const vector4f x_axis = vector_mul(vector_set(1.0F - (yy + zz), xy + wz, xz - wy, 0.0F), scale_s); @@ -123,7 +128,7 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// // Converts a QVS transform into a 3x4 affine matrix. ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix3x4f RTM_SIMD_CALL matrix_from_qvs(qvsf_arg0 transform) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix3x4f RTM_SIMD_CALL matrix_from_qvs(qvsf_argn transform) RTM_NO_EXCEPT { return matrix_from_qvs(transform.rotation, transform.translation_scale, qvs_get_scale(transform)); } @@ -131,22 +136,25 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// // Sets a 3x4 affine matrix from a rotation quaternion, translation, and 3D scale. ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix3x4f RTM_SIMD_CALL matrix_from_qvv(quatf_arg0 quat, vector4f_arg1 translation, vector4f_arg2 scale) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix3x4f RTM_SIMD_CALL matrix_from_qvv(quatf_arg0 quat, vector4f_arg1 translation, vector4f_arg2 scale) RTM_NO_EXCEPT { RTM_ASSERT(quat_is_normalized(quat), "Quaternion is not normalized"); - const float x2 = quat_get_x(quat) + quat_get_x(quat); - const float y2 = quat_get_y(quat) + quat_get_y(quat); - const float z2 = quat_get_z(quat) + quat_get_z(quat); - const float xx = quat_get_x(quat) * x2; - const float xy = quat_get_x(quat) * y2; - const float xz = quat_get_x(quat) * z2; - const float yy = quat_get_y(quat) * y2; - const float yz = quat_get_y(quat) * z2; - const float zz = quat_get_z(quat) * z2; - const float wx = quat_get_w(quat) * x2; - const float wy = quat_get_w(quat) * y2; - const float wz = quat_get_w(quat) * z2; + float quatval[4]; + quat_store(quat, quatval); + + const float x2 = quatval[0] + quatval[0]; + const float y2 = quatval[1] + quatval[1]; + const float z2 = quatval[2] + quatval[2]; + const float xx = quatval[0] * x2; + const float xy = quatval[0] * y2; + const float xz = quatval[0] * z2; + const float yy = quatval[1] * y2; + const float yz = quatval[1] * z2; + const float zz = quatval[2] * z2; + const float wx = quatval[3] * x2; + const float wy = quatval[3] * y2; + const float wz = quatval[3] * z2; const scalarf scale_x = vector_get_x_as_scalar(scale); const scalarf scale_y = vector_get_y_as_scalar(scale); @@ -161,7 +169,7 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// // Converts a QVV transform into a 3x4 affine matrix. ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix3x4f RTM_SIMD_CALL matrix_from_qvv(qvvf_arg0 transform) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix3x4f RTM_SIMD_CALL matrix_from_qvv(qvvf_argn transform) RTM_NO_EXCEPT { return matrix_from_qvv(transform.rotation, transform.translation, transform.scale); } @@ -209,7 +217,7 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// // Returns a new 3x4 matrix where the specified axis has been replaced on the input matrix. ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix3x4f RTM_SIMD_CALL matrix_set_axis(matrix3x4f_arg0 input, vector4f_arg5 axis_value, axis4 axis) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix3x4f RTM_SIMD_CALL matrix_set_axis(matrix3x4f_arg0 input, vector4f_arg5 axis_value, axis4 axis) RTM_NO_EXCEPT { switch (axis) { diff --git a/includes/rtm/matrix4x4f.h b/includes/rtm/matrix4x4f.h index 29494ea9..6116f8b4 100644 --- a/includes/rtm/matrix4x4f.h +++ b/includes/rtm/matrix4x4f.h @@ -41,7 +41,7 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// // Returns the axis pointing in the forward direction of the default coordinate system (Z+). ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE constexpr vector4f RTM_SIMD_CALL matrix_get_coord_forward(matrix4x4f_arg0 input) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE constexpr vector4f RTM_SIMD_CALL matrix_get_coord_forward(matrix4x4f_argn input) RTM_NO_EXCEPT { return input.z_axis; } @@ -49,7 +49,7 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// // Returns the axis pointing in the up direction of the default coordinate system (Y+). ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE constexpr vector4f RTM_SIMD_CALL matrix_get_coord_up(matrix4x4f_arg0 input) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE constexpr vector4f RTM_SIMD_CALL matrix_get_coord_up(matrix4x4f_argn input) RTM_NO_EXCEPT { return input.y_axis; } @@ -57,7 +57,7 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// // Returns the axis pointing in the cross direction of the default coordinate system (X+). ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE constexpr vector4f RTM_SIMD_CALL matrix_get_coord_cross(matrix4x4f_arg0 input) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE constexpr vector4f RTM_SIMD_CALL matrix_get_coord_cross(matrix4x4f_argn input) RTM_NO_EXCEPT { return input.x_axis; } @@ -65,7 +65,7 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// // Returns the axis holding the position of the default coordinate system (W+). ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE constexpr vector4f RTM_SIMD_CALL matrix_get_coord_position(matrix4x4f_arg0 input) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE constexpr vector4f RTM_SIMD_CALL matrix_get_coord_position(matrix4x4f_argn input) RTM_NO_EXCEPT { return input.w_axis; } @@ -73,7 +73,7 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// // Returns the desired 4x4 matrix axis. ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE constexpr vector4f RTM_SIMD_CALL matrix_get_axis(matrix4x4f_arg0 input, axis4 axis) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE constexpr vector4f RTM_SIMD_CALL matrix_get_axis(matrix4x4f_argn input, axis4 axis) RTM_NO_EXCEPT { return axis == axis4::x ? input.x_axis : (axis == axis4::y ? input.y_axis : (axis == axis4::z ? input.z_axis : input.w_axis)); } @@ -81,7 +81,7 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// // Returns a new 4x4 matrix where the specified axis has been replaced on the input matrix. ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix4x4f RTM_SIMD_CALL matrix_set_axis(matrix4x4f_arg0 input, vector4f_arg5 axis_value, axis4 axis) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL matrix_set_axis(matrix4x4f_argn input, vector4f_arg5 axis_value, axis4 axis) RTM_NO_EXCEPT { switch (axis) { @@ -96,7 +96,7 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// // Returns the desired 4x4 matrix component from the specified axis. ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK inline rtm_impl::vector4f_vector_get_component RTM_SIMD_CALL matrix_get_component(matrix4x4f_arg0 input, axis4 axis, component4 component) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE rtm_impl::vector4f_vector_get_component RTM_SIMD_CALL matrix_get_component(matrix4x4f_argn input, axis4 axis, component4 component) RTM_NO_EXCEPT { switch (axis) { @@ -111,7 +111,7 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// // Returns the desired 4x4 matrix component from the specified axis. ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK inline scalarf RTM_SIMD_CALL matrix_get_component_as_scalar(matrix4x4f_arg0 input, axis4 axis, component4 component) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE scalarf RTM_SIMD_CALL matrix_get_component_as_scalar(matrix4x4f_arg0 input, axis4 axis, component4 component) RTM_NO_EXCEPT { switch (axis) { @@ -126,7 +126,7 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// // Returns a new 4x4 matrix where the specified axis/component has been replaced on the input matrix. ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix4x4f RTM_SIMD_CALL matrix_set_component(matrix4x4f_arg0 input, float component_value, axis4 axis, component4 component) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL matrix_set_component(matrix4x4f_arg0 input, float component_value, axis4 axis, component4 component) RTM_NO_EXCEPT { switch (axis) { @@ -142,7 +142,7 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// // Returns a new 4x4 matrix where the specified axis/component has been replaced on the input matrix. ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix4x4f RTM_SIMD_CALL matrix_set_component(matrix4x4f_arg0 input, scalarf_arg4 component_value, axis4 axis, component4 component) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL matrix_set_component(matrix4x4f_arg0 input, scalarf_arg4 component_value, axis4 axis, component4 component) RTM_NO_EXCEPT { switch (axis) { @@ -159,7 +159,7 @@ namespace rtm // Multiplies two 4x4 matrices. // Multiplication order is as follow: local_to_world = matrix_mul(local_to_object, object_to_world) ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix4x4f RTM_SIMD_CALL matrix_mul(matrix4x4f_arg0 lhs, matrix4x4f_arg1 rhs) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL matrix_mul(matrix4x4f_argn lhs, matrix4x4f_argn rhs) RTM_NO_EXCEPT { vector4f tmp = vector_mul(vector_dup_x(lhs.x_axis), rhs.x_axis); tmp = vector_mul_add(vector_dup_y(lhs.x_axis), rhs.y_axis, tmp); @@ -192,7 +192,7 @@ namespace rtm // Multiplies a 4x4 matrix and a 4D vector. // Multiplication order is as follow: world_position = matrix_mul(local_position, local_to_world) ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL matrix_mul_vector(vector4f_arg0 vec4, matrix4x4f_arg0 mtx) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL matrix_mul_vector(vector4f_arg0 vec4, matrix4x4f_argn mtx) RTM_NO_EXCEPT { vector4f tmp; @@ -207,7 +207,7 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// // Transposes a 4x4 matrix. ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL matrix_transpose(matrix4x4f_arg0 input) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL matrix_transpose(matrix4x4f_argn input) RTM_NO_EXCEPT { vector4f x_axis; vector4f y_axis; @@ -222,7 +222,7 @@ namespace rtm // If the input matrix is not invertible, the result is undefined. // For a safe alternative, supply a fallback value and a threshold. ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix4x4f RTM_SIMD_CALL matrix_inverse(matrix4x4f_arg0 input) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL matrix_inverse(matrix4x4f_argn input) RTM_NO_EXCEPT { matrix4x4f input_transposed = matrix_transpose(input); @@ -316,7 +316,7 @@ namespace rtm // If the input matrix has a determinant whose absolute value is below the supplied threshold, the // fall back value is returned instead. ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix4x4f RTM_SIMD_CALL matrix_inverse(matrix4x4f_arg0 input, matrix4x4f_arg1 fallback, float threshold = 1.0E-8F) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL matrix_inverse(matrix4x4f_argn input, matrix4x4f_argn fallback, float threshold = 1.0E-8F) RTM_NO_EXCEPT { matrix4x4f input_transposed = matrix_transpose(input); @@ -411,7 +411,7 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// // Returns the determinant of the input 4x4 matrix. ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK inline scalarf RTM_SIMD_CALL matrix_determinant(matrix4x4f_arg0 input) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE scalarf RTM_SIMD_CALL matrix_determinant(matrix4x4f_argn input) RTM_NO_EXCEPT { matrix4x4f input_transposed = matrix_transpose(input); @@ -458,7 +458,7 @@ namespace rtm // The minor is the determinant of the sub-matrix input when the specified // row and column are removed. ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK inline scalarf RTM_SIMD_CALL matrix_minor(matrix4x4f_arg0 input, axis4 row, axis4 column) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE scalarf RTM_SIMD_CALL matrix_minor(matrix4x4f_argn input, axis4 row, axis4 column) RTM_NO_EXCEPT { vector4f row0; vector4f row1; @@ -522,7 +522,7 @@ namespace rtm // Returns the cofactor matrix of the input 4x4 matrix. // See: https://en.wikipedia.org/wiki/Minor_(linear_algebra)#Cofactor_expansion_of_the_determinant ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix4x4f RTM_SIMD_CALL matrix_cofactor(matrix4x4f_arg0 input) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL matrix_cofactor(matrix4x4f_argn input) RTM_NO_EXCEPT { const scalarf minor_xx = matrix_minor(input, axis4::x, axis4::x); const scalarf minor_xy = matrix_minor(input, axis4::x, axis4::y); @@ -558,7 +558,7 @@ namespace rtm // Returns the adjugate of the input matrix. // See: https://en.wikipedia.org/wiki/Adjugate_matrix ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix4x4f RTM_SIMD_CALL matrix_adjugate(matrix4x4f_arg0 input) RTM_NO_EXCEPT + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL matrix_adjugate(matrix4x4f_arg0 input) RTM_NO_EXCEPT { return matrix_transpose(matrix_cofactor(input)); } diff --git a/includes/rtm/quatf.h b/includes/rtm/quatf.h index 1dcf25a7..98fa9fbc 100644 --- a/includes/rtm/quatf.h +++ b/includes/rtm/quatf.h @@ -733,19 +733,7 @@ namespace rtm { RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE RTM_SIMD_CALL operator float() const RTM_NO_EXCEPT { -#if defined(RTM_SSE4_INTRINSICS) && 0 - // SSE4 dot product instruction appears slower on Zen2, is it the case elsewhere as well? - return _mm_cvtss_f32(_mm_dp_ps(lhs, rhs, 0xFF)); -#elif defined(RTM_SSE2_INTRINSICS) - __m128 x2_y2_z2_w2 = _mm_mul_ps(lhs, rhs); - __m128 z2_w2_0_0 = _mm_shuffle_ps(x2_y2_z2_w2, x2_y2_z2_w2, _MM_SHUFFLE(0, 0, 3, 2)); - __m128 x2z2_y2w2_0_0 = _mm_add_ps(x2_y2_z2_w2, z2_w2_0_0); - __m128 y2w2_0_0_0 = _mm_shuffle_ps(x2z2_y2w2_0_0, x2z2_y2w2_0_0, _MM_SHUFFLE(0, 0, 0, 1)); - __m128 x2y2z2w2_0_0_0 = _mm_add_ps(x2z2_y2w2_0_0, y2w2_0_0_0); - return _mm_cvtss_f32(x2y2z2w2_0_0_0); -#else - return (quat_get_x(lhs) * quat_get_x(rhs)) + (quat_get_y(lhs) * quat_get_y(rhs)) + (quat_get_z(lhs) * quat_get_z(rhs)) + (quat_get_w(lhs) * quat_get_w(rhs)); -#endif + return vector_dot(quat_to_vector(lhs), quat_to_vector(rhs)); } #if defined(RTM_SSE2_INTRINSICS) diff --git a/includes/rtm/vector4f.h b/includes/rtm/vector4f.h index 7cda1544..25bfbcc0 100644 --- a/includes/rtm/vector4f.h +++ b/includes/rtm/vector4f.h @@ -34,6 +34,7 @@ #include "rtm/impl/macros.mask4.impl.h" #include "rtm/impl/memory_utils.h" #include "rtm/impl/vector_common.h" +#include "rtm/impl/vector4f_swizzle.h" #include #include @@ -63,6 +64,20 @@ namespace rtm #endif } + ////////////////////////////////////////////////////////////////////////// + // Loads an aligned vector4 from memory. + ////////////////////////////////////////////////////////////////////////// + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_load_aligned(const float* input) RTM_NO_EXCEPT + { +#if defined(RTM_SSE2_INTRINSICS) + return _mm_load_ps(input); +#elif defined(RTM_NEON_INTRINSICS) + return vld1q_f32(input); +#else + return vector_set(input[0], input[1], input[2], input[3]); +#endif + } + ////////////////////////////////////////////////////////////////////////// // Loads an input scalar from memory into the [x] component and sets the [yzw] components to zero. ////////////////////////////////////////////////////////////////////////// @@ -1005,6 +1020,25 @@ namespace rtm { #if defined(RTM_SSE2_INTRINSICS) _mm_storeu_ps(output, input); +#elif defined(RTM_NEON_INTRINSICS) + vst1q_f32(output, input); +#else + output[0] = vector_get_x(input); + output[1] = vector_get_y(input); + output[2] = vector_get_z(input); + output[3] = vector_get_w(input); +#endif + } + + ////////////////////////////////////////////////////////////////////////// + // Writes a vector4 to aligned memory. + ////////////////////////////////////////////////////////////////////////// + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE void RTM_SIMD_CALL vector_store_aligned(vector4f_arg0 input, float* output) RTM_NO_EXCEPT + { +#if defined(RTM_SSE2_INTRINSICS) + _mm_store_ps(output, input); +#elif defined(RTM_NEON_INTRINSICS) + vst1q_f32(output, input); #else output[0] = vector_get_x(input); output[1] = vector_get_y(input); @@ -1026,8 +1060,15 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE void RTM_SIMD_CALL vector_store2(vector4f_arg0 input, float* output) RTM_NO_EXCEPT { +#if defined(RTM_SSE2_INTRINSICS) output[0] = vector_get_x(input); output[1] = vector_get_y(input); +#elif defined(RTM_NEON_INTRINSICS) + vst1_f32(output, *(float32x2_t*)&input); +#else + output[0] = vector_get_x(input); + output[1] = vector_get_y(input); +#endif } ////////////////////////////////////////////////////////////////////////// @@ -1035,11 +1076,22 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE void RTM_SIMD_CALL vector_store3(vector4f_arg0 input, float* output) RTM_NO_EXCEPT { +#if defined(RTM_SSE2_INTRINSICS) output[0] = vector_get_x(input); output[1] = vector_get_y(input); output[2] = vector_get_z(input); +#elif defined(RTM_NEON_INTRINSICS) + vst1_f32(output, *(float32x2_t*)&input); + vst1q_lane_f32(((float32_t*)output) + 2, input, 2); +#else + output[0] = vector_get_x(input); + output[1] = vector_get_y(input); + output[2] = vector_get_z(input); +#endif } + + ////////////////////////////////////////////////////////////////////////// // Writes a vector4 to unaligned memory. ////////////////////////////////////////////////////////////////////////// @@ -1510,47 +1562,6 @@ namespace rtm #endif } - ////////////////////////////////////////////////////////////////////////// - // 3D cross product: lhs x rhs - ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_cross3(vector4f_arg0 lhs, vector4f_arg1 rhs) RTM_NO_EXCEPT - { -#if defined(RTM_SSE2_INTRINSICS) - // cross(a, b).zxy = (a * b.yzx) - (a.yzx * b) - __m128 lhs_yzx = _mm_shuffle_ps(lhs, lhs, _MM_SHUFFLE(3, 0, 2, 1)); - __m128 rhs_yzx = _mm_shuffle_ps(rhs, rhs, _MM_SHUFFLE(3, 0, 2, 1)); - __m128 tmp_zxy = _mm_sub_ps(_mm_mul_ps(lhs, rhs_yzx), _mm_mul_ps(lhs_yzx, rhs)); - - // cross(a, b) = ((a * b.yzx) - (a.yzx * b)).yzx - return _mm_shuffle_ps(tmp_zxy, tmp_zxy, _MM_SHUFFLE(3, 0, 2, 1)); -#elif defined(RTM_NEON_INTRINSICS) - // cross(a, b) = (a.yzx * b.zxy) - (a.zxy * b.yzx) - float32x4_t lhs_yzwx = vextq_f32(lhs, lhs, 1); - float32x4_t rhs_wxyz = vextq_f32(rhs, rhs, 3); - - float32x4_t lhs_yzx = vsetq_lane_f32(vgetq_lane_f32(lhs, 0), lhs_yzwx, 2); - float32x4_t rhs_zxy = vsetq_lane_f32(vgetq_lane_f32(rhs, 2), rhs_wxyz, 0); - - // part_a = (a.yzx * b.zxy) - float32x4_t part_a = vmulq_f32(lhs_yzx, rhs_zxy); - - float32x4_t lhs_wxyz = vextq_f32(lhs, lhs, 3); - float32x4_t rhs_yzwx = vextq_f32(rhs, rhs, 1); - float32x4_t lhs_zxy = vsetq_lane_f32(vgetq_lane_f32(lhs, 2), lhs_wxyz, 0); - float32x4_t rhs_yzx = vsetq_lane_f32(vgetq_lane_f32(rhs, 0), rhs_yzwx, 2); - - return vmlsq_f32(part_a, lhs_zxy, rhs_yzx); -#else - // cross(a, b) = (a.yzx * b.zxy) - (a.zxy * b.yzx) - const float lhs_x = vector_get_x(lhs); - const float lhs_y = vector_get_y(lhs); - const float lhs_z = vector_get_z(lhs); - const float rhs_x = vector_get_x(rhs); - const float rhs_y = vector_get_y(rhs); - const float rhs_z = vector_get_z(rhs); - return vector_set((lhs_y * rhs_z) - (lhs_z * rhs_y), (lhs_z * rhs_x) - (lhs_x * rhs_z), (lhs_x * rhs_y) - (lhs_y * rhs_x)); -#endif - } namespace rtm_impl { @@ -2424,6 +2435,48 @@ namespace rtm } #endif + //v2 - v0 * v1 + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_neg_mul_add(vector4f_arg0 v0, vector4f_arg1 v1, vector4f_arg2 v2) RTM_NO_EXCEPT + { + return vector_neg_mul_sub(v0, v1, v2); + } + + + ////////////////////////////////////////////////////////////////////////// + // 3D cross product: lhs x rhs + ////////////////////////////////////////////////////////////////////////// + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_cross3(vector4f_arg0 lhs, vector4f_arg1 rhs) RTM_NO_EXCEPT + { +#if defined(RTM_SSE2_INTRINSICS) + // cross(a, b).zxy = (a * b.yzx) - (a.yzx * b) + __m128 lhs_yzx = _mm_shuffle_ps(lhs, lhs, _MM_SHUFFLE(3, 0, 2, 1)); + __m128 rhs_yzx = _mm_shuffle_ps(rhs, rhs, _MM_SHUFFLE(3, 0, 2, 1)); + __m128 tmp_zxy = _mm_sub_ps(_mm_mul_ps(lhs, rhs_yzx), _mm_mul_ps(lhs_yzx, rhs)); + + // cross(a, b) = ((a * b.yzx) - (a.yzx * b)).yzx + return _mm_shuffle_ps(tmp_zxy, tmp_zxy, _MM_SHUFFLE(3, 0, 2, 1)); +#elif defined(RTM_NEON_INTRINSICS) + // YZX + vector4f A = VECTOR_SWIZZLE(rhs, 1, 2, 0, 3); + vector4f B = VECTOR_SWIZZLE(lhs, 1, 2, 0, 3); + // XY, YZ, ZX + A = vector_mul(A, lhs); + // XY-YX, YZ-ZY, ZX-XZ + A = vector_neg_mul_add(B, rhs, A); + // YZ-ZY, ZX-XZ, XY-YX + return VECTOR_SWIZZLE(A, 1, 2, 0, 3); +#else + // cross(a, b) = (a.yzx * b.zxy) - (a.zxy * b.yzx) + const float lhs_x = vector_get_x(lhs); + const float lhs_y = vector_get_y(lhs); + const float lhs_z = vector_get_z(lhs); + const float rhs_x = vector_get_x(rhs); + const float rhs_y = vector_get_y(rhs); + const float rhs_z = vector_get_z(rhs); + return vector_set((lhs_y * rhs_z) - (lhs_z * rhs_y), (lhs_z * rhs_x) - (lhs_x * rhs_z), (lhs_x * rhs_y) - (lhs_y * rhs_x)); +#endif + } + ////////////////////////////////////////////////////////////////////////// // Per component linear interpolation of the two inputs at the specified alpha. // The formula used is: ((1.0 - alpha) * start) + (alpha * end). @@ -3369,248 +3422,143 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// // Mixes two inputs and returns the desired components. // [xyzw] indexes into the first input while [abcd] indexes in the second. + // Slow code path, not yet optimized or not using intrinsics ////////////////////////////////////////////////////////////////////////// - template - RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_mix(vector4f_arg0 input0, vector4f_arg1 input1) RTM_NO_EXCEPT - { -#if defined(RTM_SSE4_INTRINSICS) - // Each component comes from the respective position of input 0 or input 1 - if (rtm_impl::static_condition<(comp0 == mix4::a || comp0 == mix4::x) && (comp1 == mix4::b || comp1 == mix4::y) && - (comp2 == mix4::c || comp2 == mix4::z) && (comp3 == mix4::d || comp3 == mix4::w)>::test()) - { - constexpr int mask = (comp0 == mix4::a ? 1 : 0) | (comp1 == mix4::b ? 2 : 0) | - (comp2 == mix4::c ? 4 : 0) | (comp3 == mix4::d ? 8 : 0); - return _mm_blend_ps(input0, input1, mask); - } - - // First component comes from input 1, others come from the respective positions of input 0 - if (rtm_impl::static_condition::test()) - return _mm_insert_ps(input0, input1, (int(comp0) % 4) << 6); - - // Second component comes from input 1, others come from the respective positions of input 0 - if (rtm_impl::static_condition::test()) - return _mm_insert_ps(input0, input1, ((int(comp1) % 4) << 6) | (1 << 4)); - - // Third component comes from input 1, others come from the respective positions of input 0 - if (rtm_impl::static_condition::test()) - return _mm_insert_ps(input0, input1, ((int(comp2) % 4) << 6) | (2 << 4)); - - // Fourth component comes from input 1, others come from the respective positions of input 0 - if (rtm_impl::static_condition::test()) - return _mm_insert_ps(input0, input1, ((int(comp3) % 4) << 6) | (3 << 4)); - - // First component comes from input 0, others come from the respective positions of input 1 - if (rtm_impl::static_condition::test()) - return _mm_insert_ps(input1, input0, (int(comp0) % 4) << 6); - - // Second component comes from input 0, others come from the respective positions of input 1 - if (rtm_impl::static_condition::test()) - return _mm_insert_ps(input1, input0, ((int(comp1) % 4) << 6) | (1 << 4)); - - // Third component comes from input 0, others come from the respective positions of input 1 - if (rtm_impl::static_condition::test()) - return _mm_insert_ps(input1, input0, ((int(comp2) % 4) << 6) | (2 << 4)); - - // Fourth component comes from input 0, others come from the respective positions of input 1 - if (rtm_impl::static_condition::test()) - return _mm_insert_ps(input1, input0, ((int(comp3) % 4) << 6) | (3 << 4)); -#endif // defined(RTM_SSE4_INTRINSICS) - -#if defined(RTM_SSE2_INTRINSICS) - // All four components come from input 0 - if (rtm_impl::static_condition::test()) - return _mm_shuffle_ps(input0, input0, _MM_SHUFFLE(int(comp3) % 4, int(comp2) % 4, int(comp1) % 4, int(comp0) % 4)); - - // All four components come from input 1 - if (rtm_impl::static_condition::test()) - return _mm_shuffle_ps(input1, input1, _MM_SHUFFLE(int(comp3) % 4, int(comp2) % 4, int(comp1) % 4, int(comp0) % 4)); + template + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_mix_slow(vector4f_arg0 input0, vector4f_arg1 input1) RTM_NO_EXCEPT + { + float combine_arr[8]; + vector_store(input0, combine_arr); + vector_store(input1, combine_arr + 4); + return vector_set(combine_arr[index0], combine_arr[index1], combine_arr[index2], combine_arr[index3]); + } - // First two components come from input 0, second two come from input 1 - if (rtm_impl::static_condition::test()) - return _mm_shuffle_ps(input0, input1, _MM_SHUFFLE(int(comp3) % 4, int(comp2) % 4, int(comp1) % 4, int(comp0) % 4)); - - // First two components come from input 1, second two come from input 0 - if (rtm_impl::static_condition::test()) - return _mm_shuffle_ps(input1, input0, _MM_SHUFFLE(int(comp3) % 4, int(comp2) % 4, int(comp1) % 4, int(comp0) % 4)); - - // Low words from both inputs are interleaved - if (rtm_impl::static_condition::test()) - return _mm_unpacklo_ps(input0, input1); - - // Low words from both inputs are interleaved - if (rtm_impl::static_condition::test()) - return _mm_unpacklo_ps(input1, input0); - - // High words from both inputs are interleaved - if (rtm_impl::static_condition::test()) - return _mm_unpackhi_ps(input0, input1); - - // High words from both inputs are interleaved - if (rtm_impl::static_condition::test()) - return _mm_unpackhi_ps(input1, input0); -#endif // defined(RTM_SSE2_INTRINSICS) - -#if defined(RTM_NEON64_INTRINSICS) - // Low words from both inputs are interleaved - if (rtm_impl::static_condition::test()) - return vzip1q_f32(input0, input1); - - // Low words from both inputs are interleaved - if (rtm_impl::static_condition::test()) - return vzip1q_f32(input1, input0); - - // High words from both inputs are interleaved - if (rtm_impl::static_condition::test()) - return vzip2q_f32(input0, input1); - - // High words from both inputs are interleaved - if (rtm_impl::static_condition::test()) - return vzip2q_f32(input1, input0); - - // Even-numbered vector elements, consecutively - if (rtm_impl::static_condition::test()) - return vuzp1q_f32(input0, input1); - - // Even-numbered vector elements, consecutively - if (rtm_impl::static_condition::test()) - return vuzp1q_f32(input1, input0); - - // Odd-numbered vector elements, consecutively - if (rtm_impl::static_condition::test()) - return vuzp2q_f32(input0, input1); - - // Odd-numbered vector elements, consecutively - if (rtm_impl::static_condition::test()) - return vuzp2q_f32(input1, input0); - - // Even-numbered vector elements, interleaved - if (rtm_impl::static_condition::test()) - return vtrn1q_f32(input0, input1); + ////////////////////////////////////////////////////////////////////////// + // Mixes two inputs and returns the desired components. + // [xyzw] indexes into the first input while [abcd] indexes in the second. + // Use compilation time to speed up branch judgment + ////////////////////////////////////////////////////////////////////////// + template = 4 && index3 >= 4), int>::type = 0> + vector4f vector_swizzle_with_index(vector4f_arg0 input0, vector4f_arg1 input1) + { +#if defined(RTM_NO_INTRINSICS) + return vector_mix_slow(input0, input1); +#else + return VECTOR_SHUFFLE(input0, input1, index0, index1, index2 - 4, index3 - 4); +#endif + } - // Even-numbered vector elements, interleaved - if (rtm_impl::static_condition::test()) - return vtrn1q_f32(input1, input0); + ////////////////////////////////////////////////////////////////////////// + // Mixes two inputs and returns the desired components. + // [xyzw] indexes into the first input while [abcd] indexes in the second. + // Use compilation time to speed up branch judgment + ////////////////////////////////////////////////////////////////////////// + template ::type = 0> + vector4f vector_swizzle_with_index(vector4f_arg0 input0, vector4f_arg1 input1) + { +#if defined(RTM_NO_INTRINSICS) + return vector_mix_slow(input0, input1); +#else + (void)input1; + return VECTOR_SWIZZLE(input0, index0, index1, index2, index3); +#endif + } - // Odd-numbered vector elements, interleaved - if (rtm_impl::static_condition::test()) - return vtrn2q_f32(input0, input1); + ////////////////////////////////////////////////////////////////////////// + // Mixes two inputs and returns the desired components. + // [xyzw] indexes into the first input while [abcd] indexes in the second. + // Use compilation time to speed up branch judgment + ////////////////////////////////////////////////////////////////////////// + template = 4 && index1 >= 4 && index2 >= 4 && index3 >= 4), int>::type = 0> + vector4f vector_swizzle_with_index(vector4f_arg0 input0, vector4f_arg1 input1) + { +#if defined(RTM_NO_INTRINSICS) + return vector_mix_slow(input0, input1); +#else + (void)input0; + return VECTOR_SWIZZLE(input1, index0 - 4, index1 - 4, index2 - 4, index3 - 4); +#endif + } - // Odd-numbered vector elements, interleaved - if (rtm_impl::static_condition::test()) - return vtrn2q_f32(input1, input0); -#endif // defined(RTM_NEON64_INTRINSICS) + ////////////////////////////////////////////////////////////////////////// + // Mixes two inputs and returns the desired components. + // [xyzw] indexes into the first input while [abcd] indexes in the second. + // If no matching specialized template function is found, fall back to the slow version + ////////////////////////////////////////////////////////////////////////// + template = 4 && index3 >= 4) && + !(index0 < 4 && index1 < 4 && index2 < 4 && index3 < 4) && + !(index0 >= 4 && index1 >= 4 && index2 >= 4 && index3 >= 4), int>::type = 0> + vector4f vector_swizzle_with_index(vector4f_arg0 input0, vector4f_arg1 input1) + { + return vector_mix_slow(input0, input1); + } -#if defined(RTM_NEON_INTRINSICS) - // The highest vector elements from input 0 and the lowest vector elements from input 1, consecutively - if (rtm_impl::static_condition::test()) - return vextq_f32(input0, input1, int(comp0) % 4); - - // The highest vector elements from input 1 and the lowest vector elements from input 0, consecutively - if (rtm_impl::static_condition::test()) - return vextq_f32(input1, input0, int(comp0) % 4); - - // All four components come from input 0, reversed order in each doubleword - if (rtm_impl::static_condition::test()) - return vrev64q_f32(input0); - - // All four components come from input 1, reversed order in each doubleword - if (rtm_impl::static_condition::test()) - return vrev64q_f32(input1); - - // First component comes from input 1, others come from the respective positions of input 0 - if (rtm_impl::static_condition::test()) - return vsetq_lane_f32(vgetq_lane_f32(input1, int(comp0) % 4), input0, 0); - - // Second component comes from input 1, others come from the respective positions of input 0 - if (rtm_impl::static_condition::test()) - return vsetq_lane_f32(vgetq_lane_f32(input1, int(comp1) % 4), input0, 1); - - // Third component comes from input 1, others come from the respective positions of input 0 - if (rtm_impl::static_condition::test()) - return vsetq_lane_f32(vgetq_lane_f32(input1, int(comp2) % 4), input0, 2); - - // Fourth component comes from input 1, others come from the respective positions of input 0 - if (rtm_impl::static_condition::test()) - return vsetq_lane_f32(vgetq_lane_f32(input1, int(comp3) % 4), input0, 3); - - // First component comes from input 0, others come from the respective positions of input 1 - if (rtm_impl::static_condition::test()) - return vsetq_lane_f32(vgetq_lane_f32(input0, int(comp0) % 4), input1, 0); - - // Second component comes from input 0, others come from the respective positions of input 1 - if (rtm_impl::static_condition::test()) - return vsetq_lane_f32(vgetq_lane_f32(input0, int(comp1) % 4), input1, 1); - - // Third component comes from input 0, others come from the respective positions of input 1 - if (rtm_impl::static_condition::test()) - return vsetq_lane_f32(vgetq_lane_f32(input0, int(comp2) % 4), input1, 2); - - // Fourth component comes from input 0, others come from the respective positions of input 1 - if (rtm_impl::static_condition::test()) - return vsetq_lane_f32(vgetq_lane_f32(input0, int(comp3) % 4), input1, 3); - - - // All comes from the same position - if (rtm_impl::static_condition::test()) { - // All comes from the same position of input0 - if (rtm_impl::static_condition::test()) - return vmovq_n_f32(vgetq_lane_f32(input0, int(comp0) % 4)); - // All comes from the same position of input1 - if (rtm_impl::static_condition::test()) - return vmovq_n_f32(vgetq_lane_f32(input1, int(comp0) % 4)); - } -#endif // defined(RTM_NEON_INTRINSICS) - - // Slow code path, not yet optimized or not using intrinsics - constexpr component4 component0 = rtm_impl::mix_to_component(comp0); - constexpr component4 component1 = rtm_impl::mix_to_component(comp1); - constexpr component4 component2 = rtm_impl::mix_to_component(comp2); - constexpr component4 component3 = rtm_impl::mix_to_component(comp3); - - const float x0 = vector_get_component(input0, component0); - const float x1 = vector_get_component(input1, component0); - const float x = rtm_impl::is_mix_xyzw(comp0) ? x0 : x1; - - const float y0 = vector_get_component(input0, component1); - const float y1 = vector_get_component(input1, component1); - const float y = rtm_impl::is_mix_xyzw(comp1) ? y0 : y1; - - const float z0 = vector_get_component(input0, component2); - const float z1 = vector_get_component(input1, component2); - const float z = rtm_impl::is_mix_xyzw(comp2) ? z0 : z1; - - const float w0 = vector_get_component(input0, component3); - const float w1 = vector_get_component(input1, component3); - const float w = rtm_impl::is_mix_xyzw(comp3) ? w0 : w1; + ////////////////////////////////////////////////////////////////////////// + // Mixes two inputs and returns the desired components. + // [xyzw] indexes into the first input while [abcd] indexes in the second. + ////////////////////////////////////////////////////////////////////////// + template + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_mix(vector4f_arg0 input0, vector4f_arg1 input1) RTM_NO_EXCEPT + { + constexpr int index0 = (int)comp0; + constexpr int index1 = (int)comp1; + constexpr int index2 = (int)comp2; + constexpr int index3 = (int)comp3; - return vector_set(x, y, z, w); + return vector_swizzle_with_index(input0, input1); } ////////////////////////////////////////////////////////////////////////// // Replicates the [x] component in all components. ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_dup_x(vector4f_arg0 input) RTM_NO_EXCEPT { return vector_mix(input, input); } + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_dup_x(vector4f_arg0 input) RTM_NO_EXCEPT + { +#if defined(RTM_NO_INTRINSICS) + return vector_mix(input, input); +#else + return VECTOR_REPLICATE(input, 0); +#endif + } ////////////////////////////////////////////////////////////////////////// // Replicates the [y] component in all components. ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_dup_y(vector4f_arg0 input) RTM_NO_EXCEPT { return vector_mix(input, input); } + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_dup_y(vector4f_arg0 input) RTM_NO_EXCEPT + { +#if defined(RTM_NO_INTRINSICS) + return vector_mix(input, input); +#else + return VECTOR_REPLICATE(input, 1); +#endif + } ////////////////////////////////////////////////////////////////////////// // Replicates the [z] component in all components. ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_dup_z(vector4f_arg0 input) RTM_NO_EXCEPT { return vector_mix(input, input); } + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_dup_z(vector4f_arg0 input) RTM_NO_EXCEPT + { +#if defined(RTM_NO_INTRINSICS) + return vector_mix(input, input); +#else + return VECTOR_REPLICATE(input, 2); +#endif + } ////////////////////////////////////////////////////////////////////////// // Replicates the [w] component in all components. ////////////////////////////////////////////////////////////////////////// - RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_dup_w(vector4f_arg0 input) RTM_NO_EXCEPT { return vector_mix(input, input); } + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_dup_w(vector4f_arg0 input) RTM_NO_EXCEPT + { +#if defined(RTM_NO_INTRINSICS) + return vector_mix(input, input); +#else + return VECTOR_REPLICATE(input, 3); +#endif + } ////////////////////////////////////////////////////////////////////////// // Logical diff --git a/tools/bench/sources/bench_matrix4x4_arg_passing.cpp b/tools/bench/sources/bench_matrix4x4_arg_passing.cpp new file mode 100644 index 00000000..af0b9258 --- /dev/null +++ b/tools/bench/sources/bench_matrix4x4_arg_passing.cpp @@ -0,0 +1,395 @@ +//////////////////////////////////////////////////////////////////////////////// +// The MIT License (MIT) +// +// Copyright (c) 2024 Nicholas Frechette & Realtime Math contributors +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. +//////////////////////////////////////////////////////////////////////////////// + +#include + +#include +#include + +using namespace rtm; + +namespace { + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL make_tansform(vector4f_arg0 translation, vector4f_arg1 scale, quatf_arg2 orientation) { + float quatval[4]; + quat_store(orientation, quatval); + + const float x2 = quatval[0] + quatval[0]; + const float y2 = quatval[1] + quatval[1]; + const float z2 = quatval[2] + quatval[2]; + const float xx = quatval[0] * x2; + const float xy = quatval[0] * y2; + const float xz = quatval[0] * z2; + const float yy = quatval[1] * y2; + const float yz = quatval[1] * z2; + const float zz = quatval[2] * z2; + const float wx = quatval[3] * x2; + const float wy = quatval[3] * y2; + const float wz = quatval[3] * z2; + + const scalarf scale_x = vector_get_x_as_scalar(scale); + const scalarf scale_y = vector_get_y_as_scalar(scale); + const scalarf scale_z = vector_get_z_as_scalar(scale); + + + vector4f x_axis = vector_mul(vector_set(1.0F - (yy + zz), xy + wz, xz - wy, 0.0F), scale_x); + vector4f y_axis = vector_mul(vector_set(xy - wz, 1.0F - (xx + zz), yz + wx, 0.0F), scale_y); + vector4f z_axis = vector_mul(vector_set(xz + wy, yz - wx, 1.0F - (xx + yy), 0.0F), scale_z); + vector4f w_axis = translation; + + return matrix4x4f{x_axis, y_axis, z_axis, w_axis}; + } + + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE void RTM_SIMD_CALL multi_pass_by_ref(const matrix4x4f& lhs, const matrix4x4f& rhs, matrix4x4f& result) RTM_NO_EXCEPT + { + vector4f tmp = vector_mul(vector_dup_x(lhs.x_axis), rhs.x_axis); + tmp = vector_mul_add(vector_dup_y(lhs.x_axis), rhs.y_axis, tmp); + tmp = vector_mul_add(vector_dup_z(lhs.x_axis), rhs.z_axis, tmp); + tmp = vector_mul_add(vector_dup_w(lhs.x_axis), rhs.w_axis, tmp); + result.x_axis = tmp; + + tmp = vector_mul(vector_dup_x(lhs.y_axis), rhs.x_axis); + tmp = vector_mul_add(vector_dup_y(lhs.y_axis), rhs.y_axis, tmp); + tmp = vector_mul_add(vector_dup_z(lhs.y_axis), rhs.z_axis, tmp); + tmp = vector_mul_add(vector_dup_w(lhs.y_axis), rhs.w_axis, tmp); + result.y_axis = tmp; + + tmp = vector_mul(vector_dup_x(lhs.z_axis), rhs.x_axis); + tmp = vector_mul_add(vector_dup_y(lhs.z_axis), rhs.y_axis, tmp); + tmp = vector_mul_add(vector_dup_z(lhs.z_axis), rhs.z_axis, tmp); + tmp = vector_mul_add(vector_dup_w(lhs.z_axis), rhs.w_axis, tmp); + result.z_axis = tmp; + + tmp = vector_mul(vector_dup_x(lhs.w_axis), rhs.x_axis); + tmp = vector_mul_add(vector_dup_y(lhs.w_axis), rhs.y_axis, tmp); + tmp = vector_mul_add(vector_dup_z(lhs.w_axis), rhs.z_axis, tmp); + tmp = vector_mul_add(vector_dup_w(lhs.w_axis), rhs.w_axis, tmp); + result.w_axis = tmp; + } + + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE void RTM_SIMD_CALL inverse_pass_by_ref(const matrix4x4f& input, matrix4x4f& output) { + vector4f v00 = vector_mix(input.z_axis, input.z_axis); + vector4f v01 = vector_mix(input.x_axis, input.x_axis); + vector4f v02 = vector_mix(input.z_axis, input.x_axis); + vector4f v10 = vector_mix(input.w_axis, input.w_axis); + vector4f v11 = vector_mix(input.y_axis, input.y_axis); + vector4f v12 = vector_mix(input.w_axis, input.y_axis); + + vector4f d0 = vector_mul(v00, v10); + vector4f d1 = vector_mul(v01, v11); + vector4f d2 = vector_mul(v02, v12); + + v00 = vector_mix(input.z_axis, input.z_axis); + v01 = vector_mix(input.x_axis, input.x_axis); + v02 = vector_mix(input.z_axis, input.x_axis); + v10 = vector_mix(input.w_axis, input.w_axis); + v11 = vector_mix(input.y_axis, input.y_axis); + v12 = vector_mix(input.w_axis, input.y_axis); + + d0 = vector_neg_mul_sub(v00, v10, d0); + d1 = vector_neg_mul_sub(v01, v11, d1); + d2 = vector_neg_mul_sub(v02, v12, d2); + + v00 = vector_mix(input.y_axis, input.y_axis); + v01 = vector_mix(input.x_axis, input.x_axis); + v02 = vector_mix(input.w_axis, input.w_axis); + vector4f v03 = vector_mix(input.z_axis, input.z_axis); + v10 = vector_mix(d0, d2); + v11 = vector_mix(d0, d2); + v12 = vector_mix(d1, d2); + vector4f v13 = vector_mix(d1, d2); + + vector4f c0 = vector_mul(v00, v10); + vector4f c2 = vector_mul(v01, v11); + vector4f c4 = vector_mul(v02, v12); + vector4f c6 = vector_mul(v03, v13); + + v00 = vector_mix(input.y_axis, input.y_axis); + v01 = vector_mix(input.x_axis, input.x_axis); + v02 = vector_mix(input.w_axis, input.w_axis); + v03 = vector_mix(input.z_axis, input.z_axis); + v10 = vector_mix(d0, d2); + v11 = vector_mix(d0, d2); + v12 = vector_mix(d1, d2); + v13 = vector_mix(d1, d2); + + c0 = vector_neg_mul_sub(v00, v10, c0); + c2 = vector_neg_mul_sub(v01, v11, c2); + c4 = vector_neg_mul_sub(v02, v12, c4); + c6 = vector_neg_mul_sub(v03, v13, c6); + + v00 = vector_mix(input.y_axis, input.y_axis); + v01 = vector_mix(input.x_axis, input.x_axis); + v02 = vector_mix(input.w_axis, input.w_axis); + v03 = vector_mix(input.z_axis, input.z_axis); + v10 = vector_mix(d0, d2); + v11 = vector_mix(d0, d2); + v12 = vector_mix(d1, d2); + v13 = vector_mix(d1, d2); + + vector4f c1 = vector_neg_mul_sub(v00, v10, c0); + c0 = vector_mul_add(v00, v10, c0); + vector4f c3 = vector_mul_add(v01, v11, c2); + c2 = vector_neg_mul_sub(v01, v11, c2); + vector4f c5 = vector_neg_mul_sub(v02, v12, c4); + c4 = vector_mul_add(v02, v12, c4); + vector4f c7 = vector_mul_add(v03, v13, c6); + c6 = vector_neg_mul_sub(v03, v13, c6); + + vector4f x_axis = vector_mix(c0, c1); + vector4f y_axis = vector_mix(c2, c3); + vector4f z_axis = vector_mix(c4, c5); + vector4f w_axis = vector_mix(c6, c7); + + const scalarf det = vector_dot_as_scalar(x_axis, input.x_axis); + const scalarf inv_det_s = scalar_reciprocal(det); + const vector4f inv_det = vector_set(inv_det_s); + + output.x_axis = vector_mul(x_axis, inv_det); + output.y_axis = vector_mul(y_axis, inv_det); + output.z_axis = vector_mul(z_axis, inv_det); + output.w_axis = vector_mul(w_axis, inv_det); + } + + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL multi_pass_by_value(matrix4x4f_arg0 lhs, matrix4x4f_arg1 rhs) RTM_NO_EXCEPT + { + vector4f tmp = vector_mul(vector_dup_x(lhs.x_axis), rhs.x_axis); + tmp = vector_mul_add(vector_dup_y(lhs.x_axis), rhs.y_axis, tmp); + tmp = vector_mul_add(vector_dup_z(lhs.x_axis), rhs.z_axis, tmp); + tmp = vector_mul_add(vector_dup_w(lhs.x_axis), rhs.w_axis, tmp); + vector4f x_axis = tmp; + + tmp = vector_mul(vector_dup_x(lhs.y_axis), rhs.x_axis); + tmp = vector_mul_add(vector_dup_y(lhs.y_axis), rhs.y_axis, tmp); + tmp = vector_mul_add(vector_dup_z(lhs.y_axis), rhs.z_axis, tmp); + tmp = vector_mul_add(vector_dup_w(lhs.y_axis), rhs.w_axis, tmp); + vector4f y_axis = tmp; + + tmp = vector_mul(vector_dup_x(lhs.z_axis), rhs.x_axis); + tmp = vector_mul_add(vector_dup_y(lhs.z_axis), rhs.y_axis, tmp); + tmp = vector_mul_add(vector_dup_z(lhs.z_axis), rhs.z_axis, tmp); + tmp = vector_mul_add(vector_dup_w(lhs.z_axis), rhs.w_axis, tmp); + vector4f z_axis = tmp; + + tmp = vector_mul(vector_dup_x(lhs.w_axis), rhs.x_axis); + tmp = vector_mul_add(vector_dup_y(lhs.w_axis), rhs.y_axis, tmp); + tmp = vector_mul_add(vector_dup_z(lhs.w_axis), rhs.z_axis, tmp); + tmp = vector_mul_add(vector_dup_w(lhs.w_axis), rhs.w_axis, tmp); + vector4f w_axis = tmp; + + return matrix4x4f{ x_axis, y_axis, z_axis, w_axis }; + } + + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL inverse_pass_by_value(matrix4x4f_arg0 input) { + matrix4x4f output{}; + + vector4f v00 = vector_mix(input.z_axis, input.z_axis); + vector4f v01 = vector_mix(input.x_axis, input.x_axis); + vector4f v02 = vector_mix(input.z_axis, input.x_axis); + vector4f v10 = vector_mix(input.w_axis, input.w_axis); + vector4f v11 = vector_mix(input.y_axis, input.y_axis); + vector4f v12 = vector_mix(input.w_axis, input.y_axis); + + vector4f d0 = vector_mul(v00, v10); + vector4f d1 = vector_mul(v01, v11); + vector4f d2 = vector_mul(v02, v12); + + v00 = vector_mix(input.z_axis, input.z_axis); + v01 = vector_mix(input.x_axis, input.x_axis); + v02 = vector_mix(input.z_axis, input.x_axis); + v10 = vector_mix(input.w_axis, input.w_axis); + v11 = vector_mix(input.y_axis, input.y_axis); + v12 = vector_mix(input.w_axis, input.y_axis); + + d0 = vector_neg_mul_sub(v00, v10, d0); + d1 = vector_neg_mul_sub(v01, v11, d1); + d2 = vector_neg_mul_sub(v02, v12, d2); + + v00 = vector_mix(input.y_axis, input.y_axis); + v01 = vector_mix(input.x_axis, input.x_axis); + v02 = vector_mix(input.w_axis, input.w_axis); + vector4f v03 = vector_mix(input.z_axis, input.z_axis); + v10 = vector_mix(d0, d2); + v11 = vector_mix(d0, d2); + v12 = vector_mix(d1, d2); + vector4f v13 = vector_mix(d1, d2); + + vector4f c0 = vector_mul(v00, v10); + vector4f c2 = vector_mul(v01, v11); + vector4f c4 = vector_mul(v02, v12); + vector4f c6 = vector_mul(v03, v13); + + v00 = vector_mix(input.y_axis, input.y_axis); + v01 = vector_mix(input.x_axis, input.x_axis); + v02 = vector_mix(input.w_axis, input.w_axis); + v03 = vector_mix(input.z_axis, input.z_axis); + v10 = vector_mix(d0, d2); + v11 = vector_mix(d0, d2); + v12 = vector_mix(d1, d2); + v13 = vector_mix(d1, d2); + + c0 = vector_neg_mul_sub(v00, v10, c0); + c2 = vector_neg_mul_sub(v01, v11, c2); + c4 = vector_neg_mul_sub(v02, v12, c4); + c6 = vector_neg_mul_sub(v03, v13, c6); + + v00 = vector_mix(input.y_axis, input.y_axis); + v01 = vector_mix(input.x_axis, input.x_axis); + v02 = vector_mix(input.w_axis, input.w_axis); + v03 = vector_mix(input.z_axis, input.z_axis); + v10 = vector_mix(d0, d2); + v11 = vector_mix(d0, d2); + v12 = vector_mix(d1, d2); + v13 = vector_mix(d1, d2); + + vector4f c1 = vector_neg_mul_sub(v00, v10, c0); + c0 = vector_mul_add(v00, v10, c0); + vector4f c3 = vector_mul_add(v01, v11, c2); + c2 = vector_neg_mul_sub(v01, v11, c2); + vector4f c5 = vector_neg_mul_sub(v02, v12, c4); + c4 = vector_mul_add(v02, v12, c4); + vector4f c7 = vector_mul_add(v03, v13, c6); + c6 = vector_neg_mul_sub(v03, v13, c6); + + vector4f x_axis = vector_mix(c0, c1); + vector4f y_axis = vector_mix(c2, c3); + vector4f z_axis = vector_mix(c4, c5); + vector4f w_axis = vector_mix(c6, c7); + + const scalarf det = vector_dot_as_scalar(x_axis, input.x_axis); + const scalarf inv_det_s = scalar_reciprocal(det); + const vector4f inv_det = vector_set(inv_det_s); + + + output.x_axis = vector_mul(x_axis, inv_det); + output.y_axis = vector_mul(y_axis, inv_det); + output.z_axis = vector_mul(z_axis, inv_det); + output.w_axis = vector_mul(w_axis, inv_det); + + return output; + } +} + +namespace { + vector4f g_translation1 = vector_set(4.1f, 4.2f, 4.3f, 1.0f); + vector4f g_translation2 = vector_set(3.1f, 3.2f, 3.3f, 1.0f); + vector4f g_scale = vector_set(1.0f, 1.0f, 1.0f, 1.0f); + vector4f g_orientation = vector_set(5.1f, 5.2f, 5.3f, 1.0f); +} + +class matrix4x4f_ref { +public: + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f_ref() = default; + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f_ref(vector4f_arg0 translation, vector4f_arg1 scale, quatf_arg2 orientation) { + m_value = make_tansform(translation, scale, orientation); + } + + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f& RTM_SIMD_CALL ref() { + return m_value; + } + + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f& RTM_SIMD_CALL ref() const { + return const_cast(this)->ref(); + } + + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f_ref RTM_SIMD_CALL inverse() const { + matrix4x4f_ref result{}; + inverse_pass_by_ref(m_value, result.ref()); + return result; + } + + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f_ref RTM_SIMD_CALL multi(const matrix4x4f_ref& other) const { + matrix4x4f_ref result{}; + multi_pass_by_ref(other.ref(), m_value, result.ref()); + return result; + } + +private: + matrix4x4f m_value{}; +}; + +class matrix4x4f_value { +public: + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f_value() = default; + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f_value(vector4f_arg0 translation, vector4f_arg1 scale, quatf_arg2 orientation) { + m_value = make_tansform(translation, scale, orientation); + } + + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f& RTM_SIMD_CALL ref() { + return m_value; + } + + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f& RTM_SIMD_CALL ref() const { + return const_cast(this)->ref(); + } + + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f_value RTM_SIMD_CALL inverse() const { + matrix4x4f_value result{}; + result.ref() = inverse_pass_by_value(m_value); + return result; + } + + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f_value RTM_SIMD_CALL multi(matrix4x4f other) const { + matrix4x4f_value result{}; + result.ref() = multi_pass_by_value(other, m_value); + return result; + } + +private: + matrix4x4f m_value{}; +}; + +static void bm_matrix_pass_by_ref(benchmark::State& state) { + auto vec = std::vector{}; + vec.resize(1000000); + + int index = 0; + for (auto _ : state) { + auto matrix1 = matrix4x4f_ref(g_translation1, g_scale, g_orientation); + auto matrix2 = matrix4x4f_ref(g_translation2, g_scale, g_orientation); + auto matrix3 = matrix1.multi(matrix2); + + vec[index++ % vec.size()] = matrix3.inverse(); + } + benchmark::DoNotOptimize(vec); +} + +BENCHMARK(bm_matrix_pass_by_ref); + +static void bm_matrix_pass_by_value(benchmark::State& state) { + auto vec = std::vector{}; + vec.resize(1000000); + + int index = 0; + for (auto _ : state) { + auto matrix1 = matrix4x4f_value(g_translation1, g_scale, g_orientation); + auto matrix2 = matrix4x4f_value(g_translation2, g_scale, g_orientation); + auto matrix3 = matrix1.multi(matrix2.ref()); + + vec[index++ % vec.size()] = matrix3.inverse(); + } + + benchmark::DoNotOptimize(vec); +} + +BENCHMARK(bm_matrix_pass_by_value); \ No newline at end of file diff --git a/tools/bench/sources/bench_vector_mix.cpp b/tools/bench/sources/bench_vector_mix.cpp new file mode 100644 index 00000000..17c8ea93 --- /dev/null +++ b/tools/bench/sources/bench_vector_mix.cpp @@ -0,0 +1,320 @@ +//////////////////////////////////////////////////////////////////////////////// +// The MIT License (MIT) +// +// Copyright (c) 2024 Nicholas Frechette & Realtime Math contributors +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. +//////////////////////////////////////////////////////////////////////////////// + + +#include +#include +#include + +using namespace rtm; + +namespace { + template + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_mix_optimize(vector4f_arg0 input0, vector4f_arg1 input1) RTM_NO_EXCEPT + { +#if defined(__clang__) + constexpr int index0 = (int)comp0; + constexpr int index1 = (int)comp1; + constexpr int index2 = (int)comp2; + constexpr int index3 = (int)comp3; + return __builtin_shufflevector(input0, input1, index0, index1, index2, index3); +#endif + + return vector_mix(input0, input1); + } + + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL make_tansform(vector4f_arg0 translation, vector4f_arg1 scale, quatf_arg2 orientation) { + float quatval[4]; + quat_store(orientation, quatval); + + const float x2 = quatval[0] + quatval[0]; + const float y2 = quatval[1] + quatval[1]; + const float z2 = quatval[2] + quatval[2]; + const float xx = quatval[0] * x2; + const float xy = quatval[0] * y2; + const float xz = quatval[0] * z2; + const float yy = quatval[1] * y2; + const float yz = quatval[1] * z2; + const float zz = quatval[2] * z2; + const float wx = quatval[3] * x2; + const float wy = quatval[3] * y2; + const float wz = quatval[3] * z2; + + const scalarf scale_x = vector_get_x_as_scalar(scale); + const scalarf scale_y = vector_get_y_as_scalar(scale); + const scalarf scale_z = vector_get_z_as_scalar(scale); + + + vector4f x_axis = vector_mul(vector_set(1.0F - (yy + zz), xy + wz, xz - wy, 0.0F), scale_x); + vector4f y_axis = vector_mul(vector_set(xy - wz, 1.0F - (xx + zz), yz + wx, 0.0F), scale_y); + vector4f z_axis = vector_mul(vector_set(xz + wy, yz - wx, 1.0F - (xx + yy), 0.0F), scale_z); + vector4f w_axis = translation; + + return matrix4x4f{x_axis, y_axis, z_axis, w_axis}; + } + + + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL inverse_normal(matrix4x4f_arg0 input) { + matrix4x4f output{}; + + vector4f v00 = vector_mix(input.z_axis, input.z_axis); + vector4f v01 = vector_mix(input.x_axis, input.x_axis); + vector4f v02 = vector_mix(input.z_axis, input.x_axis); + vector4f v10 = vector_mix(input.w_axis, input.w_axis); + vector4f v11 = vector_mix(input.y_axis, input.y_axis); + vector4f v12 = vector_mix(input.w_axis, input.y_axis); + + vector4f d0 = vector_mul(v00, v10); + vector4f d1 = vector_mul(v01, v11); + vector4f d2 = vector_mul(v02, v12); + + v00 = vector_mix(input.z_axis, input.z_axis); + v01 = vector_mix(input.x_axis, input.x_axis); + v02 = vector_mix(input.z_axis, input.x_axis); + v10 = vector_mix(input.w_axis, input.w_axis); + v11 = vector_mix(input.y_axis, input.y_axis); + v12 = vector_mix(input.w_axis, input.y_axis); + + d0 = vector_neg_mul_sub(v00, v10, d0); + d1 = vector_neg_mul_sub(v01, v11, d1); + d2 = vector_neg_mul_sub(v02, v12, d2); + + v00 = vector_mix(input.y_axis, input.y_axis); + v01 = vector_mix(input.x_axis, input.x_axis); + v02 = vector_mix(input.w_axis, input.w_axis); + vector4f v03 = vector_mix(input.z_axis, input.z_axis); + v10 = vector_mix(d0, d2); + v11 = vector_mix(d0, d2); + v12 = vector_mix(d1, d2); + vector4f v13 = vector_mix(d1, d2); + + vector4f c0 = vector_mul(v00, v10); + vector4f c2 = vector_mul(v01, v11); + vector4f c4 = vector_mul(v02, v12); + vector4f c6 = vector_mul(v03, v13); + + v00 = vector_mix(input.y_axis, input.y_axis); + v01 = vector_mix(input.x_axis, input.x_axis); + v02 = vector_mix(input.w_axis, input.w_axis); + v03 = vector_mix(input.z_axis, input.z_axis); + v10 = vector_mix(d0, d2); + v11 = vector_mix(d0, d2); + v12 = vector_mix(d1, d2); + v13 = vector_mix(d1, d2); + + c0 = vector_neg_mul_sub(v00, v10, c0); + c2 = vector_neg_mul_sub(v01, v11, c2); + c4 = vector_neg_mul_sub(v02, v12, c4); + c6 = vector_neg_mul_sub(v03, v13, c6); + + v00 = vector_mix(input.y_axis, input.y_axis); + v01 = vector_mix(input.x_axis, input.x_axis); + v02 = vector_mix(input.w_axis, input.w_axis); + v03 = vector_mix(input.z_axis, input.z_axis); + v10 = vector_mix(d0, d2); + v11 = vector_mix(d0, d2); + v12 = vector_mix(d1, d2); + v13 = vector_mix(d1, d2); + + vector4f c1 = vector_neg_mul_sub(v00, v10, c0); + c0 = vector_mul_add(v00, v10, c0); + vector4f c3 = vector_mul_add(v01, v11, c2); + c2 = vector_neg_mul_sub(v01, v11, c2); + vector4f c5 = vector_neg_mul_sub(v02, v12, c4); + c4 = vector_mul_add(v02, v12, c4); + vector4f c7 = vector_mul_add(v03, v13, c6); + c6 = vector_neg_mul_sub(v03, v13, c6); + + vector4f x_axis = vector_mix(c0, c1); + vector4f y_axis = vector_mix(c2, c3); + vector4f z_axis = vector_mix(c4, c5); + vector4f w_axis = vector_mix(c6, c7); + + const scalarf det = vector_dot_as_scalar(x_axis, input.x_axis); + const scalarf inv_det_s = scalar_reciprocal(det); + const vector4f inv_det = vector_set(inv_det_s); + + + output.x_axis = vector_mul(x_axis, inv_det); + output.y_axis = vector_mul(y_axis, inv_det); + output.z_axis = vector_mul(z_axis, inv_det); + output.w_axis = vector_mul(w_axis, inv_det); + + return output; + } + + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL inverse_optimize(matrix4x4f_arg0 input) { + matrix4x4f output{}; + + vector4f v00 = vector_mix_optimize(input.z_axis, input.z_axis); + vector4f v01 = vector_mix_optimize(input.x_axis, input.x_axis); + vector4f v02 = vector_mix_optimize(input.z_axis, input.x_axis); + vector4f v10 = vector_mix_optimize(input.w_axis, input.w_axis); + vector4f v11 = vector_mix_optimize(input.y_axis, input.y_axis); + vector4f v12 = vector_mix_optimize(input.w_axis, input.y_axis); + + vector4f d0 = vector_mul(v00, v10); + vector4f d1 = vector_mul(v01, v11); + vector4f d2 = vector_mul(v02, v12); + + v00 = vector_mix_optimize(input.z_axis, input.z_axis); + v01 = vector_mix_optimize(input.x_axis, input.x_axis); + v02 = vector_mix_optimize(input.z_axis, input.x_axis); + v10 = vector_mix_optimize(input.w_axis, input.w_axis); + v11 = vector_mix_optimize(input.y_axis, input.y_axis); + v12 = vector_mix_optimize(input.w_axis, input.y_axis); + + d0 = vector_neg_mul_sub(v00, v10, d0); + d1 = vector_neg_mul_sub(v01, v11, d1); + d2 = vector_neg_mul_sub(v02, v12, d2); + + v00 = vector_mix_optimize(input.y_axis, input.y_axis); + v01 = vector_mix_optimize(input.x_axis, input.x_axis); + v02 = vector_mix_optimize(input.w_axis, input.w_axis); + vector4f v03 = vector_mix_optimize(input.z_axis, input.z_axis); + v10 = vector_mix_optimize(d0, d2); + v11 = vector_mix_optimize(d0, d2); + v12 = vector_mix_optimize(d1, d2); + vector4f v13 = vector_mix_optimize(d1, d2); + + vector4f c0 = vector_mul(v00, v10); + vector4f c2 = vector_mul(v01, v11); + vector4f c4 = vector_mul(v02, v12); + vector4f c6 = vector_mul(v03, v13); + + v00 = vector_mix_optimize(input.y_axis, input.y_axis); + v01 = vector_mix_optimize(input.x_axis, input.x_axis); + v02 = vector_mix_optimize(input.w_axis, input.w_axis); + v03 = vector_mix_optimize(input.z_axis, input.z_axis); + v10 = vector_mix_optimize(d0, d2); + v11 = vector_mix_optimize(d0, d2); + v12 = vector_mix_optimize(d1, d2); + v13 = vector_mix_optimize(d1, d2); + + c0 = vector_neg_mul_sub(v00, v10, c0); + c2 = vector_neg_mul_sub(v01, v11, c2); + c4 = vector_neg_mul_sub(v02, v12, c4); + c6 = vector_neg_mul_sub(v03, v13, c6); + + v00 = vector_mix_optimize(input.y_axis, input.y_axis); + v01 = vector_mix_optimize(input.x_axis, input.x_axis); + v02 = vector_mix_optimize(input.w_axis, input.w_axis); + v03 = vector_mix_optimize(input.z_axis, input.z_axis); + v10 = vector_mix_optimize(d0, d2); + v11 = vector_mix_optimize(d0, d2); + v12 = vector_mix_optimize(d1, d2); + v13 = vector_mix_optimize(d1, d2); + + vector4f c1 = vector_neg_mul_sub(v00, v10, c0); + c0 = vector_mul_add(v00, v10, c0); + vector4f c3 = vector_mul_add(v01, v11, c2); + c2 = vector_neg_mul_sub(v01, v11, c2); + vector4f c5 = vector_neg_mul_sub(v02, v12, c4); + c4 = vector_mul_add(v02, v12, c4); + vector4f c7 = vector_mul_add(v03, v13, c6); + c6 = vector_neg_mul_sub(v03, v13, c6); + + vector4f x_axis = vector_mix_optimize(c0, c1); + vector4f y_axis = vector_mix_optimize(c2, c3); + vector4f z_axis = vector_mix_optimize(c4, c5); + vector4f w_axis = vector_mix_optimize(c6, c7); + + const scalarf det = vector_dot_as_scalar(x_axis, input.x_axis); + const scalarf inv_det_s = scalar_reciprocal(det); + const vector4f inv_det = vector_set(inv_det_s); + + + output.x_axis = vector_mul(x_axis, inv_det); + output.y_axis = vector_mul(y_axis, inv_det); + output.z_axis = vector_mul(z_axis, inv_det); + output.w_axis = vector_mul(w_axis, inv_det); + + return output; + } +} + +namespace { + vector4f g_translation1 = vector_set(4.1f, 4.2f, 4.3f, 1.0f); + vector4f g_scale = vector_set(1.0f, 1.0f, 1.0f, 1.0f); + vector4f g_orientation = vector_set(5.1f, 5.2f, 5.3f, 1.0f); +} + +class matrix4x4f_agent { +public: + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f_agent() = default; + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f_agent(vector4f_arg0 translation, vector4f_arg1 scale, quatf_arg2 orientation) { + m_value = make_tansform(translation, scale, orientation); + } + + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f& RTM_SIMD_CALL ref() { + return m_value; + } + + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f& RTM_SIMD_CALL ref() const { + return const_cast(this)->ref(); + } + + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f_agent RTM_SIMD_CALL inverse_this_normal() const { + matrix4x4f_agent result{}; + result.ref() = inverse_normal(m_value); + return result; + } + + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f_agent RTM_SIMD_CALL inverse_this_optimize() const { + matrix4x4f_agent result{}; + result.ref() = inverse_optimize(m_value); + return result; + } + +private: + matrix4x4f m_value{}; +}; + +static void bm_vector_mix_normal(benchmark::State& state) { + auto vec = std::vector{}; + vec.resize(1000000); + + int index = 0; + for (auto _ : state) { + vec[index++ % vec.size()] = matrix4x4f_agent(g_translation1, g_scale, g_orientation).inverse_this_normal(); + } + + benchmark::DoNotOptimize(vec); +} + +BENCHMARK(bm_vector_mix_normal); + +static void bm_vector_mix_optimize(benchmark::State& state) { + auto vec = std::vector{}; + vec.resize(1000000); + + int index = 0; + for (auto _ : state) { + vec[index++ % vec.size()] = matrix4x4f_agent(g_translation1, g_scale, g_orientation).inverse_this_optimize(); + } + + benchmark::DoNotOptimize(vec); +} + +BENCHMARK(bm_vector_mix_optimize); \ No newline at end of file