nfrechette · daliziql · Jun 21, 2024 · Jun 24, 2024 · Jul 8, 2024 · nfrechette
diff --git a/includes/rtm/impl/matrix_affine_common.h b/includes/rtm/impl/matrix_affine_common.h
@@ -58,18 +58,21 @@ namespace rtm
 			{
 				RTM_ASSERT(quat_is_normalized(quat_input), "Quaternion is not normalized");
 
-				const float_type x2 = quat_get_x(quat_input) + quat_get_x(quat_input);
-				const float_type y2 = quat_get_y(quat_input) + quat_get_y(quat_input);
-				const float_type z2 = quat_get_z(quat_input) + quat_get_z(quat_input);
-				const float_type xx = quat_get_x(quat_input) * x2;
-				const float_type xy = quat_get_x(quat_input) * y2;
-				const float_type xz = quat_get_x(quat_input) * z2;
-				const float_type yy = quat_get_y(quat_input) * y2;
-				const float_type yz = quat_get_y(quat_input) * z2;
-				const float_type zz = quat_get_z(quat_input) * z2;
-				const float_type wx = quat_get_w(quat_input) * x2;
-				const float_type wy = quat_get_w(quat_input) * y2;
-				const float_type wz = quat_get_w(quat_input) * z2;
+				float_type quatval[4];
+				quat_store(quat_input, quatval);
+
+				const float_type x2 = quatval[0] + quatval[0];
+				const float_type y2 = quatval[1] + quatval[1];
+				const float_type z2 = quatval[2] + quatval[2];
+				const float_type xx = quatval[0] * x2;
+				const float_type xy = quatval[0] * y2;
+				const float_type xz = quatval[0] * z2;
+				const float_type yy = quatval[1] * y2;
+				const float_type yz = quatval[1] * z2;
+				const float_type zz = quatval[2] * z2;
+				const float_type wx = quatval[3] * x2;
+				const float_type wy = quatval[3] * y2;
+				const float_type wz = quatval[3] * z2;
 
 				const vector4 x_axis = vector_set(float_type(1.0) - (yy + zz), xy + wz, xz - wy, float_type(0.0));
 				const vector4 y_axis = vector_set(xy - wz, float_type(1.0) - (xx + zz), yz + wx, float_type(0.0));
@@ -80,19 +83,21 @@ namespace rtm
 			RTM_DISABLE_SECURITY_COOKIE_CHECK inline RTM_SIMD_CALL operator matrix3x4() const RTM_NO_EXCEPT
 			{
 				RTM_ASSERT(quat_is_normalized(quat_input), "Quaternion is not normalized");
-
-				const float_type x2 = quat_get_x(quat_input) + quat_get_x(quat_input);
-				const float_type y2 = quat_get_y(quat_input) + quat_get_y(quat_input);
-				const float_type z2 = quat_get_z(quat_input) + quat_get_z(quat_input);
-				const float_type xx = quat_get_x(quat_input) * x2;
-				const float_type xy = quat_get_x(quat_input) * y2;
-				const float_type xz = quat_get_x(quat_input) * z2;
-				const float_type yy = quat_get_y(quat_input) * y2;
-				const float_type yz = quat_get_y(quat_input) * z2;
-				const float_type zz = quat_get_z(quat_input) * z2;
-				const float_type wx = quat_get_w(quat_input) * x2;
-				const float_type wy = quat_get_w(quat_input) * y2;
-				const float_type wz = quat_get_w(quat_input) * z2;
+				float_type quatval[4];
+				quat_store(quat_input, quatval);
+
+				const float_type x2 = quatval[0] + quatval[0];
+				const float_type y2 = quatval[1] + quatval[1];
+				const float_type z2 = quatval[2] + quatval[2];
+				const float_type xx = quatval[0] * x2;
+				const float_type xy = quatval[0] * y2;
+				const float_type xz = quatval[0] * z2;
+				const float_type yy = quatval[1] * y2;
+				const float_type yz = quatval[1] * z2;
+				const float_type zz = quatval[2] * z2;
+				const float_type wx = quatval[3] * x2;
+				const float_type wy = quatval[3] * y2;
+				const float_type wz = quatval[3] * z2;
 
 				const vector4 x_axis = vector_set(float_type(1.0) - (yy + zz), xy + wz, xz - wy, float_type(0.0));
 				const vector4 y_axis = vector_set(xy - wz, float_type(1.0) - (xx + zz), yz + wx, float_type(0.0));

diff --git a/includes/rtm/impl/vector4f_swizzle.h b/includes/rtm/impl/vector4f_swizzle.h
@@ -0,0 +1,193 @@
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+#include <type_traits>
+
+#include "rtm/math.h"
+#include "rtm/types.h"
+#include "rtm/impl/compiler_utils.h"
+#include "rtm/scalarf.h"
+#include "rtm/scalard.h"
+#include "rtm/version.h"
+
+RTM_IMPL_FILE_PRAGMA_PUSH
+
+#if !defined(RTM_NO_INTRINSICS)
+namespace rtm
+{
+	RTM_IMPL_VERSION_NAMESPACE_BEGIN
+
+#if defined(RTM_SSE2_INTRINSICS) || defined(RTM_AVX_INTRINSICS)
+
+#define SHUFFLE_MASK(a0,a1,b2,b3) ( (a0) | ((a1)<<2) | ((b2)<<4) | ((b3)<<6) )
+
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	// Float swizzle
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	template<int index0, int index1, int index2, int index3>
+	RTM_FORCE_INLINE vector4f vector_swizzle_impl(const vector4f& vec)
+	{
+		return _mm_shuffle_ps(vec, vec, SHUFFLE_MASK(index0, index1, index2, index3));
+	}
+	template<> RTM_FORCE_INLINE vector4f vector_swizzle_impl<0, 1, 2, 3>(const vector4f& vec)
+    {
+        return vec;
+    }
+	template<> RTM_FORCE_INLINE vector4f vector_swizzle_impl<0, 1, 0, 1>(const vector4f& vec)
+    {
+        return _mm_movelh_ps(vec, vec);
+    }
+	template<> RTM_FORCE_INLINE vector4f vector_swizzle_impl<2, 3, 2, 3>(const vector4f& vec)
+    {
+        return _mm_movehl_ps(vec, vec);
+    }
+	template<> RTM_FORCE_INLINE vector4f vector_swizzle_impl<0, 0, 1, 1>(const vector4f& vec)
+    {
+        return _mm_unpacklo_ps(vec, vec);
+    }
+	template<> RTM_FORCE_INLINE vector4f vector_swizzle_impl<2, 2, 3, 3>(const vector4f& vec)
+    {
+        return _mm_unpackhi_ps(vec, vec);
+    }
+
+#if defined(RTM_SSE4_INTRINSICS)
+	template<> RTM_FORCE_INLINE vector4f vector_swizzle_impl<0, 0, 2, 2>(const vector4f& vec)
+    {
+        return _mm_moveldup_ps(vec);
+    }
+	template<> RTM_FORCE_INLINE vector4f vector_swizzle_impl<1, 1, 3, 3>(const vector4f& vec)
+    {
+        return _mm_movehdup_ps(vec);
+    }
+#endif
+
+#if defined(RTM_AVX2_INTRINSICS)
+	template<> RTM_FORCE_INLINE vector4f vector_swizzle_impl<0, 0, 0, 0>(const vector4f& vec)
+    {
+        return _mm_broadcastss_ps(vec);
+    }
+#endif
+
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	// Float replicate
+	template<int index>
+	RTM_FORCE_INLINE vector4f vector_replicate_impl(const vector4f& vec)
+	{
+		static_assert(index >= 0 && index <= 3, "Invalid Index");
+		return vector_swizzle_impl<index, index, index, index>(vec);
+	}
+
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	// Float shuffle
+	template<int index0, int index1, int index2, int index3>
+	RTM_FORCE_INLINE vector4f vector_shuffle_impl(const vector4f& vec1, const vector4f& vec2)
+	{
+		static_assert(index0 >= 0 && index0 <= 3 && index1 >= 0 && index1 <= 3 && index2 >= 0 && index2 <= 3 && index3 >= 0 && index3 <= 3, "Invalid Index");
+		return _mm_shuffle_ps(vec1, vec2, SHUFFLE_MASK(index0, index1, index2, index3));
+	}
+
+	// Float Shuffle specializations
+	template<> RTM_FORCE_INLINE vector4f vector_shuffle_impl<0, 1, 0, 1>(const vector4f& vec1, const vector4f& vec2)
+    {
+        return _mm_movelh_ps(vec1, vec2);
+
+    }
+	template<> RTM_FORCE_INLINE vector4f vector_shuffle_impl<2, 3, 2, 3>(const vector4f& vec1, const vector4f& vec2)
+    {
+        // Note: movehl copies first from the 2nd argument
+        return _mm_movehl_ps(vec2, vec1);
+    }
+
+#elif defined(RTM_NEON_INTRINSICS)
+    template <int element_index>
+    RTM_FORCE_INLINE vector4f vector_replicate_impl(const vector4f& vec)
+    {
+        return vdupq_n_f32(vgetq_lane_f32(vec, element_index));
+    }
+
+#if defined(__clang__)
+    template <int x, int y, int z, int w>
+    RTM_FORCE_INLINE vector4f vector_swizzle_impl(vector4f vec)
+    {
+        return __builtin_shufflevector(vec, vec, x, y, z, w);
+    }
+
+    template <int x, int y, int z, int w>
+    RTM_FORCE_INLINE vector4f vector_shuffle_impl(vector4f vec1, vector4f vec2)
+    {
+        return __builtin_shufflevector(vec1, vec2, x, y, z + 4, w + 4);
+    }
+#else
+
+    template<int index0, int index1, int index2, int index3>
+    RTM_FORCE_INLINE vector4f vector_shuffle_impl(vector4f vec1, vector4f vec2)
+    {
+        static_assert(index0 <= 3 && index1 <= 3 && index2 <= 3 && index3 <= 3, "Invalid Index");
+
+        static constexpr uint32_t control_element[8] =
+        {
+            0x03020100, // XM_PERMUTE_0X
+            0x07060504, // XM_PERMUTE_0Y
+            0x0B0A0908, // XM_PERMUTE_0Z
+            0x0F0E0D0C, // XM_PERMUTE_0W
+            0x13121110, // XM_PERMUTE_1X
+            0x17161514, // XM_PERMUTE_1Y
+            0x1B1A1918, // XM_PERMUTE_1Z
+            0x1F1E1D1C, // XM_PERMUTE_1W
+        };
+
+        uint8x8x4_t tbl;
+        tbl.val[0] = vget_low_f32(vec1);
+        tbl.val[1] = vget_high_f32(vec1);
+        tbl.val[2] = vget_low_f32(vec2);
+        tbl.val[3] = vget_high_f32(vec2);
+
+        uint32x2_t idx = vcreate_u32(static_cast<uint64_t>(control_element[index0]) | (static_cast<uint64_t>(control_element[index1]) << 32));
+        const uint8x8_t rL = vtbl4_u8(tbl, idx);
+
+        idx = vcreate_u32(static_cast<uint64_t>(control_element[index2 + 4]) | (static_cast<uint64_t>(control_element[index3 + 4]) << 32));
+        const uint8x8_t rH = vtbl4_u8(tbl, idx);
+
+        return vcombine_f32(rL, rH);
+    }
+
+	template<int index0, int index1, int index2, int index3>
+    RTM_FORCE_INLINE vector4f vector_swizzle_impl(vector4f vec)
+    {
+		static_assert(index0 <= 3 && index1 <= 3 && index2 <= 3 && index3 <= 3, "Invalid Index");
+
+        static constexpr uint32_t control_element[4] =
+        {
+            0x03020100, // XM_SWIZZLE_X
+            0x07060504, // XM_SWIZZLE_Y
+            0x0B0A0908, // XM_SWIZZLE_Z
+            0x0F0E0D0C, // XM_SWIZZLE_W
+        };
+
+        uint8x8x2_t tbl;
+        tbl.val[0] = vget_low_f32(vec);
+        tbl.val[1] = vget_high_f32(vec);
+
+        uint32x2_t idx = vcreate_u32(static_cast<uint64_t>(control_element[index0]) | (static_cast<uint64_t>(control_element[index1]) << 32));
+        const uint8x8_t rL = vtbl2_u8(tbl, idx);
+
+        idx = vcreate_u32(static_cast<uint64_t>(control_element[index2]) | (static_cast<uint64_t>(control_element[index3]) << 32));
+        const uint8x8_t rH = vtbl2_u8(tbl, idx);
+
+        return vcombine_f32(rL, rH);
+    }
+#endif
+#else
+#pragma error("vector swizzle not implement here!");
+#endif
+
+#define VECTOR_REPLICATE( vec, element_index )	        vector_replicate_impl<element_index>(vec)
+#define VECTOR_SWIZZLE( vec, x, y, z, w )		        vector_swizzle_impl<x,y,z,w>( vec )
+#define VECTOR_SHUFFLE( vec1, vec2, x, y, z, w )		vector_shuffle_impl<x,y,z,w>( vec1, vec2 )
+
+RTM_IMPL_VERSION_NAMESPACE_END
+}
+#endif
+RTM_IMPL_FILE_PRAGMA_POP
+
diff --git a/includes/rtm/matrix3x3f.h b/includes/rtm/matrix3x3f.h
@@ -181,7 +181,7 @@ namespace rtm
 	// is to multiply the normal with the cofactor matrix.
 	// See: https://github.com/graphitemaster/normals_revisited
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL matrix_mul_vector3(vector4f_arg0 vec3, matrix3x3f_arg0 mtx) RTM_NO_EXCEPT
+    RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL matrix_mul_vector3(vector4f_arg0 vec3, matrix3x3f_argn mtx) RTM_NO_EXCEPT
 	{
 		vector4f tmp;