diff --git a/includes/rtm/impl/matrix_affine_common.h b/includes/rtm/impl/matrix_affine_common.h
index 5bef3a43..df914e5c 100644
--- a/includes/rtm/impl/matrix_affine_common.h
+++ b/includes/rtm/impl/matrix_affine_common.h
@@ -58,18 +58,21 @@ namespace rtm
 			{
 				RTM_ASSERT(quat_is_normalized(quat_input), "Quaternion is not normalized");
 
-				const float_type x2 = quat_get_x(quat_input) + quat_get_x(quat_input);
-				const float_type y2 = quat_get_y(quat_input) + quat_get_y(quat_input);
-				const float_type z2 = quat_get_z(quat_input) + quat_get_z(quat_input);
-				const float_type xx = quat_get_x(quat_input) * x2;
-				const float_type xy = quat_get_x(quat_input) * y2;
-				const float_type xz = quat_get_x(quat_input) * z2;
-				const float_type yy = quat_get_y(quat_input) * y2;
-				const float_type yz = quat_get_y(quat_input) * z2;
-				const float_type zz = quat_get_z(quat_input) * z2;
-				const float_type wx = quat_get_w(quat_input) * x2;
-				const float_type wy = quat_get_w(quat_input) * y2;
-				const float_type wz = quat_get_w(quat_input) * z2;
+				float_type quatval[4];
+				quat_store(quat_input, quatval);
+
+				const float_type x2 = quatval[0] + quatval[0];
+				const float_type y2 = quatval[1] + quatval[1];
+				const float_type z2 = quatval[2] + quatval[2];
+				const float_type xx = quatval[0] * x2;
+				const float_type xy = quatval[0] * y2;
+				const float_type xz = quatval[0] * z2;
+				const float_type yy = quatval[1] * y2;
+				const float_type yz = quatval[1] * z2;
+				const float_type zz = quatval[2] * z2;
+				const float_type wx = quatval[3] * x2;
+				const float_type wy = quatval[3] * y2;
+				const float_type wz = quatval[3] * z2;
 
 				const vector4 x_axis = vector_set(float_type(1.0) - (yy + zz), xy + wz, xz - wy, float_type(0.0));
 				const vector4 y_axis = vector_set(xy - wz, float_type(1.0) - (xx + zz), yz + wx, float_type(0.0));
@@ -80,19 +83,21 @@ namespace rtm
 			RTM_DISABLE_SECURITY_COOKIE_CHECK inline RTM_SIMD_CALL operator matrix3x4() const RTM_NO_EXCEPT
 			{
 				RTM_ASSERT(quat_is_normalized(quat_input), "Quaternion is not normalized");
-
-				const float_type x2 = quat_get_x(quat_input) + quat_get_x(quat_input);
-				const float_type y2 = quat_get_y(quat_input) + quat_get_y(quat_input);
-				const float_type z2 = quat_get_z(quat_input) + quat_get_z(quat_input);
-				const float_type xx = quat_get_x(quat_input) * x2;
-				const float_type xy = quat_get_x(quat_input) * y2;
-				const float_type xz = quat_get_x(quat_input) * z2;
-				const float_type yy = quat_get_y(quat_input) * y2;
-				const float_type yz = quat_get_y(quat_input) * z2;
-				const float_type zz = quat_get_z(quat_input) * z2;
-				const float_type wx = quat_get_w(quat_input) * x2;
-				const float_type wy = quat_get_w(quat_input) * y2;
-				const float_type wz = quat_get_w(quat_input) * z2;
+				float_type quatval[4];
+				quat_store(quat_input, quatval);
+				
+				const float_type x2 = quatval[0] + quatval[0];
+				const float_type y2 = quatval[1] + quatval[1];
+				const float_type z2 = quatval[2] + quatval[2];
+				const float_type xx = quatval[0] * x2;
+				const float_type xy = quatval[0] * y2;
+				const float_type xz = quatval[0] * z2;
+				const float_type yy = quatval[1] * y2;
+				const float_type yz = quatval[1] * z2;
+				const float_type zz = quatval[2] * z2;
+				const float_type wx = quatval[3] * x2;
+				const float_type wy = quatval[3] * y2;
+				const float_type wz = quatval[3] * z2;
 
 				const vector4 x_axis = vector_set(float_type(1.0) - (yy + zz), xy + wz, xz - wy, float_type(0.0));
 				const vector4 y_axis = vector_set(xy - wz, float_type(1.0) - (xx + zz), yz + wx, float_type(0.0));
diff --git a/includes/rtm/impl/vector4f_swizzle.h b/includes/rtm/impl/vector4f_swizzle.h
new file mode 100644
index 00000000..9846b32d
--- /dev/null
+++ b/includes/rtm/impl/vector4f_swizzle.h
@@ -0,0 +1,193 @@
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+#include <type_traits>
+
+#include "rtm/math.h"
+#include "rtm/types.h"
+#include "rtm/impl/compiler_utils.h"
+#include "rtm/scalarf.h"
+#include "rtm/scalard.h"
+#include "rtm/version.h"
+
+RTM_IMPL_FILE_PRAGMA_PUSH
+
+#if !defined(RTM_NO_INTRINSICS)
+namespace rtm
+{
+	RTM_IMPL_VERSION_NAMESPACE_BEGIN
+
+#if defined(RTM_SSE2_INTRINSICS) || defined(RTM_AVX_INTRINSICS)
+
+#define SHUFFLE_MASK(a0,a1,b2,b3) ( (a0) | ((a1)<<2) | ((b2)<<4) | ((b3)<<6) )
+
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	// Float swizzle
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	template<int index0, int index1, int index2, int index3>
+	RTM_FORCE_INLINE vector4f vector_swizzle_impl(const vector4f& vec)
+	{
+		return _mm_shuffle_ps(vec, vec, SHUFFLE_MASK(index0, index1, index2, index3));
+	}
+	template<> RTM_FORCE_INLINE vector4f vector_swizzle_impl<0, 1, 2, 3>(const vector4f& vec)
+    {
+        return vec;
+    }
+	template<> RTM_FORCE_INLINE vector4f vector_swizzle_impl<0, 1, 0, 1>(const vector4f& vec)
+    {
+        return _mm_movelh_ps(vec, vec);
+    }
+	template<> RTM_FORCE_INLINE vector4f vector_swizzle_impl<2, 3, 2, 3>(const vector4f& vec)
+    {
+        return _mm_movehl_ps(vec, vec);
+    }
+	template<> RTM_FORCE_INLINE vector4f vector_swizzle_impl<0, 0, 1, 1>(const vector4f& vec)
+    {
+        return _mm_unpacklo_ps(vec, vec);
+    }
+	template<> RTM_FORCE_INLINE vector4f vector_swizzle_impl<2, 2, 3, 3>(const vector4f& vec)
+    {
+        return _mm_unpackhi_ps(vec, vec);
+    }
+
+#if defined(RTM_SSE4_INTRINSICS)
+	template<> RTM_FORCE_INLINE vector4f vector_swizzle_impl<0, 0, 2, 2>(const vector4f& vec)
+    {
+        return _mm_moveldup_ps(vec);
+    }
+	template<> RTM_FORCE_INLINE vector4f vector_swizzle_impl<1, 1, 3, 3>(const vector4f& vec)
+    {
+        return _mm_movehdup_ps(vec);
+    }
+#endif
+
+#if defined(RTM_AVX2_INTRINSICS)
+	template<> RTM_FORCE_INLINE vector4f vector_swizzle_impl<0, 0, 0, 0>(const vector4f& vec)
+    {
+        return _mm_broadcastss_ps(vec);
+    }
+#endif
+
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	// Float replicate
+	template<int index>
+	RTM_FORCE_INLINE vector4f vector_replicate_impl(const vector4f& vec)
+	{
+		static_assert(index >= 0 && index <= 3, "Invalid Index");
+		return vector_swizzle_impl<index, index, index, index>(vec);
+	}
+
+	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	// Float shuffle
+	template<int index0, int index1, int index2, int index3>
+	RTM_FORCE_INLINE vector4f vector_shuffle_impl(const vector4f& vec1, const vector4f& vec2)
+	{
+		static_assert(index0 >= 0 && index0 <= 3 && index1 >= 0 && index1 <= 3 && index2 >= 0 && index2 <= 3 && index3 >= 0 && index3 <= 3, "Invalid Index");
+		return _mm_shuffle_ps(vec1, vec2, SHUFFLE_MASK(index0, index1, index2, index3));
+	}
+
+	// Float Shuffle specializations
+	template<> RTM_FORCE_INLINE vector4f vector_shuffle_impl<0, 1, 0, 1>(const vector4f& vec1, const vector4f& vec2)
+    {
+        return _mm_movelh_ps(vec1, vec2);
+
+    }
+	template<> RTM_FORCE_INLINE vector4f vector_shuffle_impl<2, 3, 2, 3>(const vector4f& vec1, const vector4f& vec2)
+    {
+        // Note: movehl copies first from the 2nd argument
+        return _mm_movehl_ps(vec2, vec1);
+    }
+
+#elif defined(RTM_NEON_INTRINSICS)
+    template <int element_index>
+    RTM_FORCE_INLINE vector4f vector_replicate_impl(const vector4f& vec)
+    {
+        return vdupq_n_f32(vgetq_lane_f32(vec, element_index));
+    }
+
+#if defined(__clang__)
+    template <int x, int y, int z, int w>
+    RTM_FORCE_INLINE vector4f vector_swizzle_impl(vector4f vec)
+    {
+        return __builtin_shufflevector(vec, vec, x, y, z, w);
+    }
+
+    template <int x, int y, int z, int w>
+    RTM_FORCE_INLINE vector4f vector_shuffle_impl(vector4f vec1, vector4f vec2)
+    {
+        return __builtin_shufflevector(vec1, vec2, x, y, z + 4, w + 4);
+    }
+#else
+
+    template<int index0, int index1, int index2, int index3>
+    RTM_FORCE_INLINE vector4f vector_shuffle_impl(vector4f vec1, vector4f vec2)
+    {
+        static_assert(index0 <= 3 && index1 <= 3 && index2 <= 3 && index3 <= 3, "Invalid Index");
+
+        static constexpr uint32_t control_element[8] =
+        {
+            0x03020100, // XM_PERMUTE_0X
+            0x07060504, // XM_PERMUTE_0Y
+            0x0B0A0908, // XM_PERMUTE_0Z
+            0x0F0E0D0C, // XM_PERMUTE_0W
+            0x13121110, // XM_PERMUTE_1X
+            0x17161514, // XM_PERMUTE_1Y
+            0x1B1A1918, // XM_PERMUTE_1Z
+            0x1F1E1D1C, // XM_PERMUTE_1W
+        };
+
+        uint8x8x4_t tbl;
+        tbl.val[0] = vget_low_f32(vec1);
+        tbl.val[1] = vget_high_f32(vec1);
+        tbl.val[2] = vget_low_f32(vec2);
+        tbl.val[3] = vget_high_f32(vec2);
+
+        uint32x2_t idx = vcreate_u32(static_cast<uint64_t>(control_element[index0]) | (static_cast<uint64_t>(control_element[index1]) << 32));
+        const uint8x8_t rL = vtbl4_u8(tbl, idx);
+
+        idx = vcreate_u32(static_cast<uint64_t>(control_element[index2 + 4]) | (static_cast<uint64_t>(control_element[index3 + 4]) << 32));
+        const uint8x8_t rH = vtbl4_u8(tbl, idx);
+
+        return vcombine_f32(rL, rH);
+    }
+
+	template<int index0, int index1, int index2, int index3>
+    RTM_FORCE_INLINE vector4f vector_swizzle_impl(vector4f vec)
+    {
+		static_assert(index0 <= 3 && index1 <= 3 && index2 <= 3 && index3 <= 3, "Invalid Index");
+
+        static constexpr uint32_t control_element[4] =
+        {
+            0x03020100, // XM_SWIZZLE_X
+            0x07060504, // XM_SWIZZLE_Y
+            0x0B0A0908, // XM_SWIZZLE_Z
+            0x0F0E0D0C, // XM_SWIZZLE_W
+        };
+
+        uint8x8x2_t tbl;
+        tbl.val[0] = vget_low_f32(vec);
+        tbl.val[1] = vget_high_f32(vec);
+
+        uint32x2_t idx = vcreate_u32(static_cast<uint64_t>(control_element[index0]) | (static_cast<uint64_t>(control_element[index1]) << 32));
+        const uint8x8_t rL = vtbl2_u8(tbl, idx);
+
+        idx = vcreate_u32(static_cast<uint64_t>(control_element[index2]) | (static_cast<uint64_t>(control_element[index3]) << 32));
+        const uint8x8_t rH = vtbl2_u8(tbl, idx);
+
+        return vcombine_f32(rL, rH);
+    }
+#endif
+#else
+#pragma error("vector swizzle not implement here!");
+#endif
+
+#define VECTOR_REPLICATE( vec, element_index )	        vector_replicate_impl<element_index>(vec)
+#define VECTOR_SWIZZLE( vec, x, y, z, w )		        vector_swizzle_impl<x,y,z,w>( vec )
+#define VECTOR_SHUFFLE( vec1, vec2, x, y, z, w )		vector_shuffle_impl<x,y,z,w>( vec1, vec2 )
+
+RTM_IMPL_VERSION_NAMESPACE_END
+}
+#endif
+RTM_IMPL_FILE_PRAGMA_POP
+
diff --git a/includes/rtm/matrix3x3f.h b/includes/rtm/matrix3x3f.h
index 40a90f9c..0fe31a7e 100644
--- a/includes/rtm/matrix3x3f.h
+++ b/includes/rtm/matrix3x3f.h
@@ -181,7 +181,7 @@ namespace rtm
 	// is to multiply the normal with the cofactor matrix.
 	// See: https://github.com/graphitemaster/normals_revisited
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL matrix_mul_vector3(vector4f_arg0 vec3, matrix3x3f_arg0 mtx) RTM_NO_EXCEPT
+    RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL matrix_mul_vector3(vector4f_arg0 vec3, matrix3x3f_argn mtx) RTM_NO_EXCEPT
 	{
 		vector4f tmp;
 
diff --git a/includes/rtm/matrix3x4f.h b/includes/rtm/matrix3x4f.h
index 71f7f1b0..e7fa5963 100644
--- a/includes/rtm/matrix3x4f.h
+++ b/includes/rtm/matrix3x4f.h
@@ -61,22 +61,25 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	// Sets a 3x4 affine matrix from a rotation quaternion and translation.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix3x4f RTM_SIMD_CALL matrix_from_qv(quatf_arg0 quat, vector4f_arg1 translation) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix3x4f RTM_SIMD_CALL matrix_from_qv(quatf_arg0 quat, vector4f_arg1 translation) RTM_NO_EXCEPT
 	{
 		RTM_ASSERT(quat_is_normalized(quat), "Quaternion is not normalized");
 
-		const float x2 = quat_get_x(quat) + quat_get_x(quat);
-		const float y2 = quat_get_y(quat) + quat_get_y(quat);
-		const float z2 = quat_get_z(quat) + quat_get_z(quat);
-		const float xx = quat_get_x(quat) * x2;
-		const float xy = quat_get_x(quat) * y2;
-		const float xz = quat_get_x(quat) * z2;
-		const float yy = quat_get_y(quat) * y2;
-		const float yz = quat_get_y(quat) * z2;
-		const float zz = quat_get_z(quat) * z2;
-		const float wx = quat_get_w(quat) * x2;
-		const float wy = quat_get_w(quat) * y2;
-		const float wz = quat_get_w(quat) * z2;
+		float quatval[4];
+		quat_store(quat, quatval);
+
+		const float x2 = quatval[0] + quatval[0];
+		const float y2 = quatval[1] + quatval[1];
+		const float z2 = quatval[2] + quatval[2];
+		const float xx = quatval[0] * x2;
+		const float xy = quatval[0] * y2;
+		const float xz = quatval[0] * z2;
+		const float yy = quatval[1] * y2;
+		const float yz = quatval[1] * z2;
+		const float zz = quatval[2] * z2;
+		const float wx = quatval[3] * x2;
+		const float wy = quatval[3] * y2;
+		const float wz = quatval[3] * z2;
 
 		const vector4f x_axis = vector_set(1.0F - (yy + zz), xy + wz, xz - wy, 0.0F);
 		const vector4f y_axis = vector_set(xy - wz, 1.0F - (xx + zz), yz + wx, 0.0F);
@@ -87,7 +90,7 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	// Converts a QV transform into a 3x4 affine matrix.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix3x4f RTM_SIMD_CALL matrix_from_qv(qvf_arg0 transform) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix3x4f RTM_SIMD_CALL matrix_from_qv(qvf_argn transform) RTM_NO_EXCEPT
 	{
 		return matrix_from_qv(transform.rotation, transform.translation);
 	}
@@ -95,23 +98,25 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	// Sets a 3x4 affine matrix from a rotation quaternion, translation, and scalar scale.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix3x4f RTM_SIMD_CALL matrix_from_qvs(quatf_arg0 quat, vector4f_arg1 translation, float scale) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix3x4f RTM_SIMD_CALL matrix_from_qvs(quatf_arg0 quat, vector4f_arg1 translation, float scale) RTM_NO_EXCEPT
 	{
 		RTM_ASSERT(quat_is_normalized(quat), "Quaternion is not normalized");
 
-		const float x2 = quat_get_x(quat) + quat_get_x(quat);
-		const float y2 = quat_get_y(quat) + quat_get_y(quat);
-		const float z2 = quat_get_z(quat) + quat_get_z(quat);
-		const float xx = quat_get_x(quat) * x2;
-		const float xy = quat_get_x(quat) * y2;
-		const float xz = quat_get_x(quat) * z2;
-		const float yy = quat_get_y(quat) * y2;
-		const float yz = quat_get_y(quat) * z2;
-		const float zz = quat_get_z(quat) * z2;
-		const float wx = quat_get_w(quat) * x2;
-		const float wy = quat_get_w(quat) * y2;
-		const float wz = quat_get_w(quat) * z2;
-
+		float quatval[4];
+		quat_store(quat, quatval);
+
+		const float x2 = quatval[0] + quatval[0];
+		const float y2 = quatval[1] + quatval[1];
+		const float z2 = quatval[2] + quatval[2];
+		const float xx = quatval[0] * x2;
+		const float xy = quatval[0] * y2;
+		const float xz = quatval[0] * z2;
+		const float yy = quatval[1] * y2;
+		const float yz = quatval[1] * z2;
+		const float zz = quatval[2] * z2;
+		const float wx = quatval[3] * x2;
+		const float wy = quatval[3] * y2;
+		const float wz = quatval[3] * z2;
 		const scalarf scale_s = scalar_set(scale);
 
 		const vector4f x_axis = vector_mul(vector_set(1.0F - (yy + zz), xy + wz, xz - wy, 0.0F), scale_s);
@@ -123,7 +128,7 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	// Converts a QVS transform into a 3x4 affine matrix.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix3x4f RTM_SIMD_CALL matrix_from_qvs(qvsf_arg0 transform) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix3x4f RTM_SIMD_CALL matrix_from_qvs(qvsf_argn transform) RTM_NO_EXCEPT
 	{
 		return matrix_from_qvs(transform.rotation, transform.translation_scale, qvs_get_scale(transform));
 	}
@@ -131,22 +136,25 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	// Sets a 3x4 affine matrix from a rotation quaternion, translation, and 3D scale.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix3x4f RTM_SIMD_CALL matrix_from_qvv(quatf_arg0 quat, vector4f_arg1 translation, vector4f_arg2 scale) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix3x4f RTM_SIMD_CALL matrix_from_qvv(quatf_arg0 quat, vector4f_arg1 translation, vector4f_arg2 scale) RTM_NO_EXCEPT
 	{
 		RTM_ASSERT(quat_is_normalized(quat), "Quaternion is not normalized");
 
-		const float x2 = quat_get_x(quat) + quat_get_x(quat);
-		const float y2 = quat_get_y(quat) + quat_get_y(quat);
-		const float z2 = quat_get_z(quat) + quat_get_z(quat);
-		const float xx = quat_get_x(quat) * x2;
-		const float xy = quat_get_x(quat) * y2;
-		const float xz = quat_get_x(quat) * z2;
-		const float yy = quat_get_y(quat) * y2;
-		const float yz = quat_get_y(quat) * z2;
-		const float zz = quat_get_z(quat) * z2;
-		const float wx = quat_get_w(quat) * x2;
-		const float wy = quat_get_w(quat) * y2;
-		const float wz = quat_get_w(quat) * z2;
+		float quatval[4];
+		quat_store(quat, quatval);
+
+		const float x2 = quatval[0] + quatval[0];
+		const float y2 = quatval[1] + quatval[1];
+		const float z2 = quatval[2] + quatval[2];
+		const float xx = quatval[0] * x2;
+		const float xy = quatval[0] * y2;
+		const float xz = quatval[0] * z2;
+		const float yy = quatval[1] * y2;
+		const float yz = quatval[1] * z2;
+		const float zz = quatval[2] * z2;
+		const float wx = quatval[3] * x2;
+		const float wy = quatval[3] * y2;
+		const float wz = quatval[3] * z2;
 
 		const scalarf scale_x = vector_get_x_as_scalar(scale);
 		const scalarf scale_y = vector_get_y_as_scalar(scale);
@@ -161,7 +169,7 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	// Converts a QVV transform into a 3x4 affine matrix.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix3x4f RTM_SIMD_CALL matrix_from_qvv(qvvf_arg0 transform) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix3x4f RTM_SIMD_CALL matrix_from_qvv(qvvf_argn transform) RTM_NO_EXCEPT
 	{
 		return matrix_from_qvv(transform.rotation, transform.translation, transform.scale);
 	}
@@ -209,7 +217,7 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	// Returns a new 3x4 matrix where the specified axis has been replaced on the input matrix.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix3x4f RTM_SIMD_CALL matrix_set_axis(matrix3x4f_arg0 input, vector4f_arg5 axis_value, axis4 axis) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix3x4f RTM_SIMD_CALL matrix_set_axis(matrix3x4f_arg0 input, vector4f_arg5 axis_value, axis4 axis) RTM_NO_EXCEPT
 	{
 		switch (axis)
 		{
diff --git a/includes/rtm/matrix4x4f.h b/includes/rtm/matrix4x4f.h
index 29494ea9..6116f8b4 100644
--- a/includes/rtm/matrix4x4f.h
+++ b/includes/rtm/matrix4x4f.h
@@ -41,7 +41,7 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	// Returns the axis pointing in the forward direction of the default coordinate system (Z+).
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE constexpr vector4f RTM_SIMD_CALL matrix_get_coord_forward(matrix4x4f_arg0 input) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE constexpr vector4f RTM_SIMD_CALL matrix_get_coord_forward(matrix4x4f_argn input) RTM_NO_EXCEPT
 	{
 		return input.z_axis;
 	}
@@ -49,7 +49,7 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	// Returns the axis pointing in the up direction of the default coordinate system (Y+).
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE constexpr vector4f RTM_SIMD_CALL matrix_get_coord_up(matrix4x4f_arg0 input) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE constexpr vector4f RTM_SIMD_CALL matrix_get_coord_up(matrix4x4f_argn input) RTM_NO_EXCEPT
 	{
 		return input.y_axis;
 	}
@@ -57,7 +57,7 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	// Returns the axis pointing in the cross direction of the default coordinate system (X+).
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE constexpr vector4f RTM_SIMD_CALL matrix_get_coord_cross(matrix4x4f_arg0 input) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE constexpr vector4f RTM_SIMD_CALL matrix_get_coord_cross(matrix4x4f_argn input) RTM_NO_EXCEPT
 	{
 		return input.x_axis;
 	}
@@ -65,7 +65,7 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	// Returns the axis holding the position of the default coordinate system (W+).
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE constexpr vector4f RTM_SIMD_CALL matrix_get_coord_position(matrix4x4f_arg0 input) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE constexpr vector4f RTM_SIMD_CALL matrix_get_coord_position(matrix4x4f_argn input) RTM_NO_EXCEPT
 	{
 		return input.w_axis;
 	}
@@ -73,7 +73,7 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	// Returns the desired 4x4 matrix axis.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE constexpr vector4f RTM_SIMD_CALL matrix_get_axis(matrix4x4f_arg0 input, axis4 axis) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE constexpr vector4f RTM_SIMD_CALL matrix_get_axis(matrix4x4f_argn input, axis4 axis) RTM_NO_EXCEPT
 	{
 		return axis == axis4::x ? input.x_axis : (axis == axis4::y ? input.y_axis : (axis == axis4::z ? input.z_axis : input.w_axis));
 	}
@@ -81,7 +81,7 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	// Returns a new 4x4 matrix where the specified axis has been replaced on the input matrix.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix4x4f RTM_SIMD_CALL matrix_set_axis(matrix4x4f_arg0 input, vector4f_arg5 axis_value, axis4 axis) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL matrix_set_axis(matrix4x4f_argn input, vector4f_arg5 axis_value, axis4 axis) RTM_NO_EXCEPT
 	{
 		switch (axis)
 		{
@@ -96,7 +96,7 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	// Returns the desired 4x4 matrix component from the specified axis.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK inline rtm_impl::vector4f_vector_get_component RTM_SIMD_CALL matrix_get_component(matrix4x4f_arg0 input, axis4 axis, component4 component) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE rtm_impl::vector4f_vector_get_component RTM_SIMD_CALL matrix_get_component(matrix4x4f_argn input, axis4 axis, component4 component) RTM_NO_EXCEPT
 	{
 		switch (axis)
 		{
@@ -111,7 +111,7 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	// Returns the desired 4x4 matrix component from the specified axis.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK inline scalarf RTM_SIMD_CALL matrix_get_component_as_scalar(matrix4x4f_arg0 input, axis4 axis, component4 component) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE scalarf RTM_SIMD_CALL matrix_get_component_as_scalar(matrix4x4f_arg0 input, axis4 axis, component4 component) RTM_NO_EXCEPT
 	{
 		switch (axis)
 		{
@@ -126,7 +126,7 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	// Returns a new 4x4 matrix where the specified axis/component has been replaced on the input matrix.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix4x4f RTM_SIMD_CALL matrix_set_component(matrix4x4f_arg0 input, float component_value, axis4 axis, component4 component) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL matrix_set_component(matrix4x4f_arg0 input, float component_value, axis4 axis, component4 component) RTM_NO_EXCEPT
 	{
 		switch (axis)
 		{
@@ -142,7 +142,7 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	// Returns a new 4x4 matrix where the specified axis/component has been replaced on the input matrix.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix4x4f RTM_SIMD_CALL matrix_set_component(matrix4x4f_arg0 input, scalarf_arg4 component_value, axis4 axis, component4 component) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL matrix_set_component(matrix4x4f_arg0 input, scalarf_arg4 component_value, axis4 axis, component4 component) RTM_NO_EXCEPT
 	{
 		switch (axis)
 		{
@@ -159,7 +159,7 @@ namespace rtm
 	// Multiplies two 4x4 matrices.
 	// Multiplication order is as follow: local_to_world = matrix_mul(local_to_object, object_to_world)
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix4x4f RTM_SIMD_CALL matrix_mul(matrix4x4f_arg0 lhs, matrix4x4f_arg1 rhs) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL matrix_mul(matrix4x4f_argn lhs, matrix4x4f_argn rhs) RTM_NO_EXCEPT
 	{
 		vector4f tmp = vector_mul(vector_dup_x(lhs.x_axis), rhs.x_axis);
 		tmp = vector_mul_add(vector_dup_y(lhs.x_axis), rhs.y_axis, tmp);
@@ -192,7 +192,7 @@ namespace rtm
 	// Multiplies a 4x4 matrix and a 4D vector.
 	// Multiplication order is as follow: world_position = matrix_mul(local_position, local_to_world)
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL matrix_mul_vector(vector4f_arg0 vec4, matrix4x4f_arg0 mtx) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL matrix_mul_vector(vector4f_arg0 vec4, matrix4x4f_argn mtx) RTM_NO_EXCEPT
 	{
 		vector4f tmp;
 
@@ -207,7 +207,7 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	// Transposes a 4x4 matrix.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL matrix_transpose(matrix4x4f_arg0 input) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL matrix_transpose(matrix4x4f_argn input) RTM_NO_EXCEPT
 	{
 		vector4f x_axis;
 		vector4f y_axis;
@@ -222,7 +222,7 @@ namespace rtm
 	// If the input matrix is not invertible, the result is undefined.
 	// For a safe alternative, supply a fallback value and a threshold.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix4x4f RTM_SIMD_CALL matrix_inverse(matrix4x4f_arg0 input) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL matrix_inverse(matrix4x4f_argn input) RTM_NO_EXCEPT
 	{
 		matrix4x4f input_transposed = matrix_transpose(input);
 
@@ -316,7 +316,7 @@ namespace rtm
 	// If the input matrix has a determinant whose absolute value is below the supplied threshold, the
 	// fall back value is returned instead.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix4x4f RTM_SIMD_CALL matrix_inverse(matrix4x4f_arg0 input, matrix4x4f_arg1 fallback, float threshold = 1.0E-8F) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL matrix_inverse(matrix4x4f_argn input, matrix4x4f_argn fallback, float threshold = 1.0E-8F) RTM_NO_EXCEPT
 	{
 		matrix4x4f input_transposed = matrix_transpose(input);
 
@@ -411,7 +411,7 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	// Returns the determinant of the input 4x4 matrix.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK inline scalarf RTM_SIMD_CALL matrix_determinant(matrix4x4f_arg0 input) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE scalarf RTM_SIMD_CALL matrix_determinant(matrix4x4f_argn input) RTM_NO_EXCEPT
 	{
 		matrix4x4f input_transposed = matrix_transpose(input);
 
@@ -458,7 +458,7 @@ namespace rtm
 	// The minor is the determinant of the sub-matrix input when the specified
 	// row and column are removed.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK inline scalarf RTM_SIMD_CALL matrix_minor(matrix4x4f_arg0 input, axis4 row, axis4 column) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE scalarf RTM_SIMD_CALL matrix_minor(matrix4x4f_argn input, axis4 row, axis4 column) RTM_NO_EXCEPT
 	{
 		vector4f row0;
 		vector4f row1;
@@ -522,7 +522,7 @@ namespace rtm
 	// Returns the cofactor matrix of the input 4x4 matrix.
 	// See: https://en.wikipedia.org/wiki/Minor_(linear_algebra)#Cofactor_expansion_of_the_determinant
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix4x4f RTM_SIMD_CALL matrix_cofactor(matrix4x4f_arg0 input) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL matrix_cofactor(matrix4x4f_argn input) RTM_NO_EXCEPT
 	{
 		const scalarf minor_xx = matrix_minor(input, axis4::x, axis4::x);
 		const scalarf minor_xy = matrix_minor(input, axis4::x, axis4::y);
@@ -558,7 +558,7 @@ namespace rtm
 	// Returns the adjugate of the input matrix.
 	// See: https://en.wikipedia.org/wiki/Adjugate_matrix
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK inline matrix4x4f RTM_SIMD_CALL matrix_adjugate(matrix4x4f_arg0 input) RTM_NO_EXCEPT
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL matrix_adjugate(matrix4x4f_arg0 input) RTM_NO_EXCEPT
 	{
 		return matrix_transpose(matrix_cofactor(input));
 	}
diff --git a/includes/rtm/quatf.h b/includes/rtm/quatf.h
index 1dcf25a7..98fa9fbc 100644
--- a/includes/rtm/quatf.h
+++ b/includes/rtm/quatf.h
@@ -733,19 +733,7 @@ namespace rtm
 		{
 			RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE RTM_SIMD_CALL operator float() const RTM_NO_EXCEPT
 			{
-#if defined(RTM_SSE4_INTRINSICS) && 0
-				// SSE4 dot product instruction appears slower on Zen2, is it the case elsewhere as well?
-				return _mm_cvtss_f32(_mm_dp_ps(lhs, rhs, 0xFF));
-#elif defined(RTM_SSE2_INTRINSICS)
-				__m128 x2_y2_z2_w2 = _mm_mul_ps(lhs, rhs);
-				__m128 z2_w2_0_0 = _mm_shuffle_ps(x2_y2_z2_w2, x2_y2_z2_w2, _MM_SHUFFLE(0, 0, 3, 2));
-				__m128 x2z2_y2w2_0_0 = _mm_add_ps(x2_y2_z2_w2, z2_w2_0_0);
-				__m128 y2w2_0_0_0 = _mm_shuffle_ps(x2z2_y2w2_0_0, x2z2_y2w2_0_0, _MM_SHUFFLE(0, 0, 0, 1));
-				__m128 x2y2z2w2_0_0_0 = _mm_add_ps(x2z2_y2w2_0_0, y2w2_0_0_0);
-				return _mm_cvtss_f32(x2y2z2w2_0_0_0);
-#else
-				return (quat_get_x(lhs) * quat_get_x(rhs)) + (quat_get_y(lhs) * quat_get_y(rhs)) + (quat_get_z(lhs) * quat_get_z(rhs)) + (quat_get_w(lhs) * quat_get_w(rhs));
-#endif
+			    return vector_dot(quat_to_vector(lhs), quat_to_vector(rhs));
 			}
 
 #if defined(RTM_SSE2_INTRINSICS)
diff --git a/includes/rtm/vector4f.h b/includes/rtm/vector4f.h
index 7cda1544..25bfbcc0 100644
--- a/includes/rtm/vector4f.h
+++ b/includes/rtm/vector4f.h
@@ -34,6 +34,7 @@
 #include "rtm/impl/macros.mask4.impl.h"
 #include "rtm/impl/memory_utils.h"
 #include "rtm/impl/vector_common.h"
+#include "rtm/impl/vector4f_swizzle.h"
 
 #include <cstring>
 #include <limits>
@@ -63,6 +64,20 @@ namespace rtm
 #endif
 	}
 
+	//////////////////////////////////////////////////////////////////////////
+	// Loads an aligned vector4 from memory.
+	//////////////////////////////////////////////////////////////////////////
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_load_aligned(const float* input) RTM_NO_EXCEPT
+	{
+#if defined(RTM_SSE2_INTRINSICS)
+		return _mm_load_ps(input);
+#elif defined(RTM_NEON_INTRINSICS)
+		return vld1q_f32(input);
+#else
+		return vector_set(input[0], input[1], input[2], input[3]);
+#endif
+	}
+
 	//////////////////////////////////////////////////////////////////////////
 	// Loads an input scalar from memory into the [x] component and sets the [yzw] components to zero.
 	//////////////////////////////////////////////////////////////////////////
@@ -1005,6 +1020,25 @@ namespace rtm
 	{
 #if defined(RTM_SSE2_INTRINSICS)
 		_mm_storeu_ps(output, input);
+#elif defined(RTM_NEON_INTRINSICS)
+		vst1q_f32(output, input);
+#else
+		output[0] = vector_get_x(input);
+		output[1] = vector_get_y(input);
+		output[2] = vector_get_z(input);
+		output[3] = vector_get_w(input);
+#endif
+	}
+
+	//////////////////////////////////////////////////////////////////////////
+	// Writes a vector4 to aligned memory.
+	//////////////////////////////////////////////////////////////////////////
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE void RTM_SIMD_CALL vector_store_aligned(vector4f_arg0 input, float* output) RTM_NO_EXCEPT
+	{
+#if defined(RTM_SSE2_INTRINSICS)
+		_mm_store_ps(output, input);
+#elif defined(RTM_NEON_INTRINSICS)
+		vst1q_f32(output, input);
 #else
 		output[0] = vector_get_x(input);
 		output[1] = vector_get_y(input);
@@ -1026,8 +1060,15 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE void RTM_SIMD_CALL vector_store2(vector4f_arg0 input, float* output) RTM_NO_EXCEPT
 	{
+#if defined(RTM_SSE2_INTRINSICS)
 		output[0] = vector_get_x(input);
 		output[1] = vector_get_y(input);
+#elif defined(RTM_NEON_INTRINSICS)
+		vst1_f32(output, *(float32x2_t*)&input);
+#else
+		output[0] = vector_get_x(input);
+		output[1] = vector_get_y(input);
+#endif
 	}
 
 	//////////////////////////////////////////////////////////////////////////
@@ -1035,11 +1076,22 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE void RTM_SIMD_CALL vector_store3(vector4f_arg0 input, float* output) RTM_NO_EXCEPT
 	{
+#if defined(RTM_SSE2_INTRINSICS)
 		output[0] = vector_get_x(input);
 		output[1] = vector_get_y(input);
 		output[2] = vector_get_z(input);
+#elif defined(RTM_NEON_INTRINSICS)
+		vst1_f32(output, *(float32x2_t*)&input);
+		vst1q_lane_f32(((float32_t*)output) + 2, input, 2);
+#else
+		output[0] = vector_get_x(input);
+		output[1] = vector_get_y(input);
+		output[2] = vector_get_z(input);
+#endif
 	}
 
+
+
 	//////////////////////////////////////////////////////////////////////////
 	// Writes a vector4 to unaligned memory.
 	//////////////////////////////////////////////////////////////////////////
@@ -1510,47 +1562,6 @@ namespace rtm
 #endif
 	}
 
-	//////////////////////////////////////////////////////////////////////////
-	// 3D cross product: lhs x rhs
-	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_cross3(vector4f_arg0 lhs, vector4f_arg1 rhs) RTM_NO_EXCEPT
-	{
-#if defined(RTM_SSE2_INTRINSICS)
-		// cross(a, b).zxy = (a * b.yzx) - (a.yzx * b)
-		__m128 lhs_yzx = _mm_shuffle_ps(lhs, lhs, _MM_SHUFFLE(3, 0, 2, 1));
-		__m128 rhs_yzx = _mm_shuffle_ps(rhs, rhs, _MM_SHUFFLE(3, 0, 2, 1));
-		__m128 tmp_zxy = _mm_sub_ps(_mm_mul_ps(lhs, rhs_yzx), _mm_mul_ps(lhs_yzx, rhs));
-
-		// cross(a, b) = ((a * b.yzx) - (a.yzx * b)).yzx
-		return _mm_shuffle_ps(tmp_zxy, tmp_zxy, _MM_SHUFFLE(3, 0, 2, 1));
-#elif defined(RTM_NEON_INTRINSICS)
-		// cross(a, b) = (a.yzx * b.zxy) - (a.zxy * b.yzx)
-		float32x4_t lhs_yzwx = vextq_f32(lhs, lhs, 1);
-		float32x4_t rhs_wxyz = vextq_f32(rhs, rhs, 3);
-
-		float32x4_t lhs_yzx = vsetq_lane_f32(vgetq_lane_f32(lhs, 0), lhs_yzwx, 2);
-		float32x4_t rhs_zxy = vsetq_lane_f32(vgetq_lane_f32(rhs, 2), rhs_wxyz, 0);
-
-		// part_a = (a.yzx * b.zxy)
-		float32x4_t part_a = vmulq_f32(lhs_yzx, rhs_zxy);
-
-		float32x4_t lhs_wxyz = vextq_f32(lhs, lhs, 3);
-		float32x4_t rhs_yzwx = vextq_f32(rhs, rhs, 1);
-		float32x4_t lhs_zxy = vsetq_lane_f32(vgetq_lane_f32(lhs, 2), lhs_wxyz, 0);
-		float32x4_t rhs_yzx = vsetq_lane_f32(vgetq_lane_f32(rhs, 0), rhs_yzwx, 2);
-
-		return vmlsq_f32(part_a, lhs_zxy, rhs_yzx);
-#else
-		// cross(a, b) = (a.yzx * b.zxy) - (a.zxy * b.yzx)
-		const float lhs_x = vector_get_x(lhs);
-		const float lhs_y = vector_get_y(lhs);
-		const float lhs_z = vector_get_z(lhs);
-		const float rhs_x = vector_get_x(rhs);
-		const float rhs_y = vector_get_y(rhs);
-		const float rhs_z = vector_get_z(rhs);
-		return vector_set((lhs_y * rhs_z) - (lhs_z * rhs_y), (lhs_z * rhs_x) - (lhs_x * rhs_z), (lhs_x * rhs_y) - (lhs_y * rhs_x));
-#endif
-	}
 
 	namespace rtm_impl
 	{
@@ -2424,6 +2435,48 @@ namespace rtm
 	}
 #endif
 
+	//v2 - v0 * v1
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_neg_mul_add(vector4f_arg0 v0, vector4f_arg1 v1, vector4f_arg2 v2) RTM_NO_EXCEPT
+	{
+		return vector_neg_mul_sub(v0, v1, v2);
+	}
+
+
+	//////////////////////////////////////////////////////////////////////////
+	// 3D cross product: lhs x rhs
+	//////////////////////////////////////////////////////////////////////////
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_cross3(vector4f_arg0 lhs, vector4f_arg1 rhs) RTM_NO_EXCEPT
+	{
+#if defined(RTM_SSE2_INTRINSICS)
+		// cross(a, b).zxy = (a * b.yzx) - (a.yzx * b)
+		__m128 lhs_yzx = _mm_shuffle_ps(lhs, lhs, _MM_SHUFFLE(3, 0, 2, 1));
+		__m128 rhs_yzx = _mm_shuffle_ps(rhs, rhs, _MM_SHUFFLE(3, 0, 2, 1));
+		__m128 tmp_zxy = _mm_sub_ps(_mm_mul_ps(lhs, rhs_yzx), _mm_mul_ps(lhs_yzx, rhs));
+
+		// cross(a, b) = ((a * b.yzx) - (a.yzx * b)).yzx
+		return _mm_shuffle_ps(tmp_zxy, tmp_zxy, _MM_SHUFFLE(3, 0, 2, 1));
+#elif defined(RTM_NEON_INTRINSICS)
+		// YZX
+		vector4f A = VECTOR_SWIZZLE(rhs, 1, 2, 0, 3);
+		vector4f B = VECTOR_SWIZZLE(lhs, 1, 2, 0, 3);
+		// XY, YZ, ZX
+		A = vector_mul(A, lhs);
+		// XY-YX, YZ-ZY, ZX-XZ
+		A = vector_neg_mul_add(B, rhs, A);
+		// YZ-ZY, ZX-XZ, XY-YX
+		return VECTOR_SWIZZLE(A, 1, 2, 0, 3);
+#else
+		// cross(a, b) = (a.yzx * b.zxy) - (a.zxy * b.yzx)
+		const float lhs_x = vector_get_x(lhs);
+		const float lhs_y = vector_get_y(lhs);
+		const float lhs_z = vector_get_z(lhs);
+		const float rhs_x = vector_get_x(rhs);
+		const float rhs_y = vector_get_y(rhs);
+		const float rhs_z = vector_get_z(rhs);
+		return vector_set((lhs_y * rhs_z) - (lhs_z * rhs_y), (lhs_z * rhs_x) - (lhs_x * rhs_z), (lhs_x * rhs_y) - (lhs_y * rhs_x));
+#endif
+	}
+
 	//////////////////////////////////////////////////////////////////////////
 	// Per component linear interpolation of the two inputs at the specified alpha.
 	// The formula used is: ((1.0 - alpha) * start) + (alpha * end).
@@ -3369,248 +3422,143 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	// Mixes two inputs and returns the desired components.
 	// [xyzw] indexes into the first input while [abcd] indexes in the second.
+	// Slow code path, not yet optimized or not using intrinsics
 	//////////////////////////////////////////////////////////////////////////
-	template<mix4 comp0, mix4 comp1, mix4 comp2, mix4 comp3>
-	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_mix(vector4f_arg0 input0, vector4f_arg1 input1) RTM_NO_EXCEPT
-	{
-#if defined(RTM_SSE4_INTRINSICS)
-        // Each component comes from the respective position of input 0 or input 1
-        if (rtm_impl::static_condition<(comp0 == mix4::a || comp0 == mix4::x) && (comp1 == mix4::b || comp1 == mix4::y) &&
-                                       (comp2 == mix4::c || comp2 == mix4::z) && (comp3 == mix4::d || comp3 == mix4::w)>::test())
-        {
-            constexpr int mask = (comp0 == mix4::a ? 1 : 0) | (comp1 == mix4::b ? 2 : 0) |
-                                 (comp2 == mix4::c ? 4 : 0) | (comp3 == mix4::d ? 8 : 0);
-            return _mm_blend_ps(input0, input1, mask);
-        }
-
-        // First component comes from input 1, others come from the respective positions of input 0
-        if (rtm_impl::static_condition<rtm_impl::is_mix_abcd(comp0) && comp1 == mix4::y && comp2 == mix4::z && comp3 == mix4::w>::test())
-            return _mm_insert_ps(input0, input1, (int(comp0) % 4) << 6);
-
-        // Second component comes from input 1, others come from the respective positions of input 0
-        if (rtm_impl::static_condition<comp0 == mix4::x && rtm_impl::is_mix_abcd(comp1) && comp2 == mix4::z && comp3 == mix4::w>::test())
-            return _mm_insert_ps(input0, input1, ((int(comp1) % 4) << 6) | (1 << 4));
-
-        // Third component comes from input 1, others come from the respective positions of input 0
-        if (rtm_impl::static_condition<comp0 == mix4::x && comp1 == mix4::y && rtm_impl::is_mix_abcd(comp2) && comp3 == mix4::w>::test())
-            return _mm_insert_ps(input0, input1, ((int(comp2) % 4) << 6) | (2 << 4));
-
-        // Fourth component comes from input 1, others come from the respective positions of input 0
-        if (rtm_impl::static_condition<comp0 == mix4::x && comp1 == mix4::y && comp2 == mix4::z && rtm_impl::is_mix_abcd(comp3)>::test())
-            return _mm_insert_ps(input0, input1, ((int(comp3) % 4) << 6) | (3 << 4));
-
-        // First component comes from input 0, others come from the respective positions of input 1
-        if (rtm_impl::static_condition<rtm_impl::is_mix_xyzw(comp0) && comp1 == mix4::b && comp2 == mix4::c && comp3 == mix4::d>::test())
-            return _mm_insert_ps(input1, input0, (int(comp0) % 4) << 6);
-
-        // Second component comes from input 0, others come from the respective positions of input 1
-        if (rtm_impl::static_condition<comp0 == mix4::a && rtm_impl::is_mix_xyzw(comp1) && comp2 == mix4::c && comp3 == mix4::d>::test())
-            return _mm_insert_ps(input1, input0, ((int(comp1) % 4) << 6) | (1 << 4));
-
-        // Third component comes from input 0, others come from the respective positions of input 1
-        if (rtm_impl::static_condition<comp0 == mix4::a && comp1 == mix4::b && rtm_impl::is_mix_xyzw(comp2) && comp3 == mix4::d>::test())
-            return _mm_insert_ps(input1, input0, ((int(comp2) % 4) << 6) | (2 << 4));
-
-        // Fourth component comes from input 0, others come from the respective positions of input 1
-        if (rtm_impl::static_condition<comp0 == mix4::a && comp1 == mix4::b && comp2 == mix4::c && rtm_impl::is_mix_xyzw(comp3)>::test())
-            return _mm_insert_ps(input1, input0, ((int(comp3) % 4) << 6) | (3 << 4));
-#endif // defined(RTM_SSE4_INTRINSICS)
-
-#if defined(RTM_SSE2_INTRINSICS)
-		// All four components come from input 0
-		if (rtm_impl::static_condition<rtm_impl::is_mix_xyzw(comp0) && rtm_impl::is_mix_xyzw(comp1) && rtm_impl::is_mix_xyzw(comp2) && rtm_impl::is_mix_xyzw(comp3)>::test())
-			return _mm_shuffle_ps(input0, input0, _MM_SHUFFLE(int(comp3) % 4, int(comp2) % 4, int(comp1) % 4, int(comp0) % 4));
-
-		// All four components come from input 1
-		if (rtm_impl::static_condition<rtm_impl::is_mix_abcd(comp0) && rtm_impl::is_mix_abcd(comp1) && rtm_impl::is_mix_abcd(comp2) && rtm_impl::is_mix_abcd(comp3)>::test())
-			return _mm_shuffle_ps(input1, input1, _MM_SHUFFLE(int(comp3) % 4, int(comp2) % 4, int(comp1) % 4, int(comp0) % 4));
+    template <int index0, int index1, int index2, int index3>
+    RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_mix_slow(vector4f_arg0 input0, vector4f_arg1 input1) RTM_NO_EXCEPT
+    {
+        float combine_arr[8];
+        vector_store(input0, combine_arr);
+        vector_store(input1, combine_arr + 4);
+        return vector_set(combine_arr[index0], combine_arr[index1], combine_arr[index2], combine_arr[index3]);
+    }
 
-		// First two components come from input 0, second two come from input 1
-		if (rtm_impl::static_condition<rtm_impl::is_mix_xyzw(comp0) && rtm_impl::is_mix_xyzw(comp1) && rtm_impl::is_mix_abcd(comp2) && rtm_impl::is_mix_abcd(comp3)>::test())
-			return _mm_shuffle_ps(input0, input1, _MM_SHUFFLE(int(comp3) % 4, int(comp2) % 4, int(comp1) % 4, int(comp0) % 4));
-
-		// First two components come from input 1, second two come from input 0
-		if (rtm_impl::static_condition<rtm_impl::is_mix_abcd(comp0) && rtm_impl::is_mix_abcd(comp1) && rtm_impl::is_mix_xyzw(comp2) && rtm_impl::is_mix_xyzw(comp3)>::test())
-			return _mm_shuffle_ps(input1, input0, _MM_SHUFFLE(int(comp3) % 4, int(comp2) % 4, int(comp1) % 4, int(comp0) % 4));
-
-		// Low words from both inputs are interleaved
-		if (rtm_impl::static_condition<comp0 == mix4::x && comp1 == mix4::a && comp2 == mix4::y && comp3 == mix4::b>::test())
-			return _mm_unpacklo_ps(input0, input1);
-
-		// Low words from both inputs are interleaved
-		if (rtm_impl::static_condition<comp0 == mix4::a && comp1 == mix4::x && comp2 == mix4::b && comp3 == mix4::y>::test())
-			return _mm_unpacklo_ps(input1, input0);
-
-		// High words from both inputs are interleaved
-		if (rtm_impl::static_condition<comp0 == mix4::z && comp1 == mix4::c && comp2 == mix4::w && comp3 == mix4::d>::test())
-			return _mm_unpackhi_ps(input0, input1);
-
-		// High words from both inputs are interleaved
-		if (rtm_impl::static_condition<comp0 == mix4::c && comp1 == mix4::z && comp2 == mix4::d && comp3 == mix4::w>::test())
-			return _mm_unpackhi_ps(input1, input0);
-#endif	// defined(RTM_SSE2_INTRINSICS)
-
-#if defined(RTM_NEON64_INTRINSICS)
-        // Low words from both inputs are interleaved
-        if (rtm_impl::static_condition<comp0 == mix4::x && comp1 == mix4::a && comp2 == mix4::y && comp3 == mix4::b>::test())
-            return vzip1q_f32(input0, input1);
-
-        // Low words from both inputs are interleaved
-        if (rtm_impl::static_condition<comp0 == mix4::a && comp1 == mix4::x && comp2 == mix4::b && comp3 == mix4::y>::test())
-            return vzip1q_f32(input1, input0);
-
-        // High words from both inputs are interleaved
-        if (rtm_impl::static_condition<comp0 == mix4::z && comp1 == mix4::c && comp2 == mix4::w && comp3 == mix4::d>::test())
-            return vzip2q_f32(input0, input1);
-
-        // High words from both inputs are interleaved
-        if (rtm_impl::static_condition<comp0 == mix4::c && comp1 == mix4::z && comp2 == mix4::d && comp3 == mix4::w>::test())
-            return vzip2q_f32(input1, input0);
-
-        // Even-numbered vector elements, consecutively
-        if (rtm_impl::static_condition<comp0 == mix4::x && comp1 == mix4::z && comp2 == mix4::a && comp3 == mix4::c>::test())
-            return vuzp1q_f32(input0, input1);
-
-        // Even-numbered vector elements, consecutively
-        if (rtm_impl::static_condition<comp0 == mix4::a && comp1 == mix4::c && comp2 == mix4::x && comp3 == mix4::z>::test())
-            return vuzp1q_f32(input1, input0);
-
-        // Odd-numbered vector elements, consecutively
-        if (rtm_impl::static_condition<comp0 == mix4::y && comp1 == mix4::w && comp2 == mix4::b && comp3 == mix4::d>::test())
-            return vuzp2q_f32(input0, input1);
-
-        // Odd-numbered vector elements, consecutively
-        if (rtm_impl::static_condition<comp0 == mix4::b && comp1 == mix4::d && comp2 == mix4::y && comp3 == mix4::w>::test())
-            return vuzp2q_f32(input1, input0);
-
-        // Even-numbered vector elements, interleaved
-        if (rtm_impl::static_condition<comp0 == mix4::x && comp1 == mix4::a && comp2 == mix4::z && comp3 == mix4::c>::test())
-            return vtrn1q_f32(input0, input1);
+	//////////////////////////////////////////////////////////////////////////
+	// Mixes two inputs and returns the desired components.
+	// [xyzw] indexes into the first input while [abcd] indexes in the second.
+	// Use compilation time to speed up branch judgment
+	//////////////////////////////////////////////////////////////////////////
+	template <int index0, int index1, int index2, int index3,
+		typename std::enable_if<(index0 < 4 && index1 < 4 && index2 >= 4 && index3 >= 4), int>::type = 0>
+	vector4f vector_swizzle_with_index(vector4f_arg0 input0, vector4f_arg1 input1)
+	{
+#if defined(RTM_NO_INTRINSICS)
+		return vector_mix_slow<index0, index1, index2, index3>(input0, input1);
+#else
+		return VECTOR_SHUFFLE(input0, input1, index0, index1, index2 - 4, index3 - 4);
+#endif
+	}
 
-        // Even-numbered vector elements, interleaved
-        if (rtm_impl::static_condition<comp0 == mix4::a && comp1 == mix4::x && comp2 == mix4::c && comp3 == mix4::z>::test())
-            return vtrn1q_f32(input1, input0);
+	//////////////////////////////////////////////////////////////////////////
+	// Mixes two inputs and returns the desired components.
+	// [xyzw] indexes into the first input while [abcd] indexes in the second.
+	// Use compilation time to speed up branch judgment
+	//////////////////////////////////////////////////////////////////////////
+	template <int index0, int index1, int index2, int index3,
+		typename std::enable_if<(index0 < 4 && index1 < 4 && index2 < 4 && index3 < 4), int>::type = 0>
+	vector4f vector_swizzle_with_index(vector4f_arg0 input0, vector4f_arg1 input1)
+	{
+#if defined(RTM_NO_INTRINSICS)
+		return vector_mix_slow<index0, index1, index2, index3>(input0, input1);
+#else
+		(void)input1;
+		return VECTOR_SWIZZLE(input0, index0, index1, index2, index3);
+#endif
+	}
 
-        // Odd-numbered vector elements, interleaved
-        if (rtm_impl::static_condition<comp0 == mix4::y && comp1 == mix4::b && comp2 == mix4::w && comp3 == mix4::d>::test())
-            return vtrn2q_f32(input0, input1);
+	//////////////////////////////////////////////////////////////////////////
+	// Mixes two inputs and returns the desired components.
+	// [xyzw] indexes into the first input while [abcd] indexes in the second.
+	// Use compilation time to speed up branch judgment
+	//////////////////////////////////////////////////////////////////////////
+	template <int index0, int index1, int index2, int index3,
+		typename std::enable_if<(index0 >= 4 && index1 >= 4 && index2 >= 4 && index3 >= 4), int>::type = 0>
+	vector4f vector_swizzle_with_index(vector4f_arg0 input0, vector4f_arg1 input1)
+	{
+#if defined(RTM_NO_INTRINSICS)
+		return vector_mix_slow<index0, index1, index2, index3>(input0, input1);
+#else
+		(void)input0;
+		return VECTOR_SWIZZLE(input1, index0 - 4, index1 - 4, index2 - 4, index3 - 4);
+#endif
+	}
 
-        // Odd-numbered vector elements, interleaved
-        if (rtm_impl::static_condition<comp0 == mix4::b && comp1 == mix4::y && comp2 == mix4::d && comp3 == mix4::w>::test())
-            return vtrn2q_f32(input1, input0);
-#endif // defined(RTM_NEON64_INTRINSICS)
+	//////////////////////////////////////////////////////////////////////////
+	// Mixes two inputs and returns the desired components.
+	// [xyzw] indexes into the first input while [abcd] indexes in the second.
+	// If no matching specialized template function is found, fall back to the slow version
+	//////////////////////////////////////////////////////////////////////////
+	template <int index0, int index1, int index2, int index3,
+		typename std::enable_if<!(index0 < 4 && index1 < 4 && index2 >= 4 && index3 >= 4) &&
+			!(index0 < 4 && index1 < 4 && index2 < 4 && index3 < 4) &&
+			!(index0 >= 4 && index1 >= 4 && index2 >= 4 && index3 >= 4), int>::type = 0>
+	vector4f vector_swizzle_with_index(vector4f_arg0 input0, vector4f_arg1 input1)
+    {
+        return vector_mix_slow<index0, index1, index2, index3>(input0, input1);
+	}
 
-#if defined(RTM_NEON_INTRINSICS)
-        // The highest vector elements from input 0 and the lowest vector elements from input 1, consecutively
-        if (rtm_impl::static_condition<rtm_impl::is_mix_xyzw(comp0) &&
-                                       int(comp0) + 1 == int(comp1) &&
-                                       int(comp1) + 1 == int(comp2) &&
-                                       int(comp2) + 1 == int(comp3)>::test())
-            return vextq_f32(input0, input1, int(comp0) % 4);
-
-        // The highest vector elements from input 1 and the lowest vector elements from input 0, consecutively
-        if (rtm_impl::static_condition<rtm_impl::is_mix_abcd(comp0) &&
-                                       (int(comp0) + 1) % 8 == int(comp1) &&
-                                       (int(comp1) + 1) % 8 == int(comp2) &&
-                                       (int(comp2) + 1) % 8 == int(comp3)>::test())
-            return vextq_f32(input1, input0, int(comp0) % 4);
-
-        // All four components come from input 0, reversed order in each doubleword
-        if (rtm_impl::static_condition<comp0 == mix4::y && comp1 == mix4::x && comp2 == mix4::w && comp3 == mix4::z>::test())
-            return vrev64q_f32(input0);
-
-        // All four components come from input 1, reversed order in each doubleword
-        if (rtm_impl::static_condition<comp0 == mix4::b && comp1 == mix4::a && comp2 == mix4::d && comp3 == mix4::c>::test())
-            return vrev64q_f32(input1);
-
-        // First component comes from input 1, others come from the respective positions of input 0
-        if (rtm_impl::static_condition<rtm_impl::is_mix_abcd(comp0) && comp1 == mix4::y && comp2 == mix4::z && comp3 == mix4::w>::test())
-            return vsetq_lane_f32(vgetq_lane_f32(input1, int(comp0) % 4), input0, 0);
-
-        // Second component comes from input 1, others come from the respective positions of input 0
-        if (rtm_impl::static_condition<comp0 == mix4::x && rtm_impl::is_mix_abcd(comp1) && comp2 == mix4::z && comp3 == mix4::w>::test())
-            return vsetq_lane_f32(vgetq_lane_f32(input1, int(comp1) % 4), input0, 1);
-
-        // Third component comes from input 1, others come from the respective positions of input 0
-        if (rtm_impl::static_condition<comp0 == mix4::x && comp1 == mix4::y && rtm_impl::is_mix_abcd(comp2) && comp3 == mix4::w>::test())
-            return vsetq_lane_f32(vgetq_lane_f32(input1, int(comp2) % 4), input0, 2);
-
-        // Fourth component comes from input 1, others come from the respective positions of input 0
-        if (rtm_impl::static_condition<comp0 == mix4::x && comp1 == mix4::y && comp2 == mix4::z && rtm_impl::is_mix_abcd(comp3)>::test())
-            return vsetq_lane_f32(vgetq_lane_f32(input1, int(comp3) % 4), input0, 3);
-
-        // First component comes from input 0, others come from the respective positions of input 1
-        if (rtm_impl::static_condition<rtm_impl::is_mix_xyzw(comp0) && comp1 == mix4::b && comp2 == mix4::c && comp3 == mix4::d>::test())
-            return vsetq_lane_f32(vgetq_lane_f32(input0, int(comp0) % 4), input1, 0);
-
-        // Second component comes from input 0, others come from the respective positions of input 1
-        if (rtm_impl::static_condition<comp0 == mix4::a && rtm_impl::is_mix_xyzw(comp1) && comp2 == mix4::c && comp3 == mix4::d>::test())
-            return vsetq_lane_f32(vgetq_lane_f32(input0, int(comp1) % 4), input1, 1);
-
-        // Third component comes from input 0, others come from the respective positions of input 1
-        if (rtm_impl::static_condition<comp0 == mix4::a && comp1 == mix4::b && rtm_impl::is_mix_xyzw(comp2) && comp3 == mix4::d>::test())
-            return vsetq_lane_f32(vgetq_lane_f32(input0, int(comp2) % 4), input1, 2);
-
-        // Fourth component comes from input 0, others come from the respective positions of input 1
-        if (rtm_impl::static_condition<comp0 == mix4::a && comp1 == mix4::b && comp2 == mix4::c && rtm_impl::is_mix_xyzw(comp3)>::test())
-            return vsetq_lane_f32(vgetq_lane_f32(input0, int(comp3) % 4), input1, 3);
-
-
-        // All comes from the same position
-        if (rtm_impl::static_condition<comp0 == comp1 && comp0 == comp2 && comp0 == comp3>::test()) {
-            // All comes from the same position of input0
-            if (rtm_impl::static_condition<rtm_impl::is_mix_xyzw(comp0)>::test())
-                return vmovq_n_f32(vgetq_lane_f32(input0, int(comp0) % 4));
-            // All comes from the same position of input1
-            if (rtm_impl::static_condition<rtm_impl::is_mix_abcd(comp0)>::test())
-                return vmovq_n_f32(vgetq_lane_f32(input1, int(comp0) % 4));
-        }
-#endif // defined(RTM_NEON_INTRINSICS)
-
-        // Slow code path, not yet optimized or not using intrinsics
-		constexpr component4 component0 = rtm_impl::mix_to_component(comp0);
-		constexpr component4 component1 = rtm_impl::mix_to_component(comp1);
-		constexpr component4 component2 = rtm_impl::mix_to_component(comp2);
-		constexpr component4 component3 = rtm_impl::mix_to_component(comp3);
-
-		const float x0 = vector_get_component(input0, component0);
-		const float x1 = vector_get_component(input1, component0);
-		const float x = rtm_impl::is_mix_xyzw(comp0) ? x0 : x1;
-
-		const float y0 = vector_get_component(input0, component1);
-		const float y1 = vector_get_component(input1, component1);
-		const float y = rtm_impl::is_mix_xyzw(comp1) ? y0 : y1;
-
-		const float z0 = vector_get_component(input0, component2);
-		const float z1 = vector_get_component(input1, component2);
-		const float z = rtm_impl::is_mix_xyzw(comp2) ? z0 : z1;
-
-		const float w0 = vector_get_component(input0, component3);
-		const float w1 = vector_get_component(input1, component3);
-		const float w = rtm_impl::is_mix_xyzw(comp3) ? w0 : w1;
+	//////////////////////////////////////////////////////////////////////////
+	// Mixes two inputs and returns the desired components.
+	// [xyzw] indexes into the first input while [abcd] indexes in the second.
+	//////////////////////////////////////////////////////////////////////////
+	template<mix4 comp0, mix4 comp1, mix4 comp2, mix4 comp3>
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_mix(vector4f_arg0 input0, vector4f_arg1 input1) RTM_NO_EXCEPT
+    {
+        constexpr int index0 = (int)comp0;
+        constexpr int index1 = (int)comp1;
+        constexpr int index2 = (int)comp2;
+        constexpr int index3 = (int)comp3;
 
-		return vector_set(x, y, z, w);
+        return vector_swizzle_with_index<index0, index1, index2, index3>(input0, input1);
 	}
 
 	//////////////////////////////////////////////////////////////////////////
 	// Replicates the [x] component in all components.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_dup_x(vector4f_arg0 input) RTM_NO_EXCEPT { return vector_mix<mix4::x, mix4::x, mix4::x, mix4::x>(input, input); }
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_dup_x(vector4f_arg0 input) RTM_NO_EXCEPT
+    {
+#if defined(RTM_NO_INTRINSICS)
+        return vector_mix<mix4::x, mix4::x, mix4::x, mix4::x>(input, input);
+#else
+        return VECTOR_REPLICATE(input, 0);
+#endif
+	}
 
 	//////////////////////////////////////////////////////////////////////////
 	// Replicates the [y] component in all components.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_dup_y(vector4f_arg0 input) RTM_NO_EXCEPT { return vector_mix<mix4::y, mix4::y, mix4::y, mix4::y>(input, input); }
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_dup_y(vector4f_arg0 input) RTM_NO_EXCEPT
+    {
+#if defined(RTM_NO_INTRINSICS)
+        return vector_mix<mix4::y, mix4::y, mix4::y, mix4::y>(input, input);
+#else
+        return VECTOR_REPLICATE(input, 1);
+#endif
+	}
 
 	//////////////////////////////////////////////////////////////////////////
 	// Replicates the [z] component in all components.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_dup_z(vector4f_arg0 input) RTM_NO_EXCEPT { return vector_mix<mix4::z, mix4::z, mix4::z, mix4::z>(input, input); }
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_dup_z(vector4f_arg0 input) RTM_NO_EXCEPT
+    {
+#if defined(RTM_NO_INTRINSICS)
+        return vector_mix<mix4::z, mix4::z, mix4::z, mix4::z>(input, input);
+#else
+        return VECTOR_REPLICATE(input, 2);
+#endif
+	}
 
 	//////////////////////////////////////////////////////////////////////////
 	// Replicates the [w] component in all components.
 	//////////////////////////////////////////////////////////////////////////
-	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_dup_w(vector4f_arg0 input) RTM_NO_EXCEPT { return vector_mix<mix4::w, mix4::w, mix4::w, mix4::w>(input, input); }
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_dup_w(vector4f_arg0 input) RTM_NO_EXCEPT
+    {
+#if defined(RTM_NO_INTRINSICS)
+        return vector_mix<mix4::w, mix4::w, mix4::w, mix4::w>(input, input);
+#else
+        return VECTOR_REPLICATE(input, 3);
+#endif
+	}
 
 	//////////////////////////////////////////////////////////////////////////
 	// Logical
diff --git a/tools/bench/sources/bench_matrix4x4_arg_passing.cpp b/tools/bench/sources/bench_matrix4x4_arg_passing.cpp
new file mode 100644
index 00000000..af0b9258
--- /dev/null
+++ b/tools/bench/sources/bench_matrix4x4_arg_passing.cpp
@@ -0,0 +1,395 @@
+////////////////////////////////////////////////////////////////////////////////
+// The MIT License (MIT)
+//
+// Copyright (c) 2024 Nicholas Frechette & Realtime Math contributors
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+////////////////////////////////////////////////////////////////////////////////
+
+#include <benchmark/benchmark.h>
+
+#include <rtm/matrix4x4f.h>
+#include <rtm/quatf.h>
+
+using namespace rtm;
+
+namespace {
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL make_tansform(vector4f_arg0 translation, vector4f_arg1 scale, quatf_arg2 orientation) {
+		float quatval[4];
+		quat_store(orientation, quatval);
+
+		const float x2 = quatval[0] + quatval[0];
+		const float y2 = quatval[1] + quatval[1];
+		const float z2 = quatval[2] + quatval[2];
+		const float xx = quatval[0] * x2;
+		const float xy = quatval[0] * y2;
+		const float xz = quatval[0] * z2;
+		const float yy = quatval[1] * y2;
+		const float yz = quatval[1] * z2;
+		const float zz = quatval[2] * z2;
+		const float wx = quatval[3] * x2;
+		const float wy = quatval[3] * y2;
+		const float wz = quatval[3] * z2;
+
+		const scalarf scale_x = vector_get_x_as_scalar(scale);
+		const scalarf scale_y = vector_get_y_as_scalar(scale);
+		const scalarf scale_z = vector_get_z_as_scalar(scale);
+
+
+		vector4f x_axis = vector_mul(vector_set(1.0F - (yy + zz), xy + wz, xz - wy, 0.0F), scale_x);
+		vector4f y_axis = vector_mul(vector_set(xy - wz, 1.0F - (xx + zz), yz + wx, 0.0F), scale_y);
+		vector4f z_axis = vector_mul(vector_set(xz + wy, yz - wx, 1.0F - (xx + yy), 0.0F), scale_z);
+		vector4f w_axis = translation;
+
+		return matrix4x4f{x_axis, y_axis, z_axis, w_axis};
+	}
+
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE void RTM_SIMD_CALL multi_pass_by_ref(const matrix4x4f& lhs, const matrix4x4f& rhs, matrix4x4f& result) RTM_NO_EXCEPT
+	{
+		vector4f tmp = vector_mul(vector_dup_x(lhs.x_axis), rhs.x_axis);
+		tmp = vector_mul_add(vector_dup_y(lhs.x_axis), rhs.y_axis, tmp);
+		tmp = vector_mul_add(vector_dup_z(lhs.x_axis), rhs.z_axis, tmp);
+		tmp = vector_mul_add(vector_dup_w(lhs.x_axis), rhs.w_axis, tmp);
+		result.x_axis = tmp;
+
+		tmp = vector_mul(vector_dup_x(lhs.y_axis), rhs.x_axis);
+		tmp = vector_mul_add(vector_dup_y(lhs.y_axis), rhs.y_axis, tmp);
+		tmp = vector_mul_add(vector_dup_z(lhs.y_axis), rhs.z_axis, tmp);
+		tmp = vector_mul_add(vector_dup_w(lhs.y_axis), rhs.w_axis, tmp);
+		result.y_axis = tmp;
+
+		tmp = vector_mul(vector_dup_x(lhs.z_axis), rhs.x_axis);
+		tmp = vector_mul_add(vector_dup_y(lhs.z_axis), rhs.y_axis, tmp);
+		tmp = vector_mul_add(vector_dup_z(lhs.z_axis), rhs.z_axis, tmp);
+		tmp = vector_mul_add(vector_dup_w(lhs.z_axis), rhs.w_axis, tmp);
+		result.z_axis = tmp;
+
+		tmp = vector_mul(vector_dup_x(lhs.w_axis), rhs.x_axis);
+		tmp = vector_mul_add(vector_dup_y(lhs.w_axis), rhs.y_axis, tmp);
+		tmp = vector_mul_add(vector_dup_z(lhs.w_axis), rhs.z_axis, tmp);
+		tmp = vector_mul_add(vector_dup_w(lhs.w_axis), rhs.w_axis, tmp);
+		result.w_axis = tmp;
+	}
+
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE void RTM_SIMD_CALL inverse_pass_by_ref(const matrix4x4f& input, matrix4x4f& output) {
+	    vector4f v00 = vector_mix<mix4::x, mix4::x, mix4::y, mix4::y>(input.z_axis, input.z_axis);
+	    vector4f v01 = vector_mix<mix4::x, mix4::x, mix4::y, mix4::y>(input.x_axis, input.x_axis);
+	    vector4f v02 = vector_mix<mix4::x, mix4::z, mix4::a, mix4::c>(input.z_axis, input.x_axis);
+	    vector4f v10 = vector_mix<mix4::z, mix4::w, mix4::z, mix4::w>(input.w_axis, input.w_axis);
+	    vector4f v11 = vector_mix<mix4::z, mix4::w, mix4::z, mix4::w>(input.y_axis, input.y_axis);
+	    vector4f v12 = vector_mix<mix4::y, mix4::w, mix4::b, mix4::d>(input.w_axis, input.y_axis);
+
+	    vector4f d0 = vector_mul(v00, v10);
+	    vector4f d1 = vector_mul(v01, v11);
+	    vector4f d2 = vector_mul(v02, v12);
+
+	    v00 = vector_mix<mix4::z, mix4::w, mix4::z, mix4::w>(input.z_axis, input.z_axis);
+	    v01 = vector_mix<mix4::z, mix4::w, mix4::z, mix4::w>(input.x_axis, input.x_axis);
+	    v02 = vector_mix<mix4::y, mix4::w, mix4::b, mix4::d>(input.z_axis, input.x_axis);
+	    v10 = vector_mix<mix4::x, mix4::x, mix4::y, mix4::y>(input.w_axis, input.w_axis);
+	    v11 = vector_mix<mix4::x, mix4::x, mix4::y, mix4::y>(input.y_axis, input.y_axis);
+	    v12 = vector_mix<mix4::x, mix4::z, mix4::a, mix4::c>(input.w_axis, input.y_axis);
+
+	    d0 = vector_neg_mul_sub(v00, v10, d0);
+	    d1 = vector_neg_mul_sub(v01, v11, d1);
+	    d2 = vector_neg_mul_sub(v02, v12, d2);
+
+	    v00 = vector_mix<mix4::y, mix4::z, mix4::x, mix4::y>(input.y_axis, input.y_axis);
+	    v01 = vector_mix<mix4::z, mix4::x, mix4::y, mix4::x>(input.x_axis, input.x_axis);
+	    v02 = vector_mix<mix4::y, mix4::z, mix4::x, mix4::y>(input.w_axis, input.w_axis);
+	    vector4f v03 = vector_mix<mix4::z, mix4::x, mix4::y, mix4::x>(input.z_axis, input.z_axis);
+	    v10 = vector_mix<mix4::b, mix4::y, mix4::w, mix4::x>(d0, d2);
+	    v11 = vector_mix<mix4::w, mix4::b, mix4::y, mix4::z>(d0, d2);
+	    v12 = vector_mix<mix4::d, mix4::y, mix4::w, mix4::x>(d1, d2);
+	    vector4f v13 = vector_mix<mix4::w, mix4::d, mix4::y, mix4::z>(d1, d2);
+
+	    vector4f c0 = vector_mul(v00, v10);
+	    vector4f c2 = vector_mul(v01, v11);
+	    vector4f c4 = vector_mul(v02, v12);
+	    vector4f c6 = vector_mul(v03, v13);
+
+	    v00 = vector_mix<mix4::z, mix4::w, mix4::y, mix4::z>(input.y_axis, input.y_axis);
+	    v01 = vector_mix<mix4::w, mix4::z, mix4::w, mix4::y>(input.x_axis, input.x_axis);
+	    v02 = vector_mix<mix4::z, mix4::w, mix4::y, mix4::z>(input.w_axis, input.w_axis);
+	    v03 = vector_mix<mix4::w, mix4::z, mix4::w, mix4::y>(input.z_axis, input.z_axis);
+	    v10 = vector_mix<mix4::w, mix4::x, mix4::y, mix4::a>(d0, d2);
+	    v11 = vector_mix<mix4::z, mix4::y, mix4::a, mix4::x>(d0, d2);
+	    v12 = vector_mix<mix4::w, mix4::x, mix4::y, mix4::c>(d1, d2);
+	    v13 = vector_mix<mix4::z, mix4::y, mix4::c, mix4::x>(d1, d2);
+
+	    c0 = vector_neg_mul_sub(v00, v10, c0);
+	    c2 = vector_neg_mul_sub(v01, v11, c2);
+	    c4 = vector_neg_mul_sub(v02, v12, c4);
+	    c6 = vector_neg_mul_sub(v03, v13, c6);
+
+	    v00 = vector_mix<mix4::w, mix4::x, mix4::w, mix4::x>(input.y_axis, input.y_axis);
+	    v01 = vector_mix<mix4::y, mix4::w, mix4::x, mix4::z>(input.x_axis, input.x_axis);
+	    v02 = vector_mix<mix4::w, mix4::x, mix4::w, mix4::x>(input.w_axis, input.w_axis);
+	    v03 = vector_mix<mix4::y, mix4::w, mix4::x, mix4::z>(input.z_axis, input.z_axis);
+	    v10 = vector_mix<mix4::z, mix4::b, mix4::a, mix4::z>(d0, d2);
+	    v11 = vector_mix<mix4::b, mix4::x, mix4::w, mix4::a>(d0, d2);
+	    v12 = vector_mix<mix4::z, mix4::d, mix4::c, mix4::z>(d1, d2);
+	    v13 = vector_mix<mix4::d, mix4::x, mix4::w, mix4::c>(d1, d2);
+
+	    vector4f c1 = vector_neg_mul_sub(v00, v10, c0);
+	    c0 = vector_mul_add(v00, v10, c0);
+	    vector4f c3 = vector_mul_add(v01, v11, c2);
+	    c2 = vector_neg_mul_sub(v01, v11, c2);
+	    vector4f c5 = vector_neg_mul_sub(v02, v12, c4);
+	    c4 = vector_mul_add(v02, v12, c4);
+	    vector4f c7 = vector_mul_add(v03, v13, c6);
+	    c6 = vector_neg_mul_sub(v03, v13, c6);
+
+	    vector4f x_axis = vector_mix<mix4::x, mix4::b, mix4::z, mix4::d>(c0, c1);
+	    vector4f y_axis = vector_mix<mix4::x, mix4::b, mix4::z, mix4::d>(c2, c3);
+	    vector4f z_axis = vector_mix<mix4::x, mix4::b, mix4::z, mix4::d>(c4, c5);
+	    vector4f w_axis = vector_mix<mix4::x, mix4::b, mix4::z, mix4::d>(c6, c7);
+
+	    const scalarf det = vector_dot_as_scalar(x_axis, input.x_axis);
+	    const scalarf inv_det_s = scalar_reciprocal(det);
+	    const vector4f inv_det = vector_set(inv_det_s);
+
+	    output.x_axis = vector_mul(x_axis, inv_det);
+	    output.y_axis = vector_mul(y_axis, inv_det);
+	    output.z_axis = vector_mul(z_axis, inv_det);
+	    output.w_axis = vector_mul(w_axis, inv_det);
+	}
+
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL multi_pass_by_value(matrix4x4f_arg0 lhs, matrix4x4f_arg1 rhs) RTM_NO_EXCEPT
+	{
+		vector4f tmp = vector_mul(vector_dup_x(lhs.x_axis), rhs.x_axis);
+		tmp = vector_mul_add(vector_dup_y(lhs.x_axis), rhs.y_axis, tmp);
+		tmp = vector_mul_add(vector_dup_z(lhs.x_axis), rhs.z_axis, tmp);
+		tmp = vector_mul_add(vector_dup_w(lhs.x_axis), rhs.w_axis, tmp);
+		vector4f x_axis = tmp;
+
+		tmp = vector_mul(vector_dup_x(lhs.y_axis), rhs.x_axis);
+		tmp = vector_mul_add(vector_dup_y(lhs.y_axis), rhs.y_axis, tmp);
+		tmp = vector_mul_add(vector_dup_z(lhs.y_axis), rhs.z_axis, tmp);
+		tmp = vector_mul_add(vector_dup_w(lhs.y_axis), rhs.w_axis, tmp);
+		vector4f y_axis = tmp;
+
+		tmp = vector_mul(vector_dup_x(lhs.z_axis), rhs.x_axis);
+		tmp = vector_mul_add(vector_dup_y(lhs.z_axis), rhs.y_axis, tmp);
+		tmp = vector_mul_add(vector_dup_z(lhs.z_axis), rhs.z_axis, tmp);
+		tmp = vector_mul_add(vector_dup_w(lhs.z_axis), rhs.w_axis, tmp);
+		vector4f z_axis = tmp;
+
+		tmp = vector_mul(vector_dup_x(lhs.w_axis), rhs.x_axis);
+		tmp = vector_mul_add(vector_dup_y(lhs.w_axis), rhs.y_axis, tmp);
+		tmp = vector_mul_add(vector_dup_z(lhs.w_axis), rhs.z_axis, tmp);
+		tmp = vector_mul_add(vector_dup_w(lhs.w_axis), rhs.w_axis, tmp);
+		vector4f w_axis = tmp;
+
+		return matrix4x4f{ x_axis, y_axis, z_axis, w_axis };
+	}
+
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL inverse_pass_by_value(matrix4x4f_arg0 input) {
+		matrix4x4f output{};
+
+	    vector4f v00 = vector_mix<mix4::x, mix4::x, mix4::y, mix4::y>(input.z_axis, input.z_axis);
+	    vector4f v01 = vector_mix<mix4::x, mix4::x, mix4::y, mix4::y>(input.x_axis, input.x_axis);
+	    vector4f v02 = vector_mix<mix4::x, mix4::z, mix4::a, mix4::c>(input.z_axis, input.x_axis);
+	    vector4f v10 = vector_mix<mix4::z, mix4::w, mix4::z, mix4::w>(input.w_axis, input.w_axis);
+	    vector4f v11 = vector_mix<mix4::z, mix4::w, mix4::z, mix4::w>(input.y_axis, input.y_axis);
+	    vector4f v12 = vector_mix<mix4::y, mix4::w, mix4::b, mix4::d>(input.w_axis, input.y_axis);
+
+	    vector4f d0 = vector_mul(v00, v10);
+	    vector4f d1 = vector_mul(v01, v11);
+	    vector4f d2 = vector_mul(v02, v12);
+
+	    v00 = vector_mix<mix4::z, mix4::w, mix4::z, mix4::w>(input.z_axis, input.z_axis);
+	    v01 = vector_mix<mix4::z, mix4::w, mix4::z, mix4::w>(input.x_axis, input.x_axis);
+	    v02 = vector_mix<mix4::y, mix4::w, mix4::b, mix4::d>(input.z_axis, input.x_axis);
+	    v10 = vector_mix<mix4::x, mix4::x, mix4::y, mix4::y>(input.w_axis, input.w_axis);
+	    v11 = vector_mix<mix4::x, mix4::x, mix4::y, mix4::y>(input.y_axis, input.y_axis);
+	    v12 = vector_mix<mix4::x, mix4::z, mix4::a, mix4::c>(input.w_axis, input.y_axis);
+
+	    d0 = vector_neg_mul_sub(v00, v10, d0);
+	    d1 = vector_neg_mul_sub(v01, v11, d1);
+	    d2 = vector_neg_mul_sub(v02, v12, d2);
+
+	    v00 = vector_mix<mix4::y, mix4::z, mix4::x, mix4::y>(input.y_axis, input.y_axis);
+	    v01 = vector_mix<mix4::z, mix4::x, mix4::y, mix4::x>(input.x_axis, input.x_axis);
+	    v02 = vector_mix<mix4::y, mix4::z, mix4::x, mix4::y>(input.w_axis, input.w_axis);
+	    vector4f v03 = vector_mix<mix4::z, mix4::x, mix4::y, mix4::x>(input.z_axis, input.z_axis);
+	    v10 = vector_mix<mix4::b, mix4::y, mix4::w, mix4::x>(d0, d2);
+	    v11 = vector_mix<mix4::w, mix4::b, mix4::y, mix4::z>(d0, d2);
+	    v12 = vector_mix<mix4::d, mix4::y, mix4::w, mix4::x>(d1, d2);
+	    vector4f v13 = vector_mix<mix4::w, mix4::d, mix4::y, mix4::z>(d1, d2);
+
+	    vector4f c0 = vector_mul(v00, v10);
+	    vector4f c2 = vector_mul(v01, v11);
+	    vector4f c4 = vector_mul(v02, v12);
+	    vector4f c6 = vector_mul(v03, v13);
+
+	    v00 = vector_mix<mix4::z, mix4::w, mix4::y, mix4::z>(input.y_axis, input.y_axis);
+	    v01 = vector_mix<mix4::w, mix4::z, mix4::w, mix4::y>(input.x_axis, input.x_axis);
+	    v02 = vector_mix<mix4::z, mix4::w, mix4::y, mix4::z>(input.w_axis, input.w_axis);
+	    v03 = vector_mix<mix4::w, mix4::z, mix4::w, mix4::y>(input.z_axis, input.z_axis);
+	    v10 = vector_mix<mix4::w, mix4::x, mix4::y, mix4::a>(d0, d2);
+	    v11 = vector_mix<mix4::z, mix4::y, mix4::a, mix4::x>(d0, d2);
+	    v12 = vector_mix<mix4::w, mix4::x, mix4::y, mix4::c>(d1, d2);
+	    v13 = vector_mix<mix4::z, mix4::y, mix4::c, mix4::x>(d1, d2);
+
+	    c0 = vector_neg_mul_sub(v00, v10, c0);
+	    c2 = vector_neg_mul_sub(v01, v11, c2);
+	    c4 = vector_neg_mul_sub(v02, v12, c4);
+	    c6 = vector_neg_mul_sub(v03, v13, c6);
+
+	    v00 = vector_mix<mix4::w, mix4::x, mix4::w, mix4::x>(input.y_axis, input.y_axis);
+	    v01 = vector_mix<mix4::y, mix4::w, mix4::x, mix4::z>(input.x_axis, input.x_axis);
+	    v02 = vector_mix<mix4::w, mix4::x, mix4::w, mix4::x>(input.w_axis, input.w_axis);
+	    v03 = vector_mix<mix4::y, mix4::w, mix4::x, mix4::z>(input.z_axis, input.z_axis);
+	    v10 = vector_mix<mix4::z, mix4::b, mix4::a, mix4::z>(d0, d2);
+	    v11 = vector_mix<mix4::b, mix4::x, mix4::w, mix4::a>(d0, d2);
+	    v12 = vector_mix<mix4::z, mix4::d, mix4::c, mix4::z>(d1, d2);
+	    v13 = vector_mix<mix4::d, mix4::x, mix4::w, mix4::c>(d1, d2);
+
+	    vector4f c1 = vector_neg_mul_sub(v00, v10, c0);
+	    c0 = vector_mul_add(v00, v10, c0);
+	    vector4f c3 = vector_mul_add(v01, v11, c2);
+	    c2 = vector_neg_mul_sub(v01, v11, c2);
+	    vector4f c5 = vector_neg_mul_sub(v02, v12, c4);
+	    c4 = vector_mul_add(v02, v12, c4);
+	    vector4f c7 = vector_mul_add(v03, v13, c6);
+	    c6 = vector_neg_mul_sub(v03, v13, c6);
+
+	    vector4f x_axis = vector_mix<mix4::x, mix4::b, mix4::z, mix4::d>(c0, c1);
+	    vector4f y_axis = vector_mix<mix4::x, mix4::b, mix4::z, mix4::d>(c2, c3);
+	    vector4f z_axis = vector_mix<mix4::x, mix4::b, mix4::z, mix4::d>(c4, c5);
+	    vector4f w_axis = vector_mix<mix4::x, mix4::b, mix4::z, mix4::d>(c6, c7);
+
+	    const scalarf det = vector_dot_as_scalar(x_axis, input.x_axis);
+	    const scalarf inv_det_s = scalar_reciprocal(det);
+	    const vector4f inv_det = vector_set(inv_det_s);
+
+
+	    output.x_axis = vector_mul(x_axis, inv_det);
+	    output.y_axis = vector_mul(y_axis, inv_det);
+	    output.z_axis = vector_mul(z_axis, inv_det);
+	    output.w_axis = vector_mul(w_axis, inv_det);
+
+		return output;
+	}
+}
+
+namespace {
+	vector4f g_translation1 = vector_set(4.1f, 4.2f, 4.3f, 1.0f);
+	vector4f g_translation2 = vector_set(3.1f, 3.2f, 3.3f, 1.0f);
+	vector4f g_scale = vector_set(1.0f, 1.0f, 1.0f, 1.0f);
+	vector4f g_orientation = vector_set(5.1f, 5.2f, 5.3f, 1.0f);
+}
+
+class matrix4x4f_ref {
+public:
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f_ref() = default;
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f_ref(vector4f_arg0 translation, vector4f_arg1 scale, quatf_arg2 orientation) {
+		m_value = make_tansform(translation, scale, orientation);
+	}
+
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f& RTM_SIMD_CALL ref() {
+		return m_value;
+	}
+
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f& RTM_SIMD_CALL ref() const {
+		return const_cast<matrix4x4f_ref*>(this)->ref();
+	}
+
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f_ref RTM_SIMD_CALL inverse() const {
+		matrix4x4f_ref result{};
+		inverse_pass_by_ref(m_value, result.ref());
+		return result;
+	}
+
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f_ref RTM_SIMD_CALL multi(const matrix4x4f_ref& other) const {
+		matrix4x4f_ref result{};
+		multi_pass_by_ref(other.ref(), m_value, result.ref());
+		return result;
+	}
+
+private:
+	matrix4x4f m_value{};
+};
+
+class matrix4x4f_value {
+public:
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f_value() = default;
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f_value(vector4f_arg0 translation, vector4f_arg1 scale, quatf_arg2 orientation) {
+		m_value = make_tansform(translation, scale, orientation);
+	}
+
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f& RTM_SIMD_CALL ref() {
+		return m_value;
+	}
+
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f& RTM_SIMD_CALL ref() const {
+		return const_cast<matrix4x4f_value*>(this)->ref();
+	}
+
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f_value RTM_SIMD_CALL inverse() const {
+		matrix4x4f_value result{};
+		result.ref() = inverse_pass_by_value(m_value);
+		return result;
+	}
+
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f_value RTM_SIMD_CALL multi(matrix4x4f other) const {
+		matrix4x4f_value result{};
+		result.ref() = multi_pass_by_value(other, m_value);
+		return result;
+	}
+
+private:
+	matrix4x4f m_value{};
+};
+
+static void bm_matrix_pass_by_ref(benchmark::State& state) {
+	auto vec = std::vector<matrix4x4f_ref>{};
+	vec.resize(1000000);
+
+	int index = 0;
+	for (auto _ : state) {
+		auto matrix1 = matrix4x4f_ref(g_translation1, g_scale, g_orientation);
+		auto matrix2 = matrix4x4f_ref(g_translation2, g_scale, g_orientation);
+		auto matrix3 = matrix1.multi(matrix2);
+
+		vec[index++ % vec.size()] = matrix3.inverse();
+	}
+	benchmark::DoNotOptimize(vec);
+}
+
+BENCHMARK(bm_matrix_pass_by_ref);
+
+static void bm_matrix_pass_by_value(benchmark::State& state) {
+	auto vec = std::vector<matrix4x4f_value>{};
+	vec.resize(1000000);
+
+	int index = 0;
+	for (auto _ : state) {
+		auto matrix1 = matrix4x4f_value(g_translation1, g_scale, g_orientation);
+		auto matrix2 = matrix4x4f_value(g_translation2, g_scale, g_orientation);
+		auto matrix3 = matrix1.multi(matrix2.ref());
+
+		vec[index++ % vec.size()] = matrix3.inverse();
+	}
+
+	benchmark::DoNotOptimize(vec);
+}
+
+BENCHMARK(bm_matrix_pass_by_value);
\ No newline at end of file
diff --git a/tools/bench/sources/bench_vector_mix.cpp b/tools/bench/sources/bench_vector_mix.cpp
new file mode 100644
index 00000000..17c8ea93
--- /dev/null
+++ b/tools/bench/sources/bench_vector_mix.cpp
@@ -0,0 +1,320 @@
+////////////////////////////////////////////////////////////////////////////////
+// The MIT License (MIT)
+//
+// Copyright (c) 2024 Nicholas Frechette & Realtime Math contributors
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+////////////////////////////////////////////////////////////////////////////////
+
+
+#include <benchmark/benchmark.h>
+#include <rtm/vector4f.h>
+#include <rtm/matrix4x4f.h>
+
+using namespace rtm;
+
+namespace {
+	template<mix4 comp0, mix4 comp1, mix4 comp2, mix4 comp3>
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_mix_optimize(vector4f_arg0 input0, vector4f_arg1 input1) RTM_NO_EXCEPT
+	{
+#if defined(__clang__)
+		constexpr int index0 = (int)comp0;
+		constexpr int index1 = (int)comp1;
+		constexpr int index2 = (int)comp2;
+		constexpr int index3 = (int)comp3;
+		return __builtin_shufflevector(input0, input1, index0, index1, index2, index3);
+#endif
+
+		return vector_mix<comp0, comp1, comp2, comp3>(input0, input1);
+	}
+
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL make_tansform(vector4f_arg0 translation, vector4f_arg1 scale, quatf_arg2 orientation) {
+		float quatval[4];
+		quat_store(orientation, quatval);
+
+		const float x2 = quatval[0] + quatval[0];
+		const float y2 = quatval[1] + quatval[1];
+		const float z2 = quatval[2] + quatval[2];
+		const float xx = quatval[0] * x2;
+		const float xy = quatval[0] * y2;
+		const float xz = quatval[0] * z2;
+		const float yy = quatval[1] * y2;
+		const float yz = quatval[1] * z2;
+		const float zz = quatval[2] * z2;
+		const float wx = quatval[3] * x2;
+		const float wy = quatval[3] * y2;
+		const float wz = quatval[3] * z2;
+
+		const scalarf scale_x = vector_get_x_as_scalar(scale);
+		const scalarf scale_y = vector_get_y_as_scalar(scale);
+		const scalarf scale_z = vector_get_z_as_scalar(scale);
+
+
+		vector4f x_axis = vector_mul(vector_set(1.0F - (yy + zz), xy + wz, xz - wy, 0.0F), scale_x);
+		vector4f y_axis = vector_mul(vector_set(xy - wz, 1.0F - (xx + zz), yz + wx, 0.0F), scale_y);
+		vector4f z_axis = vector_mul(vector_set(xz + wy, yz - wx, 1.0F - (xx + yy), 0.0F), scale_z);
+		vector4f w_axis = translation;
+
+		return matrix4x4f{x_axis, y_axis, z_axis, w_axis};
+	}
+
+
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL inverse_normal(matrix4x4f_arg0 input) {
+		matrix4x4f output{};
+
+	    vector4f v00 = vector_mix<mix4::x, mix4::x, mix4::y, mix4::y>(input.z_axis, input.z_axis);
+	    vector4f v01 = vector_mix<mix4::x, mix4::x, mix4::y, mix4::y>(input.x_axis, input.x_axis);
+	    vector4f v02 = vector_mix<mix4::x, mix4::z, mix4::a, mix4::c>(input.z_axis, input.x_axis);
+	    vector4f v10 = vector_mix<mix4::z, mix4::w, mix4::z, mix4::w>(input.w_axis, input.w_axis);
+	    vector4f v11 = vector_mix<mix4::z, mix4::w, mix4::z, mix4::w>(input.y_axis, input.y_axis);
+	    vector4f v12 = vector_mix<mix4::y, mix4::w, mix4::b, mix4::d>(input.w_axis, input.y_axis);
+
+	    vector4f d0 = vector_mul(v00, v10);
+	    vector4f d1 = vector_mul(v01, v11);
+	    vector4f d2 = vector_mul(v02, v12);
+
+	    v00 = vector_mix<mix4::z, mix4::w, mix4::z, mix4::w>(input.z_axis, input.z_axis);
+	    v01 = vector_mix<mix4::z, mix4::w, mix4::z, mix4::w>(input.x_axis, input.x_axis);
+	    v02 = vector_mix<mix4::y, mix4::w, mix4::b, mix4::d>(input.z_axis, input.x_axis);
+	    v10 = vector_mix<mix4::x, mix4::x, mix4::y, mix4::y>(input.w_axis, input.w_axis);
+	    v11 = vector_mix<mix4::x, mix4::x, mix4::y, mix4::y>(input.y_axis, input.y_axis);
+	    v12 = vector_mix<mix4::x, mix4::z, mix4::a, mix4::c>(input.w_axis, input.y_axis);
+
+	    d0 = vector_neg_mul_sub(v00, v10, d0);
+	    d1 = vector_neg_mul_sub(v01, v11, d1);
+	    d2 = vector_neg_mul_sub(v02, v12, d2);
+
+	    v00 = vector_mix<mix4::y, mix4::z, mix4::x, mix4::y>(input.y_axis, input.y_axis);
+	    v01 = vector_mix<mix4::z, mix4::x, mix4::y, mix4::x>(input.x_axis, input.x_axis);
+	    v02 = vector_mix<mix4::y, mix4::z, mix4::x, mix4::y>(input.w_axis, input.w_axis);
+	    vector4f v03 = vector_mix<mix4::z, mix4::x, mix4::y, mix4::x>(input.z_axis, input.z_axis);
+	    v10 = vector_mix<mix4::b, mix4::y, mix4::w, mix4::x>(d0, d2);
+	    v11 = vector_mix<mix4::w, mix4::b, mix4::y, mix4::z>(d0, d2);
+	    v12 = vector_mix<mix4::d, mix4::y, mix4::w, mix4::x>(d1, d2);
+	    vector4f v13 = vector_mix<mix4::w, mix4::d, mix4::y, mix4::z>(d1, d2);
+
+	    vector4f c0 = vector_mul(v00, v10);
+	    vector4f c2 = vector_mul(v01, v11);
+	    vector4f c4 = vector_mul(v02, v12);
+	    vector4f c6 = vector_mul(v03, v13);
+
+	    v00 = vector_mix<mix4::z, mix4::w, mix4::y, mix4::z>(input.y_axis, input.y_axis);
+	    v01 = vector_mix<mix4::w, mix4::z, mix4::w, mix4::y>(input.x_axis, input.x_axis);
+	    v02 = vector_mix<mix4::z, mix4::w, mix4::y, mix4::z>(input.w_axis, input.w_axis);
+	    v03 = vector_mix<mix4::w, mix4::z, mix4::w, mix4::y>(input.z_axis, input.z_axis);
+	    v10 = vector_mix<mix4::w, mix4::x, mix4::y, mix4::a>(d0, d2);
+	    v11 = vector_mix<mix4::z, mix4::y, mix4::a, mix4::x>(d0, d2);
+	    v12 = vector_mix<mix4::w, mix4::x, mix4::y, mix4::c>(d1, d2);
+	    v13 = vector_mix<mix4::z, mix4::y, mix4::c, mix4::x>(d1, d2);
+
+	    c0 = vector_neg_mul_sub(v00, v10, c0);
+	    c2 = vector_neg_mul_sub(v01, v11, c2);
+	    c4 = vector_neg_mul_sub(v02, v12, c4);
+	    c6 = vector_neg_mul_sub(v03, v13, c6);
+
+	    v00 = vector_mix<mix4::w, mix4::x, mix4::w, mix4::x>(input.y_axis, input.y_axis);
+	    v01 = vector_mix<mix4::y, mix4::w, mix4::x, mix4::z>(input.x_axis, input.x_axis);
+	    v02 = vector_mix<mix4::w, mix4::x, mix4::w, mix4::x>(input.w_axis, input.w_axis);
+	    v03 = vector_mix<mix4::y, mix4::w, mix4::x, mix4::z>(input.z_axis, input.z_axis);
+	    v10 = vector_mix<mix4::z, mix4::b, mix4::a, mix4::z>(d0, d2);
+	    v11 = vector_mix<mix4::b, mix4::x, mix4::w, mix4::a>(d0, d2);
+	    v12 = vector_mix<mix4::z, mix4::d, mix4::c, mix4::z>(d1, d2);
+	    v13 = vector_mix<mix4::d, mix4::x, mix4::w, mix4::c>(d1, d2);
+
+	    vector4f c1 = vector_neg_mul_sub(v00, v10, c0);
+	    c0 = vector_mul_add(v00, v10, c0);
+	    vector4f c3 = vector_mul_add(v01, v11, c2);
+	    c2 = vector_neg_mul_sub(v01, v11, c2);
+	    vector4f c5 = vector_neg_mul_sub(v02, v12, c4);
+	    c4 = vector_mul_add(v02, v12, c4);
+	    vector4f c7 = vector_mul_add(v03, v13, c6);
+	    c6 = vector_neg_mul_sub(v03, v13, c6);
+
+	    vector4f x_axis = vector_mix<mix4::x, mix4::b, mix4::z, mix4::d>(c0, c1);
+	    vector4f y_axis = vector_mix<mix4::x, mix4::b, mix4::z, mix4::d>(c2, c3);
+	    vector4f z_axis = vector_mix<mix4::x, mix4::b, mix4::z, mix4::d>(c4, c5);
+	    vector4f w_axis = vector_mix<mix4::x, mix4::b, mix4::z, mix4::d>(c6, c7);
+
+	    const scalarf det = vector_dot_as_scalar(x_axis, input.x_axis);
+	    const scalarf inv_det_s = scalar_reciprocal(det);
+	    const vector4f inv_det = vector_set(inv_det_s);
+
+
+	    output.x_axis = vector_mul(x_axis, inv_det);
+	    output.y_axis = vector_mul(y_axis, inv_det);
+	    output.z_axis = vector_mul(z_axis, inv_det);
+	    output.w_axis = vector_mul(w_axis, inv_det);
+
+		return output;
+	}
+
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f RTM_SIMD_CALL inverse_optimize(matrix4x4f_arg0 input) {
+		matrix4x4f output{};
+
+	    vector4f v00 = vector_mix_optimize<mix4::x, mix4::x, mix4::y, mix4::y>(input.z_axis, input.z_axis);
+	    vector4f v01 = vector_mix_optimize<mix4::x, mix4::x, mix4::y, mix4::y>(input.x_axis, input.x_axis);
+	    vector4f v02 = vector_mix_optimize<mix4::x, mix4::z, mix4::a, mix4::c>(input.z_axis, input.x_axis);
+	    vector4f v10 = vector_mix_optimize<mix4::z, mix4::w, mix4::z, mix4::w>(input.w_axis, input.w_axis);
+	    vector4f v11 = vector_mix_optimize<mix4::z, mix4::w, mix4::z, mix4::w>(input.y_axis, input.y_axis);
+	    vector4f v12 = vector_mix_optimize<mix4::y, mix4::w, mix4::b, mix4::d>(input.w_axis, input.y_axis);
+
+	    vector4f d0 = vector_mul(v00, v10);
+	    vector4f d1 = vector_mul(v01, v11);
+	    vector4f d2 = vector_mul(v02, v12);
+
+	    v00 = vector_mix_optimize<mix4::z, mix4::w, mix4::z, mix4::w>(input.z_axis, input.z_axis);
+	    v01 = vector_mix_optimize<mix4::z, mix4::w, mix4::z, mix4::w>(input.x_axis, input.x_axis);
+	    v02 = vector_mix_optimize<mix4::y, mix4::w, mix4::b, mix4::d>(input.z_axis, input.x_axis);
+	    v10 = vector_mix_optimize<mix4::x, mix4::x, mix4::y, mix4::y>(input.w_axis, input.w_axis);
+	    v11 = vector_mix_optimize<mix4::x, mix4::x, mix4::y, mix4::y>(input.y_axis, input.y_axis);
+	    v12 = vector_mix_optimize<mix4::x, mix4::z, mix4::a, mix4::c>(input.w_axis, input.y_axis);
+
+	    d0 = vector_neg_mul_sub(v00, v10, d0);
+	    d1 = vector_neg_mul_sub(v01, v11, d1);
+	    d2 = vector_neg_mul_sub(v02, v12, d2);
+
+	    v00 = vector_mix_optimize<mix4::y, mix4::z, mix4::x, mix4::y>(input.y_axis, input.y_axis);
+	    v01 = vector_mix_optimize<mix4::z, mix4::x, mix4::y, mix4::x>(input.x_axis, input.x_axis);
+	    v02 = vector_mix_optimize<mix4::y, mix4::z, mix4::x, mix4::y>(input.w_axis, input.w_axis);
+	    vector4f v03 = vector_mix_optimize<mix4::z, mix4::x, mix4::y, mix4::x>(input.z_axis, input.z_axis);
+	    v10 = vector_mix_optimize<mix4::b, mix4::y, mix4::w, mix4::x>(d0, d2);
+	    v11 = vector_mix_optimize<mix4::w, mix4::b, mix4::y, mix4::z>(d0, d2);
+	    v12 = vector_mix_optimize<mix4::d, mix4::y, mix4::w, mix4::x>(d1, d2);
+	    vector4f v13 = vector_mix_optimize<mix4::w, mix4::d, mix4::y, mix4::z>(d1, d2);
+
+	    vector4f c0 = vector_mul(v00, v10);
+	    vector4f c2 = vector_mul(v01, v11);
+	    vector4f c4 = vector_mul(v02, v12);
+	    vector4f c6 = vector_mul(v03, v13);
+
+	    v00 = vector_mix_optimize<mix4::z, mix4::w, mix4::y, mix4::z>(input.y_axis, input.y_axis);
+	    v01 = vector_mix_optimize<mix4::w, mix4::z, mix4::w, mix4::y>(input.x_axis, input.x_axis);
+	    v02 = vector_mix_optimize<mix4::z, mix4::w, mix4::y, mix4::z>(input.w_axis, input.w_axis);
+	    v03 = vector_mix_optimize<mix4::w, mix4::z, mix4::w, mix4::y>(input.z_axis, input.z_axis);
+	    v10 = vector_mix_optimize<mix4::w, mix4::x, mix4::y, mix4::a>(d0, d2);
+	    v11 = vector_mix_optimize<mix4::z, mix4::y, mix4::a, mix4::x>(d0, d2);
+	    v12 = vector_mix_optimize<mix4::w, mix4::x, mix4::y, mix4::c>(d1, d2);
+	    v13 = vector_mix_optimize<mix4::z, mix4::y, mix4::c, mix4::x>(d1, d2);
+
+	    c0 = vector_neg_mul_sub(v00, v10, c0);
+	    c2 = vector_neg_mul_sub(v01, v11, c2);
+	    c4 = vector_neg_mul_sub(v02, v12, c4);
+	    c6 = vector_neg_mul_sub(v03, v13, c6);
+
+	    v00 = vector_mix_optimize<mix4::w, mix4::x, mix4::w, mix4::x>(input.y_axis, input.y_axis);
+	    v01 = vector_mix_optimize<mix4::y, mix4::w, mix4::x, mix4::z>(input.x_axis, input.x_axis);
+	    v02 = vector_mix_optimize<mix4::w, mix4::x, mix4::w, mix4::x>(input.w_axis, input.w_axis);
+	    v03 = vector_mix_optimize<mix4::y, mix4::w, mix4::x, mix4::z>(input.z_axis, input.z_axis);
+	    v10 = vector_mix_optimize<mix4::z, mix4::b, mix4::a, mix4::z>(d0, d2);
+	    v11 = vector_mix_optimize<mix4::b, mix4::x, mix4::w, mix4::a>(d0, d2);
+	    v12 = vector_mix_optimize<mix4::z, mix4::d, mix4::c, mix4::z>(d1, d2);
+	    v13 = vector_mix_optimize<mix4::d, mix4::x, mix4::w, mix4::c>(d1, d2);
+
+	    vector4f c1 = vector_neg_mul_sub(v00, v10, c0);
+	    c0 = vector_mul_add(v00, v10, c0);
+	    vector4f c3 = vector_mul_add(v01, v11, c2);
+	    c2 = vector_neg_mul_sub(v01, v11, c2);
+	    vector4f c5 = vector_neg_mul_sub(v02, v12, c4);
+	    c4 = vector_mul_add(v02, v12, c4);
+	    vector4f c7 = vector_mul_add(v03, v13, c6);
+	    c6 = vector_neg_mul_sub(v03, v13, c6);
+
+	    vector4f x_axis = vector_mix_optimize<mix4::x, mix4::b, mix4::z, mix4::d>(c0, c1);
+	    vector4f y_axis = vector_mix_optimize<mix4::x, mix4::b, mix4::z, mix4::d>(c2, c3);
+	    vector4f z_axis = vector_mix_optimize<mix4::x, mix4::b, mix4::z, mix4::d>(c4, c5);
+	    vector4f w_axis = vector_mix_optimize<mix4::x, mix4::b, mix4::z, mix4::d>(c6, c7);
+
+	    const scalarf det = vector_dot_as_scalar(x_axis, input.x_axis);
+	    const scalarf inv_det_s = scalar_reciprocal(det);
+	    const vector4f inv_det = vector_set(inv_det_s);
+
+
+	    output.x_axis = vector_mul(x_axis, inv_det);
+	    output.y_axis = vector_mul(y_axis, inv_det);
+	    output.z_axis = vector_mul(z_axis, inv_det);
+	    output.w_axis = vector_mul(w_axis, inv_det);
+
+		return output;
+	}
+}
+
+namespace {
+	vector4f g_translation1 = vector_set(4.1f, 4.2f, 4.3f, 1.0f);
+	vector4f g_scale = vector_set(1.0f, 1.0f, 1.0f, 1.0f);
+	vector4f g_orientation = vector_set(5.1f, 5.2f, 5.3f, 1.0f);
+}
+
+class matrix4x4f_agent {
+public:
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f_agent() = default;
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f_agent(vector4f_arg0 translation, vector4f_arg1 scale, quatf_arg2 orientation) {
+		m_value = make_tansform(translation, scale, orientation);
+	}
+
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f& RTM_SIMD_CALL ref() {
+		return m_value;
+	}
+
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f& RTM_SIMD_CALL ref() const {
+		return const_cast<matrix4x4f_agent*>(this)->ref();
+	}
+
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f_agent RTM_SIMD_CALL inverse_this_normal() const {
+		matrix4x4f_agent result{};
+		result.ref() = inverse_normal(m_value);
+		return result;
+	}
+
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE matrix4x4f_agent RTM_SIMD_CALL inverse_this_optimize() const {
+		matrix4x4f_agent result{};
+		result.ref() = inverse_optimize(m_value);
+		return result;
+	}
+
+private:
+	matrix4x4f m_value{};
+};
+
+static void bm_vector_mix_normal(benchmark::State& state) {
+	auto vec = std::vector<matrix4x4f_agent>{};
+	vec.resize(1000000);
+
+	int index = 0;
+	for (auto _ : state) {
+		vec[index++ % vec.size()] = matrix4x4f_agent(g_translation1, g_scale, g_orientation).inverse_this_normal();
+	}
+
+	benchmark::DoNotOptimize(vec);
+}
+
+BENCHMARK(bm_vector_mix_normal);
+
+static void bm_vector_mix_optimize(benchmark::State& state) {
+	auto vec = std::vector<matrix4x4f_agent>{};
+	vec.resize(1000000);
+
+	int index = 0;
+	for (auto _ : state) {
+		vec[index++ % vec.size()] = matrix4x4f_agent(g_translation1, g_scale, g_orientation).inverse_this_optimize();
+	}
+
+	benchmark::DoNotOptimize(vec);
+}
+
+BENCHMARK(bm_vector_mix_optimize);
\ No newline at end of file