CrossSIMD: make the transpose function compatible with ARM32

hrydgard · Dec 21, 2024 · 0dd55d9 · 0dd55d9
1 parent 5e2d097
commit 0dd55d9
Showing 1 changed file with 10 additions and 0 deletions.
diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h
@@ -274,6 +274,8 @@ struct Vec4F32 {
 
 	// One of many possible solutions. Sometimes we could also use vld4q_f32 probably..
 	static void Transpose(Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) {
+#if PPSSPP_ARCH(ARM64_NEON)
+		// Only works on ARM64
 		float32x4_t temp0 = vzip1q_f32(col0.v, col2.v);
 		float32x4_t temp1 = vzip2q_f32(col0.v, col2.v);
 		float32x4_t temp2 = vzip1q_f32(col1.v, col3.v);
@@ -282,6 +284,14 @@ struct Vec4F32 {
 		col1.v = vzip2q_f32(temp0, temp2);
 		col2.v = vzip1q_f32(temp1, temp3);
 		col3.v = vzip2q_f32(temp1, temp3);
+#else
+   		float32x4x2_t col01 = vtrnq_f32(col0.v, col1.v);
+        float32x4x2_t col23 = vtrnq_f32(col2.v, col3.v);
+        col0.v = vcombine_f32(vget_low_f32(col01.val[0]), vget_low_f32(col23.val[0]));
+        col1.v = vcombine_f32(vget_low_f32(col01.val[1]), vget_low_f32(col23.val[1]));
+        col2.v = vcombine_f32(vget_high_f32(col01.val[0]), vget_high_f32(col23.val[0]));
+        col3.v = vcombine_f32(vget_high_f32(col01.val[1]), vget_high_f32(col23.val[1]));
+#endif
 	}
 
 	inline Vec4F32 AsVec3ByMatrix44(const Mat4F32 &m) {