diff --git a/GPU/Math3D.h b/GPU/Math3D.h
index 6309801c4e49..fe64f8c5bdac 100644
--- a/GPU/Math3D.h
+++ b/GPU/Math3D.h
@@ -1116,11 +1116,6 @@ inline void Transpose4x4(float out[16], const float in[16]) {
 	}
 }
 
-inline float Vec3Dot(const float v1[3], const float v2[3])
-{
-	return v1[0]*v2[0] + v1[1]*v2[1] + v1[2]*v2[2];
-}
-
 namespace Math3D {
 
 template<typename T>
diff --git a/GPU/Software/Lighting.cpp b/GPU/Software/Lighting.cpp
index 1709db488952..58b7e0bd3e24 100644
--- a/GPU/Software/Lighting.cpp
+++ b/GPU/Software/Lighting.cpp
@@ -219,9 +219,10 @@ static inline __m128i LightColorScaleBy512SSE4(__m128i factor, __m128i color, __
 }
 #endif
 
+template <bool useSSE4>
 static inline int LightCeil(float f) {
 #if defined(_M_SSE)
-	if (cpu_info.bSSE4_1)
+	if (useSSE4)
 		return LightCeilSSE4(f);
 #elif PPSSPP_ARCH(ARM64_NEON)
 	return vcvtps_s32_f32(f);
@@ -229,12 +230,13 @@ static inline int LightCeil(float f) {
 	return (int)ceilf(f);
 }
 
+template <bool useSSE4>
 static Vec4<int> LightColorScaleBy512(const Vec4<int> &factor, const Vec4<int> &color, int scale) {
 	// We multiply s9 * s9 * s9, resulting in s27, then shift off 19 to get 8-bit.
 	// The reason all factors are s9 is to account for rounding.
 	// Also note that all values are positive, so can be treated as unsigned.
 #if defined(_M_SSE) && !PPSSPP_ARCH(X86)
-	if (cpu_info.bSSE4_1)
+	if (useSSE4)
 		return LightColorScaleBy512SSE4(factor.ivec, color.ivec, _mm_set1_epi32(scale));
 #elif PPSSPP_ARCH(ARM64_NEON)
 	int32x4_t multiplied = vmulq_n_s32(vmulq_s32(factor.ivec, color.ivec), scale);
@@ -253,7 +255,34 @@ static inline void LightColorSum(Vec4<int> &sum, const Vec4<int> &src) {
 #endif
 }
 
-void Process(VertexData &vertex, const WorldCoords &worldpos, const WorldCoords &worldnormal, const State &state) {
+#if defined(_M_SSE)
+#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)
+[[gnu::target("sse4.1")]]
+#endif
+static inline __m128 Dot33SSE4(__m128 a, __m128 b) {
+	__m128 multiplied = _mm_insert_ps(_mm_mul_ps(a, b), _mm_setzero_ps(), 0x30);
+	__m128 lanes3311 = _mm_movehdup_ps(multiplied);
+	__m128 partial = _mm_add_ps(multiplied, lanes3311);
+	return _mm_add_ss(partial, _mm_movehl_ps(lanes3311, partial));
+}
+#endif
+
+template <bool useSSE4>
+static inline float Dot33(const Vec3f &a, const Vec3f &b) {
+#if defined(_M_SSE) && !PPSSPP_ARCH(X86)
+	if (useSSE4)
+		return _mm_cvtss_f32(Dot33SSE4(a.vec, b.vec));
+#elif PPSSPP_ARCH(ARM64_NEON)
+	float32x4_t multipled = vsetq_lane_f32(0.0f, vmulq_f32(a.vec, b.vec), 3);
+	float32x2_t add1 = vget_low_f32(vpaddq_f32(multipled, multipled));
+	float32x2_t add2 = vpadd_f32(add1, add1);
+	return vget_lane_f32(add2, 0);
+#endif
+	return Dot(a, b);
+}
+
+template <bool useSSE4>
+static void ProcessSIMD(VertexData &vertex, const WorldCoords &worldpos, const WorldCoords &worldnormal, const State &state) {
 	// Lighting blending rounds using the half offset method (like alpha blend.)
 	const Vec4<int> ones = Vec4<int>::AssignToAll(1);
 	Vec4<int> colorFactor;
@@ -282,7 +311,7 @@ void Process(VertexData &vertex, const WorldCoords &worldpos, const WorldCoords
 			// TODO: Should this normalize (0, 0, 0) to (0, 0, 1)?
 			float d = L.NormalizeOr001();
 
-			att = 1.0f / Dot(lstate.att, Vec3f(1.0f, d, d * d));
+			att = 1.0f / Dot33<useSSE4>(lstate.att, Vec3f(1.0f, d, d * d));
 			if (!(att > 0.0f))
 				att = 0.0f;
 			else if (att > 1.0f)
@@ -291,7 +320,7 @@ void Process(VertexData &vertex, const WorldCoords &worldpos, const WorldCoords
 
 		float spot = 1.0f;
 		if (lstate.spot) {
-			float rawSpot = Dot(lstate.spotDir, L);
+			float rawSpot = Dot33<useSSE4>(lstate.spotDir, L);
 			if (std::isnan(rawSpot))
 				rawSpot = std::signbit(rawSpot) ? 0.0f : 1.0f;
 
@@ -306,44 +335,44 @@ void Process(VertexData &vertex, const WorldCoords &worldpos, const WorldCoords
 
 		// ambient lighting
 		if (lstate.ambient) {
-			int attspot = (int)LightCeil(256 * 2 * att * spot + 1);
+			int attspot = (int)LightCeil<useSSE4>(256 * 2 * att * spot + 1);
 			if (attspot > 512)
 				attspot = 512;
-			Vec4<int> lambient = LightColorScaleBy512(lstate.ambientColorFactor, mac, attspot);
+			Vec4<int> lambient = LightColorScaleBy512<useSSE4>(lstate.ambientColorFactor, mac, attspot);
 			LightColorSum(final_color, lambient);
 		}
 
 		// diffuse lighting
 		float diffuse_factor;
 		if (lstate.diffuse || lstate.specular) {
-			diffuse_factor = Dot(L, worldnormal);
+			diffuse_factor = Dot33<useSSE4>(L, worldnormal);
 			if (lstate.poweredDiffuse) {
 				diffuse_factor = pspLightPow(diffuse_factor, state.specularExp);
 			}
 		}
 
 		if (lstate.diffuse && diffuse_factor > 0.0f) {
-			int diffuse_attspot = (int)LightCeil(256 * 2 * att * spot * diffuse_factor + 1);
+			int diffuse_attspot = (int)LightCeil<useSSE4>(256 * 2 * att * spot * diffuse_factor + 1);
 			if (diffuse_attspot > 512)
 				diffuse_attspot = 512;
 			Vec4<int> mdc = state.colorForDiffuse ? colorFactor : state.material.diffuseColorFactor;
-			Vec4<int> ldiffuse = LightColorScaleBy512(lstate.diffuseColorFactor, mdc, diffuse_attspot);
+			Vec4<int> ldiffuse = LightColorScaleBy512<useSSE4>(lstate.diffuseColorFactor, mdc, diffuse_attspot);
 			LightColorSum(final_color, ldiffuse);
 		}
 
 		if (lstate.specular && diffuse_factor >= 0.0f) {
 			Vec3<float> H = L + Vec3<float>(0.f, 0.f, 1.f);
 
-			float specular_factor = Dot(H.NormalizedOr001(cpu_info.bSSE4_1), worldnormal);
+			float specular_factor = Dot33<useSSE4>(H.NormalizedOr001(useSSE4), worldnormal);
 			specular_factor = pspLightPow(specular_factor, state.specularExp);
 
 			if (specular_factor > 0.0f) {
-				int specular_attspot = (int)LightCeil(256 * 2 * att * spot * specular_factor + 1);
+				int specular_attspot = (int)LightCeil<useSSE4>(256 * 2 * att * spot * specular_factor + 1);
 				if (specular_attspot > 512)
 					specular_attspot = 512;
 
 				Vec4<int> msc = state.colorForSpecular ? colorFactor : state.material.specularColorFactor;
-				Vec4<int> lspecular = LightColorScaleBy512(lstate.specularColorFactor, msc, specular_attspot);
+				Vec4<int> lspecular = LightColorScaleBy512<useSSE4>(lstate.specularColorFactor, msc, specular_attspot);
 				LightColorSum(specular_color, lspecular);
 			}
 		}
@@ -360,4 +389,14 @@ void Process(VertexData &vertex, const WorldCoords &worldpos, const WorldCoords
 	}
 }
 
+void Process(VertexData &vertex, const WorldCoords &worldpos, const WorldCoords &worldnormal, const State &state) {
+#ifdef _M_SSE
+	if (cpu_info.bSSE4_1) {
+		ProcessSIMD<true>(vertex, worldpos, worldnormal, state);
+		return;
+	}
+#endif
+	ProcessSIMD<false>(vertex, worldpos, worldnormal, state);
+}
+
 } // namespace
diff --git a/GPU/Software/TransformUnit.cpp b/GPU/Software/TransformUnit.cpp
index b7afe1851ad6..8fb0ea66ddf9 100644
--- a/GPU/Software/TransformUnit.cpp
+++ b/GPU/Software/TransformUnit.cpp
@@ -332,6 +332,31 @@ void ComputeTransformState(TransformState *state, const VertexReader &vreader) {
 		state->roundToScreen = &ClipToScreenInternal<false, false>;
 }
 
+#if defined(_M_SSE)
+#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)
+[[gnu::target("sse4.1")]]
+#endif
+static inline __m128 Dot43SSE4(__m128 a, __m128 b) {
+	__m128 multiplied = _mm_mul_ps(a, _mm_insert_ps(b, _mm_set1_ps(1.0f), 0x30));
+	__m128 lanes3311 = _mm_movehdup_ps(multiplied);
+	__m128 partial = _mm_add_ps(multiplied, lanes3311);
+	return _mm_add_ss(partial, _mm_movehl_ps(lanes3311, partial));
+}
+#endif
+
+static inline float Dot43(const Vec4f &a, const Vec3f &b) {
+#if defined(_M_SSE) && !PPSSPP_ARCH(X86)
+	if (cpu_info.bSSE4_1)
+		return _mm_cvtss_f32(Dot43SSE4(a.vec, b.vec));
+#elif PPSSPP_ARCH(ARM64_NEON)
+	float32x4_t multipled = vmulq_f32(a.vec, vsetq_lane_f32(1.0f, b.vec, 3));
+	float32x2_t add1 = vget_low_f32(vpaddq_f32(multipled, multipled));
+	float32x2_t add2 = vpadd_f32(add1, add1);
+	return vget_lane_f32(add2, 0);
+#endif
+	return Dot(a, Vec4f(b, 1.0f));
+}
+
 ClipVertexData TransformUnit::ReadVertex(const VertexReader &vreader, const TransformState &state) {
 	PROFILE_THIS_SCOPE("read_vert");
 	// If we ever thread this, we'll have to change this.
@@ -396,7 +421,7 @@ ClipVertexData TransformUnit::ReadVertex(const VertexReader &vreader, const Tran
 		}
 
 		if (state.enableFog) {
-			vertex.v.fogdepth = Dot(state.posToFog, Vec4f(pos, 1.0f));
+			vertex.v.fogdepth = Dot43(state.posToFog, pos);
 		} else {
 			vertex.v.fogdepth = 1.0f;
 		}