diff --git a/GPU/Math3D.h b/GPU/Math3D.h index 6309801c4e49..fe64f8c5bdac 100644 --- a/GPU/Math3D.h +++ b/GPU/Math3D.h @@ -1116,11 +1116,6 @@ inline void Transpose4x4(float out[16], const float in[16]) { } } -inline float Vec3Dot(const float v1[3], const float v2[3]) -{ - return v1[0]*v2[0] + v1[1]*v2[1] + v1[2]*v2[2]; -} - namespace Math3D { template diff --git a/GPU/Software/Lighting.cpp b/GPU/Software/Lighting.cpp index 1709db488952..58b7e0bd3e24 100644 --- a/GPU/Software/Lighting.cpp +++ b/GPU/Software/Lighting.cpp @@ -219,9 +219,10 @@ static inline __m128i LightColorScaleBy512SSE4(__m128i factor, __m128i color, __ } #endif +template static inline int LightCeil(float f) { #if defined(_M_SSE) - if (cpu_info.bSSE4_1) + if (useSSE4) return LightCeilSSE4(f); #elif PPSSPP_ARCH(ARM64_NEON) return vcvtps_s32_f32(f); @@ -229,12 +230,13 @@ static inline int LightCeil(float f) { return (int)ceilf(f); } +template static Vec4 LightColorScaleBy512(const Vec4 &factor, const Vec4 &color, int scale) { // We multiply s9 * s9 * s9, resulting in s27, then shift off 19 to get 8-bit. // The reason all factors are s9 is to account for rounding. // Also note that all values are positive, so can be treated as unsigned. #if defined(_M_SSE) && !PPSSPP_ARCH(X86) - if (cpu_info.bSSE4_1) + if (useSSE4) return LightColorScaleBy512SSE4(factor.ivec, color.ivec, _mm_set1_epi32(scale)); #elif PPSSPP_ARCH(ARM64_NEON) int32x4_t multiplied = vmulq_n_s32(vmulq_s32(factor.ivec, color.ivec), scale); @@ -253,7 +255,34 @@ static inline void LightColorSum(Vec4 &sum, const Vec4 &src) { #endif } -void Process(VertexData &vertex, const WorldCoords &worldpos, const WorldCoords &worldnormal, const State &state) { +#if defined(_M_SSE) +#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) +[[gnu::target("sse4.1")]] +#endif +static inline __m128 Dot33SSE4(__m128 a, __m128 b) { + __m128 multiplied = _mm_insert_ps(_mm_mul_ps(a, b), _mm_setzero_ps(), 0x30); + __m128 lanes3311 = _mm_movehdup_ps(multiplied); + __m128 partial = _mm_add_ps(multiplied, lanes3311); + return _mm_add_ss(partial, _mm_movehl_ps(lanes3311, partial)); +} +#endif + +template +static inline float Dot33(const Vec3f &a, const Vec3f &b) { +#if defined(_M_SSE) && !PPSSPP_ARCH(X86) + if (useSSE4) + return _mm_cvtss_f32(Dot33SSE4(a.vec, b.vec)); +#elif PPSSPP_ARCH(ARM64_NEON) + float32x4_t multipled = vsetq_lane_f32(0.0f, vmulq_f32(a.vec, b.vec), 3); + float32x2_t add1 = vget_low_f32(vpaddq_f32(multipled, multipled)); + float32x2_t add2 = vpadd_f32(add1, add1); + return vget_lane_f32(add2, 0); +#endif + return Dot(a, b); +} + +template +static void ProcessSIMD(VertexData &vertex, const WorldCoords &worldpos, const WorldCoords &worldnormal, const State &state) { // Lighting blending rounds using the half offset method (like alpha blend.) const Vec4 ones = Vec4::AssignToAll(1); Vec4 colorFactor; @@ -282,7 +311,7 @@ void Process(VertexData &vertex, const WorldCoords &worldpos, const WorldCoords // TODO: Should this normalize (0, 0, 0) to (0, 0, 1)? float d = L.NormalizeOr001(); - att = 1.0f / Dot(lstate.att, Vec3f(1.0f, d, d * d)); + att = 1.0f / Dot33(lstate.att, Vec3f(1.0f, d, d * d)); if (!(att > 0.0f)) att = 0.0f; else if (att > 1.0f) @@ -291,7 +320,7 @@ void Process(VertexData &vertex, const WorldCoords &worldpos, const WorldCoords float spot = 1.0f; if (lstate.spot) { - float rawSpot = Dot(lstate.spotDir, L); + float rawSpot = Dot33(lstate.spotDir, L); if (std::isnan(rawSpot)) rawSpot = std::signbit(rawSpot) ? 0.0f : 1.0f; @@ -306,44 +335,44 @@ void Process(VertexData &vertex, const WorldCoords &worldpos, const WorldCoords // ambient lighting if (lstate.ambient) { - int attspot = (int)LightCeil(256 * 2 * att * spot + 1); + int attspot = (int)LightCeil(256 * 2 * att * spot + 1); if (attspot > 512) attspot = 512; - Vec4 lambient = LightColorScaleBy512(lstate.ambientColorFactor, mac, attspot); + Vec4 lambient = LightColorScaleBy512(lstate.ambientColorFactor, mac, attspot); LightColorSum(final_color, lambient); } // diffuse lighting float diffuse_factor; if (lstate.diffuse || lstate.specular) { - diffuse_factor = Dot(L, worldnormal); + diffuse_factor = Dot33(L, worldnormal); if (lstate.poweredDiffuse) { diffuse_factor = pspLightPow(diffuse_factor, state.specularExp); } } if (lstate.diffuse && diffuse_factor > 0.0f) { - int diffuse_attspot = (int)LightCeil(256 * 2 * att * spot * diffuse_factor + 1); + int diffuse_attspot = (int)LightCeil(256 * 2 * att * spot * diffuse_factor + 1); if (diffuse_attspot > 512) diffuse_attspot = 512; Vec4 mdc = state.colorForDiffuse ? colorFactor : state.material.diffuseColorFactor; - Vec4 ldiffuse = LightColorScaleBy512(lstate.diffuseColorFactor, mdc, diffuse_attspot); + Vec4 ldiffuse = LightColorScaleBy512(lstate.diffuseColorFactor, mdc, diffuse_attspot); LightColorSum(final_color, ldiffuse); } if (lstate.specular && diffuse_factor >= 0.0f) { Vec3 H = L + Vec3(0.f, 0.f, 1.f); - float specular_factor = Dot(H.NormalizedOr001(cpu_info.bSSE4_1), worldnormal); + float specular_factor = Dot33(H.NormalizedOr001(useSSE4), worldnormal); specular_factor = pspLightPow(specular_factor, state.specularExp); if (specular_factor > 0.0f) { - int specular_attspot = (int)LightCeil(256 * 2 * att * spot * specular_factor + 1); + int specular_attspot = (int)LightCeil(256 * 2 * att * spot * specular_factor + 1); if (specular_attspot > 512) specular_attspot = 512; Vec4 msc = state.colorForSpecular ? colorFactor : state.material.specularColorFactor; - Vec4 lspecular = LightColorScaleBy512(lstate.specularColorFactor, msc, specular_attspot); + Vec4 lspecular = LightColorScaleBy512(lstate.specularColorFactor, msc, specular_attspot); LightColorSum(specular_color, lspecular); } } @@ -360,4 +389,14 @@ void Process(VertexData &vertex, const WorldCoords &worldpos, const WorldCoords } } +void Process(VertexData &vertex, const WorldCoords &worldpos, const WorldCoords &worldnormal, const State &state) { +#ifdef _M_SSE + if (cpu_info.bSSE4_1) { + ProcessSIMD(vertex, worldpos, worldnormal, state); + return; + } +#endif + ProcessSIMD(vertex, worldpos, worldnormal, state); +} + } // namespace diff --git a/GPU/Software/TransformUnit.cpp b/GPU/Software/TransformUnit.cpp index b7afe1851ad6..8fb0ea66ddf9 100644 --- a/GPU/Software/TransformUnit.cpp +++ b/GPU/Software/TransformUnit.cpp @@ -332,6 +332,31 @@ void ComputeTransformState(TransformState *state, const VertexReader &vreader) { state->roundToScreen = &ClipToScreenInternal; } +#if defined(_M_SSE) +#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) +[[gnu::target("sse4.1")]] +#endif +static inline __m128 Dot43SSE4(__m128 a, __m128 b) { + __m128 multiplied = _mm_mul_ps(a, _mm_insert_ps(b, _mm_set1_ps(1.0f), 0x30)); + __m128 lanes3311 = _mm_movehdup_ps(multiplied); + __m128 partial = _mm_add_ps(multiplied, lanes3311); + return _mm_add_ss(partial, _mm_movehl_ps(lanes3311, partial)); +} +#endif + +static inline float Dot43(const Vec4f &a, const Vec3f &b) { +#if defined(_M_SSE) && !PPSSPP_ARCH(X86) + if (cpu_info.bSSE4_1) + return _mm_cvtss_f32(Dot43SSE4(a.vec, b.vec)); +#elif PPSSPP_ARCH(ARM64_NEON) + float32x4_t multipled = vmulq_f32(a.vec, vsetq_lane_f32(1.0f, b.vec, 3)); + float32x2_t add1 = vget_low_f32(vpaddq_f32(multipled, multipled)); + float32x2_t add2 = vpadd_f32(add1, add1); + return vget_lane_f32(add2, 0); +#endif + return Dot(a, Vec4f(b, 1.0f)); +} + ClipVertexData TransformUnit::ReadVertex(const VertexReader &vreader, const TransformState &state) { PROFILE_THIS_SCOPE("read_vert"); // If we ever thread this, we'll have to change this. @@ -396,7 +421,7 @@ ClipVertexData TransformUnit::ReadVertex(const VertexReader &vreader, const Tran } if (state.enableFog) { - vertex.v.fogdepth = Dot(state.posToFog, Vec4f(pos, 1.0f)); + vertex.v.fogdepth = Dot43(state.posToFog, pos); } else { vertex.v.fogdepth = 1.0f; }