From a164f77f47ba58567ecfe242a9761203d1a156df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Mon, 5 Jun 2023 10:28:19 +0200 Subject: [PATCH 1/4] VertexDecoderX86 (64-bit only): Avoid a memory access per loop iteration for alpha --- GPU/Common/VertexDecoderX86.cpp | 39 +++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/GPU/Common/VertexDecoderX86.cpp b/GPU/Common/VertexDecoderX86.cpp index 3340961f8ea6..004185dc593f 100644 --- a/GPU/Common/VertexDecoderX86.cpp +++ b/GPU/Common/VertexDecoderX86.cpp @@ -60,6 +60,7 @@ static const X64Reg tempReg3 = R10; static const X64Reg srcReg = RCX; static const X64Reg dstReg = RDX; static const X64Reg counterReg = R8; +static const X64Reg alphaReg = R11; #else static const X64Reg tempReg1 = RAX; static const X64Reg tempReg2 = R9; @@ -67,6 +68,7 @@ static const X64Reg tempReg3 = R10; static const X64Reg srcReg = RDI; static const X64Reg dstReg = RSI; static const X64Reg counterReg = RDX; +static const X64Reg alphaReg = R11; #endif #else static const X64Reg tempReg1 = EAX; @@ -201,6 +203,16 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int MOVUPS(MDisp(ESP, 80), XMM9); #endif + // Initialize alpha reg if possible. TODO: Only do if color values with alpha are used. +#if PPSSPP_ARCH(AMD64) + if (RipAccessible(&gstate_c.vertexFullAlpha)) { + MOV(8, R(alphaReg), M(&gstate_c.vertexFullAlpha)); // rip accessible + } else { + MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.vertexFullAlpha)); + MOV(8, R(alphaReg), MatR(tempReg1)); + } +#endif + bool prescaleStep = false; // Look for prescaled texcoord steps for (int i = 0; i < dec.numSteps_; i++) { @@ -243,6 +255,7 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int } // Keep the scale/offset in a few fp registers if we need it. + // TODO: Read it from an argument pointer instead of gstate_c.uv. if (prescaleStep) { MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.uv)); MOVUPS(fpScaleOffsetReg, MatR(tempReg1)); @@ -271,6 +284,16 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int SUB(32, R(counterReg), Imm8(1)); J_CC(CC_NZ, loopStart, true); + // Writeback alpha reg +#if PPSSPP_ARCH(AMD64) + if (RipAccessible(&gstate_c.vertexFullAlpha)) { + MOV(8, M(&gstate_c.vertexFullAlpha), R(alphaReg)); // rip accessible + } else { + MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.vertexFullAlpha)); + MOV(8, MatR(tempReg1), R(alphaReg)); + } +#endif + MOVUPS(XMM4, MDisp(ESP, 0)); MOVUPS(XMM5, MDisp(ESP, 16)); MOVUPS(XMM6, MDisp(ESP, 32)); @@ -930,12 +953,16 @@ void VertexDecoderJitCache::Jit_Color8888() { CMP(32, R(tempReg1), Imm32(0xFF000000)); FixupBranch skip = J_CC(CC_AE, false); +#if PPSSPP_ARCH(AMD64) + XOR(32, R(alphaReg), R(alphaReg)); +#else if (RipAccessible(&gstate_c.vertexFullAlpha)) { MOV(8, M(&gstate_c.vertexFullAlpha), Imm8(0)); // rip accessible } else { MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.vertexFullAlpha)); MOV(8, MatR(tempReg1), Imm8(0)); } +#endif SetJumpTarget(skip); } @@ -965,12 +992,16 @@ void VertexDecoderJitCache::Jit_Color4444() { CMP(32, R(tempReg1), Imm32(0xFF000000)); FixupBranch skip = J_CC(CC_AE, false); +#if PPSSPP_ARCH(AMD64) + XOR(32, R(alphaReg), R(alphaReg)); +#else if (RipAccessible(&gstate_c.vertexFullAlpha)) { MOV(8, M(&gstate_c.vertexFullAlpha), Imm8(0)); // rip accessible } else { MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.vertexFullAlpha)); MOV(8, MatR(tempReg1), Imm8(0)); } +#endif SetJumpTarget(skip); } @@ -1044,12 +1075,16 @@ void VertexDecoderJitCache::Jit_Color5551() { // Let's AND to avoid a branch, tempReg1 has alpha only in the top 8 bits. SHR(32, R(tempReg1), Imm8(24)); +#if PPSSPP_ARCH(AMD64) + AND(8, R(alphaReg), R(tempReg1)); +#else if (RipAccessible(&gstate_c.vertexFullAlpha)) { AND(8, M(&gstate_c.vertexFullAlpha), R(tempReg1)); // rip accessible } else { MOV(PTRBITS, R(tempReg3), ImmPtr(&gstate_c.vertexFullAlpha)); AND(8, MatR(tempReg3), R(tempReg1)); } +#endif } void VertexDecoderJitCache::Jit_Color8888Morph() { @@ -1258,12 +1293,16 @@ void VertexDecoderJitCache::Jit_WriteMorphColor(int outOff, bool checkAlpha) { if (checkAlpha) { CMP(32, R(tempReg1), Imm32(0xFF000000)); FixupBranch skip = J_CC(CC_AE, false); +#if PPSSPP_ARCH(AMD64) + XOR(32, R(alphaReg), R(alphaReg)); +#else if (RipAccessible(&gstate_c.vertexFullAlpha)) { MOV(8, M(&gstate_c.vertexFullAlpha), Imm8(0)); // rip accessible } else { MOV(PTRBITS, R(tempReg2), ImmPtr(&gstate_c.vertexFullAlpha)); MOV(8, MatR(tempReg2), Imm8(0)); } +#endif SetJumpTarget(skip); } else { // Force alpha to full if we're not checking it. From c9aa3479a484ee91e911e63b9e73e69b99d9efb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Sun, 11 Jun 2023 17:13:31 +0200 Subject: [PATCH 2/4] Make vertexFullAlpha-in-register work the same as on ARM. --- GPU/Common/VertexDecoderX86.cpp | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/GPU/Common/VertexDecoderX86.cpp b/GPU/Common/VertexDecoderX86.cpp index 004185dc593f..b40a1b071dbb 100644 --- a/GPU/Common/VertexDecoderX86.cpp +++ b/GPU/Common/VertexDecoderX86.cpp @@ -203,15 +203,10 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int MOVUPS(MDisp(ESP, 80), XMM9); #endif - // Initialize alpha reg if possible. TODO: Only do if color values with alpha are used. -#if PPSSPP_ARCH(AMD64) - if (RipAccessible(&gstate_c.vertexFullAlpha)) { - MOV(8, R(alphaReg), M(&gstate_c.vertexFullAlpha)); // rip accessible - } else { - MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.vertexFullAlpha)); - MOV(8, R(alphaReg), MatR(tempReg1)); + // Initialize alpha reg. + if (dec.col) { + MOV(32, R(alphaReg), Imm32(1)); } -#endif bool prescaleStep = false; // Look for prescaled texcoord steps @@ -286,11 +281,16 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int // Writeback alpha reg #if PPSSPP_ARCH(AMD64) - if (RipAccessible(&gstate_c.vertexFullAlpha)) { - MOV(8, M(&gstate_c.vertexFullAlpha), R(alphaReg)); // rip accessible - } else { - MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.vertexFullAlpha)); - MOV(8, MatR(tempReg1), R(alphaReg)); + if (dec.col) { + CMP(32, R(alphaReg), Imm32(1)); + FixupBranch alphaJump = J_CC(CC_NE, false); + if (RipAccessible(&gstate_c.vertexFullAlpha)) { + MOV(8, M(&gstate_c.vertexFullAlpha), Imm8(0)); // rip accessible + } else { + MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.vertexFullAlpha)); + MOV(8, MatR(tempReg1), Imm8(0)); // rip accessible + } + SetJumpTarget(alphaJump); } #endif @@ -954,6 +954,7 @@ void VertexDecoderJitCache::Jit_Color8888() { CMP(32, R(tempReg1), Imm32(0xFF000000)); FixupBranch skip = J_CC(CC_AE, false); #if PPSSPP_ARCH(AMD64) + // Would like to use CMOV or SetCC but CMOV doesn't take immediates and SetCC isn't right. So... XOR(32, R(alphaReg), R(alphaReg)); #else if (RipAccessible(&gstate_c.vertexFullAlpha)) { From 1a1462ecb052cf63aa64b9b7efe3885a5b53c5c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Mon, 12 Jun 2023 17:44:49 +0200 Subject: [PATCH 3/4] x86 buildfix, warning fix --- GPU/Common/ReplacedTexture.h | 2 +- GPU/Common/VertexDecoderX86.cpp | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/GPU/Common/ReplacedTexture.h b/GPU/Common/ReplacedTexture.h index 4273ffead53a..7474bae11523 100644 --- a/GPU/Common/ReplacedTexture.h +++ b/GPU/Common/ReplacedTexture.h @@ -128,7 +128,7 @@ class ReplacedTexture { void GetSize(int level, int *w, int *h) const { _dbg_assert_(State() == ReplacementState::ACTIVE); - _dbg_assert_(level < levels_.size()); + _dbg_assert_((size_t)level < levels_.size()); *w = levels_[level].fullW; *h = levels_[level].fullH; } diff --git a/GPU/Common/VertexDecoderX86.cpp b/GPU/Common/VertexDecoderX86.cpp index b40a1b071dbb..536816c2e5d4 100644 --- a/GPU/Common/VertexDecoderX86.cpp +++ b/GPU/Common/VertexDecoderX86.cpp @@ -204,9 +204,11 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int #endif // Initialize alpha reg. +#if PPSSPP_ARCH(AMD64) if (dec.col) { MOV(32, R(alphaReg), Imm32(1)); } +#endif bool prescaleStep = false; // Look for prescaled texcoord steps From d957f6b0be85ebb361efcd349442fdfef10b3832 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Mon, 12 Jun 2023 19:45:34 +0200 Subject: [PATCH 4/4] Of course got the check backwards --- GPU/Common/VertexDecoderX86.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/GPU/Common/VertexDecoderX86.cpp b/GPU/Common/VertexDecoderX86.cpp index 536816c2e5d4..5828689ea3a6 100644 --- a/GPU/Common/VertexDecoderX86.cpp +++ b/GPU/Common/VertexDecoderX86.cpp @@ -285,7 +285,7 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int #if PPSSPP_ARCH(AMD64) if (dec.col) { CMP(32, R(alphaReg), Imm32(1)); - FixupBranch alphaJump = J_CC(CC_NE, false); + FixupBranch alphaJump = J_CC(CC_E, false); if (RipAccessible(&gstate_c.vertexFullAlpha)) { MOV(8, M(&gstate_c.vertexFullAlpha), Imm8(0)); // rip accessible } else { @@ -1076,7 +1076,7 @@ void VertexDecoderJitCache::Jit_Color5551() { MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg2)); - // Let's AND to avoid a branch, tempReg1 has alpha only in the top 8 bits. + // Let's AND to avoid a branch, tempReg1 has alpha only in the top 8 bits, and they're all equal. SHR(32, R(tempReg1), Imm8(24)); #if PPSSPP_ARCH(AMD64) AND(8, R(alphaReg), R(tempReg1));