Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ARM64: Optimize saved registers in vertex decoder. #17569

Merged
merged 1 commit into from
Jun 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Common/Arm64Emitter.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ enum ARM64Reg

// R19-R28. R29 (FP), R30 (LR) are always saved and FP updated appropriately.
const u32 ALL_CALLEE_SAVED = 0x1FF80000;
const u32 ALL_CALLEE_SAVED_FP = 0x0000FF00; // d8-d15
const u32 ALL_CALLEE_SAVED_FP = 0x0000FF00; // q8-q15

inline bool Is64Bit(ARM64Reg reg) { return (reg & 0x20) != 0; }
inline bool IsSingle(ARM64Reg reg) { return (reg & 0xC0) == 0x40; }
Expand Down
43 changes: 23 additions & 20 deletions GPU/Common/VertexDecoderArm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,9 @@ static const ARM64Reg neonScratchRegQ = Q2;
static const ARM64Reg neonUVScaleReg = D0;
static const ARM64Reg neonUVOffsetReg = D1;

static const ARM64Reg src[3] = {S2, S3, S8};
static const ARM64Reg srcD[3] = {D2, D3, D8};
static const ARM64Reg srcQ[3] = {Q2, Q3, Q8};
static const ARM64Reg src[2] = {S2, S3};
static const ARM64Reg srcD = D2;
static const ARM64Reg srcQ = Q2;

static const ARM64Reg srcNEON = Q8;
static const ARM64Reg accNEON = Q9;
Expand Down Expand Up @@ -169,8 +169,11 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int

// if (skinning) log = true;

uint64_t regs_to_save = Arm64Gen::ALL_CALLEE_SAVED;
uint64_t regs_to_save_fp = Arm64Gen::ALL_CALLEE_SAVED_FP;
// GPRs 0-15 do not need to be saved.
// We don't use any higher GPRs than 16. So:
uint64_t regs_to_save = 1 << 16; // Arm64Gen::ALL_CALLEE_SAVED;
// We only need to save Q8-Q15 if skinning is used.
uint64_t regs_to_save_fp = dec.skinInDecode ? Arm64Gen::ALL_CALLEE_SAVED_FP : 0;
fp.ABI_PushRegisters(regs_to_save, regs_to_save_fp);

// Keep the scale/offset in a few fp registers if we need it.
Expand Down Expand Up @@ -645,12 +648,12 @@ void VertexDecoderJitCache::Jit_TcFloatPrescale() {

void VertexDecoderJitCache::Jit_PosS8() {
Jit_AnyS8ToFloat(dec_->posoff);
fp.STUR(128, srcQ[0], dstReg, dec_->decFmt.posoff);
fp.STUR(128, srcQ, dstReg, dec_->decFmt.posoff);
}

void VertexDecoderJitCache::Jit_PosS16() {
Jit_AnyS16ToFloat(dec_->posoff);
fp.STUR(128, srcQ[0], dstReg, dec_->decFmt.posoff);
fp.STUR(128, srcQ, dstReg, dec_->decFmt.posoff);
}

void VertexDecoderJitCache::Jit_PosFloat() {
Expand All @@ -677,8 +680,8 @@ void VertexDecoderJitCache::Jit_PosS8Through() {
void VertexDecoderJitCache::Jit_PosS16Through() {
// Start with X and Y (which is signed.)
fp.LDUR(32, src[0], srcReg, dec_->posoff);
fp.SXTL(16, srcD[0], src[0]);
fp.SCVTF(32, srcD[0], srcD[0]);
fp.SXTL(16, srcD, src[0]);
fp.SCVTF(32, srcD, srcD);
fp.STUR(64, src[0], dstReg, dec_->decFmt.posoff);
// Now load in Z (which is unsigned.)
LDRH(INDEX_UNSIGNED, tempReg3, srcReg, dec_->posoff + 4);
Expand Down Expand Up @@ -744,7 +747,7 @@ void VertexDecoderJitCache::Jit_NormalS16Skin() {
}

void VertexDecoderJitCache::Jit_NormalFloatSkin() {
fp.LDUR(128, srcQ[0], srcReg, dec_->nrmoff);
fp.LDUR(128, srcQ, srcReg, dec_->nrmoff);
Jit_WriteMatrixMul(dec_->decFmt.nrmoff, false);
}

Expand All @@ -759,28 +762,28 @@ void VertexDecoderJitCache::Jit_PosS16Skin() {
}

void VertexDecoderJitCache::Jit_PosFloatSkin() {
fp.LDUR(128, srcQ[0], srcReg, dec_->posoff);
fp.LDUR(128, srcQ, srcReg, dec_->posoff);
Jit_WriteMatrixMul(dec_->decFmt.posoff, true);
}

void VertexDecoderJitCache::Jit_AnyS8ToFloat(int srcoff) {
fp.LDUR(32, src[0], srcReg, srcoff);
fp.SXTL(8, srcD[0], src[0]);
fp.SXTL(16, srcQ[0], srcD[0]);
fp.SCVTF(32, srcQ[0], srcQ[0], 7);
fp.SXTL(8, srcD, src[0]);
fp.SXTL(16, srcQ, srcD);
fp.SCVTF(32, srcQ, srcQ, 7);
}

void VertexDecoderJitCache::Jit_AnyS16ToFloat(int srcoff) {
fp.LDUR(64, src[0], srcReg, srcoff);
fp.SXTL(16, srcQ[0], srcD[0]);
fp.SCVTF(32, srcQ[0], srcQ[0], 15);
fp.SXTL(16, srcQ, srcD);
fp.SCVTF(32, srcQ, srcQ, 15);
}

void VertexDecoderJitCache::Jit_WriteMatrixMul(int outOff, bool pos) {
// Multiply with the matrix sitting in Q4-Q7.
fp.FMUL(32, accNEON, Q4, srcQ[0], 0);
fp.FMLA(32, accNEON, Q5, srcQ[0], 1);
fp.FMLA(32, accNEON, Q6, srcQ[0], 2);
// Multiply srcQ with the matrix sitting in Q4-Q7.
fp.FMUL(32, accNEON, Q4, srcQ, 0);
fp.FMLA(32, accNEON, Q5, srcQ, 1);
fp.FMLA(32, accNEON, Q6, srcQ, 2);
if (pos) {
fp.FADD(32, accNEON, accNEON, Q7);
}
Expand Down