From acc409ef8a94c8148c86edc6a94eb8edf2e5e2e5 Mon Sep 17 00:00:00 2001 From: AndyJGraham <84187655+AndyJGraham@users.noreply.github.com> Date: Fri, 27 Jan 2023 08:07:09 +0000 Subject: [PATCH] Replace successive "ldr" and "str" instructions with "ldp" and "stp" (#77540) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Replace successive "ldr" and "str" instructions with "ldp" and "stp" This change serves to address the following four Github tickets: 1. ARM64: Optimize pair of "ldr reg, [fp]" to ldp #35130 2. ARM64: Optimize pair of "ldr reg, [reg]" to ldp #35132 3. ARM64: Optimize pair of "str reg, [reg]" to stp #35133 4. ARM64: Optimize pair of "str reg, [fp]" to stp  #35134 A technique was employed that involved detecting an optimisation opportunity as instruction sequences were being generated. The optimised instruction was then generated on top of the previous instruction, with no second instruction generated. Thus, there were no changes to instruction group size at “emission time” and no changes to jump instructions. * No longer use a temporary buffer to build the optimized instruction. * Addressed assorted review comments. * Now optimizes ascending locations and decending locations with consecutive STR and LDR instructions. * Modification to remove last instructions. * Ongoing improvements to remove previously-emitted instruction during ldr / str optimization. * Stopped optimization of consecutive instructions that straddled an instruction group boundary. * Addressed code change requests in GitHub. * Various fixes to ldp/stp optimization Add code to update IP mappings when an instruction is removed. * Delete unnecessary and incorrect assert * Diagnostic change only, to confirm whether a theory is correct or not when chasing an error. * Revert "Diagnostic change only, to confirm whether a theory is correct or" This reverts commit 4b0e51e87af44cbef5b9e643f9c5bf7aff48b548. * Do not merge. Temporarily removed calls to "codeGen->genIPmappingUpdateForRemovedInstruction()". Also, corrected minor bug in instruction numbering when removing instructions during optimization. * Modifications to better update the IP mapping table for a replaced instruction. * Minor formatting change. * Check for out of range offsets * Don't optimise during prolog/epilog * Fix windows build error * IGF_HAS_REMOVED_INSTR is ARM64 only * Add OptimizeLdrStr function * Fix formatting * Ensure local variables are tracked * Don't peephole local variables Co-authored-by: Bruce Forstall Co-authored-by: Alan Hayward Co-authored-by: Alan Hayward --- src/coreclr/jit/codegen.h | 2 +- src/coreclr/jit/codegencommon.cpp | 2 +- src/coreclr/jit/emit.cpp | 91 ++++++++++- src/coreclr/jit/emit.h | 32 +++- src/coreclr/jit/emitarm64.cpp | 254 +++++++++++++++++++++++++++--- src/coreclr/jit/emitarm64.h | 53 +++++++ src/coreclr/jit/emitloongarch64.h | 2 - src/coreclr/jit/emitpub.h | 1 + 8 files changed, 404 insertions(+), 33 deletions(-) diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index aa3fbefad7003..e24b7b54a5b56 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -664,7 +664,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX */ #ifdef DEBUG - void genIPmappingDisp(unsigned mappingNum, IPmappingDsc* ipMapping); + void genIPmappingDisp(unsigned mappingNum, const IPmappingDsc* ipMapping); void genIPmappingListDisp(); #endif // DEBUG diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp index 1be119806114c..ebca3137aacac 100644 --- a/src/coreclr/jit/codegencommon.cpp +++ b/src/coreclr/jit/codegencommon.cpp @@ -7042,7 +7042,7 @@ const char* CodeGen::siStackVarName(size_t offs, size_t size, unsigned reg, unsi * Display a IPmappingDsc. Pass -1 as mappingNum to not display a mapping number. */ -void CodeGen::genIPmappingDisp(unsigned mappingNum, IPmappingDsc* ipMapping) +void CodeGen::genIPmappingDisp(unsigned mappingNum, const IPmappingDsc* ipMapping) { if (mappingNum != unsigned(-1)) { diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index 8c5adef145de9..013c141ad872e 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -33,6 +33,22 @@ void emitLocation::CaptureLocation(emitter* emit) assert(Valid()); } +void emitLocation::SetLocation(insGroup* _ig, unsigned _codePos) +{ + ig = _ig; + codePos = _codePos; + + assert(Valid()); +} + +void emitLocation::SetLocation(emitLocation newLocation) +{ + ig = newLocation.ig; + codePos = newLocation.codePos; + + assert(Valid()); +} + bool emitLocation::IsCurrentLocation(emitter* emit) const { assert(Valid()); @@ -50,6 +66,11 @@ int emitLocation::GetInsNum() const return emitGetInsNumFromCodePos(codePos); } +int emitLocation::GetInsOffset() const +{ + return emitGetInsOfsFromCodePos(codePos); +} + // Get the instruction offset in the current instruction group, which must be a funclet prolog group. // This is used to find an instruction offset used in unwind data. // TODO-AMD64-Bug?: We only support a single main function prolog group, but allow for multiple funclet prolog @@ -798,6 +819,7 @@ insGroup* emitter::emitSavIG(bool emitAdd) assert((ig->igFlags & IGF_PLACEHOLDER) == 0); ig->igData = id; + INDEBUG(ig->igDataSize = gs;) memcpy(id, emitCurIGfreeBase, sz); @@ -8724,6 +8746,14 @@ UNATIVE_OFFSET emitter::emitCodeOffset(void* blockPtr, unsigned codePos) { of = ig->igSize; } +#ifdef TARGET_ARM64 + else if ((ig->igFlags & IGF_HAS_REMOVED_INSTR) != 0 && no == ig->igInsCnt + 1U) + { + // This can happen if a instruction was replaced, but the replacement couldn't fit into + // the same IG and instead was place in a new IG. + return ig->igNext->igOffs + emitFindOffset(ig->igNext, 1); + } +#endif else if (ig->igFlags & IGF_UPD_ISZ) { /* @@ -8742,7 +8772,6 @@ UNATIVE_OFFSET emitter::emitCodeOffset(void* blockPtr, unsigned codePos) // printf("[IG=%02u;ID=%03u;OF=%04X] <= %08X\n", ig->igNum, emitGetInsNumFromCodePos(codePos), of, codePos); /* Make sure the offset estimate is accurate */ - assert(of == emitFindOffset(ig, emitGetInsNumFromCodePos(codePos))); } @@ -9198,6 +9227,66 @@ void emitter::emitNxtIG(bool extend) #endif } +//------------------------------------------------------------------------ +// emitRemoveLastInstruction: Remove the last instruction emitted; it has been optimized away by the +// next instruction we are generating. `emitLastIns` must be non-null, meaning there is a +// previous instruction. The previous instruction might have already been saved, or it might +// be in the currently accumulating insGroup buffer. +// +// The `emitLastIns` is set to nullptr after this function. It is expected that a new instruction +// will be immediately generated after this, which will set it again. +// +// Removing an instruction can invalidate any captured emitter location +// (using emitLocation::CaptureLocation()) after the instruction was generated. This is because the +// emitLocation stores the current IG instruction number and code size. If the instruction is +// removed and not replaced (e.g., it is at the end of the IG, and any replacement creates a new +// EXTEND IG), then the saved instruction number is incorrect. The IGF_HAS_REMOVED_INSTR flag is +// used to check for this later. +// +// NOTE: It is expected that the GC effect of the removed instruction will be handled by the newly +// generated replacement(s). +// +#ifdef TARGET_ARM64 +void emitter::emitRemoveLastInstruction() +{ + assert(emitLastIns != nullptr); + assert(emitLastInsIG != nullptr); + + JITDUMP("Removing saved instruction in %s:\n> ", emitLabelString(emitLastInsIG)); + JITDUMPEXEC(dispIns(emitLastIns)) + + // We should assert it's not a jmp, as that would require updating the jump lists, e.g. emitCurIGjmpList. + + BYTE* lastInsActualStartAddr = (BYTE*)emitLastIns - m_debugInfoSize; + unsigned short lastCodeSize = (unsigned short)emitLastIns->idCodeSize(); + + // Check that a new buffer hasn't been create since the last instruction was emitted. + assert((emitCurIGfreeBase <= lastInsActualStartAddr) && (lastInsActualStartAddr < emitCurIGfreeEndp)); + + // Ensure the current IG is non-empty. + assert(emitCurIGnonEmpty()); + assert(lastInsActualStartAddr < emitCurIGfreeNext); + assert(emitCurIGinsCnt >= 1); + assert(emitCurIGsize >= emitLastIns->idCodeSize()); + + size_t insSize = emitCurIGfreeNext - lastInsActualStartAddr; + + emitCurIGfreeNext = lastInsActualStartAddr; + emitCurIGinsCnt -= 1; + emitInsCount -= 1; + emitCurIGsize -= lastCodeSize; + + // We're going to overwrite the memory; zero it. + memset(emitCurIGfreeNext, 0, insSize); + + // Remember this happened. + emitCurIG->igFlags |= IGF_HAS_REMOVED_INSTR; + + emitLastIns = nullptr; + emitLastInsIG = nullptr; +} +#endif + /***************************************************************************** * * emitGetInsSC: Get the instruction's constant value. diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index 4d4b75ad35107..783d927f6661c 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -131,6 +131,16 @@ class emitLocation { } + emitLocation(insGroup* _ig, unsigned _codePos) + { + SetLocation(_ig, _codePos); + } + + emitLocation(emitter* emit) + { + CaptureLocation(emit); + } + emitLocation(void* emitCookie) : ig((insGroup*)emitCookie), codePos(0) { } @@ -142,6 +152,8 @@ class emitLocation } void CaptureLocation(emitter* emit); + void SetLocation(insGroup* _ig, unsigned _codePos); + void SetLocation(emitLocation newLocation); bool IsCurrentLocation(emitter* emit) const; @@ -160,6 +172,7 @@ class emitLocation } int GetInsNum() const; + int GetInsOffset() const; bool operator!=(const emitLocation& other) const { @@ -250,6 +263,7 @@ struct insGroup #ifdef DEBUG BasicBlock* lastGeneratedBlock; // The last block that generated code into this insGroup. jitstd::list igBlocks; // All the blocks that generated code into this insGroup. + size_t igDataSize; // size of instrDesc data pointed to by 'igData' #endif UNATIVE_OFFSET igNum; // for ordering (and display) purposes @@ -280,6 +294,9 @@ struct insGroup #define IGF_REMOVED_ALIGN 0x0800 // IG was marked as having an alignment instruction(s), but was later unmarked // without updating the IG's size/offsets. #define IGF_HAS_REMOVABLE_JMP 0x1000 // this group ends with an unconditional jump which is a candidate for removal +#ifdef TARGET_ARM64 +#define IGF_HAS_REMOVED_INSTR 0x2000 // this group has an instruction that was removed. +#endif // Mask of IGF_* flags that should be propagated to new blocks when they are created. // This allows prologs and epilogs to be any number of IGs, but still be @@ -2170,6 +2187,10 @@ class emitter insGroup* emitSavIG(bool emitAdd = false); void emitNxtIG(bool extend = false); +#ifdef TARGET_ARM64 + void emitRemoveLastInstruction(); +#endif + bool emitCurIGnonEmpty() { return (emitCurIG && emitCurIGfreeNext > emitCurIGfreeBase); @@ -2823,12 +2844,15 @@ inline unsigned emitGetInsOfsFromCodePos(unsigned codePos) inline unsigned emitter::emitCurOffset() { - unsigned codePos = emitCurIGinsCnt + (emitCurIGsize << 16); + return emitSpecifiedOffset(emitCurIGinsCnt, emitCurIGsize); +} - assert(emitGetInsOfsFromCodePos(codePos) == emitCurIGsize); - assert(emitGetInsNumFromCodePos(codePos) == emitCurIGinsCnt); +inline unsigned emitter::emitSpecifiedOffset(unsigned insCount, unsigned igSize) +{ + unsigned codePos = insCount + (igSize << 16); - // printf("[IG=%02u;ID=%03u;OF=%04X] => %08X\n", emitCurIG->igNum, emitCurIGinsCnt, emitCurIGsize, codePos); + assert(emitGetInsOfsFromCodePos(codePos) == igSize); + assert(emitGetInsNumFromCodePos(codePos) == insCount); return codePos; } diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp index fa99bb9f6ba2a..10f56ddf77ff3 100644 --- a/src/coreclr/jit/emitarm64.cpp +++ b/src/coreclr/jit/emitarm64.cpp @@ -5068,6 +5068,7 @@ void emitter::emitIns_R_R_I( emitAttr elemsize = EA_UNKNOWN; insFormat fmt = IF_NONE; bool isLdSt = false; + bool isLdrStr = false; bool isSIMD = false; bool isAddSub = false; bool setFlags = false; @@ -5529,6 +5530,7 @@ void emitter::emitIns_R_R_I( unscaledOp = false; scale = NaturalScale_helper(size); isLdSt = true; + isLdrStr = true; break; case INS_ldur: @@ -5683,11 +5685,8 @@ void emitter::emitIns_R_R_I( } } - // Is the ldr/str even necessary? - // For volatile load/store, there will be memory barrier instruction before/after the load/store - // and in such case, IsRedundantLdStr() returns false, because the method just checks for load/store - // pair next to each other. - if (emitComp->opts.OptimizationEnabled() && IsRedundantLdStr(ins, reg1, reg2, imm, size, fmt)) + // Try to optimize a load/store with an alternative instruction. + if (isLdrStr && emitComp->opts.OptimizationEnabled() && OptimizeLdrStr(ins, attr, reg1, reg2, imm, size, fmt)) { return; } @@ -6641,6 +6640,7 @@ void emitter::emitIns_R_R_R_I(instruction ins, scale = (size == EA_8BYTE) ? 3 : 2; } isLdSt = true; + fmt = IF_LS_3C; break; case INS_ld1: @@ -6919,6 +6919,7 @@ void emitter::emitIns_R_R_R_I(instruction ins, assert(!"Instruction cannot be encoded: Add/Sub IF_DR_3A"); } } + assert(fmt != IF_NONE); instrDesc* id = emitNewInstrCns(attr, imm); @@ -7554,10 +7555,11 @@ void emitter::emitIns_S(instruction ins, emitAttr attr, int varx, int offs) */ void emitter::emitIns_R_S(instruction ins, emitAttr attr, regNumber reg1, int varx, int offs) { - emitAttr size = EA_SIZE(attr); - insFormat fmt = IF_NONE; - int disp = 0; - unsigned scale = 0; + emitAttr size = EA_SIZE(attr); + insFormat fmt = IF_NONE; + int disp = 0; + unsigned scale = 0; + bool isLdrStr = false; assert(offs >= 0); @@ -7584,7 +7586,8 @@ void emitter::emitIns_R_S(instruction ins, emitAttr attr, regNumber reg1, int va case INS_str: case INS_ldr: assert(isValidGeneralDatasize(size) || isValidVectorDatasize(size)); - scale = genLog2(EA_SIZE_IN_BYTES(size)); + scale = genLog2(EA_SIZE_IN_BYTES(size)); + isLdrStr = true; break; case INS_lea: @@ -7638,8 +7641,7 @@ void emitter::emitIns_R_S(instruction ins, emitAttr attr, regNumber reg1, int va { bool useRegForImm = false; ssize_t mask = (1 << scale) - 1; // the mask of low bits that must be zero to encode the immediate - - imm = disp; + imm = disp; if (imm == 0) { fmt = IF_LS_2A; @@ -7677,14 +7679,15 @@ void emitter::emitIns_R_S(instruction ins, emitAttr attr, regNumber reg1, int va } } - // Is the ldr/str even necessary? - if (emitComp->opts.OptimizationEnabled() && IsRedundantLdStr(ins, reg1, reg2, imm, size, fmt)) + assert(fmt != IF_NONE); + + // Try to optimize a load/store with an alternative instruction. + if (isLdrStr && emitComp->opts.OptimizationEnabled() && + OptimizeLdrStr(ins, attr, reg1, reg2, imm, size, fmt, true, varx, offs)) { return; } - assert(fmt != IF_NONE); - instrDesc* id = emitNewInstrCns(attr, imm); id->idIns(ins); @@ -7811,6 +7814,7 @@ void emitter::emitIns_S_R(instruction ins, emitAttr attr, regNumber reg1, int va int disp = 0; unsigned scale = 0; bool isVectorStore = false; + bool isStr = false; // TODO-ARM64-CQ: use unscaled loads? /* Figure out the encoding format of the instruction */ @@ -7839,6 +7843,7 @@ void emitter::emitIns_S_R(instruction ins, emitAttr attr, regNumber reg1, int va scale = NaturalScale_helper(size); isVectorStore = true; } + isStr = true; break; default: @@ -7908,14 +7913,15 @@ void emitter::emitIns_S_R(instruction ins, emitAttr attr, regNumber reg1, int va fmt = IF_LS_3A; } - // Is the ldr/str even necessary? - if (emitComp->opts.OptimizationEnabled() && IsRedundantLdStr(ins, reg1, reg2, imm, size, fmt)) + assert(fmt != IF_NONE); + + // Try to optimize a store with an alternative instruction. + if (isStr && emitComp->opts.OptimizationEnabled() && + OptimizeLdrStr(ins, attr, reg1, reg2, imm, size, fmt, true, varx, offs)) { return; } - assert(fmt != IF_NONE); - instrDesc* id = emitNewInstrCns(attr, imm); id->idIns(ins); @@ -9921,6 +9927,7 @@ BYTE* emitter::emitOutputLJ(insGroup* ig, BYTE* dst, instrDesc* i) // Backward branches using instruction count must be within the same instruction group. assert(insNum + 1 >= (unsigned)(-instrCount)); } + dstOffs = ig->igOffs + emitFindOffset(ig, (insNum + 1 + instrCount)); dstAddr = emitOffsetToPtr(dstOffs); } @@ -16085,7 +16092,7 @@ bool emitter::IsRedundantMov(instruction ins, emitAttr size, regNumber dst, regN // // str x1, [x2, #56] // ldr x1, [x2, #56] <-- redundant - +// // Arguments: // ins - The current instruction // dst - The current destination @@ -16093,13 +16100,19 @@ bool emitter::IsRedundantMov(instruction ins, emitAttr size, regNumber dst, regN // imm - Immediate offset // size - Operand size // fmt - Format of instruction +// // Return Value: // true if previous instruction already has desired value in register/memory location. - +// +// Notes: +// For volatile load/store, there will be memory barrier instruction before/after the load/store +// and in such case, this method returns false, because the method just checks for load/store +// pair next to each other. +// bool emitter::IsRedundantLdStr( instruction ins, regNumber reg1, regNumber reg2, ssize_t imm, emitAttr size, insFormat fmt) { - if (((ins != INS_ldr) && (ins != INS_str)) || !emitCanPeepholeLastIns()) + if ((ins != INS_ldr) && (ins != INS_str)) { return false; } @@ -16108,7 +16121,7 @@ bool emitter::IsRedundantLdStr( regNumber prevReg2 = emitLastIns->idReg2(); insFormat lastInsfmt = emitLastIns->idInsFmt(); emitAttr prevSize = emitLastIns->idOpSize(); - ssize_t prevImm = emitLastIns->idIsLargeCns() ? ((instrDescCns*)emitLastIns)->idcCnsVal : emitLastIns->idSmallCns(); + ssize_t prevImm = emitGetInsSC(emitLastIns); // Only optimize if: // 1. "base" or "base plus immediate offset" addressing modes. @@ -16160,4 +16173,197 @@ bool emitter::IsRedundantLdStr( return false; } + +//----------------------------------------------------------------------------------- +// ReplaceLdrStrWithPairInstr: Potentially, overwrite a previously-emitted "ldr" or "str" +// instruction with an "ldp" or "stp" instruction. +// +// Arguments: +// ins - The instruction code +// reg1Attr - The emit attribute for register 1 +// reg1 - Register 1 +// reg2 - Encoded register 2 +// imm - Immediate offset, prior to scaling by operand size +// size - Operand size +// fmt - Instruction format +// +// Return Value: +// "true" if the previous instruction has been overwritten. +// +bool emitter::ReplaceLdrStrWithPairInstr( + instruction ins, emitAttr reg1Attr, regNumber reg1, regNumber reg2, ssize_t imm, emitAttr size, insFormat fmt) +{ + // Register 2 needs conversion to unencoded value. + reg2 = encodingZRtoSP(reg2); + + RegisterOrder optimizationOrder = IsOptimizableLdrStrWithPair(ins, reg1, reg2, imm, size, fmt); + + if (optimizationOrder != eRO_none) + { + regNumber oldReg1 = emitLastIns->idReg1(); + + ssize_t oldImm = emitGetInsSC(emitLastIns); + instruction optIns = (ins == INS_ldr) ? INS_ldp : INS_stp; + + emitAttr oldReg1Attr; + switch (emitLastIns->idGCref()) + { + case GCT_GCREF: + oldReg1Attr = EA_GCREF; + break; + case GCT_BYREF: + oldReg1Attr = EA_BYREF; + break; + default: + oldReg1Attr = emitLastIns->idOpSize(); + break; + } + + // Remove the last instruction written. + emitRemoveLastInstruction(); + + // Emit the new instruction. Make sure to scale the immediate value by the operand size. + if (optimizationOrder == eRO_ascending) + { + // The FIRST register is at the lower offset + emitIns_R_R_R_I(optIns, oldReg1Attr, oldReg1, reg1, reg2, oldImm * size, INS_OPTS_NONE, reg1Attr); + } + else + { + // The SECOND register is at the lower offset + emitIns_R_R_R_I(optIns, reg1Attr, reg1, oldReg1, reg2, imm * size, INS_OPTS_NONE, oldReg1Attr); + } + + return true; + } + + return false; +} + +//----------------------------------------------------------------------------------- +// IsOptimizableLdrStrWithPair: Check if it is possible to optimize two "ldr" or "str" +// instructions into a single "ldp" or "stp" instruction. +// +// Examples: ldr w1, [x20, #0x10] +// ldr w2, [x20, #0x14] => ldp w1, w2, [x20, #0x10] +// +// ldr w1, [x20, #0x14] +// ldr w2, [x20, #0x10] => ldp w2, w1, [x20, #0x10] +// +// Arguments: +// ins - The instruction code +// reg1 - Register 1 number +// reg2 - Register 2 number +// imm - Immediate offset, prior to scaling by operand size +// size - Operand size +// fmt - Instruction format +// +// Return Value: +// eRO_none - No optimization of consecutive instructions is possible +// eRO_ascending - Registers can be loaded/ stored into ascending store locations +// eRO_descending - Registers can be loaded/ stored into decending store locations. +// +emitter::RegisterOrder emitter::IsOptimizableLdrStrWithPair( + instruction ins, regNumber reg1, regNumber reg2, ssize_t imm, emitAttr size, insFormat fmt) +{ + RegisterOrder optimisationOrder = eRO_none; + + if ((ins != INS_ldr) && (ins != INS_str)) + { + return eRO_none; + } + + if (ins != emitLastIns->idIns()) + { + // Not successive ldr or str instructions + return eRO_none; + } + + regNumber prevReg1 = emitLastIns->idReg1(); + regNumber prevReg2 = emitLastIns->idReg2(); + insFormat lastInsFmt = emitLastIns->idInsFmt(); + emitAttr prevSize = emitLastIns->idOpSize(); + ssize_t prevImm = emitGetInsSC(emitLastIns); + + // Signed, *raw* immediate value fits in 7 bits, so for LDP/ STP the raw value is from -64 to +63. + // For LDR/ STR, there are 9 bits, so we need to limit the range explicitly in software. + if ((imm < -64) || (imm > 63) || (prevImm < -64) || (prevImm > 63)) + { + // Then one or more of the immediate values is out of range, so we cannot optimise. + return eRO_none; + } + + if ((!isGeneralRegisterOrZR(reg1)) || (!isGeneralRegisterOrZR(prevReg1))) + { + // Either register 1 is not a general register or previous register 1 is not a general register + // or the zero register, so we cannot optimise. + return eRO_none; + } + + if (lastInsFmt != fmt) + { + // The formats of the two instructions differ. + return eRO_none; + } + + if ((emitInsIsLoad(ins)) && (prevReg1 == prevReg2)) + { + // Then the previous load overwrote the register that we are indexing against. + return eRO_none; + } + + if ((emitInsIsLoad(ins)) && (reg1 == prevReg1)) + { + // Cannot load to the same register twice. + return eRO_none; + } + + if (prevSize != size) + { + // Operand sizes differ. + return eRO_none; + } + + // There are two possible orders for consecutive registers. + // These may be stored to or loaded from increasing or + // decreasing store locations. + if (imm == (prevImm + 1)) + { + // Previous Register 1 is at a higher offset than This Register 1 + optimisationOrder = eRO_ascending; + } + else if (imm == (prevImm - 1)) + { + // Previous Register 1 is at a lower offset than This Register 1 + optimisationOrder = eRO_descending; + } + else + { + // Not consecutive immediate values. + return eRO_none; + } + + if ((reg2 != prevReg2) || !isGeneralRegisterOrSP(reg2)) + { + // The "register 2" should be same as previous instruction and should either be a general + // register or stack pointer. + return eRO_none; + } + + // Don't remove instructions whilst in prologs or epilogs, as these contain "unwindable" + // parts, where we need to report unwind codes to the OS, + if (emitIGisInProlog(emitCurIG) || emitIGisInEpilog(emitCurIG)) + { + return eRO_none; + } +#ifdef FEATURE_EH_FUNCLETS + if (emitIGisInFuncletProlog(emitCurIG) || emitIGisInFuncletEpilog(emitCurIG)) + { + return eRO_none; + } +#endif + + return optimisationOrder; +} + #endif // defined(TARGET_ARM64) diff --git a/src/coreclr/jit/emitarm64.h b/src/coreclr/jit/emitarm64.h index 05939d4f19773..d82f7dd833a1f 100644 --- a/src/coreclr/jit/emitarm64.h +++ b/src/coreclr/jit/emitarm64.h @@ -69,6 +69,17 @@ instrDesc* emitNewInstrCallInd(int argCnt, emitAttr retSize, emitAttr secondRetSize); +/************************************************************************/ +/* enum to allow instruction optimisation to specify register order */ +/************************************************************************/ + +enum RegisterOrder +{ + eRO_none = 0, + eRO_ascending, + eRO_descending +}; + /************************************************************************/ /* Private helpers for instruction output */ /************************************************************************/ @@ -112,7 +123,49 @@ static UINT64 Replicate_helper(UINT64 value, unsigned width, emitAttr size); // If yes, the caller of this method can choose to omit current mov instruction. static bool IsMovInstruction(instruction ins); bool IsRedundantMov(instruction ins, emitAttr size, regNumber dst, regNumber src, bool canSkip); + +// Methods to optimize a Ldr or Str with an alternative instruction. bool IsRedundantLdStr(instruction ins, regNumber reg1, regNumber reg2, ssize_t imm, emitAttr size, insFormat fmt); +RegisterOrder IsOptimizableLdrStrWithPair( + instruction ins, regNumber reg1, regNumber reg2, ssize_t imm, emitAttr size, insFormat fmt); +bool ReplaceLdrStrWithPairInstr( + instruction ins, emitAttr reg1Attr, regNumber reg1, regNumber reg2, ssize_t imm, emitAttr size, insFormat fmt); + +// Try to optimize a Ldr or Str with an alternative instruction. +inline bool OptimizeLdrStr(instruction ins, + emitAttr reg1Attr, + regNumber reg1, + regNumber reg2, + ssize_t imm, + emitAttr size, + insFormat fmt, + bool localVar = false, + int varx = 0, + int offs = 0) +{ + assert(ins == INS_ldr || ins == INS_str); + + if (!emitCanPeepholeLastIns()) + { + return false; + } + + // Is the ldr/str even necessary? + if (IsRedundantLdStr(ins, reg1, reg2, imm, size, fmt)) + { + return true; + } + + // If the previous instruction was a matching load/store, then try to replace it instead of emitting. + // Don't do this if either instruction had a local variable. + if ((emitLastIns->idIns() == ins) && !localVar && !emitLastIns->idIsLclVar() && + ReplaceLdrStrWithPairInstr(ins, reg1Attr, reg1, reg2, imm, size, fmt)) + { + return true; + } + + return false; +} /************************************************************************ * diff --git a/src/coreclr/jit/emitloongarch64.h b/src/coreclr/jit/emitloongarch64.h index 2dd9d63750289..fcbb32fa7d17f 100644 --- a/src/coreclr/jit/emitloongarch64.h +++ b/src/coreclr/jit/emitloongarch64.h @@ -73,8 +73,6 @@ unsigned emitOutput_Instr(BYTE* dst, code_t code); // If yes, the caller of this method can choose to omit current mov instruction. static bool IsMovInstruction(instruction ins); bool IsRedundantMov(instruction ins, emitAttr size, regNumber dst, regNumber src, bool canSkip); -bool IsRedundantLdStr( - instruction ins, regNumber reg1, regNumber reg2, ssize_t imm, emitAttr size, insFormat fmt); // New functions end. /************************************************************************/ /* Public inline informational methods */ diff --git a/src/coreclr/jit/emitpub.h b/src/coreclr/jit/emitpub.h index 723d9ca0563c9..0133fb19f0212 100644 --- a/src/coreclr/jit/emitpub.h +++ b/src/coreclr/jit/emitpub.h @@ -64,6 +64,7 @@ void emitFinishPrologEpilogGeneration(); void* emitCurBlock(); unsigned emitCurOffset(); +unsigned emitSpecifiedOffset(unsigned insCount, unsigned igSize); UNATIVE_OFFSET emitCodeOffset(void* blockPtr, unsigned codeOffs);