From 32878c2065c8005b3ea30c79e16dfd7eed55d645 Mon Sep 17 00:00:00 2001 From: zhongyunde 00443407 Date: Fri, 10 Nov 2023 07:29:03 -0500 Subject: [PATCH] [AArch64] merge index address with large offset into base address A case for this transformation, https://gcc.godbolt.org/z/nhYcWq1WE Fold mov w8, #56952 movk w8, #15, lsl #16 ldrb w0, [x0, x8] into add x0, x0, 1036288 ldrb w0, [x0, 3704] Only LDRBBroX is supported for the first time. Fix https://github.com/llvm/llvm-project/issues/71917 --- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 10 + llvm/lib/Target/AArch64/AArch64InstrInfo.h | 3 + .../AArch64/AArch64LoadStoreOptimizer.cpp | 229 ++++++++++++++++++ llvm/test/CodeGen/AArch64/arm64-addrmode.ll | 15 +- .../AArch64/large-offset-ldr-merge.mir | 5 +- 5 files changed, 250 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 7d71c316bcb0a2..855c7f17102561 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -4089,6 +4089,16 @@ AArch64InstrInfo::getLdStOffsetOp(const MachineInstr &MI) { return MI.getOperand(Idx); } +const MachineOperand & +AArch64InstrInfo::getLdStAmountOp(const MachineInstr &MI) { + switch (MI.getOpcode()) { + default: + llvm_unreachable("Unexpected opcode"); + case AArch64::LDRBBroX: + return MI.getOperand(4); + } +} + static const TargetRegisterClass *getRegClass(const MachineInstr &MI, Register Reg) { if (MI.getParent() == nullptr) diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h index 6526f6740747ab..db24a19fe5f8e3 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -111,6 +111,9 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo { /// Returns the immediate offset operator of a load/store. static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI); + /// Returns the shift amount operator of a load/store. + static const MachineOperand &getLdStAmountOp(const MachineInstr &MI); + /// Returns whether the instruction is FP or NEON. static bool isFpOrNEON(const MachineInstr &MI); diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp index dc6d5b8950c34a..b435b3ce03e7ee 100644 --- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -62,6 +62,8 @@ STATISTIC(NumUnscaledPairCreated, "Number of load/store from unscaled generated"); STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted"); STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted"); +STATISTIC(NumConstOffsetFolded, + "Number of const offset of index address folded"); DEBUG_COUNTER(RegRenamingCounter, DEBUG_TYPE "-reg-renaming", "Controls which pairs are considered for renaming"); @@ -75,6 +77,11 @@ static cl::opt LdStLimit("aarch64-load-store-scan-limit", static cl::opt UpdateLimit("aarch64-update-scan-limit", cl::init(100), cl::Hidden); +// The LdStConstLimit limits how far we search for const offset instructions +// when we form index address load/store instructions. +static cl::opt LdStConstLimit("aarch64-load-store-const-scan-limit", + cl::init(10), cl::Hidden); + // Enable register renaming to find additional store pairing opportunities. static cl::opt EnableRenaming("aarch64-load-store-renaming", cl::init(true), cl::Hidden); @@ -171,6 +178,13 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass { findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit); + // Scan the instruction list to find a register assigned with a const + // value that can be combined with the current instruction (a load or store) + // using base addressing with writeback. Scan forwards. + MachineBasicBlock::iterator + findMatchingConstOffsetBackward(MachineBasicBlock::iterator I, unsigned Limit, + unsigned &Offset); + // Scan the instruction list to find a base register update that can // be combined with the current instruction (a load or store) using // pre or post indexed addressing with writeback. Scan backwards. @@ -182,11 +196,19 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass { bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI, unsigned BaseReg, int Offset); + bool isMatchingMovConstInsn(MachineInstr &MemMI, MachineInstr &MI, + unsigned IndexReg, unsigned &Offset); + // Merge a pre- or post-index base register update into a ld/st instruction. MachineBasicBlock::iterator mergeUpdateInsn(MachineBasicBlock::iterator I, MachineBasicBlock::iterator Update, bool IsPreIdx); + MachineBasicBlock::iterator + mergeConstOffsetInsn(MachineBasicBlock::iterator I, + MachineBasicBlock::iterator Update, unsigned Offset, + int Scale); + // Find and merge zero store instructions. bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI); @@ -199,6 +221,9 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass { // Find and merge a base register updates before or after a ld/st instruction. bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI); + // Find and merge a index ldr/st instructions into a base ld/st instruction. + bool tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI, int Scale); + bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt); bool runOnMachineFunction(MachineFunction &Fn) override; @@ -481,6 +506,16 @@ static unsigned getPreIndexedOpcode(unsigned Opc) { } } +static unsigned getBaseAddressOpcode(unsigned Opc) { + // TODO: Add more index address loads/stores. + switch (Opc) { + default: + llvm_unreachable("Opcode has no base address equivalent!"); + case AArch64::LDRBBroX: + return AArch64::LDRBBui; + } +} + static unsigned getPostIndexedOpcode(unsigned Opc) { switch (Opc) { default: @@ -722,6 +757,20 @@ static bool isMergeableLdStUpdate(MachineInstr &MI) { } } +// Make sure this is a reg+reg Ld/St +static bool isMergeableIndexLdSt(MachineInstr &MI, int &Scale) { + unsigned Opc = MI.getOpcode(); + switch (Opc) { + default: + return false; + // Scaled instructions. + // TODO: Add more index address loads/stores. + case AArch64::LDRBBroX: + Scale = 1; + return true; + } +} + static bool isRewritableImplicitDef(unsigned Opc) { switch (Opc) { default: @@ -2018,6 +2067,63 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I, return NextI; } +MachineBasicBlock::iterator +AArch64LoadStoreOpt::mergeConstOffsetInsn(MachineBasicBlock::iterator I, + MachineBasicBlock::iterator Update, + unsigned Offset, int Scale) { + assert((Update->getOpcode() == AArch64::MOVKWi) && + "Unexpected const mov instruction to merge!"); + MachineBasicBlock::iterator E = I->getParent()->end(); + MachineBasicBlock::iterator NextI = next_nodbg(I, E); + MachineBasicBlock::iterator PrevI = prev_nodbg(Update, E); + MachineInstr &MemMI = *I; + unsigned Mask = (1 << 12) * Scale - 1; + unsigned Low = Offset & Mask; + unsigned High = Offset - Low; + Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg(); + Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MemMI).getReg(); + MachineInstrBuilder AddMIB, MemMIB; + + // Add IndexReg, BaseReg, High (the BaseReg may be SP) + AddMIB = + BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(AArch64::ADDXri)) + .addDef(IndexReg) + .addUse(BaseReg) + .addImm(High >> 12) // shifted value + .addImm(12); // shift 12 + (void)AddMIB; + // Ld/St DestReg, IndexReg, Imm12 + unsigned NewOpc = getBaseAddressOpcode(I->getOpcode()); + MemMIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) + .add(getLdStRegOp(MemMI)) + .add(AArch64InstrInfo::getLdStOffsetOp(MemMI)) + .addImm(Low / Scale) + .setMemRefs(I->memoperands()) + .setMIFlags(I->mergeFlagsWith(*Update)); + (void)MemMIB; + + ++NumConstOffsetFolded; + LLVM_DEBUG(dbgs() << "Creating base address load/store.\n"); + LLVM_DEBUG(dbgs() << " Replacing instructions:\n "); + LLVM_DEBUG(PrevI->print(dbgs())); + LLVM_DEBUG(dbgs() << " "); + LLVM_DEBUG(Update->print(dbgs())); + LLVM_DEBUG(dbgs() << " "); + LLVM_DEBUG(I->print(dbgs())); + LLVM_DEBUG(dbgs() << " with instruction:\n "); + LLVM_DEBUG(((MachineInstr *)AddMIB)->print(dbgs())); + LLVM_DEBUG(dbgs() << " "); + LLVM_DEBUG(((MachineInstr *)MemMIB)->print(dbgs())); + LLVM_DEBUG(dbgs() << "\n"); + + // Erase the old instructions for the block. + I->eraseFromParent(); + PrevI->eraseFromParent(); + Update->eraseFromParent(); + + return NextI; +} + bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI, unsigned BaseReg, int Offset) { @@ -2065,6 +2171,31 @@ bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI, return false; } +bool AArch64LoadStoreOpt::isMatchingMovConstInsn(MachineInstr &MemMI, + MachineInstr &MI, + unsigned IndexReg, + unsigned &Offset) { + // The update instruction source and destination register must be the + // same as the load/store index register. + if (MI.getOpcode() == AArch64::MOVKWi && + TRI->isSuperOrSubRegisterEq(IndexReg, MI.getOperand(1).getReg())) { + + // movz + movk hold a large offset of a Ld/St instruction. + MachineBasicBlock::iterator B = MI.getParent()->begin(); + MachineBasicBlock::iterator MBBI = &MI; + MBBI = prev_nodbg(MBBI, B); + MachineInstr &MovzMI = *MBBI; + if (MovzMI.getOpcode() == AArch64::MOVZWi) { + unsigned Low = MovzMI.getOperand(1).getImm(); + unsigned High = MI.getOperand(2).getImm() << MI.getOperand(3).getImm(); + Offset = High + Low; + // 12-bit optionally shifted immediates are legal for adds. + return Offset >> 24 == 0; + } + } + return false; +} + MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) { MachineBasicBlock::iterator E = I->getParent()->end(); @@ -2220,6 +2351,60 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( return E; } +MachineBasicBlock::iterator +AArch64LoadStoreOpt::findMatchingConstOffsetBackward( + MachineBasicBlock::iterator I, unsigned Limit, unsigned &Offset) { + MachineBasicBlock::iterator B = I->getParent()->begin(); + MachineBasicBlock::iterator E = I->getParent()->end(); + MachineInstr &MemMI = *I; + MachineBasicBlock::iterator MBBI = I; + + // If the load is the first instruction in the block, there's obviously + // not any matching load or store. + if (MBBI == B) + return E; + + // Make sure the IndexReg is killed and the shift amount is zero. + // TODO: Relex this restriction to extend, simplify processing now. + if (!AArch64InstrInfo::getLdStOffsetOp(MemMI).isKill() || + !AArch64InstrInfo::getLdStAmountOp(MemMI).isImm() || + (AArch64InstrInfo::getLdStAmountOp(MemMI).getImm() != 0)) + return E; + + Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MemMI).getReg(); + + // Track which register units have been modified and used between the first + // insn (inclusive) and the second insn. + ModifiedRegUnits.clear(); + UsedRegUnits.clear(); + unsigned Count = 0; + do { + MBBI = prev_nodbg(MBBI, B); + MachineInstr &MI = *MBBI; + + // Don't count transient instructions towards the search limit since there + // may be different numbers of them if e.g. debug information is present. + if (!MI.isTransient()) + ++Count; + + // If we found a match, return it. + if (isMatchingMovConstInsn(*I, MI, IndexReg, Offset)) { + return MBBI; + } + + // Update the status of what the instruction clobbered and used. + LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI); + + // Otherwise, if the index register is used or modified, we have no match, + // so return early. + if (!ModifiedRegUnits.available(IndexReg) || + !UsedRegUnits.available(IndexReg)) + return E; + + } while (MBBI != B && Count < Limit); + return E; +} + bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore( MachineBasicBlock::iterator &MBBI) { MachineInstr &MI = *MBBI; @@ -2404,6 +2589,34 @@ bool AArch64LoadStoreOpt::tryToMergeLdStUpdate return false; } +bool AArch64LoadStoreOpt::tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI, + int Scale) { + MachineInstr &MI = *MBBI; + MachineBasicBlock::iterator E = MI.getParent()->end(); + MachineBasicBlock::iterator Update; + + // Don't know how to handle unscaled pre/post-index versions below, so bail. + if (TII->hasUnscaledLdStOffset(MI.getOpcode())) + return false; + + // Look back to try to find a const offset for index LdSt instruction. For + // example, + // mov x8, #LargeImm ; = a * (1<<12) + imm12 + // ldr x1, [x0, x8] + // merged into: + // add x8, x0, a * (1<<12) + // ldr x1, [x8, imm12] + unsigned Offset; + Update = findMatchingConstOffsetBackward(MBBI, LdStConstLimit, Offset); + if (Update != E && (Offset & (Scale - 1)) == 0) { + // Merge the imm12 into the ld/st. + MBBI = mergeConstOffsetInsn(MBBI, Update, Offset, Scale); + return true; + } + + return false; +} + bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt) { @@ -2482,6 +2695,22 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, ++MBBI; } + // 5) Find a register assigned with a const value that can be combined with + // into the load or store. e.g., + // mov x8, #LargeImm ; = a * (1<<12) + imm12 + // ldr x1, [x0, x8] + // ; becomes + // add x8, x0, a * (1<<12) + // ldr x1, [x8, imm12] + for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + MBBI != E;) { + int Scale; + if (isMergeableIndexLdSt(*MBBI, Scale) && tryToMergeIndexLdSt(MBBI, Scale)) + Modified = true; + else + ++MBBI; + } + return Modified; } diff --git a/llvm/test/CodeGen/AArch64/arm64-addrmode.ll b/llvm/test/CodeGen/AArch64/arm64-addrmode.ll index d39029163a47aa..2181eaaee7db68 100644 --- a/llvm/test/CodeGen/AArch64/arm64-addrmode.ll +++ b/llvm/test/CodeGen/AArch64/arm64-addrmode.ll @@ -214,9 +214,8 @@ define void @t17(i64 %a) { define i8 @LdOffset_i8(ptr %a) { ; CHECK-LABEL: LdOffset_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #56952 // =0xde78 -; CHECK-NEXT: movk w8, #15, lsl #16 -; CHECK-NEXT: ldrb w0, [x0, x8] +; CHECK-NEXT: add x8, x0, #253, lsl #12 // =1036288 +; CHECK-NEXT: ldrb w0, [x8, #3704] ; CHECK-NEXT: ret %arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992 %val = load i8, ptr %arrayidx, align 1 @@ -227,9 +226,8 @@ define i8 @LdOffset_i8(ptr %a) { define i32 @LdOffset_i8_zext32(ptr %a) { ; CHECK-LABEL: LdOffset_i8_zext32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #56952 // =0xde78 -; CHECK-NEXT: movk w8, #15, lsl #16 -; CHECK-NEXT: ldrb w0, [x0, x8] +; CHECK-NEXT: add x8, x0, #253, lsl #12 // =1036288 +; CHECK-NEXT: ldrb w0, [x8, #3704] ; CHECK-NEXT: ret %arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992 %val = load i8, ptr %arrayidx, align 1 @@ -255,9 +253,8 @@ define i32 @LdOffset_i8_sext32(ptr %a) { define i64 @LdOffset_i8_zext64(ptr %a) { ; CHECK-LABEL: LdOffset_i8_zext64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #56952 // =0xde78 -; CHECK-NEXT: movk w8, #15, lsl #16 -; CHECK-NEXT: ldrb w0, [x0, x8] +; CHECK-NEXT: add x8, x0, #253, lsl #12 // =1036288 +; CHECK-NEXT: ldrb w0, [x8, #3704] ; CHECK-NEXT: ret %arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992 %val = load i8, ptr %arrayidx, align 1 diff --git a/llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir b/llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir index 488f1ffdb52f3b..15b6700398ea08 100755 --- a/llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir +++ b/llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir @@ -14,9 +14,8 @@ body: | ; CHECK-LABEL: name: LdOffset ; CHECK: liveins: $x0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $w8 = MOVZWi 56952, 0 - ; CHECK-NEXT: renamable $w8 = MOVKWi $w8, 15, 16, implicit-def $x8 - ; CHECK-NEXT: renamable $w0 = LDRBBroX killed renamable $x0, killed renamable $x8, 0, 0 + ; CHECK-NEXT: $x8 = ADDXri $x0, 253, 12 + ; CHECK-NEXT: renamable $w0 = LDRBBui killed renamable $x8, 3704 ; CHECK-NEXT: RET undef $lr, implicit $w0 renamable $w8 = MOVZWi 56952, 0 renamable $w8 = MOVKWi $w8, 15, 16, implicit-def $x8