-
Notifications
You must be signed in to change notification settings - Fork 12.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[RISCV] Reduce LMUL when index is known when lowering insert_vector_elt #66087
Conversation
@llvm/pr-subscribers-backend-risc-v ChangesContinuing on from #65997, if the index of insert_vector_elt is a constant then we can work out what the minimum number of registers will be needed for the slideup and choose a smaller type to operate on. This reduces the LMUL for not just the slideup but also for the scalar insert.Patch is 131.09 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/66087.diff 14 Files Affected:
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp index 45f61262faf9391..e36bffc91b91d95 100644 --- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp +++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp @@ -1871,11 +1871,30 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) { SlotIndex Idx = LIS->getInstructionIndex(*mi); for (auto &S : LI.subranges()) { if ((S.LaneMask & LaneMask).none()) { + // If Idx is 160B, and we have a subrange that isn't in + // %reg.subidx like so: + // + // [152r,160r)[160r,256r) + // + // Merge the two segments together so the subrange becomes: + // + // [152r,256r) LiveRange::iterator UseSeg = S.FindSegmentContaining(Idx); - LiveRange::iterator DefSeg = std::next(UseSeg); - S.MergeValueNumberInto(DefSeg->valno, UseSeg->valno); + if (UseSeg != S.end()) { + LiveRange::iterator DefSeg = std::next(UseSeg); + assert(DefSeg != S.end()); + S.MergeValueNumberInto(DefSeg->valno, UseSeg->valno); + } + // Otherwise, it should have only one segment that starts at + // 160r which we should remove. + else { + assert(S.containsOneValue()); + assert(S.begin()->start == Idx.getRegSlot()); + S.removeSegment(S.begin()); + } } } + LI.removeEmptySubRanges(); // The COPY no longer has a use of %reg. LIS->shrinkToUses(&LI); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 4ff264635cda248..db743f1c67a6232 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -7345,6 +7345,32 @@ RISCVTargetLowering::lowerVectorFPExtendOrRoundLike(SDValue Op, return Result; } +// Given a scalable vector type and an index into it, returns the type for the +// smallest subvector that the index fits in. This can be used to reduce LMUL +// for operations like vslidedown. +// +// E.g. With Zvl128b, index 3 in a nxv4i32 fits within the first nxv2i32. +static std::optional +getSmallestVTForIndex(MVT VecVT, unsigned MaxIdx, SDLoc DL, SelectionDAG &DAG, + const RISCVSubtarget &Subtarget) { + assert(VecVT.isScalableVector()); + const unsigned EltSize = VecVT.getScalarSizeInBits(); + const unsigned VectorBitsMin = Subtarget.getRealMinVLen(); + const unsigned MinVLMAX = VectorBitsMin / EltSize; + MVT SmallerVT; + if (MaxIdx < MinVLMAX) + SmallerVT = getLMUL1VT(VecVT); + else if (MaxIdx < MinVLMAX * 2) + SmallerVT = getLMUL1VT(VecVT).getDoubleNumVectorElementsVT(); + else if (MaxIdx < MinVLMAX * 4) + SmallerVT = getLMUL1VT(VecVT) + .getDoubleNumVectorElementsVT() + .getDoubleNumVectorElementsVT(); + if (!SmallerVT.isValid() || !VecVT.bitsGT(SmallerVT)) + return std::nullopt; + return SmallerVT; +} + // Custom-legalize INSERT_VECTOR_ELT so that the value is inserted into the // first position of a vector, and that vector is slid up to the insert index. // By limiting the active vector length to index+1 and merging with the @@ -7375,6 +7401,19 @@ SDValue RISCVTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op, Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget); } + MVT OrigContainerVT = ContainerVT; + SDValue OrigVec = Vec; + // If we know the index we're going to insert at, we can shrink down Vec so + // we're performing the vslide1down on a smaller LMUL. + if (auto *CIdx = dyn_cast(Idx)) { + if (auto ShrunkVT = getSmallestVTForIndex(ContainerVT, CIdx->getZExtValue(), + DL, DAG, Subtarget)) { + ContainerVT = *ShrunkVT; + Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ContainerVT, Vec, + DAG.getVectorIdxConstant(0, DL)); + } + } + MVT XLenVT = Subtarget.getXLenVT(); bool IsLegalInsert = Subtarget.is64Bit() || Val.getValueType() != MVT::i64; @@ -7399,6 +7438,10 @@ SDValue RISCVTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op, VecVT.isFloatingPoint() ? RISCVISD::VFMV_S_F_VL : RISCVISD::VMV_S_X_VL; if (isNullConstant(Idx)) { Vec = DAG.getNode(Opc, DL, ContainerVT, Vec, Val, VL); + + if (ContainerVT != OrigContainerVT) + Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, OrigContainerVT, OrigVec, + Vec, DAG.getVectorIdxConstant(0, DL)); if (!VecVT.isFixedLengthVector()) return Vec; return convertFromScalableVector(VecVT, Vec, DAG, Subtarget); @@ -7431,6 +7474,10 @@ SDValue RISCVTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op, // Bitcast back to the right container type. ValInVec = DAG.getBitcast(ContainerVT, ValInVec); + if (ContainerVT != OrigContainerVT) + ValInVec = + DAG.getNode(ISD::INSERT_SUBVECTOR, DL, OrigContainerVT, OrigVec, + ValInVec, DAG.getVectorIdxConstant(0, DL)); if (!VecVT.isFixedLengthVector()) return ValInVec; return convertFromScalableVector(VecVT, ValInVec, DAG, Subtarget); @@ -7461,37 +7508,15 @@ SDValue RISCVTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op, Policy = RISCVII::TAIL_AGNOSTIC; SDValue Slideup = getVSlideup(DAG, Subtarget, DL, ContainerVT, Vec, ValInVec, Idx, Mask, InsertVL, Policy); + + if (ContainerVT != OrigContainerVT) + Slideup = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, OrigContainerVT, OrigVec, + Slideup, DAG.getVectorIdxConstant(0, DL)); if (!VecVT.isFixedLengthVector()) return Slideup; return convertFromScalableVector(VecVT, Slideup, DAG, Subtarget); } -// Given a scalable vector type and an index into it, returns the type for the -// smallest subvector that the index fits in. This can be used to reduce LMUL -// for operations like vslidedown. -// -// E.g. With Zvl128b, index 3 in a nxv4i32 fits within the first nxv2i32. -static std::optional -getSmallestVTForIndex(MVT VecVT, unsigned MaxIdx, SDLoc DL, SelectionDAG &DAG, - const RISCVSubtarget &Subtarget) { - assert(VecVT.isScalableVector()); - const unsigned EltSize = VecVT.getScalarSizeInBits(); - const unsigned VectorBitsMin = Subtarget.getRealMinVLen(); - const unsigned MinVLMAX = VectorBitsMin / EltSize; - MVT SmallerVT; - if (MaxIdx < MinVLMAX) - SmallerVT = getLMUL1VT(VecVT); - else if (MaxIdx < MinVLMAX * 2) - SmallerVT = getLMUL1VT(VecVT).getDoubleNumVectorElementsVT(); - else if (MaxIdx < MinVLMAX * 4) - SmallerVT = getLMUL1VT(VecVT) - .getDoubleNumVectorElementsVT() - .getDoubleNumVectorElementsVT(); - if (!SmallerVT.isValid() || !VecVT.bitsGT(SmallerVT)) - return std::nullopt; - return SmallerVT; -} - // Custom-lower EXTRACT_VECTOR_ELT operations to slide the vector down, then // extract the first element: (extractelt (slidedown vec, idx), 0). For integer // types this is done using VMV_X_S to allow us to glean information about the @@ -8606,6 +8631,18 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op, ContainerVT = getContainerForFixedLengthVector(VecVT); Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget); } + + // Shrink down Vec so we're performing the slideup on a smaller LMUL. + unsigned LastIdx = OrigIdx + SubVecVT.getVectorNumElements() - 1; + MVT OrigContainerVT = ContainerVT; + SDValue OrigVec = Vec; + if (auto ShrunkVT = + getSmallestVTForIndex(ContainerVT, LastIdx, DL, DAG, Subtarget)) { + ContainerVT = *ShrunkVT; + Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ContainerVT, Vec, + DAG.getVectorIdxConstant(0, DL)); + } + SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT, DAG.getUNDEF(ContainerVT), SubVec, DAG.getConstant(0, DL, XLenVT)); @@ -8636,6 +8673,12 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op, SlideupAmt, Mask, VL, Policy); } + // If we performed the slideup on a smaller LMUL, insert the result back + // into the rest of the vector. + if (ContainerVT != OrigContainerVT) + SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, OrigContainerVT, OrigVec, + SubVec, DAG.getVectorIdxConstant(0, DL)); + if (VecVT.isFixedLengthVector()) SubVec = convertFromScalableVector(VecVT, SubVec, DAG, Subtarget); return DAG.getBitcast(Op.getValueType(), SubVec); diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-i1.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-i1.ll index b3cbad3d9e6b1d7..f7737784d4ca57e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-i1.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-i1.ll @@ -108,7 +108,7 @@ define <64 x i1> @insertelt_v64i1(<64 x i1> %x, i1 %elt) nounwind { ; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: vmv.s.x v12, a0 -; CHECK-NEXT: vsetivli zero, 2, e8, m4, tu, ma +; CHECK-NEXT: vsetivli zero, 2, e8, m1, tu, ma ; CHECK-NEXT: vslideup.vi v8, v12, 1 ; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma ; CHECK-NEXT: vand.vi v8, v8, 1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll index 1d6a45ed36f335c..6a9212ed309a8ef 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll @@ -14,7 +14,7 @@ define @insert_nxv8i32_v2i32_0( %vec, ptr % ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vsetivli zero, 2, e32, m4, tu, ma +; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma ; CHECK-NEXT: vmv.v.v v8, v12 ; CHECK-NEXT: ret %sv = load <2 x i32>, ptr %svp @@ -27,7 +27,7 @@ define @insert_nxv8i32_v2i32_2( %vec, ptr % ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vsetivli zero, 4, e32, m4, tu, ma +; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, ma ; CHECK-NEXT: vslideup.vi v8, v12, 2 ; CHECK-NEXT: ret %sv = load <2 x i32>, ptr %svp @@ -40,7 +40,7 @@ define @insert_nxv8i32_v2i32_6( %vec, ptr % ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vsetivli zero, 8, e32, m4, tu, ma +; CHECK-NEXT: vsetivli zero, 8, e32, m2, tu, ma ; CHECK-NEXT: vslideup.vi v8, v12, 6 ; CHECK-NEXT: ret %sv = load <2 x i32>, ptr %svp @@ -51,22 +51,19 @@ define @insert_nxv8i32_v2i32_6( %vec, ptr % define @insert_nxv8i32_v8i32_0( %vec, ptr %svp) { ; LMULMAX2-LABEL: insert_nxv8i32_v8i32_0: ; LMULMAX2: # %bb.0: -; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-NEXT: vle32.v v12, (a0) -; LMULMAX2-NEXT: vsetivli zero, 8, e32, m4, tu, ma -; LMULMAX2-NEXT: vmv.v.v v8, v12 +; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, tu, ma +; LMULMAX2-NEXT: vle32.v v8, (a0) ; LMULMAX2-NEXT: ret ; ; LMULMAX1-LABEL: insert_nxv8i32_v8i32_0: ; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: addi a1, a0, 16 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vle32.v v12, (a0) -; LMULMAX1-NEXT: addi a0, a0, 16 -; LMULMAX1-NEXT: vle32.v v16, (a0) -; LMULMAX1-NEXT: vsetivli zero, 4, e32, m4, tu, ma -; LMULMAX1-NEXT: vmv.v.v v8, v12 -; LMULMAX1-NEXT: vsetivli zero, 8, e32, m4, tu, ma -; LMULMAX1-NEXT: vslideup.vi v8, v16, 4 +; LMULMAX1-NEXT: vle32.v v12, (a1) +; LMULMAX1-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; LMULMAX1-NEXT: vle32.v v8, (a0) +; LMULMAX1-NEXT: vsetivli zero, 8, e32, m2, tu, ma +; LMULMAX1-NEXT: vslideup.vi v8, v12, 4 ; LMULMAX1-NEXT: ret %sv = load <8 x i32>, ptr %svp %v = call @llvm.vector.insert.v8i32.nxv8i32( %vec, <8 x i32> %sv, i64 0) @@ -84,14 +81,14 @@ define @insert_nxv8i32_v8i32_8( %vec, ptr % ; ; LMULMAX1-LABEL: insert_nxv8i32_v8i32_8: ; LMULMAX1: # %bb.0: -; LMULMAX1-NEXT: addi a1, a0, 16 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vle32.v v12, (a1) +; LMULMAX1-NEXT: vle32.v v12, (a0) +; LMULMAX1-NEXT: addi a0, a0, 16 ; LMULMAX1-NEXT: vle32.v v16, (a0) ; LMULMAX1-NEXT: vsetivli zero, 12, e32, m4, tu, ma -; LMULMAX1-NEXT: vslideup.vi v8, v16, 8 +; LMULMAX1-NEXT: vslideup.vi v8, v12, 8 ; LMULMAX1-NEXT: vsetivli zero, 16, e32, m4, tu, ma -; LMULMAX1-NEXT: vslideup.vi v8, v12, 12 +; LMULMAX1-NEXT: vslideup.vi v8, v16, 12 ; LMULMAX1-NEXT: ret %sv = load <8 x i32>, ptr %svp %v = call @llvm.vector.insert.v8i32.nxv8i32( %vec, <8 x i32> %sv, i64 8) @@ -166,7 +163,7 @@ define void @insert_v8i32_v2i32_0(ptr %vp, ptr %svp) { ; LMULMAX2-NEXT: vle32.v v8, (a1) ; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; LMULMAX2-NEXT: vle32.v v10, (a0) -; LMULMAX2-NEXT: vsetivli zero, 2, e32, m2, tu, ma +; LMULMAX2-NEXT: vsetivli zero, 2, e32, m1, tu, ma ; LMULMAX2-NEXT: vmv.v.v v10, v8 ; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; LMULMAX2-NEXT: vse32.v v10, (a0) @@ -197,7 +194,7 @@ define void @insert_v8i32_v2i32_2(ptr %vp, ptr %svp) { ; LMULMAX2-NEXT: vle32.v v8, (a1) ; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; LMULMAX2-NEXT: vle32.v v10, (a0) -; LMULMAX2-NEXT: vsetivli zero, 4, e32, m2, tu, ma +; LMULMAX2-NEXT: vsetivli zero, 4, e32, m1, tu, ma ; LMULMAX2-NEXT: vslideup.vi v10, v8, 2 ; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; LMULMAX2-NEXT: vse32.v v10, (a0) @@ -508,9 +505,9 @@ define void @insert_v2i64_nxv16i64(ptr %psv0, ptr %psv1, * %o ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vle64.v v16, (a1) -; CHECK-NEXT: vsetivli zero, 6, e64, m8, tu, ma -; CHECK-NEXT: vslideup.vi v8, v16, 4 +; CHECK-NEXT: vle64.v v12, (a1) +; CHECK-NEXT: vsetivli zero, 6, e64, m4, tu, ma +; CHECK-NEXT: vslideup.vi v8, v12, 4 ; CHECK-NEXT: vs8r.v v8, (a2) ; CHECK-NEXT: ret %sv0 = load <2 x i64>, ptr %psv0 @@ -539,7 +536,7 @@ define void @insert_v2i64_nxv16i64_lo2(ptr %psv, * %out) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 4, e64, m8, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vslideup.vi v16, v8, 2 ; CHECK-NEXT: vs8r.v v16, (a1) ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll index 373a96356a207e2..cbcca9d2696f4ba 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll @@ -40,7 +40,7 @@ define <32 x i32> @insertelt_v32i32_0(<32 x i32> %a, i32 %y) { ; CHECK-LABEL: insertelt_v32i32_0: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e32, m8, tu, ma +; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, ma ; CHECK-NEXT: vmv.s.x v8, a0 ; CHECK-NEXT: ret %b = insertelement <32 x i32> %a, i32 %y, i32 0 @@ -54,7 +54,7 @@ define <32 x i32> @insertelt_v32i32_4(<32 x i32> %a, i32 %y) { ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma ; CHECK-NEXT: vmv.s.x v16, a0 -; CHECK-NEXT: vsetivli zero, 5, e32, m8, tu, ma +; CHECK-NEXT: vsetivli zero, 5, e32, m2, tu, ma ; CHECK-NEXT: vslideup.vi v8, v16, 4 ; CHECK-NEXT: ret %b = insertelement <32 x i32> %a, i32 %y, i32 4 @@ -92,7 +92,7 @@ define <64 x i32> @insertelt_v64i32_0(<64 x i32> %a, i32 %y) { ; CHECK-LABEL: insertelt_v64i32_0: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e32, m8, tu, ma +; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, ma ; CHECK-NEXT: vmv.s.x v8, a0 ; CHECK-NEXT: ret %b = insertelement <64 x i32> %a, i32 %y, i32 0 @@ -390,7 +390,7 @@ define <8 x i64> @insertelt_v8i64_0(<8 x i64> %a, ptr %x) { ; CHECK-LABEL: insertelt_v8i64_0: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: vsetivli zero, 8, e64, m4, tu, ma +; CHECK-NEXT: vsetivli zero, 8, e64, m1, tu, ma ; CHECK-NEXT: vmv.s.x v8, a0 ; CHECK-NEXT: ret %b = insertelement <8 x i64> %a, i64 -1, i32 0 @@ -468,7 +468,7 @@ define <8 x i64> @insertelt_c6_v8i64_0(<8 x i64> %a, ptr %x) { ; CHECK-LABEL: insertelt_c6_v8i64_0: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 6 -; CHECK-NEXT: vsetivli zero, 8, e64, m4, tu, ma +; CHECK-NEXT: vsetivli zero, 8, e64, m1, tu, ma ; CHECK-NEXT: vmv.s.x v8, a0 ; CHECK-NEXT: ret %b = insertelement <8 x i64> %a, i64 6, i32 0 @@ -550,9 +550,9 @@ define void @insertelt_c6_v8i64_0_add(ptr %x, ptr %y) { ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) ; CHECK-NEXT: li a2, 6 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, tu, ma +; CHECK-NEXT: vsetivli zero, 8, e64, m1, tu, ma ; CHECK-NEXT: vmv.s.x v8, a2 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; CHECK-NEXT: vle64.v v12, (a1) ; CHECK-NEXT: vadd.vv v8, v8, v12 ; CHECK-NEXT: vse64.v v8, (a0) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll index f7352b4659e5a9b..5ae0884068f9bc3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll @@ -2426,14 +2426,14 @@ define <8 x i32> @mgather_v8i32(<8 x ptr> %ptrs, <8 x i1> %m, <8 x i32> %passthr ; RV64ZVE32F-NEXT: .LBB34_9: # %cond.load ; RV64ZVE32F-NEXT: ld a2, 0(a0) ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, ma +; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a2 ; RV64ZVE32F-NEXT: andi a2, a1, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB34_2 ; RV64ZVE32F-NEXT: .LBB34_10: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: vslideup.vi v8, v10, 1 ; RV64ZVE32F-NEXT: andi a2, a1, 4 @@ -2441,7 +2441,7 @@ define <8 x i32> @mgather... |
It looks like maybe you pushed the wrong set of commits to this PR? The first couple of changes seem unrelated to your proposed commit message. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
See prior comment regarding patch series selection
; LMULMAX2-NEXT: vle32.v v12, (a0) | ||
; LMULMAX2-NEXT: vsetivli zero, 8, e32, m4, tu, ma | ||
; LMULMAX2-NEXT: vmv.v.v v8, v12 | ||
; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, tu, ma |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh, nice!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is performCombineVMergeAndVOps
kicking in, https://reviews.llvm.org/D157272 also does this
bcd5eeb
to
0ec8328
Compare
MVT OrigContainerVT = ContainerVT; | ||
SDValue OrigVec = Vec; | ||
// If we know the index we're going to insert at, we can shrink down Vec so | ||
// we're performing the vslide1down on a smaller LMUL. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
vslide1down? shouldn't that be vslideup?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, I updated the comment in 0ec8328
Similiar to llvm#66267, we can perform a vslideup_vl on a smaller type if we know the highest lane that will be written to, which can be determined from VL. This is an alternative to llvm#65997 and llvm#66087
Please rebase this after landing previous patch and removing the twoaddress change. |
0ec8328
to
3399002
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Continuing on from llvm#65997, if the index of insert_vector_elt is a constant then we can work out what the minimum number of registers will be needed for the slideup and choose a smaller type to operate on. This reduces the LMUL for not just the slideup but also for the scalar insert.
|
||
if (ContainerVT != OrigContainerVT) | ||
Slideup = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, OrigContainerVT, OrigVec, | ||
Slideup, DAG.getVectorIdxConstant(0, DL)); | ||
if (!VecVT.isFixedLengthVector()) | ||
return Slideup; | ||
return convertFromScalableVector(VecVT, Slideup, DAG, Subtarget); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Making a mental note here re: #65997, looks like all the return paths here are covered by a reinsert into the correct original vector type after shrinking
3399002
to
09d1751
Compare
…lt (llvm#66087) Continuing on from llvm#65997, if the index of insert_vector_elt is a constant then we can work out what the minimum number of registers will be needed for the slideup and choose a smaller type to operate on. This reduces the LMUL for not just the slideup but also for the scalar insert.
Continuing on from #65997, if the index of insert_vector_elt is a constant then we can work out what the minimum number of registers will be needed for the slideup and choose a smaller type to operate on.
This reduces the LMUL for not just the slideup but also for the scalar insert.
Stacked upon #65997