Skip to content

Commit

Permalink
Arm64: Have CpBlkUnroll and InitBlkUnroll use SIMD registers (#68085)
Browse files Browse the repository at this point in the history
* Arm64: Have CpBlkUnroll and InitBlkUnroll use SIMD registers

Do not restrict SIMD registers only for memory that are 16B aligned.
Motivation: https://godbolt.org/z/eb53xPvYT

* also adjust the LSRA for SIMD registers

* cleanup

* jit formatting
  • Loading branch information
kunalspathak authored Apr 22, 2022
1 parent 7f01fbd commit 12c06c0
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 33 deletions.
36 changes: 10 additions & 26 deletions src/coreclr/jit/codegenarmarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2631,18 +2631,16 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
srcReg = REG_ZR;
}

regNumber dstReg = dstAddrBaseReg;
int dstRegAddrAlignment = 0;
bool isDstRegAddrAlignmentKnown = false;
regNumber dstReg = dstAddrBaseReg;
int dstRegAddrAlignment = 0;

if (dstLclNum != BAD_VAR_NUM)
{
bool fpBased;
const int baseAddr = compiler->lvaFrameAddress(dstLclNum, &fpBased);

dstReg = fpBased ? REG_FPBASE : REG_SPBASE;
dstRegAddrAlignment = fpBased ? (genSPtoFPdelta() % 16) : 0;
isDstRegAddrAlignmentKnown = true;
dstReg = fpBased ? REG_FPBASE : REG_SPBASE;
dstRegAddrAlignment = fpBased ? (genSPtoFPdelta() % 16) : 0;

helper.SetDstOffset(baseAddr + dstOffset);
}
Expand All @@ -2664,11 +2662,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)

bool shouldUse16ByteWideInstrs = false;

// Store operations that cross a 16-byte boundary reduce bandwidth or incur additional latency.
// The following condition prevents using 16-byte stores when dstRegAddrAlignment is:
// 1) unknown (i.e. dstReg is neither FP nor SP) or
// 2) non-zero (i.e. dstRegAddr is not 16-byte aligned).
const bool hasAvailableSimdReg = isDstRegAddrAlignmentKnown && (size > FP_REGSIZE_BYTES);
const bool hasAvailableSimdReg = (size > FP_REGSIZE_BYTES);
const bool canUse16ByteWideInstrs =
hasAvailableSimdReg && (dstRegAddrAlignment == 0) && helper.CanEncodeAllOffsets(FP_REGSIZE_BYTES);

Expand Down Expand Up @@ -2819,35 +2813,26 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node)

#ifdef TARGET_ARM64
CopyBlockUnrollHelper helper(srcOffset, dstOffset, size);

regNumber srcReg = srcAddrBaseReg;
int srcRegAddrAlignment = 0;
bool isSrcRegAddrAlignmentKnown = false;
regNumber srcReg = srcAddrBaseReg;

if (srcLclNum != BAD_VAR_NUM)
{
bool fpBased;
const int baseAddr = compiler->lvaFrameAddress(srcLclNum, &fpBased);

srcReg = fpBased ? REG_FPBASE : REG_SPBASE;
srcRegAddrAlignment = fpBased ? (genSPtoFPdelta() % 16) : 0;
isSrcRegAddrAlignmentKnown = true;
srcReg = fpBased ? REG_FPBASE : REG_SPBASE;

helper.SetSrcOffset(baseAddr + srcOffset);
}

regNumber dstReg = dstAddrBaseReg;
int dstRegAddrAlignment = 0;
bool isDstRegAddrAlignmentKnown = false;
regNumber dstReg = dstAddrBaseReg;

if (dstLclNum != BAD_VAR_NUM)
{
bool fpBased;
const int baseAddr = compiler->lvaFrameAddress(dstLclNum, &fpBased);

dstReg = fpBased ? REG_FPBASE : REG_SPBASE;
dstRegAddrAlignment = fpBased ? (genSPtoFPdelta() % 16) : 0;
isDstRegAddrAlignmentKnown = true;
dstReg = fpBased ? REG_FPBASE : REG_SPBASE;

helper.SetDstOffset(baseAddr + dstOffset);
}
Expand Down Expand Up @@ -2908,8 +2893,7 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node)
// known and the block size is larger than a single SIMD register size (i.e. when using SIMD instructions can
// be profitable).

const bool canUse16ByteWideInstrs = isSrcRegAddrAlignmentKnown && isDstRegAddrAlignmentKnown &&
(size >= 2 * FP_REGSIZE_BYTES) && (srcRegAddrAlignment == dstRegAddrAlignment);
const bool canUse16ByteWideInstrs = (size >= 2 * FP_REGSIZE_BYTES);

bool shouldUse16ByteWideInstrs = false;

Expand Down
11 changes: 4 additions & 7 deletions src/coreclr/jit/lsraarmarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ int LinearScan::BuildCall(GenTreeCall* call)
// the target. We do not handle these constraints on the same
// refposition too well so we help ourselves a bit here by forcing the
// null check with LR.
regMaskTP candidates = call->IsFastTailCall() ? RBM_LR : 0;
regMaskTP candidates = call->IsFastTailCall() ? RBM_LR : RBM_NONE;
buildInternalIntRegisterDefForNode(call, candidates);
}
#endif // TARGET_ARM
Expand Down Expand Up @@ -632,9 +632,7 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode)
buildInternalIntRegisterDefForNode(blkNode);
}

const bool isDstRegAddrAlignmentKnown = dstAddr->OperIsLocalAddr();

if (isDstRegAddrAlignmentKnown && (size > FP_REGSIZE_BYTES))
if (size > FP_REGSIZE_BYTES)
{
// For larger block sizes CodeGen can choose to use 16-byte SIMD instructions.
buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates());
Expand Down Expand Up @@ -710,10 +708,9 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode)
((srcAddrOrFill != nullptr) && srcAddrOrFill->OperIsLocalAddr());
const bool isDstAddrLocal = dstAddr->OperIsLocalAddr();

// CodeGen can use 16-byte SIMD ldp/stp for larger block sizes
// only when both source and destination base address registers have known alignment.
// CodeGen can use 16-byte SIMD ldp/stp for larger block sizes.
// This is the case, when both registers are either sp or fp.
bool canUse16ByteWideInstrs = isSrcAddrLocal && isDstAddrLocal && (size >= 2 * FP_REGSIZE_BYTES);
bool canUse16ByteWideInstrs = (size >= 2 * FP_REGSIZE_BYTES);

// Note that the SIMD registers allocation is speculative - LSRA doesn't know at this point
// whether CodeGen will use SIMD registers (i.e. if such instruction sequence will be more optimal).
Expand Down

0 comments on commit 12c06c0

Please sign in to comment.