From a930ef591d905cb9212991e5dbbccc561eea78ba Mon Sep 17 00:00:00 2001 From: Jakob Botsch Nielsen Date: Wed, 15 May 2024 23:25:44 +0200 Subject: [PATCH] JIT: Reorder stores to make them amenable to stp optimization (#102133) This generalizes the indir reordering optimization (that currently only triggers for loads) to kick in for GT_STOREIND nodes. The main complication with doing this is the fact that the data node of the second indirection needs its own reordering with the previous indirection. The existing logic works by reordering all nodes between the first and second indirection that are unrelated to the second indirection's computation to happen after it. Once that is done we know that there are no uses of the first indirection's result between it and the second indirection, so after doing the necessary interference checks we can safely move the previous indirection to happen after the data node of the second indirection. Example: ```csharp class Body { public double x, y, z, vx, vy, vz, mass; } static void Advance(double dt, Body[] bodies) { foreach (Body b in bodies) { b.x += dt * b.vx; b.y += dt * b.vy; b.z += dt * b.vz; } } ``` Diff: ```diff @@ -1,18 +1,17 @@ -G_M55007_IG04: ;; offset=0x001C +G_M55007_IG04: ;; offset=0x0020 ldr x3, [x0, w1, UXTW #3] ldp d16, d17, [x3, #0x08] ldp d18, d19, [x3, #0x20] fmul d18, d0, d18 fadd d16, d16, d18 - str d16, [x3, #0x08] - fmul d16, d0, d19 - fadd d16, d17, d16 - str d16, [x3, #0x10] + fmul d18, d0, d19 + fadd d17, d17, d18 + stp d16, d17, [x3, #0x08] ldr d16, [x3, #0x18] ldr d17, [x3, #0x30] fmul d17, d0, d17 fadd d16, d16, d17 str d16, [x3, #0x18] add w1, w1, #1 cmp w2, w1 bgt G_M55007_IG04 ``` --- src/coreclr/jit/lower.cpp | 134 +++++++++++++++++++++------ src/coreclr/jit/lower.h | 6 +- src/coreclr/jit/lowerarmarch.cpp | 14 ++- src/coreclr/jit/lowerloongarch64.cpp | 5 +- src/coreclr/jit/lowerriscv64.cpp | 5 +- src/coreclr/jit/lowerxarch.cpp | 14 +-- 6 files changed, 137 insertions(+), 41 deletions(-) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 1c2ae95a65d2..91e52f3bad5c 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -414,8 +414,7 @@ GenTree* Lowering::LowerNode(GenTree* node) } case GT_STOREIND: - LowerStoreIndirCommon(node->AsStoreInd()); - break; + return LowerStoreIndirCommon(node->AsStoreInd()); case GT_ADD: { @@ -8895,7 +8894,10 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeIndir* ind) // Arguments: // ind - the store indirection node we are lowering. // -void Lowering::LowerStoreIndirCommon(GenTreeStoreInd* ind) +// Return Value: +// Next node to lower. +// +GenTree* Lowering::LowerStoreIndirCommon(GenTreeStoreInd* ind) { assert(ind->TypeGet() != TYP_STRUCT); @@ -8910,28 +8912,30 @@ void Lowering::LowerStoreIndirCommon(GenTreeStoreInd* ind) #endif TryCreateAddrMode(ind->Addr(), isContainable, ind); - if (!comp->codeGen->gcInfo.gcIsWriteBarrierStoreIndNode(ind)) + if (comp->codeGen->gcInfo.gcIsWriteBarrierStoreIndNode(ind)) { + return ind->gtNext; + } + #ifndef TARGET_XARCH - if (ind->Data()->IsIconHandle(GTF_ICON_OBJ_HDL)) + if (ind->Data()->IsIconHandle(GTF_ICON_OBJ_HDL)) + { + const ssize_t handle = ind->Data()->AsIntCon()->IconValue(); + if (!comp->info.compCompHnd->isObjectImmutable(reinterpret_cast(handle))) { - const ssize_t handle = ind->Data()->AsIntCon()->IconValue(); - if (!comp->info.compCompHnd->isObjectImmutable(reinterpret_cast(handle))) - { - // On platforms with weaker memory model we need to make sure we use a store with the release semantic - // when we publish a potentially mutable object - // See relevant discussions https://github.com/dotnet/runtime/pull/76135#issuecomment-1257258310 and - // https://github.com/dotnet/runtime/pull/76112#discussion_r980639782 + // On platforms with weaker memory model we need to make sure we use a store with the release semantic + // when we publish a potentially mutable object + // See relevant discussions https://github.com/dotnet/runtime/pull/76135#issuecomment-1257258310 and + // https://github.com/dotnet/runtime/pull/76112#discussion_r980639782 - // This can be relaxed to "just make sure to use stlr/memory barrier" if needed - ind->gtFlags |= GTF_IND_VOLATILE; - } + // This can be relaxed to "just make sure to use stlr/memory barrier" if needed + ind->gtFlags |= GTF_IND_VOLATILE; } + } #endif - LowerStoreIndirCoalescing(ind); - LowerStoreIndir(ind); - } + LowerStoreIndirCoalescing(ind); + return LowerStoreIndir(ind); } //------------------------------------------------------------------------ @@ -9014,7 +9018,7 @@ GenTree* Lowering::LowerIndir(GenTreeIndir* ind) #ifdef TARGET_ARM64 if (comp->opts.OptimizationEnabled() && ind->OperIs(GT_IND)) { - OptimizeForLdp(ind); + OptimizeForLdpStp(ind); } #endif @@ -9029,7 +9033,7 @@ GenTree* Lowering::LowerIndir(GenTreeIndir* ind) // cases passing the distance check, but 82 out of these 112 extra cases were // then rejected due to interference. So 16 seems like a good number to balance // the throughput costs. -const int LDP_REORDERING_MAX_DISTANCE = 16; +const int LDP_STP_REORDERING_MAX_DISTANCE = 16; //------------------------------------------------------------------------ // OptimizeForLdp: Record information about an indirection, and try to optimize @@ -9042,7 +9046,7 @@ const int LDP_REORDERING_MAX_DISTANCE = 16; // Returns: // True if the optimization was successful. // -bool Lowering::OptimizeForLdp(GenTreeIndir* ind) +bool Lowering::OptimizeForLdpStp(GenTreeIndir* ind) { if (!ind->TypeIs(TYP_INT, TYP_LONG, TYP_FLOAT, TYP_DOUBLE, TYP_SIMD8, TYP_SIMD16) || ind->IsVolatile()) { @@ -9060,7 +9064,7 @@ bool Lowering::OptimizeForLdp(GenTreeIndir* ind) // Every indirection takes an expected 2+ nodes, so we only expect at most // half the reordering distance to be candidates for the optimization. - int maxCount = min(m_blockIndirs.Height(), LDP_REORDERING_MAX_DISTANCE / 2); + int maxCount = min(m_blockIndirs.Height(), LDP_STP_REORDERING_MAX_DISTANCE / 2); for (int i = 0; i < maxCount; i++) { SavedIndir& prev = m_blockIndirs.TopRef(i); @@ -9075,11 +9079,22 @@ bool Lowering::OptimizeForLdp(GenTreeIndir* ind) continue; } + if (prevIndir->gtNext == nullptr) + { + // Deleted by other optimization + continue; + } + + if (prevIndir->OperIsStore() != ind->OperIsStore()) + { + continue; + } + JITDUMP("[%06u] and [%06u] are indirs off the same base with offsets +%03u and +%03u\n", Compiler::dspTreeID(ind), Compiler::dspTreeID(prevIndir), (unsigned)offs, (unsigned)prev.Offset); if (std::abs(offs - prev.Offset) == genTypeSize(ind)) { - JITDUMP(" ..and they are amenable to ldp optimization\n"); + JITDUMP(" ..and they are amenable to ldp/stp optimization\n"); if (TryMakeIndirsAdjacent(prevIndir, ind)) { // Do not let the previous one participate in @@ -9115,7 +9130,7 @@ bool Lowering::OptimizeForLdp(GenTreeIndir* ind) bool Lowering::TryMakeIndirsAdjacent(GenTreeIndir* prevIndir, GenTreeIndir* indir) { GenTree* cur = prevIndir; - for (int i = 0; i < LDP_REORDERING_MAX_DISTANCE; i++) + for (int i = 0; i < LDP_STP_REORDERING_MAX_DISTANCE; i++) { cur = cur->gtNext; if (cur == indir) @@ -9172,8 +9187,16 @@ bool Lowering::TryMakeIndirsAdjacent(GenTreeIndir* prevIndir, GenTreeIndir* indi INDEBUG(dumpWithMarks()); JITDUMP("\n"); + if ((prevIndir->gtLIRFlags & LIR::Flags::Mark) != 0) + { + JITDUMP("Previous indir is part of the data flow of current indir\n"); + UnmarkTree(indir); + return false; + } + m_scratchSideEffects.Clear(); + bool sawData = false; for (GenTree* cur = prevIndir->gtNext; cur != indir; cur = cur->gtNext) { if ((cur->gtLIRFlags & LIR::Flags::Mark) != 0) @@ -9186,6 +9209,11 @@ bool Lowering::TryMakeIndirsAdjacent(GenTreeIndir* prevIndir, GenTreeIndir* indi UnmarkTree(indir); return false; } + + if (indir->OperIsStore()) + { + sawData |= cur == indir->Data(); + } } else { @@ -9197,6 +9225,13 @@ bool Lowering::TryMakeIndirsAdjacent(GenTreeIndir* prevIndir, GenTreeIndir* indi if (m_scratchSideEffects.InterferesWith(comp, indir, true)) { + if (!indir->OperIsLoad()) + { + JITDUMP("Have conservative interference with last store. Giving up.\n"); + UnmarkTree(indir); + return false; + } + // Try a bit harder, making use of the following facts: // // 1. We know the indir is non-faulting, so we do not need to worry @@ -9293,8 +9328,39 @@ bool Lowering::TryMakeIndirsAdjacent(GenTreeIndir* prevIndir, GenTreeIndir* indi } } - JITDUMP("Interference checks passed. Moving nodes that are not part of data flow of [%06u]\n\n", - Compiler::dspTreeID(indir)); + JITDUMP("Interference checks passed: can move unrelated nodes past second indir.\n"); + + if (sawData) + { + // If the data node of 'indir' is between 'prevIndir' and 'indir' then + // try to move the previous indir up to happen after the data + // computation. We will be moving all nodes unrelated to the data flow + // past 'indir', so we only need to check interference between + // 'prevIndir' and all nodes that are part of 'indir's dataflow. + m_scratchSideEffects.Clear(); + m_scratchSideEffects.AddNode(comp, prevIndir); + + for (GenTree* cur = prevIndir->gtNext;; cur = cur->gtNext) + { + if ((cur->gtLIRFlags & LIR::Flags::Mark) != 0) + { + if (m_scratchSideEffects.InterferesWith(comp, cur, true)) + { + JITDUMP("Cannot move prev indir [%06u] up past [%06u] to get it past the data computation\n", + Compiler::dspTreeID(prevIndir), Compiler::dspTreeID(cur)); + UnmarkTree(indir); + return false; + } + } + + if (cur == indir->Data()) + { + break; + } + } + } + + JITDUMP("Moving nodes that are not part of data flow of [%06u]\n\n", Compiler::dspTreeID(indir)); GenTree* previous = prevIndir; for (GenTree* node = prevIndir->gtNext;;) @@ -9317,6 +9383,22 @@ bool Lowering::TryMakeIndirsAdjacent(GenTreeIndir* prevIndir, GenTreeIndir* indi node = next; } + if (sawData) + { + // For some reason LSRA is not able to reuse a constant if both LIR + // temps are live simultaneously, so skip moving in those cases and + // expect LSRA to reuse the constant instead. + if (indir->Data()->OperIs(GT_CNS_INT, GT_CNS_DBL) && GenTree::Compare(indir->Data(), prevIndir->Data())) + { + JITDUMP("Not moving previous indir since we are expecting constant reuse for the data\n"); + } + else + { + BlockRange().Remove(prevIndir); + BlockRange().InsertAfter(indir->Data(), prevIndir); + } + } + JITDUMP("Result:\n\n"); INDEBUG(dumpWithMarks()); JITDUMP("\n"); diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index dc32f7941f53..898bc8fef538 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -345,13 +345,13 @@ class Lowering final : public Phase bool GetLoadStoreCoalescingData(GenTreeIndir* ind, LoadStoreCoalescingData* data) const; // Per tree node member functions - void LowerStoreIndirCommon(GenTreeStoreInd* ind); + GenTree* LowerStoreIndirCommon(GenTreeStoreInd* ind); GenTree* LowerIndir(GenTreeIndir* ind); - bool OptimizeForLdp(GenTreeIndir* ind); + bool OptimizeForLdpStp(GenTreeIndir* ind); bool TryMakeIndirsAdjacent(GenTreeIndir* prevIndir, GenTreeIndir* indir); void MarkTree(GenTree* root); void UnmarkTree(GenTree* root); - void LowerStoreIndir(GenTreeStoreInd* node); + GenTree* LowerStoreIndir(GenTreeStoreInd* node); void LowerStoreIndirCoalescing(GenTreeIndir* node); GenTree* LowerAdd(GenTreeOp* node); GenTree* LowerMul(GenTreeOp* mul); diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index 3a32d9003e1a..09466230e7ba 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -491,11 +491,21 @@ void Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc) // node - The indirect store node (GT_STORE_IND) of interest // // Return Value: -// None. +// Next node to lower. // -void Lowering::LowerStoreIndir(GenTreeStoreInd* node) +GenTree* Lowering::LowerStoreIndir(GenTreeStoreInd* node) { + GenTree* next = node->gtNext; ContainCheckStoreIndir(node); + +#ifdef TARGET_ARM64 + if (comp->opts.OptimizationEnabled()) + { + OptimizeForLdpStp(node); + } +#endif + + return next; } //------------------------------------------------------------------------ diff --git a/src/coreclr/jit/lowerloongarch64.cpp b/src/coreclr/jit/lowerloongarch64.cpp index d3552e478fdf..1bed44ca9e16 100644 --- a/src/coreclr/jit/lowerloongarch64.cpp +++ b/src/coreclr/jit/lowerloongarch64.cpp @@ -266,11 +266,12 @@ void Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc) // node - The indirect store node (GT_STORE_IND) of interest // // Return Value: -// None. +// Next node to lower. // -void Lowering::LowerStoreIndir(GenTreeStoreInd* node) +GenTree* Lowering::LowerStoreIndir(GenTreeStoreInd* node) { ContainCheckStoreIndir(node); + return node->gtNext; } //------------------------------------------------------------------------ diff --git a/src/coreclr/jit/lowerriscv64.cpp b/src/coreclr/jit/lowerriscv64.cpp index 5b0bd99df484..e621b1d4f0e3 100644 --- a/src/coreclr/jit/lowerriscv64.cpp +++ b/src/coreclr/jit/lowerriscv64.cpp @@ -215,11 +215,12 @@ void Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc) // node - The indirect store node (GT_STORE_IND) of interest // // Return Value: -// None. +// Next node to lower. // -void Lowering::LowerStoreIndir(GenTreeStoreInd* node) +GenTree* Lowering::LowerStoreIndir(GenTreeStoreInd* node) { ContainCheckStoreIndir(node); + return node->gtNext; } //------------------------------------------------------------------------ diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index dd2bc51ffcfa..e02fb9a6c2ef 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -73,9 +73,9 @@ void Lowering::LowerStoreLoc(GenTreeLclVarCommon* storeLoc) // node - The indirect store node (GT_STORE_IND) of interest // // Return Value: -// None. +// Next node to lower. // -void Lowering::LowerStoreIndir(GenTreeStoreInd* node) +GenTree* Lowering::LowerStoreIndir(GenTreeStoreInd* node) { // Mark all GT_STOREIND nodes to indicate that it is not known // whether it represents a RMW memory op. @@ -92,7 +92,7 @@ void Lowering::LowerStoreIndir(GenTreeStoreInd* node) // SSE2 doesn't support RMW form of instructions. if (LowerRMWMemOp(node)) { - return; + return node->gtNext; } } @@ -109,23 +109,25 @@ void Lowering::LowerStoreIndir(GenTreeStoreInd* node) { if (!node->Data()->IsCnsVec()) { - return; + return node->gtNext; } if (!node->Data()->AsVecCon()->TypeIs(TYP_SIMD32, TYP_SIMD64)) { - return; + return node->gtNext; } if (node->Data()->IsVectorAllBitsSet() || node->Data()->IsVectorZero()) { // To avoid some unexpected regression, this optimization only applies to non-all 1/0 constant vectors. - return; + return node->gtNext; } TryCompressConstVecData(node); } #endif + + return node->gtNext; } //----------------------------------------------------------------------------------------------