From d09e07f0cc39673211d500f7f42bcfe43f72a139 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Fri, 4 Aug 2023 16:18:20 -0700 Subject: [PATCH 1/3] proof of concept --- src/coreclr/jit/compiler.cpp | 3 + src/coreclr/jit/compiler.h | 4 + src/coreclr/jit/compphases.h | 1 + src/coreclr/jit/fgopt.cpp | 307 ++++++++++++++++++++++++++++++ src/coreclr/jit/jitconfigvalues.h | 3 + 5 files changed, 318 insertions(+) diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index a04bd56057abb0..dac2f126f1cc51 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -4746,6 +4746,9 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl // DoPhase(this, PHASE_STR_ADRLCL, &Compiler::fgMarkAddressExposedLocals); + // Tail duplication + DoPhase(this, PHASE_TAIL_DUP, &Compiler::fgTailDuplicate); + // Do an early pass of liveness for forward sub and morph. This data is // valid until after morph. // diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 3214a8a581c82e..e61b2795879915 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -5521,6 +5521,10 @@ class Compiler PhaseStatus fgTailMerge(); + PhaseStatus fgTailDuplicate(); + bool fgTailDuplicateBlock(BasicBlock* const block); + bool fgLocalIsConstantOut(GenTreeLclVarCommon* lcl, BasicBlock* const block); + enum FG_RELOCATE_TYPE { FG_RELOCATE_TRY, // relocate the 'try' region diff --git a/src/coreclr/jit/compphases.h b/src/coreclr/jit/compphases.h index 9fe31ca846410e..90901d3ecfbde6 100644 --- a/src/coreclr/jit/compphases.h +++ b/src/coreclr/jit/compphases.h @@ -56,6 +56,7 @@ CompPhaseNameMacro(PHASE_COMPUTE_EDGE_WEIGHTS, "Compute edge weights (1, f CompPhaseNameMacro(PHASE_CREATE_FUNCLETS, "Create EH funclets", false, -1, false) #endif // FEATURE_EH_FUNCLETS CompPhaseNameMacro(PHASE_TAIL_MERGE, "Tail merge", false, -1, false) +CompPhaseNameMacro(PHASE_TAIL_DUP, "Tail duplicate", false, -1, false) CompPhaseNameMacro(PHASE_MERGE_THROWS, "Merge throw blocks", false, -1, false) CompPhaseNameMacro(PHASE_INVERT_LOOPS, "Invert loops", false, -1, false) CompPhaseNameMacro(PHASE_TAIL_MERGE2, "Post-morph tail merge", false, -1, false) diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index 480739ea693376..918b522ddb5ed3 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -7039,3 +7039,310 @@ PhaseStatus Compiler::fgTailMerge() return madeChanges ? PhaseStatus::MODIFIED_EVERYTHING : PhaseStatus::MODIFIED_NOTHING; } + +//------------------------------------------------------------------------ +// fgTailDuplicate: duplicate sequences of statements in block predecessors +// +// Returns: +// Suitable phase status. +// +// Notes: +// Looks for cases where moving statements from a block to its predecessor +// is likely to result in greatly simplified computations. +// +// Similar in spirit to fgOptimizeUncondBranchToSimpleCond but more general +// as it considers a wider set of computations (not just relops feeding branches). +// +// Ideally we'd do this with SSA available, but our inability to handle +// side effects and to update SSA makes that too challenging. +// +PhaseStatus Compiler::fgTailDuplicate() +{ + bool madeChanges = false; + + const bool isEnabled = JitConfig.JitEnableTailDuplication() > 0; + if (!isEnabled) + { + JITDUMP("Tail duplication disabled by JitEnableTailDuplication\n"); + return PhaseStatus::MODIFIED_NOTHING; + } + + // Must be run when local threading is enabled... + // + assert(fgNodeThreading == NodeThreading::AllLocals); + + // Visit each block + // + for (BasicBlock* const block : Blocks()) + { + madeChanges |= fgTailDuplicateBlock(block); + } + + return madeChanges ? PhaseStatus::MODIFIED_EVERYTHING : PhaseStatus::MODIFIED_NOTHING; +} + +bool Compiler::fgTailDuplicateBlock(BasicBlock* const block) +{ + // We only consider blocks with muliple preds without + // any critical edges. + // + // We could handle critical edges by splitting. + // + unsigned numPreds = 0; + + for (BasicBlock* const predBlock : block->PredBlocks()) + { + if (predBlock->GetUniqueSucc() != block) + { + // critical edge + return false; + } + + numPreds++; + } + + if (numPreds < 2) + { + return false; + } + + if (block->bbJumpKind == BBJ_RETURN) + { + if (!compMethodHasRetVal()) + { + return false; + } + + if (compMethodReturnsRetBufAddr()) + { + return false; + } + + if (genActualType(info.compRetType) == TYP_STRUCT) + { + return false; + } + } + + JITDUMP("Considering tail dup in " FMT_BB ": has %u preds\n", block->bbNum, numPreds); + + bool madeChanges = false; + + // Walk statements from first. + // + Statement* const lastStmt = block->lastStmt(); + for (Statement* stmt : block->Statements()) + { + unsigned numLocal = 0; + unsigned numAllConstant = 0; + unsigned numSomeConstant = 0; + for (GenTreeLclVarCommon* lcl : stmt->LocalsTreeList()) + { + if (lcl->OperIsLocalStore()) + { + continue; + } + + JITDUMP("\nchecking [%06u]\n", dspTreeID(lcl)); + numLocal++; + unsigned numConstantThisLocal = 0; + for (BasicBlock* const predBlock : block->PredBlocks()) + { + // todo: possibly memoize this if we have repeated uses + // of the same local in this or across statements. + // + if (fgLocalIsConstantOut(lcl, predBlock)) + { + numConstantThisLocal++; + } + } + + if (numConstantThisLocal == numPreds) + { + numAllConstant++; + } + else if (numConstantThisLocal > 0) + { + numSomeConstant++; + } + } + + if (numLocal == 0) + { + // Todo: consider allowing reordering stmts... + break; + } + + if (numAllConstant == numLocal) + { + // todo: cost checks... (heuristic) + JITDUMP("All %u locals are constant in all preds, duplicating " FMT_STMT "\n", numLocal, stmt->GetID()); + + // If stmt is GT_RETURN, we need to spill to a new temp, and then dup the spill. + // + bool deleteStmt = true; + + if (stmt->GetRootNode()->OperIs(GT_RETURN)) + { + assert(compMethodHasRetVal()); + GenTree* const retNode = stmt->GetRootNode(); + GenTree* const retVal = retNode->AsOp()->gtOp1; + unsigned const retLclNum = lvaGrabTemp(true DEBUGARG("Tail dup return value temp")); + var_types const retLclType = info.compRetType; + + lvaGetDesc(retLclNum)->lvType = retLclType; + GenTree* const retTemp = gtNewLclvNode(retLclNum, retLclType); + retTemp->gtFlags |= GTF_DONT_CSE; + retNode->AsOp()->gtOp1 = retTemp; + fgSequenceLocals(stmt); + gtUpdateStmtSideEffects(stmt); + + GenTree* const retStore = gtNewTempStore(retLclNum, retVal); + stmt = gtNewStmt(retStore); + fgSequenceLocals(stmt); + deleteStmt = false; + } + + for (BasicBlock* const predBlock : block->PredBlocks()) + { + GenTree* const clone = gtCloneExpr(stmt->GetRootNode()); + noway_assert(clone); + Statement* const cloneStmt = gtNewStmt(clone); + fgInsertStmtAtEnd(predBlock, cloneStmt); + fgSequenceLocals(cloneStmt); + madeChanges = true; + + for (GenTreeLclVarCommon* clonedLcl : cloneStmt->LocalsTreeList()) + { + LclVarDsc* const clonedLclVarDsc = lvaGetDesc(clonedLcl); + clonedLclVarDsc->incLvRefCntSaturating(1, RCS_EARLY); + } + } + + if (deleteStmt) + { + for (GenTreeLclVarCommon* origLcl : stmt->LocalsTreeList()) + { + LclVarDsc* const origLclVarDsc = lvaGetDesc(origLcl); + unsigned short refCnt = origLclVarDsc->lvRefCnt(RCS_EARLY); + assert(refCnt > 0); + origLclVarDsc->setLvRefCnt(refCnt - 1, RCS_EARLY); + } + + fgRemoveStmt(block, stmt); + } + else + { + break; + } + } + else if (numAllConstant > 0) + { + // Could handle this case with a more stringent cost check + JITDUMP("%u of the %u locals are constant in all preds\n", numAllConstant, numLocal); + break; + } + else if (numSomeConstant > 0) + { + // Could handle this case with an even more stringent cost check + JITDUMP("%u of the %u locals are constant in some preds\n", numSomeConstant, numLocal); + break; + } + + if (stmt == lastStmt) + { + break; + } + } + + return madeChanges; +} + +// very similar to fgBlockEndFavorsTailDuplication +bool Compiler::fgLocalIsConstantOut(GenTreeLclVarCommon* lcl, BasicBlock* const block) +{ + // const unsigned lclNum = lcl->GetLclNum(); + + // If the local is address exposed, we currently can't optimize....? + // + // LclVarDsc* const lclDsc = lvaGetDesc(lclNum); + + // if (lclDsc->IsAddressExposed()) + //{ + // return false; + // } + + Statement* const lastStmt = block->lastStmt(); + Statement* const firstStmt = block->FirstNonPhiDef(); + + if (lastStmt != nullptr) + { + // Check up to N statements... + // + const int limit = 10; + int count = 0; + Statement* stmt = lastStmt; + + while (count < limit) + { + count++; + + if ((stmt->GetRootNode()->gtFlags & GTF_ASG) != 0) + { + JITDUMP("Checking " FMT_STMT " for constant def of V%02u\n", stmt->GetID(), lcl->GetLclNum()); + + for (GenTreeLclVarCommon* tree : stmt->LocalsTreeList()) + { + if (tree->OperIsLocalStore() && !tree->OperIsBlkOp() && (tree->GetLclNum() == lcl->GetLclNum())) + { + GenTree* const data = tree->Data(); + if (data->OperIsConst() || data->OperIs(GT_LCL_ADDR)) + { + JITDUMP("[%06u] constant\n", dspTreeID(data)); + return true; + } + else if (data->OperIsLocal()) + { + lcl = data->AsLclVarCommon(); + JITDUMP("Chaining to new local V%02u\n", lcl->GetLclNum()); + } + else + { + JITDUMP("[%06u] not constant or local\n", dspTreeID(data)); + return false; + } + } + } + } + + Statement* const prevStmt = stmt->GetPrevStmt(); + + // The statement list prev links wrap from first->last, so exit + // when we see lastStmt again, as we've now seen all statements. + // + if (prevStmt == lastStmt) + { + break; + } + + stmt = prevStmt; + } + + if (count == limit) + { + return false; + } + } + + BasicBlock* const pred = block->GetUniquePred(this); + + // Walk back in flow...(maybe pass count back...?) + // + if (pred != nullptr) + { + JITDUMP("Chaining back to " FMT_BB "\n", pred->bbNum); + return fgLocalIsConstantOut(lcl, pred); + } + + return false; +} diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h index b3384031f355eb..5222d51edc0448 100644 --- a/src/coreclr/jit/jitconfigvalues.h +++ b/src/coreclr/jit/jitconfigvalues.h @@ -624,6 +624,9 @@ CONFIG_INTEGER(JitCFGUseDispatcher, W("JitCFGUseDispatcher"), 2) // Enable tail merging CONFIG_INTEGER(JitEnableTailMerge, W("JitEnableTailMerge"), 1) +// Enable tail duplication +CONFIG_INTEGER(JitEnableTailDuplication, W("JitEnableTailDuplication"), 1) + // Enable physical promotion CONFIG_INTEGER(JitEnablePhysicalPromotion, W("JitEnablePhysicalPromotion"), 1) From fe3b00a2dbcbc347e8a0f4e397d251db5182baab Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Sat, 5 Aug 2023 10:45:25 -0700 Subject: [PATCH 2/3] fixes --- dotnet.sh | 2 +- src/coreclr/jit/fgopt.cpp | 27 ++++++++++++++++++++++++--- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/dotnet.sh b/dotnet.sh index a612ebac1ce1e4..54cc9bc8e7146f 100755 --- a/dotnet.sh +++ b/dotnet.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash - +set -x source="${BASH_SOURCE[0]}" # resolve $SOURCE until the file is no longer a symlink while [[ -h $source ]]; do diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index 918b522ddb5ed3..7844a356f450b9 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -7058,7 +7058,10 @@ PhaseStatus Compiler::fgTailMerge() // PhaseStatus Compiler::fgTailDuplicate() { - bool madeChanges = false; + if (!opts.OptimizationEnabled()) + { + return PhaseStatus::MODIFIED_NOTHING; + } const bool isEnabled = JitConfig.JitEnableTailDuplication() > 0; if (!isEnabled) @@ -7071,6 +7074,8 @@ PhaseStatus Compiler::fgTailDuplicate() // assert(fgNodeThreading == NodeThreading::AllLocals); + bool madeChanges = false; + // Visit each block // for (BasicBlock* const block : Blocks()) @@ -7173,6 +7178,20 @@ bool Compiler::fgTailDuplicateBlock(BasicBlock* const block) break; } + // leave last statement alone for branches and such. + // + if (stmt == lastStmt) + { + if ((block->bbJumpKind != BBJ_ALWAYS) && (block->bbJumpKind != BBJ_NONE)) + { + // if there are constants here we could split the computation off and hoist it like + // we do for returns... not clear if this is better or not. + // + JITDUMP(FMT_STMT " has control flow impact\n", stmt->GetID()); + break; + } + } + if (numAllConstant == numLocal) { // todo: cost checks... (heuristic) @@ -7239,13 +7258,15 @@ bool Compiler::fgTailDuplicateBlock(BasicBlock* const block) else if (numAllConstant > 0) { // Could handle this case with a more stringent cost check - JITDUMP("%u of the %u locals are constant in all preds\n", numAllConstant, numLocal); + JITDUMP(FMT_STMT " %u of the %u locals are constant in all preds\n", stmt->GetID(), numAllConstant, + numLocal); break; } else if (numSomeConstant > 0) { // Could handle this case with an even more stringent cost check - JITDUMP("%u of the %u locals are constant in some preds\n", numSomeConstant, numLocal); + JITDUMP(FMT_STMT " %u of the %u locals are constant in some preds\n", stmt->GetID(), numSomeConstant, + numLocal); break; } From d0d4dcf455708f054c207f819d9d3d1e23b9006c Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Sat, 5 Aug 2023 11:50:16 -0700 Subject: [PATCH 3/3] more fixes --- src/coreclr/jit/compiler.h | 2 +- src/coreclr/jit/fgopt.cpp | 102 ++++++++++++++++++++----------------- 2 files changed, 55 insertions(+), 49 deletions(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index e61b2795879915..3685bad132f1ec 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -5523,7 +5523,7 @@ class Compiler PhaseStatus fgTailDuplicate(); bool fgTailDuplicateBlock(BasicBlock* const block); - bool fgLocalIsConstantOut(GenTreeLclVarCommon* lcl, BasicBlock* const block); + bool fgLocalIsConstantOut(GenTreeLclVarCommon* lcl, BasicBlock* const block, unsigned limit = 10); enum FG_RELOCATE_TYPE { diff --git a/src/coreclr/jit/fgopt.cpp b/src/coreclr/jit/fgopt.cpp index 7844a356f450b9..cb68182dec8422 100644 --- a/src/coreclr/jit/fgopt.cpp +++ b/src/coreclr/jit/fgopt.cpp @@ -7103,6 +7103,12 @@ bool Compiler::fgTailDuplicateBlock(BasicBlock* const block) return false; } + if (predBlock->isBBCallAlwaysPairTail()) + { + // this block must remain empty + return false; + } + numPreds++; } @@ -7129,7 +7135,7 @@ bool Compiler::fgTailDuplicateBlock(BasicBlock* const block) } } - JITDUMP("Considering tail dup in " FMT_BB ": has %u preds\n", block->bbNum, numPreds); + JITDUMP("\nConsidering tail dup in " FMT_BB ": has %u preds\n", block->bbNum, numPreds); bool madeChanges = false; @@ -7138,6 +7144,7 @@ bool Compiler::fgTailDuplicateBlock(BasicBlock* const block) Statement* const lastStmt = block->lastStmt(); for (Statement* stmt : block->Statements()) { + JITDUMP("\nScanning " FMT_STMT "\n", stmt->GetID()); unsigned numLocal = 0; unsigned numAllConstant = 0; unsigned numSomeConstant = 0; @@ -7148,7 +7155,7 @@ bool Compiler::fgTailDuplicateBlock(BasicBlock* const block) continue; } - JITDUMP("\nchecking [%06u]\n", dspTreeID(lcl)); + JITDUMP("... checking [%06u] V%02u\n", dspTreeID(lcl), lcl->GetLclNum()); numLocal++; unsigned numConstantThisLocal = 0; for (BasicBlock* const predBlock : block->PredBlocks()) @@ -7178,25 +7185,47 @@ bool Compiler::fgTailDuplicateBlock(BasicBlock* const block) break; } + bool tailDuplicate = false; + + if (numAllConstant == numLocal) + { + // todo: cost checks... (heuristic) + JITDUMP("All %u locals are constant in all preds, duplicating " FMT_STMT "\n", numLocal, stmt->GetID()); + tailDuplicate = true; + } + else if (numAllConstant > 0) + { + // Could handle this case with a more stringent cost check + JITDUMP(FMT_STMT " %u of the %u locals are constant in all preds\n", stmt->GetID(), numAllConstant, + numLocal); + } + else if (numSomeConstant > 0) + { + // Could handle this case with an even more stringent cost check + JITDUMP(FMT_STMT " %u of the %u locals are constant in some preds\n", stmt->GetID(), numSomeConstant, + numLocal); + } + // leave last statement alone for branches and such. // - if (stmt == lastStmt) + if ((stmt == lastStmt) && !(block->KindIs(BBJ_ALWAYS, BBJ_NONE, BBJ_RETURN))) { - if ((block->bbJumpKind != BBJ_ALWAYS) && (block->bbJumpKind != BBJ_NONE)) - { - // if there are constants here we could split the computation off and hoist it like - // we do for returns... not clear if this is better or not. - // - JITDUMP(FMT_STMT " has control flow impact\n", stmt->GetID()); - break; - } + // if there are constants here we could split the computation off and hoist it like + // we do for returns... not clear if this is better or not. + // + JITDUMP(FMT_STMT " has control flow impact\n", stmt->GetID()); + tailDuplicate = false; } - if (numAllConstant == numLocal) + if (!tailDuplicate) { - // todo: cost checks... (heuristic) - JITDUMP("All %u locals are constant in all preds, duplicating " FMT_STMT "\n", numLocal, stmt->GetID()); + // We didn't duplicate this statement, so we can't duplicate any of the ones that follow. + break; + } + if (tailDuplicate) + { + JITDUMP(" ...duplicating " FMT_STMT "\n", numLocal, stmt->GetID()); // If stmt is GT_RETURN, we need to spill to a new temp, and then dup the spill. // bool deleteStmt = true; @@ -7252,23 +7281,11 @@ bool Compiler::fgTailDuplicateBlock(BasicBlock* const block) } else { + // Next statement is now the modified return, leave it as is. + // break; } } - else if (numAllConstant > 0) - { - // Could handle this case with a more stringent cost check - JITDUMP(FMT_STMT " %u of the %u locals are constant in all preds\n", stmt->GetID(), numAllConstant, - numLocal); - break; - } - else if (numSomeConstant > 0) - { - // Could handle this case with an even more stringent cost check - JITDUMP(FMT_STMT " %u of the %u locals are constant in some preds\n", stmt->GetID(), numSomeConstant, - numLocal); - break; - } if (stmt == lastStmt) { @@ -7280,29 +7297,18 @@ bool Compiler::fgTailDuplicateBlock(BasicBlock* const block) } // very similar to fgBlockEndFavorsTailDuplication -bool Compiler::fgLocalIsConstantOut(GenTreeLclVarCommon* lcl, BasicBlock* const block) +bool Compiler::fgLocalIsConstantOut(GenTreeLclVarCommon* lcl, BasicBlock* const block, unsigned limit) { - // const unsigned lclNum = lcl->GetLclNum(); - - // If the local is address exposed, we currently can't optimize....? - // - // LclVarDsc* const lclDsc = lvaGetDesc(lclNum); - - // if (lclDsc->IsAddressExposed()) - //{ - // return false; - // } - Statement* const lastStmt = block->lastStmt(); Statement* const firstStmt = block->FirstNonPhiDef(); + // Check up to limit statements... + // + unsigned count = 0; + if (lastStmt != nullptr) { - // Check up to N statements... - // - const int limit = 10; - int count = 0; - Statement* stmt = lastStmt; + Statement* stmt = lastStmt; while (count < limit) { @@ -7357,12 +7363,12 @@ bool Compiler::fgLocalIsConstantOut(GenTreeLclVarCommon* lcl, BasicBlock* const BasicBlock* const pred = block->GetUniquePred(this); - // Walk back in flow...(maybe pass count back...?) + // Walk back in flow // if (pred != nullptr) { - JITDUMP("Chaining back to " FMT_BB "\n", pred->bbNum); - return fgLocalIsConstantOut(lcl, pred); + JITDUMP("Chaining back to " FMT_BB " remaining limit %u\n", pred->bbNum, limit - count); + return fgLocalIsConstantOut(lcl, pred, limit - count); } return false;