Skip to content

Commit

Permalink
Update how OSR and PGO interact (#61453)
Browse files Browse the repository at this point in the history
When both OSR and PGO are enabled:
* Enable instrumenting OSR methods, so that the combined profile data from
Tier0 plus any OSR variants provide a full picture for subsequent Tier1
optimization.
* Use block profiles for both Tier0 methods that are likely to have patchpoints
and OSR methods.
* Fix phase ordering so partially jitted methods don't lose probes.
* A few more fixes for partial compilation, because the number of things
we think we might instrument and the number of things we end up instrumenting
can differ.
* Also improve the DumpJittedMethod output for OSR, and allow selective dumping
of a particular OSR variant by specifying its IL offset.

The updates on the runtime side are to pass BBINSTR to OSR methods, and to
handle the (typical) case where the OSR method instrumentation schema is a subset
of the Tier0 method schema.

We are still allowing OSR methods to read the profile data. So they are both
profile instrumented and profile optimized. Not clear if this is going to work
well as the Tier0 data will be incomplete and optimization quality may be poor.
Something to revisit down the road.
  • Loading branch information
AndyAyersMS authored Nov 13, 2021
1 parent a4bb83a commit 26a6f55
Show file tree
Hide file tree
Showing 11 changed files with 192 additions and 48 deletions.
3 changes: 3 additions & 0 deletions eng/pipelines/common/templates/runtimes/run-test-job.yml
Original file line number Diff line number Diff line change
Expand Up @@ -554,7 +554,10 @@ jobs:
scenarios:
- jitosr
- jitosr_stress
- jitosr_pgo
- jitpartialcompilation
- jitpartialcompilation_osr
- jitpartialcompilation_osr_pgo
- jitobjectstackallocation
${{ if in(parameters.testGroup, 'ilasm') }}:
scenarios:
Expand Down
29 changes: 29 additions & 0 deletions src/coreclr/inc/pgo_formatprocessing.h
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,35 @@ bool ReadInstrumentationSchemaWithLayout(const uint8_t *pByte, size_t cbDataMax,
});
}


// Return true if schemaTable entries are a subset of the schema described by pByte, with matching entries in the same order.
// Also updates offset of the matching entries in schemaTable to those of the pByte schema.
//
inline bool CheckIfPgoSchemaIsCompatibleAndSetOffsets(const uint8_t *pByte, size_t cbDataMax, ICorJitInfo::PgoInstrumentationSchema* schemaTable, size_t cSchemas)
{
size_t nMatched = 0;
size_t initialOffset = cbDataMax;

auto handler = [schemaTable, cSchemas, &nMatched](const ICorJitInfo::PgoInstrumentationSchema& schema)
{
if ((nMatched < cSchemas)
&& (schema.InstrumentationKind == schemaTable[nMatched].InstrumentationKind)
&& (schema.ILOffset == schemaTable[nMatched].ILOffset)
&& (schema.Count == schemaTable[nMatched].Count)
&& (schema.Other == schemaTable[nMatched].Other))
{
schemaTable[nMatched].Offset = schema.Offset;
nMatched++;
}

return true;
};

ReadInstrumentationSchemaWithLayout(pByte, cbDataMax, initialOffset, handler);

return (nMatched == cSchemas);
}

inline bool ReadInstrumentationSchemaWithLayoutIntoSArray(const uint8_t *pByte, size_t cbDataMax, size_t initialOffset, SArray<ICorJitInfo::PgoInstrumentationSchema>* pSchemas)
{
auto lambda = [pSchemas](const ICorJitInfo::PgoInstrumentationSchema &schema)
Expand Down
34 changes: 29 additions & 5 deletions src/coreclr/jit/compiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2777,6 +2777,18 @@ void Compiler::compInitOptions(JitFlags* jitFlags)
verboseDump = (JitConfig.JitDumpTier0() > 0);
}

// Optionally suppress dumping some OSR jit requests.
//
if (verboseDump && jitFlags->IsSet(JitFlags::JIT_FLAG_OSR))
{
const int desiredOffset = JitConfig.JitDumpAtOSROffset();

if (desiredOffset != -1)
{
verboseDump = (((IL_OFFSET)desiredOffset) == info.compILEntry);
}
}

if (verboseDump)
{
verbose = true;
Expand Down Expand Up @@ -4447,6 +4459,10 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl
//
DoPhase(this, PHASE_IMPORTATION, &Compiler::fgImport);

// Expand any patchpoints
//
DoPhase(this, PHASE_PATCHPOINTS, &Compiler::fgTransformPatchpoints);

// If instrumenting, add block and class probes.
//
if (compileFlags->IsSet(JitFlags::JIT_FLAG_BBINSTR))
Expand All @@ -4458,10 +4474,6 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl
//
DoPhase(this, PHASE_INDXCALL, &Compiler::fgTransformIndirectCalls);

// Expand any patchpoints
//
DoPhase(this, PHASE_PATCHPOINTS, &Compiler::fgTransformPatchpoints);

// PostImportPhase: cleanup inlinees
//
auto postImportPhase = [this]() {
Expand Down Expand Up @@ -6375,9 +6387,21 @@ int Compiler::compCompileHelper(CORINFO_MODULE_HANDLE classPtr,
#ifdef DEBUG
if ((JitConfig.DumpJittedMethods() == 1) && !compIsForInlining())
{
enum
{
BUFSIZE = 20
};
char osrBuffer[BUFSIZE] = {0};
if (opts.IsOSR())
{
// Tiering name already includes "OSR", we just want the IL offset
//
sprintf_s(osrBuffer, BUFSIZE, " @0x%x", info.compILEntry);
}

printf("Compiling %4d %s::%s, IL size = %u, hash=0x%08x %s%s%s\n", Compiler::jitTotalMethodCompiled,
info.compClassName, info.compMethodName, info.compILCodeSize, info.compMethodHash(),
compGetTieringName(), opts.IsOSR() ? " OSR" : "", compGetStressMessage());
compGetTieringName(), osrBuffer, compGetStressMessage());
}
if (compIsForInlining())
{
Expand Down
14 changes: 13 additions & 1 deletion src/coreclr/jit/fgopt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1553,7 +1553,18 @@ void Compiler::fgPostImportationCleanup()
//
auto addConditionalFlow = [this, entryStateVar, &entryJumpTarget](BasicBlock* fromBlock,
BasicBlock* toBlock) {
fgSplitBlockAtBeginning(fromBlock);

// We may have previously though this try entry was unreachable, but now we're going to
// step through it on the way to the OSR entry. So ensure it has plausible profile weight.
//
if (fgHaveProfileData() && !fromBlock->hasProfileWeight())
{
JITDUMP("Updating block weight for now-reachable try entry " FMT_BB " via " FMT_BB "\n",
fromBlock->bbNum, fgFirstBB->bbNum);
fromBlock->inheritWeight(fgFirstBB);
}

BasicBlock* const newBlock = fgSplitBlockAtBeginning(fromBlock);
fromBlock->bbFlags |= BBF_INTERNAL;

GenTree* const entryStateLcl = gtNewLclvNode(entryStateVar, TYP_INT);
Expand All @@ -1565,6 +1576,7 @@ void Compiler::fgPostImportationCleanup()
fromBlock->bbJumpKind = BBJ_COND;
fromBlock->bbJumpDest = toBlock;
fgAddRefPred(toBlock, fromBlock);
newBlock->inheritWeight(fromBlock);

entryJumpTarget = fromBlock;
};
Expand Down
113 changes: 83 additions & 30 deletions src/coreclr/jit/fgprofile.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -300,10 +300,10 @@ class Instrumentor
virtual void BuildSchemaElements(BasicBlock* block, Schema& schema)
{
}
virtual void Instrument(BasicBlock* block, Schema& schema, BYTE* profileMemory)
virtual void Instrument(BasicBlock* block, Schema& schema, uint8_t* profileMemory)
{
}
virtual void InstrumentMethodEntry(Schema& schema, BYTE* profileMemory)
virtual void InstrumentMethodEntry(Schema& schema, uint8_t* profileMemory)
{
}
virtual void SuppressProbes()
Expand Down Expand Up @@ -349,8 +349,8 @@ class BlockCountInstrumentor : public Instrumentor
}
void Prepare(bool isPreImport) override;
void BuildSchemaElements(BasicBlock* block, Schema& schema) override;
void Instrument(BasicBlock* block, Schema& schema, BYTE* profileMemory) override;
void InstrumentMethodEntry(Schema& schema, BYTE* profileMemory) override;
void Instrument(BasicBlock* block, Schema& schema, uint8_t* profileMemory) override;
void InstrumentMethodEntry(Schema& schema, uint8_t* profileMemory) override;
};

//------------------------------------------------------------------------
Expand Down Expand Up @@ -428,7 +428,7 @@ void BlockCountInstrumentor::BuildSchemaElements(BasicBlock* block, Schema& sche
// schema -- instrumentation schema
// profileMemory -- profile data slab
//
void BlockCountInstrumentor::Instrument(BasicBlock* block, Schema& schema, BYTE* profileMemory)
void BlockCountInstrumentor::Instrument(BasicBlock* block, Schema& schema, uint8_t* profileMemory)
{
const ICorJitInfo::PgoInstrumentationSchema& entry = schema[block->bbCountSchemaIndex];

Expand Down Expand Up @@ -464,7 +464,7 @@ void BlockCountInstrumentor::Instrument(BasicBlock* block, Schema& schema, BYTE*
// Notes:
// When prejitting, add the method entry callback node
//
void BlockCountInstrumentor::InstrumentMethodEntry(Schema& schema, BYTE* profileMemory)
void BlockCountInstrumentor::InstrumentMethodEntry(Schema& schema, uint8_t* profileMemory)
{
Compiler::Options& opts = m_comp->opts;
Compiler::Info& info = m_comp->info;
Expand Down Expand Up @@ -1002,7 +1002,7 @@ class EfficientEdgeCountInstrumentor : public Instrumentor, public SpanningTreeV
return ((block->bbFlags & BBF_IMPORTED) == BBF_IMPORTED);
}
void BuildSchemaElements(BasicBlock* block, Schema& schema) override;
void Instrument(BasicBlock* block, Schema& schema, BYTE* profileMemory) override;
void Instrument(BasicBlock* block, Schema& schema, uint8_t* profileMemory) override;

void Badcode() override
{
Expand Down Expand Up @@ -1136,7 +1136,7 @@ void EfficientEdgeCountInstrumentor::BuildSchemaElements(BasicBlock* block, Sche
// schema -- instrumentation schema
// profileMemory -- profile data slab
//
void EfficientEdgeCountInstrumentor::Instrument(BasicBlock* block, Schema& schema, BYTE* profileMemory)
void EfficientEdgeCountInstrumentor::Instrument(BasicBlock* block, Schema& schema, uint8_t* profileMemory)
{
// Inlinee compilers build their blocks in the root compiler's
// graph. So for NumSucc, we use the root compiler instance.
Expand Down Expand Up @@ -1311,12 +1311,12 @@ class BuildClassProbeSchemaGen
class ClassProbeInserter
{
Schema& m_schema;
BYTE* m_profileMemory;
uint8_t* m_profileMemory;
int* m_currentSchemaIndex;
unsigned& m_instrCount;

public:
ClassProbeInserter(Schema& schema, BYTE* profileMemory, int* pCurrentSchemaIndex, unsigned& instrCount)
ClassProbeInserter(Schema& schema, uint8_t* profileMemory, int* pCurrentSchemaIndex, unsigned& instrCount)
: m_schema(schema)
, m_profileMemory(profileMemory)
, m_currentSchemaIndex(pCurrentSchemaIndex)
Expand Down Expand Up @@ -1353,7 +1353,7 @@ class ClassProbeInserter

// Figure out where the table is located.
//
BYTE* classProfile = m_schema[*m_currentSchemaIndex].Offset + m_profileMemory;
uint8_t* classProfile = m_schema[*m_currentSchemaIndex].Offset + m_profileMemory;
*m_currentSchemaIndex += 2; // There are 2 schema entries per class probe

// Grab a temp to hold the 'this' object as it will be used three times
Expand Down Expand Up @@ -1430,7 +1430,7 @@ class ClassProbeInstrumentor : public Instrumentor
}
void Prepare(bool isPreImport) override;
void BuildSchemaElements(BasicBlock* block, Schema& schema) override;
void Instrument(BasicBlock* block, Schema& schema, BYTE* profileMemory) override;
void Instrument(BasicBlock* block, Schema& schema, uint8_t* profileMemory) override;
void SuppressProbes() override;
};

Expand Down Expand Up @@ -1494,7 +1494,7 @@ void ClassProbeInstrumentor::BuildSchemaElements(BasicBlock* block, Schema& sche
// schema -- instrumentation schema
// profileMemory -- profile data slab
//
void ClassProbeInstrumentor::Instrument(BasicBlock* block, Schema& schema, BYTE* profileMemory)
void ClassProbeInstrumentor::Instrument(BasicBlock* block, Schema& schema, uint8_t* profileMemory)
{
if ((block->bbFlags & BBF_HAS_CLASS_PROFILE) == 0)
{
Expand Down Expand Up @@ -1567,21 +1567,43 @@ PhaseStatus Compiler::fgPrepareToInstrumentMethod()
// Choose instrumentation technology.
//
// We enable edge profiling by default, except when:
//
// * disabled by option
// * we are prejitting
// * we are jitting osr methods
// * we are jitting tier0 methods with patchpoints
// * we are jitting an OSR method
//
// Currently, OSR is incompatible with edge profiling. So if OSR is enabled,
// always do block profiling.
// OSR is incompatible with edge profiling. Only portions of the Tier0
// method will be executed, and the bail-outs at patchpoints won't be obvious
// exit points from the method. So for OSR we always do block profiling.
//
// Note this incompatibility only exists for methods that actually have
// patchpoints, but we won't know that until we import.
// patchpoints. Currently we will only place patchponts in methods with
// backwards jumps.
//
// And because we want the Tier1 method to see the full set of profile data,
// when OSR is enabled, both Tier0 and any OSR methods need to contribute to
// the same profile data set. Since Tier0 has laid down a dense block-based
// schema, the OSR methods must use this schema as well.
//
// Note that OSR methods may also inline. We currently won't instrument
// any inlinee contributions (which would also need to carefully "share"
// the profile data segment with any Tier0 version and/or any other equivalent
// inlnee), so we'll lose a bit of their profile data. We can support this
// eventually if it turns out to matter.
//
// Similar issues arise with partially jitted methods. Because we currently
// only defer jitting for throw blocks, we currently ignore the impact of partial
// jitting on PGO. If we ever implement a broader pattern of deferral -- say deferring
// based on static PGO -- we will need to reconsider.
//
CLANG_FORMAT_COMMENT_ANCHOR;

const bool prejit = opts.jitFlags->IsSet(JitFlags::JIT_FLAG_PREJIT);
const bool osr = (opts.jitFlags->IsSet(JitFlags::JIT_FLAG_TIER0) && (JitConfig.TC_OnStackReplacement() > 0));
const bool useEdgeProfiles = (JitConfig.JitEdgeProfiling() > 0) && !prejit && !osr;
const bool prejit = opts.jitFlags->IsSet(JitFlags::JIT_FLAG_PREJIT);
const bool tier0WithPatchpoints = opts.jitFlags->IsSet(JitFlags::JIT_FLAG_TIER0) &&
(JitConfig.TC_OnStackReplacement() > 0) && compHasBackwardJump;
const bool osrMethod = opts.IsOSR();
const bool useEdgeProfiles = (JitConfig.JitEdgeProfiling() > 0) && !prejit && !tier0WithPatchpoints && !osrMethod;

if (useEdgeProfiles)
{
Expand All @@ -1590,7 +1612,9 @@ PhaseStatus Compiler::fgPrepareToInstrumentMethod()
else
{
JITDUMP("Using block profiling, because %s\n",
(JitConfig.JitEdgeProfiling() > 0) ? "edge profiles disabled" : prejit ? "prejitting" : "OSR");
(JitConfig.JitEdgeProfiling() > 0)
? "edge profiles disabled"
: prejit ? "prejitting" : osrMethod ? "OSR" : "tier0 with patchpoints");

fgCountInstrumentor = new (this, CMK_Pgo) BlockCountInstrumentor(this);
}
Expand Down Expand Up @@ -1640,7 +1664,7 @@ PhaseStatus Compiler::fgInstrumentMethod()
{
noway_assert(!compIsForInlining());

// Make post-importpreparations.
// Make post-import preparations.
//
const bool isPreImport = false;
fgCountInstrumentor->Prepare(isPreImport);
Expand All @@ -1665,7 +1689,17 @@ PhaseStatus Compiler::fgInstrumentMethod()
// Verify we created schema for the calls needing class probes.
// (we counted those when importing)
//
assert(fgClassInstrumentor->SchemaCount() == info.compClassProbeCount);
// This is not true when we do partial compilation; it can/will erase class probes,
// and there's no easy way to figure out how many should be left.
//
if (doesMethodHavePartialCompilationPatchpoints())
{
assert(fgClassInstrumentor->SchemaCount() <= info.compClassProbeCount);
}
else
{
assert(fgClassInstrumentor->SchemaCount() == info.compClassProbeCount);
}

// Optionally, when jitting, if there were no class probes and only one count probe,
// suppress instrumentation.
Expand Down Expand Up @@ -1698,11 +1732,16 @@ PhaseStatus Compiler::fgInstrumentMethod()

assert(schema.size() > 0);

// Allocate the profile buffer
// Allocate/retrieve the profile buffer.
//
BYTE* profileMemory;

HRESULT res = info.compCompHnd->allocPgoInstrumentationBySchema(info.compMethodHnd, schema.data(),
// If this is an OSR method, we should use the same buffer that the Tier0 method used.
//
// This is supported by allocPgoInsrumentationDataBySchema, which will verify the schema
// we provide here matches the one from Tier0, and will fill in the data offsets in
// our schema properly.
//
uint8_t* profileMemory;
HRESULT res = info.compCompHnd->allocPgoInstrumentationBySchema(info.compMethodHnd, schema.data(),
(UINT32)schema.size(), &profileMemory);

// Deal with allocation failures.
Expand Down Expand Up @@ -1924,6 +1963,14 @@ void Compiler::fgIncorporateBlockCounts()
fgSetProfileWeight(block, profileWeight);
}
}

// For OSR, give the method entry (which will be a scratch BB)
// the same weight as the OSR Entry.
//
if (opts.IsOSR())
{
fgFirstBB->inheritWeight(fgOSREntryBB);
}
}

//------------------------------------------------------------------------
Expand Down Expand Up @@ -3277,11 +3324,17 @@ void Compiler::fgComputeCalledCount(weight_t returnWeight)

BasicBlock* firstILBlock = fgFirstBB; // The first block for IL code (i.e. for the IL code at offset 0)

// Skip past any/all BBF_INTERNAL blocks that may have been added before the first real IL block.
// OSR methods can have complex entry flow, and so
// for OSR we ensure fgFirstBB has plausible profile data.
//
while (firstILBlock->bbFlags & BBF_INTERNAL)
if (!opts.IsOSR())
{
firstILBlock = firstILBlock->bbNext;
// Skip past any/all BBF_INTERNAL blocks that may have been added before the first real IL block.
//
while (firstILBlock->bbFlags & BBF_INTERNAL)
{
firstILBlock = firstILBlock->bbNext;
}
}

// The 'firstILBlock' is now expected to have a profile-derived weight
Expand Down
Loading

0 comments on commit 26a6f55

Please sign in to comment.