-
Notifications
You must be signed in to change notification settings - Fork 12.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[X86] Prefer lock or
over mfence
#106555
base: main
Are you sure you want to change the base?
[X86] Prefer lock or
over mfence
#106555
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -754,6 +754,10 @@ def TuningUseGLMDivSqrtCosts | |
def TuningBranchHint: SubtargetFeature<"branch-hint", "HasBranchHint", "true", | ||
"Target has branch hint feature">; | ||
|
||
def TuningAvoidMFENCE | ||
: SubtargetFeature<"avoid-mfence", "AvoidMFence", "true", | ||
vchuravy marked this conversation as resolved.
Show resolved
Hide resolved
|
||
"Avoid MFENCE for fence seq_cst, and instead use lock or">; | ||
|
||
//===----------------------------------------------------------------------===// | ||
// X86 CPU Families | ||
// TODO: Remove these - use general tuning features to determine codegen. | ||
|
@@ -815,7 +819,8 @@ def ProcessorFeatures { | |
TuningSlow3OpsLEA, | ||
TuningSlowDivide64, | ||
TuningSlowIncDec, | ||
TuningInsertVZEROUPPER | ||
TuningInsertVZEROUPPER, | ||
TuningAvoidMFENCE | ||
]; | ||
|
||
list<SubtargetFeature> X86_64V2Features = !listconcat(X86_64V1Features, [ | ||
|
@@ -831,7 +836,8 @@ def ProcessorFeatures { | |
TuningFastSHLDRotate, | ||
TuningFast15ByteNOP, | ||
TuningPOPCNTFalseDeps, | ||
TuningInsertVZEROUPPER | ||
TuningInsertVZEROUPPER, | ||
TuningAvoidMFENCE | ||
]; | ||
|
||
list<SubtargetFeature> X86_64V3Features = !listconcat(X86_64V2Features, [ | ||
|
@@ -850,7 +856,8 @@ def ProcessorFeatures { | |
TuningPOPCNTFalseDeps, | ||
TuningLZCNTFalseDeps, | ||
TuningInsertVZEROUPPER, | ||
TuningAllowLight256Bit | ||
TuningAllowLight256Bit, | ||
TuningAvoidMFENCE | ||
]; | ||
|
||
list<SubtargetFeature> X86_64V4Features = !listconcat(X86_64V3Features, [ | ||
|
@@ -874,15 +881,17 @@ def ProcessorFeatures { | |
TuningFastGather, | ||
TuningPOPCNTFalseDeps, | ||
TuningInsertVZEROUPPER, | ||
TuningAllowLight256Bit | ||
TuningAllowLight256Bit, | ||
TuningAvoidMFENCE | ||
]; | ||
|
||
// Nehalem | ||
list<SubtargetFeature> NHMFeatures = X86_64V2Features; | ||
list<SubtargetFeature> NHMTuning = [TuningMacroFusion, | ||
TuningSlowDivide64, | ||
TuningInsertVZEROUPPER, | ||
TuningNoDomainDelayMov]; | ||
TuningNoDomainDelayMov, | ||
TuningAvoidMFENCE]; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Probably add TuningAvoidMFENCE to tuning for x86-64-v2 and later? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also added it to x86-64-v1, since that ought to be equivalent to early "core" marchs? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we do that we might as well support it on any 64-bit capable CPU There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe that's what GCC did.
|
||
|
||
// Westmere | ||
list<SubtargetFeature> WSMAdditionalFeatures = [FeaturePCLMUL]; | ||
|
@@ -903,7 +912,8 @@ def ProcessorFeatures { | |
TuningFast15ByteNOP, | ||
TuningPOPCNTFalseDeps, | ||
TuningInsertVZEROUPPER, | ||
TuningNoDomainDelayMov]; | ||
TuningNoDomainDelayMov, | ||
TuningAvoidMFENCE]; | ||
list<SubtargetFeature> SNBFeatures = | ||
!listconcat(WSMFeatures, SNBAdditionalFeatures); | ||
|
||
|
@@ -969,7 +979,8 @@ def ProcessorFeatures { | |
TuningAllowLight256Bit, | ||
TuningNoDomainDelayMov, | ||
TuningNoDomainDelayShuffle, | ||
TuningNoDomainDelayBlend]; | ||
TuningNoDomainDelayBlend, | ||
TuningAvoidMFENCE]; | ||
list<SubtargetFeature> SKLFeatures = | ||
!listconcat(BDWFeatures, SKLAdditionalFeatures); | ||
|
||
|
@@ -1004,7 +1015,8 @@ def ProcessorFeatures { | |
TuningNoDomainDelayMov, | ||
TuningNoDomainDelayShuffle, | ||
TuningNoDomainDelayBlend, | ||
TuningFastImmVectorShift]; | ||
TuningFastImmVectorShift, | ||
TuningAvoidMFENCE]; | ||
list<SubtargetFeature> SKXFeatures = | ||
!listconcat(BDWFeatures, SKXAdditionalFeatures); | ||
|
||
|
@@ -1047,7 +1059,8 @@ def ProcessorFeatures { | |
TuningNoDomainDelayMov, | ||
TuningNoDomainDelayShuffle, | ||
TuningNoDomainDelayBlend, | ||
TuningFastImmVectorShift]; | ||
TuningFastImmVectorShift, | ||
TuningAvoidMFENCE]; | ||
list<SubtargetFeature> CNLFeatures = | ||
!listconcat(SKLFeatures, CNLAdditionalFeatures); | ||
|
||
|
@@ -1076,7 +1089,8 @@ def ProcessorFeatures { | |
TuningNoDomainDelayMov, | ||
TuningNoDomainDelayShuffle, | ||
TuningNoDomainDelayBlend, | ||
TuningFastImmVectorShift]; | ||
TuningFastImmVectorShift, | ||
TuningAvoidMFENCE]; | ||
list<SubtargetFeature> ICLFeatures = | ||
!listconcat(CNLFeatures, ICLAdditionalFeatures); | ||
|
||
|
@@ -1222,7 +1236,8 @@ def ProcessorFeatures { | |
// Tremont | ||
list<SubtargetFeature> TRMAdditionalFeatures = [FeatureCLWB, | ||
FeatureGFNI]; | ||
list<SubtargetFeature> TRMTuning = GLPTuning; | ||
list<SubtargetFeature> TRMAdditionalTuning = [TuningAvoidMFENCE]; | ||
list<SubtargetFeature> TRMTuning = !listconcat(GLPTuning, TRMAdditionalTuning); | ||
list<SubtargetFeature> TRMFeatures = | ||
!listconcat(GLPFeatures, TRMAdditionalFeatures); | ||
|
||
|
@@ -1400,7 +1415,8 @@ def ProcessorFeatures { | |
TuningFastImm16, | ||
TuningSBBDepBreaking, | ||
TuningSlowDivide64, | ||
TuningSlowSHLD]; | ||
TuningSlowSHLD, | ||
TuningAvoidMFENCE]; | ||
list<SubtargetFeature> BtVer2Features = | ||
!listconcat(BtVer1Features, BtVer2AdditionalFeatures); | ||
|
||
|
@@ -1429,7 +1445,8 @@ def ProcessorFeatures { | |
TuningFastScalarShiftMasks, | ||
TuningBranchFusion, | ||
TuningSBBDepBreaking, | ||
TuningInsertVZEROUPPER]; | ||
TuningInsertVZEROUPPER, | ||
TuningAvoidMFENCE]; | ||
vchuravy marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
// PileDriver | ||
list<SubtargetFeature> BdVer2AdditionalFeatures = [FeatureF16C, | ||
|
@@ -1509,7 +1526,8 @@ def ProcessorFeatures { | |
TuningSlowSHLD, | ||
TuningSBBDepBreaking, | ||
TuningInsertVZEROUPPER, | ||
TuningAllowLight256Bit]; | ||
TuningAllowLight256Bit, | ||
TuningAvoidMFENCE]; | ||
list<SubtargetFeature> ZN2AdditionalFeatures = [FeatureCLWB, | ||
FeatureRDPID, | ||
FeatureRDPRU, | ||
|
@@ -1697,7 +1715,8 @@ def : ProcModel<P, SandyBridgeModel, [ | |
[ | ||
TuningMacroFusion, | ||
TuningSlowUAMem16, | ||
TuningInsertVZEROUPPER | ||
TuningInsertVZEROUPPER, | ||
TuningAvoidMFENCE | ||
]>; | ||
} | ||
foreach P = ["penryn", "core_2_duo_sse4_1"] in { | ||
|
@@ -1716,7 +1735,8 @@ def : ProcModel<P, SandyBridgeModel, [ | |
[ | ||
TuningMacroFusion, | ||
TuningSlowUAMem16, | ||
TuningInsertVZEROUPPER | ||
TuningInsertVZEROUPPER, | ||
TuningAvoidMFENCE | ||
]>; | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31422,21 +31422,10 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { | |
// otherwise, we might be able to be more aggressive on relaxed idempotent | ||
// rmw. In practice, they do not look useful, so we don't try to be | ||
// especially clever. | ||
if (SSID == SyncScope::SingleThread) | ||
// FIXME: we could just insert an ISD::MEMBARRIER here, except we are at | ||
// the IR level, so we must wrap it in an intrinsic. | ||
return nullptr; | ||
|
||
if (!Subtarget.hasMFence()) | ||
// FIXME: it might make sense to use a locked operation here but on a | ||
// different cache-line to prevent cache-line bouncing. In practice it | ||
// is probably a small win, and x86 processors without mfence are rare | ||
// enough that we do not bother. | ||
return nullptr; | ||
|
||
Function *MFence = | ||
llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence); | ||
Builder.CreateCall(MFence, {}); | ||
// Use `fence seq_cst` over `llvm.x64.sse2.mfence` here to get the correct | ||
// lowering for SSID == SyncScope::SingleThread and avoidMFence || !hasMFence | ||
Builder.CreateFence(AtomicOrdering::SequentiallyConsistent, SSID); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this change be pulled out into a preliminary patch? |
||
|
||
// Finally we can emit the atomic load. | ||
LoadInst *Loaded = Builder.CreateAlignedLoad( | ||
|
@@ -31524,7 +31513,7 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget, | |
// cross-thread fence. | ||
if (FenceOrdering == AtomicOrdering::SequentiallyConsistent && | ||
FenceSSID == SyncScope::System) { | ||
if (Subtarget.hasMFence()) | ||
if (!Subtarget.avoidMFence() && Subtarget.hasMFence()) | ||
return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); | ||
|
||
SDValue Chain = Op.getOperand(0); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should better to use the opposite definition, e.g.,
TuningPreferMFENCE
given modern CPUs don't need it?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am sympathetic to that, but also following GCCs lead here.