-
Notifications
You must be signed in to change notification settings - Fork 12.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[X86] Prefer lock or
over mfence
#106555
base: main
Are you sure you want to change the base?
[X86] Prefer lock or
over mfence
#106555
Conversation
@llvm/pr-subscribers-backend-x86 Author: Valentin Churavy (vchuravy) ChangesOriginally opened as https://reviews.llvm.org/D129947 LLVM currently emits Amusingly this came up elsewhere: https://www.reddit.com/r/cpp_questions/comments/16uer2g/how_do_i_stop_clang_generating_mfence/ After another 2 years it doesn't look like anyone complained about the Full diff: https://github.com/llvm/llvm-project/pull/106555.diff 3 Files Affected:
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 988966fa6a6c46..dfa534a69e7024 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -754,6 +754,10 @@ def TuningUseGLMDivSqrtCosts
def TuningBranchHint: SubtargetFeature<"branch-hint", "HasBranchHint", "true",
"Target has branch hint feature">;
+def TuningAvoidMFENCE
+ : SubtargetFeature<"avoid-mfence", "AvoidMFence", "true",
+ "Avoid MFENCE for fence seq_cst, and instead use lock or">;
+
//===----------------------------------------------------------------------===//
// X86 CPU Families
// TODO: Remove these - use general tuning features to determine codegen.
@@ -882,7 +886,8 @@ def ProcessorFeatures {
list<SubtargetFeature> NHMTuning = [TuningMacroFusion,
TuningSlowDivide64,
TuningInsertVZEROUPPER,
- TuningNoDomainDelayMov];
+ TuningNoDomainDelayMov,
+ TuningAvoidMFENCE];
// Westmere
list<SubtargetFeature> WSMAdditionalFeatures = [FeaturePCLMUL];
@@ -903,7 +908,8 @@ def ProcessorFeatures {
TuningFast15ByteNOP,
TuningPOPCNTFalseDeps,
TuningInsertVZEROUPPER,
- TuningNoDomainDelayMov];
+ TuningNoDomainDelayMov,
+ TuningAvoidMFENCE];
list<SubtargetFeature> SNBFeatures =
!listconcat(WSMFeatures, SNBAdditionalFeatures);
@@ -969,7 +975,8 @@ def ProcessorFeatures {
TuningAllowLight256Bit,
TuningNoDomainDelayMov,
TuningNoDomainDelayShuffle,
- TuningNoDomainDelayBlend];
+ TuningNoDomainDelayBlend,
+ TuningAvoidMFENCE];
list<SubtargetFeature> SKLFeatures =
!listconcat(BDWFeatures, SKLAdditionalFeatures);
@@ -1004,7 +1011,8 @@ def ProcessorFeatures {
TuningNoDomainDelayMov,
TuningNoDomainDelayShuffle,
TuningNoDomainDelayBlend,
- TuningFastImmVectorShift];
+ TuningFastImmVectorShift,
+ TuningAvoidMFENCE];
list<SubtargetFeature> SKXFeatures =
!listconcat(BDWFeatures, SKXAdditionalFeatures);
@@ -1047,7 +1055,8 @@ def ProcessorFeatures {
TuningNoDomainDelayMov,
TuningNoDomainDelayShuffle,
TuningNoDomainDelayBlend,
- TuningFastImmVectorShift];
+ TuningFastImmVectorShift,
+ TuningAvoidMFENCE];
list<SubtargetFeature> CNLFeatures =
!listconcat(SKLFeatures, CNLAdditionalFeatures);
@@ -1076,7 +1085,8 @@ def ProcessorFeatures {
TuningNoDomainDelayMov,
TuningNoDomainDelayShuffle,
TuningNoDomainDelayBlend,
- TuningFastImmVectorShift];
+ TuningFastImmVectorShift,
+ TuningAvoidMFENCE];
list<SubtargetFeature> ICLFeatures =
!listconcat(CNLFeatures, ICLAdditionalFeatures);
@@ -1222,7 +1232,8 @@ def ProcessorFeatures {
// Tremont
list<SubtargetFeature> TRMAdditionalFeatures = [FeatureCLWB,
FeatureGFNI];
- list<SubtargetFeature> TRMTuning = GLPTuning;
+ list<SubtargetFeature> TRMAdditionalTuning = [TuningAvoidMFENCE];
+ list<SubtargetFeature> TRMTuning = !listconcat(GLPTuning, TRMAdditionalTuning);
list<SubtargetFeature> TRMFeatures =
!listconcat(GLPFeatures, TRMAdditionalFeatures);
@@ -1429,7 +1440,8 @@ def ProcessorFeatures {
TuningFastScalarShiftMasks,
TuningBranchFusion,
TuningSBBDepBreaking,
- TuningInsertVZEROUPPER];
+ TuningInsertVZEROUPPER,
+ TuningAvoidMFENCE];
// PileDriver
list<SubtargetFeature> BdVer2AdditionalFeatures = [FeatureF16C,
@@ -1509,7 +1521,8 @@ def ProcessorFeatures {
TuningSlowSHLD,
TuningSBBDepBreaking,
TuningInsertVZEROUPPER,
- TuningAllowLight256Bit];
+ TuningAllowLight256Bit,
+ TuningAvoidMFENCE];
list<SubtargetFeature> ZN2AdditionalFeatures = [FeatureCLWB,
FeatureRDPID,
FeatureRDPRU,
@@ -1664,7 +1677,8 @@ def : ProcModel<"nocona", GenericPostRAModel, [
],
[
TuningSlowUAMem16,
- TuningInsertVZEROUPPER
+ TuningInsertVZEROUPPER,
+ TuningAvoidMFENCE
]>;
// Intel Core 2 Solo/Duo.
@@ -1684,7 +1698,8 @@ def : ProcModel<P, SandyBridgeModel, [
[
TuningMacroFusion,
TuningSlowUAMem16,
- TuningInsertVZEROUPPER
+ TuningInsertVZEROUPPER,
+ TuningAvoidMFENCE
]>;
}
foreach P = ["penryn", "core_2_duo_sse4_1"] in {
@@ -1703,7 +1718,8 @@ def : ProcModel<P, SandyBridgeModel, [
[
TuningMacroFusion,
TuningSlowUAMem16,
- TuningInsertVZEROUPPER
+ TuningInsertVZEROUPPER,
+ TuningAvoidMFENCE
]>;
}
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f011249d295040..aade718c1efe80 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -31103,7 +31103,7 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
// cross-thread fence.
if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
FenceSSID == SyncScope::System) {
- if (Subtarget.hasMFence())
+ if (!Subtarget.avoidMFence() && Subtarget.hasMFence())
return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
SDValue Chain = Op.getOperand(0);
diff --git a/llvm/test/CodeGen/X86/atomic-unordered.ll b/llvm/test/CodeGen/X86/atomic-unordered.ll
index 3fb994cdb751a3..e8e0ee0b7ef492 100644
--- a/llvm/test/CodeGen/X86/atomic-unordered.ll
+++ b/llvm/test/CodeGen/X86/atomic-unordered.ll
@@ -2096,7 +2096,7 @@ define i64 @nofold_fence(ptr %p) {
; CHECK-LABEL: nofold_fence:
; CHECK: # %bb.0:
; CHECK-NEXT: movq (%rdi), %rax
-; CHECK-NEXT: mfence
+; CHECK-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: addq $15, %rax
; CHECK-NEXT: retq
%v = load atomic i64, ptr %p unordered, align 8
@@ -2170,7 +2170,7 @@ define i64 @fold_constant_fence(i64 %arg) {
; CHECK-LABEL: fold_constant_fence:
; CHECK: # %bb.0:
; CHECK-NEXT: movq Constant(%rip), %rax
-; CHECK-NEXT: mfence
+; CHECK-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: addq %rdi, %rax
; CHECK-NEXT: retq
%v = load atomic i64, ptr @Constant unordered, align 8
@@ -2197,7 +2197,7 @@ define i64 @fold_invariant_fence(ptr dereferenceable(8) %p, i64 %arg) {
; CHECK-LABEL: fold_invariant_fence:
; CHECK: # %bb.0:
; CHECK-NEXT: movq (%rdi), %rax
-; CHECK-NEXT: mfence
+; CHECK-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: addq %rsi, %rax
; CHECK-NEXT: retq
%v = load atomic i64, ptr %p unordered, align 8, !invariant.load !{}
@@ -2321,7 +2321,7 @@ define i1 @fold_cmp_over_fence(ptr %p, i32 %v1) {
; CHECK-O0-LABEL: fold_cmp_over_fence:
; CHECK-O0: # %bb.0:
; CHECK-O0-NEXT: movl (%rdi), %eax
-; CHECK-O0-NEXT: mfence
+; CHECK-O0-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
; CHECK-O0-NEXT: cmpl %eax, %esi
; CHECK-O0-NEXT: jne .LBB116_2
; CHECK-O0-NEXT: # %bb.1: # %taken
@@ -2335,7 +2335,7 @@ define i1 @fold_cmp_over_fence(ptr %p, i32 %v1) {
; CHECK-O3-LABEL: fold_cmp_over_fence:
; CHECK-O3: # %bb.0:
; CHECK-O3-NEXT: movl (%rdi), %eax
-; CHECK-O3-NEXT: mfence
+; CHECK-O3-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
; CHECK-O3-NEXT: cmpl %eax, %esi
; CHECK-O3-NEXT: jne .LBB116_2
; CHECK-O3-NEXT: # %bb.1: # %taken
|
|
@@ -882,7 +886,8 @@ def ProcessorFeatures { | |||
list<SubtargetFeature> NHMTuning = [TuningMacroFusion, | |||
TuningSlowDivide64, | |||
TuningInsertVZEROUPPER, | |||
TuningNoDomainDelayMov]; | |||
TuningNoDomainDelayMov, | |||
TuningAvoidMFENCE]; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Probably add TuningAvoidMFENCE to tuning for x86-64-v2 and later?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also added it to x86-64-v1, since that ought to be equivalent to early "core" marchs?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we do that we might as well support it on any 64-bit capable CPU
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I believe that's what GCC did.
+DEF_TUNE (X86_TUNE_AVOID_MFENCE, "avoid_mfence",
m_CORE_ALL | m_BDVER | m_ZNVER | m_GENERIC)
@@ -754,6 +754,10 @@ def TuningUseGLMDivSqrtCosts | |||
def TuningBranchHint: SubtargetFeature<"branch-hint", "HasBranchHint", "true", | |||
"Target has branch hint feature">; | |||
|
|||
def TuningAvoidMFENCE |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should better to use the opposite definition, e.g., TuningPreferMFENCE
given modern CPUs don't need it?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am sympathetic to that, but also following GCCs lead here.
0bfeaaf
to
796b0c1
Compare
Thank you |
796b0c1
to
d32af20
Compare
On a Genoa machine (AMD EPYC 9384X), a benchmark of mine takes 14.13s to execute with seq_cst defaulting to |
Originally opened as https://reviews.llvm.org/D129947 LLVM currently emits `mfence` for `__atomic_thread_fence(seq_cst)`. On modern CPUs lock or is more efficient and provides the same sequential consistency. GCC 11 made this switch as well (see https://gcc.gnu.org/pipermail/gcc-cvs/2020-July/314418.html) and https://reviews.llvm.org/D61863 and https://reviews.llvm.org/D58632 moved into this direction as well, but didn't touch fence seq_cst. Amusingly this came up elsewhere: https://www.reddit.com/r/cpp_questions/comments/16uer2g/how_do_i_stop_clang_generating_mfence/ After another 2 years it doesn't look like anyone complained about the GCC switch. And there is still `__builtin_ia32_mfence` for folks who want this precise instruction.
d32af20
to
707ca0e
Compare
Originally opened as https://reviews.llvm.org/D129947 LLVM currently emits `mfence` for `__atomic_thread_fence(seq_cst)`. On modern CPUs lock or is more efficient and provides the same sequential consistency. GCC 11 made this switch as well (see https://gcc.gnu.org/pipermail/gcc-cvs/2020-July/314418.html) and https://reviews.llvm.org/D61863 and https://reviews.llvm.org/D58632 moved into this direction as well, but didn't touch fence seq_cst. Amusingly this came up elsewhere: https://www.reddit.com/r/cpp_questions/comments/16uer2g/how_do_i_stop_clang_generating_mfence/ After another 2 years it doesn't look like anyone complained about the GCC switch. And there is still `__builtin_ia32_mfence` for folks who want this precise instruction. (cherry picked from commit 4d502dd) (cherry picked from commit 707ca0e)
Friendly bump! Can we get another round of review on this PR? Thanks! |
Builder.CreateCall(MFence, {}); | ||
// Use `fence seq_cst` over `llvm.x64.sse2.mfence` here to get the correct | ||
// lowering for SSID == SyncScope::SingleThread and avoidMFence || !hasMFence | ||
Builder.CreateFence(AtomicOrdering::SequentiallyConsistent, SSID); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should this change be pulled out into a preliminary patch?
Originally opened as https://reviews.llvm.org/D129947 LLVM currently emits `mfence` for `__atomic_thread_fence(seq_cst)`. On modern CPUs lock or is more efficient and provides the same sequential consistency. GCC 11 made this switch as well (see https://gcc.gnu.org/pipermail/gcc-cvs/2020-July/314418.html) and https://reviews.llvm.org/D61863 and https://reviews.llvm.org/D58632 moved into this direction as well, but didn't touch fence seq_cst. Amusingly this came up elsewhere: https://www.reddit.com/r/cpp_questions/comments/16uer2g/how_do_i_stop_clang_generating_mfence/ After another 2 years it doesn't look like anyone complained about the GCC switch. And there is still `__builtin_ia32_mfence` for folks who want this precise instruction. (cherry picked from commit 4d502dd) (cherry picked from commit 707ca0e)
Originally opened as https://reviews.llvm.org/D129947
LLVM currently emits
mfence
for__atomic_thread_fence(seq_cst)
. Onmodern CPUs lock or is more efficient and provides the same sequential
consistency. GCC 11 made this switch as well (see https://gcc.gnu.org/pipermail/gcc-cvs/2020-July/314418.html)
and https://reviews.llvm.org/D61863 and https://reviews.llvm.org/D58632
moved into this direction as well, but didn't touch fence seq_cst.
Amusingly this came up elsewhere: https://www.reddit.com/r/cpp_questions/comments/16uer2g/how_do_i_stop_clang_generating_mfence/
After another 2 years it doesn't look like anyone complained about the
GCC switch. And there is still
__builtin_ia32_mfence
for folks whowant this precise instruction.
Fixes #91731