[X86] Prefer lock or over mfence (llvm#106555)

Originally opened as https://reviews.llvm.org/D129947 LLVM currently emits `mfence` for `__atomic_thread_fence(seq_cst)`. On modern CPUs lock or is more efficient and provides the same sequential consistency. GCC 11 made this switch as well (see https://gcc.gnu.org/pipermail/gcc-cvs/2020-July/314418.html) and https://reviews.llvm.org/D61863 and https://reviews.llvm.org/D58632 moved into this direction as well, but didn't touch fence seq_cst. Amusingly this came up elsewhere: https://www.reddit.com/r/cpp_questions/comments/16uer2g/how_do_i_stop_clang_generating_mfence/ After another 2 years it doesn't look like anyone complained about the GCC switch. And there is still `__builtin_ia32_mfence` for folks who want this precise instruction. (cherry picked from commit 4d502dd) (cherry picked from commit 707ca0e)
JuliaLang · Nov 5, 2024 · af93c5a · af93c5a
1 parent 94c2702
commit af93c5a
Show file tree

Hide file tree

Showing 5 changed files with 847 additions and 107 deletions.
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
@@ -748,6 +748,10 @@ def TuningUseGLMDivSqrtCosts
 def TuningBranchHint: SubtargetFeature<"branch-hint", "HasBranchHint", "true",
                                         "Target has branch hint feature">;
 
+def TuningAvoidMFENCE
+   : SubtargetFeature<"avoid-mfence", "AvoidMFence", "true",
+        "Avoid MFENCE for fence seq_cst, and instead use lock or">;
+
 //===----------------------------------------------------------------------===//
 // X86 CPU Families
 // TODO: Remove these - use general tuning features to determine codegen.
@@ -809,7 +813,8 @@ def ProcessorFeatures {
     TuningSlow3OpsLEA,
     TuningSlowDivide64,
     TuningSlowIncDec,
-    TuningInsertVZEROUPPER
+    TuningInsertVZEROUPPER,
+    TuningAvoidMFENCE
   ];
 
   list<SubtargetFeature> X86_64V2Features = !listconcat(X86_64V1Features, [
@@ -825,7 +830,8 @@ def ProcessorFeatures {
     TuningFastSHLDRotate,
     TuningFast15ByteNOP,
     TuningPOPCNTFalseDeps,
-    TuningInsertVZEROUPPER
+    TuningInsertVZEROUPPER,
+    TuningAvoidMFENCE
   ];
 
   list<SubtargetFeature> X86_64V3Features = !listconcat(X86_64V2Features, [
@@ -844,7 +850,8 @@ def ProcessorFeatures {
     TuningPOPCNTFalseDeps,
     TuningLZCNTFalseDeps,
     TuningInsertVZEROUPPER,
-    TuningAllowLight256Bit
+    TuningAllowLight256Bit,
+    TuningAvoidMFENCE
   ];
 
   list<SubtargetFeature> X86_64V4Features = !listconcat(X86_64V3Features, [
@@ -868,15 +875,17 @@ def ProcessorFeatures {
     TuningFastGather,
     TuningPOPCNTFalseDeps,
     TuningInsertVZEROUPPER,
-    TuningAllowLight256Bit
+    TuningAllowLight256Bit,
+    TuningAvoidMFENCE
   ];
 
   // Nehalem
   list<SubtargetFeature> NHMFeatures = X86_64V2Features;
   list<SubtargetFeature> NHMTuning = [TuningMacroFusion,
                                       TuningSlowDivide64,
                                       TuningInsertVZEROUPPER,
-                                      TuningNoDomainDelayMov];
+                                      TuningNoDomainDelayMov,
+                                      TuningAvoidMFENCE];
 
   // Westmere
   list<SubtargetFeature> WSMAdditionalFeatures = [FeaturePCLMUL];
@@ -897,7 +906,8 @@ def ProcessorFeatures {
                                       TuningFast15ByteNOP,
                                       TuningPOPCNTFalseDeps,
                                       TuningInsertVZEROUPPER,
-                                      TuningNoDomainDelayMov];
+                                      TuningNoDomainDelayMov,
+                                      TuningAvoidMFENCE];
   list<SubtargetFeature> SNBFeatures =
     !listconcat(WSMFeatures, SNBAdditionalFeatures);
 
@@ -963,7 +973,8 @@ def ProcessorFeatures {
                                       TuningAllowLight256Bit,
                                       TuningNoDomainDelayMov,
                                       TuningNoDomainDelayShuffle,
-                                      TuningNoDomainDelayBlend];
+                                      TuningNoDomainDelayBlend,
+                                      TuningAvoidMFENCE];
   list<SubtargetFeature> SKLFeatures =
     !listconcat(BDWFeatures, SKLAdditionalFeatures);
 
@@ -998,7 +1009,8 @@ def ProcessorFeatures {
                                       TuningNoDomainDelayMov,
                                       TuningNoDomainDelayShuffle,
                                       TuningNoDomainDelayBlend,
-                                      TuningFastImmVectorShift];
+                                      TuningFastImmVectorShift,
+                                      TuningAvoidMFENCE];
   list<SubtargetFeature> SKXFeatures =
     !listconcat(BDWFeatures, SKXAdditionalFeatures);
 
@@ -1041,7 +1053,8 @@ def ProcessorFeatures {
                                       TuningNoDomainDelayMov,
                                       TuningNoDomainDelayShuffle,
                                       TuningNoDomainDelayBlend,
-                                      TuningFastImmVectorShift];
+                                      TuningFastImmVectorShift,
+                                      TuningAvoidMFENCE];
   list<SubtargetFeature> CNLFeatures =
     !listconcat(SKLFeatures, CNLAdditionalFeatures);
 
@@ -1070,7 +1083,8 @@ def ProcessorFeatures {
                                       TuningNoDomainDelayMov,
                                       TuningNoDomainDelayShuffle,
                                       TuningNoDomainDelayBlend,
-                                      TuningFastImmVectorShift];
+                                      TuningFastImmVectorShift,
+                                      TuningAvoidMFENCE];
   list<SubtargetFeature> ICLFeatures =
     !listconcat(CNLFeatures, ICLAdditionalFeatures);
 
@@ -1216,7 +1230,8 @@ def ProcessorFeatures {
   // Tremont
   list<SubtargetFeature> TRMAdditionalFeatures = [FeatureCLWB,
                                                   FeatureGFNI];
-  list<SubtargetFeature> TRMTuning = GLPTuning;
+  list<SubtargetFeature> TRMAdditionalTuning = [TuningAvoidMFENCE];
+  list<SubtargetFeature> TRMTuning = !listconcat(GLPTuning, TRMAdditionalTuning);
   list<SubtargetFeature> TRMFeatures =
     !listconcat(GLPFeatures, TRMAdditionalFeatures);
 
@@ -1394,7 +1409,8 @@ def ProcessorFeatures {
                                          TuningFastImm16,
                                          TuningSBBDepBreaking,
                                          TuningSlowDivide64,
-                                         TuningSlowSHLD];
+                                         TuningSlowSHLD,
+                                         TuningAvoidMFENCE];
   list<SubtargetFeature> BtVer2Features =
     !listconcat(BtVer1Features, BtVer2AdditionalFeatures);
 
@@ -1423,7 +1439,8 @@ def ProcessorFeatures {
                                          TuningFastScalarShiftMasks,
                                          TuningBranchFusion,
                                          TuningSBBDepBreaking,
-                                         TuningInsertVZEROUPPER];
+                                         TuningInsertVZEROUPPER,
+                                         TuningAvoidMFENCE];
 
   // PileDriver
   list<SubtargetFeature> BdVer2AdditionalFeatures = [FeatureF16C,
@@ -1503,7 +1520,8 @@ def ProcessorFeatures {
                                      TuningSlowSHLD,
                                      TuningSBBDepBreaking,
                                      TuningInsertVZEROUPPER,
-                                     TuningAllowLight256Bit];
+                                     TuningAllowLight256Bit,
+                                     TuningAvoidMFENCE];
   list<SubtargetFeature> ZN2AdditionalFeatures = [FeatureCLWB,
                                                   FeatureRDPID,
                                                   FeatureRDPRU,
@@ -1691,7 +1709,8 @@ def : ProcModel<P, SandyBridgeModel, [
 [
   TuningMacroFusion,
   TuningSlowUAMem16,
-  TuningInsertVZEROUPPER
+  TuningInsertVZEROUPPER,
+  TuningAvoidMFENCE
 ]>;
 }
 foreach P = ["penryn", "core_2_duo_sse4_1"] in {
@@ -1710,7 +1729,8 @@ def : ProcModel<P, SandyBridgeModel, [
 [
   TuningMacroFusion,
   TuningSlowUAMem16,
-  TuningInsertVZEROUPPER
+  TuningInsertVZEROUPPER,
+  TuningAvoidMFENCE
 ]>;
 }
 

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -30951,21 +30951,10 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
   // otherwise, we might be able to be more aggressive on relaxed idempotent
   // rmw. In practice, they do not look useful, so we don't try to be
   // especially clever.
-  if (SSID == SyncScope::SingleThread)
-    // FIXME: we could just insert an ISD::MEMBARRIER here, except we are at
-    // the IR level, so we must wrap it in an intrinsic.
-    return nullptr;
-
-  if (!Subtarget.hasMFence())
-    // FIXME: it might make sense to use a locked operation here but on a
-    // different cache-line to prevent cache-line bouncing. In practice it
-    // is probably a small win, and x86 processors without mfence are rare
-    // enough that we do not bother.
-    return nullptr;
 
-  Function *MFence =
-      llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
-  Builder.CreateCall(MFence, {});
+  // Use `fence seq_cst` over `llvm.x64.sse2.mfence` here to get the correct
+  // lowering for SSID == SyncScope::SingleThread and avoidMFence || !hasMFence
+  Builder.CreateFence(AtomicOrdering::SequentiallyConsistent, SSID);
 
   // Finally we can emit the atomic load.
   LoadInst *Loaded = Builder.CreateAlignedLoad(
@@ -31053,7 +31042,7 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
   // cross-thread fence.
   if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
       FenceSSID == SyncScope::System) {
-    if (Subtarget.hasMFence())
+    if (!Subtarget.avoidMFence() && Subtarget.hasMFence())
       return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
 
     SDValue Chain = Op.getOperand(0);

diff --git a/llvm/test/CodeGen/X86/atomic-idempotent.ll b/llvm/test/CodeGen/X86/atomic-idempotent.ll
@@ -27,18 +27,16 @@ define i8 @add8(ptr %p) {
 ;
 ; X86-SLM-LABEL: add8:
 ; X86-SLM:       # %bb.0:
-; X86-SLM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SLM-NEXT:    xorl %eax, %eax
-; X86-SLM-NEXT:    lock xaddb %al, (%ecx)
-; X86-SLM-NEXT:    # kill: def $al killed $al killed $eax
+; X86-SLM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SLM-NEXT:    lock orl $0, (%esp)
+; X86-SLM-NEXT:    movzbl (%eax), %eax
 ; X86-SLM-NEXT:    retl
 ;
 ; X86-ATOM-LABEL: add8:
 ; X86-ATOM:       # %bb.0:
-; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-ATOM-NEXT:    xorl %eax, %eax
-; X86-ATOM-NEXT:    lock xaddb %al, (%ecx)
-; X86-ATOM-NEXT:    # kill: def $al killed $al killed $eax
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-ATOM-NEXT:    lock orl $0, (%esp)
+; X86-ATOM-NEXT:    movzbl (%eax), %eax
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    retl
@@ -62,26 +60,18 @@ define i16 @or16(ptr %p) {
 ;
 ; X86-SLM-LABEL: or16:
 ; X86-SLM:       # %bb.0:
-; X86-SLM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SLM-NEXT:    movzwl (%ecx), %eax
-; X86-SLM-NEXT:    .p2align 4, 0x90
-; X86-SLM-NEXT:  .LBB1_1: # %atomicrmw.start
-; X86-SLM-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-SLM-NEXT:    lock cmpxchgw %ax, (%ecx)
-; X86-SLM-NEXT:    jne .LBB1_1
-; X86-SLM-NEXT:  # %bb.2: # %atomicrmw.end
+; X86-SLM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SLM-NEXT:    lock orl $0, (%esp)
+; X86-SLM-NEXT:    movzwl (%eax), %eax
 ; X86-SLM-NEXT:    retl
 ;
 ; X86-ATOM-LABEL: or16:
 ; X86-ATOM:       # %bb.0:
-; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-ATOM-NEXT:    movzwl (%ecx), %eax
-; X86-ATOM-NEXT:    .p2align 4, 0x90
-; X86-ATOM-NEXT:  .LBB1_1: # %atomicrmw.start
-; X86-ATOM-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-ATOM-NEXT:    lock cmpxchgw %ax, (%ecx)
-; X86-ATOM-NEXT:    jne .LBB1_1
-; X86-ATOM-NEXT:  # %bb.2: # %atomicrmw.end
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-ATOM-NEXT:    lock orl $0, (%esp)
+; X86-ATOM-NEXT:    movzwl (%eax), %eax
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    retl
   %1 = atomicrmw or ptr %p, i16 0 acquire
   ret i16 %1
@@ -103,26 +93,18 @@ define i32 @xor32(ptr %p) {
 ;
 ; X86-SLM-LABEL: xor32:
 ; X86-SLM:       # %bb.0:
-; X86-SLM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SLM-NEXT:    movl (%ecx), %eax
-; X86-SLM-NEXT:    .p2align 4, 0x90
-; X86-SLM-NEXT:  .LBB2_1: # %atomicrmw.start
-; X86-SLM-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-SLM-NEXT:    lock cmpxchgl %eax, (%ecx)
-; X86-SLM-NEXT:    jne .LBB2_1
-; X86-SLM-NEXT:  # %bb.2: # %atomicrmw.end
+; X86-SLM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SLM-NEXT:    lock orl $0, (%esp)
+; X86-SLM-NEXT:    movl (%eax), %eax
 ; X86-SLM-NEXT:    retl
 ;
 ; X86-ATOM-LABEL: xor32:
 ; X86-ATOM:       # %bb.0:
-; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-ATOM-NEXT:    movl (%ecx), %eax
-; X86-ATOM-NEXT:    .p2align 4, 0x90
-; X86-ATOM-NEXT:  .LBB2_1: # %atomicrmw.start
-; X86-ATOM-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-ATOM-NEXT:    lock cmpxchgl %eax, (%ecx)
-; X86-ATOM-NEXT:    jne .LBB2_1
-; X86-ATOM-NEXT:  # %bb.2: # %atomicrmw.end
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-ATOM-NEXT:    lock orl $0, (%esp)
+; X86-ATOM-NEXT:    movl (%eax), %eax
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    retl
   %1 = atomicrmw xor ptr %p, i32 0 release
   ret i32 %1
@@ -318,26 +300,18 @@ define i32 @and32 (ptr %p) {
 ;
 ; X86-SLM-LABEL: and32:
 ; X86-SLM:       # %bb.0:
-; X86-SLM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SLM-NEXT:    movl (%ecx), %eax
-; X86-SLM-NEXT:    .p2align 4, 0x90
-; X86-SLM-NEXT:  .LBB5_1: # %atomicrmw.start
-; X86-SLM-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-SLM-NEXT:    lock cmpxchgl %eax, (%ecx)
-; X86-SLM-NEXT:    jne .LBB5_1
-; X86-SLM-NEXT:  # %bb.2: # %atomicrmw.end
+; X86-SLM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SLM-NEXT:    lock orl $0, (%esp)
+; X86-SLM-NEXT:    movl (%eax), %eax
 ; X86-SLM-NEXT:    retl
 ;
 ; X86-ATOM-LABEL: and32:
 ; X86-ATOM:       # %bb.0:
-; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-ATOM-NEXT:    movl (%ecx), %eax
-; X86-ATOM-NEXT:    .p2align 4, 0x90
-; X86-ATOM-NEXT:  .LBB5_1: # %atomicrmw.start
-; X86-ATOM-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-ATOM-NEXT:    lock cmpxchgl %eax, (%ecx)
-; X86-ATOM-NEXT:    jne .LBB5_1
-; X86-ATOM-NEXT:  # %bb.2: # %atomicrmw.end
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-ATOM-NEXT:    lock orl $0, (%esp)
+; X86-ATOM-NEXT:    movl (%eax), %eax
+; X86-ATOM-NEXT:    nop
+; X86-ATOM-NEXT:    nop
 ; X86-ATOM-NEXT:    retl
   %1 = atomicrmw and ptr %p, i32 -1 acq_rel
   ret i32 %1