Skip to content

Commit

Permalink
[X86] Prefer lock or over mfence (llvm#106555)
Browse files Browse the repository at this point in the history
Originally opened as https://reviews.llvm.org/D129947

LLVM currently emits `mfence` for `__atomic_thread_fence(seq_cst)`. On
modern CPUs lock or is more efficient and provides the same sequential
consistency. GCC 11 made this switch as well (see https://gcc.gnu.org/pipermail/gcc-cvs/2020-July/314418.html)
and https://reviews.llvm.org/D61863 and https://reviews.llvm.org/D58632
moved into this direction as well, but didn't touch fence seq_cst.

Amusingly this came up elsewhere: https://www.reddit.com/r/cpp_questions/comments/16uer2g/how_do_i_stop_clang_generating_mfence/

After another 2 years it doesn't look like anyone complained about the
GCC switch. And there is still `__builtin_ia32_mfence` for folks who
want this precise instruction.

(cherry picked from commit 4d502dd)

(cherry picked from commit 707ca0e)
  • Loading branch information
vchuravy authored and giordano committed Nov 5, 2024
1 parent 94c2702 commit af93c5a
Show file tree
Hide file tree
Showing 5 changed files with 847 additions and 107 deletions.
52 changes: 36 additions & 16 deletions llvm/lib/Target/X86/X86.td
Original file line number Diff line number Diff line change
Expand Up @@ -748,6 +748,10 @@ def TuningUseGLMDivSqrtCosts
def TuningBranchHint: SubtargetFeature<"branch-hint", "HasBranchHint", "true",
"Target has branch hint feature">;

def TuningAvoidMFENCE
: SubtargetFeature<"avoid-mfence", "AvoidMFence", "true",
"Avoid MFENCE for fence seq_cst, and instead use lock or">;

//===----------------------------------------------------------------------===//
// X86 CPU Families
// TODO: Remove these - use general tuning features to determine codegen.
Expand Down Expand Up @@ -809,7 +813,8 @@ def ProcessorFeatures {
TuningSlow3OpsLEA,
TuningSlowDivide64,
TuningSlowIncDec,
TuningInsertVZEROUPPER
TuningInsertVZEROUPPER,
TuningAvoidMFENCE
];

list<SubtargetFeature> X86_64V2Features = !listconcat(X86_64V1Features, [
Expand All @@ -825,7 +830,8 @@ def ProcessorFeatures {
TuningFastSHLDRotate,
TuningFast15ByteNOP,
TuningPOPCNTFalseDeps,
TuningInsertVZEROUPPER
TuningInsertVZEROUPPER,
TuningAvoidMFENCE
];

list<SubtargetFeature> X86_64V3Features = !listconcat(X86_64V2Features, [
Expand All @@ -844,7 +850,8 @@ def ProcessorFeatures {
TuningPOPCNTFalseDeps,
TuningLZCNTFalseDeps,
TuningInsertVZEROUPPER,
TuningAllowLight256Bit
TuningAllowLight256Bit,
TuningAvoidMFENCE
];

list<SubtargetFeature> X86_64V4Features = !listconcat(X86_64V3Features, [
Expand All @@ -868,15 +875,17 @@ def ProcessorFeatures {
TuningFastGather,
TuningPOPCNTFalseDeps,
TuningInsertVZEROUPPER,
TuningAllowLight256Bit
TuningAllowLight256Bit,
TuningAvoidMFENCE
];

// Nehalem
list<SubtargetFeature> NHMFeatures = X86_64V2Features;
list<SubtargetFeature> NHMTuning = [TuningMacroFusion,
TuningSlowDivide64,
TuningInsertVZEROUPPER,
TuningNoDomainDelayMov];
TuningNoDomainDelayMov,
TuningAvoidMFENCE];

// Westmere
list<SubtargetFeature> WSMAdditionalFeatures = [FeaturePCLMUL];
Expand All @@ -897,7 +906,8 @@ def ProcessorFeatures {
TuningFast15ByteNOP,
TuningPOPCNTFalseDeps,
TuningInsertVZEROUPPER,
TuningNoDomainDelayMov];
TuningNoDomainDelayMov,
TuningAvoidMFENCE];
list<SubtargetFeature> SNBFeatures =
!listconcat(WSMFeatures, SNBAdditionalFeatures);

Expand Down Expand Up @@ -963,7 +973,8 @@ def ProcessorFeatures {
TuningAllowLight256Bit,
TuningNoDomainDelayMov,
TuningNoDomainDelayShuffle,
TuningNoDomainDelayBlend];
TuningNoDomainDelayBlend,
TuningAvoidMFENCE];
list<SubtargetFeature> SKLFeatures =
!listconcat(BDWFeatures, SKLAdditionalFeatures);

Expand Down Expand Up @@ -998,7 +1009,8 @@ def ProcessorFeatures {
TuningNoDomainDelayMov,
TuningNoDomainDelayShuffle,
TuningNoDomainDelayBlend,
TuningFastImmVectorShift];
TuningFastImmVectorShift,
TuningAvoidMFENCE];
list<SubtargetFeature> SKXFeatures =
!listconcat(BDWFeatures, SKXAdditionalFeatures);

Expand Down Expand Up @@ -1041,7 +1053,8 @@ def ProcessorFeatures {
TuningNoDomainDelayMov,
TuningNoDomainDelayShuffle,
TuningNoDomainDelayBlend,
TuningFastImmVectorShift];
TuningFastImmVectorShift,
TuningAvoidMFENCE];
list<SubtargetFeature> CNLFeatures =
!listconcat(SKLFeatures, CNLAdditionalFeatures);

Expand Down Expand Up @@ -1070,7 +1083,8 @@ def ProcessorFeatures {
TuningNoDomainDelayMov,
TuningNoDomainDelayShuffle,
TuningNoDomainDelayBlend,
TuningFastImmVectorShift];
TuningFastImmVectorShift,
TuningAvoidMFENCE];
list<SubtargetFeature> ICLFeatures =
!listconcat(CNLFeatures, ICLAdditionalFeatures);

Expand Down Expand Up @@ -1216,7 +1230,8 @@ def ProcessorFeatures {
// Tremont
list<SubtargetFeature> TRMAdditionalFeatures = [FeatureCLWB,
FeatureGFNI];
list<SubtargetFeature> TRMTuning = GLPTuning;
list<SubtargetFeature> TRMAdditionalTuning = [TuningAvoidMFENCE];
list<SubtargetFeature> TRMTuning = !listconcat(GLPTuning, TRMAdditionalTuning);
list<SubtargetFeature> TRMFeatures =
!listconcat(GLPFeatures, TRMAdditionalFeatures);

Expand Down Expand Up @@ -1394,7 +1409,8 @@ def ProcessorFeatures {
TuningFastImm16,
TuningSBBDepBreaking,
TuningSlowDivide64,
TuningSlowSHLD];
TuningSlowSHLD,
TuningAvoidMFENCE];
list<SubtargetFeature> BtVer2Features =
!listconcat(BtVer1Features, BtVer2AdditionalFeatures);

Expand Down Expand Up @@ -1423,7 +1439,8 @@ def ProcessorFeatures {
TuningFastScalarShiftMasks,
TuningBranchFusion,
TuningSBBDepBreaking,
TuningInsertVZEROUPPER];
TuningInsertVZEROUPPER,
TuningAvoidMFENCE];

// PileDriver
list<SubtargetFeature> BdVer2AdditionalFeatures = [FeatureF16C,
Expand Down Expand Up @@ -1503,7 +1520,8 @@ def ProcessorFeatures {
TuningSlowSHLD,
TuningSBBDepBreaking,
TuningInsertVZEROUPPER,
TuningAllowLight256Bit];
TuningAllowLight256Bit,
TuningAvoidMFENCE];
list<SubtargetFeature> ZN2AdditionalFeatures = [FeatureCLWB,
FeatureRDPID,
FeatureRDPRU,
Expand Down Expand Up @@ -1691,7 +1709,8 @@ def : ProcModel<P, SandyBridgeModel, [
[
TuningMacroFusion,
TuningSlowUAMem16,
TuningInsertVZEROUPPER
TuningInsertVZEROUPPER,
TuningAvoidMFENCE
]>;
}
foreach P = ["penryn", "core_2_duo_sse4_1"] in {
Expand All @@ -1710,7 +1729,8 @@ def : ProcModel<P, SandyBridgeModel, [
[
TuningMacroFusion,
TuningSlowUAMem16,
TuningInsertVZEROUPPER
TuningInsertVZEROUPPER,
TuningAvoidMFENCE
]>;
}

Expand Down
19 changes: 4 additions & 15 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30951,21 +30951,10 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
// otherwise, we might be able to be more aggressive on relaxed idempotent
// rmw. In practice, they do not look useful, so we don't try to be
// especially clever.
if (SSID == SyncScope::SingleThread)
// FIXME: we could just insert an ISD::MEMBARRIER here, except we are at
// the IR level, so we must wrap it in an intrinsic.
return nullptr;

if (!Subtarget.hasMFence())
// FIXME: it might make sense to use a locked operation here but on a
// different cache-line to prevent cache-line bouncing. In practice it
// is probably a small win, and x86 processors without mfence are rare
// enough that we do not bother.
return nullptr;

Function *MFence =
llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
Builder.CreateCall(MFence, {});
// Use `fence seq_cst` over `llvm.x64.sse2.mfence` here to get the correct
// lowering for SSID == SyncScope::SingleThread and avoidMFence || !hasMFence
Builder.CreateFence(AtomicOrdering::SequentiallyConsistent, SSID);

// Finally we can emit the atomic load.
LoadInst *Loaded = Builder.CreateAlignedLoad(
Expand Down Expand Up @@ -31053,7 +31042,7 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
// cross-thread fence.
if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
FenceSSID == SyncScope::System) {
if (Subtarget.hasMFence())
if (!Subtarget.avoidMFence() && Subtarget.hasMFence())
return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));

SDValue Chain = Op.getOperand(0);
Expand Down
86 changes: 30 additions & 56 deletions llvm/test/CodeGen/X86/atomic-idempotent.ll
Original file line number Diff line number Diff line change
Expand Up @@ -27,18 +27,16 @@ define i8 @add8(ptr %p) {
;
; X86-SLM-LABEL: add8:
; X86-SLM: # %bb.0:
; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SLM-NEXT: xorl %eax, %eax
; X86-SLM-NEXT: lock xaddb %al, (%ecx)
; X86-SLM-NEXT: # kill: def $al killed $al killed $eax
; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SLM-NEXT: lock orl $0, (%esp)
; X86-SLM-NEXT: movzbl (%eax), %eax
; X86-SLM-NEXT: retl
;
; X86-ATOM-LABEL: add8:
; X86-ATOM: # %bb.0:
; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-ATOM-NEXT: xorl %eax, %eax
; X86-ATOM-NEXT: lock xaddb %al, (%ecx)
; X86-ATOM-NEXT: # kill: def $al killed $al killed $eax
; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-ATOM-NEXT: lock orl $0, (%esp)
; X86-ATOM-NEXT: movzbl (%eax), %eax
; X86-ATOM-NEXT: nop
; X86-ATOM-NEXT: nop
; X86-ATOM-NEXT: retl
Expand All @@ -62,26 +60,18 @@ define i16 @or16(ptr %p) {
;
; X86-SLM-LABEL: or16:
; X86-SLM: # %bb.0:
; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SLM-NEXT: movzwl (%ecx), %eax
; X86-SLM-NEXT: .p2align 4, 0x90
; X86-SLM-NEXT: .LBB1_1: # %atomicrmw.start
; X86-SLM-NEXT: # =>This Inner Loop Header: Depth=1
; X86-SLM-NEXT: lock cmpxchgw %ax, (%ecx)
; X86-SLM-NEXT: jne .LBB1_1
; X86-SLM-NEXT: # %bb.2: # %atomicrmw.end
; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SLM-NEXT: lock orl $0, (%esp)
; X86-SLM-NEXT: movzwl (%eax), %eax
; X86-SLM-NEXT: retl
;
; X86-ATOM-LABEL: or16:
; X86-ATOM: # %bb.0:
; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-ATOM-NEXT: movzwl (%ecx), %eax
; X86-ATOM-NEXT: .p2align 4, 0x90
; X86-ATOM-NEXT: .LBB1_1: # %atomicrmw.start
; X86-ATOM-NEXT: # =>This Inner Loop Header: Depth=1
; X86-ATOM-NEXT: lock cmpxchgw %ax, (%ecx)
; X86-ATOM-NEXT: jne .LBB1_1
; X86-ATOM-NEXT: # %bb.2: # %atomicrmw.end
; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-ATOM-NEXT: lock orl $0, (%esp)
; X86-ATOM-NEXT: movzwl (%eax), %eax
; X86-ATOM-NEXT: nop
; X86-ATOM-NEXT: nop
; X86-ATOM-NEXT: retl
%1 = atomicrmw or ptr %p, i16 0 acquire
ret i16 %1
Expand All @@ -103,26 +93,18 @@ define i32 @xor32(ptr %p) {
;
; X86-SLM-LABEL: xor32:
; X86-SLM: # %bb.0:
; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SLM-NEXT: movl (%ecx), %eax
; X86-SLM-NEXT: .p2align 4, 0x90
; X86-SLM-NEXT: .LBB2_1: # %atomicrmw.start
; X86-SLM-NEXT: # =>This Inner Loop Header: Depth=1
; X86-SLM-NEXT: lock cmpxchgl %eax, (%ecx)
; X86-SLM-NEXT: jne .LBB2_1
; X86-SLM-NEXT: # %bb.2: # %atomicrmw.end
; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SLM-NEXT: lock orl $0, (%esp)
; X86-SLM-NEXT: movl (%eax), %eax
; X86-SLM-NEXT: retl
;
; X86-ATOM-LABEL: xor32:
; X86-ATOM: # %bb.0:
; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-ATOM-NEXT: movl (%ecx), %eax
; X86-ATOM-NEXT: .p2align 4, 0x90
; X86-ATOM-NEXT: .LBB2_1: # %atomicrmw.start
; X86-ATOM-NEXT: # =>This Inner Loop Header: Depth=1
; X86-ATOM-NEXT: lock cmpxchgl %eax, (%ecx)
; X86-ATOM-NEXT: jne .LBB2_1
; X86-ATOM-NEXT: # %bb.2: # %atomicrmw.end
; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-ATOM-NEXT: lock orl $0, (%esp)
; X86-ATOM-NEXT: movl (%eax), %eax
; X86-ATOM-NEXT: nop
; X86-ATOM-NEXT: nop
; X86-ATOM-NEXT: retl
%1 = atomicrmw xor ptr %p, i32 0 release
ret i32 %1
Expand Down Expand Up @@ -318,26 +300,18 @@ define i32 @and32 (ptr %p) {
;
; X86-SLM-LABEL: and32:
; X86-SLM: # %bb.0:
; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SLM-NEXT: movl (%ecx), %eax
; X86-SLM-NEXT: .p2align 4, 0x90
; X86-SLM-NEXT: .LBB5_1: # %atomicrmw.start
; X86-SLM-NEXT: # =>This Inner Loop Header: Depth=1
; X86-SLM-NEXT: lock cmpxchgl %eax, (%ecx)
; X86-SLM-NEXT: jne .LBB5_1
; X86-SLM-NEXT: # %bb.2: # %atomicrmw.end
; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SLM-NEXT: lock orl $0, (%esp)
; X86-SLM-NEXT: movl (%eax), %eax
; X86-SLM-NEXT: retl
;
; X86-ATOM-LABEL: and32:
; X86-ATOM: # %bb.0:
; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-ATOM-NEXT: movl (%ecx), %eax
; X86-ATOM-NEXT: .p2align 4, 0x90
; X86-ATOM-NEXT: .LBB5_1: # %atomicrmw.start
; X86-ATOM-NEXT: # =>This Inner Loop Header: Depth=1
; X86-ATOM-NEXT: lock cmpxchgl %eax, (%ecx)
; X86-ATOM-NEXT: jne .LBB5_1
; X86-ATOM-NEXT: # %bb.2: # %atomicrmw.end
; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-ATOM-NEXT: lock orl $0, (%esp)
; X86-ATOM-NEXT: movl (%eax), %eax
; X86-ATOM-NEXT: nop
; X86-ATOM-NEXT: nop
; X86-ATOM-NEXT: retl
%1 = atomicrmw and ptr %p, i32 -1 acq_rel
ret i32 %1
Expand Down
Loading

0 comments on commit af93c5a

Please sign in to comment.