diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 6cf37836f921d4..67486ae0ebf842 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -754,6 +754,10 @@ def TuningUseGLMDivSqrtCosts def TuningBranchHint: SubtargetFeature<"branch-hint", "HasBranchHint", "true", "Target has branch hint feature">; +def TuningAvoidMFENCE + : SubtargetFeature<"avoid-mfence", "AvoidMFence", "true", + "Avoid MFENCE for fence seq_cst, and instead use lock or">; + //===----------------------------------------------------------------------===// // X86 CPU Families // TODO: Remove these - use general tuning features to determine codegen. @@ -815,7 +819,8 @@ def ProcessorFeatures { TuningSlow3OpsLEA, TuningSlowDivide64, TuningSlowIncDec, - TuningInsertVZEROUPPER + TuningInsertVZEROUPPER, + TuningAvoidMFENCE ]; list X86_64V2Features = !listconcat(X86_64V1Features, [ @@ -831,7 +836,8 @@ def ProcessorFeatures { TuningFastSHLDRotate, TuningFast15ByteNOP, TuningPOPCNTFalseDeps, - TuningInsertVZEROUPPER + TuningInsertVZEROUPPER, + TuningAvoidMFENCE ]; list X86_64V3Features = !listconcat(X86_64V2Features, [ @@ -850,7 +856,8 @@ def ProcessorFeatures { TuningPOPCNTFalseDeps, TuningLZCNTFalseDeps, TuningInsertVZEROUPPER, - TuningAllowLight256Bit + TuningAllowLight256Bit, + TuningAvoidMFENCE ]; list X86_64V4Features = !listconcat(X86_64V3Features, [ @@ -874,7 +881,8 @@ def ProcessorFeatures { TuningFastGather, TuningPOPCNTFalseDeps, TuningInsertVZEROUPPER, - TuningAllowLight256Bit + TuningAllowLight256Bit, + TuningAvoidMFENCE ]; // Nehalem @@ -882,7 +890,8 @@ def ProcessorFeatures { list NHMTuning = [TuningMacroFusion, TuningSlowDivide64, TuningInsertVZEROUPPER, - TuningNoDomainDelayMov]; + TuningNoDomainDelayMov, + TuningAvoidMFENCE]; // Westmere list WSMAdditionalFeatures = [FeaturePCLMUL]; @@ -903,7 +912,8 @@ def ProcessorFeatures { TuningFast15ByteNOP, TuningPOPCNTFalseDeps, TuningInsertVZEROUPPER, - TuningNoDomainDelayMov]; + TuningNoDomainDelayMov, + TuningAvoidMFENCE]; list SNBFeatures = !listconcat(WSMFeatures, SNBAdditionalFeatures); @@ -969,7 +979,8 @@ def ProcessorFeatures { TuningAllowLight256Bit, TuningNoDomainDelayMov, TuningNoDomainDelayShuffle, - TuningNoDomainDelayBlend]; + TuningNoDomainDelayBlend, + TuningAvoidMFENCE]; list SKLFeatures = !listconcat(BDWFeatures, SKLAdditionalFeatures); @@ -1004,7 +1015,8 @@ def ProcessorFeatures { TuningNoDomainDelayMov, TuningNoDomainDelayShuffle, TuningNoDomainDelayBlend, - TuningFastImmVectorShift]; + TuningFastImmVectorShift, + TuningAvoidMFENCE]; list SKXFeatures = !listconcat(BDWFeatures, SKXAdditionalFeatures); @@ -1047,7 +1059,8 @@ def ProcessorFeatures { TuningNoDomainDelayMov, TuningNoDomainDelayShuffle, TuningNoDomainDelayBlend, - TuningFastImmVectorShift]; + TuningFastImmVectorShift, + TuningAvoidMFENCE]; list CNLFeatures = !listconcat(SKLFeatures, CNLAdditionalFeatures); @@ -1076,7 +1089,8 @@ def ProcessorFeatures { TuningNoDomainDelayMov, TuningNoDomainDelayShuffle, TuningNoDomainDelayBlend, - TuningFastImmVectorShift]; + TuningFastImmVectorShift, + TuningAvoidMFENCE]; list ICLFeatures = !listconcat(CNLFeatures, ICLAdditionalFeatures); @@ -1222,7 +1236,8 @@ def ProcessorFeatures { // Tremont list TRMAdditionalFeatures = [FeatureCLWB, FeatureGFNI]; - list TRMTuning = GLPTuning; + list TRMAdditionalTuning = [TuningAvoidMFENCE]; + list TRMTuning = !listconcat(GLPTuning, TRMAdditionalTuning); list TRMFeatures = !listconcat(GLPFeatures, TRMAdditionalFeatures); @@ -1400,7 +1415,8 @@ def ProcessorFeatures { TuningFastImm16, TuningSBBDepBreaking, TuningSlowDivide64, - TuningSlowSHLD]; + TuningSlowSHLD, + TuningAvoidMFENCE]; list BtVer2Features = !listconcat(BtVer1Features, BtVer2AdditionalFeatures); @@ -1429,7 +1445,8 @@ def ProcessorFeatures { TuningFastScalarShiftMasks, TuningBranchFusion, TuningSBBDepBreaking, - TuningInsertVZEROUPPER]; + TuningInsertVZEROUPPER, + TuningAvoidMFENCE]; // PileDriver list BdVer2AdditionalFeatures = [FeatureF16C, @@ -1509,7 +1526,8 @@ def ProcessorFeatures { TuningSlowSHLD, TuningSBBDepBreaking, TuningInsertVZEROUPPER, - TuningAllowLight256Bit]; + TuningAllowLight256Bit, + TuningAvoidMFENCE]; list ZN2AdditionalFeatures = [FeatureCLWB, FeatureRDPID, FeatureRDPRU, @@ -1697,7 +1715,8 @@ def : ProcModel; } foreach P = ["penryn", "core_2_duo_sse4_1"] in { @@ -1716,7 +1735,8 @@ def : ProcModel; } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 73f7f52846f625..76df32bef62fea 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -31422,21 +31422,10 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { // otherwise, we might be able to be more aggressive on relaxed idempotent // rmw. In practice, they do not look useful, so we don't try to be // especially clever. - if (SSID == SyncScope::SingleThread) - // FIXME: we could just insert an ISD::MEMBARRIER here, except we are at - // the IR level, so we must wrap it in an intrinsic. - return nullptr; - - if (!Subtarget.hasMFence()) - // FIXME: it might make sense to use a locked operation here but on a - // different cache-line to prevent cache-line bouncing. In practice it - // is probably a small win, and x86 processors without mfence are rare - // enough that we do not bother. - return nullptr; - Function *MFence = - llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence); - Builder.CreateCall(MFence, {}); + // Use `fence seq_cst` over `llvm.x64.sse2.mfence` here to get the correct + // lowering for SSID == SyncScope::SingleThread and avoidMFence || !hasMFence + Builder.CreateFence(AtomicOrdering::SequentiallyConsistent, SSID); // Finally we can emit the atomic load. LoadInst *Loaded = Builder.CreateAlignedLoad( @@ -31524,7 +31513,7 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget, // cross-thread fence. if (FenceOrdering == AtomicOrdering::SequentiallyConsistent && FenceSSID == SyncScope::System) { - if (Subtarget.hasMFence()) + if (!Subtarget.avoidMFence() && Subtarget.hasMFence()) return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); SDValue Chain = Op.getOperand(0); diff --git a/llvm/test/CodeGen/X86/atomic-idempotent.ll b/llvm/test/CodeGen/X86/atomic-idempotent.ll index d5c46485068a64..4deedd5726b244 100644 --- a/llvm/test/CodeGen/X86/atomic-idempotent.ll +++ b/llvm/test/CodeGen/X86/atomic-idempotent.ll @@ -27,18 +27,16 @@ define i8 @add8(ptr %p) { ; ; X86-SLM-LABEL: add8: ; X86-SLM: # %bb.0: -; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SLM-NEXT: xorl %eax, %eax -; X86-SLM-NEXT: lock xaddb %al, (%ecx) -; X86-SLM-NEXT: # kill: def $al killed $al killed $eax +; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SLM-NEXT: lock orl $0, (%esp) +; X86-SLM-NEXT: movzbl (%eax), %eax ; X86-SLM-NEXT: retl ; ; X86-ATOM-LABEL: add8: ; X86-ATOM: # %bb.0: -; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-ATOM-NEXT: xorl %eax, %eax -; X86-ATOM-NEXT: lock xaddb %al, (%ecx) -; X86-ATOM-NEXT: # kill: def $al killed $al killed $eax +; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-ATOM-NEXT: lock orl $0, (%esp) +; X86-ATOM-NEXT: movzbl (%eax), %eax ; X86-ATOM-NEXT: nop ; X86-ATOM-NEXT: nop ; X86-ATOM-NEXT: retl @@ -62,26 +60,18 @@ define i16 @or16(ptr %p) { ; ; X86-SLM-LABEL: or16: ; X86-SLM: # %bb.0: -; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SLM-NEXT: movzwl (%ecx), %eax -; X86-SLM-NEXT: .p2align 4, 0x90 -; X86-SLM-NEXT: .LBB1_1: # %atomicrmw.start -; X86-SLM-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-SLM-NEXT: lock cmpxchgw %ax, (%ecx) -; X86-SLM-NEXT: jne .LBB1_1 -; X86-SLM-NEXT: # %bb.2: # %atomicrmw.end +; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SLM-NEXT: lock orl $0, (%esp) +; X86-SLM-NEXT: movzwl (%eax), %eax ; X86-SLM-NEXT: retl ; ; X86-ATOM-LABEL: or16: ; X86-ATOM: # %bb.0: -; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-ATOM-NEXT: movzwl (%ecx), %eax -; X86-ATOM-NEXT: .p2align 4, 0x90 -; X86-ATOM-NEXT: .LBB1_1: # %atomicrmw.start -; X86-ATOM-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-ATOM-NEXT: lock cmpxchgw %ax, (%ecx) -; X86-ATOM-NEXT: jne .LBB1_1 -; X86-ATOM-NEXT: # %bb.2: # %atomicrmw.end +; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-ATOM-NEXT: lock orl $0, (%esp) +; X86-ATOM-NEXT: movzwl (%eax), %eax +; X86-ATOM-NEXT: nop +; X86-ATOM-NEXT: nop ; X86-ATOM-NEXT: retl %1 = atomicrmw or ptr %p, i16 0 acquire ret i16 %1 @@ -103,26 +93,18 @@ define i32 @xor32(ptr %p) { ; ; X86-SLM-LABEL: xor32: ; X86-SLM: # %bb.0: -; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SLM-NEXT: movl (%ecx), %eax -; X86-SLM-NEXT: .p2align 4, 0x90 -; X86-SLM-NEXT: .LBB2_1: # %atomicrmw.start -; X86-SLM-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-SLM-NEXT: lock cmpxchgl %eax, (%ecx) -; X86-SLM-NEXT: jne .LBB2_1 -; X86-SLM-NEXT: # %bb.2: # %atomicrmw.end +; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SLM-NEXT: lock orl $0, (%esp) +; X86-SLM-NEXT: movl (%eax), %eax ; X86-SLM-NEXT: retl ; ; X86-ATOM-LABEL: xor32: ; X86-ATOM: # %bb.0: -; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-ATOM-NEXT: movl (%ecx), %eax -; X86-ATOM-NEXT: .p2align 4, 0x90 -; X86-ATOM-NEXT: .LBB2_1: # %atomicrmw.start -; X86-ATOM-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-ATOM-NEXT: lock cmpxchgl %eax, (%ecx) -; X86-ATOM-NEXT: jne .LBB2_1 -; X86-ATOM-NEXT: # %bb.2: # %atomicrmw.end +; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-ATOM-NEXT: lock orl $0, (%esp) +; X86-ATOM-NEXT: movl (%eax), %eax +; X86-ATOM-NEXT: nop +; X86-ATOM-NEXT: nop ; X86-ATOM-NEXT: retl %1 = atomicrmw xor ptr %p, i32 0 release ret i32 %1 @@ -318,26 +300,18 @@ define i32 @and32 (ptr %p) { ; ; X86-SLM-LABEL: and32: ; X86-SLM: # %bb.0: -; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SLM-NEXT: movl (%ecx), %eax -; X86-SLM-NEXT: .p2align 4, 0x90 -; X86-SLM-NEXT: .LBB5_1: # %atomicrmw.start -; X86-SLM-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-SLM-NEXT: lock cmpxchgl %eax, (%ecx) -; X86-SLM-NEXT: jne .LBB5_1 -; X86-SLM-NEXT: # %bb.2: # %atomicrmw.end +; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SLM-NEXT: lock orl $0, (%esp) +; X86-SLM-NEXT: movl (%eax), %eax ; X86-SLM-NEXT: retl ; ; X86-ATOM-LABEL: and32: ; X86-ATOM: # %bb.0: -; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-ATOM-NEXT: movl (%ecx), %eax -; X86-ATOM-NEXT: .p2align 4, 0x90 -; X86-ATOM-NEXT: .LBB5_1: # %atomicrmw.start -; X86-ATOM-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-ATOM-NEXT: lock cmpxchgl %eax, (%ecx) -; X86-ATOM-NEXT: jne .LBB5_1 -; X86-ATOM-NEXT: # %bb.2: # %atomicrmw.end +; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-ATOM-NEXT: lock orl $0, (%esp) +; X86-ATOM-NEXT: movl (%eax), %eax +; X86-ATOM-NEXT: nop +; X86-ATOM-NEXT: nop ; X86-ATOM-NEXT: retl %1 = atomicrmw and ptr %p, i32 -1 acq_rel ret i32 %1 diff --git a/llvm/test/CodeGen/X86/atomic-unordered.ll b/llvm/test/CodeGen/X86/atomic-unordered.ll index 3fb994cdb751a3..ff101b9037f0ef 100644 --- a/llvm/test/CodeGen/X86/atomic-unordered.ll +++ b/llvm/test/CodeGen/X86/atomic-unordered.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -O0 < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mcpu=skylake | FileCheck --check-prefixes=CHECK,CHECK-O0 %s ; RUN: llc -O3 < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mcpu=skylake | FileCheck --check-prefixes=CHECK,CHECK-O3 %s +; RUN: llc -O3 < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mcpu=skylake -mattr=-avoid-mfence | FileCheck --check-prefixes=CHECK,CHECK-MFENCE %s define i8 @load_i8(ptr %ptr) { ; CHECK-O0-LABEL: load_i8: @@ -12,6 +13,11 @@ define i8 @load_i8(ptr %ptr) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: movzbl (%rdi), %eax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_i8: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movzbl (%rdi), %eax +; CHECK-MFENCE-NEXT: retq %v = load atomic i8, ptr %ptr unordered, align 1 ret i8 %v } @@ -27,6 +33,11 @@ define void @store_i8(ptr %ptr, i8 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: movb %sil, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: store_i8: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movb %sil, (%rdi) +; CHECK-MFENCE-NEXT: retq store atomic i8 %v, ptr %ptr unordered, align 1 ret void } @@ -41,6 +52,11 @@ define i16 @load_i16(ptr %ptr) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: movzwl (%rdi), %eax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_i16: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movzwl (%rdi), %eax +; CHECK-MFENCE-NEXT: retq %v = load atomic i16, ptr %ptr unordered, align 2 ret i16 %v } @@ -57,6 +73,11 @@ define void @store_i16(ptr %ptr, i16 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: movw %si, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: store_i16: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movw %si, (%rdi) +; CHECK-MFENCE-NEXT: retq store atomic i16 %v, ptr %ptr unordered, align 2 ret void } @@ -116,6 +137,11 @@ define void @narrow_writeback_or(ptr %ptr) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: orq $7, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: narrow_writeback_or: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: orq $7, (%rdi) +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %ptr unordered, align 8 %v.new = or i64 %v, 7 store atomic i64 %v.new, ptr %ptr unordered, align 8 @@ -138,6 +164,12 @@ define void @narrow_writeback_and(ptr %ptr) { ; CHECK-O3-NEXT: movl $4294967040, %eax # imm = 0xFFFFFF00 ; CHECK-O3-NEXT: andq %rax, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: narrow_writeback_and: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movl $4294967040, %eax # imm = 0xFFFFFF00 +; CHECK-MFENCE-NEXT: andq %rax, (%rdi) +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %ptr unordered, align 8 %v.new = and i64 %v, 4294967040 ;; 0xFFFF_FF00 store atomic i64 %v.new, ptr %ptr unordered, align 8 @@ -157,6 +189,11 @@ define void @narrow_writeback_xor(ptr %ptr) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: xorq $7, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: narrow_writeback_xor: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: xorq $7, (%rdi) +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %ptr unordered, align 8 %v.new = xor i64 %v, 7 store atomic i64 %v.new, ptr %ptr unordered, align 8 @@ -254,6 +291,14 @@ define void @store_i128(ptr %ptr, i128 %v) { ; CHECK-O3-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; CHECK-O3-NEXT: vmovdqa %xmm0, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: store_i128: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: vmovq %rdx, %xmm0 +; CHECK-MFENCE-NEXT: vmovq %rsi, %xmm1 +; CHECK-MFENCE-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; CHECK-MFENCE-NEXT: vmovdqa %xmm0, (%rdi) +; CHECK-MFENCE-NEXT: retq store atomic i128 %v, ptr %ptr unordered, align 16 ret void } @@ -305,6 +350,28 @@ define i256 @load_i256(ptr %ptr) { ; CHECK-O3-NEXT: .cfi_def_cfa_offset 8 ; CHECK-O3-NEXT: vzeroupper ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_i256: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: pushq %rbx +; CHECK-MFENCE-NEXT: .cfi_def_cfa_offset 16 +; CHECK-MFENCE-NEXT: subq $32, %rsp +; CHECK-MFENCE-NEXT: .cfi_def_cfa_offset 48 +; CHECK-MFENCE-NEXT: .cfi_offset %rbx, -16 +; CHECK-MFENCE-NEXT: movq %rdi, %rbx +; CHECK-MFENCE-NEXT: movq %rsp, %rdx +; CHECK-MFENCE-NEXT: movl $32, %edi +; CHECK-MFENCE-NEXT: xorl %ecx, %ecx +; CHECK-MFENCE-NEXT: callq __atomic_load@PLT +; CHECK-MFENCE-NEXT: vmovups (%rsp), %ymm0 +; CHECK-MFENCE-NEXT: vmovups %ymm0, (%rbx) +; CHECK-MFENCE-NEXT: movq %rbx, %rax +; CHECK-MFENCE-NEXT: addq $32, %rsp +; CHECK-MFENCE-NEXT: .cfi_def_cfa_offset 16 +; CHECK-MFENCE-NEXT: popq %rbx +; CHECK-MFENCE-NEXT: .cfi_def_cfa_offset 8 +; CHECK-MFENCE-NEXT: vzeroupper +; CHECK-MFENCE-NEXT: retq %v = load atomic i256, ptr %ptr unordered, align 16 ret i256 %v } @@ -345,6 +412,24 @@ define void @store_i256(ptr %ptr, i256 %v) { ; CHECK-O3-NEXT: addq $40, %rsp ; CHECK-O3-NEXT: .cfi_def_cfa_offset 8 ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: store_i256: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: subq $40, %rsp +; CHECK-MFENCE-NEXT: .cfi_def_cfa_offset 48 +; CHECK-MFENCE-NEXT: movq %rdi, %rax +; CHECK-MFENCE-NEXT: movq %r8, {{[0-9]+}}(%rsp) +; CHECK-MFENCE-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; CHECK-MFENCE-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; CHECK-MFENCE-NEXT: movq %rsi, (%rsp) +; CHECK-MFENCE-NEXT: movq %rsp, %rdx +; CHECK-MFENCE-NEXT: movl $32, %edi +; CHECK-MFENCE-NEXT: movq %rax, %rsi +; CHECK-MFENCE-NEXT: xorl %ecx, %ecx +; CHECK-MFENCE-NEXT: callq __atomic_store@PLT +; CHECK-MFENCE-NEXT: addq $40, %rsp +; CHECK-MFENCE-NEXT: .cfi_def_cfa_offset 8 +; CHECK-MFENCE-NEXT: retq store atomic i256 %v, ptr %ptr unordered, align 16 ret void } @@ -366,6 +451,14 @@ define void @vec_store(ptr %p0, <2 x i32> %vec) { ; CHECK-O3-NEXT: movl %eax, (%rdi) ; CHECK-O3-NEXT: movl %ecx, 4(%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: vec_store: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: vmovd %xmm0, %eax +; CHECK-MFENCE-NEXT: vpextrd $1, %xmm0, %ecx +; CHECK-MFENCE-NEXT: movl %eax, (%rdi) +; CHECK-MFENCE-NEXT: movl %ecx, 4(%rdi) +; CHECK-MFENCE-NEXT: retq %v1 = extractelement <2 x i32> %vec, i32 0 %v2 = extractelement <2 x i32> %vec, i32 1 %p1 = getelementptr i32, ptr %p0, i64 1 @@ -391,6 +484,14 @@ define void @vec_store_unaligned(ptr %p0, <2 x i32> %vec) { ; CHECK-O3-NEXT: movl %eax, (%rdi) ; CHECK-O3-NEXT: movl %ecx, 4(%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: vec_store_unaligned: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: vmovd %xmm0, %eax +; CHECK-MFENCE-NEXT: vpextrd $1, %xmm0, %ecx +; CHECK-MFENCE-NEXT: movl %eax, (%rdi) +; CHECK-MFENCE-NEXT: movl %ecx, 4(%rdi) +; CHECK-MFENCE-NEXT: retq %v1 = extractelement <2 x i32> %vec, i32 0 %v2 = extractelement <2 x i32> %vec, i32 1 %p1 = getelementptr i32, ptr %p0, i64 1 @@ -496,6 +597,12 @@ define i64 @load_fold_add3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: movq (%rsi), %rax ; CHECK-O3-NEXT: addq (%rdi), %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_add3: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rsi), %rax +; CHECK-MFENCE-NEXT: addq (%rdi), %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = add i64 %v, %v2 @@ -515,6 +622,12 @@ define i64 @load_fold_sub1(ptr %p) { ; CHECK-O3-NEXT: movq (%rdi), %rax ; CHECK-O3-NEXT: addq $-15, %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_sub1: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: addq $-15, %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = sub i64 %v, 15 ret i64 %ret @@ -556,6 +669,13 @@ define i64 @load_fold_mul1(ptr %p) { ; CHECK-O3-NEXT: leaq (%rax,%rax,4), %rax ; CHECK-O3-NEXT: leaq (%rax,%rax,2), %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_mul1: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: leaq (%rax,%rax,4), %rax +; CHECK-MFENCE-NEXT: leaq (%rax,%rax,2), %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = mul i64 %v, 15 ret i64 %ret @@ -584,6 +704,12 @@ define i64 @load_fold_mul3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: movq (%rsi), %rax ; CHECK-O3-NEXT: imulq (%rdi), %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_mul3: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rsi), %rax +; CHECK-MFENCE-NEXT: imulq (%rdi), %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = mul i64 %v, %v2 @@ -613,6 +739,20 @@ define i64 @load_fold_sdiv1(ptr %p) { ; CHECK-O3-NEXT: addq %rax, %rcx ; CHECK-O3-NEXT: movq %rcx, %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_sdiv1: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rcx +; CHECK-MFENCE-NEXT: movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889 +; CHECK-MFENCE-NEXT: movq %rcx, %rax +; CHECK-MFENCE-NEXT: imulq %rdx +; CHECK-MFENCE-NEXT: addq %rdx, %rcx +; CHECK-MFENCE-NEXT: movq %rcx, %rax +; CHECK-MFENCE-NEXT: shrq $63, %rax +; CHECK-MFENCE-NEXT: sarq $3, %rcx +; CHECK-MFENCE-NEXT: addq %rax, %rcx +; CHECK-MFENCE-NEXT: movq %rcx, %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = sdiv i64 %v, 15 ret i64 %ret @@ -644,6 +784,24 @@ define i64 @load_fold_sdiv2(ptr %p, i64 %v2) { ; CHECK-O3-NEXT: divl %esi ; CHECK-O3-NEXT: # kill: def $eax killed $eax def $rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_sdiv2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: movq %rax, %rcx +; CHECK-MFENCE-NEXT: orq %rsi, %rcx +; CHECK-MFENCE-NEXT: shrq $32, %rcx +; CHECK-MFENCE-NEXT: je .LBB35_1 +; CHECK-MFENCE-NEXT: # %bb.2: +; CHECK-MFENCE-NEXT: cqto +; CHECK-MFENCE-NEXT: idivq %rsi +; CHECK-MFENCE-NEXT: retq +; CHECK-MFENCE-NEXT: .LBB35_1: +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divl %esi +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax def $rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = sdiv i64 %v, %v2 ret i64 %ret @@ -675,6 +833,25 @@ define i64 @load_fold_sdiv3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: divl %ecx ; CHECK-O3-NEXT: # kill: def $eax killed $eax def $rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_sdiv3: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: movq (%rsi), %rcx +; CHECK-MFENCE-NEXT: movq %rax, %rdx +; CHECK-MFENCE-NEXT: orq %rcx, %rdx +; CHECK-MFENCE-NEXT: shrq $32, %rdx +; CHECK-MFENCE-NEXT: je .LBB36_1 +; CHECK-MFENCE-NEXT: # %bb.2: +; CHECK-MFENCE-NEXT: cqto +; CHECK-MFENCE-NEXT: idivq %rcx +; CHECK-MFENCE-NEXT: retq +; CHECK-MFENCE-NEXT: .LBB36_1: +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divl %ecx +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax def $rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = sdiv i64 %v, %v2 @@ -699,6 +876,14 @@ define i64 @load_fold_udiv1(ptr %p) { ; CHECK-O3-NEXT: mulxq %rax, %rax, %rax ; CHECK-O3-NEXT: shrq $3, %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_udiv1: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rdx +; CHECK-MFENCE-NEXT: movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889 +; CHECK-MFENCE-NEXT: mulxq %rax, %rax, %rax +; CHECK-MFENCE-NEXT: shrq $3, %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = udiv i64 %v, 15 ret i64 %ret @@ -730,6 +915,24 @@ define i64 @load_fold_udiv2(ptr %p, i64 %v2) { ; CHECK-O3-NEXT: divl %esi ; CHECK-O3-NEXT: # kill: def $eax killed $eax def $rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_udiv2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: movq %rax, %rcx +; CHECK-MFENCE-NEXT: orq %rsi, %rcx +; CHECK-MFENCE-NEXT: shrq $32, %rcx +; CHECK-MFENCE-NEXT: je .LBB38_1 +; CHECK-MFENCE-NEXT: # %bb.2: +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divq %rsi +; CHECK-MFENCE-NEXT: retq +; CHECK-MFENCE-NEXT: .LBB38_1: +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divl %esi +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax def $rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = udiv i64 %v, %v2 ret i64 %ret @@ -762,6 +965,25 @@ define i64 @load_fold_udiv3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: divl %ecx ; CHECK-O3-NEXT: # kill: def $eax killed $eax def $rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_udiv3: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: movq (%rsi), %rcx +; CHECK-MFENCE-NEXT: movq %rax, %rdx +; CHECK-MFENCE-NEXT: orq %rcx, %rdx +; CHECK-MFENCE-NEXT: shrq $32, %rdx +; CHECK-MFENCE-NEXT: je .LBB39_1 +; CHECK-MFENCE-NEXT: # %bb.2: +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divq %rcx +; CHECK-MFENCE-NEXT: retq +; CHECK-MFENCE-NEXT: .LBB39_1: +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divl %ecx +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax def $rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = udiv i64 %v, %v2 @@ -795,6 +1017,23 @@ define i64 @load_fold_srem1(ptr %p) { ; CHECK-O3-NEXT: subq %rax, %rcx ; CHECK-O3-NEXT: movq %rcx, %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_srem1: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rcx +; CHECK-MFENCE-NEXT: movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889 +; CHECK-MFENCE-NEXT: movq %rcx, %rax +; CHECK-MFENCE-NEXT: imulq %rdx +; CHECK-MFENCE-NEXT: addq %rcx, %rdx +; CHECK-MFENCE-NEXT: movq %rdx, %rax +; CHECK-MFENCE-NEXT: shrq $63, %rax +; CHECK-MFENCE-NEXT: sarq $3, %rdx +; CHECK-MFENCE-NEXT: addq %rax, %rdx +; CHECK-MFENCE-NEXT: leaq (%rdx,%rdx,4), %rax +; CHECK-MFENCE-NEXT: leaq (%rax,%rax,2), %rax +; CHECK-MFENCE-NEXT: subq %rax, %rcx +; CHECK-MFENCE-NEXT: movq %rcx, %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = srem i64 %v, 15 ret i64 %ret @@ -828,6 +1067,25 @@ define i64 @load_fold_srem2(ptr %p, i64 %v2) { ; CHECK-O3-NEXT: divl %esi ; CHECK-O3-NEXT: movl %edx, %eax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_srem2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: movq %rax, %rcx +; CHECK-MFENCE-NEXT: orq %rsi, %rcx +; CHECK-MFENCE-NEXT: shrq $32, %rcx +; CHECK-MFENCE-NEXT: je .LBB41_1 +; CHECK-MFENCE-NEXT: # %bb.2: +; CHECK-MFENCE-NEXT: cqto +; CHECK-MFENCE-NEXT: idivq %rsi +; CHECK-MFENCE-NEXT: movq %rdx, %rax +; CHECK-MFENCE-NEXT: retq +; CHECK-MFENCE-NEXT: .LBB41_1: +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divl %esi +; CHECK-MFENCE-NEXT: movl %edx, %eax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = srem i64 %v, %v2 ret i64 %ret @@ -861,6 +1119,26 @@ define i64 @load_fold_srem3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: divl %ecx ; CHECK-O3-NEXT: movl %edx, %eax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_srem3: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: movq (%rsi), %rcx +; CHECK-MFENCE-NEXT: movq %rax, %rdx +; CHECK-MFENCE-NEXT: orq %rcx, %rdx +; CHECK-MFENCE-NEXT: shrq $32, %rdx +; CHECK-MFENCE-NEXT: je .LBB42_1 +; CHECK-MFENCE-NEXT: # %bb.2: +; CHECK-MFENCE-NEXT: cqto +; CHECK-MFENCE-NEXT: idivq %rcx +; CHECK-MFENCE-NEXT: movq %rdx, %rax +; CHECK-MFENCE-NEXT: retq +; CHECK-MFENCE-NEXT: .LBB42_1: +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divl %ecx +; CHECK-MFENCE-NEXT: movl %edx, %eax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = srem i64 %v, %v2 @@ -890,6 +1168,18 @@ define i64 @load_fold_urem1(ptr %p) { ; CHECK-O3-NEXT: leaq (%rcx,%rcx,2), %rcx ; CHECK-O3-NEXT: subq %rcx, %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_urem1: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889 +; CHECK-MFENCE-NEXT: movq %rax, %rdx +; CHECK-MFENCE-NEXT: mulxq %rcx, %rcx, %rcx +; CHECK-MFENCE-NEXT: shrq $3, %rcx +; CHECK-MFENCE-NEXT: leaq (%rcx,%rcx,4), %rcx +; CHECK-MFENCE-NEXT: leaq (%rcx,%rcx,2), %rcx +; CHECK-MFENCE-NEXT: subq %rcx, %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = urem i64 %v, 15 ret i64 %ret @@ -924,6 +1214,25 @@ define i64 @load_fold_urem2(ptr %p, i64 %v2) { ; CHECK-O3-NEXT: divl %esi ; CHECK-O3-NEXT: movl %edx, %eax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_urem2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: movq %rax, %rcx +; CHECK-MFENCE-NEXT: orq %rsi, %rcx +; CHECK-MFENCE-NEXT: shrq $32, %rcx +; CHECK-MFENCE-NEXT: je .LBB44_1 +; CHECK-MFENCE-NEXT: # %bb.2: +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divq %rsi +; CHECK-MFENCE-NEXT: movq %rdx, %rax +; CHECK-MFENCE-NEXT: retq +; CHECK-MFENCE-NEXT: .LBB44_1: +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divl %esi +; CHECK-MFENCE-NEXT: movl %edx, %eax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = urem i64 %v, %v2 ret i64 %ret @@ -958,6 +1267,26 @@ define i64 @load_fold_urem3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: divl %ecx ; CHECK-O3-NEXT: movl %edx, %eax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_urem3: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: movq (%rsi), %rcx +; CHECK-MFENCE-NEXT: movq %rax, %rdx +; CHECK-MFENCE-NEXT: orq %rcx, %rdx +; CHECK-MFENCE-NEXT: shrq $32, %rdx +; CHECK-MFENCE-NEXT: je .LBB45_1 +; CHECK-MFENCE-NEXT: # %bb.2: +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divq %rcx +; CHECK-MFENCE-NEXT: movq %rdx, %rax +; CHECK-MFENCE-NEXT: retq +; CHECK-MFENCE-NEXT: .LBB45_1: +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divl %ecx +; CHECK-MFENCE-NEXT: movl %edx, %eax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = urem i64 %v, %v2 @@ -989,6 +1318,11 @@ define i64 @load_fold_shl2(ptr %p, i64 %v2) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: shlxq %rsi, (%rdi), %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_shl2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: shlxq %rsi, (%rdi), %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = shl i64 %v, %v2 ret i64 %ret @@ -1008,6 +1342,12 @@ define i64 @load_fold_shl3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: movq (%rsi), %rax ; CHECK-O3-NEXT: shlxq %rax, (%rdi), %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_shl3: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rsi), %rax +; CHECK-MFENCE-NEXT: shlxq %rax, (%rdi), %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = shl i64 %v, %v2 @@ -1039,6 +1379,11 @@ define i64 @load_fold_lshr2(ptr %p, i64 %v2) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: shrxq %rsi, (%rdi), %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_lshr2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: shrxq %rsi, (%rdi), %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = lshr i64 %v, %v2 ret i64 %ret @@ -1058,6 +1403,12 @@ define i64 @load_fold_lshr3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: movq (%rsi), %rax ; CHECK-O3-NEXT: shrxq %rax, (%rdi), %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_lshr3: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rsi), %rax +; CHECK-MFENCE-NEXT: shrxq %rax, (%rdi), %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = lshr i64 %v, %v2 @@ -1089,6 +1440,11 @@ define i64 @load_fold_ashr2(ptr %p, i64 %v2) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: sarxq %rsi, (%rdi), %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_ashr2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: sarxq %rsi, (%rdi), %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = ashr i64 %v, %v2 ret i64 %ret @@ -1108,6 +1464,12 @@ define i64 @load_fold_ashr3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: movq (%rsi), %rax ; CHECK-O3-NEXT: sarxq %rax, (%rdi), %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_ashr3: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rsi), %rax +; CHECK-MFENCE-NEXT: sarxq %rax, (%rdi), %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = ashr i64 %v, %v2 @@ -1127,6 +1489,12 @@ define i64 @load_fold_and1(ptr %p) { ; CHECK-O3-NEXT: movq (%rdi), %rax ; CHECK-O3-NEXT: andl $15, %eax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_and1: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: andl $15, %eax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = and i64 %v, 15 ret i64 %ret @@ -1155,6 +1523,12 @@ define i64 @load_fold_and3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: movq (%rsi), %rax ; CHECK-O3-NEXT: andq (%rdi), %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_and3: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rsi), %rax +; CHECK-MFENCE-NEXT: andq (%rdi), %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = and i64 %v, %v2 @@ -1196,6 +1570,12 @@ define i64 @load_fold_or3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: movq (%rsi), %rax ; CHECK-O3-NEXT: orq (%rdi), %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_or3: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rsi), %rax +; CHECK-MFENCE-NEXT: orq (%rdi), %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = or i64 %v, %v2 @@ -1237,6 +1617,12 @@ define i64 @load_fold_xor3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: movq (%rsi), %rax ; CHECK-O3-NEXT: xorq (%rdi), %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_xor3: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rsi), %rax +; CHECK-MFENCE-NEXT: xorq (%rdi), %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = xor i64 %v, %v2 @@ -1256,6 +1642,12 @@ define i1 @load_fold_icmp1(ptr %p) { ; CHECK-O3-NEXT: cmpq $15, (%rdi) ; CHECK-O3-NEXT: sete %al ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_icmp1: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: cmpq $15, (%rdi) +; CHECK-MFENCE-NEXT: sete %al +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = icmp eq i64 %v, 15 ret i1 %ret @@ -1274,6 +1666,12 @@ define i1 @load_fold_icmp2(ptr %p, i64 %v2) { ; CHECK-O3-NEXT: cmpq %rsi, (%rdi) ; CHECK-O3-NEXT: sete %al ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_icmp2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: cmpq %rsi, (%rdi) +; CHECK-MFENCE-NEXT: sete %al +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %ret = icmp eq i64 %v, %v2 ret i1 %ret @@ -1294,6 +1692,13 @@ define i1 @load_fold_icmp3(ptr %p1, ptr %p2) { ; CHECK-O3-NEXT: cmpq %rax, (%rdi) ; CHECK-O3-NEXT: sete %al ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_fold_icmp3: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rsi), %rax +; CHECK-MFENCE-NEXT: cmpq %rax, (%rdi) +; CHECK-MFENCE-NEXT: sete %al +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p1 unordered, align 8 %v2 = load atomic i64, ptr %p2 unordered, align 8 %ret = icmp eq i64 %v, %v2 @@ -1319,6 +1724,11 @@ define void @rmw_fold_add1(ptr %p, i64 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: addq $15, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_add1: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: addq $15, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = add i64 %prev, 15 store atomic i64 %val, ptr %p unordered, align 8 @@ -1338,6 +1748,11 @@ define void @rmw_fold_add2(ptr %p, i64 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: addq %rsi, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_add2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: addq %rsi, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = add i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -1357,6 +1772,11 @@ define void @rmw_fold_sub1(ptr %p, i64 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: addq $-15, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_sub1: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: addq $-15, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = sub i64 %prev, 15 store atomic i64 %val, ptr %p unordered, align 8 @@ -1376,6 +1796,11 @@ define void @rmw_fold_sub2(ptr %p, i64 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: subq %rsi, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_sub2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: subq %rsi, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = sub i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -1411,6 +1836,12 @@ define void @rmw_fold_mul2(ptr %p, i64 %v) { ; CHECK-O3-NEXT: imulq (%rdi), %rsi ; CHECK-O3-NEXT: movq %rsi, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_mul2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: imulq (%rdi), %rsi +; CHECK-MFENCE-NEXT: movq %rsi, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = mul i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -1447,6 +1878,20 @@ define void @rmw_fold_sdiv1(ptr %p, i64 %v) { ; CHECK-O3-NEXT: addq %rax, %rdx ; CHECK-O3-NEXT: movq %rdx, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_sdiv1: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rcx +; CHECK-MFENCE-NEXT: movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889 +; CHECK-MFENCE-NEXT: movq %rcx, %rax +; CHECK-MFENCE-NEXT: imulq %rdx +; CHECK-MFENCE-NEXT: addq %rcx, %rdx +; CHECK-MFENCE-NEXT: movq %rdx, %rax +; CHECK-MFENCE-NEXT: shrq $63, %rax +; CHECK-MFENCE-NEXT: sarq $3, %rdx +; CHECK-MFENCE-NEXT: addq %rax, %rdx +; CHECK-MFENCE-NEXT: movq %rdx, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = sdiv i64 %prev, 15 store atomic i64 %val, ptr %p unordered, align 8 @@ -1482,6 +1927,26 @@ define void @rmw_fold_sdiv2(ptr %p, i64 %v) { ; CHECK-O3-NEXT: # kill: def $eax killed $eax def $rax ; CHECK-O3-NEXT: movq %rax, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_sdiv2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: movq %rax, %rcx +; CHECK-MFENCE-NEXT: orq %rsi, %rcx +; CHECK-MFENCE-NEXT: shrq $32, %rcx +; CHECK-MFENCE-NEXT: je .LBB74_1 +; CHECK-MFENCE-NEXT: # %bb.2: +; CHECK-MFENCE-NEXT: cqto +; CHECK-MFENCE-NEXT: idivq %rsi +; CHECK-MFENCE-NEXT: movq %rax, (%rdi) +; CHECK-MFENCE-NEXT: retq +; CHECK-MFENCE-NEXT: .LBB74_1: +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divl %esi +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax def $rax +; CHECK-MFENCE-NEXT: movq %rax, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = sdiv i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -1534,6 +1999,26 @@ define void @rmw_fold_udiv2(ptr %p, i64 %v) { ; CHECK-O3-NEXT: # kill: def $eax killed $eax def $rax ; CHECK-O3-NEXT: movq %rax, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_udiv2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: movq %rax, %rcx +; CHECK-MFENCE-NEXT: orq %rsi, %rcx +; CHECK-MFENCE-NEXT: shrq $32, %rcx +; CHECK-MFENCE-NEXT: je .LBB76_1 +; CHECK-MFENCE-NEXT: # %bb.2: +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divq %rsi +; CHECK-MFENCE-NEXT: movq %rax, (%rdi) +; CHECK-MFENCE-NEXT: retq +; CHECK-MFENCE-NEXT: .LBB76_1: +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divl %esi +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax def $rax +; CHECK-MFENCE-NEXT: movq %rax, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = udiv i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -1577,6 +2062,23 @@ define void @rmw_fold_srem1(ptr %p, i64 %v) { ; CHECK-O3-NEXT: subq %rax, %rcx ; CHECK-O3-NEXT: movq %rcx, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_srem1: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rcx +; CHECK-MFENCE-NEXT: movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889 +; CHECK-MFENCE-NEXT: movq %rcx, %rax +; CHECK-MFENCE-NEXT: imulq %rdx +; CHECK-MFENCE-NEXT: addq %rcx, %rdx +; CHECK-MFENCE-NEXT: movq %rdx, %rax +; CHECK-MFENCE-NEXT: shrq $63, %rax +; CHECK-MFENCE-NEXT: sarq $3, %rdx +; CHECK-MFENCE-NEXT: addq %rax, %rdx +; CHECK-MFENCE-NEXT: leaq (%rdx,%rdx,4), %rax +; CHECK-MFENCE-NEXT: leaq (%rax,%rax,2), %rax +; CHECK-MFENCE-NEXT: subq %rax, %rcx +; CHECK-MFENCE-NEXT: movq %rcx, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = srem i64 %prev, 15 store atomic i64 %val, ptr %p unordered, align 8 @@ -1612,6 +2114,26 @@ define void @rmw_fold_srem2(ptr %p, i64 %v) { ; CHECK-O3-NEXT: # kill: def $edx killed $edx def $rdx ; CHECK-O3-NEXT: movq %rdx, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_srem2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: movq %rax, %rcx +; CHECK-MFENCE-NEXT: orq %rsi, %rcx +; CHECK-MFENCE-NEXT: shrq $32, %rcx +; CHECK-MFENCE-NEXT: je .LBB78_1 +; CHECK-MFENCE-NEXT: # %bb.2: +; CHECK-MFENCE-NEXT: cqto +; CHECK-MFENCE-NEXT: idivq %rsi +; CHECK-MFENCE-NEXT: movq %rdx, (%rdi) +; CHECK-MFENCE-NEXT: retq +; CHECK-MFENCE-NEXT: .LBB78_1: +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divl %esi +; CHECK-MFENCE-NEXT: # kill: def $edx killed $edx def $rdx +; CHECK-MFENCE-NEXT: movq %rdx, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = srem i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -1644,6 +2166,18 @@ define void @rmw_fold_urem1(ptr %p, i64 %v) { ; CHECK-O3-NEXT: subq %rax, %rdx ; CHECK-O3-NEXT: movq %rdx, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_urem1: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rdx +; CHECK-MFENCE-NEXT: movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889 +; CHECK-MFENCE-NEXT: mulxq %rax, %rax, %rax +; CHECK-MFENCE-NEXT: shrq $3, %rax +; CHECK-MFENCE-NEXT: leaq (%rax,%rax,4), %rax +; CHECK-MFENCE-NEXT: leaq (%rax,%rax,2), %rax +; CHECK-MFENCE-NEXT: subq %rax, %rdx +; CHECK-MFENCE-NEXT: movq %rdx, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = urem i64 %prev, 15 store atomic i64 %val, ptr %p unordered, align 8 @@ -1680,6 +2214,26 @@ define void @rmw_fold_urem2(ptr %p, i64 %v) { ; CHECK-O3-NEXT: # kill: def $edx killed $edx def $rdx ; CHECK-O3-NEXT: movq %rdx, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_urem2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: movq %rax, %rcx +; CHECK-MFENCE-NEXT: orq %rsi, %rcx +; CHECK-MFENCE-NEXT: shrq $32, %rcx +; CHECK-MFENCE-NEXT: je .LBB80_1 +; CHECK-MFENCE-NEXT: # %bb.2: +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divq %rsi +; CHECK-MFENCE-NEXT: movq %rdx, (%rdi) +; CHECK-MFENCE-NEXT: retq +; CHECK-MFENCE-NEXT: .LBB80_1: +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-MFENCE-NEXT: xorl %edx, %edx +; CHECK-MFENCE-NEXT: divl %esi +; CHECK-MFENCE-NEXT: # kill: def $edx killed $edx def $rdx +; CHECK-MFENCE-NEXT: movq %rdx, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = urem i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -1717,6 +2271,12 @@ define void @rmw_fold_shl2(ptr %p, i64 %v) { ; CHECK-O3-NEXT: shlxq %rsi, (%rdi), %rax ; CHECK-O3-NEXT: movq %rax, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_shl2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: shlxq %rsi, (%rdi), %rax +; CHECK-MFENCE-NEXT: movq %rax, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = shl i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -1754,6 +2314,12 @@ define void @rmw_fold_lshr2(ptr %p, i64 %v) { ; CHECK-O3-NEXT: shrxq %rsi, (%rdi), %rax ; CHECK-O3-NEXT: movq %rax, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_lshr2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: shrxq %rsi, (%rdi), %rax +; CHECK-MFENCE-NEXT: movq %rax, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = lshr i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -1791,6 +2357,12 @@ define void @rmw_fold_ashr2(ptr %p, i64 %v) { ; CHECK-O3-NEXT: sarxq %rsi, (%rdi), %rax ; CHECK-O3-NEXT: movq %rax, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_ashr2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: sarxq %rsi, (%rdi), %rax +; CHECK-MFENCE-NEXT: movq %rax, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = ashr i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -1812,6 +2384,11 @@ define void @rmw_fold_and1(ptr %p, i64 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: andq $15, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_and1: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: andq $15, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = and i64 %prev, 15 store atomic i64 %val, ptr %p unordered, align 8 @@ -1831,6 +2408,11 @@ define void @rmw_fold_and2(ptr %p, i64 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: andq %rsi, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_and2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: andq %rsi, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = and i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -1850,6 +2432,11 @@ define void @rmw_fold_or1(ptr %p, i64 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: orq $15, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_or1: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: orq $15, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = or i64 %prev, 15 store atomic i64 %val, ptr %p unordered, align 8 @@ -1869,6 +2456,11 @@ define void @rmw_fold_or2(ptr %p, i64 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: orq %rsi, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_or2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: orq %rsi, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = or i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -1888,6 +2480,11 @@ define void @rmw_fold_xor1(ptr %p, i64 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: xorq $15, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_xor1: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: xorq $15, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = xor i64 %prev, 15 store atomic i64 %val, ptr %p unordered, align 8 @@ -1907,6 +2504,11 @@ define void @rmw_fold_xor2(ptr %p, i64 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: xorq %rsi, (%rdi) ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: rmw_fold_xor2: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: xorq %rsi, (%rdi) +; CHECK-MFENCE-NEXT: retq %prev = load atomic i64, ptr %p unordered, align 8 %val = xor i64 %prev, %v store atomic i64 %val, ptr %p unordered, align 8 @@ -1943,6 +2545,13 @@ define i32 @fold_trunc_add(ptr %p, i32 %v2) { ; CHECK-O3-NEXT: addl %esi, %eax ; CHECK-O3-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: fold_trunc_add: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: addl %esi, %eax +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %trunc = trunc i64 %v to i32 %ret = add i32 %trunc, %v2 @@ -1964,6 +2573,13 @@ define i32 @fold_trunc_and(ptr %p, i32 %v2) { ; CHECK-O3-NEXT: andl %esi, %eax ; CHECK-O3-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: fold_trunc_and: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: andl %esi, %eax +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %trunc = trunc i64 %v to i32 %ret = and i32 %trunc, %v2 @@ -1985,6 +2601,13 @@ define i32 @fold_trunc_or(ptr %p, i32 %v2) { ; CHECK-O3-NEXT: orl %esi, %eax ; CHECK-O3-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: fold_trunc_or: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: orl %esi, %eax +; CHECK-MFENCE-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %trunc = trunc i64 %v to i32 %ret = or i32 %trunc, %v2 @@ -2012,6 +2635,15 @@ define i32 @split_load(ptr %p) { ; CHECK-O3-NEXT: orl %eax, %ecx ; CHECK-O3-NEXT: movzbl %cl, %eax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: split_load: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: movq %rax, %rcx +; CHECK-MFENCE-NEXT: shrq $32, %rcx +; CHECK-MFENCE-NEXT: orl %eax, %ecx +; CHECK-MFENCE-NEXT: movzbl %cl, %eax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 %b1 = trunc i64 %v to i8 %v.shift = lshr i64 %v, 32 @@ -2093,12 +2725,26 @@ define void @dead_store(ptr %p, i64 %v) { ;; isn't violated. define i64 @nofold_fence(ptr %p) { -; CHECK-LABEL: nofold_fence: -; CHECK: # %bb.0: -; CHECK-NEXT: movq (%rdi), %rax -; CHECK-NEXT: mfence -; CHECK-NEXT: addq $15, %rax -; CHECK-NEXT: retq +; CHECK-O0-LABEL: nofold_fence: +; CHECK-O0: # %bb.0: +; CHECK-O0-NEXT: movq (%rdi), %rax +; CHECK-O0-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) +; CHECK-O0-NEXT: addq $15, %rax +; CHECK-O0-NEXT: retq +; +; CHECK-O3-LABEL: nofold_fence: +; CHECK-O3: # %bb.0: +; CHECK-O3-NEXT: movq (%rdi), %rax +; CHECK-O3-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) +; CHECK-O3-NEXT: addq $15, %rax +; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: nofold_fence: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: mfence +; CHECK-MFENCE-NEXT: addq $15, %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 fence seq_cst %ret = add i64 %v, 15 @@ -2148,6 +2794,12 @@ define i64 @fold_constant(i64 %arg) { ; CHECK-O3-NEXT: movq %rdi, %rax ; CHECK-O3-NEXT: addq Constant(%rip), %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: fold_constant: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq %rdi, %rax +; CHECK-MFENCE-NEXT: addq Constant(%rip), %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr @Constant unordered, align 8 %ret = add i64 %v, %arg ret i64 %ret @@ -2167,12 +2819,26 @@ define i64 @fold_constant_clobber(ptr %p, i64 %arg) { } define i64 @fold_constant_fence(i64 %arg) { -; CHECK-LABEL: fold_constant_fence: -; CHECK: # %bb.0: -; CHECK-NEXT: movq Constant(%rip), %rax -; CHECK-NEXT: mfence -; CHECK-NEXT: addq %rdi, %rax -; CHECK-NEXT: retq +; CHECK-O0-LABEL: fold_constant_fence: +; CHECK-O0: # %bb.0: +; CHECK-O0-NEXT: movq Constant(%rip), %rax +; CHECK-O0-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) +; CHECK-O0-NEXT: addq %rdi, %rax +; CHECK-O0-NEXT: retq +; +; CHECK-O3-LABEL: fold_constant_fence: +; CHECK-O3: # %bb.0: +; CHECK-O3-NEXT: movq Constant(%rip), %rax +; CHECK-O3-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) +; CHECK-O3-NEXT: addq %rdi, %rax +; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: fold_constant_fence: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq Constant(%rip), %rax +; CHECK-MFENCE-NEXT: mfence +; CHECK-MFENCE-NEXT: addq %rdi, %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr @Constant unordered, align 8 fence seq_cst %ret = add i64 %v, %arg @@ -2194,12 +2860,26 @@ define i64 @fold_invariant_clobber(ptr dereferenceable(8) %p, i64 %arg) { define i64 @fold_invariant_fence(ptr dereferenceable(8) %p, i64 %arg) { -; CHECK-LABEL: fold_invariant_fence: -; CHECK: # %bb.0: -; CHECK-NEXT: movq (%rdi), %rax -; CHECK-NEXT: mfence -; CHECK-NEXT: addq %rsi, %rax -; CHECK-NEXT: retq +; CHECK-O0-LABEL: fold_invariant_fence: +; CHECK-O0: # %bb.0: +; CHECK-O0-NEXT: movq (%rdi), %rax +; CHECK-O0-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) +; CHECK-O0-NEXT: addq %rsi, %rax +; CHECK-O0-NEXT: retq +; +; CHECK-O3-LABEL: fold_invariant_fence: +; CHECK-O3: # %bb.0: +; CHECK-O3-NEXT: movq (%rdi), %rax +; CHECK-O3-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) +; CHECK-O3-NEXT: addq %rsi, %rax +; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: fold_invariant_fence: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movq (%rdi), %rax +; CHECK-MFENCE-NEXT: mfence +; CHECK-MFENCE-NEXT: addq %rsi, %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8, !invariant.load !{} fence seq_cst %ret = add i64 %v, %arg @@ -2222,6 +2902,12 @@ define i16 @load_i8_anyext_i16(ptr %ptr) { ; CHECK-O3-NEXT: movzbl (%rdi), %eax ; CHECK-O3-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_i8_anyext_i16: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movzbl (%rdi), %eax +; CHECK-MFENCE-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-MFENCE-NEXT: retq %v = load atomic i8, ptr %ptr unordered, align 2 %vec = insertelement <2 x i8> undef, i8 %v, i32 0 %res = bitcast <2 x i8> %vec to i16 @@ -2239,6 +2925,11 @@ define i32 @load_i8_anyext_i32(ptr %ptr) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: movzbl (%rdi), %eax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_i8_anyext_i32: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movzbl (%rdi), %eax +; CHECK-MFENCE-NEXT: retq %v = load atomic i8, ptr %ptr unordered, align 4 %vec = insertelement <4 x i8> undef, i8 %v, i32 0 %res = bitcast <4 x i8> %vec to i32 @@ -2257,6 +2948,11 @@ define i32 @load_i16_anyext_i32(ptr %ptr) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: movzwl (%rdi), %eax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_i16_anyext_i32: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movzwl (%rdi), %eax +; CHECK-MFENCE-NEXT: retq %v = load atomic i16, ptr %ptr unordered, align 4 %vec = insertelement <2 x i16> undef, i16 %v, i64 0 %res = bitcast <2 x i16> %vec to i32 @@ -2279,6 +2975,13 @@ define i64 @load_i16_anyext_i64(ptr %ptr) { ; CHECK-O3-NEXT: vmovd %eax, %xmm0 ; CHECK-O3-NEXT: vmovq %xmm0, %rax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_i16_anyext_i64: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movzwl (%rdi), %eax +; CHECK-MFENCE-NEXT: vmovd %eax, %xmm0 +; CHECK-MFENCE-NEXT: vmovq %xmm0, %rax +; CHECK-MFENCE-NEXT: retq %v = load atomic i16, ptr %ptr unordered, align 8 %vec = insertelement <4 x i16> undef, i16 %v, i64 0 %res = bitcast <4 x i16> %vec to i64 @@ -2307,6 +3010,15 @@ define i16 @load_combine(ptr %p) { ; CHECK-O3-NEXT: orl %ecx, %eax ; CHECK-O3-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: load_combine: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movzbl (%rdi), %ecx +; CHECK-MFENCE-NEXT: movzbl 1(%rdi), %eax +; CHECK-MFENCE-NEXT: shll $8, %eax +; CHECK-MFENCE-NEXT: orl %ecx, %eax +; CHECK-MFENCE-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-MFENCE-NEXT: retq %v1 = load atomic i8, ptr %p unordered, align 2 %p2 = getelementptr i8, ptr %p, i64 1 %v2 = load atomic i8, ptr %p2 unordered, align 1 @@ -2321,7 +3033,7 @@ define i1 @fold_cmp_over_fence(ptr %p, i32 %v1) { ; CHECK-O0-LABEL: fold_cmp_over_fence: ; CHECK-O0: # %bb.0: ; CHECK-O0-NEXT: movl (%rdi), %eax -; CHECK-O0-NEXT: mfence +; CHECK-O0-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) ; CHECK-O0-NEXT: cmpl %eax, %esi ; CHECK-O0-NEXT: jne .LBB116_2 ; CHECK-O0-NEXT: # %bb.1: # %taken @@ -2335,7 +3047,7 @@ define i1 @fold_cmp_over_fence(ptr %p, i32 %v1) { ; CHECK-O3-LABEL: fold_cmp_over_fence: ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: movl (%rdi), %eax -; CHECK-O3-NEXT: mfence +; CHECK-O3-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) ; CHECK-O3-NEXT: cmpl %eax, %esi ; CHECK-O3-NEXT: jne .LBB116_2 ; CHECK-O3-NEXT: # %bb.1: # %taken @@ -2344,6 +3056,19 @@ define i1 @fold_cmp_over_fence(ptr %p, i32 %v1) { ; CHECK-O3-NEXT: .LBB116_2: # %untaken ; CHECK-O3-NEXT: xorl %eax, %eax ; CHECK-O3-NEXT: retq +; +; CHECK-MFENCE-LABEL: fold_cmp_over_fence: +; CHECK-MFENCE: # %bb.0: +; CHECK-MFENCE-NEXT: movl (%rdi), %eax +; CHECK-MFENCE-NEXT: mfence +; CHECK-MFENCE-NEXT: cmpl %eax, %esi +; CHECK-MFENCE-NEXT: jne .LBB116_2 +; CHECK-MFENCE-NEXT: # %bb.1: # %taken +; CHECK-MFENCE-NEXT: movb $1, %al +; CHECK-MFENCE-NEXT: retq +; CHECK-MFENCE-NEXT: .LBB116_2: # %untaken +; CHECK-MFENCE-NEXT: xorl %eax, %eax +; CHECK-MFENCE-NEXT: retq %v2 = load atomic i32, ptr %p unordered, align 4 fence seq_cst %cmp = icmp eq i32 %v1, %v2 diff --git a/llvm/test/CodeGen/X86/mfence.ll b/llvm/test/CodeGen/X86/mfence.ll index f34657b3f240c9..a7b4790bf801ee 100644 --- a/llvm/test/CodeGen/X86/mfence.ll +++ b/llvm/test/CodeGen/X86/mfence.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X32 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-sse2 | FileCheck %s --check-prefix=X64 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-sse2,+avoid-mfence | FileCheck %s --check-prefix=X64-NO-MFENCE ; It doesn't matter if an x86-64 target has specified "no-sse2"; we still can use mfence. @@ -14,6 +15,11 @@ define void @test() { ; X64: # %bb.0: ; X64-NEXT: mfence ; X64-NEXT: retq +; +; X64-NO-MFENCE-LABEL: test: +; X64-NO-MFENCE: # %bb.0: +; X64-NO-MFENCE-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) +; X64-NO-MFENCE-NEXT: retq fence seq_cst ret void } @@ -31,7 +37,33 @@ define i32 @fence(ptr %ptr) { ; X64-NEXT: mfence ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: retq +; +; X64-NO-MFENCE-LABEL: fence: +; X64-NO-MFENCE: # %bb.0: +; X64-NO-MFENCE-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) +; X64-NO-MFENCE-NEXT: movl (%rdi), %eax +; X64-NO-MFENCE-NEXT: retq %atomic = atomicrmw add ptr %ptr, i32 0 seq_cst ret i32 %atomic } +define void @mfence() nounwind { +; X32-LABEL: mfence: +; X32: # %bb.0: +; X32-NEXT: mfence +; X32-NEXT: retl +; +; X64-LABEL: mfence: +; X64: # %bb.0: +; X64-NEXT: mfence +; X64-NEXT: retq +; +; X64-NO-MFENCE-LABEL: mfence: +; X64-NO-MFENCE: # %bb.0: +; X64-NO-MFENCE-NEXT: mfence +; X64-NO-MFENCE-NEXT: retq + call void @llvm.x86.sse2.mfence() + ret void +} +declare void @llvm.x86.sse2.mfence() nounwind readnone +