JuliaLang · Zentrik · Jul 30, 2024 · Oct 16, 2024 · Aug 29, 2024 · Nov 18, 2024
diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/i386.h b/llvm/include/llvm/ExecutionEngine/JITLink/i386.h
@@ -39,12 +39,8 @@ enum EdgeKind_i386 : Edge::Kind {
   /// Represents a data/control flow instruction using PC-relative addressing
   /// to a target.
   ///
-  /// The fixup expression for this kind includes an implicit offset to account
-  /// for the PC (unlike the Delta edges) so that a PCRel32 with a target
-  /// T and addend zero is a call/branch to the start (offset zero) of T.
-  ///
   /// Fixup expression:
-  ///   Fixup <- Target - (Fixup + 4) + Addend : int32
+  ///   Fixup <- Target - Fixup + Addend : int32
   ///
   /// Errors:
   ///   - The result of the fixup expression must fit into an int32, otherwise
@@ -68,12 +64,8 @@ enum EdgeKind_i386 : Edge::Kind {
   /// Represents a data/control flow instruction using PC-relative addressing
   /// to a target.
   ///
-  /// The fixup expression for this kind includes an implicit offset to account
-  /// for the PC (unlike the Delta edges) so that a PCRel16 with a target
-  /// T and addend zero is a call/branch to the start (offset zero) of T.
-  ///
   /// Fixup expression:
-  ///   Fixup <- Target - (Fixup + 4) + Addend : int16
+  ///   Fixup <- Target - Fixup + Addend : int16
   ///
   /// Errors:
   ///   - The result of the fixup expression must fit into an int16, otherwise
@@ -86,7 +78,7 @@ enum EdgeKind_i386 : Edge::Kind {
   /// Delta from the fixup to the target.
   ///
   /// Fixup expression:
-  ///   Fixup <- Target - Fixup + Addend : int64
+  ///   Fixup <- Target - Fixup + Addend : int32
   ///
   /// Errors:
   ///   - The result of the fixup expression must fit into an int32, otherwise
@@ -130,12 +122,8 @@ enum EdgeKind_i386 : Edge::Kind {
   /// Represents a PC-relative call or branch to a target. This can be used to
   /// identify, record, and/or patch call sites.
   ///
-  /// The fixup expression for this kind includes an implicit offset to account
-  /// for the PC (unlike the Delta edges) so that a Branch32PCRel with a target
-  /// T and addend zero is a call/branch to the start (offset zero) of T.
-  ///
   /// Fixup expression:
-  ///   Fixup <- Target - (Fixup + 4) + Addend : int32
+  ///   Fixup <- Target - Fixup + Addend : int32
   ///
   /// Errors:
   ///   - The result of the fixup expression must fit into an int32, otherwise
@@ -164,7 +152,7 @@ enum EdgeKind_i386 : Edge::Kind {
   /// target may be recorded to allow manipulation at runtime.
   ///
   /// Fixup expression:
-  ///   Fixup <- Target - Fixup + Addend - 4 : int32
+  ///   Fixup <- Target - Fixup + Addend : int32
   ///
   /// Errors:
   ///   - The result of the fixup expression must fit into an int32, otherwise
@@ -180,7 +168,7 @@ enum EdgeKind_i386 : Edge::Kind {
   /// is within range of the fixup location.
   ///
   /// Fixup expression:
-  ///   Fixup <- Target - Fixup + Addend - 4: int32
+  ///   Fixup <- Target - Fixup + Addend : int32
   ///
   /// Errors:
   ///   - The result of the fixup expression must fit into an int32, otherwise
@@ -215,8 +203,7 @@ inline Error applyFixup(LinkGraph &G, Block &B, const Edge &E,
   }
 
   case i386::PCRel32: {
-    int32_t Value =
-        E.getTarget().getAddress() - (FixupAddress + 4) + E.getAddend();
+    int32_t Value = E.getTarget().getAddress() - FixupAddress + E.getAddend();
     *(little32_t *)FixupPtr = Value;
     break;
   }
@@ -231,8 +218,7 @@ inline Error applyFixup(LinkGraph &G, Block &B, const Edge &E,
   }
 
   case i386::PCRel16: {
-    int32_t Value =
-        E.getTarget().getAddress() - (FixupAddress + 4) + E.getAddend();
+    int32_t Value = E.getTarget().getAddress() - FixupAddress + E.getAddend();
     if (LLVM_LIKELY(isInt<16>(Value)))
       *(little16_t *)FixupPtr = Value;
     else
@@ -257,8 +243,7 @@ inline Error applyFixup(LinkGraph &G, Block &B, const Edge &E,
   case i386::BranchPCRel32:
   case i386::BranchPCRel32ToPtrJumpStub:
   case i386::BranchPCRel32ToPtrJumpStubBypassable: {
-    int32_t Value =
-        E.getTarget().getAddress() - (FixupAddress + 4) + E.getAddend();
+    int32_t Value = E.getTarget().getAddress() - FixupAddress + E.getAddend();
     *(little32_t *)FixupPtr = Value;
     break;
   }

diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_i386.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_i386.cpp
@@ -186,15 +186,29 @@ class ELFLinkGraphBuilder_i386 : public ELFLinkGraphBuilder<ELFT> {
     int64_t Addend = 0;
 
     switch (*Kind) {
-    case i386::EdgeKind_i386::Delta32: {
+    case i386::EdgeKind_i386::None:
+      break;
+    case i386::EdgeKind_i386::Pointer32:
+    case i386::EdgeKind_i386::PCRel32:
+    case i386::EdgeKind_i386::RequestGOTAndTransformToDelta32FromGOT:
+    case i386::EdgeKind_i386::Delta32:
+    case i386::EdgeKind_i386::Delta32FromGOT:
+    case i386::EdgeKind_i386::BranchPCRel32:
+    case i386::EdgeKind_i386::BranchPCRel32ToPtrJumpStub:
+    case i386::EdgeKind_i386::BranchPCRel32ToPtrJumpStubBypassable: {
       const char *FixupContent = BlockToFix.getContent().data() +
                                  (FixupAddress - BlockToFix.getAddress());
-      Addend = *(const support::ulittle32_t *)FixupContent;
+      Addend = *(const support::little32_t *)FixupContent;
       break;
     }
-    default:
+    case i386::EdgeKind_i386::Pointer16:
+    case i386::EdgeKind_i386::PCRel16: {
+      const char *FixupContent = BlockToFix.getContent().data() +
+                                 (FixupAddress - BlockToFix.getAddress());
+      Addend = *(const support::little16_t *)FixupContent;
       break;
     }
+    }
 
     Edge::OffsetT Offset = FixupAddress - BlockToFix.getAddress();
     Edge GE(*Kind, Offset, *GraphSymbol, Addend);

diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
@@ -748,6 +748,10 @@ def TuningUseGLMDivSqrtCosts
 def TuningBranchHint: SubtargetFeature<"branch-hint", "HasBranchHint", "true",
                                         "Target has branch hint feature">;
 
+def TuningAvoidMFENCE
+   : SubtargetFeature<"avoid-mfence", "AvoidMFence", "true",
+        "Avoid MFENCE for fence seq_cst, and instead use lock or">;
+
 //===----------------------------------------------------------------------===//
 // X86 CPU Families
 // TODO: Remove these - use general tuning features to determine codegen.
@@ -809,7 +813,8 @@ def ProcessorFeatures {
     TuningSlow3OpsLEA,
     TuningSlowDivide64,
     TuningSlowIncDec,
-    TuningInsertVZEROUPPER
+    TuningInsertVZEROUPPER,
+    TuningAvoidMFENCE
   ];
 
   list<SubtargetFeature> X86_64V2Features = !listconcat(X86_64V1Features, [
@@ -825,7 +830,8 @@ def ProcessorFeatures {
     TuningFastSHLDRotate,
     TuningFast15ByteNOP,
     TuningPOPCNTFalseDeps,
-    TuningInsertVZEROUPPER
+    TuningInsertVZEROUPPER,
+    TuningAvoidMFENCE
   ];
 
   list<SubtargetFeature> X86_64V3Features = !listconcat(X86_64V2Features, [
@@ -844,7 +850,8 @@ def ProcessorFeatures {
     TuningPOPCNTFalseDeps,
     TuningLZCNTFalseDeps,
     TuningInsertVZEROUPPER,
-    TuningAllowLight256Bit
+    TuningAllowLight256Bit,
+    TuningAvoidMFENCE
   ];
 
   list<SubtargetFeature> X86_64V4Features = !listconcat(X86_64V3Features, [
@@ -868,15 +875,17 @@ def ProcessorFeatures {
     TuningFastGather,
     TuningPOPCNTFalseDeps,
     TuningInsertVZEROUPPER,
-    TuningAllowLight256Bit
+    TuningAllowLight256Bit,
+    TuningAvoidMFENCE
   ];
 
   // Nehalem
   list<SubtargetFeature> NHMFeatures = X86_64V2Features;
   list<SubtargetFeature> NHMTuning = [TuningMacroFusion,
                                       TuningSlowDivide64,
                                       TuningInsertVZEROUPPER,
-                                      TuningNoDomainDelayMov];
+                                      TuningNoDomainDelayMov,
+                                      TuningAvoidMFENCE];
 
   // Westmere
   list<SubtargetFeature> WSMAdditionalFeatures = [FeaturePCLMUL];
@@ -897,7 +906,8 @@ def ProcessorFeatures {
                                       TuningFast15ByteNOP,
                                       TuningPOPCNTFalseDeps,
                                       TuningInsertVZEROUPPER,
-                                      TuningNoDomainDelayMov];
+                                      TuningNoDomainDelayMov,
+                                      TuningAvoidMFENCE];
   list<SubtargetFeature> SNBFeatures =
     !listconcat(WSMFeatures, SNBAdditionalFeatures);
 
@@ -963,7 +973,8 @@ def ProcessorFeatures {
                                       TuningAllowLight256Bit,
                                       TuningNoDomainDelayMov,
                                       TuningNoDomainDelayShuffle,
-                                      TuningNoDomainDelayBlend];
+                                      TuningNoDomainDelayBlend,
+                                      TuningAvoidMFENCE];
   list<SubtargetFeature> SKLFeatures =
     !listconcat(BDWFeatures, SKLAdditionalFeatures);
 
@@ -998,7 +1009,8 @@ def ProcessorFeatures {
                                       TuningNoDomainDelayMov,
                                       TuningNoDomainDelayShuffle,
                                       TuningNoDomainDelayBlend,
-                                      TuningFastImmVectorShift];
+                                      TuningFastImmVectorShift,
+                                      TuningAvoidMFENCE];
   list<SubtargetFeature> SKXFeatures =
     !listconcat(BDWFeatures, SKXAdditionalFeatures);
 
@@ -1041,7 +1053,8 @@ def ProcessorFeatures {
                                       TuningNoDomainDelayMov,
                                       TuningNoDomainDelayShuffle,
                                       TuningNoDomainDelayBlend,
-                                      TuningFastImmVectorShift];
+                                      TuningFastImmVectorShift,
+                                      TuningAvoidMFENCE];
   list<SubtargetFeature> CNLFeatures =
     !listconcat(SKLFeatures, CNLAdditionalFeatures);
 
@@ -1070,7 +1083,8 @@ def ProcessorFeatures {
                                       TuningNoDomainDelayMov,
                                       TuningNoDomainDelayShuffle,
                                       TuningNoDomainDelayBlend,
-                                      TuningFastImmVectorShift];
+                                      TuningFastImmVectorShift,
+                                      TuningAvoidMFENCE];
   list<SubtargetFeature> ICLFeatures =
     !listconcat(CNLFeatures, ICLAdditionalFeatures);
 
@@ -1216,7 +1230,8 @@ def ProcessorFeatures {
   // Tremont
   list<SubtargetFeature> TRMAdditionalFeatures = [FeatureCLWB,
                                                   FeatureGFNI];
-  list<SubtargetFeature> TRMTuning = GLPTuning;
+  list<SubtargetFeature> TRMAdditionalTuning = [TuningAvoidMFENCE];
+  list<SubtargetFeature> TRMTuning = !listconcat(GLPTuning, TRMAdditionalTuning);
   list<SubtargetFeature> TRMFeatures =
     !listconcat(GLPFeatures, TRMAdditionalFeatures);
 
@@ -1394,7 +1409,8 @@ def ProcessorFeatures {
                                          TuningFastImm16,
                                          TuningSBBDepBreaking,
                                          TuningSlowDivide64,
-                                         TuningSlowSHLD];
+                                         TuningSlowSHLD,
+                                         TuningAvoidMFENCE];
   list<SubtargetFeature> BtVer2Features =
     !listconcat(BtVer1Features, BtVer2AdditionalFeatures);
 
@@ -1423,7 +1439,8 @@ def ProcessorFeatures {
                                          TuningFastScalarShiftMasks,
                                          TuningBranchFusion,
                                          TuningSBBDepBreaking,
-                                         TuningInsertVZEROUPPER];
+                                         TuningInsertVZEROUPPER,
+                                         TuningAvoidMFENCE];
 
   // PileDriver
   list<SubtargetFeature> BdVer2AdditionalFeatures = [FeatureF16C,
@@ -1503,7 +1520,8 @@ def ProcessorFeatures {
                                      TuningSlowSHLD,
                                      TuningSBBDepBreaking,
                                      TuningInsertVZEROUPPER,
-                                     TuningAllowLight256Bit];
+                                     TuningAllowLight256Bit,
+                                     TuningAvoidMFENCE];
   list<SubtargetFeature> ZN2AdditionalFeatures = [FeatureCLWB,
                                                   FeatureRDPID,
                                                   FeatureRDPRU,
@@ -1691,7 +1709,8 @@ def : ProcModel<P, SandyBridgeModel, [
 [
   TuningMacroFusion,
   TuningSlowUAMem16,
-  TuningInsertVZEROUPPER
+  TuningInsertVZEROUPPER,
+  TuningAvoidMFENCE
 ]>;
 }
 foreach P = ["penryn", "core_2_duo_sse4_1"] in {
@@ -1710,7 +1729,8 @@ def : ProcModel<P, SandyBridgeModel, [
 [
   TuningMacroFusion,
   TuningSlowUAMem16,
-  TuningInsertVZEROUPPER
+  TuningInsertVZEROUPPER,
+  TuningAvoidMFENCE
 ]>;
 }
 

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -30951,21 +30951,10 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
   // otherwise, we might be able to be more aggressive on relaxed idempotent
   // rmw. In practice, they do not look useful, so we don't try to be
   // especially clever.
-  if (SSID == SyncScope::SingleThread)
-    // FIXME: we could just insert an ISD::MEMBARRIER here, except we are at
-    // the IR level, so we must wrap it in an intrinsic.
-    return nullptr;
-
-  if (!Subtarget.hasMFence())
-    // FIXME: it might make sense to use a locked operation here but on a
-    // different cache-line to prevent cache-line bouncing. In practice it
-    // is probably a small win, and x86 processors without mfence are rare
-    // enough that we do not bother.
-    return nullptr;
 
-  Function *MFence =
-      llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
-  Builder.CreateCall(MFence, {});
+  // Use `fence seq_cst` over `llvm.x64.sse2.mfence` here to get the correct
+  // lowering for SSID == SyncScope::SingleThread and avoidMFence || !hasMFence
+  Builder.CreateFence(AtomicOrdering::SequentiallyConsistent, SSID);
 
   // Finally we can emit the atomic load.
   LoadInst *Loaded = Builder.CreateAlignedLoad(
@@ -31053,7 +31042,7 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
   // cross-thread fence.
   if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
       FenceSSID == SyncScope::System) {
-    if (Subtarget.hasMFence())
+    if (!Subtarget.avoidMFence() && Subtarget.hasMFence())
       return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
 
     SDValue Chain = Op.getOperand(0);

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -13889,11 +13889,16 @@ Value *BoUpSLP::vectorizeTree(
         }
         if (!Ex) {
           // "Reuse" the existing extract to improve final codegen.
-          if (auto *ES = dyn_cast<ExtractElementInst>(Scalar)) {
+          if (auto *ES = dyn_cast<ExtractElementInst>(Scalar);
+              ES && isa<Instruction>(Vec)) {
             Value *V = ES->getVectorOperand();
             if (const TreeEntry *ETE = getTreeEntry(V))
               V = ETE->VectorizedValue;
-            Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
+            if (auto *IV = dyn_cast<Instruction>(V);
+                !IV || IV == Vec || (IV->getParent() == cast<Instruction>(Vec)->getParent() && IV->comesBefore(cast<Instruction>(Vec))))
+              Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
+            else
+              Ex = Builder.CreateExtractElement(Vec, Lane);
           } else if (ReplaceGEP) {
             // Leave the GEPs as is, they are free in most cases and better to
             // keep them as GEPs.