Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix 7f17639803fdc73f9ab9bd60a315596ea8881af9 #38

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 9 additions & 24 deletions llvm/include/llvm/ExecutionEngine/JITLink/i386.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,8 @@ enum EdgeKind_i386 : Edge::Kind {
/// Represents a data/control flow instruction using PC-relative addressing
/// to a target.
///
/// The fixup expression for this kind includes an implicit offset to account
/// for the PC (unlike the Delta edges) so that a PCRel32 with a target
/// T and addend zero is a call/branch to the start (offset zero) of T.
///
/// Fixup expression:
/// Fixup <- Target - (Fixup + 4) + Addend : int32
/// Fixup <- Target - Fixup + Addend : int32
///
/// Errors:
/// - The result of the fixup expression must fit into an int32, otherwise
Expand All @@ -68,12 +64,8 @@ enum EdgeKind_i386 : Edge::Kind {
/// Represents a data/control flow instruction using PC-relative addressing
/// to a target.
///
/// The fixup expression for this kind includes an implicit offset to account
/// for the PC (unlike the Delta edges) so that a PCRel16 with a target
/// T and addend zero is a call/branch to the start (offset zero) of T.
///
/// Fixup expression:
/// Fixup <- Target - (Fixup + 4) + Addend : int16
/// Fixup <- Target - Fixup + Addend : int16
///
/// Errors:
/// - The result of the fixup expression must fit into an int16, otherwise
Expand All @@ -86,7 +78,7 @@ enum EdgeKind_i386 : Edge::Kind {
/// Delta from the fixup to the target.
///
/// Fixup expression:
/// Fixup <- Target - Fixup + Addend : int64
/// Fixup <- Target - Fixup + Addend : int32
///
/// Errors:
/// - The result of the fixup expression must fit into an int32, otherwise
Expand Down Expand Up @@ -130,12 +122,8 @@ enum EdgeKind_i386 : Edge::Kind {
/// Represents a PC-relative call or branch to a target. This can be used to
/// identify, record, and/or patch call sites.
///
/// The fixup expression for this kind includes an implicit offset to account
/// for the PC (unlike the Delta edges) so that a Branch32PCRel with a target
/// T and addend zero is a call/branch to the start (offset zero) of T.
///
/// Fixup expression:
/// Fixup <- Target - (Fixup + 4) + Addend : int32
/// Fixup <- Target - Fixup + Addend : int32
///
/// Errors:
/// - The result of the fixup expression must fit into an int32, otherwise
Expand Down Expand Up @@ -164,7 +152,7 @@ enum EdgeKind_i386 : Edge::Kind {
/// target may be recorded to allow manipulation at runtime.
///
/// Fixup expression:
/// Fixup <- Target - Fixup + Addend - 4 : int32
/// Fixup <- Target - Fixup + Addend : int32
///
/// Errors:
/// - The result of the fixup expression must fit into an int32, otherwise
Expand All @@ -180,7 +168,7 @@ enum EdgeKind_i386 : Edge::Kind {
/// is within range of the fixup location.
///
/// Fixup expression:
/// Fixup <- Target - Fixup + Addend - 4: int32
/// Fixup <- Target - Fixup + Addend : int32
///
/// Errors:
/// - The result of the fixup expression must fit into an int32, otherwise
Expand Down Expand Up @@ -215,8 +203,7 @@ inline Error applyFixup(LinkGraph &G, Block &B, const Edge &E,
}

case i386::PCRel32: {
int32_t Value =
E.getTarget().getAddress() - (FixupAddress + 4) + E.getAddend();
int32_t Value = E.getTarget().getAddress() - FixupAddress + E.getAddend();
*(little32_t *)FixupPtr = Value;
break;
}
Expand All @@ -231,8 +218,7 @@ inline Error applyFixup(LinkGraph &G, Block &B, const Edge &E,
}

case i386::PCRel16: {
int32_t Value =
E.getTarget().getAddress() - (FixupAddress + 4) + E.getAddend();
int32_t Value = E.getTarget().getAddress() - FixupAddress + E.getAddend();
if (LLVM_LIKELY(isInt<16>(Value)))
*(little16_t *)FixupPtr = Value;
else
Expand All @@ -257,8 +243,7 @@ inline Error applyFixup(LinkGraph &G, Block &B, const Edge &E,
case i386::BranchPCRel32:
case i386::BranchPCRel32ToPtrJumpStub:
case i386::BranchPCRel32ToPtrJumpStubBypassable: {
int32_t Value =
E.getTarget().getAddress() - (FixupAddress + 4) + E.getAddend();
int32_t Value = E.getTarget().getAddress() - FixupAddress + E.getAddend();
*(little32_t *)FixupPtr = Value;
break;
}
Expand Down
20 changes: 17 additions & 3 deletions llvm/lib/ExecutionEngine/JITLink/ELF_i386.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -186,15 +186,29 @@ class ELFLinkGraphBuilder_i386 : public ELFLinkGraphBuilder<ELFT> {
int64_t Addend = 0;

switch (*Kind) {
case i386::EdgeKind_i386::Delta32: {
case i386::EdgeKind_i386::None:
break;
case i386::EdgeKind_i386::Pointer32:
case i386::EdgeKind_i386::PCRel32:
case i386::EdgeKind_i386::RequestGOTAndTransformToDelta32FromGOT:
case i386::EdgeKind_i386::Delta32:
case i386::EdgeKind_i386::Delta32FromGOT:
case i386::EdgeKind_i386::BranchPCRel32:
case i386::EdgeKind_i386::BranchPCRel32ToPtrJumpStub:
case i386::EdgeKind_i386::BranchPCRel32ToPtrJumpStubBypassable: {
const char *FixupContent = BlockToFix.getContent().data() +
(FixupAddress - BlockToFix.getAddress());
Addend = *(const support::ulittle32_t *)FixupContent;
Addend = *(const support::little32_t *)FixupContent;
break;
}
default:
case i386::EdgeKind_i386::Pointer16:
case i386::EdgeKind_i386::PCRel16: {
const char *FixupContent = BlockToFix.getContent().data() +
(FixupAddress - BlockToFix.getAddress());
Addend = *(const support::little16_t *)FixupContent;
break;
}
}

Edge::OffsetT Offset = FixupAddress - BlockToFix.getAddress();
Edge GE(*Kind, Offset, *GraphSymbol, Addend);
Expand Down
52 changes: 36 additions & 16 deletions llvm/lib/Target/X86/X86.td
Original file line number Diff line number Diff line change
Expand Up @@ -748,6 +748,10 @@ def TuningUseGLMDivSqrtCosts
def TuningBranchHint: SubtargetFeature<"branch-hint", "HasBranchHint", "true",
"Target has branch hint feature">;

def TuningAvoidMFENCE
: SubtargetFeature<"avoid-mfence", "AvoidMFence", "true",
"Avoid MFENCE for fence seq_cst, and instead use lock or">;

//===----------------------------------------------------------------------===//
// X86 CPU Families
// TODO: Remove these - use general tuning features to determine codegen.
Expand Down Expand Up @@ -809,7 +813,8 @@ def ProcessorFeatures {
TuningSlow3OpsLEA,
TuningSlowDivide64,
TuningSlowIncDec,
TuningInsertVZEROUPPER
TuningInsertVZEROUPPER,
TuningAvoidMFENCE
];

list<SubtargetFeature> X86_64V2Features = !listconcat(X86_64V1Features, [
Expand All @@ -825,7 +830,8 @@ def ProcessorFeatures {
TuningFastSHLDRotate,
TuningFast15ByteNOP,
TuningPOPCNTFalseDeps,
TuningInsertVZEROUPPER
TuningInsertVZEROUPPER,
TuningAvoidMFENCE
];

list<SubtargetFeature> X86_64V3Features = !listconcat(X86_64V2Features, [
Expand All @@ -844,7 +850,8 @@ def ProcessorFeatures {
TuningPOPCNTFalseDeps,
TuningLZCNTFalseDeps,
TuningInsertVZEROUPPER,
TuningAllowLight256Bit
TuningAllowLight256Bit,
TuningAvoidMFENCE
];

list<SubtargetFeature> X86_64V4Features = !listconcat(X86_64V3Features, [
Expand All @@ -868,15 +875,17 @@ def ProcessorFeatures {
TuningFastGather,
TuningPOPCNTFalseDeps,
TuningInsertVZEROUPPER,
TuningAllowLight256Bit
TuningAllowLight256Bit,
TuningAvoidMFENCE
];

// Nehalem
list<SubtargetFeature> NHMFeatures = X86_64V2Features;
list<SubtargetFeature> NHMTuning = [TuningMacroFusion,
TuningSlowDivide64,
TuningInsertVZEROUPPER,
TuningNoDomainDelayMov];
TuningNoDomainDelayMov,
TuningAvoidMFENCE];

// Westmere
list<SubtargetFeature> WSMAdditionalFeatures = [FeaturePCLMUL];
Expand All @@ -897,7 +906,8 @@ def ProcessorFeatures {
TuningFast15ByteNOP,
TuningPOPCNTFalseDeps,
TuningInsertVZEROUPPER,
TuningNoDomainDelayMov];
TuningNoDomainDelayMov,
TuningAvoidMFENCE];
list<SubtargetFeature> SNBFeatures =
!listconcat(WSMFeatures, SNBAdditionalFeatures);

Expand Down Expand Up @@ -963,7 +973,8 @@ def ProcessorFeatures {
TuningAllowLight256Bit,
TuningNoDomainDelayMov,
TuningNoDomainDelayShuffle,
TuningNoDomainDelayBlend];
TuningNoDomainDelayBlend,
TuningAvoidMFENCE];
list<SubtargetFeature> SKLFeatures =
!listconcat(BDWFeatures, SKLAdditionalFeatures);

Expand Down Expand Up @@ -998,7 +1009,8 @@ def ProcessorFeatures {
TuningNoDomainDelayMov,
TuningNoDomainDelayShuffle,
TuningNoDomainDelayBlend,
TuningFastImmVectorShift];
TuningFastImmVectorShift,
TuningAvoidMFENCE];
list<SubtargetFeature> SKXFeatures =
!listconcat(BDWFeatures, SKXAdditionalFeatures);

Expand Down Expand Up @@ -1041,7 +1053,8 @@ def ProcessorFeatures {
TuningNoDomainDelayMov,
TuningNoDomainDelayShuffle,
TuningNoDomainDelayBlend,
TuningFastImmVectorShift];
TuningFastImmVectorShift,
TuningAvoidMFENCE];
list<SubtargetFeature> CNLFeatures =
!listconcat(SKLFeatures, CNLAdditionalFeatures);

Expand Down Expand Up @@ -1070,7 +1083,8 @@ def ProcessorFeatures {
TuningNoDomainDelayMov,
TuningNoDomainDelayShuffle,
TuningNoDomainDelayBlend,
TuningFastImmVectorShift];
TuningFastImmVectorShift,
TuningAvoidMFENCE];
list<SubtargetFeature> ICLFeatures =
!listconcat(CNLFeatures, ICLAdditionalFeatures);

Expand Down Expand Up @@ -1216,7 +1230,8 @@ def ProcessorFeatures {
// Tremont
list<SubtargetFeature> TRMAdditionalFeatures = [FeatureCLWB,
FeatureGFNI];
list<SubtargetFeature> TRMTuning = GLPTuning;
list<SubtargetFeature> TRMAdditionalTuning = [TuningAvoidMFENCE];
list<SubtargetFeature> TRMTuning = !listconcat(GLPTuning, TRMAdditionalTuning);
list<SubtargetFeature> TRMFeatures =
!listconcat(GLPFeatures, TRMAdditionalFeatures);

Expand Down Expand Up @@ -1394,7 +1409,8 @@ def ProcessorFeatures {
TuningFastImm16,
TuningSBBDepBreaking,
TuningSlowDivide64,
TuningSlowSHLD];
TuningSlowSHLD,
TuningAvoidMFENCE];
list<SubtargetFeature> BtVer2Features =
!listconcat(BtVer1Features, BtVer2AdditionalFeatures);

Expand Down Expand Up @@ -1423,7 +1439,8 @@ def ProcessorFeatures {
TuningFastScalarShiftMasks,
TuningBranchFusion,
TuningSBBDepBreaking,
TuningInsertVZEROUPPER];
TuningInsertVZEROUPPER,
TuningAvoidMFENCE];

// PileDriver
list<SubtargetFeature> BdVer2AdditionalFeatures = [FeatureF16C,
Expand Down Expand Up @@ -1503,7 +1520,8 @@ def ProcessorFeatures {
TuningSlowSHLD,
TuningSBBDepBreaking,
TuningInsertVZEROUPPER,
TuningAllowLight256Bit];
TuningAllowLight256Bit,
TuningAvoidMFENCE];
list<SubtargetFeature> ZN2AdditionalFeatures = [FeatureCLWB,
FeatureRDPID,
FeatureRDPRU,
Expand Down Expand Up @@ -1691,7 +1709,8 @@ def : ProcModel<P, SandyBridgeModel, [
[
TuningMacroFusion,
TuningSlowUAMem16,
TuningInsertVZEROUPPER
TuningInsertVZEROUPPER,
TuningAvoidMFENCE
]>;
}
foreach P = ["penryn", "core_2_duo_sse4_1"] in {
Expand All @@ -1710,7 +1729,8 @@ def : ProcModel<P, SandyBridgeModel, [
[
TuningMacroFusion,
TuningSlowUAMem16,
TuningInsertVZEROUPPER
TuningInsertVZEROUPPER,
TuningAvoidMFENCE
]>;
}

Expand Down
19 changes: 4 additions & 15 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30951,21 +30951,10 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
// otherwise, we might be able to be more aggressive on relaxed idempotent
// rmw. In practice, they do not look useful, so we don't try to be
// especially clever.
if (SSID == SyncScope::SingleThread)
// FIXME: we could just insert an ISD::MEMBARRIER here, except we are at
// the IR level, so we must wrap it in an intrinsic.
return nullptr;

if (!Subtarget.hasMFence())
// FIXME: it might make sense to use a locked operation here but on a
// different cache-line to prevent cache-line bouncing. In practice it
// is probably a small win, and x86 processors without mfence are rare
// enough that we do not bother.
return nullptr;

Function *MFence =
llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
Builder.CreateCall(MFence, {});
// Use `fence seq_cst` over `llvm.x64.sse2.mfence` here to get the correct
// lowering for SSID == SyncScope::SingleThread and avoidMFence || !hasMFence
Builder.CreateFence(AtomicOrdering::SequentiallyConsistent, SSID);

// Finally we can emit the atomic load.
LoadInst *Loaded = Builder.CreateAlignedLoad(
Expand Down Expand Up @@ -31053,7 +31042,7 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
// cross-thread fence.
if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
FenceSSID == SyncScope::System) {
if (Subtarget.hasMFence())
if (!Subtarget.avoidMFence() && Subtarget.hasMFence())
return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));

SDValue Chain = Op.getOperand(0);
Expand Down
9 changes: 7 additions & 2 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13889,11 +13889,16 @@ Value *BoUpSLP::vectorizeTree(
}
if (!Ex) {
// "Reuse" the existing extract to improve final codegen.
if (auto *ES = dyn_cast<ExtractElementInst>(Scalar)) {
if (auto *ES = dyn_cast<ExtractElementInst>(Scalar);
ES && isa<Instruction>(Vec)) {
Value *V = ES->getVectorOperand();
if (const TreeEntry *ETE = getTreeEntry(V))
V = ETE->VectorizedValue;
Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
if (auto *IV = dyn_cast<Instruction>(V);
!IV || IV == Vec || (IV->getParent() == cast<Instruction>(Vec)->getParent() && IV->comesBefore(cast<Instruction>(Vec))))
Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
else
Ex = Builder.CreateExtractElement(Vec, Lane);
} else if (ReplaceGEP) {
// Leave the GEPs as is, they are free in most cases and better to
// keep them as GEPs.
Expand Down
Loading