Skip to content

Commit

Permalink
Merge pull request #3469 from alyssarosenzweig/opt/df
Browse files Browse the repository at this point in the history
Optimize DF representation
  • Loading branch information
Sonicadvance1 authored Mar 14, 2024
2 parents 4e269d8 + 063b81d commit cd2a6ce
Show file tree
Hide file tree
Showing 14 changed files with 384 additions and 419 deletions.
10 changes: 10 additions & 0 deletions FEXCore/Source/Interface/Core/Core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ namespace FEXCore::Context {
case X86State::RFLAG_ZF_RAW_LOC:
case X86State::RFLAG_SF_RAW_LOC:
case X86State::RFLAG_OF_RAW_LOC:
case X86State::RFLAG_DF_RAW_LOC:
// Intentionally do nothing.
// These contain multiple bits which can corrupt other members when compacted.
break;
Expand Down Expand Up @@ -215,6 +216,11 @@ namespace FEXCore::Context {
uint32_t AF = ((Frame->State.af_raw ^ PFByte) & (1 << 4)) ? 1 : 0;
EFLAGS |= AF << X86State::RFLAG_AF_RAW_LOC;

// DF is pretransformed, undo the transform from 1/-1 back to 0/1
uint8_t DFByte = Frame->State.flags[X86State::RFLAG_DF_RAW_LOC];
if (DFByte & 0x80)
EFLAGS |= 1 << X86State::RFLAG_DF_RAW_LOC;

return EFLAGS;
}

Expand All @@ -238,6 +244,10 @@ namespace FEXCore::Context {
// PF is inverted in our internal representation.
Frame->State.pf_raw = (EFLAGS & (1U << i)) ? 0 : 1;
break;
case X86State::RFLAG_DF_RAW_LOC:
// DF is encoded as 1/-1
Frame->State.flags[i] = (EFLAGS & (1U << i)) ? 0xff : 1;
break;
default:
Frame->State.flags[i] = (EFLAGS & (1U << i)) ? 1 : 0;
break;
Expand Down
24 changes: 16 additions & 8 deletions FEXCore/Source/Interface/Core/JIT/Arm64/MemoryOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -888,6 +888,14 @@ DEF_OP(StoreNZCV) {
msr(ARMEmitter::SystemRegister::NZCV, GetReg(Op->Value.ID()));
}

DEF_OP(LoadDF) {
auto Dst = GetReg(Node);
auto Flag = X86State::RFLAG_DF_RAW_LOC;

// DF needs sign extension to turn 0x1/0xFF into 1/-1
ldrsb(Dst.X(), STATE, offsetof(FEXCore::Core::CPUState, flags[Flag]));
}

DEF_OP(LoadFlag) {
auto Op = IROp->C<IR::IROp_LoadFlag>();
auto Dst = GetReg(Node);
Expand Down Expand Up @@ -1708,7 +1716,7 @@ DEF_OP(MemSet) {
DirectionReg = GetReg(Op->Direction.ID());
}

// If Direction == 0 then:
// If Direction > 0 then:
// MemReg is incremented (by size)
// else:
// MemReg is decremented (by size)
Expand All @@ -1729,7 +1737,7 @@ DEF_OP(MemSet) {

if (!DirectionIsInline) {
// Backward or forwards implementation depends on flag
cbnz(ARMEmitter::Size::i64Bit, DirectionReg, &BackwardImpl);
tbnz(DirectionReg, 1, &BackwardImpl);
}

auto MemStore = [this](auto Value, uint32_t OpSize, int32_t Size) {
Expand Down Expand Up @@ -1847,8 +1855,8 @@ DEF_OP(MemSet) {
};

if (DirectionIsInline) {
// If the direction constant is set then the direction is negative.
EmitMemset(DirectionConstant ? -1 : 1);
LOGMAN_THROW_AA_FMT(DirectionConstant == 1 || DirectionConstant == -1, "unexpected direction");
EmitMemset(DirectionConstant);
}
else {
// Emit forward direction memset then backward direction memset.
Expand Down Expand Up @@ -1886,7 +1894,7 @@ DEF_OP(MemCpy) {
}

auto Dst = GetRegPair(Node);
// If Direction == 0 then:
// If Direction > 0 then:
// MemRegDest is incremented (by size)
// MemRegSrc is incremented (by size)
// else:
Expand Down Expand Up @@ -1922,7 +1930,7 @@ DEF_OP(MemCpy) {

if (!DirectionIsInline) {
// Backward or forwards implementation depends on flag
cbnz(ARMEmitter::Size::i64Bit, DirectionReg, &BackwardImpl);
tbnz(DirectionReg, 1, &BackwardImpl);
}

auto MemCpy = [this](uint32_t OpSize, int32_t Size) {
Expand Down Expand Up @@ -2121,8 +2129,8 @@ DEF_OP(MemCpy) {
};

if (DirectionIsInline) {
// If the direction constant is set then the direction is negative.
EmitMemcpy(DirectionConstant ? -1 : 1);
LOGMAN_THROW_AA_FMT(DirectionConstant == 1 || DirectionConstant == -1, "unexpected direction");
EmitMemcpy(DirectionConstant);
}
else {
// Emit forward direction memset then backward direction memset.
Expand Down
61 changes: 15 additions & 46 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1379,10 +1379,10 @@ void OpDispatchBuilder::FLAGControlOp(OpcodeArgs) {
SetRFLAG(_Constant(1), FEXCore::X86State::RFLAG_CF_RAW_LOC);
break;
case 0xFC: // CLD
SetRFLAG(_Constant(0), FEXCore::X86State::RFLAG_DF_LOC);
SetRFLAG(_Constant(0), FEXCore::X86State::RFLAG_DF_RAW_LOC);
break;
case 0xFD: // STD
SetRFLAG(_Constant(1), FEXCore::X86State::RFLAG_DF_LOC);
SetRFLAG(_Constant(1), FEXCore::X86State::RFLAG_DF_RAW_LOC);
break;
}
}
Expand Down Expand Up @@ -3627,7 +3627,8 @@ void OpDispatchBuilder::STOSOp(OpcodeArgs) {
const bool Repeat = (Op->Flags & (FEXCore::X86Tables::DecodeFlags::FLAG_REP_PREFIX | FEXCore::X86Tables::DecodeFlags::FLAG_REPNE_PREFIX)) != 0;

if (!Repeat) {
OrderedNode *Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags);
// Src is used only for a store of the same size so allow garbage
OrderedNode *Src = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true});
OrderedNode *Dest = LoadGPRRegister(X86State::REG_RDI);

// Only ES prefix
Expand All @@ -3636,16 +3637,9 @@ void OpDispatchBuilder::STOSOp(OpcodeArgs) {
// Store to memory where RDI points
_StoreMemAutoTSO(GPRClass, Size, Dest, Src, Size);

// Calculate direction.
auto DF = GetRFLAG(FEXCore::X86State::RFLAG_DF_LOC);
auto SizeConst = _Constant(Size);
auto PtrDir = _SubShift(IR::SizeToOpSize(CTX->GetGPRSize()), SizeConst, DF, ShiftType::LSL, FEXCore::ilog2(Size) + 1);

// Offset the pointer
OrderedNode *TailDest = LoadGPRRegister(X86State::REG_RDI);
TailDest = _Add(OpSize::i64Bit, TailDest, PtrDir);

StoreGPRRegister(X86State::REG_RDI, TailDest);
StoreGPRRegister(X86State::REG_RDI, OffsetByDir(TailDest, Size));
}
else {
// FEX doesn't support partial faulting REP instructions.
Expand All @@ -3658,9 +3652,8 @@ void OpDispatchBuilder::STOSOp(OpcodeArgs) {
auto Segment = GetSegment(0, FEXCore::X86Tables::DecodeFlags::FLAG_ES_PREFIX, true);

OrderedNode *Counter = LoadGPRRegister(X86State::REG_RCX);
auto DF = GetRFLAG(FEXCore::X86State::RFLAG_DF_LOC);

auto Result = _MemSet(CTX->IsAtomicTSOEnabled(), Size, Segment ?: InvalidNode, Dest, Src, Counter, DF);
auto Result = _MemSet(CTX->IsAtomicTSOEnabled(), Size, Segment ?: InvalidNode, Dest, Src, Counter, LoadDir(1));
StoreGPRRegister(X86State::REG_RCX, _Constant(0));
StoreGPRRegister(X86State::REG_RDI, Result);
}
Expand All @@ -3676,9 +3669,6 @@ void OpDispatchBuilder::MOVSOp(OpcodeArgs) {
// RA now can handle these to be here, to avoid DF accesses
const auto Size = GetSrcSize(Op);

// Calculate direction.
auto DF = GetRFLAG(FEXCore::X86State::RFLAG_DF_LOC);

if (Op->Flags & (FEXCore::X86Tables::DecodeFlags::FLAG_REP_PREFIX | FEXCore::X86Tables::DecodeFlags::FLAG_REPNE_PREFIX)) {
auto SrcAddr = LoadGPRRegister(X86State::REG_RSI);
auto DstAddr = LoadGPRRegister(X86State::REG_RDI);
Expand All @@ -3690,7 +3680,8 @@ void OpDispatchBuilder::MOVSOp(OpcodeArgs) {
auto Result = _MemCpy(CTX->IsAtomicTSOEnabled(), Size,
DstSegment ?: InvalidNode,
SrcSegment ?: InvalidNode,
DstAddr, SrcAddr, Counter, DF);
DstAddr, SrcAddr, Counter,
LoadDir(1));

OrderedNode *Result_Dst = _ExtractElementPair(OpSize::i64Bit, Result, 0);
OrderedNode *Result_Src = _ExtractElementPair(OpSize::i64Bit, Result, 1);
Expand All @@ -3700,9 +3691,6 @@ void OpDispatchBuilder::MOVSOp(OpcodeArgs) {
StoreGPRRegister(X86State::REG_RSI, Result_Src);
}
else {
auto SizeConst = _Constant(Size);
auto PtrDir = _SubShift(IR::SizeToOpSize(CTX->GetGPRSize()), SizeConst, DF, ShiftType::LSL, FEXCore::ilog2(Size) + 1);

OrderedNode *RSI = LoadGPRRegister(X86State::REG_RSI);
OrderedNode *RDI = LoadGPRRegister(X86State::REG_RDI);
RDI= AppendSegmentOffset(RDI, 0, FEXCore::X86Tables::DecodeFlags::FLAG_ES_PREFIX, true);
Expand All @@ -3713,6 +3701,7 @@ void OpDispatchBuilder::MOVSOp(OpcodeArgs) {
// Store to memory where RDI points
_StoreMemAutoTSO(GPRClass, Size, RDI, Src, Size);

auto PtrDir = LoadDir(Size);
RSI = _Add(OpSize::i64Bit, RSI, PtrDir);
RDI = _Add(OpSize::i64Bit, RDI, PtrDir);

Expand Down Expand Up @@ -3745,9 +3734,7 @@ void OpDispatchBuilder::CMPSOp(OpcodeArgs) {

GenerateFlags_SUB(Op, Src2, Src1);

auto DF = GetRFLAG(FEXCore::X86State::RFLAG_DF_LOC);
auto SizeConst = _Constant(Size);
auto PtrDir = _SubShift(IR::SizeToOpSize(CTX->GetGPRSize()), SizeConst, DF, ShiftType::LSL, FEXCore::ilog2(Size) + 1);
auto PtrDir = LoadDir(Size);

// Offset the pointer
Dest_RDI = _Add(OpSize::i64Bit, Dest_RDI, PtrDir);
Expand All @@ -3764,9 +3751,7 @@ void OpDispatchBuilder::CMPSOp(OpcodeArgs) {
bool REPE = Op->Flags & FEXCore::X86Tables::DecodeFlags::FLAG_REP_PREFIX;

// read DF once
auto DF = GetRFLAG(FEXCore::X86State::RFLAG_DF_LOC);
auto SizeConst = _Constant(Size);
auto PtrDir = _SubShift(IR::SizeToOpSize(CTX->GetGPRSize()), SizeConst, DF, ShiftType::LSL, FEXCore::ilog2(Size) + 1);
auto PtrDir = LoadDir(Size);

auto JumpStart = Jump();
// Make sure to start a new block after ending this one
Expand Down Expand Up @@ -3856,15 +3841,9 @@ void OpDispatchBuilder::LODSOp(OpcodeArgs) {

StoreResult(GPRClass, Op, Src, -1);

auto DF = GetRFLAG(FEXCore::X86State::RFLAG_DF_LOC);
auto SizeConst = _Constant(Size);
auto PtrDir = _SubShift(IR::SizeToOpSize(CTX->GetGPRSize()), SizeConst, DF, ShiftType::LSL, FEXCore::ilog2(Size) + 1);

// Offset the pointer
OrderedNode *TailDest_RSI = LoadGPRRegister(X86State::REG_RSI);

TailDest_RSI = _Add(OpSize::i64Bit, TailDest_RSI, PtrDir);
StoreGPRRegister(X86State::REG_RSI, TailDest_RSI);
StoreGPRRegister(X86State::REG_RSI, OffsetByDir(TailDest_RSI, Size));
}
else {
// Calculate flags early. because end of block
Expand All @@ -3877,9 +3856,7 @@ void OpDispatchBuilder::LODSOp(OpcodeArgs) {
// May or may not matter

// Read DF once
auto DF = GetRFLAG(FEXCore::X86State::RFLAG_DF_LOC);
auto SizeConst = _Constant(Size);
auto PtrDir = _SubShift(IR::SizeToOpSize(CTX->GetGPRSize()), SizeConst, DF, ShiftType::LSL, FEXCore::ilog2(Size) + 1);
auto PtrDir = LoadDir(Size);

auto JumpStart = Jump();
// Make sure to start a new block after ending this one
Expand Down Expand Up @@ -3953,15 +3930,9 @@ void OpDispatchBuilder::SCASOp(OpcodeArgs) {

GenerateFlags_SUB(Op, Src1, Src2);

auto DF = GetRFLAG(FEXCore::X86State::RFLAG_DF_LOC);
auto SizeConst = _Constant(Size);
auto PtrDir = _SubShift(IR::SizeToOpSize(CTX->GetGPRSize()), SizeConst, DF, ShiftType::LSL, FEXCore::ilog2(Size) + 1);

// Offset the pointer
OrderedNode *TailDest_RDI = LoadGPRRegister(X86State::REG_RDI);

TailDest_RDI = _Add(OpSize::i64Bit, TailDest_RDI, PtrDir);
StoreGPRRegister(X86State::REG_RDI, TailDest_RDI);
StoreGPRRegister(X86State::REG_RDI, OffsetByDir(TailDest_RDI, Size));
}
else {
// Calculate flags early. because end of block
Expand All @@ -3970,9 +3941,7 @@ void OpDispatchBuilder::SCASOp(OpcodeArgs) {
bool REPE = Op->Flags & FEXCore::X86Tables::DecodeFlags::FLAG_REP_PREFIX;

// read DF once
auto DF = GetRFLAG(FEXCore::X86State::RFLAG_DF_LOC);
auto SizeConst = _Constant(Size);
auto PtrDir = _SubShift(IR::SizeToOpSize(CTX->GetGPRSize()), SizeConst, DF, ShiftType::LSL, FEXCore::ilog2(Size) + 1);
auto PtrDir = LoadDir(Size);

auto JumpStart = Jump();
// Make sure to start a new block after ending this one
Expand Down
26 changes: 26 additions & 0 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -1401,6 +1401,11 @@ friend class FEXCore::IR::PassManager;
if (ValueOffset || MustMask)
Value = _Bfe(OpSize::i32Bit, 1, ValueOffset, Value);

// For DF, we need to transform 0/1 into 1/-1
if (BitOffset == FEXCore::X86State::RFLAG_DF_RAW_LOC) {
Value = _SubShift(OpSize::i64Bit, _Constant(1), Value, ShiftType::LSL, 1);
}

_StoreFlag(Value, BitOffset);
}
}
Expand Down Expand Up @@ -1453,11 +1458,32 @@ friend class FEXCore::IR::PassManager;
return _LoadRegister(false, offsetof(FEXCore::Core::CPUState, pf_raw), GPRClass, GPRFixedClass, CTX->GetGPRSize());
} else if (BitOffset == FEXCore::X86State::RFLAG_AF_RAW_LOC) {
return _LoadRegister(false, offsetof(FEXCore::Core::CPUState, af_raw), GPRClass, GPRFixedClass, CTX->GetGPRSize());
} else if (BitOffset == FEXCore::X86State::RFLAG_DF_RAW_LOC) {
// Recover the sign bit, it is the logical DF value
return _Lshr(OpSize::i64Bit, _LoadDF(), _Constant(63));
} else {
return _LoadFlag(BitOffset);
}
}

// Returns (DF ? -Size : Size)
OrderedNode *LoadDir(const unsigned Size) {
auto Dir = _LoadDF();
auto Shift = FEXCore::ilog2(Size);

if (Shift)
return _Lshl(IR::SizeToOpSize(CTX->GetGPRSize()), Dir, _Constant(Shift));
else
return Dir;
}

// Returns DF ? (X - Size) : (X + Size)
OrderedNode *OffsetByDir(OrderedNode *X, const unsigned Size) {
auto Shift = FEXCore::ilog2(Size);

return _AddShift(OpSize::i64Bit, X, _LoadDF(), ShiftType::LSL, Shift);
}

// Set SSE comparison flags based on the result set by Arm FCMP. This converts
// NZCV from the Arm representation to an eXternal representation that's
// totally not a euphemism for x86 or anything, nuh-uh.
Expand Down
2 changes: 1 addition & 1 deletion FEXCore/Source/Interface/Core/OpcodeDispatcher/Flags.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ constexpr std::array<uint32_t, 17> FlagOffsets = {
FEXCore::X86State::RFLAG_SF_RAW_LOC,
FEXCore::X86State::RFLAG_TF_LOC,
FEXCore::X86State::RFLAG_IF_LOC,
FEXCore::X86State::RFLAG_DF_LOC,
FEXCore::X86State::RFLAG_DF_RAW_LOC,
FEXCore::X86State::RFLAG_OF_RAW_LOC,
FEXCore::X86State::RFLAG_IOPL_LOC,
FEXCore::X86State::RFLAG_NT_LOC,
Expand Down
7 changes: 7 additions & 0 deletions FEXCore/Source/Interface/IR/IR.json
Original file line number Diff line number Diff line change
Expand Up @@ -456,6 +456,13 @@
"DestSize": "4"
},

"GPR = LoadDF": {
"Desc": ["Loads the decimal flag from the context object in -1/1",
"representation for easy consumption"
],
"DestSize": "8"
},

"GPR = LoadFlag u32:$Flag": {
"Desc": ["Loads an x86-64 flag from the context object",
"Specialized to allow flexible implementation of flag handling"
Expand Down
4 changes: 2 additions & 2 deletions FEXCore/Source/Interface/IR/Passes/ConstProp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1330,7 +1330,7 @@ bool ConstProp::ConstantInlining(IREmitter *IREmit, const IRListView& CurrentIR)
if (IREmit->IsValueConstant(Op->Direction, &Constant)) {
IREmit->SetWriteCursor(CurrentIR.GetNode(Op->Direction));

IREmit->ReplaceNodeArgument(CodeNode, Op->Direction_Index, CreateInlineConstant(IREmit, Constant & 1));
IREmit->ReplaceNodeArgument(CodeNode, Op->Direction_Index, CreateInlineConstant(IREmit, Constant));

Changed = true;
}
Expand All @@ -1344,7 +1344,7 @@ bool ConstProp::ConstantInlining(IREmitter *IREmit, const IRListView& CurrentIR)
if (IREmit->IsValueConstant(Op->Direction, &Constant)) {
IREmit->SetWriteCursor(CurrentIR.GetNode(Op->Direction));

IREmit->ReplaceNodeArgument(CodeNode, Op->Direction_Index, CreateInlineConstant(IREmit, Constant & 1));
IREmit->ReplaceNodeArgument(CodeNode, Op->Direction_Index, CreateInlineConstant(IREmit, Constant));

Changed = true;
}
Expand Down
Loading

0 comments on commit cd2a6ce

Please sign in to comment.