diff --git a/External/FEXCore/Source/Interface/Core/Interpreter/ALUOps.cpp b/External/FEXCore/Source/Interface/Core/Interpreter/ALUOps.cpp index 5b01f56f49..947669f8d7 100644 --- a/External/FEXCore/Source/Interface/Core/Interpreter/ALUOps.cpp +++ b/External/FEXCore/Source/Interface/Core/Interpreter/ALUOps.cpp @@ -351,6 +351,26 @@ DEF_OP(And) { } } +DEF_OP(Andn) { + auto Op = IROp->C(); + const uint8_t OpSize = IROp->Size; + + void *Src1 = GetSrc(Data->SSAData, Op->Header.Args[0]); + void *Src2 = GetSrc(Data->SSAData, Op->Header.Args[1]); + constexpr auto Func = [](auto a, auto b) { + using Type = decltype(a); + return static_cast(a & static_cast(~b)); + }; + + switch (OpSize) { + DO_OP(1, uint8_t, Func) + DO_OP(2, uint16_t, Func) + DO_OP(4, uint32_t, Func) + DO_OP(8, uint64_t, Func) + default: LOGMAN_MSG_A_FMT("Unknown size: {}", OpSize); break; + } +} + DEF_OP(Xor) { auto Op = IROp->C(); uint8_t OpSize = IROp->Size; @@ -975,6 +995,7 @@ void InterpreterOps::RegisterALUHandlers() { REGISTER_OP(UMULH, UMulH); REGISTER_OP(OR, Or); REGISTER_OP(AND, And); + REGISTER_OP(ANDN, Andn); REGISTER_OP(XOR, Xor); REGISTER_OP(LSHL, Lshl); REGISTER_OP(LSHR, Lshr); diff --git a/External/FEXCore/Source/Interface/Core/Interpreter/InterpreterOps.h b/External/FEXCore/Source/Interface/Core/Interpreter/InterpreterOps.h index 9c26109a74..84d2e12ad5 100644 --- a/External/FEXCore/Source/Interface/Core/Interpreter/InterpreterOps.h +++ b/External/FEXCore/Source/Interface/Core/Interpreter/InterpreterOps.h @@ -103,6 +103,7 @@ namespace FEXCore::CPU { DEF_OP(UMulH); DEF_OP(Or); DEF_OP(And); + DEF_OP(Andn); DEF_OP(Xor); DEF_OP(Lshl); DEF_OP(Lshr); diff --git a/External/FEXCore/Source/Interface/Core/JIT/Arm64/ALUOps.cpp b/External/FEXCore/Source/Interface/Core/JIT/Arm64/ALUOps.cpp index f47444baa4..4735bcc414 100644 --- a/External/FEXCore/Source/Interface/Core/JIT/Arm64/ALUOps.cpp +++ b/External/FEXCore/Source/Interface/Core/JIT/Arm64/ALUOps.cpp @@ -390,6 +390,19 @@ DEF_OP(And) { } } +DEF_OP(Andn) { + auto Op = IROp->C(); + const auto& Lhs = Op->Header.Args[0]; + const auto& Rhs = Op->Header.Args[1]; + uint64_t Const{}; + + if (IsInlineConstant(Rhs, &Const)) { + bic(GRS(Node), GRS(Lhs.ID()), Const); + } else { + bic(GRS(Node), GRS(Lhs.ID()), GRS(Rhs.ID())); + } +} + DEF_OP(Xor) { auto Op = IROp->C(); uint64_t Const; @@ -1079,6 +1092,7 @@ void Arm64JITCore::RegisterALUHandlers() { REGISTER_OP(UMULH, UMulH); REGISTER_OP(OR, Or); REGISTER_OP(AND, And); + REGISTER_OP(ANDN, Andn); REGISTER_OP(XOR, Xor); REGISTER_OP(LSHL, Lshl); REGISTER_OP(LSHR, Lshr); diff --git a/External/FEXCore/Source/Interface/Core/JIT/Arm64/JITClass.h b/External/FEXCore/Source/Interface/Core/JIT/Arm64/JITClass.h index c33955edaf..6740444355 100644 --- a/External/FEXCore/Source/Interface/Core/JIT/Arm64/JITClass.h +++ b/External/FEXCore/Source/Interface/Core/JIT/Arm64/JITClass.h @@ -223,6 +223,7 @@ class Arm64JITCore final : public CPUBackend, public Arm64Emitter { DEF_OP(UMulH); DEF_OP(Or); DEF_OP(And); + DEF_OP(Andn); DEF_OP(Xor); DEF_OP(Lshl); DEF_OP(Lshr); diff --git a/External/FEXCore/Source/Interface/Core/JIT/x86_64/ALUOps.cpp b/External/FEXCore/Source/Interface/Core/JIT/x86_64/ALUOps.cpp index 4b6632ab49..8a09305177 100644 --- a/External/FEXCore/Source/Interface/Core/JIT/x86_64/ALUOps.cpp +++ b/External/FEXCore/Source/Interface/Core/JIT/x86_64/ALUOps.cpp @@ -15,6 +15,11 @@ tags: backend|x86-64 #include namespace FEXCore::CPU { + +#define GRS(Node) (IROp->Size <= 4 ? GetSrc(Node) : GetSrc(Node)) +#define GRD(Node) (IROp->Size <= 4 ? GetDst(Node) : GetDst(Node)) +#define GRCMP(Node) (Op->CompareSize == 4 ? GetSrc(Node) : GetSrc(Node)) + #define DEF_OP(x) void X86JITCore::Op_##x(FEXCore::IR::IROp_Header *IROp, uint32_t Node) DEF_OP(TruncElementPair) { auto Op = IROp->C(); @@ -417,6 +422,25 @@ DEF_OP(And) { mov(Dst, rax); } +DEF_OP(Andn) { + auto Op = IROp->C(); + const auto& Lhs = Op->Header.Args[0]; + const auto& Rhs = Op->Header.Args[1]; + auto Dst = GRD(Node); + + uint64_t Const{}; + if (IsInlineConstant(Rhs, &Const)) { + mov(Dst, GRS(Lhs.ID())); + and_(Dst, ~Const); + } else { + const auto Temp = IROp->Size <= 4 ? Xbyak::Reg{rax.cvt32()} : Xbyak::Reg{rax}; + mov(Temp, GRS(Rhs.ID())); + not_(Temp); + and_(Temp, GRS(Lhs.ID())); + mov(Dst, Temp); + } +} + DEF_OP(Xor) { auto Op = IROp->C(); auto Dst = GetDst(Node); @@ -1048,10 +1072,6 @@ DEF_OP(Sbfe) { } } -#define GRS(Node) (IROp->Size <= 4 ? GetSrc(Node) : GetSrc(Node)) -#define GRD(Node) (IROp->Size <= 4 ? GetDst(Node) : GetDst(Node)) -#define GRCMP(Node) (Op->CompareSize == 4 ? GetSrc(Node) : GetSrc(Node)) - DEF_OP(Select) { auto Op = IROp->C(); auto Dst = GRD(Node); @@ -1221,6 +1241,7 @@ void X86JITCore::RegisterALUHandlers() { REGISTER_OP(UMULH, UMulH); REGISTER_OP(OR, Or); REGISTER_OP(AND, And); + REGISTER_OP(ANDN, Andn); REGISTER_OP(XOR, Xor); REGISTER_OP(LSHL, Lshl); REGISTER_OP(LSHR, Lshr); diff --git a/External/FEXCore/Source/Interface/Core/JIT/x86_64/JITClass.h b/External/FEXCore/Source/Interface/Core/JIT/x86_64/JITClass.h index 7d2023518a..9b4175df17 100644 --- a/External/FEXCore/Source/Interface/Core/JIT/x86_64/JITClass.h +++ b/External/FEXCore/Source/Interface/Core/JIT/x86_64/JITClass.h @@ -220,6 +220,7 @@ class X86JITCore final : public CPUBackend, public Xbyak::CodeGenerator { DEF_OP(UMulH); DEF_OP(Or); DEF_OP(And); + DEF_OP(Andn); DEF_OP(Xor); DEF_OP(Lshl); DEF_OP(Lshr); diff --git a/External/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp b/External/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp index 8e08ad1929..01b174599e 100644 --- a/External/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp +++ b/External/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp @@ -2134,8 +2134,7 @@ void OpDispatchBuilder::ANDNBMIOp(OpcodeArgs) { auto* Src1 = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, -1); auto* Src2 = LoadSource(GPRClass, Op, Op->Src[1], Op->Flags, -1); - // TODO: This can be replaced with a BIC IR op once it's implemented. - auto Dest = _And(_Not(Src1), Src2); + auto Dest = _Andn(Src2, Src1); StoreResult(GPRClass, Op, Dest, -1); GenerateFlags_Logical(Op, Dest, Src1, Src2); @@ -2633,8 +2632,7 @@ void OpDispatchBuilder::BTROp(OpcodeArgs) { Result = _Lshr(Dest, BitSelect); OrderedNode *BitMask = _Lshl(_Constant(1), BitSelect); - BitMask = _Not(BitMask); - Dest = _And(Dest, BitMask); + Dest = _Andn(Dest, BitMask); StoreResult(GPRClass, Op, Dest, -1); } else { @@ -2655,10 +2653,10 @@ void OpDispatchBuilder::BTROp(OpcodeArgs) { // Now add the addresses together and load the memory OrderedNode *MemoryLocation = _Add(Dest, Src); OrderedNode *BitMask = _Lshl(_Constant(1), BitSelect); - BitMask = _Not(BitMask); if (DestIsLockedMem(Op)) { HandledLock = true; + BitMask = _Not(BitMask); // XXX: Technically this can optimize to an AArch64 ldclralb // We don't current support this IR op though Result = _AtomicFetchAnd(MemoryLocation, BitMask, 1); @@ -2670,7 +2668,7 @@ void OpDispatchBuilder::BTROp(OpcodeArgs) { // Now shift in to the correct bit location Result = _Lshr(Value, BitSelect); - Value = _And(Value, BitMask); + Value = _Andn(Value, BitMask); _StoreMemAutoTSO(GPRClass, 1, MemoryLocation, Value, 1); } } diff --git a/External/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp b/External/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp index a80cdcabca..7cc9007e8a 100644 --- a/External/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp +++ b/External/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp @@ -33,9 +33,7 @@ void OpDispatchBuilder::SetX87TopTag(OrderedNode *Value, uint32_t Tag) { OrderedNode *Mask = _Constant(0b11); auto TopOffset = _Lshl(Value, _Constant(1)); Mask = _Lshl(Mask, TopOffset); - // XXX: This Neg can be removed if we support BIC - Mask = _Not(Mask); - OrderedNode *NewFTW = _And(FTW, Mask); + OrderedNode *NewFTW = _Andn(FTW, Mask); if (Tag != 0) { auto TagVal = _Lshl(_Constant(Tag), TopOffset); NewFTW = _Or(NewFTW, TagVal); diff --git a/External/FEXCore/Source/Interface/IR/IR.json b/External/FEXCore/Source/Interface/IR/IR.json index 6fb065e609..0c97e35d35 100644 --- a/External/FEXCore/Source/Interface/IR/IR.json +++ b/External/FEXCore/Source/Interface/IR/IR.json @@ -948,6 +948,15 @@ "SSAArgs": "2" }, + "Andn": { + "Desc": ["Integer binary AND NOT. Performs the equivalent of Src1 & ~Src2"], + "OpClass": "ALU", + "HasDest": true, + "DestClass": "GPR", + "DestSize": "std::max(4, GetOpSize(ssa0))", + "SSAArgs": "2" + }, + "Xor": { "Desc": ["Integer binary exclusive or" ],