From fcbf0de05a470c1e92f1a5b828a855f5a2717f90 Mon Sep 17 00:00:00 2001 From: Paulo Matos Date: Sun, 1 Dec 2024 12:53:07 +0100 Subject: [PATCH 1/5] Enable RA of SVE Predicate Registers --- FEXCore/Scripts/json_ir_generator.py | 10 +++-- .../Core/ArchHelpers/Arm64Emitter.cpp | 21 ++++++++++ .../Interface/Core/ArchHelpers/Arm64Emitter.h | 1 + FEXCore/Source/Interface/Core/JIT/JIT.cpp | 1 + FEXCore/Source/Interface/Core/JIT/JITClass.h | 13 ++++++ .../Source/Interface/Core/JIT/MemoryOps.cpp | 40 +++++++++++++++++++ FEXCore/Source/Interface/IR/IR.json | 20 +++++++++- FEXCore/Source/Interface/IR/IRDumper.cpp | 3 ++ FEXCore/Source/Interface/IR/PassManager.cpp | 2 +- FEXCore/Source/Interface/IR/Passes.h | 3 +- .../Interface/IR/Passes/RAValidation.cpp | 3 ++ 11 files changed, 109 insertions(+), 8 deletions(-) diff --git a/FEXCore/Scripts/json_ir_generator.py b/FEXCore/Scripts/json_ir_generator.py index d1548a340a..425c966e85 100755 --- a/FEXCore/Scripts/json_ir_generator.py +++ b/FEXCore/Scripts/json_ir_generator.py @@ -101,7 +101,8 @@ def is_ssa_type(type): if (type == "SSA" or type == "GPR" or type == "GPRPair" or - type == "FPR"): + type == "FPR" or + type == "PRED"): return True return False @@ -150,8 +151,8 @@ def parse_ops(ops): RHS += f", {DType}:$Out{Name}" else: # Single anonymous destination - if LHS not in ["SSA", "GPR", "GPRPair", "FPR"]: - ExitError(f"Unknown destination class type {LHS}. Needs to be one of SSA, GPR, GPRPair, FPR") + if LHS not in ["SSA", "GPR", "GPRPair", "FPR", "PRED"]: + ExitError(f"Unknown destination class type {LHS}. Needs to be one of SSA, GPR, GPRPair, FPR, PRED") OpDef.HasDest = True OpDef.DestType = LHS @@ -221,7 +222,8 @@ def parse_ops(ops): if (OpArg.IsSSA and (OpArg.Type == "GPR" or OpArg.Type == "GPRPair" or - OpArg.Type == "FPR")): + OpArg.Type == "FPR" or + OpArg.Type == "PR")): OpDef.EmitValidation.append(f"GetOpRegClass({ArgName}) == InvalidClass || WalkFindRegClass({ArgName}) == {OpArg.Type}Class") OpArg.Name = ArgName diff --git a/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.cpp b/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.cpp index e1472e9d0e..1ce51f831b 100644 --- a/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.cpp +++ b/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.cpp @@ -57,6 +57,12 @@ namespace x64 { ARMEmitter::Reg::r24, ARMEmitter::Reg::r25, ARMEmitter::Reg::r30, ARMEmitter::Reg::r18, }; + // p6 and p7 registers are used as temporaries no not added here for RA + // See PREF_TMP_16B and PREF_TMP_32B + // Also p8-p15 cannot be used can only encode p0-p7, so we're left with p0-p5. + constexpr std::array PR = {ARMEmitter::PReg::p0, ARMEmitter::PReg::p1, ARMEmitter::PReg::p2, + ARMEmitter::PReg::p3, ARMEmitter::PReg::p4, ARMEmitter::PReg::p5}; + constexpr unsigned RAPairs = 6; // All are caller saved @@ -103,6 +109,12 @@ namespace x64 { ARMEmitter::Reg::r16, ARMEmitter::Reg::r17, ARMEmitter::Reg::r30, }; + // p6 and p7 registers are used as temporaries no not added here for RA + // See PREF_TMP_16B and PREF_TMP_32B + // Also p8-p15 cannot be used can only encode p0-p7, so we're left with p0-p5. + constexpr std::array PR = {ARMEmitter::PReg::p0, ARMEmitter::PReg::p1, ARMEmitter::PReg::p2, + ARMEmitter::PReg::p3, ARMEmitter::PReg::p4, ARMEmitter::PReg::p5}; + constexpr unsigned RAPairs = 6; constexpr std::array SRAFPR = { @@ -234,6 +246,12 @@ namespace x32 { constexpr unsigned RAPairs = 12; + // p6 and p7 registers are used as temporaries no not added here for RA + // See PREF_TMP_16B and PREF_TMP_32B + // Also p8-p15 cannot be used can only encode p0-p7, so we're left with p0-p5. + constexpr std::array PR = {ARMEmitter::PReg::p0, ARMEmitter::PReg::p1, ARMEmitter::PReg::p2, + ARMEmitter::PReg::p3, ARMEmitter::PReg::p4, ARMEmitter::PReg::p5}; + // All are caller saved constexpr std::array SRAFPR = { ARMEmitter::VReg::v16, ARMEmitter::VReg::v17, ARMEmitter::VReg::v18, ARMEmitter::VReg::v19, @@ -357,6 +375,7 @@ Arm64Emitter::Arm64Emitter(FEXCore::Context::ContextImpl* ctx, void* EmissionPtr GeneralRegisters = x64::RA; StaticFPRegisters = x64::SRAFPR; GeneralFPRegisters = x64::RAFPR; + PredicateRegisters = x64::PR; PairRegisters = x64::RAPairs; #ifdef _M_ARM_64EC ConfiguredDynamicRegisterBase = std::span(x64::RA.begin(), 7); @@ -370,6 +389,8 @@ Arm64Emitter::Arm64Emitter(FEXCore::Context::ContextImpl* ctx, void* EmissionPtr StaticFPRegisters = x32::SRAFPR; GeneralFPRegisters = x32::RAFPR; + + PredicateRegisters = x32::PR; } } diff --git a/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.h b/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.h index f3bcf16584..2348d9cd6a 100644 --- a/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.h +++ b/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.h @@ -94,6 +94,7 @@ class Arm64Emitter : public ARMEmitter::Emitter { std::span ConfiguredDynamicRegisterBase {}; std::span StaticRegisters {}; std::span GeneralRegisters {}; + std::span PredicateRegisters {}; std::span StaticFPRegisters {}; std::span GeneralFPRegisters {}; uint32_t PairRegisters = 0; diff --git a/FEXCore/Source/Interface/Core/JIT/JIT.cpp b/FEXCore/Source/Interface/Core/JIT/JIT.cpp index 993e9cc272..8d05022f56 100644 --- a/FEXCore/Source/Interface/Core/JIT/JIT.cpp +++ b/FEXCore/Source/Interface/Core/JIT/JIT.cpp @@ -534,6 +534,7 @@ Arm64JITCore::Arm64JITCore(FEXCore::Context::ContextImpl* ctx, FEXCore::Core::In RAPass->AddRegisters(FEXCore::IR::GPRFixedClass, StaticRegisters.size()); RAPass->AddRegisters(FEXCore::IR::FPRClass, GeneralFPRegisters.size()); RAPass->AddRegisters(FEXCore::IR::FPRFixedClass, StaticFPRegisters.size()); + RAPass->AddRegisters(FEXCore::IR::PREDClass, PredicateRegisters.size()); RAPass->PairRegs = PairRegisters; { diff --git a/FEXCore/Source/Interface/Core/JIT/JITClass.h b/FEXCore/Source/Interface/Core/JIT/JITClass.h index ba3e24442d..223c19b82f 100644 --- a/FEXCore/Source/Interface/Core/JIT/JITClass.h +++ b/FEXCore/Source/Interface/Core/JIT/JITClass.h @@ -94,6 +94,19 @@ class Arm64JITCore final : public CPUBackend, public Arm64Emitter { FEX_UNREACHABLE; } + [[nodiscard]] + ARMEmitter::PRegister GetPReg(IR::NodeID Node) const { + const auto Reg = GetPhys(Node); + + LOGMAN_THROW_AA_FMT(Reg.Class == IR::PREDClass.Val, "Unexpected Class: {}", Reg.Class); + + if (Reg.Class == IR::PREDClass.Val) { + return PredicateRegisters[Reg.Reg]; + } + + FEX_UNREACHABLE; + } + [[nodiscard]] FEXCore::IR::RegisterClassType GetRegClass(IR::NodeID Node) const; diff --git a/FEXCore/Source/Interface/Core/JIT/MemoryOps.cpp b/FEXCore/Source/Interface/Core/JIT/MemoryOps.cpp index d9ce167ec8..e9dbad86b8 100644 --- a/FEXCore/Source/Interface/Core/JIT/MemoryOps.cpp +++ b/FEXCore/Source/Interface/Core/JIT/MemoryOps.cpp @@ -10,6 +10,7 @@ tags: backend|arm64 #include "Interface/Context/Context.h" #include "Interface/Core/CPUID.h" #include "Interface/Core/JIT/JITClass.h" +#include "Interface/IR/IR.h" #include #include @@ -1551,6 +1552,45 @@ DEF_OP(StoreMem) { } } +DEF_OP(InitPredicate) { + const auto Op = IROp->C(); + const auto OpSize = IROp->Size; + ptrue(ConvertSubRegSize16(OpSize), GetPReg(Node), static_cast(Op->Pattern)); +} + +DEF_OP(StoreMemPredicate) { + const auto Op = IROp->C(); + // FIXME: Dont we need OpSize then? + const auto Predicate = GetPReg(Op->Mask.ID()); + + const auto RegData = GetVReg(Op->Value.ID()); + const auto MemReg = GetReg(Op->Addr.ID()); + + LOGMAN_THROW_A_FMT(HostSupportsSVE128 || HostSupportsSVE256, "StoreMemPredicate needs SVE support"); + + const auto MemDst = ARMEmitter::SVEMemOperand(MemReg.X(), 0); + + switch (IROp->ElementSize) { + case IR::OpSize::i8Bit: { + st1b(RegData.Z(), Predicate, MemDst); + break; + } + case IR::OpSize::i16Bit: { + st1h(RegData.Z(), Predicate, MemDst); + break; + } + case IR::OpSize::i32Bit: { + st1w(RegData.Z(), Predicate, MemDst); + break; + } + case IR::OpSize::i64Bit: { + st1d(RegData.Z(), Predicate, MemDst); + break; + } + default: break; + } +} + DEF_OP(StoreMemPair) { const auto Op = IROp->C(); const auto OpSize = IROp->Size; diff --git a/FEXCore/Source/Interface/IR/IR.json b/FEXCore/Source/Interface/IR/IR.json index cad007c6a2..e29b54d043 100644 --- a/FEXCore/Source/Interface/IR/IR.json +++ b/FEXCore/Source/Interface/IR/IR.json @@ -7,6 +7,7 @@ " SSA = untyped", " GPR = GPR class type", " FPR = FPR class type", + " PRED = Predicate register class type", "Declaring the SSA types correctly will allow validation passes to ensure the op is getting passed correct arguments", "", "Arguments must always follow a particular order. :", @@ -83,6 +84,7 @@ "constexpr FEXCore::IR::RegisterClassType GPRFixedClass {1}", "constexpr FEXCore::IR::RegisterClassType FPRClass {2}", "constexpr FEXCore::IR::RegisterClassType FPRFixedClass {3}", + "constexpr FEXCore::IR::RegisterClassType PREDClass {4}", "constexpr FEXCore::IR::RegisterClassType ComplexClass {5}", "constexpr FEXCore::IR::RegisterClassType InvalidClass {7}", "", @@ -148,6 +150,7 @@ "SSA": "OrderedNode*", "GPR": "OrderedNode*", "FPR": "OrderedNode*", + "PRED": "OrderedNode*", "FenceType": "FenceType", "RegisterClass": "RegisterClassType", "CondClass": "CondClassType", @@ -560,8 +563,21 @@ "HasSideEffects": true, "DestSize": "Size", "EmitValidation": [ - "WalkFindRegClass($Value1) == $Class", - "WalkFindRegClass($Value2) == $Class" + "WalkFindRegClass($Value1) == $Class" + ] + }, + + "PRED = InitPredicate OpSize:#Size, u8:$Pattern": { + "Desc": ["Initialize predicate register from Pattern"], + "DestSize": "Size" + }, + + "StoreMemPredicate RegisterClass:$Class, OpSize:#Size, SSA:$Value, PRED:$Mask, GPR:$Addr": { + "Desc": [ "Stores a value to memory using SVE predicate mask." ], + "HasSideEffects": true, + "DestSize": "Size", + "EmitValidation": [ + "WalkFindRegClass($Value) == $Class" ] }, diff --git a/FEXCore/Source/Interface/IR/IRDumper.cpp b/FEXCore/Source/Interface/IR/IRDumper.cpp index c1fef63d10..b4a94aa1b8 100644 --- a/FEXCore/Source/Interface/IR/IRDumper.cpp +++ b/FEXCore/Source/Interface/IR/IRDumper.cpp @@ -77,6 +77,8 @@ static void PrintArg(fextl::stringstream* out, [[maybe_unused]] const IRListView *out << "FPR"; } else if (Arg == FPRFixedClass.Val) { *out << "FPRFixed"; + } else if (Arg == PREDClass.Val) { + *out << "PRED"; } else { *out << "Unknown Registerclass " << Arg; } @@ -98,6 +100,7 @@ static void PrintArg(fextl::stringstream* out, const IRListView* IR, OrderedNode case FEXCore::IR::GPRFixedClass.Val: *out << "(GPRFixed"; break; case FEXCore::IR::FPRClass.Val: *out << "(FPR"; break; case FEXCore::IR::FPRFixedClass.Val: *out << "(FPRFixed"; break; + case FEXCore::IR::PREDClass.Val: *out << "(PRED"; break; case FEXCore::IR::ComplexClass.Val: *out << "(Complex"; break; case FEXCore::IR::InvalidClass.Val: *out << "(Invalid"; break; default: *out << "(Unknown"; break; diff --git a/FEXCore/Source/Interface/IR/PassManager.cpp b/FEXCore/Source/Interface/IR/PassManager.cpp index 5072364a68..fccbd41c56 100644 --- a/FEXCore/Source/Interface/IR/PassManager.cpp +++ b/FEXCore/Source/Interface/IR/PassManager.cpp @@ -70,7 +70,7 @@ void PassManager::AddDefaultPasses(FEXCore::Context::ContextImpl* ctx) { FEX_CONFIG_OPT(DisablePasses, O0); if (!DisablePasses()) { - InsertPass(CreateX87StackOptimizationPass()); + InsertPass(CreateX87StackOptimizationPass(ctx->HostFeatures)); InsertPass(CreateConstProp(ctx->HostFeatures.SupportsTSOImm9, &ctx->CPUID)); InsertPass(CreateDeadFlagCalculationEliminination()); } diff --git a/FEXCore/Source/Interface/IR/Passes.h b/FEXCore/Source/Interface/IR/Passes.h index a8a3d1cb64..fa230264fc 100644 --- a/FEXCore/Source/Interface/IR/Passes.h +++ b/FEXCore/Source/Interface/IR/Passes.h @@ -5,6 +5,7 @@ namespace FEXCore { class CPUIDEmu; +struct HostFeatures; } namespace FEXCore::Utils { @@ -19,7 +20,7 @@ class RegisterAllocationData; fextl::unique_ptr CreateConstProp(bool SupportsTSOImm9, const FEXCore::CPUIDEmu* CPUID); fextl::unique_ptr CreateDeadFlagCalculationEliminination(); fextl::unique_ptr CreateRegisterAllocationPass(); -fextl::unique_ptr CreateX87StackOptimizationPass(); +fextl::unique_ptr CreateX87StackOptimizationPass(const FEXCore::HostFeatures&); namespace Validation { fextl::unique_ptr CreateIRValidation(); diff --git a/FEXCore/Source/Interface/IR/Passes/RAValidation.cpp b/FEXCore/Source/Interface/IR/Passes/RAValidation.cpp index 75e4ede6f7..bd10556b96 100644 --- a/FEXCore/Source/Interface/IR/Passes/RAValidation.cpp +++ b/FEXCore/Source/Interface/IR/Passes/RAValidation.cpp @@ -47,6 +47,7 @@ struct RegState { // On arm64, there are 16 Fixed and 12 normal FPRsFixed[Reg.Reg] = ssa; return true; + case PREDClass: PREGs[Reg.Reg] = ssa; return true; } return false; } @@ -59,6 +60,7 @@ struct RegState { case GPRFixedClass: return GPRsFixed[Reg.Reg]; case FPRClass: return FPRs[Reg.Reg]; case FPRFixedClass: return FPRsFixed[Reg.Reg]; + case PREDClass: return PREGs[Reg.Reg]; } return InvalidReg; } @@ -82,6 +84,7 @@ struct RegState { std::array FPRsFixed = {}; std::array GPRs = {}; std::array FPRs = {}; + std::array PREGs = {}; fextl::unordered_map Spills; }; From 1d3ce30e5041de81c6b2914b5e50b835e7084f23 Mon Sep 17 00:00:00 2001 From: Paulo Matos Date: Sun, 1 Dec 2024 12:52:40 +0100 Subject: [PATCH 2/5] Generate SVE for 80bit load/stores when possible Fixes #4166. --- .../Core/ArchHelpers/Arm64Emitter.cpp | 18 +++++----- .../Source/Interface/Core/JIT/MemoryOps.cpp | 34 +++++++++++++++++-- .../Interface/Core/OpcodeDispatcher.cpp | 28 ++++++++++----- FEXCore/Source/Interface/IR/IR.json | 13 ++++--- .../IR/Passes/x87StackOptimizationPass.cpp | 30 ++++++++++------ 5 files changed, 87 insertions(+), 36 deletions(-) diff --git a/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.cpp b/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.cpp index 1ce51f831b..746afc47ba 100644 --- a/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.cpp +++ b/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.cpp @@ -59,9 +59,9 @@ namespace x64 { // p6 and p7 registers are used as temporaries no not added here for RA // See PREF_TMP_16B and PREF_TMP_32B - // Also p8-p15 cannot be used can only encode p0-p7, so we're left with p0-p5. - constexpr std::array PR = {ARMEmitter::PReg::p0, ARMEmitter::PReg::p1, ARMEmitter::PReg::p2, - ARMEmitter::PReg::p3, ARMEmitter::PReg::p4, ARMEmitter::PReg::p5}; + // p0-p1 are also used in the jit as temps. + // Also p8-p15 cannot be used can only encode p0-p7, so we're left with p2-p5. + constexpr std::array PR = {ARMEmitter::PReg::p2, ARMEmitter::PReg::p3, ARMEmitter::PReg::p4, ARMEmitter::PReg::p5}; constexpr unsigned RAPairs = 6; @@ -111,9 +111,9 @@ namespace x64 { // p6 and p7 registers are used as temporaries no not added here for RA // See PREF_TMP_16B and PREF_TMP_32B - // Also p8-p15 cannot be used can only encode p0-p7, so we're left with p0-p5. - constexpr std::array PR = {ARMEmitter::PReg::p0, ARMEmitter::PReg::p1, ARMEmitter::PReg::p2, - ARMEmitter::PReg::p3, ARMEmitter::PReg::p4, ARMEmitter::PReg::p5}; + // p0-p1 are also used in the jit as temps. + // Also p8-p15 cannot be used can only encode p0-p7, so we're left with p2-p5. + constexpr std::array PR = {ARMEmitter::PReg::p2, ARMEmitter::PReg::p3, ARMEmitter::PReg::p4, ARMEmitter::PReg::p5}; constexpr unsigned RAPairs = 6; @@ -248,9 +248,9 @@ namespace x32 { // p6 and p7 registers are used as temporaries no not added here for RA // See PREF_TMP_16B and PREF_TMP_32B - // Also p8-p15 cannot be used can only encode p0-p7, so we're left with p0-p5. - constexpr std::array PR = {ARMEmitter::PReg::p0, ARMEmitter::PReg::p1, ARMEmitter::PReg::p2, - ARMEmitter::PReg::p3, ARMEmitter::PReg::p4, ARMEmitter::PReg::p5}; + // p0-p1 are also used in the jit as temps. + // Also p8-p15 cannot be used can only encode p0-p7, so we're left with p2-p5. + constexpr std::array PR = {ARMEmitter::PReg::p2, ARMEmitter::PReg::p3, ARMEmitter::PReg::p4, ARMEmitter::PReg::p5}; // All are caller saved constexpr std::array SRAFPR = { diff --git a/FEXCore/Source/Interface/Core/JIT/MemoryOps.cpp b/FEXCore/Source/Interface/Core/JIT/MemoryOps.cpp index e9dbad86b8..81461600ec 100644 --- a/FEXCore/Source/Interface/Core/JIT/MemoryOps.cpp +++ b/FEXCore/Source/Interface/Core/JIT/MemoryOps.cpp @@ -1560,7 +1560,6 @@ DEF_OP(InitPredicate) { DEF_OP(StoreMemPredicate) { const auto Op = IROp->C(); - // FIXME: Dont we need OpSize then? const auto Predicate = GetPReg(Op->Mask.ID()); const auto RegData = GetVReg(Op->Value.ID()); @@ -1587,7 +1586,38 @@ DEF_OP(StoreMemPredicate) { st1d(RegData.Z(), Predicate, MemDst); break; } - default: break; + default: LOGMAN_MSG_A_FMT("Unhandled {} element size: {}", __func__, IROp->ElementSize); break; + } +} + +DEF_OP(LoadMemPredicate) { + const auto Op = IROp->C(); + const auto Dst = GetVReg(Node); + const auto Predicate = GetPReg(Op->Mask.ID()); + const auto MemReg = GetReg(Op->Addr.ID()); + + LOGMAN_THROW_A_FMT(HostSupportsSVE128 || HostSupportsSVE256, "LoadMemPredicate needs SVE support"); + + const auto MemDst = ARMEmitter::SVEMemOperand(MemReg.X(), 0); + + switch (IROp->ElementSize) { + case IR::OpSize::i8Bit: { + ld1b(Dst.Z(), Predicate.Zeroing(), MemDst); + break; + } + case IR::OpSize::i16Bit: { + ld1h(Dst.Z(), Predicate.Zeroing(), MemDst); + break; + } + case IR::OpSize::i32Bit: { + ld1w(Dst.Z(), Predicate.Zeroing(), MemDst); + break; + } + case IR::OpSize::i64Bit: { + ld1d(Dst.Z(), Predicate.Zeroing(), MemDst); + break; + } + default: LOGMAN_MSG_A_FMT("Unhandled {} element size: {}", __func__, IROp->ElementSize); break; } } diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp index 6020dcd41c..37efb44008 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp @@ -6,6 +6,7 @@ desc: Handles x86/64 ops to IR, no-pf opt, local-flags opt $end_info$ */ +#include "FEXCore/Core/HostFeatures.h" #include "FEXCore/Utils/Telemetry.h" #include "Interface/Context/Context.h" #include "Interface/Core/OpcodeDispatcher.h" @@ -4309,10 +4310,15 @@ Ref OpDispatchBuilder::LoadSource_WithOpSize(RegisterClassType Class, const X86T if ((IsOperandMem(Operand, true) && LoadData) || ForceLoad) { if (OpSize == OpSize::f80Bit) { Ref MemSrc = LoadEffectiveAddress(A, true); - - // For X87 extended doubles, Split the load. - auto Res = _LoadMem(Class, OpSize::i64Bit, MemSrc, Align == OpSize::iInvalid ? OpSize : Align); - return _VLoadVectorElement(OpSize::i128Bit, OpSize::i16Bit, Res, 4, _Add(OpSize::i64Bit, MemSrc, _InlineConstant(8))); + if (CTX->HostFeatures.SupportsSVE128 || CTX->HostFeatures.SupportsSVE256) { + // Using SVE we can load this with a single instruction. + auto PReg = _InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5)); + return _LoadMemPredicate(OpSize::i128Bit, OpSize::i16Bit, PReg, MemSrc); + } else { + // For X87 extended doubles, Split the load. + auto Res = _LoadMem(Class, OpSize::i64Bit, MemSrc, Align == OpSize::iInvalid ? OpSize : Align); + return _VLoadVectorElement(OpSize::i128Bit, OpSize::i16Bit, Res, 4, _Add(OpSize::i64Bit, MemSrc, _InlineConstant(8))); + } } return _LoadMemAutoTSO(Class, OpSize, A, Align == OpSize::iInvalid ? OpSize : Align); @@ -4439,11 +4445,15 @@ void OpDispatchBuilder::StoreResult_WithOpSize(FEXCore::IR::RegisterClassType Cl if (OpSize == OpSize::f80Bit) { Ref MemStoreDst = LoadEffectiveAddress(A, true); - - // For X87 extended doubles, split before storing - _StoreMem(FPRClass, OpSize::i64Bit, MemStoreDst, Src, Align); - auto Upper = _VExtractToGPR(OpSize::i128Bit, OpSize::i64Bit, Src, 1); - _StoreMem(GPRClass, OpSize::i16Bit, Upper, MemStoreDst, _Constant(8), std::min(Align, OpSize::i64Bit), MEM_OFFSET_SXTX, 1); + if (CTX->HostFeatures.SupportsSVE128 || CTX->HostFeatures.SupportsSVE256) { + auto PReg = _InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5)); + _StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, Src, PReg, MemStoreDst); + } else { + // For X87 extended doubles, split before storing + _StoreMem(FPRClass, OpSize::i64Bit, MemStoreDst, Src, Align); + auto Upper = _VExtractToGPR(OpSize::i128Bit, OpSize::i64Bit, Src, 1); + _StoreMem(GPRClass, OpSize::i16Bit, Upper, MemStoreDst, _Constant(8), std::min(Align, OpSize::i64Bit), MEM_OFFSET_SXTX, 1); + } } else { _StoreMemAutoTSO(Class, OpSize, A, Src, Align == OpSize::iInvalid ? OpSize : Align); } diff --git a/FEXCore/Source/Interface/IR/IR.json b/FEXCore/Source/Interface/IR/IR.json index e29b54d043..8fd523e192 100644 --- a/FEXCore/Source/Interface/IR/IR.json +++ b/FEXCore/Source/Interface/IR/IR.json @@ -572,13 +572,16 @@ "DestSize": "Size" }, - "StoreMemPredicate RegisterClass:$Class, OpSize:#Size, SSA:$Value, PRED:$Mask, GPR:$Addr": { + "StoreMemPredicate OpSize:#RegisterSize, OpSize:#ElementSize, FPR:$Value, PRED:$Mask, GPR:$Addr": { "Desc": [ "Stores a value to memory using SVE predicate mask." ], + "DestSize": "RegisterSize", "HasSideEffects": true, - "DestSize": "Size", - "EmitValidation": [ - "WalkFindRegClass($Value) == $Class" - ] + "NumElements": "IR::NumElements(RegisterSize, ElementSize)" + }, + "FPR = LoadMemPredicate OpSize:#RegisterSize, OpSize:#ElementSize, PRED:$Mask, GPR:$Addr": { + "Desc": [ "Loads a value to memory using SVE predicate mask." ], + "DestSize": "RegisterSize", + "NumElements": "IR::NumElements(RegisterSize, ElementSize)" }, "SSA = LoadMemTSO RegisterClass:$Class, OpSize:#Size, GPR:$Addr, GPR:$Offset, OpSize:$Align, MemOffsetType:$OffsetType, u8:$OffsetScale": { diff --git a/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp b/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp index f342006530..a34b4fdd3a 100644 --- a/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp +++ b/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp @@ -3,9 +3,10 @@ #include "Interface/IR/IR.h" #include "Interface/IR/IREmitter.h" #include "Interface/IR/PassManager.h" -#include -#include -#include +#include "FEXCore/IR/IR.h" +#include "FEXCore/Utils/Profiler.h" +#include "FEXCore/Core/HostFeatures.h" +#include "CodeEmitter/Emitter.h" #include #include @@ -146,13 +147,15 @@ class FixedSizeStack { class X87StackOptimization final : public Pass { public: - X87StackOptimization() { + X87StackOptimization(const FEXCore::HostFeatures& Features) + : Features(Features) { FEX_CONFIG_OPT(ReducedPrecision, X87REDUCEDPRECISION); ReducedPrecisionMode = ReducedPrecision; } void Run(IREmitter* Emit) override; private: + const FEXCore::HostFeatures& Features; bool ReducedPrecisionMode; // Helpers @@ -820,11 +823,16 @@ void X87StackOptimization::Run(IREmitter* Emit) { StackNode = IREmit->_F80CVT(Op->StoreSize, StackNode); } if (Op->StoreSize == OpSize::f80Bit) { // Part of code from StoreResult_WithOpSize() - // For X87 extended doubles, split before storing - IREmit->_StoreMem(FPRClass, OpSize::i64Bit, AddrNode, StackNode); - auto Upper = IREmit->_VExtractToGPR(OpSize::i128Bit, OpSize::i64Bit, StackNode, 1); - auto DestAddr = IREmit->_Add(OpSize::i64Bit, AddrNode, GetConstant(8)); - IREmit->_StoreMem(GPRClass, OpSize::i16Bit, DestAddr, Upper, OpSize::i64Bit); + if (Features.SupportsSVE128 || Features.SupportsSVE256) { + auto PReg = IREmit->_InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5)); + IREmit->_StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, StackNode, PReg, AddrNode); + } else { + // For X87 extended doubles, split before storing + IREmit->_StoreMem(FPRClass, OpSize::i64Bit, AddrNode, StackNode); + auto Upper = IREmit->_VExtractToGPR(OpSize::i128Bit, OpSize::i64Bit, StackNode, 1); + auto DestAddr = IREmit->_Add(OpSize::i64Bit, AddrNode, GetConstant(8)); + IREmit->_StoreMem(GPRClass, OpSize::i16Bit, DestAddr, Upper, OpSize::i64Bit); + } } else { IREmit->_StoreMem(FPRClass, Op->StoreSize, AddrNode, StackNode); } @@ -1025,7 +1033,7 @@ void X87StackOptimization::Run(IREmitter* Emit) { return; } -fextl::unique_ptr CreateX87StackOptimizationPass() { - return fextl::make_unique(); +fextl::unique_ptr CreateX87StackOptimizationPass(const FEXCore::HostFeatures& Features) { + return fextl::make_unique(Features); } } // namespace FEXCore::IR From 0b1229da55c7efbf96714758be7602ed10bd5048 Mon Sep 17 00:00:00 2001 From: Paulo Matos Date: Sun, 1 Dec 2024 12:52:13 +0100 Subject: [PATCH 3/5] instcountci: Generate SVE for 80bit load/stores when possible --- unittests/InstructionCountCI/X87ldst-SVE.json | 133 +++++++----------- 1 file changed, 50 insertions(+), 83 deletions(-) diff --git a/unittests/InstructionCountCI/X87ldst-SVE.json b/unittests/InstructionCountCI/X87ldst-SVE.json index 4b63bd77e8..d82b68d9b1 100644 --- a/unittests/InstructionCountCI/X87ldst-SVE.json +++ b/unittests/InstructionCountCI/X87ldst-SVE.json @@ -14,16 +14,14 @@ }, "Instructions": { "fstp tword [rax]": { - "ExpectedInstructionCount": 15, + "ExpectedInstructionCount": 13, "Comment": "Single 80-bit store.", "ExpectedArm64ASM": [ "ldrb w20, [x28, #1019]", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "str d2, [x4]", - "mov x21, v2.d[1]", - "add x22, x4, #0x8 (8)", - "strh w21, [x22]", + "ptrue p2.h, vl5", + "st1h {z2.h}, p2, [x4]", "ldrb w21, [x28, #1298]", "mov w22, #0x1", "lsl w22, w22, w20", @@ -36,7 +34,7 @@ }, "2-store 80bit": { "x86InstructionCount": 2, - "ExpectedInstructionCount": 29, + "ExpectedInstructionCount": 25, "x86Insts": [ "fstp tword [rax]", "fstp tword [rax+10]" @@ -45,10 +43,8 @@ "ldrb w20, [x28, #1019]", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "str d2, [x4]", - "mov x21, v2.d[1]", - "add x22, x4, #0x8 (8)", - "strh w21, [x22]", + "ptrue p2.h, vl5", + "st1h {z2.h}, p2, [x4]", "ldrb w21, [x28, #1298]", "mov w22, #0x1", "lsl w23, w22, w20", @@ -60,10 +56,8 @@ "add x21, x4, #0xa (10)", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "str d2, [x21]", - "mov x23, v2.d[1]", - "add x21, x21, #0x8 (8)", - "strh w23, [x21]", + "ptrue p2.h, vl5", + "st1h {z2.h}, p2, [x21]", "ldrb w21, [x28, #1298]", "lsl w22, w22, w20", "bic w21, w21, w22", @@ -75,7 +69,7 @@ }, "8-store 80bit": { "x86InstructionCount": 8, - "ExpectedInstructionCount": 113, + "ExpectedInstructionCount": 97, "x86Insts": [ "fstp tword [rax]", "fstp tword [rax+10]", @@ -90,10 +84,8 @@ "ldrb w20, [x28, #1019]", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "str d2, [x4]", - "mov x21, v2.d[1]", - "add x22, x4, #0x8 (8)", - "strh w21, [x22]", + "ptrue p2.h, vl5", + "st1h {z2.h}, p2, [x4]", "ldrb w21, [x28, #1298]", "mov w22, #0x1", "lsl w23, w22, w20", @@ -105,10 +97,8 @@ "add x21, x4, #0xa (10)", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "str d2, [x21]", - "mov x23, v2.d[1]", - "add x21, x21, #0x8 (8)", - "strh w23, [x21]", + "ptrue p2.h, vl5", + "st1h {z2.h}, p2, [x21]", "ldrb w21, [x28, #1298]", "lsl w23, w22, w20", "bic w21, w21, w23", @@ -119,10 +109,8 @@ "add x21, x4, #0x14 (20)", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "str d2, [x21]", - "mov x23, v2.d[1]", - "add x21, x21, #0x8 (8)", - "strh w23, [x21]", + "ptrue p2.h, vl5", + "st1h {z2.h}, p2, [x21]", "ldrb w21, [x28, #1298]", "lsl w23, w22, w20", "bic w21, w21, w23", @@ -133,10 +121,8 @@ "add x21, x4, #0x1e (30)", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "str d2, [x21]", - "mov x23, v2.d[1]", - "add x21, x21, #0x8 (8)", - "strh w23, [x21]", + "ptrue p2.h, vl5", + "st1h {z2.h}, p2, [x21]", "ldrb w21, [x28, #1298]", "lsl w23, w22, w20", "bic w21, w21, w23", @@ -147,10 +133,8 @@ "add x21, x4, #0x28 (40)", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "str d2, [x21]", - "mov x23, v2.d[1]", - "add x21, x21, #0x8 (8)", - "strh w23, [x21]", + "ptrue p2.h, vl5", + "st1h {z2.h}, p2, [x21]", "ldrb w21, [x28, #1298]", "lsl w23, w22, w20", "bic w21, w21, w23", @@ -161,10 +145,8 @@ "add x21, x4, #0x32 (50)", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "str d2, [x21]", - "mov x23, v2.d[1]", - "add x21, x21, #0x8 (8)", - "strh w23, [x21]", + "ptrue p2.h, vl5", + "st1h {z2.h}, p2, [x21]", "ldrb w21, [x28, #1298]", "lsl w23, w22, w20", "bic w21, w21, w23", @@ -175,10 +157,8 @@ "add x21, x4, #0x3c (60)", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "str d2, [x21]", - "mov x23, v2.d[1]", - "add x21, x21, #0x8 (8)", - "strh w23, [x21]", + "ptrue p2.h, vl5", + "st1h {z2.h}, p2, [x21]", "ldrb w21, [x28, #1298]", "lsl w23, w22, w20", "bic w21, w21, w23", @@ -189,10 +169,8 @@ "add x21, x4, #0x46 (70)", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "str d2, [x21]", - "mov x23, v2.d[1]", - "add x21, x21, #0x8 (8)", - "strh w23, [x21]", + "ptrue p2.h, vl5", + "st1h {z2.h}, p2, [x21]", "ldrb w21, [x28, #1298]", "lsl w22, w22, w20", "bic w21, w21, w22", @@ -203,12 +181,11 @@ ] }, "fld tword [rax]": { - "ExpectedInstructionCount": 14, + "ExpectedInstructionCount": 13, "Comment": "Single 80-bit store.", "ExpectedArm64ASM": [ - "ldr d2, [x4]", - "add x20, x4, #0x8 (8)", - "ld1 {v2.h}[4], [x20]", + "ptrue p2.h, vl5", + "ld1h {z2.h}, p2/z, [x4]", "ldrb w20, [x28, #1019]", "mov w21, #0x1", "sub w20, w20, #0x1 (1)", @@ -224,19 +201,17 @@ }, "2-load 80bit": { "x86InstructionCount": 2, - "ExpectedInstructionCount": 24, + "ExpectedInstructionCount": 22, "x86Insts": [ "fld tword [rax]", "fld tword [rax+10]" ], "ExpectedArm64ASM": [ - "ldr d2, [x4]", - "add x20, x4, #0x8 (8)", - "ld1 {v2.h}[4], [x20]", + "ptrue p2.h, vl5", + "ld1h {z2.h}, p2/z, [x4]", "add x20, x4, #0xa (10)", - "ldr d3, [x20]", - "add x20, x20, #0x8 (8)", - "ld1 {v3.h}[4], [x20]", + "ptrue p2.h, vl5", + "ld1h {z3.h}, p2/z, [x20]", "ldrb w20, [x28, #1019]", "sub w20, w20, #0x2 (2)", "and w20, w20, #0x7", @@ -258,7 +233,7 @@ }, "8-load 80bit": { "x86InstructionCount": 8, - "ExpectedInstructionCount": 67, + "ExpectedInstructionCount": 59, "x86Insts": [ "fld tword [rax]", "fld tword [rax+10]", @@ -270,37 +245,29 @@ "fld tword [rax+70]" ], "ExpectedArm64ASM": [ - "ldr d2, [x4]", - "add x20, x4, #0x8 (8)", - "ld1 {v2.h}[4], [x20]", + "ptrue p2.h, vl5", + "ld1h {z2.h}, p2/z, [x4]", "add x20, x4, #0xa (10)", - "ldr d3, [x20]", - "add x20, x20, #0x8 (8)", - "ld1 {v3.h}[4], [x20]", + "ptrue p2.h, vl5", + "ld1h {z3.h}, p2/z, [x20]", "add x20, x4, #0x14 (20)", - "ldr d4, [x20]", - "add x20, x20, #0x8 (8)", - "ld1 {v4.h}[4], [x20]", + "ptrue p2.h, vl5", + "ld1h {z4.h}, p2/z, [x20]", "add x20, x4, #0x1e (30)", - "ldr d5, [x20]", - "add x20, x20, #0x8 (8)", - "ld1 {v5.h}[4], [x20]", + "ptrue p2.h, vl5", + "ld1h {z5.h}, p2/z, [x20]", "add x20, x4, #0x28 (40)", - "ldr d6, [x20]", - "add x20, x20, #0x8 (8)", - "ld1 {v6.h}[4], [x20]", + "ptrue p2.h, vl5", + "ld1h {z6.h}, p2/z, [x20]", "add x20, x4, #0x32 (50)", - "ldr d7, [x20]", - "add x20, x20, #0x8 (8)", - "ld1 {v7.h}[4], [x20]", + "ptrue p2.h, vl5", + "ld1h {z7.h}, p2/z, [x20]", "add x20, x4, #0x3c (60)", - "ldr d8, [x20]", - "add x20, x20, #0x8 (8)", - "ld1 {v8.h}[4], [x20]", + "ptrue p2.h, vl5", + "ld1h {z8.h}, p2/z, [x20]", "add x20, x4, #0x46 (70)", - "ldr d9, [x20]", - "add x20, x20, #0x8 (8)", - "ld1 {v9.h}[4], [x20]", + "ptrue p2.h, vl5", + "ld1h {z9.h}, p2/z, [x20]", "ldrb w20, [x28, #1019]", "sub w20, w20, #0x8 (8)", "and w20, w20, #0x7", From 72a40636515aaa8986f500ce7f4c5cb546d6127f Mon Sep 17 00:00:00 2001 From: Paulo Matos Date: Tue, 3 Dec 2024 17:40:25 +0100 Subject: [PATCH 4/5] Cache predicate register generation from pattern --- FEXCore/Scripts/json_ir_generator.py | 2 +- .../Interface/Core/OpcodeDispatcher.cpp | 4 +-- .../Source/Interface/Core/OpcodeDispatcher.h | 3 ++ FEXCore/Source/Interface/IR/IREmitter.cpp | 1 + FEXCore/Source/Interface/IR/IREmitter.h | 34 ++++++++++++++++++- .../IR/Passes/x87StackOptimizationPass.cpp | 2 +- 6 files changed, 41 insertions(+), 5 deletions(-) diff --git a/FEXCore/Scripts/json_ir_generator.py b/FEXCore/Scripts/json_ir_generator.py index 425c966e85..4a53d074a7 100755 --- a/FEXCore/Scripts/json_ir_generator.py +++ b/FEXCore/Scripts/json_ir_generator.py @@ -223,7 +223,7 @@ def parse_ops(ops): (OpArg.Type == "GPR" or OpArg.Type == "GPRPair" or OpArg.Type == "FPR" or - OpArg.Type == "PR")): + OpArg.Type == "PRED")): OpDef.EmitValidation.append(f"GetOpRegClass({ArgName}) == InvalidClass || WalkFindRegClass({ArgName}) == {OpArg.Type}Class") OpArg.Name = ArgName diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp index 37efb44008..bbd5498c43 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp @@ -4312,7 +4312,7 @@ Ref OpDispatchBuilder::LoadSource_WithOpSize(RegisterClassType Class, const X86T Ref MemSrc = LoadEffectiveAddress(A, true); if (CTX->HostFeatures.SupportsSVE128 || CTX->HostFeatures.SupportsSVE256) { // Using SVE we can load this with a single instruction. - auto PReg = _InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5)); + auto PReg = InitPredicateCached(OpSize::i16Bit, ARMEmitter::PredicatePattern::SVE_VL5); return _LoadMemPredicate(OpSize::i128Bit, OpSize::i16Bit, PReg, MemSrc); } else { // For X87 extended doubles, Split the load. @@ -4446,7 +4446,7 @@ void OpDispatchBuilder::StoreResult_WithOpSize(FEXCore::IR::RegisterClassType Cl if (OpSize == OpSize::f80Bit) { Ref MemStoreDst = LoadEffectiveAddress(A, true); if (CTX->HostFeatures.SupportsSVE128 || CTX->HostFeatures.SupportsSVE256) { - auto PReg = _InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5)); + auto PReg = InitPredicateCached(OpSize::i16Bit, ARMEmitter::PredicatePattern::SVE_VL5); _StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, Src, PReg, MemStoreDst); } else { // For X87 extended doubles, split before storing diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h index ec6589c9c7..61036a2b57 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h @@ -125,6 +125,9 @@ class OpDispatchBuilder final : public IREmitter { // Need to clear any named constants that were cached. ClearCachedNamedConstants(); + + // Clear predicate cache for x87 ldst + ResetInitPredicateCache(); } IRPair Jump() { diff --git a/FEXCore/Source/Interface/IR/IREmitter.cpp b/FEXCore/Source/Interface/IR/IREmitter.cpp index 0850187b1c..95cb2e73dd 100644 --- a/FEXCore/Source/Interface/IR/IREmitter.cpp +++ b/FEXCore/Source/Interface/IR/IREmitter.cpp @@ -41,6 +41,7 @@ FEXCore::IR::RegisterClassType IREmitter::WalkFindRegClass(Ref Node) { case FPRClass: case GPRFixedClass: case FPRFixedClass: + case PREDClass: case InvalidClass: return Class; default: break; } diff --git a/FEXCore/Source/Interface/IR/IREmitter.h b/FEXCore/Source/Interface/IR/IREmitter.h index 0cfc4027be..c5af4efdd3 100644 --- a/FEXCore/Source/Interface/IR/IREmitter.h +++ b/FEXCore/Source/Interface/IR/IREmitter.h @@ -1,6 +1,7 @@ // SPDX-License-Identifier: MIT #pragma once +#include "CodeEmitter/Emitter.h" #include "Interface/IR/IR.h" #include "Interface/IR/IntrusiveIRList.h" @@ -9,9 +10,9 @@ #include #include +#include #include -#include #include #include @@ -45,6 +46,37 @@ class IREmitter { } void ResetWorkingList(); + // Predicate Cache Implementation + // This lives here rather than OpcodeDispatcher because x87StackOptimization Pass + // also needs it. + struct PredicateKey { + ARMEmitter::PredicatePattern Pattern; + OpSize Size; + bool operator==(const PredicateKey& rhs) const = default; + }; + + struct PredicateKeyHash { + size_t operator()(const PredicateKey& key) const { + return FEXCore::ToUnderlying(key.Pattern) + (FEXCore::ToUnderlying(key.Size) * FEXCore::ToUnderlying(OpSize::iInvalid)); + } + }; + fextl::unordered_map InitPredicateCache; + + Ref InitPredicateCached(OpSize Size, ARMEmitter::PredicatePattern Pattern) { + PredicateKey Key {Pattern, Size}; + auto ValIt = InitPredicateCache.find(Key); + if (ValIt == InitPredicateCache.end()) { + auto Predicate = _InitPredicate(Size, static_cast(FEXCore::ToUnderlying(Pattern))); + InitPredicateCache[Key] = Predicate; + return Predicate; + } + return ValIt->second; + } + + void ResetInitPredicateCache() { + InitPredicateCache.clear(); + } + /** * @name IR allocation routines * diff --git a/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp b/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp index a34b4fdd3a..9c45c769a0 100644 --- a/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp +++ b/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp @@ -824,7 +824,7 @@ void X87StackOptimization::Run(IREmitter* Emit) { } if (Op->StoreSize == OpSize::f80Bit) { // Part of code from StoreResult_WithOpSize() if (Features.SupportsSVE128 || Features.SupportsSVE256) { - auto PReg = IREmit->_InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5)); + auto PReg = IREmit->InitPredicateCached(OpSize::i16Bit, ARMEmitter::PredicatePattern::SVE_VL5); IREmit->_StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, StackNode, PReg, AddrNode); } else { // For X87 extended doubles, split before storing From 8f8aa55c7f88aee940b7333fcd854a075e6c0b60 Mon Sep 17 00:00:00 2001 From: Paulo Matos Date: Tue, 3 Dec 2024 17:41:18 +0100 Subject: [PATCH 5/5] instcountci: Cache predicate register generation from pattern --- unittests/InstructionCountCI/X87ldst-SVE.json | 24 ++++--------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/unittests/InstructionCountCI/X87ldst-SVE.json b/unittests/InstructionCountCI/X87ldst-SVE.json index d82b68d9b1..ec0abde349 100644 --- a/unittests/InstructionCountCI/X87ldst-SVE.json +++ b/unittests/InstructionCountCI/X87ldst-SVE.json @@ -34,7 +34,7 @@ }, "2-store 80bit": { "x86InstructionCount": 2, - "ExpectedInstructionCount": 25, + "ExpectedInstructionCount": 24, "x86Insts": [ "fstp tword [rax]", "fstp tword [rax+10]" @@ -56,7 +56,6 @@ "add x21, x4, #0xa (10)", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "st1h {z2.h}, p2, [x21]", "ldrb w21, [x28, #1298]", "lsl w22, w22, w20", @@ -69,7 +68,7 @@ }, "8-store 80bit": { "x86InstructionCount": 8, - "ExpectedInstructionCount": 97, + "ExpectedInstructionCount": 90, "x86Insts": [ "fstp tword [rax]", "fstp tword [rax+10]", @@ -97,7 +96,6 @@ "add x21, x4, #0xa (10)", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "st1h {z2.h}, p2, [x21]", "ldrb w21, [x28, #1298]", "lsl w23, w22, w20", @@ -109,7 +107,6 @@ "add x21, x4, #0x14 (20)", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "st1h {z2.h}, p2, [x21]", "ldrb w21, [x28, #1298]", "lsl w23, w22, w20", @@ -121,7 +118,6 @@ "add x21, x4, #0x1e (30)", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "st1h {z2.h}, p2, [x21]", "ldrb w21, [x28, #1298]", "lsl w23, w22, w20", @@ -133,7 +129,6 @@ "add x21, x4, #0x28 (40)", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "st1h {z2.h}, p2, [x21]", "ldrb w21, [x28, #1298]", "lsl w23, w22, w20", @@ -145,7 +140,6 @@ "add x21, x4, #0x32 (50)", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "st1h {z2.h}, p2, [x21]", "ldrb w21, [x28, #1298]", "lsl w23, w22, w20", @@ -157,7 +151,6 @@ "add x21, x4, #0x3c (60)", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "st1h {z2.h}, p2, [x21]", "ldrb w21, [x28, #1298]", "lsl w23, w22, w20", @@ -169,7 +162,6 @@ "add x21, x4, #0x46 (70)", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "st1h {z2.h}, p2, [x21]", "ldrb w21, [x28, #1298]", "lsl w22, w22, w20", @@ -201,7 +193,7 @@ }, "2-load 80bit": { "x86InstructionCount": 2, - "ExpectedInstructionCount": 22, + "ExpectedInstructionCount": 21, "x86Insts": [ "fld tword [rax]", "fld tword [rax+10]" @@ -210,7 +202,6 @@ "ptrue p2.h, vl5", "ld1h {z2.h}, p2/z, [x4]", "add x20, x4, #0xa (10)", - "ptrue p2.h, vl5", "ld1h {z3.h}, p2/z, [x20]", "ldrb w20, [x28, #1019]", "sub w20, w20, #0x2 (2)", @@ -233,7 +224,7 @@ }, "8-load 80bit": { "x86InstructionCount": 8, - "ExpectedInstructionCount": 59, + "ExpectedInstructionCount": 52, "x86Insts": [ "fld tword [rax]", "fld tword [rax+10]", @@ -248,25 +239,18 @@ "ptrue p2.h, vl5", "ld1h {z2.h}, p2/z, [x4]", "add x20, x4, #0xa (10)", - "ptrue p2.h, vl5", "ld1h {z3.h}, p2/z, [x20]", "add x20, x4, #0x14 (20)", - "ptrue p2.h, vl5", "ld1h {z4.h}, p2/z, [x20]", "add x20, x4, #0x1e (30)", - "ptrue p2.h, vl5", "ld1h {z5.h}, p2/z, [x20]", "add x20, x4, #0x28 (40)", - "ptrue p2.h, vl5", "ld1h {z6.h}, p2/z, [x20]", "add x20, x4, #0x32 (50)", - "ptrue p2.h, vl5", "ld1h {z7.h}, p2/z, [x20]", "add x20, x4, #0x3c (60)", - "ptrue p2.h, vl5", "ld1h {z8.h}, p2/z, [x20]", "add x20, x4, #0x46 (70)", - "ptrue p2.h, vl5", "ld1h {z9.h}, p2/z, [x20]", "ldrb w20, [x28, #1019]", "sub w20, w20, #0x8 (8)",