diff --git a/CMakeLists.txt b/CMakeLists.txt
index 485be291de..d4d62cb805 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -23,6 +23,7 @@ endif()
# Renderer options.
option(ENABLE_OPENGL "Build with OpenGL renderer" ON)
option(ENABLE_VULKAN "Build with Vulkan renderer" ON)
+option(ENABLE_NEWREC "Build with experimental new dynarec (needed for RISC-V)" ON)
# Global options.
if(NOT ANDROID)
@@ -171,6 +172,9 @@ elseif("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "arm" OR "${CMAKE_SYSTEM_PROCESSOR}"
endif()
elseif("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "riscv64")
set(CPU_ARCH "riscv64")
+
+ # Not done for us. Or we should inline atomics?
+ link_libraries("-latomic")
else()
message(FATAL_ERROR "Unknown system processor: ${CMAKE_SYSTEM_PROCESSOR}")
endif()
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index d2e070ddb7..ee3cdf6072 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -121,6 +121,11 @@ set(RECOMPILER_SRCS
cpu_recompiler_types.h
)
+set(NEWREC_SOURCES
+ cpu_newrec_compiler.cpp
+ cpu_newrec_compiler.h
+)
+
target_precompile_headers(core PRIVATE "pch.h")
target_include_directories(core PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/..")
target_include_directories(core PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/..")
@@ -134,6 +139,15 @@ if(${CPU_ARCH} STREQUAL "x64")
cpu_recompiler_code_generator_x64.cpp
)
message("Building x64 recompiler")
+
+ if(ENABLE_NEWREC)
+ target_compile_definitions(core PUBLIC "ENABLE_NEWREC=1")
+ target_sources(core PRIVATE ${NEWREC_SOURCES}
+ cpu_newrec_compiler_x64.cpp
+ cpu_newrec_compiler_x64.h
+ )
+ message("Building x64 newrec")
+ endif()
elseif(${CPU_ARCH} STREQUAL "aarch32")
target_compile_definitions(core PUBLIC "ENABLE_RECOMPILER=1")
target_sources(core PRIVATE ${RECOMPILER_SRCS}
@@ -148,6 +162,25 @@ elseif(${CPU_ARCH} STREQUAL "aarch64")
)
target_link_libraries(core PUBLIC vixl)
message("Building AArch64 recompiler")
+ if(ENABLE_NEWREC)
+ target_compile_definitions(core PUBLIC "ENABLE_NEWREC=1")
+ target_sources(core PRIVATE ${NEWREC_SOURCES}
+ cpu_newrec_compiler_aarch64.cpp
+ cpu_newrec_compiler_aarch64.h
+ )
+ message("Building AArch64 newrec")
+ endif()
+elseif(${CPU_ARCH} STREQUAL "riscv64")
+ target_compile_definitions(core PUBLIC "ENABLE_MMAP_FASTMEM=1")
+ if(ENABLE_NEWREC)
+ target_compile_definitions(core PUBLIC "ENABLE_NEWREC=1")
+ target_sources(core PRIVATE ${NEWREC_SOURCES}
+ cpu_newrec_compiler_riscv64.cpp
+ cpu_newrec_compiler_riscv64.h
+ )
+ target_link_libraries(core PUBLIC biscuit::biscuit riscv-disas)
+ message("Building RISC-V 64-bit newrec")
+ endif()
else()
message("Not building recompiler")
endif()
diff --git a/src/core/core.props b/src/core/core.props
index a4ceaf16b2..9bd5357f4e 100644
--- a/src/core/core.props
+++ b/src/core/core.props
@@ -8,6 +8,7 @@
ENABLE_RAINTEGRATION=1;%(PreprocessorDefinitions)
ENABLE_RECOMPILER=1;%(PreprocessorDefinitions)
ENABLE_MMAP_FASTMEM=1;%(PreprocessorDefinitions)
+ ENABLE_NEWREC=1;%(PreprocessorDefinitions)
%(AdditionalIncludeDirectories);$(SolutionDir)dep\xxhash\include;$(SolutionDir)dep\zlib\include;$(SolutionDir)dep\rcheevos\include;$(SolutionDir)dep\rapidjson\include;$(SolutionDir)dep\discord-rpc\include
%(AdditionalIncludeDirectories);$(SolutionDir)dep\rainterface
diff --git a/src/core/core.vcxproj b/src/core/core.vcxproj
index 6366658d7a..9846a201cf 100644
--- a/src/core/core.vcxproj
+++ b/src/core/core.vcxproj
@@ -13,6 +13,13 @@
+
+
+ true
+
+
+ true
+
true
@@ -90,6 +97,13 @@
+
+
+ true
+
+
+ true
+
true
diff --git a/src/core/core.vcxproj.filters b/src/core/core.vcxproj.filters
index f0bd545d4d..e6dea09a4c 100644
--- a/src/core/core.vcxproj.filters
+++ b/src/core/core.vcxproj.filters
@@ -60,6 +60,9 @@
+
+
+
@@ -125,5 +128,8 @@
+
+
+
\ No newline at end of file
diff --git a/src/core/cpu_code_cache.cpp b/src/core/cpu_code_cache.cpp
index 3edef3860f..8bf28e1963 100644
--- a/src/core/cpu_code_cache.cpp
+++ b/src/core/cpu_code_cache.cpp
@@ -21,6 +21,10 @@ Log_SetChannel(CPU::CodeCache);
#include "cpu_recompiler_code_generator.h"
#endif
+#ifdef ENABLE_NEWREC
+#include "cpu_newrec_compiler.h"
+#endif
+
#include
#include
@@ -144,7 +148,8 @@ static u32 s_total_host_instructions_emitted = 0;
bool CPU::CodeCache::IsUsingAnyRecompiler()
{
#ifdef ENABLE_RECOMPILER_SUPPORT
- return g_settings.cpu_execution_mode == CPUExecutionMode::Recompiler;
+ return (g_settings.cpu_execution_mode == CPUExecutionMode::Recompiler ||
+ g_settings.cpu_execution_mode == CPUExecutionMode::NewRec);
#else
return false;
#endif
@@ -498,8 +503,8 @@ CPU::CodeCache::Block* CPU::CodeCache::CreateBlock(u32 pc, const BlockInstructio
return block;
}
- // TODO: Only used by NewRec for now, don't waste time filling it.
- if constexpr (false)
+ // Old rec doesn't use backprop info, don't waste time filling it.
+ if (g_settings.cpu_execution_mode == CPUExecutionMode::NewRec)
FillBlockRegInfo(block);
// add it to the tracking list for its page
@@ -1419,6 +1424,10 @@ bool CPU::CodeCache::CompileBlock(Block* block)
host_code = codegen.CompileBlock(block, &host_code_size, &host_far_code_size);
}
#endif
+#ifdef ENABLE_NEWREC
+ if (g_settings.cpu_execution_mode == CPUExecutionMode::NewRec)
+ host_code = NewRec::g_compiler->CompileBlock(block, &host_code_size, &host_far_code_size);
+#endif
s_code_buffer.WriteProtect(true);
@@ -1570,6 +1579,10 @@ void CPU::CodeCache::BackpatchLoadStore(void* host_pc, const LoadstoreBackpatchI
if (g_settings.cpu_execution_mode == CPUExecutionMode::Recompiler)
Recompiler::CodeGenerator::BackpatchLoadStore(host_pc, info);
#endif
+#ifdef ENABLE_NEWREC
+ if (g_settings.cpu_execution_mode == CPUExecutionMode::NewRec)
+ NewRec::BackpatchLoadStore(host_pc, info);
+#endif
s_code_buffer.WriteProtect(true);
}
diff --git a/src/core/cpu_code_cache_private.h b/src/core/cpu_code_cache_private.h
index f1392e0d88..341fde10f9 100644
--- a/src/core/cpu_code_cache_private.h
+++ b/src/core/cpu_code_cache_private.h
@@ -227,7 +227,7 @@ void InterpretUncachedBlock();
void LogCurrentState();
-#if defined(ENABLE_RECOMPILER)
+#if defined(ENABLE_RECOMPILER) || defined(ENABLE_NEWREC)
#define ENABLE_RECOMPILER_SUPPORT 1
#if defined(_DEBUG) || false
diff --git a/src/core/cpu_core.cpp b/src/core/cpu_core.cpp
index 531453bdef..cedabfde0c 100644
--- a/src/core/cpu_core.cpp
+++ b/src/core/cpu_core.cpp
@@ -2231,6 +2231,7 @@ void CPU::Execute()
{
case CPUExecutionMode::Recompiler:
case CPUExecutionMode::CachedInterpreter:
+ case CPUExecutionMode::NewRec:
CodeCache::Execute();
break;
diff --git a/src/core/cpu_newrec_compiler.cpp b/src/core/cpu_newrec_compiler.cpp
new file mode 100644
index 0000000000..5a3fb9b429
--- /dev/null
+++ b/src/core/cpu_newrec_compiler.cpp
@@ -0,0 +1,2277 @@
+// SPDX-FileCopyrightText: 2023 Connor McLaughlin
+// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
+
+#include "cpu_newrec_compiler.h"
+#include "common/assert.h"
+#include "common/log.h"
+#include "common/small_string.h"
+#include "cpu_code_cache.h"
+#include "cpu_core_private.h"
+#include "cpu_disasm.h"
+#include "pgxp.h"
+#include "settings.h"
+#include
+#include
+Log_SetChannel(NewRec::Compiler);
+
+// TODO: direct link skip delay slot check
+// TODO: speculative constants
+// TODO: std::bitset in msvc has bounds checks even in release...
+
+const std::array, 3> CPU::NewRec::Compiler::s_pgxp_mem_load_functions = {
+ {{{reinterpret_cast(&PGXP::CPU_LBx), reinterpret_cast(&PGXP::CPU_LBx)}},
+ {{reinterpret_cast(&PGXP::CPU_LHU), reinterpret_cast(&PGXP::CPU_LH)}},
+ {{reinterpret_cast(&PGXP::CPU_LW)}}}};
+const std::array CPU::NewRec::Compiler::s_pgxp_mem_store_functions = {
+ {reinterpret_cast(&PGXP::CPU_SB), reinterpret_cast(&PGXP::CPU_SH),
+ reinterpret_cast(&PGXP::CPU_SW)}};
+
+CPU::NewRec::Compiler::Compiler() = default;
+
+CPU::NewRec::Compiler::~Compiler() = default;
+
+void CPU::NewRec::Compiler::Reset(CodeCache::Block* block, u8* code_buffer, u32 code_buffer_space, u8* far_code_buffer,
+ u32 far_code_space)
+{
+ m_block = block;
+ m_compiler_pc = block->pc;
+ m_cycles = 0;
+ m_gte_done_cycle = 0;
+ inst = nullptr;
+ iinfo = nullptr;
+ m_current_instruction_pc = 0;
+ m_current_instruction_branch_delay_slot = false;
+ m_dirty_pc = false;
+ m_dirty_instruction_bits = false;
+ m_dirty_gte_done_cycle = true;
+ m_block_ended = false;
+ m_constant_reg_values.fill(0);
+ m_constant_regs_valid.reset();
+ m_constant_regs_dirty.reset();
+
+ for (u32 i = 0; i < NUM_HOST_REGS; i++)
+ ClearHostReg(i);
+ m_register_alloc_counter = 0;
+
+ m_constant_reg_values[static_cast(Reg::zero)] = 0;
+ m_constant_regs_valid.set(static_cast(Reg::zero));
+
+ m_load_delay_dirty = EMULATE_LOAD_DELAYS;
+ m_load_delay_register = Reg::count;
+ m_load_delay_value_register = NUM_HOST_REGS;
+}
+
+void CPU::NewRec::Compiler::BeginBlock()
+{
+#if 0
+ GenerateCall(reinterpret_cast(&CPU::CodeCache::LogCurrentState));
+#endif
+
+ if (m_block->protection == CodeCache::PageProtectionMode::ManualCheck)
+ {
+ Log_DebugPrintf("Generate manual protection for PC %08X", m_block->pc);
+ const u8* ram_ptr = Bus::g_ram + VirtualAddressToPhysical(m_block->pc);
+ const u8* shadow_ptr = reinterpret_cast(m_block->Instructions());
+ GenerateBlockProtectCheck(ram_ptr, shadow_ptr, m_block->size * sizeof(Instruction));
+ }
+
+ if (m_block->uncached_fetch_ticks > 0 || m_block->icache_line_count > 0)
+ GenerateICacheCheckAndUpdate();
+
+ if (g_settings.bios_tty_logging)
+ {
+ if (m_block->pc == 0xa0)
+ GenerateCall(reinterpret_cast(&CPU::HandleA0Syscall));
+ else if (m_block->pc == 0xb0)
+ GenerateCall(reinterpret_cast(&CPU::HandleB0Syscall));
+ }
+
+ inst = m_block->Instructions();
+ iinfo = m_block->InstructionsInfo();
+ m_current_instruction_pc = m_block->pc;
+ m_current_instruction_branch_delay_slot = false;
+ m_compiler_pc += sizeof(Instruction);
+ m_dirty_pc = true;
+ m_dirty_instruction_bits = true;
+}
+
+const void* CPU::NewRec::Compiler::CompileBlock(CodeCache::Block* block, u32* host_code_size, u32* host_far_code_size)
+{
+ JitCodeBuffer& buffer = CodeCache::GetCodeBuffer();
+ Reset(block, buffer.GetFreeCodePointer(), buffer.GetFreeCodeSpace(), buffer.GetFreeFarCodePointer(),
+ buffer.GetFreeFarCodeSpace());
+
+ Log_DebugPrintf("Block range: %08X -> %08X", block->pc, block->pc + block->size * 4);
+
+ BeginBlock();
+
+ for (;;)
+ {
+ CompileInstruction();
+
+ if (iinfo->is_last_instruction || m_block_ended)
+ {
+ if (!m_block_ended)
+ {
+ // Block was truncated. Link it.
+ EndBlock(m_compiler_pc, false);
+ }
+
+ break;
+ }
+
+ inst++;
+ iinfo++;
+ m_current_instruction_pc += sizeof(Instruction);
+ m_compiler_pc += sizeof(Instruction);
+ m_dirty_pc = true;
+ m_dirty_instruction_bits = true;
+ }
+
+ // Nothing should be valid anymore
+ for (u32 i = 0; i < NUM_HOST_REGS; i++)
+ DebugAssert(!IsHostRegAllocated(i));
+ for (u32 i = 1; i < static_cast(Reg::count); i++)
+ DebugAssert(!m_constant_regs_dirty.test(i) && !m_constant_regs_valid.test(i));
+
+ u32 code_size, far_code_size;
+ const void* code = EndCompile(&code_size, &far_code_size);
+ *host_code_size = code_size;
+ *host_far_code_size = far_code_size;
+ buffer.CommitCode(code_size);
+ buffer.CommitFarCode(far_code_size);
+
+ return code;
+}
+
+void CPU::NewRec::Compiler::SetConstantReg(Reg r, u32 v)
+{
+ DebugAssert(r < Reg::count && r != Reg::zero);
+
+ // There might still be an incoming load delay which we need to cancel.
+ CancelLoadDelaysToReg(r);
+
+ if (m_constant_regs_valid.test(static_cast(r)) && m_constant_reg_values[static_cast(r)] == v)
+ {
+ // Shouldn't be any host regs though.
+ DebugAssert(!CheckHostReg(0, HR_TYPE_CPU_REG, r).has_value());
+ return;
+ }
+
+ m_constant_reg_values[static_cast(r)] = v;
+ m_constant_regs_valid.set(static_cast(r));
+ m_constant_regs_dirty.set(static_cast(r));
+
+ if (const std::optional hostreg = CheckHostReg(0, HR_TYPE_CPU_REG, r); hostreg.has_value())
+ {
+ Log_DebugPrintf("Discarding guest register %s in host register %s due to constant set", GetRegName(r),
+ GetHostRegName(hostreg.value()));
+ FreeHostReg(hostreg.value());
+ }
+}
+
+void CPU::NewRec::Compiler::CancelLoadDelaysToReg(Reg reg)
+{
+ if (m_load_delay_register != reg)
+ return;
+
+ Log_DebugPrintf("Cancelling load delay to %s", GetRegName(reg));
+ m_load_delay_register = Reg::count;
+ if (m_load_delay_value_register != NUM_HOST_REGS)
+ ClearHostReg(m_load_delay_value_register);
+}
+
+void CPU::NewRec::Compiler::UpdateLoadDelay()
+{
+ if (m_load_delay_dirty)
+ {
+ // we shouldn't have a static load delay.
+ DebugAssert(!HasLoadDelay());
+
+ // have to invalidate registers, we might have one of them cached
+ // TODO: double check the order here, will we trash a new value? we shouldn't...
+ // thankfully since this only happens on the first instruction, we can get away with just killing anything which
+ // isn't in write mode, because nothing could've been written before it, and the new value overwrites any
+ // load-delayed value
+ Log_DebugPrintf("Invalidating non-dirty registers, and flushing load delay from state");
+
+ constexpr u32 req_flags = (HR_ALLOCATED | HR_MODE_WRITE);
+
+ for (u32 i = 0; i < NUM_HOST_REGS; i++)
+ {
+ HostRegAlloc& ra = m_host_regs[i];
+ if (ra.type != HR_TYPE_CPU_REG || !IsHostRegAllocated(i) || ((ra.flags & req_flags) == req_flags))
+ continue;
+
+ Log_DebugPrintf("Freeing non-dirty cached register %s in %s", GetRegName(ra.reg), GetHostRegName(i));
+ DebugAssert(!(ra.flags & HR_MODE_WRITE));
+ ClearHostReg(i);
+ }
+
+ // remove any non-dirty constants too
+ for (u32 i = 1; i < static_cast(Reg::count); i++)
+ {
+ if (!HasConstantReg(static_cast(i)) || HasDirtyConstantReg(static_cast(i)))
+ continue;
+
+ Log_DebugPrintf("Clearing non-dirty constant %s", GetRegName(static_cast(i)));
+ ClearConstantReg(static_cast(i));
+ }
+
+ Flush(FLUSH_LOAD_DELAY_FROM_STATE);
+ }
+
+ // commit the delayed register load
+ FinishLoadDelay();
+
+ // move next load delay forward
+ if (m_next_load_delay_register != Reg::count)
+ {
+ // if it somehow got flushed, read it back in.
+ if (m_next_load_delay_value_register == NUM_HOST_REGS)
+ {
+ AllocateHostReg(HR_MODE_READ, HR_TYPE_NEXT_LOAD_DELAY_VALUE, m_next_load_delay_register);
+ DebugAssert(m_next_load_delay_value_register != NUM_HOST_REGS);
+ }
+
+ HostRegAlloc& ra = m_host_regs[m_next_load_delay_value_register];
+ ra.flags |= HR_MODE_WRITE;
+ ra.type = HR_TYPE_LOAD_DELAY_VALUE;
+
+ m_load_delay_register = m_next_load_delay_register;
+ m_load_delay_value_register = m_next_load_delay_value_register;
+ m_next_load_delay_register = Reg::count;
+ m_next_load_delay_value_register = NUM_HOST_REGS;
+ }
+}
+
+void CPU::NewRec::Compiler::FinishLoadDelay()
+{
+ DebugAssert(!m_load_delay_dirty);
+ if (!HasLoadDelay())
+ return;
+
+ // we may need to reload the value..
+ if (m_load_delay_value_register == NUM_HOST_REGS)
+ {
+ AllocateHostReg(HR_MODE_READ, HR_TYPE_LOAD_DELAY_VALUE, m_load_delay_register);
+ DebugAssert(m_load_delay_value_register != NUM_HOST_REGS);
+ }
+
+ // kill any (old) cached value for this register
+ DeleteMIPSReg(m_load_delay_register, false);
+
+ Log_DebugPrintf("Finished delayed load to %s in host register %s", GetRegName(m_load_delay_register),
+ GetHostRegName(m_load_delay_value_register));
+
+ // and swap the mode over so it gets written back later
+ HostRegAlloc& ra = m_host_regs[m_load_delay_value_register];
+ DebugAssert(ra.reg == m_load_delay_register);
+ ra.flags = (ra.flags & IMMUTABLE_HR_FLAGS) | HR_ALLOCATED | HR_MODE_READ | HR_MODE_WRITE;
+ ra.counter = m_register_alloc_counter++;
+ ra.type = HR_TYPE_CPU_REG;
+
+ // constants are gone
+ Log_DebugPrintf("Clearing constant in %s due to load delay", GetRegName(m_load_delay_register));
+ ClearConstantReg(m_load_delay_register);
+
+ m_load_delay_register = Reg::count;
+ m_load_delay_value_register = NUM_HOST_REGS;
+}
+
+void CPU::NewRec::Compiler::FinishLoadDelayToReg(Reg reg)
+{
+ if (m_load_delay_dirty)
+ {
+ // inter-block :(
+ UpdateLoadDelay();
+ return;
+ }
+
+ if (m_load_delay_register != reg)
+ return;
+
+ FinishLoadDelay();
+}
+
+u32 CPU::NewRec::Compiler::GetFlagsForNewLoadDelayedReg() const
+{
+ return g_settings.gpu_pgxp_enable ? (HR_MODE_WRITE | HR_CALLEE_SAVED) : (HR_MODE_WRITE);
+}
+
+void CPU::NewRec::Compiler::ClearConstantReg(Reg r)
+{
+ DebugAssert(r < Reg::count && r != Reg::zero);
+ m_constant_reg_values[static_cast(r)] = 0;
+ m_constant_regs_valid.reset(static_cast(r));
+ m_constant_regs_dirty.reset(static_cast(r));
+}
+
+void CPU::NewRec::Compiler::FlushConstantRegs(bool invalidate)
+{
+ for (u32 i = 1; i < static_cast(Reg::count); i++)
+ {
+ if (m_constant_regs_dirty.test(static_cast(i)))
+ FlushConstantReg(static_cast(i));
+ if (invalidate)
+ ClearConstantReg(static_cast(i));
+ }
+}
+
+CPU::Reg CPU::NewRec::Compiler::MipsD() const
+{
+ return inst->r.rd;
+}
+
+u32 CPU::NewRec::Compiler::GetConditionalBranchTarget(CompileFlags cf) const
+{
+ // compiler pc has already been advanced when swapping branch delay slots
+ const u32 current_pc = m_compiler_pc - (cf.delay_slot_swapped ? sizeof(Instruction) : 0);
+ return current_pc + (inst->i.imm_sext32() << 2);
+}
+
+u32 CPU::NewRec::Compiler::GetBranchReturnAddress(CompileFlags cf) const
+{
+ // compiler pc has already been advanced when swapping branch delay slots
+ return m_compiler_pc + (cf.delay_slot_swapped ? 0 : sizeof(Instruction));
+}
+
+bool CPU::NewRec::Compiler::TrySwapDelaySlot(Reg rs, Reg rt, Reg rd)
+{
+ if constexpr (!SWAP_BRANCH_DELAY_SLOTS)
+ return false;
+
+ const Instruction* next_instruction = inst + 1;
+ DebugAssert(next_instruction < (m_block->Instructions() + m_block->size));
+
+ const Reg opcode_rs = next_instruction->r.rs;
+ const Reg opcode_rt = next_instruction->r.rt;
+ const Reg opcode_rd = next_instruction->r.rd;
+
+#ifdef _DEBUG
+ TinyString disasm;
+ DisassembleInstruction(&disasm, m_current_instruction_pc + 4, next_instruction->bits);
+#endif
+
+ // Just in case we read it in the instruction.. but the block should end after this.
+ const Instruction* const backup_instruction = inst;
+ const u32 backup_instruction_pc = m_current_instruction_pc;
+ const bool backup_instruction_delay_slot = m_current_instruction_branch_delay_slot;
+
+ if (next_instruction->bits == 0)
+ {
+ // nop
+ goto is_safe;
+ }
+
+ // can't swap when the branch is the first instruction because of bloody load delays
+ if ((EMULATE_LOAD_DELAYS && m_block->pc == m_current_instruction_pc) || m_load_delay_dirty ||
+ (HasLoadDelay() && (m_load_delay_register == rs || m_load_delay_register == rt || m_load_delay_register == rd)))
+ {
+ goto is_unsafe;
+ }
+
+ switch (next_instruction->op)
+ {
+ case InstructionOp::addi:
+ case InstructionOp::addiu:
+ case InstructionOp::slti:
+ case InstructionOp::sltiu:
+ case InstructionOp::andi:
+ case InstructionOp::ori:
+ case InstructionOp::xori:
+ case InstructionOp::lui:
+ case InstructionOp::lb:
+ case InstructionOp::lh:
+ case InstructionOp::lwl:
+ case InstructionOp::lw:
+ case InstructionOp::lbu:
+ case InstructionOp::lhu:
+ case InstructionOp::lwr:
+ case InstructionOp::sb:
+ case InstructionOp::sh:
+ case InstructionOp::swl:
+ case InstructionOp::sw:
+ case InstructionOp::swr:
+ {
+ if ((rs != Reg::zero && rs == opcode_rt) || (rt != Reg::zero && rt == opcode_rt) ||
+ (rd != Reg::zero && (rd == opcode_rs || rd == opcode_rt)) ||
+ (HasLoadDelay() && (m_load_delay_register == opcode_rs || m_load_delay_register == opcode_rt)))
+ {
+ goto is_unsafe;
+ }
+ }
+ break;
+
+ case InstructionOp::lwc2: // LWC2
+ case InstructionOp::swc2: // SWC2
+ break;
+
+ case InstructionOp::funct: // SPECIAL
+ {
+ switch (next_instruction->r.funct)
+ {
+ case InstructionFunct::sll:
+ case InstructionFunct::srl:
+ case InstructionFunct::sra:
+ case InstructionFunct::sllv:
+ case InstructionFunct::srlv:
+ case InstructionFunct::srav:
+ case InstructionFunct::add:
+ case InstructionFunct::addu:
+ case InstructionFunct::sub:
+ case InstructionFunct::subu:
+ case InstructionFunct::and_:
+ case InstructionFunct::or_:
+ case InstructionFunct::xor_:
+ case InstructionFunct::nor:
+ case InstructionFunct::slt:
+ case InstructionFunct::sltu:
+ {
+ if ((rs != Reg::zero && rs == opcode_rd) || (rt != Reg::zero && rt == opcode_rd) ||
+ (rd != Reg::zero && (rd == opcode_rs || rd == opcode_rt)) ||
+ (HasLoadDelay() && (m_load_delay_register == opcode_rs || m_load_delay_register == opcode_rt ||
+ m_load_delay_register == opcode_rd)))
+ {
+ goto is_unsafe;
+ }
+ }
+ break;
+
+ case InstructionFunct::mult:
+ case InstructionFunct::multu:
+ case InstructionFunct::div:
+ case InstructionFunct::divu:
+ {
+ if (HasLoadDelay() && (m_load_delay_register == opcode_rs || m_load_delay_register == opcode_rt))
+ goto is_unsafe;
+ }
+ break;
+
+ default:
+ goto is_unsafe;
+ }
+ }
+ break;
+
+ case InstructionOp::cop0: // COP0
+ case InstructionOp::cop1: // COP1
+ case InstructionOp::cop2: // COP2
+ case InstructionOp::cop3: // COP3
+ {
+ if (next_instruction->cop.IsCommonInstruction())
+ {
+ switch (next_instruction->cop.CommonOp())
+ {
+ case CopCommonInstruction::mfcn: // MFC0
+ case CopCommonInstruction::cfcn: // CFC0
+ {
+ if ((rs != Reg::zero && rs == opcode_rt) || (rt != Reg::zero && rt == opcode_rt) ||
+ (rd != Reg::zero && rd == opcode_rt) || (HasLoadDelay() && m_load_delay_register == opcode_rt))
+ {
+ goto is_unsafe;
+ }
+ }
+ break;
+
+ case CopCommonInstruction::mtcn: // MTC0
+ case CopCommonInstruction::ctcn: // CTC0
+ break;
+ }
+ }
+ else
+ {
+ // swap when it's GTE
+ if (next_instruction->op != InstructionOp::cop2)
+ goto is_unsafe;
+ }
+ }
+ break;
+
+ default:
+ goto is_unsafe;
+ }
+
+is_safe:
+#ifdef _DEBUG
+ Log_DevFmt("Swapping delay slot {:08X} {}", m_current_instruction_pc + 4, disasm);
+#endif
+
+ CompileBranchDelaySlot();
+
+ inst = backup_instruction;
+ m_current_instruction_pc = backup_instruction_pc;
+ m_current_instruction_branch_delay_slot = backup_instruction_delay_slot;
+ return true;
+
+is_unsafe:
+#ifdef _DEBUG
+ Log_DevFmt("NOT swapping delay slot {:08X} {}", m_current_instruction_pc + 4, disasm);
+#endif
+
+ return false;
+}
+
+void CPU::NewRec::Compiler::SetCompilerPC(u32 newpc)
+{
+ m_compiler_pc = newpc;
+ m_dirty_pc = true;
+}
+
+u32 CPU::NewRec::Compiler::GetFreeHostReg(u32 flags)
+{
+ const u32 req_flags = HR_USABLE | (flags & HR_CALLEE_SAVED);
+
+ for (u32 i = 0; i < NUM_HOST_REGS; i++)
+ {
+ if ((m_host_regs[i].flags & (req_flags | HR_NEEDED | HR_ALLOCATED)) == req_flags)
+ return i;
+ }
+
+ // find register with lowest counter
+ u32 lowest = NUM_HOST_REGS;
+ u16 lowest_count = std::numeric_limits::max();
+ for (u32 i = 0; i < NUM_HOST_REGS; i++)
+ {
+ const HostRegAlloc& ra = m_host_regs[i];
+ if ((ra.flags & (req_flags | HR_NEEDED)) != req_flags)
+ continue;
+
+ DebugAssert(ra.flags & HR_ALLOCATED);
+ if (ra.type == HR_TYPE_TEMP)
+ {
+ // can't punt temps
+ continue;
+ }
+
+ if (ra.counter < lowest_count)
+ {
+ lowest = i;
+ lowest_count = ra.counter;
+ }
+ }
+
+ //
+
+ AssertMsg(lowest != NUM_HOST_REGS, "Register allocation failed.");
+
+ const HostRegAlloc& ra = m_host_regs[lowest];
+ switch (ra.type)
+ {
+ case HR_TYPE_CPU_REG:
+ {
+ // If the register is needed later, and we're allocating a callee-saved register, try moving it to a caller-saved
+ // register.
+ if (iinfo->UsedTest(ra.reg) && flags & HR_CALLEE_SAVED)
+ {
+ u32 caller_saved_lowest = NUM_HOST_REGS;
+ u16 caller_saved_lowest_count = std::numeric_limits::max();
+ for (u32 i = 0; i < NUM_HOST_REGS; i++)
+ {
+ constexpr u32 caller_req_flags = HR_USABLE;
+ constexpr u32 caller_req_mask = HR_USABLE | HR_NEEDED | HR_CALLEE_SAVED;
+ const HostRegAlloc& caller_ra = m_host_regs[i];
+ if ((caller_ra.flags & caller_req_mask) != caller_req_flags)
+ continue;
+
+ if (!(caller_ra.flags & HR_ALLOCATED))
+ {
+ caller_saved_lowest = i;
+ caller_saved_lowest_count = 0;
+ break;
+ }
+
+ if (caller_ra.type == HR_TYPE_TEMP)
+ continue;
+
+ if (caller_ra.counter < caller_saved_lowest_count)
+ {
+ caller_saved_lowest = i;
+ caller_saved_lowest_count = caller_ra.counter;
+ }
+ }
+
+ if (caller_saved_lowest_count < lowest_count)
+ {
+ Log_DebugPrintf("Moving caller-saved host register %s with MIPS register %s to %s for allocation",
+ GetHostRegName(lowest), GetRegName(ra.reg), GetHostRegName(caller_saved_lowest));
+ if (IsHostRegAllocated(caller_saved_lowest))
+ FreeHostReg(caller_saved_lowest);
+ CopyHostReg(caller_saved_lowest, lowest);
+ SwapHostRegAlloc(caller_saved_lowest, lowest);
+ DebugAssert(!IsHostRegAllocated(lowest));
+ return lowest;
+ }
+ }
+
+ Log_DebugPrintf("Freeing register %s in host register %s for allocation", GetHostRegName(lowest),
+ GetRegName(ra.reg));
+ }
+ break;
+ case HR_TYPE_LOAD_DELAY_VALUE:
+ {
+ Log_DebugPrintf("Freeing load delay register %s in host register %s for allocation", GetHostRegName(lowest),
+ GetRegName(ra.reg));
+ }
+ break;
+ case HR_TYPE_NEXT_LOAD_DELAY_VALUE:
+ {
+ Log_DebugPrintf("Freeing next load delay register %s in host register %s due for allocation",
+ GetHostRegName(lowest), GetRegName(ra.reg));
+ }
+ break;
+ default:
+ {
+ Panic("Unknown type freed");
+ }
+ break;
+ }
+
+ FreeHostReg(lowest);
+ return lowest;
+}
+
+const char* CPU::NewRec::Compiler::GetReadWriteModeString(u32 flags)
+{
+ if ((flags & (HR_MODE_READ | HR_MODE_WRITE)) == (HR_MODE_READ | HR_MODE_WRITE))
+ return "read-write";
+ else if (flags & HR_MODE_READ)
+ return "read-only";
+ else if (flags & HR_MODE_WRITE)
+ return "write-only";
+ else
+ return "UNKNOWN";
+}
+
+u32 CPU::NewRec::Compiler::AllocateHostReg(u32 flags, HostRegAllocType type /* = HR_TYPE_TEMP */,
+ Reg reg /* = Reg::count */)
+{
+ // Cancel any load delays before booting anything out
+ if (flags & HR_MODE_WRITE && (type == HR_TYPE_CPU_REG || type == HR_TYPE_NEXT_LOAD_DELAY_VALUE))
+ CancelLoadDelaysToReg(reg);
+
+ // Already have a matching type?
+ if (type != HR_TYPE_TEMP)
+ {
+ const std::optional check_reg = CheckHostReg(flags, type, reg);
+
+ // shouldn't be allocating >1 load delay in a single instruction..
+ // TODO: prefer callee saved registers for load delay
+ DebugAssert((type != HR_TYPE_LOAD_DELAY_VALUE && type != HR_TYPE_NEXT_LOAD_DELAY_VALUE) || !check_reg.has_value());
+ if (check_reg.has_value())
+ return check_reg.value();
+ }
+
+ const u32 hreg = GetFreeHostReg(flags);
+ HostRegAlloc& ra = m_host_regs[hreg];
+ ra.flags = (ra.flags & IMMUTABLE_HR_FLAGS) | (flags & ALLOWED_HR_FLAGS) | HR_ALLOCATED | HR_NEEDED;
+ ra.type = type;
+ ra.reg = reg;
+ ra.counter = m_register_alloc_counter++;
+
+ switch (type)
+ {
+ case HR_TYPE_CPU_REG:
+ {
+ DebugAssert(reg != Reg::zero);
+
+ Log_DebugPrintf("Allocate host reg %s to guest reg %s in %s mode", GetHostRegName(hreg), GetRegName(reg),
+ GetReadWriteModeString(flags));
+
+ if (flags & HR_MODE_READ)
+ {
+ DebugAssert(ra.reg > Reg::zero && ra.reg < Reg::count);
+
+ if (HasConstantReg(reg))
+ {
+ // may as well flush it now
+ Log_DebugPrintf("Flush constant register in guest reg %s to host reg %s", GetRegName(reg),
+ GetHostRegName(hreg));
+ LoadHostRegWithConstant(hreg, GetConstantRegU32(reg));
+ m_constant_regs_dirty.reset(static_cast(reg));
+ ra.flags |= HR_MODE_WRITE;
+ }
+ else
+ {
+ LoadHostRegFromCPUPointer(hreg, &g_state.regs.r[static_cast(reg)]);
+ }
+ }
+
+ if (flags & HR_MODE_WRITE && HasConstantReg(reg))
+ {
+ DebugAssert(reg != Reg::zero);
+ Log_DebugPrintf("Clearing constant register in guest reg %s due to write mode in %s", GetRegName(reg),
+ GetHostRegName(hreg));
+
+ ClearConstantReg(reg);
+ }
+ }
+ break;
+
+ case HR_TYPE_LOAD_DELAY_VALUE:
+ {
+ DebugAssert(!m_load_delay_dirty && (!HasLoadDelay() || !(flags & HR_MODE_WRITE)));
+ Log_DebugPrintf("Allocating load delayed guest register %s in host reg %s in %s mode", GetRegName(reg),
+ GetHostRegName(hreg), GetReadWriteModeString(flags));
+ m_load_delay_register = reg;
+ m_load_delay_value_register = hreg;
+ if (flags & HR_MODE_READ)
+ LoadHostRegFromCPUPointer(hreg, &g_state.load_delay_value);
+ }
+ break;
+
+ case HR_TYPE_NEXT_LOAD_DELAY_VALUE:
+ {
+ Log_DebugPrintf("Allocating next load delayed guest register %s in host reg %s in %s mode", GetRegName(reg),
+ GetHostRegName(hreg), GetReadWriteModeString(flags));
+ m_next_load_delay_register = reg;
+ m_next_load_delay_value_register = hreg;
+ if (flags & HR_MODE_READ)
+ LoadHostRegFromCPUPointer(hreg, &g_state.next_load_delay_value);
+ }
+ break;
+
+ case HR_TYPE_TEMP:
+ {
+ DebugAssert(!(flags & (HR_MODE_READ | HR_MODE_WRITE)));
+ Log_DebugPrintf("Allocate host reg %s as temporary", GetHostRegName(hreg));
+ }
+ break;
+
+ default:
+ Panic("Unknown type");
+ break;
+ }
+
+ return hreg;
+}
+
+std::optional CPU::NewRec::Compiler::CheckHostReg(u32 flags, HostRegAllocType type /* = HR_TYPE_TEMP */,
+ Reg reg /* = Reg::count */)
+{
+ for (u32 i = 0; i < NUM_HOST_REGS; i++)
+ {
+ HostRegAlloc& ra = m_host_regs[i];
+ if (!(ra.flags & HR_ALLOCATED) || ra.type != type || ra.reg != reg)
+ continue;
+
+ DebugAssert(ra.flags & HR_MODE_READ);
+ if (flags & HR_MODE_WRITE)
+ {
+ DebugAssert(type == HR_TYPE_CPU_REG);
+ if (!(ra.flags & HR_MODE_WRITE))
+ {
+ Log_DebugPrintf("Switch guest reg %s from read to read-write in host reg %s", GetRegName(reg),
+ GetHostRegName(i));
+ }
+
+ if (HasConstantReg(reg))
+ {
+ DebugAssert(reg != Reg::zero);
+ Log_DebugPrintf("Clearing constant register in guest reg %s due to write mode in %s", GetRegName(reg),
+ GetHostRegName(i));
+
+ ClearConstantReg(reg);
+ }
+ }
+
+ ra.flags |= (flags & ALLOWED_HR_FLAGS) | HR_NEEDED;
+ ra.counter = m_register_alloc_counter++;
+
+ // Need a callee saved reg?
+ if (flags & HR_CALLEE_SAVED && !(ra.flags & HR_CALLEE_SAVED))
+ {
+ // Need to move it to one which is
+ const u32 new_reg = GetFreeHostReg(HR_CALLEE_SAVED);
+ Log_DebugPrintf("Rename host reg %s to %s for callee saved", GetHostRegName(i), GetHostRegName(new_reg));
+
+ CopyHostReg(new_reg, i);
+ SwapHostRegAlloc(i, new_reg);
+ DebugAssert(!IsHostRegAllocated(i));
+ return new_reg;
+ }
+
+ return i;
+ }
+
+ return std::nullopt;
+}
+
+u32 CPU::NewRec::Compiler::AllocateTempHostReg(u32 flags)
+{
+ return AllocateHostReg(flags, HR_TYPE_TEMP);
+}
+
+void CPU::NewRec::Compiler::SwapHostRegAlloc(u32 lhs, u32 rhs)
+{
+ HostRegAlloc& lra = m_host_regs[lhs];
+ HostRegAlloc& rra = m_host_regs[rhs];
+
+ const u8 lra_flags = lra.flags;
+ lra.flags = (lra.flags & IMMUTABLE_HR_FLAGS) | (rra.flags & ~IMMUTABLE_HR_FLAGS);
+ rra.flags = (rra.flags & IMMUTABLE_HR_FLAGS) | (lra_flags & ~IMMUTABLE_HR_FLAGS);
+ std::swap(lra.type, rra.type);
+ std::swap(lra.reg, rra.reg);
+ std::swap(lra.counter, rra.counter);
+}
+
+void CPU::NewRec::Compiler::FlushHostReg(u32 reg)
+{
+ HostRegAlloc& ra = m_host_regs[reg];
+ if (ra.flags & HR_MODE_WRITE)
+ {
+ switch (ra.type)
+ {
+ case HR_TYPE_CPU_REG:
+ {
+ DebugAssert(ra.reg > Reg::zero && ra.reg < Reg::count);
+ Log_DebugPrintf("Flushing register %s in host register %s to state", GetRegName(ra.reg), GetHostRegName(reg));
+ StoreHostRegToCPUPointer(reg, &g_state.regs.r[static_cast(ra.reg)]);
+ }
+ break;
+
+ case HR_TYPE_LOAD_DELAY_VALUE:
+ {
+ DebugAssert(m_load_delay_value_register == reg);
+ Log_DebugPrintf("Flushing load delayed register %s in host register %s to state", GetRegName(ra.reg),
+ GetHostRegName(reg));
+
+ StoreHostRegToCPUPointer(reg, &g_state.load_delay_value);
+ m_load_delay_value_register = NUM_HOST_REGS;
+ }
+ break;
+
+ case HR_TYPE_NEXT_LOAD_DELAY_VALUE:
+ {
+ DebugAssert(m_next_load_delay_value_register == reg);
+ Log_WarningPrintf("Flushing NEXT load delayed register %s in host register %s to state", GetRegName(ra.reg),
+ GetHostRegName(reg));
+
+ StoreHostRegToCPUPointer(reg, &g_state.next_load_delay_value);
+ m_next_load_delay_value_register = NUM_HOST_REGS;
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ ra.flags = (ra.flags & ~HR_MODE_WRITE) | HR_MODE_READ;
+ }
+}
+
+void CPU::NewRec::Compiler::FreeHostReg(u32 reg)
+{
+ DebugAssert(IsHostRegAllocated(reg));
+ FlushHostReg(reg);
+ ClearHostReg(reg);
+}
+
+void CPU::NewRec::Compiler::ClearHostReg(u32 reg)
+{
+ HostRegAlloc& ra = m_host_regs[reg];
+ ra.flags &= IMMUTABLE_HR_FLAGS;
+ ra.type = HR_TYPE_TEMP;
+ ra.counter = 0;
+ ra.reg = Reg::count;
+}
+
+void CPU::NewRec::Compiler::MarkRegsNeeded(HostRegAllocType type, Reg reg)
+{
+ for (u32 i = 0; i < NUM_HOST_REGS; i++)
+ {
+ HostRegAlloc& ra = m_host_regs[i];
+ if (ra.flags & HR_ALLOCATED && ra.type == type && ra.reg == reg)
+ ra.flags |= HR_NEEDED;
+ }
+}
+
+void CPU::NewRec::Compiler::RenameHostReg(u32 reg, u32 new_flags, HostRegAllocType new_type, Reg new_reg)
+{
+ // only supported for cpu regs for now
+ DebugAssert(new_type == HR_TYPE_TEMP || new_type == HR_TYPE_CPU_REG || new_type == HR_TYPE_NEXT_LOAD_DELAY_VALUE);
+
+ const std::optional old_reg = CheckHostReg(0, new_type, new_reg);
+ if (old_reg.has_value())
+ {
+ // don't writeback
+ ClearHostReg(old_reg.value());
+ }
+
+ // kill any load delay to this reg
+ if (new_type == HR_TYPE_CPU_REG || new_type == HR_TYPE_NEXT_LOAD_DELAY_VALUE)
+ CancelLoadDelaysToReg(new_reg);
+
+ if (new_type == HR_TYPE_CPU_REG)
+ {
+ Log_DebugPrintf("Renaming host reg %s to guest reg %s", GetHostRegName(reg), GetRegName(new_reg));
+ }
+ else if (new_type == HR_TYPE_NEXT_LOAD_DELAY_VALUE)
+ {
+ Log_DebugPrintf("Renaming host reg %s to load delayed guest reg %s", GetHostRegName(reg), GetRegName(new_reg));
+ DebugAssert(m_next_load_delay_register == Reg::count && m_next_load_delay_value_register == NUM_HOST_REGS);
+ m_next_load_delay_register = new_reg;
+ m_next_load_delay_value_register = reg;
+ }
+ else
+ {
+ Log_DebugPrintf("Renaming host reg %s to temp", GetHostRegName(reg));
+ }
+
+ HostRegAlloc& ra = m_host_regs[reg];
+ ra.flags = (ra.flags & IMMUTABLE_HR_FLAGS) | HR_NEEDED | HR_ALLOCATED | (new_flags & ALLOWED_HR_FLAGS);
+ ra.counter = m_register_alloc_counter++;
+ ra.type = new_type;
+ ra.reg = new_reg;
+}
+
+void CPU::NewRec::Compiler::ClearHostRegNeeded(u32 reg)
+{
+ DebugAssert(reg < NUM_HOST_REGS && IsHostRegAllocated(reg));
+ HostRegAlloc& ra = m_host_regs[reg];
+ if (ra.flags & HR_MODE_WRITE)
+ ra.flags |= HR_MODE_READ;
+
+ ra.flags &= ~HR_NEEDED;
+}
+
+void CPU::NewRec::Compiler::ClearHostRegsNeeded()
+{
+ for (u32 i = 0; i < NUM_HOST_REGS; i++)
+ {
+ HostRegAlloc& ra = m_host_regs[i];
+ if (!(ra.flags & HR_ALLOCATED))
+ continue;
+
+ // shouldn't have any temps left
+ DebugAssert(ra.type != HR_TYPE_TEMP);
+
+ if (ra.flags & HR_MODE_WRITE)
+ ra.flags |= HR_MODE_READ;
+
+ ra.flags &= ~HR_NEEDED;
+ }
+}
+
+void CPU::NewRec::Compiler::DeleteMIPSReg(Reg reg, bool flush)
+{
+ DebugAssert(reg != Reg::zero);
+
+ for (u32 i = 0; i < NUM_HOST_REGS; i++)
+ {
+ HostRegAlloc& ra = m_host_regs[i];
+ if (ra.flags & HR_ALLOCATED && ra.type == HR_TYPE_CPU_REG && ra.reg == reg)
+ {
+ if (flush)
+ FlushHostReg(i);
+ ClearHostReg(i);
+ ClearConstantReg(reg);
+ return;
+ }
+ }
+
+ if (flush)
+ FlushConstantReg(reg);
+ ClearConstantReg(reg);
+}
+
+bool CPU::NewRec::Compiler::TryRenameMIPSReg(Reg to, Reg from, u32 fromhost, Reg other)
+{
+ // can't rename when in form Rd = Rs op Rt and Rd == Rs or Rd == Rt
+ if (to == from || to == other || !iinfo->RenameTest(from))
+ return false;
+
+ Log_DebugPrintf("Renaming MIPS register %s to %s", GetRegName(from), GetRegName(to));
+
+ if (iinfo->LiveTest(from))
+ FlushHostReg(fromhost);
+
+ // remove all references to renamed-to register
+ DeleteMIPSReg(to, false);
+
+ // and do the actual rename, new register has been modified.
+ m_host_regs[fromhost].reg = to;
+ m_host_regs[fromhost].flags |= HR_MODE_READ | HR_MODE_WRITE;
+ return true;
+}
+
+void CPU::NewRec::Compiler::UpdateHostRegCounters()
+{
+ const CodeCache::InstructionInfo* const info_end = m_block->InstructionsInfo() + m_block->size;
+
+ for (u32 i = 0; i < NUM_HOST_REGS; i++)
+ {
+ HostRegAlloc& ra = m_host_regs[i];
+ if ((ra.flags & (HR_ALLOCATED | HR_NEEDED)) != HR_ALLOCATED)
+ continue;
+
+ // Try not to punt out load delays.
+ if (ra.type != HR_TYPE_CPU_REG)
+ {
+ ra.counter = std::numeric_limits::max();
+ continue;
+ }
+
+ DebugAssert(IsHostRegAllocated(i));
+ const CodeCache::InstructionInfo* cur = iinfo;
+ const Reg reg = ra.reg;
+ if (!(cur->reg_flags[static_cast(reg)] & CodeCache::RI_USED))
+ {
+ ra.counter = 0;
+ continue;
+ }
+
+ // order based on the number of instructions until this register is used
+ u16 counter_val = std::numeric_limits::max();
+ for (; cur != info_end; cur++, counter_val--)
+ {
+ if (cur->ReadsReg(reg))
+ break;
+ }
+
+ ra.counter = counter_val;
+ }
+}
+
+void CPU::NewRec::Compiler::Flush(u32 flags)
+{
+ // TODO: Flush unneeded caller-saved regs (backup/replace calle-saved needed with caller-saved)
+ if (flags &
+ (FLUSH_FREE_UNNEEDED_CALLER_SAVED_REGISTERS | FLUSH_FREE_CALLER_SAVED_REGISTERS | FLUSH_FREE_ALL_REGISTERS))
+ {
+ const u32 req_mask = (flags & FLUSH_FREE_ALL_REGISTERS) ?
+ HR_ALLOCATED :
+ ((flags & FLUSH_FREE_CALLER_SAVED_REGISTERS) ? (HR_ALLOCATED | HR_CALLEE_SAVED) :
+ (HR_ALLOCATED | HR_CALLEE_SAVED | HR_NEEDED));
+ constexpr u32 req_flags = HR_ALLOCATED;
+
+ for (u32 i = 0; i < NUM_HOST_REGS; i++)
+ {
+ HostRegAlloc& ra = m_host_regs[i];
+ if ((ra.flags & req_mask) == req_flags)
+ FreeHostReg(i);
+ }
+ }
+
+ if (flags & FLUSH_INVALIDATE_MIPS_REGISTERS)
+ {
+ for (u32 i = 0; i < NUM_HOST_REGS; i++)
+ {
+ HostRegAlloc& ra = m_host_regs[i];
+ if (ra.flags & HR_ALLOCATED && ra.type == HR_TYPE_CPU_REG)
+ FreeHostReg(i);
+ }
+
+ FlushConstantRegs(true);
+ }
+ else
+ {
+ if (flags & FLUSH_FLUSH_MIPS_REGISTERS)
+ {
+ for (u32 i = 0; i < NUM_HOST_REGS; i++)
+ {
+ HostRegAlloc& ra = m_host_regs[i];
+ if ((ra.flags & (HR_ALLOCATED | HR_MODE_WRITE)) == (HR_ALLOCATED | HR_MODE_WRITE) && ra.type == HR_TYPE_CPU_REG)
+ FlushHostReg(i);
+ }
+
+ // flush any constant registers which are dirty too
+ FlushConstantRegs(false);
+ }
+ }
+}
+
+void CPU::NewRec::Compiler::FlushConstantReg(Reg r)
+{
+ DebugAssert(m_constant_regs_valid.test(static_cast(r)));
+ Log_DebugPrintf("Writing back register %s with constant value 0x%08X", GetRegName(r),
+ m_constant_reg_values[static_cast(r)]);
+ StoreConstantToCPUPointer(m_constant_reg_values[static_cast(r)], &g_state.regs.r[static_cast(r)]);
+ m_constant_regs_dirty.reset(static_cast(r));
+}
+
+void CPU::NewRec::Compiler::BackupHostState()
+{
+ DebugAssert(m_host_state_backup_count < m_host_state_backup.size());
+
+ // need to back up everything...
+ HostStateBackup& bu = m_host_state_backup[m_host_state_backup_count];
+ bu.cycles = m_cycles;
+ bu.gte_done_cycle = m_gte_done_cycle;
+ bu.compiler_pc = m_compiler_pc;
+ bu.dirty_pc = m_dirty_pc;
+ bu.dirty_instruction_bits = m_dirty_instruction_bits;
+ bu.dirty_gte_done_cycle = m_dirty_gte_done_cycle;
+ bu.block_ended = m_block_ended;
+ bu.inst = inst;
+ bu.current_instruction_pc = m_current_instruction_pc;
+ bu.current_instruction_delay_slot = m_current_instruction_branch_delay_slot;
+ bu.const_regs_valid = m_constant_regs_valid;
+ bu.const_regs_dirty = m_constant_regs_dirty;
+ bu.const_regs_values = m_constant_reg_values;
+ bu.host_regs = m_host_regs;
+ bu.register_alloc_counter = m_register_alloc_counter;
+ bu.load_delay_dirty = m_load_delay_dirty;
+ bu.load_delay_register = m_load_delay_register;
+ bu.load_delay_value_register = m_load_delay_value_register;
+ bu.next_load_delay_register = m_next_load_delay_register;
+ bu.next_load_delay_value_register = m_next_load_delay_value_register;
+ m_host_state_backup_count++;
+}
+
+void CPU::NewRec::Compiler::RestoreHostState()
+{
+ DebugAssert(m_host_state_backup_count > 0);
+ m_host_state_backup_count--;
+
+ HostStateBackup& bu = m_host_state_backup[m_host_state_backup_count];
+ m_host_regs = std::move(bu.host_regs);
+ m_constant_reg_values = std::move(bu.const_regs_values);
+ m_constant_regs_dirty = std::move(bu.const_regs_dirty);
+ m_constant_regs_valid = std::move(bu.const_regs_valid);
+ m_current_instruction_branch_delay_slot = bu.current_instruction_delay_slot;
+ m_current_instruction_pc = bu.current_instruction_pc;
+ inst = bu.inst;
+ m_block_ended = bu.block_ended;
+ m_dirty_gte_done_cycle = bu.dirty_gte_done_cycle;
+ m_dirty_instruction_bits = bu.dirty_instruction_bits;
+ m_dirty_pc = bu.dirty_pc;
+ m_compiler_pc = bu.compiler_pc;
+ m_register_alloc_counter = bu.register_alloc_counter;
+ m_load_delay_dirty = bu.load_delay_dirty;
+ m_load_delay_register = bu.load_delay_register;
+ m_load_delay_value_register = bu.load_delay_value_register;
+ m_next_load_delay_register = bu.next_load_delay_register;
+ m_next_load_delay_value_register = bu.next_load_delay_value_register;
+ m_gte_done_cycle = bu.gte_done_cycle;
+ m_cycles = bu.cycles;
+}
+
+void CPU::NewRec::Compiler::AddLoadStoreInfo(void* code_address, u32 code_size, u32 address_register, u32 data_register,
+ MemoryAccessSize size, bool is_signed, bool is_load)
+{
+ DebugAssert(CodeCache::IsUsingFastmem());
+ DebugAssert(address_register < NUM_HOST_REGS);
+ DebugAssert(data_register < NUM_HOST_REGS);
+
+ u32 gpr_bitmask = 0;
+ for (u32 i = 0; i < NUM_HOST_REGS; i++)
+ {
+ if (IsHostRegAllocated(i))
+ gpr_bitmask |= (1u << i);
+ }
+
+ CPU::CodeCache::AddLoadStoreInfo(code_address, code_size, m_current_instruction_pc, m_cycles, gpr_bitmask,
+ static_cast(address_register), static_cast(data_register), size, is_signed,
+ is_load);
+}
+
+void CPU::NewRec::Compiler::CompileInstruction()
+{
+#ifdef _DEBUG
+ TinyString str;
+ DisassembleInstruction(&str, m_current_instruction_pc, inst->bits);
+ Log_DebugFmt("Compiling{} {:08X}: {}", m_current_instruction_branch_delay_slot ? " branch delay slot" : "",
+ m_current_instruction_pc, str);
+#endif
+
+ m_cycles++;
+
+ if (IsNopInstruction(*inst))
+ {
+ UpdateLoadDelay();
+ return;
+ }
+
+ switch (inst->op)
+ {
+#define PGXPFN(x) reinterpret_cast(&PGXP::x)
+
+ // clang-format off
+ // TODO: PGXP for jalr
+
+ case InstructionOp::funct:
+ {
+ switch (inst->r.funct)
+ {
+ case InstructionFunct::sll: CompileTemplate(&Compiler::Compile_sll_const, &Compiler::Compile_sll, PGXPFN(CPU_SLL), TF_WRITES_D | TF_READS_T); break;
+ case InstructionFunct::srl: CompileTemplate(&Compiler::Compile_srl_const, &Compiler::Compile_srl, PGXPFN(CPU_SRL), TF_WRITES_D | TF_READS_T); break;
+ case InstructionFunct::sra: CompileTemplate(&Compiler::Compile_sra_const, &Compiler::Compile_sra, PGXPFN(CPU_SRA), TF_WRITES_D | TF_READS_T); break;
+ case InstructionFunct::sllv: CompileTemplate(&Compiler::Compile_sllv_const, &Compiler::Compile_sllv, PGXPFN(CPU_SLLV), TF_WRITES_D | TF_READS_S | TF_READS_T); break;
+ case InstructionFunct::srlv: CompileTemplate(&Compiler::Compile_srlv_const, &Compiler::Compile_srlv, PGXPFN(CPU_SRLV), TF_WRITES_D | TF_READS_S | TF_READS_T); break;
+ case InstructionFunct::srav: CompileTemplate(&Compiler::Compile_srav_const, &Compiler::Compile_srav, PGXPFN(CPU_SRAV), TF_WRITES_D | TF_READS_S | TF_READS_T); break;
+ case InstructionFunct::jr: CompileTemplate(&Compiler::Compile_jr_const, &Compiler::Compile_jr, nullptr, TF_READS_S); break;
+ case InstructionFunct::jalr: CompileTemplate(&Compiler::Compile_jalr_const, &Compiler::Compile_jalr, nullptr, /*TF_WRITES_D |*/ TF_READS_S | TF_NO_NOP); break;
+ case InstructionFunct::syscall: Compile_syscall(); break;
+ case InstructionFunct::break_: Compile_break(); break;
+ case InstructionFunct::mfhi: CompileMoveRegTemplate(inst->r.rd, Reg::hi, g_settings.gpu_pgxp_cpu); break;
+ case InstructionFunct::mthi: CompileMoveRegTemplate(Reg::hi, inst->r.rs, g_settings.gpu_pgxp_cpu); break;
+ case InstructionFunct::mflo: CompileMoveRegTemplate(inst->r.rd, Reg::lo, g_settings.gpu_pgxp_cpu); break;
+ case InstructionFunct::mtlo: CompileMoveRegTemplate(Reg::lo, inst->r.rs, g_settings.gpu_pgxp_cpu); break;
+ case InstructionFunct::mult: CompileTemplate(&Compiler::Compile_mult_const, &Compiler::Compile_mult, PGXPFN(CPU_MULT), TF_READS_S | TF_READS_T | TF_WRITES_LO | TF_WRITES_HI | TF_COMMUTATIVE); break;
+ case InstructionFunct::multu: CompileTemplate(&Compiler::Compile_multu_const, &Compiler::Compile_multu, PGXPFN(CPU_MULTU), TF_READS_S | TF_READS_T | TF_WRITES_LO | TF_WRITES_HI | TF_COMMUTATIVE); break;
+ case InstructionFunct::div: CompileTemplate(&Compiler::Compile_div_const, &Compiler::Compile_div, PGXPFN(CPU_DIV), TF_READS_S | TF_READS_T | TF_WRITES_LO | TF_WRITES_HI); break;
+ case InstructionFunct::divu: CompileTemplate(&Compiler::Compile_divu_const, &Compiler::Compile_divu, PGXPFN(CPU_DIVU), TF_READS_S | TF_READS_T | TF_WRITES_LO | TF_WRITES_HI); break;
+ case InstructionFunct::add: CompileTemplate(&Compiler::Compile_add_const, &Compiler::Compile_add, PGXPFN(CPU_ADD), TF_WRITES_D | TF_READS_S | TF_READS_T | TF_COMMUTATIVE | TF_CAN_OVERFLOW | TF_RENAME_WITH_ZERO_T); break;
+ case InstructionFunct::addu: CompileTemplate(&Compiler::Compile_addu_const, &Compiler::Compile_addu, PGXPFN(CPU_ADD), TF_WRITES_D | TF_READS_S | TF_READS_T | TF_COMMUTATIVE | TF_RENAME_WITH_ZERO_T); break;
+ case InstructionFunct::sub: CompileTemplate(&Compiler::Compile_sub_const, &Compiler::Compile_sub, PGXPFN(CPU_SUB), TF_WRITES_D | TF_READS_S | TF_READS_T | TF_CAN_OVERFLOW | TF_RENAME_WITH_ZERO_T); break;
+ case InstructionFunct::subu: CompileTemplate(&Compiler::Compile_subu_const, &Compiler::Compile_subu, PGXPFN(CPU_SUB), TF_WRITES_D | TF_READS_S | TF_READS_T | TF_RENAME_WITH_ZERO_T); break;
+ case InstructionFunct::and_: CompileTemplate(&Compiler::Compile_and_const, &Compiler::Compile_and, PGXPFN(CPU_AND_), TF_WRITES_D | TF_READS_S | TF_READS_T | TF_COMMUTATIVE); break;
+ case InstructionFunct::or_: CompileTemplate(&Compiler::Compile_or_const, &Compiler::Compile_or, PGXPFN(CPU_OR_), TF_WRITES_D | TF_READS_S | TF_READS_T | TF_COMMUTATIVE | TF_RENAME_WITH_ZERO_T); break;
+ case InstructionFunct::xor_: CompileTemplate(&Compiler::Compile_xor_const, &Compiler::Compile_xor, PGXPFN(CPU_XOR_), TF_WRITES_D | TF_READS_S | TF_READS_T | TF_COMMUTATIVE | TF_RENAME_WITH_ZERO_T); break;
+ case InstructionFunct::nor: CompileTemplate(&Compiler::Compile_nor_const, &Compiler::Compile_nor, PGXPFN(CPU_NOR), TF_WRITES_D | TF_READS_S | TF_READS_T | TF_COMMUTATIVE); break;
+ case InstructionFunct::slt: CompileTemplate(&Compiler::Compile_slt_const, &Compiler::Compile_slt, PGXPFN(CPU_SLT), TF_WRITES_D | TF_READS_T | TF_READS_S); break;
+ case InstructionFunct::sltu: CompileTemplate(&Compiler::Compile_sltu_const, &Compiler::Compile_sltu, PGXPFN(CPU_SLTU), TF_WRITES_D | TF_READS_T | TF_READS_S); break;
+
+ default: Panic("fixme funct"); break;
+ }
+ }
+ break;
+
+ case InstructionOp::j: Compile_j(); break;
+ case InstructionOp::jal: Compile_jal(); break;
+
+ case InstructionOp::b: CompileTemplate(&Compiler::Compile_b_const, &Compiler::Compile_b, nullptr, TF_READS_S | TF_CAN_SWAP_DELAY_SLOT); break;
+ case InstructionOp::blez: CompileTemplate(&Compiler::Compile_blez_const, &Compiler::Compile_blez, nullptr, TF_READS_S | TF_CAN_SWAP_DELAY_SLOT); break;
+ case InstructionOp::bgtz: CompileTemplate(&Compiler::Compile_bgtz_const, &Compiler::Compile_bgtz, nullptr, TF_READS_S | TF_CAN_SWAP_DELAY_SLOT); break;
+ case InstructionOp::beq: CompileTemplate(&Compiler::Compile_beq_const, &Compiler::Compile_beq, nullptr, TF_READS_S | TF_READS_T | TF_COMMUTATIVE | TF_CAN_SWAP_DELAY_SLOT); break;
+ case InstructionOp::bne: CompileTemplate(&Compiler::Compile_bne_const, &Compiler::Compile_bne, nullptr, TF_READS_S | TF_READS_T | TF_COMMUTATIVE | TF_CAN_SWAP_DELAY_SLOT); break;
+
+ case InstructionOp::addi: CompileTemplate(&Compiler::Compile_addi_const, &Compiler::Compile_addi, PGXPFN(CPU_ADDI), TF_WRITES_T | TF_READS_S | TF_COMMUTATIVE | TF_CAN_OVERFLOW | TF_RENAME_WITH_ZERO_IMM); break;
+ case InstructionOp::addiu: CompileTemplate(&Compiler::Compile_addiu_const, &Compiler::Compile_addiu, PGXPFN(CPU_ADDI), TF_WRITES_T | TF_READS_S | TF_COMMUTATIVE | TF_RENAME_WITH_ZERO_IMM); break;
+ case InstructionOp::slti: CompileTemplate(&Compiler::Compile_slti_const, &Compiler::Compile_slti, PGXPFN(CPU_SLTI), TF_WRITES_T | TF_READS_S); break;
+ case InstructionOp::sltiu: CompileTemplate(&Compiler::Compile_sltiu_const, &Compiler::Compile_sltiu, PGXPFN(CPU_SLTIU), TF_WRITES_T | TF_READS_S); break;
+ case InstructionOp::andi: CompileTemplate(&Compiler::Compile_andi_const, &Compiler::Compile_andi, PGXPFN(CPU_ANDI), TF_WRITES_T | TF_READS_S | TF_COMMUTATIVE); break;
+ case InstructionOp::ori: CompileTemplate(&Compiler::Compile_ori_const, &Compiler::Compile_ori, PGXPFN(CPU_ORI), TF_WRITES_T | TF_READS_S | TF_COMMUTATIVE | TF_RENAME_WITH_ZERO_IMM); break;
+ case InstructionOp::xori: CompileTemplate(&Compiler::Compile_xori_const, &Compiler::Compile_xori, PGXPFN(CPU_XORI), TF_WRITES_T | TF_READS_S | TF_COMMUTATIVE | TF_RENAME_WITH_ZERO_IMM); break;
+ case InstructionOp::lui: Compile_lui(); break;
+
+ case InstructionOp::lb: CompileLoadStoreTemplate(&Compiler::Compile_lxx, MemoryAccessSize::Byte, false, true, TF_READS_S | TF_WRITES_T | TF_LOAD_DELAY); break;
+ case InstructionOp::lbu: CompileLoadStoreTemplate(&Compiler::Compile_lxx, MemoryAccessSize::Byte, false, false, TF_READS_S | TF_WRITES_T | TF_LOAD_DELAY); break;
+ case InstructionOp::lh: CompileLoadStoreTemplate(&Compiler::Compile_lxx, MemoryAccessSize::HalfWord, false, true, TF_READS_S | TF_WRITES_T | TF_LOAD_DELAY); break;
+ case InstructionOp::lhu: CompileLoadStoreTemplate(&Compiler::Compile_lxx, MemoryAccessSize::HalfWord, false, false, TF_READS_S | TF_WRITES_T | TF_LOAD_DELAY); break;
+ case InstructionOp::lw: CompileLoadStoreTemplate(&Compiler::Compile_lxx, MemoryAccessSize::Word, false, false, TF_READS_S | TF_WRITES_T | TF_LOAD_DELAY); break;
+ case InstructionOp::lwl: CompileLoadStoreTemplate(&Compiler::Compile_lwx, MemoryAccessSize::Word, false, false, TF_READS_S | /*TF_READS_T | TF_WRITES_T | */TF_LOAD_DELAY); break;
+ case InstructionOp::lwr: CompileLoadStoreTemplate(&Compiler::Compile_lwx, MemoryAccessSize::Word, false, false, TF_READS_S | /*TF_READS_T | TF_WRITES_T | */TF_LOAD_DELAY); break;
+ case InstructionOp::sb: CompileLoadStoreTemplate(&Compiler::Compile_sxx, MemoryAccessSize::Byte, true, false, TF_READS_S | TF_READS_T); break;
+ case InstructionOp::sh: CompileLoadStoreTemplate(&Compiler::Compile_sxx, MemoryAccessSize::HalfWord, true, false, TF_READS_S | TF_READS_T); break;
+ case InstructionOp::sw: CompileLoadStoreTemplate(&Compiler::Compile_sxx, MemoryAccessSize::Word, true, false, TF_READS_S | TF_READS_T); break;
+ case InstructionOp::swl: CompileLoadStoreTemplate(&Compiler::Compile_swx, MemoryAccessSize::Word, false, false, TF_READS_S | /*TF_READS_T | TF_WRITES_T | */TF_LOAD_DELAY); break;
+ case InstructionOp::swr: CompileLoadStoreTemplate(&Compiler::Compile_swx, MemoryAccessSize::Word, false, false, TF_READS_S | /*TF_READS_T | TF_WRITES_T | */TF_LOAD_DELAY); break;
+
+ case InstructionOp::cop0:
+ {
+ if (inst->cop.IsCommonInstruction())
+ {
+ switch (inst->cop.CommonOp())
+ {
+ case CopCommonInstruction::mfcn: if (inst->r.rt != Reg::zero) { CompileTemplate(nullptr, &Compiler::Compile_mfc0, nullptr, TF_WRITES_T | TF_LOAD_DELAY); } break;
+ case CopCommonInstruction::mtcn: CompileTemplate(nullptr, &Compiler::Compile_mtc0, PGXPFN(CPU_MTC0), TF_READS_T); break;
+ default: Compile_Fallback(); break;
+ }
+ }
+ else
+ {
+ switch (inst->cop.Cop0Op())
+ {
+ case Cop0Instruction::rfe: CompileTemplate(nullptr, &Compiler::Compile_rfe, nullptr, 0); break;
+ default: Compile_Fallback(); break;
+ }
+ }
+ }
+ break;
+
+ case InstructionOp::cop2:
+ {
+ if (inst->cop.IsCommonInstruction())
+ {
+ switch (inst->cop.CommonOp())
+ {
+ case CopCommonInstruction::mfcn: if (inst->r.rt != Reg::zero) { CompileTemplate(nullptr, &Compiler::Compile_mfc2, nullptr, TF_GTE_STALL); } break;
+ case CopCommonInstruction::cfcn: if (inst->r.rt != Reg::zero) { CompileTemplate(nullptr, &Compiler::Compile_mfc2, nullptr, TF_GTE_STALL); } break;
+ case CopCommonInstruction::mtcn: CompileTemplate(nullptr, &Compiler::Compile_mtc2, PGXPFN(CPU_MTC2), TF_GTE_STALL | TF_READS_T | TF_PGXP_WITHOUT_CPU); break;
+ case CopCommonInstruction::ctcn: CompileTemplate(nullptr, &Compiler::Compile_mtc2, PGXPFN(CPU_MTC2), TF_GTE_STALL | TF_READS_T | TF_PGXP_WITHOUT_CPU); break;
+ default: Compile_Fallback(); break;
+ }
+ }
+ else
+ {
+ // GTE ops
+ CompileTemplate(nullptr, &Compiler::Compile_cop2, nullptr, TF_GTE_STALL);
+ }
+ }
+ break;
+
+ case InstructionOp::lwc2: CompileLoadStoreTemplate(&Compiler::Compile_lwc2, MemoryAccessSize::Word, false, false, TF_GTE_STALL | TF_READS_S | TF_LOAD_DELAY); break;
+ case InstructionOp::swc2: CompileLoadStoreTemplate(&Compiler::Compile_swc2, MemoryAccessSize::Word, true, false, TF_GTE_STALL | TF_READS_S); break;
+
+ default: Panic("Fixme"); break;
+ // clang-format on
+
+#undef PGXPFN
+ }
+
+ ClearHostRegsNeeded();
+ UpdateLoadDelay();
+
+#if 0
+ const void* end = GetCurrentCodePointer();
+ if (start != end && !m_current_instruction_branch_delay_slot)
+ {
+ CodeCache::DisassembleAndLogHostCode(start,
+ static_cast(static_cast(end) - static_cast(start)));
+ }
+#endif
+}
+
+void CPU::NewRec::Compiler::CompileBranchDelaySlot(bool dirty_pc /* = true */)
+{
+ // Update load delay at the end of the previous instruction.
+ UpdateLoadDelay();
+
+ // TODO: Move cycle add before this.
+ inst++;
+ iinfo++;
+ m_current_instruction_pc += sizeof(Instruction);
+ m_current_instruction_branch_delay_slot = true;
+ m_compiler_pc += sizeof(Instruction);
+ m_dirty_pc = dirty_pc;
+ m_dirty_instruction_bits = true;
+
+ CompileInstruction();
+
+ m_current_instruction_branch_delay_slot = false;
+}
+
+void CPU::NewRec::Compiler::CompileTemplate(void (Compiler::*const_func)(CompileFlags),
+ void (Compiler::*func)(CompileFlags), const void* pgxp_cpu_func, u32 tflags)
+{
+ // TODO: This is where we will do memory operand optimization. Remember to kill constants!
+ // TODO: Swap S and T if commutative
+ // TODO: For and, treat as zeroing if imm is zero
+ // TODO: Optimize slt + bne to cmp + jump
+ // TODO: Prefer memory operands when load delay is dirty, since we're going to invalidate immediately after the first
+ // instruction..
+ // TODO: andi with zero -> zero const
+ // TODO: load constant so it can be flushed if it's not overwritten later
+ // TODO: inline PGXP ops.
+ // TODO: don't rename on sltu.
+
+ bool allow_constant = static_cast(const_func);
+ Reg rs = inst->r.rs.GetValue();
+ Reg rt = inst->r.rt.GetValue();
+ Reg rd = inst->r.rd.GetValue();
+
+ if (tflags & TF_GTE_STALL)
+ StallUntilGTEComplete();
+
+ // throw away instructions writing to $zero
+ if (!(tflags & TF_NO_NOP) && (!g_settings.cpu_recompiler_memory_exceptions || !(tflags & TF_CAN_OVERFLOW)) &&
+ ((tflags & TF_WRITES_T && rt == Reg::zero) || (tflags & TF_WRITES_D && rd == Reg::zero)))
+ {
+ Log_DebugPrintf("Skipping instruction because it writes to zero");
+ return;
+ }
+
+ // handle rename operations
+ if ((tflags & TF_RENAME_WITH_ZERO_T && HasConstantRegValue(rt, 0)))
+ {
+ DebugAssert((tflags & (TF_WRITES_D | TF_READS_S | TF_READS_T)) == (TF_WRITES_D | TF_READS_S | TF_READS_T));
+ CompileMoveRegTemplate(rd, rs, true);
+ return;
+ }
+ else if ((tflags & (TF_RENAME_WITH_ZERO_T | TF_COMMUTATIVE)) == (TF_RENAME_WITH_ZERO_T | TF_COMMUTATIVE) &&
+ HasConstantRegValue(rs, 0))
+ {
+ DebugAssert((tflags & (TF_WRITES_D | TF_READS_S | TF_READS_T)) == (TF_WRITES_D | TF_READS_S | TF_READS_T));
+ CompileMoveRegTemplate(rd, rt, true);
+ return;
+ }
+ else if (tflags & TF_RENAME_WITH_ZERO_IMM && inst->i.imm == 0)
+ {
+ CompileMoveRegTemplate(rt, rs, true);
+ return;
+ }
+
+ if (pgxp_cpu_func && g_settings.gpu_pgxp_enable && ((tflags & TF_PGXP_WITHOUT_CPU) || g_settings.UsingPGXPCPUMode()))
+ {
+ std::array reg_args = {{Reg::count, Reg::count}};
+ u32 num_reg_args = 0;
+ if (tflags & TF_READS_S)
+ reg_args[num_reg_args++] = rs;
+ if (tflags & TF_READS_T)
+ reg_args[num_reg_args++] = rt;
+ if (tflags & TF_READS_LO)
+ reg_args[num_reg_args++] = Reg::lo;
+ if (tflags & TF_READS_HI)
+ reg_args[num_reg_args++] = Reg::hi;
+
+ DebugAssert(num_reg_args <= 2);
+ GeneratePGXPCallWithMIPSRegs(pgxp_cpu_func, inst->bits, reg_args[0], reg_args[1]);
+ }
+
+ // if it's a commutative op, and we have one constant reg but not the other, swap them
+ // TODO: make it swap when writing to T as well
+ // TODO: drop the hack for rd == rt
+ if (tflags & TF_COMMUTATIVE && !(tflags & TF_WRITES_T) &&
+ ((HasConstantReg(rs) && !HasConstantReg(rt)) || (tflags & TF_WRITES_D && rd == rt)))
+ {
+ Log_DebugPrintf("Swapping S:%s and T:%s due to commutative op and constants", GetRegName(rs), GetRegName(rt));
+ std::swap(rs, rt);
+ }
+
+ CompileFlags cf = {};
+
+ if (tflags & TF_READS_S)
+ {
+ MarkRegsNeeded(HR_TYPE_CPU_REG, rs);
+ if (HasConstantReg(rs))
+ cf.const_s = true;
+ else
+ allow_constant = false;
+ }
+ if (tflags & TF_READS_T)
+ {
+ MarkRegsNeeded(HR_TYPE_CPU_REG, rt);
+ if (HasConstantReg(rt))
+ cf.const_t = true;
+ else
+ allow_constant = false;
+ }
+ if (tflags & TF_READS_LO)
+ {
+ MarkRegsNeeded(HR_TYPE_CPU_REG, Reg::lo);
+ if (HasConstantReg(Reg::lo))
+ cf.const_lo = true;
+ else
+ allow_constant = false;
+ }
+ if (tflags & TF_READS_HI)
+ {
+ MarkRegsNeeded(HR_TYPE_CPU_REG, Reg::hi);
+ if (HasConstantReg(Reg::hi))
+ cf.const_hi = true;
+ else
+ allow_constant = false;
+ }
+
+ // Needed because of potential swapping
+ if (tflags & TF_READS_S)
+ cf.mips_s = static_cast(rs);
+ if (tflags & (TF_READS_T | TF_WRITES_T))
+ cf.mips_t = static_cast(rt);
+
+ if (allow_constant)
+ {
+ // woot, constant path
+ (this->*const_func)(cf);
+ return;
+ }
+
+ UpdateHostRegCounters();
+
+ if (tflags & TF_CAN_SWAP_DELAY_SLOT && TrySwapDelaySlot(cf.MipsS(), cf.MipsT()))
+ cf.delay_slot_swapped = true;
+
+ if (tflags & TF_READS_S &&
+ (tflags & TF_NEEDS_REG_S || !cf.const_s || (tflags & TF_WRITES_D && rd != Reg::zero && rd == rs)))
+ {
+ cf.host_s = AllocateHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rs);
+ cf.const_s = false;
+ cf.valid_host_s = true;
+ }
+
+ if (tflags & TF_READS_T &&
+ (tflags & (TF_NEEDS_REG_T | TF_WRITES_T) || !cf.const_t || (tflags & TF_WRITES_D && rd != Reg::zero && rd == rt)))
+ {
+ cf.host_t = AllocateHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rt);
+ cf.const_t = false;
+ cf.valid_host_t = true;
+ }
+
+ if (tflags & (TF_READS_LO | TF_WRITES_LO))
+ {
+ cf.host_lo =
+ AllocateHostReg(((tflags & TF_READS_LO) ? HR_MODE_READ : 0u) | ((tflags & TF_WRITES_LO) ? HR_MODE_WRITE : 0u),
+ HR_TYPE_CPU_REG, Reg::lo);
+ cf.const_lo = false;
+ cf.valid_host_lo = true;
+ }
+
+ if (tflags & (TF_READS_HI | TF_WRITES_HI))
+ {
+ cf.host_hi =
+ AllocateHostReg(((tflags & TF_READS_HI) ? HR_MODE_READ : 0u) | ((tflags & TF_WRITES_HI) ? HR_MODE_WRITE : 0u),
+ HR_TYPE_CPU_REG, Reg::hi);
+ cf.const_hi = false;
+ cf.valid_host_hi = true;
+ }
+
+ const HostRegAllocType write_type =
+ (tflags & TF_LOAD_DELAY && EMULATE_LOAD_DELAYS) ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG;
+
+ if (tflags & TF_CAN_OVERFLOW && g_settings.cpu_recompiler_memory_exceptions)
+ {
+ // allocate a temp register for the result, then swap it back
+ const u32 tempreg = AllocateHostReg(0, HR_TYPE_TEMP);
+ ;
+ if (tflags & TF_WRITES_D)
+ {
+ cf.host_d = tempreg;
+ cf.valid_host_d = true;
+ }
+ else if (tflags & TF_WRITES_T)
+ {
+ cf.host_t = tempreg;
+ cf.valid_host_t = true;
+ }
+
+ (this->*func)(cf);
+
+ if (tflags & TF_WRITES_D && rd != Reg::zero)
+ {
+ DeleteMIPSReg(rd, false);
+ RenameHostReg(tempreg, HR_MODE_WRITE, write_type, rd);
+ }
+ else if (tflags & TF_WRITES_T && rt != Reg::zero)
+ {
+ DeleteMIPSReg(rt, false);
+ RenameHostReg(tempreg, HR_MODE_WRITE, write_type, rt);
+ }
+ else
+ {
+ FreeHostReg(tempreg);
+ }
+ }
+ else
+ {
+ if (tflags & TF_WRITES_D && rd != Reg::zero)
+ {
+ if (tflags & TF_READS_S && cf.valid_host_s && TryRenameMIPSReg(rd, rs, cf.host_s, Reg::count))
+ cf.host_d = cf.host_s;
+ else
+ cf.host_d = AllocateHostReg(HR_MODE_WRITE, write_type, rd);
+ cf.valid_host_d = true;
+ }
+
+ if (tflags & TF_WRITES_T && rt != Reg::zero)
+ {
+ if (tflags & TF_READS_S && cf.valid_host_s && TryRenameMIPSReg(rt, rs, cf.host_s, Reg::count))
+ cf.host_t = cf.host_s;
+ else
+ cf.host_t = AllocateHostReg(HR_MODE_WRITE, write_type, rt);
+ cf.valid_host_t = true;
+ }
+
+ (this->*func)(cf);
+ }
+}
+
+void CPU::NewRec::Compiler::CompileLoadStoreTemplate(void (Compiler::*func)(CompileFlags, MemoryAccessSize, bool,
+ const std::optional&),
+ MemoryAccessSize size, bool store, bool sign, u32 tflags)
+{
+ const Reg rs = inst->i.rs;
+ const Reg rt = inst->i.rt;
+
+ if (tflags & TF_GTE_STALL)
+ StallUntilGTEComplete();
+
+ CompileFlags cf = {};
+
+ if (tflags & TF_READS_S)
+ {
+ MarkRegsNeeded(HR_TYPE_CPU_REG, rs);
+ cf.mips_s = static_cast(rs);
+ }
+ if (tflags & (TF_READS_T | TF_WRITES_T))
+ {
+ if (tflags & TF_READS_T)
+ MarkRegsNeeded(HR_TYPE_CPU_REG, rt);
+ cf.mips_t = static_cast(rt);
+ }
+
+ UpdateHostRegCounters();
+
+ // constant address?
+ std::optional addr;
+ if (HasConstantReg(rs))
+ {
+ addr = GetConstantRegU32(rs) + inst->i.imm_sext32();
+ cf.const_s = true;
+ }
+ else
+ {
+ if constexpr (HAS_MEMORY_OPERANDS)
+ {
+ // don't bother caching it since we're going to flush anyway
+ // TODO: make less rubbish, if it's caller saved we don't need to flush...
+ const std::optional hreg = CheckHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rs);
+ if (hreg.has_value())
+ {
+ cf.valid_host_s = true;
+ cf.host_s = hreg.value();
+ }
+ }
+ else
+ {
+ // need rs in a register
+ cf.host_s = AllocateHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rs);
+ cf.valid_host_s = true;
+ }
+ }
+
+ // reads T -> store, writes T -> load
+ // for now, we defer the allocation to afterwards, because C call
+ if (tflags & TF_READS_T)
+ {
+ if (HasConstantReg(rt))
+ {
+ cf.const_t = true;
+ }
+ else
+ {
+ if constexpr (HAS_MEMORY_OPERANDS)
+ {
+ const std::optional hreg = CheckHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rt);
+ if (hreg.has_value())
+ {
+ cf.valid_host_t = true;
+ cf.host_t = hreg.value();
+ }
+ }
+ else
+ {
+ cf.host_t = AllocateHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rt);
+ cf.valid_host_t = true;
+ }
+ }
+ }
+
+ (this->*func)(cf, size, sign, addr);
+}
+
+void CPU::NewRec::Compiler::FlushForLoadStore(const std::optional& address, bool store)
+{
+ if (CodeCache::IsUsingFastmem() && !g_settings.cpu_recompiler_memory_exceptions)
+ return;
+
+ // TODO: Stores don't need to flush GTE cycles...
+ Flush(FLUSH_FOR_C_CALL | FLUSH_FOR_LOADSTORE);
+}
+
+void CPU::NewRec::Compiler::CompileMoveRegTemplate(Reg dst, Reg src, bool pgxp_move)
+{
+ if (dst == src || dst == Reg::zero)
+ return;
+
+ if (HasConstantReg(src))
+ {
+ DeleteMIPSReg(dst, false);
+ SetConstantReg(dst, GetConstantRegU32(src));
+ }
+ else
+ {
+ const u32 srcreg = AllocateHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, src);
+ if (!TryRenameMIPSReg(dst, src, srcreg, Reg::count))
+ {
+ const u32 dstreg = AllocateHostReg(HR_MODE_WRITE, HR_TYPE_CPU_REG, dst);
+ CopyHostReg(dstreg, srcreg);
+ ClearHostRegNeeded(dstreg);
+ }
+ }
+
+ // TODO: This could be made better if we only did it for registers where there was a previous MFC2.
+ if (g_settings.gpu_pgxp_enable && pgxp_move)
+ {
+ // might've been renamed, so use dst here
+ GeneratePGXPCallWithMIPSRegs(reinterpret_cast(&PGXP::CPU_MOVE),
+ (static_cast(dst) << 8) | (static_cast(src)), dst);
+ }
+}
+
+void CPU::NewRec::Compiler::Compile_j()
+{
+ const u32 newpc = (m_compiler_pc & UINT32_C(0xF0000000)) | (inst->j.target << 2);
+
+ // TODO: Delay slot swap.
+ // We could also move the cycle commit back.
+ CompileBranchDelaySlot();
+ EndBlock(newpc, true);
+}
+
+void CPU::NewRec::Compiler::Compile_jr_const(CompileFlags cf)
+{
+ DebugAssert(HasConstantReg(cf.MipsS()));
+ const u32 newpc = GetConstantRegU32(cf.MipsS());
+ if (newpc & 3 && g_settings.cpu_recompiler_memory_exceptions)
+ {
+ EndBlockWithException(Exception::AdEL);
+ return;
+ }
+
+ CompileBranchDelaySlot();
+ EndBlock(newpc, true);
+}
+
+void CPU::NewRec::Compiler::Compile_jal()
+{
+ const u32 newpc = (m_compiler_pc & UINT32_C(0xF0000000)) | (inst->j.target << 2);
+ SetConstantReg(Reg::ra, GetBranchReturnAddress({}));
+ CompileBranchDelaySlot();
+ EndBlock(newpc, true);
+}
+
+void CPU::NewRec::Compiler::Compile_jalr_const(CompileFlags cf)
+{
+ DebugAssert(HasConstantReg(cf.MipsS()));
+ const u32 newpc = GetConstantRegU32(cf.MipsS());
+ if (MipsD() != Reg::zero)
+ SetConstantReg(MipsD(), GetBranchReturnAddress({}));
+
+ CompileBranchDelaySlot();
+ EndBlock(newpc, true);
+}
+
+void CPU::NewRec::Compiler::Compile_syscall()
+{
+ EndBlockWithException(Exception::Syscall);
+}
+
+void CPU::NewRec::Compiler::Compile_break()
+{
+ EndBlockWithException(Exception::BP);
+}
+
+void CPU::NewRec::Compiler::Compile_b_const(CompileFlags cf)
+{
+ DebugAssert(HasConstantReg(cf.MipsS()));
+
+ const u8 irt = static_cast(inst->i.rt.GetValue());
+ const bool bgez = ConvertToBoolUnchecked(irt & u8(1));
+ const bool link = (irt & u8(0x1E)) == u8(0x10);
+
+ const s32 rs = GetConstantRegS32(cf.MipsS());
+ const bool taken = bgez ? (rs >= 0) : (rs < 0);
+ const u32 taken_pc = GetConditionalBranchTarget(cf);
+
+ if (link)
+ SetConstantReg(Reg::ra, GetBranchReturnAddress(cf));
+
+ CompileBranchDelaySlot();
+ EndBlock(taken ? taken_pc : m_compiler_pc, true);
+}
+
+void CPU::NewRec::Compiler::Compile_b(CompileFlags cf)
+{
+ const u8 irt = static_cast(inst->i.rt.GetValue());
+ const bool bgez = ConvertToBoolUnchecked(irt & u8(1));
+ const bool link = (irt & u8(0x1E)) == u8(0x10);
+
+ if (link)
+ SetConstantReg(Reg::ra, GetBranchReturnAddress(cf));
+
+ Compile_bxx(cf, bgez ? BranchCondition::GreaterEqualZero : BranchCondition::LessThanZero);
+}
+
+void CPU::NewRec::Compiler::Compile_blez(CompileFlags cf)
+{
+ Compile_bxx(cf, BranchCondition::LessEqualZero);
+}
+
+void CPU::NewRec::Compiler::Compile_blez_const(CompileFlags cf)
+{
+ Compile_bxx_const(cf, BranchCondition::LessEqualZero);
+}
+
+void CPU::NewRec::Compiler::Compile_bgtz(CompileFlags cf)
+{
+ Compile_bxx(cf, BranchCondition::GreaterThanZero);
+}
+
+void CPU::NewRec::Compiler::Compile_bgtz_const(CompileFlags cf)
+{
+ Compile_bxx_const(cf, BranchCondition::GreaterThanZero);
+}
+
+void CPU::NewRec::Compiler::Compile_beq(CompileFlags cf)
+{
+ Compile_bxx(cf, BranchCondition::Equal);
+}
+
+void CPU::NewRec::Compiler::Compile_beq_const(CompileFlags cf)
+{
+ Compile_bxx_const(cf, BranchCondition::Equal);
+}
+
+void CPU::NewRec::Compiler::Compile_bne(CompileFlags cf)
+{
+ Compile_bxx(cf, BranchCondition::NotEqual);
+}
+
+void CPU::NewRec::Compiler::Compile_bne_const(CompileFlags cf)
+{
+ Compile_bxx_const(cf, BranchCondition::NotEqual);
+}
+
+void CPU::NewRec::Compiler::Compile_bxx_const(CompileFlags cf, BranchCondition cond)
+{
+ DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT()));
+
+ bool taken;
+ switch (cond)
+ {
+ case BranchCondition::Equal:
+ taken = GetConstantRegU32(cf.MipsS()) == GetConstantRegU32(cf.MipsT());
+ break;
+
+ case BranchCondition::NotEqual:
+ taken = GetConstantRegU32(cf.MipsS()) != GetConstantRegU32(cf.MipsT());
+ break;
+
+ case BranchCondition::GreaterThanZero:
+ taken = GetConstantRegS32(cf.MipsS()) > 0;
+ break;
+
+ case BranchCondition::GreaterEqualZero:
+ taken = GetConstantRegS32(cf.MipsS()) >= 0;
+ break;
+
+ case BranchCondition::LessThanZero:
+ taken = GetConstantRegS32(cf.MipsS()) < 0;
+ break;
+
+ case BranchCondition::LessEqualZero:
+ taken = GetConstantRegS32(cf.MipsS()) <= 0;
+ break;
+
+ default:
+ Panic("Unhandled condition");
+ return;
+ }
+
+ const u32 taken_pc = GetConditionalBranchTarget(cf);
+ CompileBranchDelaySlot();
+ EndBlock(taken ? taken_pc : m_compiler_pc, true);
+}
+
+void CPU::NewRec::Compiler::Compile_sll_const(CompileFlags cf)
+{
+ DebugAssert(HasConstantReg(cf.MipsT()));
+ SetConstantReg(MipsD(), GetConstantRegU32(cf.MipsT()) << inst->r.shamt);
+}
+
+void CPU::NewRec::Compiler::Compile_srl_const(CompileFlags cf)
+{
+ DebugAssert(HasConstantReg(cf.MipsT()));
+ SetConstantReg(MipsD(), GetConstantRegU32(cf.MipsT()) >> inst->r.shamt);
+}
+
+void CPU::NewRec::Compiler::Compile_sra_const(CompileFlags cf)
+{
+ DebugAssert(HasConstantReg(cf.MipsT()));
+ SetConstantReg(MipsD(), static_cast(GetConstantRegS32(cf.MipsT()) >> inst->r.shamt));
+}
+
+void CPU::NewRec::Compiler::Compile_sllv_const(CompileFlags cf)
+{
+ DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT()));
+ SetConstantReg(MipsD(), GetConstantRegU32(cf.MipsT()) << (GetConstantRegU32(cf.MipsS()) & 0x1Fu));
+}
+
+void CPU::NewRec::Compiler::Compile_srlv_const(CompileFlags cf)
+{
+ DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT()));
+ SetConstantReg(MipsD(), GetConstantRegU32(cf.MipsT()) >> (GetConstantRegU32(cf.MipsS()) & 0x1Fu));
+}
+
+void CPU::NewRec::Compiler::Compile_srav_const(CompileFlags cf)
+{
+ DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT()));
+ SetConstantReg(MipsD(), static_cast(GetConstantRegS32(cf.MipsT()) >> (GetConstantRegU32(cf.MipsS()) & 0x1Fu)));
+}
+
+void CPU::NewRec::Compiler::Compile_and_const(CompileFlags cf)
+{
+ DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT()));
+ SetConstantReg(MipsD(), GetConstantRegU32(cf.MipsS()) & GetConstantRegU32(cf.MipsT()));
+}
+
+void CPU::NewRec::Compiler::Compile_or_const(CompileFlags cf)
+{
+ DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT()));
+ SetConstantReg(MipsD(), GetConstantRegU32(cf.MipsS()) | GetConstantRegU32(cf.MipsT()));
+}
+
+void CPU::NewRec::Compiler::Compile_xor_const(CompileFlags cf)
+{
+ DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT()));
+ SetConstantReg(MipsD(), GetConstantRegU32(cf.MipsS()) ^ GetConstantRegU32(cf.MipsT()));
+}
+
+void CPU::NewRec::Compiler::Compile_nor_const(CompileFlags cf)
+{
+ DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT()));
+ SetConstantReg(MipsD(), ~(GetConstantRegU32(cf.MipsS()) | GetConstantRegU32(cf.MipsT())));
+}
+
+void CPU::NewRec::Compiler::Compile_slt_const(CompileFlags cf)
+{
+ DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT()));
+ SetConstantReg(MipsD(), BoolToUInt32(GetConstantRegS32(cf.MipsS()) < GetConstantRegS32(cf.MipsT())));
+}
+
+void CPU::NewRec::Compiler::Compile_sltu_const(CompileFlags cf)
+{
+ DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT()));
+ SetConstantReg(MipsD(), BoolToUInt32(GetConstantRegU32(cf.MipsS()) < GetConstantRegU32(cf.MipsT())));
+}
+
+void CPU::NewRec::Compiler::Compile_mult_const(CompileFlags cf)
+{
+ DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT()));
+
+ const u64 res =
+ static_cast(static_cast(GetConstantRegS32(cf.MipsS())) * static_cast(GetConstantRegS32(cf.MipsT())));
+ SetConstantReg(Reg::hi, static_cast(res >> 32));
+ SetConstantReg(Reg::lo, static_cast(res));
+}
+
+void CPU::NewRec::Compiler::Compile_multu_const(CompileFlags cf)
+{
+ DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT()));
+
+ const u64 res = static_cast(GetConstantRegU32(cf.MipsS())) * static_cast(GetConstantRegU32(cf.MipsT()));
+ SetConstantReg(Reg::hi, static_cast(res >> 32));
+ SetConstantReg(Reg::lo, static_cast(res));
+}
+
+void CPU::NewRec::Compiler::Compile_div_const(CompileFlags cf)
+{
+ DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT()));
+
+ const s32 num = GetConstantRegS32(cf.MipsS());
+ const s32 denom = GetConstantRegS32(cf.MipsT());
+
+ s32 lo, hi;
+ if (denom == 0)
+ {
+ // divide by zero
+ lo = (num >= 0) ? UINT32_C(0xFFFFFFFF) : UINT32_C(1);
+ hi = static_cast(num);
+ }
+ else if (static_cast(num) == UINT32_C(0x80000000) && denom == -1)
+ {
+ // unrepresentable
+ lo = UINT32_C(0x80000000);
+ hi = 0;
+ }
+ else
+ {
+ lo = num / denom;
+ hi = num % denom;
+ }
+
+ SetConstantReg(Reg::hi, hi);
+ SetConstantReg(Reg::lo, lo);
+}
+
+void CPU::NewRec::Compiler::Compile_divu_const(CompileFlags cf)
+{
+ DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT()));
+
+ const u32 num = GetConstantRegU32(cf.MipsS());
+ const u32 denom = GetConstantRegU32(cf.MipsT());
+
+ u32 lo, hi;
+
+ if (denom == 0)
+ {
+ // divide by zero
+ lo = UINT32_C(0xFFFFFFFF);
+ hi = static_cast(num);
+ }
+ else
+ {
+ lo = num / denom;
+ hi = num % denom;
+ }
+
+ SetConstantReg(Reg::hi, hi);
+ SetConstantReg(Reg::lo, lo);
+}
+
+void CPU::NewRec::Compiler::Compile_add_const(CompileFlags cf)
+{
+ // TODO: Overflow
+ DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT()));
+ if (MipsD() != Reg::zero)
+ SetConstantReg(MipsD(), GetConstantRegU32(cf.MipsS()) + GetConstantRegU32(cf.MipsT()));
+}
+
+void CPU::NewRec::Compiler::Compile_addu_const(CompileFlags cf)
+{
+ DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT()));
+ SetConstantReg(MipsD(), GetConstantRegU32(cf.MipsS()) + GetConstantRegU32(cf.MipsT()));
+}
+
+void CPU::NewRec::Compiler::Compile_sub_const(CompileFlags cf)
+{
+ // TODO: Overflow
+ DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT()));
+ if (MipsD() != Reg::zero)
+ SetConstantReg(MipsD(), GetConstantRegU32(cf.MipsS()) - GetConstantRegU32(cf.MipsT()));
+}
+
+void CPU::NewRec::Compiler::Compile_subu_const(CompileFlags cf)
+{
+ DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT()));
+ SetConstantReg(MipsD(), GetConstantRegU32(cf.MipsS()) - GetConstantRegU32(cf.MipsT()));
+}
+
+void CPU::NewRec::Compiler::Compile_addi_const(CompileFlags cf)
+{
+ // TODO: Overflow
+ DebugAssert(HasConstantReg(cf.MipsS()));
+ if (cf.MipsT() != Reg::zero)
+ SetConstantReg(cf.MipsT(), GetConstantRegU32(cf.MipsS()) + inst->i.imm_sext32());
+}
+
+void CPU::NewRec::Compiler::Compile_addiu_const(CompileFlags cf)
+{
+ DebugAssert(HasConstantReg(cf.MipsS()));
+ SetConstantReg(cf.MipsT(), GetConstantRegU32(cf.MipsS()) + inst->i.imm_sext32());
+}
+
+void CPU::NewRec::Compiler::Compile_slti_const(CompileFlags cf)
+{
+ DebugAssert(HasConstantReg(cf.MipsS()));
+ SetConstantReg(cf.MipsT(), BoolToUInt32(GetConstantRegS32(cf.MipsS()) < static_cast(inst->i.imm_sext32())));
+}
+
+void CPU::NewRec::Compiler::Compile_sltiu_const(CompileFlags cf)
+{
+ DebugAssert(HasConstantReg(cf.MipsS()));
+ SetConstantReg(cf.MipsT(), GetConstantRegU32(cf.MipsS()) < inst->i.imm_sext32());
+}
+
+void CPU::NewRec::Compiler::Compile_andi_const(CompileFlags cf)
+{
+ DebugAssert(HasConstantReg(cf.MipsS()));
+ SetConstantReg(cf.MipsT(), GetConstantRegU32(cf.MipsS()) & inst->i.imm_zext32());
+}
+
+void CPU::NewRec::Compiler::Compile_ori_const(CompileFlags cf)
+{
+ DebugAssert(HasConstantReg(cf.MipsS()));
+ SetConstantReg(cf.MipsT(), GetConstantRegU32(cf.MipsS()) | inst->i.imm_zext32());
+}
+
+void CPU::NewRec::Compiler::Compile_xori_const(CompileFlags cf)
+{
+ DebugAssert(HasConstantReg(cf.MipsS()));
+ SetConstantReg(cf.MipsT(), GetConstantRegU32(cf.MipsS()) ^ inst->i.imm_zext32());
+}
+
+void CPU::NewRec::Compiler::Compile_lui()
+{
+ if (inst->i.rt == Reg::zero)
+ return;
+
+ SetConstantReg(inst->i.rt, inst->i.imm_zext32() << 16);
+}
+
+static constexpr const std::array, 16> s_cop0_table = {
+ {{nullptr, 0x00000000u},
+ {nullptr, 0x00000000u},
+ {nullptr, 0x00000000u},
+ {&CPU::g_state.cop0_regs.BPC, 0xffffffffu},
+ {nullptr, 0},
+ {&CPU::g_state.cop0_regs.BDA, 0xffffffffu},
+ {&CPU::g_state.cop0_regs.TAR, 0x00000000u},
+ {&CPU::g_state.cop0_regs.dcic.bits, CPU::Cop0Registers::DCIC::WRITE_MASK},
+ {&CPU::g_state.cop0_regs.BadVaddr, 0x00000000u},
+ {&CPU::g_state.cop0_regs.BDAM, 0xffffffffu},
+ {nullptr, 0x00000000u},
+ {&CPU::g_state.cop0_regs.BPCM, 0xffffffffu},
+ {&CPU::g_state.cop0_regs.sr.bits, CPU::Cop0Registers::SR::WRITE_MASK},
+ {&CPU::g_state.cop0_regs.cause.bits, CPU::Cop0Registers::CAUSE::WRITE_MASK},
+ {&CPU::g_state.cop0_regs.EPC, 0x00000000u},
+ {&CPU::g_state.cop0_regs.PRID, 0x00000000u}}};
+
+u32* CPU::NewRec::Compiler::GetCop0RegPtr(Cop0Reg reg)
+{
+ return (static_cast(reg) < s_cop0_table.size()) ? s_cop0_table[static_cast(reg)].first : nullptr;
+}
+
+u32 CPU::NewRec::Compiler::GetCop0RegWriteMask(Cop0Reg reg)
+{
+ return (static_cast(reg) < s_cop0_table.size()) ? s_cop0_table[static_cast(reg)].second : 0;
+}
+
+void CPU::NewRec::Compiler::Compile_mfc0(CompileFlags cf)
+{
+ const Cop0Reg r = static_cast(MipsD());
+ const u32* ptr = GetCop0RegPtr(r);
+ if (!ptr)
+ {
+ Log_ErrorPrintf("Read from unknown cop0 reg %u", static_cast(r));
+ Compile_Fallback();
+ return;
+ }
+
+ DebugAssert(cf.valid_host_t);
+ LoadHostRegFromCPUPointer(cf.host_t, ptr);
+}
+
+std::pair
+CPU::NewRec::Compiler::GetGTERegisterPointer(u32 index, bool writing)
+{
+ if (!writing)
+ {
+ // Most GTE registers can be read directly. Handle the special cases here.
+ if (index == 15) // SXY3
+ {
+ // mirror of SXY2
+ index = 14;
+ }
+
+ switch (index)
+ {
+ case 28: // IRGB
+ case 29: // ORGB
+ {
+ return std::make_pair(&g_state.gte_regs.r32[index], GTERegisterAccessAction::CallHandler);
+ }
+ break;
+
+ default:
+ {
+ return std::make_pair(&g_state.gte_regs.r32[index], GTERegisterAccessAction::Direct);
+ }
+ break;
+ }
+ }
+ else
+ {
+ switch (index)
+ {
+ case 1: // V0[z]
+ case 3: // V1[z]
+ case 5: // V2[z]
+ case 8: // IR0
+ case 9: // IR1
+ case 10: // IR2
+ case 11: // IR3
+ case 36: // RT33
+ case 44: // L33
+ case 52: // LR33
+ case 58: // H - sign-extended on read but zext on use
+ case 59: // DQA
+ case 61: // ZSF3
+ case 62: // ZSF4
+ {
+ // sign-extend z component of vector registers
+ return std::make_pair(&g_state.gte_regs.r32[index], GTERegisterAccessAction::SignExtend16);
+ }
+ break;
+
+ case 7: // OTZ
+ case 16: // SZ0
+ case 17: // SZ1
+ case 18: // SZ2
+ case 19: // SZ3
+ {
+ // zero-extend unsigned values
+ return std::make_pair(&g_state.gte_regs.r32[index], GTERegisterAccessAction::ZeroExtend16);
+ }
+ break;
+
+ case 15: // SXY3
+ {
+ // writing to SXYP pushes to the FIFO
+ return std::make_pair(&g_state.gte_regs.r32[index], GTERegisterAccessAction::PushFIFO);
+ }
+ break;
+
+ case 28: // IRGB
+ case 30: // LZCS
+ case 63: // FLAG
+ {
+ return std::make_pair(&g_state.gte_regs.r32[index], GTERegisterAccessAction::CallHandler);
+ }
+
+ case 29: // ORGB
+ case 31: // LZCR
+ {
+ // read-only registers
+ return std::make_pair(&g_state.gte_regs.r32[index], GTERegisterAccessAction::Ignore);
+ }
+
+ default:
+ {
+ // written as-is, 2x16 or 1x32 bits
+ return std::make_pair(&g_state.gte_regs.r32[index], GTERegisterAccessAction::Direct);
+ }
+ }
+ }
+}
+
+void CPU::NewRec::Compiler::AddGTETicks(TickCount ticks)
+{
+ // TODO: check, int has +1 here
+ m_gte_done_cycle = m_cycles + ticks;
+ Log_DebugPrintf("Adding %d GTE ticks", ticks);
+}
+
+void CPU::NewRec::Compiler::StallUntilGTEComplete()
+{
+ // TODO: hack to match old rec.. this may or may not be correct behavior
+ // it's the difference between stalling before and after the current instruction's cycle
+ DebugAssert(m_cycles > 0);
+ m_cycles--;
+
+ if (!m_dirty_gte_done_cycle)
+ {
+ // simple case - in block scheduling
+ if (m_gte_done_cycle > m_cycles)
+ {
+ Log_DebugPrintf("Stalling for %d ticks from GTE", m_gte_done_cycle - m_cycles);
+ m_cycles += (m_gte_done_cycle - m_cycles);
+ }
+ }
+ else
+ {
+ // switch to in block scheduling
+ Log_DebugPrintf("Flushing GTE stall from state");
+ Flush(FLUSH_GTE_STALL_FROM_STATE);
+ }
+
+ m_cycles++;
+}
+
+void CPU::NewRec::BackpatchLoadStore(void* exception_pc, const CodeCache::LoadstoreBackpatchInfo& info)
+{
+ // remove the cycles we added for the memory read, then take them off again after the backpatch
+ // the normal rec path will add the ram read ticks later, so we need to take them off at the end
+ DebugAssert(!info.is_load || info.cycles >= Bus::RAM_READ_TICKS);
+ const TickCount cycles_to_add =
+ static_cast(static_cast(info.cycles)) - (info.is_load ? Bus::RAM_READ_TICKS : 0);
+ const TickCount cycles_to_remove = static_cast(static_cast(info.cycles));
+
+ JitCodeBuffer& buffer = CodeCache::GetCodeBuffer();
+ void* thunk_address = buffer.GetFreeFarCodePointer();
+ const u32 thunk_size = CompileLoadStoreThunk(
+ thunk_address, buffer.GetFreeFarCodeSpace(), exception_pc, info.code_size, cycles_to_add, cycles_to_remove,
+ info.gpr_bitmask, info.address_register, info.data_register, info.AccessSize(), info.is_signed, info.is_load);
+
+#if 0
+ Log_DebugPrintf("**Backpatch Thunk**");
+ CodeCache::DisassembleAndLogHostCode(thunk_address, thunk_size);
+#endif
+
+ // backpatch to a jump to the slowmem handler
+ CodeCache::EmitJump(exception_pc, thunk_address, true);
+
+ buffer.CommitFarCode(thunk_size);
+}
diff --git a/src/core/cpu_newrec_compiler.h b/src/core/cpu_newrec_compiler.h
new file mode 100644
index 0000000000..7781006cf6
--- /dev/null
+++ b/src/core/cpu_newrec_compiler.h
@@ -0,0 +1,465 @@
+// SPDX-FileCopyrightText: 2023 Connor McLaughlin
+// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
+
+#pragma once
+#include "cpu_code_cache_private.h"
+#include "cpu_recompiler_types.h"
+#include "cpu_types.h"
+#include
+#include
+#include
+#include
+#include
+
+namespace CPU::NewRec {
+
+// Global options
+static constexpr bool EMULATE_LOAD_DELAYS = true;
+static constexpr bool SWAP_BRANCH_DELAY_SLOTS = true;
+
+// Arch-specific options
+#if defined(CPU_ARCH_X64)
+static constexpr u32 NUM_HOST_REGS = 16;
+static constexpr bool HAS_MEMORY_OPERANDS = true;
+#elif defined(CPU_ARCH_ARM64)
+static constexpr u32 NUM_HOST_REGS = 32;
+static constexpr bool HAS_MEMORY_OPERANDS = false;
+#elif defined(CPU_ARCH_RISCV64)
+static constexpr u32 NUM_HOST_REGS = 32;
+static constexpr bool HAS_MEMORY_OPERANDS = false;
+#endif
+
+// TODO: Get rid of the virtuals... somehow.
+class Compiler
+{
+public:
+ Compiler();
+ virtual ~Compiler();
+
+ const void* CompileBlock(CodeCache::Block* block, u32* host_code_size, u32* host_far_code_size);
+
+protected:
+ enum FlushFlags : u32
+ {
+ FLUSH_FLUSH_MIPS_REGISTERS = (1 << 0),
+ FLUSH_INVALIDATE_MIPS_REGISTERS = (1 << 1),
+ FLUSH_FREE_CALLER_SAVED_REGISTERS = (1 << 2),
+ FLUSH_FREE_UNNEEDED_CALLER_SAVED_REGISTERS = (1 << 3),
+ FLUSH_FREE_ALL_REGISTERS = (1 << 4),
+ FLUSH_PC = (1 << 5),
+ FLUSH_INSTRUCTION_BITS = (1 << 6),
+ FLUSH_CYCLES = (1 << 7),
+ FLUSH_LOAD_DELAY = (1 << 8),
+ FLUSH_LOAD_DELAY_FROM_STATE = (1 << 9),
+ FLUSH_GTE_DONE_CYCLE = (1 << 10),
+ FLUSH_GTE_STALL_FROM_STATE = (1 << 11),
+
+ FLUSH_FOR_C_CALL = (FLUSH_FREE_CALLER_SAVED_REGISTERS),
+ FLUSH_FOR_LOADSTORE = (FLUSH_FREE_CALLER_SAVED_REGISTERS | FLUSH_CYCLES),
+ FLUSH_FOR_BRANCH = (FLUSH_FLUSH_MIPS_REGISTERS),
+ FLUSH_FOR_EXCEPTION =
+ (FLUSH_CYCLES | FLUSH_GTE_DONE_CYCLE), // GTE cycles needed because it stalls when a GTE instruction is next.
+ FLUSH_FOR_INTERPRETER =
+ (FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_INVALIDATE_MIPS_REGISTERS | FLUSH_FREE_CALLER_SAVED_REGISTERS | FLUSH_PC |
+ FLUSH_CYCLES | FLUSH_INSTRUCTION_BITS | FLUSH_LOAD_DELAY | FLUSH_GTE_DONE_CYCLE),
+ FLUSH_END_BLOCK = 0xFFFFFFFFu & ~(FLUSH_PC | FLUSH_CYCLES | FLUSH_GTE_DONE_CYCLE | FLUSH_INSTRUCTION_BITS |
+ FLUSH_GTE_STALL_FROM_STATE),
+ };
+
+ union CompileFlags
+ {
+ struct
+ {
+ u32 const_s : 1; // S is constant
+ u32 const_t : 1; // T is constant
+ u32 const_lo : 1; // LO is constant
+ u32 const_hi : 1; // HI is constant
+
+ u32 valid_host_d : 1; // D is valid in host register
+ u32 valid_host_s : 1; // S is valid in host register
+ u32 valid_host_t : 1; // T is valid in host register
+ u32 valid_host_lo : 1; // LO is valid in host register
+ u32 valid_host_hi : 1; // HI is valid in host register
+
+ u32 host_d : 5; // D host register
+ u32 host_s : 5; // S host register
+ u32 host_t : 5; // T host register
+ u32 host_lo : 5; // LO host register
+
+ u32 delay_slot_swapped : 1;
+ u32 pad1 : 2; // 28..31
+
+ u32 host_hi : 5; // HI host register
+
+ u32 mips_s : 5; // S guest register
+ u32 mips_t : 5; // T guest register
+
+ u32 pad2 : 15; // 32 bits
+ };
+
+ u64 bits;
+
+ ALWAYS_INLINE Reg MipsS() const { return static_cast(mips_s); }
+ ALWAYS_INLINE Reg MipsT() const { return static_cast(mips_t); }
+ };
+ static_assert(sizeof(CompileFlags) == sizeof(u64));
+
+ enum TemplateFlag : u32
+ {
+ TF_READS_S = (1 << 0),
+ TF_READS_T = (1 << 1),
+ TF_READS_LO = (1 << 2),
+ TF_READS_HI = (1 << 3),
+ TF_WRITES_D = (1 << 4),
+ TF_WRITES_T = (1 << 5),
+ TF_WRITES_LO = (1 << 6),
+ TF_WRITES_HI = (1 << 7),
+ TF_COMMUTATIVE = (1 << 8), // S op T == T op S
+ TF_CAN_OVERFLOW = (1 << 9),
+
+ // TF_NORENAME = // TODO
+ TF_LOAD_DELAY = (1 << 10),
+ TF_GTE_STALL = (1 << 11),
+
+ TF_NO_NOP = (1 << 12),
+ TF_NEEDS_REG_S = (1 << 13),
+ TF_NEEDS_REG_T = (1 << 14),
+ TF_CAN_SWAP_DELAY_SLOT = (1 << 15),
+
+ TF_RENAME_WITH_ZERO_T = (1 << 16), // add commutative for S as well
+ TF_RENAME_WITH_ZERO_IMM = (1 << 17),
+
+ TF_PGXP_WITHOUT_CPU = (1 << 18),
+ };
+
+ enum HostRegFlags : u8
+ {
+ HR_ALLOCATED = (1 << 0),
+ HR_NEEDED = (1 << 1),
+ HR_MODE_READ = (1 << 2), // valid
+ HR_MODE_WRITE = (1 << 3), // dirty
+
+ HR_USABLE = (1 << 7),
+ HR_CALLEE_SAVED = (1 << 6),
+
+ ALLOWED_HR_FLAGS = HR_MODE_READ | HR_MODE_WRITE,
+ IMMUTABLE_HR_FLAGS = HR_USABLE | HR_CALLEE_SAVED,
+ };
+
+ enum HostRegAllocType : u8
+ {
+ HR_TYPE_TEMP,
+ HR_TYPE_CPU_REG,
+ HR_TYPE_PC_WRITEBACK,
+ HR_TYPE_LOAD_DELAY_VALUE,
+ HR_TYPE_NEXT_LOAD_DELAY_VALUE,
+ };
+
+ struct HostRegAlloc
+ {
+ u8 flags;
+ HostRegAllocType type;
+ Reg reg;
+ u16 counter;
+ };
+
+ enum class BranchCondition : u8
+ {
+ Equal,
+ NotEqual,
+ GreaterThanZero,
+ GreaterEqualZero,
+ LessThanZero,
+ LessEqualZero,
+ };
+
+ ALWAYS_INLINE bool HasConstantReg(Reg r) const { return m_constant_regs_valid.test(static_cast(r)); }
+ ALWAYS_INLINE bool HasDirtyConstantReg(Reg r) const { return m_constant_regs_dirty.test(static_cast(r)); }
+ ALWAYS_INLINE bool HasConstantRegValue(Reg r, u32 val) const
+ {
+ return m_constant_regs_valid.test(static_cast(r)) && m_constant_reg_values[static_cast(r)] == val;
+ }
+ ALWAYS_INLINE u32 GetConstantRegU32(Reg r) const { return m_constant_reg_values[static_cast(r)]; }
+ ALWAYS_INLINE s32 GetConstantRegS32(Reg r) const
+ {
+ return static_cast(m_constant_reg_values[static_cast(r)]);
+ }
+ void SetConstantReg(Reg r, u32 v);
+ void ClearConstantReg(Reg r);
+ void FlushConstantReg(Reg r);
+ void FlushConstantRegs(bool invalidate);
+
+ Reg MipsD() const;
+ u32 GetConditionalBranchTarget(CompileFlags cf) const;
+ u32 GetBranchReturnAddress(CompileFlags cf) const;
+ bool TrySwapDelaySlot(Reg rs = Reg::zero, Reg rt = Reg::zero, Reg rd = Reg::zero);
+ void SetCompilerPC(u32 newpc);
+
+ virtual const void* GetCurrentCodePointer() = 0;
+
+ virtual void Reset(CodeCache::Block* block, u8* code_buffer, u32 code_buffer_space, u8* far_code_buffer,
+ u32 far_code_space);
+ virtual void BeginBlock();
+ virtual void GenerateBlockProtectCheck(const u8* ram_ptr, const u8* shadow_ptr, u32 size) = 0;
+ virtual void GenerateICacheCheckAndUpdate() = 0;
+ virtual void GenerateCall(const void* func, s32 arg1reg = -1, s32 arg2reg = -1, s32 arg3reg = -1) = 0;
+ virtual void EndBlock(const std::optional& newpc, bool do_event_test) = 0;
+ virtual void EndBlockWithException(Exception excode) = 0;
+ virtual const void* EndCompile(u32* code_size, u32* far_code_size) = 0;
+
+ ALWAYS_INLINE bool IsHostRegAllocated(u32 r) const { return (m_host_regs[r].flags & HR_ALLOCATED) != 0; }
+ static const char* GetReadWriteModeString(u32 flags);
+ virtual const char* GetHostRegName(u32 reg) const = 0;
+ u32 GetFreeHostReg(u32 flags);
+ u32 AllocateHostReg(u32 flags, HostRegAllocType type = HR_TYPE_TEMP, Reg reg = Reg::count);
+ std::optional CheckHostReg(u32 flags, HostRegAllocType type = HR_TYPE_TEMP, Reg reg = Reg::count);
+ u32 AllocateTempHostReg(u32 flags = 0);
+ void SwapHostRegAlloc(u32 lhs, u32 rhs);
+ void FlushHostReg(u32 reg);
+ void FreeHostReg(u32 reg);
+ void ClearHostReg(u32 reg);
+ void MarkRegsNeeded(HostRegAllocType type, Reg reg);
+ void RenameHostReg(u32 reg, u32 new_flags, HostRegAllocType new_type, Reg new_reg);
+ void ClearHostRegNeeded(u32 reg);
+ void ClearHostRegsNeeded();
+ void DeleteMIPSReg(Reg reg, bool flush);
+ bool TryRenameMIPSReg(Reg to, Reg from, u32 fromhost, Reg other);
+ void UpdateHostRegCounters();
+
+ virtual void LoadHostRegWithConstant(u32 reg, u32 val) = 0;
+ virtual void LoadHostRegFromCPUPointer(u32 reg, const void* ptr) = 0;
+ virtual void StoreConstantToCPUPointer(u32 val, const void* ptr) = 0;
+ virtual void StoreHostRegToCPUPointer(u32 reg, const void* ptr) = 0;
+ virtual void CopyHostReg(u32 dst, u32 src) = 0;
+ virtual void Flush(u32 flags);
+
+ /// Returns true if there is a load delay which will be stored at the end of the instruction.
+ bool HasLoadDelay() const { return m_load_delay_register != Reg::count; }
+
+ /// Cancels any pending load delay to the specified register.
+ void CancelLoadDelaysToReg(Reg reg);
+
+ /// Moves load delay to the next load delay, and writes any previous load delay to the destination register.
+ void UpdateLoadDelay();
+
+ /// Flushes the load delay, i.e. writes it to the destination register.
+ void FinishLoadDelay();
+
+ /// Flushes the load delay, but only if it matches the specified register.
+ void FinishLoadDelayToReg(Reg reg);
+
+ /// Uses a caller-saved register for load delays when PGXP is enabled.
+ u32 GetFlagsForNewLoadDelayedReg() const;
+
+ void BackupHostState();
+ void RestoreHostState();
+
+ /// Registers loadstore for possible backpatching.
+ void AddLoadStoreInfo(void* code_address, u32 code_size, u32 address_register, u32 data_register,
+ MemoryAccessSize size, bool is_signed, bool is_load);
+
+ void CompileInstruction();
+ void CompileBranchDelaySlot(bool dirty_pc = true);
+
+ void CompileTemplate(void (Compiler::*const_func)(CompileFlags), void (Compiler::*func)(CompileFlags),
+ const void* pgxp_cpu_func, u32 tflags);
+ void CompileLoadStoreTemplate(void (Compiler::*func)(CompileFlags, MemoryAccessSize, bool,
+ const std::optional&),
+ MemoryAccessSize size, bool store, bool sign, u32 tflags);
+ void FlushForLoadStore(const std::optional& address, bool store);
+ void CompileMoveRegTemplate(Reg dst, Reg src, bool pgxp_move);
+
+ virtual void GeneratePGXPCallWithMIPSRegs(const void* func, u32 arg1val, Reg arg2reg = Reg::count,
+ Reg arg3reg = Reg::count) = 0;
+
+ virtual void Compile_Fallback() = 0;
+
+ void Compile_j();
+ virtual void Compile_jr(CompileFlags cf) = 0;
+ void Compile_jr_const(CompileFlags cf);
+ void Compile_jal();
+ virtual void Compile_jalr(CompileFlags cf) = 0;
+ void Compile_jalr_const(CompileFlags cf);
+ void Compile_syscall();
+ void Compile_break();
+
+ void Compile_b_const(CompileFlags cf);
+ void Compile_b(CompileFlags cf);
+ void Compile_blez(CompileFlags cf);
+ void Compile_blez_const(CompileFlags cf);
+ void Compile_bgtz(CompileFlags cf);
+ void Compile_bgtz_const(CompileFlags cf);
+ void Compile_beq(CompileFlags cf);
+ void Compile_beq_const(CompileFlags cf);
+ void Compile_bne(CompileFlags cf);
+ void Compile_bne_const(CompileFlags cf);
+ virtual void Compile_bxx(CompileFlags cf, BranchCondition cond) = 0;
+ void Compile_bxx_const(CompileFlags cf, BranchCondition cond);
+
+ void Compile_sll_const(CompileFlags cf);
+ virtual void Compile_sll(CompileFlags cf) = 0;
+ void Compile_srl_const(CompileFlags cf);
+ virtual void Compile_srl(CompileFlags cf) = 0;
+ void Compile_sra_const(CompileFlags cf);
+ virtual void Compile_sra(CompileFlags cf) = 0;
+ void Compile_sllv_const(CompileFlags cf);
+ virtual void Compile_sllv(CompileFlags cf) = 0;
+ void Compile_srlv_const(CompileFlags cf);
+ virtual void Compile_srlv(CompileFlags cf) = 0;
+ void Compile_srav_const(CompileFlags cf);
+ virtual void Compile_srav(CompileFlags cf) = 0;
+ void Compile_mult_const(CompileFlags cf);
+ virtual void Compile_mult(CompileFlags cf) = 0;
+ void Compile_multu_const(CompileFlags cf);
+ virtual void Compile_multu(CompileFlags cf) = 0;
+ void Compile_div_const(CompileFlags cf);
+ virtual void Compile_div(CompileFlags cf) = 0;
+ void Compile_divu_const(CompileFlags cf);
+ virtual void Compile_divu(CompileFlags cf) = 0;
+ void Compile_add_const(CompileFlags cf);
+ virtual void Compile_add(CompileFlags cf) = 0;
+ void Compile_addu_const(CompileFlags cf);
+ virtual void Compile_addu(CompileFlags cf) = 0;
+ void Compile_sub_const(CompileFlags cf);
+ virtual void Compile_sub(CompileFlags cf) = 0;
+ void Compile_subu_const(CompileFlags cf);
+ virtual void Compile_subu(CompileFlags cf) = 0;
+ void Compile_and_const(CompileFlags cf);
+ virtual void Compile_and(CompileFlags cf) = 0;
+ void Compile_or_const(CompileFlags cf);
+ virtual void Compile_or(CompileFlags cf) = 0;
+ void Compile_xor_const(CompileFlags cf);
+ virtual void Compile_xor(CompileFlags cf) = 0;
+ void Compile_nor_const(CompileFlags cf);
+ virtual void Compile_nor(CompileFlags cf) = 0;
+ void Compile_slt_const(CompileFlags cf);
+ virtual void Compile_slt(CompileFlags cf) = 0;
+ void Compile_sltu_const(CompileFlags cf);
+ virtual void Compile_sltu(CompileFlags cf) = 0;
+
+ void Compile_addi_const(CompileFlags cf);
+ virtual void Compile_addi(CompileFlags cf) = 0;
+ void Compile_addiu_const(CompileFlags cf);
+ virtual void Compile_addiu(CompileFlags cf) = 0;
+ void Compile_slti_const(CompileFlags cf);
+ virtual void Compile_slti(CompileFlags cf) = 0;
+ void Compile_sltiu_const(CompileFlags cf);
+ virtual void Compile_sltiu(CompileFlags cf) = 0;
+ void Compile_andi_const(CompileFlags cf);
+ virtual void Compile_andi(CompileFlags cf) = 0;
+ void Compile_ori_const(CompileFlags cf);
+ virtual void Compile_ori(CompileFlags cf) = 0;
+ void Compile_xori_const(CompileFlags cf);
+ virtual void Compile_xori(CompileFlags cf) = 0;
+ void Compile_lui();
+
+ virtual void Compile_lxx(CompileFlags cf, MemoryAccessSize size, bool sign,
+ const std::optional& address) = 0;
+ virtual void Compile_lwx(CompileFlags cf, MemoryAccessSize size, bool sign,
+ const std::optional& address) = 0; // lwl/lwr
+ virtual void Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign,
+ const std::optional& address) = 0;
+ virtual void Compile_sxx(CompileFlags cf, MemoryAccessSize size, bool sign,
+ const std::optional& address) = 0;
+ virtual void Compile_swx(CompileFlags cf, MemoryAccessSize size, bool sign,
+ const std::optional& address) = 0; // swl/swr
+ virtual void Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign,
+ const std::optional& address) = 0;
+
+ static u32* GetCop0RegPtr(Cop0Reg reg);
+ static u32 GetCop0RegWriteMask(Cop0Reg reg);
+
+ void Compile_mfc0(CompileFlags cf);
+ virtual void Compile_mtc0(CompileFlags cf) = 0;
+ virtual void Compile_rfe(CompileFlags cf) = 0;
+
+ void AddGTETicks(TickCount ticks);
+ void StallUntilGTEComplete();
+ virtual void Compile_mfc2(CompileFlags cf) = 0;
+ virtual void Compile_mtc2(CompileFlags cf) = 0;
+ virtual void Compile_cop2(CompileFlags cf) = 0;
+
+ enum GTERegisterAccessAction : u8
+ {
+ Ignore,
+ Direct,
+ ZeroExtend16,
+ SignExtend16,
+ CallHandler,
+ PushFIFO,
+ };
+
+ static std::pair GetGTERegisterPointer(u32 index, bool writing);
+
+ CodeCache::Block* m_block = nullptr;
+ u32 m_compiler_pc = 0;
+ TickCount m_cycles = 0;
+ TickCount m_gte_done_cycle = 0;
+
+ const Instruction* inst = nullptr;
+ const CodeCache::InstructionInfo* iinfo = nullptr;
+ u32 m_current_instruction_pc = 0;
+ bool m_current_instruction_branch_delay_slot = false;
+ bool m_branch_delay_slot_swapped = false;
+
+ bool m_dirty_pc = false;
+ bool m_dirty_instruction_bits = false;
+ bool m_dirty_gte_done_cycle = false;
+ bool m_block_ended = false;
+
+ std::bitset(Reg::count)> m_constant_regs_valid = {};
+ std::bitset(Reg::count)> m_constant_regs_dirty = {};
+ std::array(Reg::count)> m_constant_reg_values = {};
+
+ std::array m_host_regs = {};
+ u16 m_register_alloc_counter = 0;
+
+ bool m_load_delay_dirty = true;
+ Reg m_load_delay_register = Reg::count;
+ u32 m_load_delay_value_register = 0;
+
+ Reg m_next_load_delay_register = Reg::count;
+ u32 m_next_load_delay_value_register = 0;
+
+ struct HostStateBackup
+ {
+ TickCount cycles;
+ TickCount gte_done_cycle;
+ u32 compiler_pc;
+ bool dirty_pc;
+ bool dirty_instruction_bits;
+ bool dirty_gte_done_cycle;
+ bool block_ended;
+ const Instruction* inst;
+ const CodeCache::InstructionInfo* iinfo;
+ u32 current_instruction_pc;
+ bool current_instruction_delay_slot;
+ std::bitset(Reg::count)> const_regs_valid;
+ std::bitset(Reg::count)> const_regs_dirty;
+ std::array(Reg::count)> const_regs_values;
+ std::array host_regs;
+ u16 register_alloc_counter;
+ bool load_delay_dirty;
+ Reg load_delay_register;
+ u32 load_delay_value_register;
+ Reg next_load_delay_register;
+ u32 next_load_delay_value_register;
+ };
+
+ // we need two of these, one for branch delays, and another if we have an overflow in the delay slot
+ std::array m_host_state_backup = {};
+ u32 m_host_state_backup_count = 0;
+
+ // PGXP memory callbacks
+ static const std::array, 3> s_pgxp_mem_load_functions;
+ static const std::array s_pgxp_mem_store_functions;
+};
+
+void BackpatchLoadStore(void* exception_pc, const CodeCache::LoadstoreBackpatchInfo& info);
+
+u32 CompileLoadStoreThunk(void* thunk_code, u32 thunk_space, void* code_address, u32 code_size, TickCount cycles_to_add,
+ TickCount cycles_to_remove, u32 gpr_bitmask, u8 address_register, u8 data_register,
+ MemoryAccessSize size, bool is_signed, bool is_load);
+
+extern Compiler* g_compiler;
+} // namespace CPU::NewRec
diff --git a/src/core/cpu_newrec_compiler_aarch64.cpp b/src/core/cpu_newrec_compiler_aarch64.cpp
new file mode 100644
index 0000000000..4d05927bd5
--- /dev/null
+++ b/src/core/cpu_newrec_compiler_aarch64.cpp
@@ -0,0 +1,2235 @@
+// SPDX-FileCopyrightText: 2023 Connor McLaughlin
+// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
+
+#include "cpu_newrec_compiler_aarch64.h"
+#include "common/align.h"
+#include "common/assert.h"
+#include "common/log.h"
+#include "common/string_util.h"
+#include "cpu_core_private.h"
+#include "cpu_recompiler_thunks.h"
+#include "gte.h"
+#include "pgxp.h"
+#include "settings.h"
+#include "timing_event.h"
+#include
+Log_SetChannel(CPU::NewRec);
+
+#define DUMP_BLOCKS
+
+#ifdef DUMP_BLOCKS
+#include "vixl/aarch64/disasm-aarch64.h"
+#endif
+
+using namespace vixl::aarch64;
+
+#define RWRET vixl::aarch64::w0
+#define RXRET vixl::aarch64::x0
+#define RWARG1 vixl::aarch64::w0
+#define RXARG1 vixl::aarch64::x0
+#define RWARG2 vixl::aarch64::w1
+#define RXARG2 vixl::aarch64::x1
+#define RWARG3 vixl::aarch64::w2
+#define RXARG3 vixl::aarch64::x2
+#define RWSCRATCH vixl::aarch64::w16
+#define RXSCRATCH vixl::aarch64::x16
+#define RSTATE vixl::aarch64::x19
+#define RMEMBASE vixl::aarch64::x20
+
+#define PTR(x) vixl::aarch64::MemOperand(RSTATE, (u32)(((u8*)(x)) - ((u8*)&g_state)))
+
+namespace CPU::NewRec {
+
+using CPU::Recompiler::armEmitCall;
+using CPU::Recompiler::armEmitCondBranch;
+using CPU::Recompiler::armEmitJmp;
+using CPU::Recompiler::armEmitMov;
+using CPU::Recompiler::armGetJumpTrampoline;
+using CPU::Recompiler::armGetPCDisplacement;
+using CPU::Recompiler::armIsCallerSavedRegister;
+using CPU::Recompiler::armMoveAddressToReg;
+
+AArch64Compiler s_instance;
+Compiler* g_compiler = &s_instance;
+
+} // namespace CPU::NewRec
+
+CPU::NewRec::AArch64Compiler::AArch64Compiler() = default;
+
+CPU::NewRec::AArch64Compiler::~AArch64Compiler() = default;
+
+const void* CPU::NewRec::AArch64Compiler::GetCurrentCodePointer()
+{
+ return armAsm->GetCursorAddress();
+}
+
+void CPU::NewRec::AArch64Compiler::Reset(CodeCache::Block* block, u8* code_buffer, u32 code_buffer_space,
+ u8* far_code_buffer, u32 far_code_space)
+{
+ Compiler::Reset(block, code_buffer, code_buffer_space, far_code_buffer, far_code_space);
+
+ // TODO: don't recreate this every time..
+ DebugAssert(!m_emitter && !m_far_emitter && !armAsm);
+ m_emitter = std::make_unique(code_buffer, code_buffer_space, PositionDependentCode);
+ m_far_emitter = std::make_unique(far_code_buffer, far_code_space, PositionDependentCode);
+ armAsm = m_emitter.get();
+
+#ifdef VIXL_DEBUG
+ m_emitter_check = std::make_unique(m_emitter.get(), code_buffer_space,
+ vixl::CodeBufferCheckScope::kDontReserveBufferSpace);
+ m_far_emitter_check = std::make_unique(
+ m_far_emitter.get(), far_code_space, vixl::CodeBufferCheckScope::kDontReserveBufferSpace);
+#endif
+
+ // Need to wipe it out so it's correct when toggling fastmem.
+ m_host_regs = {};
+
+ const u32 membase_idx = CodeCache::IsUsingFastmem() ? RMEMBASE.GetCode() : NUM_HOST_REGS;
+ for (u32 i = 0; i < NUM_HOST_REGS; i++)
+ {
+ HostRegAlloc& ra = m_host_regs[i];
+
+ if (i == RWARG1.GetCode() || i == RWARG1.GetCode() || i == RWARG2.GetCode() || i == RWARG3.GetCode() ||
+ i == RWSCRATCH.GetCode() || i == RSTATE.GetCode() || i == membase_idx || i == x18.GetCode() || i >= 30)
+ {
+ continue;
+ }
+
+ ra.flags = HR_USABLE | (armIsCallerSavedRegister(i) ? 0 : HR_CALLEE_SAVED);
+ }
+}
+
+void CPU::NewRec::AArch64Compiler::SwitchToFarCode(bool emit_jump, vixl::aarch64::Condition cond)
+{
+ DebugAssert(armAsm == m_emitter.get());
+ if (emit_jump)
+ {
+ const s64 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter->GetCursorAddress());
+ if (cond != Condition::al)
+ {
+ if (vixl::IsInt19(disp))
+ {
+ armAsm->b(disp, cond);
+ }
+ else
+ {
+ Label skip;
+ armAsm->b(&skip, vixl::aarch64::InvertCondition(cond));
+ armAsm->b(armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter->GetCursorAddress()));
+ armAsm->bind(&skip);
+ }
+ }
+ else
+ {
+ armAsm->b(disp);
+ }
+ }
+ armAsm = m_far_emitter.get();
+}
+
+void CPU::NewRec::AArch64Compiler::SwitchToFarCodeIfBitSet(const vixl::aarch64::Register& reg, u32 bit)
+{
+ const s64 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter->GetCursorAddress());
+ if (vixl::IsInt14(disp))
+ {
+ armAsm->tbnz(reg, bit, disp);
+ }
+ else
+ {
+ Label skip;
+ armAsm->tbz(reg, bit, &skip);
+ armAsm->b(armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter->GetCursorAddress()));
+ armAsm->bind(&skip);
+ }
+
+ armAsm = m_far_emitter.get();
+}
+
+void CPU::NewRec::AArch64Compiler::SwitchToFarCodeIfRegZeroOrNonZero(const vixl::aarch64::Register& reg, bool nonzero)
+{
+ const s64 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter->GetCursorAddress());
+ if (vixl::IsInt19(disp))
+ {
+ nonzero ? armAsm->cbnz(reg, disp) : armAsm->cbz(reg, disp);
+ }
+ else
+ {
+ Label skip;
+ nonzero ? armAsm->cbz(reg, &skip) : armAsm->cbnz(reg, &skip);
+ armAsm->b(armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter->GetCursorAddress()));
+ armAsm->bind(&skip);
+ }
+
+ armAsm = m_far_emitter.get();
+}
+
+void CPU::NewRec::AArch64Compiler::SwitchToNearCode(bool emit_jump, vixl::aarch64::Condition cond)
+{
+ DebugAssert(armAsm == m_far_emitter.get());
+ if (emit_jump)
+ {
+ const s64 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_emitter->GetCursorAddress());
+ (cond != Condition::al) ? armAsm->b(disp, cond) : armAsm->b(disp);
+ }
+ armAsm = m_emitter.get();
+}
+
+void CPU::NewRec::AArch64Compiler::EmitMov(const vixl::aarch64::WRegister& dst, u32 val)
+{
+ armEmitMov(armAsm, dst, val);
+}
+
+void CPU::NewRec::AArch64Compiler::EmitCall(const void* ptr, bool force_inline /*= false*/)
+{
+ armEmitCall(armAsm, ptr, force_inline);
+}
+
+vixl::aarch64::Operand CPU::NewRec::AArch64Compiler::armCheckAddSubConstant(s32 val)
+{
+ if (Assembler::IsImmAddSub(val))
+ return vixl::aarch64::Operand(static_cast(val));
+
+ EmitMov(RWSCRATCH, static_cast(val));
+ return vixl::aarch64::Operand(RWSCRATCH);
+}
+
+vixl::aarch64::Operand CPU::NewRec::AArch64Compiler::armCheckAddSubConstant(u32 val)
+{
+ return armCheckAddSubConstant(static_cast(val));
+}
+
+vixl::aarch64::Operand CPU::NewRec::AArch64Compiler::armCheckCompareConstant(s32 val)
+{
+ if (Assembler::IsImmConditionalCompare(val))
+ return vixl::aarch64::Operand(static_cast(val));
+
+ EmitMov(RWSCRATCH, static_cast(val));
+ return vixl::aarch64::Operand(RWSCRATCH);
+}
+
+vixl::aarch64::Operand CPU::NewRec::AArch64Compiler::armCheckLogicalConstant(u32 val)
+{
+ if (Assembler::IsImmLogical(val, 32))
+ return vixl::aarch64::Operand(static_cast(static_cast(val)));
+
+ EmitMov(RWSCRATCH, val);
+ return vixl::aarch64::Operand(RWSCRATCH);
+}
+
+void CPU::NewRec::AArch64Compiler::BeginBlock()
+{
+ Compiler::BeginBlock();
+}
+
+void CPU::NewRec::AArch64Compiler::GenerateBlockProtectCheck(const u8* ram_ptr, const u8* shadow_ptr, u32 size)
+{
+ // store it first to reduce code size, because we can offset
+ armMoveAddressToReg(armAsm, RXARG1, ram_ptr);
+ armMoveAddressToReg(armAsm, RXARG2, shadow_ptr);
+
+ bool first = true;
+ u32 offset = 0;
+ Label block_changed;
+
+ while (size >= 16)
+ {
+ const VRegister vtmp = v2.V4S();
+ const VRegister dst = first ? v0.V4S() : v1.V4S();
+ armAsm->ldr(dst, MemOperand(RXARG1, offset));
+ armAsm->ldr(vtmp, MemOperand(RXARG2, offset));
+ armAsm->cmeq(dst, dst, vtmp);
+ if (!first)
+ armAsm->and_(dst.V16B(), dst.V16B(), vtmp.V16B());
+ else
+ first = false;
+
+ offset += 16;
+ size -= 16;
+ }
+
+ if (!first)
+ {
+ // TODO: make sure this doesn't choke on ffffffff
+ armAsm->uminv(s0, v0.V4S());
+ armAsm->fcmp(s0, 0.0);
+ armAsm->b(&block_changed, eq);
+ }
+
+ while (size >= 8)
+ {
+ armAsm->ldr(RXARG3, MemOperand(RXARG1, offset));
+ armAsm->ldr(RXSCRATCH, MemOperand(RXARG2, offset));
+ armAsm->cmp(RXARG3, RXSCRATCH);
+ armAsm->b(&block_changed, ne);
+ offset += 8;
+ size -= 8;
+ }
+
+ while (size >= 4)
+ {
+ armAsm->ldr(RWARG3, MemOperand(RXARG1, offset));
+ armAsm->ldr(RWSCRATCH, MemOperand(RXARG2, offset));
+ armAsm->cmp(RWARG3, RWSCRATCH);
+ armAsm->b(&block_changed, ne);
+ offset += 4;
+ size -= 4;
+ }
+
+ DebugAssert(size == 0);
+
+ Label block_unchanged;
+ armAsm->b(&block_unchanged);
+ armAsm->bind(&block_changed);
+ armEmitJmp(armAsm, CodeCache::g_discard_and_recompile_block, false);
+ armAsm->bind(&block_unchanged);
+}
+
+void CPU::NewRec::AArch64Compiler::GenerateICacheCheckAndUpdate()
+{
+ if (GetSegmentForAddress(m_block->pc) >= Segment::KSEG1)
+ {
+ armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));
+ armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(static_cast(m_block->uncached_fetch_ticks)));
+ armAsm->str(RWARG1, PTR(&g_state.pending_ticks));
+ }
+ else
+ {
+ const auto& ticks_reg = RWARG1;
+ const auto& current_tag_reg = RWARG2;
+ const auto& existing_tag_reg = RWARG3;
+
+ VirtualMemoryAddress current_pc = m_block->pc & ICACHE_TAG_ADDRESS_MASK;
+ armAsm->ldr(ticks_reg, PTR(&g_state.pending_ticks));
+ armEmitMov(armAsm, current_tag_reg, current_pc);
+
+ for (u32 i = 0; i < m_block->icache_line_count; i++, current_pc += ICACHE_LINE_SIZE)
+ {
+ const TickCount fill_ticks = GetICacheFillTicks(current_pc);
+ if (fill_ticks <= 0)
+ continue;
+
+ const u32 line = GetICacheLine(current_pc);
+ const u32 offset = offsetof(State, icache_tags) + (line * sizeof(u32));
+
+ Label cache_hit;
+ armAsm->ldr(existing_tag_reg, MemOperand(RSTATE, offset));
+ armAsm->cmp(existing_tag_reg, current_tag_reg);
+ armAsm->b(&cache_hit, eq);
+
+ armAsm->str(current_tag_reg, MemOperand(RSTATE, offset));
+ armAsm->add(ticks_reg, ticks_reg, armCheckAddSubConstant(static_cast(fill_ticks)));
+ armAsm->bind(&cache_hit);
+
+ if (i != (m_block->icache_line_count - 1))
+ armAsm->add(current_tag_reg, current_tag_reg, armCheckAddSubConstant(ICACHE_LINE_SIZE));
+ }
+
+ armAsm->str(ticks_reg, PTR(&g_state.pending_ticks));
+ }
+}
+
+void CPU::NewRec::AArch64Compiler::GenerateCall(const void* func, s32 arg1reg /*= -1*/, s32 arg2reg /*= -1*/,
+ s32 arg3reg /*= -1*/)
+{
+ if (arg1reg >= 0 && arg1reg != static_cast(RXARG1.GetCode()))
+ armAsm->mov(RXARG1, XRegister(arg1reg));
+ if (arg1reg >= 0 && arg2reg != static_cast(RXARG2.GetCode()))
+ armAsm->mov(RXARG2, XRegister(arg2reg));
+ if (arg1reg >= 0 && arg3reg != static_cast(RXARG3.GetCode()))
+ armAsm->mov(RXARG3, XRegister(arg3reg));
+ EmitCall(func);
+}
+
+void CPU::NewRec::AArch64Compiler::EndBlock(const std::optional& newpc, bool do_event_test)
+{
+ if (newpc.has_value())
+ {
+ if (m_dirty_pc || m_compiler_pc != newpc)
+ {
+ EmitMov(RWSCRATCH, newpc.value());
+ armAsm->str(RWSCRATCH, PTR(&g_state.pc));
+ }
+ }
+ m_dirty_pc = false;
+
+ // flush regs
+ Flush(FLUSH_END_BLOCK);
+ EndAndLinkBlock(newpc, do_event_test);
+}
+
+void CPU::NewRec::AArch64Compiler::EndBlockWithException(Exception excode)
+{
+ // flush regs, but not pc, it's going to get overwritten
+ // flush cycles because of the GTE instruction stuff...
+ Flush(FLUSH_END_BLOCK | FLUSH_FOR_EXCEPTION);
+
+ // TODO: flush load delay
+ // TODO: break for pcdrv
+
+ EmitMov(RWARG1, Cop0Registers::CAUSE::MakeValueForException(excode, m_current_instruction_branch_delay_slot, false,
+ inst->cop.cop_n));
+ EmitMov(RWARG2, m_current_instruction_pc);
+ EmitCall(reinterpret_cast(static_cast(&CPU::RaiseException)));
+ m_dirty_pc = false;
+
+ EndAndLinkBlock(std::nullopt, true);
+}
+
+void CPU::NewRec::AArch64Compiler::EndAndLinkBlock(const std::optional& newpc, bool do_event_test)
+{
+ // event test
+ // pc should've been flushed
+ DebugAssert(!m_dirty_pc);
+
+ // TODO: try extracting this to a function
+ // TODO: move the cycle flush in here..
+
+ // save cycles for event test
+ const TickCount cycles = std::exchange(m_cycles, 0);
+
+ // pending_ticks += cycles
+ // if (pending_ticks >= downcount) { dispatch_event(); }
+ if (do_event_test || m_gte_done_cycle > cycles || cycles > 0)
+ armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));
+ if (do_event_test)
+ armAsm->ldr(RWARG2, PTR(&g_state.downcount));
+ if (cycles > 0)
+ armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(cycles));
+ if (m_gte_done_cycle > cycles)
+ {
+ armAsm->add(RWARG2, RWARG1, armCheckAddSubConstant(m_gte_done_cycle - cycles));
+ armAsm->str(RWARG2, PTR(&g_state.gte_completion_tick));
+ }
+ if (do_event_test)
+ armAsm->cmp(RWARG1, RWARG2);
+ if (cycles > 0)
+ armAsm->str(RWARG1, PTR(&g_state.pending_ticks));
+ if (do_event_test)
+ armEmitCondBranch(armAsm, ge, CodeCache::g_run_events_and_dispatch);
+
+ // jump to dispatcher or next block
+ if (!newpc.has_value())
+ {
+ armEmitJmp(armAsm, CodeCache::g_dispatcher, false);
+ }
+ else
+ {
+ if (newpc.value() == m_block->pc)
+ {
+ // Special case: ourselves! No need to backlink then.
+ Log_DebugPrintf("Linking block at %08X to self", m_block->pc);
+ armEmitJmp(armAsm, armAsm->GetBuffer()->GetStartAddress(), true);
+ }
+ else
+ {
+ const void* target = CodeCache::CreateBlockLink(m_block, armAsm->GetCursorAddress(), newpc.value());
+ armEmitJmp(armAsm, target, true);
+ }
+ }
+
+ m_block_ended = true;
+}
+
+const void* CPU::NewRec::AArch64Compiler::EndCompile(u32* code_size, u32* far_code_size)
+{
+#ifdef VIXL_DEBUG
+ m_emitter_check.reset();
+ m_far_emitter_check.reset();
+#endif
+
+ m_emitter->FinalizeCode();
+ m_far_emitter->FinalizeCode();
+
+ u8* const code = m_emitter->GetBuffer()->GetStartAddress();
+ *code_size = static_cast(m_emitter->GetCursorOffset());
+ *far_code_size = static_cast(m_far_emitter->GetCursorOffset());
+ armAsm = nullptr;
+ m_far_emitter.reset();
+ m_emitter.reset();
+ return code;
+}
+
+const char* CPU::NewRec::AArch64Compiler::GetHostRegName(u32 reg) const
+{
+ static constexpr std::array reg64_names = {
+ {"x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
+ "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "fp", "lr", "sp"}};
+ return (reg < reg64_names.size()) ? reg64_names[reg] : "UNKNOWN";
+}
+
+void CPU::NewRec::AArch64Compiler::LoadHostRegWithConstant(u32 reg, u32 val)
+{
+ EmitMov(WRegister(reg), val);
+}
+
+void CPU::NewRec::AArch64Compiler::LoadHostRegFromCPUPointer(u32 reg, const void* ptr)
+{
+ armAsm->ldr(WRegister(reg), PTR(ptr));
+}
+
+void CPU::NewRec::AArch64Compiler::StoreHostRegToCPUPointer(u32 reg, const void* ptr)
+{
+ armAsm->str(WRegister(reg), PTR(ptr));
+}
+
+void CPU::NewRec::AArch64Compiler::StoreConstantToCPUPointer(u32 val, const void* ptr)
+{
+ if (val == 0)
+ {
+ armAsm->str(wzr, PTR(ptr));
+ return;
+ }
+
+ EmitMov(RWSCRATCH, val);
+ armAsm->str(RWSCRATCH, PTR(ptr));
+}
+
+void CPU::NewRec::AArch64Compiler::CopyHostReg(u32 dst, u32 src)
+{
+ if (src != dst)
+ armAsm->mov(WRegister(dst), WRegister(src));
+}
+
+void CPU::NewRec::AArch64Compiler::AssertRegOrConstS(CompileFlags cf) const
+{
+ DebugAssert(cf.valid_host_s || cf.const_s);
+}
+
+void CPU::NewRec::AArch64Compiler::AssertRegOrConstT(CompileFlags cf) const
+{
+ DebugAssert(cf.valid_host_t || cf.const_t);
+}
+
+vixl::aarch64::MemOperand CPU::NewRec::AArch64Compiler::MipsPtr(Reg r) const
+{
+ DebugAssert(r < Reg::count);
+ return PTR(&g_state.regs.r[static_cast(r)]);
+}
+
+vixl::aarch64::WRegister CPU::NewRec::AArch64Compiler::CFGetRegD(CompileFlags cf) const
+{
+ DebugAssert(cf.valid_host_d);
+ return WRegister(cf.host_d);
+}
+
+vixl::aarch64::WRegister CPU::NewRec::AArch64Compiler::CFGetRegS(CompileFlags cf) const
+{
+ DebugAssert(cf.valid_host_s);
+ return WRegister(cf.host_s);
+}
+
+vixl::aarch64::WRegister CPU::NewRec::AArch64Compiler::CFGetRegT(CompileFlags cf) const
+{
+ DebugAssert(cf.valid_host_t);
+ return WRegister(cf.host_t);
+}
+
+vixl::aarch64::WRegister CPU::NewRec::AArch64Compiler::CFGetRegLO(CompileFlags cf) const
+{
+ DebugAssert(cf.valid_host_lo);
+ return WRegister(cf.host_lo);
+}
+
+vixl::aarch64::WRegister CPU::NewRec::AArch64Compiler::CFGetRegHI(CompileFlags cf) const
+{
+ DebugAssert(cf.valid_host_hi);
+ return WRegister(cf.host_hi);
+}
+
+void CPU::NewRec::AArch64Compiler::MoveSToReg(const vixl::aarch64::WRegister& dst, CompileFlags cf)
+{
+ if (cf.valid_host_s)
+ {
+ if (cf.host_s != dst.GetCode())
+ armAsm->mov(dst, WRegister(cf.host_s));
+ }
+ else if (cf.const_s)
+ {
+ const u32 cv = GetConstantRegU32(cf.MipsS());
+ if (cv == 0)
+ armAsm->mov(dst, wzr);
+ else
+ EmitMov(dst, cv);
+ }
+ else
+ {
+ Log_WarningPrintf("Hit memory path in MoveSToReg() for %s", GetRegName(cf.MipsS()));
+ armAsm->ldr(dst, PTR(&g_state.regs.r[cf.mips_s]));
+ }
+}
+
+void CPU::NewRec::AArch64Compiler::MoveTToReg(const vixl::aarch64::WRegister& dst, CompileFlags cf)
+{
+ if (cf.valid_host_t)
+ {
+ if (cf.host_t != dst.GetCode())
+ armAsm->mov(dst, WRegister(cf.host_t));
+ }
+ else if (cf.const_t)
+ {
+ const u32 cv = GetConstantRegU32(cf.MipsT());
+ if (cv == 0)
+ armAsm->mov(dst, wzr);
+ else
+ EmitMov(dst, cv);
+ }
+ else
+ {
+ Log_WarningPrintf("Hit memory path in MoveTToReg() for %s", GetRegName(cf.MipsT()));
+ armAsm->ldr(dst, PTR(&g_state.regs.r[cf.mips_t]));
+ }
+}
+
+void CPU::NewRec::AArch64Compiler::MoveMIPSRegToReg(const vixl::aarch64::WRegister& dst, Reg reg)
+{
+ DebugAssert(reg < Reg::count);
+ if (const std::optional hreg = CheckHostReg(0, Compiler::HR_TYPE_CPU_REG, reg))
+ armAsm->mov(dst, WRegister(hreg.value()));
+ else if (HasConstantReg(reg))
+ EmitMov(dst, GetConstantRegU32(reg));
+ else
+ armAsm->ldr(dst, MipsPtr(reg));
+}
+
+void CPU::NewRec::AArch64Compiler::GeneratePGXPCallWithMIPSRegs(const void* func, u32 arg1val,
+ Reg arg2reg /* = Reg::count */,
+ Reg arg3reg /* = Reg::count */)
+{
+ DebugAssert(g_settings.gpu_pgxp_enable);
+
+ Flush(FLUSH_FOR_C_CALL);
+
+ if (arg2reg != Reg::count)
+ MoveMIPSRegToReg(RWARG2, arg2reg);
+ if (arg3reg != Reg::count)
+ MoveMIPSRegToReg(RWARG3, arg3reg);
+
+ EmitMov(RWARG1, arg1val);
+ EmitCall(func);
+}
+
+void CPU::NewRec::AArch64Compiler::Flush(u32 flags)
+{
+ Compiler::Flush(flags);
+
+ if (flags & FLUSH_PC && m_dirty_pc)
+ {
+ StoreConstantToCPUPointer(m_compiler_pc, &g_state.pc);
+ m_dirty_pc = false;
+ }
+
+ if (flags & FLUSH_INSTRUCTION_BITS)
+ {
+ // This sucks, but it's only used for fallbacks.
+ Panic("Not implemented");
+ }
+
+ if (flags & FLUSH_LOAD_DELAY_FROM_STATE && m_load_delay_dirty)
+ {
+ // This sucks :(
+ // TODO: make it a function?
+ armAsm->ldrb(RWARG1, PTR(&g_state.load_delay_reg));
+ armAsm->ldr(RWARG2, PTR(&g_state.load_delay_value));
+ EmitMov(RWSCRATCH, offsetof(CPU::State, regs.r[0]));
+ armAsm->add(RWARG1, RWSCRATCH, vixl::aarch64::Operand(RWARG1, LSL, 2));
+ armAsm->str(RWARG2, MemOperand(RSTATE, RXARG1));
+ EmitMov(RWSCRATCH, static_cast(Reg::count));
+ armAsm->strb(RWSCRATCH, PTR(&g_state.load_delay_reg));
+ m_load_delay_dirty = false;
+ }
+
+ if (flags & FLUSH_LOAD_DELAY && m_load_delay_register != Reg::count)
+ {
+ if (m_load_delay_value_register != NUM_HOST_REGS)
+ FreeHostReg(m_load_delay_value_register);
+
+ EmitMov(RWSCRATCH, static_cast(m_load_delay_register));
+ armAsm->strb(RWSCRATCH, PTR(&g_state.load_delay_reg));
+ m_load_delay_register = Reg::count;
+ m_load_delay_dirty = true;
+ }
+
+ if (flags & FLUSH_GTE_STALL_FROM_STATE && m_dirty_gte_done_cycle)
+ {
+ // May as well flush cycles while we're here.
+ // GTE spanning blocks is very rare, we _could_ disable this for speed.
+ armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));
+ armAsm->ldr(RWARG2, PTR(&g_state.gte_completion_tick));
+ if (m_cycles > 0)
+ {
+ armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(m_cycles));
+ m_cycles = 0;
+ }
+ armAsm->cmp(RWARG2, RWARG1);
+ armAsm->csel(RWARG1, RWARG2, RWARG1, hs);
+ armAsm->str(RWARG1, PTR(&g_state.pending_ticks));
+ m_dirty_gte_done_cycle = false;
+ }
+
+ if (flags & FLUSH_GTE_DONE_CYCLE && m_gte_done_cycle > m_cycles)
+ {
+ armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));
+
+ // update cycles at the same time
+ if (flags & FLUSH_CYCLES && m_cycles > 0)
+ {
+ armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(m_cycles));
+ armAsm->str(RWARG1, PTR(&g_state.pending_ticks));
+ m_gte_done_cycle -= m_cycles;
+ m_cycles = 0;
+ }
+
+ armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(m_gte_done_cycle));
+ armAsm->str(RWARG1, PTR(&g_state.gte_completion_tick));
+ m_gte_done_cycle = 0;
+ m_dirty_gte_done_cycle = true;
+ }
+
+ if (flags & FLUSH_CYCLES && m_cycles > 0)
+ {
+ armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));
+ armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(m_cycles));
+ armAsm->str(RWARG1, PTR(&g_state.pending_ticks));
+ m_gte_done_cycle = std::max(m_gte_done_cycle - m_cycles, 0);
+ m_cycles = 0;
+ }
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_Fallback()
+{
+ Flush(FLUSH_FOR_INTERPRETER);
+
+#if 0
+ cg->call(&CPU::Recompiler::Thunks::InterpretInstruction);
+
+ // TODO: make me less garbage
+ // TODO: this is wrong, it flushes the load delay on the same cycle when we return.
+ // but nothing should be going through here..
+ Label no_load_delay;
+ cg->movzx(RWARG1, cg->byte[PTR(&g_state.next_load_delay_reg)]);
+ cg->cmp(RWARG1, static_cast(Reg::count));
+ cg->je(no_load_delay, CodeGenerator::T_SHORT);
+ cg->mov(RWARG2, cg->dword[PTR(&g_state.next_load_delay_value)]);
+ cg->mov(cg->byte[PTR(&g_state.load_delay_reg)], RWARG1);
+ cg->mov(cg->dword[PTR(&g_state.load_delay_value)], RWARG2);
+ cg->mov(cg->byte[PTR(&g_state.next_load_delay_reg)], static_cast(Reg::count));
+ cg->L(no_load_delay);
+
+ m_load_delay_dirty = EMULATE_LOAD_DELAYS;
+#else
+ Panic("Fixme");
+#endif
+}
+
+void CPU::NewRec::AArch64Compiler::CheckBranchTarget(const vixl::aarch64::WRegister& pcreg)
+{
+ if (!g_settings.cpu_recompiler_memory_exceptions)
+ return;
+
+ armAsm->tst(pcreg, armCheckLogicalConstant(0x3));
+ SwitchToFarCode(true, ne);
+
+ BackupHostState();
+ EndBlockWithException(Exception::AdEL);
+
+ RestoreHostState();
+ SwitchToNearCode(false);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_jr(CompileFlags cf)
+{
+ const WRegister pcreg = CFGetRegS(cf);
+ CheckBranchTarget(pcreg);
+
+ armAsm->str(pcreg, PTR(&g_state.pc));
+
+ CompileBranchDelaySlot(false);
+ EndBlock(std::nullopt, true);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_jalr(CompileFlags cf)
+{
+ const WRegister pcreg = CFGetRegS(cf);
+ if (MipsD() != Reg::zero)
+ SetConstantReg(MipsD(), GetBranchReturnAddress(cf));
+
+ CheckBranchTarget(pcreg);
+ armAsm->str(pcreg, PTR(&g_state.pc));
+
+ CompileBranchDelaySlot(false);
+ EndBlock(std::nullopt, true);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_bxx(CompileFlags cf, BranchCondition cond)
+{
+ AssertRegOrConstS(cf);
+
+ const u32 taken_pc = GetConditionalBranchTarget(cf);
+
+ Flush(FLUSH_FOR_BRANCH);
+
+ DebugAssert(cf.valid_host_s);
+
+ // MipsT() here should equal zero for zero branches.
+ DebugAssert(cond == BranchCondition::Equal || cond == BranchCondition::NotEqual || cf.MipsT() == Reg::zero);
+
+ Label taken;
+ const WRegister rs = CFGetRegS(cf);
+ switch (cond)
+ {
+ case BranchCondition::Equal:
+ case BranchCondition::NotEqual:
+ {
+ AssertRegOrConstT(cf);
+ if (cf.const_t && HasConstantRegValue(cf.MipsT(), 0))
+ {
+ (cond == BranchCondition::Equal) ? armAsm->cbz(rs, &taken) : armAsm->cbnz(rs, &taken);
+ }
+ else
+ {
+ if (cf.valid_host_t)
+ armAsm->cmp(rs, CFGetRegT(cf));
+ else if (cf.const_t)
+ armAsm->cmp(rs, armCheckCompareConstant(GetConstantRegU32(cf.MipsT())));
+
+ armAsm->b(&taken, (cond == BranchCondition::Equal) ? eq : ne);
+ }
+ }
+ break;
+
+ case BranchCondition::GreaterThanZero:
+ {
+ armAsm->cmp(rs, 0);
+ armAsm->b(&taken, gt);
+ }
+ break;
+
+ case BranchCondition::GreaterEqualZero:
+ {
+ armAsm->cmp(rs, 0);
+ armAsm->b(&taken, ge);
+ }
+ break;
+
+ case BranchCondition::LessThanZero:
+ {
+ armAsm->cmp(rs, 0);
+ armAsm->b(&taken, lt);
+ }
+ break;
+
+ case BranchCondition::LessEqualZero:
+ {
+ armAsm->cmp(rs, 0);
+ armAsm->b(&taken, le);
+ }
+ break;
+ }
+
+ BackupHostState();
+ if (!cf.delay_slot_swapped)
+ CompileBranchDelaySlot();
+
+ EndBlock(m_compiler_pc, true);
+
+ armAsm->bind(&taken);
+
+ RestoreHostState();
+ if (!cf.delay_slot_swapped)
+ CompileBranchDelaySlot();
+
+ EndBlock(taken_pc, true);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_addi(CompileFlags cf, bool overflow)
+{
+ const WRegister rs = CFGetRegS(cf);
+ const WRegister rt = CFGetRegT(cf);
+ if (const u32 imm = inst->i.imm_sext32(); imm != 0)
+ {
+ if (!overflow)
+ {
+ armAsm->add(rt, rs, armCheckAddSubConstant(imm));
+ }
+ else
+ {
+ armAsm->adds(rt, rs, armCheckAddSubConstant(imm));
+ TestOverflow(rt);
+ }
+ }
+ else if (rt.GetCode() != rs.GetCode())
+ {
+ armAsm->mov(rt, rs);
+ }
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_addi(CompileFlags cf)
+{
+ Compile_addi(cf, g_settings.cpu_recompiler_memory_exceptions);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_addiu(CompileFlags cf)
+{
+ Compile_addi(cf, false);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_slti(CompileFlags cf)
+{
+ Compile_slti(cf, true);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_sltiu(CompileFlags cf)
+{
+ Compile_slti(cf, false);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_slti(CompileFlags cf, bool sign)
+{
+ armAsm->cmp(CFGetRegS(cf), armCheckCompareConstant(static_cast(inst->i.imm_sext32())));
+ armAsm->cset(CFGetRegT(cf), sign ? lt : lo);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_andi(CompileFlags cf)
+{
+ const WRegister rt = CFGetRegT(cf);
+ if (const u32 imm = inst->i.imm_zext32(); imm != 0)
+ armAsm->and_(rt, CFGetRegS(cf), armCheckLogicalConstant(imm));
+ else
+ armAsm->mov(rt, wzr);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_ori(CompileFlags cf)
+{
+ const WRegister rt = CFGetRegT(cf);
+ const WRegister rs = CFGetRegS(cf);
+ if (const u32 imm = inst->i.imm_zext32(); imm != 0)
+ armAsm->orr(rt, rs, armCheckLogicalConstant(imm));
+ else if (rt.GetCode() != rs.GetCode())
+ armAsm->mov(rt, rs);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_xori(CompileFlags cf)
+{
+ const WRegister rt = CFGetRegT(cf);
+ const WRegister rs = CFGetRegS(cf);
+ if (const u32 imm = inst->i.imm_zext32(); imm != 0)
+ armAsm->eor(rt, rs, armCheckLogicalConstant(imm));
+ else if (rt.GetCode() != rs.GetCode())
+ armAsm->mov(rt, rs);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_shift(CompileFlags cf,
+ void (vixl::aarch64::Assembler::*op)(const vixl::aarch64::Register&,
+ const vixl::aarch64::Register&,
+ unsigned))
+{
+ const WRegister rd = CFGetRegD(cf);
+ const WRegister rt = CFGetRegT(cf);
+ if (inst->r.shamt > 0)
+ (armAsm->*op)(rd, rt, inst->r.shamt);
+ else if (rd.GetCode() != rt.GetCode())
+ armAsm->mov(rd, rt);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_sll(CompileFlags cf)
+{
+ Compile_shift(cf, &Assembler::lsl);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_srl(CompileFlags cf)
+{
+ Compile_shift(cf, &Assembler::lsr);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_sra(CompileFlags cf)
+{
+ Compile_shift(cf, &Assembler::asr);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_variable_shift(
+ CompileFlags cf,
+ void (vixl::aarch64::Assembler::*op)(const vixl::aarch64::Register&, const vixl::aarch64::Register&,
+ const vixl::aarch64::Register&),
+ void (vixl::aarch64::Assembler::*op_const)(const vixl::aarch64::Register&, const vixl::aarch64::Register&, unsigned))
+{
+ const WRegister rd = CFGetRegD(cf);
+
+ AssertRegOrConstS(cf);
+ AssertRegOrConstT(cf);
+
+ const WRegister rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG2;
+ if (!cf.valid_host_t)
+ MoveTToReg(rt, cf);
+
+ if (cf.const_s)
+ {
+ if (const u32 shift = GetConstantRegU32(cf.MipsS()); shift != 0)
+ (armAsm->*op_const)(rd, rt, shift);
+ else if (rd.GetCode() != rt.GetCode())
+ armAsm->mov(rd, rt);
+ }
+ else
+ {
+ (armAsm->*op)(rd, rt, CFGetRegS(cf));
+ }
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_sllv(CompileFlags cf)
+{
+ Compile_variable_shift(cf, &Assembler::lslv, &Assembler::lsl);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_srlv(CompileFlags cf)
+{
+ Compile_variable_shift(cf, &Assembler::lsrv, &Assembler::lsr);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_srav(CompileFlags cf)
+{
+ Compile_variable_shift(cf, &Assembler::asrv, &Assembler::asr);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_mult(CompileFlags cf, bool sign)
+{
+ const WRegister rs = cf.valid_host_s ? CFGetRegS(cf) : RWARG1;
+ if (!cf.valid_host_s)
+ MoveSToReg(rs, cf);
+
+ const WRegister rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG2;
+ if (!cf.valid_host_t)
+ MoveTToReg(rt, cf);
+
+ // TODO: if lo/hi gets killed, we can use a 32-bit multiply
+ const WRegister lo = CFGetRegLO(cf);
+ const WRegister hi = CFGetRegHI(cf);
+
+ (sign) ? armAsm->smull(lo.X(), rs, rt) : armAsm->umull(lo.X(), rs, rt);
+ armAsm->lsr(hi.X(), lo.X(), 32);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_mult(CompileFlags cf)
+{
+ Compile_mult(cf, true);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_multu(CompileFlags cf)
+{
+ Compile_mult(cf, false);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_div(CompileFlags cf)
+{
+ const WRegister rs = cf.valid_host_s ? CFGetRegS(cf) : RWARG1;
+ if (!cf.valid_host_s)
+ MoveSToReg(rs, cf);
+
+ const WRegister rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG2;
+ if (!cf.valid_host_t)
+ MoveTToReg(rt, cf);
+
+ const WRegister rlo = CFGetRegLO(cf);
+ const WRegister rhi = CFGetRegHI(cf);
+
+ // TODO: This could be slightly more optimal
+ Label done;
+ Label not_divide_by_zero;
+ armAsm->cbnz(rt, ¬_divide_by_zero);
+ armAsm->cmp(rs, 0);
+ armAsm->mov(rhi, rs); // hi = num
+ EmitMov(rlo, 1);
+ EmitMov(RWSCRATCH, static_cast(-1));
+ armAsm->csel(rlo, RWSCRATCH, rlo, ge); // lo = s >= 0 ? -1 : 1
+ armAsm->b(&done);
+
+ armAsm->bind(¬_divide_by_zero);
+ Label not_unrepresentable;
+ armAsm->cmp(rs, armCheckCompareConstant(static_cast(0x80000000u)));
+ armAsm->b(¬_unrepresentable, ne);
+ armAsm->cmp(rt, armCheckCompareConstant(-1));
+ armAsm->b(¬_unrepresentable, ne);
+
+ EmitMov(rlo, 0x80000000u);
+ EmitMov(rhi, 0);
+ armAsm->b(&done);
+
+ armAsm->bind(¬_unrepresentable);
+
+ armAsm->sdiv(rlo, rs, rt);
+
+ // TODO: skip when hi is dead
+ armAsm->msub(rhi, rlo, rt, rs);
+
+ armAsm->bind(&done);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_divu(CompileFlags cf)
+{
+ const WRegister rs = cf.valid_host_s ? CFGetRegS(cf) : RWARG1;
+ if (!cf.valid_host_s)
+ MoveSToReg(rs, cf);
+
+ const WRegister rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG2;
+ if (!cf.valid_host_t)
+ MoveTToReg(rt, cf);
+
+ const WRegister rlo = CFGetRegLO(cf);
+ const WRegister rhi = CFGetRegHI(cf);
+
+ Label done;
+ Label not_divide_by_zero;
+ armAsm->cbnz(rt, ¬_divide_by_zero);
+ EmitMov(rlo, static_cast(-1));
+ armAsm->mov(rhi, rs);
+ armAsm->b(&done);
+
+ armAsm->bind(¬_divide_by_zero);
+
+ armAsm->udiv(rlo, rs, rt);
+
+ // TODO: skip when hi is dead
+ armAsm->msub(rhi, rlo, rt, rs);
+
+ armAsm->bind(&done);
+}
+
+void CPU::NewRec::AArch64Compiler::TestOverflow(const vixl::aarch64::WRegister& result)
+{
+ SwitchToFarCode(true, vs);
+
+ BackupHostState();
+
+ // toss the result
+ ClearHostReg(result.GetCode());
+
+ EndBlockWithException(Exception::Ov);
+
+ RestoreHostState();
+
+ SwitchToNearCode(false);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_dst_op(CompileFlags cf,
+ void (vixl::aarch64::Assembler::*op)(const vixl::aarch64::Register&,
+ const vixl::aarch64::Register&,
+ const vixl::aarch64::Operand&),
+ bool commutative, bool logical, bool overflow)
+{
+ AssertRegOrConstS(cf);
+ AssertRegOrConstT(cf);
+
+ const WRegister rd = CFGetRegD(cf);
+ if (cf.valid_host_s && cf.valid_host_t)
+ {
+ (armAsm->*op)(rd, CFGetRegS(cf), CFGetRegT(cf));
+ }
+ else if (commutative && (cf.const_s || cf.const_t))
+ {
+ const WRegister src = cf.const_s ? CFGetRegT(cf) : CFGetRegS(cf);
+ if (const u32 cv = GetConstantRegU32(cf.const_s ? cf.MipsS() : cf.MipsT()); cv != 0)
+ {
+ (armAsm->*op)(rd, src, logical ? armCheckLogicalConstant(cv) : armCheckAddSubConstant(cv));
+ }
+ else
+ {
+ if (rd.GetCode() != src.GetCode())
+ armAsm->mov(rd, src);
+ overflow = false;
+ }
+ }
+ else if (cf.const_s)
+ {
+ // TODO: Check where we can use wzr here
+ EmitMov(RWSCRATCH, GetConstantRegU32(cf.MipsS()));
+ (armAsm->*op)(rd, RWSCRATCH, CFGetRegT(cf));
+ }
+ else if (cf.const_t)
+ {
+ const WRegister rs = CFGetRegS(cf);
+ if (const u32 cv = GetConstantRegU32(cf.const_s ? cf.MipsS() : cf.MipsT()); cv != 0)
+ {
+ (armAsm->*op)(rd, rs, logical ? armCheckLogicalConstant(cv) : armCheckAddSubConstant(cv));
+ }
+ else
+ {
+ if (rd.GetCode() != rs.GetCode())
+ armAsm->mov(rd, rs);
+ overflow = false;
+ }
+ }
+
+ if (overflow)
+ TestOverflow(rd);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_add(CompileFlags cf)
+{
+ if (g_settings.cpu_recompiler_memory_exceptions)
+ Compile_dst_op(cf, &Assembler::adds, true, false, true);
+ else
+ Compile_dst_op(cf, &Assembler::add, true, false, false);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_addu(CompileFlags cf)
+{
+ Compile_dst_op(cf, &Assembler::add, true, false, false);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_sub(CompileFlags cf)
+{
+ if (g_settings.cpu_recompiler_memory_exceptions)
+ Compile_dst_op(cf, &Assembler::subs, false, false, true);
+ else
+ Compile_dst_op(cf, &Assembler::sub, false, false, false);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_subu(CompileFlags cf)
+{
+ Compile_dst_op(cf, &Assembler::sub, false, false, false);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_and(CompileFlags cf)
+{
+ AssertRegOrConstS(cf);
+ AssertRegOrConstT(cf);
+
+ // special cases - and with self -> self, and with 0 -> 0
+ const WRegister regd = CFGetRegD(cf);
+ if (cf.MipsS() == cf.MipsT())
+ {
+ armAsm->mov(regd, CFGetRegS(cf));
+ return;
+ }
+ else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0))
+ {
+ armAsm->mov(regd, wzr);
+ return;
+ }
+
+ Compile_dst_op(cf, &Assembler::and_, true, true, false);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_or(CompileFlags cf)
+{
+ AssertRegOrConstS(cf);
+ AssertRegOrConstT(cf);
+
+ // or/nor with 0 -> no effect
+ const WRegister regd = CFGetRegD(cf);
+ if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0) || cf.MipsS() == cf.MipsT())
+ {
+ cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf);
+ return;
+ }
+
+ Compile_dst_op(cf, &Assembler::orr, true, true, false);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_xor(CompileFlags cf)
+{
+ AssertRegOrConstS(cf);
+ AssertRegOrConstT(cf);
+
+ const WRegister regd = CFGetRegD(cf);
+ if (cf.MipsS() == cf.MipsT())
+ {
+ // xor with self -> zero
+ armAsm->mov(regd, wzr);
+ return;
+ }
+ else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0))
+ {
+ // xor with zero -> no effect
+ cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf);
+ return;
+ }
+
+ Compile_dst_op(cf, &Assembler::eor, true, true, false);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_nor(CompileFlags cf)
+{
+ Compile_or(cf);
+ armAsm->mvn(CFGetRegD(cf), CFGetRegD(cf));
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_slt(CompileFlags cf)
+{
+ Compile_slt(cf, true);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_sltu(CompileFlags cf)
+{
+ Compile_slt(cf, false);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_slt(CompileFlags cf, bool sign)
+{
+ AssertRegOrConstS(cf);
+ AssertRegOrConstT(cf);
+
+ // TODO: swap and reverse op for constants
+ if (cf.const_s)
+ {
+ EmitMov(RWSCRATCH, GetConstantRegS32(cf.MipsS()));
+ armAsm->cmp(RWSCRATCH, CFGetRegT(cf));
+ }
+ else if (cf.const_t)
+ {
+ armAsm->cmp(CFGetRegS(cf), armCheckCompareConstant(GetConstantRegS32(cf.MipsT())));
+ }
+ else
+ {
+ armAsm->cmp(CFGetRegS(cf), CFGetRegT(cf));
+ }
+
+ armAsm->cset(CFGetRegD(cf), sign ? lt : lo);
+}
+
+vixl::aarch64::WRegister
+CPU::NewRec::AArch64Compiler::ComputeLoadStoreAddressArg(CompileFlags cf,
+ const std::optional& address,
+ const std::optional& reg)
+{
+ const u32 imm = inst->i.imm_sext32();
+ if (cf.valid_host_s && imm == 0 && !reg.has_value())
+ return CFGetRegS(cf);
+
+ const WRegister dst = reg.has_value() ? reg.value() : RWARG1;
+ if (address.has_value())
+ {
+ EmitMov(dst, address.value());
+ }
+ else if (imm == 0)
+ {
+ if (cf.valid_host_s)
+ {
+ if (const WRegister src = CFGetRegS(cf); src.GetCode() != dst.GetCode())
+ armAsm->mov(dst, CFGetRegS(cf));
+ }
+ else
+ {
+ armAsm->ldr(dst, MipsPtr(cf.MipsS()));
+ }
+ }
+ else
+ {
+ if (cf.valid_host_s)
+ {
+ armAsm->add(dst, CFGetRegS(cf), armCheckAddSubConstant(static_cast(inst->i.imm_sext32())));
+ }
+ else
+ {
+ armAsm->ldr(dst, MipsPtr(cf.MipsS()));
+ armAsm->add(dst, dst, armCheckAddSubConstant(static_cast(inst->i.imm_sext32())));
+ }
+ }
+
+ return dst;
+}
+
+template
+vixl::aarch64::WRegister CPU::NewRec::AArch64Compiler::GenerateLoad(const vixl::aarch64::WRegister& addr_reg,
+ MemoryAccessSize size, bool sign,
+ const RegAllocFn& dst_reg_alloc)
+{
+ const bool checked = g_settings.cpu_recompiler_memory_exceptions;
+ if (!checked && CodeCache::IsUsingFastmem())
+ {
+ m_cycles += Bus::RAM_READ_TICKS;
+
+ const WRegister dst = dst_reg_alloc();
+
+ if (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT)
+ {
+ DebugAssert(addr_reg.GetCode() != RWARG3.GetCode());
+ armAsm->lsr(RWARG3, addr_reg, Bus::FASTMEM_LUT_PAGE_SHIFT);
+ armAsm->ldr(RXARG3, MemOperand(RMEMBASE, RXARG3, LSL, 8));
+ }
+
+ const MemOperand mem =
+ MemOperand((g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT) ? RXARG3 : RMEMBASE, addr_reg.X());
+ u8* start = m_emitter->GetCursorAddress();
+ switch (size)
+ {
+ case MemoryAccessSize::Byte:
+ sign ? armAsm->ldrsb(dst, mem) : armAsm->ldrb(dst, mem);
+ break;
+
+ case MemoryAccessSize::HalfWord:
+ sign ? armAsm->ldrsh(dst, mem) : armAsm->ldrh(dst, mem);
+ break;
+
+ case MemoryAccessSize::Word:
+ armAsm->ldr(dst, mem);
+ break;
+ }
+
+ AddLoadStoreInfo(start, kInstructionSize, addr_reg.GetCode(), dst.GetCode(), size, sign, true);
+ return dst;
+ }
+
+ if (addr_reg.GetCode() != RWARG1.GetCode())
+ armAsm->mov(RWARG1, addr_reg);
+
+ switch (size)
+ {
+ case MemoryAccessSize::Byte:
+ {
+ EmitCall(checked ? reinterpret_cast(&Recompiler::Thunks::ReadMemoryByte) :
+ reinterpret_cast(&Recompiler::Thunks::UncheckedReadMemoryByte));
+ }
+ break;
+ case MemoryAccessSize::HalfWord:
+ {
+ EmitCall(checked ? reinterpret_cast(&Recompiler::Thunks::ReadMemoryHalfWord) :
+ reinterpret_cast(&Recompiler::Thunks::UncheckedReadMemoryHalfWord));
+ }
+ break;
+ case MemoryAccessSize::Word:
+ {
+ EmitCall(checked ? reinterpret_cast(&Recompiler::Thunks::ReadMemoryWord) :
+ reinterpret_cast(&Recompiler::Thunks::UncheckedReadMemoryWord));
+ }
+ break;
+ }
+
+ // TODO: turn this into an asm function instead
+ if (checked)
+ {
+ SwitchToFarCodeIfBitSet(RXRET, 63);
+ BackupHostState();
+
+ // Need to stash this in a temp because of the flush.
+ const WRegister temp = WRegister(AllocateTempHostReg(HR_CALLEE_SAVED));
+ armAsm->neg(temp.X(), RXRET);
+ armAsm->lsl(temp, temp, 2);
+
+ Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION);
+
+ // cause_bits = (-result << 2) | BD | cop_n
+ armAsm->orr(RWARG1, temp,
+ armCheckLogicalConstant(Cop0Registers::CAUSE::MakeValueForException(
+ static_cast(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n)));
+ EmitMov(RWARG2, m_current_instruction_pc);
+ EmitCall(reinterpret_cast(static_cast(&CPU::RaiseException)));
+ FreeHostReg(temp.GetCode());
+ EndBlock(std::nullopt, true);
+
+ RestoreHostState();
+ SwitchToNearCode(false);
+ }
+
+ const WRegister dst_reg = dst_reg_alloc();
+ switch (size)
+ {
+ case MemoryAccessSize::Byte:
+ {
+ sign ? armAsm->sxtb(dst_reg, RWRET) : armAsm->uxtb(dst_reg, RWRET);
+ }
+ break;
+ case MemoryAccessSize::HalfWord:
+ {
+ sign ? armAsm->sxth(dst_reg, RWRET) : armAsm->uxth(dst_reg, RWRET);
+ }
+ break;
+ case MemoryAccessSize::Word:
+ {
+ if (dst_reg.GetCode() != RWRET.GetCode())
+ armAsm->mov(dst_reg, RWRET);
+ }
+ break;
+ }
+
+ return dst_reg;
+}
+
+void CPU::NewRec::AArch64Compiler::GenerateStore(const vixl::aarch64::WRegister& addr_reg,
+ const vixl::aarch64::WRegister& value_reg, MemoryAccessSize size)
+{
+ const bool checked = g_settings.cpu_recompiler_memory_exceptions;
+ if (!checked && CodeCache::IsUsingFastmem())
+ {
+ if (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT)
+ {
+ DebugAssert(addr_reg.GetCode() != RWARG3.GetCode());
+ armAsm->lsr(RWARG3, addr_reg, Bus::FASTMEM_LUT_PAGE_SHIFT);
+ armAsm->ldr(RXARG3, MemOperand(RMEMBASE, RXARG3, LSL, 8));
+ }
+
+ const MemOperand mem =
+ MemOperand((g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT) ? RXARG3 : RMEMBASE, addr_reg.X());
+ u8* start = m_emitter->GetCursorAddress();
+ switch (size)
+ {
+ case MemoryAccessSize::Byte:
+ armAsm->strb(value_reg, mem);
+ break;
+
+ case MemoryAccessSize::HalfWord:
+ armAsm->strh(value_reg, mem);
+ break;
+
+ case MemoryAccessSize::Word:
+ armAsm->str(value_reg, mem);
+ break;
+ }
+ AddLoadStoreInfo(start, kInstructionSize, addr_reg.GetCode(), value_reg.GetCode(), size, false, false);
+ return;
+ }
+
+ if (addr_reg.GetCode() != RWARG1.GetCode())
+ armAsm->mov(RWARG1, addr_reg);
+ if (value_reg.GetCode() != RWARG2.GetCode())
+ armAsm->mov(RWARG2, value_reg);
+
+ switch (size)
+ {
+ case MemoryAccessSize::Byte:
+ {
+ EmitCall(checked ? reinterpret_cast(&Recompiler::Thunks::WriteMemoryByte) :
+ reinterpret_cast(&Recompiler::Thunks::UncheckedWriteMemoryByte));
+ }
+ break;
+ case MemoryAccessSize::HalfWord:
+ {
+ EmitCall(checked ? reinterpret_cast(&Recompiler::Thunks::WriteMemoryHalfWord) :
+ reinterpret_cast(&Recompiler::Thunks::UncheckedWriteMemoryHalfWord));
+ }
+ break;
+ case MemoryAccessSize::Word:
+ {
+ EmitCall(checked ? reinterpret_cast(&Recompiler::Thunks::WriteMemoryWord) :
+ reinterpret_cast(&Recompiler::Thunks::UncheckedWriteMemoryWord));
+ }
+ break;
+ }
+
+ // TODO: turn this into an asm function instead
+ if (checked)
+ {
+ SwitchToFarCodeIfRegZeroOrNonZero(RXRET, true);
+ BackupHostState();
+
+ // Need to stash this in a temp because of the flush.
+ const WRegister temp = WRegister(AllocateTempHostReg(HR_CALLEE_SAVED));
+ armAsm->lsl(temp, RWRET, 2);
+
+ Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION);
+
+ // cause_bits = (result << 2) | BD | cop_n
+ armAsm->orr(RWARG1, temp,
+ armCheckLogicalConstant(Cop0Registers::CAUSE::MakeValueForException(
+ static_cast(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n)));
+ EmitMov(RWARG2, m_current_instruction_pc);
+ EmitCall(reinterpret_cast(static_cast(&CPU::RaiseException)));
+ FreeHostReg(temp.GetCode());
+ EndBlock(std::nullopt, true);
+
+ RestoreHostState();
+ SwitchToNearCode(false);
+ }
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_lxx(CompileFlags cf, MemoryAccessSize size, bool sign,
+ const std::optional& address)
+{
+ const std::optional addr_reg =
+ g_settings.gpu_pgxp_enable ? std::optional(WRegister(AllocateTempHostReg(HR_CALLEE_SAVED))) :
+ std::optional();
+ FlushForLoadStore(address, false);
+ const WRegister addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);
+ const WRegister data = GenerateLoad(addr, size, sign, [this, cf]() {
+ if (cf.MipsT() == Reg::zero)
+ return RWRET;
+
+ return WRegister(AllocateHostReg(GetFlagsForNewLoadDelayedReg(),
+ EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG,
+ cf.MipsT()));
+ });
+
+ if (g_settings.gpu_pgxp_enable)
+ {
+ Flush(FLUSH_FOR_C_CALL);
+
+ EmitMov(RWARG1, inst->bits);
+ armAsm->mov(RWARG2, addr);
+ armAsm->mov(RWARG3, data);
+ EmitCall(s_pgxp_mem_load_functions[static_cast(size)][static_cast(sign)]);
+ FreeHostReg(addr_reg.value().GetCode());
+ }
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size, bool sign,
+ const std::optional& address)
+{
+ DebugAssert(size == MemoryAccessSize::Word && !sign);
+ FlushForLoadStore(address, false);
+
+ // TODO: if address is constant, this can be simplified..
+
+ // If we're coming from another block, just flush the load delay and hope for the best..
+ if (m_load_delay_dirty)
+ UpdateLoadDelay();
+
+ // We'd need to be careful here if we weren't overwriting it..
+ const WRegister addr = WRegister(AllocateHostReg(HR_CALLEE_SAVED, HR_TYPE_TEMP));
+ ComputeLoadStoreAddressArg(cf, address, addr);
+ armAsm->and_(RWARG1, addr, armCheckLogicalConstant(~0x3u));
+ GenerateLoad(RWARG1, MemoryAccessSize::Word, false, []() { return RWRET; });
+
+ if (inst->r.rt == Reg::zero)
+ {
+ FreeHostReg(addr.GetCode());
+ return;
+ }
+
+ // lwl/lwr from a load-delayed value takes the new value, but it itself, is load delayed, so the original value is
+ // never written back. NOTE: can't trust T in cf because of the flush
+ const Reg rt = inst->r.rt;
+ WRegister value;
+ if (m_load_delay_register == rt)
+ {
+ const u32 existing_ld_rt = (m_load_delay_value_register == NUM_HOST_REGS) ?
+ AllocateHostReg(HR_MODE_READ, HR_TYPE_LOAD_DELAY_VALUE, rt) :
+ m_load_delay_value_register;
+ RenameHostReg(existing_ld_rt, HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt);
+ value = WRegister(existing_ld_rt);
+ }
+ else
+ {
+ if constexpr (EMULATE_LOAD_DELAYS)
+ {
+ value = WRegister(AllocateHostReg(HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt));
+ if (const std::optional rtreg = CheckHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rt); rtreg.has_value())
+ armAsm->mov(value, WRegister(rtreg.value()));
+ else if (HasConstantReg(rt))
+ EmitMov(value, GetConstantRegU32(rt));
+ else
+ armAsm->ldr(value, MipsPtr(rt));
+ }
+ else
+ {
+ value = WRegister(AllocateHostReg(HR_MODE_READ | HR_MODE_WRITE, HR_TYPE_CPU_REG, rt));
+ }
+ }
+
+ DebugAssert(value.GetCode() != RWARG2.GetCode() && value.GetCode() != RWARG3.GetCode());
+ armAsm->and_(RWARG2, addr, 3);
+ armAsm->lsl(RWARG2, RWARG2, 3); // *8
+ EmitMov(RWARG3, 24);
+ armAsm->sub(RWARG3, RWARG3, RWARG2);
+
+ if (inst->op == InstructionOp::lwl)
+ {
+ // const u32 mask = UINT32_C(0x00FFFFFF) >> shift;
+ // new_value = (value & mask) | (RWRET << (24 - shift));
+ EmitMov(addr, 0xFFFFFFu);
+ armAsm->lsrv(addr, addr, RWARG2);
+ armAsm->and_(value, value, addr);
+ armAsm->lslv(RWRET, RWRET, RWARG3);
+ armAsm->orr(value, value, RWRET);
+ }
+ else
+ {
+ // const u32 mask = UINT32_C(0xFFFFFF00) << (24 - shift);
+ // new_value = (value & mask) | (RWRET >> shift);
+ armAsm->lsrv(RWRET, RWRET, RWARG2);
+ EmitMov(addr, 0xFFFFFF00u);
+ armAsm->lslv(addr, addr, RWARG3);
+ armAsm->and_(value, value, addr);
+ armAsm->orr(value, value, RWRET);
+ }
+
+ FreeHostReg(addr.GetCode());
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign,
+ const std::optional& address)
+{
+ const std::optional addr_reg =
+ g_settings.gpu_pgxp_enable ? std::optional(WRegister(AllocateTempHostReg(HR_CALLEE_SAVED))) :
+ std::optional();
+ FlushForLoadStore(address, false);
+ const WRegister addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);
+ GenerateLoad(addr, MemoryAccessSize::Word, false, []() { return RWRET; });
+
+ const u32 index = static_cast(inst->r.rt.GetValue());
+ const auto [ptr, action] = GetGTERegisterPointer(index, true);
+ switch (action)
+ {
+ case GTERegisterAccessAction::Ignore:
+ {
+ break;
+ }
+
+ case GTERegisterAccessAction::Direct:
+ {
+ armAsm->str(RWRET, PTR(ptr));
+ break;
+ }
+
+ case GTERegisterAccessAction::SignExtend16:
+ {
+ armAsm->sxth(RWRET, RWRET);
+ armAsm->str(RWRET, PTR(ptr));
+ break;
+ }
+
+ case GTERegisterAccessAction::ZeroExtend16:
+ {
+ armAsm->uxth(RWRET, RWRET);
+ armAsm->str(RWRET, PTR(ptr));
+ break;
+ }
+
+ case GTERegisterAccessAction::CallHandler:
+ {
+ Flush(FLUSH_FOR_C_CALL);
+ armAsm->mov(RWARG2, RWRET);
+ EmitMov(RWARG1, index);
+ EmitCall(reinterpret_cast(>E::WriteRegister));
+ break;
+ }
+
+ case GTERegisterAccessAction::PushFIFO:
+ {
+ // SXY0 <- SXY1
+ // SXY1 <- SXY2
+ // SXY2 <- SXYP
+ DebugAssert(RWRET.GetCode() != RWARG2.GetCode() && RWRET.GetCode() != RWARG3.GetCode());
+ armAsm->ldr(RWARG2, PTR(&g_state.gte_regs.SXY1[0]));
+ armAsm->ldr(RWARG3, PTR(&g_state.gte_regs.SXY2[0]));
+ armAsm->str(RWARG2, PTR(&g_state.gte_regs.SXY0[0]));
+ armAsm->str(RWARG3, PTR(&g_state.gte_regs.SXY1[0]));
+ armAsm->str(RWRET, PTR(&g_state.gte_regs.SXY2[0]));
+ break;
+ }
+
+ default:
+ {
+ Panic("Unknown action");
+ return;
+ }
+ }
+
+ if (g_settings.gpu_pgxp_enable)
+ {
+ Flush(FLUSH_FOR_C_CALL);
+ armAsm->mov(RWARG3, RWRET);
+ armAsm->mov(RWARG2, addr);
+ EmitMov(RWARG1, inst->bits);
+ EmitCall(reinterpret_cast(&PGXP::CPU_LWC2));
+ FreeHostReg(addr_reg.value().GetCode());
+ }
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_sxx(CompileFlags cf, MemoryAccessSize size, bool sign,
+ const std::optional& address)
+{
+ AssertRegOrConstS(cf);
+ AssertRegOrConstT(cf);
+
+ const std::optional addr_reg =
+ g_settings.gpu_pgxp_enable ? std::optional(WRegister(AllocateTempHostReg(HR_CALLEE_SAVED))) :
+ std::optional();
+ FlushForLoadStore(address, true);
+ const WRegister addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);
+ const WRegister data = cf.valid_host_t ? CFGetRegT(cf) : RWARG2;
+ if (!cf.valid_host_t)
+ MoveTToReg(RWARG2, cf);
+
+ GenerateStore(addr, data, size);
+
+ if (g_settings.gpu_pgxp_enable)
+ {
+ Flush(FLUSH_FOR_C_CALL);
+ MoveMIPSRegToReg(RWARG3, cf.MipsT());
+ armAsm->mov(RWARG2, addr);
+ EmitMov(RWARG1, inst->bits);
+ EmitCall(s_pgxp_mem_store_functions[static_cast(size)]);
+ FreeHostReg(addr_reg.value().GetCode());
+ }
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, bool sign,
+ const std::optional& address)
+{
+ DebugAssert(size == MemoryAccessSize::Word && !sign);
+ FlushForLoadStore(address, true);
+
+ // TODO: if address is constant, this can be simplified..
+ // We'd need to be careful here if we weren't overwriting it..
+ const WRegister addr = WRegister(AllocateHostReg(HR_CALLEE_SAVED, HR_TYPE_TEMP));
+ ComputeLoadStoreAddressArg(cf, address, addr);
+ armAsm->and_(RWARG1, addr, armCheckLogicalConstant(~0x3u));
+ GenerateLoad(RWARG1, MemoryAccessSize::Word, false, []() { return RWRET; });
+
+ // TODO: this can take over rt's value if it's no longer needed
+ // NOTE: can't trust T in cf because of the flush
+ const Reg rt = inst->r.rt;
+ const WRegister value = RWARG2;
+ if (const std::optional rtreg = CheckHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rt); rtreg.has_value())
+ armAsm->mov(value, WRegister(rtreg.value()));
+ else if (HasConstantReg(rt))
+ EmitMov(value, GetConstantRegU32(rt));
+ else
+ armAsm->ldr(value, MipsPtr(rt));
+
+ armAsm->and_(RWSCRATCH, addr, 3);
+ armAsm->lsl(RWSCRATCH, RWSCRATCH, 3); // *8
+
+ if (inst->op == InstructionOp::swl)
+ {
+ // const u32 mem_mask = UINT32_C(0xFFFFFF00) << shift;
+ // new_value = (RWRET & mem_mask) | (value >> (24 - shift));
+ EmitMov(RWARG3, 0xFFFFFF00u);
+ armAsm->lslv(RWARG3, RWARG3, RWSCRATCH);
+ armAsm->and_(RWRET, RWRET, RWARG3);
+
+ EmitMov(RWARG3, 24);
+ armAsm->sub(RWARG3, RWARG3, RWSCRATCH);
+ armAsm->lsrv(value, value, RWARG3);
+ armAsm->orr(value, value, RWRET);
+ }
+ else
+ {
+ // const u32 mem_mask = UINT32_C(0x00FFFFFF) >> (24 - shift);
+ // new_value = (RWRET & mem_mask) | (value << shift);
+ armAsm->lslv(value, value, RWSCRATCH);
+
+ EmitMov(RWARG3, 24);
+ armAsm->sub(RWARG3, RWARG3, RWSCRATCH);
+ EmitMov(RWSCRATCH, 0x00FFFFFFu);
+ armAsm->lsrv(RWSCRATCH, RWSCRATCH, RWARG3);
+ armAsm->and_(RWRET, RWRET, RWSCRATCH);
+ armAsm->orr(value, value, RWRET);
+ }
+
+ FreeHostReg(addr.GetCode());
+
+ armAsm->and_(RWARG1, addr, armCheckLogicalConstant(~0x3u));
+ GenerateStore(RWARG1, value, MemoryAccessSize::Word);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign,
+ const std::optional& address)
+{
+ FlushForLoadStore(address, true);
+
+ const u32 index = static_cast(inst->r.rt.GetValue());
+ const auto [ptr, action] = GetGTERegisterPointer(index, false);
+ switch (action)
+ {
+ case GTERegisterAccessAction::Direct:
+ {
+ armAsm->ldr(RWARG2, PTR(ptr));
+ }
+ break;
+
+ case GTERegisterAccessAction::CallHandler:
+ {
+ // should already be flushed.. except in fastmem case
+ Flush(FLUSH_FOR_C_CALL);
+ EmitMov(RWARG1, index);
+ EmitCall(reinterpret_cast(>E::ReadRegister));
+ armAsm->mov(RWARG2, RWRET);
+ }
+ break;
+
+ default:
+ {
+ Panic("Unknown action");
+ }
+ break;
+ }
+
+ // PGXP makes this a giant pain.
+ if (!g_settings.gpu_pgxp_enable)
+ {
+ const WRegister addr = ComputeLoadStoreAddressArg(cf, address);
+ GenerateStore(addr, RWARG2, size);
+ return;
+ }
+
+ // TODO: This can be simplified because we don't need to validate in PGXP..
+ const WRegister addr_reg = WRegister(AllocateTempHostReg(HR_CALLEE_SAVED));
+ const WRegister data_backup = WRegister(AllocateTempHostReg(HR_CALLEE_SAVED));
+ FlushForLoadStore(address, true);
+ ComputeLoadStoreAddressArg(cf, address, addr_reg);
+ armAsm->mov(data_backup, RWARG2);
+ GenerateStore(addr_reg, RWARG2, size);
+
+ Flush(FLUSH_FOR_C_CALL);
+ armAsm->mov(RWARG3, data_backup);
+ armAsm->mov(RWARG2, addr_reg);
+ EmitMov(RWARG1, inst->bits);
+ EmitCall(reinterpret_cast(&PGXP::CPU_SWC2));
+ FreeHostReg(addr_reg.GetCode());
+ FreeHostReg(data_backup.GetCode());
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_mtc0(CompileFlags cf)
+{
+ // TODO: we need better constant setting here.. which will need backprop
+ AssertRegOrConstT(cf);
+
+ const Cop0Reg reg = static_cast(MipsD());
+ const u32* ptr = GetCop0RegPtr(reg);
+ const u32 mask = GetCop0RegWriteMask(reg);
+ if (!ptr)
+ {
+ Compile_Fallback();
+ return;
+ }
+
+ if (mask == 0)
+ {
+ // if it's a read-only register, ignore
+ Log_DebugPrintf("Ignoring write to read-only cop0 reg %u", static_cast(reg));
+ return;
+ }
+
+ // for some registers, we need to test certain bits
+ const bool needs_bit_test = (reg == Cop0Reg::SR);
+ const WRegister new_value = RWARG1;
+ const WRegister old_value = RWARG2;
+ const WRegister changed_bits = RWARG3;
+ const WRegister mask_reg = RWSCRATCH;
+
+ // Load old value
+ armAsm->ldr(old_value, PTR(ptr));
+
+ // No way we fit this in an immediate..
+ EmitMov(mask_reg, mask);
+
+ // update value
+ if (cf.valid_host_t)
+ armAsm->and_(new_value, CFGetRegT(cf), mask_reg);
+ else
+ EmitMov(new_value, GetConstantRegU32(cf.MipsT()) & mask);
+
+ if (needs_bit_test)
+ armAsm->eor(changed_bits, old_value, new_value);
+ armAsm->bic(old_value, old_value, mask_reg);
+ armAsm->orr(new_value, old_value, new_value);
+ armAsm->str(new_value, PTR(ptr));
+
+ if (reg == Cop0Reg::SR)
+ {
+ // TODO: replace with register backup
+ // We could just inline the whole thing..
+ Flush(FLUSH_FOR_C_CALL);
+
+ SwitchToFarCodeIfBitSet(changed_bits, 16);
+ armAsm->sub(sp, sp, 16);
+ armAsm->stp(RWARG1, RWARG2, MemOperand(sp));
+ EmitCall(reinterpret_cast(&CPU::UpdateMemoryPointers));
+ armAsm->ldp(RWARG1, RWARG2, MemOperand(sp));
+ armAsm->add(sp, sp, 16);
+ armAsm->ldr(RMEMBASE, PTR(&g_state.fastmem_base));
+ SwitchToNearCode(true);
+ }
+
+ if (reg == Cop0Reg::SR || reg == Cop0Reg::CAUSE)
+ {
+ const WRegister sr = (reg == Cop0Reg::SR) ? RWARG2 : (armAsm->ldr(RWARG1, PTR(&g_state.cop0_regs.sr.bits)), RWARG1);
+ TestInterrupts(sr);
+ }
+
+ if (reg == Cop0Reg::DCIC && g_settings.cpu_recompiler_memory_exceptions)
+ {
+ // TODO: DCIC handling for debug breakpoints
+ Log_WarningPrintf("TODO: DCIC handling for debug breakpoints");
+ }
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_rfe(CompileFlags cf)
+{
+ // shift mode bits right two, preserving upper bits
+ armAsm->ldr(RWARG1, PTR(&g_state.cop0_regs.sr.bits));
+ armAsm->bfxil(RWARG1, RWARG1, 2, 4);
+ armAsm->str(RWARG1, PTR(&g_state.cop0_regs.sr.bits));
+
+ TestInterrupts(RWARG1);
+}
+
+void CPU::NewRec::AArch64Compiler::TestInterrupts(const vixl::aarch64::WRegister& sr)
+{
+ // if Iec == 0 then goto no_interrupt
+ Label no_interrupt;
+ armAsm->tbz(sr, 0, &no_interrupt);
+
+ // sr & cause
+ armAsm->ldr(RWSCRATCH, PTR(&g_state.cop0_regs.cause.bits));
+ armAsm->and_(sr, sr, RWSCRATCH);
+
+ // ((sr & cause) & 0xff00) == 0 goto no_interrupt
+ armAsm->tst(sr, 0xFF00);
+
+ SwitchToFarCode(true, ne);
+ BackupHostState();
+ Flush(FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION | FLUSH_FOR_C_CALL);
+ EmitCall(reinterpret_cast(&DispatchInterrupt));
+ EndBlock(std::nullopt, true);
+ RestoreHostState();
+ SwitchToNearCode(false);
+
+ armAsm->bind(&no_interrupt);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_mfc2(CompileFlags cf)
+{
+ const u32 index = inst->cop.Cop2Index();
+ const Reg rt = inst->r.rt;
+
+ const auto [ptr, action] = GetGTERegisterPointer(index, false);
+ if (action == GTERegisterAccessAction::Ignore)
+ return;
+
+ u32 hreg;
+ if (action == GTERegisterAccessAction::Direct)
+ {
+ hreg = AllocateHostReg(GetFlagsForNewLoadDelayedReg(),
+ EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt);
+ armAsm->ldr(WRegister(hreg), PTR(ptr));
+ }
+ else if (action == GTERegisterAccessAction::CallHandler)
+ {
+ Flush(FLUSH_FOR_C_CALL);
+ EmitMov(RWARG1, index);
+ EmitCall(reinterpret_cast(>E::ReadRegister));
+
+ hreg = AllocateHostReg(GetFlagsForNewLoadDelayedReg(),
+ EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt);
+ armAsm->mov(WRegister(hreg), RWRET);
+ }
+ else
+ {
+ Panic("Unknown action");
+ return;
+ }
+
+ if (g_settings.gpu_pgxp_enable)
+ {
+ Flush(FLUSH_FOR_C_CALL);
+ EmitMov(RWARG1, inst->bits);
+ armAsm->mov(RWARG2, WRegister(hreg));
+ EmitCall(reinterpret_cast(&PGXP::CPU_MFC2));
+ }
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_mtc2(CompileFlags cf)
+{
+ const u32 index = inst->cop.Cop2Index();
+ const auto [ptr, action] = GetGTERegisterPointer(index, true);
+ if (action == GTERegisterAccessAction::Ignore)
+ return;
+
+ if (action == GTERegisterAccessAction::Direct)
+ {
+ if (cf.const_t)
+ StoreConstantToCPUPointer(GetConstantRegU32(cf.MipsT()), ptr);
+ else
+ armAsm->str(CFGetRegT(cf), PTR(ptr));
+ }
+ else if (action == GTERegisterAccessAction::SignExtend16 || action == GTERegisterAccessAction::ZeroExtend16)
+ {
+ const bool sign = (action == GTERegisterAccessAction::SignExtend16);
+ if (cf.valid_host_t)
+ {
+ sign ? armAsm->sxth(RWARG1, CFGetRegT(cf)) : armAsm->uxth(RWARG1, CFGetRegT(cf));
+ armAsm->str(RWARG1, PTR(ptr));
+ }
+ else if (cf.const_t)
+ {
+ const u16 cv = Truncate16(GetConstantRegU32(cf.MipsT()));
+ StoreConstantToCPUPointer(sign ? ::SignExtend32(cv) : ::ZeroExtend32(cv), ptr);
+ }
+ else
+ {
+ Panic("Unsupported setup");
+ }
+ }
+ else if (action == GTERegisterAccessAction::CallHandler)
+ {
+ Flush(FLUSH_FOR_C_CALL);
+ EmitMov(RWARG1, index);
+ MoveTToReg(RWARG2, cf);
+ EmitCall(reinterpret_cast(>E::WriteRegister));
+ }
+ else if (action == GTERegisterAccessAction::PushFIFO)
+ {
+ // SXY0 <- SXY1
+ // SXY1 <- SXY2
+ // SXY2 <- SXYP
+ DebugAssert(RWRET.GetCode() != RWARG2.GetCode() && RWRET.GetCode() != RWARG3.GetCode());
+ armAsm->ldr(RWARG2, PTR(&g_state.gte_regs.SXY1[0]));
+ armAsm->ldr(RWARG3, PTR(&g_state.gte_regs.SXY2[0]));
+ armAsm->str(RWARG2, PTR(&g_state.gte_regs.SXY0[0]));
+ armAsm->str(RWARG3, PTR(&g_state.gte_regs.SXY1[0]));
+ if (cf.valid_host_t)
+ armAsm->str(CFGetRegT(cf), PTR(&g_state.gte_regs.SXY2[0]));
+ else if (cf.const_t)
+ StoreConstantToCPUPointer(GetConstantRegU32(cf.MipsT()), &g_state.gte_regs.SXY2[0]);
+ else
+ Panic("Unsupported setup");
+ }
+ else
+ {
+ Panic("Unknown action");
+ }
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_cop2(CompileFlags cf)
+{
+ TickCount func_ticks;
+ GTE::InstructionImpl func = GTE::GetInstructionImpl(inst->bits, &func_ticks);
+
+ Flush(FLUSH_FOR_C_CALL);
+ EmitMov(RWARG1, inst->bits & GTE::Instruction::REQUIRED_BITS_MASK);
+ EmitCall(reinterpret_cast(func));
+
+ AddGTETicks(func_ticks);
+}
+
+u32 CPU::NewRec::CompileLoadStoreThunk(void* thunk_code, u32 thunk_space, void* code_address, u32 code_size,
+ TickCount cycles_to_add, TickCount cycles_to_remove, u32 gpr_bitmask,
+ u8 address_register, u8 data_register, MemoryAccessSize size, bool is_signed,
+ bool is_load)
+{
+ Assembler arm_asm(static_cast(thunk_code), thunk_space);
+ Assembler* armAsm = &arm_asm;
+
+#ifdef VIXL_DEBUG
+ vixl::CodeBufferCheckScope asm_check(armAsm, thunk_space, vixl::CodeBufferCheckScope::kDontReserveBufferSpace);
+#endif
+
+ static constexpr u32 GPR_SIZE = 8;
+
+ // save regs
+ u32 num_gprs = 0;
+
+ for (u32 i = 0; i < NUM_HOST_REGS; i++)
+ {
+ if ((gpr_bitmask & (1u << i)) && armIsCallerSavedRegister(i) && (!is_load || data_register != i))
+ num_gprs++;
+ }
+
+ const u32 stack_size = (((num_gprs + 1) & ~1u) * GPR_SIZE);
+
+ // TODO: use stp+ldp, vixl helper?
+
+ if (stack_size > 0)
+ {
+ armAsm->sub(sp, sp, stack_size);
+
+ u32 stack_offset = 0;
+ for (u32 i = 0; i < NUM_HOST_REGS; i++)
+ {
+ if ((gpr_bitmask & (1u << i)) && armIsCallerSavedRegister(i) && (!is_load || data_register != i))
+ {
+ armAsm->str(XRegister(i), MemOperand(sp, stack_offset));
+ stack_offset += GPR_SIZE;
+ }
+ }
+ }
+
+ if (cycles_to_add != 0)
+ {
+ // NOTE: we have to reload here, because memory writes can run DMA, which can screw with cycles
+ Assert(Assembler::IsImmAddSub(cycles_to_add));
+ armAsm->ldr(RWSCRATCH, PTR(&g_state.pending_ticks));
+ armAsm->add(RWSCRATCH, RWSCRATCH, cycles_to_add);
+ armAsm->str(RWSCRATCH, PTR(&g_state.pending_ticks));
+ }
+
+ if (address_register != static_cast(RWARG1.GetCode()))
+ armAsm->mov(RWARG1, WRegister(address_register));
+
+ if (!is_load)
+ {
+ if (data_register != static_cast(RWARG2.GetCode()))
+ armAsm->mov(RWARG2, WRegister(data_register));
+ }
+
+ switch (size)
+ {
+ case MemoryAccessSize::Byte:
+ {
+ armEmitCall(armAsm,
+ is_load ? reinterpret_cast(&Recompiler::Thunks::UncheckedReadMemoryByte) :
+ reinterpret_cast(&Recompiler::Thunks::UncheckedWriteMemoryByte),
+ false);
+ }
+ break;
+ case MemoryAccessSize::HalfWord:
+ {
+ armEmitCall(armAsm,
+ is_load ? reinterpret_cast(&Recompiler::Thunks::UncheckedReadMemoryHalfWord) :
+ reinterpret_cast