Add AArch64 recompiler backend (#1556)
Some checks failed
Build check / build (push) Waiting to run
Generate translation template / generate-pot (push) Failing after 1s

This commit is contained in:
SSimco 2025-05-09 13:47:22 +03:00 committed by GitHub
parent d13dab0fd8
commit 081ebead5f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 1766 additions and 7 deletions

3
.gitmodules vendored
View file

@ -18,3 +18,6 @@
path = dependencies/imgui
url = https://github.com/ocornut/imgui
shallow = true
[submodule "dependencies/xbyak_aarch64"]
path = dependencies/xbyak_aarch64
url = https://github.com/fujitsu/xbyak_aarch64

View file

@ -222,6 +222,10 @@ endif()
add_subdirectory("dependencies/ih264d" EXCLUDE_FROM_ALL)
if(CMAKE_SYSTEM_PROCESSOR MATCHES "(aarch64)|(AARCH64)")
add_subdirectory("dependencies/xbyak_aarch64" EXCLUDE_FROM_ALL)
endif()
find_package(ZArchive)
if (NOT ZArchive_FOUND)
add_subdirectory("dependencies/ZArchive" EXCLUDE_FROM_ALL)

1
dependencies/xbyak_aarch64 vendored Submodule

@ -0,0 +1 @@
Subproject commit 904b8923457f3ec0d6f82ea2d6832a792851194d

View file

@ -537,6 +537,14 @@ if(APPLE)
target_sources(CemuCafe PRIVATE "HW/Latte/Renderer/Vulkan/CocoaSurface.mm")
endif()
if(CMAKE_SYSTEM_PROCESSOR MATCHES "(aarch64)|(AARCH64)")
target_sources(CemuCafe PRIVATE
HW/Espresso/Recompiler/BackendAArch64/BackendAArch64.cpp
HW/Espresso/Recompiler/BackendAArch64/BackendAArch64.h
)
target_link_libraries(CemuCafe PRIVATE xbyak_aarch64)
endif()
set_property(TARGET CemuCafe PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")
target_include_directories(CemuCafe PUBLIC "../")

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,18 @@
#pragma once
#include "HW/Espresso/Recompiler/IML/IMLInstruction.h"
#include "../PPCRecompiler.h"
bool PPCRecompiler_generateAArch64Code(struct PPCRecFunction_t* PPCRecFunction, struct ppcImlGenContext_t* ppcImlGenContext);
void PPCRecompiler_cleanupAArch64Code(void* code, size_t size);
void PPCRecompilerAArch64Gen_generateRecompilerInterfaceFunctions();
// architecture specific constants
namespace IMLArchAArch64
{
static constexpr int PHYSREG_GPR_BASE = 0;
static constexpr int PHYSREG_GPR_COUNT = 25;
static constexpr int PHYSREG_FPR_BASE = PHYSREG_GPR_COUNT;
static constexpr int PHYSREG_FPR_COUNT = 31;
}; // namespace IMLArchAArch64

View file

@ -702,8 +702,10 @@ void IMLOptimizer_StandardOptimizationPassForSegment(IMLOptimizerRegIOAnalysis&
{
IMLOptimizer_RemoveDeadCodeFromSegment(regIoAnalysis, seg);
#ifdef ARCH_X86_64
// x86 specific optimizations
IMLOptimizerX86_SubstituteCJumpForEflagsJump(regIoAnalysis, seg); // this pass should be applied late since it creates invisible eflags dependencies (which would break further register dependency analysis)
#endif
}
void IMLOptimizer_StandardOptimizationPass(ppcImlGenContext_t& ppcImlGenContext)

View file

@ -6,6 +6,9 @@
#include "IMLRegisterAllocatorRanges.h"
#include "../BackendX64/BackendX64.h"
#ifdef __aarch64__
#include "../BackendAArch64/BackendAArch64.h"
#endif
#include <boost/container/static_vector.hpp>
#include <boost/container/small_vector.hpp>
@ -127,23 +130,22 @@ static void GetInstructionFixedRegisters(IMLInstruction* instruction, IMLFixedRe
fixedRegs.listInput.clear();
fixedRegs.listOutput.clear();
// code below for aarch64 has not been tested
// The purpose of GetInstructionFixedRegisters() is to constraint virtual registers to specific physical registers for instructions which need it
// on x86 this is used for instructions like SHL <reg>, CL where the CL register is hardwired. On aarch it's probably only necessary for setting up the calling convention
cemu_assert_unimplemented();
#ifdef 0
if (instruction->type == PPCREC_IML_TYPE_CALL_IMM)
{
const IMLPhysReg intParamToPhysReg[3] = {IMLArchAArch64::PHYSREG_GPR_BASE + 0, IMLArchAArch64::PHYSREG_GPR_BASE + 1, IMLArchAArch64::PHYSREG_GPR_BASE + 2};
const IMLPhysReg floatParamToPhysReg[3] = {IMLArchAArch64::PHYSREG_FPR_BASE + 0, IMLArchAArch64::PHYSREG_FPR_BASE + 1, IMLArchAArch64::PHYSREG_FPR_BASE + 2};
IMLPhysRegisterSet volatileRegs;
for (int i=0; i<19; i++) // x0 to x18 are volatile
for (int i = 0; i <= 17; i++) // x0 to x17 are volatile
volatileRegs.SetAvailable(IMLArchAArch64::PHYSREG_GPR_BASE + i);
for (int i = 0; i <= 31; i++) // which float registers are volatile?
// v0-v7 & v16-v31 are volatile. For v8-v15 only the high 64 bits are volatile.
for (int i = 0; i <= 7; i++)
volatileRegs.SetAvailable(IMLArchAArch64::PHYSREG_FPR_BASE + i);
for (int i = 16; i <= 31; i++)
volatileRegs.SetAvailable(IMLArchAArch64::PHYSREG_FPR_BASE + i);
SetupCallingConvention(instruction, fixedRegs, intParamToPhysReg, floatParamToPhysReg, IMLArchAArch64::PHYSREG_GPR_BASE + 0, IMLArchAArch64::PHYSREG_FPR_BASE + 0, volatileRegs);
}
#endif
}
#else
// x86-64

View file

@ -16,6 +16,9 @@
#include "IML/IML.h"
#include "IML/IMLRegisterAllocator.h"
#include "BackendX64/BackendX64.h"
#ifdef __aarch64__
#include "BackendAArch64/BackendAArch64.h"
#endif
#include "util/highresolutiontimer/HighResolutionTimer.h"
#define PPCREC_FORCE_SYNCHRONOUS_COMPILATION 0 // if 1, then function recompilation will block and execute on the thread that called PPCRecompiler_visitAddressNoBlock
@ -220,12 +223,20 @@ PPCRecFunction_t* PPCRecompiler_recompileFunction(PPCFunctionBoundaryTracker::PP
return nullptr;
}
#if defined(ARCH_X86_64)
// emit x64 code
bool x64GenerationSuccess = PPCRecompiler_generateX64Code(ppcRecFunc, &ppcImlGenContext);
if (x64GenerationSuccess == false)
{
return nullptr;
}
#elif defined(__aarch64__)
bool aarch64GenerationSuccess = PPCRecompiler_generateAArch64Code(ppcRecFunc, &ppcImlGenContext);
if (aarch64GenerationSuccess == false)
{
return nullptr;
}
#endif
if (ActiveSettings::DumpRecompilerFunctionsEnabled())
{
FileStream* fs = FileStream::createFile2(ActiveSettings::GetUserDataPath(fmt::format("dump/recompiler/ppc_{:08x}.bin", ppcRecFunc->ppcAddress)));
@ -270,6 +281,7 @@ void PPCRecompiler_NativeRegisterAllocatorPass(ppcImlGenContext_t& ppcImlGenCont
for (auto& it : ppcImlGenContext.mappedRegs)
raParam.regIdToName.try_emplace(it.second.GetRegID(), it.first);
#if defined(ARCH_X86_64)
auto& gprPhysPool = raParam.GetPhysRegPool(IMLRegFormat::I64);
gprPhysPool.SetAvailable(IMLArchX86::PHYSREG_GPR_BASE + X86_REG_RAX);
gprPhysPool.SetAvailable(IMLArchX86::PHYSREG_GPR_BASE + X86_REG_RDX);
@ -301,6 +313,19 @@ void PPCRecompiler_NativeRegisterAllocatorPass(ppcImlGenContext_t& ppcImlGenCont
fprPhysPool.SetAvailable(IMLArchX86::PHYSREG_FPR_BASE + 12);
fprPhysPool.SetAvailable(IMLArchX86::PHYSREG_FPR_BASE + 13);
fprPhysPool.SetAvailable(IMLArchX86::PHYSREG_FPR_BASE + 14);
#elif defined(__aarch64__)
auto& gprPhysPool = raParam.GetPhysRegPool(IMLRegFormat::I64);
for (auto i = IMLArchAArch64::PHYSREG_GPR_BASE; i < IMLArchAArch64::PHYSREG_GPR_BASE + IMLArchAArch64::PHYSREG_GPR_COUNT; i++)
{
if (i == IMLArchAArch64::PHYSREG_GPR_BASE + 18)
continue; // Skip reserved platform register
gprPhysPool.SetAvailable(i);
}
auto& fprPhysPool = raParam.GetPhysRegPool(IMLRegFormat::F64);
for (auto i = IMLArchAArch64::PHYSREG_FPR_BASE; i < IMLArchAArch64::PHYSREG_FPR_BASE + IMLArchAArch64::PHYSREG_FPR_COUNT; i++)
fprPhysPool.SetAvailable(i);
#endif
IMLRegisterAllocator_AllocateRegisters(&ppcImlGenContext, raParam);
}
@ -679,8 +704,11 @@ void PPCRecompiler_init()
debug_printf("Allocating %dMB for recompiler instance data...\n", (sint32)(sizeof(PPCRecompilerInstanceData_t) / 1024 / 1024));
ppcRecompilerInstanceData = (PPCRecompilerInstanceData_t*)MemMapper::ReserveMemory(nullptr, sizeof(PPCRecompilerInstanceData_t), MemMapper::PAGE_PERMISSION::P_RW);
MemMapper::AllocateMemory(&(ppcRecompilerInstanceData->_x64XMM_xorNegateMaskBottom), sizeof(PPCRecompilerInstanceData_t) - offsetof(PPCRecompilerInstanceData_t, _x64XMM_xorNegateMaskBottom), MemMapper::PAGE_PERMISSION::P_RW, true);
#ifdef ARCH_X86_64
PPCRecompilerX64Gen_generateRecompilerInterfaceFunctions();
#elif defined(__aarch64__)
PPCRecompilerAArch64Gen_generateRecompilerInterfaceFunctions();
#endif
PPCRecompiler_allocateRange(0, 0x1000); // the first entry is used for fallback to interpreter
PPCRecompiler_allocateRange(mmuRange_TRAMPOLINE_AREA.getBase(), mmuRange_TRAMPOLINE_AREA.getSize());
PPCRecompiler_allocateRange(mmuRange_CODECAVE.getBase(), mmuRange_CODECAVE.getSize());