Initial PPU LLVM implementation for aarch64

This commit is contained in:
kd-11 2024-08-04 05:09:06 +03:00 committed by kd-11
parent a5f9256ac6
commit 56cc5d9355
6 changed files with 223 additions and 78 deletions

View file

@ -26,6 +26,7 @@
#include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/IntrinsicsX86.h" #include "llvm/IR/IntrinsicsX86.h"
#include "llvm/IR/IntrinsicsAArch64.h" #include "llvm/IR/IntrinsicsAArch64.h"
#include "llvm/IR/InlineAsm.h"
#ifdef _MSC_VER #ifdef _MSC_VER
#pragma warning(pop) #pragma warning(pop)
@ -3898,4 +3899,39 @@ struct fmt_unveil<llvm::TypeSize, void>
} }
}; };
// Inline assembly wrappers.
// TODO: Move these to proper location and replace macros with templates
static inline
llvm::InlineAsm* compile_inline_asm(
llvm::Type* returnType,
llvm::ArrayRef<llvm::Type*> argTypes,
const std::string& code,
const std::string& constraints)
{
const auto callSig = llvm::FunctionType::get(returnType, argTypes, false);
return llvm::InlineAsm::get(callSig, code, constraints, true, false);
}
// Helper for ASM generation with dynamic number of arguments
#define LLVM_ASM(asm_, args, constraints, irb, ctx)\
do {\
std::vector<llvm::Type*> _argTypes;\
_argTypes.reserve(args.size());\
for (const auto& _arg : args) _argTypes.push_back(_arg->getType());\
auto _returnType = llvm::Type::getVoidTy(ctx); \
llvm::FunctionCallee _callee = compile_inline_asm(_returnType, _argTypes, asm_, constraints); \
auto _c = irb->CreateCall(_callee, args); \
_c->addFnAttr(llvm::Attribute::AlwaysInline); \
} while(0)
// Helper for ASM generation with 0 args
#define LLVM_ASM_0(asm_, irb, ctx)\
do {\
const auto _voidTy = llvm::Type::getVoidTy(ctx); \
auto _callee = compile_inline_asm(_voidTy, std::nullopt, asm_, ""); \
auto _c = irb->CreateCall(_callee); \
_c->setTailCall(); \
_c->addFnAttr(llvm::Attribute::AlwaysInline); \
} while(0)
#endif #endif

View file

@ -0,0 +1,40 @@
#pragma once
#include <util/types.hpp>
namespace rpcs3
{
union alignas(16) hypervisor_context_t
{
u64 regs[16];
struct
{
u64 pc;
u64 sp;
u64 x18;
u64 x19;
u64 x20;
u64 x21;
u64 x22;
u64 x23;
u64 x24;
u64 x25;
u64 x26;
u64 x27;
u64 x28;
u64 x29;
u64 x30;
// x0-x17 unused
} aarch64;
struct
{
u64 sp;
// Other regs unused
} x86;
};
}

View file

@ -222,7 +222,7 @@ const auto ppu_gateway = build_function_asm<void(*)(ppu_thread*)>("ppu_gateway",
#endif #endif
// Save native stack pointer for longjmp emulation // Save native stack pointer for longjmp emulation
c.mov(x86::qword_ptr(args[0], ::offset32(&ppu_thread::saved_native_sp)), x86::rsp); c.mov(x86::qword_ptr(args[0], ::offset32(&ppu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs)), x86::rsp);
// Initialize args // Initialize args
c.mov(x86::r13, x86::qword_ptr(reinterpret_cast<u64>(&vm::g_exec_addr))); c.mov(x86::r13, x86::qword_ptr(reinterpret_cast<u64>(&vm::g_exec_addr)));
@ -291,37 +291,48 @@ const auto ppu_gateway = build_function_asm<void(*)(ppu_thread*)>("ppu_gateway",
// and https://developer.arm.com/documentation/den0024/a/The-ABI-for-ARM-64-bit-Architecture/Register-use-in-the-AArch64-Procedure-Call-Standard/Parameters-in-general-purpose-registers // and https://developer.arm.com/documentation/den0024/a/The-ABI-for-ARM-64-bit-Architecture/Register-use-in-the-AArch64-Procedure-Call-Standard/Parameters-in-general-purpose-registers
// for AArch64 calling convention // for AArch64 calling convention
// Save sp for native longjmp emulation // Push callee saved registers to the hv context
Label native_sp_offset = c.newLabel(); // Assume our LLVM compiled code is unsafe and can clobber our stack. GHC on aarch64 treats stack as scratch.
c.ldr(a64::x10, arm::Mem(native_sp_offset)); // We also want to store the register context at a fixed place so we can read the hypervisor state from any lcoation.
// sp not allowed to be used in load/stores directly
c.mov(a64::x15, a64::sp);
c.str(a64::x15, arm::Mem(args[0], a64::x10));
// Push callee saved registers to the stack
// We need to save x18-x30 = 13 x 8B each + 8 bytes for 16B alignment = 112B // We need to save x18-x30 = 13 x 8B each + 8 bytes for 16B alignment = 112B
c.sub(a64::sp, a64::sp, Imm(112));
c.stp(a64::x18, a64::x19, arm::Mem(a64::sp)); // Pre-context save
c.stp(a64::x20, a64::x21, arm::Mem(a64::sp, 16)); // Layout:
c.stp(a64::x22, a64::x23, arm::Mem(a64::sp, 32)); // pc, sp
c.stp(a64::x24, a64::x25, arm::Mem(a64::sp, 48)); // x18, x19...x30
c.stp(a64::x26, a64::x27, arm::Mem(a64::sp, 64)); // NOTE: Do not touch x19..x30 before saving the registers!
c.stp(a64::x28, a64::x29, arm::Mem(a64::sp, 80)); const u64 hv_register_array_offset = ::offset32(&ppu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs);
c.str(a64::x30, arm::Mem(a64::sp, 96)); Label hv_ctx_pc = c.newLabel(); // Used to hold the far jump return address
// Sanity
ensure(hv_register_array_offset < 4096); // Imm10
c.mov(a64::x15, args[0]);
c.add(a64::x14, a64::x15, Imm(hv_register_array_offset)); // Per-thread context save
c.adr(a64::x15, hv_ctx_pc); // x15 = pc
c.mov(a64::x13, a64::sp); // x16 = sp
c.stp(a64::x15, a64::x13, arm::Mem(a64::x14));
c.stp(a64::x18, a64::x19, arm::Mem(a64::x14, 16));
c.stp(a64::x20, a64::x21, arm::Mem(a64::x14, 32));
c.stp(a64::x22, a64::x23, arm::Mem(a64::x14, 48));
c.stp(a64::x24, a64::x25, arm::Mem(a64::x14, 64));
c.stp(a64::x26, a64::x27, arm::Mem(a64::x14, 80));
c.stp(a64::x28, a64::x29, arm::Mem(a64::x14, 96));
c.str(a64::x30, arm::Mem(a64::x14, 112));
// Load REG_Base - use absolute jump target to bypass rel jmp range limits // Load REG_Base - use absolute jump target to bypass rel jmp range limits
Label exec_addr = c.newLabel(); c.mov(a64::x19, Imm(reinterpret_cast<u64>(&vm::g_exec_addr)));
c.ldr(a64::x19, arm::Mem(exec_addr));
c.ldr(a64::x19, arm::Mem(a64::x19)); c.ldr(a64::x19, arm::Mem(a64::x19));
// Load PPUThread struct base -> REG_Sp // Load PPUThread struct base -> REG_Sp
const arm::GpX ppu_t_base = a64::x20; const arm::GpX ppu_t_base = a64::x20;
c.mov(ppu_t_base, args[0]); c.mov(ppu_t_base, args[0]);
// Load PC // Load PC
const arm::GpX pc = a64::x15; const arm::GpX pc = a64::x15;
Label cia_offset = c.newLabel();
const arm::GpX cia_addr_reg = a64::x11; const arm::GpX cia_addr_reg = a64::x11;
// Load offset value // Load offset value
c.ldr(cia_addr_reg, arm::Mem(cia_offset)); c.mov(cia_addr_reg, Imm(static_cast<u64>(::offset32(&ppu_thread::cia))));
// Load cia // Load cia
c.ldr(a64::w15, arm::Mem(ppu_t_base, cia_addr_reg)); c.ldr(a64::w15, arm::Mem(ppu_t_base, cia_addr_reg));
// Multiply by 2 to index into ptr table // Multiply by 2 to index into ptr table
@ -343,44 +354,45 @@ const auto ppu_gateway = build_function_asm<void(*)(ppu_thread*)>("ppu_gateway",
c.lsr(call_target, call_target, Imm(16)); c.lsr(call_target, call_target, Imm(16));
// Load registers // Load registers
Label base_addr = c.newLabel(); c.mov(a64::x22, Imm(reinterpret_cast<u64>(&vm::g_base_addr)));
c.ldr(a64::x22, arm::Mem(base_addr));
c.ldr(a64::x22, arm::Mem(a64::x22)); c.ldr(a64::x22, arm::Mem(a64::x22));
Label gpr_addr_offset = c.newLabel();
const arm::GpX gpr_addr_reg = a64::x9; const arm::GpX gpr_addr_reg = a64::x9;
c.ldr(gpr_addr_reg, arm::Mem(gpr_addr_offset)); c.mov(gpr_addr_reg, Imm(static_cast<u64>(::offset32(&ppu_thread::gpr))));
c.add(gpr_addr_reg, gpr_addr_reg, ppu_t_base); c.add(gpr_addr_reg, gpr_addr_reg, ppu_t_base);
c.ldr(a64::x23, arm::Mem(gpr_addr_reg)); c.ldr(a64::x23, arm::Mem(gpr_addr_reg));
c.ldr(a64::x24, arm::Mem(gpr_addr_reg, 8)); c.ldr(a64::x24, arm::Mem(gpr_addr_reg, 8));
c.ldr(a64::x25, arm::Mem(gpr_addr_reg, 16)); c.ldr(a64::x25, arm::Mem(gpr_addr_reg, 16));
// GHC frame for the guest. This seems dodgy but the only thing stored on stack is actually registers before making calls to C++ code.
// Injected stack frames also work, but are not free and are completely unnecessary.
c.sub(a64::sp, a64::sp, Imm(4096));
// Execute LLE call // Execute LLE call
c.blr(call_target); c.blr(call_target);
// Restore registers from the stack // Return address after far jump. Reset sp and start unwinding...
c.ldp(a64::x18, a64::x19, arm::Mem(a64::sp)); c.bind(hv_ctx_pc);
c.ldp(a64::x20, a64::x21, arm::Mem(a64::sp, 16));
c.ldp(a64::x22, a64::x23, arm::Mem(a64::sp, 32));
c.ldp(a64::x24, a64::x25, arm::Mem(a64::sp, 48));
c.ldp(a64::x26, a64::x27, arm::Mem(a64::sp, 64));
c.ldp(a64::x28, a64::x29, arm::Mem(a64::sp, 80));
c.ldr(a64::x30, arm::Mem(a64::sp, 96));
// Restore stack ptr
c.add(a64::sp, a64::sp, Imm(112));
// Return
c.ret(a64::x30);
c.bind(exec_addr); // Execution guard undo (unneded since we're going to hard-reset the SP)
c.embedUInt64(reinterpret_cast<u64>(&vm::g_exec_addr)); //c.add(a64::sp, a64::sp, Imm(4096));
c.bind(base_addr);
c.embedUInt64(reinterpret_cast<u64>(&vm::g_base_addr)); // We either got here through normal "ret" which keeps our x20 intact, or we jumped here and the escape reset our x20 reg
c.bind(cia_offset); // Either way, x20 contains our thread base and we forcefully reset the stack pointer
c.embedUInt64(static_cast<u64>(::offset32(&ppu_thread::cia))); c.add(a64::x14, a64::x20, Imm(hv_register_array_offset)); // Per-thread context save
c.bind(gpr_addr_offset);
c.embedUInt64(static_cast<u64>(::offset32(&ppu_thread::gpr))); c.ldr(a64::x15, arm::Mem(a64::x14, 8));
c.bind(native_sp_offset); c.ldp(a64::x18, a64::x19, arm::Mem(a64::x14, 16));
c.embedUInt64(static_cast<u64>(::offset32(&ppu_thread::saved_native_sp))); c.ldp(a64::x20, a64::x21, arm::Mem(a64::x14, 32));
c.ldp(a64::x22, a64::x23, arm::Mem(a64::x14, 48));
c.ldp(a64::x24, a64::x25, arm::Mem(a64::x14, 64));
c.ldp(a64::x26, a64::x27, arm::Mem(a64::x14, 80));
c.ldp(a64::x28, a64::x29, arm::Mem(a64::x14, 96));
c.ldr(a64::x30, arm::Mem(a64::x14, 112));
// Return
c.mov(a64::sp, a64::x15);
c.ret(a64::x30);
#endif #endif
}); });
@ -390,11 +402,20 @@ const extern auto ppu_escape = build_function_asm<void(*)(ppu_thread*)>("ppu_esc
#if defined(ARCH_X64) #if defined(ARCH_X64)
// Restore native stack pointer (longjmp emulation) // Restore native stack pointer (longjmp emulation)
c.mov(x86::rsp, x86::qword_ptr(args[0], ::offset32(&ppu_thread::saved_native_sp))); c.mov(x86::rsp, x86::qword_ptr(args[0], ::offset32(&ppu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs)));
// Return to the return location // Return to the return location
c.sub(x86::rsp, 8); c.sub(x86::rsp, 8);
c.ret(); c.ret();
#else
// We really shouldn't be using this, but an implementation shoudln't hurt
// Far jump return. Only clobbers x30.
const arm::GpX ppu_t_base = a64::x20;
const u64 hv_register_array_offset = ::offset32(&ppu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs);
c.mov(ppu_t_base, args[0]);
c.mov(a64::x30, Imm(hv_register_array_offset));
c.ldr(a64::x30, arm::Mem(ppu_t_base, a64::x30));
c.ret(a64::x30);
#endif #endif
}); });
@ -2265,6 +2286,9 @@ void ppu_thread::exec_task()
{ {
if (g_cfg.core.ppu_decoder != ppu_decoder_type::_static) if (g_cfg.core.ppu_decoder != ppu_decoder_type::_static)
{ {
// HVContext push to allow recursion. This happens with guest callback invocations.
const auto old_hv_ctx = hv_ctx;
while (true) while (true)
{ {
if (state) [[unlikely]] if (state) [[unlikely]]
@ -2276,6 +2300,8 @@ void ppu_thread::exec_task()
ppu_gateway(this); ppu_gateway(this);
} }
// HVContext pop
hv_ctx = old_hv_ctx;
return; return;
} }
@ -2314,6 +2340,8 @@ ppu_thread::ppu_thread(const ppu_thread_params& param, std::string_view name, u3
{ {
prio.raw().prio = _prio; prio.raw().prio = _prio;
memset(&hv_ctx, 0, sizeof(hv_ctx));
gpr[1] = stack_addr + stack_size - ppu_stack_start_offset; gpr[1] = stack_addr + stack_size - ppu_stack_start_offset;
gpr[13] = param.tls_addr; gpr[13] = param.tls_addr;
@ -3502,7 +3530,7 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value)
if (notify) if (notify)
{ {
bool notified = false; bool notified = false;
if (ppu.res_notify_time == (vm::reservation_acquire(notify) & -128)) if (ppu.res_notify_time == (vm::reservation_acquire(notify) & -128))
{ {
@ -5277,12 +5305,14 @@ static void ppu_initialize2(jit_compiler& jit, const ppu_module& module_part, co
// Translate // Translate
if (const auto func = translator.Translate(module_part.funcs[fi])) if (const auto func = translator.Translate(module_part.funcs[fi]))
{ {
#ifdef ARCH_X64 // TODO
// Run optimization passes // Run optimization passes
#if LLVM_VERSION_MAJOR < 17 #if LLVM_VERSION_MAJOR < 17
pm.run(*func); pm.run(*func);
#else #else
fpm.run(*func, fam); fpm.run(*func, fam);
#endif #endif
#endif // ARCH_X64
} }
else else
{ {
@ -5297,12 +5327,14 @@ static void ppu_initialize2(jit_compiler& jit, const ppu_module& module_part, co
{ {
if (const auto func = translator.GetSymbolResolver(whole_module)) if (const auto func = translator.GetSymbolResolver(whole_module))
{ {
#ifdef ARCH_X64 // TODO
// Run optimization passes // Run optimization passes
#if LLVM_VERSION_MAJOR < 17 #if LLVM_VERSION_MAJOR < 17
pm.run(*func); pm.run(*func);
#else #else
fpm.run(*func, fam); fpm.run(*func, fam);
#endif #endif
#endif // ARCH_X64
} }
else else
{ {

View file

@ -1,6 +1,7 @@
#pragma once #pragma once
#include "../CPU/CPUThread.h" #include "../CPU/CPUThread.h"
#include "../CPU/Hypervisor.h"
#include "../Memory/vm_ptr.h" #include "../Memory/vm_ptr.h"
#include "Utilities/lockless.h" #include "Utilities/lockless.h"
#include "Utilities/BitField.h" #include "Utilities/BitField.h"
@ -163,6 +164,9 @@ public:
using cpu_thread::operator=; using cpu_thread::operator=;
// Hypervisor context data
alignas(16) rpcs3::hypervisor_context_t hv_ctx; // HV context for gate enter exit. Keep at a low struct offset.
u64 gpr[32] = {}; // General-Purpose Registers u64 gpr[32] = {}; // General-Purpose Registers
f64 fpr[32] = {}; // Floating Point Registers f64 fpr[32] = {}; // Floating Point Registers
v128 vr[32] = {}; // Vector Registers v128 vr[32] = {}; // Vector Registers

View file

@ -208,8 +208,7 @@ Function* PPUTranslator::Translate(const ppu_function& info)
m_ir->CreateAtomicRMW(llvm::AtomicRMWInst::Or, ptr, m_ir->getInt32((+cpu_flag::wait).operator u32()), llvm::MaybeAlign{4}, llvm::AtomicOrdering::AcquireRelease); m_ir->CreateAtomicRMW(llvm::AtomicRMWInst::Or, ptr, m_ir->getInt32((+cpu_flag::wait).operator u32()), llvm::MaybeAlign{4}, llvm::AtomicOrdering::AcquireRelease);
// Create tail call to the check function // Create tail call to the check function
Call(GetType<void>(), "__check", m_thread, GetAddr())->setTailCall(); VMEscape(Call(GetType<void>(), "__check", m_thread, GetAddr()));
m_ir->CreateRetVoid();
} }
else else
{ {
@ -321,7 +320,7 @@ Function* PPUTranslator::GetSymbolResolver(const ppu_module& info)
if (vec_addrs.empty()) if (vec_addrs.empty())
{ {
// Possible special case for no functions (allowing the do-while optimization) // Possible special case for no functions (allowing the do-while optimization)
m_ir->CreateRetVoid(); m_ir->CreateRetVoid(); // FIXME: Aarch64. It should work fine as long as there is no callchain beyond this function with a ret path.
replace_intrinsics(*m_function); replace_intrinsics(*m_function);
return m_function; return m_function;
} }
@ -378,7 +377,7 @@ Function* PPUTranslator::GetSymbolResolver(const ppu_module& info)
// Set insertion point to afterloop_block // Set insertion point to afterloop_block
m_ir->SetInsertPoint(after_loop); m_ir->SetInsertPoint(after_loop);
m_ir->CreateRetVoid(); m_ir->CreateRetVoid(); // FIXME: Aarch64 - Should be ok as long as no ret-based callchain proceeds from here
replace_intrinsics(*m_function); replace_intrinsics(*m_function);
return m_function; return m_function;
@ -482,8 +481,8 @@ void PPUTranslator::CallFunction(u64 target, Value* indirect)
if (_target >= u32{umax}) if (_target >= u32{umax})
{ {
Call(GetType<void>(), "__error", m_thread, GetAddr(), m_ir->getInt32(*ensure(m_info.get_ptr<u32>(::narrow<u32>(m_addr + base))))); auto c = Call(GetType<void>(), "__error", m_thread, GetAddr(), m_ir->getInt32(*ensure(m_info.get_ptr<u32>(::narrow<u32>(m_addr + base)))));
m_ir->CreateRetVoid(); VMEscape(c);
return; return;
} }
else if (_target >= caddr && _target <= cend) else if (_target >= caddr && _target <= cend)
@ -565,7 +564,7 @@ void PPUTranslator::CallFunction(u64 target, Value* indirect)
const auto c = m_ir->CreateCall(callee, {m_exec, m_thread, seg0, m_base, GetGpr(0), GetGpr(1), GetGpr(2)}); const auto c = m_ir->CreateCall(callee, {m_exec, m_thread, seg0, m_base, GetGpr(0), GetGpr(1), GetGpr(2)});
c->setTailCallKind(llvm::CallInst::TCK_Tail); c->setTailCallKind(llvm::CallInst::TCK_Tail);
c->setCallingConv(CallingConv::GHC); c->setCallingConv(CallingConv::GHC);
m_ir->CreateRetVoid(); VMEscape(c);
} }
Value* PPUTranslator::RegInit(Value*& local) Value* PPUTranslator::RegInit(Value*& local)
@ -779,8 +778,8 @@ void PPUTranslator::TestAborted()
m_ir->SetInsertPoint(vcheck); m_ir->SetInsertPoint(vcheck);
// Create tail call to the check function // Create tail call to the check function
Call(GetType<void>(), "__check", m_thread, GetAddr())->setTailCall(); auto c = Call(GetType<void>(), "__check", m_thread, GetAddr());
m_ir->CreateRetVoid(); VMEscape(c);
m_ir->SetInsertPoint(body); m_ir->SetInsertPoint(body);
} }
@ -2206,16 +2205,14 @@ void PPUTranslator::SC(ppu_opcode_t op)
if (index < 1024) if (index < 1024)
{ {
Call(GetType<void>(), fmt::format("%s", ppu_syscall_code(index)), m_thread); auto c = Call(GetType<void>(), fmt::format("%s", ppu_syscall_code(index)), m_thread);
//Call(GetType<void>(), "__escape", m_thread)->setTailCall(); VMEscape(c, true);
m_ir->CreateRetVoid();
return; return;
} }
} }
Call(GetType<void>(), op.lev ? "__lv1call" : "__syscall", m_thread, num); auto c = Call(GetType<void>(), op.lev ? "__lv1call" : "__syscall", m_thread, num);
//Call(GetType<void>(), "__escape", m_thread)->setTailCall(); VMEscape(c, true);
m_ir->CreateRetVoid();
} }
void PPUTranslator::B(ppu_opcode_t op) void PPUTranslator::B(ppu_opcode_t op)
@ -2776,9 +2773,9 @@ void PPUTranslator::LWARX(ppu_opcode_t op)
{ {
RegStore(Trunc(GetAddr()), m_cia); RegStore(Trunc(GetAddr()), m_cia);
FlushRegisters(); FlushRegisters();
Call(GetType<void>(), "__resinterp", m_thread);
//Call(GetType<void>(), "__escape", m_thread)->setTailCall(); auto inst = Call(GetType<void>(), "__resinterp", m_thread);
m_ir->CreateRetVoid(); VMEscape(inst, true);
return; return;
} }
@ -2928,9 +2925,9 @@ void PPUTranslator::LDARX(ppu_opcode_t op)
{ {
RegStore(Trunc(GetAddr()), m_cia); RegStore(Trunc(GetAddr()), m_cia);
FlushRegisters(); FlushRegisters();
Call(GetType<void>(), "__resinterp", m_thread);
//Call(GetType<void>(), "__escape", m_thread)->setTailCall(); auto inst = Call(GetType<void>(), "__resinterp", m_thread);
m_ir->CreateRetVoid(); VMEscape(inst, true);
return; return;
} }
@ -4998,9 +4995,8 @@ void PPUTranslator::FCFID(ppu_opcode_t op)
void PPUTranslator::UNK(ppu_opcode_t op) void PPUTranslator::UNK(ppu_opcode_t op)
{ {
FlushRegisters(); FlushRegisters();
Call(GetType<void>(), "__error", m_thread, GetAddr(), m_ir->getInt32(op.opcode)); auto c = Call(GetType<void>(), "__error", m_thread, GetAddr(), m_ir->getInt32(op.opcode));
//Call(GetType<void>(), "__escape", m_thread)->setTailCall(); VMEscape(c, true);
m_ir->CreateRetVoid();
} }
@ -5279,9 +5275,8 @@ Value* PPUTranslator::CheckTrapCondition(u32 to, Value* left, Value* right)
void PPUTranslator::Trap() void PPUTranslator::Trap()
{ {
Call(GetType<void>(), "__trap", m_thread, GetAddr()); auto c = Call(GetType<void>(), "__trap", m_thread, GetAddr());
//Call(GetType<void>(), "__escape", m_thread)->setTailCall(); VMEscape(c);
m_ir->CreateRetVoid();
} }
Value* PPUTranslator::CheckBranchCondition(u32 bo, u32 bi) Value* PPUTranslator::CheckBranchCondition(u32 bo, u32 bi)
@ -5328,6 +5323,42 @@ MDNode* PPUTranslator::CheckBranchProbability(u32 bo)
return nullptr; return nullptr;
} }
void PPUTranslator::VMEscape([[maybe_unused]] llvm::CallInst* tail_call, [[maybe_unused]] bool skip_flush)
{
//if (!skip_flush)
{
// Flush
FlushRegisters();
}
#ifdef ARCH_X64
// Optionally flag last call as a tail
if (tail_call)
{
tail_call->setTailCall();
}
// This is actually AMD64 specific but good enough for now
m_ir->CreateRetVoid();
#else
// Validation. Make sure we're escaping from a correct context. Only guest JIT should ever go through the "escape" gate.
const auto bb = m_ir->GetInsertPoint();
const auto arg = llvm::dyn_cast<llvm::Argument>(m_thread);
ensure(bb->getParent()->getName().str() == arg->getParent()->getName().str());
const u32 hv_register_array_offset = ::offset32(&ppu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs);
const std::string asm_ = fmt::format(
"ldr x20, $0;\n"
"ldr x30, [x20, #%u];\n",
hv_register_array_offset);
LLVM_ASM(asm_, std::array{ m_thread }, "m", m_ir, m_function->getContext());
m_ir->CreateRetVoid();
#endif
}
void PPUTranslator::build_interpreter() void PPUTranslator::build_interpreter()
{ {
#define BUILD_VEC_INST(i) { \ #define BUILD_VEC_INST(i) { \
@ -5343,8 +5374,7 @@ void PPUTranslator::build_interpreter()
op.vb = 2; \ op.vb = 2; \
op.vc = 3; \ op.vc = 3; \
this->i(op); \ this->i(op); \
FlushRegisters(); \ VMEscape(); \
m_ir->CreateRetVoid(); \
replace_intrinsics(*m_function); \ replace_intrinsics(*m_function); \
} }

View file

@ -150,6 +150,9 @@ public:
// Emit function call // Emit function call
void CallFunction(u64 target, llvm::Value* indirect = nullptr); void CallFunction(u64 target, llvm::Value* indirect = nullptr);
// Emit escape sequence back to hypervisor
void VMEscape(llvm::CallInst* tail_call = nullptr, bool skip_flush = false);
// Emit state check mid-block // Emit state check mid-block
void TestAborted(); void TestAborted();