Initial PPU LLVM implementation for aarch64

2025-07-06 06:51:26 +12:00 · 2024-08-04 05:09:06 +03:00 · 2024-08-04 05:09:06 +03:00 · 56cc5d9355
commit 56cc5d9355
parent a5f9256ac6
6 changed files with 223 additions and 78 deletions
--- a/rpcs3/Emu/CPU/CPUTranslator.h
+++ b/rpcs3/Emu/CPU/CPUTranslator.h
@ -26,6 +26,7 @@
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/IntrinsicsX86.h"
 #include "llvm/IR/IntrinsicsAArch64.h"
 #include "llvm/IR/InlineAsm.h"
 #ifdef _MSC_VER
 #pragma warning(pop)
@ -3898,4 +3899,39 @@ struct fmt_unveil<llvm::TypeSize, void>
 	}
 };
 // Inline assembly wrappers.
 // TODO: Move these to proper location and replace macros with templates
 static inline
 llvm::InlineAsm* compile_inline_asm(
 	llvm::Type* returnType,
 	llvm::ArrayRef<llvm::Type*> argTypes,
 	const std::string& code,
 	const std::string& constraints)
 {
 	const auto callSig = llvm::FunctionType::get(returnType, argTypes, false);
 	return llvm::InlineAsm::get(callSig, code, constraints, true, false);
 }
 // Helper for ASM generation with dynamic number of arguments
 #define LLVM_ASM(asm_, args, constraints, irb, ctx)\
 	do {\
 		std::vector<llvm::Type*> _argTypes;\
 		_argTypes.reserve(args.size());\
 		for (const auto& _arg : args) _argTypes.push_back(_arg->getType());\
 		auto _returnType = llvm::Type::getVoidTy(ctx); \
 		llvm::FunctionCallee _callee = compile_inline_asm(_returnType, _argTypes, asm_, constraints); \
 		auto _c = irb->CreateCall(_callee, args); \
 		_c->addFnAttr(llvm::Attribute::AlwaysInline); \
 	} while(0)
 // Helper for ASM generation with 0 args
 #define LLVM_ASM_0(asm_, irb, ctx)\
 	do {\
 		const auto _voidTy = llvm::Type::getVoidTy(ctx); \
 		auto _callee = compile_inline_asm(_voidTy, std::nullopt, asm_, ""); \
 		auto _c = irb->CreateCall(_callee); \
 		_c->setTailCall(); \
 		_c->addFnAttr(llvm::Attribute::AlwaysInline); \
 	} while(0)
 #endif
--- a/rpcs3/Emu/CPU/Hypervisor.h
+++ b/rpcs3/Emu/CPU/Hypervisor.h
@ -0,0 +1,40 @@
 #pragma once
 #include <util/types.hpp>
 namespace rpcs3
 {
 	union alignas(16) hypervisor_context_t
 	{
 		u64 regs[16];
 		struct
 		{
 			u64 pc;
 			u64 sp;
 			u64 x18;
 			u64 x19;
 			u64 x20;
 			u64 x21;
 			u64 x22;
 			u64 x23;
 			u64 x24;
 			u64 x25;
 			u64 x26;
 			u64 x27;
 			u64 x28;
 			u64 x29;
 			u64 x30;
 			// x0-x17 unused
 		} aarch64;
 		struct
 		{
 			u64 sp;
 			// Other regs unused
 		} x86;
 	};
 }
--- a/rpcs3/Emu/Cell/PPUThread.cpp
+++ b/rpcs3/Emu/Cell/PPUThread.cpp
@ -222,7 +222,7 @@ const auto ppu_gateway = build_function_asm<void(*)(ppu_thread*)>("ppu_gateway",
 #endif
 	// Save native stack pointer for longjmp emulation
-	c.mov(x86::qword_ptr(args[0], ::offset32(&ppu_thread::saved_native_sp)), x86::rsp);
+	c.mov(x86::qword_ptr(args[0], ::offset32(&ppu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs)), x86::rsp);
 	// Initialize args
 	c.mov(x86::r13, x86::qword_ptr(reinterpret_cast<u64>(&vm::g_exec_addr)));
@ -291,37 +291,48 @@ const auto ppu_gateway = build_function_asm<void(*)(ppu_thread*)>("ppu_gateway",
 	// and https://developer.arm.com/documentation/den0024/a/The-ABI-for-ARM-64-bit-Architecture/Register-use-in-the-AArch64-Procedure-Call-Standard/Parameters-in-general-purpose-registers
 	// for AArch64 calling convention
-	// Save sp for native longjmp emulation
+	// Push callee saved registers to the hv context
-	Label native_sp_offset = c.newLabel();
+	// Assume our LLVM compiled code is unsafe and can clobber our stack. GHC on aarch64 treats stack as scratch.
-	c.ldr(a64::x10, arm::Mem(native_sp_offset));
+	// We also want to store the register context at a fixed place so we can read the hypervisor state from any lcoation.
 	// sp not allowed to be used in load/stores directly
 	c.mov(a64::x15, a64::sp);
 	c.str(a64::x15, arm::Mem(args[0], a64::x10));
 	// Push callee saved registers to the stack
 	// We need to save x18-x30 = 13 x 8B each + 8 bytes for 16B alignment = 112B
-	c.sub(a64::sp, a64::sp, Imm(112));
+
-	c.stp(a64::x18, a64::x19, arm::Mem(a64::sp));
+	// Pre-context save
-	c.stp(a64::x20, a64::x21, arm::Mem(a64::sp, 16));
+	// Layout:
-	c.stp(a64::x22, a64::x23, arm::Mem(a64::sp, 32));
+	// pc, sp
-	c.stp(a64::x24, a64::x25, arm::Mem(a64::sp, 48));
+	// x18, x19...x30
-	c.stp(a64::x26, a64::x27, arm::Mem(a64::sp, 64));
+	// NOTE: Do not touch x19..x30 before saving the registers!
-	c.stp(a64::x28, a64::x29, arm::Mem(a64::sp, 80));
+	const u64 hv_register_array_offset = ::offset32(&ppu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs);
-	c.str(a64::x30, arm::Mem(a64::sp, 96));
+	Label hv_ctx_pc = c.newLabel(); // Used to hold the far jump return address
 	// Sanity
 	ensure(hv_register_array_offset < 4096); // Imm10
 	c.mov(a64::x15, args[0]);
 	c.add(a64::x14, a64::x15, Imm(hv_register_array_offset));  // Per-thread context save
 	c.adr(a64::x15, hv_ctx_pc); // x15 = pc
 	c.mov(a64::x13, a64::sp);   // x16 = sp
 	c.stp(a64::x15, a64::x13, arm::Mem(a64::x14));
 	c.stp(a64::x18, a64::x19, arm::Mem(a64::x14, 16));
 	c.stp(a64::x20, a64::x21, arm::Mem(a64::x14, 32));
 	c.stp(a64::x22, a64::x23, arm::Mem(a64::x14, 48));
 	c.stp(a64::x24, a64::x25, arm::Mem(a64::x14, 64));
 	c.stp(a64::x26, a64::x27, arm::Mem(a64::x14, 80));
 	c.stp(a64::x28, a64::x29, arm::Mem(a64::x14, 96));
 	c.str(a64::x30, arm::Mem(a64::x14, 112));
 	// Load REG_Base - use absolute jump target to bypass rel jmp range limits
-	Label exec_addr = c.newLabel();
+	c.mov(a64::x19, Imm(reinterpret_cast<u64>(&vm::g_exec_addr)));
 	c.ldr(a64::x19, arm::Mem(exec_addr));
 	c.ldr(a64::x19, arm::Mem(a64::x19));
 	// Load PPUThread struct base -> REG_Sp
 	const arm::GpX ppu_t_base = a64::x20;
 	c.mov(ppu_t_base, args[0]);
 	// Load PC
 	const arm::GpX pc = a64::x15;
 	Label cia_offset = c.newLabel();
 	const arm::GpX cia_addr_reg = a64::x11;
 	// Load offset value
-	c.ldr(cia_addr_reg, arm::Mem(cia_offset));
+	c.mov(cia_addr_reg, Imm(static_cast<u64>(::offset32(&ppu_thread::cia))));
 	// Load cia
 	c.ldr(a64::w15, arm::Mem(ppu_t_base, cia_addr_reg));
 	// Multiply by 2 to index into ptr table
@ -343,44 +354,45 @@ const auto ppu_gateway = build_function_asm<void(*)(ppu_thread*)>("ppu_gateway",
 	c.lsr(call_target, call_target, Imm(16));
 	// Load registers
-	Label base_addr = c.newLabel();
+	c.mov(a64::x22, Imm(reinterpret_cast<u64>(&vm::g_base_addr)));
 	c.ldr(a64::x22, arm::Mem(base_addr));
 	c.ldr(a64::x22, arm::Mem(a64::x22));
 	Label gpr_addr_offset = c.newLabel();
 	const arm::GpX gpr_addr_reg = a64::x9;
-	c.ldr(gpr_addr_reg, arm::Mem(gpr_addr_offset));
+	c.mov(gpr_addr_reg, Imm(static_cast<u64>(::offset32(&ppu_thread::gpr))));
 	c.add(gpr_addr_reg, gpr_addr_reg, ppu_t_base);
 	c.ldr(a64::x23, arm::Mem(gpr_addr_reg));
 	c.ldr(a64::x24, arm::Mem(gpr_addr_reg, 8));
 	c.ldr(a64::x25, arm::Mem(gpr_addr_reg, 16));
 	// GHC frame for the guest. This seems dodgy but the only thing stored on stack is actually registers before making calls to C++ code.
 	// Injected stack frames also work, but are not free and are completely unnecessary.
 	c.sub(a64::sp, a64::sp, Imm(4096));
 	// Execute LLE call
 	c.blr(call_target);
-	// Restore registers from the stack
+	// Return address after far jump. Reset sp and start unwinding...
-	c.ldp(a64::x18, a64::x19, arm::Mem(a64::sp));
+	c.bind(hv_ctx_pc);
 	c.ldp(a64::x20, a64::x21, arm::Mem(a64::sp, 16));
 	c.ldp(a64::x22, a64::x23, arm::Mem(a64::sp, 32));
 	c.ldp(a64::x24, a64::x25, arm::Mem(a64::sp, 48));
 	c.ldp(a64::x26, a64::x27, arm::Mem(a64::sp, 64));
 	c.ldp(a64::x28, a64::x29, arm::Mem(a64::sp, 80));
 	c.ldr(a64::x30, arm::Mem(a64::sp, 96));
 	// Restore stack ptr
 	c.add(a64::sp, a64::sp, Imm(112));
 	// Return
 	c.ret(a64::x30);
-	c.bind(exec_addr);
+	// Execution guard undo (unneded since we're going to hard-reset the SP)
-	c.embedUInt64(reinterpret_cast<u64>(&vm::g_exec_addr));
+	//c.add(a64::sp, a64::sp, Imm(4096));
-	c.bind(base_addr);
+
-	c.embedUInt64(reinterpret_cast<u64>(&vm::g_base_addr));
+	// We either got here through normal "ret" which keeps our x20 intact, or we jumped here and the escape reset our x20 reg
-	c.bind(cia_offset);
+	// Either way, x20 contains our thread base and we forcefully reset the stack pointer
-	c.embedUInt64(static_cast<u64>(::offset32(&ppu_thread::cia)));
+	c.add(a64::x14, a64::x20, Imm(hv_register_array_offset));  // Per-thread context save
-	c.bind(gpr_addr_offset);
+
-	c.embedUInt64(static_cast<u64>(::offset32(&ppu_thread::gpr)));
+	c.ldr(a64::x15, arm::Mem(a64::x14, 8));
-	c.bind(native_sp_offset);
+	c.ldp(a64::x18, a64::x19, arm::Mem(a64::x14, 16));
-	c.embedUInt64(static_cast<u64>(::offset32(&ppu_thread::saved_native_sp)));
+	c.ldp(a64::x20, a64::x21, arm::Mem(a64::x14, 32));
 	c.ldp(a64::x22, a64::x23, arm::Mem(a64::x14, 48));
 	c.ldp(a64::x24, a64::x25, arm::Mem(a64::x14, 64));
 	c.ldp(a64::x26, a64::x27, arm::Mem(a64::x14, 80));
 	c.ldp(a64::x28, a64::x29, arm::Mem(a64::x14, 96));
 	c.ldr(a64::x30, arm::Mem(a64::x14, 112));
 	// Return
 	c.mov(a64::sp, a64::x15);
 	c.ret(a64::x30);
 #endif
 });
@ -390,11 +402,20 @@ const extern auto ppu_escape = build_function_asm<void(*)(ppu_thread*)>("ppu_esc
 #if defined(ARCH_X64)
 	// Restore native stack pointer (longjmp emulation)
-	c.mov(x86::rsp, x86::qword_ptr(args[0], ::offset32(&ppu_thread::saved_native_sp)));
+	c.mov(x86::rsp, x86::qword_ptr(args[0], ::offset32(&ppu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs)));
 	// Return to the return location
 	c.sub(x86::rsp, 8);
 	c.ret();
 #else
 	// We really shouldn't be using this, but an implementation shoudln't hurt
 	// Far jump return. Only clobbers x30.
 	const arm::GpX ppu_t_base = a64::x20;
 	const u64 hv_register_array_offset = ::offset32(&ppu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs);
 	c.mov(ppu_t_base, args[0]);
 	c.mov(a64::x30, Imm(hv_register_array_offset));
 	c.ldr(a64::x30, arm::Mem(ppu_t_base, a64::x30));
 	c.ret(a64::x30);
 #endif
 });
@ -2265,6 +2286,9 @@ void ppu_thread::exec_task()
 {
 	if (g_cfg.core.ppu_decoder != ppu_decoder_type::_static)
 	{
 		// HVContext push to allow recursion. This happens with guest callback invocations.
 		const auto old_hv_ctx = hv_ctx;
 		while (true)
 		{
 			if (state) [[unlikely]]
@ -2276,6 +2300,8 @@ void ppu_thread::exec_task()
 			ppu_gateway(this);
 		}
 		// HVContext pop
 		hv_ctx = old_hv_ctx;
 		return;
 	}
@ -2314,6 +2340,8 @@ ppu_thread::ppu_thread(const ppu_thread_params& param, std::string_view name, u3
 {
 	prio.raw().prio = _prio;
 	memset(&hv_ctx, 0, sizeof(hv_ctx));
 	gpr[1] = stack_addr + stack_size - ppu_stack_start_offset;
 	gpr[13] = param.tls_addr;
@ -3502,7 +3530,7 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value)
 			if (notify)
 			{
-				bool notified = false;
+bool notified = false;
 				if (ppu.res_notify_time == (vm::reservation_acquire(notify) & -128))
 				{
@ -5277,12 +5305,14 @@ static void ppu_initialize2(jit_compiler& jit, const ppu_module& module_part, co
 				// Translate
 				if (const auto func = translator.Translate(module_part.funcs[fi]))
 				{
 #ifdef ARCH_X64 // TODO
 					// Run optimization passes
 #if LLVM_VERSION_MAJOR < 17
 					pm.run(*func);
 #else
 					fpm.run(*func, fam);
 #endif
 #endif // ARCH_X64
 				}
 				else
 				{
@ -5297,12 +5327,14 @@ static void ppu_initialize2(jit_compiler& jit, const ppu_module& module_part, co
 		{
 			if (const auto func = translator.GetSymbolResolver(whole_module))
 			{
 #ifdef ARCH_X64 // TODO
 				// Run optimization passes
 #if LLVM_VERSION_MAJOR < 17
 				pm.run(*func);
 #else
 				fpm.run(*func, fam);
 #endif
 #endif // ARCH_X64
 			}
 			else
 			{
--- a/rpcs3/Emu/Cell/PPUThread.h
+++ b/rpcs3/Emu/Cell/PPUThread.h
@ -1,6 +1,7 @@
 #pragma once
 #include "../CPU/CPUThread.h"
 #include "../CPU/Hypervisor.h"
 #include "../Memory/vm_ptr.h"
 #include "Utilities/lockless.h"
 #include "Utilities/BitField.h"
@ -163,6 +164,9 @@ public:
 	using cpu_thread::operator=;
 	// Hypervisor context data
 	alignas(16) rpcs3::hypervisor_context_t hv_ctx; // HV context for gate enter exit. Keep at a low struct offset.
 	u64 gpr[32] = {}; // General-Purpose Registers
 	f64 fpr[32] = {}; // Floating Point Registers
 	v128 vr[32] = {}; // Vector Registers
--- a/rpcs3/Emu/Cell/PPUTranslator.cpp
+++ b/rpcs3/Emu/Cell/PPUTranslator.cpp
@ -208,8 +208,7 @@ Function* PPUTranslator::Translate(const ppu_function& info)
 		m_ir->CreateAtomicRMW(llvm::AtomicRMWInst::Or, ptr, m_ir->getInt32((+cpu_flag::wait).operator u32()), llvm::MaybeAlign{4}, llvm::AtomicOrdering::AcquireRelease);
 		// Create tail call to the check function
-		Call(GetType<void>(), "__check", m_thread, GetAddr())->setTailCall();
+		VMEscape(Call(GetType<void>(), "__check", m_thread, GetAddr()));
 		m_ir->CreateRetVoid();
 	}
 	else
 	{
@ -321,7 +320,7 @@ Function* PPUTranslator::GetSymbolResolver(const ppu_module& info)
 	if (vec_addrs.empty())
 	{
 		// Possible special case for no functions (allowing the do-while optimization)
-		m_ir->CreateRetVoid();
+		m_ir->CreateRetVoid(); // FIXME: Aarch64. It should work fine as long as there is no callchain beyond this function with a ret path.
 		replace_intrinsics(*m_function);
 		return m_function;
 	}
@ -378,7 +377,7 @@ Function* PPUTranslator::GetSymbolResolver(const ppu_module& info)
 	// Set insertion point to afterloop_block
 	m_ir->SetInsertPoint(after_loop);
-	m_ir->CreateRetVoid();
+	m_ir->CreateRetVoid(); // FIXME: Aarch64 - Should be ok as long as no ret-based callchain proceeds from here
 	replace_intrinsics(*m_function);
 	return m_function;
@ -482,8 +481,8 @@ void PPUTranslator::CallFunction(u64 target, Value* indirect)
 		if (_target >= u32{umax})
 		{
-			Call(GetType<void>(), "__error", m_thread, GetAddr(), m_ir->getInt32(*ensure(m_info.get_ptr<u32>(::narrow<u32>(m_addr + base)))));
+			auto c = Call(GetType<void>(), "__error", m_thread, GetAddr(), m_ir->getInt32(*ensure(m_info.get_ptr<u32>(::narrow<u32>(m_addr + base)))));
-			m_ir->CreateRetVoid();
+			VMEscape(c);
 			return;
 		}
 		else if (_target >= caddr && _target <= cend)
@ -565,7 +564,7 @@ void PPUTranslator::CallFunction(u64 target, Value* indirect)
 	const auto c = m_ir->CreateCall(callee, {m_exec, m_thread, seg0, m_base, GetGpr(0), GetGpr(1), GetGpr(2)});
 	c->setTailCallKind(llvm::CallInst::TCK_Tail);
 	c->setCallingConv(CallingConv::GHC);
-	m_ir->CreateRetVoid();
+	VMEscape(c);
 }
 Value* PPUTranslator::RegInit(Value*& local)
@ -779,8 +778,8 @@ void PPUTranslator::TestAborted()
 	m_ir->SetInsertPoint(vcheck);
 	// Create tail call to the check function
-	Call(GetType<void>(), "__check", m_thread, GetAddr())->setTailCall();
+	auto c = Call(GetType<void>(), "__check", m_thread, GetAddr());
-	m_ir->CreateRetVoid();
+	VMEscape(c);
 	m_ir->SetInsertPoint(body);
 }
@ -2206,16 +2205,14 @@ void PPUTranslator::SC(ppu_opcode_t op)
 		if (index < 1024)
 		{
-			Call(GetType<void>(), fmt::format("%s", ppu_syscall_code(index)), m_thread);
+			auto c = Call(GetType<void>(), fmt::format("%s", ppu_syscall_code(index)), m_thread);
-			//Call(GetType<void>(), "__escape", m_thread)->setTailCall();
+			VMEscape(c, true);
 			m_ir->CreateRetVoid();
 			return;
 		}
 	}
-	Call(GetType<void>(), op.lev ? "__lv1call" : "__syscall", m_thread, num);
+	auto c = Call(GetType<void>(), op.lev ? "__lv1call" : "__syscall", m_thread, num);
-	//Call(GetType<void>(), "__escape", m_thread)->setTailCall();
+	VMEscape(c, true);
 	m_ir->CreateRetVoid();
 }
 void PPUTranslator::B(ppu_opcode_t op)
@ -2776,9 +2773,9 @@ void PPUTranslator::LWARX(ppu_opcode_t op)
 	{
 		RegStore(Trunc(GetAddr()), m_cia);
 		FlushRegisters();
-		Call(GetType<void>(), "__resinterp", m_thread);
+
-		//Call(GetType<void>(), "__escape", m_thread)->setTailCall();
+		auto inst = Call(GetType<void>(), "__resinterp", m_thread);
-		m_ir->CreateRetVoid();
+		VMEscape(inst, true);
 		return;
 	}
@ -2928,9 +2925,9 @@ void PPUTranslator::LDARX(ppu_opcode_t op)
 	{
 		RegStore(Trunc(GetAddr()), m_cia);
 		FlushRegisters();
-		Call(GetType<void>(), "__resinterp", m_thread);
+
-		//Call(GetType<void>(), "__escape", m_thread)->setTailCall();
+		auto inst = Call(GetType<void>(), "__resinterp", m_thread);
-		m_ir->CreateRetVoid();
+		VMEscape(inst, true);
 		return;
 	}
@ -4998,9 +4995,8 @@ void PPUTranslator::FCFID(ppu_opcode_t op)
 void PPUTranslator::UNK(ppu_opcode_t op)
 {
 	FlushRegisters();
-	Call(GetType<void>(), "__error", m_thread, GetAddr(), m_ir->getInt32(op.opcode));
+	auto c = Call(GetType<void>(), "__error", m_thread, GetAddr(), m_ir->getInt32(op.opcode));
-	//Call(GetType<void>(), "__escape", m_thread)->setTailCall();
+	VMEscape(c, true);
 	m_ir->CreateRetVoid();
 }
@ -5279,9 +5275,8 @@ Value* PPUTranslator::CheckTrapCondition(u32 to, Value* left, Value* right)
 void PPUTranslator::Trap()
 {
-	Call(GetType<void>(), "__trap", m_thread, GetAddr());
+	auto c = Call(GetType<void>(), "__trap", m_thread, GetAddr());
-	//Call(GetType<void>(), "__escape", m_thread)->setTailCall();
+	VMEscape(c);
 	m_ir->CreateRetVoid();
 }
 Value* PPUTranslator::CheckBranchCondition(u32 bo, u32 bi)
@ -5328,6 +5323,42 @@ MDNode* PPUTranslator::CheckBranchProbability(u32 bo)
 	return nullptr;
 }
 void PPUTranslator::VMEscape([[maybe_unused]] llvm::CallInst* tail_call, [[maybe_unused]] bool skip_flush)
 {
 	//if (!skip_flush)
 	{
 		// Flush
 		FlushRegisters();
 	}
 #ifdef ARCH_X64
 	// Optionally flag last call as a tail
 	if (tail_call)
 	{
 		tail_call->setTailCall();
 	}
 	// This is actually AMD64 specific but good enough for now
 	m_ir->CreateRetVoid();
 #else
 	// Validation. Make sure we're escaping from a correct context. Only guest JIT should ever go through the "escape" gate.
 	const auto bb = m_ir->GetInsertPoint();
 	const auto arg = llvm::dyn_cast<llvm::Argument>(m_thread);
 	ensure(bb->getParent()->getName().str() == arg->getParent()->getName().str());
 	const u32 hv_register_array_offset = ::offset32(&ppu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs);
 	const std::string asm_ = fmt::format(
 		"ldr x20, $0;\n"
 		"ldr x30, [x20, #%u];\n",
 		hv_register_array_offset);
 	LLVM_ASM(asm_, std::array{ m_thread }, "m", m_ir, m_function->getContext());
 	m_ir->CreateRetVoid();
 #endif
 }
 void PPUTranslator::build_interpreter()
 {
 #define BUILD_VEC_INST(i) { \
@ -5343,8 +5374,7 @@ void PPUTranslator::build_interpreter()
 		op.vb = 2; \
 		op.vc = 3; \
 		this->i(op); \
-		FlushRegisters(); \
+		VMEscape(); \
 		m_ir->CreateRetVoid(); \
 		replace_intrinsics(*m_function); \
 	}
--- a/rpcs3/Emu/Cell/PPUTranslator.h
+++ b/rpcs3/Emu/Cell/PPUTranslator.h
@ -150,6 +150,9 @@ public:
 	// Emit function call
 	void CallFunction(u64 target, llvm::Value* indirect = nullptr);
 	// Emit escape sequence back to hypervisor
 	void VMEscape(llvm::CallInst* tail_call = nullptr, bool skip_flush = false);
 	// Emit state check mid-block
 	void TestAborted();