diff --git a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp index 59b9c69177..e4f64f2074 100644 --- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp @@ -947,35 +947,11 @@ void spu_recompiler::branch_fixed(u32 target) return; } - c->mov(x86::rax, imm_ptr(spu_runtime::g_dispatcher + target / 4)); - c->mov(x86::rax, x86::qword_ptr(x86::rax)); - c->mov(SPU_OFF_32(pc), target); + c->xor_(qw0->r32(), qw0->r32()); c->cmp(SPU_OFF_32(state), 0); c->jnz(label_stop); - - if (false) - { - // Don't generate patch points (TODO) - c->xor_(qw0->r32(), qw0->r32()); - c->jmp(x86::rax); - return; - } - - // Set patch address as a third argument and fallback to it - Label patch_point = c->newLabel(); - c->lea(*qw0, x86::qword_ptr(patch_point)); - - // Need to emit exactly one executable instruction within 8 bytes - c->align(kAlignCode, 8); - c->bind(patch_point); - //c->dq(0x841f0f); - c->jmp(imm_ptr(&spu_recompiler_base::branch)); - - // Fallback to the branch via dispatcher - c->align(kAlignCode, 8); - c->xor_(qw0->r32(), qw0->r32()); - c->jmp(x86::rax); + c->jmp(imm_ptr(m_spurt->make_branch_patchpoint(target))); } void spu_recompiler::branch_indirect(spu_opcode_t op, bool jt, bool ret) diff --git a/rpcs3/Emu/Cell/SPURecompiler.cpp b/rpcs3/Emu/Cell/SPURecompiler.cpp index c8b9e226cb..91a04f21e9 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.cpp +++ b/rpcs3/Emu/Cell/SPURecompiler.cpp @@ -259,6 +259,15 @@ spu_runtime::spu_runtime() workload.reserve(250); + // Generate a trampoline to spu_recompiler_base::branch + u8* const trptr = jit_runtime::alloc(16, 16); + trptr[0] = 0xff; // jmp [rip] + trptr[1] = 0x25; + std::memset(trptr + 2, 0, 4); + const u64 target = reinterpret_cast(&spu_recompiler_base::branch); + std::memcpy(trptr + 6, &target, 8); + tr_branch = reinterpret_cast(trptr); + LOG_SUCCESS(SPU, "SPU Recompiler Runtime initialized..."); } @@ -539,6 +548,40 @@ void spu_runtime::add(std::pair, spu_function_t>& where, m_cond.notify_all(); } +spu_function_t spu_runtime::make_branch_patchpoint(u32 target) const +{ + u8* const raw = jit_runtime::alloc(16, 16); + + // Save address of the following jmp +#ifdef _WIN32 + raw[0] = 0x4c; // lea r8, [rip+1] + raw[1] = 0x8d; + raw[2] = 0x05; +#else + raw[0] = 0x48; // lea rdx, [rip+1] + raw[1] = 0x8d; + raw[2] = 0x15; +#endif + raw[3] = 0x01; + raw[4] = 0x00; + raw[5] = 0x00; + raw[6] = 0x00; + raw[7] = 0x90; // nop + + // Jump to spu_recompiler_base::branch + raw[8] = 0xe9; + // Compute the distance + const s64 rel = reinterpret_cast(tr_branch) - reinterpret_cast(raw + 8) - 5; + std::memcpy(raw + 9, &rel, 4); + raw[13] = 0xcc; + + // Write compressed target address + raw[14] = target >> 2; + raw[15] = target >> 10; + + return reinterpret_cast(raw); +} + spu_recompiler_base::spu_recompiler_base() { } @@ -549,10 +592,25 @@ spu_recompiler_base::~spu_recompiler_base() void spu_recompiler_base::dispatch(spu_thread& spu, void*, u8* rip) { - // If code verification failed from a patched patchpoint, clear it with a single NOP + // If code verification failed from a patched patchpoint, clear it with a dispatcher jump if (rip) { - atomic_storage::release(*reinterpret_cast(rip), 0x841f0f); + const u32 target = *(u16*)(rip + 6) * 4; + const s64 rel = reinterpret_cast(spu_runtime::g_dispatcher) + 2 * target - reinterpret_cast(rip - 8) - 6; + + union + { + u8 bytes[8]; + u64 result; + }; + + bytes[0] = 0xff; // jmp [rip + 0x...] + bytes[1] = 0x25; + std::memcpy(bytes + 2, &rel, 4); + bytes[6] = 0x90; + bytes[7] = 0x90; + + atomic_storage::release(*reinterpret_cast(rip - 8), result); } // Second attempt (recover from the recursion after repeated unsuccessful trampoline call) @@ -580,7 +638,7 @@ void spu_recompiler_base::dispatch(spu_thread& spu, void*, u8* rip) void spu_recompiler_base::branch(spu_thread& spu, void*, u8* rip) { // Compile (TODO: optimize search of the existing functions) - const auto func = verify(HERE, spu.jit->compile(spu.jit->block(spu._ptr(0), spu.pc))); + const auto func = verify(HERE, spu.jit->compile(spu.jit->block(spu._ptr(0), *(u16*)(rip + 6) * 4))); // Overwrite jump to this function with jump to the compiled function const s64 rel = reinterpret_cast(func) - reinterpret_cast(rip) - 5; @@ -599,23 +657,22 @@ void spu_recompiler_base::branch(spu_thread& spu, void*, u8* rip) { bytes[0] = 0xeb; // jmp rel8 bytes[1] = static_cast(rel8); - std::memset(bytes + 2, 0x90, 6); + std::memset(bytes + 2, 0xcc, 4); } else { bytes[0] = 0xe9; // jmp rel32 std::memcpy(bytes + 1, &rel, 4); - std::memset(bytes + 5, 0x90, 3); + bytes[5] = 0xcc; } + + // Preserve target address + bytes[6] = rip[6]; + bytes[7] = rip[7]; } else { - // Far jumps: extremely rare and disabled due to implementation complexity - LOG_ERROR(SPU, "Impossible far jump"); - bytes[0] = 0x0f; // nop (8-byte form) - bytes[1] = 0x1f; - bytes[2] = 0x84; - std::memset(bytes + 3, 0x00, 5); + fmt::throw_exception("Impossible far jump: %p -> %p", rip, func); } atomic_storage::release(*reinterpret_cast(rip), result); @@ -1985,13 +2042,13 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator LOG_ERROR(SPU, "[0x%x] Predecessor not found for target 0x%x (chunk=0x%x, entry=0x%x, size=%u)", m_pos, target, m_entry, m_function_queue[0], m_size / 4); } - // Generate external indirect tail call + // Generate a patchpoint for fixed location const auto cblock = m_ir->GetInsertBlock(); const auto result = llvm::BasicBlock::Create(m_context, "", m_function); m_ir->SetInsertPoint(result); m_ir->CreateStore(m_ir->getInt32(target), spu_ptr(&spu_thread::pc)); - const auto type = llvm::FunctionType::get(get_type(), {get_type(), get_type(), get_type()}, false)->getPointerTo()->getPointerTo(); - tail(m_ir->CreateLoad(m_ir->CreateIntToPtr(m_ir->getInt64((u64)(spu_runtime::g_dispatcher + target / 4)), type))); + const auto type = llvm::FunctionType::get(get_type(), {get_type(), get_type(), get_type()}, false)->getPointerTo(); + tail(m_ir->CreateIntToPtr(m_ir->getInt64((u64)m_spurt->make_branch_patchpoint(target)), type)); m_ir->SetInsertPoint(cblock); return result; } diff --git a/rpcs3/Emu/Cell/SPURecompiler.h b/rpcs3/Emu/Cell/SPURecompiler.h index 97f5d15636..9f9f233d5d 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.h +++ b/rpcs3/Emu/Cell/SPURecompiler.h @@ -67,12 +67,18 @@ private: // Trampoline to spu_recompiler_base::dispatch spu_function_t tr_dispatch = nullptr; + // Trampoline to spu_recompiler_base::branch + spu_function_t tr_branch = nullptr; + public: spu_runtime(); // Add compiled function and generate trampoline if necessary void add(std::pair, spu_function_t>& where, spu_function_t compiled); + // Generate a patchable trampoline to spu_recompiler_base::branch + spu_function_t make_branch_patchpoint(u32 target) const; + // All dispatchers (array allocated in jit memory) static atomic_t* const g_dispatcher; };