From 239f53568c572aea517e1e9310ddb2da2e17e360 Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Thu, 3 Oct 2019 19:57:32 +0300 Subject: [PATCH] SPU: optimize function dispatch in trampolines Add a top-level hashtable --- rpcs3/Emu/Cell/SPURecompiler.cpp | 119 ++++++++++++++++++++----------- rpcs3/Emu/Cell/SPURecompiler.h | 7 +- 2 files changed, 83 insertions(+), 43 deletions(-) diff --git a/rpcs3/Emu/Cell/SPURecompiler.cpp b/rpcs3/Emu/Cell/SPURecompiler.cpp index 1c92e1b888..aac42d8ebc 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.cpp +++ b/rpcs3/Emu/Cell/SPURecompiler.cpp @@ -91,11 +91,60 @@ DECLARE(spu_runtime::tr_interpreter) = [] DECLARE(spu_runtime::g_dispatcher) = [] { - const auto ptr = reinterpret_cast(jit_runtime::alloc(sizeof(spu_function_t), 8, false)); - ptr->raw() = tr_dispatch; + // Allocate 2^20 positions in data area + const auto ptr = reinterpret_cast(jit_runtime::alloc(sizeof(*g_dispatcher), 64, false)); + + for (auto& x : *ptr) + { + x.raw() = tr_dispatch; + } + return ptr; }(); +DECLARE(spu_runtime::tr_all) = [] +{ + u8* const trptr = jit_runtime::alloc(32, 16); + u8* raw = trptr; + + // Load PC: mov eax, [r13 + spu_thread::pc] + *raw++ = 0x41; + *raw++ = 0x8b; + *raw++ = 0x45; + *raw++ = ::narrow(::offset32(&spu_thread::pc)); + + // Get LS address starting from PC: lea rcx, [rbp + rax] + *raw++ = 0x48; + *raw++ = 0x8d; + *raw++ = 0x4c; + *raw++ = 0x05; + *raw++ = 0x00; + + // mov eax, [rcx] + *raw++ = 0x8b; + *raw++ = 0x01; + + // shr eax, (32 - 20) + *raw++ = 0xc1; + *raw++ = 0xe8; + *raw++ = 0x0c; + + // Load g_dispatcher to rdx + *raw++ = 0x48; + *raw++ = 0x8d; + *raw++ = 0x15; + const s32 r32 = ::narrow(reinterpret_cast(g_dispatcher) - reinterpret_cast(raw) - 4, HERE); + std::memcpy(raw, &r32, 4); + raw += 4; + + // jmp [rdx + rax * 8] + *raw++ = 0xff; + *raw++ = 0x24; + *raw++ = 0xc2; + + return reinterpret_cast(trptr); +}(); + DECLARE(spu_runtime::g_gateway) = build_function_asm([](asmjit::X86Assembler& c, auto& args) { // Gateway for SPU dispatcher, converts from native to GHC calling convention, also saves RSP value for spu_escape @@ -131,9 +180,8 @@ DECLARE(spu_runtime::g_gateway) = build_function_asm([](asmjit:: c.push(x86::rax); #endif - // Load g_dispatcher pointer to call g_dispatcher[0] - c.mov(x86::rax, asmjit::imm_ptr(spu_runtime::g_dispatcher)); - c.mov(x86::rax, x86::qword_ptr(x86::rax)); + // Load tr_all function pointer to call actual compiled function + c.mov(x86::rax, asmjit::imm_ptr(spu_runtime::tr_all)); // Save native stack pointer for longjmp emulation c.mov(x86::qword_ptr(args[0], ::offset32(&spu_thread::saved_native_sp)), x86::rsp); @@ -300,7 +348,10 @@ void spu_cache::initialize() if (g_cfg.core.spu_decoder == spu_decoder_type::precise || g_cfg.core.spu_decoder == spu_decoder_type::fast) { - *spu_runtime::g_dispatcher = spu_runtime::tr_interpreter; + for (auto& x : *spu_runtime::g_dispatcher) + { + x.raw() = spu_runtime::tr_interpreter; + } } const std::string ppu_cache = Emu.PPUCache(); @@ -463,9 +514,6 @@ void spu_cache::initialize() if (compilers.size() && !func_list.empty()) { - LOG_NOTICE(SPU, "SPU Runtime: Building trampoline..."); - spu_runtime::g_dispatcher[0] = compilers[0]->get_runtime().rebuild_ubertrampoline(); - LOG_SUCCESS(SPU, "SPU Runtime: Built %u functions.", func_list.size()); } @@ -568,12 +616,12 @@ bool spu_runtime::add(u64 last_reset_count, void* _where, spu_function_t compile // Register function in PIC map m_pic_map[{func.data() + _off, func.size() - _off}] = compiled; - if (g_fxo->get()) + if (func.size() > 1) { // Rebuild trampolines if necessary - if (const auto new_tr = rebuild_ubertrampoline()) + if (const auto new_tr = rebuild_ubertrampoline(func[1])) { - g_dispatcher[0] = new_tr; + g_dispatcher->at(func[1] >> 12) = new_tr; } else { @@ -586,11 +634,17 @@ bool spu_runtime::add(u64 last_reset_count, void* _where, spu_function_t compile return true; } -spu_function_t spu_runtime::rebuild_ubertrampoline() +spu_function_t spu_runtime::rebuild_ubertrampoline(u32 id_inst) { // Prepare sorted list m_flat_list.clear(); - m_flat_list.assign(m_pic_map.cbegin(), m_pic_map.cend()); + { + // Select required subrange (fixed 20 bits for single pos in g_dispatcher table) + const u32 id_lower = id_inst & ~0xfff; + const u32 id_upper = id_inst | 0xfff; + + m_flat_list.assign(m_pic_map.lower_bound({&id_lower, 1}), m_pic_map.upper_bound({&id_upper, 1})); + } struct work { @@ -661,18 +715,7 @@ spu_function_t spu_runtime::rebuild_ubertrampoline() workload.back().beg = beg; workload.back().end = _end; - // Load PC: mov eax, [r13 + spu_thread::pc] - *raw++ = 0x41; - *raw++ = 0x8b; - *raw++ = 0x45; - *raw++ = ::narrow(::offset32(&spu_thread::pc)); - - // Get LS address starting from PC: lea rcx, [rbp + rax] - *raw++ = 0x48; - *raw++ = 0x8d; - *raw++ = 0x4c; - *raw++ = 0x05; - *raw++ = 0x00; + // LS address starting from PC is already loaded into rcx (see spu_runtime::tr_all) for (std::size_t i = 0; i < workload.size(); i++) { @@ -1098,7 +1141,7 @@ void spu_recompiler_base::dispatch(spu_thread& spu, void*, u8* rip) // If code verification failed from a patched patchpoint, clear it with a dispatcher jump if (rip) { - const s64 rel = reinterpret_cast(spu_runtime::g_dispatcher) - reinterpret_cast(rip - 8) - 6; + const s64 rel = reinterpret_cast(spu_runtime::tr_all) - reinterpret_cast(rip - 8) - 5; union { @@ -1106,9 +1149,9 @@ void spu_recompiler_base::dispatch(spu_thread& spu, void*, u8* rip) u64 result; }; - bytes[0] = 0xff; // jmp [rip + 0x...] - bytes[1] = 0x25; - std::memcpy(bytes + 2, &rel, 4); + bytes[0] = 0xe9; // jmp rel32 + std::memcpy(bytes + 1, &rel, 4); + bytes[5] = 0x90; bytes[6] = 0x90; bytes[7] = 0x90; @@ -1116,7 +1159,7 @@ void spu_recompiler_base::dispatch(spu_thread& spu, void*, u8* rip) } // Second attempt (recover from the recursion after repeated unsuccessful trampoline call) - if (spu.block_counter != spu.block_recover && &dispatch != spu_runtime::g_dispatcher[0]) + if (spu.block_counter != spu.block_recover && &dispatch != spu_runtime::g_dispatcher->at(spu._ref>(spu.pc) >> 12)) { spu.block_recover = spu.block_counter; return; @@ -4388,13 +4431,8 @@ public: const auto entry_call = m_ir->CreateCall(entry_chunk->chunk, {m_thread, m_lsptr, m_base_pc}); entry_call->setCallingConv(entry_chunk->chunk->getCallingConv()); -#ifdef _WIN32 - // TODO: fix this mess - const auto dispatcher = m_ir->CreateIntToPtr(m_ir->getInt64((u64)+spu_runtime::g_dispatcher), get_type()); -#else - const auto dispatcher = new llvm::GlobalVariable(*m_module, get_type(), true, GlobalValue::ExternalLinkage, nullptr, "spu_dispatcher"); - m_engine->addGlobalMapping("spu_dispatcher", (u64)+spu_runtime::g_dispatcher); -#endif + const auto dispatcher = llvm::cast(m_module->getOrInsertFunction("spu_dispatcher", main_func->getType()).getCallee()); + m_engine->addGlobalMapping("spu_dispatcher", reinterpret_cast(spu_runtime::tr_all)); // Proceed to the next code if (entry_chunk->chunk->getReturnType() != get_type()) @@ -4436,15 +4474,14 @@ public: if (entry_chunk->chunk->getReturnType() == get_type()) { - const auto next_func = m_ir->CreateLoad(dispatcher); - const auto next_call = m_ir->CreateCall(m_ir->CreateBitCast(next_func, main_func->getType()), {m_thread, m_lsptr, m_ir->getInt64(0)}); + const auto next_call = m_ir->CreateCall(m_ir->CreateBitCast(dispatcher, main_func->getType()), {m_thread, m_lsptr, m_ir->getInt64(0)}); next_call->setCallingConv(main_func->getCallingConv()); next_call->setTailCall(); m_ir->CreateRetVoid(); } else { - m_ir->CreateRet(m_ir->CreateLoad(dispatcher)); + m_ir->CreateRet(m_ir->CreateBitCast(dispatcher, get_type())); } // Function that executes check_state and escapes if necessary diff --git a/rpcs3/Emu/Cell/SPURecompiler.h b/rpcs3/Emu/Cell/SPURecompiler.h index 2e329a8d1e..8b73fa9903 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.h +++ b/rpcs3/Emu/Cell/SPURecompiler.h @@ -77,6 +77,9 @@ public: // Trampoline to legacy interpreter static const spu_function_t tr_interpreter; + // Detect and call any recompiled function + static const spu_function_t tr_all; + public: spu_runtime(); @@ -93,7 +96,7 @@ public: bool add(u64 last_reset_count, void* where, spu_function_t compiled); private: - spu_function_t rebuild_ubertrampoline(); + spu_function_t rebuild_ubertrampoline(u32 id_inst); friend class spu_cache; public: @@ -120,7 +123,7 @@ public: void handle_return(spu_thread* _spu); // All dispatchers (array allocated in jit memory) - static atomic_t* const g_dispatcher; + static std::array, (1 << 20)>* const g_dispatcher; // Recompiler entry point static const spu_function_t g_gateway;