mirror of
https://github.com/RPCS3/rpcs3.git
synced 2025-07-10 17:01:24 +12:00
SPU: optimize function dispatch in trampolines
Add a top-level hashtable
This commit is contained in:
parent
8031180373
commit
239f53568c
2 changed files with 83 additions and 43 deletions
|
@ -91,11 +91,60 @@ DECLARE(spu_runtime::tr_interpreter) = []
|
||||||
|
|
||||||
DECLARE(spu_runtime::g_dispatcher) = []
|
DECLARE(spu_runtime::g_dispatcher) = []
|
||||||
{
|
{
|
||||||
const auto ptr = reinterpret_cast<decltype(spu_runtime::g_dispatcher)>(jit_runtime::alloc(sizeof(spu_function_t), 8, false));
|
// Allocate 2^20 positions in data area
|
||||||
ptr->raw() = tr_dispatch;
|
const auto ptr = reinterpret_cast<decltype(g_dispatcher)>(jit_runtime::alloc(sizeof(*g_dispatcher), 64, false));
|
||||||
|
|
||||||
|
for (auto& x : *ptr)
|
||||||
|
{
|
||||||
|
x.raw() = tr_dispatch;
|
||||||
|
}
|
||||||
|
|
||||||
return ptr;
|
return ptr;
|
||||||
}();
|
}();
|
||||||
|
|
||||||
|
DECLARE(spu_runtime::tr_all) = []
|
||||||
|
{
|
||||||
|
u8* const trptr = jit_runtime::alloc(32, 16);
|
||||||
|
u8* raw = trptr;
|
||||||
|
|
||||||
|
// Load PC: mov eax, [r13 + spu_thread::pc]
|
||||||
|
*raw++ = 0x41;
|
||||||
|
*raw++ = 0x8b;
|
||||||
|
*raw++ = 0x45;
|
||||||
|
*raw++ = ::narrow<s8>(::offset32(&spu_thread::pc));
|
||||||
|
|
||||||
|
// Get LS address starting from PC: lea rcx, [rbp + rax]
|
||||||
|
*raw++ = 0x48;
|
||||||
|
*raw++ = 0x8d;
|
||||||
|
*raw++ = 0x4c;
|
||||||
|
*raw++ = 0x05;
|
||||||
|
*raw++ = 0x00;
|
||||||
|
|
||||||
|
// mov eax, [rcx]
|
||||||
|
*raw++ = 0x8b;
|
||||||
|
*raw++ = 0x01;
|
||||||
|
|
||||||
|
// shr eax, (32 - 20)
|
||||||
|
*raw++ = 0xc1;
|
||||||
|
*raw++ = 0xe8;
|
||||||
|
*raw++ = 0x0c;
|
||||||
|
|
||||||
|
// Load g_dispatcher to rdx
|
||||||
|
*raw++ = 0x48;
|
||||||
|
*raw++ = 0x8d;
|
||||||
|
*raw++ = 0x15;
|
||||||
|
const s32 r32 = ::narrow<s32>(reinterpret_cast<u64>(g_dispatcher) - reinterpret_cast<u64>(raw) - 4, HERE);
|
||||||
|
std::memcpy(raw, &r32, 4);
|
||||||
|
raw += 4;
|
||||||
|
|
||||||
|
// jmp [rdx + rax * 8]
|
||||||
|
*raw++ = 0xff;
|
||||||
|
*raw++ = 0x24;
|
||||||
|
*raw++ = 0xc2;
|
||||||
|
|
||||||
|
return reinterpret_cast<spu_function_t>(trptr);
|
||||||
|
}();
|
||||||
|
|
||||||
DECLARE(spu_runtime::g_gateway) = build_function_asm<spu_function_t>([](asmjit::X86Assembler& c, auto& args)
|
DECLARE(spu_runtime::g_gateway) = build_function_asm<spu_function_t>([](asmjit::X86Assembler& c, auto& args)
|
||||||
{
|
{
|
||||||
// Gateway for SPU dispatcher, converts from native to GHC calling convention, also saves RSP value for spu_escape
|
// Gateway for SPU dispatcher, converts from native to GHC calling convention, also saves RSP value for spu_escape
|
||||||
|
@ -131,9 +180,8 @@ DECLARE(spu_runtime::g_gateway) = build_function_asm<spu_function_t>([](asmjit::
|
||||||
c.push(x86::rax);
|
c.push(x86::rax);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Load g_dispatcher pointer to call g_dispatcher[0]
|
// Load tr_all function pointer to call actual compiled function
|
||||||
c.mov(x86::rax, asmjit::imm_ptr(spu_runtime::g_dispatcher));
|
c.mov(x86::rax, asmjit::imm_ptr(spu_runtime::tr_all));
|
||||||
c.mov(x86::rax, x86::qword_ptr(x86::rax));
|
|
||||||
|
|
||||||
// Save native stack pointer for longjmp emulation
|
// Save native stack pointer for longjmp emulation
|
||||||
c.mov(x86::qword_ptr(args[0], ::offset32(&spu_thread::saved_native_sp)), x86::rsp);
|
c.mov(x86::qword_ptr(args[0], ::offset32(&spu_thread::saved_native_sp)), x86::rsp);
|
||||||
|
@ -300,7 +348,10 @@ void spu_cache::initialize()
|
||||||
|
|
||||||
if (g_cfg.core.spu_decoder == spu_decoder_type::precise || g_cfg.core.spu_decoder == spu_decoder_type::fast)
|
if (g_cfg.core.spu_decoder == spu_decoder_type::precise || g_cfg.core.spu_decoder == spu_decoder_type::fast)
|
||||||
{
|
{
|
||||||
*spu_runtime::g_dispatcher = spu_runtime::tr_interpreter;
|
for (auto& x : *spu_runtime::g_dispatcher)
|
||||||
|
{
|
||||||
|
x.raw() = spu_runtime::tr_interpreter;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const std::string ppu_cache = Emu.PPUCache();
|
const std::string ppu_cache = Emu.PPUCache();
|
||||||
|
@ -463,9 +514,6 @@ void spu_cache::initialize()
|
||||||
|
|
||||||
if (compilers.size() && !func_list.empty())
|
if (compilers.size() && !func_list.empty())
|
||||||
{
|
{
|
||||||
LOG_NOTICE(SPU, "SPU Runtime: Building trampoline...");
|
|
||||||
spu_runtime::g_dispatcher[0] = compilers[0]->get_runtime().rebuild_ubertrampoline();
|
|
||||||
|
|
||||||
LOG_SUCCESS(SPU, "SPU Runtime: Built %u functions.", func_list.size());
|
LOG_SUCCESS(SPU, "SPU Runtime: Built %u functions.", func_list.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -568,12 +616,12 @@ bool spu_runtime::add(u64 last_reset_count, void* _where, spu_function_t compile
|
||||||
// Register function in PIC map
|
// Register function in PIC map
|
||||||
m_pic_map[{func.data() + _off, func.size() - _off}] = compiled;
|
m_pic_map[{func.data() + _off, func.size() - _off}] = compiled;
|
||||||
|
|
||||||
if (g_fxo->get<spu_cache>())
|
if (func.size() > 1)
|
||||||
{
|
{
|
||||||
// Rebuild trampolines if necessary
|
// Rebuild trampolines if necessary
|
||||||
if (const auto new_tr = rebuild_ubertrampoline())
|
if (const auto new_tr = rebuild_ubertrampoline(func[1]))
|
||||||
{
|
{
|
||||||
g_dispatcher[0] = new_tr;
|
g_dispatcher->at(func[1] >> 12) = new_tr;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -586,11 +634,17 @@ bool spu_runtime::add(u64 last_reset_count, void* _where, spu_function_t compile
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
spu_function_t spu_runtime::rebuild_ubertrampoline()
|
spu_function_t spu_runtime::rebuild_ubertrampoline(u32 id_inst)
|
||||||
{
|
{
|
||||||
// Prepare sorted list
|
// Prepare sorted list
|
||||||
m_flat_list.clear();
|
m_flat_list.clear();
|
||||||
m_flat_list.assign(m_pic_map.cbegin(), m_pic_map.cend());
|
{
|
||||||
|
// Select required subrange (fixed 20 bits for single pos in g_dispatcher table)
|
||||||
|
const u32 id_lower = id_inst & ~0xfff;
|
||||||
|
const u32 id_upper = id_inst | 0xfff;
|
||||||
|
|
||||||
|
m_flat_list.assign(m_pic_map.lower_bound({&id_lower, 1}), m_pic_map.upper_bound({&id_upper, 1}));
|
||||||
|
}
|
||||||
|
|
||||||
struct work
|
struct work
|
||||||
{
|
{
|
||||||
|
@ -661,18 +715,7 @@ spu_function_t spu_runtime::rebuild_ubertrampoline()
|
||||||
workload.back().beg = beg;
|
workload.back().beg = beg;
|
||||||
workload.back().end = _end;
|
workload.back().end = _end;
|
||||||
|
|
||||||
// Load PC: mov eax, [r13 + spu_thread::pc]
|
// LS address starting from PC is already loaded into rcx (see spu_runtime::tr_all)
|
||||||
*raw++ = 0x41;
|
|
||||||
*raw++ = 0x8b;
|
|
||||||
*raw++ = 0x45;
|
|
||||||
*raw++ = ::narrow<s8>(::offset32(&spu_thread::pc));
|
|
||||||
|
|
||||||
// Get LS address starting from PC: lea rcx, [rbp + rax]
|
|
||||||
*raw++ = 0x48;
|
|
||||||
*raw++ = 0x8d;
|
|
||||||
*raw++ = 0x4c;
|
|
||||||
*raw++ = 0x05;
|
|
||||||
*raw++ = 0x00;
|
|
||||||
|
|
||||||
for (std::size_t i = 0; i < workload.size(); i++)
|
for (std::size_t i = 0; i < workload.size(); i++)
|
||||||
{
|
{
|
||||||
|
@ -1098,7 +1141,7 @@ void spu_recompiler_base::dispatch(spu_thread& spu, void*, u8* rip)
|
||||||
// If code verification failed from a patched patchpoint, clear it with a dispatcher jump
|
// If code verification failed from a patched patchpoint, clear it with a dispatcher jump
|
||||||
if (rip)
|
if (rip)
|
||||||
{
|
{
|
||||||
const s64 rel = reinterpret_cast<u64>(spu_runtime::g_dispatcher) - reinterpret_cast<u64>(rip - 8) - 6;
|
const s64 rel = reinterpret_cast<u64>(spu_runtime::tr_all) - reinterpret_cast<u64>(rip - 8) - 5;
|
||||||
|
|
||||||
union
|
union
|
||||||
{
|
{
|
||||||
|
@ -1106,9 +1149,9 @@ void spu_recompiler_base::dispatch(spu_thread& spu, void*, u8* rip)
|
||||||
u64 result;
|
u64 result;
|
||||||
};
|
};
|
||||||
|
|
||||||
bytes[0] = 0xff; // jmp [rip + 0x...]
|
bytes[0] = 0xe9; // jmp rel32
|
||||||
bytes[1] = 0x25;
|
std::memcpy(bytes + 1, &rel, 4);
|
||||||
std::memcpy(bytes + 2, &rel, 4);
|
bytes[5] = 0x90;
|
||||||
bytes[6] = 0x90;
|
bytes[6] = 0x90;
|
||||||
bytes[7] = 0x90;
|
bytes[7] = 0x90;
|
||||||
|
|
||||||
|
@ -1116,7 +1159,7 @@ void spu_recompiler_base::dispatch(spu_thread& spu, void*, u8* rip)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Second attempt (recover from the recursion after repeated unsuccessful trampoline call)
|
// Second attempt (recover from the recursion after repeated unsuccessful trampoline call)
|
||||||
if (spu.block_counter != spu.block_recover && &dispatch != spu_runtime::g_dispatcher[0])
|
if (spu.block_counter != spu.block_recover && &dispatch != spu_runtime::g_dispatcher->at(spu._ref<nse_t<u32>>(spu.pc) >> 12))
|
||||||
{
|
{
|
||||||
spu.block_recover = spu.block_counter;
|
spu.block_recover = spu.block_counter;
|
||||||
return;
|
return;
|
||||||
|
@ -4388,13 +4431,8 @@ public:
|
||||||
const auto entry_call = m_ir->CreateCall(entry_chunk->chunk, {m_thread, m_lsptr, m_base_pc});
|
const auto entry_call = m_ir->CreateCall(entry_chunk->chunk, {m_thread, m_lsptr, m_base_pc});
|
||||||
entry_call->setCallingConv(entry_chunk->chunk->getCallingConv());
|
entry_call->setCallingConv(entry_chunk->chunk->getCallingConv());
|
||||||
|
|
||||||
#ifdef _WIN32
|
const auto dispatcher = llvm::cast<llvm::Function>(m_module->getOrInsertFunction("spu_dispatcher", main_func->getType()).getCallee());
|
||||||
// TODO: fix this mess
|
m_engine->addGlobalMapping("spu_dispatcher", reinterpret_cast<u64>(spu_runtime::tr_all));
|
||||||
const auto dispatcher = m_ir->CreateIntToPtr(m_ir->getInt64((u64)+spu_runtime::g_dispatcher), get_type<u8**>());
|
|
||||||
#else
|
|
||||||
const auto dispatcher = new llvm::GlobalVariable(*m_module, get_type<u8*>(), true, GlobalValue::ExternalLinkage, nullptr, "spu_dispatcher");
|
|
||||||
m_engine->addGlobalMapping("spu_dispatcher", (u64)+spu_runtime::g_dispatcher);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Proceed to the next code
|
// Proceed to the next code
|
||||||
if (entry_chunk->chunk->getReturnType() != get_type<void>())
|
if (entry_chunk->chunk->getReturnType() != get_type<void>())
|
||||||
|
@ -4436,15 +4474,14 @@ public:
|
||||||
|
|
||||||
if (entry_chunk->chunk->getReturnType() == get_type<void>())
|
if (entry_chunk->chunk->getReturnType() == get_type<void>())
|
||||||
{
|
{
|
||||||
const auto next_func = m_ir->CreateLoad(dispatcher);
|
const auto next_call = m_ir->CreateCall(m_ir->CreateBitCast(dispatcher, main_func->getType()), {m_thread, m_lsptr, m_ir->getInt64(0)});
|
||||||
const auto next_call = m_ir->CreateCall(m_ir->CreateBitCast(next_func, main_func->getType()), {m_thread, m_lsptr, m_ir->getInt64(0)});
|
|
||||||
next_call->setCallingConv(main_func->getCallingConv());
|
next_call->setCallingConv(main_func->getCallingConv());
|
||||||
next_call->setTailCall();
|
next_call->setTailCall();
|
||||||
m_ir->CreateRetVoid();
|
m_ir->CreateRetVoid();
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
m_ir->CreateRet(m_ir->CreateLoad(dispatcher));
|
m_ir->CreateRet(m_ir->CreateBitCast(dispatcher, get_type<u8*>()));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Function that executes check_state and escapes if necessary
|
// Function that executes check_state and escapes if necessary
|
||||||
|
|
|
@ -77,6 +77,9 @@ public:
|
||||||
// Trampoline to legacy interpreter
|
// Trampoline to legacy interpreter
|
||||||
static const spu_function_t tr_interpreter;
|
static const spu_function_t tr_interpreter;
|
||||||
|
|
||||||
|
// Detect and call any recompiled function
|
||||||
|
static const spu_function_t tr_all;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
spu_runtime();
|
spu_runtime();
|
||||||
|
|
||||||
|
@ -93,7 +96,7 @@ public:
|
||||||
bool add(u64 last_reset_count, void* where, spu_function_t compiled);
|
bool add(u64 last_reset_count, void* where, spu_function_t compiled);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
spu_function_t rebuild_ubertrampoline();
|
spu_function_t rebuild_ubertrampoline(u32 id_inst);
|
||||||
|
|
||||||
friend class spu_cache;
|
friend class spu_cache;
|
||||||
public:
|
public:
|
||||||
|
@ -120,7 +123,7 @@ public:
|
||||||
void handle_return(spu_thread* _spu);
|
void handle_return(spu_thread* _spu);
|
||||||
|
|
||||||
// All dispatchers (array allocated in jit memory)
|
// All dispatchers (array allocated in jit memory)
|
||||||
static atomic_t<spu_function_t>* const g_dispatcher;
|
static std::array<atomic_t<spu_function_t>, (1 << 20)>* const g_dispatcher;
|
||||||
|
|
||||||
// Recompiler entry point
|
// Recompiler entry point
|
||||||
static const spu_function_t g_gateway;
|
static const spu_function_t g_gateway;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue