Prefetch byteswapped opcodes in ppu interpreter

This commit is contained in:
eladash 2019-03-22 09:58:04 +02:00 committed by Ivan
parent 1c462abc37
commit b307aff9eb
3 changed files with 52 additions and 61 deletions

View file

@ -169,13 +169,13 @@ static void ppu_initialize2(class jit_compiler& jit, const ppu_module& module_pa
extern void ppu_execute_syscall(ppu_thread& ppu, u64 code); extern void ppu_execute_syscall(ppu_thread& ppu, u64 code);
// Get pointer to executable cache // Get pointer to executable cache
static u32& ppu_ref(u32 addr) static u64& ppu_ref(u32 addr)
{ {
return *reinterpret_cast<u32*>(vm::g_exec_addr + addr); return *reinterpret_cast<u64*>(vm::g_exec_addr + (u64)addr * 2);
} }
// Get interpreter cache value // Get interpreter cache value
static u32 ppu_cache(u32 addr) static u64 ppu_cache(u32 addr)
{ {
// Select opcode table // Select opcode table
const auto& table = *( const auto& table = *(
@ -183,7 +183,8 @@ static u32 ppu_cache(u32 addr)
g_cfg.core.ppu_decoder == ppu_decoder_type::fast ? &g_ppu_interpreter_fast.get_table() : g_cfg.core.ppu_decoder == ppu_decoder_type::fast ? &g_ppu_interpreter_fast.get_table() :
(fmt::throw_exception<std::logic_error>("Invalid PPU decoder"), nullptr)); (fmt::throw_exception<std::logic_error>("Invalid PPU decoder"), nullptr));
return ::narrow<u32>(reinterpret_cast<std::uintptr_t>(table[ppu_decode(vm::read32(addr))])); const u32 value = vm::read32(addr);
return (u64)value << 32 | ::narrow<u32>(reinterpret_cast<std::uintptr_t>(table[ppu_decode(value)]));
} }
static bool ppu_fallback(ppu_thread& ppu, ppu_opcode_t op) static bool ppu_fallback(ppu_thread& ppu, ppu_opcode_t op)
@ -207,20 +208,19 @@ void ppu_recompiler_fallback(ppu_thread& ppu)
} }
const auto& table = g_ppu_interpreter_fast.get_table(); const auto& table = g_ppu_interpreter_fast.get_table();
const auto base = vm::g_base_addr;
const auto cache = vm::g_exec_addr; const auto cache = vm::g_exec_addr;
while (true) while (true)
{ {
// Run instructions in interpreter // Run instructions in interpreter
if (const u32 op = *reinterpret_cast<be_t<u32>*>(base + ppu.cia); if (const u32 op = *reinterpret_cast<u32*>(cache + (u64)ppu.cia * 2 + 4);
LIKELY(table[ppu_decode(op)](ppu, { op }))) LIKELY(table[ppu_decode(op)](ppu, { op })))
{ {
ppu.cia += 4; ppu.cia += 4;
continue; continue;
} }
if (uptr func = *reinterpret_cast<u32*>(cache + ppu.cia); if (uptr func = *reinterpret_cast<u32*>(cache + (u64)ppu.cia * 2);
func != reinterpret_cast<uptr>(ppu_recompiler_fallback)) func != reinterpret_cast<uptr>(ppu_recompiler_fallback))
{ {
// We found a recompiler function at cia, return // We found a recompiler function at cia, return
@ -269,7 +269,7 @@ extern void ppu_register_range(u32 addr, u32 size)
} }
// Register executable range at // Register executable range at
utils::memory_commit(&ppu_ref(addr), size, utils::protection::rw); utils::memory_commit(&ppu_ref(addr), size * 2, utils::protection::rw);
const u32 fallback = ::narrow<u32>(g_cfg.core.ppu_decoder == ppu_decoder_type::llvm ? const u32 fallback = ::narrow<u32>(g_cfg.core.ppu_decoder == ppu_decoder_type::llvm ?
reinterpret_cast<uptr>(ppu_recompiler_fallback) : reinterpret_cast<uptr>(ppu_fallback)); reinterpret_cast<uptr>(ppu_recompiler_fallback) : reinterpret_cast<uptr>(ppu_fallback));
@ -277,7 +277,7 @@ extern void ppu_register_range(u32 addr, u32 size)
size &= ~3; // Loop assumes `size = n * 4`, enforce that by rounding down size &= ~3; // Loop assumes `size = n * 4`, enforce that by rounding down
while (size) while (size)
{ {
ppu_ref(addr) = fallback; ppu_ref(addr) = (u64)vm::read32(addr) << 32 | fallback;
addr += 4; addr += 4;
size -= 4; size -= 4;
} }
@ -288,7 +288,7 @@ extern void ppu_register_function_at(u32 addr, u32 size, ppu_function_t ptr)
// Initialize specific function // Initialize specific function
if (ptr) if (ptr)
{ {
ppu_ref(addr) = ::narrow<u32>(reinterpret_cast<std::uintptr_t>(ptr)); *reinterpret_cast<u32*>(&ppu_ref(addr)) = ::narrow<u32>(reinterpret_cast<std::uintptr_t>(ptr));
return; return;
} }
@ -312,7 +312,7 @@ extern void ppu_register_function_at(u32 addr, u32 size, ppu_function_t ptr)
while (size) while (size)
{ {
if (ppu_ref(addr) == fallback) if ((u32)ppu_ref(addr) == fallback)
{ {
ppu_ref(addr) = ppu_cache(addr); ppu_ref(addr) = ppu_cache(addr);
} }
@ -357,7 +357,7 @@ extern void ppu_breakpoint(u32 addr, bool isAdding)
if (isAdding) if (isAdding)
{ {
// Set breakpoint // Set breakpoint
ppu_ref(addr) = _break; *reinterpret_cast<u32*>(&ppu_ref(addr)) = _break;
} }
else else
{ {
@ -376,9 +376,9 @@ extern void ppu_set_breakpoint(u32 addr)
const auto _break = ::narrow<u32>(reinterpret_cast<std::uintptr_t>(&ppu_break)); const auto _break = ::narrow<u32>(reinterpret_cast<std::uintptr_t>(&ppu_break));
if (ppu_ref(addr) != _break) if ((u32)ppu_ref(addr) != _break)
{ {
ppu_ref(addr) = _break; *reinterpret_cast<u32*>(&ppu_ref(addr)) = _break;
} }
} }
@ -392,7 +392,7 @@ extern void ppu_remove_breakpoint(u32 addr)
const auto _break = ::narrow<u32>(reinterpret_cast<std::uintptr_t>(&ppu_break)); const auto _break = ::narrow<u32>(reinterpret_cast<std::uintptr_t>(&ppu_break));
if (ppu_ref(addr) == _break) if ((u32)ppu_ref(addr) == _break)
{ {
ppu_ref(addr) = ppu_cache(addr); ppu_ref(addr) = ppu_cache(addr);
} }
@ -420,7 +420,7 @@ extern bool ppu_patch(u32 addr, u32 value)
const u32 _break = ::narrow<u32>(reinterpret_cast<std::uintptr_t>(&ppu_break)); const u32 _break = ::narrow<u32>(reinterpret_cast<std::uintptr_t>(&ppu_break));
const u32 fallback = ::narrow<u32>(reinterpret_cast<std::uintptr_t>(&ppu_fallback)); const u32 fallback = ::narrow<u32>(reinterpret_cast<std::uintptr_t>(&ppu_fallback));
if (ppu_ref(addr) != _break && ppu_ref(addr) != fallback) if ((u32)ppu_ref(addr) != _break && (u32)ppu_ref(addr) != fallback)
{ {
ppu_ref(addr) = ppu_cache(addr); ppu_ref(addr) = ppu_cache(addr);
} }
@ -622,81 +622,72 @@ void ppu_thread::exec_task()
{ {
while (!(state & (cpu_flag::ret + cpu_flag::exit + cpu_flag::stop + cpu_flag::dbg_global_stop))) while (!(state & (cpu_flag::ret + cpu_flag::exit + cpu_flag::stop + cpu_flag::dbg_global_stop)))
{ {
reinterpret_cast<ppu_function_t>(static_cast<std::uintptr_t>(ppu_ref(cia)))(*this); reinterpret_cast<ppu_function_t>(static_cast<std::uintptr_t>((u32)ppu_ref(cia)))(*this);
} }
return; return;
} }
const auto base = vm::_ptr<const u8>(0);
const auto cache = vm::g_exec_addr; const auto cache = vm::g_exec_addr;
const auto bswap4 = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3);
v128 _op;
using func_t = decltype(&ppu_interpreter::UNK); using func_t = decltype(&ppu_interpreter::UNK);
func_t func0, func1, func2, func3, func4, func5;
while (true) while (true)
{ {
if (UNLIKELY(state)) const auto exec_op = [this](u64 op)
{ {
if (check_state()) return; return reinterpret_cast<func_t>((uptr)(u32)op)(*this, {u32(op >> 32)});
};
if (cia % 8 || !s_use_ssse3 || UNLIKELY(state))
{
if (test_stopped()) return;
// Decode single instruction (may be step) // Decode single instruction (may be step)
const u32 op = *reinterpret_cast<const be_t<u32>*>(base + cia); if (exec_op(*reinterpret_cast<u64*>(cache + (u64)cia * 2))) { cia += 4; }
if (reinterpret_cast<func_t>((std::uintptr_t)ppu_ref(cia))(*this, {op})) { cia += 4; }
continue; continue;
} }
if (cia % 16 || !s_use_ssse3) u64 op0, op1, op2, op3;
{ u64 _pos = (u64)cia * 2;
// Unaligned
const u32 op = *reinterpret_cast<const be_t<u32>*>(base + cia);
if (reinterpret_cast<func_t>((std::uintptr_t)ppu_ref(cia))(*this, {op})) { cia += 4; }
continue;
}
// Reinitialize // Reinitialize
{ {
const v128 x = v128::fromV(_mm_load_si128(reinterpret_cast<const __m128i*>(cache + cia))); const v128 _op0 = *reinterpret_cast<const v128*>(cache + _pos);
func0 = reinterpret_cast<func_t>((std::uintptr_t)x._u32[0]); const v128 _op1 = *reinterpret_cast<const v128*>(cache + _pos + 16);
func1 = reinterpret_cast<func_t>((std::uintptr_t)x._u32[1]); op0 = _op0._u64[0];
func2 = reinterpret_cast<func_t>((std::uintptr_t)x._u32[2]); op1 = _op0._u64[1];
func3 = reinterpret_cast<func_t>((std::uintptr_t)x._u32[3]); op2 = _op1._u64[0];
_op.vi = _mm_shuffle_epi8(_mm_load_si128(reinterpret_cast<const __m128i*>(base + cia)), bswap4); op3 = _op1._u64[1];
} }
while (LIKELY(func0(*this, {_op._u32[0]}))) while (LIKELY(exec_op(op0)))
{ {
cia += 4; cia += 4;
if (LIKELY(func1(*this, {_op._u32[1]}))) if (LIKELY(exec_op(op1)))
{ {
cia += 4; cia += 4;
const v128 x = v128::fromV(_mm_load_si128(reinterpret_cast<const __m128i*>(cache + cia + 8))); if (LIKELY(exec_op(op2)))
func0 = reinterpret_cast<func_t>((std::uintptr_t)x._u32[0]);
func1 = reinterpret_cast<func_t>((std::uintptr_t)x._u32[1]);
func4 = reinterpret_cast<func_t>((std::uintptr_t)x._u32[2]);
func5 = reinterpret_cast<func_t>((std::uintptr_t)x._u32[3]);
if (LIKELY(func2(*this, {_op._u32[2]})))
{ {
cia += 4; cia += 4;
if (LIKELY(func3(*this, {_op._u32[3]}))) if (LIKELY(exec_op(op3)))
{ {
cia += 4; cia += 4;
func2 = func4;
func3 = func5;
if (UNLIKELY(state)) if (UNLIKELY(state))
{ {
break; break;
} }
_op.vi = _mm_shuffle_epi8(_mm_load_si128(reinterpret_cast<const __m128i*>(base + cia)), bswap4); _pos += 32;
const v128 _op0 = *reinterpret_cast<const v128*>(cache + _pos);
const v128 _op1 = *reinterpret_cast<const v128*>(cache + _pos + 16);
op0 = _op0._u64[0];
op1 = _op0._u64[1];
op2 = _op1._u64[0];
op3 = _op1._u64[1];
continue; continue;
} }
break; break;
@ -1296,7 +1287,7 @@ extern void ppu_initialize(const ppu_module& info)
if (g_cfg.core.ppu_debug && func.size && func.toc != -1) if (g_cfg.core.ppu_debug && func.size && func.toc != -1)
{ {
s_ppu_toc->emplace(func.addr, func.toc); s_ppu_toc->emplace(func.addr, func.toc);
ppu_ref(func.addr) = ::narrow<u32>(reinterpret_cast<std::uintptr_t>(&ppu_check_toc)); *reinterpret_cast<u32*>(&ppu_ref(func.addr)) = ::narrow<u32>(reinterpret_cast<std::uintptr_t>(&ppu_check_toc));
} }
} }
@ -1553,7 +1544,7 @@ extern void ppu_initialize(const ppu_module& info)
#endif #endif
// Write version, hash, CPU, settings // Write version, hash, CPU, settings
fmt::append(obj_name, "v2-tane-%s-%s-%s.obj", fmt::base57(output, 16), fmt::base57(settings), jit_compiler::cpu(g_cfg.core.llvm_cpu)); fmt::append(obj_name, "v3-tane-%s-%s-%s.obj", fmt::base57(output, 16), fmt::base57(settings), jit_compiler::cpu(g_cfg.core.llvm_cpu));
} }
if (Emu.IsStopped()) if (Emu.IsStopped())
@ -1652,7 +1643,7 @@ extern void ppu_initialize(const ppu_module& info)
{ {
const u64 addr = jit->get(fmt::format("__0x%x", block.first - reloc)); const u64 addr = jit->get(fmt::format("__0x%x", block.first - reloc));
jit_mod.funcs.emplace_back(reinterpret_cast<ppu_function_t>(addr)); jit_mod.funcs.emplace_back(reinterpret_cast<ppu_function_t>(addr));
ppu_ref(block.first) = ::narrow<u32>(addr); *reinterpret_cast<u32*>(&ppu_ref(block.first)) = ::narrow<u32>(addr);
} }
} }
} }
@ -1683,7 +1674,7 @@ extern void ppu_initialize(const ppu_module& info)
{ {
if (block.second) if (block.second)
{ {
ppu_ref(block.first) = ::narrow<u32>(reinterpret_cast<uptr>(jit_mod.funcs[index++])); *reinterpret_cast<u32*>(&ppu_ref(block.first)) = ::narrow<u32>(reinterpret_cast<uptr>(jit_mod.funcs[index++]));
} }
} }
} }

View file

@ -49,7 +49,7 @@ PPUTranslator::PPUTranslator(LLVMContext& context, Module* module, const ppu_mod
m_thread_type = StructType::create(m_context, thread_struct, "context_t"); m_thread_type = StructType::create(m_context, thread_struct, "context_t");
// Callable // Callable
m_call = new GlobalVariable(*module, ArrayType::get(GetType<u32>(), 0x40000000)->getPointerTo(), true, GlobalValue::ExternalLinkage, 0, fmt::format("__cptr%x", gsuffix)); m_call = new GlobalVariable(*module, ArrayType::get(GetType<u32>(), 0x80000000)->getPointerTo(), true, GlobalValue::ExternalLinkage, 0, fmt::format("__cptr%x", gsuffix));
m_call->setInitializer(ConstantPointerNull::get(cast<PointerType>(m_call->getType()->getPointerElementType()))); m_call->setInitializer(ConstantPointerNull::get(cast<PointerType>(m_call->getType()->getPointerElementType())));
m_call->setExternallyInitialized(true); m_call->setExternallyInitialized(true);
@ -282,7 +282,7 @@ void PPUTranslator::CallFunction(u64 target, Value* indirect)
} }
} }
const auto pos = m_ir->CreateLShr(indirect, 2, "", true); const auto pos = m_ir->CreateShl(m_ir->CreateLShr(indirect, 2, "", true), 1, "", true);
const auto ptr = m_ir->CreateGEP(m_ir->CreateLoad(m_call), {m_ir->getInt64(0), pos}); const auto ptr = m_ir->CreateGEP(m_ir->CreateLoad(m_call), {m_ir->getInt64(0), pos});
indirect = m_ir->CreateIntToPtr(m_ir->CreateLoad(ptr), type->getPointerTo()); indirect = m_ir->CreateIntToPtr(m_ir->CreateLoad(ptr), type->getPointerTo());
} }

View file

@ -364,7 +364,7 @@ namespace vm
if (flags & page_executable) if (flags & page_executable)
{ {
utils::memory_commit(g_exec_addr + addr, size); utils::memory_commit(g_exec_addr + addr * 2, size * 2);
} }
if (g_cfg.core.ppu_debug) if (g_cfg.core.ppu_debug)
@ -494,7 +494,7 @@ namespace vm
if (is_exec) if (is_exec)
{ {
utils::memory_decommit(g_exec_addr + addr, size); utils::memory_decommit(g_exec_addr + addr * 2, size * 2);
} }
if (g_cfg.core.ppu_debug) if (g_cfg.core.ppu_debug)