#include "stdafx.h" #include "Utilities/JIT.h" #include "Utilities/StrUtil.h" #include "util/serialization.hpp" #include "Crypto/sha1.h" #include "Crypto/unself.h" #include "Loader/ELF.h" #include "Loader/mself.hpp" #include "Emu/perf_meter.hpp" #include "Emu/Memory/vm_reservation.h" #include "Emu/Memory/vm_locking.h" #include "Emu/RSX/RSXThread.h" #include "Emu/VFS.h" #include "Emu/system_progress.hpp" #include "Emu/system_utils.hpp" #include "PPUThread.h" #include "PPUInterpreter.h" #include "PPUAnalyser.h" #include "PPUModule.h" #include "PPUDisAsm.h" #include "SPURecompiler.h" #include "timers.hpp" #include "lv2/sys_sync.h" #include "lv2/sys_prx.h" #include "lv2/sys_overlay.h" #include "lv2/sys_process.h" #include "lv2/sys_spu.h" #ifdef LLVM_AVAILABLE #ifdef _MSC_VER #pragma warning(push, 0) #else #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wall" #pragma GCC diagnostic ignored "-Wextra" #pragma GCC diagnostic ignored "-Wold-style-cast" #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Weffc++" #pragma GCC diagnostic ignored "-Wmissing-noreturn" #endif #include "llvm/Support/FormattedStream.h" #include "llvm/Support/Host.h" #include "llvm/Object/ObjectFile.h" #include "llvm/ADT/Triple.h" #include "llvm/IR/Verifier.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Scalar.h" #ifdef _MSC_VER #pragma warning(pop) #else #pragma GCC diagnostic pop #endif #include "PPUTranslator.h" #endif #include #include #include #include #include "util/asm.hpp" #include "util/vm.hpp" #include "util/v128.hpp" #include "util/simd.hpp" #include "util/sysinfo.hpp" #ifdef __APPLE__ #include #endif extern atomic_t g_watchdog_hold_ctr; // Should be of the same type using spu_rdata_t = decltype(ppu_thread::rdata); extern void mov_rdata(spu_rdata_t& _dst, const spu_rdata_t& _src); extern void mov_rdata_nt(spu_rdata_t& _dst, const spu_rdata_t& _src); extern bool cmp_rdata(const spu_rdata_t& _lhs, const spu_rdata_t& _rhs); // Verify AVX availability for TSX transactions static const bool s_tsx_avx = utils::has_avx(); template <> void fmt_class_string::format(std::string& out, u64 arg) { format_enum(out, arg, [](ppu_join_status js) { switch (js) { case ppu_join_status::joinable: return "none"; case ppu_join_status::detached: return "detached"; case ppu_join_status::zombie: return "zombie"; case ppu_join_status::exited: return "exited"; case ppu_join_status::max: break; } return unknown; }); } template <> void fmt_class_string::format(std::string& out, u64 arg) { format_enum(out, arg, [](ppu_thread_status s) { switch (s) { case PPU_THREAD_STATUS_IDLE: return "IDLE"; case PPU_THREAD_STATUS_RUNNABLE: return "RUN"; case PPU_THREAD_STATUS_ONPROC: return "ONPROC"; case PPU_THREAD_STATUS_SLEEP: return "SLEEP"; case PPU_THREAD_STATUS_STOP: return "STOP"; case PPU_THREAD_STATUS_ZOMBIE: return "Zombie"; case PPU_THREAD_STATUS_DELETED: return "Deleted"; case PPU_THREAD_STATUS_UNKNOWN: break; } return unknown; }); } template <> void fmt_class_string::format(std::string& out, u64 arg) { const auto& history = get_object(arg); PPUDisAsm dis_asm(cpu_disasm_mode::normal, vm::g_sudo_addr); for (u64 count = 0, idx = history.index - 1; idx != umax && count < ppu_thread::call_history_max_size; count++, idx--) { const u32 pc = history.data[idx % ppu_thread::call_history_max_size]; dis_asm.disasm(pc); fmt::append(out, "\n(%u) 0x%08x: %s", count, pc, dis_asm.last_opcode); } } extern const ppu_decoder g_ppu_itype{}; extern const ppu_decoder g_ppu_iname{}; template <> bool serialize(utils::serial& ar, typename ppu_thread::cr_bits& o) { if (ar.is_writing()) { ar(o.pack()); } else { o.unpack(ar); } return true; } extern void ppu_initialize(); extern void ppu_finalize(const ppu_module& info); extern bool ppu_initialize(const ppu_module& info, bool = false); static void ppu_initialize2(class jit_compiler& jit, const ppu_module& module_part, const std::string& cache_path, const std::string& obj_name); extern std::pair, CellError> ppu_load_overlay(const ppu_exec_object&, const std::string& path, s64 file_offset, utils::serial* = nullptr); extern void ppu_unload_prx(const lv2_prx&); extern std::shared_ptr ppu_load_prx(const ppu_prx_object&, const std::string&, s64 file_offset, utils::serial* = nullptr); extern void ppu_execute_syscall(ppu_thread& ppu, u64 code); static void ppu_break(ppu_thread&, ppu_opcode_t, be_t*, ppu_intrp_func*); extern void do_cell_atomic_128_store(u32 addr, const void* to_write); const auto ppu_gateway = build_function_asm("ppu_gateway", [](native_asm& c, auto& args) { // Gateway for PPU, converts from native to GHC calling convention, also saves RSP value for escape using namespace asmjit; #if defined(ARCH_X64) #ifdef _WIN32 c.push(x86::r15); c.push(x86::r14); c.push(x86::r13); c.push(x86::r12); c.push(x86::rsi); c.push(x86::rdi); c.push(x86::rbp); c.push(x86::rbx); c.sub(x86::rsp, 0xa8); c.movaps(x86::oword_ptr(x86::rsp, 0x90), x86::xmm15); c.movaps(x86::oword_ptr(x86::rsp, 0x80), x86::xmm14); c.movaps(x86::oword_ptr(x86::rsp, 0x70), x86::xmm13); c.movaps(x86::oword_ptr(x86::rsp, 0x60), x86::xmm12); c.movaps(x86::oword_ptr(x86::rsp, 0x50), x86::xmm11); c.movaps(x86::oword_ptr(x86::rsp, 0x40), x86::xmm10); c.movaps(x86::oword_ptr(x86::rsp, 0x30), x86::xmm9); c.movaps(x86::oword_ptr(x86::rsp, 0x20), x86::xmm8); c.movaps(x86::oword_ptr(x86::rsp, 0x10), x86::xmm7); c.movaps(x86::oword_ptr(x86::rsp, 0), x86::xmm6); #else c.push(x86::rbp); c.push(x86::r15); c.push(x86::r14); c.push(x86::r13); c.push(x86::r12); c.push(x86::rbx); c.push(x86::rax); #endif // Save native stack pointer for longjmp emulation c.mov(x86::qword_ptr(args[0], ::offset32(&ppu_thread::saved_native_sp)), x86::rsp); // Initialize args c.mov(x86::r13, x86::qword_ptr(reinterpret_cast(&vm::g_exec_addr))); c.mov(x86::rbp, args[0]); c.mov(x86::edx, x86::dword_ptr(x86::rbp, ::offset32(&ppu_thread::cia))); // Load PC c.mov(x86::rax, x86::qword_ptr(x86::r13, x86::edx, 1, 0)); // Load call target c.mov(x86::rdx, x86::rax); c.shl(x86::rax, 16); c.shr(x86::rax, 16); c.shr(x86::rdx, 48); c.shl(x86::edx, 13); c.mov(x86::r12d, x86::edx); // Load relocation base c.mov(x86::rbx, x86::qword_ptr(reinterpret_cast(&vm::g_base_addr))); c.mov(x86::r14, x86::qword_ptr(x86::rbp, ::offset32(&ppu_thread::gpr, 0))); // Load some registers c.mov(x86::rsi, x86::qword_ptr(x86::rbp, ::offset32(&ppu_thread::gpr, 1))); c.mov(x86::rdi, x86::qword_ptr(x86::rbp, ::offset32(&ppu_thread::gpr, 2))); if (utils::has_avx()) { c.vzeroupper(); } c.call(x86::rax); if (utils::has_avx()) { c.vzeroupper(); } #ifdef _WIN32 c.movaps(x86::xmm6, x86::oword_ptr(x86::rsp, 0)); c.movaps(x86::xmm7, x86::oword_ptr(x86::rsp, 0x10)); c.movaps(x86::xmm8, x86::oword_ptr(x86::rsp, 0x20)); c.movaps(x86::xmm9, x86::oword_ptr(x86::rsp, 0x30)); c.movaps(x86::xmm10, x86::oword_ptr(x86::rsp, 0x40)); c.movaps(x86::xmm11, x86::oword_ptr(x86::rsp, 0x50)); c.movaps(x86::xmm12, x86::oword_ptr(x86::rsp, 0x60)); c.movaps(x86::xmm13, x86::oword_ptr(x86::rsp, 0x70)); c.movaps(x86::xmm14, x86::oword_ptr(x86::rsp, 0x80)); c.movaps(x86::xmm15, x86::oword_ptr(x86::rsp, 0x90)); c.add(x86::rsp, 0xa8); c.pop(x86::rbx); c.pop(x86::rbp); c.pop(x86::rdi); c.pop(x86::rsi); c.pop(x86::r12); c.pop(x86::r13); c.pop(x86::r14); c.pop(x86::r15); #else c.add(x86::rsp, +8); c.pop(x86::rbx); c.pop(x86::r12); c.pop(x86::r13); c.pop(x86::r14); c.pop(x86::r15); c.pop(x86::rbp); #endif c.ret(); #else // See https://github.com/ghc/ghc/blob/master/rts/include/stg/MachRegs.h // for GHC calling convention definitions on Aarch64 // and https://developer.arm.com/documentation/den0024/a/The-ABI-for-ARM-64-bit-Architecture/Register-use-in-the-AArch64-Procedure-Call-Standard/Parameters-in-general-purpose-registers // for AArch64 calling convention // Save sp for native longjmp emulation Label native_sp_offset = c.newLabel(); c.ldr(a64::x10, arm::Mem(native_sp_offset)); // sp not allowed to be used in load/stores directly c.mov(a64::x15, a64::sp); c.str(a64::x15, arm::Mem(args[0], a64::x10)); // Push callee saved registers to the stack // We need to save x18-x30 = 13 x 8B each + 8 bytes for 16B alignment = 112B c.sub(a64::sp, a64::sp, Imm(112)); c.stp(a64::x18, a64::x19, arm::Mem(a64::sp)); c.stp(a64::x20, a64::x21, arm::Mem(a64::sp, 16)); c.stp(a64::x22, a64::x23, arm::Mem(a64::sp, 32)); c.stp(a64::x24, a64::x25, arm::Mem(a64::sp, 48)); c.stp(a64::x26, a64::x27, arm::Mem(a64::sp, 64)); c.stp(a64::x28, a64::x29, arm::Mem(a64::sp, 80)); c.str(a64::x30, arm::Mem(a64::sp, 96)); // Load REG_Base - use absolute jump target to bypass rel jmp range limits Label exec_addr = c.newLabel(); c.ldr(a64::x19, arm::Mem(exec_addr)); c.ldr(a64::x19, arm::Mem(a64::x19)); // Load PPUThread struct base -> REG_Sp const arm::GpX ppu_t_base = a64::x20; c.mov(ppu_t_base, args[0]); // Load PC const arm::GpX pc = a64::x15; Label cia_offset = c.newLabel(); const arm::GpX cia_addr_reg = a64::x11; // Load offset value c.ldr(cia_addr_reg, arm::Mem(cia_offset)); // Load cia c.ldr(a64::w15, arm::Mem(ppu_t_base, cia_addr_reg)); // Multiply by 2 to index into ptr table const arm::GpX index_shift = a64::x12; c.mov(index_shift, Imm(2)); c.mul(pc, pc, index_shift); // Load call target const arm::GpX call_target = a64::x13; c.ldr(call_target, arm::Mem(a64::x19, pc)); // Compute REG_Hp const arm::GpX reg_hp = a64::x21; c.mov(reg_hp, call_target); c.lsr(reg_hp, reg_hp, 48); c.lsl(a64::w21, a64::w21, 13); // Zero top 16 bits of call target c.lsl(call_target, call_target, Imm(16)); c.lsr(call_target, call_target, Imm(16)); // Load registers Label base_addr = c.newLabel(); c.ldr(a64::x22, arm::Mem(base_addr)); c.ldr(a64::x22, arm::Mem(a64::x22)); Label gpr_addr_offset = c.newLabel(); const arm::GpX gpr_addr_reg = a64::x9; c.ldr(gpr_addr_reg, arm::Mem(gpr_addr_offset)); c.add(gpr_addr_reg, gpr_addr_reg, ppu_t_base); c.ldr(a64::x23, arm::Mem(gpr_addr_reg)); c.ldr(a64::x24, arm::Mem(gpr_addr_reg, 8)); c.ldr(a64::x25, arm::Mem(gpr_addr_reg, 16)); // Execute LLE call c.blr(call_target); // Restore registers from the stack c.ldp(a64::x18, a64::x19, arm::Mem(a64::sp)); c.ldp(a64::x20, a64::x21, arm::Mem(a64::sp, 16)); c.ldp(a64::x22, a64::x23, arm::Mem(a64::sp, 32)); c.ldp(a64::x24, a64::x25, arm::Mem(a64::sp, 48)); c.ldp(a64::x26, a64::x27, arm::Mem(a64::sp, 64)); c.ldp(a64::x28, a64::x29, arm::Mem(a64::sp, 80)); c.ldr(a64::x30, arm::Mem(a64::sp, 96)); // Restore stack ptr c.add(a64::sp, a64::sp, Imm(112)); // Return c.ret(a64::x30); c.bind(exec_addr); c.embedUInt64(reinterpret_cast(&vm::g_exec_addr)); c.bind(base_addr); c.embedUInt64(reinterpret_cast(&vm::g_base_addr)); c.bind(cia_offset); c.embedUInt64(static_cast(::offset32(&ppu_thread::cia))); c.bind(gpr_addr_offset); c.embedUInt64(static_cast(::offset32(&ppu_thread::gpr))); c.bind(native_sp_offset); c.embedUInt64(static_cast(::offset32(&ppu_thread::saved_native_sp))); #endif }); const extern auto ppu_escape = build_function_asm("ppu_escape", [](native_asm& c, auto& args) { using namespace asmjit; #if defined(ARCH_X64) // Restore native stack pointer (longjmp emulation) c.mov(x86::rsp, x86::qword_ptr(args[0], ::offset32(&ppu_thread::saved_native_sp))); // Return to the return location c.sub(x86::rsp, 8); c.ret(); #endif }); void ppu_recompiler_fallback(ppu_thread& ppu); #if defined(ARCH_X64) const auto ppu_recompiler_fallback_ghc = build_function_asm("", [](native_asm& c, auto& args) { using namespace asmjit; c.mov(args[0], x86::rbp); c.jmp(ppu_recompiler_fallback); }); #elif defined(ARCH_ARM64) const auto ppu_recompiler_fallback_ghc = &ppu_recompiler_fallback; #endif // Get pointer to executable cache static ppu_intrp_func_t& ppu_ref(u32 addr) { return *reinterpret_cast(vm::g_exec_addr + u64{addr} * 2); } // Get interpreter cache value static ppu_intrp_func_t ppu_cache(u32 addr) { if (g_cfg.core.ppu_decoder != ppu_decoder_type::_static) { fmt::throw_exception("Invalid PPU decoder"); } return g_fxo->get().decode(vm::read32(addr)); } static ppu_intrp_func ppu_ret = {[](ppu_thread& ppu, ppu_opcode_t, be_t* this_op, ppu_intrp_func*) { // Fix PC and return (step execution) ppu.cia = vm::get_addr(this_op); return; }}; static void ppu_fallback(ppu_thread& ppu, ppu_opcode_t op, be_t* this_op, ppu_intrp_func* next_fn) { const auto _pc = vm::get_addr(this_op); const auto _fn = ppu_cache(_pc); ppu_ref(_pc) = _fn; return _fn(ppu, op, this_op, next_fn); } // TODO: Make this a dispatch call void ppu_recompiler_fallback(ppu_thread& ppu) { perf_meter<"PPUFALL1"_u64> perf0; if (g_cfg.core.ppu_debug) { ppu_log.error("Unregistered PPU Function (LR=0x%x)", ppu.lr); } const auto& table = g_fxo->get(); while (true) { if (uptr func = uptr(ppu_ref(ppu.cia)); (func << 16 >> 16) != reinterpret_cast(ppu_recompiler_fallback_ghc)) { // We found a recompiler function at cia, return break; } // Run one instruction in interpreter (TODO) const u32 op = vm::read32(ppu.cia); table.decode(op)(ppu, {op}, vm::_ptr(ppu.cia), &ppu_ret); if (ppu.test_stopped()) { break; } } } void ppu_reservation_fallback(ppu_thread& ppu) { perf_meter<"PPUFALL2"_u64> perf0; const auto& table = g_fxo->get(); while (true) { // Run one instruction in interpreter (TODO) const u32 op = vm::read32(ppu.cia); table.decode(op)(ppu, {op}, vm::_ptr(ppu.cia), &ppu_ret); if (!ppu.raddr || !ppu.use_full_rdata) { // We've escaped from reservation, return. return; } if (ppu.test_stopped()) { return; } } } static std::unordered_map* s_ppu_toc; static void ppu_check_toc(ppu_thread& ppu, ppu_opcode_t op, be_t* this_op, ppu_intrp_func* next_fn) { // Compare TOC with expected value const auto found = s_ppu_toc->find(ppu.cia); if (ppu.gpr[2] != found->second) { ppu_log.error("Unexpected TOC (0x%x, expected 0x%x)", ppu.gpr[2], found->second); if (!ppu.state.test_and_set(cpu_flag::dbg_pause) && ppu.check_state()) { return; } } // Fallback to the interpreter function return ppu_cache(ppu.cia)(ppu, op, this_op, next_fn); } extern void ppu_register_range(u32 addr, u32 size) { if (!size) { ppu_log.error("ppu_register_range(0x%x): empty range", addr); return; } size = utils::align(size + addr % 0x10000, 0x10000); addr &= -0x10000; // Register executable range at utils::memory_commit(&ppu_ref(addr), u64{size} * 2, utils::protection::rw); ensure(vm::page_protect(addr, size, 0, vm::page_executable)); if (g_cfg.core.ppu_debug) { utils::memory_commit(vm::g_stat_addr + addr, size); } const u64 seg_base = addr; while (size) { if (g_cfg.core.ppu_decoder == ppu_decoder_type::llvm) { // Assume addr is the start of first segment of PRX ppu_ref(addr) = reinterpret_cast(reinterpret_cast(ppu_recompiler_fallback_ghc) | (seg_base << (32 + 3))); } else { ppu_ref(addr) = ppu_fallback; } addr += 4; size -= 4; } } static void ppu_far_jump(ppu_thread&, ppu_opcode_t, be_t*, ppu_intrp_func*); extern void ppu_register_function_at(u32 addr, u32 size, ppu_intrp_func_t ptr = nullptr) { // Initialize specific function if (ptr) { ppu_ref(addr) = reinterpret_cast((reinterpret_cast(ptr) & 0xffff'ffff'ffffu) | (uptr(ppu_ref(addr)) & ~0xffff'ffff'ffffu)); return; } if (!size) { if (g_cfg.core.ppu_debug) { ppu_log.error("ppu_register_function_at(0x%x): empty range", addr); } return; } if (g_cfg.core.ppu_decoder == ppu_decoder_type::llvm) { return; } // Initialize interpreter cache while (size) { if (ppu_ref(addr) != ppu_break && ppu_ref(addr) != ppu_far_jump) { ppu_ref(addr) = ppu_cache(addr); } addr += 4; size -= 4; } } extern void ppu_register_function_at(u32 addr, u32 size, u64 ptr) { return ppu_register_function_at(addr, size, reinterpret_cast(ptr)); } u32 ppu_get_exported_func_addr(u32 fnid, const std::string& module_name); void ppu_return_from_far_jump(ppu_thread& ppu, ppu_opcode_t, be_t*, ppu_intrp_func*) { auto& calls_info = ppu.hle_func_calls_with_toc_info; ensure(!calls_info.empty()); // Branch to next instruction after far jump call entry with restored R2 and LR const auto restore_info = &calls_info.back(); ppu.cia = restore_info->cia + 4; ppu.lr = restore_info->saved_lr; ppu.gpr[2] = restore_info->saved_r2; calls_info.pop_back(); } static const bool s_init_return_far_jump_func = [] { REG_HIDDEN_FUNC_PURE(ppu_return_from_far_jump); return true; }(); struct ppu_far_jumps_t { struct all_info_t { u32 target; bool link; bool with_toc; std::string module_name; ppu_intrp_func_t func; }; ppu_far_jumps_t(int) noexcept {} std::unordered_map vals; ::jit_runtime rt; mutable shared_mutex mutex; // Get target address, 'ppu' is used in ppu_far_jump in order to modify registers u32 get_target(const u32 pc, ppu_thread* ppu = nullptr) { reader_lock lock(mutex); if (auto it = vals.find(pc); it != vals.end()) { all_info_t& all_info = it->second; u32 target = all_info.target; bool link = all_info.link; bool from_opd = all_info.with_toc; if (!all_info.module_name.empty()) { target = ppu_get_exported_func_addr(target, all_info.module_name); } if (from_opd && !vm::check_addr(target)) { // Avoid reading unmapped memory under mutex from_opd = false; } if (from_opd) { auto& opd = vm::_ref(target); target = opd.addr; // We modify LR to custom values here link = false; if (ppu) { auto& calls_info = ppu->hle_func_calls_with_toc_info; // Save LR and R2 // Set LR to the this ppu_return_from_far_jump branch for restoration of registers // NOTE: In order to clean up this information all calls must return in order auto& saved_info = calls_info.emplace_back(); saved_info.cia = pc; saved_info.saved_lr = std::exchange(ppu->lr, FIND_FUNC(ppu_return_from_far_jump)); saved_info.saved_r2 = std::exchange(ppu->gpr[2], opd.rtoc); } } if (link && ppu) { ppu->lr = pc + 4; } return target; } return {}; } template ppu_intrp_func_t gen_jump(u32 pc) { [[maybe_unused]] std::conditional_t, const shared_mutex&> lock(mutex); auto it = vals.find(pc); if (it == vals.end()) { return nullptr; } if (!it->second.func) { it->second.func = build_function_asm("", [&](native_asm& c, auto& args) { using namespace asmjit; #ifdef ARCH_X64 c.mov(args[0], x86::rbp); c.mov(x86::dword_ptr(args[0], ::offset32(&ppu_thread::cia)), pc); c.jmp(ppu_far_jump); #else Label jmp_address = c.newLabel(); Label imm_address = c.newLabel(); c.ldr(args[1].w(), arm::ptr(imm_address)); c.str(args[1].w(), arm::Mem(args[0], ::offset32(&ppu_thread::cia))); c.ldr(args[1], arm::ptr(jmp_address)); c.br(args[1]); c.align(AlignMode::kCode, 16); c.bind(jmp_address); c.embedUInt64(reinterpret_cast(ppu_far_jump)); c.bind(imm_address); c.embedUInt32(pc); #endif }, &rt); } return it->second.func; } }; u32 ppu_get_far_jump(u32 pc) { if (!g_fxo->is_init()) { return 0; } return g_fxo->get().get_target(pc); } static void ppu_far_jump(ppu_thread& ppu, ppu_opcode_t, be_t*, ppu_intrp_func*) { const u32 cia = g_fxo->get().get_target(ppu.cia, &ppu); if (!vm::check_addr(cia, vm::page_executable)) { fmt::throw_exception("PPU far jump failed! (returned cia = 0x%08x)", cia); } ppu.cia = cia; } bool ppu_form_branch_to_code(u32 entry, u32 target, bool link, bool with_toc, std::string module_name) { // Force align entry and target entry &= -4; // Exported functions are using target as FNID, must not be changed if (module_name.empty()) { target &= -4; u32 cia_target = target; if (with_toc) { ppu_func_opd_t opd{}; if (!vm::try_access(target, &opd, sizeof(opd), false)) { // Cannot access function descriptor return false; } // For now allow situations where OPD is changed later by patches or by the program itself //cia_target = opd.addr; // So force a valid target (executable, yet not equal to entry) cia_target = entry ^ 8; } // Target CIA must be aligned, executable and not equal with if (cia_target % 4 || entry == cia_target || !vm::check_addr(cia_target, vm::page_executable)) { return false; } } // Entry must be executable if (!vm::check_addr(entry, vm::page_executable)) { return false; } g_fxo->init(0); if (!module_name.empty()) { // Always use function descriptor for exported functions with_toc = true; } if (with_toc) { // Always link for calls with function descriptor link = true; } // Register branch target in host memory, not guest memory auto& jumps = g_fxo->get(); std::lock_guard lock(jumps.mutex); jumps.vals.insert_or_assign(entry, ppu_far_jumps_t::all_info_t{target, link, with_toc, std::move(module_name)}); ppu_register_function_at(entry, 4, g_cfg.core.ppu_decoder == ppu_decoder_type::_static ? &ppu_far_jump : ensure(g_fxo->get().gen_jump(entry))); return true; } bool ppu_form_branch_to_code(u32 entry, u32 target, bool link, bool with_toc) { return ppu_form_branch_to_code(entry, target, link, with_toc, std::string{}); } bool ppu_form_branch_to_code(u32 entry, u32 target, bool link) { return ppu_form_branch_to_code(entry, target, link, false); } bool ppu_form_branch_to_code(u32 entry, u32 target) { return ppu_form_branch_to_code(entry, target, false); } void ppu_remove_hle_instructions(u32 addr, u32 size) { if (Emu.IsStopped() || !g_fxo->is_init()) { return; } auto& jumps = g_fxo->get(); std::lock_guard lock(jumps.mutex); for (auto it = jumps.vals.begin(); it != jumps.vals.end();) { if (it->first >= addr && it->first <= addr + size - 1 && size) { it = jumps.vals.erase(it); continue; } it++; } } atomic_t g_debugger_pause_all_threads_on_bp = true; // Breakpoint entry point static void ppu_break(ppu_thread& ppu, ppu_opcode_t, be_t* this_op, ppu_intrp_func* next_fn) { const bool pause_all = g_debugger_pause_all_threads_on_bp; const u32 old_cia = vm::get_addr(this_op); ppu.cia = old_cia; // Pause ppu.state.atomic_op([&](bs_t& state) { if (pause_all) state += cpu_flag::dbg_global_pause; if (pause_all || !(state & cpu_flag::dbg_step)) state += cpu_flag::dbg_pause; }); if (pause_all) { // Pause all other threads Emu.CallFromMainThread([]() { Emu.Pause(); }); } if (ppu.check_state() || old_cia != atomic_storage::load(ppu.cia)) { // Do not execute if PC changed return; } // Fallback to the interpreter function return ppu_cache(ppu.cia)(ppu, {*this_op}, this_op, ppu.state ? &ppu_ret : next_fn); } // Set or remove breakpoint extern bool ppu_breakpoint(u32 addr, bool is_adding) { if (addr % 4 || !vm::check_addr(addr, vm::page_executable) || g_cfg.core.ppu_decoder == ppu_decoder_type::llvm) { return false; } // Remove breakpoint parameters ppu_intrp_func_t to_set = 0; ppu_intrp_func_t expected = &ppu_break; if (u32 hle_addr{}; g_fxo->is_init() && (hle_addr = g_fxo->get().addr)) { // HLE function index const u32 index = (addr - hle_addr) / 8; if (addr % 8 == 4 && index < ppu_function_manager::get().size()) { // HLE function placement to_set = ppu_function_manager::get()[index]; } } if (!to_set) { // If not an HLE function use regular instruction function to_set = ppu_cache(addr); } ppu_intrp_func_t& _ref = ppu_ref(addr); if (is_adding) { // Swap if adding std::swap(to_set, expected); if (_ref == &ppu_fallback) { ppu_log.error("Unregistered instruction replaced with a breakpoint at 0x%08x", addr); expected = ppu_fallback; } } return atomic_storage::compare_exchange(_ref, expected, to_set); } extern bool ppu_patch(u32 addr, u32 value) { if (addr % 4) { ppu_log.fatal("Patch failed at 0x%x: unanligned memory address.", addr); return false; } vm::writer_lock rlock; if (!vm::check_addr(addr)) { ppu_log.fatal("Patch failed at 0x%x: invalid memory address.", addr); return false; } const bool is_exec = vm::check_addr(addr, vm::page_executable); if (is_exec && g_cfg.core.ppu_decoder == ppu_decoder_type::llvm && !Emu.IsReady()) { // TODO: support recompilers ppu_log.fatal("Patch failed at 0x%x: LLVM recompiler is used.", addr); return false; } *vm::get_super_ptr(addr) = value; if (is_exec) { if (ppu_ref(addr) != ppu_break && ppu_ref(addr) != ppu_fallback) { ppu_ref(addr) = ppu_cache(addr); } } return true; } std::array op_branch_targets(u32 pc, ppu_opcode_t op) { std::array res{pc + 4, umax}; g_fxo->need(); if (u32 target = g_fxo->get().get_target(pc)) { res[0] = target; return res; } switch (const auto type = g_ppu_itype.decode(op.opcode)) { case ppu_itype::B: case ppu_itype::BC: { res[type == ppu_itype::BC ? 1 : 0] = ((op.aa ? 0 : pc) + (type == ppu_itype::B ? +op.bt24 : +op.bt14)); break; } case ppu_itype::BCCTR: case ppu_itype::BCLR: case ppu_itype::UNK: { res[0] = umax; break; } default: break; } return res; } void ppu_thread::dump_regs(std::string& ret) const { PPUDisAsm dis_asm(cpu_disasm_mode::normal, vm::g_sudo_addr); for (uint i = 0; i < 32; ++i) { auto reg = gpr[i]; // Fixup for syscall arguments if (current_function && i >= 3 && i <= 10) reg = syscall_args[i - 3]; auto [is_const, const_value] = dis_asm.try_get_const_gpr_value(i, cia); if (const_value != reg) { // Expectation of pretictable code path has not been met (such as a branch directly to the instruction) is_const = false; } fmt::append(ret, "r%d%s%s 0x%-8llx", i, i <= 9 ? " " : "", is_const ? "©" : ":", reg); constexpr u32 max_str_len = 32; constexpr u32 hex_count = 8; if (reg <= u32{umax} && vm::check_addr(static_cast(reg))) { bool is_function = false; u32 toc = 0; auto is_exec_code = [&](u32 addr) { return addr % 4 == 0 && vm::check_addr(addr, vm::page_executable) && g_ppu_itype.decode(*vm::get_super_ptr(addr)) != ppu_itype::UNK; }; if (const u32 reg_ptr = *vm::get_super_ptr(static_cast(reg)); vm::check_addr<8>(reg_ptr) && !vm::check_addr(toc, vm::page_executable)) { // Check executability and alignment if (reg % 4 == 0 && is_exec_code(reg_ptr)) { toc = *vm::get_super_ptr(static_cast(reg + 4)); if (toc % 4 == 0 && (toc >> 29) == (reg_ptr >> 29) && vm::check_addr(toc) && !vm::check_addr(toc, vm::page_executable)) { is_function = true; reg = reg_ptr; } } } else if (is_exec_code(reg)) { is_function = true; } const auto gpr_buf = vm::get_super_ptr(reg); std::string buf_tmp(gpr_buf, gpr_buf + max_str_len); std::string_view sv(buf_tmp.data(), std::min(buf_tmp.size(), buf_tmp.find_first_of("\0\n"sv))); if (is_function) { if (toc) { fmt::append(ret, " -> func(at=0x%x, toc=0x%x)", reg, toc); } else { dis_asm.disasm(reg); fmt::append(ret, " -> %s", dis_asm.last_opcode); } } // NTS: size of 3 and above is required // If ends with a newline, only one character is required else if ((sv.size() == buf_tmp.size() || (sv.size() >= (buf_tmp[sv.size()] == '\n' ? 1 : 3))) && std::all_of(sv.begin(), sv.end(), [](u8 c){ return std::isprint(c); })) { fmt::append(ret, " -> \"%s\"", sv); } else { fmt::append(ret, " -> "); for (u32 j = 0; j < hex_count; ++j) { fmt::append(ret, "%02x ", buf_tmp[j]); } } } fmt::append(ret, "\n"); } for (uint i = 0; i < 32; ++i) { const f64 r = fpr[i]; if (!std::bit_cast(r)) { fmt::append(ret, "f%d%s: %-12.6G [%-18s] (f32=0x%x)\n", i, i <= 9 ? " " : "", r, "", std::bit_cast(f32(r))); continue; } fmt::append(ret, "f%d%s: %-12.6G [0x%016x] (f32=0x%x)\n", i, i <= 9 ? " " : "", r, std::bit_cast(r), std::bit_cast(f32(r))); } for (uint i = 0; i < 32; ++i, ret += '\n') { fmt::append(ret, "v%d%s: ", i, i <= 9 ? " " : ""); const auto r = vr[i]; const u32 i3 = r.u32r[0]; if (v128::from32p(i3) == r) { // Shortand formatting fmt::append(ret, "%08x", i3); fmt::append(ret, " [x: %g]", r.fr[0]); } else { fmt::append(ret, "%08x %08x %08x %08x", r.u32r[0], r.u32r[1], r.u32r[2], r.u32r[3]); fmt::append(ret, " [x: %g y: %g z: %g w: %g]", r.fr[0], r.fr[1], r.fr[2], r.fr[3]); } } fmt::append(ret, "CR: 0x%08x\n", cr.pack()); fmt::append(ret, "LR: 0x%llx\n", lr); fmt::append(ret, "CTR: 0x%llx\n", ctr); fmt::append(ret, "VRSAVE: 0x%08x\n", vrsave); fmt::append(ret, "XER: [CA=%u | OV=%u | SO=%u | CNT=%u]\n", xer.ca, xer.ov, xer.so, xer.cnt); fmt::append(ret, "VSCR: [SAT=%u | NJ=%u]\n", sat, nj); fmt::append(ret, "FPSCR: [FL=%u | FG=%u | FE=%u | FU=%u]\n", fpscr.fl, fpscr.fg, fpscr.fe, fpscr.fu); const u32 addr = raddr; if (addr) fmt::append(ret, "Reservation Addr: 0x%x", addr); else fmt::append(ret, "Reservation Addr: none"); fmt::append(ret, "\nReservation Data (entire cache line):\n"); be_t data[32]{}; std::memcpy(data, rdata, sizeof(rdata)); // Show the data even if the reservation was lost inside the atomic loop if (addr && !use_full_rdata) { const u32 offset = addr & 0x78; fmt::append(ret, "[0x%02x] %08x %08x\n", offset, data[offset / sizeof(u32)], data[offset / sizeof(u32) + 1]); // Asterisk marks the offset of data that had been given to the guest PPU code *(&ret.back() - (addr & 4 ? 9 : 18)) = '*'; } else { for (usz i = 0; i < std::size(data); i += 4) { fmt::append(ret, "[0x%02x] %08x %08x %08x %08x\n", i * sizeof(data[0]) , data[i + 0], data[i + 1], data[i + 2], data[i + 3]); } if (addr) { // See the note above *(&ret.back() - (4 - (addr % 16 / 4)) * 9 - (8 - (addr % 128 / 16)) * std::size("[0x00]"sv)) = '*'; } } } std::string ppu_thread::dump_callstack() const { std::string ret; fmt::append(ret, "Call stack:\n=========\n0x%08x (0x0) called\n", cia); for (const auto& sp : dump_callstack_list()) { // TODO: function addresses too fmt::append(ret, "> from 0x%08x (sp=0x%08x)\n", sp.first, sp.second); } return ret; } std::vector> ppu_thread::dump_callstack_list() const { //std::shared_lock rlock(vm::g_mutex); // Needs optimizations // Determine stack range const u64 r1 = gpr[1]; if (r1 > u32{umax} || r1 % 0x10) { return {}; } const u32 stack_ptr = static_cast(r1); if (!vm::check_addr(stack_ptr, vm::page_writable)) { // Normally impossible unless the code does not follow ABI rules return {}; } u32 stack_min = stack_ptr & ~0xfff; u32 stack_max = stack_min + 4096; while (stack_min && vm::check_addr(stack_min - 4096, vm::page_writable)) { stack_min -= 4096; } while (stack_max + 4096 && vm::check_addr(stack_max, vm::page_writable)) { stack_max += 4096; } std::vector> call_stack_list; bool first = true; for ( u64 sp = r1; sp % 0x10 == 0u && sp >= stack_min && sp <= stack_max - ppu_stack_start_offset; sp = *vm::get_super_ptr(static_cast(sp)), first = false ) { u64 addr = *vm::get_super_ptr(static_cast(sp + 16)); auto is_invalid = [](u64 addr) { if (addr > u32{umax} || addr % 4 || !vm::check_addr(static_cast(addr), vm::page_executable)) { return true; } // Ignore HLE stop address return addr == g_fxo->get().func_addr(1) + 4; }; if (is_invalid(addr)) { if (first) { // Function hasn't saved LR, could be because it's a leaf function // Use LR directly instead addr = lr; if (is_invalid(addr)) { // Skip it, workaround continue; } } else { break; } } // TODO: function addresses too call_stack_list.emplace_back(static_cast(addr), static_cast(sp)); } return call_stack_list; } std::string ppu_thread::dump_misc() const { std::string ret = cpu_thread::dump_misc(); if (ack_suspend) { if (ret.ends_with("\n")) { ret.pop_back(); } fmt::append(ret, " (LV2 suspended)\n"); } fmt::append(ret, "Priority: %d\n", +prio); fmt::append(ret, "Stack: 0x%x..0x%x\n", stack_addr, stack_addr + stack_size - 1); fmt::append(ret, "Joiner: %s\n", joiner.load()); if (const auto size = cmd_queue.size()) fmt::append(ret, "Commands: %u\n", size); const char* _func = current_function; if (_func) { ret += "In function: "; ret += _func; ret += '\n'; for (u32 i = 3; i <= 10; i++) if (u64 v = gpr[i]; v != syscall_args[i - 3]) fmt::append(ret, " ** r%d: 0x%llx\n", i, v); } else if (is_paused() || is_stopped()) { if (const auto last_func = last_function) { _func = last_func; ret += "Last function: "; ret += _func; ret += '\n'; } } if (const auto _time = start_time) { fmt::append(ret, "Waiting: %fs\n", (get_guest_system_time() - _time) / 1000000.); } else { ret += '\n'; } if (!_func) { ret += '\n'; } return ret; } void ppu_thread::dump_all(std::string& ret) const { cpu_thread::dump_all(ret); if (!call_history.data.empty()) { ret += "\nCalling History:" "\n================"; fmt::append(ret, "%s", call_history); } } extern thread_local std::string(*g_tls_log_prefix)(); void ppu_thread::cpu_task() { std::fesetround(FE_TONEAREST); if (g_cfg.core.set_daz_and_ftz) { gv_set_zeroing_denormals(); } else { gv_unset_zeroing_denormals(); } // Execute cmd_queue while (cmd64 cmd = cmd_wait()) { const u32 arg = cmd.arg2(); // 32-bit arg extracted switch (auto type = cmd.arg1()) { case ppu_cmd::opcode: { cmd_pop(), g_fxo->get().decode(arg)(*this, {arg}, vm::_ptr(cia - 4), &ppu_ret); break; } case ppu_cmd::set_gpr: { if (arg >= 32) { fmt::throw_exception("Invalid ppu_cmd::set_gpr arg (0x%x)", arg); } gpr[arg % 32] = cmd_get(1).as(); cmd_pop(1); break; } case ppu_cmd::set_args: { if (arg > 8) { fmt::throw_exception("Unsupported ppu_cmd::set_args size (0x%x)", arg); } for (u32 i = 0; i < arg; i++) { gpr[i + 3] = cmd_get(1 + i).as(); } cmd_pop(arg); break; } case ppu_cmd::lle_call: { #ifdef __APPLE__ pthread_jit_write_protect_np(true); #endif const vm::ptr opd(arg < 32 ? vm::cast(gpr[arg]) : vm::cast(arg)); cmd_pop(), fast_call(opd[0], opd[1]); break; } case ppu_cmd::hle_call: { cmd_pop(), ::at32(ppu_function_manager::get(), arg)(*this, {arg}, vm::_ptr(cia - 4), &ppu_ret); break; } case ppu_cmd::opd_call: { #ifdef __APPLE__ pthread_jit_write_protect_np(true); #endif const ppu_func_opd_t opd = cmd_get(1).as(); cmd_pop(1), fast_call(opd.addr, opd.rtoc); break; } case ppu_cmd::ptr_call: { const ppu_intrp_func_t func = cmd_get(1).as(); cmd_pop(1), func(*this, {}, vm::_ptr(cia - 4), &ppu_ret); break; } case ppu_cmd::cia_call: { loaded_from_savestate = true; cmd_pop(), fast_call(std::exchange(cia, 0), gpr[2]); break; } case ppu_cmd::initialize: { #ifdef __APPLE__ pthread_jit_write_protect_np(false); #endif cmd_pop(); ppu_initialize(), spu_cache::initialize(); #ifdef __APPLE__ pthread_jit_write_protect_np(true); #endif #ifdef ARCH_ARM64 // Flush all cache lines after potentially writing executable code asm("ISB"); asm("DSB ISH"); #endif // Wait until the progress dialog is closed. // We don't want to open a cell dialog while a native progress dialog is still open. thread_ctrl::wait_on(g_progr_ptotal, 0); g_fxo->get().skip_the_progress_dialog = true; // Sadly we can't postpone initializing guest time because we need to run PPU threads // (the farther it's postponed, the less accuracy of guest time has been lost) Emu.FixGuestTime(); // Run SPUs waiting on a syscall (savestates related) idm::select>([&](u32, named_thread& spu) { if (spu.group && spu.index == spu.group->waiter_spu_index) { if (std::exchange(spu.stop_flag_removal_protection, false)) { return; } ensure(spu.state.test_and_reset(cpu_flag::stop)); spu.state.notify_one(cpu_flag::stop); } }); // Check if this is the only PPU left to initialize (savestates related) if (lv2_obj::is_scheduler_ready()) { if (Emu.IsStarting()) { Emu.FinalizeRunRequest(); } } break; } case ppu_cmd::sleep: { cmd_pop(), lv2_obj::sleep(*this); break; } case ppu_cmd::reset_stack: { cmd_pop(), gpr[1] = stack_addr + stack_size - ppu_stack_start_offset; break; } default: { fmt::throw_exception("Unknown ppu_cmd(0x%x)", static_cast(type)); } } } } void ppu_thread::cpu_sleep() { // Clear reservation raddr = 0; // Setup wait flag and memory flags to relock itself state += g_use_rtm ? cpu_flag::wait : cpu_flag::wait + cpu_flag::memory; if (auto ptr = vm::g_tls_locked) { ptr->compare_and_swap(this, nullptr); } lv2_obj::awake(this); } void ppu_thread::cpu_on_stop() { if (current_function) { if (start_time) { ppu_log.warning("'%s' aborted (%fs)", current_function, (get_guest_system_time() - start_time) / 1000000.); } else { ppu_log.warning("'%s' aborted", current_function); } current_function = {}; } // TODO: More conditions if (Emu.IsStopped() && g_cfg.core.spu_debug) { std::string ret; dump_all(ret); ppu_log.notice("thread context: %s", ret); } } void ppu_thread::exec_task() { if (g_cfg.core.ppu_decoder != ppu_decoder_type::_static) { while (true) { if (state) [[unlikely]] { if (check_state()) break; } ppu_gateway(this); } return; } const auto cache = vm::g_exec_addr; const auto mem_ = vm::g_base_addr; while (true) { if (test_stopped()) [[unlikely]] { return; } gv_zeroupper(); // Execute instruction (may be step; execute only one instruction if state) const auto op = reinterpret_cast*>(mem_ + u64{cia}); const auto fn = reinterpret_cast(cache + u64{cia} * 2); fn->fn(*this, {*op}, op, state ? &ppu_ret : fn + 1); } } ppu_thread::~ppu_thread() { perf_log.notice("Perf stats for STCX reload: successs %u, failure %u", last_succ, last_fail); perf_log.notice("Perf stats for instructions: total %u", exec_bytes / 4); } ppu_thread::ppu_thread(const ppu_thread_params& param, std::string_view name, u32 prio, int detached) : cpu_thread(idm::last_id()) , prio(prio) , stack_size(param.stack_size) , stack_addr(param.stack_addr) , joiner(detached != 0 ? ppu_join_status::detached : ppu_join_status::joinable) , entry_func(param.entry) , start_time(get_guest_system_time()) , is_interrupt_thread(detached < 0) , ppu_tname(make_single(name)) { gpr[1] = stack_addr + stack_size - ppu_stack_start_offset; gpr[13] = param.tls_addr; if (detached >= 0) { // Initialize thread args gpr[3] = param.arg0; gpr[4] = param.arg1; } optional_savestate_state = std::make_shared(); // Trigger the scheduler state += cpu_flag::suspend; if (!g_use_rtm) { state += cpu_flag::memory; } if (g_cfg.core.ppu_call_history) { call_history.data.resize(call_history_max_size); } #ifdef __APPLE__ pthread_jit_write_protect_np(true); #endif #ifdef ARCH_ARM64 // Flush all cache lines after potentially writing executable code asm("ISB"); asm("DSB ISH"); #endif } struct disable_precomp_t { atomic_t disable = false; }; void vdecEntry(ppu_thread& ppu, u32 vid); bool ppu_thread::savable() const { if (joiner == ppu_join_status::exited) { return false; } if (cia == g_fxo->get().func_addr(FIND_FUNC(vdecEntry))) { // Do not attempt to save the state of HLE VDEC threads return false; } return true; } void ppu_thread::serialize_common(utils::serial& ar) { ar(gpr, fpr, cr, fpscr.bits, lr, ctr, vrsave, cia, xer, sat, nj, prio, optional_savestate_state, vr); if (optional_savestate_state->data.empty()) { optional_savestate_state->clear(); } } ppu_thread::ppu_thread(utils::serial& ar) : cpu_thread(idm::last_id()) // last_id() is showed to constructor on serialization , stack_size(ar) , stack_addr(ar) , joiner(ar.operator ppu_join_status()) , entry_func(std::bit_cast(ar)) , is_interrupt_thread(ar) { struct init_pushed { bool pushed = false; atomic_t inited = false; }; serialize_common(ar); // Restore jm_mask jm_mask = nj ? 0x7F800000 : 0x7fff'ffff; auto queue_intr_entry = [&]() { if (is_interrupt_thread) { void ppu_interrupt_thread_entry(ppu_thread&, ppu_opcode_t, be_t*, struct ppu_intrp_func*); cmd_list ({ { ppu_cmd::ptr_call, 0 }, std::bit_cast(&ppu_interrupt_thread_entry) }); } }; switch (const u32 status = ar.operator u32()) { case PPU_THREAD_STATUS_IDLE: { stop_flag_removal_protection = true; break; } case PPU_THREAD_STATUS_RUNNABLE: case PPU_THREAD_STATUS_ONPROC: { lv2_obj::awake(this); [[fallthrough]]; } case PPU_THREAD_STATUS_SLEEP: { if (std::exchange(g_fxo->get().pushed, true)) { cmd_list ({ {ppu_cmd::ptr_call, 0}, +[](ppu_thread&) -> bool { while (!Emu.IsStopped() && !g_fxo->get().inited) { thread_ctrl::wait_on(g_fxo->get().inited, false); } return false; } }); } else { g_fxo->init(); g_fxo->get().disable = true; cmd_push({ppu_cmd::initialize, 0}); cmd_list ({ {ppu_cmd::ptr_call, 0}, +[](ppu_thread&) -> bool { auto& inited = g_fxo->get().inited; inited = true; inited.notify_all(); return true; } }); } if (status == PPU_THREAD_STATUS_SLEEP) { cmd_list ({ {ppu_cmd::ptr_call, 0}, +[](ppu_thread& ppu) -> bool { const u32 op = vm::read32(ppu.cia); const auto& table = g_fxo->get(); ppu.loaded_from_savestate = true; table.decode(op)(ppu, {op}, vm::_ptr(ppu.cia), &ppu_ret); ppu.optional_savestate_state->clear(); // Reset to writing state ppu.loaded_from_savestate = false; return true; } }); lv2_obj::set_future_sleep(this); } queue_intr_entry(); cmd_push({ppu_cmd::cia_call, 0}); break; } case PPU_THREAD_STATUS_ZOMBIE: { state += cpu_flag::exit; break; } case PPU_THREAD_STATUS_STOP: { queue_intr_entry(); break; } } // Trigger the scheduler state += cpu_flag::suspend; if (!g_use_rtm) { state += cpu_flag::memory; } ppu_tname = make_single(ar.operator std::string()); } void ppu_thread::save(utils::serial& ar) { USING_SERIALIZATION_VERSION(ppu); const u64 entry = std::bit_cast(entry_func); ppu_join_status _joiner = joiner; if (_joiner >= ppu_join_status::max) { // Joining thread should recover this member properly _joiner = ppu_join_status::joinable; } if (state & cpu_flag::again) { std::memcpy(&gpr[3], syscall_args, sizeof(syscall_args)); cia -= 4; } ar(stack_size, stack_addr, _joiner, entry, is_interrupt_thread); serialize_common(ar); ppu_thread_status status = lv2_obj::ppu_state(this, false); if (status == PPU_THREAD_STATUS_SLEEP && cpu_flag::again - state) { // Hack for sys_fs status = PPU_THREAD_STATUS_RUNNABLE; } ar(status); ar(*ppu_tname.load()); } ppu_thread::thread_name_t::operator std::string() const { std::string thread_name = fmt::format("PPU[0x%x]", _this->id); if (const std::string name = *_this->ppu_tname.load(); !name.empty()) { fmt::append(thread_name, " %s", name); } return thread_name; } void ppu_thread::cmd_push(cmd64 cmd) { // Reserve queue space const u32 pos = cmd_queue.push_begin(); // Write single command cmd_queue[pos] = cmd; } void ppu_thread::cmd_list(std::initializer_list list) { // Reserve queue space const u32 pos = cmd_queue.push_begin(static_cast(list.size())); // Write command tail in relaxed manner for (u32 i = 1; i < list.size(); i++) { cmd_queue[pos + i].raw() = list.begin()[i]; } // Write command head after all cmd_queue[pos] = *list.begin(); } void ppu_thread::cmd_pop(u32 count) { // Get current position const u32 pos = cmd_queue.peek(); // Clean command buffer for command tail for (u32 i = 1; i <= count; i++) { cmd_queue[pos + i].raw() = cmd64{}; } // Free cmd_queue.pop_end(count + 1); } cmd64 ppu_thread::cmd_wait() { while (true) { if (cmd64 result = cmd_queue[cmd_queue.peek()].exchange(cmd64{})) { return result; } if (is_stopped()) { return {}; } thread_ctrl::wait_on(cmd_notify, 0); cmd_notify = 0; } } be_t* ppu_thread::get_stack_arg(s32 i, u64 align) { if (align != 1 && align != 2 && align != 4 && align != 8 && align != 16) fmt::throw_exception("Unsupported alignment: 0x%llx", align); return vm::_ptr(vm::cast((gpr[1] + 0x30 + 0x8 * (i - 1)) & (0 - align))); } void ppu_thread::fast_call(u32 addr, u64 rtoc) { const auto old_cia = cia; const auto old_rtoc = gpr[2]; const auto old_lr = lr; const auto old_func = current_function; const auto old_fmt = g_tls_log_prefix; interrupt_thread_executing = true; cia = addr; gpr[2] = rtoc; lr = g_fxo->get().func_addr(1) + 4; // HLE stop address current_function = nullptr; if (std::exchange(loaded_from_savestate, false)) { lr = old_lr; } g_tls_log_prefix = [] { const auto _this = static_cast(get_current_cpu_thread()); static thread_local shared_ptr name_cache; if (!_this->ppu_tname.is_equal(name_cache)) [[unlikely]] { _this->ppu_tname.peek_op([&](const shared_ptr& ptr) { if (ptr != name_cache) { name_cache = ptr; } }); } const auto cia = _this->cia; if (_this->current_function && vm::read32(cia) != ppu_instructions::SC(0)) { return fmt::format("PPU[0x%x] Thread (%s) [HLE:0x%08x, LR:0x%08x]", _this->id, *name_cache.get(), cia, _this->lr); } extern const char* get_prx_name_by_cia(u32 addr); if (auto name = get_prx_name_by_cia(cia)) { return fmt::format("PPU[0x%x] Thread (%s) [%s: 0x%08x]", _this->id, *name_cache.get(), name, cia); } return fmt::format("PPU[0x%x] Thread (%s) [0x%08x]", _this->id, *name_cache.get(), cia); }; auto at_ret = [&]() { if (std::uncaught_exceptions()) { cpu_on_stop(); current_function = old_func; } else if (old_cia) { if (state & cpu_flag::again) { ppu_log.error("HLE callstack savestate is not implemented!"); } cia = old_cia; gpr[2] = old_rtoc; lr = old_lr; } current_function = old_func; g_tls_log_prefix = old_fmt; state -= cpu_flag::ret; }; exec_task(); at_ret(); } std::pair ppu_thread::stack_push(u32 size, u32 align_v) { if (auto cpu = get_current_cpu_thread()) { ppu_thread& context = static_cast(*cpu); const u32 old_pos = vm::cast(context.gpr[1]); context.gpr[1] -= size; // room minimal possible size context.gpr[1] &= ~(u64{align_v} - 1); // fix stack alignment auto is_stack = [&](u64 addr) { return addr >= context.stack_addr && addr < context.stack_addr + context.stack_size; }; // TODO: This check does not care about custom stack memory if (is_stack(old_pos) != is_stack(context.gpr[1])) { fmt::throw_exception("Stack overflow (size=0x%x, align=0x%x, SP=0x%llx, stack=*0x%x)", size, align_v, old_pos, context.stack_addr); } else { const u32 addr = static_cast(context.gpr[1]); std::memset(vm::base(addr), 0, size); return {vm::cast(addr), old_pos - addr}; } } fmt::throw_exception("Invalid thread"); } void ppu_thread::stack_pop_verbose(u32 addr, u32 size) noexcept { if (auto cpu = get_current_cpu_thread()) { ppu_thread& context = static_cast(*cpu); if (context.gpr[1] != addr) { ppu_log.error("Stack inconsistency (addr=0x%x, SP=0x%llx, size=0x%x)", addr, context.gpr[1], size); return; } context.gpr[1] += size; return; } ppu_log.error("Invalid thread"); } extern ppu_intrp_func_t ppu_get_syscall(u64 code); void ppu_trap(ppu_thread& ppu, u64 addr) { ensure((addr & (~u64{0xffff'ffff} | 0x3)) == 0); ppu.cia = static_cast(addr); u32 add = static_cast(g_cfg.core.stub_ppu_traps) * 4; // If stubbing is enabled, check current instruction and the following if (!add || !vm::check_addr(ppu.cia, vm::page_executable) || !vm::check_addr(ppu.cia + add, vm::page_executable)) { fmt::throw_exception("PPU Trap! Sometimes tweaking the setting \"Stub PPU Traps\" can be a workaround to this crash.\nBest values depend on game code, if unsure try 1."); } ppu_log.error("PPU Trap: Stubbing %d instructions %s.", std::abs(static_cast(add) / 4), add >> 31 ? "backwards" : "forwards"); ppu.cia += add; // Skip instructions, hope for valid code (interprter may be invoked temporarily) } static void ppu_error(ppu_thread& ppu, u64 addr, u32 /*op*/) { ppu.cia = ::narrow(addr); ppu_recompiler_fallback(ppu); } static void ppu_check(ppu_thread& ppu, u64 addr) { ppu.cia = ::narrow(addr); if (ppu.test_stopped()) { return; } } static void ppu_trace(u64 addr) { ppu_log.notice("Trace: 0x%llx", addr); } template static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr) { perf_meter<"LARX"_u32> perf0; // Do not allow stores accessed from the same cache line to past reservation load atomic_fence_seq_cst(); if (addr % sizeof(T)) { fmt::throw_exception("PPU %s: Unaligned address: 0x%08x", sizeof(T) == 4 ? "LWARX" : "LDARX", addr); } // Always load aligned 64-bit value auto& data = vm::_ref>(addr & -8); const u64 size_off = (sizeof(T) * 8) & 63; const u64 data_off = (addr & 7) * 8; ppu.raddr = addr; u32 addr_mask = -1; if (const s32 max = g_cfg.core.ppu_128_reservations_loop_max_length) { // If we use it in HLE it means we want the accurate version ppu.use_full_rdata = max < 0 || ppu.current_function || [&]() { const u32 cia = ppu.cia; if ((cia & 0xffff) >= 0x10000u - max * 4) { // Do not cross 64k boundary return false; } const auto inst = vm::_ptr>(cia); // Search for STWCX or STDCX nearby (LDARX-STWCX and LWARX-STDCX loops will use accurate 128-byte reservations) constexpr u32 store_cond = stx::se_storage::swap(sizeof(T) == 8 ? 0x7C00012D : 0x7C0001AD); constexpr u32 mask = stx::se_storage::swap(0xFC0007FF); const auto store_vec = v128::from32p(store_cond); const auto mask_vec = v128::from32p(mask); s32 i = 2; for (const s32 _max = max - 3; i < _max; i += 4) { const auto _inst = v128::loadu(inst + i) & mask_vec; if (!gv_testz(gv_eq32(_inst, store_vec))) { return false; } } for (; i < max; i++) { const u32 val = inst[i] & mask; if (val == store_cond) { return false; } } return true; }(); if (ppu.use_full_rdata) { addr_mask = -128; } } else { ppu.use_full_rdata = false; } if ((addr & addr_mask) == (ppu.last_faddr & addr_mask)) { ppu_log.trace(u8"LARX after fail: addr=0x%x, faddr=0x%x, time=%u c", addr, ppu.last_faddr, (perf0.get() - ppu.last_ftsc)); } if ((addr & addr_mask) == (ppu.last_faddr & addr_mask) && (perf0.get() - ppu.last_ftsc) < 600 && (vm::reservation_acquire(addr) & -128) == ppu.last_ftime) { be_t rdata; std::memcpy(&rdata, &ppu.rdata[addr & 0x78], 8); if (rdata == data.load()) { ppu.rtime = ppu.last_ftime; ppu.raddr = ppu.last_faddr; ppu.last_ftime = 0; return static_cast(rdata << data_off >> size_off); } ppu.last_fail++; ppu.last_faddr = 0; } else { // Silent failure ppu.last_faddr = 0; } ppu.rtime = vm::reservation_acquire(addr) & -128; be_t rdata; if (!ppu.use_full_rdata) { rdata = data.load(); // Store only 64 bits of reservation data std::memcpy(&ppu.rdata[addr & 0x78], &rdata, 8); } else { mov_rdata(ppu.rdata, vm::_ref(addr & -128)); atomic_fence_acquire(); // Load relevant 64 bits of reservation data std::memcpy(&rdata, &ppu.rdata[addr & 0x78], 8); } return static_cast(rdata << data_off >> size_off); } extern u32 ppu_lwarx(ppu_thread& ppu, u32 addr) { return ppu_load_acquire_reservation(ppu, addr); } extern u64 ppu_ldarx(ppu_thread& ppu, u32 addr) { return ppu_load_acquire_reservation(ppu, addr); } const auto ppu_stcx_accurate_tx = build_function_asm("ppu_stcx_accurate_tx", [](native_asm& c, auto& args) { using namespace asmjit; #if defined(ARCH_X64) Label fall = c.newLabel(); Label fail = c.newLabel(); Label _ret = c.newLabel(); Label load = c.newLabel(); //if (utils::has_avx() && !s_tsx_avx) //{ // c.vzeroupper(); //} // Create stack frame if necessary (Windows ABI has only 6 volatile vector registers) c.push(x86::rbp); c.push(x86::r14); c.sub(x86::rsp, 40); #ifdef _WIN32 if (!s_tsx_avx) { c.movups(x86::oword_ptr(x86::rsp, 0), x86::xmm6); c.movups(x86::oword_ptr(x86::rsp, 16), x86::xmm7); } #endif // Prepare registers build_swap_rdx_with(c, args, x86::r10); c.mov(x86::rbp, x86::qword_ptr(reinterpret_cast(&vm::g_sudo_addr))); c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0])); c.and_(x86::rbp, -128); c.prefetchw(x86::byte_ptr(x86::rbp, 0)); c.prefetchw(x86::byte_ptr(x86::rbp, 64)); c.movzx(args[0].r32(), args[0].r16()); c.shr(args[0].r32(), 1); c.lea(x86::r11, x86::qword_ptr(reinterpret_cast(+vm::g_reservations), args[0])); c.and_(x86::r11, -128 / 2); c.and_(args[0].r32(), 63); // Prepare data if (s_tsx_avx) { c.vmovups(x86::ymm0, x86::ymmword_ptr(args[2], 0)); c.vmovups(x86::ymm1, x86::ymmword_ptr(args[2], 32)); c.vmovups(x86::ymm2, x86::ymmword_ptr(args[2], 64)); c.vmovups(x86::ymm3, x86::ymmword_ptr(args[2], 96)); } else { c.movaps(x86::xmm0, x86::oword_ptr(args[2], 0)); c.movaps(x86::xmm1, x86::oword_ptr(args[2], 16)); c.movaps(x86::xmm2, x86::oword_ptr(args[2], 32)); c.movaps(x86::xmm3, x86::oword_ptr(args[2], 48)); c.movaps(x86::xmm4, x86::oword_ptr(args[2], 64)); c.movaps(x86::xmm5, x86::oword_ptr(args[2], 80)); c.movaps(x86::xmm6, x86::oword_ptr(args[2], 96)); c.movaps(x86::xmm7, x86::oword_ptr(args[2], 112)); } // Alloc r14 to stamp0 const auto stamp0 = x86::r14; build_get_tsc(c, stamp0); Label fail2 = c.newLabel(); Label tx1 = build_transaction_enter(c, fall, [&]() { build_get_tsc(c); c.sub(x86::rax, stamp0); c.cmp(x86::rax, x86::qword_ptr(reinterpret_cast(&g_rtm_tx_limit2))); c.jae(fall); }); // Check pause flag c.bt(x86::dword_ptr(args[2], ::offset32(&ppu_thread::state) - ::offset32(&ppu_thread::rdata)), static_cast(cpu_flag::pause)); c.jc(fall); c.xbegin(tx1); if (s_tsx_avx) { c.vxorps(x86::ymm0, x86::ymm0, x86::ymmword_ptr(x86::rbp, 0)); c.vxorps(x86::ymm1, x86::ymm1, x86::ymmword_ptr(x86::rbp, 32)); c.vxorps(x86::ymm2, x86::ymm2, x86::ymmword_ptr(x86::rbp, 64)); c.vxorps(x86::ymm3, x86::ymm3, x86::ymmword_ptr(x86::rbp, 96)); c.vorps(x86::ymm0, x86::ymm0, x86::ymm1); c.vorps(x86::ymm1, x86::ymm2, x86::ymm3); c.vorps(x86::ymm0, x86::ymm1, x86::ymm0); c.vptest(x86::ymm0, x86::ymm0); } else { c.xorps(x86::xmm0, x86::oword_ptr(x86::rbp, 0)); c.xorps(x86::xmm1, x86::oword_ptr(x86::rbp, 16)); c.xorps(x86::xmm2, x86::oword_ptr(x86::rbp, 32)); c.xorps(x86::xmm3, x86::oword_ptr(x86::rbp, 48)); c.xorps(x86::xmm4, x86::oword_ptr(x86::rbp, 64)); c.xorps(x86::xmm5, x86::oword_ptr(x86::rbp, 80)); c.xorps(x86::xmm6, x86::oword_ptr(x86::rbp, 96)); c.xorps(x86::xmm7, x86::oword_ptr(x86::rbp, 112)); c.orps(x86::xmm0, x86::xmm1); c.orps(x86::xmm2, x86::xmm3); c.orps(x86::xmm4, x86::xmm5); c.orps(x86::xmm6, x86::xmm7); c.orps(x86::xmm0, x86::xmm2); c.orps(x86::xmm4, x86::xmm6); c.orps(x86::xmm0, x86::xmm4); c.ptest(x86::xmm0, x86::xmm0); } c.jnz(fail); // Store 8 bytes c.mov(x86::qword_ptr(x86::rbp, args[0], 1, 0), args[3]); c.xend(); c.lock().add(x86::qword_ptr(x86::r11), 64); build_get_tsc(c); c.sub(x86::rax, stamp0); c.jmp(_ret); // XABORT is expensive so try to finish with xend instead c.bind(fail); // Load old data to store back in rdata if (s_tsx_avx) { c.vmovaps(x86::ymm0, x86::ymmword_ptr(x86::rbp, 0)); c.vmovaps(x86::ymm1, x86::ymmword_ptr(x86::rbp, 32)); c.vmovaps(x86::ymm2, x86::ymmword_ptr(x86::rbp, 64)); c.vmovaps(x86::ymm3, x86::ymmword_ptr(x86::rbp, 96)); } else { c.movaps(x86::xmm0, x86::oword_ptr(x86::rbp, 0)); c.movaps(x86::xmm1, x86::oword_ptr(x86::rbp, 16)); c.movaps(x86::xmm2, x86::oword_ptr(x86::rbp, 32)); c.movaps(x86::xmm3, x86::oword_ptr(x86::rbp, 48)); c.movaps(x86::xmm4, x86::oword_ptr(x86::rbp, 64)); c.movaps(x86::xmm5, x86::oword_ptr(x86::rbp, 80)); c.movaps(x86::xmm6, x86::oword_ptr(x86::rbp, 96)); c.movaps(x86::xmm7, x86::oword_ptr(x86::rbp, 112)); } c.xend(); c.jmp(fail2); c.bind(fall); c.mov(x86::rax, -1); c.jmp(_ret); c.bind(fail2); c.lock().sub(x86::qword_ptr(x86::r11), 64); c.bind(load); // Store previous data back to rdata if (s_tsx_avx) { c.vmovaps(x86::ymmword_ptr(args[2], 0), x86::ymm0); c.vmovaps(x86::ymmword_ptr(args[2], 32), x86::ymm1); c.vmovaps(x86::ymmword_ptr(args[2], 64), x86::ymm2); c.vmovaps(x86::ymmword_ptr(args[2], 96), x86::ymm3); } else { c.movaps(x86::oword_ptr(args[2], 0), x86::xmm0); c.movaps(x86::oword_ptr(args[2], 16), x86::xmm1); c.movaps(x86::oword_ptr(args[2], 32), x86::xmm2); c.movaps(x86::oword_ptr(args[2], 48), x86::xmm3); c.movaps(x86::oword_ptr(args[2], 64), x86::xmm4); c.movaps(x86::oword_ptr(args[2], 80), x86::xmm5); c.movaps(x86::oword_ptr(args[2], 96), x86::xmm6); c.movaps(x86::oword_ptr(args[2], 112), x86::xmm7); } c.mov(x86::rax, -1); c.mov(x86::qword_ptr(args[2], ::offset32(&ppu_thread::last_ftime) - ::offset32(&ppu_thread::rdata)), x86::rax); c.xor_(x86::eax, x86::eax); //c.jmp(_ret); c.bind(_ret); #ifdef _WIN32 if (!s_tsx_avx) { c.vmovups(x86::xmm6, x86::oword_ptr(x86::rsp, 0)); c.vmovups(x86::xmm7, x86::oword_ptr(x86::rsp, 16)); } #endif if (s_tsx_avx) { c.vzeroupper(); } c.add(x86::rsp, 40); c.pop(x86::r14); c.pop(x86::rbp); maybe_flush_lbr(c); c.ret(); #else // Unimplemented should fail. c.brk(Imm(0x42)); c.ret(a64::x30); #endif }); template static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value) { perf_meter<"STCX"_u32> perf0; if (addr % sizeof(T)) { fmt::throw_exception("PPU %s: Unaligned address: 0x%08x", sizeof(T) == 4 ? "STWCX" : "STDCX", addr); } auto& data = vm::_ref>(addr & -8); auto& res = vm::reservation_acquire(addr); const u64 rtime = ppu.rtime; be_t old_data = 0; std::memcpy(&old_data, &ppu.rdata[addr & 0x78], sizeof(old_data)); be_t new_data = old_data; if constexpr (sizeof(T) == sizeof(u32)) { // Rebuild reg_value to be 32-bits of new data and 32-bits of old data const be_t reg32 = static_cast(reg_value); std::memcpy(reinterpret_cast(&new_data) + (addr & 4), ®32, sizeof(u32)); } else { new_data = reg_value; } // Test if store address is on the same aligned 8-bytes memory as load if (const u32 raddr = std::exchange(ppu.raddr, 0); raddr / 8 != addr / 8) { // If not and it is on the same aligned 128-byte memory, proceed only if 128-byte reservations are enabled // In realhw the store address can be at any address of the 128-byte cache line if (raddr / 128 != addr / 128 || !ppu.use_full_rdata) { // Even when the reservation address does not match the target address must be valid if (!vm::check_addr(addr, vm::page_writable)) { // Access violate data += 0; } return false; } } if (old_data != data || rtime != (res & -128)) { return false; } if ([&]() { if (ppu.use_full_rdata) [[unlikely]] { auto [_oldd, _ok] = res.fetch_op([&](u64& r) { if ((r & -128) != rtime || (r & 127)) { return false; } r += vm::rsrv_unique_lock; return true; }); if (!_ok) { // Already locked or updated: give up return false; } if (g_use_rtm) [[likely]] { switch (u64 count = ppu_stcx_accurate_tx(addr & -8, rtime, ppu.rdata, std::bit_cast(new_data))) { case umax: { auto& all_data = *vm::get_super_ptr(addr & -128); auto& sdata = *vm::get_super_ptr>(addr & -8); const bool ok = cpu_thread::suspend_all<+3>(&ppu, {all_data, all_data + 64, &res}, [&] { if ((res & -128) == rtime && cmp_rdata(ppu.rdata, all_data)) { sdata.release(new_data); res += 64; return true; } mov_rdata_nt(ppu.rdata, all_data); res -= 64; return false; }); if (ok) { break; } ppu.last_ftime = -1; [[fallthrough]]; } case 0: { if (ppu.last_faddr == addr) { ppu.last_fail++; } if (ppu.last_ftime != umax) { ppu.last_faddr = 0; return false; } utils::prefetch_read(ppu.rdata); utils::prefetch_read(ppu.rdata + 64); ppu.last_faddr = addr; ppu.last_ftime = res.load() & -128; ppu.last_ftsc = utils::get_tsc(); return false; } default: { if (count > 20000 && g_cfg.core.perf_report) [[unlikely]] { perf_log.warning(u8"STCX: took too long: %.3fµs (%u c)", count / (utils::get_tsc_freq() / 1000'000.), count); } break; } } if (ppu.last_faddr == addr) { ppu.last_succ++; } ppu.last_faddr = 0; return true; } // Align address: we do not need the lower 7 bits anymore addr &= -128; // Cache line data //auto& cline_data = vm::_ref(addr); data += 0; rsx::reservation_lock rsx_lock(addr, 128); auto& super_data = *vm::get_super_ptr(addr); const bool success = [&]() { // Full lock (heavyweight) // TODO: vm::check_addr vm::writer_lock lock(addr); if (cmp_rdata(ppu.rdata, super_data)) { data.release(new_data); res += 64; return true; } res -= 64; return false; }(); return success; } if (new_data == old_data) { ppu.last_faddr = 0; return res.compare_and_swap_test(rtime, rtime + 128); } // Aligned 8-byte reservations will be used here addr &= -8; const u64 lock_bits = vm::rsrv_unique_lock; auto [_oldd, _ok] = res.fetch_op([&](u64& r) { if ((r & -128) != rtime || (r & 127)) { return false; } r += lock_bits; return true; }); // Give up if reservation has been locked or updated if (!_ok) { ppu.last_faddr = 0; return false; } // Store previous value in old_data on failure if (data.compare_exchange(old_data, new_data)) { res += 128 - lock_bits; return true; } const u64 old_rtime = res.fetch_sub(lock_bits); // TODO: disabled with this setting on, since it's dangerous to mix if (!g_cfg.core.ppu_128_reservations_loop_max_length) { // Store old_data on failure if (ppu.last_faddr == addr) { ppu.last_fail++; } ppu.last_faddr = addr; ppu.last_ftime = old_rtime & -128; ppu.last_ftsc = utils::get_tsc(); std::memcpy(&ppu.rdata[addr & 0x78], &old_data, 8); } return false; }()) { // Test a common pattern in lwmutex extern atomic_t liblv2_begin, liblv2_end; if (ppu.cia < liblv2_begin || ppu.cia >= liblv2_end) { res.notify_all(-128); } if (addr == ppu.last_faddr) { ppu.last_succ++; } ppu.last_faddr = 0; return true; } return false; } extern bool ppu_stwcx(ppu_thread& ppu, u32 addr, u32 reg_value) { return ppu_store_reservation(ppu, addr, reg_value); } extern bool ppu_stdcx(ppu_thread& ppu, u32 addr, u64 reg_value) { return ppu_store_reservation(ppu, addr, reg_value); } #ifdef LLVM_AVAILABLE namespace { // Compiled PPU module info struct jit_module { std::vector funcs; std::shared_ptr pjit; bool init = false; }; struct jit_module_manager { shared_mutex mutex; std::unordered_map map; jit_module& get(const std::string& name) { std::lock_guard lock(mutex); return map.emplace(name, jit_module{}).first->second; } void remove(const std::string& name) noexcept { std::lock_guard lock(mutex); const auto found = map.find(name); if (found == map.end()) [[unlikely]] { ppu_log.error("Failed to remove module %s", name); return; } map.erase(found); } }; } #endif namespace { // Read-only file view starting with specified offset (for MSELF) struct file_view : fs::file_base { const fs::file m_file; const u64 m_off; u64 m_pos; explicit file_view(fs::file&& _file, u64 offset) : m_file(std::move(_file)) , m_off(offset) , m_pos(0) { } ~file_view() override { } fs::stat_t stat() override { return m_file.stat(); } bool trunc(u64) override { return false; } u64 read(void* buffer, u64 size) override { const u64 old_pos = m_file.pos(); m_file.seek(m_off + m_pos); const u64 result = m_file.read(buffer, size); ensure(old_pos == m_file.seek(old_pos)); m_pos += result; return result; } u64 write(const void*, u64) override { return 0; } u64 seek(s64 offset, fs::seek_mode whence) override { const s64 new_pos = whence == fs::seek_set ? offset : whence == fs::seek_cur ? offset + m_pos : whence == fs::seek_end ? offset + size() : -1; if (new_pos < 0) { fs::g_tls_error = fs::error::inval; return -1; } m_pos = new_pos; return m_pos; } u64 size() override { return m_file.size(); } }; } extern fs::file make_file_view(fs::file&& _file, u64 offset) { fs::file file; file.reset(std::make_unique(std::move(_file), offset)); return file; } extern void ppu_finalize(const ppu_module& info) { // Get cache path for this executable std::string cache_path; if (info.name.empty()) { // Don't remove main module from memory return; } else { // Get PPU cache location cache_path = fs::get_cache_dir() + "cache/"; const std::string dev_flash = vfs::get("/dev_flash/sys/"); if (info.path.starts_with(dev_flash) || Emu.GetCat() == "1P") { // Don't remove dev_flash prx from memory return; } else if (!Emu.GetTitleID().empty()) { cache_path += Emu.GetTitleID(); cache_path += '/'; } // Add PPU hash and filename fmt::append(cache_path, "ppu-%s-%s/", fmt::base57(info.sha1), info.path.substr(info.path.find_last_of('/') + 1)); } #ifdef LLVM_AVAILABLE g_fxo->get().remove(cache_path + info.name); #endif } extern void ppu_precompile(std::vector& dir_queue, std::vector* loaded_modules) { if (g_cfg.core.ppu_decoder != ppu_decoder_type::llvm) { return; } if (auto dis = g_fxo->try_get(); dis && dis->disable) { return; } // Make sure we only have one '/' at the end and remove duplicates. for (std::string& dir : dir_queue) { while (dir.back() == '/' || dir.back() == '\\') dir.pop_back(); dir += '/'; } std::sort(dir_queue.begin(), dir_queue.end()); dir_queue.erase(std::unique(dir_queue.begin(), dir_queue.end()), dir_queue.end()); const std::string firmware_sprx_path = vfs::get("/dev_flash/sys/external/"); // Map fixed address executables area, fake overlay support const bool had_ovl = !vm::map(0x3000'0000, 0x1000'0000, 0x202).operator bool(); const u32 ppc_seg = std::exchange(g_ps3_process_info.ppc_seg, 0x3); std::vector> file_queue; file_queue.reserve(2000); // Find all .sprx files recursively for (usz i = 0; i < dir_queue.size(); i++) { if (Emu.IsStopped()) { file_queue.clear(); break; } ppu_log.notice("Scanning directory: %s", dir_queue[i]); for (auto&& entry : fs::dir(dir_queue[i])) { if (Emu.IsStopped()) { file_queue.clear(); break; } if (entry.is_directory) { if (entry.name != "." && entry.name != "..") { dir_queue.emplace_back(dir_queue[i] + entry.name + '/'); } continue; } std::string upper = fmt::to_upper(entry.name); // Skip already loaded modules or HLEd ones auto is_ignored = [&](s64 /*offset*/) -> bool { if (dir_queue[i] != firmware_sprx_path) { return false; } if (loaded_modules) { if (std::any_of(loaded_modules->begin(), loaded_modules->end(), [&](ppu_module* obj) { return obj->name == entry.name; })) { return true; } } if (g_cfg.core.libraries_control.get_set().count(entry.name + ":lle")) { // Force LLE return false; } else if (g_cfg.core.libraries_control.get_set().count(entry.name + ":hle")) { // Force HLE return true; } extern const std::map g_prx_list; // Use list return g_prx_list.count(entry.name) && ::at32(g_prx_list, entry.name) != 0; }; // Check .sprx filename if (upper.ends_with(".SPRX") && entry.name != "libfs_utility_init.sprx"sv) { if (is_ignored(0)) { continue; } // Get full path file_queue.emplace_back(dir_queue[i] + entry.name, 0); continue; } // Check .self filename if (upper.ends_with(".SELF")) { // Get full path file_queue.emplace_back(dir_queue[i] + entry.name, 0); continue; } // Check .mself filename if (upper.ends_with(".MSELF")) { if (fs::file mself{dir_queue[i] + entry.name}) { mself_header hdr{}; if (mself.read(hdr) && hdr.get_count(mself.size())) { for (u32 j = 0; j < hdr.count; j++) { mself_record rec{}; if (mself.read(rec) && rec.get_pos(mself.size())) { std::string name = rec.name; upper = fmt::to_upper(name); if (upper.ends_with(".SPRX")) { // .sprx inside .mself found file_queue.emplace_back(dir_queue[i] + entry.name, rec.off); continue; } if (upper.ends_with(".SELF")) { // .self inside .mself found file_queue.emplace_back(dir_queue[i] + entry.name, rec.off); continue; } } else { ppu_log.error("MSELF file is possibly truncated"); break; } } } } } } } g_progr_ftotal += file_queue.size(); scoped_progress_dialog progr = "Compiling PPU modules..."; atomic_t fnext = 0; shared_mutex sprx_mtx, ovl_mtx; named_thread_group workers("SPRX Worker ", std::min(utils::get_thread_count(), ::size32(file_queue)), [&] { #ifdef __APPLE__ pthread_jit_write_protect_np(false); #endif // Set low priority thread_ctrl::scoped_priority low_prio(-1); for (usz func_i = fnext++; func_i < file_queue.size(); func_i = fnext++, g_progr_fdone++) { if (Emu.IsStopped()) { continue; } auto [path, offset] = std::as_const(file_queue)[func_i]; ppu_log.notice("Trying to load: %s", path); // Load MSELF, SPRX or SELF fs::file src{path}; if (!src) { ppu_log.error("Failed to open '%s' (%s)", path, fs::g_tls_error); continue; } if (u64 off = offset) { // Adjust offset for MSELF src.reset(std::make_unique(std::move(src), off)); // Adjust path for MSELF too fmt::append(path, "_x%x", off); } // Some files may fail to decrypt due to the lack of klic src = decrypt_self(std::move(src)); if (!src) { ppu_log.notice("Failed to decrypt '%s'", path); continue; } elf_error prx_err{}, ovl_err{}; if (ppu_prx_object obj = src; (prx_err = obj, obj == elf_error::ok)) { std::unique_lock lock(sprx_mtx); if (auto prx = ppu_load_prx(obj, path, offset)) { lock.unlock(); obj.clear(), src.close(); // Clear decrypted file and elf object memory ppu_initialize(*prx); idm::remove(idm::last_id()); lock.lock(); ppu_unload_prx(*prx); lock.unlock(); ppu_finalize(*prx); continue; } // Log error prx_err = elf_error::header_type; } if (ppu_exec_object obj = src; (ovl_err = obj, obj == elf_error::ok)) { while (ovl_err == elf_error::ok) { // Only one thread compiles OVL atm, other can compile PRX cuncurrently std::unique_lock lock(ovl_mtx); auto [ovlm, error] = ppu_load_overlay(obj, path, offset); if (error) { // Abort ovl_err = elf_error::header_type; break; } obj.clear(), src.close(); // Clear decrypted file and elf object memory ppu_initialize(*ovlm); for (auto& seg : ovlm->segs) { vm::dealloc(seg.addr); } lock.unlock(); idm::remove(idm::last_id()); ppu_finalize(*ovlm); break; } if (ovl_err == elf_error::ok) { continue; } } ppu_log.notice("Failed to precompile '%s' (prx: %s, ovl: %s)", path, prx_err, ovl_err); continue; } }); // Join every thread workers.join(); // Revert changes if (!had_ovl) { ensure(vm::unmap(0x3000'0000).second); } g_ps3_process_info.ppc_seg = ppc_seg; } extern void ppu_initialize() { if (!g_fxo->is_init()) { return; } if (Emu.IsStopped()) { return; } auto& _main = g_fxo->get(); scoped_progress_dialog progr = "Scanning PPU modules..."; bool compile_main = false; // Check main module cache if (!_main.segs.empty()) { compile_main = ppu_initialize(_main, true); } std::vector module_list; const std::string firmware_sprx_path = vfs::get("/dev_flash/sys/external/"); // If empty we have no indication for firmware cache state, check everything bool compile_fw = true; idm::select([&](u32, lv2_prx& _module) { if (_module.path.starts_with(firmware_sprx_path)) { // Postpone testing compile_fw = false; } module_list.emplace_back(&_module); }); idm::select([&](u32, lv2_overlay& _module) { module_list.emplace_back(&_module); }); // Check preloaded libraries cache if (!compile_fw) { for (auto ptr : module_list) { if (ptr->path.starts_with(firmware_sprx_path)) { compile_fw |= ppu_initialize(*ptr, true); } } } std::vector dir_queue; const std::string mount_point = vfs::get("/dev_flash/"); bool dev_flash_located = Emu.GetCat().back() != 'P' && Emu.IsPathInsideDir(Emu.GetBoot(), mount_point); if (compile_fw || dev_flash_located) { if (dev_flash_located) { const std::string eseibrd = mount_point + "/vsh/module/eseibrd.sprx"; if (auto prx = ppu_load_prx(ppu_prx_object{decrypt_self(fs::file{eseibrd})}, eseibrd, 0)) { // Check if cache exists for this infinitesimally small prx dev_flash_located = ppu_initialize(*prx, true); idm::remove(idm::last_id()); ppu_unload_prx(*prx); } } const std::string firmware_sprx_path = vfs::get(dev_flash_located ? "/dev_flash/"sv : "/dev_flash/sys/"sv); dir_queue.emplace_back(firmware_sprx_path); } // Avoid compilation if main's cache exists or it is a standalone SELF with no PARAM.SFO if (compile_main && g_cfg.core.ppu_llvm_precompilation && !Emu.GetTitleID().empty()) { // Try to add all related directories const std::set dirs = Emu.GetGameDirs(); dir_queue.insert(std::end(dir_queue), std::begin(dirs), std::end(dirs)); } ppu_precompile(dir_queue, &module_list); if (Emu.IsStopped()) { return; } // Initialize main module cache if (!_main.segs.empty()) { ppu_initialize(_main); } // Initialize preloaded libraries for (auto ptr : module_list) { if (Emu.IsStopped()) { return; } ppu_initialize(*ptr); } } struct ppu_toc_manager { std::unordered_map toc_map; shared_mutex mutex; }; bool ppu_initialize(const ppu_module& info, bool check_only) { if (g_cfg.core.ppu_decoder != ppu_decoder_type::llvm) { if (check_only) { return false; } // Temporarily s_ppu_toc = &g_fxo->get().toc_map; for (const auto& func : info.funcs) { for (auto& block : func.blocks) { ppu_register_function_at(block.first, block.second); } if (g_cfg.core.ppu_debug && func.size && func.toc != umax) { s_ppu_toc->emplace(func.addr, func.toc); ppu_ref(func.addr) = &ppu_check_toc; } } return false; } // Link table static const std::unordered_map s_link_table = []() { std::unordered_map link_table { { "sys_game_watchdog_start", reinterpret_cast(ppu_execute_syscall) }, { "sys_game_watchdog_stop", reinterpret_cast(ppu_execute_syscall) }, { "sys_game_watchdog_clear", reinterpret_cast(ppu_execute_syscall) }, { "sys_game_get_system_sw_version", reinterpret_cast(ppu_execute_syscall) }, { "sys_game_board_storage_read", reinterpret_cast(ppu_execute_syscall) }, { "__trap", reinterpret_cast(&ppu_trap) }, { "__error", reinterpret_cast(&ppu_error) }, { "__check", reinterpret_cast(&ppu_check) }, { "__trace", reinterpret_cast(&ppu_trace) }, { "__syscall", reinterpret_cast(ppu_execute_syscall) }, { "__get_tb", reinterpret_cast(get_timebased_time) }, { "__lwarx", reinterpret_cast(ppu_lwarx) }, { "__ldarx", reinterpret_cast(ppu_ldarx) }, { "__stwcx", reinterpret_cast(ppu_stwcx) }, { "__stdcx", reinterpret_cast(ppu_stdcx) }, { "__dcbz", reinterpret_cast(+[](u32 addr){ alignas(64) static constexpr u8 z[128]{}; do_cell_atomic_128_store(addr, z); }) }, { "__resupdate", reinterpret_cast(vm::reservation_update) }, { "__resinterp", reinterpret_cast(ppu_reservation_fallback) }, }; for (u64 index = 0; index < 1024; index++) { if (ppu_get_syscall(index)) { link_table.emplace(fmt::format("%s", ppu_syscall_code(index)), reinterpret_cast(ppu_execute_syscall)); link_table.emplace(fmt::format("syscall_%u", index), reinterpret_cast(ppu_execute_syscall)); } } return link_table; }(); // Get cache path for this executable std::string cache_path; if (info.name.empty()) { cache_path = info.cache; } else { // New PPU cache location cache_path = fs::get_cache_dir() + "cache/"; const std::string dev_flash = vfs::get("/dev_flash/"); if (!info.path.starts_with(dev_flash) && !Emu.GetTitleID().empty() && Emu.GetCat() != "1P") { // Add prefix for anything except dev_flash files, standalone elfs or PS1 classics cache_path += Emu.GetTitleID(); cache_path += '/'; } // Add PPU hash and filename fmt::append(cache_path, "ppu-%s-%s/", fmt::base57(info.sha1), info.path.substr(info.path.find_last_of('/') + 1)); if (!fs::create_path(cache_path)) { fmt::throw_exception("Failed to create cache directory: %s (%s)", cache_path, fs::g_tls_error); } } #ifdef LLVM_AVAILABLE std::optional progr; if (!check_only) { // Initialize progress dialog progr.emplace("Loading PPU modules..."); } struct jit_core_allocator { const s32 thread_count = g_cfg.core.llvm_threads ? std::min(g_cfg.core.llvm_threads, limit()) : limit(); // Initialize global semaphore with the max number of threads ::semaphore<0x7fffffff> sem{std::max(thread_count, 1)}; static s32 limit() { return static_cast(utils::get_thread_count()); } }; // Permanently loaded compiled PPU modules (name -> data) jit_module& jit_mod = g_fxo->get().get(cache_path + info.name); // Compiler instance (deferred initialization) std::shared_ptr& jit = jit_mod.pjit; // Split module into fragments <= 1 MiB usz fpos = 0; // Difference between function name and current location const u32 reloc = info.relocs.empty() ? 0 : ::at32(info.segs, 0).addr; // Info sent to threads std::vector> workload; // Info to load to main JIT instance (true - compiled) std::vector> link_workload; // Sync variable to acquire workloads atomic_t work_cv = 0; bool compiled_new = false; bool has_mfvscr = false; for (auto& func : info.funcs) { if (func.size == 0) { continue; } for (const auto& [addr, size] : func.blocks) { if (size == 0) { continue; } for (u32 i = addr; i < addr + size; i += 4) { if (g_ppu_itype.decode(vm::read32(i)) == ppu_itype::MFVSCR) { ppu_log.warning("MFVSCR found"); has_mfvscr = true; break; } } if (has_mfvscr) { break; } } if (has_mfvscr) { break; } } while (!jit_mod.init && fpos < info.funcs.size()) { // Initialize compiler instance if (!jit && get_current_cpu_thread()) { jit = std::make_shared(s_link_table, g_cfg.core.llvm_cpu); } // Copy module information (TODO: optimize) ppu_module part; part.copy_part(info); part.funcs.reserve(16000); // Overall block size in bytes usz bsize = 0; usz bcount = 0; while (fpos < info.funcs.size()) { auto& func = info.funcs[fpos]; if (!func.size) { fpos++; continue; } if (bsize + func.size > 100 * 1024 && bsize) { if (bcount >= 1000) { break; } } if (jit) { const auto far_jump = ppu_get_far_jump(func.addr) ? g_fxo->get().gen_jump(func.addr) : nullptr; if (far_jump) { // Replace the function with ppu_far_jump jit->update_global_mapping(fmt::format("__0x%x", func.addr - reloc), reinterpret_cast(far_jump)); fpos++; continue; } } // Copy block or function entry ppu_function& entry = part.funcs.emplace_back(func); // Fixup some information entry.name = fmt::format("__0x%x", entry.addr - reloc); if (has_mfvscr && g_cfg.core.ppu_set_sat_bit) { // TODO entry.attr += ppu_attr::has_mfvscr; } if (entry.blocks.empty()) { entry.blocks.emplace(func.addr, func.size); } bsize += func.size; fpos++; bcount++; } // Compute module hash to generate (hopefully) unique object name std::string obj_name; { sha1_context ctx; u8 output[20]; sha1_starts(&ctx); int has_dcbz = !!g_cfg.core.accurate_cache_line_stores; for (const auto& func : part.funcs) { if (func.size == 0) { continue; } const be_t addr = func.addr - reloc; const be_t size = func.size; sha1_update(&ctx, reinterpret_cast(&addr), sizeof(addr)); sha1_update(&ctx, reinterpret_cast(&size), sizeof(size)); for (const auto& block : func.blocks) { if (block.second == 0 || reloc) { continue; } // Find relevant relocations auto low = std::lower_bound(part.relocs.cbegin(), part.relocs.cend(), block.first); auto high = std::lower_bound(low, part.relocs.cend(), block.first + block.second); auto addr = block.first; for (; low != high; ++low) { // Aligned relocation address const u32 roff = low->addr & ~3; if (roff > addr) { // Hash from addr to the beginning of the relocation sha1_update(&ctx, vm::_ptr(addr), roff - addr); } // Hash relocation type instead const be_t type = low->type; sha1_update(&ctx, reinterpret_cast(&type), sizeof(type)); // Set the next addr addr = roff + 4; } if (has_dcbz == 1) { for (u32 i = addr, end = block.second + block.first - 1; i <= end; i += 4) { if (g_ppu_itype.decode(vm::read32(i)) == ppu_itype::DCBZ) { has_dcbz = 2; break; } } } // Hash from addr to the end of the block sha1_update(&ctx, vm::_ptr(addr), block.second - (addr - block.first)); } if (reloc) { continue; } if (has_dcbz == 1) { for (u32 i = func.addr, end = func.addr + func.size - 1; i <= end; i += 4) { if (g_ppu_itype.decode(vm::read32(i)) == ppu_itype::DCBZ) { has_dcbz = 2; break; } } } sha1_update(&ctx, vm::_ptr(func.addr), func.size); } if (false) { const be_t forced_upd = 3; sha1_update(&ctx, reinterpret_cast(&forced_upd), sizeof(forced_upd)); } sha1_finish(&ctx, output); // Settings: should be populated by settings which affect codegen (TODO) enum class ppu_settings : u32 { non_win32, accurate_dfma, fixup_vnan, fixup_nj_denormals, accurate_cache_line_stores, reservations_128_byte, greedy_mode, accurate_sat, accurate_fpcc, accurate_vnan, accurate_nj_mode, __bitset_enum_max }; be_t> settings{}; #ifndef _WIN32 settings += ppu_settings::non_win32; #endif if (g_cfg.core.use_accurate_dfma) settings += ppu_settings::accurate_dfma; if (g_cfg.core.ppu_fix_vnan) settings += ppu_settings::fixup_vnan; if (g_cfg.core.ppu_llvm_nj_fixup) settings += ppu_settings::fixup_nj_denormals; if (has_dcbz == 2) settings += ppu_settings::accurate_cache_line_stores; if (g_cfg.core.ppu_128_reservations_loop_max_length) settings += ppu_settings::reservations_128_byte; if (g_cfg.core.ppu_llvm_greedy_mode) settings += ppu_settings::greedy_mode; if (has_mfvscr && g_cfg.core.ppu_set_sat_bit) settings += ppu_settings::accurate_sat; if (g_cfg.core.ppu_set_fpcc) settings += ppu_settings::accurate_fpcc, fmt::throw_exception("FPCC Not implemented"); if (g_cfg.core.ppu_set_vnan) settings += ppu_settings::accurate_vnan, settings -= ppu_settings::fixup_vnan, fmt::throw_exception("VNAN Not implemented"); if (g_cfg.core.ppu_use_nj_bit) settings += ppu_settings::accurate_nj_mode, settings -= ppu_settings::fixup_nj_denormals, fmt::throw_exception("NJ Not implemented"); // Write version, hash, CPU, settings fmt::append(obj_name, "v5-kusa-%s-%s-%s.obj", fmt::base57(output, 16), fmt::base57(settings), jit_compiler::cpu(g_cfg.core.llvm_cpu)); } if (Emu.IsStopped()) { break; } if (!check_only) { // Update progress dialog g_progr_ptotal++; link_workload.emplace_back(obj_name, false); } // Check object file if (jit_compiler::check(cache_path + obj_name)) { if (!jit && !check_only) { ppu_log.success("LLVM: Module exists: %s", obj_name); // Update progress dialog g_progr_pdone++; } continue; } if (check_only) { return true; } // Remember, used in ppu_initialize(void) compiled_new = true; // Adjust information (is_compiled) link_workload.back().second = true; // Fill workload list for compilation workload.emplace_back(std::move(obj_name), std::move(part)); } if (check_only) { return false; } if (!workload.empty()) { g_progr = "Compiling PPU modules..."; } // Create worker threads for compilation (TODO: how many threads) { u32 thread_count = rpcs3::utils::get_max_threads(); if (workload.size() < thread_count) { thread_count = ::size32(workload); } struct thread_index_allocator { atomic_t index = 0; }; // Prevent watchdog thread from terminating g_watchdog_hold_ctr++; named_thread_group threads(fmt::format("PPUW.%u.", ++g_fxo->get().index), thread_count, [&]() { // Set low priority thread_ctrl::scoped_priority low_prio(-1); #ifdef __APPLE__ pthread_jit_write_protect_np(false); #endif for (u32 i = work_cv++; i < workload.size(); i = work_cv++, g_progr_pdone++) { if (Emu.IsStopped()) { continue; } // Keep allocating workload const auto& [obj_name, part] = std::as_const(workload)[i]; // Allocate "core" std::lock_guard jlock(g_fxo->get().sem); if (Emu.IsStopped()) { continue; } ppu_log.warning("LLVM: Compiling module %s%s", cache_path, obj_name); // Use another JIT instance jit_compiler jit2({}, g_cfg.core.llvm_cpu, 0x1); ppu_initialize2(jit2, part, cache_path, obj_name); ppu_log.success("LLVM: Compiled module %s", obj_name); } }); threads.join(); g_watchdog_hold_ctr--; if (Emu.IsStopped() || !get_current_cpu_thread()) { return compiled_new; } if (workload.size() < link_workload.size()) { // Only show this message if this task is relevant g_progr = "Linking PPU modules..."; } for (auto [obj_name, is_compiled] : link_workload) { if (Emu.IsStopped()) { break; } jit->add(cache_path + obj_name); if (!is_compiled) { ppu_log.success("LLVM: Loaded module %s", obj_name); g_progr_pdone++; } } } if (Emu.IsStopped() || !get_current_cpu_thread()) { return compiled_new; } // Jit can be null if the loop doesn't ever enter. #ifdef __APPLE__ pthread_jit_write_protect_np(false); #endif if (jit && !jit_mod.init) { jit->fin(); // Get and install function addresses for (const auto& func : info.funcs) { if (!func.size) continue; const auto name = fmt::format("__0x%x", func.addr - reloc); const auto addr = ensure(reinterpret_cast(jit->get(name))); jit_mod.funcs.emplace_back(addr); ppu_register_function_at(func.addr, 4, addr); if (g_cfg.core.ppu_debug) ppu_log.notice("Installing function %s at 0x%x: %p (reloc = 0x%x)", name, func.addr, ppu_ref(func.addr), reloc); } jit_mod.init = true; } else { usz index = 0; // Locate existing functions for (const auto& func : info.funcs) { if (!func.size) continue; const u64 addr = reinterpret_cast(ensure(jit_mod.funcs[index++])); ppu_register_function_at(func.addr, 4, addr); if (g_cfg.core.ppu_debug) ppu_log.notice("Reinstalling function at 0x%x: %p (reloc=0x%x)", func.addr, ppu_ref(func.addr), reloc); } index = 0; } return compiled_new; #else fmt::throw_exception("LLVM is not available in this build."); #endif } static void ppu_initialize2(jit_compiler& jit, const ppu_module& module_part, const std::string& cache_path, const std::string& obj_name) { #ifdef LLVM_AVAILABLE using namespace llvm; // Create LLVM module std::unique_ptr _module = std::make_unique(obj_name, jit.get_context()); // Initialize target #if defined(__APPLE__) && defined(ARCH_ARM64) // Force target linux on macOS arm64 to bypass some 64-bit address space linking issues _module->setTargetTriple(Triple::normalize(utils::c_llvm_default_triple)); #else _module->setTargetTriple(Triple::normalize(sys::getProcessTriple())); #endif _module->setDataLayout(jit.get_engine().getTargetMachine()->createDataLayout()); // Initialize translator PPUTranslator translator(jit.get_context(), _module.get(), module_part, jit.get_engine()); // Define some types const auto _func = FunctionType::get(translator.get_type(), { translator.get_type(), // Exec base translator.GetContextType()->getPointerTo(), // PPU context translator.get_type(), // Segment address (for PRX) translator.get_type(), // Memory base translator.get_type(), // r0 translator.get_type(), // r1 translator.get_type(), // r2 }, false); // Initialize function list for (const auto& func : module_part.funcs) { if (func.size) { const auto f = cast(_module->getOrInsertFunction(func.name, _func).getCallee()); f->setCallingConv(CallingConv::GHC); f->addAttribute(2, Attribute::NoAlias); f->addFnAttr(Attribute::NoUnwind); } } { legacy::FunctionPassManager pm(_module.get()); // Basic optimizations //pm.add(createCFGSimplificationPass()); //pm.add(createPromoteMemoryToRegisterPass()); pm.add(createEarlyCSEPass()); //pm.add(createTailCallEliminationPass()); //pm.add(createInstructionCombiningPass()); //pm.add(createBasicAAWrapperPass()); //pm.add(new MemoryDependenceAnalysis()); //pm.add(createLICMPass()); //pm.add(createLoopInstSimplifyPass()); //pm.add(createNewGVNPass()); pm.add(createDeadStoreEliminationPass()); //pm.add(createSCCPPass()); //pm.add(createReassociatePass()); //pm.add(createInstructionCombiningPass()); //pm.add(createInstructionSimplifierPass()); //pm.add(createAggressiveDCEPass()); //pm.add(createCFGSimplificationPass()); //pm.add(createLintPass()); // Check // Translate functions for (usz fi = 0, fmax = module_part.funcs.size(); fi < fmax; fi++) { if (Emu.IsStopped()) { ppu_log.success("LLVM: Translation cancelled"); return; } if (module_part.funcs[fi].size) { // Translate if (const auto func = translator.Translate(module_part.funcs[fi])) { // Run optimization passes pm.run(*func); } else { Emu.Pause(); return; } } } //legacy::PassManager mpm; // Remove unused functions, structs, global variables, etc //mpm.add(createStripDeadPrototypesPass()); //mpm.add(createFunctionInliningPass()); //mpm.add(createDeadInstEliminationPass()); //mpm.run(*module); std::string result; raw_string_ostream out(result); if (g_cfg.core.llvm_logs) { out << *_module; // print IR fs::file(cache_path + obj_name + ".log", fs::rewrite).write(out.str()); result.clear(); } if (verifyModule(*_module, &out)) { out.flush(); ppu_log.error("LLVM: Verification failed for %s:\n%s", obj_name, result); Emu.CallFromMainThread([]{ Emu.GracefulShutdown(false, true); }); return; } ppu_log.notice("LLVM: %zu functions generated", _module->getFunctionList().size()); } // Load or compile module jit.add(std::move(_module), cache_path); #endif // LLVM_AVAILABLE }