#include "stdafx.h" #include "SPURecompiler.h" #include "Emu/System.h" #include "Emu/system_config.h" #include "Emu/IdManager.h" #include "Emu/Cell/timers.hpp" #include "Crypto/sha1.h" #include "Utilities/JIT.h" #include "SPUThread.h" #include "SPUAnalyser.h" #include "SPUInterpreter.h" #include #include #include "util/v128.hpp" #include "util/simd.hpp" #include "util/sysinfo.hpp" const extern spu_decoder g_spu_itype; const extern spu_decoder g_spu_iname; const extern spu_decoder g_spu_iflag; #ifdef LLVM_AVAILABLE #include "Emu/CPU/CPUTranslator.h" #ifdef _MSC_VER #pragma warning(push, 0) #else #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wall" #pragma GCC diagnostic ignored "-Wextra" #pragma GCC diagnostic ignored "-Wold-style-cast" #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wstrict-aliasing" #pragma GCC diagnostic ignored "-Weffc++" #pragma GCC diagnostic ignored "-Wmissing-noreturn" #endif #if LLVM_VERSION_MAJOR < 17 #include "llvm/ADT/Triple.h" #endif #include "llvm/TargetParser/Host.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/Verifier.h" #include "llvm/IR/InlineAsm.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Analysis/PostDominators.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/ADT/PostOrderIterator.h" #ifdef _MSC_VER #pragma warning(pop) #else #pragma GCC diagnostic pop #endif class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator { // JIT Instance jit_compiler m_jit{{}, jit_compiler::cpu(g_cfg.core.llvm_cpu)}; // Interpreter table size power const u8 m_interp_magn; // Constant opcode bits u32 m_op_const_mask = -1; // Current function chunk entry point u32 m_entry; // Main entry point offset u32 m_base; // Module name std::string m_hash; // Patchpoint unique id u32 m_pp_id = 0; // Next opcode u32 m_next_op = 0; // Current function (chunk) llvm::Function* m_function; llvm::Value* m_thread; llvm::Value* m_lsptr; llvm::Value* m_interp_op; llvm::Value* m_interp_pc; llvm::Value* m_interp_table; llvm::Value* m_interp_7f0; llvm::Value* m_interp_regs; // Helpers llvm::Value* m_base_pc; llvm::Value* m_interp_pc_next; llvm::BasicBlock* m_interp_bblock; // i8*, contains constant vm::g_base_addr value llvm::Value* m_memptr; // Pointers to registers in the thread context std::array m_reg_addr; // Global variable (function table) llvm::GlobalVariable* m_function_table{}; // Global LUTs llvm::GlobalVariable* m_spu_frest_fraction_lut{}; llvm::GlobalVariable* m_spu_frest_exponent_lut{}; llvm::GlobalVariable* m_spu_frsqest_fraction_lut{}; llvm::GlobalVariable* m_spu_frsqest_exponent_lut{}; // Helpers (interpreter) llvm::GlobalVariable* m_scale_float_to{}; llvm::GlobalVariable* m_scale_to_float{}; // Function for check_state execution llvm::Function* m_test_state{}; // Chunk for external tail call (dispatch) llvm::Function* m_dispatch{}; llvm::MDNode* m_md_unlikely; llvm::MDNode* m_md_likely; struct block_info { // Pointer to the analyser spu_recompiler_base::block_info* bb{}; // Current block's entry block llvm::BasicBlock* block; // Final block (for PHI nodes, set after completion) llvm::BasicBlock* block_end{}; // Additional blocks for sinking instructions after block_end: std::unordered_map> block_edges; // Current register values std::array reg{}; // PHI nodes created for this block (if any) std::array phi{}; // Store instructions std::array store{}; // Store reordering/elimination protection std::array store_context_last_id = fill_array(0); // Protects against illegal forward ordering std::array store_context_first_id = fill_array(usz{umax}); // Protects against illegal past store elimination (backwards ordering is not implemented) std::array store_context_ctr = fill_array(1); // Store barrier cointer bool does_gpr_barrier_proceed_last_store(u32 i) const noexcept { const usz counter = store_context_ctr[i]; return counter != 1 && counter > store_context_last_id[i]; } bool does_gpr_barrier_preceed_first_store(u32 i) const noexcept { const usz counter = store_context_ctr[i]; const usz first_id = store_context_first_id[i]; return counter != 1 && first_id != umax && counter < first_id; } }; struct function_info { // Standard callable chunk llvm::Function* chunk{}; // Callable function llvm::Function* fn{}; // Registers possibly loaded in the entry block std::array load{}; }; // Current block block_info* m_block; // Current function or chunk function_info* m_finfo; // All blocks in the current function chunk std::unordered_map> m_blocks; // Block list for processing std::vector m_block_queue; // All function chunks in current SPU compile unit std::unordered_map> m_functions; // Function chunk list for processing std::vector m_function_queue; // Add or get the function chunk function_info* add_function(u32 addr) { // Enqueue if necessary const auto empl = m_functions.try_emplace(addr); if (!empl.second) { return &empl.first->second; } // Chunk function type // 0. Result (tail call target) // 1. Thread context // 2. Local storage pointer // 3. #if 0 const auto chunk_type = get_ftype(); #else const auto chunk_type = get_ftype(); #endif // Get function chunk name const std::string name = fmt::format("__spu-cx%05x-%s", addr, fmt::base57(be_t{m_hash_start})); llvm::Function* result = llvm::cast(m_module->getOrInsertFunction(name, chunk_type).getCallee()); // Set parameters result->setLinkage(llvm::GlobalValue::InternalLinkage); result->addParamAttr(0, llvm::Attribute::NoAlias); result->addParamAttr(1, llvm::Attribute::NoAlias); #if 1 result->setCallingConv(llvm::CallingConv::GHC); #endif empl.first->second.chunk = result; if (g_cfg.core.spu_block_size == spu_block_size_type::giga) { // Find good real function const auto ffound = m_funcs.find(addr); if (ffound != m_funcs.end() && ffound->second.good) { // Real function type (not equal to chunk type) // 4. $SP // 5. $3 const auto func_type = get_ftype(); const std::string fname = fmt::format("__spu-fx%05x-%s", addr, fmt::base57(be_t{m_hash_start})); llvm::Function* fn = llvm::cast(m_module->getOrInsertFunction(fname, func_type).getCallee()); fn->setLinkage(llvm::GlobalValue::InternalLinkage); fn->addParamAttr(0, llvm::Attribute::NoAlias); fn->addParamAttr(1, llvm::Attribute::NoAlias); #if 1 fn->setCallingConv(llvm::CallingConv::GHC); #endif empl.first->second.fn = fn; } } // Enqueue m_function_queue.push_back(addr); return &empl.first->second; } // Create tail call to the function chunk (non-tail calls are just out of question) void tail_chunk(llvm::FunctionCallee callee, llvm::Value* base_pc = nullptr) { if (!callee && !g_cfg.core.spu_verification) { // Disable patchpoints if verification is disabled callee = m_dispatch; } else if (!callee) { // Create branch patchpoint if chunk == nullptr ensure(m_finfo && (!m_finfo->fn || m_function == m_finfo->chunk)); // Register under a unique linkable name const std::string ppname = fmt::format("%s-pp-%u", m_hash, m_pp_id++); m_engine->updateGlobalMapping(ppname, reinterpret_cast(m_spurt->make_branch_patchpoint())); // Create function with not exactly correct type const auto ppfunc = llvm::cast(m_module->getOrInsertFunction(ppname, m_finfo->chunk->getFunctionType()).getCallee()); ppfunc->setCallingConv(m_finfo->chunk->getCallingConv()); if (m_finfo->chunk->getReturnType() != get_type()) { m_ir->CreateRet(ppfunc); return; } callee = ppfunc; base_pc = m_ir->getInt32(0); } ensure(callee); auto call = m_ir->CreateCall(callee, {m_thread, m_lsptr, base_pc ? base_pc : m_base_pc}); auto func = m_finfo ? m_finfo->chunk : llvm::dyn_cast(callee.getCallee()); call->setCallingConv(func->getCallingConv()); call->setTailCall(); if (func->getReturnType() == get_type()) { m_ir->CreateRetVoid(); } else { m_ir->CreateRet(call); } } // Call the real function void call_function(llvm::Function* fn, bool tail = false) { llvm::Value* lr{}; llvm::Value* sp{}; llvm::Value* r3{}; if (!m_finfo->fn && !m_block) { lr = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::gpr, +s_reg_lr, &v128::_u32, 3)); sp = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::gpr, +s_reg_sp)); r3 = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::gpr, 3)); } else { lr = m_ir->CreateExtractElement(get_reg_fixed(s_reg_lr).value, 3); sp = get_reg_fixed(s_reg_sp).value; r3 = get_reg_fixed(3).value; } const auto _call = m_ir->CreateCall(ensure(fn), {m_thread, m_lsptr, m_base_pc, sp, r3}); _call->setCallingConv(fn->getCallingConv()); // Tail call using loaded LR value (gateway from a chunk) if (!m_finfo->fn) { lr = m_ir->CreateAnd(lr, 0x3fffc); m_ir->CreateStore(lr, spu_ptr(&spu_thread::pc)); m_ir->CreateStore(_call, spu_ptr(&spu_thread::gpr, 3)); m_ir->CreateBr(add_block_indirect({}, value(lr))); } else if (tail) { _call->setTailCall(); m_ir->CreateRet(_call); } else { // TODO: initialize $LR with a constant for (u32 i = 0; i < s_reg_max; i++) { if (i != s_reg_lr && i != s_reg_sp && (i < s_reg_80 || i > s_reg_127)) { m_block->reg[i] = m_ir->CreateLoad(get_reg_type(i), init_reg_fixed(i)); } } // Set result m_block->reg[3] = _call; } } // Emit return from the real function void ret_function() { m_ir->CreateRet(get_reg_fixed(3).value); } void set_function(llvm::Function* func) { m_function = func; m_thread = func->getArg(0); m_lsptr = func->getArg(1); m_base_pc = func->getArg(2); m_reg_addr.fill(nullptr); m_block = nullptr; m_finfo = nullptr; m_blocks.clear(); m_block_queue.clear(); m_ir->SetInsertPoint(llvm::BasicBlock::Create(m_context, "", m_function)); m_memptr = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::memory_base_addr)); } // Add block with current block as a predecessor llvm::BasicBlock* add_block(u32 target, bool absolute = false) { // Check the predecessor const bool pred_found = m_block_info[target / 4] && m_preds[target].find_first_of(m_pos) + 1; if (m_blocks.empty()) { // Special case: first block, proceed normally if (auto fn = std::exchange(m_finfo->fn, nullptr)) { // Create a gateway call_function(fn, true); m_finfo->fn = fn; m_function = fn; m_thread = fn->getArg(0); m_lsptr = fn->getArg(1); m_base_pc = fn->getArg(2); m_ir->SetInsertPoint(llvm::BasicBlock::Create(m_context, "", fn)); m_memptr = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::memory_base_addr)); // Load registers at the entry chunk for (u32 i = 0; i < s_reg_max; i++) { if (i >= s_reg_80 && i <= s_reg_127) { // TODO //m_finfo->load[i] = llvm::UndefValue::get(get_reg_type(i)); } m_finfo->load[i] = m_ir->CreateLoad(get_reg_type(i), init_reg_fixed(i)); } // Load $SP m_finfo->load[s_reg_sp] = fn->getArg(3); // Load first args m_finfo->load[3] = fn->getArg(4); } } else if (m_block_info[target / 4] && m_entry_info[target / 4] && !(pred_found && m_entry == target) && (!m_finfo->fn || !m_ret_info[target / 4])) { // Generate a tail call to the function chunk const auto cblock = m_ir->GetInsertBlock(); const auto result = llvm::BasicBlock::Create(m_context, "", m_function); m_ir->SetInsertPoint(result); const auto pfinfo = add_function(target); if (absolute) { ensure(!m_finfo->fn); const auto next = llvm::BasicBlock::Create(m_context, "", m_function); const auto fail = llvm::BasicBlock::Create(m_context, "", m_function); m_ir->CreateCondBr(m_ir->CreateICmpEQ(m_base_pc, m_ir->getInt32(m_base)), next, fail); m_ir->SetInsertPoint(fail); m_ir->CreateStore(m_ir->getInt32(target), spu_ptr(&spu_thread::pc)); tail_chunk(nullptr); m_ir->SetInsertPoint(next); } if (pfinfo->fn) { // Tail call to the real function call_function(pfinfo->fn, true); if (!result->getTerminator()) ret_function(); } else { // Just a boring tail call to another chunk update_pc(target); tail_chunk(pfinfo->chunk); } m_ir->SetInsertPoint(cblock); return result; } else if (!pred_found || !m_block_info[target / 4]) { if (m_block_info[target / 4]) { spu_log.error("[%s] [0x%x] Predecessor not found for target 0x%x (chunk=0x%x, entry=0x%x, size=%u)", m_hash, m_pos, target, m_entry, m_function_queue[0], m_size / 4); } const auto cblock = m_ir->GetInsertBlock(); const auto result = llvm::BasicBlock::Create(m_context, "", m_function); m_ir->SetInsertPoint(result); if (absolute) { ensure(!m_finfo->fn); m_ir->CreateStore(m_ir->getInt32(target), spu_ptr(&spu_thread::pc)); } else { update_pc(target); } tail_chunk(nullptr); m_ir->SetInsertPoint(cblock); return result; } ensure(!absolute); auto& result = m_blocks[target].block; if (!result) { result = llvm::BasicBlock::Create(m_context, fmt::format("b-0x%x", target), m_function); // Add the block to the queue m_block_queue.push_back(target); } else if (m_block && m_blocks[target].block_end) { // Connect PHI nodes if necessary for (u32 i = 0; i < s_reg_max; i++) { if (const auto phi = m_blocks[target].phi[i]) { const auto typ = phi->getType() == get_type() ? get_type() : get_reg_type(i); phi->addIncoming(get_reg_fixed(i, typ), m_block->block_end); } } } return result; } template llvm::Value* _ptr(llvm::Value* base, u32 offset) { return m_ir->CreateGEP(get_type(), base, m_ir->getInt64(offset)); } template llvm::Value* spu_ptr(Args... offset_args) { return _ptr(m_thread, ::offset32(offset_args...)); } template llvm::Value* spu_ptr(value_t add, Args... offset_args) { const auto off = m_ir->CreateGEP(get_type(), m_thread, m_ir->getInt64(::offset32(offset_args...))); return m_ir->CreateAdd(off, add.value); } // Return default register type llvm::Type* get_reg_type(u32 index) { if (index < 128) { return get_type(); } switch (index) { case s_reg_mfc_eal: case s_reg_mfc_lsa: return get_type(); case s_reg_mfc_tag: return get_type(); case s_reg_mfc_size: return get_type(); default: fmt::throw_exception("get_reg_type(%u): invalid register index", index); } } u32 get_reg_offset(u32 index) { if (index < 128) { return ::offset32(&spu_thread::gpr, index); } switch (index) { case s_reg_mfc_eal: return ::offset32(&spu_thread::ch_mfc_cmd, &spu_mfc_cmd::eal); case s_reg_mfc_lsa: return ::offset32(&spu_thread::ch_mfc_cmd, &spu_mfc_cmd::lsa); case s_reg_mfc_tag: return ::offset32(&spu_thread::ch_mfc_cmd, &spu_mfc_cmd::tag); case s_reg_mfc_size: return ::offset32(&spu_thread::ch_mfc_cmd, &spu_mfc_cmd::size); default: fmt::throw_exception("get_reg_offset(%u): invalid register index", index); } } llvm::Value* init_reg_fixed(u32 index) { if (!m_block) { return _ptr(m_thread, get_reg_offset(index)); } auto& ptr = ::at32(m_reg_addr, index); if (!ptr) { // Save and restore current insert point if necessary const auto block_cur = m_ir->GetInsertBlock(); // Emit register pointer at the beginning of the function chunk m_ir->SetInsertPoint(m_function->getEntryBlock().getTerminator()); ptr = _ptr(m_thread, get_reg_offset(index)); m_ir->SetInsertPoint(block_cur); } return ptr; } // Get pointer to the vector register (interpreter only) template llvm::Value* init_vr(const bf_t&) { if (!m_interp_magn) { m_interp_7f0 = m_ir->getInt32(0x7f0); m_interp_regs = _ptr(m_thread, get_reg_offset(0)); } // Extract reg index const auto isl = I >= 4 ? m_interp_op : m_ir->CreateShl(m_interp_op, u64{4 - I}); const auto isr = I <= 4 ? m_interp_op : m_ir->CreateLShr(m_interp_op, u64{I - 4}); const auto idx = m_ir->CreateAnd(I > 4 ? isr : isl, m_interp_7f0); // Pointer to the register return m_ir->CreateGEP(get_type(), m_interp_regs, m_ir->CreateZExt(idx, get_type())); } llvm::Value* double_as_uint64(llvm::Value* val) { return bitcast(val); } llvm::Value* uint64_as_double(llvm::Value* val) { return bitcast(val); } llvm::Value* double_to_xfloat(llvm::Value* val) { ensure(val && val->getType() == get_type()); const auto d = double_as_uint64(val); const auto s = m_ir->CreateAnd(m_ir->CreateLShr(d, 32), 0x80000000); const auto m = m_ir->CreateXor(m_ir->CreateLShr(d, 29), 0x40000000); const auto r = m_ir->CreateOr(m_ir->CreateAnd(m, 0x7fffffff), s); return m_ir->CreateTrunc(m_ir->CreateSelect(m_ir->CreateIsNotNull(d), r, splat(0).eval(m_ir)), get_type()); } llvm::Value* xfloat_to_double(llvm::Value* val) { ensure(val && val->getType() == get_type()); const auto x = m_ir->CreateZExt(val, get_type()); const auto s = m_ir->CreateShl(m_ir->CreateAnd(x, 0x80000000), 32); const auto a = m_ir->CreateAnd(x, 0x7fffffff); const auto m = m_ir->CreateShl(m_ir->CreateAdd(a, splat(0x1c0000000).eval(m_ir)), 29); const auto r = m_ir->CreateSelect(m_ir->CreateICmpSGT(a, splat(0x7fffff).eval(m_ir)), m, splat(0).eval(m_ir)); const auto f = m_ir->CreateOr(s, r); return uint64_as_double(f); } // Clamp double values to ±Smax, flush values smaller than ±Smin to positive zero llvm::Value* xfloat_in_double(llvm::Value* val) { ensure(val && val->getType() == get_type()); const auto smax = uint64_as_double(splat(0x47ffffffe0000000).eval(m_ir)); const auto smin = uint64_as_double(splat(0x3810000000000000).eval(m_ir)); const auto d = double_as_uint64(val); const auto s = m_ir->CreateAnd(d, 0x8000000000000000); const auto a = uint64_as_double(m_ir->CreateAnd(d, 0x7fffffffe0000000)); const auto n = m_ir->CreateFCmpOLT(a, smax); const auto z = m_ir->CreateFCmpOLT(a, smin); const auto c = double_as_uint64(m_ir->CreateSelect(n, a, smax)); return m_ir->CreateSelect(z, fsplat(0.).eval(m_ir), uint64_as_double(m_ir->CreateOr(c, s))); } // Expand 32-bit mask for xfloat values to 64-bit, 29 least significant bits are always zero llvm::Value* conv_xfloat_mask(llvm::Value* val) { const auto d = m_ir->CreateZExt(val, get_type()); const auto s = m_ir->CreateShl(m_ir->CreateAnd(d, 0x80000000), 32); const auto e = m_ir->CreateLShr(m_ir->CreateAShr(m_ir->CreateShl(d, 33), 4), 1); return m_ir->CreateOr(s, e); } llvm::Value* get_reg_raw(u32 index) { if (!m_block || index >= m_block->reg.size()) { return nullptr; } return m_block->reg[index]; } llvm::Value* get_reg_fixed(u32 index, llvm::Type* type) { llvm::Value* dummy{}; auto& reg = *(m_block ? &::at32(m_block->reg, index) : &dummy); if (!reg) { // Load register value if necessary reg = m_finfo && m_finfo->load[index] ? m_finfo->load[index] : m_ir->CreateLoad(get_reg_type(index), init_reg_fixed(index)); } if (reg->getType() == get_type()) { if (type == reg->getType()) { return reg; } return bitcast(double_to_xfloat(reg), type); } if (type == get_type()) { return xfloat_to_double(bitcast(reg)); } return bitcast(reg, type); } template value_t get_reg_fixed(u32 index) { value_t r; r.value = get_reg_fixed(index, get_type()); return r; } template value_t get_vr(const bf_t& index) { value_t r; if ((m_op_const_mask & index.data_mask()) != index.data_mask()) { // Update const mask if necessary if (I >= (32u - m_interp_magn)) { m_op_const_mask |= index.data_mask(); } // Load reg if (get_type() == get_type()) { r.value = xfloat_to_double(m_ir->CreateLoad(get_type(), init_vr(index))); } else { r.value = m_ir->CreateLoad(get_type(), init_vr(index)); } } else { r.value = get_reg_fixed(index, get_type()); } return r; } template auto get_vr_as(U&&, const bf_t& index) { return get_vr::type>(index); } template std::tuple>...> get_vrs(const Args&... args) { return {get_vr(args)...}; } template llvm_match_t match_vr(const bf_t& index) { llvm_match_t r; if (m_block) { auto v = ::at32(m_block->reg, index); if (v && v->getType() == get_type()) { r.value = v; return r; } } return r; } template auto match_vr_as(U&&, const bf_t& index) { return match_vr::type>(index); } template bool match_vr(const bf_t& index, F&& pred) { return (( match_vr(index) ? pred(match_vr(index), match()) : false ) || ...); } template std::tuple>...> match_vrs(const Args&... args) { return {match_vr(args)...}; } // Extract scalar value from the preferred slot template auto get_scalar(value_t value) { using e_type = std::remove_extent_t; static_assert(sizeof(T) == 16 || std::is_same_v, "Unknown vector type"); if (auto [ok, v] = match_expr(value, vsplat(match())); ok) { return eval(v); } if constexpr (sizeof(e_type) == 1) { return eval(extract(value, 12)); } else if constexpr (sizeof(e_type) == 2) { return eval(extract(value, 6)); } else if constexpr (sizeof(e_type) == 4 || sizeof(T) == 32) { return eval(extract(value, 3)); } else { return eval(extract(value, 1)); } } // Splat scalar value from the preferred slot template auto splat_scalar(T&& arg) { using VT = std::remove_extent_t::type>; if constexpr (sizeof(VT) == 1) { return zshuffle(std::forward(arg), 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12); } else if constexpr (sizeof(VT) == 2) { return zshuffle(std::forward(arg), 6, 6, 6, 6, 6, 6, 6, 6); } else if constexpr (sizeof(VT) == 4) { return zshuffle(std::forward(arg), 3, 3, 3, 3); } else if constexpr (sizeof(VT) == 8) { return zshuffle(std::forward(arg), 1, 1); } else { static_assert(sizeof(VT) == 16); return std::forward(arg); } } void set_reg_fixed(u32 index, llvm::Value* value, bool fixup = true) { llvm::StoreInst* dummy{}; // Check ensure(!m_block || m_regmod[m_pos / 4] == index); // Test for special case const bool is_xfloat = value->getType() == get_type(); // Clamp value if necessary const auto saved_value = is_xfloat && fixup ? xfloat_in_double(value) : value; // Set register value if (m_block) { #ifndef _WIN32 if (g_cfg.core.spu_debug) value->setName(fmt::format("result_0x%05x", m_pos)); #endif ::at32(m_block->reg, index) = saved_value; } // Get register location const auto addr = init_reg_fixed(index); auto& _store = *(m_block ? &m_block->store[index] : &dummy); // Erase previous dead store instruction if necessary if (_store) { if (m_block->store_context_last_id[index] == m_block->store_context_ctr[index]) { // Erase store of it is not preserved by ensure_gpr_stores() _store->eraseFromParent(); } } if (m_block) { // Keep the store's location in history of gpr preservaions m_block->store_context_last_id[index] = m_block->store_context_ctr[index]; m_block->store_context_first_id[index] = std::min(m_block->store_context_first_id[index], m_block->store_context_ctr[index]); } if (m_finfo && m_finfo->fn) { if (index <= 3 || (index >= s_reg_80 && index <= s_reg_127)) { // Don't save some registers in true functions return; } } // Write register to the context _store = m_ir->CreateStore(is_xfloat ? double_to_xfloat(saved_value) : m_ir->CreateBitCast(value, get_reg_type(index)), addr); } template void set_vr(const bf_t& index, T expr, std::function vr_assume = nullptr, bool fixup = true) { // Process expression const auto value = expr.eval(m_ir); // Test for special case const bool is_xfloat = value->getType() == get_type(); if ((m_op_const_mask & index.data_mask()) != index.data_mask()) { // Update const mask if necessary if (I >= (32u - m_interp_magn)) { m_op_const_mask |= index.data_mask(); } // Clamp value if necessary const auto saved_value = is_xfloat && fixup ? xfloat_in_double(value) : value; // Store value m_ir->CreateStore(is_xfloat ? double_to_xfloat(saved_value) : m_ir->CreateBitCast(value, get_type()), init_vr(index)); return; } if (vr_assume) { } set_reg_fixed(index, value, fixup); } template value_t get_imm(const bf_t& imm, bool mask = true) { if ((m_op_const_mask & imm.data_mask()) != imm.data_mask()) { // Update const mask if necessary if (I >= (32u - m_interp_magn)) { m_op_const_mask |= imm.data_mask(); } // Extract unsigned immediate (skip AND if mask == false or truncated anyway) value_t r; r.value = m_interp_op; r.value = I == 0 ? r.value : m_ir->CreateLShr(r.value, u64{I}); r.value = !mask || N >= r.esize ? r.value : m_ir->CreateAnd(r.value, imm.data_mask() >> I); if constexpr (r.esize != 32) { r.value = m_ir->CreateZExtOrTrunc(r.value, get_type()->getScalarType()); } if (r.is_vector) { r.value = m_ir->CreateVectorSplat(r.is_vector, r.value); } return r; } return eval(splat(imm)); } template value_t get_imm(const bf_t& imm) { if ((m_op_const_mask & imm.data_mask()) != imm.data_mask()) { // Update const mask if necessary if (I >= (32u - m_interp_magn)) { m_op_const_mask |= imm.data_mask(); } // Extract signed immediate (skip sign ext if truncated anyway) value_t r; r.value = m_interp_op; r.value = I + N == 32 || N >= r.esize ? r.value : m_ir->CreateShl(r.value, u64{32u - I - N}); r.value = N == 32 || N >= r.esize ? r.value : m_ir->CreateAShr(r.value, u64{32u - N}); r.value = I == 0 || N < r.esize ? r.value : m_ir->CreateLShr(r.value, u64{I}); if constexpr (r.esize != 32) { r.value = m_ir->CreateSExtOrTrunc(r.value, get_type()->getScalarType()); } if (r.is_vector) { r.value = m_ir->CreateVectorSplat(r.is_vector, r.value); } return r; } return eval(splat(imm)); } // Get PC for given instruction address llvm::Value* get_pc(u32 addr) { return m_ir->CreateAdd(m_base_pc, m_ir->getInt32(addr - m_base)); } // Update PC for current or explicitly specified instruction address void update_pc(u32 target = -1) { m_ir->CreateStore(m_ir->CreateAnd(get_pc(target + 1 ? target : m_pos), 0x3fffc), spu_ptr(&spu_thread::pc))->setVolatile(true); } // Call cpu_thread::check_state if necessary and return or continue (full check) void check_state(u32 addr, bool may_be_unsafe_for_savestate = true) { const auto pstate = spu_ptr(&spu_thread::state); const auto _body = llvm::BasicBlock::Create(m_context, "", m_function); const auto check = llvm::BasicBlock::Create(m_context, "", m_function); m_ir->CreateCondBr(m_ir->CreateICmpEQ(m_ir->CreateLoad(get_type(), pstate, true), m_ir->getInt32(0)), _body, check, m_md_likely); m_ir->SetInsertPoint(check); update_pc(addr); if (may_be_unsafe_for_savestate && std::none_of(std::begin(m_block->phi), std::end(m_block->phi), FN(!!x))) { may_be_unsafe_for_savestate = false; } if (may_be_unsafe_for_savestate) { m_ir->CreateStore(m_ir->getInt8(1), spu_ptr(&spu_thread::unsavable))->setVolatile(true); } m_ir->CreateCall(m_test_state, {m_thread}); if (may_be_unsafe_for_savestate) { m_ir->CreateStore(m_ir->getInt8(0), spu_ptr(&spu_thread::unsavable))->setVolatile(true); } m_ir->CreateBr(_body); m_ir->SetInsertPoint(_body); } public: spu_llvm_recompiler(u8 interp_magn = 0) : spu_recompiler_base() , cpu_translator(nullptr, false) , m_interp_magn(interp_magn) { } virtual void init() override { // Initialize if necessary if (!m_spurt) { m_spurt = &g_fxo->get(); cpu_translator::initialize(m_jit.get_context(), m_jit.get_engine()); const auto md_name = llvm::MDString::get(m_context, "branch_weights"); const auto md_low = llvm::ValueAsMetadata::get(llvm::ConstantInt::get(GetType(), 1)); const auto md_high = llvm::ValueAsMetadata::get(llvm::ConstantInt::get(GetType(), 999)); // Metadata for branch weights m_md_likely = llvm::MDTuple::get(m_context, {md_name, md_high, md_low}); m_md_unlikely = llvm::MDTuple::get(m_context, {md_name, md_low, md_high}); } } void init_luts() { // LUTs for some instructions m_spu_frest_fraction_lut = new llvm::GlobalVariable(*m_module, llvm::ArrayType::get(GetType(), 32), true, llvm::GlobalValue::PrivateLinkage, llvm::ConstantDataArray::get(m_context, spu_frest_fraction_lut)); m_spu_frest_exponent_lut = new llvm::GlobalVariable(*m_module, llvm::ArrayType::get(GetType(), 256), true, llvm::GlobalValue::PrivateLinkage, llvm::ConstantDataArray::get(m_context, spu_frest_exponent_lut)); m_spu_frsqest_fraction_lut = new llvm::GlobalVariable(*m_module, llvm::ArrayType::get(GetType(), 64), true, llvm::GlobalValue::PrivateLinkage, llvm::ConstantDataArray::get(m_context, spu_frsqest_fraction_lut)); m_spu_frsqest_exponent_lut = new llvm::GlobalVariable(*m_module, llvm::ArrayType::get(GetType(), 256), true, llvm::GlobalValue::PrivateLinkage, llvm::ConstantDataArray::get(m_context, spu_frsqest_exponent_lut)); } virtual spu_function_t compile(spu_program&& _func) override { if (_func.data.empty() && m_interp_magn) { return compile_interpreter(); } const u32 start0 = _func.entry_point; const auto add_loc = m_spurt->add_empty(std::move(_func)); if (!add_loc) { return nullptr; } const spu_program& func = add_loc->data; if (func.entry_point != start0) { // Wait for the duplicate while (!add_loc->compiled) { add_loc->compiled.wait(nullptr); } return add_loc->compiled; } std::string log; if (auto& cache = g_fxo->get(); cache && g_cfg.core.spu_cache && !add_loc->cached.exchange(1)) { cache.add(func); } { sha1_context ctx; u8 output[20]; sha1_starts(&ctx); sha1_update(&ctx, reinterpret_cast(func.data.data()), func.data.size() * 4); sha1_finish(&ctx, output); m_hash.clear(); fmt::append(m_hash, "__spu-0x%05x-%s", func.entry_point, fmt::base57(output)); be_t hash_start; std::memcpy(&hash_start, output, sizeof(hash_start)); m_hash_start = hash_start; } spu_log.notice("Building function 0x%x... (size %u, %s)", func.entry_point, func.data.size(), m_hash); m_pos = func.lower_bound; m_base = func.entry_point; m_size = ::size32(func.data) * 4; const u32 start = m_pos; const u32 end = start + m_size; m_pp_id = 0; if (g_cfg.core.spu_debug && !add_loc->logged.exchange(1)) { this->dump(func, log); fs::file(m_spurt->get_cache_path() + "spu.log", fs::write + fs::append).write(log); } using namespace llvm; m_engine->clearAllGlobalMappings(); // Create LLVM module std::unique_ptr _module = std::make_unique(m_hash + ".obj", m_context); _module->setTargetTriple(jit_compiler::triple2()); _module->setDataLayout(m_jit.get_engine().getTargetMachine()->createDataLayout()); m_module = _module.get(); // Initialize IR Builder IRBuilder<> irb(m_context); m_ir = &irb; // Add entry function (contains only state/code check) const auto main_func = llvm::cast(m_module->getOrInsertFunction(m_hash, get_ftype()).getCallee()); const auto main_arg2 = main_func->getArg(2); main_func->setCallingConv(CallingConv::GHC); set_function(main_func); init_luts(); // Start compilation const auto label_test = BasicBlock::Create(m_context, "", m_function); const auto label_diff = BasicBlock::Create(m_context, "", m_function); const auto label_body = BasicBlock::Create(m_context, "", m_function); const auto label_stop = BasicBlock::Create(m_context, "", m_function); // Load PC, which will be the actual value of 'm_base' m_base_pc = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::pc)); // Emit state check const auto pstate = spu_ptr(&spu_thread::state); m_ir->CreateCondBr(m_ir->CreateICmpNE(m_ir->CreateLoad(get_type(), pstate), m_ir->getInt32(0)), label_stop, label_test, m_md_unlikely); // Emit code check u32 check_iterations = 0; m_ir->SetInsertPoint(label_test); // Set block hash for profiling (if enabled) if (g_cfg.core.spu_prof && g_cfg.core.spu_verification) m_ir->CreateStore(m_ir->getInt64((m_hash_start & -65536)), spu_ptr(&spu_thread::block_hash)); if (!g_cfg.core.spu_verification) { // Disable check (unsafe) m_ir->CreateBr(label_body); } else if (func.data.size() == 1) { const auto pu32 = m_ir->CreateGEP(get_type(), m_lsptr, m_base_pc); const auto cond = m_ir->CreateICmpNE(m_ir->CreateLoad(get_type(), pu32), m_ir->getInt32(func.data[0])); m_ir->CreateCondBr(cond, label_diff, label_body, m_md_unlikely); } else if (func.data.size() == 2) { const auto pu64 = m_ir->CreateGEP(get_type(), m_lsptr, m_base_pc); const auto cond = m_ir->CreateICmpNE(m_ir->CreateLoad(get_type(), pu64), m_ir->getInt64(static_cast(func.data[1]) << 32 | func.data[0])); m_ir->CreateCondBr(cond, label_diff, label_body, m_md_unlikely); } else { u32 starta = start; // Skip holes at the beginning (giga only) for (u32 j = start; j < end; j += 4) { if (!func.data[(j - start) / 4]) { starta += 4; } else { break; } } u32 stride; u32 elements; u32 dwords; if (m_use_avx512 && g_cfg.core.full_width_avx512) { stride = 64; elements = 16; dwords = 8; } else if (m_use_avx) { stride = 32; elements = 8; dwords = 4; } else { stride = 16; elements = 4; dwords = 2; } // Get actual pc corresponding to the found beginning of the data llvm::Value* starta_pc = m_ir->CreateAnd(get_pc(starta), 0x3fffc); llvm::Value* data_addr = m_ir->CreateGEP(get_type(), m_lsptr, starta_pc); llvm::Value* acc = nullptr; for (u32 j = starta; j < end; j += stride) { int indices[16]; bool holes = false; bool data = false; for (u32 i = 0; i < elements; i++) { const u32 k = j + i * 4; if (k < start || k >= end || !func.data[(k - start) / 4]) { indices[i] = elements; holes = true; } else { indices[i] = i; data = true; } } if (!data) { // Skip full-sized holes continue; } llvm::Value* vls = nullptr; // Load unaligned code block from LS if (m_use_avx512 && g_cfg.core.full_width_avx512) { vls = m_ir->CreateAlignedLoad(get_type(), _ptr(data_addr, j - starta), llvm::MaybeAlign{4}); } else if (m_use_avx) { vls = m_ir->CreateAlignedLoad(get_type(), _ptr(data_addr, j - starta), llvm::MaybeAlign{4}); } else { vls = m_ir->CreateAlignedLoad(get_type(), _ptr(data_addr, j - starta), llvm::MaybeAlign{4}); } // Mask if necessary if (holes) { vls = m_ir->CreateShuffleVector(vls, ConstantAggregateZero::get(vls->getType()), llvm::ArrayRef(indices, elements)); } // Perform bitwise comparison and accumulate u32 words[16]; for (u32 i = 0; i < elements; i++) { const u32 k = j + i * 4; words[i] = k >= start && k < end ? func.data[(k - start) / 4] : 0; } vls = m_ir->CreateXor(vls, ConstantDataVector::get(m_context, llvm::ArrayRef(words, elements))); acc = acc ? m_ir->CreateOr(acc, vls) : vls; check_iterations++; } // Pattern for PTEST if (m_use_avx512 && g_cfg.core.full_width_avx512) { acc = m_ir->CreateBitCast(acc, get_type()); } else if (m_use_avx) { acc = m_ir->CreateBitCast(acc, get_type()); } else { acc = m_ir->CreateBitCast(acc, get_type()); } llvm::Value* elem = m_ir->CreateExtractElement(acc, u64{0}); for (u32 i = 1; i < dwords; i++) { elem = m_ir->CreateOr(elem, m_ir->CreateExtractElement(acc, i)); } // Compare result with zero const auto cond = m_ir->CreateICmpNE(elem, m_ir->getInt64(0)); m_ir->CreateCondBr(cond, label_diff, label_body, m_md_unlikely); } // Increase block counter with statistics m_ir->SetInsertPoint(label_body); const auto pbcount = spu_ptr(&spu_thread::block_counter); m_ir->CreateStore(m_ir->CreateAdd(m_ir->CreateLoad(get_type(), pbcount), m_ir->getInt64(check_iterations)), pbcount); // Call the entry function chunk const auto entry_chunk = add_function(m_pos); const auto entry_call = m_ir->CreateCall(entry_chunk->chunk, {m_thread, m_lsptr, m_base_pc}); entry_call->setCallingConv(entry_chunk->chunk->getCallingConv()); const auto dispatcher = llvm::cast(m_module->getOrInsertFunction("spu_dispatcher", main_func->getType()).getCallee()); m_engine->updateGlobalMapping("spu_dispatcher", reinterpret_cast(spu_runtime::tr_all)); dispatcher->setCallingConv(main_func->getCallingConv()); // Proceed to the next code if (entry_chunk->chunk->getReturnType() != get_type()) { const auto next_call = m_ir->CreateCall(main_func->getFunctionType(), entry_call, {m_thread, m_lsptr, m_ir->getInt64(0)}); next_call->setCallingConv(main_func->getCallingConv()); next_call->setTailCall(); } else { entry_call->setTailCall(); } m_ir->CreateRetVoid(); m_ir->SetInsertPoint(label_stop); call("spu_escape", spu_runtime::g_escape, m_thread)->setTailCall(); m_ir->CreateRetVoid(); m_ir->SetInsertPoint(label_diff); if (g_cfg.core.spu_verification) { const auto pbfail = spu_ptr(&spu_thread::block_failure); m_ir->CreateStore(m_ir->CreateAdd(m_ir->CreateLoad(get_type(), pbfail), m_ir->getInt64(1)), pbfail); const auto dispci = call("spu_dispatch", spu_runtime::tr_dispatch, m_thread, m_lsptr, main_arg2); dispci->setCallingConv(CallingConv::GHC); dispci->setTailCall(); m_ir->CreateRetVoid(); } else { m_ir->CreateUnreachable(); } m_dispatch = cast(_module->getOrInsertFunction("__spu-null", entry_chunk->chunk->getFunctionType()).getCallee()); m_dispatch->setLinkage(llvm::GlobalValue::InternalLinkage); m_dispatch->setCallingConv(entry_chunk->chunk->getCallingConv()); set_function(m_dispatch); if (entry_chunk->chunk->getReturnType() == get_type()) { const auto next_call = m_ir->CreateCall(main_func->getFunctionType(), dispatcher, {m_thread, m_lsptr, m_ir->getInt64(0)}); next_call->setCallingConv(main_func->getCallingConv()); next_call->setTailCall(); m_ir->CreateRetVoid(); } else { m_ir->CreateRet(dispatcher); } // Function that executes check_state and escapes if necessary m_test_state = llvm::cast(m_module->getOrInsertFunction("spu_test_state", get_ftype()).getCallee()); m_test_state->setLinkage(GlobalValue::InternalLinkage); #ifdef ARCH_ARM64 // LLVM doesn't support PreserveAll on arm64. m_test_state->setCallingConv(CallingConv::PreserveMost); #else m_test_state->setCallingConv(CallingConv::PreserveAll); #endif m_ir->SetInsertPoint(BasicBlock::Create(m_context, "", m_test_state)); const auto escape_yes = BasicBlock::Create(m_context, "", m_test_state); const auto escape_no = BasicBlock::Create(m_context, "", m_test_state); m_ir->CreateCondBr(call("spu_exec_check_state", &exec_check_state, m_test_state->getArg(0)), escape_yes, escape_no); m_ir->SetInsertPoint(escape_yes); call("spu_escape", spu_runtime::g_escape, m_test_state->getArg(0)); m_ir->CreateRetVoid(); m_ir->SetInsertPoint(escape_no); m_ir->CreateRetVoid(); // Create function table (uninitialized) m_function_table = new llvm::GlobalVariable(*m_module, llvm::ArrayType::get(entry_chunk->chunk->getType(), m_size / 4), true, llvm::GlobalValue::InternalLinkage, nullptr); // Create function chunks for (usz fi = 0; fi < m_function_queue.size(); fi++) { // Initialize function info m_entry = m_function_queue[fi]; set_function(m_functions[m_entry].chunk); // Set block hash for profiling (if enabled) if (g_cfg.core.spu_prof) m_ir->CreateStore(m_ir->getInt64((m_hash_start & -65536) | (m_entry >> 2)), spu_ptr(&spu_thread::block_hash)); m_finfo = &m_functions[m_entry]; m_ir->CreateBr(add_block(m_entry)); // Emit instructions for basic blocks for (usz bi = 0; bi < m_block_queue.size(); bi++) { // Initialize basic block info const u32 baddr = m_block_queue[bi]; m_block = &m_blocks[baddr]; m_ir->SetInsertPoint(m_block->block); auto& bb = ::at32(m_bbs, baddr); bool need_check = false; m_block->bb = &bb; if (!bb.preds.empty()) { // Initialize registers and build PHI nodes if necessary for (u32 i = 0; i < s_reg_max; i++) { const u32 src = m_finfo->fn ? bb.reg_origin_abs[i] : bb.reg_origin[i]; if (src > 0x40000) { // Use the xfloat hint to create 256-bit (4x double) PHI llvm::Type* type = g_cfg.core.spu_xfloat_accuracy == xfloat_accuracy::accurate && bb.reg_maybe_xf[i] ? get_type() : get_reg_type(i); const auto _phi = m_ir->CreatePHI(type, ::size32(bb.preds), fmt::format("phi0x%05x_r%u", baddr, i)); m_block->phi[i] = _phi; m_block->reg[i] = _phi; for (u32 pred : bb.preds) { const auto bfound = m_blocks.find(pred); if (bfound != m_blocks.end() && bfound->second.block_end) { auto& value = bfound->second.reg[i]; if (!value || value->getType() != _phi->getType()) { const auto regptr = init_reg_fixed(i); const auto cblock = m_ir->GetInsertBlock(); m_ir->SetInsertPoint(bfound->second.block_end->getTerminator()); if (!value) { // Value hasn't been loaded yet value = m_finfo && m_finfo->load[i] ? m_finfo->load[i] : m_ir->CreateLoad(get_reg_type(i), regptr); } if (value->getType() == get_type() && type != get_type()) { value = double_to_xfloat(value); } else if (value->getType() != get_type() && type == get_type()) { value = xfloat_to_double(bitcast(value)); } else { value = bitcast(value, _phi->getType()); } m_ir->SetInsertPoint(cblock); ensure(bfound->second.block_end->getTerminator()); } _phi->addIncoming(value, bfound->second.block_end); } } if (baddr == m_entry) { // Load value at the function chunk's entry block if necessary const auto regptr = init_reg_fixed(i); const auto cblock = m_ir->GetInsertBlock(); m_ir->SetInsertPoint(m_function->getEntryBlock().getTerminator()); const auto value = m_finfo && m_finfo->load[i] ? m_finfo->load[i] : m_ir->CreateLoad(get_reg_type(i), regptr); m_ir->SetInsertPoint(cblock); _phi->addIncoming(value, &m_function->getEntryBlock()); } } else if (src < 0x40000) { // Passthrough register value const auto bfound = m_blocks.find(src); if (bfound != m_blocks.end()) { m_block->reg[i] = bfound->second.reg[i]; } else { spu_log.error("[0x%05x] Value not found ($%u from 0x%05x)", baddr, i, src); } } else { m_block->reg[i] = m_finfo->load[i]; } } // Emit state check if necessary (TODO: more conditions) for (u32 pred : bb.preds) { if (pred >= baddr) { // If this block is a target of a backward branch (possibly loop), emit a check need_check = true; break; } } } // State check at the beginning of the chunk if (need_check || (bi == 0 && g_cfg.core.spu_block_size != spu_block_size_type::safe)) { check_state(baddr); } // Emit instructions for (m_pos = baddr; m_pos >= start && m_pos < end && !m_ir->GetInsertBlock()->getTerminator(); m_pos += 4) { if (m_pos != baddr && m_block_info[m_pos / 4]) { break; } const u32 op = std::bit_cast>(func.data[(m_pos - start) / 4]); if (!op) { spu_log.error("[%s] Unexpected fallthrough to 0x%x (chunk=0x%x, entry=0x%x)", m_hash, m_pos, m_entry, m_function_queue[0]); break; } // Set variable for set_link() if (m_pos + 4 >= end) m_next_op = 0; else m_next_op = func.data[(m_pos - start) / 4 + 1]; // Execute recompiler function (TODO) (this->*decode(op))({op}); } // Finalize block with fallthrough if necessary if (!m_ir->GetInsertBlock()->getTerminator()) { const u32 target = m_pos == baddr ? baddr : m_pos & 0x3fffc; if (m_pos != baddr) { m_pos -= 4; if (target >= start && target < end) { const auto tfound = m_targets.find(m_pos); if (tfound == m_targets.end() || tfound->second.find_first_of(target) + 1 == 0) { spu_log.error("[%s] Unregistered fallthrough to 0x%x (chunk=0x%x, entry=0x%x)", m_hash, target, m_entry, m_function_queue[0]); } } } m_block->block_end = m_ir->GetInsertBlock(); m_ir->CreateBr(add_block(target)); } ensure(m_block->block_end); } // Work on register stores. // 1. Remove stores which are overwritten later. // 2. Sink stores to post-dominating blocks. llvm::PostDominatorTree pdt(*m_function); llvm::DominatorTree dt(*m_function); // Post-order indices std::unordered_map pois; { usz i = 0; for (auto* bb : llvm::post_order(m_function)) pois[bb] = i++; } // Basic block to block_info std::unordered_map bb_to_info; std::vector block_q; block_q.reserve(m_blocks.size()); for (auto& [a, b] : m_blocks) { block_q.emplace_back(&b); bb_to_info[b.block] = &b; } for (usz bi = 0; bi < block_q.size();) { auto bqbi = block_q[bi++]; // TODO: process all registers up to s_reg_max for (u32 i = 0; i < 128; i++) { // Check if the store is beyond the last barrier if (auto& bs = bqbi->store[i]; bs && !bqbi->does_gpr_barrier_proceed_last_store(i)) { for (auto& [a, b] : m_blocks) { // Check if the store occurs before any barrier in the block if (b.store[i] && b.store[i] != bs && b.store_context_first_id[i] == 1) { if (pdt.dominates(b.store[i], bs)) { bs->eraseFromParent(); bs = nullptr; break; } } } if (!bs) continue; // Set of store instructions which overwrite bs std::vector killers; for (auto& [a, b] : m_blocks) { const auto si = b.store[i]; if (si && si != bs) { if (pois[bs->getParent()] > pois[si->getParent()]) { killers.emplace_back(si->getParent()); } else { // Reset: store is not the first in the set killers.clear(); break; } } } if (killers.empty()) continue; // Find nearest common post-dominator llvm::BasicBlock* common_pdom = killers[0]; for (auto* bbb : llvm::drop_begin(killers)) { if (!common_pdom) break; common_pdom = pdt.findNearestCommonDominator(common_pdom, bbb); } // Shortcut if (!pdt.dominates(common_pdom, bs->getParent())) common_pdom = nullptr; // Look for possibly-dead store in CFG starting from the exit nodes llvm::SetVector work_list; std::unordered_map worked_on; if (!common_pdom || std::count(killers.begin(), killers.end(), common_pdom) == 0) { if (common_pdom) { // Shortcut work_list.insert(common_pdom); worked_on[common_pdom] = true; } else { // Check all exits for (auto* r : pdt.roots()) { worked_on[r] = true; work_list.insert(r); } } } // bool flag indicates the presence of a memory barrier before the killer store std::vector> work2_list; for (usz wi = 0; wi < work_list.size(); wi++) { auto* cur = work_list[wi]; if (std::count(killers.begin(), killers.end(), cur)) { work2_list.emplace_back(cur, bb_to_info[cur] && bb_to_info[cur]->does_gpr_barrier_preceed_first_store(i)); continue; } if (cur == bs->getParent()) { // Reset: store is not dead killers.clear(); break; } for (auto* p : llvm::predecessors(cur)) { if (!worked_on[p]) { worked_on[p] = true; work_list.insert(p); } } } if (killers.empty()) continue; worked_on.clear(); for (usz wi = 0; wi < work2_list.size(); wi++) { worked_on[work2_list[wi].first] = true; } // Need to treat tails differently: do not require checking barrier (checked before in a suitable manner) const usz work_list_tail_blocks_max_index = work2_list.size(); for (usz wi = 0; wi < work2_list.size(); wi++) { auto [cur, found_user] = work2_list[wi]; ensure(cur != bs->getParent()); if (!found_user && wi >= work_list_tail_blocks_max_index) { if (auto info = bb_to_info[cur]) { if (info->store_context_ctr[i] != 1) { found_user = true; } } } for (auto* p : llvm::predecessors(cur)) { if (p == bs->getParent()) { if (found_user) { // Reset: store is being used and preserved by ensure_gpr_stores() killers.clear(); break; } continue; } if (!worked_on[p]) { worked_on[p] = true; work2_list.push_back(std::make_pair(p, found_user)); } // Enqueue a second iteration for found_user=true if only found with found_user=false else if (found_user && !std::find_if(work2_list.rbegin(), work2_list.rend(), [&](auto& it){ return it.first == p; })->second) { work2_list.push_back(std::make_pair(p, true)); } } if (killers.empty()) { break; } } // Finally erase the dead store if (!killers.empty()) { bs->eraseFromParent(); bs = nullptr; // Run the loop from the start bi = 0; } } } } block_q.clear(); for (auto& [a, b] : m_blocks) { block_q.emplace_back(&b); } for (usz bi = 0; bi < block_q.size(); bi++) { for (u32 i = 0; i < 128; i++) { // If store isn't erased, try to sink it if (auto& bs = block_q[bi]->store[i]; bs && block_q[bi]->bb->targets.size() > 1 && !block_q[bi]->does_gpr_barrier_proceed_last_store(i)) { std::map> sucs; for (u32 tj : block_q[bi]->bb->targets) { auto b2it = m_blocks.find(tj); if (b2it != m_blocks.end()) { sucs.emplace(tj, &b2it->second); } } for (auto [a2, b2] : sucs) { if (b2 != block_q[bi]) { auto ins = b2->block->getFirstNonPHI(); if (b2->bb->preds.size() == 1) { if (!dt.dominates(bs->getOperand(0), ins)) continue; if (!pdt.dominates(ins, bs)) continue; m_ir->SetInsertPoint(ins); auto si = llvm::cast(m_ir->Insert(bs->clone())); if (b2->store[i] == nullptr) { b2->store[i] = si; b2->store_context_last_id[i] = 0; if (!std::count(block_q.begin() + bi, block_q.end(), b2)) { // Sunk store can be checked again block_q.push_back(b2); } } } else { // Initialize additional block between two basic blocks auto& edge = block_q[bi]->block_edges[a2]; if (!edge) { const auto succ_range = llvm::successors(block_q[bi]->block_end); auto succ = b2->block; llvm::SmallSetVector succ_q; succ_q.insert(b2->block); for (usz j = 0; j < 32 && j < succ_q.size(); j++) { if (!llvm::count(succ_range, (succ = succ_q[j]))) { for (auto pred : llvm::predecessors(succ)) { succ_q.insert(pred); } } else { break; } } if (!llvm::count(succ_range, succ)) { // TODO: figure this out spu_log.notice("[%s] Failed successor to 0x%05x", fmt::base57(be_t{m_hash_start}), a2); continue; } edge = llvm::SplitEdge(block_q[bi]->block_end, succ); pdt.recalculate(*m_function); dt.recalculate(*m_function); } ins = edge->getTerminator(); if (!dt.dominates(bs->getOperand(0), ins)) continue; if (!pdt.dominates(ins, bs)) continue; m_ir->SetInsertPoint(ins); m_ir->Insert(bs->clone()); } bs->eraseFromParent(); bs = nullptr; pdt.recalculate(*m_function); dt.recalculate(*m_function); break; } } } } } } // Create function table if necessary if (m_function_table->getNumUses()) { std::vector chunks; chunks.reserve(m_size / 4); for (u32 i = start; i < end; i += 4) { const auto found = m_functions.find(i); if (found == m_functions.end()) { if (false && g_cfg.core.spu_verification) { const std::string ppname = fmt::format("%s-chunkpp-0x%05x", m_hash, i); m_engine->updateGlobalMapping(ppname, reinterpret_cast(m_spurt->make_branch_patchpoint(i / 4))); const auto ppfunc = llvm::cast(m_module->getOrInsertFunction(ppname, m_finfo->chunk->getFunctionType()).getCallee()); ppfunc->setCallingConv(m_finfo->chunk->getCallingConv()); chunks.push_back(ppfunc); continue; } chunks.push_back(m_dispatch); continue; } chunks.push_back(found->second.chunk); } m_function_table->setInitializer(llvm::ConstantArray::get(llvm::ArrayType::get(entry_chunk->chunk->getType(), m_size / 4), chunks)); } else { m_function_table->eraseFromParent(); } // Initialize pass manager legacy::FunctionPassManager pm(_module.get()); // Basic optimizations pm.add(createEarlyCSEPass()); pm.add(createCFGSimplificationPass()); //pm.add(createNewGVNPass()); #if LLVM_VERSION_MAJOR < 17 pm.add(createDeadStoreEliminationPass()); #endif pm.add(createLICMPass()); #if LLVM_VERSION_MAJOR < 17 pm.add(createAggressiveDCEPass()); #else pm.add(createDeadCodeEliminationPass()); #endif //pm.add(createLintPass()); // Check for (auto& f : *m_module) { replace_intrinsics(f); } for (const auto& func : m_functions) { const auto f = func.second.fn ? func.second.fn : func.second.chunk; pm.run(*f); } // Clear context (TODO) m_blocks.clear(); m_block_queue.clear(); m_functions.clear(); m_function_queue.clear(); m_function_table = nullptr; raw_string_ostream out(log); if (g_cfg.core.spu_debug) { fmt::append(log, "LLVM IR at 0x%x:\n", func.entry_point); out << *_module; // print IR out << "\n\n"; } if (verifyModule(*_module, &out)) { out.flush(); spu_log.error("LLVM: Verification failed at 0x%x:\n%s", func.entry_point, log); if (g_cfg.core.spu_debug) { fs::file(m_spurt->get_cache_path() + "spu-ir.log", fs::write + fs::append).write(log); } fmt::throw_exception("Compilation failed"); } #if defined(__APPLE__) pthread_jit_write_protect_np(false); #endif if (g_cfg.core.spu_debug) { // Testing only m_jit.add(std::move(_module), m_spurt->get_cache_path() + "llvm/"); } else { m_jit.add(std::move(_module)); } m_jit.fin(); // Register function pointer const spu_function_t fn = reinterpret_cast(m_jit.get_engine().getPointerToFunction(main_func)); // Install unconditionally, possibly replacing existing one from spu_fast add_loc->compiled = fn; // Rebuild trampoline if necessary if (!m_spurt->rebuild_ubertrampoline(func.data[0])) { return nullptr; } add_loc->compiled.notify_all(); if (g_cfg.core.spu_debug) { out.flush(); fs::write_file(m_spurt->get_cache_path() + "spu-ir.log", fs::create + fs::write + fs::append, log); } #if defined(__APPLE__) pthread_jit_write_protect_np(true); #endif #if defined(ARCH_ARM64) // Flush all cache lines after potentially writing executable code asm("ISB"); asm("DSB ISH"); #endif if (g_fxo->get().operator bool()) { spu_log.success("New block compiled successfully"); } return fn; } static void interp_check(spu_thread* _spu, bool after) { static thread_local std::array s_gpr; if (!after) { // Preserve reg state s_gpr = _spu->gpr; // Execute interpreter instruction const u32 op = *reinterpret_cast*>(_spu->_ptr(0) + _spu->pc); if (!g_fxo->get().decode(op)(*_spu, {op})) spu_log.fatal("Bad instruction"); // Swap state for (u32 i = 0; i < s_gpr.size(); ++i) std::swap(_spu->gpr[i], s_gpr[i]); } else { // Check saved state for (u32 i = 0; i < s_gpr.size(); ++i) { if (_spu->gpr[i] != s_gpr[i]) { spu_log.fatal("Register mismatch: $%u\n%s\n%s", i, _spu->gpr[i], s_gpr[i]); _spu->state += cpu_flag::dbg_pause; } } } } spu_function_t compile_interpreter() { using namespace llvm; m_engine->clearAllGlobalMappings(); // Create LLVM module std::unique_ptr _module = std::make_unique("spu_interpreter.obj", m_context); _module->setTargetTriple(jit_compiler::triple2()); _module->setDataLayout(m_jit.get_engine().getTargetMachine()->createDataLayout()); m_module = _module.get(); // Initialize IR Builder IRBuilder<> irb(m_context); m_ir = &irb; // Create interpreter table const auto if_type = get_ftype(); m_function_table = new GlobalVariable(*m_module, ArrayType::get(if_type->getPointerTo(), 1ull << m_interp_magn), true, GlobalValue::InternalLinkage, nullptr); init_luts(); // Add return function const auto ret_func = cast(_module->getOrInsertFunction("spu_ret", if_type).getCallee()); ret_func->setCallingConv(CallingConv::GHC); ret_func->setLinkage(GlobalValue::InternalLinkage); m_ir->SetInsertPoint(BasicBlock::Create(m_context, "", ret_func)); m_thread = ret_func->getArg(1); m_interp_pc = ret_func->getArg(2); m_ir->CreateRetVoid(); // Add entry function, serves as a trampoline const auto main_func = llvm::cast(m_module->getOrInsertFunction("spu_interpreter", get_ftype()).getCallee()); #ifdef _WIN32 main_func->setCallingConv(CallingConv::Win64); #endif set_function(main_func); // Load pc and opcode m_interp_pc = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::pc)); m_interp_op = m_ir->CreateLoad(get_type(), m_ir->CreateGEP(get_type(), m_lsptr, m_ir->CreateZExt(m_interp_pc, get_type()))); m_interp_op = m_ir->CreateCall(get_intrinsic(Intrinsic::bswap), {m_interp_op}); // Pinned constant, address of interpreter table m_interp_table = m_ir->CreateGEP(m_function_table->getValueType(), m_function_table, {m_ir->getInt64(0), m_ir->getInt64(0)}); // Pinned constant, mask for shifted register index m_interp_7f0 = m_ir->getInt32(0x7f0); // Pinned constant, address of first register m_interp_regs = _ptr(m_thread, get_reg_offset(0)); // Save host thread's stack pointer const auto native_sp = spu_ptr(&spu_thread::saved_native_sp); #if defined(ARCH_X64) const auto rsp_name = MetadataAsValue::get(m_context, MDNode::get(m_context, {MDString::get(m_context, "rsp")})); #elif defined(ARCH_ARM64) const auto rsp_name = MetadataAsValue::get(m_context, MDNode::get(m_context, {MDString::get(m_context, "sp")})); #endif m_ir->CreateStore(m_ir->CreateCall(get_intrinsic(Intrinsic::read_register), {rsp_name}), native_sp); // Decode (shift) and load function pointer const auto first = m_ir->CreateLoad(if_type->getPointerTo(), m_ir->CreateGEP(if_type->getPointerTo(), m_interp_table, m_ir->CreateLShr(m_interp_op, 32u - m_interp_magn))); const auto call0 = m_ir->CreateCall(if_type, first, {m_lsptr, m_thread, m_interp_pc, m_interp_op, m_interp_table, m_interp_7f0, m_interp_regs}); call0->setCallingConv(CallingConv::GHC); m_ir->CreateRetVoid(); // Create helper globals { std::vector float_to; std::vector to_float; float_to.reserve(256); to_float.reserve(256); for (int i = 0; i < 256; ++i) { float_to.push_back(ConstantFP::get(get_type(), std::exp2(173 - i))); to_float.push_back(ConstantFP::get(get_type(), std::exp2(i - 155))); } const auto atype = ArrayType::get(get_type(), 256); m_scale_float_to = new GlobalVariable(*m_module, atype, true, GlobalValue::InternalLinkage, ConstantArray::get(atype, float_to)); m_scale_to_float = new GlobalVariable(*m_module, atype, true, GlobalValue::InternalLinkage, ConstantArray::get(atype, to_float)); } // Fill interpreter table std::array ifuncs{}; std::vector iptrs; iptrs.reserve(1ull << m_interp_magn); m_block = nullptr; auto last_itype = spu_itype::type{255}; for (u32 i = 0; i < 1u << m_interp_magn;) { // Fake opcode const u32 op = i << (32u - m_interp_magn); // Instruction type const auto itype = g_spu_itype.decode(op); // Function name std::string fname = fmt::format("spu_%s", g_spu_iname.decode(op)); if (last_itype != itype) { // Trigger automatic information collection (probing) m_op_const_mask = 0; } else { // Inject const mask into function name fmt::append(fname, "_%X", (i & (m_op_const_mask >> (32u - m_interp_magn))) | (1u << m_interp_magn)); } // Decode instruction name, access function const auto f = cast(_module->getOrInsertFunction(fname, if_type).getCallee()); // Build if necessary if (f->empty()) { if (last_itype != itype) { ifuncs[static_cast(itype)] = f; } f->setCallingConv(CallingConv::GHC); m_function = f; m_lsptr = f->getArg(0); m_thread = f->getArg(1); m_interp_pc = f->getArg(2); m_interp_op = f->getArg(3); m_interp_table = f->getArg(4); m_interp_7f0 = f->getArg(5); m_interp_regs = f->getArg(6); m_ir->SetInsertPoint(BasicBlock::Create(m_context, "", f)); m_memptr = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::memory_base_addr)); switch (itype) { case spu_itype::UNK: case spu_itype::DFCEQ: case spu_itype::DFCMEQ: case spu_itype::DFCGT: case spu_itype::DFCMGT: case spu_itype::DFTSV: case spu_itype::STOP: case spu_itype::STOPD: case spu_itype::RDCH: case spu_itype::WRCH: { // Invalid or abortable instruction. Save current address. m_ir->CreateStore(m_interp_pc, spu_ptr(&spu_thread::pc)); [[fallthrough]]; } default: { break; } } { m_interp_bblock = nullptr; // Next instruction (no wraparound at the end of LS) m_interp_pc_next = m_ir->CreateAdd(m_interp_pc, m_ir->getInt32(4)); bool check = false; if (itype == spu_itype::WRCH || itype == spu_itype::RDCH || itype == spu_itype::RCHCNT || itype == spu_itype::STOP || itype == spu_itype::STOPD || itype & spu_itype::floating || itype & spu_itype::branch) { check = false; } if (itype & spu_itype::branch) { // Instruction changes pc - change order. (this->*decode(op))({op}); if (m_interp_bblock) { m_ir->SetInsertPoint(m_interp_bblock); m_interp_bblock = nullptr; } } if (!m_ir->GetInsertBlock()->getTerminator()) { if (check) { m_ir->CreateStore(m_interp_pc, spu_ptr(&spu_thread::pc)); } // Decode next instruction. const auto next_pc = itype & spu_itype::branch ? m_interp_pc : m_interp_pc_next; const auto be32_op = m_ir->CreateLoad(get_type(), m_ir->CreateGEP(get_type(), m_lsptr, m_ir->CreateZExt(next_pc, get_type()))); const auto next_op = m_ir->CreateCall(get_intrinsic(Intrinsic::bswap), {be32_op}); const auto next_if = m_ir->CreateLoad(if_type->getPointerTo(), m_ir->CreateGEP(if_type->getPointerTo(), m_interp_table, m_ir->CreateLShr(next_op, 32u - m_interp_magn))); llvm::cast(next_if)->setVolatile(true); if (!(itype & spu_itype::branch)) { if (check) { call("spu_interp_check", &interp_check, m_thread, m_ir->getFalse()); } // Normal instruction. (this->*decode(op))({op}); if (check && !m_ir->GetInsertBlock()->getTerminator()) { call("spu_interp_check", &interp_check, m_thread, m_ir->getTrue()); } m_interp_pc = m_interp_pc_next; } if (last_itype != itype) { // Reset to discard dead code llvm::cast(next_if)->setVolatile(false); if (itype & spu_itype::branch) { const auto _stop = BasicBlock::Create(m_context, "", f); const auto _next = BasicBlock::Create(m_context, "", f); m_ir->CreateCondBr(m_ir->CreateIsNotNull(m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::state))), _stop, _next, m_md_unlikely); m_ir->SetInsertPoint(_stop); m_ir->CreateStore(m_interp_pc, spu_ptr(&spu_thread::pc)); const auto escape_yes = BasicBlock::Create(m_context, "", f); const auto escape_no = BasicBlock::Create(m_context, "", f); m_ir->CreateCondBr(call("spu_exec_check_state", &exec_check_state, m_thread), escape_yes, escape_no); m_ir->SetInsertPoint(escape_yes); call("spu_escape", spu_runtime::g_escape, m_thread); m_ir->CreateBr(_next); m_ir->SetInsertPoint(escape_no); m_ir->CreateBr(_next); m_ir->SetInsertPoint(_next); } llvm::Value* fret = m_interp_table; if (itype == spu_itype::WRCH || itype == spu_itype::RDCH || itype == spu_itype::RCHCNT || itype == spu_itype::STOP || itype == spu_itype::STOPD || itype == spu_itype::UNK || itype == spu_itype::DFCMEQ || itype == spu_itype::DFCMGT || itype == spu_itype::DFCGT || itype == spu_itype::DFCEQ || itype == spu_itype::DFTSV) { m_interp_7f0 = m_ir->getInt32(0x7f0); m_interp_regs = _ptr(m_thread, get_reg_offset(0)); fret = ret_func; } else if (!(itype & spu_itype::branch)) { // Hack: inline ret instruction before final jmp; this is not reliable. #ifdef ARCH_X64 m_ir->CreateCall(InlineAsm::get(get_ftype(), "ret", "", true, false, InlineAsm::AD_Intel)); #else m_ir->CreateCall(InlineAsm::get(get_ftype(), "ret", "", true, false)); #endif fret = ret_func; } const auto arg3 = UndefValue::get(get_type()); const auto _ret = m_ir->CreateCall(if_type, fret, {m_lsptr, m_thread, m_interp_pc, arg3, m_interp_table, m_interp_7f0, m_interp_regs}); _ret->setCallingConv(CallingConv::GHC); _ret->setTailCall(); m_ir->CreateRetVoid(); } if (!m_ir->GetInsertBlock()->getTerminator()) { // Call next instruction. const auto _stop = BasicBlock::Create(m_context, "", f); const auto _next = BasicBlock::Create(m_context, "", f); m_ir->CreateCondBr(m_ir->CreateIsNotNull(m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::state))), _stop, _next, m_md_unlikely); m_ir->SetInsertPoint(_next); if (itype == spu_itype::WRCH || itype == spu_itype::RDCH || itype == spu_itype::RCHCNT || itype == spu_itype::STOP || itype == spu_itype::STOPD) { m_interp_7f0 = m_ir->getInt32(0x7f0); m_interp_regs = _ptr(m_thread, get_reg_offset(0)); } const auto ncall = m_ir->CreateCall(if_type, next_if, {m_lsptr, m_thread, m_interp_pc, next_op, m_interp_table, m_interp_7f0, m_interp_regs}); ncall->setCallingConv(CallingConv::GHC); ncall->setTailCall(); m_ir->CreateRetVoid(); m_ir->SetInsertPoint(_stop); m_ir->CreateStore(m_interp_pc, spu_ptr(&spu_thread::pc)); call("spu_escape", spu_runtime::g_escape, m_thread)->setTailCall(); m_ir->CreateRetVoid(); } } } } if (last_itype != itype && g_cfg.core.spu_decoder != spu_decoder_type::llvm) { // Repeat after probing last_itype = itype; } else { // Add to the table iptrs.push_back(f); i++; } } m_function_table->setInitializer(ConstantArray::get(ArrayType::get(if_type->getPointerTo(), 1ull << m_interp_magn), iptrs)); m_function_table = nullptr; // Initialize pass manager legacy::FunctionPassManager pm(_module.get()); // Basic optimizations pm.add(createEarlyCSEPass()); pm.add(createCFGSimplificationPass()); #if LLVM_VERSION_MAJOR < 17 pm.add(createDeadStoreEliminationPass()); pm.add(createAggressiveDCEPass()); #else pm.add(createDeadCodeEliminationPass()); #endif //pm.add(createLintPass()); for (auto& f : *_module) { replace_intrinsics(f); //pm.run(f); } std::string log; raw_string_ostream out(log); if (g_cfg.core.spu_debug) { fmt::append(log, "LLVM IR (interpreter):\n"); out << *_module; // print IR out << "\n\n"; } if (verifyModule(*_module, &out)) { out.flush(); spu_log.error("LLVM: Verification failed:\n%s", log); if (g_cfg.core.spu_debug) { fs::write_file(m_spurt->get_cache_path() + "spu-ir.log", fs::create + fs::write + fs::append, log); } fmt::throw_exception("Compilation failed"); } if (g_cfg.core.spu_debug) { // Testing only m_jit.add(std::move(_module), m_spurt->get_cache_path() + "llvm/"); } else { m_jit.add(std::move(_module)); } m_jit.fin(); // Register interpreter entry point spu_runtime::g_interpreter = reinterpret_cast(m_jit.get_engine().getPointerToFunction(main_func)); for (u32 i = 0; i < spu_runtime::g_interpreter_table.size(); i++) { // Fill exported interpreter table spu_runtime::g_interpreter_table[i] = ifuncs[i] ? reinterpret_cast(m_jit.get_engine().getPointerToFunction(ifuncs[i])) : 0; } if (!spu_runtime::g_interpreter) { return nullptr; } if (g_cfg.core.spu_debug) { out.flush(); fs::write_file(m_spurt->get_cache_path() + "spu-ir.log", fs::create + fs::write + fs::append, log); } return spu_runtime::g_interpreter; } static bool exec_check_state(spu_thread* _spu) { return _spu->check_state(); } template static void exec_fall(spu_thread* _spu, spu_opcode_t op) { if (F(*_spu, op)) { _spu->pc += 4; } } template void fall(spu_opcode_t op) { std::string name = fmt::format("spu_%s", g_spu_iname.decode(op.opcode)); if (m_interp_magn) { call(name, F, m_thread, m_interp_op); return; } update_pc(); call(name, &exec_fall, m_thread, m_ir->getInt32(op.opcode)); } [[noreturn]] static void exec_unk(spu_thread*, u32 op) { fmt::throw_exception("Unknown/Illegal instruction (0x%08x)", op); } void UNK(spu_opcode_t op_unk) { if (m_interp_magn) { m_ir->CreateStore(m_interp_pc, spu_ptr(&spu_thread::pc)); call("spu_unknown", &exec_unk, m_thread, m_ir->getInt32(op_unk.opcode)); return; } m_block->block_end = m_ir->GetInsertBlock(); update_pc(); call("spu_unknown", &exec_unk, m_thread, m_ir->getInt32(op_unk.opcode)); } static void exec_stop(spu_thread* _spu, u32 code) { if (!_spu->stop_and_signal(code) || _spu->state & cpu_flag::again) { spu_runtime::g_escape(_spu); } if (_spu->test_stopped()) { _spu->pc += 4; spu_runtime::g_escape(_spu); } } void STOP(spu_opcode_t op) // { if (m_interp_magn) { call("spu_syscall", &exec_stop, m_thread, m_ir->CreateAnd(m_interp_op, m_ir->getInt32(0x3fff))); return; } update_pc(); call("spu_syscall", &exec_stop, m_thread, m_ir->getInt32(op.opcode & 0x3fff)); if (g_cfg.core.spu_block_size == spu_block_size_type::safe) { m_block->block_end = m_ir->GetInsertBlock(); update_pc(m_pos + 4); ensure_gpr_stores(); tail_chunk(m_dispatch); return; } } void STOPD(spu_opcode_t) // { if (m_interp_magn) { call("spu_syscall", &exec_stop, m_thread, m_ir->getInt32(0x3fff)); return; } STOP(spu_opcode_t{0x3fff}); } static u32 exec_rdch(spu_thread* _spu, u32 ch) { const s64 result = _spu->get_ch_value(ch); if (result < 0 || _spu->state & cpu_flag::again) { spu_runtime::g_escape(_spu); } static_cast(_spu->test_stopped()); return static_cast(result & 0xffffffff); } static u32 exec_read_in_mbox(spu_thread* _spu) { // TODO return exec_rdch(_spu, SPU_RdInMbox); } static u32 exec_read_dec(spu_thread* _spu) { const u32 res = _spu->read_dec().first; if (res > 1500 && g_cfg.core.spu_loop_detection) { _spu->state += cpu_flag::wait; std::this_thread::yield(); static_cast(_spu->test_stopped()); } return res; } static u32 exec_read_events(spu_thread* _spu) { // TODO return exec_rdch(_spu, SPU_RdEventStat); } void ensure_gpr_stores() { if (m_block) { // Make previous stores not able to be reordered beyond this point or be deleted std::for_each(m_block->store_context_ctr.begin(), m_block->store_context_ctr.end(), FN(x++)); } } llvm::Value* get_rdch(spu_opcode_t op, u32 off, bool atomic) { const auto ptr = _ptr(m_thread, off); llvm::Value* val0; if (atomic) { const auto val = m_ir->CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, ptr, m_ir->getInt64(0), llvm::MaybeAlign{8}, llvm::AtomicOrdering::Acquire); val0 = val; } else { const auto val = m_ir->CreateLoad(get_type(), ptr); val->setAtomic(llvm::AtomicOrdering::Acquire); m_ir->CreateStore(m_ir->getInt64(0), ptr)->setAtomic(llvm::AtomicOrdering::Release); val0 = val; } const auto _cur = m_ir->GetInsertBlock(); const auto done = llvm::BasicBlock::Create(m_context, "", m_function); const auto wait = llvm::BasicBlock::Create(m_context, "", m_function); const auto cond = m_ir->CreateICmpSLT(val0, m_ir->getInt64(0)); val0 = m_ir->CreateTrunc(val0, get_type()); m_ir->CreateCondBr(cond, done, wait); m_ir->SetInsertPoint(wait); update_pc(); const auto val1 = call("spu_read_channel", &exec_rdch, m_thread, m_ir->getInt32(op.ra)); m_ir->CreateBr(done); m_ir->SetInsertPoint(done); const auto rval = m_ir->CreatePHI(get_type(), 2); rval->addIncoming(val0, _cur); rval->addIncoming(val1, wait); return rval; } void RDCH(spu_opcode_t op) // { value_t res; if (m_interp_magn) { res.value = call("spu_read_channel", &exec_rdch, m_thread, get_imm(op.ra).value); set_vr(op.rt, insert(splat(0), 3, res)); return; } switch (op.ra) { case SPU_RdSRR0: { res.value = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::srr0)); break; } case SPU_RdInMbox: { update_pc(); ensure_gpr_stores(); res.value = call("spu_read_in_mbox", &exec_read_in_mbox, m_thread); break; } case MFC_RdTagStat: { res.value = get_rdch(op, ::offset32(&spu_thread::ch_tag_stat), false); break; } case MFC_RdTagMask: { res.value = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::ch_tag_mask)); break; } case SPU_RdSigNotify1: { update_pc(); ensure_gpr_stores(); res.value = get_rdch(op, ::offset32(&spu_thread::ch_snr1), true); break; } case SPU_RdSigNotify2: { update_pc(); ensure_gpr_stores(); res.value = get_rdch(op, ::offset32(&spu_thread::ch_snr2), true); break; } case MFC_RdAtomicStat: { res.value = get_rdch(op, ::offset32(&spu_thread::ch_atomic_stat), false); break; } case MFC_RdListStallStat: { res.value = get_rdch(op, ::offset32(&spu_thread::ch_stall_stat), false); break; } case SPU_RdDec: { #if defined(ARCH_X64) if (utils::get_tsc_freq() && !(g_cfg.core.spu_loop_detection) && (g_cfg.core.clocks_scale == 100)) { const auto timestamp = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::ch_dec_start_timestamp)); const auto dec_value = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::ch_dec_value)); const auto tsc = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::x86_rdtsc)); const auto tscx = m_ir->CreateMul(m_ir->CreateUDiv(tsc, m_ir->getInt64(utils::get_tsc_freq())), m_ir->getInt64(80000000)); const auto tscm = m_ir->CreateUDiv(m_ir->CreateMul(m_ir->CreateURem(tsc, m_ir->getInt64(utils::get_tsc_freq())), m_ir->getInt64(80000000)), m_ir->getInt64(utils::get_tsc_freq())); const auto tsctb = m_ir->CreateAdd(tscx, tscm); const auto frz = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::is_dec_frozen)); const auto frzev = m_ir->CreateICmpEQ(frz, m_ir->getInt8(0)); const auto delta = m_ir->CreateTrunc(m_ir->CreateSub(tsctb, timestamp), get_type()); const auto deltax = m_ir->CreateSelect(frzev, delta, m_ir->getInt32(0)); res.value = m_ir->CreateSub(dec_value, deltax); break; } #endif res.value = call("spu_read_decrementer", &exec_read_dec, m_thread); break; } case SPU_RdEventMask: { const auto value = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::ch_events)); value->setAtomic(llvm::AtomicOrdering::Acquire); res.value = m_ir->CreateTrunc(m_ir->CreateLShr(value, 32), get_type()); break; } case SPU_RdEventStat: { update_pc(); if (g_cfg.savestate.compatible_mode) { ensure_gpr_stores(); } else { m_ir->CreateStore(m_ir->getInt8(1), spu_ptr(&spu_thread::unsavable)); } res.value = call("spu_read_events", &exec_read_events, m_thread); if (!g_cfg.savestate.compatible_mode) { m_ir->CreateStore(m_ir->getInt8(0), spu_ptr(&spu_thread::unsavable)); } break; } case SPU_RdMachStat: { res.value = m_ir->CreateZExt(m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::interrupts_enabled)), get_type()); res.value = m_ir->CreateOr(res.value, m_ir->CreateAnd(m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::thread_type)), m_ir->getInt32(2))); break; } default: { update_pc(); ensure_gpr_stores(); res.value = call("spu_read_channel", &exec_rdch, m_thread, m_ir->getInt32(op.ra)); break; } } set_vr(op.rt, insert(splat(0), 3, res)); } static u32 exec_rchcnt(spu_thread* _spu, u32 ch) { return _spu->get_ch_count(ch); } static u32 exec_get_events(spu_thread* _spu, u32 mask) { return _spu->get_events(mask).count; } llvm::Value* get_rchcnt(u32 off, u64 inv = 0) { const auto val = m_ir->CreateLoad(get_type(), _ptr(m_thread, off)); val->setAtomic(llvm::AtomicOrdering::Acquire); const auto shv = m_ir->CreateLShr(val, spu_channel::off_count); return m_ir->CreateTrunc(m_ir->CreateXor(shv, u64{inv}), get_type()); } void RCHCNT(spu_opcode_t op) // { value_t res; if (m_interp_magn) { res.value = call("spu_read_channel_count", &exec_rchcnt, m_thread, get_imm(op.ra).value); set_vr(op.rt, insert(splat(0), 3, res)); return; } switch (op.ra) { case SPU_WrOutMbox: { res.value = get_rchcnt(::offset32(&spu_thread::ch_out_mbox), true); break; } case SPU_WrOutIntrMbox: { res.value = get_rchcnt(::offset32(&spu_thread::ch_out_intr_mbox), true); break; } case MFC_RdTagStat: { res.value = get_rchcnt(::offset32(&spu_thread::ch_tag_stat)); break; } case MFC_RdListStallStat: { res.value = get_rchcnt(::offset32(&spu_thread::ch_stall_stat)); break; } case SPU_RdSigNotify1: { res.value = get_rchcnt(::offset32(&spu_thread::ch_snr1)); break; } case SPU_RdSigNotify2: { res.value = get_rchcnt(::offset32(&spu_thread::ch_snr2)); break; } case MFC_RdAtomicStat: { res.value = get_rchcnt(::offset32(&spu_thread::ch_atomic_stat)); break; } case MFC_WrTagUpdate: { res.value = m_ir->getInt32(1); break; } case MFC_Cmd: { res.value = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::mfc_size)); res.value = m_ir->CreateSub(m_ir->getInt32(16), res.value); break; } case SPU_RdInMbox: { const auto value = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::ch_in_mbox)); value->setAtomic(llvm::AtomicOrdering::Acquire); res.value = value; res.value = m_ir->CreateLShr(res.value, 8); res.value = m_ir->CreateAnd(res.value, 7); break; } case SPU_RdEventStat: { const auto mask = m_ir->CreateTrunc(m_ir->CreateLShr(m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::ch_events)), 32), get_type()); res.value = call("spu_get_events", &exec_get_events, m_thread, mask); break; } // Channels with a constant count of 1: case SPU_WrEventMask: case SPU_WrEventAck: case SPU_WrDec: case SPU_RdDec: case SPU_RdEventMask: case SPU_RdMachStat: case SPU_WrSRR0: case SPU_RdSRR0: case SPU_Set_Bkmk_Tag: case SPU_PM_Start_Ev: case SPU_PM_Stop_Ev: case MFC_RdTagMask: case MFC_LSA: case MFC_EAH: case MFC_EAL: case MFC_Size: case MFC_TagID: case MFC_WrTagMask: case MFC_WrListStallAck: { res.value = m_ir->getInt32(1); break; } default: { res.value = call("spu_read_channel_count", &exec_rchcnt, m_thread, m_ir->getInt32(op.ra)); break; } } set_vr(op.rt, insert(splat(0), 3, res)); } static void exec_wrch(spu_thread* _spu, u32 ch, u32 value) { if (!_spu->set_ch_value(ch, value) || _spu->state & cpu_flag::again) { spu_runtime::g_escape(_spu); } static_cast(_spu->test_stopped()); } static void exec_list_unstall(spu_thread* _spu, u32 tag) { for (u32 i = 0; i < _spu->mfc_size; i++) { if (_spu->mfc_queue[i].tag == (tag | 0x80)) { _spu->mfc_queue[i].tag &= 0x7f; } } _spu->do_mfc(); } static void exec_mfc_cmd(spu_thread* _spu) { if (!_spu->process_mfc_cmd() || _spu->state & cpu_flag::again) { spu_runtime::g_escape(_spu); } static_cast(_spu->test_stopped()); } void WRCH(spu_opcode_t op) // { const auto val = eval(extract(get_vr(op.rt), 3)); if (m_interp_magn) { call("spu_write_channel", &exec_wrch, m_thread, get_imm(op.ra).value, val.value); return; } switch (op.ra) { case SPU_WrSRR0: { m_ir->CreateStore(eval(val & 0x3fffc).value, spu_ptr(&spu_thread::srr0)); return; } case SPU_WrOutIntrMbox: { // TODO break; } case SPU_WrOutMbox: { // TODO break; } case MFC_WrTagMask: { // TODO m_ir->CreateStore(val.value, spu_ptr(&spu_thread::ch_tag_mask)); const auto next = llvm::BasicBlock::Create(m_context, "", m_function); const auto _mfc = llvm::BasicBlock::Create(m_context, "", m_function); m_ir->CreateCondBr(m_ir->CreateICmpNE(m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::ch_tag_upd)), m_ir->getInt32(MFC_TAG_UPDATE_IMMEDIATE)), _mfc, next); m_ir->SetInsertPoint(_mfc); update_pc(); call("spu_write_channel", &exec_wrch, m_thread, m_ir->getInt32(op.ra), val.value); m_ir->CreateBr(next); m_ir->SetInsertPoint(next); return; } case MFC_WrTagUpdate: { if (true) { const auto tag_mask = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::ch_tag_mask)); const auto mfc_fence = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::mfc_fence)); const auto completed = m_ir->CreateAnd(tag_mask, m_ir->CreateNot(mfc_fence)); const auto upd_ptr = spu_ptr(&spu_thread::ch_tag_upd); const auto stat_ptr = spu_ptr(&spu_thread::ch_tag_stat); const auto stat_val = m_ir->CreateOr(m_ir->CreateZExt(completed, get_type()), s64{smin}); const auto next = llvm::BasicBlock::Create(m_context, "", m_function); const auto next0 = llvm::BasicBlock::Create(m_context, "", m_function); const auto imm = llvm::BasicBlock::Create(m_context, "", m_function); const auto any = llvm::BasicBlock::Create(m_context, "", m_function); const auto fail = llvm::BasicBlock::Create(m_context, "", m_function); const auto update = llvm::BasicBlock::Create(m_context, "", m_function); m_ir->CreateCondBr(m_ir->CreateICmpEQ(val.value, m_ir->getInt32(MFC_TAG_UPDATE_IMMEDIATE)), imm, next0); m_ir->SetInsertPoint(imm); m_ir->CreateStore(val.value, upd_ptr); m_ir->CreateStore(stat_val, stat_ptr); m_ir->CreateBr(next); m_ir->SetInsertPoint(next0); m_ir->CreateCondBr(m_ir->CreateICmpULE(val.value, m_ir->getInt32(MFC_TAG_UPDATE_ALL)), any, fail, m_md_likely); // Illegal update, access violate with special address m_ir->SetInsertPoint(fail); const auto ptr = _ptr(m_memptr, 0xffdead04); m_ir->CreateStore(m_ir->getInt32("TAG\0"_u32), ptr); m_ir->CreateBr(next); m_ir->SetInsertPoint(any); const auto cond = m_ir->CreateSelect(m_ir->CreateICmpEQ(val.value, m_ir->getInt32(MFC_TAG_UPDATE_ANY)) , m_ir->CreateICmpNE(completed, m_ir->getInt32(0)), m_ir->CreateICmpEQ(completed, tag_mask)); m_ir->CreateStore(m_ir->CreateSelect(cond, m_ir->getInt32(MFC_TAG_UPDATE_IMMEDIATE), val.value), upd_ptr); m_ir->CreateCondBr(cond, update, next, m_md_likely); m_ir->SetInsertPoint(update); m_ir->CreateStore(stat_val, stat_ptr); m_ir->CreateBr(next); m_ir->SetInsertPoint(next); return; } } case MFC_LSA: { set_reg_fixed(s_reg_mfc_lsa, val.value); return; } case MFC_EAH: { if (auto ci = llvm::dyn_cast(val.value)) { if (ci->getZExtValue() == 0) { return; } } spu_log.warning("[0x%x] MFC_EAH: $%u is not a zero constant", m_pos, +op.rt); //m_ir->CreateStore(val.value, spu_ptr(&spu_thread::ch_mfc_cmd, &spu_mfc_cmd::eah)); return; } case MFC_EAL: { set_reg_fixed(s_reg_mfc_eal, val.value); return; } case MFC_Size: { set_reg_fixed(s_reg_mfc_size, trunc(val).eval(m_ir)); return; } case MFC_TagID: { set_reg_fixed(s_reg_mfc_tag, trunc(val & 0x1f).eval(m_ir)); return; } case MFC_Cmd: { // Prevent store elimination (TODO) m_block->store_context_ctr[s_reg_mfc_eal]++; m_block->store_context_ctr[s_reg_mfc_lsa]++; m_block->store_context_ctr[s_reg_mfc_tag]++; m_block->store_context_ctr[s_reg_mfc_size]++; if (auto ci = llvm::dyn_cast(trunc(val).eval(m_ir))) { if (g_cfg.core.mfc_debug) { break; } bool must_use_cpp_functions = !!g_cfg.core.spu_accurate_dma; if (u64 cmdh = ci->getZExtValue() & ~(MFC_BARRIER_MASK | MFC_FENCE_MASK | MFC_RESULT_MASK); g_cfg.core.rsx_fifo_accuracy || g_cfg.video.strict_rendering_mode || !g_use_rtm) { // TODO: don't require TSX (current implementation is TSX-only) if (cmdh == MFC_PUT_CMD || cmdh == MFC_SNDSIG_CMD) { must_use_cpp_functions = true; } } const auto eal = get_reg_fixed(s_reg_mfc_eal); const auto lsa = get_reg_fixed(s_reg_mfc_lsa); const auto tag = get_reg_fixed(s_reg_mfc_tag); const auto size = get_reg_fixed(s_reg_mfc_size); const auto mask = m_ir->CreateShl(m_ir->getInt32(1), zext(tag).eval(m_ir)); const auto exec = llvm::BasicBlock::Create(m_context, "", m_function); const auto fail = llvm::BasicBlock::Create(m_context, "", m_function); const auto next = llvm::BasicBlock::Create(m_context, "", m_function); const auto pf = spu_ptr(&spu_thread::mfc_fence); const auto pb = spu_ptr(&spu_thread::mfc_barrier); switch (u64 cmd = ci->getZExtValue()) { case MFC_SDCRT_CMD: case MFC_SDCRTST_CMD: { return; } case MFC_PUTL_CMD: case MFC_PUTLB_CMD: case MFC_PUTLF_CMD: case MFC_PUTRL_CMD: case MFC_PUTRLB_CMD: case MFC_PUTRLF_CMD: case MFC_GETL_CMD: case MFC_GETLB_CMD: case MFC_GETLF_CMD: { ensure_gpr_stores(); [[fallthrough]]; } case MFC_SDCRZ_CMD: case MFC_GETLLAR_CMD: case MFC_PUTLLC_CMD: case MFC_PUTLLUC_CMD: case MFC_PUTQLLUC_CMD: { // TODO m_ir->CreateBr(next); m_ir->SetInsertPoint(exec); m_ir->CreateUnreachable(); m_ir->SetInsertPoint(fail); m_ir->CreateUnreachable(); m_ir->SetInsertPoint(next); m_ir->CreateStore(ci, spu_ptr(&spu_thread::ch_mfc_cmd, &spu_mfc_cmd::cmd)); update_pc(); call("spu_exec_mfc_cmd", &exec_mfc_cmd, m_thread); return; } case MFC_SNDSIG_CMD: case MFC_SNDSIGB_CMD: case MFC_SNDSIGF_CMD: case MFC_PUT_CMD: case MFC_PUTB_CMD: case MFC_PUTF_CMD: case MFC_PUTR_CMD: case MFC_PUTRB_CMD: case MFC_PUTRF_CMD: case MFC_GET_CMD: case MFC_GETB_CMD: case MFC_GETF_CMD: { // Try to obtain constant size u64 csize = -1; if (auto ci = llvm::dyn_cast(size.value)) { csize = ci->getZExtValue(); } if (cmd >= MFC_SNDSIG_CMD && csize != 4) { csize = -1; } llvm::Value* src = m_ir->CreateGEP(get_type(), m_lsptr, zext(lsa).eval(m_ir)); llvm::Value* dst = m_ir->CreateGEP(get_type(), m_memptr, zext(eal).eval(m_ir)); if (cmd & MFC_GET_CMD) { std::swap(src, dst); } llvm::Value* barrier = m_ir->CreateLoad(get_type(), pb); if (cmd & (MFC_BARRIER_MASK | MFC_FENCE_MASK)) { barrier = m_ir->CreateOr(barrier, m_ir->CreateLoad(get_type(), pf)); } const auto cond = m_ir->CreateIsNull(m_ir->CreateAnd(mask, barrier)); m_ir->CreateCondBr(cond, exec, fail, m_md_likely); m_ir->SetInsertPoint(exec); const auto copy = llvm::BasicBlock::Create(m_context, "", m_function); // Always use interpreter function for MFC debug option if (!must_use_cpp_functions) { const auto mmio = llvm::BasicBlock::Create(m_context, "", m_function); m_ir->CreateCondBr(m_ir->CreateICmpUGE(eal.value, m_ir->getInt32(0xe0000000)), mmio, copy, m_md_unlikely); m_ir->SetInsertPoint(mmio); } m_ir->CreateStore(ci, spu_ptr(&spu_thread::ch_mfc_cmd, &spu_mfc_cmd::cmd)); call("spu_exec_mfc_cmd", &exec_mfc_cmd, m_thread); m_ir->CreateBr(next); m_ir->SetInsertPoint(copy); llvm::Type* vtype = get_type(); switch (csize) { case 0: case umax: { break; } case 1: { vtype = get_type(); break; } case 2: { vtype = get_type(); break; } case 4: { vtype = get_type(); break; } case 8: { vtype = get_type(); break; } default: { if (csize % 16 || csize > 0x4000) { spu_log.error("[0x%x] MFC_Cmd: invalid size %u", m_pos, csize); } } } // Check if the LS address is constant and 256 bit aligned u64 clsa = umax; if (auto ci = llvm::dyn_cast(lsa.value)) { clsa = ci->getZExtValue(); } u32 stride = 16; if (m_use_avx && csize >= 32 && !(clsa % 32)) { vtype = get_type(); stride = 32; } if (csize > 0 && csize <= 16) { // Generate single copy operation m_ir->CreateStore(m_ir->CreateLoad(vtype, src), dst); } else if (csize <= stride * 16 && !(csize % 32)) { // Generate fixed sequence of copy operations for (u32 i = 0; i < csize; i += stride) { const auto _src = m_ir->CreateGEP(get_type(), src, m_ir->getInt32(i)); const auto _dst = m_ir->CreateGEP(get_type(), dst, m_ir->getInt32(i)); if (csize - i < stride) { m_ir->CreateStore(m_ir->CreateLoad(get_type(), _src), _dst); } else { m_ir->CreateAlignedStore(m_ir->CreateAlignedLoad(vtype, _src, llvm::MaybeAlign{16}), _dst, llvm::MaybeAlign{16}); } } } else if (csize) { // TODO auto spu_memcpy = [](u8* dst, const u8* src, u32 size) { std::memcpy(dst, src, size); }; call("spu_memcpy", +spu_memcpy, dst, src, zext(size).eval(m_ir)); } // Disable certain thing m_ir->CreateStore(m_ir->getInt32(0), spu_ptr(&spu_thread::last_faddr)); m_ir->CreateBr(next); break; } case MFC_BARRIER_CMD: case MFC_EIEIO_CMD: case MFC_SYNC_CMD: { const auto cond = m_ir->CreateIsNull(m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::mfc_size))); m_ir->CreateCondBr(cond, exec, fail, m_md_likely); m_ir->SetInsertPoint(exec); m_ir->CreateFence(llvm::AtomicOrdering::SequentiallyConsistent); m_ir->CreateBr(next); break; } default: { // TODO spu_log.error("[0x%x] MFC_Cmd: unknown command (0x%x)", m_pos, cmd); m_ir->CreateBr(next); m_ir->SetInsertPoint(exec); m_ir->CreateUnreachable(); break; } } // Fallback: enqueue the command m_ir->SetInsertPoint(fail); // Get MFC slot, redirect to invalid memory address const auto slot = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::mfc_size)); const auto off0 = m_ir->CreateAdd(m_ir->CreateMul(slot, m_ir->getInt32(sizeof(spu_mfc_cmd))), m_ir->getInt32(::offset32(&spu_thread::mfc_queue))); const auto ptr0 = m_ir->CreateGEP(get_type(), m_thread, m_ir->CreateZExt(off0, get_type())); const auto ptr1 = m_ir->CreateGEP(get_type(), m_memptr, m_ir->getInt64(0xffdeadf0)); const auto pmfc = m_ir->CreateSelect(m_ir->CreateICmpULT(slot, m_ir->getInt32(16)), ptr0, ptr1); m_ir->CreateStore(ci, _ptr(pmfc, ::offset32(&spu_mfc_cmd::cmd))); switch (u64 cmd = ci->getZExtValue()) { case MFC_GETLLAR_CMD: case MFC_PUTLLC_CMD: case MFC_PUTLLUC_CMD: case MFC_PUTQLLUC_CMD: { break; } case MFC_PUTL_CMD: case MFC_PUTLB_CMD: case MFC_PUTLF_CMD: case MFC_PUTRL_CMD: case MFC_PUTRLB_CMD: case MFC_PUTRLF_CMD: case MFC_GETL_CMD: case MFC_GETLB_CMD: case MFC_GETLF_CMD: { break; } case MFC_SDCRZ_CMD: { break; } case MFC_SNDSIG_CMD: case MFC_SNDSIGB_CMD: case MFC_SNDSIGF_CMD: case MFC_PUT_CMD: case MFC_PUTB_CMD: case MFC_PUTF_CMD: case MFC_PUTR_CMD: case MFC_PUTRB_CMD: case MFC_PUTRF_CMD: case MFC_GET_CMD: case MFC_GETB_CMD: case MFC_GETF_CMD: { m_ir->CreateStore(tag.value, _ptr(pmfc, ::offset32(&spu_mfc_cmd::tag))); m_ir->CreateStore(size.value, _ptr(pmfc, ::offset32(&spu_mfc_cmd::size))); m_ir->CreateStore(lsa.value, _ptr(pmfc, ::offset32(&spu_mfc_cmd::lsa))); m_ir->CreateStore(eal.value, _ptr(pmfc, ::offset32(&spu_mfc_cmd::eal))); m_ir->CreateStore(m_ir->CreateOr(m_ir->CreateLoad(get_type(), pf), mask), pf); if (cmd & MFC_BARRIER_MASK) m_ir->CreateStore(m_ir->CreateOr(m_ir->CreateLoad(get_type(), pb), mask), pb); break; } case MFC_BARRIER_CMD: case MFC_EIEIO_CMD: case MFC_SYNC_CMD: { m_ir->CreateStore(m_ir->getInt32(-1), pb); m_ir->CreateStore(m_ir->CreateOr(m_ir->CreateLoad(get_type(), pf), mask), pf); break; } default: { m_ir->CreateUnreachable(); break; } } m_ir->CreateStore(m_ir->CreateAdd(slot, m_ir->getInt32(1)), spu_ptr(&spu_thread::mfc_size)); m_ir->CreateBr(next); m_ir->SetInsertPoint(next); return; } // Fallback to unoptimized WRCH implementation (TODO) spu_log.warning("[0x%x] MFC_Cmd: $%u is not a constant", m_pos, +op.rt); break; } case MFC_WrListStallAck: { const auto mask = eval(splat(1) << (val & 0x1f)); const auto _ptr = spu_ptr(&spu_thread::ch_stall_mask); const auto _old = m_ir->CreateLoad(get_type(), _ptr); const auto _new = m_ir->CreateAnd(_old, m_ir->CreateNot(mask.value)); m_ir->CreateStore(_new, _ptr); const auto next = llvm::BasicBlock::Create(m_context, "", m_function); const auto _mfc = llvm::BasicBlock::Create(m_context, "", m_function); m_ir->CreateCondBr(m_ir->CreateICmpNE(_old, _new), _mfc, next); m_ir->SetInsertPoint(_mfc); ensure_gpr_stores(); update_pc(); call("spu_list_unstall", &exec_list_unstall, m_thread, eval(val & 0x1f).value); m_ir->CreateBr(next); m_ir->SetInsertPoint(next); return; } case SPU_WrDec: { call("spu_get_events", &exec_get_events, m_thread, m_ir->getInt32(SPU_EVENT_TM)); #if defined(ARCH_X64) if (utils::get_tsc_freq() && !(g_cfg.core.spu_loop_detection) && (g_cfg.core.clocks_scale == 100)) { const auto tsc = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::x86_rdtsc)); const auto tscx = m_ir->CreateMul(m_ir->CreateUDiv(tsc, m_ir->getInt64(utils::get_tsc_freq())), m_ir->getInt64(80000000)); const auto tscm = m_ir->CreateUDiv(m_ir->CreateMul(m_ir->CreateURem(tsc, m_ir->getInt64(utils::get_tsc_freq())), m_ir->getInt64(80000000)), m_ir->getInt64(utils::get_tsc_freq())); const auto tsctb = m_ir->CreateAdd(tscx, tscm); m_ir->CreateStore(tsctb, spu_ptr(&spu_thread::ch_dec_start_timestamp)); } else #endif { m_ir->CreateStore(call("get_timebased_time", &get_timebased_time), spu_ptr