#include "stdafx.h" #include "Utilities/JIT.h" #include "Utilities/sysinfo.h" #include "Emu/Memory/vm.h" #include "Emu/System.h" #include "Emu/IdManager.h" #include "Emu/Cell/PPUThread.h" #include "Emu/Cell/ErrorCodes.h" #include "Emu/Cell/lv2/sys_spu.h" #include "Emu/Cell/lv2/sys_event_flag.h" #include "Emu/Cell/lv2/sys_event.h" #include "Emu/Cell/lv2/sys_interrupt.h" #include "Emu/Cell/SPUDisAsm.h" #include "Emu/Cell/SPUThread.h" #include "Emu/Cell/SPUInterpreter.h" #include "Emu/Cell/SPURecompiler.h" #include "Emu/Cell/RawSPUThread.h" #include #include #include #include // Verify AVX availability for TSX transactions static const bool s_tsx_avx = utils::has_avx(); // For special case static const bool s_tsx_haswell = utils::has_rtm() && !utils::has_mpx(); static FORCE_INLINE bool cmp_rdata(const decltype(spu_thread::rdata)& lhs, const decltype(spu_thread::rdata)& rhs) { const v128 a = (lhs[0] ^ rhs[0]) | (lhs[1] ^ rhs[1]); const v128 b = (lhs[2] ^ rhs[2]) | (lhs[3] ^ rhs[3]); const v128 c = (lhs[4] ^ rhs[4]) | (lhs[5] ^ rhs[5]); const v128 d = (lhs[6] ^ rhs[6]) | (lhs[7] ^ rhs[7]); const v128 r = (a | b) | (c | d); return !(r._u64[0] | r._u64[1]); } static FORCE_INLINE void mov_rdata(decltype(spu_thread::rdata)& dst, const decltype(spu_thread::rdata)& src) { { const v128 data0 = src[0]; const v128 data1 = src[1]; const v128 data2 = src[2]; dst[0] = data0; dst[1] = data1; dst[2] = data2; } { const v128 data0 = src[3]; const v128 data1 = src[4]; const v128 data2 = src[5]; dst[3] = data0; dst[4] = data1; dst[5] = data2; } { const v128 data0 = src[6]; const v128 data1 = src[7]; dst[6] = data0; dst[7] = data1; } } extern u64 get_timebased_time(); extern u64 get_system_time(); extern const spu_decoder g_spu_interpreter_precise; extern const spu_decoder g_spu_interpreter_fast; extern thread_local u64 g_tls_fault_spu; template <> void fmt_class_string::format(std::string& out, u64 arg) { format_enum(out, arg, [](spu_decoder_type type) { switch (type) { case spu_decoder_type::precise: return "Interpreter (precise)"; case spu_decoder_type::fast: return "Interpreter (fast)"; case spu_decoder_type::asmjit: return "Recompiler (ASMJIT)"; case spu_decoder_type::llvm: return "Recompiler (LLVM)"; } return unknown; }); } template <> void fmt_class_string::format(std::string& out, u64 arg) { format_enum(out, arg, [](spu_block_size_type type) { switch (type) { case spu_block_size_type::safe: return "Safe"; case spu_block_size_type::mega: return "Mega"; case spu_block_size_type::giga: return "Giga"; } return unknown; }); } namespace spu { namespace scheduler { std::array, 65536> atomic_instruction_table = {}; constexpr u32 native_jiffy_duration_us = 1500; //About 1ms resolution with a half offset void acquire_pc_address(u32 pc, u32 timeout_ms = 3) { const u8 max_concurrent_instructions = (u8)g_cfg.core.preferred_spu_threads; const u32 pc_offset = pc >> 2; if (atomic_instruction_table[pc_offset].load(std::memory_order_consume) >= max_concurrent_instructions) { if (timeout_ms > 0) { const u64 timeout = timeout_ms * 1000u; //convert to microseconds const u64 start = get_system_time(); auto remaining = timeout; while (atomic_instruction_table[pc_offset].load(std::memory_order_consume) >= max_concurrent_instructions) { if (remaining >= native_jiffy_duration_us) std::this_thread::sleep_for(1ms); else std::this_thread::yield(); const auto now = get_system_time(); const auto elapsed = now - start; if (elapsed > timeout) break; remaining = timeout - elapsed; } } else { //Slight pause if function is overburdened const auto count = atomic_instruction_table[pc_offset].load(std::memory_order_consume) * 100ull; busy_wait(count); } } atomic_instruction_table[pc_offset]++; } void release_pc_address(u32 pc) { const u32 pc_offset = pc >> 2; atomic_instruction_table[pc_offset]--; } struct concurrent_execution_watchdog { u32 pc = 0; bool active = false; concurrent_execution_watchdog(spu_thread& spu) :pc(spu.pc) { if (g_cfg.core.preferred_spu_threads > 0) { acquire_pc_address(pc, (u32)g_cfg.core.spu_delay_penalty); active = true; } } ~concurrent_execution_watchdog() { if (active) release_pc_address(pc); } }; } } const auto spu_putllc_tx = build_function_asm([](asmjit::X86Assembler& c, auto& args) { using namespace asmjit; Label fall = c.newLabel(); Label fail = c.newLabel(); Label _ret = c.newLabel(); Label skip = c.newLabel(); Label next = c.newLabel(); if (utils::has_avx() && !s_tsx_avx) { c.vzeroupper(); } // Create stack frame if necessary (Windows ABI has only 6 volatile vector registers) c.push(x86::rbp); c.push(x86::r13); c.push(x86::r12); c.push(x86::rbx); c.sub(x86::rsp, 168); #ifdef _WIN32 if (s_tsx_avx) { c.vmovups(x86::oword_ptr(x86::rsp, 0), x86::xmm6); c.vmovups(x86::oword_ptr(x86::rsp, 16), x86::xmm7); } else { c.movups(x86::oword_ptr(x86::rsp, 0), x86::xmm6); c.movups(x86::oword_ptr(x86::rsp, 16), x86::xmm7); c.movups(x86::oword_ptr(x86::rsp, 32), x86::xmm8); c.movups(x86::oword_ptr(x86::rsp, 48), x86::xmm9); c.movups(x86::oword_ptr(x86::rsp, 64), x86::xmm10); c.movups(x86::oword_ptr(x86::rsp, 80), x86::xmm11); c.movups(x86::oword_ptr(x86::rsp, 96), x86::xmm12); c.movups(x86::oword_ptr(x86::rsp, 112), x86::xmm13); c.movups(x86::oword_ptr(x86::rsp, 128), x86::xmm14); c.movups(x86::oword_ptr(x86::rsp, 144), x86::xmm15); } #endif // Prepare registers c.mov(x86::rax, imm_ptr(&vm::g_reservations)); c.mov(x86::rbx, x86::qword_ptr(x86::rax)); c.mov(x86::rax, imm_ptr(&vm::g_base_addr)); c.mov(x86::rbp, x86::qword_ptr(x86::rax)); c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0])); c.shr(args[0], 4); c.lea(x86::rbx, x86::qword_ptr(x86::rbx, args[0])); c.xor_(x86::r12d, x86::r12d); c.mov(x86::r13, args[1]); // Prepare data if (s_tsx_avx) { c.vmovups(x86::ymm0, x86::yword_ptr(args[2], 0)); c.vmovups(x86::ymm1, x86::yword_ptr(args[2], 32)); c.vmovups(x86::ymm2, x86::yword_ptr(args[2], 64)); c.vmovups(x86::ymm3, x86::yword_ptr(args[2], 96)); c.vmovups(x86::ymm4, x86::yword_ptr(args[3], 0)); c.vmovups(x86::ymm5, x86::yword_ptr(args[3], 32)); c.vmovups(x86::ymm6, x86::yword_ptr(args[3], 64)); c.vmovups(x86::ymm7, x86::yword_ptr(args[3], 96)); } else { c.movaps(x86::xmm0, x86::oword_ptr(args[2], 0)); c.movaps(x86::xmm1, x86::oword_ptr(args[2], 16)); c.movaps(x86::xmm2, x86::oword_ptr(args[2], 32)); c.movaps(x86::xmm3, x86::oword_ptr(args[2], 48)); c.movaps(x86::xmm4, x86::oword_ptr(args[2], 64)); c.movaps(x86::xmm5, x86::oword_ptr(args[2], 80)); c.movaps(x86::xmm6, x86::oword_ptr(args[2], 96)); c.movaps(x86::xmm7, x86::oword_ptr(args[2], 112)); c.movaps(x86::xmm8, x86::oword_ptr(args[3], 0)); c.movaps(x86::xmm9, x86::oword_ptr(args[3], 16)); c.movaps(x86::xmm10, x86::oword_ptr(args[3], 32)); c.movaps(x86::xmm11, x86::oword_ptr(args[3], 48)); c.movaps(x86::xmm12, x86::oword_ptr(args[3], 64)); c.movaps(x86::xmm13, x86::oword_ptr(args[3], 80)); c.movaps(x86::xmm14, x86::oword_ptr(args[3], 96)); c.movaps(x86::xmm15, x86::oword_ptr(args[3], 112)); } // Begin transaction build_transaction_enter(c, fall, x86::r12, 4); c.mov(x86::rax, x86::qword_ptr(x86::rbx)); c.and_(x86::rax, -128); c.cmp(x86::rax, x86::r13); c.jne(fail); c.test(x86::qword_ptr(x86::rbx), 127); c.jnz(skip); if (s_tsx_avx) { c.vxorps(x86::ymm0, x86::ymm0, x86::yword_ptr(x86::rbp, 0)); c.vxorps(x86::ymm1, x86::ymm1, x86::yword_ptr(x86::rbp, 32)); c.vxorps(x86::ymm2, x86::ymm2, x86::yword_ptr(x86::rbp, 64)); c.vxorps(x86::ymm3, x86::ymm3, x86::yword_ptr(x86::rbp, 96)); c.vorps(x86::ymm0, x86::ymm0, x86::ymm1); c.vorps(x86::ymm1, x86::ymm2, x86::ymm3); c.vorps(x86::ymm0, x86::ymm1, x86::ymm0); c.vptest(x86::ymm0, x86::ymm0); } else { c.xorps(x86::xmm0, x86::oword_ptr(x86::rbp, 0)); c.xorps(x86::xmm1, x86::oword_ptr(x86::rbp, 16)); c.xorps(x86::xmm2, x86::oword_ptr(x86::rbp, 32)); c.xorps(x86::xmm3, x86::oword_ptr(x86::rbp, 48)); c.xorps(x86::xmm4, x86::oword_ptr(x86::rbp, 64)); c.xorps(x86::xmm5, x86::oword_ptr(x86::rbp, 80)); c.xorps(x86::xmm6, x86::oword_ptr(x86::rbp, 96)); c.xorps(x86::xmm7, x86::oword_ptr(x86::rbp, 112)); c.orps(x86::xmm0, x86::xmm1); c.orps(x86::xmm2, x86::xmm3); c.orps(x86::xmm4, x86::xmm5); c.orps(x86::xmm6, x86::xmm7); c.orps(x86::xmm0, x86::xmm2); c.orps(x86::xmm4, x86::xmm6); c.orps(x86::xmm0, x86::xmm4); c.ptest(x86::xmm0, x86::xmm0); } c.jnz(fail); if (s_tsx_avx) { c.vmovaps(x86::yword_ptr(x86::rbp, 0), x86::ymm4); c.vmovaps(x86::yword_ptr(x86::rbp, 32), x86::ymm5); c.vmovaps(x86::yword_ptr(x86::rbp, 64), x86::ymm6); c.vmovaps(x86::yword_ptr(x86::rbp, 96), x86::ymm7); } else { c.movaps(x86::oword_ptr(x86::rbp, 0), x86::xmm8); c.movaps(x86::oword_ptr(x86::rbp, 16), x86::xmm9); c.movaps(x86::oword_ptr(x86::rbp, 32), x86::xmm10); c.movaps(x86::oword_ptr(x86::rbp, 48), x86::xmm11); c.movaps(x86::oword_ptr(x86::rbp, 64), x86::xmm12); c.movaps(x86::oword_ptr(x86::rbp, 80), x86::xmm13); c.movaps(x86::oword_ptr(x86::rbp, 96), x86::xmm14); c.movaps(x86::oword_ptr(x86::rbp, 112), x86::xmm15); } c.sub(x86::qword_ptr(x86::rbx), -128); c.xend(); c.mov(x86::eax, 1); c.jmp(_ret); c.bind(skip); c.xor_(x86::eax, x86::eax); c.xor_(x86::r12d, x86::r12d); build_transaction_abort(c, 0); //c.jmp(fall); c.bind(fall); c.sar(x86::eax, 24); c.js(fail); c.lock().add(x86::qword_ptr(x86::rbx), 1); c.lock().bts(x86::dword_ptr(args[2], ::offset32(&spu_thread::state) - ::offset32(&spu_thread::rdata)), static_cast(cpu_flag::wait)); // Touch memory if transaction failed without RETRY flag on the first attempt c.cmp(x86::r12, 1); c.jne(next); c.xor_(x86::rbp, 0xf80); c.lock().add(x86::dword_ptr(x86::rbp), 0); c.xor_(x86::rbp, 0xf80); Label fall2 = c.newLabel(); Label fail2 = c.newLabel(); // Lightened transaction: only compare and swap data c.bind(next); build_transaction_enter(c, fall2, x86::r12, 666); if (s_tsx_avx) { c.vxorps(x86::ymm0, x86::ymm0, x86::yword_ptr(x86::rbp, 0)); c.vxorps(x86::ymm1, x86::ymm1, x86::yword_ptr(x86::rbp, 32)); c.vxorps(x86::ymm2, x86::ymm2, x86::yword_ptr(x86::rbp, 64)); c.vxorps(x86::ymm3, x86::ymm3, x86::yword_ptr(x86::rbp, 96)); c.vorps(x86::ymm0, x86::ymm0, x86::ymm1); c.vorps(x86::ymm1, x86::ymm2, x86::ymm3); c.vorps(x86::ymm0, x86::ymm1, x86::ymm0); c.vptest(x86::ymm0, x86::ymm0); } else { c.xorps(x86::xmm0, x86::oword_ptr(x86::rbp, 0)); c.xorps(x86::xmm1, x86::oword_ptr(x86::rbp, 16)); c.xorps(x86::xmm2, x86::oword_ptr(x86::rbp, 32)); c.xorps(x86::xmm3, x86::oword_ptr(x86::rbp, 48)); c.xorps(x86::xmm4, x86::oword_ptr(x86::rbp, 64)); c.xorps(x86::xmm5, x86::oword_ptr(x86::rbp, 80)); c.xorps(x86::xmm6, x86::oword_ptr(x86::rbp, 96)); c.xorps(x86::xmm7, x86::oword_ptr(x86::rbp, 112)); c.orps(x86::xmm0, x86::xmm1); c.orps(x86::xmm2, x86::xmm3); c.orps(x86::xmm4, x86::xmm5); c.orps(x86::xmm6, x86::xmm7); c.orps(x86::xmm0, x86::xmm2); c.orps(x86::xmm4, x86::xmm6); c.orps(x86::xmm0, x86::xmm4); c.ptest(x86::xmm0, x86::xmm0); } c.jnz(fail2); if (s_tsx_avx) { c.vmovaps(x86::yword_ptr(x86::rbp, 0), x86::ymm4); c.vmovaps(x86::yword_ptr(x86::rbp, 32), x86::ymm5); c.vmovaps(x86::yword_ptr(x86::rbp, 64), x86::ymm6); c.vmovaps(x86::yword_ptr(x86::rbp, 96), x86::ymm7); } else { c.movaps(x86::oword_ptr(x86::rbp, 0), x86::xmm8); c.movaps(x86::oword_ptr(x86::rbp, 16), x86::xmm9); c.movaps(x86::oword_ptr(x86::rbp, 32), x86::xmm10); c.movaps(x86::oword_ptr(x86::rbp, 48), x86::xmm11); c.movaps(x86::oword_ptr(x86::rbp, 64), x86::xmm12); c.movaps(x86::oword_ptr(x86::rbp, 80), x86::xmm13); c.movaps(x86::oword_ptr(x86::rbp, 96), x86::xmm14); c.movaps(x86::oword_ptr(x86::rbp, 112), x86::xmm15); } c.xend(); c.lock().add(x86::qword_ptr(x86::rbx), 127); c.mov(x86::eax, 1); c.jmp(_ret); c.bind(fall2); c.sar(x86::eax, 24); c.js(fail2); c.mov(x86::eax, 2); c.jmp(_ret); c.bind(fail); build_transaction_abort(c, 0xff); c.xor_(x86::eax, x86::eax); c.jmp(_ret); c.bind(fail2); build_transaction_abort(c, 0xff); c.lock().sub(x86::qword_ptr(x86::rbx), 1); c.xor_(x86::eax, x86::eax); //c.jmp(_ret); c.bind(_ret); #ifdef _WIN32 if (s_tsx_avx) { c.vmovups(x86::xmm6, x86::oword_ptr(x86::rsp, 0)); c.vmovups(x86::xmm7, x86::oword_ptr(x86::rsp, 16)); } else { c.movups(x86::xmm6, x86::oword_ptr(x86::rsp, 0)); c.movups(x86::xmm7, x86::oword_ptr(x86::rsp, 16)); c.movups(x86::xmm8, x86::oword_ptr(x86::rsp, 32)); c.movups(x86::xmm9, x86::oword_ptr(x86::rsp, 48)); c.movups(x86::xmm10, x86::oword_ptr(x86::rsp, 64)); c.movups(x86::xmm11, x86::oword_ptr(x86::rsp, 80)); c.movups(x86::xmm12, x86::oword_ptr(x86::rsp, 96)); c.movups(x86::xmm13, x86::oword_ptr(x86::rsp, 112)); c.movups(x86::xmm14, x86::oword_ptr(x86::rsp, 128)); c.movups(x86::xmm15, x86::oword_ptr(x86::rsp, 144)); } #endif if (s_tsx_avx) { c.vzeroupper(); } c.add(x86::rsp, 168); c.pop(x86::rbx); c.pop(x86::r12); c.pop(x86::r13); c.pop(x86::rbp); c.ret(); }); const auto spu_getll_tx = build_function_asm([](asmjit::X86Assembler& c, auto& args) { using namespace asmjit; Label fall = c.newLabel(); Label _ret = c.newLabel(); if (utils::has_avx() && !s_tsx_avx) { c.vzeroupper(); } // Create stack frame if necessary (Windows ABI has only 6 volatile vector registers) c.push(x86::rbp); c.push(x86::r13); c.push(x86::r12); c.push(x86::rbx); c.sub(x86::rsp, 72); #ifdef _WIN32 if (!s_tsx_avx) { c.movups(x86::oword_ptr(x86::rsp, 0), x86::xmm6); c.movups(x86::oword_ptr(x86::rsp, 16), x86::xmm7); } #endif // Prepare registers c.mov(x86::rax, imm_ptr(&vm::g_reservations)); c.mov(x86::rbx, x86::qword_ptr(x86::rax)); c.mov(x86::rax, imm_ptr(&vm::g_base_addr)); c.mov(x86::rbp, x86::qword_ptr(x86::rax)); c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0])); c.shr(args[0], 4); c.lea(x86::rbx, x86::qword_ptr(x86::rbx, args[0])); c.xor_(x86::r12d, x86::r12d); c.mov(x86::r13, args[1]); // Begin transaction build_transaction_enter(c, fall, x86::r12, 16); c.mov(x86::rax, x86::qword_ptr(x86::rbx)); if (s_tsx_avx) { c.vmovaps(x86::ymm0, x86::yword_ptr(x86::rbp, 0)); c.vmovaps(x86::ymm1, x86::yword_ptr(x86::rbp, 32)); c.vmovaps(x86::ymm2, x86::yword_ptr(x86::rbp, 64)); c.vmovaps(x86::ymm3, x86::yword_ptr(x86::rbp, 96)); } else { c.movaps(x86::xmm0, x86::oword_ptr(x86::rbp, 0)); c.movaps(x86::xmm1, x86::oword_ptr(x86::rbp, 16)); c.movaps(x86::xmm2, x86::oword_ptr(x86::rbp, 32)); c.movaps(x86::xmm3, x86::oword_ptr(x86::rbp, 48)); c.movaps(x86::xmm4, x86::oword_ptr(x86::rbp, 64)); c.movaps(x86::xmm5, x86::oword_ptr(x86::rbp, 80)); c.movaps(x86::xmm6, x86::oword_ptr(x86::rbp, 96)); c.movaps(x86::xmm7, x86::oword_ptr(x86::rbp, 112)); } c.xend(); if (s_tsx_avx) { c.vmovups(x86::yword_ptr(x86::r13, 0), x86::ymm0); c.vmovups(x86::yword_ptr(x86::r13, 32), x86::ymm1); c.vmovups(x86::yword_ptr(x86::r13, 64), x86::ymm2); c.vmovups(x86::yword_ptr(x86::r13, 96), x86::ymm3); } else { c.movaps(x86::oword_ptr(x86::r13, 0), x86::xmm0); c.movaps(x86::oword_ptr(x86::r13, 16), x86::xmm1); c.movaps(x86::oword_ptr(x86::r13, 32), x86::xmm2); c.movaps(x86::oword_ptr(x86::r13, 48), x86::xmm3); c.movaps(x86::oword_ptr(x86::r13, 64), x86::xmm4); c.movaps(x86::oword_ptr(x86::r13, 80), x86::xmm5); c.movaps(x86::oword_ptr(x86::r13, 96), x86::xmm6); c.movaps(x86::oword_ptr(x86::r13, 112), x86::xmm7); } c.and_(x86::rax, -128); c.jmp(_ret); c.bind(fall); c.mov(x86::eax, 1); //c.jmp(_ret); c.bind(_ret); #ifdef _WIN32 if (!s_tsx_avx) { c.movups(x86::xmm6, x86::oword_ptr(x86::rsp, 0)); c.movups(x86::xmm7, x86::oword_ptr(x86::rsp, 16)); } #endif if (s_tsx_avx) { c.vzeroupper(); } c.add(x86::rsp, 72); c.pop(x86::rbx); c.pop(x86::r12); c.pop(x86::r13); c.pop(x86::rbp); c.ret(); }); const auto spu_getll_inexact = build_function_asm([](asmjit::X86Assembler& c, auto& args) { using namespace asmjit; Label _ret = c.newLabel(); if (utils::has_avx() && !s_tsx_avx) { c.vzeroupper(); } // Create stack frame if necessary (Windows ABI has only 6 volatile vector registers) c.push(x86::rbp); c.push(x86::r13); c.push(x86::r12); c.push(x86::rbx); c.sub(x86::rsp, 72); #ifdef _WIN32 if (!s_tsx_avx) { c.movups(x86::oword_ptr(x86::rsp, 0), x86::xmm6); c.movups(x86::oword_ptr(x86::rsp, 16), x86::xmm7); c.movups(x86::oword_ptr(x86::rsp, 32), x86::xmm8); c.movups(x86::oword_ptr(x86::rsp, 48), x86::xmm9); } #endif // Prepare registers c.mov(x86::rax, imm_ptr(&vm::g_reservations)); c.mov(x86::rbx, x86::qword_ptr(x86::rax)); c.mov(x86::rax, imm_ptr(&vm::g_base_addr)); c.mov(x86::rbp, x86::qword_ptr(x86::rax)); c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0])); c.shr(args[0], 4); c.lea(x86::rbx, x86::qword_ptr(x86::rbx, args[0])); c.xor_(x86::r12d, x86::r12d); c.mov(x86::r13, args[1]); // Begin copying Label begin = c.newLabel(); Label test0 = c.newLabel(); c.bind(begin); c.mov(x86::rax, x86::qword_ptr(x86::rbx)); if (s_tsx_avx) { c.vmovaps(x86::ymm0, x86::yword_ptr(x86::rbp, 0)); c.vmovaps(x86::ymm1, x86::yword_ptr(x86::rbp, 32)); c.vmovaps(x86::ymm2, x86::yword_ptr(x86::rbp, 64)); c.vmovaps(x86::ymm3, x86::yword_ptr(x86::rbp, 96)); } else { c.movaps(x86::xmm0, x86::oword_ptr(x86::rbp, 0)); c.movaps(x86::xmm1, x86::oword_ptr(x86::rbp, 16)); c.movaps(x86::xmm2, x86::oword_ptr(x86::rbp, 32)); c.movaps(x86::xmm3, x86::oword_ptr(x86::rbp, 48)); c.movaps(x86::xmm4, x86::oword_ptr(x86::rbp, 64)); c.movaps(x86::xmm5, x86::oword_ptr(x86::rbp, 80)); c.movaps(x86::xmm6, x86::oword_ptr(x86::rbp, 96)); c.movaps(x86::xmm7, x86::oword_ptr(x86::rbp, 112)); } // Verify and retry if necessary. c.mov(args[0], x86::rax); c.xor_(args[0], x86::qword_ptr(x86::rbx)); c.test(args[0], -128); c.jz(test0); c.lea(x86::r12, x86::qword_ptr(x86::r12, 1)); c.jmp(begin); c.bind(test0); c.test(x86::eax, 127); c.jz(_ret); c.and_(x86::rax, -128); // If there are lock bits set, verify data as well. if (s_tsx_avx) { c.vxorps(x86::ymm4, x86::ymm0, x86::yword_ptr(x86::rbp, 0)); c.vxorps(x86::ymm5, x86::ymm1, x86::yword_ptr(x86::rbp, 32)); c.vorps(x86::ymm5, x86::ymm5, x86::ymm4); c.vxorps(x86::ymm4, x86::ymm2, x86::yword_ptr(x86::rbp, 64)); c.vorps(x86::ymm5, x86::ymm5, x86::ymm4); c.vxorps(x86::ymm4, x86::ymm3, x86::yword_ptr(x86::rbp, 96)); c.vorps(x86::ymm5, x86::ymm5, x86::ymm4); c.vptest(x86::ymm5, x86::ymm5); } else { c.xorps(x86::xmm9, x86::xmm9); c.movaps(x86::xmm8, x86::xmm0); c.xorps(x86::xmm8, x86::oword_ptr(x86::rbp, 0)); c.orps(x86::xmm9, x86::xmm8); c.movaps(x86::xmm8, x86::xmm1); c.xorps(x86::xmm8, x86::oword_ptr(x86::rbp, 16)); c.orps(x86::xmm9, x86::xmm8); c.movaps(x86::xmm8, x86::xmm2); c.xorps(x86::xmm8, x86::oword_ptr(x86::rbp, 32)); c.orps(x86::xmm9, x86::xmm8); c.movaps(x86::xmm8, x86::xmm3); c.xorps(x86::xmm8, x86::oword_ptr(x86::rbp, 48)); c.orps(x86::xmm9, x86::xmm8); c.movaps(x86::xmm8, x86::xmm4); c.xorps(x86::xmm8, x86::oword_ptr(x86::rbp, 64)); c.orps(x86::xmm9, x86::xmm8); c.movaps(x86::xmm8, x86::xmm5); c.xorps(x86::xmm8, x86::oword_ptr(x86::rbp, 80)); c.orps(x86::xmm9, x86::xmm8); c.movaps(x86::xmm8, x86::xmm6); c.xorps(x86::xmm8, x86::oword_ptr(x86::rbp, 96)); c.orps(x86::xmm9, x86::xmm8); c.movaps(x86::xmm8, x86::xmm7); c.xorps(x86::xmm8, x86::oword_ptr(x86::rbp, 112)); c.orps(x86::xmm9, x86::xmm8); c.ptest(x86::xmm9, x86::xmm9); } c.jz(_ret); c.lea(x86::r12, x86::qword_ptr(x86::r12, 2)); c.jmp(begin); c.bind(_ret); if (s_tsx_avx) { c.vmovups(x86::yword_ptr(x86::r13, 0), x86::ymm0); c.vmovups(x86::yword_ptr(x86::r13, 32), x86::ymm1); c.vmovups(x86::yword_ptr(x86::r13, 64), x86::ymm2); c.vmovups(x86::yword_ptr(x86::r13, 96), x86::ymm3); } else { c.movaps(x86::oword_ptr(x86::r13, 0), x86::xmm0); c.movaps(x86::oword_ptr(x86::r13, 16), x86::xmm1); c.movaps(x86::oword_ptr(x86::r13, 32), x86::xmm2); c.movaps(x86::oword_ptr(x86::r13, 48), x86::xmm3); c.movaps(x86::oword_ptr(x86::r13, 64), x86::xmm4); c.movaps(x86::oword_ptr(x86::r13, 80), x86::xmm5); c.movaps(x86::oword_ptr(x86::r13, 96), x86::xmm6); c.movaps(x86::oword_ptr(x86::r13, 112), x86::xmm7); } #ifdef _WIN32 if (!s_tsx_avx) { c.movups(x86::xmm6, x86::oword_ptr(x86::rsp, 0)); c.movups(x86::xmm7, x86::oword_ptr(x86::rsp, 16)); c.movups(x86::xmm8, x86::oword_ptr(x86::rsp, 32)); c.movups(x86::xmm9, x86::oword_ptr(x86::rsp, 48)); } #endif if (s_tsx_avx) { c.vzeroupper(); } c.add(x86::rsp, 72); c.pop(x86::rbx); c.pop(x86::r12); c.pop(x86::r13); c.pop(x86::rbp); c.ret(); }); const auto spu_putlluc_tx = build_function_asm([](asmjit::X86Assembler& c, auto& args) { using namespace asmjit; Label fall = c.newLabel(); Label _ret = c.newLabel(); Label skip = c.newLabel(); Label next = c.newLabel(); if (utils::has_avx() && !s_tsx_avx) { c.vzeroupper(); } // Create stack frame if necessary (Windows ABI has only 6 volatile vector registers) c.push(x86::rbp); c.push(x86::r13); c.push(x86::r12); c.push(x86::rbx); c.sub(x86::rsp, 40); #ifdef _WIN32 if (!s_tsx_avx) { c.movups(x86::oword_ptr(x86::rsp, 0), x86::xmm6); c.movups(x86::oword_ptr(x86::rsp, 16), x86::xmm7); } #endif // Prepare registers c.mov(x86::rax, imm_ptr(&vm::g_reservations)); c.mov(x86::rbx, x86::qword_ptr(x86::rax)); c.mov(x86::rax, imm_ptr(&vm::g_base_addr)); c.mov(x86::rbp, x86::qword_ptr(x86::rax)); c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0])); c.shr(args[0], 4); c.lea(x86::rbx, x86::qword_ptr(x86::rbx, args[0])); c.xor_(x86::r12d, x86::r12d); c.mov(x86::r13, args[1]); // Prepare data if (s_tsx_avx) { c.vmovups(x86::ymm0, x86::yword_ptr(args[1], 0)); c.vmovups(x86::ymm1, x86::yword_ptr(args[1], 32)); c.vmovups(x86::ymm2, x86::yword_ptr(args[1], 64)); c.vmovups(x86::ymm3, x86::yword_ptr(args[1], 96)); } else { c.movaps(x86::xmm0, x86::oword_ptr(args[1], 0)); c.movaps(x86::xmm1, x86::oword_ptr(args[1], 16)); c.movaps(x86::xmm2, x86::oword_ptr(args[1], 32)); c.movaps(x86::xmm3, x86::oword_ptr(args[1], 48)); c.movaps(x86::xmm4, x86::oword_ptr(args[1], 64)); c.movaps(x86::xmm5, x86::oword_ptr(args[1], 80)); c.movaps(x86::xmm6, x86::oword_ptr(args[1], 96)); c.movaps(x86::xmm7, x86::oword_ptr(args[1], 112)); } // Begin transaction build_transaction_enter(c, fall, x86::r12, 8); c.test(x86::dword_ptr(x86::rbx), 127); c.jnz(skip); if (s_tsx_avx) { c.vmovaps(x86::yword_ptr(x86::rbp, 0), x86::ymm0); c.vmovaps(x86::yword_ptr(x86::rbp, 32), x86::ymm1); c.vmovaps(x86::yword_ptr(x86::rbp, 64), x86::ymm2); c.vmovaps(x86::yword_ptr(x86::rbp, 96), x86::ymm3); } else { c.movaps(x86::oword_ptr(x86::rbp, 0), x86::xmm0); c.movaps(x86::oword_ptr(x86::rbp, 16), x86::xmm1); c.movaps(x86::oword_ptr(x86::rbp, 32), x86::xmm2); c.movaps(x86::oword_ptr(x86::rbp, 48), x86::xmm3); c.movaps(x86::oword_ptr(x86::rbp, 64), x86::xmm4); c.movaps(x86::oword_ptr(x86::rbp, 80), x86::xmm5); c.movaps(x86::oword_ptr(x86::rbp, 96), x86::xmm6); c.movaps(x86::oword_ptr(x86::rbp, 112), x86::xmm7); } c.sub(x86::qword_ptr(x86::rbx), -128); c.xend(); c.mov(x86::eax, 1); c.jmp(_ret); c.bind(skip); c.xor_(x86::eax, x86::eax); c.xor_(x86::r12d, x86::r12d); build_transaction_abort(c, 0); //c.jmp(fall); c.bind(fall); c.lock().add(x86::qword_ptr(x86::rbx), 1); c.lock().bts(x86::dword_ptr(args[2], ::offset32(&spu_thread::state)), static_cast(cpu_flag::wait)); // Touch memory if transaction failed without RETRY flag on the first attempt c.cmp(x86::r12, 1); c.jne(next); c.xor_(x86::rbp, 0xf80); c.lock().add(x86::dword_ptr(x86::rbp), 0); c.xor_(x86::rbp, 0xf80); Label fall2 = c.newLabel(); // Lightened transaction c.bind(next); build_transaction_enter(c, fall2, x86::r12, 666); if (s_tsx_avx) { c.vmovaps(x86::yword_ptr(x86::rbp, 0), x86::ymm0); c.vmovaps(x86::yword_ptr(x86::rbp, 32), x86::ymm1); c.vmovaps(x86::yword_ptr(x86::rbp, 64), x86::ymm2); c.vmovaps(x86::yword_ptr(x86::rbp, 96), x86::ymm3); } else { c.movaps(x86::oword_ptr(x86::rbp, 0), x86::xmm0); c.movaps(x86::oword_ptr(x86::rbp, 16), x86::xmm1); c.movaps(x86::oword_ptr(x86::rbp, 32), x86::xmm2); c.movaps(x86::oword_ptr(x86::rbp, 48), x86::xmm3); c.movaps(x86::oword_ptr(x86::rbp, 64), x86::xmm4); c.movaps(x86::oword_ptr(x86::rbp, 80), x86::xmm5); c.movaps(x86::oword_ptr(x86::rbp, 96), x86::xmm6); c.movaps(x86::oword_ptr(x86::rbp, 112), x86::xmm7); } c.xend(); c.lock().add(x86::qword_ptr(x86::rbx), 127); c.mov(x86::eax, 1); c.jmp(_ret); c.bind(fall2); c.mov(x86::eax, 2); //c.jmp(_ret); c.bind(_ret); #ifdef _WIN32 if (!s_tsx_avx) { c.movups(x86::xmm6, x86::oword_ptr(x86::rsp, 0)); c.movups(x86::xmm7, x86::oword_ptr(x86::rsp, 16)); } #endif if (s_tsx_avx) { c.vzeroupper(); } c.add(x86::rsp, 40); c.pop(x86::rbx); c.pop(x86::r12); c.pop(x86::r13); c.pop(x86::rbp); c.ret(); }); void spu_int_ctrl_t::set(u64 ints) { // leave only enabled interrupts ints &= mask; // notify if at least 1 bit was set if (ints && ~stat.fetch_or(ints) & ints && tag) { reader_lock rlock(id_manager::g_mutex); if (tag) { if (auto handler = tag->handler.lock()) { handler->exec(); } } } } const spu_imm_table_t g_spu_imm; spu_imm_table_t::scale_table_t::scale_table_t() { for (s32 i = -155; i < 174; i++) { m_data[i + 155].vf = _mm_set1_ps(static_cast(std::exp2(i))); } } spu_imm_table_t::spu_imm_table_t() { for (u32 i = 0; i < std::size(sldq_pshufb); i++) { for (u32 j = 0; j < 16; j++) { sldq_pshufb[i]._u8[j] = static_cast(j - i); } } for (u32 i = 0; i < std::size(srdq_pshufb); i++) { const u32 im = (0u - i) & 0x1f; for (u32 j = 0; j < 16; j++) { srdq_pshufb[i]._u8[j] = (j + im > 15) ? 0xff : static_cast(j + im); } } for (u32 i = 0; i < std::size(rldq_pshufb); i++) { for (u32 j = 0; j < 16; j++) { rldq_pshufb[i]._u8[j] = static_cast((j - i) & 0xf); } } } std::string spu_thread::get_name() const { return fmt::format("%sSPU[0x%x] Thread (%s)", offset >= RAW_SPU_BASE_ADDR ? "Raw" : "", id, spu_name.get()); } std::string spu_thread::dump() const { std::string ret = cpu_thread::dump(); if (group) { fmt::append(ret, "\nGroup ID: 0x%x", group->id); } fmt::append(ret, "\nBlock Weight: %u (Retreats: %u)", block_counter, block_failure); fmt::append(ret, "\n[%s]", ch_mfc_cmd); fmt::append(ret, "\nTag Mask: 0x%08x", ch_tag_mask); fmt::append(ret, "\nMFC Stall: 0x%08x", ch_stall_mask); fmt::append(ret, "\nMFC Queue Size: %u", mfc_size); for (u32 i = 0; i < 16; i++) { if (i < mfc_size) { fmt::append(ret, "\n%s", mfc_queue[i]); } else { fmt::append(ret, "\n[-]"); } } ret += "\nRegisters:\n========="; for (u32 i = 0; i < 128; i++) { fmt::append(ret, "\nGPR[%d] = %s", i, gpr[i]); } return ret; } void spu_thread::cpu_init() { gpr = {}; fpscr.Reset(); ch_mfc_cmd = {}; srr0 = 0; mfc_size = 0; mfc_barrier = 0; mfc_fence = 0; ch_tag_upd = 0; ch_tag_mask = 0; mfc_prxy_mask = 0; ch_tag_stat.data.release({}); ch_stall_mask = 0; ch_stall_stat.data.release({}); ch_atomic_stat.data.release({}); ch_in_mbox.clear(); ch_out_mbox.data.release({}); ch_out_intr_mbox.data.release({}); snr_config = 0; ch_snr1.data.release({}); ch_snr2.data.release({}); ch_event_mask = 0; ch_event_stat = 0; interrupts_enabled = false; raddr = 0; ch_dec_start_timestamp = get_timebased_time(); // ??? ch_dec_value = 0; run_ctrl.release(0); status.release(0); npc.release(0); int_ctrl[0].clear(); int_ctrl[1].clear(); int_ctrl[2].clear(); gpr[1]._u32[3] = 0x3FFF0; // initial stack frame pointer } void spu_thread::cpu_stop() { if (!group && offset >= RAW_SPU_BASE_ADDR) { // Save next PC and current SPU Interrupt Status npc = pc | (interrupts_enabled); } else if (group && is_stopped()) { if (verify(HERE, group->running--) == 1) { { std::lock_guard lock(group->mutex); group->stop_count++; group->run_state = SPU_THREAD_GROUP_STATUS_INITIALIZED; if (!group->join_state) { group->join_state = SYS_SPU_THREAD_GROUP_JOIN_ALL_THREADS_EXIT; } if (const auto ppu = std::exchange(group->waiter, nullptr)) { // Send exit status directly to the joining thread ppu->gpr[4] = group->join_state; ppu->gpr[5] = group->exit_status; group->join_state.release(0); } } // Notify on last thread stopped group->cond.notify_all(); } } } extern thread_local std::string(*g_tls_log_prefix)(); void spu_thread::cpu_task() { // Get next PC and SPU Interrupt status pc = npc.exchange(0); set_interrupt_status((pc & 1) != 0); pc &= 0x3fffc; std::fesetround(FE_TOWARDZERO); if (g_cfg.core.set_daz_and_ftz && g_cfg.core.spu_decoder != spu_decoder_type::precise) { // Set DAZ and FTZ _mm_setcsr(_mm_getcsr() | 0x8840); } g_tls_log_prefix = [] { const auto cpu = static_cast(get_current_cpu_thread()); return fmt::format("%s [0x%05x]", thread_ctrl::get_name(), cpu->pc); }; if (jit) { // Register SPU runtime user spu_runtime::passive_lock _passive_lock(jit->get_runtime()); while (true) { if (UNLIKELY(state)) { if (check_state()) { if (state & cpu_flag::jit_return) { // Handle jit_return as a special case jit->get_runtime().handle_return(this); continue; } break; } } spu_runtime::g_gateway(*this, vm::_ptr(offset), nullptr); } // Print some stats LOG_NOTICE(SPU, "Stats: Block Weight: %u (Retreats: %u);", block_counter, block_failure); cpu_stop(); return; } if (spu_runtime::g_interpreter) { while (true) { if (UNLIKELY(state)) { if (check_state()) break; } spu_runtime::g_interpreter(*this, vm::_ptr(offset), nullptr); } cpu_stop(); return; } // Select opcode table const auto& table = *( g_cfg.core.spu_decoder == spu_decoder_type::precise ? &g_spu_interpreter_precise.get_table() : g_cfg.core.spu_decoder == spu_decoder_type::fast ? &g_spu_interpreter_fast.get_table() : (fmt::throw_exception("Invalid SPU decoder"), nullptr)); // LS pointer const auto base = vm::_ptr(offset); while (true) { if (UNLIKELY(state)) { if (check_state()) break; } const u32 op = *reinterpret_cast*>(base + pc); if (table[spu_decode(op)](*this, {op})) pc += 4; } cpu_stop(); } void spu_thread::cpu_mem() { //vm::passive_lock(*this); } void spu_thread::cpu_unmem() { //state.test_and_set(cpu_flag::memory); } spu_thread::~spu_thread() { // Deallocate Local Storage vm::dealloc_verbose_nothrow(offset); // Deallocate RawSPU ID if (!group && offset >= RAW_SPU_BASE_ADDR) { g_raw_spu_id[index] = 0; g_raw_spu_ctr--; } } spu_thread::spu_thread(vm::addr_t ls, lv2_spu_group* group, u32 index, std::string_view name) : cpu_thread(idm::last_id()) , spu_name(name) , index(index) , offset(ls) , group(group) { if (g_cfg.core.spu_decoder == spu_decoder_type::asmjit) { jit = spu_recompiler_base::make_asmjit_recompiler(); } if (g_cfg.core.spu_decoder == spu_decoder_type::llvm) { jit = spu_recompiler_base::make_llvm_recompiler(); } if (g_cfg.core.spu_decoder != spu_decoder_type::fast && g_cfg.core.spu_decoder != spu_decoder_type::precise) { if (g_cfg.core.spu_block_size != spu_block_size_type::safe) { // Initialize stack mirror std::memset(stack_mirror.data(), 0xff, sizeof(stack_mirror)); } } if (!group && offset >= RAW_SPU_BASE_ADDR) { cpu_init(); } } void spu_thread::push_snr(u32 number, u32 value) { // Get channel const auto channel = number & 1 ? &ch_snr2 : &ch_snr1; // Check corresponding SNR register settings if ((snr_config >> number) & 1) { channel->push_or(*this, value); } else { channel->push(*this, value); } } void spu_thread::do_dma_transfer(const spu_mfc_cmd& args) { const bool is_get = (args.cmd & ~(MFC_BARRIER_MASK | MFC_FENCE_MASK | MFC_START_MASK)) == MFC_GET_CMD; u32 eal = args.eal; u32 lsa = args.lsa & 0x3ffff; // SPU Thread Group MMIO (LS and SNR) and RawSPU MMIO if (eal >= RAW_SPU_BASE_ADDR) { const u32 index = (eal - SYS_SPU_THREAD_BASE_LOW) / SYS_SPU_THREAD_OFFSET; // thread number in group const u32 offset = (eal - SYS_SPU_THREAD_BASE_LOW) % SYS_SPU_THREAD_OFFSET; // LS offset or MMIO register if (eal < SYS_SPU_THREAD_BASE_LOW) { // RawSPU MMIO auto thread = idm::get>(find_raw_spu((eal - RAW_SPU_BASE_ADDR) / RAW_SPU_OFFSET)); if (!thread) { fmt::throw_exception("RawSPU not found (cmd=0x%x, lsa=0x%x, ea=0x%llx, tag=0x%x, size=0x%x)" HERE, args.cmd, args.lsa, args.eal, args.tag, args.size); } u32 value; if ((eal - RAW_SPU_BASE_ADDR) % RAW_SPU_OFFSET + args.size - 1 < 0x40000) // LS access { } else if (args.size == 4 && is_get && thread->read_reg(eal, value)) { _ref(lsa) = value; return; } else if (args.size == 4 && !is_get && thread->write_reg(eal, _ref(lsa))) { return; } else { fmt::throw_exception("Invalid RawSPU MMIO offset (cmd=0x%x, lsa=0x%x, ea=0x%llx, tag=0x%x, size=0x%x)" HERE, args.cmd, args.lsa, args.eal, args.tag, args.size); } } else if (this->offset >= RAW_SPU_BASE_ADDR) { fmt::throw_exception("SPU MMIO used for RawSPU (cmd=0x%x, lsa=0x%x, ea=0x%llx, tag=0x%x, size=0x%x)" HERE, args.cmd, args.lsa, args.eal, args.tag, args.size); } else if (group && group->threads[index]) { auto& spu = static_cast(*group->threads[index]); if (offset + args.size - 1 < 0x40000) // LS access { eal = spu.offset + offset; // redirect access } else if (!is_get && args.size == 4 && (offset == SYS_SPU_THREAD_SNR1 || offset == SYS_SPU_THREAD_SNR2)) { spu.push_snr(SYS_SPU_THREAD_SNR2 == offset, _ref(lsa)); return; } else { fmt::throw_exception("Invalid MMIO offset (cmd=0x%x, lsa=0x%x, ea=0x%llx, tag=0x%x, size=0x%x)" HERE, args.cmd, args.lsa, args.eal, args.tag, args.size); } } else { fmt::throw_exception("Invalid thread type (cmd=0x%x, lsa=0x%x, ea=0x%llx, tag=0x%x, size=0x%x)" HERE, args.cmd, args.lsa, args.eal, args.tag, args.size); } } u8* dst = (u8*)vm::base(eal); u8* src = (u8*)vm::base(offset + lsa); if (UNLIKELY(!is_get && !g_use_rtm)) { switch (u32 size = args.size) { case 1: { auto& res = vm::reservation_lock(eal, 1); *reinterpret_cast(dst) = *reinterpret_cast(src); res.release(res.load() + 127); break; } case 2: { auto& res = vm::reservation_lock(eal, 2); *reinterpret_cast(dst) = *reinterpret_cast(src); res.release(res.load() + 127); break; } case 4: { auto& res = vm::reservation_lock(eal, 4); *reinterpret_cast(dst) = *reinterpret_cast(src); res.release(res.load() + 127); break; } case 8: { auto& res = vm::reservation_lock(eal, 8); *reinterpret_cast(dst) = *reinterpret_cast(src); res.release(res.load() + 127); break; } default: { if (((eal & 127) + size) <= 128) { // Lock one cache line auto& res = vm::reservation_lock(eal, 128); while (size) { *reinterpret_cast(dst) = *reinterpret_cast(src); dst += 16; src += 16; size -= 16; } res.release(res.load() + 127); break; } auto lock = vm::passive_lock(eal & -128u, ::align(eal + size, 128)); while (size >= 128) { mov_rdata(*reinterpret_cast(dst), *reinterpret_cast(src)); dst += 128; src += 128; size -= 128; } while (size) { *reinterpret_cast(dst) = *reinterpret_cast(src); dst += 16; src += 16; size -= 16; } lock->release(0); break; } } return; } if (is_get) { std::swap(dst, src); } switch (u32 size = args.size) { case 1: { *reinterpret_cast(dst) = *reinterpret_cast(src); break; } case 2: { *reinterpret_cast(dst) = *reinterpret_cast(src); break; } case 4: { *reinterpret_cast(dst) = *reinterpret_cast(src); break; } case 8: { *reinterpret_cast(dst) = *reinterpret_cast(src); break; } default: { while (size >= 128) { mov_rdata(*reinterpret_cast(dst), *reinterpret_cast(src)); dst += 128; src += 128; size -= 128; } while (size) { *reinterpret_cast(dst) = *reinterpret_cast(src); dst += 16; src += 16; size -= 16; } break; } } } bool spu_thread::do_dma_check(const spu_mfc_cmd& args) { const u32 mask = utils::rol32(1, args.tag); if (UNLIKELY(mfc_barrier & mask || (args.cmd & (MFC_BARRIER_MASK | MFC_FENCE_MASK) && mfc_fence & mask))) { // Check for special value combination (normally impossible) if (false) { // Update barrier/fence masks if necessary mfc_barrier = 0; mfc_fence = 0; for (u32 i = 0; i < mfc_size; i++) { if ((mfc_queue[i].cmd & ~0xc) == MFC_BARRIER_CMD) { mfc_barrier |= -1; continue; } if (true) { const u32 _mask = utils::rol32(1u, mfc_queue[i].tag); // A command with barrier hard blocks that tag until it's been dealt with if (mfc_queue[i].cmd & MFC_BARRIER_MASK) { mfc_barrier |= _mask; } // A new command that has a fence can't be executed until the stalled list has been dealt with mfc_fence |= _mask; } } if (mfc_barrier & mask || (args.cmd & MFC_FENCE_MASK && mfc_fence & mask)) { return false; } return true; } return false; } return true; } bool spu_thread::do_list_transfer(spu_mfc_cmd& args) { struct list_element { be_t sb; // Stall-and-Notify bit (0x8000) be_t ts; // List Transfer Size be_t ea; // External Address Low } item{}; while (args.size) { if (UNLIKELY(item.sb & 0x8000)) { ch_stall_mask |= utils::rol32(1, args.tag); if (!ch_stall_stat.get_count()) { ch_event_stat |= SPU_EVENT_SN; } ch_stall_stat.set_value(utils::rol32(1, args.tag) | ch_stall_stat.get_value()); args.tag |= 0x80; // Set stalled status return false; } args.lsa &= 0x3fff0; item = _ref(args.eal & 0x3fff8); const u32 size = item.ts & 0x7fff; const u32 addr = item.ea; LOG_TRACE(SPU, "LIST: addr=0x%x, size=0x%x, lsa=0x%05x, sb=0x%x", addr, size, args.lsa | (addr & 0xf), item.sb); if (size) { spu_mfc_cmd transfer; transfer.eal = addr; transfer.eah = 0; transfer.lsa = args.lsa | (addr & 0xf); transfer.tag = args.tag; transfer.cmd = MFC(args.cmd & ~MFC_LIST_MASK); transfer.size = size; do_dma_transfer(transfer); const u32 add_size = std::max(size, 16); args.lsa += add_size; } args.eal += 8; args.size -= 8; } return true; } void spu_thread::do_putlluc(const spu_mfc_cmd& args) { const u32 addr = args.eal & -128u; if (raddr && addr == raddr) { // Last check for event before we clear the reservation if ((vm::reservation_acquire(addr, 128) & -128) != rtime || !cmp_rdata(rdata, vm::_ref(addr))) { ch_event_stat |= SPU_EVENT_LR; } raddr = 0; } const auto& to_write = _ref(args.lsa & 0x3ff80); // Store unconditionally if (LIKELY(g_use_rtm)) { const u32 result = spu_putlluc_tx(addr, to_write.data(), this); if (result == 2) { cpu_thread::suspend_all cpu_lock(this); // Try to obtain bit 7 (+64) if (!atomic_storage::bts(vm::reservation_acquire(addr, 128).raw(), 6)) { auto& data = vm::_ref(addr); mov_rdata(data, to_write); // Keep checking written data against a rogue transaction sneak in while (std::atomic_thread_fence(std::memory_order_seq_cst), !cmp_rdata(data, to_write)) { mov_rdata(data, to_write); } vm::reservation_acquire(addr, 128) += 63; } else { // Give up if another PUTLLUC command took precedence vm::reservation_acquire(addr, 128) -= 1; } } } else { auto& data = vm::_ref(addr); auto& res = vm::reservation_lock(addr, 128); *reinterpret_cast*>(&data) += 0; if (g_cfg.core.spu_accurate_putlluc) { // Full lock (heavyweight) // TODO: vm::check_addr vm::writer_lock lock(addr); mov_rdata(data, to_write); res.release(res.load() + 127); } else { mov_rdata(data, to_write); res.release(res.load() + 127); } } vm::reservation_notifier(addr, 128).notify_all(); } void spu_thread::do_mfc(bool wait) { u32 removed = 0; u32 barrier = 0; u32 fence = 0; // Process enqueued commands std::remove_if(mfc_queue + 0, mfc_queue + mfc_size, [&](spu_mfc_cmd& args) { if ((args.cmd & ~0xc) == MFC_BARRIER_CMD) { if (&args - mfc_queue <= removed) { // Remove barrier-class command if it's the first in the queue _mm_mfence(); removed++; return true; } // Block all tags barrier |= -1; return false; } // Select tag bit in the tag mask or the stall mask const u32 mask = utils::rol32(1, args.tag); if (barrier & mask) { fence |= mask; return false; } if (args.cmd & (MFC_BARRIER_MASK | MFC_FENCE_MASK) && fence & mask) { if (args.cmd & MFC_BARRIER_MASK) { barrier |= mask; } return false; } if (args.cmd & MFC_LIST_MASK) { if (!(args.tag & 0x80)) { if (do_list_transfer(args)) { removed++; return true; } } if (args.cmd & MFC_BARRIER_MASK) { barrier |= mask; } fence |= mask; return false; } if (args.cmd == MFC_PUTQLLUC_CMD) { if (fence & mask) { return false; } do_putlluc(args); } else if (args.size) { do_dma_transfer(args); } removed++; return true; }); mfc_size -= removed; mfc_barrier = barrier; mfc_fence = fence; if (removed && ch_tag_upd) { const u32 completed = get_mfc_completed(); if (completed && ch_tag_upd == 1) { ch_tag_stat.set_value(completed); ch_tag_upd = 0; } else if (completed == ch_tag_mask && ch_tag_upd == 2) { ch_tag_stat.set_value(completed); ch_tag_upd = 0; } } } u32 spu_thread::get_mfc_completed() { return ch_tag_mask & ~mfc_fence; } bool spu_thread::process_mfc_cmd() { // Stall infinitely if MFC queue is full while (UNLIKELY(mfc_size >= 16)) { state += cpu_flag::wait; if (is_stopped()) { return false; } thread_ctrl::wait(); } spu::scheduler::concurrent_execution_watchdog watchdog(*this); LOG_TRACE(SPU, "DMAC: cmd=%s, lsa=0x%x, ea=0x%llx, tag=0x%x, size=0x%x", ch_mfc_cmd.cmd, ch_mfc_cmd.lsa, ch_mfc_cmd.eal, ch_mfc_cmd.tag, ch_mfc_cmd.size); switch (ch_mfc_cmd.cmd) { case MFC_GETLLAR_CMD: { const u32 addr = ch_mfc_cmd.eal & -128u; auto& data = vm::_ref(addr); auto& dst = _ref(ch_mfc_cmd.lsa & 0x3ff80); u64 ntime; const bool is_polling = false; // TODO if (is_polling) { rtime = vm::reservation_acquire(addr, 128) & -128; while (cmp_rdata(rdata, data) && (vm::reservation_acquire(addr, 128)) == rtime) { state += cpu_flag::wait; if (is_stopped()) { break; } thread_ctrl::wait_for(100); } if (test_stopped()) { return false; } } if (LIKELY(g_use_rtm && !g_cfg.core.spu_accurate_getllar && raddr != addr)) { // TODO: maybe always start from a transaction ntime = spu_getll_inexact(addr, dst.data()); } else if (g_use_rtm) { ntime = spu_getll_tx(addr, dst.data()); if (ntime == 1) { if (!g_cfg.core.spu_accurate_getllar) { ntime = spu_getll_inexact(addr, dst.data()); } else { cpu_thread::suspend_all cpu_lock(this); while (vm::reservation_acquire(addr, 128) & 127) { busy_wait(100); } ntime = vm::reservation_acquire(addr, 128); mov_rdata(dst, data); } } } else { auto& res = vm::reservation_lock(addr, 128); const u64 old_time = res.load() & -128; if (g_cfg.core.spu_accurate_getllar) { *reinterpret_cast*>(&data) += 0; // Full lock (heavyweight) // TODO: vm::check_addr vm::writer_lock lock(addr); ntime = old_time; mov_rdata(dst, data); res.release(old_time); } else { ntime = old_time; mov_rdata(dst, data); res.release(old_time); } } if (raddr && raddr != addr) { // Last check for event before we replace the reservation with a new one if ((vm::reservation_acquire(raddr, 128) & -128) != rtime || !cmp_rdata(rdata, vm::_ref(raddr))) { ch_event_stat |= SPU_EVENT_LR; } } else if (raddr == addr) { // Lost previous reservation on polling if (ntime != rtime || !cmp_rdata(rdata, dst)) { ch_event_stat |= SPU_EVENT_LR; } } raddr = addr; rtime = ntime; mov_rdata(rdata, dst); ch_atomic_stat.set_value(MFC_GETLLAR_SUCCESS); return true; } case MFC_PUTLLC_CMD: { // Store conditionally const u32 addr = ch_mfc_cmd.eal & -128u; u32 result = 0; if (raddr == addr) { const auto& to_write = _ref(ch_mfc_cmd.lsa & 0x3ff80); if (LIKELY(g_use_rtm)) { result = spu_putllc_tx(addr, rtime, rdata.data(), to_write.data()); if (result == 2) { result = 0; cpu_thread::suspend_all cpu_lock(this); // Give up if other PUTLLC/PUTLLUC commands are in progress if (!vm::reservation_acquire(addr, 128).try_dec(rtime + 1)) { auto& data = vm::_ref(addr); if ((vm::reservation_acquire(addr, 128) & -128) == rtime && cmp_rdata(rdata, data)) { mov_rdata(data, to_write); vm::reservation_acquire(addr, 128) += 127; result = 1; } else { vm::reservation_acquire(addr, 128) -= 1; } } } } else if (auto& data = vm::_ref(addr); rtime == (vm::reservation_acquire(raddr, 128) & -128) && cmp_rdata(rdata, data)) { auto& res = vm::reservation_lock(raddr, 128); const u64 old_time = res.load() & -128; if (rtime == old_time) { *reinterpret_cast*>(&data) += 0; // Full lock (heavyweight) // TODO: vm::check_addr vm::writer_lock lock(addr); if (cmp_rdata(rdata, data)) { mov_rdata(data, to_write); res.release(old_time + 128); result = 1; } else { res.release(old_time); } } else { res.release(old_time); } } } if (result) { vm::reservation_notifier(addr, 128).notify_all(); ch_atomic_stat.set_value(MFC_PUTLLC_SUCCESS); } else { if (raddr) { // Last check for event before we clear the reservation if (raddr == addr || rtime != (vm::reservation_acquire(raddr, 128) & -128) || !cmp_rdata(rdata, vm::_ref(raddr))) { ch_event_stat |= SPU_EVENT_LR; } } ch_atomic_stat.set_value(MFC_PUTLLC_FAILURE); } raddr = 0; return true; } case MFC_PUTLLUC_CMD: { do_putlluc(ch_mfc_cmd); ch_atomic_stat.set_value(MFC_PUTLLUC_SUCCESS); return true; } case MFC_PUTQLLUC_CMD: { const u32 mask = utils::rol32(1, ch_mfc_cmd.tag); if (UNLIKELY((mfc_barrier | mfc_fence) & mask)) { mfc_queue[mfc_size++] = ch_mfc_cmd; mfc_fence |= mask; } else { do_putlluc(ch_mfc_cmd); } return true; } case MFC_SNDSIG_CMD: case MFC_SNDSIGB_CMD: case MFC_SNDSIGF_CMD: { if (ch_mfc_cmd.size != 4) { break; } // Fallthrough } case MFC_PUT_CMD: case MFC_PUTB_CMD: case MFC_PUTF_CMD: case MFC_PUTR_CMD: case MFC_PUTRB_CMD: case MFC_PUTRF_CMD: case MFC_GET_CMD: case MFC_GETB_CMD: case MFC_GETF_CMD: { if (LIKELY(ch_mfc_cmd.size <= 0x4000)) { if (LIKELY(do_dma_check(ch_mfc_cmd))) { if (ch_mfc_cmd.size) { do_dma_transfer(ch_mfc_cmd); } return true; } mfc_queue[mfc_size++] = ch_mfc_cmd; mfc_fence |= utils::rol32(1, ch_mfc_cmd.tag); if (ch_mfc_cmd.cmd & MFC_BARRIER_MASK) { mfc_barrier |= utils::rol32(1, ch_mfc_cmd.tag); } return true; } break; } case MFC_PUTL_CMD: case MFC_PUTLB_CMD: case MFC_PUTLF_CMD: case MFC_PUTRL_CMD: case MFC_PUTRLB_CMD: case MFC_PUTRLF_CMD: case MFC_GETL_CMD: case MFC_GETLB_CMD: case MFC_GETLF_CMD: { if (LIKELY(ch_mfc_cmd.size <= 0x4000)) { auto& cmd = mfc_queue[mfc_size]; cmd = ch_mfc_cmd; if (LIKELY(do_dma_check(cmd))) { if (LIKELY(do_list_transfer(cmd))) { return true; } } mfc_size++; mfc_fence |= utils::rol32(1, cmd.tag); if (cmd.cmd & MFC_BARRIER_MASK) { mfc_barrier |= utils::rol32(1, cmd.tag); } return true; } break; } case MFC_BARRIER_CMD: case MFC_EIEIO_CMD: case MFC_SYNC_CMD: { if (mfc_size == 0) { _mm_mfence(); } else { mfc_queue[mfc_size++] = ch_mfc_cmd; mfc_barrier |= -1; } return true; } default: { break; } } fmt::throw_exception("Unknown command (cmd=%s, lsa=0x%x, ea=0x%llx, tag=0x%x, size=0x%x)" HERE, ch_mfc_cmd.cmd, ch_mfc_cmd.lsa, ch_mfc_cmd.eal, ch_mfc_cmd.tag, ch_mfc_cmd.size); } u32 spu_thread::get_events(bool waiting) { const u32 mask1 = ch_event_mask; if (mask1 & ~SPU_EVENT_IMPLEMENTED) { fmt::throw_exception("SPU Events not implemented (mask=0x%x)" HERE, mask1); } // Check reservation status and set SPU_EVENT_LR if lost if (raddr && ((vm::reservation_acquire(raddr, sizeof(rdata)) & -128) != rtime || !cmp_rdata(rdata, vm::_ref(raddr)))) { ch_event_stat |= SPU_EVENT_LR; raddr = 0; } // SPU Decrementer Event if (!ch_dec_value || (ch_dec_value - (get_timebased_time() - ch_dec_start_timestamp)) >> 31) { if ((ch_event_stat & SPU_EVENT_TM) == 0) { ch_event_stat |= SPU_EVENT_TM; } } // Simple polling or polling with atomically set/removed SPU_EVENT_WAITING flag return !waiting ? ch_event_stat & mask1 : ch_event_stat.atomic_op([&](u32& stat) -> u32 { if (u32 res = stat & mask1) { stat &= ~SPU_EVENT_WAITING; return res; } stat |= SPU_EVENT_WAITING; return 0; }); } void spu_thread::set_events(u32 mask) { if (mask & ~SPU_EVENT_IMPLEMENTED) { fmt::throw_exception("SPU Events not implemented (mask=0x%x)" HERE, mask); } // Set new events, get old event mask const u32 old_stat = ch_event_stat.fetch_or(mask); // Notify if some events were set if (~old_stat & mask && old_stat & SPU_EVENT_WAITING && ch_event_stat & SPU_EVENT_WAITING) { notify(); } } void spu_thread::set_interrupt_status(bool enable) { if (enable) { // Detect enabling interrupts with events masked if (ch_event_mask & ~SPU_EVENT_INTR_IMPLEMENTED) { fmt::throw_exception("SPU Interrupts not implemented (mask=0x%x)" HERE, +ch_event_mask); } interrupts_enabled = true; } else { interrupts_enabled = false; } } u32 spu_thread::get_ch_count(u32 ch) { LOG_TRACE(SPU, "get_ch_count(ch=%d [%s])", ch, ch < 128 ? spu_ch_name[ch] : "???"); switch (ch) { case SPU_WrOutMbox: return ch_out_mbox.get_count() ^ 1; case SPU_WrOutIntrMbox: return ch_out_intr_mbox.get_count() ^ 1; case SPU_RdInMbox: return ch_in_mbox.get_count(); case MFC_RdTagStat: return ch_tag_stat.get_count(); case MFC_RdListStallStat: return ch_stall_stat.get_count(); case MFC_WrTagUpdate: return ch_tag_upd == 0; case SPU_RdSigNotify1: return ch_snr1.get_count(); case SPU_RdSigNotify2: return ch_snr2.get_count(); case MFC_RdAtomicStat: return ch_atomic_stat.get_count(); case SPU_RdEventStat: return get_events() != 0; case MFC_Cmd: return 16 - mfc_size; } fmt::throw_exception("Unknown/illegal channel (ch=%d [%s])" HERE, ch, ch < 128 ? spu_ch_name[ch] : "???"); } s64 spu_thread::get_ch_value(u32 ch) { LOG_TRACE(SPU, "get_ch_value(ch=%d [%s])", ch, ch < 128 ? spu_ch_name[ch] : "???"); auto read_channel = [&](spu_channel& channel) -> s64 { if (channel.get_count() == 0) { state += cpu_flag::wait; } for (int i = 0; i < 10 && channel.get_count() == 0; i++) { busy_wait(); } u32 out = 0; while (!channel.try_pop(out)) { if (is_stopped()) { return -1; } thread_ctrl::wait(); } check_state(); return out; }; switch (ch) { case SPU_RdSRR0: { return srr0; } case SPU_RdInMbox: { if (ch_in_mbox.get_count() == 0) { state += cpu_flag::wait; } while (true) { for (int i = 0; i < 10 && ch_in_mbox.get_count() == 0; i++) { busy_wait(); } u32 out = 0; if (const uint old_count = ch_in_mbox.try_pop(out)) { if (old_count == 4 /* SPU_IN_MBOX_THRESHOLD */) // TODO: check this { int_ctrl[2].set(SPU_INT2_STAT_SPU_MAILBOX_THRESHOLD_INT); } check_state(); return out; } if (is_stopped()) { return -1; } thread_ctrl::wait(); } } case MFC_RdTagStat: { if (ch_tag_stat.get_count()) { u32 out = ch_tag_stat.get_value(); ch_tag_stat.set_value(0, false); return out; } // Will stall infinitely return read_channel(ch_tag_stat); } case MFC_RdTagMask: { return ch_tag_mask; } case SPU_RdSigNotify1: { return read_channel(ch_snr1); } case SPU_RdSigNotify2: { return read_channel(ch_snr2); } case MFC_RdAtomicStat: { if (ch_atomic_stat.get_count()) { u32 out = ch_atomic_stat.get_value(); ch_atomic_stat.set_value(0, false); return out; } // Will stall infinitely return read_channel(ch_atomic_stat); } case MFC_RdListStallStat: { if (ch_stall_stat.get_count()) { u32 out = ch_stall_stat.get_value(); ch_stall_stat.set_value(0, false); return out; } // Will stall infinitely return read_channel(ch_stall_stat); } case SPU_RdDec: { u32 out = ch_dec_value - (u32)(get_timebased_time() - ch_dec_start_timestamp); //Polling: We might as well hint to the scheduler to slot in another thread since this one is counting down if (g_cfg.core.spu_loop_detection && out > spu::scheduler::native_jiffy_duration_us) { state += cpu_flag::wait; std::this_thread::yield(); } return out; } case SPU_RdEventMask: { return ch_event_mask; } case SPU_RdEventStat: { u32 res = get_events(); if (res) { return res; } const u32 mask1 = ch_event_mask; if (mask1 & SPU_EVENT_LR && raddr) { if (mask1 != SPU_EVENT_LR && mask1 != SPU_EVENT_LR + SPU_EVENT_TM) { // Combining LR with other flags needs another solution fmt::throw_exception("Not supported: event mask 0x%x" HERE, mask1); } const auto pseudo_lock = vm::reservation_notifier(raddr, 128).try_shared_lock(); if (!pseudo_lock) { fmt::throw_exception("Unexpected: reservation notifier lock failed"); } while (res = get_events(), !res) { state += cpu_flag::wait; if (is_stopped()) { return -1; } pseudo_lock.wait(100); } check_state(); return res; } while (res = get_events(true), !res) { state += cpu_flag::wait; if (is_stopped()) { return -1; } thread_ctrl::wait_for(100); } check_state(); return res; } case SPU_RdMachStat: { // HACK: "Not isolated" status // Return SPU Interrupt status in LSB return interrupts_enabled == true; } } fmt::throw_exception("Unknown/illegal channel (ch=%d [%s])" HERE, ch, ch < 128 ? spu_ch_name[ch] : "???"); } bool spu_thread::set_ch_value(u32 ch, u32 value) { LOG_TRACE(SPU, "set_ch_value(ch=%d [%s], value=0x%x)", ch, ch < 128 ? spu_ch_name[ch] : "???", value); switch (ch) { case SPU_WrSRR0: { srr0 = value; return true; } case SPU_WrOutIntrMbox: { if (offset >= RAW_SPU_BASE_ADDR) { while (!ch_out_intr_mbox.try_push(value)) { state += cpu_flag::wait; if (is_stopped()) { return false; } thread_ctrl::wait(); } int_ctrl[2].set(SPU_INT2_STAT_MAILBOX_INT); check_state(); return true; } state += cpu_flag::wait; const u32 code = value >> 24; { if (code < 64) { /* ===== sys_spu_thread_send_event (used by spu_printf) ===== */ u32 spup = code & 63; u32 data = 0; if (!ch_out_mbox.try_pop(data)) { fmt::throw_exception("sys_spu_thread_send_event(value=0x%x, spup=%d): Out_MBox is empty" HERE, value, spup); } if (u32 count = ch_in_mbox.get_count()) { fmt::throw_exception("sys_spu_thread_send_event(value=0x%x, spup=%d): In_MBox is not empty (count=%d)" HERE, value, spup, count); } LOG_TRACE(SPU, "sys_spu_thread_send_event(spup=%d, data0=0x%x, data1=0x%x)", spup, value & 0x00ffffff, data); const auto queue = (std::lock_guard{group->mutex}, this->spup[spup].lock()); if (!queue) { LOG_WARNING(SPU, "sys_spu_thread_send_event(spup=%d, data0=0x%x, data1=0x%x): event queue not connected", spup, (value & 0x00ffffff), data); ch_in_mbox.set_values(1, CELL_ENOTCONN); return true; } ch_in_mbox.set_values(1, CELL_OK); if (!queue->send(SYS_SPU_THREAD_EVENT_USER_KEY, id, ((u64)spup << 32) | (value & 0x00ffffff), data)) { ch_in_mbox.set_values(1, CELL_EBUSY); } return true; } else if (code < 128) { /* ===== sys_spu_thread_throw_event ===== */ u32 spup = code & 63; u32 data = 0; if (!ch_out_mbox.try_pop(data)) { fmt::throw_exception("sys_spu_thread_throw_event(value=0x%x, spup=%d): Out_MBox is empty" HERE, value, spup); } LOG_TRACE(SPU, "sys_spu_thread_throw_event(spup=%d, data0=0x%x, data1=0x%x)", spup, value & 0x00ffffff, data); const auto queue = (std::lock_guard{group->mutex}, this->spup[spup].lock()); if (!queue) { LOG_WARNING(SPU, "sys_spu_thread_throw_event(spup=%d, data0=0x%x, data1=0x%x): event queue not connected", spup, (value & 0x00ffffff), data); return true; } // TODO: check passing spup value if (!queue->send(SYS_SPU_THREAD_EVENT_USER_KEY, id, ((u64)spup << 32) | (value & 0x00ffffff), data)) { LOG_WARNING(SPU, "sys_spu_thread_throw_event(spup=%d, data0=0x%x, data1=0x%x) failed (queue is full)", spup, (value & 0x00ffffff), data); } return true; } else if (code == 128) { /* ===== sys_event_flag_set_bit ===== */ u32 flag = value & 0xffffff; u32 data = 0; if (!ch_out_mbox.try_pop(data)) { fmt::throw_exception("sys_event_flag_set_bit(value=0x%x (flag=%d)): Out_MBox is empty" HERE, value, flag); } if (u32 count = ch_in_mbox.get_count()) { fmt::throw_exception("sys_event_flag_set_bit(value=0x%x (flag=%d)): In_MBox is not empty (%d)" HERE, value, flag, count); } LOG_TRACE(SPU, "sys_event_flag_set_bit(id=%d, value=0x%x (flag=%d))", data, value, flag); ch_in_mbox.set_values(1, CELL_OK); // Use the syscall to set flag if (s32 res = sys_event_flag_set(data, 1ull << flag)) { ch_in_mbox.set_values(1, res); } return true; } else if (code == 192) { /* ===== sys_event_flag_set_bit_impatient ===== */ u32 flag = value & 0xffffff; u32 data = 0; if (!ch_out_mbox.try_pop(data)) { fmt::throw_exception("sys_event_flag_set_bit_impatient(value=0x%x (flag=%d)): Out_MBox is empty" HERE, value, flag); } LOG_TRACE(SPU, "sys_event_flag_set_bit_impatient(id=%d, value=0x%x (flag=%d))", data, value, flag); // Use the syscall to set flag sys_event_flag_set(data, 1ull << flag); return true; } else { if (ch_out_mbox.get_count()) { fmt::throw_exception("SPU_WrOutIntrMbox: unknown data (value=0x%x); Out_MBox = 0x%x" HERE, value, ch_out_mbox.get_value()); } else { fmt::throw_exception("SPU_WrOutIntrMbox: unknown data (value=0x%x)" HERE, value); } } } } case SPU_WrOutMbox: { while (!ch_out_mbox.try_push(value)) { state += cpu_flag::wait; if (is_stopped()) { return false; } thread_ctrl::wait(); } check_state(); return true; } case MFC_WrTagMask: { ch_tag_mask = value; if (ch_tag_upd) { const u32 completed = get_mfc_completed(); if (completed && ch_tag_upd == 1) { ch_tag_stat.set_value(completed); ch_tag_upd = 0; } else if (completed == value && ch_tag_upd == 2) { ch_tag_stat.set_value(completed); ch_tag_upd = 0; } } return true; } case MFC_WrTagUpdate: { if (value > 2) { break; } const u32 completed = get_mfc_completed(); if (!value) { ch_tag_upd = 0; ch_tag_stat.set_value(completed); } else if (completed && value == 1) { ch_tag_upd = 0; ch_tag_stat.set_value(completed); } else if (completed == ch_tag_mask && value == 2) { ch_tag_upd = 0; ch_tag_stat.set_value(completed); } else { ch_tag_upd = value; ch_tag_stat.set_value(0, false); } return true; } case MFC_LSA: { ch_mfc_cmd.lsa = value; return true; } case MFC_EAH: { ch_mfc_cmd.eah = value; return true; } case MFC_EAL: { ch_mfc_cmd.eal = value; return true; } case MFC_Size: { ch_mfc_cmd.size = value & 0x7fff; return true; } case MFC_TagID: { ch_mfc_cmd.tag = value & 0x1f; return true; } case MFC_Cmd: { ch_mfc_cmd.cmd = MFC(value & 0xff); return process_mfc_cmd(); } case MFC_WrListStallAck: { // Reset stall status for specified tag const u32 tag_mask = utils::rol32(1, value); if (ch_stall_mask & tag_mask) { ch_stall_mask &= ~tag_mask; for (u32 i = 0; i < mfc_size; i++) { if (mfc_queue[i].tag == (value | 0x80)) { // Unset stall bit mfc_queue[i].tag &= 0x7f; } } do_mfc(true); } return true; } case SPU_WrDec: { ch_dec_start_timestamp = get_timebased_time(); ch_dec_value = value; return true; } case SPU_WrEventMask: { ch_event_mask = value; return true; } case SPU_WrEventAck: { ch_event_stat &= ~value; return true; } case 69: { return true; } } fmt::throw_exception("Unknown/illegal channel (ch=%d [%s], value=0x%x)" HERE, ch, ch < 128 ? spu_ch_name[ch] : "???", value); } bool spu_thread::stop_and_signal(u32 code) { LOG_TRACE(SPU, "stop_and_signal(code=0x%x)", code); if (offset >= RAW_SPU_BASE_ADDR) { state += cpu_flag::wait; status.atomic_op([code](u32& status) { status = (status & 0xffff) | (code << 16); status |= SPU_STATUS_STOPPED_BY_STOP; status &= ~SPU_STATUS_RUNNING; }); int_ctrl[2].set(SPU_INT2_STAT_SPU_STOP_AND_SIGNAL_INT); state += cpu_flag::stop; check_state(); return true; } switch (code) { case 0x000: { LOG_WARNING(SPU, "STOP 0x0"); // HACK: find an ILA instruction for (u32 addr = pc; addr < 0x40000; addr += 4) { const u32 instr = _ref(addr); if (instr >> 25 == 0x21) { pc = addr; return false; } if (instr > 0x1fffff) { break; } } // HACK: wait for executable code while (!_ref(pc)) { state += cpu_flag::wait; if (is_stopped()) { return false; } thread_ctrl::wait_for(1000); } check_state(); return false; } case 0x001: { state += cpu_flag::wait; thread_ctrl::wait_for(1000); // hack check_state(); return true; } case 0x002: { state += cpu_flag::ret; return true; } case 0x110: { /* ===== sys_spu_thread_receive_event ===== */ u32 spuq = 0; if (!ch_out_mbox.try_pop(spuq)) { fmt::throw_exception("sys_spu_thread_receive_event(): Out_MBox is empty" HERE); } if (u32 count = ch_in_mbox.get_count()) { LOG_ERROR(SPU, "sys_spu_thread_receive_event(): In_MBox is not empty (%d)", count); return ch_in_mbox.set_values(1, CELL_EBUSY), true; } LOG_TRACE(SPU, "sys_spu_thread_receive_event(spuq=0x%x)", spuq); if (group->type & SYS_SPU_THREAD_GROUP_TYPE_EXCLUSIVE_NON_CONTEXT) // this check may be inaccurate { return ch_in_mbox.set_values(1, CELL_EINVAL), true; } std::shared_ptr queue; state += cpu_flag::wait; while (true) { queue.reset(); // Check group status, wait if necessary while (group->run_state >= SPU_THREAD_GROUP_STATUS_WAITING && group->run_state <= SPU_THREAD_GROUP_STATUS_SUSPENDED) { if (is_stopped()) { return false; } thread_ctrl::wait(); } reader_lock rlock(id_manager::g_mutex); std::lock_guard lock(group->mutex); if (group->run_state >= SPU_THREAD_GROUP_STATUS_WAITING && group->run_state <= SPU_THREAD_GROUP_STATUS_WAITING_AND_SUSPENDED) { // Try again continue; } for (auto& v : this->spuq) { if (spuq == v.first) { queue = v.second.lock(); if (queue) { break; } } } if (!queue) { check_state(); return ch_in_mbox.set_values(1, CELL_EINVAL), true; // TODO: check error value } std::lock_guard qlock(queue->mutex); if (queue->events.empty()) { queue->sq.emplace_back(this); group->run_state = SPU_THREAD_GROUP_STATUS_WAITING; for (auto& thread : group->threads) { if (thread) { thread->state += cpu_flag::suspend; } } // Wait break; } else { // Return the event immediately const auto event = queue->events.front(); const auto data1 = static_cast(std::get<1>(event)); const auto data2 = static_cast(std::get<2>(event)); const auto data3 = static_cast(std::get<3>(event)); ch_in_mbox.set_values(4, CELL_OK, data1, data2, data3); queue->events.pop_front(); check_state(); return true; } } while (true) { if (is_stopped()) { return false; } if (!state.test_and_reset(cpu_flag::signal)) { thread_ctrl::wait(); } else { break; } } std::lock_guard lock(group->mutex); if (group->run_state == SPU_THREAD_GROUP_STATUS_WAITING) { group->run_state = SPU_THREAD_GROUP_STATUS_RUNNING; } else if (group->run_state == SPU_THREAD_GROUP_STATUS_WAITING_AND_SUSPENDED) { group->run_state = SPU_THREAD_GROUP_STATUS_SUSPENDED; } for (auto& thread : group->threads) { if (thread) { thread->state -= cpu_flag::suspend; if (thread.get() != this) { thread_ctrl::notify(*thread); } } } check_state(); return true; } case 0x111: { /* ===== sys_spu_thread_tryreceive_event ===== */ u32 spuq = 0; if (!ch_out_mbox.try_pop(spuq)) { fmt::throw_exception("sys_spu_thread_tryreceive_event(): Out_MBox is empty" HERE); } if (u32 count = ch_in_mbox.get_count()) { LOG_ERROR(SPU, "sys_spu_thread_tryreceive_event(): In_MBox is not empty (%d)", count); return ch_in_mbox.set_values(1, CELL_EBUSY), true; } LOG_TRACE(SPU, "sys_spu_thread_tryreceive_event(spuq=0x%x)", spuq); std::lock_guard lock(group->mutex); std::shared_ptr queue; for (auto& v : this->spuq) { if (spuq == v.first) { if (queue = v.second.lock()) { break; } } } if (!queue) { return ch_in_mbox.set_values(1, CELL_EINVAL), true; } std::lock_guard qlock(queue->mutex); if (queue->events.empty()) { return ch_in_mbox.set_values(1, CELL_EBUSY), true; } const auto event = queue->events.front(); const auto data1 = static_cast(std::get<1>(event)); const auto data2 = static_cast(std::get<2>(event)); const auto data3 = static_cast(std::get<3>(event)); ch_in_mbox.set_values(4, CELL_OK, data1, data2, data3); queue->events.pop_front(); return true; } case 0x100: { if (ch_out_mbox.get_count()) { fmt::throw_exception("STOP code 0x100: Out_MBox is not empty" HERE); } _mm_mfence(); return true; } case 0x101: { /* ===== sys_spu_thread_group_exit ===== */ state += cpu_flag::wait; u32 value = 0; if (!ch_out_mbox.try_pop(value)) { fmt::throw_exception("sys_spu_thread_group_exit(): Out_MBox is empty" HERE); } LOG_TRACE(SPU, "sys_spu_thread_group_exit(status=0x%x)", value); std::lock_guard lock(group->mutex); for (auto& thread : group->threads) { if (thread && thread.get() != this) { thread->state += cpu_flag::stop; thread_ctrl::notify(*thread); } } group->exit_status = value; group->join_state = SYS_SPU_THREAD_GROUP_JOIN_GROUP_EXIT; state += cpu_flag::stop; check_state(); return true; } case 0x102: { /* ===== sys_spu_thread_exit ===== */ state += cpu_flag::wait; if (!ch_out_mbox.get_count()) { fmt::throw_exception("sys_spu_thread_exit(): Out_MBox is empty" HERE); } LOG_TRACE(SPU, "sys_spu_thread_exit(status=0x%x)", ch_out_mbox.get_value()); status |= SPU_STATUS_STOPPED_BY_STOP; state += cpu_flag::stop; check_state(); return true; } } if (!ch_out_mbox.get_count()) { fmt::throw_exception("Unknown STOP code: 0x%x (Out_MBox is empty)" HERE, code); } else { fmt::throw_exception("Unknown STOP code: 0x%x (Out_MBox=0x%x)" HERE, code, ch_out_mbox.get_value()); } } void spu_thread::halt() { LOG_TRACE(SPU, "halt()"); if (offset >= RAW_SPU_BASE_ADDR) { status.atomic_op([](u32& status) { status |= SPU_STATUS_STOPPED_BY_HALT; status &= ~SPU_STATUS_RUNNING; }); int_ctrl[2].set(SPU_INT2_STAT_SPU_HALT_OR_STEP_INT); throw cpu_flag::stop; } status |= SPU_STATUS_STOPPED_BY_HALT; fmt::throw_exception("Halt" HERE); } void spu_thread::fast_call(u32 ls_addr) { // LS:0x0: this is originally the entry point of the interrupt handler, but interrupts are not implemented _ref(0) = 0x00000002; // STOP 2 auto old_pc = pc; auto old_lr = gpr[0]._u32[3]; auto old_stack = gpr[1]._u32[3]; // only saved and restored (may be wrong) pc = ls_addr; gpr[0]._u32[3] = 0x0; try { cpu_task(); } catch (cpu_flag _s) { state += _s; if (_s != cpu_flag::ret) throw; } state -= cpu_flag::ret; pc = old_pc; gpr[0]._u32[3] = old_lr; gpr[1]._u32[3] = old_stack; } DECLARE(spu_thread::g_raw_spu_ctr){}; DECLARE(spu_thread::g_raw_spu_id){};