diff --git a/rpcs3/Emu/Cell/PPUThread.cpp b/rpcs3/Emu/Cell/PPUThread.cpp index 808ee6dc57..5c897a0e21 100644 --- a/rpcs3/Emu/Cell/PPUThread.cpp +++ b/rpcs3/Emu/Cell/PPUThread.cpp @@ -973,18 +973,26 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr) ppu.raddr = addr; + u64 count = 0; + while (LIKELY(g_use_rtm)) { ppu.rtime = vm::reservation_acquire(addr, sizeof(T)) & -128; ppu.rdata = data; - if (LIKELY(vm::reservation_acquire(addr, sizeof(T)) == ppu.rtime)) + if (LIKELY((vm::reservation_acquire(addr, sizeof(T)) & -128) == ppu.rtime)) { + if (UNLIKELY(count >= 10)) + { + LOG_ERROR(PPU, "%s took too long: %u", sizeof(T) == 4 ? "LWARX" : "LDARX", count); + } + return static_cast(ppu.rdata << data_off >> size_off); } else { _mm_pause(); + count++; } } @@ -1040,7 +1048,7 @@ extern u64 ppu_ldarx(ppu_thread& ppu, u32 addr) return ppu_load_acquire_reservation(ppu, addr); } -const auto ppu_stwcx_tx = build_function_asm([](asmjit::X86Assembler& c, auto& args) +const auto ppu_stwcx_tx = build_function_asm([](asmjit::X86Assembler& c, auto& args) { using namespace asmjit; @@ -1057,11 +1065,12 @@ const auto ppu_stwcx_tx = build_function_asm([](asmjit::X86Assembler& c, auto& args) +const auto ppu_stdcx_tx = build_function_asm([](asmjit::X86Assembler& c, auto& args) { using namespace asmjit; @@ -1155,11 +1185,12 @@ const auto ppu_stdcx_tx = build_function_asm([](asmjit::X86Assembler& c, auto& args) +const auto spu_putllc_tx = build_function_asm([](asmjit::X86Assembler& c, auto& args) { using namespace asmjit; Label fall = c.newLabel(); Label fail = c.newLabel(); - Label retry = c.newLabel(); Label _ret = c.newLabel(); if (utils::has_avx() && !s_tsx_avx) @@ -194,16 +196,21 @@ const auto spu_putllc_tx = build_function_asm([](asmjit::X86Assembler& c, auto& args) +const auto spu_getll_tx = build_function_asm([](asmjit::X86Assembler& c, auto& args) { using namespace asmjit; @@ -406,7 +548,7 @@ const auto spu_getll_tx = build_function_asm([]( c.lea(x86::r11, x86::qword_ptr(x86::r11, args[0])); c.shr(args[0], 4); c.lea(x86::r10, x86::qword_ptr(x86::r10, args[0])); - c.mov(args[0].r32(), 2); + c.xor_(args[0].r32(), args[0].r32()); // Begin transaction Label begin = build_transaction_enter(c, fall); @@ -453,10 +595,13 @@ const auto spu_getll_tx = build_function_asm([]( } c.and_(x86::rax, -128); + c.mov(x86::qword_ptr(args[2]), x86::rax); + c.mov(x86::rax, args[0]); c.jmp(_ret); // Touch memory after transaction failure c.bind(fall); + c.lea(args[0], x86::qword_ptr(args[0], 1)); c.pause(); c.xor_(x86::r11, 0xf80); c.xor_(x86::r10, 0xf80); @@ -464,11 +609,7 @@ const auto spu_getll_tx = build_function_asm([]( c.mov(x86::rax, x86::qword_ptr(x86::r10)); c.xor_(x86::r11, 0xf80); c.xor_(x86::r10, 0xf80); - c.sub(args[0], 1); - c.jnz(begin); - c.mov(x86::eax, 1); - c.jmp(_ret); - + c.jmp(begin); c.bind(_ret); #ifdef _WIN32 @@ -488,7 +629,166 @@ const auto spu_getll_tx = build_function_asm([]( c.ret(); }); -const auto spu_putlluc_tx = build_function_asm([](asmjit::X86Assembler& c, auto& args) +const auto spu_getll_fast = build_function_asm([](asmjit::X86Assembler& c, auto& args) +{ + using namespace asmjit; + + Label _ret = c.newLabel(); + + if (utils::has_avx() && !s_tsx_avx) + { + c.vzeroupper(); + } + + // Create stack frame if necessary (Windows ABI has only 6 volatile vector registers) +#ifdef _WIN32 + if (!s_tsx_avx) + { + c.sub(x86::rsp, 72); + c.movups(x86::oword_ptr(x86::rsp, 0), x86::xmm6); + c.movups(x86::oword_ptr(x86::rsp, 16), x86::xmm7); + c.movups(x86::oword_ptr(x86::rsp, 32), x86::xmm8); + c.movups(x86::oword_ptr(x86::rsp, 48), x86::xmm9); + } +#endif + + // Prepare registers + c.mov(x86::rax, imm_ptr(&vm::g_reservations)); + c.mov(x86::r10, x86::qword_ptr(x86::rax)); + c.mov(x86::rax, imm_ptr(&vm::g_base_addr)); + c.mov(x86::r11, x86::qword_ptr(x86::rax)); + c.lea(x86::r11, x86::qword_ptr(x86::r11, args[0])); + c.shr(args[0], 4); + c.lea(x86::r10, x86::qword_ptr(x86::r10, args[0])); + c.xor_(args[0].r32(), args[0].r32()); + + // Begin copying + Label begin = c.newLabel(); + Label test0 = c.newLabel(); + c.bind(begin); + c.mov(x86::rax, x86::qword_ptr(x86::r10)); + + if (s_tsx_avx) + { + c.vmovaps(x86::ymm0, x86::yword_ptr(x86::r11, 0)); + c.vmovaps(x86::ymm1, x86::yword_ptr(x86::r11, 32)); + c.vmovaps(x86::ymm2, x86::yword_ptr(x86::r11, 64)); + c.vmovaps(x86::ymm3, x86::yword_ptr(x86::r11, 96)); + } + else + { + c.movaps(x86::xmm0, x86::oword_ptr(x86::r11, 0)); + c.movaps(x86::xmm1, x86::oword_ptr(x86::r11, 16)); + c.movaps(x86::xmm2, x86::oword_ptr(x86::r11, 32)); + c.movaps(x86::xmm3, x86::oword_ptr(x86::r11, 48)); + c.movaps(x86::xmm4, x86::oword_ptr(x86::r11, 64)); + c.movaps(x86::xmm5, x86::oword_ptr(x86::r11, 80)); + c.movaps(x86::xmm6, x86::oword_ptr(x86::r11, 96)); + c.movaps(x86::xmm7, x86::oword_ptr(x86::r11, 112)); + } + + // Verify and retry if necessary. + c.cmp(x86::rax, x86::qword_ptr(x86::r10)); + c.je(test0); + c.pause(); + c.lea(args[0], x86::qword_ptr(args[0], 1)); + c.jmp(begin); + + c.bind(test0); + c.test(x86::eax, 0x7f); + c.jz(_ret); + c.and_(x86::rax, -128); + + // If there are lock bits set, verify data as well. + if (s_tsx_avx) + { + c.vxorps(x86::ymm4, x86::ymm0, x86::yword_ptr(x86::r11, 0)); + c.vxorps(x86::ymm5, x86::ymm1, x86::yword_ptr(x86::r11, 32)); + c.vorps(x86::ymm5, x86::ymm5, x86::ymm4); + c.vxorps(x86::ymm4, x86::ymm2, x86::yword_ptr(x86::r11, 64)); + c.vorps(x86::ymm5, x86::ymm5, x86::ymm4); + c.vxorps(x86::ymm4, x86::ymm3, x86::yword_ptr(x86::r11, 96)); + c.vorps(x86::ymm5, x86::ymm5, x86::ymm4); + c.vptest(x86::ymm5, x86::ymm5); + } + else + { + c.xorps(x86::xmm9, x86::xmm9); + c.movaps(x86::xmm8, x86::xmm0); + c.xorps(x86::xmm8, x86::oword_ptr(x86::r11, 0)); + c.orps(x86::xmm9, x86::xmm8); + c.movaps(x86::xmm8, x86::xmm1); + c.xorps(x86::xmm8, x86::oword_ptr(x86::r11, 16)); + c.orps(x86::xmm9, x86::xmm8); + c.movaps(x86::xmm8, x86::xmm2); + c.xorps(x86::xmm8, x86::oword_ptr(x86::r11, 32)); + c.orps(x86::xmm9, x86::xmm8); + c.movaps(x86::xmm8, x86::xmm3); + c.xorps(x86::xmm8, x86::oword_ptr(x86::r11, 48)); + c.orps(x86::xmm9, x86::xmm8); + c.movaps(x86::xmm8, x86::xmm4); + c.xorps(x86::xmm8, x86::oword_ptr(x86::r11, 64)); + c.orps(x86::xmm9, x86::xmm8); + c.movaps(x86::xmm8, x86::xmm5); + c.xorps(x86::xmm8, x86::oword_ptr(x86::r11, 80)); + c.orps(x86::xmm9, x86::xmm8); + c.movaps(x86::xmm8, x86::xmm6); + c.xorps(x86::xmm8, x86::oword_ptr(x86::r11, 96)); + c.orps(x86::xmm9, x86::xmm8); + c.movaps(x86::xmm8, x86::xmm7); + c.xorps(x86::xmm8, x86::oword_ptr(x86::r11, 112)); + c.orps(x86::xmm9, x86::xmm8); + c.ptest(x86::xmm9, x86::xmm9); + } + + c.jz(_ret); + c.lea(args[0], x86::qword_ptr(args[0], 2)); + c.jmp(begin); + + c.bind(_ret); + + if (s_tsx_avx) + { + c.vmovups(x86::yword_ptr(args[1], 0), x86::ymm0); + c.vmovups(x86::yword_ptr(args[1], 32), x86::ymm1); + c.vmovups(x86::yword_ptr(args[1], 64), x86::ymm2); + c.vmovups(x86::yword_ptr(args[1], 96), x86::ymm3); + } + else + { + c.movaps(x86::oword_ptr(args[1], 0), x86::xmm0); + c.movaps(x86::oword_ptr(args[1], 16), x86::xmm1); + c.movaps(x86::oword_ptr(args[1], 32), x86::xmm2); + c.movaps(x86::oword_ptr(args[1], 48), x86::xmm3); + c.movaps(x86::oword_ptr(args[1], 64), x86::xmm4); + c.movaps(x86::oword_ptr(args[1], 80), x86::xmm5); + c.movaps(x86::oword_ptr(args[1], 96), x86::xmm6); + c.movaps(x86::oword_ptr(args[1], 112), x86::xmm7); + } + + c.mov(x86::qword_ptr(args[2]), x86::rax); + c.mov(x86::rax, args[0]); + +#ifdef _WIN32 + if (!s_tsx_avx) + { + c.movups(x86::xmm6, x86::oword_ptr(x86::rsp, 0)); + c.movups(x86::xmm7, x86::oword_ptr(x86::rsp, 16)); + c.movups(x86::xmm8, x86::oword_ptr(x86::rsp, 32)); + c.movups(x86::xmm9, x86::oword_ptr(x86::rsp, 48)); + c.add(x86::rsp, 72); + } +#endif + + if (s_tsx_avx) + { + c.vzeroupper(); + } + + c.ret(); +}); + +const auto spu_putlluc_tx = build_function_asm([](asmjit::X86Assembler& c, auto& args) { using namespace asmjit; @@ -501,10 +801,14 @@ const auto spu_putlluc_tx = build_function_asm 5) + if (count >= 10) { LOG_ERROR(SPU, "%s took too long: %u", args.cmd, count); } @@ -1474,31 +1848,9 @@ bool spu_thread::process_mfc_cmd() if (LIKELY(g_use_rtm)) { - u64 count = 1; + const u64 count = g_cfg.core.spu_accurate_getllar ? spu_getll_tx(addr, dst.data(), &ntime) : spu_getll_fast(addr, dst.data(), &ntime); - if (g_cfg.core.spu_accurate_getllar) - { - while ((ntime = spu_getll_tx(addr, dst.data())) & 1) - { - std::this_thread::yield(); - count += 2; - } - } - else - { - for (;; count++, busy_wait(300)) - { - ntime = vm::reservation_acquire(addr, 128) & -128; - dst = data; - - if (LIKELY(vm::reservation_acquire(addr, 128) == ntime)) - { - break; - } - } - } - - if (count > 15) + if (count >= 10) { LOG_ERROR(SPU, "%s took too long: %u", ch_mfc_cmd.cmd, count); } @@ -1565,17 +1917,20 @@ bool spu_thread::process_mfc_cmd() if (LIKELY(g_use_rtm)) { - while (true) + u64 count = spu_putllc_tx(addr, rtime, rdata.data(), to_write.data()); + + if ((count >> 63) == 0) { - result = spu_putllc_tx(addr, rtime, rdata.data(), to_write.data()); + result = 1; + } + else + { + count = ~count; + } - if (result < 2) - { - break; - } - - // Retry - std::this_thread::yield(); + if (count >= 10) + { + LOG_ERROR(SPU, "%s took too long: %u (r=%u)", ch_mfc_cmd.cmd, count, result); } } else if (auto& data = vm::_ref(addr); rdata == data) @@ -1868,7 +2223,7 @@ s64 spu_thread::get_ch_value(u32 ch) busy_wait(); } - u32 out; + u32 out = 0; while (!channel.try_pop(out)) { @@ -1898,7 +2253,7 @@ s64 spu_thread::get_ch_value(u32 ch) busy_wait(); } - u32 out; + u32 out = 0; if (const uint old_count = ch_in_mbox.try_pop(out)) { @@ -2084,7 +2439,7 @@ bool spu_thread::set_ch_value(u32 ch, u32 value) /* ===== sys_spu_thread_send_event (used by spu_printf) ===== */ u32 spup = code & 63; - u32 data; + u32 data = 0; if (!ch_out_mbox.try_pop(data)) { @@ -2121,7 +2476,7 @@ bool spu_thread::set_ch_value(u32 ch, u32 value) /* ===== sys_spu_thread_throw_event ===== */ u32 spup = code & 63; - u32 data; + u32 data = 0; if (!ch_out_mbox.try_pop(data)) { @@ -2151,7 +2506,7 @@ bool spu_thread::set_ch_value(u32 ch, u32 value) /* ===== sys_event_flag_set_bit ===== */ u32 flag = value & 0xffffff; - u32 data; + u32 data = 0; if (!ch_out_mbox.try_pop(data)) { @@ -2180,7 +2535,7 @@ bool spu_thread::set_ch_value(u32 ch, u32 value) /* ===== sys_event_flag_set_bit_impatient ===== */ u32 flag = value & 0xffffff; - u32 data; + u32 data = 0; if (!ch_out_mbox.try_pop(data)) { @@ -2437,7 +2792,7 @@ bool spu_thread::stop_and_signal(u32 code) { /* ===== sys_spu_thread_receive_event ===== */ - u32 spuq; + u32 spuq = 0; if (!ch_out_mbox.try_pop(spuq)) { @@ -2581,7 +2936,7 @@ bool spu_thread::stop_and_signal(u32 code) { /* ===== sys_spu_thread_tryreceive_event ===== */ - u32 spuq; + u32 spuq = 0; if (!ch_out_mbox.try_pop(spuq)) { @@ -2647,7 +3002,7 @@ bool spu_thread::stop_and_signal(u32 code) { /* ===== sys_spu_thread_group_exit ===== */ - u32 value; + u32 value = 0; if (!ch_out_mbox.try_pop(value)) {