mirror of
https://github.com/RPCS3/rpcs3.git
synced 2025-07-07 15:31:26 +12:00
SPU: New GETLLAR technique
This commit is contained in:
parent
3ec73b651e
commit
e423128a32
1 changed files with 39 additions and 330 deletions
|
@ -510,283 +510,6 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, const
|
||||||
c.ret();
|
c.ret();
|
||||||
});
|
});
|
||||||
|
|
||||||
const auto spu_getll_tx = build_function_asm<u64(*)(u32 raddr, void* rdata)>([](asmjit::X86Assembler& c, auto& args)
|
|
||||||
{
|
|
||||||
using namespace asmjit;
|
|
||||||
|
|
||||||
Label fall = c.newLabel();
|
|
||||||
Label _ret = c.newLabel();
|
|
||||||
|
|
||||||
if (utils::has_avx() && !s_tsx_avx)
|
|
||||||
{
|
|
||||||
c.vzeroupper();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Create stack frame if necessary (Windows ABI has only 6 volatile vector registers)
|
|
||||||
c.push(x86::rbp);
|
|
||||||
c.push(x86::r13);
|
|
||||||
c.push(x86::r12);
|
|
||||||
c.push(x86::rbx);
|
|
||||||
c.sub(x86::rsp, 72);
|
|
||||||
#ifdef _WIN32
|
|
||||||
if (!s_tsx_avx)
|
|
||||||
{
|
|
||||||
c.movups(x86::oword_ptr(x86::rsp, 0), x86::xmm6);
|
|
||||||
c.movups(x86::oword_ptr(x86::rsp, 16), x86::xmm7);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Prepare registers
|
|
||||||
c.mov(x86::rbx, imm_ptr(+vm::g_reservations));
|
|
||||||
c.mov(x86::rax, imm_ptr(&vm::g_base_addr));
|
|
||||||
c.mov(x86::rbp, x86::qword_ptr(x86::rax));
|
|
||||||
c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0]));
|
|
||||||
c.and_(args[0].r32(), 0xff80);
|
|
||||||
c.shr(args[0].r32(), 1);
|
|
||||||
c.lea(x86::rbx, x86::qword_ptr(x86::rbx, args[0]));
|
|
||||||
c.xor_(x86::r12d, x86::r12d);
|
|
||||||
c.mov(x86::r13, args[1]);
|
|
||||||
|
|
||||||
// Begin transaction
|
|
||||||
build_transaction_enter(c, fall, x86::r12, 16);
|
|
||||||
c.mov(x86::rax, x86::qword_ptr(x86::rbx));
|
|
||||||
|
|
||||||
if (s_tsx_avx)
|
|
||||||
{
|
|
||||||
c.vmovaps(x86::ymm0, x86::yword_ptr(x86::rbp, 0));
|
|
||||||
c.vmovaps(x86::ymm1, x86::yword_ptr(x86::rbp, 32));
|
|
||||||
c.vmovaps(x86::ymm2, x86::yword_ptr(x86::rbp, 64));
|
|
||||||
c.vmovaps(x86::ymm3, x86::yword_ptr(x86::rbp, 96));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
c.movaps(x86::xmm0, x86::oword_ptr(x86::rbp, 0));
|
|
||||||
c.movaps(x86::xmm1, x86::oword_ptr(x86::rbp, 16));
|
|
||||||
c.movaps(x86::xmm2, x86::oword_ptr(x86::rbp, 32));
|
|
||||||
c.movaps(x86::xmm3, x86::oword_ptr(x86::rbp, 48));
|
|
||||||
c.movaps(x86::xmm4, x86::oword_ptr(x86::rbp, 64));
|
|
||||||
c.movaps(x86::xmm5, x86::oword_ptr(x86::rbp, 80));
|
|
||||||
c.movaps(x86::xmm6, x86::oword_ptr(x86::rbp, 96));
|
|
||||||
c.movaps(x86::xmm7, x86::oword_ptr(x86::rbp, 112));
|
|
||||||
}
|
|
||||||
|
|
||||||
c.xend();
|
|
||||||
|
|
||||||
if (s_tsx_avx)
|
|
||||||
{
|
|
||||||
c.vmovups(x86::yword_ptr(x86::r13, 0), x86::ymm0);
|
|
||||||
c.vmovups(x86::yword_ptr(x86::r13, 32), x86::ymm1);
|
|
||||||
c.vmovups(x86::yword_ptr(x86::r13, 64), x86::ymm2);
|
|
||||||
c.vmovups(x86::yword_ptr(x86::r13, 96), x86::ymm3);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
c.movaps(x86::oword_ptr(x86::r13, 0), x86::xmm0);
|
|
||||||
c.movaps(x86::oword_ptr(x86::r13, 16), x86::xmm1);
|
|
||||||
c.movaps(x86::oword_ptr(x86::r13, 32), x86::xmm2);
|
|
||||||
c.movaps(x86::oword_ptr(x86::r13, 48), x86::xmm3);
|
|
||||||
c.movaps(x86::oword_ptr(x86::r13, 64), x86::xmm4);
|
|
||||||
c.movaps(x86::oword_ptr(x86::r13, 80), x86::xmm5);
|
|
||||||
c.movaps(x86::oword_ptr(x86::r13, 96), x86::xmm6);
|
|
||||||
c.movaps(x86::oword_ptr(x86::r13, 112), x86::xmm7);
|
|
||||||
}
|
|
||||||
|
|
||||||
c.and_(x86::rax, -128);
|
|
||||||
c.jmp(_ret);
|
|
||||||
|
|
||||||
c.bind(fall);
|
|
||||||
c.mov(x86::eax, 1);
|
|
||||||
//c.jmp(_ret);
|
|
||||||
|
|
||||||
c.bind(_ret);
|
|
||||||
|
|
||||||
#ifdef _WIN32
|
|
||||||
if (!s_tsx_avx)
|
|
||||||
{
|
|
||||||
c.movups(x86::xmm6, x86::oword_ptr(x86::rsp, 0));
|
|
||||||
c.movups(x86::xmm7, x86::oword_ptr(x86::rsp, 16));
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if (s_tsx_avx)
|
|
||||||
{
|
|
||||||
c.vzeroupper();
|
|
||||||
}
|
|
||||||
|
|
||||||
c.add(x86::rsp, 72);
|
|
||||||
c.pop(x86::rbx);
|
|
||||||
c.pop(x86::r12);
|
|
||||||
c.pop(x86::r13);
|
|
||||||
c.pop(x86::rbp);
|
|
||||||
c.ret();
|
|
||||||
});
|
|
||||||
|
|
||||||
const auto spu_getll_inexact = build_function_asm<u64(*)(u32 raddr, void* rdata)>([](asmjit::X86Assembler& c, auto& args)
|
|
||||||
{
|
|
||||||
using namespace asmjit;
|
|
||||||
|
|
||||||
Label _ret = c.newLabel();
|
|
||||||
|
|
||||||
if (utils::has_avx() && !s_tsx_avx)
|
|
||||||
{
|
|
||||||
c.vzeroupper();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Create stack frame if necessary (Windows ABI has only 6 volatile vector registers)
|
|
||||||
c.push(x86::rbp);
|
|
||||||
c.push(x86::r13);
|
|
||||||
c.push(x86::r12);
|
|
||||||
c.push(x86::rbx);
|
|
||||||
c.sub(x86::rsp, 72);
|
|
||||||
#ifdef _WIN32
|
|
||||||
if (!s_tsx_avx)
|
|
||||||
{
|
|
||||||
c.movups(x86::oword_ptr(x86::rsp, 0), x86::xmm6);
|
|
||||||
c.movups(x86::oword_ptr(x86::rsp, 16), x86::xmm7);
|
|
||||||
c.movups(x86::oword_ptr(x86::rsp, 32), x86::xmm8);
|
|
||||||
c.movups(x86::oword_ptr(x86::rsp, 48), x86::xmm9);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Prepare registers
|
|
||||||
c.mov(x86::rbx, imm_ptr(+vm::g_reservations));
|
|
||||||
c.mov(x86::rax, imm_ptr(&vm::g_base_addr));
|
|
||||||
c.mov(x86::rbp, x86::qword_ptr(x86::rax));
|
|
||||||
c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0]));
|
|
||||||
c.and_(args[0].r32(), 0xff80);
|
|
||||||
c.shr(args[0].r32(), 1);
|
|
||||||
c.lea(x86::rbx, x86::qword_ptr(x86::rbx, args[0]));
|
|
||||||
c.xor_(x86::r12d, x86::r12d);
|
|
||||||
c.mov(x86::r13, args[1]);
|
|
||||||
|
|
||||||
// Begin copying
|
|
||||||
Label begin = c.newLabel();
|
|
||||||
Label test0 = c.newLabel();
|
|
||||||
c.bind(begin);
|
|
||||||
c.mov(x86::rax, x86::qword_ptr(x86::rbx));
|
|
||||||
|
|
||||||
if (s_tsx_avx)
|
|
||||||
{
|
|
||||||
c.vmovaps(x86::ymm0, x86::yword_ptr(x86::rbp, 0));
|
|
||||||
c.vmovaps(x86::ymm1, x86::yword_ptr(x86::rbp, 32));
|
|
||||||
c.vmovaps(x86::ymm2, x86::yword_ptr(x86::rbp, 64));
|
|
||||||
c.vmovaps(x86::ymm3, x86::yword_ptr(x86::rbp, 96));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
c.movaps(x86::xmm0, x86::oword_ptr(x86::rbp, 0));
|
|
||||||
c.movaps(x86::xmm1, x86::oword_ptr(x86::rbp, 16));
|
|
||||||
c.movaps(x86::xmm2, x86::oword_ptr(x86::rbp, 32));
|
|
||||||
c.movaps(x86::xmm3, x86::oword_ptr(x86::rbp, 48));
|
|
||||||
c.movaps(x86::xmm4, x86::oword_ptr(x86::rbp, 64));
|
|
||||||
c.movaps(x86::xmm5, x86::oword_ptr(x86::rbp, 80));
|
|
||||||
c.movaps(x86::xmm6, x86::oword_ptr(x86::rbp, 96));
|
|
||||||
c.movaps(x86::xmm7, x86::oword_ptr(x86::rbp, 112));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Verify and retry if necessary.
|
|
||||||
c.mov(args[0], x86::rax);
|
|
||||||
c.xor_(args[0], x86::qword_ptr(x86::rbx));
|
|
||||||
c.test(args[0], -128);
|
|
||||||
c.jz(test0);
|
|
||||||
c.lea(x86::r12, x86::qword_ptr(x86::r12, 1));
|
|
||||||
c.jmp(begin);
|
|
||||||
|
|
||||||
c.bind(test0);
|
|
||||||
c.test(x86::eax, 127);
|
|
||||||
c.jz(_ret);
|
|
||||||
c.and_(x86::rax, -128);
|
|
||||||
|
|
||||||
// If there are lock bits set, verify data as well.
|
|
||||||
if (s_tsx_avx)
|
|
||||||
{
|
|
||||||
c.vxorps(x86::ymm4, x86::ymm0, x86::yword_ptr(x86::rbp, 0));
|
|
||||||
c.vxorps(x86::ymm5, x86::ymm1, x86::yword_ptr(x86::rbp, 32));
|
|
||||||
c.vorps(x86::ymm5, x86::ymm5, x86::ymm4);
|
|
||||||
c.vxorps(x86::ymm4, x86::ymm2, x86::yword_ptr(x86::rbp, 64));
|
|
||||||
c.vorps(x86::ymm5, x86::ymm5, x86::ymm4);
|
|
||||||
c.vxorps(x86::ymm4, x86::ymm3, x86::yword_ptr(x86::rbp, 96));
|
|
||||||
c.vorps(x86::ymm5, x86::ymm5, x86::ymm4);
|
|
||||||
c.vptest(x86::ymm5, x86::ymm5);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
c.xorps(x86::xmm9, x86::xmm9);
|
|
||||||
c.movaps(x86::xmm8, x86::xmm0);
|
|
||||||
c.xorps(x86::xmm8, x86::oword_ptr(x86::rbp, 0));
|
|
||||||
c.orps(x86::xmm9, x86::xmm8);
|
|
||||||
c.movaps(x86::xmm8, x86::xmm1);
|
|
||||||
c.xorps(x86::xmm8, x86::oword_ptr(x86::rbp, 16));
|
|
||||||
c.orps(x86::xmm9, x86::xmm8);
|
|
||||||
c.movaps(x86::xmm8, x86::xmm2);
|
|
||||||
c.xorps(x86::xmm8, x86::oword_ptr(x86::rbp, 32));
|
|
||||||
c.orps(x86::xmm9, x86::xmm8);
|
|
||||||
c.movaps(x86::xmm8, x86::xmm3);
|
|
||||||
c.xorps(x86::xmm8, x86::oword_ptr(x86::rbp, 48));
|
|
||||||
c.orps(x86::xmm9, x86::xmm8);
|
|
||||||
c.movaps(x86::xmm8, x86::xmm4);
|
|
||||||
c.xorps(x86::xmm8, x86::oword_ptr(x86::rbp, 64));
|
|
||||||
c.orps(x86::xmm9, x86::xmm8);
|
|
||||||
c.movaps(x86::xmm8, x86::xmm5);
|
|
||||||
c.xorps(x86::xmm8, x86::oword_ptr(x86::rbp, 80));
|
|
||||||
c.orps(x86::xmm9, x86::xmm8);
|
|
||||||
c.movaps(x86::xmm8, x86::xmm6);
|
|
||||||
c.xorps(x86::xmm8, x86::oword_ptr(x86::rbp, 96));
|
|
||||||
c.orps(x86::xmm9, x86::xmm8);
|
|
||||||
c.movaps(x86::xmm8, x86::xmm7);
|
|
||||||
c.xorps(x86::xmm8, x86::oword_ptr(x86::rbp, 112));
|
|
||||||
c.orps(x86::xmm9, x86::xmm8);
|
|
||||||
c.ptest(x86::xmm9, x86::xmm9);
|
|
||||||
}
|
|
||||||
|
|
||||||
c.jz(_ret);
|
|
||||||
c.lea(x86::r12, x86::qword_ptr(x86::r12, 2));
|
|
||||||
c.jmp(begin);
|
|
||||||
|
|
||||||
c.bind(_ret);
|
|
||||||
|
|
||||||
if (s_tsx_avx)
|
|
||||||
{
|
|
||||||
c.vmovups(x86::yword_ptr(x86::r13, 0), x86::ymm0);
|
|
||||||
c.vmovups(x86::yword_ptr(x86::r13, 32), x86::ymm1);
|
|
||||||
c.vmovups(x86::yword_ptr(x86::r13, 64), x86::ymm2);
|
|
||||||
c.vmovups(x86::yword_ptr(x86::r13, 96), x86::ymm3);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
c.movaps(x86::oword_ptr(x86::r13, 0), x86::xmm0);
|
|
||||||
c.movaps(x86::oword_ptr(x86::r13, 16), x86::xmm1);
|
|
||||||
c.movaps(x86::oword_ptr(x86::r13, 32), x86::xmm2);
|
|
||||||
c.movaps(x86::oword_ptr(x86::r13, 48), x86::xmm3);
|
|
||||||
c.movaps(x86::oword_ptr(x86::r13, 64), x86::xmm4);
|
|
||||||
c.movaps(x86::oword_ptr(x86::r13, 80), x86::xmm5);
|
|
||||||
c.movaps(x86::oword_ptr(x86::r13, 96), x86::xmm6);
|
|
||||||
c.movaps(x86::oword_ptr(x86::r13, 112), x86::xmm7);
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef _WIN32
|
|
||||||
if (!s_tsx_avx)
|
|
||||||
{
|
|
||||||
c.movups(x86::xmm6, x86::oword_ptr(x86::rsp, 0));
|
|
||||||
c.movups(x86::xmm7, x86::oword_ptr(x86::rsp, 16));
|
|
||||||
c.movups(x86::xmm8, x86::oword_ptr(x86::rsp, 32));
|
|
||||||
c.movups(x86::xmm9, x86::oword_ptr(x86::rsp, 48));
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if (s_tsx_avx)
|
|
||||||
{
|
|
||||||
c.vzeroupper();
|
|
||||||
}
|
|
||||||
|
|
||||||
c.add(x86::rsp, 72);
|
|
||||||
c.pop(x86::rbx);
|
|
||||||
c.pop(x86::r12);
|
|
||||||
c.pop(x86::r13);
|
|
||||||
c.pop(x86::rbp);
|
|
||||||
c.ret();
|
|
||||||
});
|
|
||||||
|
|
||||||
const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rdata, spu_thread* _spu)>([](asmjit::X86Assembler& c, auto& args)
|
const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rdata, spu_thread* _spu)>([](asmjit::X86Assembler& c, auto& args)
|
||||||
{
|
{
|
||||||
using namespace asmjit;
|
using namespace asmjit;
|
||||||
|
@ -1994,7 +1717,7 @@ bool spu_thread::process_mfc_cmd()
|
||||||
case MFC_GETLLAR_CMD:
|
case MFC_GETLLAR_CMD:
|
||||||
{
|
{
|
||||||
const u32 addr = ch_mfc_cmd.eal & -128;
|
const u32 addr = ch_mfc_cmd.eal & -128;
|
||||||
auto& data = vm::_ref<decltype(rdata)>(addr);
|
const auto& data = vm::_ref<decltype(rdata)>(addr);
|
||||||
auto& dst = _ref<decltype(rdata)>(ch_mfc_cmd.lsa & 0x3ff80);
|
auto& dst = _ref<decltype(rdata)>(ch_mfc_cmd.lsa & 0x3ff80);
|
||||||
u64 ntime;
|
u64 ntime;
|
||||||
|
|
||||||
|
@ -2022,67 +1745,53 @@ bool spu_thread::process_mfc_cmd()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (g_use_rtm && !g_cfg.core.spu_accurate_getllar && raddr != addr) [[likely]]
|
for (u64 i = 0;; [&]()
|
||||||
{
|
{
|
||||||
// TODO: maybe always start from a transaction
|
if (ntime & 127 && g_use_rtm && !(state & cpu_flag::wait))
|
||||||
ntime = spu_getll_inexact(addr, dst.data());
|
|
||||||
}
|
|
||||||
else if (g_use_rtm)
|
|
||||||
{
|
|
||||||
ntime = spu_getll_tx(addr, dst.data());
|
|
||||||
|
|
||||||
if (ntime == 1)
|
|
||||||
{
|
{
|
||||||
if (!g_cfg.core.spu_accurate_getllar)
|
state += cpu_flag::wait;
|
||||||
{
|
|
||||||
ntime = spu_getll_inexact(addr, dst.data());
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
cpu_thread::suspend_all cpu_lock(this);
|
|
||||||
|
|
||||||
while (vm::reservation_acquire(addr, 128) & 127)
|
|
||||||
{
|
|
||||||
busy_wait(100);
|
|
||||||
}
|
|
||||||
|
|
||||||
ntime = vm::reservation_acquire(addr, 128);
|
|
||||||
mov_rdata(dst, data);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
auto& res = vm::reservation_lock(addr, 128);
|
|
||||||
const u64 old_time = res.load() & -128;
|
|
||||||
|
|
||||||
if (g_cfg.core.spu_accurate_getllar)
|
if (++i < 25) [[likely]]
|
||||||
{
|
{
|
||||||
*reinterpret_cast<atomic_t<u32>*>(&data) += 0;
|
busy_wait(300);
|
||||||
|
|
||||||
const auto render = get_rsx_if_needs_res_pause(addr);
|
|
||||||
|
|
||||||
if (render) render->pause();
|
|
||||||
|
|
||||||
const auto& super_data = *vm::get_super_ptr<decltype(rdata)>(addr);
|
|
||||||
{
|
|
||||||
// Full lock (heavyweight)
|
|
||||||
// TODO: vm::check_addr
|
|
||||||
vm::writer_lock lock(addr);
|
|
||||||
|
|
||||||
ntime = old_time;
|
|
||||||
mov_rdata(dst, super_data);
|
|
||||||
res.release(old_time);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (render) render->unpause();
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
ntime = old_time;
|
std::this_thread::yield();
|
||||||
mov_rdata(dst, data);
|
|
||||||
res.release(old_time);
|
|
||||||
}
|
}
|
||||||
|
}())
|
||||||
|
{
|
||||||
|
ntime = vm::reservation_acquire(addr, 128);
|
||||||
|
|
||||||
|
if (ntime & 127)
|
||||||
|
{
|
||||||
|
// There's an on-going reservation store, wait
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
mov_rdata(dst, data);
|
||||||
|
|
||||||
|
if (u64 time0 = vm::reservation_acquire(addr, 128);
|
||||||
|
ntime != time0)
|
||||||
|
{
|
||||||
|
// Reservation data has been modified recently
|
||||||
|
if (time0 & 127) i += 12, ntime = time0;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (g_cfg.core.spu_accurate_getllar && !cmp_rdata(dst, data))
|
||||||
|
{
|
||||||
|
i += 2;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (test_stopped())
|
||||||
|
{
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (raddr && raddr != addr)
|
if (raddr && raddr != addr)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue