Remove XABORT in PPU/SPU transactions.

It's expensive for unknown reason. Simply XEND is usually much cheaper.
Add some minor improvements. Use g_sudo_addr.
This commit is contained in:
Nekotekina 2020-10-20 08:22:25 +03:00
parent 182a998cb6
commit dc8252bb9f
5 changed files with 161 additions and 104 deletions

View file

@ -1340,7 +1340,7 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
// Prepare registers
c.mov(x86::rbx, imm_ptr(+vm::g_reservations));
c.mov(x86::rax, imm_ptr(&vm::g_base_addr));
c.mov(x86::rax, imm_ptr(&vm::g_sudo_addr));
c.mov(x86::rbp, x86::qword_ptr(x86::rax));
c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0]));
c.and_(x86::rbp, -128);
@ -1350,8 +1350,9 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
c.shr(args[0].r32(), 1);
c.lea(x86::rbx, x86::qword_ptr(x86::rbx, args[0]));
c.and_(x86::rbx, -128 / 2);
c.prefetchw(x86::byte_ptr(x86::rbx));
c.and_(args[0].r32(), 63);
c.xor_(x86::r12d, x86::r12d);
c.mov(x86::r12d, 1);
c.mov(x86::r13, args[1]);
c.bswap(args[3]);
@ -1376,7 +1377,7 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
}
// Begin transaction
Label tx0 = build_transaction_enter(c, fall, x86::r12, 4);
Label tx0 = build_transaction_enter(c, fall, x86::r12d, 4);
c.xbegin(tx0);
c.mov(x86::rax, x86::qword_ptr(x86::rbx));
c.test(x86::eax, 127);
@ -1423,22 +1424,46 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
// Update reservation
c.sub(x86::qword_ptr(x86::rbx), -128);
c.xend();
c.mov(x86::eax, 1);
c.mov(x86::eax, x86::r12d);
c.jmp(_ret);
// XABORT is expensive so finish with xend instead
c.bind(fail);
// Load old data (unused)
if (s_tsx_avx)
{
c.vmovaps(x86::ymm0, x86::yword_ptr(x86::rbp, 0));
c.vmovaps(x86::ymm1, x86::yword_ptr(x86::rbp, 32));
c.vmovaps(x86::ymm2, x86::yword_ptr(x86::rbp, 64));
c.vmovaps(x86::ymm3, x86::yword_ptr(x86::rbp, 96));
}
else
{
c.movaps(x86::xmm0, x86::oword_ptr(x86::rbp, 0));
c.movaps(x86::xmm1, x86::oword_ptr(x86::rbp, 16));
c.movaps(x86::xmm2, x86::oword_ptr(x86::rbp, 32));
c.movaps(x86::xmm3, x86::oword_ptr(x86::rbp, 48));
c.movaps(x86::xmm4, x86::oword_ptr(x86::rbp, 64));
c.movaps(x86::xmm5, x86::oword_ptr(x86::rbp, 80));
c.movaps(x86::xmm6, x86::oword_ptr(x86::rbp, 96));
c.movaps(x86::xmm7, x86::oword_ptr(x86::rbp, 112));
}
c.xend();
c.xor_(x86::eax, x86::eax);
c.jmp(_ret);
c.bind(skip);
c.xor_(x86::eax, x86::eax);
c.xor_(x86::r12d, x86::r12d);
build_transaction_abort(c, 0);
c.xend();
c.mov(x86::eax, _XABORT_EXPLICIT);
//c.jmp(fall);
c.bind(fall);
c.sar(x86::eax, 24);
c.js(fail);
// Touch memory if transaction failed without RETRY flag on the first attempt
c.cmp(x86::r12, 1);
c.jne(next);
// Touch memory if transaction failed with status 0
c.test(x86::eax, x86::eax);
c.jnz(next);
c.xor_(x86::rbp, 0xf80);
c.lock().add(x86::dword_ptr(x86::rbp), 0);
c.xor_(x86::rbp, 0xf80);
@ -1454,19 +1479,19 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
c.mov(x86::eax, 1);
c.lock().xadd(x86::qword_ptr(x86::rbx), x86::rax);
c.test(x86::eax, vm::rsrv_unique_lock);
c.jnz(fail3);
c.jnz(fall2);
// Allow only first shared lock to proceed
c.cmp(x86::rax, x86::r13);
c.jne(fail2);
Label tx1 = build_transaction_enter(c, fall2, x86::r12, 666);
Label tx1 = build_transaction_enter(c, fall2, x86::r12d, 666);
c.prefetchw(x86::byte_ptr(x86::rbp, 0));
c.prefetchw(x86::byte_ptr(x86::rbp, 64));
// Check pause flag
c.bt(x86::dword_ptr(args[2], ::offset32(&ppu_thread::state) - ::offset32(&ppu_thread::rdata)), static_cast<u32>(cpu_flag::pause));
c.jc(fail3);
c.jc(fall2);
c.mov(x86::rax, x86::qword_ptr(x86::rbx));
c.and_(x86::rax, -128);
c.cmp(x86::rax, x86::r13);
@ -1504,30 +1529,47 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
c.ptest(x86::xmm0, x86::xmm0);
}
c.jnz(fail2);
c.jnz(fail3);
// Store 8 bytes
c.mov(x86::qword_ptr(x86::rbp, args[0], 1, 0), args[3]);
c.xend();
c.lock().add(x86::qword_ptr(x86::rbx), 127);
c.lea(x86::rax, x86::qword_ptr(x86::r12, 1));
c.mov(x86::eax, x86::r12d);
c.jmp(_ret);
c.bind(fall2);
c.sar(x86::eax, 24);
c.js(fail2);
// XABORT is expensive so try to finish with xend instead
c.bind(fail3);
// Load old data (unused)
if (s_tsx_avx)
{
c.vmovaps(x86::ymm0, x86::yword_ptr(x86::rbp, 0));
c.vmovaps(x86::ymm1, x86::yword_ptr(x86::rbp, 32));
c.vmovaps(x86::ymm2, x86::yword_ptr(x86::rbp, 64));
c.vmovaps(x86::ymm3, x86::yword_ptr(x86::rbp, 96));
}
else
{
c.movaps(x86::xmm0, x86::oword_ptr(x86::rbp, 0));
c.movaps(x86::xmm1, x86::oword_ptr(x86::rbp, 16));
c.movaps(x86::xmm2, x86::oword_ptr(x86::rbp, 32));
c.movaps(x86::xmm3, x86::oword_ptr(x86::rbp, 48));
c.movaps(x86::xmm4, x86::oword_ptr(x86::rbp, 64));
c.movaps(x86::xmm5, x86::oword_ptr(x86::rbp, 80));
c.movaps(x86::xmm6, x86::oword_ptr(x86::rbp, 96));
c.movaps(x86::xmm7, x86::oword_ptr(x86::rbp, 112));
}
c.xend();
c.jmp(fail2);
c.bind(fall2);
c.mov(x86::eax, -1);
c.jmp(_ret);
c.bind(fail);
build_transaction_abort(c, 0xff);
c.xor_(x86::eax, x86::eax);
c.jmp(_ret);
c.bind(fail2);
build_transaction_abort(c, 0xff);
c.lock().sub(x86::qword_ptr(x86::rbx), 1);
c.xor_(x86::eax, x86::eax);
//c.jmp(_ret);