LLVM: enable some JIT events (Intel, Perf)

Made some related adjustments.
Currently incomplete.
This commit is contained in:
Nekotekina 2021-12-24 20:33:32 +03:00
parent 510041a873
commit d836033212
16 changed files with 233 additions and 162 deletions

View file

@ -405,7 +405,7 @@ std::array<u32, 2> op_branch_targets(u32 pc, spu_opcode_t op)
return res;
}
const auto spu_putllc_tx = built_function<u64(*)(u32 raddr, u64 rtime, void* _old, const void* _new)>([](asmjit::X86Assembler& c, auto& args)
const auto spu_putllc_tx = built_function<u64(*)(u32 raddr, u64 rtime, void* _old, const void* _new)>("spu_putllc_tx", [](asmjit::X86Assembler& c, auto& args)
{
using namespace asmjit;
@ -420,12 +420,8 @@ const auto spu_putllc_tx = built_function<u64(*)(u32 raddr, u64 rtime, void* _ol
//}
// Create stack frame if necessary (Windows ABI has only 6 volatile vector registers)
c.push(x86::rbp);
c.push(x86::r13);
c.push(x86::r12);
c.push(x86::rbx);
c.sub(x86::rsp, 168);
#ifdef _WIN32
c.sub(x86::rsp, 168);
if (s_tsx_avx)
{
c.vmovups(x86::oword_ptr(x86::rsp, 0), x86::xmm6);
@ -447,16 +443,14 @@ const auto spu_putllc_tx = built_function<u64(*)(u32 raddr, u64 rtime, void* _ol
#endif
// Prepare registers
build_swap_rdx_with(c, args, x86::r12);
c.mov(x86::rbp, x86::qword_ptr(reinterpret_cast<u64>(&vm::g_sudo_addr)));
c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0]));
c.prefetchw(x86::byte_ptr(x86::rbp, 0));
c.prefetchw(x86::byte_ptr(x86::rbp, 64));
build_swap_rdx_with(c, args, x86::r10);
c.mov(args[1], x86::qword_ptr(reinterpret_cast<u64>(&vm::g_sudo_addr)));
c.lea(args[1], x86::qword_ptr(args[1], args[0]));
c.prefetchw(x86::byte_ptr(args[1], 0));
c.prefetchw(x86::byte_ptr(args[1], 64));
c.and_(args[0].r32(), 0xff80);
c.shr(args[0].r32(), 1);
c.lea(x86::rbx, x86::qword_ptr(reinterpret_cast<u64>(+vm::g_reservations), args[0]));
c.prefetchw(x86::byte_ptr(x86::rbx));
c.mov(x86::r13, args[1]);
c.lea(x86::r11, x86::qword_ptr(reinterpret_cast<u64>(+vm::g_reservations), args[0]));
// Prepare data
if (s_tsx_avx)
@ -504,8 +498,6 @@ const auto spu_putllc_tx = built_function<u64(*)(u32 raddr, u64 rtime, void* _ol
c.cmp(x86::rax, x86::qword_ptr(reinterpret_cast<u64>(&g_rtm_tx_limit2)));
c.jae(fall);
});
c.prefetchw(x86::byte_ptr(x86::rbp, 0));
c.prefetchw(x86::byte_ptr(x86::rbp, 64));
// Check pause flag
c.bt(x86::dword_ptr(args[2], ::offset32(&spu_thread::state) - ::offset32(&spu_thread::rdata)), static_cast<u32>(cpu_flag::pause));
@ -514,10 +506,10 @@ const auto spu_putllc_tx = built_function<u64(*)(u32 raddr, u64 rtime, void* _ol
if (s_tsx_avx)
{
c.vxorps(x86::ymm0, x86::ymm0, x86::yword_ptr(x86::rbp, 0));
c.vxorps(x86::ymm1, x86::ymm1, x86::yword_ptr(x86::rbp, 32));
c.vxorps(x86::ymm2, x86::ymm2, x86::yword_ptr(x86::rbp, 64));
c.vxorps(x86::ymm3, x86::ymm3, x86::yword_ptr(x86::rbp, 96));
c.vxorps(x86::ymm0, x86::ymm0, x86::yword_ptr(args[1], 0));
c.vxorps(x86::ymm1, x86::ymm1, x86::yword_ptr(args[1], 32));
c.vxorps(x86::ymm2, x86::ymm2, x86::yword_ptr(args[1], 64));
c.vxorps(x86::ymm3, x86::ymm3, x86::yword_ptr(args[1], 96));
c.vorps(x86::ymm0, x86::ymm0, x86::ymm1);
c.vorps(x86::ymm1, x86::ymm2, x86::ymm3);
c.vorps(x86::ymm0, x86::ymm1, x86::ymm0);
@ -525,14 +517,14 @@ const auto spu_putllc_tx = built_function<u64(*)(u32 raddr, u64 rtime, void* _ol
}
else
{
c.xorps(x86::xmm0, x86::oword_ptr(x86::rbp, 0));
c.xorps(x86::xmm1, x86::oword_ptr(x86::rbp, 16));
c.xorps(x86::xmm2, x86::oword_ptr(x86::rbp, 32));
c.xorps(x86::xmm3, x86::oword_ptr(x86::rbp, 48));
c.xorps(x86::xmm4, x86::oword_ptr(x86::rbp, 64));
c.xorps(x86::xmm5, x86::oword_ptr(x86::rbp, 80));
c.xorps(x86::xmm6, x86::oword_ptr(x86::rbp, 96));
c.xorps(x86::xmm7, x86::oword_ptr(x86::rbp, 112));
c.xorps(x86::xmm0, x86::oword_ptr(args[1], 0));
c.xorps(x86::xmm1, x86::oword_ptr(args[1], 16));
c.xorps(x86::xmm2, x86::oword_ptr(args[1], 32));
c.xorps(x86::xmm3, x86::oword_ptr(args[1], 48));
c.xorps(x86::xmm4, x86::oword_ptr(args[1], 64));
c.xorps(x86::xmm5, x86::oword_ptr(args[1], 80));
c.xorps(x86::xmm6, x86::oword_ptr(args[1], 96));
c.xorps(x86::xmm7, x86::oword_ptr(args[1], 112));
c.orps(x86::xmm0, x86::xmm1);
c.orps(x86::xmm2, x86::xmm3);
c.orps(x86::xmm4, x86::xmm5);
@ -547,25 +539,25 @@ const auto spu_putllc_tx = built_function<u64(*)(u32 raddr, u64 rtime, void* _ol
if (s_tsx_avx)
{
c.vmovaps(x86::yword_ptr(x86::rbp, 0), x86::ymm4);
c.vmovaps(x86::yword_ptr(x86::rbp, 32), x86::ymm5);
c.vmovaps(x86::yword_ptr(x86::rbp, 64), x86::ymm6);
c.vmovaps(x86::yword_ptr(x86::rbp, 96), x86::ymm7);
c.vmovaps(x86::yword_ptr(args[1], 0), x86::ymm4);
c.vmovaps(x86::yword_ptr(args[1], 32), x86::ymm5);
c.vmovaps(x86::yword_ptr(args[1], 64), x86::ymm6);
c.vmovaps(x86::yword_ptr(args[1], 96), x86::ymm7);
}
else
{
c.movaps(x86::oword_ptr(x86::rbp, 0), x86::xmm8);
c.movaps(x86::oword_ptr(x86::rbp, 16), x86::xmm9);
c.movaps(x86::oword_ptr(x86::rbp, 32), x86::xmm10);
c.movaps(x86::oword_ptr(x86::rbp, 48), x86::xmm11);
c.movaps(x86::oword_ptr(x86::rbp, 64), x86::xmm12);
c.movaps(x86::oword_ptr(x86::rbp, 80), x86::xmm13);
c.movaps(x86::oword_ptr(x86::rbp, 96), x86::xmm14);
c.movaps(x86::oword_ptr(x86::rbp, 112), x86::xmm15);
c.movaps(x86::oword_ptr(args[1], 0), x86::xmm8);
c.movaps(x86::oword_ptr(args[1], 16), x86::xmm9);
c.movaps(x86::oword_ptr(args[1], 32), x86::xmm10);
c.movaps(x86::oword_ptr(args[1], 48), x86::xmm11);
c.movaps(x86::oword_ptr(args[1], 64), x86::xmm12);
c.movaps(x86::oword_ptr(args[1], 80), x86::xmm13);
c.movaps(x86::oword_ptr(args[1], 96), x86::xmm14);
c.movaps(x86::oword_ptr(args[1], 112), x86::xmm15);
}
c.xend();
c.lock().add(x86::qword_ptr(x86::rbx), 64);
c.lock().add(x86::qword_ptr(x86::r11), 64);
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx) - ::offset32(&spu_thread::rdata)), 1);
build_get_tsc(c);
c.sub(x86::rax, stamp0);
@ -577,21 +569,21 @@ const auto spu_putllc_tx = built_function<u64(*)(u32 raddr, u64 rtime, void* _ol
// Load previous data to store back to rdata
if (s_tsx_avx)
{
c.vmovaps(x86::ymm0, x86::yword_ptr(x86::rbp, 0));
c.vmovaps(x86::ymm1, x86::yword_ptr(x86::rbp, 32));
c.vmovaps(x86::ymm2, x86::yword_ptr(x86::rbp, 64));
c.vmovaps(x86::ymm3, x86::yword_ptr(x86::rbp, 96));
c.vmovaps(x86::ymm0, x86::yword_ptr(args[1], 0));
c.vmovaps(x86::ymm1, x86::yword_ptr(args[1], 32));
c.vmovaps(x86::ymm2, x86::yword_ptr(args[1], 64));
c.vmovaps(x86::ymm3, x86::yword_ptr(args[1], 96));
}
else
{
c.movaps(x86::xmm0, x86::oword_ptr(x86::rbp, 0));
c.movaps(x86::xmm1, x86::oword_ptr(x86::rbp, 16));
c.movaps(x86::xmm2, x86::oword_ptr(x86::rbp, 32));
c.movaps(x86::xmm3, x86::oword_ptr(x86::rbp, 48));
c.movaps(x86::xmm4, x86::oword_ptr(x86::rbp, 64));
c.movaps(x86::xmm5, x86::oword_ptr(x86::rbp, 80));
c.movaps(x86::xmm6, x86::oword_ptr(x86::rbp, 96));
c.movaps(x86::xmm7, x86::oword_ptr(x86::rbp, 112));
c.movaps(x86::xmm0, x86::oword_ptr(args[1], 0));
c.movaps(x86::xmm1, x86::oword_ptr(args[1], 16));
c.movaps(x86::xmm2, x86::oword_ptr(args[1], 32));
c.movaps(x86::xmm3, x86::oword_ptr(args[1], 48));
c.movaps(x86::xmm4, x86::oword_ptr(args[1], 64));
c.movaps(x86::xmm5, x86::oword_ptr(args[1], 80));
c.movaps(x86::xmm6, x86::oword_ptr(args[1], 96));
c.movaps(x86::xmm7, x86::oword_ptr(args[1], 112));
}
c.xend();
@ -603,7 +595,7 @@ const auto spu_putllc_tx = built_function<u64(*)(u32 raddr, u64 rtime, void* _ol
c.jmp(_ret);
c.bind(fail2);
c.lock().sub(x86::qword_ptr(x86::rbx), 64);
c.lock().sub(x86::qword_ptr(x86::r11), 64);
c.bind(load);
// Store previous data back to rdata
@ -652,6 +644,7 @@ const auto spu_putllc_tx = built_function<u64(*)(u32 raddr, u64 rtime, void* _ol
c.movups(x86::xmm14, x86::oword_ptr(x86::rsp, 128));
c.movups(x86::xmm15, x86::oword_ptr(x86::rsp, 144));
}
c.add(x86::rsp, 168);
#endif
if (s_tsx_avx)
@ -659,15 +652,18 @@ const auto spu_putllc_tx = built_function<u64(*)(u32 raddr, u64 rtime, void* _ol
c.vzeroupper();
}
c.add(x86::rsp, 168);
c.pop(x86::rbx);
c.pop(x86::r12);
c.pop(x86::r13);
c.pop(x86::rbp);
#ifdef __linux__
// Hack for perf profiling (TODO)
Label ret2 = c.newLabel();
c.lea(x86::rdx, x86::qword_ptr(ret2));
c.push(x86::rdx);
c.push(x86::rdx);
c.bind(ret2);
#endif
c.ret();
});
const auto spu_putlluc_tx = built_function<u64(*)(u32 raddr, const void* rdata, u64* _stx, u64* _ftx)>([](asmjit::X86Assembler& c, auto& args)
const auto spu_putlluc_tx = built_function<u64(*)(u32 raddr, const void* rdata, u64* _stx, u64* _ftx)>("spu_putlluc_tx", [](asmjit::X86Assembler& c, auto& args)
{
using namespace asmjit;
@ -680,30 +676,20 @@ const auto spu_putlluc_tx = built_function<u64(*)(u32 raddr, const void* rdata,
//}
// Create stack frame if necessary (Windows ABI has only 6 volatile vector registers)
c.push(x86::rbp);
c.push(x86::r13);
c.push(x86::r12);
c.push(x86::rbx);
c.sub(x86::rsp, 40);
#ifdef _WIN32
c.sub(x86::rsp, 40);
if (!s_tsx_avx)
{
c.movups(x86::oword_ptr(x86::rsp, 0), x86::xmm6);
c.movups(x86::oword_ptr(x86::rsp, 16), x86::xmm7);
}
#endif
// Prepare registers
build_swap_rdx_with(c, args, x86::r12);
c.mov(x86::rbp, x86::qword_ptr(reinterpret_cast<u64>(&vm::g_sudo_addr)));
c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0]));
c.prefetchw(x86::byte_ptr(x86::rbp, 0));
c.prefetchw(x86::byte_ptr(x86::rbp, 64));
c.and_(args[0].r32(), 0xff80);
c.shr(args[0].r32(), 1);
c.lea(x86::rbx, x86::qword_ptr(reinterpret_cast<u64>(+vm::g_reservations), args[0]));
c.prefetchw(x86::byte_ptr(x86::rbx));
c.mov(x86::r13, args[1]);
build_swap_rdx_with(c, args, x86::r10);
c.mov(x86::r11, x86::qword_ptr(reinterpret_cast<u64>(&vm::g_sudo_addr)));
c.lea(x86::r11, x86::qword_ptr(x86::r11, args[0]));
c.prefetchw(x86::byte_ptr(x86::r11, 0));
c.prefetchw(x86::byte_ptr(x86::r11, 64));
// Prepare data
if (s_tsx_avx)
@ -725,6 +711,10 @@ const auto spu_putlluc_tx = built_function<u64(*)(u32 raddr, const void* rdata,
c.movaps(x86::xmm7, x86::oword_ptr(args[1], 112));
}
c.and_(args[0].r32(), 0xff80);
c.shr(args[0].r32(), 1);
c.lea(args[1], x86::qword_ptr(reinterpret_cast<u64>(+vm::g_reservations), args[0]));
// Alloc args[0] to stamp0
const auto stamp0 = args[0];
build_get_tsc(c, stamp0);
@ -739,35 +729,29 @@ const auto spu_putlluc_tx = built_function<u64(*)(u32 raddr, const void* rdata,
c.jae(fall);
});
c.prefetchw(x86::byte_ptr(x86::rbp, 0));
c.prefetchw(x86::byte_ptr(x86::rbp, 64));
// // Check pause flag
// c.bt(x86::dword_ptr(args[2], ::offset32(&cpu_thread::state)), static_cast<u32>(cpu_flag::pause));
// c.jc(fall);
c.xbegin(tx1);
if (s_tsx_avx)
{
c.vmovaps(x86::yword_ptr(x86::rbp, 0), x86::ymm0);
c.vmovaps(x86::yword_ptr(x86::rbp, 32), x86::ymm1);
c.vmovaps(x86::yword_ptr(x86::rbp, 64), x86::ymm2);
c.vmovaps(x86::yword_ptr(x86::rbp, 96), x86::ymm3);
c.vmovaps(x86::yword_ptr(x86::r11, 0), x86::ymm0);
c.vmovaps(x86::yword_ptr(x86::r11, 32), x86::ymm1);
c.vmovaps(x86::yword_ptr(x86::r11, 64), x86::ymm2);
c.vmovaps(x86::yword_ptr(x86::r11, 96), x86::ymm3);
}
else
{
c.movaps(x86::oword_ptr(x86::rbp, 0), x86::xmm0);
c.movaps(x86::oword_ptr(x86::rbp, 16), x86::xmm1);
c.movaps(x86::oword_ptr(x86::rbp, 32), x86::xmm2);
c.movaps(x86::oword_ptr(x86::rbp, 48), x86::xmm3);
c.movaps(x86::oword_ptr(x86::rbp, 64), x86::xmm4);
c.movaps(x86::oword_ptr(x86::rbp, 80), x86::xmm5);
c.movaps(x86::oword_ptr(x86::rbp, 96), x86::xmm6);
c.movaps(x86::oword_ptr(x86::rbp, 112), x86::xmm7);
c.movaps(x86::oword_ptr(x86::r11, 0), x86::xmm0);
c.movaps(x86::oword_ptr(x86::r11, 16), x86::xmm1);
c.movaps(x86::oword_ptr(x86::r11, 32), x86::xmm2);
c.movaps(x86::oword_ptr(x86::r11, 48), x86::xmm3);
c.movaps(x86::oword_ptr(x86::r11, 64), x86::xmm4);
c.movaps(x86::oword_ptr(x86::r11, 80), x86::xmm5);
c.movaps(x86::oword_ptr(x86::r11, 96), x86::xmm6);
c.movaps(x86::oword_ptr(x86::r11, 112), x86::xmm7);
}
c.xend();
c.lock().add(x86::qword_ptr(x86::rbx), 32);
c.lock().add(x86::qword_ptr(args[1]), 32);
// stx++
c.add(x86::qword_ptr(args[2]), 1);
build_get_tsc(c);
@ -786,6 +770,7 @@ const auto spu_putlluc_tx = built_function<u64(*)(u32 raddr, const void* rdata,
c.movups(x86::xmm6, x86::oword_ptr(x86::rsp, 0));
c.movups(x86::xmm7, x86::oword_ptr(x86::rsp, 16));
}
c.add(x86::rsp, 40);
#endif
if (s_tsx_avx)
@ -793,15 +778,18 @@ const auto spu_putlluc_tx = built_function<u64(*)(u32 raddr, const void* rdata,
c.vzeroupper();
}
c.add(x86::rsp, 40);
c.pop(x86::rbx);
c.pop(x86::r12);
c.pop(x86::r13);
c.pop(x86::rbp);
#ifdef __linux__
// Hack for perf profiling (TODO)
Label ret2 = c.newLabel();
c.lea(x86::rdx, x86::qword_ptr(ret2));
c.push(x86::rdx);
c.push(x86::rdx);
c.bind(ret2);
#endif
c.ret();
});
const auto spu_getllar_tx = built_function<u64(*)(u32 raddr, void* rdata, cpu_thread* _cpu, u64 rtime)>([](asmjit::X86Assembler& c, auto& args)
const auto spu_getllar_tx = built_function<u64(*)(u32 raddr, void* rdata, cpu_thread* _cpu, u64 rtime)>("spu_getllar_tx", [](asmjit::X86Assembler& c, auto& args)
{
using namespace asmjit;
@ -815,8 +803,6 @@ const auto spu_getllar_tx = built_function<u64(*)(u32 raddr, void* rdata, cpu_th
// Create stack frame if necessary (Windows ABI has only 6 volatile vector registers)
c.push(x86::rbp);
c.push(x86::r13);
c.push(x86::r12);
c.push(x86::rbx);
c.sub(x86::rsp, 40);
#ifdef _WIN32
@ -828,13 +814,12 @@ const auto spu_getllar_tx = built_function<u64(*)(u32 raddr, void* rdata, cpu_th
#endif
// Prepare registers
build_swap_rdx_with(c, args, x86::r12);
build_swap_rdx_with(c, args, x86::r10);
c.mov(x86::rbp, x86::qword_ptr(reinterpret_cast<u64>(&vm::g_sudo_addr)));
c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0]));
c.and_(args[0].r32(), 0xff80);
c.shr(args[0].r32(), 1);
c.lea(x86::rbx, x86::qword_ptr(reinterpret_cast<u64>(+vm::g_reservations), args[0]));
c.mov(x86::r13, args[1]);
c.lea(x86::r11, x86::qword_ptr(reinterpret_cast<u64>(+vm::g_reservations), args[0]));
// Alloc args[0] to stamp0
const auto stamp0 = args[0];
@ -853,7 +838,7 @@ const auto spu_getllar_tx = built_function<u64(*)(u32 raddr, void* rdata, cpu_th
// Check pause flag
c.bt(x86::dword_ptr(args[2], ::offset32(&cpu_thread::state)), static_cast<u32>(cpu_flag::pause));
c.jc(fall);
c.mov(x86::rax, x86::qword_ptr(x86::rbx));
c.mov(x86::rax, x86::qword_ptr(x86::r11));
c.and_(x86::rax, -128);
c.cmp(x86::rax, args[3]);
c.jne(fall);
@ -926,9 +911,16 @@ const auto spu_getllar_tx = built_function<u64(*)(u32 raddr, void* rdata, cpu_th
c.add(x86::rsp, 40);
c.pop(x86::rbx);
c.pop(x86::r12);
c.pop(x86::r13);
c.pop(x86::rbp);
#ifdef __linux__
// Hack for perf profiling (TODO)
Label ret2 = c.newLabel();
c.lea(x86::rdx, x86::qword_ptr(ret2));
c.push(x86::rdx);
c.push(x86::rdx);
c.bind(ret2);
#endif
c.ret();
});