From 9e4f43f4d1d92e4ef860b13ca7167a6a4a9f5a55 Mon Sep 17 00:00:00 2001 From: Whatcookie Date: Thu, 13 Aug 2020 10:00:56 -0400 Subject: [PATCH] SPU LLVM: Add icelake optimized paths for SHUFB (#8712) --- rpcs3/Emu/CPU/CPUTranslator.h | 15 +++++++++++++++ rpcs3/Emu/Cell/SPURecompiler.cpp | 24 ++++++++++++++++++++++-- 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/rpcs3/Emu/CPU/CPUTranslator.h b/rpcs3/Emu/CPU/CPUTranslator.h index f9f10610d4..60b273eedb 100644 --- a/rpcs3/Emu/CPU/CPUTranslator.h +++ b/rpcs3/Emu/CPU/CPUTranslator.h @@ -2785,6 +2785,21 @@ public: return result; } + template + value_t gf2p8affineqb(T1 a, T2 b, u8 c) + { + value_t result; + + const auto data0 = a.eval(m_ir); + const auto data1 = b.eval(m_ir); + + const auto immediate = (llvm_const_int{c}); + const auto imm8 = immediate.eval(m_ir); + + result.value = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::x86_vgf2p8affineqb_128), {data0, data1, imm8}); + return result; + } + template value_t vperm2b(T1 a, T2 b, T3 c) { diff --git a/rpcs3/Emu/Cell/SPURecompiler.cpp b/rpcs3/Emu/Cell/SPURecompiler.cpp index aed86e32c6..baa7a3eb51 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.cpp +++ b/rpcs3/Emu/Cell/SPURecompiler.cpp @@ -7277,10 +7277,20 @@ public: { if (auto [ok, v1] = match_expr(b, byteswap(match())); ok) { - // Undo endian swapping, and rely on pshufb to re-reverse endianness - const auto x = avg(noncast(sext((c & 0xc0) == 0xc0)), noncast(sext((c & 0xe0) == 0xc0))); + // Undo endian swapping, and rely on pshufb/vperm2b to re-reverse endianness const auto as = byteswap(a); const auto bs = byteswap(b); + + if (m_use_avx512_icl && (op.ra != op.rb || m_interp_magn)) + { + const auto m = gf2p8affineqb(build(0x02, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x02, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04), c, 0x7f); + const auto mm = select(noncast(m) >= 0, splat(0), m); + const auto ab = vperm2b(as, bs, c); + set_vr(op.rt4, select(noncast(c) >= 0, ab, mm)); + return; + } + + const auto x = avg(noncast(sext((c & 0xc0) == 0xc0)), noncast(sext((c & 0xe0) == 0xc0))); const auto ax = pshufb(as, c); const auto bx = pshufb(bs, c); set_vr(op.rt4, select(noncast(c << 3) >= 0, ax, bx) | x); @@ -7319,6 +7329,16 @@ public: } } + if (m_use_avx512_icl && (op.ra != op.rb || m_interp_magn)) + { + const auto m = gf2p8affineqb(build(0x02, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x02, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04), c, 0x7f); + const auto mm = select(noncast(m) >= 0, splat(0), m); + const auto cr = eval(~c); + const auto ab = vperm2b(b, a, cr); + set_vr(op.rt4, select(noncast(cr) >= 0, mm, ab)); + return; + } + const auto x = avg(noncast(sext((c & 0xc0) == 0xc0)), noncast(sext((c & 0xe0) == 0xc0))); const auto cr = eval(c ^ 0xf); const auto ax = pshufb(a, cr);