diff --git a/rpcs3/Emu/CPU/CPUTranslator.h b/rpcs3/Emu/CPU/CPUTranslator.h index dbae5a5c4e..7d498a7c6d 100644 --- a/rpcs3/Emu/CPU/CPUTranslator.h +++ b/rpcs3/Emu/CPU/CPUTranslator.h @@ -30,6 +30,7 @@ #endif #include "util/types.hpp" +#include "util/sysinfo.hpp" #include "Utilities/StrFmt.h" #include "Utilities/BitField.h" #include "Utilities/JIT.h" @@ -3442,6 +3443,11 @@ public: template value_t vperm2b(T1 a, T2 b, T3 c) { + if (!utils::has_fast_vperm2b()) + { + return vperm2b256to128(a, b, c); + } + value_t result; const auto data0 = a.eval(m_ir); diff --git a/rpcs3/Emu/Cell/PPUTranslator.cpp b/rpcs3/Emu/Cell/PPUTranslator.cpp index e525b0e2fa..8a337c19c4 100644 --- a/rpcs3/Emu/Cell/PPUTranslator.cpp +++ b/rpcs3/Emu/Cell/PPUTranslator.cpp @@ -1289,7 +1289,7 @@ void PPUTranslator::VPERM(ppu_opcode_t op) if (m_use_avx512_icl) { const auto i = eval(~c); - set_vr(op.vd, vperm2b256to128(b, a, i)); + set_vr(op.vd, vperm2b(b, a, i)); return; } diff --git a/rpcs3/Emu/Cell/SPURecompiler.cpp b/rpcs3/Emu/Cell/SPURecompiler.cpp index 72149fc591..2802babc95 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.cpp +++ b/rpcs3/Emu/Cell/SPURecompiler.cpp @@ -8313,13 +8313,13 @@ public: { if (perm_only) { - set_vr(op.rt4, vperm2b256to128(as, bs, c)); + set_vr(op.rt4, vperm2b(as, bs, c)); return; } const auto m = gf2p8affineqb(c, build(0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20), 0x7f); const auto mm = select(noncast(m) >= 0, splat(0), m); - const auto ab = vperm2b256to128(as, bs, c); + const auto ab = vperm2b(as, bs, c); set_vr(op.rt4, select(noncast(c) >= 0, ab, mm)); return; } @@ -8371,18 +8371,18 @@ public: } } - if (m_use_avx512_icl && (op.ra != op.rb)) + if (m_use_avx512_icl && (op.ra != op.rb || m_interp_magn)) { if (perm_only) { - set_vr(op.rt4, vperm2b256to128(a, b, eval(c ^ 0xf))); + set_vr(op.rt4, vperm2b(a, b, eval(c ^ 0xf))); return; } const auto m = gf2p8affineqb(c, build(0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20), 0x7f); const auto mm = select(noncast(m) >= 0, splat(0), m); const auto cr = eval(c ^ 0xf); - const auto ab = vperm2b256to128(a, b, cr); + const auto ab = vperm2b(a, b, cr); set_vr(op.rt4, select(noncast(c) >= 0, ab, mm)); return; } diff --git a/rpcs3/util/sysinfo.cpp b/rpcs3/util/sysinfo.cpp index 49ec8e4237..8ee13a2aee 100755 --- a/rpcs3/util/sysinfo.cpp +++ b/rpcs3/util/sysinfo.cpp @@ -227,6 +227,19 @@ bool utils::has_fma4() #endif } +// The Zen4 based CPUs support VPERMI2B/VPERMT2B in a single uop. +// Current Intel cpus (as of 2022) need 3 uops to execute these instructions. +// Check for SSE4A (which intel doesn't doesn't support) as well as VBMI. +bool utils::has_fast_vperm2b() +{ +#if defined(ARCH_X64) + static const bool g_value = has_avx512() && (get_cpuid(7, 0)[2] & 0x2) == 0x2 && get_cpuid(0, 0)[0] >= 0x7 && (get_cpuid(0x80000001, 0)[2] & 0x20) == 0x20; + return g_value; +#else + return false; +#endif +} + bool utils::has_erms() { #if defined(ARCH_X64) diff --git a/rpcs3/util/sysinfo.hpp b/rpcs3/util/sysinfo.hpp index 110b8bc4f7..23ebefd8e7 100755 --- a/rpcs3/util/sysinfo.hpp +++ b/rpcs3/util/sysinfo.hpp @@ -37,6 +37,8 @@ namespace utils bool has_fma4(); + bool has_fast_vperm2b(); + bool has_erms(); bool has_fsrm();