From a0bf103e8bb665b97afc0f598e5b3065dc87c036 Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Thu, 28 Jun 2018 16:21:08 +0300 Subject: [PATCH] Implement cpu_translator::pshufb<>() Remove spu_translator::pshufb<>() Improve PSHUFB emulation (pre-SSSE3) Emit static shufflevector for the constant mask PPU: Inline VPERM instruction --- rpcs3/Emu/CPU/CPUTranslator.h | 66 ++++++++++++++++++++++++++++++++ rpcs3/Emu/Cell/PPUThread.cpp | 4 +- rpcs3/Emu/Cell/PPUTranslator.cpp | 10 +++-- rpcs3/Emu/Cell/PPUTranslator.h | 2 +- rpcs3/Emu/Cell/SPURecompiler.cpp | 31 +-------------- 5 files changed, 77 insertions(+), 36 deletions(-) diff --git a/rpcs3/Emu/CPU/CPUTranslator.h b/rpcs3/Emu/CPU/CPUTranslator.h index f2fb45f498..4780d392aa 100644 --- a/rpcs3/Emu/CPU/CPUTranslator.h +++ b/rpcs3/Emu/CPU/CPUTranslator.h @@ -903,6 +903,9 @@ protected: // Endianness, affects vector element numbering (TODO) bool m_is_be; + // Allow PSHUFB intrinsic + bool m_use_ssse3; + // IR builder llvm::IRBuilder<>* m_ir; @@ -1173,6 +1176,69 @@ public: return result; } + template + value_t pshufb(T1 a, T2 b) + { + value_t result; + + const auto data0 = a.eval(m_ir); + const auto index = b.eval(m_ir); + const auto zeros = llvm::ConstantAggregateZero::get(get_type()); + + if (auto c = llvm::dyn_cast(index)) + { + // Convert PSHUFB index back to LLVM vector shuffle mask + v128 mask{}; + + const auto cv = llvm::dyn_cast(c); + + if (cv) + { + for (u32 i = 0; i < 16; i++) + { + const u64 b = cv->getElementAsInteger(i); + mask._u8[i] = b < 128 ? b % 16 : 16; + } + } + + if (cv || llvm::isa(c)) + { + result.value = llvm::ConstantDataVector::get(m_context, llvm::makeArrayRef((const u8*)mask._bytes, 16)); + result.value = m_ir->CreateZExt(result.value, get_type()); + result.value = m_ir->CreateShuffleVector(data0, zeros, result.value); + return result; + } + } + + if (m_use_ssse3) + { + result.value = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::x86_ssse3_pshuf_b_128), {data0, index}); + } + else + { + // Emulate PSHUFB (TODO) + const auto mask = m_ir->CreateAnd(index, 0xf); + const auto loop = llvm::BasicBlock::Create(m_context, "", m_ir->GetInsertBlock()->getParent()); + const auto next = llvm::BasicBlock::Create(m_context, "", m_ir->GetInsertBlock()->getParent()); + const auto prev = m_ir->GetInsertBlock(); + + m_ir->CreateBr(loop); + m_ir->SetInsertPoint(loop); + const auto i = m_ir->CreatePHI(get_type(), 2); + const auto v = m_ir->CreatePHI(get_type(), 2); + i->addIncoming(m_ir->getInt32(0), prev); + i->addIncoming(m_ir->CreateAdd(i, m_ir->getInt32(1)), loop); + v->addIncoming(zeros, prev); + result.value = m_ir->CreateInsertElement(v, m_ir->CreateExtractElement(data0, m_ir->CreateExtractElement(mask, i)), i); + v->addIncoming(result.value, loop); + m_ir->CreateCondBr(m_ir->CreateICmpULT(i, m_ir->getInt32(16)), loop, next); + m_ir->SetInsertPoint(next); + result.value = m_ir->CreateSelect(m_ir->CreateICmpSLT(index, zeros), zeros, result.value); + } + + return result; + } + template R get_const_vector(llvm::Constant*, u32 a, u32 b); diff --git a/rpcs3/Emu/Cell/PPUThread.cpp b/rpcs3/Emu/Cell/PPUThread.cpp index b611e7fff7..45b1e38015 100644 --- a/rpcs3/Emu/Cell/PPUThread.cpp +++ b/rpcs3/Emu/Cell/PPUThread.cpp @@ -1302,7 +1302,7 @@ extern void ppu_initialize(const ppu_module& info) { "__stdcx", (u64)&ppu_stdcx }, { "__vexptefp", (u64)&sse_exp2_ps }, { "__vlogefp", (u64)&sse_log2_ps }, - { "__vperm", s_use_ssse3 ? (u64)&sse_altivec_vperm : (u64)&sse_altivec_vperm_v0 }, + { "__vperm", s_use_ssse3 ? (u64)&sse_altivec_vperm : (u64)&sse_altivec_vperm_v0 }, // Obsolete { "__lvsl", (u64)&sse_altivec_lvsl }, { "__lvsr", (u64)&sse_altivec_lvsr }, { "__lvlx", s_use_ssse3 ? (u64)&sse_cellbe_lvlx : (u64)&sse_cellbe_lvlx_v0 }, @@ -1685,7 +1685,7 @@ static void ppu_initialize2(jit_compiler& jit, const ppu_module& module_part, co module->setTargetTriple(Triple::normalize(sys::getProcessTriple())); // Initialize translator - PPUTranslator translator(jit.get_context(), module.get(), module_part); + PPUTranslator translator(jit.get_context(), module.get(), module_part, jit.has_ssse3()); // Define some types const auto _void = Type::getVoidTy(jit.get_context()); diff --git a/rpcs3/Emu/Cell/PPUTranslator.cpp b/rpcs3/Emu/Cell/PPUTranslator.cpp index f1148439fb..766adcfe92 100644 --- a/rpcs3/Emu/Cell/PPUTranslator.cpp +++ b/rpcs3/Emu/Cell/PPUTranslator.cpp @@ -11,13 +11,14 @@ using namespace llvm; const ppu_decoder s_ppu_decoder; -PPUTranslator::PPUTranslator(LLVMContext& context, Module* module, const ppu_module& info) +PPUTranslator::PPUTranslator(LLVMContext& context, Module* module, const ppu_module& info, bool ssse3) : cpu_translator(module, false) , m_info(info) , m_pure_attr(AttributeList::get(m_context, AttributeList::FunctionIndex, {Attribute::NoUnwind, Attribute::ReadNone})) { // Bind context m_context = context; + m_use_ssse3 = ssse3; // There is no weak linkage on JIT, so let's create variables with different names for each module part const u32 gsuffix = m_info.name.empty() ? info.funcs[0].addr : info.funcs[0].addr - m_info.segs[0].addr; @@ -1193,8 +1194,11 @@ void PPUTranslator::VOR(ppu_opcode_t op) void PPUTranslator::VPERM(ppu_opcode_t op) { - const auto abc = GetVrs(VrType::vi8, op.va, op.vb, op.vc); - SetVr(op.vd, Call(GetType(), m_pure_attr, "__vperm", abc[0], abc[1], abc[2])); + const auto a = get_vr(op.va); + const auto b = get_vr(op.vb); + const auto c = get_vr(op.vc); + const auto i = eval(~c & 0x1f); + set_vr(op.vd, select(bitcast(c << 3) >= 0, pshufb(a, i), pshufb(b, i))); } void PPUTranslator::VPKPX(ppu_opcode_t op) diff --git a/rpcs3/Emu/Cell/PPUTranslator.h b/rpcs3/Emu/Cell/PPUTranslator.h index 1df4e5716b..bc428bb6ab 100644 --- a/rpcs3/Emu/Cell/PPUTranslator.h +++ b/rpcs3/Emu/Cell/PPUTranslator.h @@ -313,7 +313,7 @@ public: // Handle compilation errors void CompilationError(const std::string& error); - PPUTranslator(llvm::LLVMContext& context, llvm::Module* module, const ppu_module& info); + PPUTranslator(llvm::LLVMContext& context, llvm::Module* module, const ppu_module& info, bool ssse3); ~PPUTranslator(); // Get thread context struct type diff --git a/rpcs3/Emu/Cell/SPURecompiler.cpp b/rpcs3/Emu/Cell/SPURecompiler.cpp index b55fb7e90c..2b19860474 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.cpp +++ b/rpcs3/Emu/Cell/SPURecompiler.cpp @@ -1892,36 +1892,6 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator m_ir->CreateRetVoid(); } - template - value_t pshufb(T1 a, T2 b) - { - value_t result; - - if (m_spurt->m_jit.has_ssse3()) - { - result.value = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::x86_ssse3_pshuf_b_128), {a.eval(m_ir), b.eval(m_ir)}); - } - else - { - const auto data0 = a.eval(m_ir); - const auto index = b.eval(m_ir); - const auto mask = m_ir->CreateAnd(index, 0xf); - const auto zero = llvm::ConstantInt::get(get_type(), 0u); - - result.value = zero; - - for (u32 i = 0; i < 16; i++) - { - const auto x = m_ir->CreateExtractElement(data0, m_ir->CreateExtractElement(mask, i)); - result.value = m_ir->CreateInsertElement(result.value, x, i); - } - - result.value = m_ir->CreateSelect(m_ir->CreateICmpSLT(index, zero), zero, result.value); - } - - return result; - } - public: spu_llvm_recompiler() : spu_recompiler_base() @@ -1942,6 +1912,7 @@ public: m_cache = fxm::get(); m_spurt = fxm::get_always(); m_context = m_spurt->m_jit.get_context(); + m_use_ssse3 = m_spurt->m_jit.has_ssse3(); } }