mirror of
https://github.com/RPCS3/rpcs3.git
synced 2025-07-03 21:41:26 +12:00
SPU: remove SSSE3 dependency
This commit is contained in:
parent
61de20a633
commit
4aee4ed6d7
6 changed files with 639 additions and 113 deletions
|
@ -6,6 +6,7 @@
|
||||||
#include "SPUThread.h"
|
#include "SPUThread.h"
|
||||||
#include "SPUInterpreter.h"
|
#include "SPUInterpreter.h"
|
||||||
#include "SPUASMJITRecompiler.h"
|
#include "SPUASMJITRecompiler.h"
|
||||||
|
#include "Utilities/sysinfo.h"
|
||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
|
||||||
|
@ -20,7 +21,7 @@
|
||||||
#define SPU_OFF_16(x, ...) asmjit::x86::word_ptr(*cpu, offset32(&SPUThread::x, ##__VA_ARGS__))
|
#define SPU_OFF_16(x, ...) asmjit::x86::word_ptr(*cpu, offset32(&SPUThread::x, ##__VA_ARGS__))
|
||||||
#define SPU_OFF_8(x, ...) asmjit::x86::byte_ptr(*cpu, offset32(&SPUThread::x, ##__VA_ARGS__))
|
#define SPU_OFF_8(x, ...) asmjit::x86::byte_ptr(*cpu, offset32(&SPUThread::x, ##__VA_ARGS__))
|
||||||
|
|
||||||
const spu_decoder<spu_interpreter_fast> s_spu_interpreter; // TODO: remove
|
extern const spu_decoder<spu_interpreter_fast> g_spu_interpreter_fast; // TODO: avoid
|
||||||
const spu_decoder<spu_recompiler> s_spu_decoder;
|
const spu_decoder<spu_recompiler> s_spu_decoder;
|
||||||
|
|
||||||
spu_recompiler::spu_recompiler()
|
spu_recompiler::spu_recompiler()
|
||||||
|
@ -101,6 +102,8 @@ void spu_recompiler::compile(spu_function_t& f)
|
||||||
this->qw1 = &qw1_var;
|
this->qw1 = &qw1_var;
|
||||||
X86Gp qw2_var = compiler.newUInt64("qw2");
|
X86Gp qw2_var = compiler.newUInt64("qw2");
|
||||||
this->qw2 = &qw2_var;
|
this->qw2 = &qw2_var;
|
||||||
|
X86Gp qw3_var = compiler.newUInt64("qw3");
|
||||||
|
this->qw3 = &qw3_var;
|
||||||
|
|
||||||
std::array<X86Xmm, 6> vec_vars;
|
std::array<X86Xmm, 6> vec_vars;
|
||||||
|
|
||||||
|
@ -351,7 +354,7 @@ void spu_recompiler::InterpreterCall(spu_opcode_t op)
|
||||||
asmjit::CCFuncCall* call = c->call(asmjit::imm_ptr(asmjit::Internal::ptr_cast<void*, u32(SPUThread*, u32, spu_inter_func_t)>(gate)), asmjit::FuncSignature3<u32, void*, u32, void*>(asmjit::CallConv::kIdHost));
|
asmjit::CCFuncCall* call = c->call(asmjit::imm_ptr(asmjit::Internal::ptr_cast<void*, u32(SPUThread*, u32, spu_inter_func_t)>(gate)), asmjit::FuncSignature3<u32, void*, u32, void*>(asmjit::CallConv::kIdHost));
|
||||||
call->setArg(0, *cpu);
|
call->setArg(0, *cpu);
|
||||||
call->setArg(1, asmjit::imm_u(op.opcode));
|
call->setArg(1, asmjit::imm_u(op.opcode));
|
||||||
call->setArg(2, asmjit::imm_ptr(asmjit::Internal::ptr_cast<void*>(s_spu_interpreter.decode(op.opcode))));
|
call->setArg(2, asmjit::imm_ptr(asmjit::Internal::ptr_cast<void*>(g_spu_interpreter_fast.decode(op.opcode))));
|
||||||
call->setRet(0, *addr);
|
call->setRet(0, *addr);
|
||||||
|
|
||||||
// return immediately if an error occured
|
// return immediately if an error occured
|
||||||
|
@ -1029,9 +1032,24 @@ void spu_recompiler::STQX(spu_opcode_t op)
|
||||||
c->add(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
|
c->add(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
|
||||||
c->and_(*addr, 0x3fff0);
|
c->and_(*addr, 0x3fff0);
|
||||||
|
|
||||||
|
if (utils::has_ssse3())
|
||||||
|
{
|
||||||
const XmmLink& vt = XmmGet(op.rt, XmmType::Int);
|
const XmmLink& vt = XmmGet(op.rt, XmmType::Int);
|
||||||
c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
|
c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
|
||||||
c->movdqa(asmjit::x86::oword_ptr(*ls, *addr), vt);
|
c->movdqa(asmjit::x86::oword_ptr(*ls, *addr), vt);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
c->mov(*qw0, SPU_OFF_64(gpr, op.rt, &v128::_u64, 0));
|
||||||
|
c->mov(*qw1, SPU_OFF_64(gpr, op.rt, &v128::_u64, 1));
|
||||||
|
c->bswap(*qw0);
|
||||||
|
c->bswap(*qw1);
|
||||||
|
c->mov(asmjit::x86::qword_ptr(*ls, *addr, 0, 0), *qw1);
|
||||||
|
c->mov(asmjit::x86::qword_ptr(*ls, *addr, 0, 8), *qw0);
|
||||||
|
c->unuse(*qw0);
|
||||||
|
c->unuse(*qw1);
|
||||||
|
}
|
||||||
|
|
||||||
c->unuse(*addr);
|
c->unuse(*addr);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1079,9 +1097,8 @@ void spu_recompiler::HBR(spu_opcode_t op)
|
||||||
void spu_recompiler::GB(spu_opcode_t op)
|
void spu_recompiler::GB(spu_opcode_t op)
|
||||||
{
|
{
|
||||||
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
||||||
c->pshufb(va, XmmConst(_mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0)));
|
c->pslld(va, 31);
|
||||||
c->psllq(va, 7);
|
c->movmskps(*addr, va);
|
||||||
c->pmovmskb(*addr, va);
|
|
||||||
c->pxor(va, va);
|
c->pxor(va, va);
|
||||||
c->pinsrw(va, *addr, 6);
|
c->pinsrw(va, *addr, 6);
|
||||||
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
|
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
|
||||||
|
@ -1091,8 +1108,8 @@ void spu_recompiler::GB(spu_opcode_t op)
|
||||||
void spu_recompiler::GBH(spu_opcode_t op)
|
void spu_recompiler::GBH(spu_opcode_t op)
|
||||||
{
|
{
|
||||||
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
||||||
c->pshufb(va, XmmConst(_mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 14, 12, 10, 8, 6, 4, 2, 0)));
|
c->psllw(va, 15);
|
||||||
c->psllq(va, 7);
|
c->packsswb(va, XmmConst(_mm_setzero_si128()));
|
||||||
c->pmovmskb(*addr, va);
|
c->pmovmskb(*addr, va);
|
||||||
c->pxor(va, va);
|
c->pxor(va, va);
|
||||||
c->pinsrw(va, *addr, 6);
|
c->pinsrw(va, *addr, 6);
|
||||||
|
@ -1171,21 +1188,54 @@ void spu_recompiler::LQX(spu_opcode_t op)
|
||||||
c->add(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
|
c->add(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
|
||||||
c->and_(*addr, 0x3fff0);
|
c->and_(*addr, 0x3fff0);
|
||||||
|
|
||||||
|
if (utils::has_ssse3())
|
||||||
|
{
|
||||||
const XmmLink& vt = XmmAlloc();
|
const XmmLink& vt = XmmAlloc();
|
||||||
c->movdqa(vt, asmjit::x86::oword_ptr(*ls, *addr));
|
c->movdqa(vt, asmjit::x86::oword_ptr(*ls, *addr));
|
||||||
c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
|
c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
|
||||||
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
|
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
c->mov(*qw0, asmjit::x86::qword_ptr(*ls, *addr, 0, 0));
|
||||||
|
c->mov(*qw1, asmjit::x86::qword_ptr(*ls, *addr, 0, 8));
|
||||||
|
c->bswap(*qw0);
|
||||||
|
c->bswap(*qw1);
|
||||||
|
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw1);
|
||||||
|
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw0);
|
||||||
|
c->unuse(*qw0);
|
||||||
|
c->unuse(*qw1);
|
||||||
|
}
|
||||||
|
|
||||||
c->unuse(*addr);
|
c->unuse(*addr);
|
||||||
}
|
}
|
||||||
|
|
||||||
void spu_recompiler::ROTQBYBI(spu_opcode_t op)
|
void spu_recompiler::ROTQBYBI(spu_opcode_t op)
|
||||||
{
|
{
|
||||||
|
auto body = [](u8* t, const u8* _a, u32 v) noexcept
|
||||||
|
{
|
||||||
|
const auto a = *(__m128i*)_a;
|
||||||
|
alignas(32) const __m128i buf[2]{a, a};
|
||||||
|
*(__m128i*)t = _mm_loadu_si128((__m128i*)((u8*)buf + (16 - (v >> 3 & 0xf))));
|
||||||
|
};
|
||||||
|
|
||||||
|
if (!utils::has_ssse3())
|
||||||
|
{
|
||||||
|
c->lea(*qw0, SPU_OFF_128(gpr, op.rt));
|
||||||
|
c->lea(*qw1, SPU_OFF_128(gpr, op.ra));
|
||||||
|
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
|
||||||
|
asmjit::CCFuncCall* call = c->call(asmjit::imm_ptr(asmjit::Internal::ptr_cast<void*, void(u8*, const u8*, u32)>(body)), asmjit::FuncSignature3<void, void*, void*, u32>(asmjit::CallConv::kIdHost));
|
||||||
|
call->setArg(0, *qw0);
|
||||||
|
call->setArg(1, *qw1);
|
||||||
|
call->setArg(2, *addr);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
||||||
c->mov(*qw0, asmjit::imm_ptr((void*)g_spu_imm.rldq_pshufb));
|
c->mov(*qw0, asmjit::imm_ptr((void*)g_spu_imm.rldq_pshufb));
|
||||||
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
|
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
|
||||||
c->and_(*addr, 0xf << 3);
|
c->and_(*addr, 0xf << 3);
|
||||||
c->shl(*addr, 1);
|
c->pshufb(va, asmjit::x86::oword_ptr(*qw0, *addr, 1));
|
||||||
c->pshufb(va, asmjit::x86::oword_ptr(*qw0, *addr));
|
|
||||||
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
|
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
|
||||||
c->unuse(*addr);
|
c->unuse(*addr);
|
||||||
c->unuse(*qw0);
|
c->unuse(*qw0);
|
||||||
|
@ -1193,14 +1243,30 @@ void spu_recompiler::ROTQBYBI(spu_opcode_t op)
|
||||||
|
|
||||||
void spu_recompiler::ROTQMBYBI(spu_opcode_t op)
|
void spu_recompiler::ROTQMBYBI(spu_opcode_t op)
|
||||||
{
|
{
|
||||||
|
auto body = [](u8* t, const u8* _a, u32 v) noexcept
|
||||||
|
{
|
||||||
|
const auto a = *(__m128i*)_a;
|
||||||
|
alignas(64) const __m128i buf[3]{a, _mm_setzero_si128(), _mm_setzero_si128()};
|
||||||
|
*(__m128i*)t = _mm_loadu_si128((__m128i*)((u8*)buf + (v >> 3 & 0x1f)));
|
||||||
|
};
|
||||||
|
|
||||||
|
if (!utils::has_ssse3())
|
||||||
|
{
|
||||||
|
c->lea(*qw0, SPU_OFF_128(gpr, op.rt));
|
||||||
|
c->lea(*qw1, SPU_OFF_128(gpr, op.ra));
|
||||||
|
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
|
||||||
|
asmjit::CCFuncCall* call = c->call(asmjit::imm_ptr(asmjit::Internal::ptr_cast<void*, void(u8*, const u8*, u32)>(body)), asmjit::FuncSignature3<void, void*, void*, u32>(asmjit::CallConv::kIdHost));
|
||||||
|
call->setArg(0, *qw0);
|
||||||
|
call->setArg(1, *qw1);
|
||||||
|
call->setArg(2, *addr);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
||||||
c->mov(*qw0, asmjit::imm_ptr((void*)g_spu_imm.srdq_pshufb));
|
c->mov(*qw0, asmjit::imm_ptr((void*)g_spu_imm.srdq_pshufb));
|
||||||
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
|
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
|
||||||
c->shr(*addr, 3);
|
c->and_(*addr, 0x1f << 3);
|
||||||
c->neg(*addr);
|
c->pshufb(va, asmjit::x86::oword_ptr(*qw0, *addr, 1));
|
||||||
c->and_(*addr, 0x1f);
|
|
||||||
c->shl(*addr, 4);
|
|
||||||
c->pshufb(va, asmjit::x86::oword_ptr(*qw0, *addr));
|
|
||||||
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
|
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
|
||||||
c->unuse(*addr);
|
c->unuse(*addr);
|
||||||
c->unuse(*qw0);
|
c->unuse(*qw0);
|
||||||
|
@ -1208,12 +1274,30 @@ void spu_recompiler::ROTQMBYBI(spu_opcode_t op)
|
||||||
|
|
||||||
void spu_recompiler::SHLQBYBI(spu_opcode_t op)
|
void spu_recompiler::SHLQBYBI(spu_opcode_t op)
|
||||||
{
|
{
|
||||||
|
auto body = [](u8* t, const u8* _a, u32 v) noexcept
|
||||||
|
{
|
||||||
|
const auto a = *(__m128i*)_a;
|
||||||
|
alignas(64) const __m128i buf[3]{_mm_setzero_si128(), _mm_setzero_si128(), a};
|
||||||
|
*(__m128i*)t = _mm_loadu_si128((__m128i*)((u8*)buf + (32 - (v >> 3 & 0x1f))));
|
||||||
|
};
|
||||||
|
|
||||||
|
if (!utils::has_ssse3())
|
||||||
|
{
|
||||||
|
c->lea(*qw0, SPU_OFF_128(gpr, op.rt));
|
||||||
|
c->lea(*qw1, SPU_OFF_128(gpr, op.ra));
|
||||||
|
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
|
||||||
|
asmjit::CCFuncCall* call = c->call(asmjit::imm_ptr(asmjit::Internal::ptr_cast<void*, void(u8*, const u8*, u32)>(body)), asmjit::FuncSignature3<void, void*, void*, u32>(asmjit::CallConv::kIdHost));
|
||||||
|
call->setArg(0, *qw0);
|
||||||
|
call->setArg(1, *qw1);
|
||||||
|
call->setArg(2, *addr);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
||||||
c->mov(*qw0, asmjit::imm_ptr((void*)g_spu_imm.sldq_pshufb));
|
c->mov(*qw0, asmjit::imm_ptr((void*)g_spu_imm.sldq_pshufb));
|
||||||
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
|
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
|
||||||
c->and_(*addr, 0x1f << 3);
|
c->and_(*addr, 0x1f << 3);
|
||||||
c->shl(*addr, 1);
|
c->pshufb(va, asmjit::x86::oword_ptr(*qw0, *addr, 1));
|
||||||
c->pshufb(va, asmjit::x86::oword_ptr(*qw0, *addr));
|
|
||||||
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
|
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
|
||||||
c->unuse(*addr);
|
c->unuse(*addr);
|
||||||
c->unuse(*qw0);
|
c->unuse(*qw0);
|
||||||
|
@ -1327,6 +1411,25 @@ void spu_recompiler::SHLQBI(spu_opcode_t op)
|
||||||
|
|
||||||
void spu_recompiler::ROTQBY(spu_opcode_t op)
|
void spu_recompiler::ROTQBY(spu_opcode_t op)
|
||||||
{
|
{
|
||||||
|
auto body = [](u8* t, const u8* _a, u32 v) noexcept
|
||||||
|
{
|
||||||
|
const auto a = *(__m128i*)_a;
|
||||||
|
alignas(32) const __m128i buf[2]{a, a};
|
||||||
|
*(__m128i*)t = _mm_loadu_si128((__m128i*)((u8*)buf + (16 - (v & 0xf))));
|
||||||
|
};
|
||||||
|
|
||||||
|
if (!utils::has_ssse3())
|
||||||
|
{
|
||||||
|
c->lea(*qw0, SPU_OFF_128(gpr, op.rt));
|
||||||
|
c->lea(*qw1, SPU_OFF_128(gpr, op.ra));
|
||||||
|
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
|
||||||
|
asmjit::CCFuncCall* call = c->call(asmjit::imm_ptr(asmjit::Internal::ptr_cast<void*, void(u8*, const u8*, u32)>(body)), asmjit::FuncSignature3<void, void*, void*, u32>(asmjit::CallConv::kIdHost));
|
||||||
|
call->setArg(0, *qw0);
|
||||||
|
call->setArg(1, *qw1);
|
||||||
|
call->setArg(2, *addr);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
||||||
c->mov(*qw0, asmjit::imm_ptr((void*)g_spu_imm.rldq_pshufb));
|
c->mov(*qw0, asmjit::imm_ptr((void*)g_spu_imm.rldq_pshufb));
|
||||||
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
|
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
|
||||||
|
@ -1340,10 +1443,28 @@ void spu_recompiler::ROTQBY(spu_opcode_t op)
|
||||||
|
|
||||||
void spu_recompiler::ROTQMBY(spu_opcode_t op)
|
void spu_recompiler::ROTQMBY(spu_opcode_t op)
|
||||||
{
|
{
|
||||||
|
auto body = [](u8* t, const u8* _a, u32 v) noexcept
|
||||||
|
{
|
||||||
|
const auto a = *(__m128i*)_a;
|
||||||
|
alignas(64) const __m128i buf[3]{a, _mm_setzero_si128(), _mm_setzero_si128()};
|
||||||
|
*(__m128i*)t = _mm_loadu_si128((__m128i*)((u8*)buf + (v & 0x1f)));
|
||||||
|
};
|
||||||
|
|
||||||
|
if (!utils::has_ssse3())
|
||||||
|
{
|
||||||
|
c->lea(*qw0, SPU_OFF_128(gpr, op.rt));
|
||||||
|
c->lea(*qw1, SPU_OFF_128(gpr, op.ra));
|
||||||
|
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
|
||||||
|
asmjit::CCFuncCall* call = c->call(asmjit::imm_ptr(asmjit::Internal::ptr_cast<void*, void(u8*, const u8*, u32)>(body)), asmjit::FuncSignature3<void, void*, void*, u32>(asmjit::CallConv::kIdHost));
|
||||||
|
call->setArg(0, *qw0);
|
||||||
|
call->setArg(1, *qw1);
|
||||||
|
call->setArg(2, *addr);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
||||||
c->mov(*qw0, asmjit::imm_ptr((void*)g_spu_imm.srdq_pshufb));
|
c->mov(*qw0, asmjit::imm_ptr((void*)g_spu_imm.srdq_pshufb));
|
||||||
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
|
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
|
||||||
c->neg(*addr);
|
|
||||||
c->and_(*addr, 0x1f);
|
c->and_(*addr, 0x1f);
|
||||||
c->shl(*addr, 4);
|
c->shl(*addr, 4);
|
||||||
c->pshufb(va, asmjit::x86::oword_ptr(*qw0, *addr));
|
c->pshufb(va, asmjit::x86::oword_ptr(*qw0, *addr));
|
||||||
|
@ -1354,6 +1475,25 @@ void spu_recompiler::ROTQMBY(spu_opcode_t op)
|
||||||
|
|
||||||
void spu_recompiler::SHLQBY(spu_opcode_t op)
|
void spu_recompiler::SHLQBY(spu_opcode_t op)
|
||||||
{
|
{
|
||||||
|
auto body = [](u8* t, const u8* _a, u32 v) noexcept
|
||||||
|
{
|
||||||
|
const auto a = *(__m128i*)_a;
|
||||||
|
alignas(64) const __m128i buf[3]{_mm_setzero_si128(), _mm_setzero_si128(), a};
|
||||||
|
*(__m128i*)t = _mm_loadu_si128((__m128i*)((u8*)buf + (32 - (v & 0x1f))));
|
||||||
|
};
|
||||||
|
|
||||||
|
if (!utils::has_ssse3())
|
||||||
|
{
|
||||||
|
c->lea(*qw0, SPU_OFF_128(gpr, op.rt));
|
||||||
|
c->lea(*qw1, SPU_OFF_128(gpr, op.ra));
|
||||||
|
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
|
||||||
|
asmjit::CCFuncCall* call = c->call(asmjit::imm_ptr(asmjit::Internal::ptr_cast<void*, void(u8*, const u8*, u32)>(body)), asmjit::FuncSignature3<void, void*, void*, u32>(asmjit::CallConv::kIdHost));
|
||||||
|
call->setArg(0, *qw0);
|
||||||
|
call->setArg(1, *qw1);
|
||||||
|
call->setArg(2, *addr);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
||||||
c->mov(*qw0, asmjit::imm_ptr((void*)g_spu_imm.sldq_pshufb));
|
c->mov(*qw0, asmjit::imm_ptr((void*)g_spu_imm.sldq_pshufb));
|
||||||
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
|
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
|
||||||
|
@ -1523,7 +1663,27 @@ void spu_recompiler::ROTQBYI(spu_opcode_t op)
|
||||||
{
|
{
|
||||||
const int s = op.i7 & 0xf;
|
const int s = op.i7 & 0xf;
|
||||||
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
||||||
|
const XmmLink& v2 = XmmAlloc();
|
||||||
|
|
||||||
|
if (s == 0)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
else if (s == 4 || s == 8 || s == 12)
|
||||||
|
{
|
||||||
|
c->pshufd(va, va, ::rol8(0xE4, s / 2));
|
||||||
|
}
|
||||||
|
else if (utils::has_ssse3())
|
||||||
|
{
|
||||||
c->palignr(va, va, 16 - s);
|
c->palignr(va, va, 16 - s);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
c->movdqa(v2, va);
|
||||||
|
c->psrldq(va, 16 - s);
|
||||||
|
c->pslldq(v2, s);
|
||||||
|
c->por(va, v2);
|
||||||
|
}
|
||||||
|
|
||||||
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
|
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1588,12 +1748,25 @@ void spu_recompiler::SUMB(spu_opcode_t op)
|
||||||
{
|
{
|
||||||
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
||||||
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
|
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
|
||||||
const XmmLink& vi = XmmAlloc();
|
const XmmLink& v1 = XmmAlloc();
|
||||||
c->movdqa(vi, XmmConst(_mm_set1_epi8(1)));
|
const XmmLink& v2 = XmmAlloc();
|
||||||
c->pmaddubsw(va, vi);
|
c->movdqa(v2, XmmConst(_mm_set1_epi16(0xff)));
|
||||||
c->pmaddubsw(vb, vi);
|
c->movdqa(v1, va);
|
||||||
c->phaddw(va, vb);
|
c->psrlw(va, 8);
|
||||||
c->pshufb(va, XmmConst(_mm_set_epi8(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0)));
|
c->pand(v1, v2);
|
||||||
|
c->pand(v2, vb);
|
||||||
|
c->psrlw(vb, 8);
|
||||||
|
c->paddw(va, v1);
|
||||||
|
c->paddw(vb, v2);
|
||||||
|
c->movdqa(v2, XmmConst(_mm_set1_epi32(0xffff)));
|
||||||
|
c->movdqa(v1, va);
|
||||||
|
c->psrld(va, 16);
|
||||||
|
c->pand(v1, v2);
|
||||||
|
c->pandn(v2, vb);
|
||||||
|
c->pslld(vb, 16);
|
||||||
|
c->paddw(va, v1);
|
||||||
|
c->paddw(vb, v2);
|
||||||
|
c->por(va, vb);
|
||||||
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
|
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1657,16 +1830,24 @@ void spu_recompiler::CNTB(spu_opcode_t op)
|
||||||
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
||||||
const XmmLink& v1 = XmmAlloc();
|
const XmmLink& v1 = XmmAlloc();
|
||||||
const XmmLink& vm = XmmAlloc();
|
const XmmLink& vm = XmmAlloc();
|
||||||
|
c->movdqa(vm, XmmConst(_mm_set1_epi8(0x55)));
|
||||||
c->movdqa(v1, va);
|
c->movdqa(v1, va);
|
||||||
c->psrlq(v1, 4);
|
|
||||||
c->movdqa(vm, XmmConst(_mm_set1_epi8(0xf)));
|
|
||||||
c->pand(va, vm);
|
c->pand(va, vm);
|
||||||
|
c->psrlq(v1, 1);
|
||||||
c->pand(v1, vm);
|
c->pand(v1, vm);
|
||||||
c->movdqa(vm, XmmConst(_mm_set_epi8(4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0)));
|
c->paddb(va, v1);
|
||||||
c->pshufb(vm, va);
|
c->movdqa(vm, XmmConst(_mm_set1_epi8(0x33)));
|
||||||
c->movdqa(va, XmmConst(_mm_set_epi8(4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0)));
|
c->movdqa(v1, va);
|
||||||
c->pshufb(va, v1);
|
c->pand(va, vm);
|
||||||
c->paddb(va, vm);
|
c->psrlq(v1, 2);
|
||||||
|
c->pand(v1, vm);
|
||||||
|
c->paddb(va, v1);
|
||||||
|
c->movdqa(vm, XmmConst(_mm_set1_epi8(0x0f)));
|
||||||
|
c->movdqa(v1, va);
|
||||||
|
c->pand(va, vm);
|
||||||
|
c->psrlq(v1, 4);
|
||||||
|
c->pand(v1, vm);
|
||||||
|
c->paddb(va, v1);
|
||||||
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
|
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2318,11 +2499,25 @@ void spu_recompiler::BRZ(spu_opcode_t op)
|
||||||
}
|
}
|
||||||
|
|
||||||
void spu_recompiler::STQA(spu_opcode_t op)
|
void spu_recompiler::STQA(spu_opcode_t op)
|
||||||
|
{
|
||||||
|
if (utils::has_ssse3())
|
||||||
{
|
{
|
||||||
const XmmLink& vt = XmmGet(op.rt, XmmType::Int);
|
const XmmLink& vt = XmmGet(op.rt, XmmType::Int);
|
||||||
c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
|
c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
|
||||||
c->movdqa(asmjit::x86::oword_ptr(*ls, spu_ls_target(0, op.i16)), vt);
|
c->movdqa(asmjit::x86::oword_ptr(*ls, spu_ls_target(0, op.i16)), vt);
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
c->mov(*qw0, SPU_OFF_64(gpr, op.rt, &v128::_u64, 0));
|
||||||
|
c->mov(*qw1, SPU_OFF_64(gpr, op.rt, &v128::_u64, 1));
|
||||||
|
c->bswap(*qw0);
|
||||||
|
c->bswap(*qw1);
|
||||||
|
c->mov(asmjit::x86::qword_ptr(*ls, spu_ls_target(0, op.i16) + 0), *qw1);
|
||||||
|
c->mov(asmjit::x86::qword_ptr(*ls, spu_ls_target(0, op.i16) + 8), *qw0);
|
||||||
|
c->unuse(*qw0);
|
||||||
|
c->unuse(*qw1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void spu_recompiler::BRNZ(spu_opcode_t op)
|
void spu_recompiler::BRNZ(spu_opcode_t op)
|
||||||
{
|
{
|
||||||
|
@ -2400,11 +2595,25 @@ void spu_recompiler::BRHNZ(spu_opcode_t op)
|
||||||
}
|
}
|
||||||
|
|
||||||
void spu_recompiler::STQR(spu_opcode_t op)
|
void spu_recompiler::STQR(spu_opcode_t op)
|
||||||
|
{
|
||||||
|
if (utils::has_ssse3())
|
||||||
{
|
{
|
||||||
const XmmLink& vt = XmmGet(op.rt, XmmType::Int);
|
const XmmLink& vt = XmmGet(op.rt, XmmType::Int);
|
||||||
c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
|
c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
|
||||||
c->movdqa(asmjit::x86::oword_ptr(*ls, spu_ls_target(m_pos, op.i16)), vt);
|
c->movdqa(asmjit::x86::oword_ptr(*ls, spu_ls_target(m_pos, op.i16)), vt);
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
c->mov(*qw0, SPU_OFF_64(gpr, op.rt, &v128::_u64, 0));
|
||||||
|
c->mov(*qw1, SPU_OFF_64(gpr, op.rt, &v128::_u64, 1));
|
||||||
|
c->bswap(*qw0);
|
||||||
|
c->bswap(*qw1);
|
||||||
|
c->mov(asmjit::x86::qword_ptr(*ls, spu_ls_target(m_pos, op.i16) + 0), *qw1);
|
||||||
|
c->mov(asmjit::x86::qword_ptr(*ls, spu_ls_target(m_pos, op.i16) + 8), *qw0);
|
||||||
|
c->unuse(*qw0);
|
||||||
|
c->unuse(*qw1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void spu_recompiler::BRA(spu_opcode_t op)
|
void spu_recompiler::BRA(spu_opcode_t op)
|
||||||
{
|
{
|
||||||
|
@ -2430,12 +2639,26 @@ void spu_recompiler::BRA(spu_opcode_t op)
|
||||||
}
|
}
|
||||||
|
|
||||||
void spu_recompiler::LQA(spu_opcode_t op)
|
void spu_recompiler::LQA(spu_opcode_t op)
|
||||||
|
{
|
||||||
|
if (utils::has_ssse3())
|
||||||
{
|
{
|
||||||
const XmmLink& vt = XmmAlloc();
|
const XmmLink& vt = XmmAlloc();
|
||||||
c->movdqa(vt, asmjit::x86::oword_ptr(*ls, spu_ls_target(0, op.i16)));
|
c->movdqa(vt, asmjit::x86::oword_ptr(*ls, spu_ls_target(0, op.i16)));
|
||||||
c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
|
c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
|
||||||
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
|
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
c->mov(*qw0, asmjit::x86::qword_ptr(*ls, spu_ls_target(0, op.i16) + 0));
|
||||||
|
c->mov(*qw1, asmjit::x86::qword_ptr(*ls, spu_ls_target(0, op.i16) + 8));
|
||||||
|
c->bswap(*qw0);
|
||||||
|
c->bswap(*qw1);
|
||||||
|
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw1);
|
||||||
|
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw0);
|
||||||
|
c->unuse(*qw0);
|
||||||
|
c->unuse(*qw1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void spu_recompiler::BRASL(spu_opcode_t op)
|
void spu_recompiler::BRASL(spu_opcode_t op)
|
||||||
{
|
{
|
||||||
|
@ -2515,12 +2738,26 @@ void spu_recompiler::BRSL(spu_opcode_t op)
|
||||||
}
|
}
|
||||||
|
|
||||||
void spu_recompiler::LQR(spu_opcode_t op)
|
void spu_recompiler::LQR(spu_opcode_t op)
|
||||||
|
{
|
||||||
|
if (utils::has_ssse3())
|
||||||
{
|
{
|
||||||
const XmmLink& vt = XmmAlloc();
|
const XmmLink& vt = XmmAlloc();
|
||||||
c->movdqa(vt, asmjit::x86::oword_ptr(*ls, spu_ls_target(m_pos, op.i16)));
|
c->movdqa(vt, asmjit::x86::oword_ptr(*ls, spu_ls_target(m_pos, op.i16)));
|
||||||
c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
|
c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
|
||||||
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
|
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
c->mov(*qw0, asmjit::x86::qword_ptr(*ls, spu_ls_target(m_pos, op.i16) + 0));
|
||||||
|
c->mov(*qw1, asmjit::x86::qword_ptr(*ls, spu_ls_target(m_pos, op.i16) + 8));
|
||||||
|
c->bswap(*qw0);
|
||||||
|
c->bswap(*qw1);
|
||||||
|
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw1);
|
||||||
|
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw0);
|
||||||
|
c->unuse(*qw0);
|
||||||
|
c->unuse(*qw1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void spu_recompiler::IL(spu_opcode_t op)
|
void spu_recompiler::IL(spu_opcode_t op)
|
||||||
{
|
{
|
||||||
|
@ -2630,9 +2867,24 @@ void spu_recompiler::STQD(spu_opcode_t op)
|
||||||
if (op.si10) c->add(*addr, op.si10 << 4);
|
if (op.si10) c->add(*addr, op.si10 << 4);
|
||||||
c->and_(*addr, 0x3fff0);
|
c->and_(*addr, 0x3fff0);
|
||||||
|
|
||||||
|
if (utils::has_ssse3())
|
||||||
|
{
|
||||||
const XmmLink& vt = XmmGet(op.rt, XmmType::Int);
|
const XmmLink& vt = XmmGet(op.rt, XmmType::Int);
|
||||||
c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
|
c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
|
||||||
c->movdqa(asmjit::x86::oword_ptr(*ls, *addr), vt);
|
c->movdqa(asmjit::x86::oword_ptr(*ls, *addr), vt);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
c->mov(*qw0, SPU_OFF_64(gpr, op.rt, &v128::_u64, 0));
|
||||||
|
c->mov(*qw1, SPU_OFF_64(gpr, op.rt, &v128::_u64, 1));
|
||||||
|
c->bswap(*qw0);
|
||||||
|
c->bswap(*qw1);
|
||||||
|
c->mov(asmjit::x86::qword_ptr(*ls, *addr, 0, 0), *qw1);
|
||||||
|
c->mov(asmjit::x86::qword_ptr(*ls, *addr, 0, 8), *qw0);
|
||||||
|
c->unuse(*qw0);
|
||||||
|
c->unuse(*qw1);
|
||||||
|
}
|
||||||
|
|
||||||
c->unuse(*addr);
|
c->unuse(*addr);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2642,10 +2894,25 @@ void spu_recompiler::LQD(spu_opcode_t op)
|
||||||
if (op.si10) c->add(*addr, op.si10 << 4);
|
if (op.si10) c->add(*addr, op.si10 << 4);
|
||||||
c->and_(*addr, 0x3fff0);
|
c->and_(*addr, 0x3fff0);
|
||||||
|
|
||||||
|
if (utils::has_ssse3())
|
||||||
|
{
|
||||||
const XmmLink& vt = XmmAlloc();
|
const XmmLink& vt = XmmAlloc();
|
||||||
c->movdqa(vt, asmjit::x86::oword_ptr(*ls, *addr));
|
c->movdqa(vt, asmjit::x86::oword_ptr(*ls, *addr));
|
||||||
c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
|
c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
|
||||||
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
|
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
c->mov(*qw0, asmjit::x86::qword_ptr(*ls, *addr, 0, 0));
|
||||||
|
c->mov(*qw1, asmjit::x86::qword_ptr(*ls, *addr, 0, 8));
|
||||||
|
c->bswap(*qw0);
|
||||||
|
c->bswap(*qw1);
|
||||||
|
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw1);
|
||||||
|
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw0);
|
||||||
|
c->unuse(*qw0);
|
||||||
|
c->unuse(*qw1);
|
||||||
|
}
|
||||||
|
|
||||||
c->unuse(*addr);
|
c->unuse(*addr);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2814,6 +3081,61 @@ void spu_recompiler::SELB(spu_opcode_t op)
|
||||||
|
|
||||||
void spu_recompiler::SHUFB(spu_opcode_t op)
|
void spu_recompiler::SHUFB(spu_opcode_t op)
|
||||||
{
|
{
|
||||||
|
alignas(16) static thread_local u8 s_lut[256]
|
||||||
|
{
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
||||||
|
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
||||||
|
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
||||||
|
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
||||||
|
};
|
||||||
|
|
||||||
|
auto body = [](u8* t, const u8* a, const u8* b, const u8* c) noexcept
|
||||||
|
{
|
||||||
|
__m128i _a = *(__m128i*)a;
|
||||||
|
__m128i _b = *(__m128i*)b;
|
||||||
|
_mm_store_si128((__m128i*)(s_lut + 0x00), _a);
|
||||||
|
_mm_store_si128((__m128i*)(s_lut + 0x10), _b);
|
||||||
|
_mm_store_si128((__m128i*)(s_lut + 0x20), _a);
|
||||||
|
_mm_store_si128((__m128i*)(s_lut + 0x30), _b);
|
||||||
|
_mm_store_si128((__m128i*)(s_lut + 0x40), _a);
|
||||||
|
_mm_store_si128((__m128i*)(s_lut + 0x50), _b);
|
||||||
|
_mm_store_si128((__m128i*)(s_lut + 0x60), _a);
|
||||||
|
_mm_store_si128((__m128i*)(s_lut + 0x70), _b);
|
||||||
|
v128 mask = v128::fromV(_mm_xor_si128(*(__m128i*)c, _mm_set1_epi8(0xf)));
|
||||||
|
|
||||||
|
for (int i = 0; i < 16; i++)
|
||||||
|
{
|
||||||
|
t[i] = s_lut[mask._u8[i]];
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
if (!utils::has_ssse3())
|
||||||
|
{
|
||||||
|
c->lea(*qw0, SPU_OFF_128(gpr, op.rt4));
|
||||||
|
c->lea(*qw1, SPU_OFF_128(gpr, op.ra));
|
||||||
|
c->lea(*qw2, SPU_OFF_128(gpr, op.rb));
|
||||||
|
c->lea(*qw3, SPU_OFF_128(gpr, op.rc));
|
||||||
|
asmjit::CCFuncCall* call = c->call(asmjit::imm_ptr(asmjit::Internal::ptr_cast<void*, void(u8*, const u8*, const u8*, const u8*)>(body)), asmjit::FuncSignature4<void, void*, void*, void*, void*>(asmjit::CallConv::kIdHost));
|
||||||
|
call->setArg(0, *qw0);
|
||||||
|
call->setArg(1, *qw1);
|
||||||
|
call->setArg(2, *qw2);
|
||||||
|
call->setArg(3, *qw3);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
const XmmLink& v0 = XmmGet(op.rc, XmmType::Int); // v0 = mask
|
const XmmLink& v0 = XmmGet(op.rc, XmmType::Int); // v0 = mask
|
||||||
const XmmLink& v1 = XmmAlloc();
|
const XmmLink& v1 = XmmAlloc();
|
||||||
const XmmLink& v2 = XmmAlloc();
|
const XmmLink& v2 = XmmAlloc();
|
||||||
|
|
|
@ -37,6 +37,7 @@ private:
|
||||||
asmjit::X86Gp* qw0;
|
asmjit::X86Gp* qw0;
|
||||||
asmjit::X86Gp* qw1;
|
asmjit::X86Gp* qw1;
|
||||||
asmjit::X86Gp* qw2;
|
asmjit::X86Gp* qw2;
|
||||||
|
asmjit::X86Gp* qw3;
|
||||||
std::array<asmjit::X86Xmm*, 6> vec;
|
std::array<asmjit::X86Xmm*, 6> vec;
|
||||||
|
|
||||||
// labels:
|
// labels:
|
||||||
|
|
|
@ -8,6 +8,10 @@
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <cfenv>
|
#include <cfenv>
|
||||||
|
|
||||||
|
#if !defined(_MSC_VER) && !defined(__SSSE3__)
|
||||||
|
#define _mm_shuffle_epi8
|
||||||
|
#endif
|
||||||
|
|
||||||
// Compare 16 packed unsigned bytes (greater than)
|
// Compare 16 packed unsigned bytes (greater than)
|
||||||
inline __m128i sse_cmpgt_epu8(__m128i A, __m128i B)
|
inline __m128i sse_cmpgt_epu8(__m128i A, __m128i B)
|
||||||
{
|
{
|
||||||
|
@ -398,12 +402,12 @@ void spu_interpreter::HBR(SPUThread& spu, spu_opcode_t op)
|
||||||
|
|
||||||
void spu_interpreter::GB(SPUThread& spu, spu_opcode_t op)
|
void spu_interpreter::GB(SPUThread& spu, spu_opcode_t op)
|
||||||
{
|
{
|
||||||
spu.gpr[op.rt] = v128::from32r(_mm_movemask_epi8(_mm_slli_epi64(_mm_shuffle_epi8(spu.gpr[op.ra].vi, _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0)), 7)));
|
spu.gpr[op.rt] = v128::from32r(_mm_movemask_ps(_mm_castsi128_ps(_mm_slli_epi32(spu.gpr[op.ra].vi, 31))));
|
||||||
}
|
}
|
||||||
|
|
||||||
void spu_interpreter::GBH(SPUThread& spu, spu_opcode_t op)
|
void spu_interpreter::GBH(SPUThread& spu, spu_opcode_t op)
|
||||||
{
|
{
|
||||||
spu.gpr[op.rt] = v128::from32r(_mm_movemask_epi8(_mm_slli_epi64(_mm_shuffle_epi8(spu.gpr[op.ra].vi, _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 14, 12, 10, 8, 6, 4, 2, 0)), 7)));
|
spu.gpr[op.rt] = v128::from32r(_mm_movemask_epi8(_mm_packs_epi16(_mm_slli_epi16(spu.gpr[op.ra].vi, 15), _mm_setzero_si128())));
|
||||||
}
|
}
|
||||||
|
|
||||||
void spu_interpreter::GBB(SPUThread& spu, spu_opcode_t op)
|
void spu_interpreter::GBB(SPUThread& spu, spu_opcode_t op)
|
||||||
|
@ -442,17 +446,38 @@ void spu_interpreter::LQX(SPUThread& spu, spu_opcode_t op)
|
||||||
spu.gpr[op.rt] = spu._ref<v128>((spu.gpr[op.ra]._u32[3] + spu.gpr[op.rb]._u32[3]) & 0x3fff0);
|
spu.gpr[op.rt] = spu._ref<v128>((spu.gpr[op.ra]._u32[3] + spu.gpr[op.rb]._u32[3]) & 0x3fff0);
|
||||||
}
|
}
|
||||||
|
|
||||||
void spu_interpreter::ROTQBYBI(SPUThread& spu, spu_opcode_t op)
|
void spu_interpreter_precise::ROTQBYBI(SPUThread& spu, spu_opcode_t op)
|
||||||
|
{
|
||||||
|
const auto a = spu.gpr[op.ra].vi;
|
||||||
|
alignas(32) const __m128i buf[2]{a, a};
|
||||||
|
spu.gpr[op.rt].vi = _mm_loadu_si128((__m128i*)((u8*)buf + (16 - (spu.gpr[op.rb]._u32[3] >> 3 & 0xf))));
|
||||||
|
}
|
||||||
|
|
||||||
|
void spu_interpreter_fast::ROTQBYBI(SPUThread& spu, spu_opcode_t op)
|
||||||
{
|
{
|
||||||
spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.rldq_pshufb[spu.gpr[op.rb]._u32[3] >> 3 & 0xf].vi);
|
spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.rldq_pshufb[spu.gpr[op.rb]._u32[3] >> 3 & 0xf].vi);
|
||||||
}
|
}
|
||||||
|
|
||||||
void spu_interpreter::ROTQMBYBI(SPUThread& spu, spu_opcode_t op)
|
void spu_interpreter_precise::ROTQMBYBI(SPUThread& spu, spu_opcode_t op)
|
||||||
{
|
{
|
||||||
spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.srdq_pshufb[-(spu.gpr[op.rb]._s32[3] >> 3) & 0x1f].vi);
|
const auto a = spu.gpr[op.ra].vi;
|
||||||
|
alignas(64) const __m128i buf[3]{a, _mm_setzero_si128(), _mm_setzero_si128()};
|
||||||
|
spu.gpr[op.rt].vi = _mm_loadu_si128((__m128i*)((u8*)buf + (spu.gpr[op.rb]._u32[3] >> 3 & 0x1f)));
|
||||||
}
|
}
|
||||||
|
|
||||||
void spu_interpreter::SHLQBYBI(SPUThread& spu, spu_opcode_t op)
|
void spu_interpreter_fast::ROTQMBYBI(SPUThread& spu, spu_opcode_t op)
|
||||||
|
{
|
||||||
|
spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.srdq_pshufb[spu.gpr[op.rb]._s32[3] >> 3 & 0x1f].vi);
|
||||||
|
}
|
||||||
|
|
||||||
|
void spu_interpreter_precise::SHLQBYBI(SPUThread& spu, spu_opcode_t op)
|
||||||
|
{
|
||||||
|
const auto a = spu.gpr[op.ra].vi;
|
||||||
|
alignas(64) const __m128i buf[3]{_mm_setzero_si128(), _mm_setzero_si128(), a};
|
||||||
|
spu.gpr[op.rt].vi = _mm_loadu_si128((__m128i*)((u8*)buf + (32 - (spu.gpr[op.rb]._u32[3] >> 3 & 0x1f))));
|
||||||
|
}
|
||||||
|
|
||||||
|
void spu_interpreter_fast::SHLQBYBI(SPUThread& spu, spu_opcode_t op)
|
||||||
{
|
{
|
||||||
spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.sldq_pshufb[spu.gpr[op.rb]._u32[3] >> 3 & 0x1f].vi);
|
spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.sldq_pshufb[spu.gpr[op.rb]._u32[3] >> 3 & 0x1f].vi);
|
||||||
}
|
}
|
||||||
|
@ -509,7 +534,7 @@ void spu_interpreter::ROTQBI(SPUThread& spu, spu_opcode_t op)
|
||||||
{
|
{
|
||||||
const auto a = spu.gpr[op.ra].vi;
|
const auto a = spu.gpr[op.ra].vi;
|
||||||
const s32 n = spu.gpr[op.rb]._s32[3] & 0x7;
|
const s32 n = spu.gpr[op.rb]._s32[3] & 0x7;
|
||||||
spu.gpr[op.rt].vi = _mm_or_si128(_mm_slli_epi64(a, n), _mm_srli_epi64(_mm_alignr_epi8(a, a, 8), 64 - n));
|
spu.gpr[op.rt].vi = _mm_or_si128(_mm_slli_epi64(a, n), _mm_srli_epi64(_mm_shuffle_epi32(a, 0x4E), 64 - n));
|
||||||
}
|
}
|
||||||
|
|
||||||
void spu_interpreter::ROTQMBI(SPUThread& spu, spu_opcode_t op)
|
void spu_interpreter::ROTQMBI(SPUThread& spu, spu_opcode_t op)
|
||||||
|
@ -526,17 +551,38 @@ void spu_interpreter::SHLQBI(SPUThread& spu, spu_opcode_t op)
|
||||||
spu.gpr[op.rt].vi = _mm_or_si128(_mm_slli_epi64(a, n), _mm_srli_epi64(_mm_slli_si128(a, 8), 64 - n));
|
spu.gpr[op.rt].vi = _mm_or_si128(_mm_slli_epi64(a, n), _mm_srli_epi64(_mm_slli_si128(a, 8), 64 - n));
|
||||||
}
|
}
|
||||||
|
|
||||||
void spu_interpreter::ROTQBY(SPUThread& spu, spu_opcode_t op)
|
void spu_interpreter_precise::ROTQBY(SPUThread& spu, spu_opcode_t op)
|
||||||
|
{
|
||||||
|
const auto a = spu.gpr[op.ra].vi;
|
||||||
|
alignas(32) const __m128i buf[2]{a, a};
|
||||||
|
spu.gpr[op.rt].vi = _mm_loadu_si128((__m128i*)((u8*)buf + (16 - (spu.gpr[op.rb]._u32[3] & 0xf))));
|
||||||
|
}
|
||||||
|
|
||||||
|
void spu_interpreter_fast::ROTQBY(SPUThread& spu, spu_opcode_t op)
|
||||||
{
|
{
|
||||||
spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.rldq_pshufb[spu.gpr[op.rb]._u32[3] & 0xf].vi);
|
spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.rldq_pshufb[spu.gpr[op.rb]._u32[3] & 0xf].vi);
|
||||||
}
|
}
|
||||||
|
|
||||||
void spu_interpreter::ROTQMBY(SPUThread& spu, spu_opcode_t op)
|
void spu_interpreter_precise::ROTQMBY(SPUThread& spu, spu_opcode_t op)
|
||||||
{
|
{
|
||||||
spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.srdq_pshufb[-spu.gpr[op.rb]._s32[3] & 0x1f].vi);
|
const auto a = spu.gpr[op.ra].vi;
|
||||||
|
alignas(64) const __m128i buf[3]{a, _mm_setzero_si128(), _mm_setzero_si128()};
|
||||||
|
spu.gpr[op.rt].vi = _mm_loadu_si128((__m128i*)((u8*)buf + (spu.gpr[op.rb]._u32[3] & 0x1f)));
|
||||||
}
|
}
|
||||||
|
|
||||||
void spu_interpreter::SHLQBY(SPUThread& spu, spu_opcode_t op)
|
void spu_interpreter_fast::ROTQMBY(SPUThread& spu, spu_opcode_t op)
|
||||||
|
{
|
||||||
|
spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.srdq_pshufb[spu.gpr[op.rb]._s32[3] & 0x1f].vi);
|
||||||
|
}
|
||||||
|
|
||||||
|
void spu_interpreter_precise::SHLQBY(SPUThread& spu, spu_opcode_t op)
|
||||||
|
{
|
||||||
|
const auto a = spu.gpr[op.ra].vi;
|
||||||
|
alignas(64) const __m128i buf[3]{_mm_setzero_si128(), _mm_setzero_si128(), a};
|
||||||
|
spu.gpr[op.rt].vi = _mm_loadu_si128((__m128i*)((u8*)buf + (32 - (spu.gpr[op.rb]._u32[3] & 0x1f))));
|
||||||
|
}
|
||||||
|
|
||||||
|
void spu_interpreter_fast::SHLQBY(SPUThread& spu, spu_opcode_t op)
|
||||||
{
|
{
|
||||||
spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.sldq_pshufb[spu.gpr[op.rb]._u32[3] & 0x1f].vi);
|
spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.sldq_pshufb[spu.gpr[op.rb]._u32[3] & 0x1f].vi);
|
||||||
}
|
}
|
||||||
|
@ -598,7 +644,7 @@ void spu_interpreter::ROTQBII(SPUThread& spu, spu_opcode_t op)
|
||||||
{
|
{
|
||||||
const auto a = spu.gpr[op.ra].vi;
|
const auto a = spu.gpr[op.ra].vi;
|
||||||
const s32 n = op.i7 & 0x7;
|
const s32 n = op.i7 & 0x7;
|
||||||
spu.gpr[op.rt].vi = _mm_or_si128(_mm_slli_epi64(a, n), _mm_srli_epi64(_mm_alignr_epi8(a, a, 8), 64 - n));
|
spu.gpr[op.rt].vi = _mm_or_si128(_mm_slli_epi64(a, n), _mm_srli_epi64(_mm_shuffle_epi32(a, 0x4E), 64 - n));
|
||||||
}
|
}
|
||||||
|
|
||||||
void spu_interpreter::ROTQMBII(SPUThread& spu, spu_opcode_t op)
|
void spu_interpreter::ROTQMBII(SPUThread& spu, spu_opcode_t op)
|
||||||
|
@ -615,17 +661,38 @@ void spu_interpreter::SHLQBII(SPUThread& spu, spu_opcode_t op)
|
||||||
spu.gpr[op.rt].vi = _mm_or_si128(_mm_slli_epi64(a, n), _mm_srli_epi64(_mm_slli_si128(a, 8), 64 - n));
|
spu.gpr[op.rt].vi = _mm_or_si128(_mm_slli_epi64(a, n), _mm_srli_epi64(_mm_slli_si128(a, 8), 64 - n));
|
||||||
}
|
}
|
||||||
|
|
||||||
void spu_interpreter::ROTQBYI(SPUThread& spu, spu_opcode_t op)
|
void spu_interpreter_precise::ROTQBYI(SPUThread& spu, spu_opcode_t op)
|
||||||
|
{
|
||||||
|
const auto a = spu.gpr[op.ra].vi;
|
||||||
|
alignas(32) const __m128i buf[2]{a, a};
|
||||||
|
spu.gpr[op.rt].vi = _mm_loadu_si128((__m128i*)((u8*)buf + (16 - (op.i7 & 0xf))));
|
||||||
|
}
|
||||||
|
|
||||||
|
void spu_interpreter_fast::ROTQBYI(SPUThread& spu, spu_opcode_t op)
|
||||||
{
|
{
|
||||||
spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.rldq_pshufb[op.i7 & 0xf].vi);
|
spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.rldq_pshufb[op.i7 & 0xf].vi);
|
||||||
}
|
}
|
||||||
|
|
||||||
void spu_interpreter::ROTQMBYI(SPUThread& spu, spu_opcode_t op)
|
void spu_interpreter_precise::ROTQMBYI(SPUThread& spu, spu_opcode_t op)
|
||||||
{
|
{
|
||||||
spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.srdq_pshufb[0-op.i7 & 0x1f].vi);
|
const auto a = spu.gpr[op.ra].vi;
|
||||||
|
alignas(64) const __m128i buf[3]{a, _mm_setzero_si128(), _mm_setzero_si128()};
|
||||||
|
spu.gpr[op.rt].vi = _mm_loadu_si128((__m128i*)((u8*)buf + (op.i7 & 0x1f)));
|
||||||
}
|
}
|
||||||
|
|
||||||
void spu_interpreter::SHLQBYI(SPUThread& spu, spu_opcode_t op)
|
void spu_interpreter_fast::ROTQMBYI(SPUThread& spu, spu_opcode_t op)
|
||||||
|
{
|
||||||
|
spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.srdq_pshufb[op.i7 & 0x1f].vi);
|
||||||
|
}
|
||||||
|
|
||||||
|
void spu_interpreter_precise::SHLQBYI(SPUThread& spu, spu_opcode_t op)
|
||||||
|
{
|
||||||
|
const auto a = spu.gpr[op.ra].vi;
|
||||||
|
alignas(64) const __m128i buf[3]{_mm_setzero_si128(), _mm_setzero_si128(), a};
|
||||||
|
spu.gpr[op.rt].vi = _mm_loadu_si128((__m128i*)((u8*)buf + (32 - (op.i7 & 0x1f))));
|
||||||
|
}
|
||||||
|
|
||||||
|
void spu_interpreter_fast::SHLQBYI(SPUThread& spu, spu_opcode_t op)
|
||||||
{
|
{
|
||||||
spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.sldq_pshufb[op.i7 & 0x1f].vi);
|
spu.gpr[op.rt].vi = _mm_shuffle_epi8(spu.gpr[op.ra].vi, g_spu_imm.sldq_pshufb[op.i7 & 0x1f].vi);
|
||||||
}
|
}
|
||||||
|
@ -661,10 +728,21 @@ void spu_interpreter::CGTB(SPUThread& spu, spu_opcode_t op)
|
||||||
|
|
||||||
void spu_interpreter::SUMB(SPUThread& spu, spu_opcode_t op)
|
void spu_interpreter::SUMB(SPUThread& spu, spu_opcode_t op)
|
||||||
{
|
{
|
||||||
const auto ones = _mm_set1_epi8(1);
|
const auto m1 = _mm_set1_epi16(0xff);
|
||||||
const auto a = _mm_maddubs_epi16(spu.gpr[op.ra].vi, ones);
|
const auto m2 = _mm_set1_epi32(0xffff);
|
||||||
const auto b = _mm_maddubs_epi16(spu.gpr[op.rb].vi, ones);
|
const auto a = spu.gpr[op.ra].vi;
|
||||||
spu.gpr[op.rt].vi = _mm_shuffle_epi8(_mm_hadd_epi16(a, b), _mm_set_epi8(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0));
|
const auto b = spu.gpr[op.rb].vi;
|
||||||
|
const auto a1 = _mm_srli_epi16(a, 8);
|
||||||
|
const auto a2 = _mm_and_si128(a, m1);
|
||||||
|
const auto b1 = _mm_srli_epi16(b, 8);
|
||||||
|
const auto b2 = _mm_and_si128(b, m1);
|
||||||
|
const auto sa = _mm_add_epi16(a1, a2);
|
||||||
|
const auto sb = _mm_add_epi16(b1, b2);
|
||||||
|
const auto s2 = _mm_and_si128(sa, m2);
|
||||||
|
const auto s1 = _mm_srli_epi32(sa, 16);
|
||||||
|
const auto s4 = _mm_andnot_si128(m2, sb);
|
||||||
|
const auto s3 = _mm_slli_epi32(sb, 16);
|
||||||
|
spu.gpr[op.rt].vi = _mm_or_si128(_mm_add_epi16(s1, s2), _mm_add_epi16(s3, s4));
|
||||||
}
|
}
|
||||||
|
|
||||||
void spu_interpreter::HGT(SPUThread& spu, spu_opcode_t op)
|
void spu_interpreter::HGT(SPUThread& spu, spu_opcode_t op)
|
||||||
|
@ -696,10 +774,14 @@ void spu_interpreter::XSHW(SPUThread& spu, spu_opcode_t op)
|
||||||
|
|
||||||
void spu_interpreter::CNTB(SPUThread& spu, spu_opcode_t op)
|
void spu_interpreter::CNTB(SPUThread& spu, spu_opcode_t op)
|
||||||
{
|
{
|
||||||
const auto counts = _mm_set_epi8(4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0);
|
|
||||||
const auto mask = _mm_set1_epi8(0xf);
|
|
||||||
const auto a = spu.gpr[op.ra].vi;
|
const auto a = spu.gpr[op.ra].vi;
|
||||||
spu.gpr[op.rt].vi = _mm_add_epi8(_mm_shuffle_epi8(counts, _mm_and_si128(a, mask)), _mm_shuffle_epi8(counts, _mm_and_si128(_mm_srli_epi64(a, 4), mask)));
|
const auto mask1 = _mm_set1_epi8(0x55);
|
||||||
|
const auto sum1 = _mm_add_epi8(_mm_and_si128(_mm_srli_epi64(a, 1), mask1), _mm_and_si128(a, mask1));
|
||||||
|
const auto mask2 = _mm_set1_epi8(0x33);
|
||||||
|
const auto sum2 = _mm_add_epi8(_mm_and_si128(_mm_srli_epi64(sum1, 2), mask2), _mm_and_si128(sum1, mask2));
|
||||||
|
const auto mask3 = _mm_set1_epi8(0x0f);
|
||||||
|
const auto sum3 = _mm_add_epi8(_mm_and_si128(_mm_srli_epi64(sum2, 4), mask3), _mm_and_si128(sum2, mask3));
|
||||||
|
spu.gpr[op.rt].vi = sum3;
|
||||||
}
|
}
|
||||||
|
|
||||||
void spu_interpreter::XSBH(SPUThread& spu, spu_opcode_t op)
|
void spu_interpreter::XSBH(SPUThread& spu, spu_opcode_t op)
|
||||||
|
@ -1354,7 +1436,49 @@ void spu_interpreter::SELB(SPUThread& spu, spu_opcode_t op)
|
||||||
spu.gpr[op.rt4] = (spu.gpr[op.rc] & spu.gpr[op.rb]) | v128::andnot(spu.gpr[op.rc], spu.gpr[op.ra]);
|
spu.gpr[op.rt4] = (spu.gpr[op.rc] & spu.gpr[op.rb]) | v128::andnot(spu.gpr[op.rc], spu.gpr[op.ra]);
|
||||||
}
|
}
|
||||||
|
|
||||||
void spu_interpreter::SHUFB(SPUThread& spu, spu_opcode_t op)
|
void spu_interpreter_precise::SHUFB(SPUThread& spu, spu_opcode_t op)
|
||||||
|
{
|
||||||
|
alignas(16) static thread_local u8 s_lut[256]
|
||||||
|
{
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
||||||
|
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
||||||
|
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
||||||
|
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
||||||
|
};
|
||||||
|
|
||||||
|
const auto _a = spu.gpr[op.ra].vi;
|
||||||
|
const auto _b = spu.gpr[op.rb].vi;
|
||||||
|
_mm_store_si128((__m128i*)(s_lut + 0x00), _a);
|
||||||
|
_mm_store_si128((__m128i*)(s_lut + 0x10), _b);
|
||||||
|
_mm_store_si128((__m128i*)(s_lut + 0x20), _a);
|
||||||
|
_mm_store_si128((__m128i*)(s_lut + 0x30), _b);
|
||||||
|
_mm_store_si128((__m128i*)(s_lut + 0x40), _a);
|
||||||
|
_mm_store_si128((__m128i*)(s_lut + 0x50), _b);
|
||||||
|
_mm_store_si128((__m128i*)(s_lut + 0x60), _a);
|
||||||
|
_mm_store_si128((__m128i*)(s_lut + 0x70), _b);
|
||||||
|
v128 mask = v128::fromV(_mm_xor_si128(spu.gpr[op.rc].vi, _mm_set1_epi8(0xf)));
|
||||||
|
auto& t = spu.gpr[op.rt4];
|
||||||
|
|
||||||
|
for (int i = 0; i < 16; i++)
|
||||||
|
{
|
||||||
|
t._u8[i] = s_lut[mask._u8[i]];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void spu_interpreter_fast::SHUFB(SPUThread& spu, spu_opcode_t op)
|
||||||
{
|
{
|
||||||
const auto index = _mm_xor_si128(spu.gpr[op.rc].vi, _mm_set1_epi32(0x0f0f0f0f));
|
const auto index = _mm_xor_si128(spu.gpr[op.rc].vi, _mm_set1_epi32(0x0f0f0f0f));
|
||||||
const auto res1 = _mm_shuffle_epi8(spu.gpr[op.ra].vi, index);
|
const auto res1 = _mm_shuffle_epi8(spu.gpr[op.ra].vi, index);
|
||||||
|
|
|
@ -66,9 +66,6 @@ struct spu_interpreter
|
||||||
static void FSMH(SPUThread&, spu_opcode_t);
|
static void FSMH(SPUThread&, spu_opcode_t);
|
||||||
static void FSMB(SPUThread&, spu_opcode_t);
|
static void FSMB(SPUThread&, spu_opcode_t);
|
||||||
static void LQX(SPUThread&, spu_opcode_t);
|
static void LQX(SPUThread&, spu_opcode_t);
|
||||||
static void ROTQBYBI(SPUThread&, spu_opcode_t);
|
|
||||||
static void ROTQMBYBI(SPUThread&, spu_opcode_t);
|
|
||||||
static void SHLQBYBI(SPUThread&, spu_opcode_t);
|
|
||||||
static void CBX(SPUThread&, spu_opcode_t);
|
static void CBX(SPUThread&, spu_opcode_t);
|
||||||
static void CHX(SPUThread&, spu_opcode_t);
|
static void CHX(SPUThread&, spu_opcode_t);
|
||||||
static void CWX(SPUThread&, spu_opcode_t);
|
static void CWX(SPUThread&, spu_opcode_t);
|
||||||
|
@ -76,9 +73,6 @@ struct spu_interpreter
|
||||||
static void ROTQBI(SPUThread&, spu_opcode_t);
|
static void ROTQBI(SPUThread&, spu_opcode_t);
|
||||||
static void ROTQMBI(SPUThread&, spu_opcode_t);
|
static void ROTQMBI(SPUThread&, spu_opcode_t);
|
||||||
static void SHLQBI(SPUThread&, spu_opcode_t);
|
static void SHLQBI(SPUThread&, spu_opcode_t);
|
||||||
static void ROTQBY(SPUThread&, spu_opcode_t);
|
|
||||||
static void ROTQMBY(SPUThread&, spu_opcode_t);
|
|
||||||
static void SHLQBY(SPUThread&, spu_opcode_t);
|
|
||||||
static void ORX(SPUThread&, spu_opcode_t);
|
static void ORX(SPUThread&, spu_opcode_t);
|
||||||
static void CBD(SPUThread&, spu_opcode_t);
|
static void CBD(SPUThread&, spu_opcode_t);
|
||||||
static void CHD(SPUThread&, spu_opcode_t);
|
static void CHD(SPUThread&, spu_opcode_t);
|
||||||
|
@ -87,9 +81,6 @@ struct spu_interpreter
|
||||||
static void ROTQBII(SPUThread&, spu_opcode_t);
|
static void ROTQBII(SPUThread&, spu_opcode_t);
|
||||||
static void ROTQMBII(SPUThread&, spu_opcode_t);
|
static void ROTQMBII(SPUThread&, spu_opcode_t);
|
||||||
static void SHLQBII(SPUThread&, spu_opcode_t);
|
static void SHLQBII(SPUThread&, spu_opcode_t);
|
||||||
static void ROTQBYI(SPUThread&, spu_opcode_t);
|
|
||||||
static void ROTQMBYI(SPUThread&, spu_opcode_t);
|
|
||||||
static void SHLQBYI(SPUThread&, spu_opcode_t);
|
|
||||||
static void NOP(SPUThread&, spu_opcode_t);
|
static void NOP(SPUThread&, spu_opcode_t);
|
||||||
static void CGT(SPUThread&, spu_opcode_t);
|
static void CGT(SPUThread&, spu_opcode_t);
|
||||||
static void XOR(SPUThread&, spu_opcode_t);
|
static void XOR(SPUThread&, spu_opcode_t);
|
||||||
|
@ -175,7 +166,6 @@ struct spu_interpreter
|
||||||
static void HBRR(SPUThread&, spu_opcode_t);
|
static void HBRR(SPUThread&, spu_opcode_t);
|
||||||
static void ILA(SPUThread&, spu_opcode_t);
|
static void ILA(SPUThread&, spu_opcode_t);
|
||||||
static void SELB(SPUThread&, spu_opcode_t);
|
static void SELB(SPUThread&, spu_opcode_t);
|
||||||
static void SHUFB(SPUThread&, spu_opcode_t);
|
|
||||||
static void MPYA(SPUThread&, spu_opcode_t);
|
static void MPYA(SPUThread&, spu_opcode_t);
|
||||||
static void DFCGT(SPUThread&, spu_opcode_t);
|
static void DFCGT(SPUThread&, spu_opcode_t);
|
||||||
static void DFCMGT(SPUThread&, spu_opcode_t);
|
static void DFCMGT(SPUThread&, spu_opcode_t);
|
||||||
|
@ -186,6 +176,17 @@ struct spu_interpreter
|
||||||
|
|
||||||
struct spu_interpreter_fast final : spu_interpreter
|
struct spu_interpreter_fast final : spu_interpreter
|
||||||
{
|
{
|
||||||
|
static void ROTQBYBI(SPUThread&, spu_opcode_t);
|
||||||
|
static void ROTQMBYBI(SPUThread&, spu_opcode_t);
|
||||||
|
static void SHLQBYBI(SPUThread&, spu_opcode_t);
|
||||||
|
static void ROTQBY(SPUThread&, spu_opcode_t);
|
||||||
|
static void ROTQMBY(SPUThread&, spu_opcode_t);
|
||||||
|
static void SHLQBY(SPUThread&, spu_opcode_t);
|
||||||
|
static void ROTQBYI(SPUThread&, spu_opcode_t);
|
||||||
|
static void ROTQMBYI(SPUThread&, spu_opcode_t);
|
||||||
|
static void SHLQBYI(SPUThread&, spu_opcode_t);
|
||||||
|
static void SHUFB(SPUThread&, spu_opcode_t);
|
||||||
|
|
||||||
static void FREST(SPUThread&, spu_opcode_t);
|
static void FREST(SPUThread&, spu_opcode_t);
|
||||||
static void FRSQEST(SPUThread&, spu_opcode_t);
|
static void FRSQEST(SPUThread&, spu_opcode_t);
|
||||||
static void FCGT(SPUThread&, spu_opcode_t);
|
static void FCGT(SPUThread&, spu_opcode_t);
|
||||||
|
@ -218,6 +219,17 @@ struct spu_interpreter_fast final : spu_interpreter
|
||||||
|
|
||||||
struct spu_interpreter_precise final : spu_interpreter
|
struct spu_interpreter_precise final : spu_interpreter
|
||||||
{
|
{
|
||||||
|
static void ROTQBYBI(SPUThread&, spu_opcode_t);
|
||||||
|
static void ROTQMBYBI(SPUThread&, spu_opcode_t);
|
||||||
|
static void SHLQBYBI(SPUThread&, spu_opcode_t);
|
||||||
|
static void ROTQBY(SPUThread&, spu_opcode_t);
|
||||||
|
static void ROTQMBY(SPUThread&, spu_opcode_t);
|
||||||
|
static void SHLQBY(SPUThread&, spu_opcode_t);
|
||||||
|
static void ROTQBYI(SPUThread&, spu_opcode_t);
|
||||||
|
static void ROTQMBYI(SPUThread&, spu_opcode_t);
|
||||||
|
static void SHLQBYI(SPUThread&, spu_opcode_t);
|
||||||
|
static void SHUFB(SPUThread&, spu_opcode_t);
|
||||||
|
|
||||||
static void FREST(SPUThread&, spu_opcode_t);
|
static void FREST(SPUThread&, spu_opcode_t);
|
||||||
static void FRSQEST(SPUThread&, spu_opcode_t);
|
static void FRSQEST(SPUThread&, spu_opcode_t);
|
||||||
static void FCGT(SPUThread&, spu_opcode_t);
|
static void FCGT(SPUThread&, spu_opcode_t);
|
||||||
|
|
|
@ -271,6 +271,12 @@ public:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename F>
|
||||||
|
spu_decoder(F&& init) : spu_decoder()
|
||||||
|
{
|
||||||
|
init(m_table);
|
||||||
|
}
|
||||||
|
|
||||||
const std::array<T, 2048>& get_table() const
|
const std::array<T, 2048>& get_table() const
|
||||||
{
|
{
|
||||||
return m_table;
|
return m_table;
|
||||||
|
|
|
@ -25,6 +25,15 @@
|
||||||
|
|
||||||
const bool s_use_rtm = utils::has_rtm();
|
const bool s_use_rtm = utils::has_rtm();
|
||||||
|
|
||||||
|
const bool s_use_ssse3 =
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
utils::has_ssse3();
|
||||||
|
#elif __SSSE3__
|
||||||
|
true;
|
||||||
|
#else
|
||||||
|
false;
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef _MSC_VER
|
#ifdef _MSC_VER
|
||||||
bool operator ==(const u128& lhs, const u128& rhs)
|
bool operator ==(const u128& lhs, const u128& rhs)
|
||||||
{
|
{
|
||||||
|
@ -37,10 +46,60 @@ extern u64 get_system_time();
|
||||||
|
|
||||||
extern thread_local u64 g_tls_fault_spu;
|
extern thread_local u64 g_tls_fault_spu;
|
||||||
|
|
||||||
const spu_decoder<spu_interpreter_precise> s_spu_interpreter_precise;
|
// Table of identical interpreter functions when precise contains SSE2 version, and fast contains SSSE3 functions
|
||||||
const spu_decoder<spu_interpreter_fast> s_spu_interpreter_fast;
|
const std::pair<spu_inter_func_t, spu_inter_func_t> s_spu_dispatch_table[]
|
||||||
|
{
|
||||||
|
#define FUNC(x) {&spu_interpreter_precise::x, &spu_interpreter_fast::x}
|
||||||
|
FUNC(ROTQBYBI),
|
||||||
|
FUNC(ROTQMBYBI),
|
||||||
|
FUNC(SHLQBYBI),
|
||||||
|
FUNC(ROTQBY),
|
||||||
|
FUNC(ROTQMBY),
|
||||||
|
FUNC(SHLQBY),
|
||||||
|
FUNC(ROTQBYI),
|
||||||
|
FUNC(ROTQMBYI),
|
||||||
|
FUNC(SHLQBYI),
|
||||||
|
FUNC(SHUFB),
|
||||||
|
#undef FUNC
|
||||||
|
};
|
||||||
|
|
||||||
std::atomic<u64> g_num_spu_threads = { 0ull };
|
extern const spu_decoder<spu_interpreter_precise> g_spu_interpreter_precise([](auto& table)
|
||||||
|
{
|
||||||
|
if (s_use_ssse3)
|
||||||
|
{
|
||||||
|
for (auto& func : table)
|
||||||
|
{
|
||||||
|
for (const auto& pair : s_spu_dispatch_table)
|
||||||
|
{
|
||||||
|
if (pair.first == func)
|
||||||
|
{
|
||||||
|
func = pair.second;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
extern const spu_decoder<spu_interpreter_fast> g_spu_interpreter_fast([](auto& table)
|
||||||
|
{
|
||||||
|
if (!s_use_ssse3)
|
||||||
|
{
|
||||||
|
for (auto& func : table)
|
||||||
|
{
|
||||||
|
for (const auto& pair : s_spu_dispatch_table)
|
||||||
|
{
|
||||||
|
if (pair.second == func)
|
||||||
|
{
|
||||||
|
func = pair.first;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
std::atomic<u64> g_num_spu_threads{0ull};
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
void fmt_class_string<spu_decoder_type>::format(std::string& out, u64 arg)
|
void fmt_class_string<spu_decoder_type>::format(std::string& out, u64 arg)
|
||||||
|
@ -200,9 +259,11 @@ spu_imm_table_t::spu_imm_table_t()
|
||||||
|
|
||||||
for (u32 i = 0; i < sizeof(srdq_pshufb) / sizeof(srdq_pshufb[0]); i++)
|
for (u32 i = 0; i < sizeof(srdq_pshufb) / sizeof(srdq_pshufb[0]); i++)
|
||||||
{
|
{
|
||||||
|
const u32 im = (0u - i) & 0x1f;
|
||||||
|
|
||||||
for (u32 j = 0; j < 16; j++)
|
for (u32 j = 0; j < 16; j++)
|
||||||
{
|
{
|
||||||
srdq_pshufb[i]._u8[j] = (j + i > 15) ? 0xff : static_cast<u8>(j + i);
|
srdq_pshufb[i]._u8[j] = (j + im > 15) ? 0xff : static_cast<u8>(j + im);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -330,8 +391,8 @@ void SPUThread::cpu_task()
|
||||||
|
|
||||||
// Select opcode table
|
// Select opcode table
|
||||||
const auto& table = *(
|
const auto& table = *(
|
||||||
g_cfg.core.spu_decoder == spu_decoder_type::precise ? &s_spu_interpreter_precise.get_table() :
|
g_cfg.core.spu_decoder == spu_decoder_type::precise ? &g_spu_interpreter_precise.get_table() :
|
||||||
g_cfg.core.spu_decoder == spu_decoder_type::fast ? &s_spu_interpreter_fast.get_table() :
|
g_cfg.core.spu_decoder == spu_decoder_type::fast ? &g_spu_interpreter_fast.get_table() :
|
||||||
(fmt::throw_exception<std::logic_error>("Invalid SPU decoder"), nullptr));
|
(fmt::throw_exception<std::logic_error>("Invalid SPU decoder"), nullptr));
|
||||||
|
|
||||||
// LS base address
|
// LS base address
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue