rpcs3/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
Zion c133db6721 Update asmjit submodule (Testers wanted!) (#2782)
* Update asmjit submodule

* Add myself as a contributor 	😉
2017-05-18 14:22:45 +03:00

2788 lines
75 KiB
C++

#include "stdafx.h"
#include "Utilities/Config.h"
#include "Emu/Memory/Memory.h"
#include "Emu/System.h"
#include "SPUDisAsm.h"
#include "SPUThread.h"
#include "SPUInterpreter.h"
#include "SPUASMJITRecompiler.h"
#include <cmath>
#define ASMJIT_STATIC
#define ASMJIT_DEBUG
#include "asmjit.h"
#define SPU_OFF_128(x, ...) asmjit::x86::oword_ptr(*cpu, offset32(&SPUThread::x, ##__VA_ARGS__))
#define SPU_OFF_64(x, ...) asmjit::x86::qword_ptr(*cpu, offset32(&SPUThread::x, ##__VA_ARGS__))
#define SPU_OFF_32(x, ...) asmjit::x86::dword_ptr(*cpu, offset32(&SPUThread::x, ##__VA_ARGS__))
#define SPU_OFF_16(x, ...) asmjit::x86::word_ptr(*cpu, offset32(&SPUThread::x, ##__VA_ARGS__))
#define SPU_OFF_8(x, ...) asmjit::x86::byte_ptr(*cpu, offset32(&SPUThread::x, ##__VA_ARGS__))
const spu_decoder<spu_interpreter_fast> s_spu_interpreter; // TODO: remove
const spu_decoder<spu_recompiler> s_spu_decoder;
extern cfg::bool_entry g_cfg_spu_debug;
spu_recompiler::spu_recompiler()
: m_jit(std::make_shared<asmjit::JitRuntime>())
{
LOG_SUCCESS(SPU, "SPU Recompiler (ASMJIT) created...");
if (g_cfg_spu_debug)
{
fs::file log(Emu.GetCachePath() + "SPUJIT.log", fs::rewrite);
log.write(fmt::format("SPU JIT initialization...\n\nTitle: %s\nTitle ID: %s\n\n", Emu.GetTitle().c_str(), Emu.GetTitleID().c_str()));
}
}
void spu_recompiler::compile(spu_function_t& f)
{
std::lock_guard<std::mutex> lock(m_mutex);
if (f.compiled)
{
// return if function already compiled
return;
}
if (f.addr >= 0x40000 || f.addr % 4 || f.size == 0 || f.size > 0x40000 - f.addr || f.size % 4)
{
fmt::throw_exception("Invalid SPU function (addr=0x%05x, size=0x%x)" HERE, f.addr, f.size);
}
using namespace asmjit;
SPUDisAsm dis_asm(CPUDisAsm_InterpreterMode);
dis_asm.offset = reinterpret_cast<u8*>(f.data.data()) - f.addr;
StringLogger logger;
logger.addOptions(Logger::kOptionBinaryForm);
std::string log;
if (g_cfg_spu_debug)
{
fmt::append(log, "========== SPU FUNCTION 0x%05x - 0x%05x ==========\n\n", f.addr, f.addr + f.size);
}
this->m_func = &f;
asmjit::CodeHolder code;
code.init(m_jit->getCodeInfo());
this->codeHolder = &code;
X86Compiler compiler(&code);
this->c = &compiler;
if (g_cfg_spu_debug)
{
// Set logger
codeHolder->setLogger(&logger);
}
compiler.addFunc(FuncSignature2<u32, void*, void*>(asmjit::CallConv::kIdHost));
// Initialize variables
X86Gp cpu_var = compiler.newIntPtr("cpu");
compiler.setArg(0, cpu_var);
compiler.alloc(cpu_var, asmjit::x86::rbp); // ASMJIT bug workaround
this->cpu = &cpu_var;
X86Gp ls_var = compiler.newIntPtr("ls");
compiler.setArg(1, ls_var);
compiler.alloc(ls_var, asmjit::x86::rbx); // ASMJIT bug workaround
this->ls = &ls_var;
X86Gp addr_var = compiler.newUInt32("addr");
this->addr = &addr_var;
X86Gp qw0_var = compiler.newUInt64("qw0");
this->qw0 = &qw0_var;
X86Gp qw1_var = compiler.newUInt64("qw1");
this->qw1 = &qw1_var;
X86Gp qw2_var = compiler.newUInt64("qw2");
this->qw2 = &qw2_var;
std::array<X86Xmm, 6> vec_vars;
for (u32 i = 0; i < vec_vars.size(); i++)
{
vec_vars[i] = compiler.newXmm(fmt::format("vec%d", i).c_str());
vec.at(i) = vec_vars.data() + i;
}
// Initialize labels
std::vector<Label> pos_labels{ 0x10000 };
this->labels = pos_labels.data();
// Register labels for block entries
for (const u32 addr : f.blocks)
{
if (addr < f.addr || addr >= f.addr + f.size || addr % 4)
{
fmt::throw_exception("Invalid function block entry (0x%05x)" HERE, addr);
}
pos_labels[addr / 4] = compiler.newLabel();
}
// Register label for post-the-end address
pos_labels[(f.addr + f.size) / 4 % 0x10000] = compiler.newLabel();
// Register label for jump table resolver
Label jt_label = compiler.newLabel();
this->jt = &jt_label;
for (const u32 addr : f.jtable)
{
if (addr < f.addr || addr >= f.addr + f.size || addr % 4)
{
fmt::throw_exception("Invalid jump table entry (0x%05x)" HERE, addr);
}
}
// Register label for the function return
Label end_label = compiler.newLabel();
this->end = &end_label;
// Start compilation
m_pos = f.addr;
for (const u32 op : f.data)
{
// Bind label if initialized
if (pos_labels[m_pos / 4].isValid())
{
compiler.bind(pos_labels[m_pos / 4]);
if (f.blocks.find(m_pos) != f.blocks.end())
{
compiler.comment("Block:");
}
}
if (g_cfg_spu_debug)
{
// Disasm
dis_asm.dump_pc = m_pos;
dis_asm.disasm(m_pos);
compiler.comment(dis_asm.last_opcode.c_str());
log += dis_asm.last_opcode.c_str();
log += '\n';
}
// Recompiler function
(this->*s_spu_decoder.decode(op))({ op });
// Collect allocated xmm vars
for (u32 i = 0; i < vec_vars.size(); i++)
{
if (!vec[i])
{
compiler.unuse(vec_vars[i]);
vec[i] = vec_vars.data() + i;
}
}
// Set next position
m_pos += 4;
}
if (g_cfg_spu_debug)
{
log += '\n';
}
// Generate default function end (go to the next address)
compiler.bind(pos_labels[m_pos / 4 % 0x10000]);
compiler.comment("Fallthrough:");
compiler.mov(addr_var, spu_branch_target(m_pos));
compiler.jmp(end_label);
// Generate jump table resolver (uses addr_var)
compiler.bind(jt_label);
if (f.jtable.size())
{
compiler.comment("Jump table resolver:");
}
for (const u32 addr : f.jtable)
{
if ((addr % 4) == 0 && addr < 0x40000 && pos_labels[addr / 4].isValid())
{
// It could be binary search or something
compiler.cmp(addr_var, addr);
compiler.je(pos_labels[addr / 4]);
}
else
{
LOG_ERROR(SPU, "Unable to add jump table entry (0x%05x)", addr);
}
}
// Generate function end (returns addr_var)
compiler.bind(end_label);
compiler.unuse(cpu_var);
compiler.unuse(ls_var);
compiler.ret(addr_var);
// Finalization
compiler.endFunc();
compiler.finalize();
// Compile and store function address
typedef u32 (*Func)(void* x, void* y);
Func fn;
m_jit->add(&fn, codeHolder);
f.compiled = asmjit::Internal::ptr_cast<decltype(f.compiled)>(fn);
if (g_cfg_spu_debug)
{
// Add ASMJIT logs
log += logger.getString();
log += "\n\n\n";
// Append log file
fs::file(Emu.GetCachePath() + "SPUJIT.log", fs::write + fs::append).write(log);
}
}
spu_recompiler::XmmLink spu_recompiler::XmmAlloc() // get empty xmm register
{
for (auto& v : vec)
{
if (v) return{ v };
}
fmt::throw_exception("Out of Xmm Vars" HERE);
}
spu_recompiler::XmmLink spu_recompiler::XmmGet(s8 reg, XmmType type) // get xmm register with specific SPU reg
{
XmmLink result = XmmAlloc();
switch (type)
{
case XmmType::Int: c->movdqa(result, SPU_OFF_128(gpr, reg)); break;
case XmmType::Float: c->movaps(result, SPU_OFF_128(gpr, reg)); break;
case XmmType::Double: c->movapd(result, SPU_OFF_128(gpr, reg)); break;
default: fmt::throw_exception("Invalid XmmType" HERE);
}
return result;
}
inline asmjit::X86Mem spu_recompiler::XmmConst(v128 data)
{
return c->newXmmConst(asmjit::kConstScopeLocal, asmjit::Data128::fromU64(data._u64[0], data._u64[1]));
}
inline asmjit::X86Mem spu_recompiler::XmmConst(__m128 data)
{
return XmmConst(v128::fromF(data));
}
inline asmjit::X86Mem spu_recompiler::XmmConst(__m128i data)
{
return XmmConst(v128::fromV(data));
}
void spu_recompiler::InterpreterCall(spu_opcode_t op)
{
auto gate = [](SPUThread* _spu, u32 opcode, spu_inter_func_t _func) noexcept -> u32
{
try
{
// TODO: check correctness
const u32 old_pc = _spu->pc;
if (test(_spu->state) && _spu->check_state())
{
return 0x2000000 | _spu->pc;
}
_func(*_spu, { opcode });
if (old_pc != _spu->pc)
{
_spu->pc += 4;
return 0x2000000 | _spu->pc;
}
_spu->pc += 4;
return 0;
}
catch (...)
{
_spu->pending_exception = std::current_exception();
return 0x1000000 | _spu->pc;
}
};
c->mov(SPU_OFF_32(pc), m_pos);
asmjit::CCFuncCall* call = c->call(asmjit::imm_ptr(asmjit::Internal::ptr_cast<void*, u32(SPUThread*, u32, spu_inter_func_t)>(gate)), asmjit::FuncSignature3<u32, void*, u32, void*>(asmjit::CallConv::kIdHost));
call->setArg(0, *cpu);
call->setArg(1, asmjit::imm_u(op.opcode));
call->setArg(2, asmjit::imm_ptr(asmjit::Internal::ptr_cast<void*>(s_spu_interpreter.decode(op.opcode))));
call->setRet(0, *addr);
// return immediately if an error occured
c->test(*addr, *addr);
c->jnz(*end);
c->unuse(*addr);
}
void spu_recompiler::FunctionCall()
{
auto gate = [](SPUThread* _spu, u32 link) noexcept -> u32
{
_spu->recursion_level++;
try
{
// TODO: check correctness
if (_spu->pc & 0x4000000)
{
if (_spu->pc & 0x8000000)
{
fmt::throw_exception("Undefined behaviour" HERE);
}
_spu->set_interrupt_status(true);
_spu->pc &= ~0x4000000;
}
else if (_spu->pc & 0x8000000)
{
_spu->set_interrupt_status(false);
_spu->pc &= ~0x8000000;
}
if (_spu->pc == link)
{
LOG_ERROR(SPU, "Branch-to-next");
}
else if (_spu->pc == link - 4)
{
LOG_ERROR(SPU, "Branch-to-self");
}
while (!test(_spu->state) || !_spu->check_state())
{
// Proceed recursively
spu_recompiler_base::enter(*_spu);
if (test(_spu->state & cpu_flag::ret))
{
break;
}
if (_spu->pc == link)
{
_spu->recursion_level--;
return 0; // Successfully returned
}
}
_spu->recursion_level--;
return 0x2000000 | _spu->pc;
}
catch (...)
{
_spu->pending_exception = std::current_exception();
_spu->recursion_level--;
return 0x1000000 | _spu->pc;
}
};
asmjit::CCFuncCall* call = c->call(asmjit::imm_ptr(asmjit::Internal::ptr_cast<void*, u32(SPUThread*, u32)>(gate)), asmjit::FuncSignature2<u32, SPUThread*, u32>(asmjit::CallConv::kIdHost));
call->setArg(0, *cpu);
call->setArg(1, asmjit::imm_u(spu_branch_target(m_pos + 4)));
call->setRet(0, *addr);
// return immediately if an error occured
c->test(*addr, *addr);
c->jnz(*end);
c->unuse(*addr);
}
void spu_recompiler::STOP(spu_opcode_t op)
{
InterpreterCall(op); // TODO
}
void spu_recompiler::LNOP(spu_opcode_t op)
{
}
void spu_recompiler::SYNC(spu_opcode_t op)
{
// This instruction must be used following a store instruction that modifies the instruction stream.
c->mfence();
}
void spu_recompiler::DSYNC(spu_opcode_t op)
{
// This instruction forces all earlier load, store, and channel instructions to complete before proceeding.
c->mfence();
}
void spu_recompiler::MFSPR(spu_opcode_t op)
{
InterpreterCall(op);
}
void spu_recompiler::RDCH(spu_opcode_t op)
{
switch (op.ra)
{
case SPU_RdSRR0:
{
const XmmLink& vr = XmmAlloc();
c->movd(vr, SPU_OFF_32(srr0));
c->pslldq(vr, 12);
c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
return;
}
case MFC_RdTagMask:
{
const XmmLink& vr = XmmAlloc();
c->movd(vr, SPU_OFF_32(ch_tag_mask));
c->pslldq(vr, 12);
c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
return;
}
case SPU_RdEventMask:
{
const XmmLink& vr = XmmAlloc();
c->movd(vr, SPU_OFF_32(ch_event_mask));
c->pslldq(vr, 12);
c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
return;
}
default:
{
InterpreterCall(op); // TODO
}
}
}
void spu_recompiler::RCHCNT(spu_opcode_t op)
{
InterpreterCall(op); // TODO
}
void spu_recompiler::SF(spu_opcode_t op)
{
// sub from
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
c->psubd(vb, SPU_OFF_128(gpr, op.ra));
c->movdqa(SPU_OFF_128(gpr, op.rt), vb);
}
void spu_recompiler::OR(spu_opcode_t op)
{
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
c->por(vb, SPU_OFF_128(gpr, op.ra));
c->movdqa(SPU_OFF_128(gpr, op.rt), vb);
}
void spu_recompiler::BG(spu_opcode_t op)
{
// compare if-greater-than
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& vi = XmmAlloc();
c->movdqa(vi, XmmConst(_mm_set1_epi32(0x80000000)));
c->pxor(va, vi);
c->pxor(vi, SPU_OFF_128(gpr, op.rb));
c->pcmpgtd(va, vi);
c->paddd(va, XmmConst(_mm_set1_epi32(1)));
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::SFH(spu_opcode_t op)
{
// sub from (halfword)
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
c->psubw(vb, SPU_OFF_128(gpr, op.ra));
c->movdqa(SPU_OFF_128(gpr, op.rt), vb);
}
void spu_recompiler::NOR(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->por(va, SPU_OFF_128(gpr, op.rb));
c->pxor(va, XmmConst(_mm_set1_epi32(0xffffffff)));
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::ABSDB(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
const XmmLink& vm = XmmAlloc();
c->movdqa(vm, va);
c->pmaxub(va, vb);
c->pminub(vb, vm);
c->psubb(va, vb);
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::ROT(spu_opcode_t op)
{
auto body = [](u32* t, const u32* a, const s32* b) noexcept
{
for (u32 i = 0; i < 4; i++)
{
t[i] = rol32(a[i], b[i]);
}
};
c->lea(*qw0, SPU_OFF_128(gpr, op.rt));
c->lea(*qw1, SPU_OFF_128(gpr, op.ra));
c->lea(*qw2, SPU_OFF_128(gpr, op.rb));
asmjit::CCFuncCall* call = c->call(asmjit::imm_ptr(asmjit::Internal::ptr_cast<void*, void(u32*, const u32*, const s32*)>(body)), asmjit::FuncSignature3<void, void*, void*, void*>(asmjit::CallConv::kIdHost));
call->setArg(0, *qw0);
call->setArg(1, *qw1);
call->setArg(2, *qw2);
//for (u32 i = 0; i < 4; i++) // unrolled loop
//{
// c->mov(qw0->r32(), SPU_OFF_32(gpr, op.ra, &v128::_u32, i));
// c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, i));
// c->rol(qw0->r32(), *addr);
// c->mov(SPU_OFF_32(gpr, op.rt, &v128::_u32, i), qw0->r32());
//}
}
void spu_recompiler::ROTM(spu_opcode_t op)
{
auto body = [](u32* t, const u32* a, const u32* b) noexcept
{
for (u32 i = 0; i < 4; i++)
{
t[i] = static_cast<u32>(static_cast<u64>(a[i]) >> (0 - b[i]));
}
};
c->lea(*qw0, SPU_OFF_128(gpr, op.rt));
c->lea(*qw1, SPU_OFF_128(gpr, op.ra));
c->lea(*qw2, SPU_OFF_128(gpr, op.rb));
asmjit::CCFuncCall* call = c->call(asmjit::imm_ptr(asmjit::Internal::ptr_cast<void*, void(u32*, const u32*, const u32*)>(body)), asmjit::FuncSignature3<void, void*, void*, void*>(asmjit::CallConv::kIdHost));
call->setArg(0, *qw0);
call->setArg(1, *qw1);
call->setArg(2, *qw2);
//for (u32 i = 0; i < 4; i++) // unrolled loop
//{
// c->mov(qw0->r32(), SPU_OFF_32(gpr, op.ra, &v128::_u32, i));
// c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, i));
// c->neg(*addr);
// c->shr(*qw0, *addr);
// c->mov(SPU_OFF_32(gpr, op.rt, &v128::_u32, i), qw0->r32());
//}
}
void spu_recompiler::ROTMA(spu_opcode_t op)
{
auto body = [](s32* t, const s32* a, const u32* b) noexcept
{
for (u32 i = 0; i < 4; i++)
{
t[i] = static_cast<s32>(static_cast<s64>(a[i]) >> (0 - b[i]));
}
};
c->lea(*qw0, SPU_OFF_128(gpr, op.rt));
c->lea(*qw1, SPU_OFF_128(gpr, op.ra));
c->lea(*qw2, SPU_OFF_128(gpr, op.rb));
asmjit::CCFuncCall* call = c->call(asmjit::imm_ptr(asmjit::Internal::ptr_cast<void*, void(s32*, const s32*, const u32*)>(body)), asmjit::FuncSignature3<void, void*, void*, void*>(asmjit::CallConv::kIdHost));
call->setArg(0, *qw0);
call->setArg(1, *qw1);
call->setArg(2, *qw2);
//for (u32 i = 0; i < 4; i++) // unrolled loop
//{
// c->movsxd(*qw0, SPU_OFF_32(gpr, op.ra, &v128::_u32, i));
// c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, i));
// c->neg(*addr);
// c->sar(*qw0, *addr);
// c->mov(SPU_OFF_32(gpr, op.rt, &v128::_u32, i), qw0->r32());
//}
}
void spu_recompiler::SHL(spu_opcode_t op)
{
auto body = [](u32* t, const u32* a, const u32* b) noexcept
{
for (u32 i = 0; i < 4; i++)
{
t[i] = static_cast<u32>(static_cast<u64>(a[i]) << b[i]);
}
};
c->lea(*qw0, SPU_OFF_128(gpr, op.rt));
c->lea(*qw1, SPU_OFF_128(gpr, op.ra));
c->lea(*qw2, SPU_OFF_128(gpr, op.rb));
asmjit::CCFuncCall* call = c->call(asmjit::imm_ptr(asmjit::Internal::ptr_cast<void*, void(u32*, const u32*, const u32*)>(body)), asmjit::FuncSignature3<void, void*, void*, void*>(asmjit::CallConv::kIdHost));
call->setArg(0, *qw0);
call->setArg(1, *qw1);
call->setArg(2, *qw2);
//for (u32 i = 0; i < 4; i++) // unrolled loop
//{
// c->mov(qw0->r32(), SPU_OFF_32(gpr, op.ra, &v128::_u32, i));
// c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, i));
// c->shl(*qw0, *addr);
// c->mov(SPU_OFF_32(gpr, op.rt, &v128::_u32, i), qw0->r32());
//}
}
void spu_recompiler::ROTH(spu_opcode_t op) //nf
{
auto body = [](u16* t, const u16* a, const s16* b) noexcept
{
for (u32 i = 0; i < 8; i++)
{
t[i] = rol16(a[i], b[i]);
}
};
c->lea(*qw0, SPU_OFF_128(gpr, op.rt));
c->lea(*qw1, SPU_OFF_128(gpr, op.ra));
c->lea(*qw2, SPU_OFF_128(gpr, op.rb));
asmjit::CCFuncCall* call = c->call(asmjit::imm_ptr(asmjit::Internal::ptr_cast<void*, void(u16*, const u16*, const s16*)>(body)), asmjit::FuncSignature3<void, void*, void*, void*>(asmjit::CallConv::kIdHost));
call->setArg(0, *qw0);
call->setArg(1, *qw1);
call->setArg(2, *qw2);
//for (u32 i = 0; i < 8; i++) // unrolled loop
//{
// c->movzx(qw0->r32(), SPU_OFF_16(gpr, op.ra, &v128::_u16, i));
// c->movzx(*addr, SPU_OFF_16(gpr, op.rb, &v128::_u16, i));
// c->rol(qw0->r16(), *addr);
// c->mov(SPU_OFF_16(gpr, op.rt, &v128::_u16, i), qw0->r16());
//}
}
void spu_recompiler::ROTHM(spu_opcode_t op)
{
auto body = [](u16* t, const u16* a, const u16* b) noexcept
{
for (u32 i = 0; i < 8; i++)
{
t[i] = static_cast<u16>(static_cast<u32>(a[i]) >> (0 - b[i]));
}
};
c->lea(*qw0, SPU_OFF_128(gpr, op.rt));
c->lea(*qw1, SPU_OFF_128(gpr, op.ra));
c->lea(*qw2, SPU_OFF_128(gpr, op.rb));
asmjit::CCFuncCall* call = c->call(asmjit::imm_ptr(asmjit::Internal::ptr_cast<void*, void(u16*, const u16*, const u16*)>(body)), asmjit::FuncSignature3<void, void*, void*, void*>(asmjit::CallConv::kIdHost));
call->setArg(0, *qw0);
call->setArg(1, *qw1);
call->setArg(2, *qw2);
//for (u32 i = 0; i < 8; i++) // unrolled loop
//{
// c->movzx(qw0->r32(), SPU_OFF_16(gpr, op.ra, &v128::_u16, i));
// c->movzx(*addr, SPU_OFF_16(gpr, op.rb, &v128::_u16, i));
// c->neg(*addr);
// c->shr(qw0->r32(), *addr);
// c->mov(SPU_OFF_16(gpr, op.rt, &v128::_u16, i), qw0->r16());
//}
}
void spu_recompiler::ROTMAH(spu_opcode_t op)
{
auto body = [](s16* t, const s16* a, const u16* b) noexcept
{
for (u32 i = 0; i < 8; i++)
{
t[i] = static_cast<s16>(static_cast<s32>(a[i]) >> (0 - b[i]));
}
};
c->lea(*qw0, SPU_OFF_128(gpr, op.rt));
c->lea(*qw1, SPU_OFF_128(gpr, op.ra));
c->lea(*qw2, SPU_OFF_128(gpr, op.rb));
asmjit::CCFuncCall* call = c->call(asmjit::imm_ptr(asmjit::Internal::ptr_cast<void*, void(s16*, const s16*, const u16*)>(body)), asmjit::FuncSignature3<void, void*, void*, void*>(asmjit::CallConv::kIdHost));
call->setArg(0, *qw0);
call->setArg(1, *qw1);
call->setArg(2, *qw2);
//for (u32 i = 0; i < 8; i++) // unrolled loop
//{
// c->movsx(qw0->r32(), SPU_OFF_16(gpr, op.ra, &v128::_u16, i));
// c->movzx(*addr, SPU_OFF_16(gpr, op.rb, &v128::_u16, i));
// c->neg(*addr);
// c->sar(qw0->r32(), *addr);
// c->mov(SPU_OFF_16(gpr, op.rt, &v128::_u16, i), qw0->r16());
//}
}
void spu_recompiler::SHLH(spu_opcode_t op)
{
auto body = [](u16* t, const u16* a, const u16* b) noexcept
{
for (u32 i = 0; i < 8; i++)
{
t[i] = static_cast<u16>(static_cast<u32>(a[i]) << b[i]);
}
};
c->lea(*qw0, SPU_OFF_128(gpr, op.rt));
c->lea(*qw1, SPU_OFF_128(gpr, op.ra));
c->lea(*qw2, SPU_OFF_128(gpr, op.rb));
asmjit::CCFuncCall* call = c->call(asmjit::imm_ptr(asmjit::Internal::ptr_cast<void*, void(u16*, const u16*, const u16*)>(body)), asmjit::FuncSignature3<void, void*, void*, void*>(asmjit::CallConv::kIdHost));
call->setArg(0, *qw0);
call->setArg(1, *qw1);
call->setArg(2, *qw2);
//for (u32 i = 0; i < 8; i++) // unrolled loop
//{
// c->movzx(qw0->r32(), SPU_OFF_16(gpr, op.ra, &v128::_u16, i));
// c->movzx(*addr, SPU_OFF_16(gpr, op.rb, &v128::_u16, i));
// c->shl(qw0->r32(), *addr);
// c->mov(SPU_OFF_16(gpr, op.rt, &v128::_u16, i), qw0->r16());
//}
}
void spu_recompiler::ROTI(spu_opcode_t op)
{
// rotate left
const int s = op.i7 & 0x1f;
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& v1 = XmmAlloc();
c->movdqa(v1, va);
c->pslld(va, s);
c->psrld(v1, 32 - s);
c->por(va, v1);
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::ROTMI(spu_opcode_t op)
{
// shift right logical
const int s = 0-op.i7 & 0x3f;
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->psrld(va, s);
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::ROTMAI(spu_opcode_t op)
{
// shift right arithmetical
const int s = 0-op.i7 & 0x3f;
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->psrad(va, s);
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::SHLI(spu_opcode_t op)
{
// shift left
const int s = op.i7 & 0x3f;
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->pslld(va, s);
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::ROTHI(spu_opcode_t op)
{
// rotate left (halfword)
const int s = op.i7 & 0xf;
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& v1 = XmmAlloc();
c->movdqa(v1, va);
c->psllw(va, s);
c->psrlw(v1, 16 - s);
c->por(va, v1);
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::ROTHMI(spu_opcode_t op)
{
// shift right logical
const int s = 0-op.i7 & 0x1f;
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->psrlw(va, s);
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::ROTMAHI(spu_opcode_t op)
{
// shift right arithmetical (halfword)
const int s = 0-op.i7 & 0x1f;
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->psraw(va, s);
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::SHLHI(spu_opcode_t op)
{
// shift left (halfword)
const int s = op.i7 & 0x1f;
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->psllw(va, s);
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::A(spu_opcode_t op)
{
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
c->paddd(vb, SPU_OFF_128(gpr, op.ra));
c->movdqa(SPU_OFF_128(gpr, op.rt), vb);
}
void spu_recompiler::AND(spu_opcode_t op)
{
// and
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
c->pand(vb, SPU_OFF_128(gpr, op.ra));
c->movdqa(SPU_OFF_128(gpr, op.rt), vb);
}
void spu_recompiler::CG(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
const XmmLink& vi = XmmAlloc();
c->movdqa(vi, XmmConst(_mm_set1_epi32(0x80000000)));
c->paddd(vb, va);
c->pxor(va, vi);
c->pxor(vb, vi);
c->pcmpgtd(va, vb);
c->psrld(va, 31);
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::AH(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->paddw(va, SPU_OFF_128(gpr, op.rb));
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::NAND(spu_opcode_t op)
{
// nand
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->pand(va, SPU_OFF_128(gpr, op.rb));
c->pxor(va, XmmConst(_mm_set1_epi32(0xffffffff)));
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::AVGB(spu_opcode_t op)
{
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
c->pavgb(vb, SPU_OFF_128(gpr, op.ra));
c->movdqa(SPU_OFF_128(gpr, op.rt), vb);
}
void spu_recompiler::MTSPR(spu_opcode_t op)
{
InterpreterCall(op);
}
void spu_recompiler::WRCH(spu_opcode_t op)
{
switch (op.ra)
{
case SPU_WrSRR0:
{
c->mov(*addr, SPU_OFF_32(gpr, op.rt, &v128::_u32, 3));
c->mov(SPU_OFF_32(srr0), *addr);
c->unuse(*addr);
return;
}
case MFC_WrTagMask:
{
c->mov(*addr, SPU_OFF_32(gpr, op.rt, &v128::_u32, 3));
c->mov(SPU_OFF_32(ch_tag_mask), *addr);
c->unuse(*addr);
return;
}
case MFC_LSA:
{
c->mov(*addr, SPU_OFF_32(gpr, op.rt, &v128::_u32, 3));
c->mov(SPU_OFF_32(ch_mfc_cmd, &spu_mfc_cmd::lsa), *addr);
c->unuse(*addr);
return;
}
case MFC_EAH:
{
c->mov(*addr, SPU_OFF_32(gpr, op.rt, &v128::_u32, 3));
c->mov(SPU_OFF_32(ch_mfc_cmd, &spu_mfc_cmd::eah), *addr);
c->unuse(*addr);
return;
}
case MFC_EAL:
{
c->mov(*addr, SPU_OFF_32(gpr, op.rt, &v128::_u32, 3));
c->mov(SPU_OFF_32(ch_mfc_cmd, &spu_mfc_cmd::eal), *addr);
c->unuse(*addr);
return;
}
case MFC_Size:
{
c->mov(*addr, SPU_OFF_32(gpr, op.rt, &v128::_u32, 3));
c->mov(SPU_OFF_16(ch_mfc_cmd, &spu_mfc_cmd::size), addr->r16());
c->unuse(*addr);
return;
}
case MFC_TagID:
{
c->mov(*addr, SPU_OFF_32(gpr, op.rt, &v128::_u32, 3));
c->mov(SPU_OFF_8(ch_mfc_cmd, &spu_mfc_cmd::tag), addr->r8());
c->unuse(*addr);
return;
}
case 69:
{
return;
}
default:
{
InterpreterCall(op); // TODO
}
}
}
void spu_recompiler::BIZ(spu_opcode_t op)
{
c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
c->and_(*addr, 0x3fffc);
if (op.d || op.e) c->or_(*addr, op.e << 26 | op.d << 27); // interrupt flags neutralize jump table
c->cmp(SPU_OFF_32(gpr, op.rt, &v128::_u32, 3), 0);
c->je(*jt);
c->unuse(*addr);
}
void spu_recompiler::BINZ(spu_opcode_t op)
{
c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
c->and_(*addr, 0x3fffc);
if (op.d || op.e) c->or_(*addr, op.e << 26 | op.d << 27); // interrupt flags neutralize jump table
c->cmp(SPU_OFF_32(gpr, op.rt, &v128::_u32, 3), 0);
c->jne(*jt);
c->unuse(*addr);
}
void spu_recompiler::BIHZ(spu_opcode_t op)
{
c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
c->and_(*addr, 0x3fffc);
if (op.d || op.e) c->or_(*addr, op.e << 26 | op.d << 27); // interrupt flags neutralize jump table
c->cmp(SPU_OFF_16(gpr, op.rt, &v128::_u16, 6), 0);
c->je(*jt);
c->unuse(*addr);
}
void spu_recompiler::BIHNZ(spu_opcode_t op)
{
c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
c->and_(*addr, 0x3fffc);
if (op.d || op.e) c->or_(*addr, op.e << 26 | op.d << 27); // interrupt flags neutralize jump table
c->cmp(SPU_OFF_16(gpr, op.rt, &v128::_u16, 6), 0);
c->jne(*jt);
c->unuse(*addr);
}
void spu_recompiler::STOPD(spu_opcode_t op)
{
InterpreterCall(op);
}
void spu_recompiler::STQX(spu_opcode_t op)
{
c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
c->add(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
c->and_(*addr, 0x3fff0);
const XmmLink& vt = XmmGet(op.rt, XmmType::Int);
c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
c->movdqa(asmjit::x86::oword_ptr(*ls, *addr), vt);
c->unuse(*addr);
}
void spu_recompiler::BI(spu_opcode_t op)
{
c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
c->and_(*addr, 0x3fffc);
if (op.d || op.e) c->or_(*addr, op.e << 26 | op.d << 27); // interrupt flags neutralize jump table
c->jmp(*jt);
}
void spu_recompiler::BISL(spu_opcode_t op)
{
c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
c->and_(*addr, 0x3fffc);
if (op.d || op.e) c->or_(*addr, op.e << 26 | op.d << 27); // interrupt flags stored to PC
c->mov(SPU_OFF_32(pc), *addr);
c->unuse(*addr);
const XmmLink& vr = XmmAlloc();
c->movdqa(vr, XmmConst(_mm_set_epi32(spu_branch_target(m_pos + 4), 0, 0, 0)));
c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
c->unuse(vr);
FunctionCall();
}
void spu_recompiler::IRET(spu_opcode_t op)
{
c->mov(*addr, SPU_OFF_32(srr0));
c->and_(*addr, 0x3fffc);
if (op.d || op.e) c->or_(*addr, op.e << 26 | op.d << 27); // interrupt flags neutralize jump table
c->jmp(*jt);
}
void spu_recompiler::BISLED(spu_opcode_t op)
{
fmt::throw_exception("Unimplemented instruction" HERE);
}
void spu_recompiler::HBR(spu_opcode_t op)
{
}
void spu_recompiler::GB(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->pshufb(va, XmmConst(_mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0)));
c->psllq(va, 7);
c->pmovmskb(*addr, va);
c->pxor(va, va);
c->pinsrw(va, *addr, 6);
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
c->unuse(*addr);
}
void spu_recompiler::GBH(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->pshufb(va, XmmConst(_mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 14, 12, 10, 8, 6, 4, 2, 0)));
c->psllq(va, 7);
c->pmovmskb(*addr, va);
c->pxor(va, va);
c->pinsrw(va, *addr, 6);
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
c->unuse(*addr);
}
void spu_recompiler::GBB(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->psllq(va, 7);
c->pmovmskb(*addr, va);
c->pxor(va, va);
c->pinsrw(va, *addr, 6);
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
c->unuse(*addr);
}
void spu_recompiler::FSM(spu_opcode_t op)
{
const XmmLink& vr = XmmAlloc();
c->mov(*qw0, asmjit::imm_ptr((void*)g_spu_imm.fsm));
c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
c->and_(*addr, 0xf);
c->shl(*addr, 4);
c->movdqa(vr, asmjit::x86::oword_ptr(*qw0, *addr));
c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
c->unuse(*addr);
c->unuse(*qw0);
}
void spu_recompiler::FSMH(spu_opcode_t op)
{
const XmmLink& vr = XmmAlloc();
c->mov(*qw0, asmjit::imm_ptr((void*)g_spu_imm.fsmh));
c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
c->and_(*addr, 0xff);
c->shl(*addr, 4);
c->movdqa(vr, asmjit::x86::oword_ptr(*qw0, *addr));
c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
c->unuse(*addr);
c->unuse(*qw0);
}
void spu_recompiler::FSMB(spu_opcode_t op)
{
const XmmLink& vr = XmmAlloc();
c->mov(*qw0, asmjit::imm_ptr((void*)g_spu_imm.fsmb));
c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
c->and_(*addr, 0xffff);
c->shl(*addr, 4);
c->movdqa(vr, asmjit::x86::oword_ptr(*qw0, *addr));
c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
c->unuse(*addr);
c->unuse(*qw0);
}
void spu_recompiler::FREST(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Float);
c->rcpps(va, va);
c->movaps(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::FRSQEST(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Float);
c->andps(va, XmmConst(_mm_set1_epi32(0x7fffffff))); // abs
c->rsqrtps(va, va);
c->movaps(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::LQX(spu_opcode_t op)
{
c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
c->add(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
c->and_(*addr, 0x3fff0);
const XmmLink& vt = XmmAlloc();
c->movdqa(vt, asmjit::x86::oword_ptr(*ls, *addr));
c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
c->unuse(*addr);
}
void spu_recompiler::ROTQBYBI(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->mov(*qw0, asmjit::imm_ptr((void*)g_spu_imm.rldq_pshufb));
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
c->and_(*addr, 0xf << 3);
c->shl(*addr, 1);
c->pshufb(va, asmjit::x86::oword_ptr(*qw0, *addr));
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
c->unuse(*addr);
c->unuse(*qw0);
}
void spu_recompiler::ROTQMBYBI(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->mov(*qw0, asmjit::imm_ptr((void*)g_spu_imm.srdq_pshufb));
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
c->shr(*addr, 3);
c->neg(*addr);
c->and_(*addr, 0x1f);
c->shl(*addr, 4);
c->pshufb(va, asmjit::x86::oword_ptr(*qw0, *addr));
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
c->unuse(*addr);
c->unuse(*qw0);
}
void spu_recompiler::SHLQBYBI(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->mov(*qw0, asmjit::imm_ptr((void*)g_spu_imm.sldq_pshufb));
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
c->and_(*addr, 0x1f << 3);
c->shl(*addr, 1);
c->pshufb(va, asmjit::x86::oword_ptr(*qw0, *addr));
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
c->unuse(*addr);
c->unuse(*qw0);
}
void spu_recompiler::CBX(spu_opcode_t op)
{
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
c->add(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
c->not_(*addr);
c->and_(*addr, 0xf);
const XmmLink& vr = XmmAlloc();
c->movdqa(vr, XmmConst(_mm_set_epi32(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f)));
c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
c->mov(asmjit::x86::byte_ptr(*cpu, *addr, 0, offset32(&SPUThread::gpr, op.rt)), 0x03);
c->unuse(*addr);
}
void spu_recompiler::CHX(spu_opcode_t op)
{
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
c->add(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
c->not_(*addr);
c->and_(*addr, 0xe);
const XmmLink& vr = XmmAlloc();
c->movdqa(vr, XmmConst(_mm_set_epi32(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f)));
c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
c->mov(asmjit::x86::word_ptr(*cpu, *addr, 0, offset32(&SPUThread::gpr, op.rt)), 0x0203);
c->unuse(*addr);
}
void spu_recompiler::CWX(spu_opcode_t op)
{
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
c->add(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
c->not_(*addr);
c->and_(*addr, 0xc);
const XmmLink& vr = XmmAlloc();
c->movdqa(vr, XmmConst(_mm_set_epi32(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f)));
c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
c->mov(asmjit::x86::dword_ptr(*cpu, *addr, 0, offset32(&SPUThread::gpr, op.rt)), 0x00010203);
c->unuse(*addr);
}
void spu_recompiler::CDX(spu_opcode_t op)
{
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
c->add(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
c->not_(*addr);
c->and_(*addr, 0x8);
const XmmLink& vr = XmmAlloc();
c->movdqa(vr, XmmConst(_mm_set_epi32(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f)));
c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
c->mov(*qw0, asmjit::imm_u(0x0001020304050607));
c->mov(asmjit::x86::qword_ptr(*cpu, *addr, 0, offset32(&SPUThread::gpr, op.rt)), *qw0);
c->unuse(*addr);
c->unuse(*qw0);
}
void spu_recompiler::ROTQBI(spu_opcode_t op)
{
c->mov(*qw0, SPU_OFF_64(gpr, op.ra, &v128::_u64, 0));
c->mov(*qw1, SPU_OFF_64(gpr, op.ra, &v128::_u64, 1));
c->mov(*qw2, *qw0);
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
c->and_(*addr, 7);
c->shld(*qw0, *qw1, *addr);
c->shld(*qw1, *qw2, *addr);
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw0);
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw1);
c->unuse(*addr);
c->unuse(*qw0);
c->unuse(*qw1);
c->unuse(*qw2);
}
void spu_recompiler::ROTQMBI(spu_opcode_t op)
{
c->mov(*qw0, SPU_OFF_64(gpr, op.ra, &v128::_u64, 0));
c->mov(*qw1, SPU_OFF_64(gpr, op.ra, &v128::_u64, 1));
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
c->neg(*addr);
c->and_(*addr, 7);
c->shrd(*qw0, *qw1, *addr);
c->shr(*qw1, *addr);
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw0);
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw1);
c->unuse(*addr);
c->unuse(*qw0);
c->unuse(*qw1);
}
void spu_recompiler::SHLQBI(spu_opcode_t op)
{
c->mov(*qw0, SPU_OFF_64(gpr, op.ra, &v128::_u64, 0));
c->mov(*qw1, SPU_OFF_64(gpr, op.ra, &v128::_u64, 1));
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
c->and_(*addr, 7);
c->shld(*qw1, *qw0, *addr);
c->shl(*qw0, *addr);
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw0);
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw1);
c->unuse(*addr);
c->unuse(*qw0);
c->unuse(*qw1);
}
void spu_recompiler::ROTQBY(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->mov(*qw0, asmjit::imm_ptr((void*)g_spu_imm.rldq_pshufb));
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
c->and_(*addr, 0xf);
c->shl(*addr, 4);
c->pshufb(va, asmjit::x86::oword_ptr(*qw0, *addr));
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
c->unuse(*addr);
c->unuse(*qw0);
}
void spu_recompiler::ROTQMBY(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->mov(*qw0, asmjit::imm_ptr((void*)g_spu_imm.srdq_pshufb));
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
c->neg(*addr);
c->and_(*addr, 0x1f);
c->shl(*addr, 4);
c->pshufb(va, asmjit::x86::oword_ptr(*qw0, *addr));
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
c->unuse(*addr);
c->unuse(*qw0);
}
void spu_recompiler::SHLQBY(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->mov(*qw0, asmjit::imm_ptr((void*)g_spu_imm.sldq_pshufb));
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
c->and_(*addr, 0x1f);
c->shl(*addr, 4);
c->pshufb(va, asmjit::x86::oword_ptr(*qw0, *addr));
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
c->unuse(*addr);
c->unuse(*qw0);
}
void spu_recompiler::ORX(spu_opcode_t op)
{
c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 0));
c->or_(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 1));
c->or_(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 2));
c->or_(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
c->mov(SPU_OFF_32(gpr, op.rt, &v128::_u32, 3), *addr);
c->xor_(*addr, *addr);
c->mov(SPU_OFF_32(gpr, op.rt, &v128::_u32, 0), *addr);
c->mov(SPU_OFF_32(gpr, op.rt, &v128::_u32, 1), *addr);
c->mov(SPU_OFF_32(gpr, op.rt, &v128::_u32, 2), *addr);
c->unuse(*addr);
}
void spu_recompiler::CBD(spu_opcode_t op)
{
//if (op.ra == 1)
//{
// // assuming that SP % 16 is always zero
// const XmmLink& vr = XmmAlloc();
// v128 value = v128::fromV(_mm_set_epi32(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f));
// value.u8r[op.i7 & 0xf] = 0x03;
// c->movdqa(vr, XmmConst(value));
// c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
// return;
//}
c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
if (op.i7) c->add(*addr, op.i7);
c->not_(*addr);
c->and_(*addr, 0xf);
const XmmLink& vr = XmmAlloc();
c->movdqa(vr, XmmConst(_mm_set_epi32(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f)));
c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
c->mov(asmjit::x86::byte_ptr(*cpu, *addr, 0, offset32(&SPUThread::gpr, op.rt)), 0x03);
c->unuse(*addr);
}
void spu_recompiler::CHD(spu_opcode_t op)
{
//if (op.ra == 1)
//{
// // assuming that SP % 16 is always zero
// const XmmLink& vr = XmmAlloc();
// v128 value = v128::fromV(_mm_set_epi32(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f));
// value.u16r[(op.i7 >> 1) & 0x7] = 0x0203;
// c->movdqa(vr, XmmConst(value));
// c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
// return;
//}
c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
if (op.i7) c->add(*addr, op.i7);
c->not_(*addr);
c->and_(*addr, 0xe);
const XmmLink& vr = XmmAlloc();
c->movdqa(vr, XmmConst(_mm_set_epi32(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f)));
c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
c->mov(asmjit::x86::word_ptr(*cpu, *addr, 0, offset32(&SPUThread::gpr, op.rt)), 0x0203);
c->unuse(*addr);
}
void spu_recompiler::CWD(spu_opcode_t op)
{
//if (op.ra == 1)
//{
// // assuming that SP % 16 is always zero
// const XmmLink& vr = XmmAlloc();
// v128 value = v128::fromV(_mm_set_epi32(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f));
// value.u32r[(op.i7 >> 2) & 0x3] = 0x00010203;
// c->movdqa(vr, XmmConst(value));
// c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
// return;
//}
c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
if (op.i7) c->add(*addr, op.i7);
c->not_(*addr);
c->and_(*addr, 0xc);
const XmmLink& vr = XmmAlloc();
c->movdqa(vr, XmmConst(_mm_set_epi32(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f)));
c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
c->mov(asmjit::x86::dword_ptr(*cpu, *addr, 0, offset32(&SPUThread::gpr, op.rt)), 0x00010203);
c->unuse(*addr);
}
void spu_recompiler::CDD(spu_opcode_t op)
{
//if (op.ra == 1)
//{
// // assuming that SP % 16 is always zero
// const XmmLink& vr = XmmAlloc();
// v128 value = v128::fromV(_mm_set_epi32(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f));
// value.u64r[(op.i7 >> 3) & 0x1] = 0x0001020304050607ull;
// c->movdqa(vr, XmmConst(value));
// c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
// return;
//}
c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
if (op.i7) c->add(*addr, op.i7);
c->not_(*addr);
c->and_(*addr, 0x8);
const XmmLink& vr = XmmAlloc();
c->movdqa(vr, XmmConst(_mm_set_epi32(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f)));
c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
c->mov(*qw0, asmjit::imm_u(0x0001020304050607));
c->mov(asmjit::x86::qword_ptr(*cpu, *addr, 0, offset32(&SPUThread::gpr, op.rt)), *qw0);
c->unuse(*addr);
c->unuse(*qw0);
}
void spu_recompiler::ROTQBII(spu_opcode_t op)
{
c->mov(*qw0, SPU_OFF_64(gpr, op.ra, &v128::_u64, 0));
c->mov(*qw1, SPU_OFF_64(gpr, op.ra, &v128::_u64, 1));
c->mov(*qw2, *qw0);
c->shld(*qw0, *qw1, op.i7 & 0x7);
c->shld(*qw1, *qw2, op.i7 & 0x7);
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw0);
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw1);
c->unuse(*qw0);
c->unuse(*qw1);
c->unuse(*qw2);
}
void spu_recompiler::ROTQMBII(spu_opcode_t op)
{
c->mov(*qw0, SPU_OFF_64(gpr, op.ra, &v128::_u64, 0));
c->mov(*qw1, SPU_OFF_64(gpr, op.ra, &v128::_u64, 1));
c->shrd(*qw0, *qw1, 0-op.i7 & 0x7);
c->shr(*qw1, 0-op.i7 & 0x7);
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw0);
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw1);
c->unuse(*qw0);
c->unuse(*qw1);
}
void spu_recompiler::SHLQBII(spu_opcode_t op)
{
c->mov(*qw0, SPU_OFF_64(gpr, op.ra, &v128::_u64, 0));
c->mov(*qw1, SPU_OFF_64(gpr, op.ra, &v128::_u64, 1));
c->shld(*qw1, *qw0, op.i7 & 0x7);
c->shl(*qw0, op.i7 & 0x7);
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw0);
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw1);
c->unuse(*qw0);
c->unuse(*qw1);
}
void spu_recompiler::ROTQBYI(spu_opcode_t op)
{
const int s = op.i7 & 0xf;
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->palignr(va, va, 16 - s);
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::ROTQMBYI(spu_opcode_t op)
{
const int s = 0-op.i7 & 0x1f;
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->psrldq(va, s);
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::SHLQBYI(spu_opcode_t op)
{
const int s = op.i7 & 0x1f;
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->pslldq(va, s);
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::NOP(spu_opcode_t op)
{
}
void spu_recompiler::CGT(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->pcmpgtd(va, SPU_OFF_128(gpr, op.rb));
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::XOR(spu_opcode_t op)
{
// xor
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->pxor(va, SPU_OFF_128(gpr, op.rb));
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::CGTH(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->pcmpgtw(va, SPU_OFF_128(gpr, op.rb));
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::EQV(spu_opcode_t op)
{
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
c->pxor(vb, XmmConst(_mm_set1_epi32(0xffffffff)));
c->pxor(vb, SPU_OFF_128(gpr, op.ra));
c->movdqa(SPU_OFF_128(gpr, op.rt), vb);
}
void spu_recompiler::CGTB(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->pcmpgtb(va, SPU_OFF_128(gpr, op.rb));
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::SUMB(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
const XmmLink& vi = XmmAlloc();
c->movdqa(vi, XmmConst(_mm_set1_epi8(1)));
c->pmaddubsw(va, vi);
c->pmaddubsw(vb, vi);
c->phaddw(va, vb);
c->pshufb(va, XmmConst(_mm_set_epi8(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0)));
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
//HGT uses signed values. HLGT uses unsigned values
void spu_recompiler::HGT(spu_opcode_t op)
{
c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_s32, 3));
c->cmp(*addr, SPU_OFF_32(gpr, op.rb, &v128::_s32, 3));
c->mov(*addr, m_pos | 0x1000000);
c->jg(*end);
c->unuse(*addr);
}
void spu_recompiler::CLZ(spu_opcode_t op)
{
auto body = [](u32* t, const u32* a) noexcept
{
for (u32 i = 0; i < 4; i++)
{
t[i] = cntlz32(a[i]);
}
};
c->lea(*qw0, SPU_OFF_128(gpr, op.rt));
c->lea(*qw1, SPU_OFF_128(gpr, op.ra));
asmjit::CCFuncCall* call = c->call(asmjit::imm_ptr(asmjit::Internal::ptr_cast<void*, void(u32*, const u32*)>(body)), asmjit::FuncSignature2<void, void*, void*>(asmjit::CallConv::kIdHost));
call->setArg(0, *qw0);
call->setArg(1, *qw1);
//c->mov(*qw0, 32 + 31);
//for (u32 i = 0; i < 4; i++) // unrolled loop
//{
// c->bsr(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, i));
// c->cmovz(*addr, qw0->r32());
// c->xor_(*addr, 31);
// c->mov(SPU_OFF_32(gpr, op.rt, &v128::_u32, i), *addr);
//}
}
void spu_recompiler::XSWD(spu_opcode_t op)
{
c->movsxd(*qw0, SPU_OFF_32(gpr, op.ra, &v128::_s32, 0));
c->movsxd(*qw1, SPU_OFF_32(gpr, op.ra, &v128::_s32, 2));
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_s64, 0), *qw0);
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_s64, 1), *qw1);
c->unuse(*qw0);
c->unuse(*qw1);
}
void spu_recompiler::XSHW(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->pslld(va, 16);
c->psrad(va, 16);
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::CNTB(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& v1 = XmmAlloc();
const XmmLink& vm = XmmAlloc();
c->movdqa(v1, va);
c->psrlq(v1, 4);
c->movdqa(vm, XmmConst(_mm_set1_epi8(0xf)));
c->pand(va, vm);
c->pand(v1, vm);
c->movdqa(vm, XmmConst(_mm_set_epi8(4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0)));
c->pshufb(vm, va);
c->movdqa(va, XmmConst(_mm_set_epi8(4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0)));
c->pshufb(va, v1);
c->paddb(va, vm);
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::XSBH(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->psllw(va, 8);
c->psraw(va, 8);
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::CLGT(spu_opcode_t op)
{
// compare if-greater-than
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& vi = XmmAlloc();
c->movdqa(vi, XmmConst(_mm_set1_epi32(0x80000000)));
c->pxor(va, vi);
c->pxor(vi, SPU_OFF_128(gpr, op.rb));
c->pcmpgtd(va, vi);
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::ANDC(spu_opcode_t op)
{
// and not
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
c->pandn(vb, SPU_OFF_128(gpr, op.ra));
c->movdqa(SPU_OFF_128(gpr, op.rt), vb);
}
void spu_recompiler::FCGT(spu_opcode_t op)
{
// reverted less-than
const XmmLink& vb = XmmGet(op.rb, XmmType::Float);
c->cmpps(vb, SPU_OFF_128(gpr, op.ra), 1);
c->movaps(SPU_OFF_128(gpr, op.rt), vb);
}
void spu_recompiler::DFCGT(spu_opcode_t op)
{
fmt::throw_exception("Unexpected instruction" HERE);
}
void spu_recompiler::FA(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Float);
c->addps(va, SPU_OFF_128(gpr, op.rb));
c->movaps(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::FS(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Float);
c->subps(va, SPU_OFF_128(gpr, op.rb));
c->movaps(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::FM(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Float);
c->mulps(va, SPU_OFF_128(gpr, op.rb));
c->movaps(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::CLGTH(spu_opcode_t op)
{
// compare if-greater-than
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& vi = XmmAlloc();
c->movdqa(vi, XmmConst(_mm_set1_epi16(INT16_MIN)));
c->pxor(va, vi);
c->pxor(vi, SPU_OFF_128(gpr, op.rb));
c->pcmpgtw(va, vi);
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::ORC(spu_opcode_t op)
{
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
c->pxor(vb, XmmConst(_mm_set1_epi32(0xffffffff)));
c->por(vb, SPU_OFF_128(gpr, op.ra));
c->movdqa(SPU_OFF_128(gpr, op.rt), vb);
}
void spu_recompiler::FCMGT(spu_opcode_t op)
{
// reverted less-than
const XmmLink& vb = XmmGet(op.rb, XmmType::Float);
const XmmLink& vi = XmmAlloc();
c->movaps(vi, XmmConst(_mm_set1_epi32(0x7fffffff)));
c->andps(vb, vi); // abs
c->andps(vi, SPU_OFF_128(gpr, op.ra));
c->cmpps(vb, vi, 1);
c->movaps(SPU_OFF_128(gpr, op.rt), vb);
}
void spu_recompiler::DFCMGT(spu_opcode_t op)
{
fmt::throw_exception("Unexpected instruction" HERE);
}
void spu_recompiler::DFA(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Double);
c->addpd(va, SPU_OFF_128(gpr, op.rb));
c->movapd(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::DFS(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Double);
c->subpd(va, SPU_OFF_128(gpr, op.rb));
c->movapd(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::DFM(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Double);
c->mulpd(va, SPU_OFF_128(gpr, op.rb));
c->movapd(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::CLGTB(spu_opcode_t op)
{
// compare if-greater-than
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& vi = XmmAlloc();
c->movdqa(vi, XmmConst(_mm_set1_epi8(INT8_MIN)));
c->pxor(va, vi);
c->pxor(vi, SPU_OFF_128(gpr, op.rb));
c->pcmpgtb(va, vi);
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::HLGT(spu_opcode_t op)
{
c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
c->cmp(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
c->mov(*addr, m_pos | 0x1000000);
c->ja(*end);
c->unuse(*addr);
}
void spu_recompiler::DFMA(spu_opcode_t op)
{
const XmmLink& vr = XmmGet(op.rt, XmmType::Double);
const XmmLink& va = XmmGet(op.ra, XmmType::Double);
c->mulpd(va, SPU_OFF_128(gpr, op.rb));
c->addpd(vr, va);
c->movapd(SPU_OFF_128(gpr, op.rt), vr);
}
void spu_recompiler::DFMS(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Double);
const XmmLink& vt = XmmGet(op.rt, XmmType::Double);
c->mulpd(va, SPU_OFF_128(gpr, op.rb));
c->subpd(va, vt);
c->movapd(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::DFNMS(spu_opcode_t op)
{
const XmmLink& vr = XmmGet(op.rt, XmmType::Double);
const XmmLink& va = XmmGet(op.ra, XmmType::Double);
c->mulpd(va, SPU_OFF_128(gpr, op.rb));
c->subpd(vr, va);
c->movapd(SPU_OFF_128(gpr, op.rt), vr);
}
void spu_recompiler::DFNMA(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Double);
const XmmLink& vt = XmmGet(op.rt, XmmType::Double);
c->mulpd(va, SPU_OFF_128(gpr, op.rb));
c->addpd(vt, va);
c->xorpd(va, va);
c->subpd(va, vt);
c->movapd(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::CEQ(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->pcmpeqd(va, SPU_OFF_128(gpr, op.rb));
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::MPYHHU(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
const XmmLink& va2 = XmmAlloc();
c->movdqa(va2, va);
c->pmulhuw(va, vb);
c->pmullw(va2, vb);
c->pand(va, XmmConst(_mm_set1_epi32(0xffff0000)));
c->psrld(va2, 16);
c->por(va, va2);
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::ADDX(spu_opcode_t op)
{
const XmmLink& vt = XmmGet(op.rt, XmmType::Int);
c->pand(vt, XmmConst(_mm_set1_epi32(1)));
c->paddd(vt, SPU_OFF_128(gpr, op.ra));
c->paddd(vt, SPU_OFF_128(gpr, op.rb));
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
}
void spu_recompiler::SFX(spu_opcode_t op)
{
const XmmLink& vt = XmmGet(op.rt, XmmType::Int);
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
c->pandn(vt, XmmConst(_mm_set1_epi32(1)));
c->psubd(vb, SPU_OFF_128(gpr, op.ra));
c->psubd(vb, vt);
c->movdqa(SPU_OFF_128(gpr, op.rt), vb);
}
void spu_recompiler::CGX(spu_opcode_t op) //nf
{
auto body = [](u32* t, const u32* a, const u32* b) noexcept
{
for (s32 i = 0; i < 4; i++)
{
t[i] = (static_cast<u64>(t[i] & 1) + a[i] + b[i]) >> 32;
}
};
c->lea(*qw0, SPU_OFF_128(gpr, op.rt));
c->lea(*qw1, SPU_OFF_128(gpr, op.ra));
c->lea(*qw2, SPU_OFF_128(gpr, op.rb));
asmjit::CCFuncCall* call = c->call(asmjit::imm_ptr(asmjit::Internal::ptr_cast<void*, void(u32*, const u32*, const u32*)>(body)), asmjit::FuncSignature3<void, void*, void*, void*>(asmjit::CallConv::kIdHost));
call->setArg(0, *qw0);
call->setArg(1, *qw1);
call->setArg(2, *qw2);
}
void spu_recompiler::BGX(spu_opcode_t op) //nf
{
auto body = [](u32* t, const u32* a, const u32* b) noexcept
{
for (s32 i = 0; i < 4; i++)
{
const s64 result = (u64)b[i] - (u64)a[i] - (u64)(1 - (t[i] & 1));
t[i] = result >= 0;
}
};
c->lea(*qw0, SPU_OFF_128(gpr, op.rt));
c->lea(*qw1, SPU_OFF_128(gpr, op.ra));
c->lea(*qw2, SPU_OFF_128(gpr, op.rb));
asmjit::CCFuncCall* call = c->call(asmjit::imm_ptr(asmjit::Internal::ptr_cast<void*, void(u32*, const u32*, const u32*)>(body)), asmjit::FuncSignature3<void, void*, void*, void*>(asmjit::CallConv::kIdHost));
call->setArg(0, *qw0);
call->setArg(1, *qw1);
call->setArg(2, *qw2);
}
void spu_recompiler::MPYHHA(spu_opcode_t op)
{
const XmmLink& vt = XmmGet(op.rt, XmmType::Int);
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
c->psrld(va, 16);
c->psrld(vb, 16);
c->pmaddwd(va, vb);
c->paddd(vt, va);
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
}
void spu_recompiler::MPYHHAU(spu_opcode_t op)
{
const XmmLink& vt = XmmGet(op.rt, XmmType::Int);
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
const XmmLink& va2 = XmmAlloc();
c->movdqa(va2, va);
c->pmulhuw(va, vb);
c->pmullw(va2, vb);
c->pand(va, XmmConst(_mm_set1_epi32(0xffff0000)));
c->psrld(va2, 16);
c->paddd(vt, va);
c->paddd(vt, va2);
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
}
void spu_recompiler::FSCRRD(spu_opcode_t op)
{
// zero (hack)
const XmmLink& v0 = XmmAlloc();
c->pxor(v0, v0);
c->movdqa(SPU_OFF_128(gpr, op.rt), v0);
}
void spu_recompiler::FESD(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Float);
c->shufps(va, va, 0x8d); // _f[0] = _f[1]; _f[1] = _f[3];
c->cvtps2pd(va, va);
c->movapd(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::FRDS(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Double);
c->cvtpd2ps(va, va);
c->shufps(va, va, 0x72); // _f[1] = _f[0]; _f[3] = _f[1]; _f[0] = _f[2] = 0;
c->movaps(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::FSCRWR(spu_opcode_t op)
{
// nop (not implemented)
}
void spu_recompiler::DFTSV(spu_opcode_t op)
{
fmt::throw_exception("Unexpected instruction" HERE);
}
void spu_recompiler::FCEQ(spu_opcode_t op)
{
// compare equal
const XmmLink& vb = XmmGet(op.rb, XmmType::Float);
c->cmpps(vb, SPU_OFF_128(gpr, op.ra), 0);
c->movaps(SPU_OFF_128(gpr, op.rt), vb);
}
void spu_recompiler::DFCEQ(spu_opcode_t op)
{
fmt::throw_exception("Unexpected instruction" HERE);
}
void spu_recompiler::MPY(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
const XmmLink& vi = XmmAlloc();
c->movdqa(vi, XmmConst(_mm_set1_epi32(0xffff)));
c->pand(va, vi);
c->pand(vb, vi);
c->pmaddwd(va, vb);
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::MPYH(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
c->psrld(va, 16);
c->pmullw(va, vb);
c->pslld(va, 16);
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::MPYHH(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
c->psrld(va, 16);
c->psrld(vb, 16);
c->pmaddwd(va, vb);
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::MPYS(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
c->pmulhw(va, vb);
c->pslld(va, 16);
c->psrad(va, 16);
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::CEQH(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->pcmpeqw(va, SPU_OFF_128(gpr, op.rb));
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::FCMEQ(spu_opcode_t op)
{
const XmmLink& vb = XmmGet(op.rb, XmmType::Float);
const XmmLink& vi = XmmAlloc();
c->movaps(vi, XmmConst(_mm_set1_epi32(0x7fffffff)));
c->andps(vb, vi); // abs
c->andps(vi, SPU_OFF_128(gpr, op.ra));
c->cmpps(vb, vi, 0); // ==
c->movaps(SPU_OFF_128(gpr, op.rt), vb);
}
void spu_recompiler::DFCMEQ(spu_opcode_t op)
{
fmt::throw_exception("Unexpected instruction" HERE);
}
void spu_recompiler::MPYU(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
const XmmLink& va2 = XmmAlloc();
c->movdqa(va2, va);
c->pmulhuw(va, vb);
c->pmullw(va2, vb);
c->pslld(va, 16);
c->pand(va2, XmmConst(_mm_set1_epi32(0xffff)));
c->por(va, va2);
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::CEQB(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->pcmpeqb(va, SPU_OFF_128(gpr, op.rb));
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::FI(spu_opcode_t op)
{
// Floating Interpolate
const XmmLink& vb = XmmGet(op.rb, XmmType::Float);
c->movaps(SPU_OFF_128(gpr, op.rt), vb);
}
void spu_recompiler::HEQ(spu_opcode_t op)
{
c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_s32, 3));
c->cmp(*addr, SPU_OFF_32(gpr, op.rb, &v128::_s32, 3));
c->mov(*addr, m_pos | 0x1000000);
c->je(*end);
c->unuse(*addr);
}
void spu_recompiler::CFLTS(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Float);
const XmmLink& vi = XmmAlloc();
if (op.i8 != 173) c->mulps(va, XmmConst(_mm_set1_ps(std::exp2(static_cast<float>(static_cast<s16>(173 - op.i8)))))); // scale
c->movaps(vi, XmmConst(_mm_set1_ps(std::exp2(31.f))));
c->cmpps(vi, va, 2);
c->cvttps2dq(va, va); // convert to ints with truncation
c->pxor(va, vi); // fix result saturation (0x80000000 -> 0x7fffffff)
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::CFLTU(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Float);
const XmmLink& vs = XmmAlloc();
const XmmLink& vs2 = XmmAlloc();
const XmmLink& vs3 = XmmAlloc();
if (op.i8 != 173) c->mulps(va, XmmConst(_mm_set1_ps(std::exp2(static_cast<float>(static_cast<s16>(173 - op.i8)))))); // scale
c->maxps(va, XmmConst(_mm_set1_ps(0.0f))); // saturate
c->movaps(vs, va); // copy scaled value
c->movaps(vs2, va);
c->movaps(vs3, XmmConst(_mm_set1_ps(std::exp2(31.f))));
c->subps(vs2, vs3);
c->cmpps(vs3, vs, 2);
c->andps(vs2, vs3);
c->cvttps2dq(va, va);
c->cmpps(vs, XmmConst(_mm_set1_ps(std::exp2(32.f))), 5);
c->cvttps2dq(vs2, vs2);
c->por(va, vs);
c->por(va, vs2);
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::CSFLT(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->cvtdq2ps(va, va); // convert to floats
if (op.i8 != 155) c->mulps(va, XmmConst(_mm_set1_ps(std::exp2(static_cast<float>(static_cast<s16>(op.i8 - 155)))))); // scale
c->movaps(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::CUFLT(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& v1 = XmmAlloc();
c->movdqa(v1, va);
c->pand(va, XmmConst(_mm_set1_epi32(0x7fffffff)));
c->cvtdq2ps(va, va); // convert to floats
c->psrad(v1, 31); // generate mask from sign bit
c->andps(v1, XmmConst(_mm_set1_ps(std::exp2(31.f)))); // generate correction component
c->addps(va, v1); // add correction component
if (op.i8 != 155) c->mulps(va, XmmConst(_mm_set1_ps(std::exp2(static_cast<float>(static_cast<s16>(op.i8 - 155)))))); // scale
c->movaps(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::BRZ(spu_opcode_t op)
{
const u32 target = spu_branch_target(m_pos, op.i16);
if (target == m_pos) fmt::throw_exception("Branch-to-self (0x%05x)" HERE, target);
c->cmp(SPU_OFF_32(gpr, op.rt, &v128::_u32, 3), 0);
if (labels[target / 4].isValid())
{
c->je(labels[target / 4]);
}
else
{
if (target >= m_func->addr && target < m_func->addr + m_func->size)
{
LOG_ERROR(SPU, "Local block not registered (brz 0x%x)", target);
}
c->mov(*addr, target);
c->je(*end);
c->unuse(*addr);
}
}
void spu_recompiler::STQA(spu_opcode_t op)
{
const XmmLink& vt = XmmGet(op.rt, XmmType::Int);
c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
c->movdqa(asmjit::x86::oword_ptr(*ls, spu_ls_target(0, op.i16)), vt);
}
void spu_recompiler::BRNZ(spu_opcode_t op)
{
const u32 target = spu_branch_target(m_pos, op.i16);
if (target == m_pos) fmt::throw_exception("Branch-to-self (0x%05x)" HERE, target);
c->cmp(SPU_OFF_32(gpr, op.rt, &v128::_u32, 3), 0);
if (labels[target / 4].isValid())
{
c->jne(labels[target / 4]);
}
else
{
if (target >= m_func->addr && target < m_func->addr + m_func->size)
{
LOG_ERROR(SPU, "Local block not registered (brnz 0x%x)", target);
}
c->mov(*addr, target);
c->jne(*end);
c->unuse(*addr);
}
}
void spu_recompiler::BRHZ(spu_opcode_t op)
{
const u32 target = spu_branch_target(m_pos, op.i16);
if (target == m_pos) fmt::throw_exception("Branch-to-self (0x%05x)" HERE, target);
c->cmp(SPU_OFF_16(gpr, op.rt, &v128::_u16, 6), 0);
if (labels[target / 4].isValid())
{
c->je(labels[target / 4]);
}
else
{
if (target >= m_func->addr && target < m_func->addr + m_func->size)
{
LOG_ERROR(SPU, "Local block not registered (brhz 0x%x)", target);
}
c->mov(*addr, target);
c->je(*end);
c->unuse(*addr);
}
}
void spu_recompiler::BRHNZ(spu_opcode_t op)
{
const u32 target = spu_branch_target(m_pos, op.i16);
if (target == m_pos) fmt::throw_exception("Branch-to-self (0x%05x)" HERE, target);
c->cmp(SPU_OFF_16(gpr, op.rt, &v128::_u16, 6), 0);
if (labels[target / 4].isValid())
{
c->jne(labels[target / 4]);
}
else
{
if (target >= m_func->addr && target < m_func->addr + m_func->size)
{
LOG_ERROR(SPU, "Local block not registered (brhnz 0x%x)", target);
}
c->mov(*addr, target);
c->jne(*end);
c->unuse(*addr);
}
}
void spu_recompiler::STQR(spu_opcode_t op)
{
const XmmLink& vt = XmmGet(op.rt, XmmType::Int);
c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
c->movdqa(asmjit::x86::oword_ptr(*ls, spu_ls_target(m_pos, op.i16)), vt);
}
void spu_recompiler::BRA(spu_opcode_t op)
{
const u32 target = spu_branch_target(0, op.i16);
if (target == m_pos) fmt::throw_exception("Branch-to-self (0x%05x)" HERE, target);
if (labels[target / 4].isValid())
{
c->jmp(labels[target / 4]);
}
else
{
if (target >= m_func->addr && target < m_func->addr + m_func->size)
{
LOG_ERROR(SPU, "Local block not registered (bra 0x%x)", target);
}
c->mov(*addr, target);
c->jmp(*end);
c->unuse(*addr);
}
}
void spu_recompiler::LQA(spu_opcode_t op)
{
const XmmLink& vt = XmmAlloc();
c->movdqa(vt, asmjit::x86::oword_ptr(*ls, spu_ls_target(0, op.i16)));
c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
}
void spu_recompiler::BRASL(spu_opcode_t op)
{
const u32 target = spu_branch_target(0, op.i16);
if (target == m_pos) fmt::throw_exception("Branch-to-self (0x%05x)" HERE, target);
const XmmLink& vr = XmmAlloc();
c->movdqa(vr, XmmConst(_mm_set_epi32(spu_branch_target(m_pos + 4), 0, 0, 0)));
c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
c->unuse(vr);
c->mov(SPU_OFF_32(pc), target);
FunctionCall();
}
void spu_recompiler::BR(spu_opcode_t op)
{
const u32 target = spu_branch_target(m_pos, op.i16);
if (target == m_pos)
{
c->mov(*addr, target | 0x2000000);
//c->cmp(asmjit::x86::dword_ptr(*ls, m_pos), 0x32); // compare instruction opcode with BR-to-self
//c->je(labels[target / 4]);
c->lock().or_(SPU_OFF_32(state), static_cast<u32>(cpu_flag::stop + cpu_flag::ret));
c->jmp(*end);
c->unuse(*addr);
return;
}
if (labels[target / 4].isValid())
{
c->jmp(labels[target / 4]);
}
else
{
if (target >= m_func->addr && target < m_func->addr + m_func->size)
{
LOG_ERROR(SPU, "Local block not registered (brz 0x%x)", target);
}
c->mov(*addr, target);
c->jmp(*end);
c->unuse(*addr);
}
}
void spu_recompiler::FSMBI(spu_opcode_t op)
{
const XmmLink& vr = XmmAlloc();
c->movdqa(vr, XmmConst(g_spu_imm.fsmb[op.i16]));
c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
}
void spu_recompiler::BRSL(spu_opcode_t op)
{
const u32 target = spu_branch_target(m_pos, op.i16);
if (target == m_pos) fmt::throw_exception("Branch-to-self (0x%05x)" HERE, target);
const XmmLink& vr = XmmAlloc();
c->movdqa(vr, XmmConst(_mm_set_epi32(spu_branch_target(m_pos + 4), 0, 0, 0)));
c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
c->unuse(vr);
if (target == spu_branch_target(m_pos + 4))
{
// branch-to-next
return;
}
c->mov(SPU_OFF_32(pc), target);
FunctionCall();
}
void spu_recompiler::LQR(spu_opcode_t op)
{
const XmmLink& vt = XmmAlloc();
c->movdqa(vt, asmjit::x86::oword_ptr(*ls, spu_ls_target(m_pos, op.i16)));
c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
}
void spu_recompiler::IL(spu_opcode_t op)
{
const XmmLink& vr = XmmAlloc();
c->movdqa(vr, XmmConst(_mm_set1_epi32(op.si16)));
c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
}
void spu_recompiler::ILHU(spu_opcode_t op)
{
const XmmLink& vr = XmmAlloc();
c->movdqa(vr, XmmConst(_mm_set1_epi32(op.i16 << 16)));
c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
}
void spu_recompiler::ILH(spu_opcode_t op)
{
const XmmLink& vr = XmmAlloc();
c->movdqa(vr, XmmConst(_mm_set1_epi16(op.i16)));
c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
}
void spu_recompiler::IOHL(spu_opcode_t op)
{
const XmmLink& vt = XmmGet(op.rt, XmmType::Int);
c->por(vt, XmmConst(_mm_set1_epi32(op.i16)));
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
}
void spu_recompiler::ORI(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
if (op.si10) c->por(va, XmmConst(_mm_set1_epi32(op.si10)));
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::ORHI(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->por(va, XmmConst(_mm_set1_epi16(op.si10)));
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::ORBI(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->por(va, XmmConst(_mm_set1_epi8(op.si10)));
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::SFI(spu_opcode_t op)
{
const XmmLink& vr = XmmAlloc();
c->movdqa(vr, XmmConst(_mm_set1_epi32(op.si10)));
c->psubd(vr, SPU_OFF_128(gpr, op.ra));
c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
}
void spu_recompiler::SFHI(spu_opcode_t op)
{
const XmmLink& vr = XmmAlloc();
c->movdqa(vr, XmmConst(_mm_set1_epi16(op.si10)));
c->psubw(vr, SPU_OFF_128(gpr, op.ra));
c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
}
void spu_recompiler::ANDI(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->pand(va, XmmConst(_mm_set1_epi32(op.si10)));
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::ANDHI(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->pand(va, XmmConst(_mm_set1_epi16(op.si10)));
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::ANDBI(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->pand(va, XmmConst(_mm_set1_epi8(op.si10)));
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::AI(spu_opcode_t op)
{
// add
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->paddd(va, XmmConst(_mm_set1_epi32(op.si10)));
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::AHI(spu_opcode_t op)
{
// add
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->paddw(va, XmmConst(_mm_set1_epi16(op.si10)));
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::STQD(spu_opcode_t op)
{
c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
if (op.si10) c->add(*addr, op.si10 << 4);
c->and_(*addr, 0x3fff0);
const XmmLink& vt = XmmGet(op.rt, XmmType::Int);
c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
c->movdqa(asmjit::x86::oword_ptr(*ls, *addr), vt);
c->unuse(*addr);
}
void spu_recompiler::LQD(spu_opcode_t op)
{
c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
if (op.si10) c->add(*addr, op.si10 << 4);
c->and_(*addr, 0x3fff0);
const XmmLink& vt = XmmAlloc();
c->movdqa(vt, asmjit::x86::oword_ptr(*ls, *addr));
c->pshufb(vt, XmmConst(_mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f)));
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
c->unuse(*addr);
}
void spu_recompiler::XORI(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->pxor(va, XmmConst(_mm_set1_epi32(op.si10)));
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::XORHI(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->pxor(va, XmmConst(_mm_set1_epi16(op.si10)));
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::XORBI(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->pxor(va, XmmConst(_mm_set1_epi8(op.si10)));
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::CGTI(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->pcmpgtd(va, XmmConst(_mm_set1_epi32(op.si10)));
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::CGTHI(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->pcmpgtw(va, XmmConst(_mm_set1_epi16(op.si10)));
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::CGTBI(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->pcmpgtb(va, XmmConst(_mm_set1_epi8(op.si10)));
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::HGTI(spu_opcode_t op)
{
c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_s32, 3));
c->cmp(*addr, op.si10);
c->mov(*addr, m_pos | 0x1000000);
c->jg(*end);
c->unuse(*addr);
}
void spu_recompiler::CLGTI(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->pxor(va, XmmConst(_mm_set1_epi32(0x80000000)));
c->pcmpgtd(va, XmmConst(_mm_set1_epi32(op.si10 - 0x80000000)));
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::CLGTHI(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->pxor(va, XmmConst(_mm_set1_epi16(INT16_MIN)));
c->pcmpgtw(va, XmmConst(_mm_set1_epi16(op.si10 - 0x8000)));
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::CLGTBI(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->psubb(va, XmmConst(_mm_set1_epi8(INT8_MIN)));
c->pcmpgtb(va, XmmConst(_mm_set1_epi8(op.si10 - 0x80)));
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::HLGTI(spu_opcode_t op)
{
c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
c->cmp(*addr, op.si10);
c->mov(*addr, m_pos | 0x1000000);
c->ja(*end);
c->unuse(*addr);
}
void spu_recompiler::MPYI(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->pmaddwd(va, XmmConst(_mm_set1_epi32(op.si10 & 0xffff)));
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::MPYUI(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& vi = XmmAlloc();
const XmmLink& va2 = XmmAlloc();
c->movdqa(va2, va);
c->movdqa(vi, XmmConst(_mm_set1_epi32(op.si10 & 0xffff)));
c->pmulhuw(va, vi);
c->pmullw(va2, vi);
c->pslld(va, 16);
c->por(va, va2);
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::CEQI(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->pcmpeqd(va, XmmConst(_mm_set1_epi32(op.si10)));
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::CEQHI(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->pcmpeqw(va, XmmConst(_mm_set1_epi16(op.si10)));
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::CEQBI(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->pcmpeqb(va, XmmConst(_mm_set1_epi8(op.si10)));
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
}
void spu_recompiler::HEQI(spu_opcode_t op)
{
c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
c->cmp(*addr, op.si10);
c->mov(*addr, m_pos | 0x1000000);
c->je(*end);
c->unuse(*addr);
}
void spu_recompiler::HBRA(spu_opcode_t op)
{
}
void spu_recompiler::HBRR(spu_opcode_t op)
{
}
void spu_recompiler::ILA(spu_opcode_t op)
{
const XmmLink& vr = XmmAlloc();
c->movdqa(vr, XmmConst(_mm_set1_epi32(op.i18)));
c->movdqa(SPU_OFF_128(gpr, op.rt), vr);
}
void spu_recompiler::SELB(spu_opcode_t op)
{
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
const XmmLink& vc = XmmGet(op.rc, XmmType::Int);
c->pand(vb, vc);
c->pandn(vc, SPU_OFF_128(gpr, op.ra));
c->por(vb, vc);
c->movdqa(SPU_OFF_128(gpr, op.rt4), vb);
}
void spu_recompiler::SHUFB(spu_opcode_t op)
{
const XmmLink& v0 = XmmGet(op.rc, XmmType::Int); // v0 = mask
const XmmLink& v1 = XmmAlloc();
const XmmLink& v2 = XmmAlloc();
const XmmLink& v3 = XmmAlloc();
const XmmLink& v4 = XmmAlloc();
const XmmLink& vFF = XmmAlloc();
c->movdqa(v2, v0); // v2 = mask
// generate specific values:
c->movdqa(v1, XmmConst(_mm_set1_epi8(-0x20))); // v1 = 11100000
c->movdqa(v3, XmmConst(_mm_set1_epi8(-0x80))); // v3 = 10000000
c->pand(v2, v1); // filter mask v2 = mask & 11100000
c->movdqa(vFF, v2); // and copy vFF = mask & 11100000
c->movdqa(v4, XmmConst(_mm_set1_epi8(-0x40))); // v4 = 11000000
c->pcmpeqb(vFF, v4); // gen 0xff vFF = (mask & 11100000 == 11000000) ? 0xff : 0
c->movdqa(v4, v2); // copy again v4 = mask & 11100000
c->pand(v4, v3); // filter mask v4 = mask & 10000000
c->pcmpeqb(v2, v1); // v2 = (mask & 11100000 == 11100000) ? 0xff : 0
c->pcmpeqb(v4, v3); // v4 = (mask & 10000000 == 10000000) ? 0xff : 0
c->pand(v2, v3); // generate 0x80 v2 = (mask & 11100000 == 11100000) ? 0x80 : 0
c->por(vFF, v2); // merge 0xff, 0x80 vFF = (mask & 11100000 == 11000000) ? 0xff : (mask & 11100000 == 11100000) ? 0x80 : 0
c->pandn(v1, v0); // filter mask v1 = mask & 00011111
// select bytes from [op.rb]:
c->movdqa(v2, XmmConst(_mm_set1_epi8(0x0f))); // v2 = 00001111
c->pxor(v1, XmmConst(_mm_set1_epi8(0x10))); // v1 = (mask & 00011111) ^ 00010000
c->psubb(v2, v1); // v2 = 00001111 - ((mask & 00011111) ^ 00010000)
c->movdqa(v1, SPU_OFF_128(gpr, op.rb)); // v1 = op.rb
c->pshufb(v1, v2); // v1 = select(op.rb, 00001111 - ((mask & 00011111) ^ 00010000))
// select bytes from [op.ra]:
c->pxor(v2, XmmConst(_mm_set1_epi8(-0x10))); // v2 = (00001111 - ((mask & 00011111) ^ 00010000)) ^ 11110000
c->movdqa(v3, SPU_OFF_128(gpr, op.ra)); // v3 = op.ra
c->pshufb(v3, v2); // v3 = select(op.ra, (00001111 - ((mask & 00011111) ^ 00010000)) ^ 11110000)
c->por(v1, v3); // v1 = select(op.rb, 00001111 - ((mask & 00011111) ^ 00010000)) | (v3)
c->pandn(v4, v1); // filter result v4 = v1 & ((mask & 10000000 == 10000000) ? 0 : 0xff)
c->por(vFF, v4); // final merge vFF = (mask & 10000000 == 10000000) ? ((mask & 11100000 == 11000000) ? 0xff : (mask & 11100000 == 11100000) ? 0x80 : 0) : (v1)
c->movdqa(SPU_OFF_128(gpr, op.rt4), vFF);
}
void spu_recompiler::MPYA(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
const XmmLink& vi = XmmAlloc();
c->movdqa(vi, XmmConst(_mm_set1_epi32(0xffff)));
c->pand(va, vi);
c->pand(vb, vi);
c->pmaddwd(va, vb);
c->paddd(va, SPU_OFF_128(gpr, op.rc));
c->movdqa(SPU_OFF_128(gpr, op.rt4), va);
}
void spu_recompiler::FNMS(spu_opcode_t op)
{
const XmmLink& vc = XmmGet(op.rc, XmmType::Float);
const auto mask = XmmConst(_mm_set1_epi32(0x7f800000));
const XmmLink& tmp_a = XmmAlloc();
const XmmLink& tmp_b = XmmAlloc();
c->movdqa(tmp_a, mask); //tmp_a = mask
c->andps(tmp_a, SPU_OFF_128(gpr, op.ra)); //tmp_a = a & mask
c->cmpps(tmp_a, mask, 4); //tmp_a = tmp_a != mask
c->andps(tmp_a, SPU_OFF_128(gpr, op.ra)); //tmp_a = mask_a & va
c->movdqa(tmp_b, mask); //tmp_b = mask
c->andps(tmp_b, SPU_OFF_128(gpr, op.rb)); //tmp_b = b & mask
c->cmpps(tmp_b, mask, 4); //tmp_b = tmp_b != mask
c->andps(tmp_b, SPU_OFF_128(gpr, op.rb)); //tmp_b = mask_b & vb
c->mulps(tmp_a, tmp_b);
c->subps(vc, tmp_a);
c->movaps(SPU_OFF_128(gpr, op.rt4), vc);
}
void spu_recompiler::FMA(spu_opcode_t op)
{
const XmmLink& vc = XmmGet(op.rc, XmmType::Float);
const auto mask = XmmConst(_mm_set1_epi32(0x7f800000));
const XmmLink& tmp_a = XmmAlloc();
const XmmLink& tmp_b = XmmAlloc();
c->movdqa(tmp_a, mask); //tmp_a = mask
c->andps(tmp_a, SPU_OFF_128(gpr, op.ra)); //tmp_a = a & mask
c->cmpps(tmp_a, mask, 4); //tmp_a = tmp_a != mask
c->andps(tmp_a, SPU_OFF_128(gpr, op.ra)); //tmp_a = mask_a & va
c->movdqa(tmp_b, mask); //tmp_b = mask
c->andps(tmp_b, SPU_OFF_128(gpr, op.rb)); //tmp_b = b & mask
c->cmpps(tmp_b, mask, 4); //tmp_b = tmp_b != mask
c->andps(tmp_b, SPU_OFF_128(gpr, op.rb)); //tmp_b = mask_b & vb
c->mulps(tmp_a, tmp_b);
c->addps(tmp_a, SPU_OFF_128(gpr, op.rc));
c->movaps(SPU_OFF_128(gpr, op.rt4), tmp_a);
}
void spu_recompiler::FMS(spu_opcode_t op)
{
const XmmLink& vc = XmmGet(op.rc, XmmType::Float);
const auto mask = XmmConst(_mm_set1_epi32(0x7f800000));
const XmmLink& tmp_a = XmmAlloc();
const XmmLink& tmp_b = XmmAlloc();
c->movdqa(tmp_a, mask); //tmp_a = mask
c->andps(tmp_a, SPU_OFF_128(gpr, op.ra)); //tmp_a = a & mask
c->cmpps(tmp_a, mask, 4); //tmp_a = tmp_a != mask
c->andps(tmp_a, SPU_OFF_128(gpr, op.ra)); //tmp_a = mask_a & va
c->movdqa(tmp_b, mask); //tmp_b = mask
c->andps(tmp_b, SPU_OFF_128(gpr, op.rb)); //tmp_b = b & mask
c->cmpps(tmp_b, mask, 4); //tmp_b = tmp_b != mask
c->andps(tmp_b, SPU_OFF_128(gpr, op.rb)); //tmp_b = mask_b & vb
c->mulps(tmp_a, tmp_b);
c->subps(tmp_a, SPU_OFF_128(gpr, op.rc));
c->movaps(SPU_OFF_128(gpr, op.rt4), tmp_a);
}
void spu_recompiler::UNK(spu_opcode_t op)
{
LOG_ERROR(SPU, "0x%05x: Unknown/Illegal opcode (0x%08x)", m_pos, op.opcode);
c->int3();
}