Some stuff

This commit is contained in:
Nekotekina 2014-04-20 23:36:53 +04:00
parent e1bbedd4bf
commit 525084e7cc
3 changed files with 366 additions and 210 deletions

View file

@ -181,7 +181,7 @@ private:
CPU.VSCR.VSCR = CPU.VPR[vb]._u32[0]; CPU.VSCR.VSCR = CPU.VPR[vb]._u32[0];
CPU.VSCR.X = CPU.VSCR.Y = 0; CPU.VSCR.X = CPU.VSCR.Y = 0;
} }
void VADDCUW(u32 vd, u32 va, u32 vb) void VADDCUW(u32 vd, u32 va, u32 vb) //nf
{ {
for (uint w = 0; w < 4; w++) for (uint w = 0; w < 4; w++)
{ {
@ -195,7 +195,7 @@ private:
CPU.VPR[vd]._f[w] = CPU.VPR[va]._f[w] + CPU.VPR[vb]._f[w]; CPU.VPR[vd]._f[w] = CPU.VPR[va]._f[w] + CPU.VPR[vb]._f[w];
} }
} }
void VADDSBS(u32 vd, u32 va, u32 vb) void VADDSBS(u32 vd, u32 va, u32 vb) //nf
{ {
for(u32 b=0; b<16; ++b) for(u32 b=0; b<16; ++b)
{ {
@ -235,7 +235,7 @@ private:
CPU.VPR[vd]._s16[h] = result; CPU.VPR[vd]._s16[h] = result;
} }
} }
void VADDSWS(u32 vd, u32 va, u32 vb) void VADDSWS(u32 vd, u32 va, u32 vb) //nf
{ {
for (uint w = 0; w < 4; w++) for (uint w = 0; w < 4; w++)
{ {
@ -335,21 +335,21 @@ private:
CPU.VPR[vd]._u32[w] = CPU.VPR[va]._u32[w] & (~CPU.VPR[vb]._u32[w]); CPU.VPR[vd]._u32[w] = CPU.VPR[va]._u32[w] & (~CPU.VPR[vb]._u32[w]);
} }
} }
void VAVGSB(u32 vd, u32 va, u32 vb) void VAVGSB(u32 vd, u32 va, u32 vb) //nf
{ {
for (uint b = 0; b < 16; b++) for (uint b = 0; b < 16; b++)
{ {
CPU.VPR[vd]._s8[b] = (CPU.VPR[va]._s8[b] + CPU.VPR[vb]._s8[b] + 1) >> 1; CPU.VPR[vd]._s8[b] = (CPU.VPR[va]._s8[b] + CPU.VPR[vb]._s8[b] + 1) >> 1;
} }
} }
void VAVGSH(u32 vd, u32 va, u32 vb) void VAVGSH(u32 vd, u32 va, u32 vb) //nf
{ {
for (uint h = 0; h < 8; h++) for (uint h = 0; h < 8; h++)
{ {
CPU.VPR[vd]._s16[h] = (CPU.VPR[va]._s16[h] + CPU.VPR[vb]._s16[h] + 1) >> 1; CPU.VPR[vd]._s16[h] = (CPU.VPR[va]._s16[h] + CPU.VPR[vb]._s16[h] + 1) >> 1;
} }
} }
void VAVGSW(u32 vd, u32 va, u32 vb) void VAVGSW(u32 vd, u32 va, u32 vb) //nf
{ {
for (uint w = 0; w < 4; w++) for (uint w = 0; w < 4; w++)
{ {
@ -361,14 +361,14 @@ private:
for (uint b = 0; b < 16; b++) for (uint b = 0; b < 16; b++)
CPU.VPR[vd]._u8[b] = (CPU.VPR[va]._u8[b] + CPU.VPR[vb]._u8[b] + 1) >> 1; CPU.VPR[vd]._u8[b] = (CPU.VPR[va]._u8[b] + CPU.VPR[vb]._u8[b] + 1) >> 1;
} }
void VAVGUH(u32 vd, u32 va, u32 vb) void VAVGUH(u32 vd, u32 va, u32 vb) //nf
{ {
for (uint h = 0; h < 8; h++) for (uint h = 0; h < 8; h++)
{ {
CPU.VPR[vd]._u16[h] = (CPU.VPR[va]._u16[h] + CPU.VPR[vb]._u16[h] + 1) >> 1; CPU.VPR[vd]._u16[h] = (CPU.VPR[va]._u16[h] + CPU.VPR[vb]._u16[h] + 1) >> 1;
} }
} }
void VAVGUW(u32 vd, u32 va, u32 vb) void VAVGUW(u32 vd, u32 va, u32 vb) //nf
{ {
for (uint w = 0; w < 4; w++) for (uint w = 0; w < 4; w++)
{ {
@ -487,14 +487,14 @@ private:
CPU.CR.cr6 = all_equal | none_equal; CPU.CR.cr6 = all_equal | none_equal;
} }
void VCMPEQUH(u32 vd, u32 va, u32 vb) void VCMPEQUH(u32 vd, u32 va, u32 vb) //nf
{ {
for (uint h = 0; h < 8; h++) for (uint h = 0; h < 8; h++)
{ {
CPU.VPR[vd]._u16[h] = CPU.VPR[va]._u16[h] == CPU.VPR[vb]._u16[h] ? 0xffff : 0; CPU.VPR[vd]._u16[h] = CPU.VPR[va]._u16[h] == CPU.VPR[vb]._u16[h] ? 0xffff : 0;
} }
} }
void VCMPEQUH_(u32 vd, u32 va, u32 vb) void VCMPEQUH_(u32 vd, u32 va, u32 vb) //nf
{ {
int all_equal = 0x8; int all_equal = 0x8;
int none_equal = 0x2; int none_equal = 0x2;
@ -599,7 +599,7 @@ private:
CPU.CR.cr6 = all_ge | none_ge; CPU.CR.cr6 = all_ge | none_ge;
} }
void VCMPGTSB(u32 vd, u32 va, u32 vb) void VCMPGTSB(u32 vd, u32 va, u32 vb) //nf
{ {
for (uint b = 0; b < 16; b++) for (uint b = 0; b < 16; b++)
{ {
@ -833,7 +833,7 @@ private:
CPU.VPR[vd]._f[w] = max(CPU.VPR[va]._f[w], CPU.VPR[vb]._f[w]); CPU.VPR[vd]._f[w] = max(CPU.VPR[va]._f[w], CPU.VPR[vb]._f[w]);
} }
} }
void VMAXSB(u32 vd, u32 va, u32 vb) void VMAXSB(u32 vd, u32 va, u32 vb) //nf
{ {
for (uint b = 0; b < 16; b++) for (uint b = 0; b < 16; b++)
CPU.VPR[vd]._s8[b] = max(CPU.VPR[va]._s8[b], CPU.VPR[vb]._s8[b]); CPU.VPR[vd]._s8[b] = max(CPU.VPR[va]._s8[b], CPU.VPR[vb]._s8[b]);
@ -918,7 +918,7 @@ private:
CPU.VPR[vd]._f[w] = min(CPU.VPR[va]._f[w], CPU.VPR[vb]._f[w]); CPU.VPR[vd]._f[w] = min(CPU.VPR[va]._f[w], CPU.VPR[vb]._f[w]);
} }
} }
void VMINSB(u32 vd, u32 va, u32 vb) void VMINSB(u32 vd, u32 va, u32 vb) //nf
{ {
for (uint b = 0; b < 16; b++) for (uint b = 0; b < 16; b++)
{ {
@ -1021,7 +1021,7 @@ private:
CPU.VPR[vd]._u32[3 - d*2 - 1] = CPU.VPR[vb]._u32[1 - d]; CPU.VPR[vd]._u32[3 - d*2 - 1] = CPU.VPR[vb]._u32[1 - d];
} }
} }
void VMSUMMBM(u32 vd, u32 va, u32 vb, u32 vc) void VMSUMMBM(u32 vd, u32 va, u32 vb, u32 vc) //nf
{ {
for (uint w = 0; w < 4; w++) for (uint w = 0; w < 4; w++)
{ {
@ -1036,7 +1036,7 @@ private:
CPU.VPR[vd]._s32[w] = result; CPU.VPR[vd]._s32[w] = result;
} }
} }
void VMSUMSHM(u32 vd, u32 va, u32 vb, u32 vc) void VMSUMSHM(u32 vd, u32 va, u32 vb, u32 vc) //nf
{ {
for (uint w = 0; w < 4; w++) for (uint w = 0; w < 4; w++)
{ {
@ -1051,7 +1051,7 @@ private:
CPU.VPR[vd]._s32[w] = result; CPU.VPR[vd]._s32[w] = result;
} }
} }
void VMSUMSHS(u32 vd, u32 va, u32 vb, u32 vc) void VMSUMSHS(u32 vd, u32 va, u32 vb, u32 vc) //nf
{ {
for (uint w = 0; w < 4; w++) for (uint w = 0; w < 4; w++)
{ {
@ -1096,7 +1096,7 @@ private:
CPU.VPR[vd]._u32[w] = result; CPU.VPR[vd]._u32[w] = result;
} }
} }
void VMSUMUHM(u32 vd, u32 va, u32 vb, u32 vc) void VMSUMUHM(u32 vd, u32 va, u32 vb, u32 vc) //nf
{ {
for (uint w = 0; w < 4; w++) for (uint w = 0; w < 4; w++)
{ {
@ -1111,7 +1111,7 @@ private:
CPU.VPR[vd]._u32[w] = result; CPU.VPR[vd]._u32[w] = result;
} }
} }
void VMSUMUHS(u32 vd, u32 va, u32 vb, u32 vc) void VMSUMUHS(u32 vd, u32 va, u32 vb, u32 vc) //nf
{ {
for (uint w = 0; w < 4; w++) for (uint w = 0; w < 4; w++)
{ {
@ -1136,7 +1136,7 @@ private:
CPU.VPR[vd]._u32[w] = saturated; CPU.VPR[vd]._u32[w] = saturated;
} }
} }
void VMULESB(u32 vd, u32 va, u32 vb) void VMULESB(u32 vd, u32 va, u32 vb) //nf
{ {
for (uint h = 0; h < 8; h++) for (uint h = 0; h < 8; h++)
{ {
@ -1164,7 +1164,7 @@ private:
CPU.VPR[vd]._u32[w] = (u32)CPU.VPR[va]._u16[w*2+1] * (u32)CPU.VPR[vb]._u16[w*2+1]; CPU.VPR[vd]._u32[w] = (u32)CPU.VPR[va]._u16[w*2+1] * (u32)CPU.VPR[vb]._u16[w*2+1];
} }
} }
void VMULOSB(u32 vd, u32 va, u32 vb) void VMULOSB(u32 vd, u32 va, u32 vb) //nf
{ {
for (uint h = 0; h < 8; h++) for (uint h = 0; h < 8; h++)
{ {
@ -1243,7 +1243,7 @@ private:
CPU.VPR[vd]._u16[4 + (3 - h)] = (ab7 << 15) | (ab8 << 10) | (ab16 << 5) | ab24; CPU.VPR[vd]._u16[4 + (3 - h)] = (ab7 << 15) | (ab8 << 10) | (ab16 << 5) | ab24;
} }
} }
void VPKSHSS(u32 vd, u32 va, u32 vb) void VPKSHSS(u32 vd, u32 va, u32 vb) //nf
{ {
for (uint b = 0; b < 8; b++) for (uint b = 0; b < 8; b++)
{ {
@ -1348,7 +1348,7 @@ private:
CPU.VPR[vd]._s16[h] = result; CPU.VPR[vd]._s16[h] = result;
} }
} }
void VPKSWUS(u32 vd, u32 va, u32 vb) void VPKSWUS(u32 vd, u32 va, u32 vb) //nf
{ {
for (uint h = 0; h < 4; h++) for (uint h = 0; h < 4; h++)
{ {
@ -1383,7 +1383,7 @@ private:
CPU.VPR[vd]._u16[h] = result; CPU.VPR[vd]._u16[h] = result;
} }
} }
void VPKUHUM(u32 vd, u32 va, u32 vb) void VPKUHUM(u32 vd, u32 va, u32 vb) //nf
{ {
for (uint b = 0; b < 8; b++) for (uint b = 0; b < 8; b++)
{ {
@ -1424,7 +1424,7 @@ private:
CPU.VPR[vd]._u16[h ] = CPU.VPR[vb]._u16[h*2]; CPU.VPR[vd]._u16[h ] = CPU.VPR[vb]._u16[h*2];
} }
} }
void VPKUWUS(u32 vd, u32 va, u32 vb) void VPKUWUS(u32 vd, u32 va, u32 vb) //nf
{ {
for (uint h = 0; h < 4; h++) for (uint h = 0; h < 4; h++)
{ {
@ -1486,7 +1486,7 @@ private:
CPU.VPR[vd]._f[w] = f; CPU.VPR[vd]._f[w] = f;
} }
} }
void VRLB(u32 vd, u32 va, u32 vb) void VRLB(u32 vd, u32 va, u32 vb) //nf
{ {
for (uint b = 0; b < 16; b++) for (uint b = 0; b < 16; b++)
{ {
@ -1495,7 +1495,7 @@ private:
CPU.VPR[vd]._u8[b] = (CPU.VPR[va]._u8[b] << nRot) | (CPU.VPR[va]._u8[b] >> (8 - nRot)); CPU.VPR[vd]._u8[b] = (CPU.VPR[va]._u8[b] << nRot) | (CPU.VPR[va]._u8[b] >> (8 - nRot));
} }
} }
void VRLH(u32 vd, u32 va, u32 vb) void VRLH(u32 vd, u32 va, u32 vb) //nf
{ {
for (uint h = 0; h < 8; h++) for (uint h = 0; h < 8; h++)
{ {
@ -1524,7 +1524,7 @@ private:
CPU.VPR[vd]._u8[b] = (CPU.VPR[vb]._u8[b] & CPU.VPR[vc]._u8[b]) | (CPU.VPR[va]._u8[b] & (~CPU.VPR[vc]._u8[b])); CPU.VPR[vd]._u8[b] = (CPU.VPR[vb]._u8[b] & CPU.VPR[vc]._u8[b]) | (CPU.VPR[va]._u8[b] & (~CPU.VPR[vc]._u8[b]));
} }
} }
void VSL(u32 vd, u32 va, u32 vb) void VSL(u32 vd, u32 va, u32 vb) //nf
{ {
u8 sh = CPU.VPR[vb]._u8[0] & 0x7; u8 sh = CPU.VPR[vb]._u8[0] & 0x7;
@ -1648,7 +1648,7 @@ private:
CPU.VPR[vd]._u32[w] = word; CPU.VPR[vd]._u32[w] = word;
} }
} }
void VSR(u32 vd, u32 va, u32 vb) void VSR(u32 vd, u32 va, u32 vb) //nf
{ {
u8 sh = CPU.VPR[vb]._u8[0] & 0x7; u8 sh = CPU.VPR[vb]._u8[0] & 0x7;
u32 t = 1; u32 t = 1;
@ -1676,7 +1676,7 @@ private:
CPU.VPR[vd]._u32[3] = 0xCDCDCDCD; CPU.VPR[vd]._u32[3] = 0xCDCDCDCD;
} }
} }
void VSRAB(u32 vd, u32 va, u32 vb) void VSRAB(u32 vd, u32 va, u32 vb) //nf
{ {
for (uint b = 0; b < 16; b++) for (uint b = 0; b < 16; b++)
{ {
@ -1729,7 +1729,7 @@ private:
CPU.VPR[vd]._u32[w] = CPU.VPR[va]._u32[w] >> (CPU.VPR[vb]._u8[w*4] & 0x1f); CPU.VPR[vd]._u32[w] = CPU.VPR[va]._u32[w] >> (CPU.VPR[vb]._u8[w*4] & 0x1f);
} }
} }
void VSUBCUW(u32 vd, u32 va, u32 vb) void VSUBCUW(u32 vd, u32 va, u32 vb) //nf
{ {
for (uint w = 0; w < 4; w++) for (uint w = 0; w < 4; w++)
{ {
@ -1743,7 +1743,7 @@ private:
CPU.VPR[vd]._f[w] = CPU.VPR[va]._f[w] - CPU.VPR[vb]._f[w]; CPU.VPR[vd]._f[w] = CPU.VPR[va]._f[w] - CPU.VPR[vb]._f[w];
} }
} }
void VSUBSBS(u32 vd, u32 va, u32 vb) void VSUBSBS(u32 vd, u32 va, u32 vb) //nf
{ {
for (uint b = 0; b < 16; b++) for (uint b = 0; b < 16; b++)
{ {
@ -1832,7 +1832,7 @@ private:
CPU.VPR[vd]._u16[h] = CPU.VPR[va]._u16[h] - CPU.VPR[vb]._u16[h]; CPU.VPR[vd]._u16[h] = CPU.VPR[va]._u16[h] - CPU.VPR[vb]._u16[h];
} }
} }
void VSUBUHS(u32 vd, u32 va, u32 vb) void VSUBUHS(u32 vd, u32 va, u32 vb) //nf
{ {
for (uint h = 0; h < 8; h++) for (uint h = 0; h < 8; h++)
{ {
@ -1915,7 +1915,7 @@ private:
CPU.VPR[vd]._s32[1] = 0; CPU.VPR[vd]._s32[1] = 0;
CPU.VPR[vd]._s32[3] = 0; CPU.VPR[vd]._s32[3] = 0;
} }
void VSUM4SBS(u32 vd, u32 va, u32 vb) void VSUM4SBS(u32 vd, u32 va, u32 vb) //nf
{ {
for (uint w = 0; w < 4; w++) for (uint w = 0; w < 4; w++)
{ {
@ -2019,7 +2019,7 @@ private:
CPU.VPR[vd]._u8[(3 - w)*4 + 0] = CPU.VPR[vb]._u8[8 + w*2 + 1] & 0x1f; CPU.VPR[vd]._u8[(3 - w)*4 + 0] = CPU.VPR[vb]._u8[8 + w*2 + 1] & 0x1f;
} }
} }
void VUPKLSB(u32 vd, u32 vb) void VUPKLSB(u32 vd, u32 vb) //nf
{ {
for (uint h = 0; h < 8; h++) for (uint h = 0; h < 8; h++)
{ {

View file

@ -14,6 +14,58 @@ using namespace asmjit::host;
#define UNIMPLEMENTED() UNK(__FUNCTION__) #define UNIMPLEMENTED() UNK(__FUNCTION__)
struct g_imm_table_struct
{
u16 cntb_table[65536];
__m128i fsmb_table[65536];
__m128i fsmh_table[256];
__m128i fsm_table[16];
__m128i sldq_pshufb[32];
__m128i srdq_pshufb[32];
__m128i rldq_pshufb[16];
g_imm_table_struct()
{
static_assert(offsetof(g_imm_table_struct, cntb_table) == 0, "offsetof(cntb_table) != 0");
for (u32 i = 0; i < sizeof(cntb_table) / sizeof(cntb_table[0]); i++)
{
u32 cnt_low = 0, cnt_high = 0;
for (u32 j = 0; j < 8; j++)
{
cnt_low += (i >> j) & 1;
cnt_high += (i >> (j + 8)) & 1;
}
cntb_table[i] = (cnt_high << 8) | cnt_low;
}
for (u32 i = 0; i < sizeof(fsm_table) / sizeof(fsm_table[0]); i++)
{
for (u32 j = 0; j < 4; j++) fsm_table[i].m128i_u32[j] = (i & (1 << j)) ? ~0 : 0;
}
for (u32 i = 0; i < sizeof(fsmh_table) / sizeof(fsmh_table[0]); i++)
{
for (u32 j = 0; j < 8; j++) fsmh_table[i].m128i_u16[j] = (i & (1 << j)) ? ~0 : 0;
}
for (u32 i = 0; i < sizeof(fsmb_table) / sizeof(fsmb_table[0]); i++)
{
for (u32 j = 0; j < 16; j++) fsmb_table[i].m128i_u8[j] = (i & (1 << j)) ? ~0 : 0;
}
for (u32 i = 0; i < sizeof(sldq_pshufb) / sizeof(sldq_pshufb[0]); i++)
{
for (u32 j = 0; j < 16; j++) sldq_pshufb[i].m128i_u8[j] = (u8)(j - i);
}
for (u32 i = 0; i < sizeof(srdq_pshufb) / sizeof(srdq_pshufb[0]); i++)
{
for (u32 j = 0; j < 16; j++) srdq_pshufb[i].m128i_u8[j] = (j + i > 15) ? 0xff : (u8)(j + i);
}
for (u32 i = 0; i < sizeof(rldq_pshufb) / sizeof(rldq_pshufb[0]); i++)
{
for (u32 j = 0; j < 16; j++) rldq_pshufb[i].m128i_u8[j] = (u8)(j - i) & 0xf;
}
}
};
class SPURecompiler; class SPURecompiler;
class SPURecompilerCore : public CPUDecoder class SPURecompilerCore : public CPUDecoder
@ -57,6 +109,9 @@ public:
#define cpu_word(x) word_ptr(*cpu_var, (sizeof((*(SPUThread*)nullptr).x) == 2) ? offsetof(SPUThread, x) : throw "sizeof("#x") != 2") #define cpu_word(x) word_ptr(*cpu_var, (sizeof((*(SPUThread*)nullptr).x) == 2) ? offsetof(SPUThread, x) : throw "sizeof("#x") != 2")
#define cpu_byte(x) byte_ptr(*cpu_var, (sizeof((*(SPUThread*)nullptr).x) == 1) ? offsetof(SPUThread, x) : throw "sizeof("#x") != 1") #define cpu_byte(x) byte_ptr(*cpu_var, (sizeof((*(SPUThread*)nullptr).x) == 1) ? offsetof(SPUThread, x) : throw "sizeof("#x") != 1")
#define g_imm_xmm(x) oword_ptr(*g_imm_var, offsetof(g_imm_table_struct, x))
#define g_imm2_xmm(x, y) oword_ptr(*g_imm_var, y, 0, offsetof(g_imm_table_struct, x))
#define LOG_OPCODE(...) //ConLog.Write("Compiled "__FUNCTION__"(): "__VA_ARGS__) #define LOG_OPCODE(...) //ConLog.Write("Compiled "__FUNCTION__"(): "__VA_ARGS__)
#define LOG3_OPCODE(...) //ConLog.Write("Linked "__FUNCTION__"(): "__VA_ARGS__) #define LOG3_OPCODE(...) //ConLog.Write("Linked "__FUNCTION__"(): "__VA_ARGS__)
@ -97,12 +152,14 @@ public:
GpVar* cpu_var; GpVar* cpu_var;
GpVar* ls_var; GpVar* ls_var;
GpVar* imm_var; GpVar* imm_var;
// (input) output: GpVar* g_imm_var;
// output:
GpVar* pos_var; GpVar* pos_var;
// temporary: // temporary:
GpVar* addr; GpVar* addr;
GpVar* qw0; GpVar* qw0;
GpVar* qw1; GpVar* qw1;
GpVar* qw2;
struct XmmLink struct XmmLink
{ {
@ -578,30 +635,41 @@ private:
} }
void ROT(u32 rt, u32 ra, u32 rb) void ROT(u32 rt, u32 ra, u32 rb)
{ {
WRAPPER_BEGIN(rt, ra, rb, zz); XmmInvalidate(rt);
CPU.GPR[rt]._u32[0] = (CPU.GPR[ra]._u32[0] << (CPU.GPR[rb]._u32[0] & 0x1f)) | (CPU.GPR[ra]._u32[0] >> (32 - (CPU.GPR[rb]._u32[0] & 0x1f))); for (u32 i = 0; i < 4; i++)
CPU.GPR[rt]._u32[1] = (CPU.GPR[ra]._u32[1] << (CPU.GPR[rb]._u32[1] & 0x1f)) | (CPU.GPR[ra]._u32[1] >> (32 - (CPU.GPR[rb]._u32[1] & 0x1f))); {
CPU.GPR[rt]._u32[2] = (CPU.GPR[ra]._u32[2] << (CPU.GPR[rb]._u32[2] & 0x1f)) | (CPU.GPR[ra]._u32[2] >> (32 - (CPU.GPR[rb]._u32[2] & 0x1f))); c.mov(qw0->r32(), cpu_dword(GPR[ra]._u32[i]));
CPU.GPR[rt]._u32[3] = (CPU.GPR[ra]._u32[3] << (CPU.GPR[rb]._u32[3] & 0x1f)) | (CPU.GPR[ra]._u32[3] >> (32 - (CPU.GPR[rb]._u32[3] & 0x1f))); c.mov(*addr, cpu_dword(GPR[rb]._u32[i]));
WRAPPER_END(rt, ra, rb, 0); c.rol(qw0->r32(), *addr);
c.mov(cpu_dword(GPR[rt]._u32[i]), qw0->r32());
}
LOG_OPCODE();
} }
void ROTM(u32 rt, u32 ra, u32 rb) void ROTM(u32 rt, u32 ra, u32 rb)
{ {
WRAPPER_BEGIN(rt, ra, rb, zz); XmmInvalidate(rt);
CPU.GPR[rt]._u32[0] = ((0 - CPU.GPR[rb]._u32[0]) & 0x3f) < 32 ? CPU.GPR[ra]._u32[0] >> ((0 - CPU.GPR[rb]._u32[0]) & 0x3f) : 0; for (u32 i = 0; i < 4; i++)
CPU.GPR[rt]._u32[1] = ((0 - CPU.GPR[rb]._u32[1]) & 0x3f) < 32 ? CPU.GPR[ra]._u32[1] >> ((0 - CPU.GPR[rb]._u32[1]) & 0x3f) : 0; {
CPU.GPR[rt]._u32[2] = ((0 - CPU.GPR[rb]._u32[2]) & 0x3f) < 32 ? CPU.GPR[ra]._u32[2] >> ((0 - CPU.GPR[rb]._u32[2]) & 0x3f) : 0; c.mov(qw0->r32(), cpu_dword(GPR[ra]._u32[i]));
CPU.GPR[rt]._u32[3] = ((0 - CPU.GPR[rb]._u32[3]) & 0x3f) < 32 ? CPU.GPR[ra]._u32[3] >> ((0 - CPU.GPR[rb]._u32[3]) & 0x3f) : 0; c.mov(*addr, cpu_dword(GPR[rb]._u32[i]));
WRAPPER_END(rt, ra, rb, 0); c.neg(*addr);
c.shr(*qw0, *addr);
c.mov(cpu_dword(GPR[rt]._u32[i]), qw0->r32());
}
LOG_OPCODE();
} }
void ROTMA(u32 rt, u32 ra, u32 rb) void ROTMA(u32 rt, u32 ra, u32 rb)
{ {
WRAPPER_BEGIN(rt, ra, rb, zz); XmmInvalidate(rt);
CPU.GPR[rt]._i32[0] = ((0 - CPU.GPR[rb]._u32[0]) & 0x3f) < 32 ? CPU.GPR[ra]._i32[0] >> ((0 - CPU.GPR[rb]._u32[0]) & 0x3f) : CPU.GPR[ra]._i32[0] >> 31; for (u32 i = 0; i < 4; i++)
CPU.GPR[rt]._i32[1] = ((0 - CPU.GPR[rb]._u32[1]) & 0x3f) < 32 ? CPU.GPR[ra]._i32[1] >> ((0 - CPU.GPR[rb]._u32[1]) & 0x3f) : CPU.GPR[ra]._i32[1] >> 31; {
CPU.GPR[rt]._i32[2] = ((0 - CPU.GPR[rb]._u32[2]) & 0x3f) < 32 ? CPU.GPR[ra]._i32[2] >> ((0 - CPU.GPR[rb]._u32[2]) & 0x3f) : CPU.GPR[ra]._i32[2] >> 31; c.movsxd(*qw0, cpu_dword(GPR[ra]._u32[i]));
CPU.GPR[rt]._i32[3] = ((0 - CPU.GPR[rb]._u32[3]) & 0x3f) < 32 ? CPU.GPR[ra]._i32[3] >> ((0 - CPU.GPR[rb]._u32[3]) & 0x3f) : CPU.GPR[ra]._i32[3] >> 31; c.mov(*addr, cpu_dword(GPR[rb]._u32[i]));
WRAPPER_END(rt, ra, rb, 0); c.neg(*addr);
c.sar(*qw0, *addr);
c.mov(cpu_dword(GPR[rt]._u32[i]), qw0->r32());
}
LOG_OPCODE();
} }
void SHL(u32 rt, u32 ra, u32 rb) void SHL(u32 rt, u32 ra, u32 rb)
{ {
@ -617,31 +685,53 @@ private:
} }
void ROTH(u32 rt, u32 ra, u32 rb) void ROTH(u32 rt, u32 ra, u32 rb)
{ {
WRAPPER_BEGIN(rt, ra, rb, zz); XmmInvalidate(rt);
for (int h = 0; h < 8; h++) for (u32 i = 0; i < 8; i++)
CPU.GPR[rt]._u16[h] = (CPU.GPR[ra]._u16[h] << (CPU.GPR[rb]._u16[h] & 0xf)) | (CPU.GPR[ra]._u16[h] >> (16 - (CPU.GPR[rb]._u16[h] & 0xf))); {
WRAPPER_END(rt, ra, rb, 0); c.movzx(qw0->r32(), cpu_word(GPR[ra]._u16[i]));
c.movzx(*addr, cpu_word(GPR[rb]._u16[i]));
c.rol(qw0->r16(), *addr);
c.mov(cpu_word(GPR[rt]._u16[i]), qw0->r16());
}
LOG_OPCODE();
} }
void ROTHM(u32 rt, u32 ra, u32 rb) void ROTHM(u32 rt, u32 ra, u32 rb)
{ {
WRAPPER_BEGIN(rt, ra, rb, zz); XmmInvalidate(rt);
for (int h = 0; h < 8; h++) for (u32 i = 0; i < 8; i++)
CPU.GPR[rt]._u16[h] = ((0 - CPU.GPR[rb]._u16[h]) & 0x1f) < 16 ? CPU.GPR[ra]._u16[h] >> ((0 - CPU.GPR[rb]._u16[h]) & 0x1f) : 0; {
WRAPPER_END(rt, ra, rb, 0); c.movzx(qw0->r32(), cpu_word(GPR[ra]._u16[i]));
c.movzx(*addr, cpu_word(GPR[rb]._u16[i]));
c.neg(*addr);
c.shr(qw0->r32(), *addr);
c.mov(cpu_word(GPR[rt]._u16[i]), qw0->r16());
}
LOG_OPCODE();
} }
void ROTMAH(u32 rt, u32 ra, u32 rb) void ROTMAH(u32 rt, u32 ra, u32 rb)
{ {
WRAPPER_BEGIN(rt, ra, rb, zz); XmmInvalidate(rt);
for (int h = 0; h < 8; h++) for (u32 i = 0; i < 8; i++)
CPU.GPR[rt]._i16[h] = ((0 - CPU.GPR[rb]._u16[h]) & 0x1f) < 16 ? CPU.GPR[ra]._i16[h] >> ((0 - CPU.GPR[rb]._u16[h]) & 0x1f) : CPU.GPR[ra]._i16[h] >> 15; {
WRAPPER_END(rt, ra, rb, 0); c.movsx(qw0->r32(), cpu_word(GPR[ra]._u16[i]));
c.movzx(*addr, cpu_word(GPR[rb]._u16[i]));
c.neg(*addr);
c.sar(qw0->r32(), *addr);
c.mov(cpu_word(GPR[rt]._u16[i]), qw0->r16());
}
LOG_OPCODE();
} }
void SHLH(u32 rt, u32 ra, u32 rb) void SHLH(u32 rt, u32 ra, u32 rb)
{ {
WRAPPER_BEGIN(rt, ra, rb, zz); XmmInvalidate(rt);
for (int h = 0; h < 8; h++) for (u32 i = 0; i < 8; i++)
CPU.GPR[rt]._u16[h] = (CPU.GPR[rb]._u16[h] & 0x1f) > 15 ? 0 : CPU.GPR[ra]._u16[h] << (CPU.GPR[rb]._u16[h] & 0x1f); {
WRAPPER_END(rt, ra, rb, 0); c.movzx(qw0->r32(), cpu_word(GPR[ra]._u16[i]));
c.movzx(*addr, cpu_word(GPR[rb]._u16[i]));
c.shl(qw0->r32(), *addr);
c.mov(cpu_word(GPR[rt]._u16[i]), qw0->r16());
}
LOG_OPCODE();
} }
void ROTI(u32 rt, u32 ra, s32 i7) void ROTI(u32 rt, u32 ra, s32 i7)
{ {
@ -1186,27 +1276,33 @@ private:
} }
void FSM(u32 rt, u32 ra) void FSM(u32 rt, u32 ra)
{ {
WRAPPER_BEGIN(rt, ra, yy, zz); const XmmLink& vr = XmmAlloc(rt);
const u32 pref = CPU.GPR[ra]._u32[3]; c.mov(*addr, cpu_dword(GPR[ra]._u32[3]));
for (int w = 0; w < 4; w++) c.and_(*addr, 0xf);
CPU.GPR[rt]._u32[w] = (pref & (1 << w)) ? ~0 : 0; c.shl(*addr, 4);
WRAPPER_END(rt, ra, 0, 0); c.movdqa(vr.get(), g_imm2_xmm(fsm_table[0], *addr));
XmmFinalize(vr, rt);
LOG_OPCODE();
} }
void FSMH(u32 rt, u32 ra) void FSMH(u32 rt, u32 ra)
{ {
WRAPPER_BEGIN(rt, ra, yy, zz); const XmmLink& vr = XmmAlloc(rt);
const u32 pref = CPU.GPR[ra]._u32[3]; c.mov(*addr, cpu_dword(GPR[ra]._u32[3]));
for (int h = 0; h < 8; h++) c.and_(*addr, 0xff);
CPU.GPR[rt]._u16[h] = (pref & (1 << h)) ? ~0 : 0; c.shl(*addr, 4);
WRAPPER_END(rt, ra, 0, 0); c.movdqa(vr.get(), g_imm2_xmm(fsmh_table[0], *addr));
XmmFinalize(vr, rt);
LOG_OPCODE();
} }
void FSMB(u32 rt, u32 ra) void FSMB(u32 rt, u32 ra)
{ {
WRAPPER_BEGIN(rt, ra, yy, zz); const XmmLink& vr = XmmAlloc(rt);
const u32 pref = CPU.GPR[ra]._u32[3]; c.mov(*addr, cpu_dword(GPR[ra]._u32[3]));
for (int b = 0; b < 16; b++) c.and_(*addr, 0xffff);
CPU.GPR[rt]._u8[b] = (pref & (1 << b)) ? ~0 : 0; c.shl(*addr, 4);
WRAPPER_END(rt, ra, 0, 0); c.movdqa(vr.get(), g_imm2_xmm(fsmb_table[0], *addr));
XmmFinalize(vr, rt);
LOG_OPCODE();
} }
void FREST(u32 rt, u32 ra) void FREST(u32 rt, u32 ra)
{ {
@ -1247,32 +1343,35 @@ private:
} }
void ROTQBYBI(u32 rt, u32 ra, u32 rb) void ROTQBYBI(u32 rt, u32 ra, u32 rb)
{ {
WRAPPER_BEGIN(rt, ra, rb, zz); const XmmLink& va = XmmGet(ra, rt);
const int s = (CPU.GPR[rb]._u32[3] >> 3) & 0xf; c.mov(*addr, cpu_dword(GPR[rb]._u32[3]));
const SPU_GPR_hdr temp = CPU.GPR[ra]; c.and_(*addr, 0xf << 3);
for (int b = 0; b < 16; b++) c.shl(*addr, 1);
CPU.GPR[rt]._u8[b] = temp._u8[(b - s) & 0xf]; c.pshufb(va.get(), g_imm2_xmm(rldq_pshufb[0], *addr));
WRAPPER_END(rt, ra, rb, 0); XmmFinalize(va, rt);
LOG_OPCODE();
} }
void ROTQMBYBI(u32 rt, u32 ra, u32 rb) void ROTQMBYBI(u32 rt, u32 ra, u32 rb)
{ {
WRAPPER_BEGIN(rt, ra, rb, zz); const XmmLink& va = XmmGet(ra, rt);
const int s = (0 - (CPU.GPR[rb]._u32[3] >> 3)) & 0x1f; c.mov(*addr, cpu_dword(GPR[rb]._u32[3]));
const SPU_GPR_hdr temp = CPU.GPR[ra]; c.shr(*addr, 3);
CPU.GPR[rt].Reset(); c.neg(*addr);
for (int b = 0; b < 16 - s; b++) c.and_(*addr, 0x1f);
CPU.GPR[rt]._u8[b] = temp._u8[(b + s) & 0xf]; c.shl(*addr, 4);
WRAPPER_END(rt, ra, rb, 0); c.pshufb(va.get(), g_imm2_xmm(srdq_pshufb[0], *addr));
XmmFinalize(va, rt);
LOG_OPCODE();
} }
void SHLQBYBI(u32 rt, u32 ra, u32 rb) void SHLQBYBI(u32 rt, u32 ra, u32 rb)
{ {
WRAPPER_BEGIN(rt, ra, rb, zz); const XmmLink& va = XmmGet(ra, rt);
const int s = (CPU.GPR[rb]._u32[3] >> 3) & 0x1f; c.mov(*addr, cpu_dword(GPR[rb]._u32[3]));
const SPU_GPR_hdr temp = CPU.GPR[ra]; c.and_(*addr, 0x1f << 3);
CPU.GPR[rt].Reset(); c.shl(*addr, 1);
for (int b = s; b < 16; b++) c.pshufb(va.get(), g_imm2_xmm(sldq_pshufb[0], *addr));
CPU.GPR[rt]._u8[b] = temp._u8[b - s]; XmmFinalize(va, rt);
WRAPPER_END(rt, ra, rb, 0); LOG_OPCODE();
} }
void CBX(u32 rt, u32 ra, u32 rb) void CBX(u32 rt, u32 ra, u32 rb)
{ {
@ -1361,73 +1460,89 @@ private:
} }
void ROTQBI(u32 rt, u32 ra, u32 rb) void ROTQBI(u32 rt, u32 ra, u32 rb)
{ {
WRAPPER_BEGIN(rt, ra, rb, zz); XmmInvalidate(rt);
const int t = CPU.GPR[rb]._u32[3] & 0x7; c.mov(*qw0, cpu_qword(GPR[ra]._u64[0]));
const SPU_GPR_hdr temp = CPU.GPR[ra]; c.mov(*qw1, cpu_qword(GPR[ra]._u64[1]));
CPU.GPR[rt]._u32[0] = (temp._u32[0] << t) | (temp._u32[3] >> (32 - t)); c.mov(*qw2, *qw0);
CPU.GPR[rt]._u32[1] = (temp._u32[1] << t) | (temp._u32[0] >> (32 - t)); c.mov(*addr, cpu_dword(GPR[rb]._u32[3]));
CPU.GPR[rt]._u32[2] = (temp._u32[2] << t) | (temp._u32[1] >> (32 - t)); c.and_(*addr, 7);
CPU.GPR[rt]._u32[3] = (temp._u32[3] << t) | (temp._u32[2] >> (32 - t)); c.shld(*qw0, *qw1, *addr);
WRAPPER_END(rt, ra, rb, 0); c.shld(*qw1, *qw2, *addr);
c.mov(cpu_qword(GPR[rt]._u64[0]), *qw0);
c.mov(cpu_qword(GPR[rt]._u64[1]), *qw1);
LOG_OPCODE();
} }
void ROTQMBI(u32 rt, u32 ra, u32 rb) void ROTQMBI(u32 rt, u32 ra, u32 rb)
{ {
WRAPPER_BEGIN(rt, ra, rb, zz); XmmInvalidate(rt);
const int t = (0 - CPU.GPR[rb]._u32[3]) & 0x7; c.mov(*qw0, cpu_qword(GPR[ra]._u64[0]));
const SPU_GPR_hdr temp = CPU.GPR[ra]; c.mov(*qw1, cpu_qword(GPR[ra]._u64[1]));
CPU.GPR[rt]._u32[0] = (temp._u32[0] >> t) | (temp._u32[1] << (32 - t)); c.mov(*addr, cpu_dword(GPR[rb]._u32[3]));
CPU.GPR[rt]._u32[1] = (temp._u32[1] >> t) | (temp._u32[2] << (32 - t)); c.neg(*addr);
CPU.GPR[rt]._u32[2] = (temp._u32[2] >> t) | (temp._u32[3] << (32 - t)); c.and_(*addr, 7);
CPU.GPR[rt]._u32[3] = (temp._u32[3] >> t); c.shrd(*qw0, *qw1, *addr);
WRAPPER_END(rt, ra, rb, 0); c.shr(*qw1, *addr);
c.mov(cpu_qword(GPR[rt]._u64[0]), *qw0);
c.mov(cpu_qword(GPR[rt]._u64[1]), *qw1);
LOG_OPCODE();
} }
void SHLQBI(u32 rt, u32 ra, u32 rb) void SHLQBI(u32 rt, u32 ra, u32 rb)
{ {
WRAPPER_BEGIN(rt, ra, rb, zz); XmmInvalidate(rt);
const int t = CPU.GPR[rb]._u32[3] & 0x7; c.mov(*qw0, cpu_qword(GPR[ra]._u64[0]));
const SPU_GPR_hdr temp = CPU.GPR[ra]; c.mov(*qw1, cpu_qword(GPR[ra]._u64[1]));
CPU.GPR[rt]._u32[0] = (temp._u32[0] << t); c.mov(*addr, cpu_dword(GPR[rb]._u32[3]));
CPU.GPR[rt]._u32[1] = (temp._u32[1] << t) | (temp._u32[0] >> (32 - t)); c.and_(*addr, 7);
CPU.GPR[rt]._u32[2] = (temp._u32[2] << t) | (temp._u32[1] >> (32 - t)); c.shld(*qw1, *qw0, *addr);
CPU.GPR[rt]._u32[3] = (temp._u32[3] << t) | (temp._u32[2] >> (32 - t)); c.shl(*qw0, *addr);
WRAPPER_END(rt, ra, rb, 0); c.mov(cpu_qword(GPR[rt]._u64[0]), *qw0);
c.mov(cpu_qword(GPR[rt]._u64[1]), *qw1);
LOG_OPCODE();
} }
void ROTQBY(u32 rt, u32 ra, u32 rb) void ROTQBY(u32 rt, u32 ra, u32 rb)
{ {
WRAPPER_BEGIN(rt, ra, rb, zz); const XmmLink& va = XmmGet(ra, rt);
const int s = CPU.GPR[rb]._u32[3] & 0xf; c.mov(*addr, cpu_dword(GPR[rb]._u32[3]));
const SPU_GPR_hdr temp = CPU.GPR[ra]; c.and_(*addr, 0xf);
for (int b = 0; b < 16; ++b) c.shl(*addr, 4);
CPU.GPR[rt]._u8[b] = temp._u8[(b - s) & 0xf]; c.pshufb(va.get(), g_imm2_xmm(rldq_pshufb[0], *addr));
WRAPPER_END(rt, ra, rb, 0); XmmFinalize(va, rt);
LOG_OPCODE();
} }
void ROTQMBY(u32 rt, u32 ra, u32 rb) void ROTQMBY(u32 rt, u32 ra, u32 rb)
{ {
WRAPPER_BEGIN(rt, ra, rb, zz); const XmmLink& va = XmmGet(ra, rt);
const int s = (0 - CPU.GPR[rb]._u32[3]) & 0x1f; c.mov(*addr, cpu_dword(GPR[rb]._u32[3]));
const SPU_GPR_hdr temp = CPU.GPR[ra]; c.neg(*addr);
CPU.GPR[rt].Reset(); c.and_(*addr, 0x1f);
for (int b = 0; b < 16 - s; b++) c.shl(*addr, 4);
CPU.GPR[rt]._u8[b] = temp._u8[(b + s) & 0xf]; c.pshufb(va.get(), g_imm2_xmm(srdq_pshufb[0], *addr));
WRAPPER_END(rt, ra, rb, 0); XmmFinalize(va, rt);
LOG_OPCODE();
} }
void SHLQBY(u32 rt, u32 ra, u32 rb) void SHLQBY(u32 rt, u32 ra, u32 rb)
{ {
WRAPPER_BEGIN(rt, ra, rb, zz); const XmmLink& va = XmmGet(ra, rt);
const int s = CPU.GPR[rb]._u32[3] & 0x1f; c.mov(*addr, cpu_dword(GPR[rb]._u32[3]));
const SPU_GPR_hdr temp = CPU.GPR[ra]; c.and_(*addr, 0x1f);
CPU.GPR[rt].Reset(); c.shl(*addr, 4);
for (int b = s; b < 16; b++) c.pshufb(va.get(), g_imm2_xmm(sldq_pshufb[0], *addr));
CPU.GPR[rt]._u8[b] = temp._u8[b - s]; XmmFinalize(va, rt);
WRAPPER_END(rt, ra, rb, 0); LOG_OPCODE();
} }
void ORX(u32 rt, u32 ra) void ORX(u32 rt, u32 ra)
{ {
WRAPPER_BEGIN(rt, ra, yy, zz); XmmInvalidate(rt);
CPU.GPR[rt]._u32[3] = CPU.GPR[ra]._u32[0] | CPU.GPR[ra]._u32[1] | CPU.GPR[ra]._u32[2] | CPU.GPR[ra]._u32[3]; c.mov(*addr, cpu_dword(GPR[ra]._u32[0]));
CPU.GPR[rt]._u32[2] = 0; c.or_(*addr, cpu_dword(GPR[ra]._u32[1]));
CPU.GPR[rt]._u64[0] = 0; c.or_(*addr, cpu_dword(GPR[ra]._u32[2]));
WRAPPER_END(rt, ra, 0, 0); c.or_(*addr, cpu_dword(GPR[ra]._u32[3]));
c.mov(cpu_dword(GPR[rt]._u32[3]), *addr);
c.xor_(*addr, *addr);
c.mov(cpu_dword(GPR[rt]._u32[0]), *addr);
c.mov(cpu_dword(GPR[rt]._u32[1]), *addr);
c.mov(cpu_dword(GPR[rt]._u32[2]), *addr);
LOG_OPCODE();
} }
void CBD(u32 rt, u32 ra, s32 i7) void CBD(u32 rt, u32 ra, s32 i7)
{ {
@ -1488,36 +1603,37 @@ private:
} }
void ROTQBII(u32 rt, u32 ra, s32 i7) void ROTQBII(u32 rt, u32 ra, s32 i7)
{ {
WRAPPER_BEGIN(rt, ra, i7, zz); XmmInvalidate(rt);
const int s = i7 & 0x7; c.mov(*qw0, cpu_qword(GPR[ra]._u64[0]));
const SPU_GPR_hdr temp = CPU.GPR[ra]; c.mov(*qw1, cpu_qword(GPR[ra]._u64[1]));
CPU.GPR[rt]._u32[0] = (temp._u32[0] << s) | (temp._u32[3] >> (32 - s)); c.mov(*qw2, *qw0);
CPU.GPR[rt]._u32[1] = (temp._u32[1] << s) | (temp._u32[0] >> (32 - s)); c.shld(*qw0, *qw1, i7 & 0x7);
CPU.GPR[rt]._u32[2] = (temp._u32[2] << s) | (temp._u32[1] >> (32 - s)); c.shld(*qw1, *qw2, i7 & 0x7);
CPU.GPR[rt]._u32[3] = (temp._u32[3] << s) | (temp._u32[2] >> (32 - s)); c.mov(cpu_qword(GPR[rt]._u64[0]), *qw0);
WRAPPER_END(rt, ra, i7, 0); c.mov(cpu_qword(GPR[rt]._u64[1]), *qw1);
LOG_OPCODE();
} }
void ROTQMBII(u32 rt, u32 ra, s32 i7) void ROTQMBII(u32 rt, u32 ra, s32 i7)
{ {
WRAPPER_BEGIN(rt, ra, i7, zz); XmmInvalidate(rt);
const int s = (0 - (s32)i7) & 0x7; c.mov(*qw0, cpu_qword(GPR[ra]._u64[0]));
const SPU_GPR_hdr temp = CPU.GPR[ra]; c.mov(*qw1, cpu_qword(GPR[ra]._u64[1]));
CPU.GPR[rt]._u32[0] = (temp._u32[0] >> s) | (temp._u32[1] << (32 - s)); c.shrd(*qw0, *qw1, (0 - i7) & 0x7);
CPU.GPR[rt]._u32[1] = (temp._u32[1] >> s) | (temp._u32[2] << (32 - s)); c.shr(*qw1, (0 - i7) & 0x7);
CPU.GPR[rt]._u32[2] = (temp._u32[2] >> s) | (temp._u32[3] << (32 - s)); c.mov(cpu_qword(GPR[rt]._u64[0]), *qw0);
CPU.GPR[rt]._u32[3] = (temp._u32[3] >> s); c.mov(cpu_qword(GPR[rt]._u64[1]), *qw1);
WRAPPER_END(rt, ra, i7, 0); LOG_OPCODE();
} }
void SHLQBII(u32 rt, u32 ra, s32 i7) void SHLQBII(u32 rt, u32 ra, s32 i7)
{ {
WRAPPER_BEGIN(rt, ra, i7, zz); XmmInvalidate(rt);
const int s = i7 & 0x7; c.mov(*qw0, cpu_qword(GPR[ra]._u64[0]));
const SPU_GPR_hdr temp = CPU.GPR[ra]; c.mov(*qw1, cpu_qword(GPR[ra]._u64[1]));
CPU.GPR[rt]._u32[0] = (temp._u32[0] << s); c.shld(*qw1, *qw0, i7 & 0x7);
CPU.GPR[rt]._u32[1] = (temp._u32[1] << s) | (temp._u32[0] >> (32 - s)); c.shl(*qw0, i7 & 0x7);
CPU.GPR[rt]._u32[2] = (temp._u32[2] << s) | (temp._u32[1] >> (32 - s)); c.mov(cpu_qword(GPR[rt]._u64[0]), *qw0);
CPU.GPR[rt]._u32[3] = (temp._u32[3] << s) | (temp._u32[2] >> (32 - s)); c.mov(cpu_qword(GPR[rt]._u64[1]), *qw1);
WRAPPER_END(rt, ra, i7, 0); LOG_OPCODE();
} }
void ROTQBYI(u32 rt, u32 ra, s32 i7) void ROTQBYI(u32 rt, u32 ra, s32 i7)
{ {
@ -1729,7 +1845,7 @@ private:
} }
void SUMB(u32 rt, u32 ra, u32 rb) void SUMB(u32 rt, u32 ra, u32 rb)
{ {
WRAPPER_BEGIN(rt, ra, rb, zz); /*WRAPPER_BEGIN(rt, ra, rb, zz);
const SPU_GPR_hdr _a = CPU.GPR[ra]; const SPU_GPR_hdr _a = CPU.GPR[ra];
const SPU_GPR_hdr _b = CPU.GPR[rb]; const SPU_GPR_hdr _b = CPU.GPR[rb];
for (int w = 0; w < 4; w++) for (int w = 0; w < 4; w++)
@ -1737,7 +1853,46 @@ private:
CPU.GPR[rt]._u16[w*2] = _a._u8[w*4] + _a._u8[w*4 + 1] + _a._u8[w*4 + 2] + _a._u8[w*4 + 3]; CPU.GPR[rt]._u16[w*2] = _a._u8[w*4] + _a._u8[w*4 + 1] + _a._u8[w*4 + 2] + _a._u8[w*4 + 3];
CPU.GPR[rt]._u16[w*2 + 1] = _b._u8[w*4] + _b._u8[w*4 + 1] + _b._u8[w*4 + 2] + _b._u8[w*4 + 3]; CPU.GPR[rt]._u16[w*2 + 1] = _b._u8[w*4] + _b._u8[w*4 + 1] + _b._u8[w*4 + 2] + _b._u8[w*4 + 3];
} }
WRAPPER_END(rt, ra, rb, 0); WRAPPER_END(rt, ra, rb, 0);*/
const XmmLink& va = XmmGet(ra);
const XmmLink& vb = (ra == rb) ? XmmCopy(va) : XmmGet(rb);
const XmmLink& v1 = XmmCopy(vb, rt);
const XmmLink& v2 = XmmCopy(vb);
const XmmLink& vFF = XmmAlloc();
c.movdqa(vFF.get(), XmmConst(_mm_set1_epi32(0xff)));
c.pand(v1.get(), vFF.get());
c.psrld(v2.get(), 8);
c.pand(v2.get(), vFF.get());
c.paddd(v1.get(), v2.get());
c.movdqa(v2.get(), vb.get());
c.psrld(v2.get(), 16);
c.pand(v2.get(), vFF.get());
c.paddd(v1.get(), v2.get());
c.movdqa(v2.get(), vb.get());
c.psrld(v2.get(), 24);
c.paddd(v1.get(), v2.get());
c.pslld(v1.get(), 16);
c.movdqa(v2.get(), va.get());
c.pand(v2.get(), vFF.get());
c.por(v1.get(), v2.get());
c.movdqa(v2.get(), va.get());
c.psrld(v2.get(), 8);
c.pand(v2.get(), vFF.get());
c.paddd(v1.get(), v2.get());
c.movdqa(v2.get(), va.get());
c.psrld(v2.get(), 16);
c.pand(v2.get(), vFF.get());
c.paddd(v1.get(), v2.get());
c.movdqa(v2.get(), va.get());
c.psrld(v2.get(), 24);
c.paddd(v1.get(), v2.get());
XmmFinalize(vb);
XmmFinalize(va);
XmmFinalize(v1, rt);
XmmFinalize(v2);
XmmFinalize(vFF);
LOG_OPCODE();
} }
//HGT uses signed values. HLGT uses unsigned values //HGT uses signed values. HLGT uses unsigned values
void HGT(u32 rt, s32 ra, s32 rb) void HGT(u32 rt, s32 ra, s32 rb)
@ -1754,18 +1909,16 @@ private:
} }
void CLZ(u32 rt, u32 ra) void CLZ(u32 rt, u32 ra)
{ {
WRAPPER_BEGIN(rt, ra, yy, zz); XmmInvalidate(rt);
for (int w = 0; w < 4; w++) for (u32 i = 0; i < 4; i++)
{ {
int nPos; c.bsr(*addr, cpu_dword(GPR[ra]._u32[i]));
c.cmovz(*addr, dword_ptr(*g_imm_var, offsetof(g_imm_table_struct, fsmb_table[0xffff]))); // load 0xffffffff
for (nPos = 0; nPos < 32; nPos++) c.neg(*addr);
if (CPU.GPR[ra]._u32[w] & (1 << (31 - nPos))) c.add(*addr, 31);
break; c.mov(cpu_dword(GPR[rt]._u32[i]), *addr);
CPU.GPR[rt]._u32[w] = nPos;
} }
WRAPPER_END(rt, ra, 0, 0); LOG_OPCODE();
} }
void XSWD(u32 rt, u32 ra) void XSWD(u32 rt, u32 ra)
{ {
@ -1786,13 +1939,14 @@ private:
} }
void CNTB(u32 rt, u32 ra) void CNTB(u32 rt, u32 ra)
{ {
WRAPPER_BEGIN(rt, ra, yy, zz); XmmInvalidate(rt);
const SPU_GPR_hdr temp = CPU.GPR[ra]; for (u32 i = 0; i < 8; i++)
CPU.GPR[rt].Reset(); {
for (int b = 0; b < 16; b++) c.movzx(*addr, cpu_word(GPR[ra]._u16[i]));
for (int i = 0; i < 8; i++) c.movzx(*addr, word_ptr(*g_imm_var, *addr, 1, offsetof(g_imm_table_struct, cntb_table[0])));
CPU.GPR[rt]._u8[b] += (temp._u8[b] & (1 << i)) ? 1 : 0; c.mov(cpu_word(GPR[rt]._u16[i]), addr->r16());
WRAPPER_END(rt, ra, 0, 0); }
LOG_OPCODE();
} }
void XSBH(u32 rt, u32 ra) void XSBH(u32 rt, u32 ra)
{ {
@ -2228,14 +2382,14 @@ private:
XmmFinalize(vt); XmmFinalize(vt);
LOG_OPCODE(); LOG_OPCODE();
} }
void CGX(u32 rt, u32 ra, u32 rb) void CGX(u32 rt, u32 ra, u32 rb) //nf
{ {
WRAPPER_BEGIN(rt, ra, rb, zz); WRAPPER_BEGIN(rt, ra, rb, zz);
for (int w = 0; w < 4; w++) for (int w = 0; w < 4; w++)
CPU.GPR[rt]._u32[w] = ((u64)CPU.GPR[ra]._u32[w] + (u64)CPU.GPR[rb]._u32[w] + (u64)(CPU.GPR[rt]._u32[w] & 1)) >> 32; CPU.GPR[rt]._u32[w] = ((u64)CPU.GPR[ra]._u32[w] + (u64)CPU.GPR[rb]._u32[w] + (u64)(CPU.GPR[rt]._u32[w] & 1)) >> 32;
WRAPPER_END(rt, ra, rb, 0); WRAPPER_END(rt, ra, rb, 0);
} }
void BGX(u32 rt, u32 ra, u32 rb) void BGX(u32 rt, u32 ra, u32 rb) //nf
{ {
WRAPPER_BEGIN(rt, ra, rb, zz); WRAPPER_BEGIN(rt, ra, rb, zz);
s64 nResult; s64 nResult;
@ -2299,7 +2453,7 @@ private:
{ {
UNIMPLEMENTED(); UNIMPLEMENTED();
} }
void DFTSV(u32 rt, u32 ra, s32 i7) void DFTSV(u32 rt, u32 ra, s32 i7) //nf
{ {
WRAPPER_BEGIN(rt, ra, i7, zz); WRAPPER_BEGIN(rt, ra, i7, zz);
const u64 DoubleExpMask = 0x7ff0000000000000; const u64 DoubleExpMask = 0x7ff0000000000000;
@ -2721,12 +2875,7 @@ private:
else else
{ {
const XmmLink& vr = XmmAlloc(rt); const XmmLink& vr = XmmAlloc(rt);
__m128i fsmbi_mask; c.movdqa(vr.get(), g_imm_xmm(fsmb_table[i16 & 0xffff]));
for (u32 j = 0; j < 16; j++)
{
fsmbi_mask.m128i_i8[j] = ((i16 >> j) & 0x1) ? 0xff : 0;
}
c.movdqa(vr.get(), XmmConst(fsmbi_mask));
XmmFinalize(vr, rt); XmmFinalize(vr, rt);
} }
LOG_OPCODE(); LOG_OPCODE();

View file

@ -4,6 +4,8 @@
#include "SPUInterpreter.h" #include "SPUInterpreter.h"
#include "SPURecompiler.h" #include "SPURecompiler.h"
static const g_imm_table_struct g_imm_table;
SPURecompilerCore::SPURecompilerCore(SPUThread& cpu) SPURecompilerCore::SPURecompilerCore(SPUThread& cpu)
: m_enc(new SPURecompiler(cpu, *this)) : m_enc(new SPURecompiler(cpu, *this))
, inter(new SPUInterpreter(cpu)) , inter(new SPUInterpreter(cpu))
@ -58,16 +60,21 @@ void SPURecompilerCore::Compile(u16 pos)
compiler.alloc(imm_var); compiler.alloc(imm_var);
m_enc->imm_var = &imm_var; m_enc->imm_var = &imm_var;
GpVar pos_var(compiler, kVarTypeUInt32, "pos"); GpVar g_imm_var(compiler, kVarTypeIntPtr, "g_imm");
compiler.setArg(3, pos_var); compiler.setArg(3, g_imm_var);
m_enc->pos_var = &pos_var; compiler.alloc(g_imm_var);
m_enc->g_imm_var = &g_imm_var;
GpVar pos_var(compiler, kVarTypeUInt32, "pos");
m_enc->pos_var = &pos_var;
GpVar addr_var(compiler, kVarTypeUInt32, "addr"); GpVar addr_var(compiler, kVarTypeUInt32, "addr");
m_enc->addr = &addr_var; m_enc->addr = &addr_var;
GpVar qw0_var(compiler, kVarTypeUInt64, "qw0"); GpVar qw0_var(compiler, kVarTypeUInt64, "qw0");
m_enc->qw0 = &qw0_var; m_enc->qw0 = &qw0_var;
GpVar qw1_var(compiler, kVarTypeUInt64, "qw1"); GpVar qw1_var(compiler, kVarTypeUInt64, "qw1");
m_enc->qw1 = &qw1_var; m_enc->qw1 = &qw1_var;
GpVar qw2_var(compiler, kVarTypeUInt64, "qw2");
m_enc->qw2 = &qw2_var;
for (u32 i = 0; i < 16; i++) for (u32 i = 0; i < 16; i++)
{ {
@ -198,7 +205,7 @@ u8 SPURecompilerCore::DecodeMemory(const u64 address)
return 0; return 0;
} }
typedef u32(*Func)(void* _cpu, void* _ls, const void* _imm, u32 _pos); typedef u32(*Func)(const void* _cpu, const void* _ls, const void* _imm, const void* _g_imm);
Func func = asmjit_cast<Func>(entry[pos].pointer); Func func = asmjit_cast<Func>(entry[pos].pointer);
@ -215,7 +222,7 @@ u8 SPURecompilerCore::DecodeMemory(const u64 address)
} }
u32 res = pos; u32 res = pos;
res = func(cpu, &Memory[m_offset], imm_table.data(), res); res = func(cpu, &Memory[m_offset], imm_table.data(), &g_imm_table);
if (res > 0xffff) if (res > 0xffff)
{ {