PPCRec: Remove redundant FPR instructions which are no longer used

This commit is contained in:
Exzap 2025-05-06 03:53:20 +02:00
parent bb5a7ce4ff
commit 32205a2081
7 changed files with 46 additions and 999 deletions

View file

@ -609,7 +609,7 @@ bool PPCRecompilerX64Gen_imlInstruction_r_r(PPCRecFunction_t* PPCRecFunction, pp
}
else
{
debug_printf("PPCRecompilerX64Gen_imlInstruction_r_r(): Unsupported operation 0x%x\n", imlInstruction->operation);
cemuLog_logDebug(LogType::Force, "PPCRecompilerX64Gen_imlInstruction_r_r(): Unsupported operation 0x%x\n", imlInstruction->operation);
return false;
}
return true;
@ -635,7 +635,7 @@ bool PPCRecompilerX64Gen_imlInstruction_r_s32(PPCRecFunction_t* PPCRecFunction,
}
else
{
debug_printf("PPCRecompilerX64Gen_imlInstruction_r_s32(): Unsupported operation 0x%x\n", imlInstruction->operation);
cemuLog_logDebug(LogType::Force, "PPCRecompilerX64Gen_imlInstruction_r_s32(): Unsupported operation 0x%x\n", imlInstruction->operation);
return false;
}
return true;
@ -894,7 +894,7 @@ bool PPCRecompilerX64Gen_imlInstruction_r_r_r(PPCRecFunction_t* PPCRecFunction,
}
else
{
debug_printf("PPCRecompilerX64Gen_imlInstruction_r_r_r(): Unsupported operation 0x%x\n", imlInstruction->operation);
cemuLog_logDebug(LogType::Force, "PPCRecompilerX64Gen_imlInstruction_r_r_r(): Unsupported operation 0x%x\n", imlInstruction->operation);
return false;
}
return true;

View file

@ -3,8 +3,6 @@
#include "BackendX64.h"
#include "Common/cpu_features.h"
#include "asm/x64util.h" // for recompiler_fres / frsqrte
uint32 _regF64(IMLReg physReg);
uint32 _regI32(IMLReg r)
@ -34,231 +32,6 @@ static x86Assembler64::GPR8_REX _reg8_from_reg32(x86Assembler64::GPR32 regId)
return (x86Assembler64::GPR8_REX)regId;
}
void PPCRecompilerX64Gen_imlInstr_gqr_generateScaleCode(ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, sint32 registerXMM, bool isLoad, bool scalePS1, IMLReg registerGQR)
{
// load GQR
x64Gen_mov_reg64_reg64(x64GenContext, REG_RESV_TEMP, _regI32(registerGQR));
// extract scale field and multiply by 16 to get array offset
x64Gen_shr_reg64Low32_imm8(x64GenContext, REG_RESV_TEMP, (isLoad?16:0)+8-4);
x64Gen_and_reg64Low32_imm32(x64GenContext, REG_RESV_TEMP, (0x3F<<4));
// multiply xmm by scale
x64Gen_add_reg64_reg64(x64GenContext, REG_RESV_TEMP, REG_RESV_RECDATA);
if (isLoad)
{
if(scalePS1)
x64Gen_mulpd_xmmReg_memReg128(x64GenContext, registerXMM, REG_RESV_TEMP, offsetof(PPCRecompilerInstanceData_t, _psq_ld_scale_ps0_ps1));
else
x64Gen_mulpd_xmmReg_memReg128(x64GenContext, registerXMM, REG_RESV_TEMP, offsetof(PPCRecompilerInstanceData_t, _psq_ld_scale_ps0_1));
}
else
{
if (scalePS1)
x64Gen_mulpd_xmmReg_memReg128(x64GenContext, registerXMM, REG_RESV_TEMP, offsetof(PPCRecompilerInstanceData_t, _psq_st_scale_ps0_ps1));
else
x64Gen_mulpd_xmmReg_memReg128(x64GenContext, registerXMM, REG_RESV_TEMP, offsetof(PPCRecompilerInstanceData_t, _psq_st_scale_ps0_1));
}
}
// generate code for PSQ load for a particular type
// if scaleGQR is -1 then a scale of 1.0 is assumed (no scale)
void PPCRecompilerX64Gen_imlInstr_psq_load(ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, uint8 mode, sint32 registerXMM, sint32 memReg, sint32 memRegEx, sint32 memImmS32, bool indexed, IMLReg registerGQR = IMLREG_INVALID)
{
if (mode == PPCREC_FPR_LD_MODE_PSQ_FLOAT_PS0_PS1)
{
if (indexed)
{
assert_dbg();
}
// optimized code for ps float load
x64Emit_mov_reg64_mem64(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, memReg, memImmS32);
x64GenContext->emitter->BSWAP_q(REG_RESV_TEMP);
x64Gen_rol_reg64_imm8(x64GenContext, REG_RESV_TEMP, 32); // swap upper and lower DWORD
x64Gen_movq_xmmReg_reg64(x64GenContext, registerXMM, REG_RESV_TEMP);
x64Gen_cvtps2pd_xmmReg_xmmReg(x64GenContext, registerXMM, registerXMM);
// note: floats are not scaled
}
else if (mode == PPCREC_FPR_LD_MODE_PSQ_FLOAT_PS0)
{
if (indexed)
{
x64Gen_mov_reg64Low32_reg64Low32(x64GenContext, REG_RESV_TEMP, memRegEx);
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, REG_RESV_TEMP, memReg);
if (g_CPUFeatures.x86.movbe)
{
x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, REG_RESV_TEMP, memImmS32);
}
else
{
x64Emit_mov_reg32_mem32(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, REG_RESV_TEMP, memImmS32);
x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_RESV_TEMP);
}
}
else
{
if (g_CPUFeatures.x86.movbe)
{
x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, memReg, memImmS32);
}
else
{
x64Emit_mov_reg32_mem32(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, memReg, memImmS32);
x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_RESV_TEMP);
}
}
if (g_CPUFeatures.x86.avx)
{
x64Gen_movd_xmmReg_reg64Low32(x64GenContext, REG_RESV_FPR_TEMP, REG_RESV_TEMP);
}
else
{
x64Emit_mov_mem32_reg64(x64GenContext, REG_RESV_HCPU, offsetof(PPCInterpreter_t, temporaryFPR), REG_RESV_TEMP);
x64Gen_movddup_xmmReg_memReg64(x64GenContext, REG_RESV_FPR_TEMP, REG_RESV_HCPU, offsetof(PPCInterpreter_t, temporaryFPR));
}
x64Gen_cvtss2sd_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, REG_RESV_FPR_TEMP);
// load constant 1.0 into lower half and upper half of temp register
x64Gen_movddup_xmmReg_memReg64(x64GenContext, registerXMM, REG_RESV_RECDATA, offsetof(PPCRecompilerInstanceData_t, _x64XMM_constDouble1_1));
// overwrite lower half with single from memory
x64Gen_movsd_xmmReg_xmmReg(x64GenContext, registerXMM, REG_RESV_FPR_TEMP);
// note: floats are not scaled
}
else
{
sint32 readSize;
bool isSigned = false;
if (mode == PPCREC_FPR_LD_MODE_PSQ_S16_PS0 ||
mode == PPCREC_FPR_LD_MODE_PSQ_S16_PS0_PS1)
{
readSize = 16;
isSigned = true;
}
else if (mode == PPCREC_FPR_LD_MODE_PSQ_U16_PS0 ||
mode == PPCREC_FPR_LD_MODE_PSQ_U16_PS0_PS1)
{
readSize = 16;
isSigned = false;
}
else if (mode == PPCREC_FPR_LD_MODE_PSQ_S8_PS0 ||
mode == PPCREC_FPR_LD_MODE_PSQ_S8_PS0_PS1)
{
readSize = 8;
isSigned = true;
}
else if (mode == PPCREC_FPR_LD_MODE_PSQ_U8_PS0 ||
mode == PPCREC_FPR_LD_MODE_PSQ_U8_PS0_PS1)
{
readSize = 8;
isSigned = false;
}
else
assert_dbg();
bool loadPS1 = (mode == PPCREC_FPR_LD_MODE_PSQ_S16_PS0_PS1 ||
mode == PPCREC_FPR_LD_MODE_PSQ_U16_PS0_PS1 ||
mode == PPCREC_FPR_LD_MODE_PSQ_U8_PS0_PS1 ||
mode == PPCREC_FPR_LD_MODE_PSQ_S8_PS0_PS1);
for (sint32 wordIndex = 0; wordIndex < 2; wordIndex++)
{
if (indexed)
{
assert_dbg();
}
// read from memory
if (wordIndex == 1 && loadPS1 == false)
{
// store constant 1
x64Gen_mov_mem32Reg64_imm32(x64GenContext, REG_RESV_HCPU, offsetof(PPCInterpreter_t, temporaryGPR) + sizeof(uint32) * 1, 1);
}
else
{
uint32 memOffset = memImmS32 + wordIndex * (readSize / 8);
if (readSize == 16)
{
// half word
x64Gen_movZeroExtend_reg64Low16_mem16Reg64PlusReg64(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, memReg, memOffset);
x64Gen_rol_reg64Low16_imm8(x64GenContext, REG_RESV_TEMP, 8); // endian swap
if (isSigned)
x64Gen_movSignExtend_reg64Low32_reg64Low16(x64GenContext, REG_RESV_TEMP, REG_RESV_TEMP);
else
x64Gen_movZeroExtend_reg64Low32_reg64Low16(x64GenContext, REG_RESV_TEMP, REG_RESV_TEMP);
}
else if (readSize == 8)
{
// byte
x64Emit_mov_reg64b_mem8(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, memReg, memOffset);
if (isSigned)
x64Gen_movSignExtend_reg64Low32_reg64Low8(x64GenContext, REG_RESV_TEMP, REG_RESV_TEMP);
else
x64Gen_movZeroExtend_reg64Low32_reg64Low8(x64GenContext, REG_RESV_TEMP, REG_RESV_TEMP);
}
// store
x64Emit_mov_mem32_reg32(x64GenContext, REG_RESV_HCPU, offsetof(PPCInterpreter_t, temporaryGPR) + sizeof(uint32) * wordIndex, REG_RESV_TEMP);
}
}
// convert the two integers to doubles
x64Gen_cvtpi2pd_xmmReg_mem64Reg64(x64GenContext, registerXMM, REG_RESV_HCPU, offsetof(PPCInterpreter_t, temporaryGPR));
// scale
if (registerGQR.IsValid())
PPCRecompilerX64Gen_imlInstr_gqr_generateScaleCode(ppcImlGenContext, x64GenContext, registerXMM, true, loadPS1, registerGQR);
}
}
void PPCRecompilerX64Gen_imlInstr_psq_load_generic(ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, uint8 mode, sint32 registerXMM, sint32 memReg, sint32 memRegEx, sint32 memImmS32, bool indexed, IMLReg registerGQR)
{
bool loadPS1 = (mode == PPCREC_FPR_LD_MODE_PSQ_GENERIC_PS0_PS1);
// load GQR
x64Gen_mov_reg64_reg64(x64GenContext, REG_RESV_TEMP, _regI32(registerGQR));
// extract load type field
x64Gen_shr_reg64Low32_imm8(x64GenContext, REG_RESV_TEMP, 16);
x64Gen_and_reg64Low32_imm32(x64GenContext, REG_RESV_TEMP, 7);
// jump cases
x64Gen_cmp_reg64Low32_imm32(x64GenContext, REG_RESV_TEMP, 4); // type 4 -> u8
sint32 jumpOffset_caseU8 = x64GenContext->emitter->GetWriteIndex();
x64Gen_jmpc_far(x64GenContext, X86_CONDITION_EQUAL, 0);
x64Gen_cmp_reg64Low32_imm32(x64GenContext, REG_RESV_TEMP, 5); // type 5 -> u16
sint32 jumpOffset_caseU16 = x64GenContext->emitter->GetWriteIndex();
x64Gen_jmpc_far(x64GenContext, X86_CONDITION_EQUAL, 0);
x64Gen_cmp_reg64Low32_imm32(x64GenContext, REG_RESV_TEMP, 6); // type 4 -> s8
sint32 jumpOffset_caseS8 = x64GenContext->emitter->GetWriteIndex();
x64Gen_jmpc_far(x64GenContext, X86_CONDITION_EQUAL, 0);
x64Gen_cmp_reg64Low32_imm32(x64GenContext, REG_RESV_TEMP, 7); // type 5 -> s16
sint32 jumpOffset_caseS16 = x64GenContext->emitter->GetWriteIndex();
x64Gen_jmpc_far(x64GenContext, X86_CONDITION_EQUAL, 0);
// default case -> float
// generate cases
uint32 jumpOffset_endOfFloat;
uint32 jumpOffset_endOfU8;
uint32 jumpOffset_endOfU16;
uint32 jumpOffset_endOfS8;
PPCRecompilerX64Gen_imlInstr_psq_load(ppcImlGenContext, x64GenContext, loadPS1 ? PPCREC_FPR_LD_MODE_PSQ_FLOAT_PS0_PS1 : PPCREC_FPR_LD_MODE_PSQ_FLOAT_PS0, registerXMM, memReg, memRegEx, memImmS32, indexed, registerGQR);
jumpOffset_endOfFloat = x64GenContext->emitter->GetWriteIndex();
x64Gen_jmp_imm32(x64GenContext, 0);
PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpOffset_caseU16, x64GenContext->emitter->GetWriteIndex());
PPCRecompilerX64Gen_imlInstr_psq_load(ppcImlGenContext, x64GenContext, loadPS1 ? PPCREC_FPR_LD_MODE_PSQ_U16_PS0_PS1 : PPCREC_FPR_LD_MODE_PSQ_U16_PS0, registerXMM, memReg, memRegEx, memImmS32, indexed, registerGQR);
jumpOffset_endOfU8 = x64GenContext->emitter->GetWriteIndex();
x64Gen_jmp_imm32(x64GenContext, 0);
PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpOffset_caseS16, x64GenContext->emitter->GetWriteIndex());
PPCRecompilerX64Gen_imlInstr_psq_load(ppcImlGenContext, x64GenContext, loadPS1 ? PPCREC_FPR_LD_MODE_PSQ_S16_PS0_PS1 : PPCREC_FPR_LD_MODE_PSQ_S16_PS0, registerXMM, memReg, memRegEx, memImmS32, indexed, registerGQR);
jumpOffset_endOfU16 = x64GenContext->emitter->GetWriteIndex();
x64Gen_jmp_imm32(x64GenContext, 0);
PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpOffset_caseU8, x64GenContext->emitter->GetWriteIndex());
PPCRecompilerX64Gen_imlInstr_psq_load(ppcImlGenContext, x64GenContext, loadPS1 ? PPCREC_FPR_LD_MODE_PSQ_U8_PS0_PS1 : PPCREC_FPR_LD_MODE_PSQ_U8_PS0, registerXMM, memReg, memRegEx, memImmS32, indexed, registerGQR);
jumpOffset_endOfS8 = x64GenContext->emitter->GetWriteIndex();
x64Gen_jmp_imm32(x64GenContext, 0);
PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpOffset_caseS8, x64GenContext->emitter->GetWriteIndex());
PPCRecompilerX64Gen_imlInstr_psq_load(ppcImlGenContext, x64GenContext, loadPS1 ? PPCREC_FPR_LD_MODE_PSQ_S8_PS0_PS1 : PPCREC_FPR_LD_MODE_PSQ_S8_PS0, registerXMM, memReg, memRegEx, memImmS32, indexed, registerGQR);
PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpOffset_endOfFloat, x64GenContext->emitter->GetWriteIndex());
PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpOffset_endOfU8, x64GenContext->emitter->GetWriteIndex());
PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpOffset_endOfU16, x64GenContext->emitter->GetWriteIndex());
PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpOffset_endOfS8, x64GenContext->emitter->GetWriteIndex());
}
// load from memory
bool PPCRecompilerX64Gen_imlInstruction_fpr_load(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, IMLInstruction* imlInstruction, bool indexed)
{
@ -269,8 +42,7 @@ bool PPCRecompilerX64Gen_imlInstruction_fpr_load(PPCRecFunction_t* PPCRecFunctio
realRegisterMem2 = _regI32(imlInstruction->op_storeLoad.registerMem2);
uint8 mode = imlInstruction->op_storeLoad.mode;
if( mode == PPCREC_FPR_LD_MODE_SINGLE_INTO_PS0_PS1 ||
mode == PPCREC_FPR_LD_MODE_SINGLE_INTO_PS0 ) // lazy hack for now. Load only one value for SINGLE_INTO_PS0
if( mode == PPCREC_FPR_LD_MODE_SINGLE_INTO_PS0 )
{
// load byte swapped single into temporary FPR
if( indexed )
@ -362,25 +134,6 @@ bool PPCRecompilerX64Gen_imlInstruction_fpr_load(PPCRecFunction_t* PPCRecFunctio
}
}
}
else if (mode == PPCREC_FPR_LD_MODE_PSQ_FLOAT_PS0_PS1 ||
mode == PPCREC_FPR_LD_MODE_PSQ_FLOAT_PS0 ||
mode == PPCREC_FPR_LD_MODE_PSQ_S16_PS0 ||
mode == PPCREC_FPR_LD_MODE_PSQ_S16_PS0_PS1 ||
mode == PPCREC_FPR_LD_MODE_PSQ_S16_PS0 ||
mode == PPCREC_FPR_LD_MODE_PSQ_U16_PS0 ||
mode == PPCREC_FPR_LD_MODE_PSQ_U16_PS0_PS1 ||
mode == PPCREC_FPR_LD_MODE_PSQ_S8_PS0 ||
mode == PPCREC_FPR_LD_MODE_PSQ_S8_PS0_PS1 ||
mode == PPCREC_FPR_LD_MODE_PSQ_S8_PS0 ||
mode == PPCREC_FPR_LD_MODE_PSQ_U8_PS0_PS1 )
{
PPCRecompilerX64Gen_imlInstr_psq_load(ppcImlGenContext, x64GenContext, mode, realRegisterXMM, realRegisterMem, realRegisterMem2, imlInstruction->op_storeLoad.immS32, indexed);
}
else if (mode == PPCREC_FPR_LD_MODE_PSQ_GENERIC_PS0_PS1 ||
mode == PPCREC_FPR_LD_MODE_PSQ_GENERIC_PS0)
{
PPCRecompilerX64Gen_imlInstr_psq_load_generic(ppcImlGenContext, x64GenContext, mode, realRegisterXMM, realRegisterMem, realRegisterMem2, imlInstruction->op_storeLoad.immS32, indexed, imlInstruction->op_storeLoad.registerGQR);
}
else
{
return false;
@ -388,188 +141,6 @@ bool PPCRecompilerX64Gen_imlInstruction_fpr_load(PPCRecFunction_t* PPCRecFunctio
return true;
}
void PPCRecompilerX64Gen_imlInstr_psq_store(ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, uint8 mode, sint32 registerXMM, sint32 memReg, sint32 memRegEx, sint32 memImmS32, bool indexed, IMLReg registerGQR = IMLREG_INVALID)
{
bool storePS1 = (mode == PPCREC_FPR_ST_MODE_PSQ_FLOAT_PS0_PS1 ||
mode == PPCREC_FPR_ST_MODE_PSQ_S8_PS0_PS1 ||
mode == PPCREC_FPR_ST_MODE_PSQ_U8_PS0_PS1 ||
mode == PPCREC_FPR_ST_MODE_PSQ_U16_PS0_PS1 ||
mode == PPCREC_FPR_ST_MODE_PSQ_S16_PS0_PS1);
bool isFloat = mode == PPCREC_FPR_ST_MODE_PSQ_FLOAT_PS0 || mode == PPCREC_FPR_ST_MODE_PSQ_FLOAT_PS0_PS1;
if (registerGQR.IsValid())
{
// move to temporary xmm and update registerXMM
x64Gen_movaps_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, registerXMM);
registerXMM = REG_RESV_FPR_TEMP;
// apply scale
if(isFloat == false)
PPCRecompilerX64Gen_imlInstr_gqr_generateScaleCode(ppcImlGenContext, x64GenContext, registerXMM, false, storePS1, registerGQR);
}
if (mode == PPCREC_FPR_ST_MODE_PSQ_FLOAT_PS0)
{
x64Gen_cvtsd2ss_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, registerXMM);
x64Gen_movd_reg64Low32_xmmReg(x64GenContext, REG_RESV_TEMP, REG_RESV_FPR_TEMP);
if (g_CPUFeatures.x86.movbe == false)
x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_RESV_TEMP);
if (indexed)
{
cemu_assert_debug(memReg != memRegEx);
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, memReg, memRegEx);
}
if (g_CPUFeatures.x86.movbe)
x64Gen_movBETruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_RESV_MEMBASE, memReg, memImmS32, REG_RESV_TEMP);
else
x64Gen_movTruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_RESV_MEMBASE, memReg, memImmS32, REG_RESV_TEMP);
if (indexed)
{
x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, memReg, memRegEx);
}
return;
}
else if (mode == PPCREC_FPR_ST_MODE_PSQ_FLOAT_PS0_PS1)
{
if (indexed)
assert_dbg(); // todo
x64Gen_cvtpd2ps_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, registerXMM);
x64Gen_movq_reg64_xmmReg(x64GenContext, REG_RESV_TEMP, REG_RESV_FPR_TEMP);
x64Gen_rol_reg64_imm8(x64GenContext, REG_RESV_TEMP, 32); // swap upper and lower DWORD
x64GenContext->emitter->BSWAP_q(REG_RESV_TEMP);
x64Gen_mov_mem64Reg64PlusReg64_reg64(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, memReg, memImmS32);
return;
}
// store as integer
// get limit from mode
sint32 clampMin, clampMax;
sint32 bitWriteSize;
if (mode == PPCREC_FPR_ST_MODE_PSQ_S8_PS0 ||
mode == PPCREC_FPR_ST_MODE_PSQ_S8_PS0_PS1 )
{
clampMin = -128;
clampMax = 127;
bitWriteSize = 8;
}
else if (mode == PPCREC_FPR_ST_MODE_PSQ_U8_PS0 ||
mode == PPCREC_FPR_ST_MODE_PSQ_U8_PS0_PS1 )
{
clampMin = 0;
clampMax = 255;
bitWriteSize = 8;
}
else if (mode == PPCREC_FPR_ST_MODE_PSQ_U16_PS0 ||
mode == PPCREC_FPR_ST_MODE_PSQ_U16_PS0_PS1 )
{
clampMin = 0;
clampMax = 0xFFFF;
bitWriteSize = 16;
}
else if (mode == PPCREC_FPR_ST_MODE_PSQ_S16_PS0 ||
mode == PPCREC_FPR_ST_MODE_PSQ_S16_PS0_PS1 )
{
clampMin = -32768;
clampMax = 32767;
bitWriteSize = 16;
}
else
{
cemu_assert(false);
}
for (sint32 valueIndex = 0; valueIndex < (storePS1?2:1); valueIndex++)
{
// todo - multiply by GQR scale
if (valueIndex == 0)
{
// convert low half (PS0) to integer
x64Gen_cvttsd2si_reg64Low_xmmReg(x64GenContext, REG_RESV_TEMP, registerXMM);
}
else
{
// load top half (PS1) into bottom half of temporary register
x64Gen_movhlps_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, registerXMM);
// convert low half to integer
x64Gen_cvttsd2si_reg64Low_xmmReg(x64GenContext, REG_RESV_TEMP, REG_RESV_FPR_TEMP);
}
// max(i, -clampMin)
x64Gen_cmp_reg64Low32_imm32(x64GenContext, REG_RESV_TEMP, clampMin);
sint32 jumpInstructionOffset1 = x64GenContext->emitter->GetWriteIndex();
x64Gen_jmpc_near(x64GenContext, X86_CONDITION_SIGNED_GREATER_EQUAL, 0);
x64Gen_mov_reg64Low32_imm32(x64GenContext, REG_RESV_TEMP, clampMin);
PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpInstructionOffset1, x64GenContext->emitter->GetWriteIndex());
// min(i, clampMax)
x64Gen_cmp_reg64Low32_imm32(x64GenContext, REG_RESV_TEMP, clampMax);
sint32 jumpInstructionOffset2 = x64GenContext->emitter->GetWriteIndex();
x64Gen_jmpc_near(x64GenContext, X86_CONDITION_SIGNED_LESS_EQUAL, 0);
x64Gen_mov_reg64Low32_imm32(x64GenContext, REG_RESV_TEMP, clampMax);
PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpInstructionOffset2, x64GenContext->emitter->GetWriteIndex());
// endian swap
if( bitWriteSize == 16)
x64Gen_rol_reg64Low16_imm8(x64GenContext, REG_RESV_TEMP, 8);
// write to memory
if (indexed)
assert_dbg(); // unsupported
sint32 memOffset = memImmS32 + valueIndex * (bitWriteSize/8);
if (bitWriteSize == 8)
x64Gen_movTruncate_mem8Reg64PlusReg64_reg64(x64GenContext, REG_RESV_MEMBASE, memReg, memOffset, REG_RESV_TEMP);
else if (bitWriteSize == 16)
x64Gen_movTruncate_mem16Reg64PlusReg64_reg64(x64GenContext, REG_RESV_MEMBASE, memReg, memOffset, REG_RESV_TEMP);
}
}
void PPCRecompilerX64Gen_imlInstr_psq_store_generic(ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, uint8 mode, sint32 registerXMM, sint32 memReg, sint32 memRegEx, sint32 memImmS32, bool indexed, IMLReg registerGQR)
{
bool storePS1 = (mode == PPCREC_FPR_ST_MODE_PSQ_GENERIC_PS0_PS1);
// load GQR
x64Gen_mov_reg64_reg64(x64GenContext, REG_RESV_TEMP, _regI32(registerGQR));
// extract store type field
x64Gen_and_reg64Low32_imm32(x64GenContext, REG_RESV_TEMP, 7);
// jump cases
x64Gen_cmp_reg64Low32_imm32(x64GenContext, REG_RESV_TEMP, 4); // type 4 -> u8
sint32 jumpOffset_caseU8 = x64GenContext->emitter->GetWriteIndex();
x64Gen_jmpc_far(x64GenContext, X86_CONDITION_EQUAL, 0);
x64Gen_cmp_reg64Low32_imm32(x64GenContext, REG_RESV_TEMP, 5); // type 5 -> u16
sint32 jumpOffset_caseU16 = x64GenContext->emitter->GetWriteIndex();
x64Gen_jmpc_far(x64GenContext, X86_CONDITION_EQUAL, 0);
x64Gen_cmp_reg64Low32_imm32(x64GenContext, REG_RESV_TEMP, 6); // type 4 -> s8
sint32 jumpOffset_caseS8 = x64GenContext->emitter->GetWriteIndex();
x64Gen_jmpc_far(x64GenContext, X86_CONDITION_EQUAL, 0);
x64Gen_cmp_reg64Low32_imm32(x64GenContext, REG_RESV_TEMP, 7); // type 5 -> s16
sint32 jumpOffset_caseS16 = x64GenContext->emitter->GetWriteIndex();
x64Gen_jmpc_far(x64GenContext, X86_CONDITION_EQUAL, 0);
// default case -> float
// generate cases
uint32 jumpOffset_endOfFloat;
uint32 jumpOffset_endOfU8;
uint32 jumpOffset_endOfU16;
uint32 jumpOffset_endOfS8;
PPCRecompilerX64Gen_imlInstr_psq_store(ppcImlGenContext, x64GenContext, storePS1 ? PPCREC_FPR_ST_MODE_PSQ_FLOAT_PS0_PS1 : PPCREC_FPR_ST_MODE_PSQ_FLOAT_PS0, registerXMM, memReg, memRegEx, memImmS32, indexed, registerGQR);
jumpOffset_endOfFloat = x64GenContext->emitter->GetWriteIndex();
x64Gen_jmp_imm32(x64GenContext, 0);
PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpOffset_caseU16, x64GenContext->emitter->GetWriteIndex());
PPCRecompilerX64Gen_imlInstr_psq_store(ppcImlGenContext, x64GenContext, storePS1 ? PPCREC_FPR_ST_MODE_PSQ_U16_PS0_PS1 : PPCREC_FPR_ST_MODE_PSQ_U16_PS0, registerXMM, memReg, memRegEx, memImmS32, indexed, registerGQR);
jumpOffset_endOfU8 = x64GenContext->emitter->GetWriteIndex();
x64Gen_jmp_imm32(x64GenContext, 0);
PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpOffset_caseS16, x64GenContext->emitter->GetWriteIndex());
PPCRecompilerX64Gen_imlInstr_psq_store(ppcImlGenContext, x64GenContext, storePS1 ? PPCREC_FPR_ST_MODE_PSQ_S16_PS0_PS1 : PPCREC_FPR_ST_MODE_PSQ_S16_PS0, registerXMM, memReg, memRegEx, memImmS32, indexed, registerGQR);
jumpOffset_endOfU16 = x64GenContext->emitter->GetWriteIndex();
x64Gen_jmp_imm32(x64GenContext, 0);
PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpOffset_caseU8, x64GenContext->emitter->GetWriteIndex());
PPCRecompilerX64Gen_imlInstr_psq_store(ppcImlGenContext, x64GenContext, storePS1 ? PPCREC_FPR_ST_MODE_PSQ_U8_PS0_PS1 : PPCREC_FPR_ST_MODE_PSQ_U8_PS0, registerXMM, memReg, memRegEx, memImmS32, indexed, registerGQR);
jumpOffset_endOfS8 = x64GenContext->emitter->GetWriteIndex();
x64Gen_jmp_imm32(x64GenContext, 0);
PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpOffset_caseS8, x64GenContext->emitter->GetWriteIndex());
PPCRecompilerX64Gen_imlInstr_psq_store(ppcImlGenContext, x64GenContext, storePS1 ? PPCREC_FPR_ST_MODE_PSQ_S8_PS0_PS1 : PPCREC_FPR_ST_MODE_PSQ_S8_PS0, registerXMM, memReg, memRegEx, memImmS32, indexed, registerGQR);
PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpOffset_endOfFloat, x64GenContext->emitter->GetWriteIndex());
PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpOffset_endOfU8, x64GenContext->emitter->GetWriteIndex());
PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpOffset_endOfU16, x64GenContext->emitter->GetWriteIndex());
PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpOffset_endOfS8, x64GenContext->emitter->GetWriteIndex());
}
// store to memory
bool PPCRecompilerX64Gen_imlInstruction_fpr_store(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, IMLInstruction* imlInstruction, bool indexed)
{
@ -646,40 +217,14 @@ bool PPCRecompilerX64Gen_imlInstruction_fpr_store(PPCRecFunction_t* PPCRecFuncti
x64Gen_movTruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_RESV_MEMBASE, realRegisterMem, imlInstruction->op_storeLoad.immS32, REG_RESV_TEMP);
}
}
else if(mode == PPCREC_FPR_ST_MODE_PSQ_FLOAT_PS0_PS1 ||
mode == PPCREC_FPR_ST_MODE_PSQ_FLOAT_PS0 ||
mode == PPCREC_FPR_ST_MODE_PSQ_S8_PS0 ||
mode == PPCREC_FPR_ST_MODE_PSQ_S8_PS0_PS1 ||
mode == PPCREC_FPR_ST_MODE_PSQ_U8_PS0 ||
mode == PPCREC_FPR_ST_MODE_PSQ_U8_PS0_PS1 ||
mode == PPCREC_FPR_ST_MODE_PSQ_S16_PS0 ||
mode == PPCREC_FPR_ST_MODE_PSQ_S16_PS0_PS1 ||
mode == PPCREC_FPR_ST_MODE_PSQ_U16_PS0 ||
mode == PPCREC_FPR_ST_MODE_PSQ_U16_PS0_PS1 )
{
cemu_assert_debug(imlInstruction->op_storeLoad.flags2.notExpanded == false);
PPCRecompilerX64Gen_imlInstr_psq_store(ppcImlGenContext, x64GenContext, mode, realRegisterXMM, realRegisterMem, realRegisterMem2, imlInstruction->op_storeLoad.immS32, indexed);
}
else if (mode == PPCREC_FPR_ST_MODE_PSQ_GENERIC_PS0_PS1 ||
mode == PPCREC_FPR_ST_MODE_PSQ_GENERIC_PS0)
{
PPCRecompilerX64Gen_imlInstr_psq_store_generic(ppcImlGenContext, x64GenContext, mode, realRegisterXMM, realRegisterMem, realRegisterMem2, imlInstruction->op_storeLoad.immS32, indexed, imlInstruction->op_storeLoad.registerGQR);
}
else
{
if( indexed )
assert_dbg(); // todo
debug_printf("PPCRecompilerX64Gen_imlInstruction_fpr_store(): Unsupported mode %d\n", mode);
return false;
}
return true;
}
void _swapPS0PS1(x64GenContext_t* x64GenContext, sint32 xmmReg)
{
x64Gen_shufpd_xmmReg_xmmReg_imm8(x64GenContext, xmmReg, xmmReg, 1);
}
// FPR op FPR
void PPCRecompilerX64Gen_imlInstruction_fpr_r_r(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, IMLInstruction* imlInstruction)
{
@ -701,93 +246,26 @@ void PPCRecompilerX64Gen_imlInstruction_fpr_r_r(PPCRecFunction_t* PPCRecFunction
uint32 regR = _regF64(imlInstruction->op_fpr_r_r.regR);
uint32 regA = _regF64(imlInstruction->op_fpr_r_r.regA);
if( imlInstruction->operation == PPCREC_IML_OP_FPR_COPY_BOTTOM_TO_BOTTOM_AND_TOP )
{
x64Gen_movddup_xmmReg_xmmReg(x64GenContext, regR, regA);
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_COPY_TOP_TO_BOTTOM_AND_TOP )
{
// VPUNPCKHQDQ
if (regR == regA)
{
// unpack top to bottom and top
x64Gen_unpckhpd_xmmReg_xmmReg(x64GenContext, regR, regA);
}
//else if ( hasAVXSupport )
//{
// // unpack top to bottom and top with non-destructive destination
// // update: On Ivy Bridge this causes weird stalls?
// x64Gen_avx_VUNPCKHPD_xmm_xmm_xmm(x64GenContext, registerResult, registerOperand, registerOperand);
//}
else
{
// move top to bottom
x64Gen_movhlps_xmmReg_xmmReg(x64GenContext, regR, regA);
// duplicate bottom
x64Gen_movddup_xmmReg_xmmReg(x64GenContext, regR, regR);
}
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_COPY_BOTTOM_TO_BOTTOM )
if( imlInstruction->operation == PPCREC_IML_OP_FPR_COPY_BOTTOM_TO_BOTTOM )
{
x64Gen_movsd_xmmReg_xmmReg(x64GenContext, regR, regA);
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_COPY_BOTTOM_TO_TOP )
{
x64Gen_unpcklpd_xmmReg_xmmReg(x64GenContext, regR, regA);
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_COPY_BOTTOM_AND_TOP_SWAPPED )
{
if( regR != regA )
x64Gen_movaps_xmmReg_xmmReg(x64GenContext, regR, regA);
_swapPS0PS1(x64GenContext, regR);
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_COPY_TOP_TO_TOP )
{
x64Gen_shufpd_xmmReg_xmmReg_imm8(x64GenContext, regR, regA, 2);
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_COPY_TOP_TO_BOTTOM )
{
// use unpckhpd here?
x64Gen_shufpd_xmmReg_xmmReg_imm8(x64GenContext, regR, regA, 3);
_swapPS0PS1(x64GenContext, regR);
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_MULTIPLY_BOTTOM )
{
x64Gen_mulsd_xmmReg_xmmReg(x64GenContext, regR, regA);
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_MULTIPLY_PAIR )
{
x64Gen_mulpd_xmmReg_xmmReg(x64GenContext, regR, regA);
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_DIVIDE_BOTTOM )
{
x64Gen_divsd_xmmReg_xmmReg(x64GenContext, regR, regA);
}
else if (imlInstruction->operation == PPCREC_IML_OP_FPR_DIVIDE_PAIR)
{
x64Gen_divpd_xmmReg_xmmReg(x64GenContext, regR, regA);
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_ADD_BOTTOM )
{
x64Gen_addsd_xmmReg_xmmReg(x64GenContext, regR, regA);
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_ADD_PAIR )
{
x64Gen_addpd_xmmReg_xmmReg(x64GenContext, regR, regA);
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_SUB_PAIR )
{
x64Gen_subpd_xmmReg_xmmReg(x64GenContext, regR, regA);
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_SUB_BOTTOM )
{
x64Gen_subsd_xmmReg_xmmReg(x64GenContext, regR, regA);
}
else if( imlInstruction->operation == PPCREC_IML_OP_ASSIGN )
{
x64Gen_movaps_xmmReg_xmmReg(x64GenContext, regR, regA);
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_BOTTOM_FCTIWZ )
{
x64Gen_cvttsd2si_xmmReg_xmmReg(x64GenContext, REG_RESV_TEMP, regA);
@ -795,58 +273,6 @@ void PPCRecompilerX64Gen_imlInstruction_fpr_r_r(PPCRecFunction_t* PPCRecFunction
// move to FPR register
x64Gen_movq_xmmReg_reg64(x64GenContext, regR, REG_RESV_TEMP);
}
else if (imlInstruction->operation == PPCREC_IML_OP_FPR_BOTTOM_RECIPROCAL_SQRT)
{
// move register to XMM15
x64Gen_movsd_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, regA);
// call assembly routine to calculate accurate FRSQRTE result in XMM15
x64Gen_mov_reg64_imm64(x64GenContext, REG_RESV_TEMP, (uint64)recompiler_frsqrte);
x64Gen_call_reg64(x64GenContext, REG_RESV_TEMP);
// copy result to bottom of result register
x64Gen_movsd_xmmReg_xmmReg(x64GenContext, regR, REG_RESV_FPR_TEMP);
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_NEGATE_PAIR )
{
// copy register
if( regR != regA )
{
x64Gen_movaps_xmmReg_xmmReg(x64GenContext, regR, regA);
}
// toggle sign bits
x64Gen_xorps_xmmReg_mem128Reg64(x64GenContext, regR, REG_RESV_RECDATA, offsetof(PPCRecompilerInstanceData_t, _x64XMM_xorNegateMaskPair));
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_ABS_PAIR )
{
// copy register
if( regR != regA )
{
x64Gen_movaps_xmmReg_xmmReg(x64GenContext, regR, regA);
}
// set sign bit to 0
x64Gen_andps_xmmReg_mem128Reg64(x64GenContext, regR, REG_RESV_RECDATA, offsetof(PPCRecompilerInstanceData_t, _x64XMM_andAbsMaskPair));
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_FRES_PAIR || imlInstruction->operation == PPCREC_IML_OP_FPR_FRSQRTE_PAIR)
{
// calculate bottom half of result
x64Gen_movsd_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, regA);
if(imlInstruction->operation == PPCREC_IML_OP_FPR_FRES_PAIR)
x64Gen_mov_reg64_imm64(x64GenContext, REG_RESV_TEMP, (uint64)recompiler_fres);
else
x64Gen_mov_reg64_imm64(x64GenContext, REG_RESV_TEMP, (uint64)recompiler_frsqrte);
x64Gen_call_reg64(x64GenContext, REG_RESV_TEMP); // calculate fres result in xmm15
x64Gen_movsd_xmmReg_xmmReg(x64GenContext, regR, REG_RESV_FPR_TEMP);
// calculate top half of result
// todo - this top to bottom copy can be optimized?
x64Gen_shufpd_xmmReg_xmmReg_imm8(x64GenContext, REG_RESV_FPR_TEMP, regA, 3);
x64Gen_shufpd_xmmReg_xmmReg_imm8(x64GenContext, REG_RESV_FPR_TEMP, REG_RESV_FPR_TEMP, 1); // swap top and bottom
x64Gen_call_reg64(x64GenContext, REG_RESV_TEMP); // calculate fres result in xmm15
x64Gen_unpcklpd_xmmReg_xmmReg(x64GenContext, regR, REG_RESV_FPR_TEMP); // copy bottom to top
}
else
{
assert_dbg();
@ -895,29 +321,6 @@ void PPCRecompilerX64Gen_imlInstruction_fpr_r_r_r(PPCRecFunction_t* PPCRecFuncti
x64Gen_addsd_xmmReg_xmmReg(x64GenContext, regR, regB);
}
}
else if (imlInstruction->operation == PPCREC_IML_OP_FPR_SUB_PAIR)
{
// registerResult = registerOperandA - registerOperandB
if( regR == regA )
{
x64Gen_subpd_xmmReg_xmmReg(x64GenContext, regR, regB);
}
else if (g_CPUFeatures.x86.avx)
{
x64Gen_avx_VSUBPD_xmm_xmm_xmm(x64GenContext, regR, regA, regB);
}
else if( regR == regB )
{
x64Gen_movaps_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, regA);
x64Gen_subpd_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, regB);
x64Gen_movaps_xmmReg_xmmReg(x64GenContext, regR, REG_RESV_FPR_TEMP);
}
else
{
x64Gen_movaps_xmmReg_xmmReg(x64GenContext, regR, regA);
x64Gen_subpd_xmmReg_xmmReg(x64GenContext, regR, regB);
}
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_SUB_BOTTOM )
{
if( regR == regA )
@ -950,39 +353,7 @@ void PPCRecompilerX64Gen_imlInstruction_fpr_r_r_r_r(PPCRecFunction_t* PPCRecFunc
uint32 regB = _regF64(imlInstruction->op_fpr_r_r_r_r.regB);
uint32 regC = _regF64(imlInstruction->op_fpr_r_r_r_r.regC);
if( imlInstruction->operation == PPCREC_IML_OP_FPR_SUM0 )
{
// todo: Investigate if there are other optimizations possible if the operand registers overlap
// generic case
// 1) move frA bottom to frTemp bottom and top
x64Gen_movddup_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, regA);
// 2) add frB (both halfs, lower half is overwritten in the next step)
x64Gen_addpd_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, regB);
// 3) Interleave top of frTemp and frC
x64Gen_unpckhpd_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, regC);
// todo: We can optimize the REG_RESV_FPR_TEMP -> resultReg copy operation away when the result register does not overlap with any of the operand registers
x64Gen_movaps_xmmReg_xmmReg(x64GenContext, regR, REG_RESV_FPR_TEMP);
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_SUM1 )
{
// todo: Investigate if there are other optimizations possible if the operand registers overlap
// 1) move frA bottom to frTemp bottom and top
x64Gen_movddup_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, regA);
// 2) add frB (both halfs, lower half is overwritten in the next step)
x64Gen_addpd_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, regB);
// 3) Copy bottom from frC
x64Gen_movsd_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, regC);
//// 4) Swap bottom and top half
//x64Gen_shufpd_xmmReg_xmmReg_imm8(x64GenContext, REG_RESV_FPR_TEMP, REG_RESV_FPR_TEMP, 1);
// todo: We can optimize the REG_RESV_FPR_TEMP -> resultReg copy operation away when the result register does not overlap with any of the operand registers
x64Gen_movaps_xmmReg_xmmReg(x64GenContext, regR, REG_RESV_FPR_TEMP);
//float s0 = (float)hCPU->fpr[frC].fp0;
//float s1 = (float)(hCPU->fpr[frA].fp0 + hCPU->fpr[frB].fp1);
//hCPU->fpr[frD].fp0 = s0;
//hCPU->fpr[frD].fp1 = s1;
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_SELECT_BOTTOM )
if( imlInstruction->operation == PPCREC_IML_OP_FPR_SELECT_BOTTOM )
{
x64Gen_comisd_xmmReg_mem64Reg64(x64GenContext, regA, REG_RESV_RECDATA, offsetof(PPCRecompilerInstanceData_t, _x64XMM_constDouble0_0));
sint32 jumpInstructionOffset1 = x64GenContext->emitter->GetWriteIndex();
@ -997,38 +368,6 @@ void PPCRecompilerX64Gen_imlInstruction_fpr_r_r_r_r(PPCRecFunction_t* PPCRecFunc
// end
PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpInstructionOffset2, x64GenContext->emitter->GetWriteIndex());
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_SELECT_PAIR )
{
// select bottom
x64Gen_comisd_xmmReg_mem64Reg64(x64GenContext, regA, REG_RESV_RECDATA, offsetof(PPCRecompilerInstanceData_t, _x64XMM_constDouble0_0));
sint32 jumpInstructionOffset1_bottom = x64GenContext->emitter->GetWriteIndex();
x64Gen_jmpc_near(x64GenContext, X86_CONDITION_UNSIGNED_BELOW, 0);
// select C bottom
x64Gen_movsd_xmmReg_xmmReg(x64GenContext, regR, regC);
sint32 jumpInstructionOffset2_bottom = x64GenContext->emitter->GetWriteIndex();
x64Gen_jmpc_near(x64GenContext, X86_CONDITION_NONE, 0);
// select B bottom
PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpInstructionOffset1_bottom, x64GenContext->emitter->GetWriteIndex());
x64Gen_movsd_xmmReg_xmmReg(x64GenContext, regR, regB);
// end
PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpInstructionOffset2_bottom, x64GenContext->emitter->GetWriteIndex());
// select top
x64Gen_movhlps_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, regA); // copy top to bottom (todo: May cause stall?)
x64Gen_comisd_xmmReg_mem64Reg64(x64GenContext, REG_RESV_FPR_TEMP, REG_RESV_RECDATA, offsetof(PPCRecompilerInstanceData_t, _x64XMM_constDouble0_0));
sint32 jumpInstructionOffset1_top = x64GenContext->emitter->GetWriteIndex();
x64Gen_jmpc_near(x64GenContext, X86_CONDITION_UNSIGNED_BELOW, 0);
// select C top
//x64Gen_movsd_xmmReg_xmmReg(x64GenContext, registerResult, registerOperandC);
x64Gen_shufpd_xmmReg_xmmReg_imm8(x64GenContext, regR, regC, 2);
sint32 jumpInstructionOffset2_top = x64GenContext->emitter->GetWriteIndex();
x64Gen_jmpc_near(x64GenContext, X86_CONDITION_NONE, 0);
// select B top
PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpInstructionOffset1_top, x64GenContext->emitter->GetWriteIndex());
//x64Gen_movsd_xmmReg_xmmReg(x64GenContext, registerResult, registerOperandB);
x64Gen_shufpd_xmmReg_xmmReg_imm8(x64GenContext, regR, regB, 2);
// end
PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpInstructionOffset2_top, x64GenContext->emitter->GetWriteIndex());
}
else
assert_dbg();
}
@ -1060,13 +399,6 @@ void PPCRecompilerX64Gen_imlInstruction_fpr_r(PPCRecFunction_t* PPCRecFunction,
// convert back to 64bit double
x64Gen_cvtss2sd_xmmReg_xmmReg(x64GenContext, regR, regR);
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_ROUND_TO_SINGLE_PRECISION_PAIR )
{
// convert to 32bit singles
x64Gen_cvtpd2ps_xmmReg_xmmReg(x64GenContext, regR, regR);
// convert back to 64bit doubles
x64Gen_cvtps2pd_xmmReg_xmmReg(x64GenContext, regR, regR);
}
else if (imlInstruction->operation == PPCREC_IML_OP_FPR_EXPAND_BOTTOM32_TO_BOTTOM64_AND_TOP64)
{
// convert bottom to 64bit double

View file

@ -229,29 +229,11 @@ void IMLInstruction::CheckRegisterUsage(IMLUsedRegisters* registersUsed) const
// determine partially written result
switch (op_storeLoad.mode)
{
case PPCREC_FPR_LD_MODE_PSQ_GENERIC_PS0:
case PPCREC_FPR_LD_MODE_PSQ_GENERIC_PS0_PS1:
cemu_assert_debug(op_storeLoad.registerGQR.IsValid());
registersUsed->readGPR2 = op_storeLoad.registerGQR;
break;
case PPCREC_FPR_LD_MODE_DOUBLE_INTO_PS0:
// PS1 remains the same
cemu_assert_debug(op_storeLoad.registerGQR.IsInvalid());
registersUsed->readGPR2 = op_storeLoad.registerData;
break;
case PPCREC_FPR_LD_MODE_SINGLE_INTO_PS0:
case PPCREC_FPR_LD_MODE_SINGLE_INTO_PS0_PS1:
case PPCREC_FPR_LD_MODE_PSQ_FLOAT_PS0_PS1:
case PPCREC_FPR_LD_MODE_PSQ_FLOAT_PS0:
case PPCREC_FPR_LD_MODE_PSQ_S16_PS0:
case PPCREC_FPR_LD_MODE_PSQ_S16_PS0_PS1:
case PPCREC_FPR_LD_MODE_PSQ_U16_PS0_PS1:
case PPCREC_FPR_LD_MODE_PSQ_U16_PS0:
case PPCREC_FPR_LD_MODE_PSQ_S8_PS0_PS1:
case PPCREC_FPR_LD_MODE_PSQ_U8_PS0_PS1:
case PPCREC_FPR_LD_MODE_PSQ_U8_PS0:
case PPCREC_FPR_LD_MODE_PSQ_S8_PS0:
cemu_assert_debug(op_storeLoad.registerGQR.IsInvalid());
break;
default:
cemu_assert_unimplemented();
@ -269,28 +251,11 @@ void IMLInstruction::CheckRegisterUsage(IMLUsedRegisters* registersUsed) const
// determine partially written result
switch (op_storeLoad.mode)
{
case PPCREC_FPR_LD_MODE_PSQ_GENERIC_PS0:
case PPCREC_FPR_LD_MODE_PSQ_GENERIC_PS0_PS1:
cemu_assert_debug(op_storeLoad.registerGQR.IsValid());
registersUsed->readGPR3 = op_storeLoad.registerGQR;
break;
case PPCREC_FPR_LD_MODE_DOUBLE_INTO_PS0:
// PS1 remains the same
cemu_assert_debug(op_storeLoad.registerGQR.IsInvalid());
registersUsed->readGPR3 = op_storeLoad.registerData;
break;
case PPCREC_FPR_LD_MODE_SINGLE_INTO_PS0:
case PPCREC_FPR_LD_MODE_SINGLE_INTO_PS0_PS1:
case PPCREC_FPR_LD_MODE_PSQ_FLOAT_PS0_PS1:
case PPCREC_FPR_LD_MODE_PSQ_FLOAT_PS0:
case PPCREC_FPR_LD_MODE_PSQ_S16_PS0:
case PPCREC_FPR_LD_MODE_PSQ_S16_PS0_PS1:
case PPCREC_FPR_LD_MODE_PSQ_U16_PS0_PS1:
case PPCREC_FPR_LD_MODE_PSQ_U16_PS0:
case PPCREC_FPR_LD_MODE_PSQ_S8_PS0_PS1:
case PPCREC_FPR_LD_MODE_PSQ_U8_PS0_PS1:
case PPCREC_FPR_LD_MODE_PSQ_U8_PS0:
cemu_assert_debug(op_storeLoad.registerGQR.IsInvalid());
break;
default:
cemu_assert_unimplemented();
@ -302,18 +267,6 @@ void IMLInstruction::CheckRegisterUsage(IMLUsedRegisters* registersUsed) const
registersUsed->readGPR1 = op_storeLoad.registerData;
if (op_storeLoad.registerMem.IsValid())
registersUsed->readGPR2 = op_storeLoad.registerMem;
// PSQ generic stores also access GQR
switch (op_storeLoad.mode)
{
case PPCREC_FPR_ST_MODE_PSQ_GENERIC_PS0:
case PPCREC_FPR_ST_MODE_PSQ_GENERIC_PS0_PS1:
cemu_assert_debug(op_storeLoad.registerGQR.IsValid());
registersUsed->readGPR3 = op_storeLoad.registerGQR;
break;
default:
cemu_assert_debug(op_storeLoad.registerGQR.IsInvalid());
break;
}
}
else if (type == PPCREC_IML_TYPE_FPR_STORE_INDEXED)
{
@ -324,43 +277,14 @@ void IMLInstruction::CheckRegisterUsage(IMLUsedRegisters* registersUsed) const
registersUsed->readGPR2 = op_storeLoad.registerMem;
if (op_storeLoad.registerMem2.IsValid())
registersUsed->readGPR3 = op_storeLoad.registerMem2;
// PSQ generic stores also access GQR
switch (op_storeLoad.mode)
{
case PPCREC_FPR_ST_MODE_PSQ_GENERIC_PS0:
case PPCREC_FPR_ST_MODE_PSQ_GENERIC_PS0_PS1:
cemu_assert_debug(op_storeLoad.registerGQR.IsValid());
registersUsed->readGPR4 = op_storeLoad.registerGQR;
break;
default:
cemu_assert_debug(op_storeLoad.registerGQR.IsInvalid());
break;
}
}
else if (type == PPCREC_IML_TYPE_FPR_R_R)
{
// fpr operation
if (operation == PPCREC_IML_OP_FPR_COPY_BOTTOM_TO_BOTTOM_AND_TOP ||
operation == PPCREC_IML_OP_FPR_COPY_TOP_TO_BOTTOM_AND_TOP ||
operation == PPCREC_IML_OP_FPR_COPY_BOTTOM_AND_TOP_SWAPPED ||
operation == PPCREC_IML_OP_ASSIGN ||
operation == PPCREC_IML_OP_FPR_NEGATE_PAIR ||
operation == PPCREC_IML_OP_FPR_ABS_PAIR ||
operation == PPCREC_IML_OP_FPR_FRES_PAIR ||
operation == PPCREC_IML_OP_FPR_FRSQRTE_PAIR)
{
// operand read, result written
registersUsed->readGPR1 = op_fpr_r_r.regA;
registersUsed->writtenGPR1 = op_fpr_r_r.regR;
}
else if (
if (
operation == PPCREC_IML_OP_FPR_COPY_BOTTOM_TO_BOTTOM ||
operation == PPCREC_IML_OP_FPR_COPY_BOTTOM_TO_TOP ||
operation == PPCREC_IML_OP_FPR_COPY_TOP_TO_TOP ||
operation == PPCREC_IML_OP_FPR_COPY_TOP_TO_BOTTOM ||
operation == PPCREC_IML_OP_FPR_EXPAND_BOTTOM32_TO_BOTTOM64_AND_TOP64 ||
operation == PPCREC_IML_OP_FPR_BOTTOM_FCTIWZ ||
operation == PPCREC_IML_OP_FPR_BOTTOM_RECIPROCAL_SQRT
operation == PPCREC_IML_OP_FPR_BOTTOM_FCTIWZ
)
{
// operand read, result read and (partially) written
@ -369,12 +293,8 @@ void IMLInstruction::CheckRegisterUsage(IMLUsedRegisters* registersUsed) const
registersUsed->writtenGPR1 = op_fpr_r_r.regR;
}
else if (operation == PPCREC_IML_OP_FPR_MULTIPLY_BOTTOM ||
operation == PPCREC_IML_OP_FPR_MULTIPLY_PAIR ||
operation == PPCREC_IML_OP_FPR_DIVIDE_BOTTOM ||
operation == PPCREC_IML_OP_FPR_DIVIDE_PAIR ||
operation == PPCREC_IML_OP_FPR_ADD_BOTTOM ||
operation == PPCREC_IML_OP_FPR_ADD_PAIR ||
operation == PPCREC_IML_OP_FPR_SUB_PAIR ||
operation == PPCREC_IML_OP_FPR_SUB_BOTTOM)
{
// operand read, result read and written
@ -383,14 +303,6 @@ void IMLInstruction::CheckRegisterUsage(IMLUsedRegisters* registersUsed) const
registersUsed->writtenGPR1 = op_fpr_r_r.regR;
}
else if (operation == PPCREC_IML_OP_FPR_FCMPU_BOTTOM ||
operation == PPCREC_IML_OP_FPR_FCMPU_TOP ||
operation == PPCREC_IML_OP_FPR_FCMPO_BOTTOM)
{
// operand read, result read
registersUsed->readGPR1 = op_fpr_r_r.regA;
registersUsed->readGPR2 = op_fpr_r_r.regR;
}
else if (operation == PPCREC_IML_OP_FPR_FLOAT_TO_INT ||
operation == PPCREC_IML_OP_FPR_INT_TO_FLOAT)
{
@ -414,8 +326,6 @@ void IMLInstruction::CheckRegisterUsage(IMLUsedRegisters* registersUsed) const
case PPCREC_IML_OP_FPR_SUB_BOTTOM:
registersUsed->readGPR3 = op_fpr_r_r_r.regR;
break;
case PPCREC_IML_OP_FPR_SUB_PAIR:
break;
default:
cemu_assert_unimplemented();
}
@ -433,10 +343,6 @@ void IMLInstruction::CheckRegisterUsage(IMLUsedRegisters* registersUsed) const
case PPCREC_IML_OP_FPR_SELECT_BOTTOM:
registersUsed->readGPR4 = op_fpr_r_r_r_r.regR;
break;
case PPCREC_IML_OP_FPR_SUM0:
case PPCREC_IML_OP_FPR_SUM1:
case PPCREC_IML_OP_FPR_SELECT_PAIR:
break;
default:
cemu_assert_unimplemented();
}
@ -448,8 +354,7 @@ void IMLInstruction::CheckRegisterUsage(IMLUsedRegisters* registersUsed) const
operation == PPCREC_IML_OP_FPR_ABS_BOTTOM ||
operation == PPCREC_IML_OP_FPR_NEGATIVE_ABS_BOTTOM ||
operation == PPCREC_IML_OP_FPR_EXPAND_BOTTOM32_TO_BOTTOM64_AND_TOP64 ||
operation == PPCREC_IML_OP_FPR_ROUND_TO_SINGLE_PRECISION_BOTTOM ||
operation == PPCREC_IML_OP_FPR_ROUND_TO_SINGLE_PRECISION_PAIR)
operation == PPCREC_IML_OP_FPR_ROUND_TO_SINGLE_PRECISION_BOTTOM)
{
registersUsed->readGPR1 = op_fpr_r.regR;
registersUsed->writtenGPR1 = op_fpr_r.regR;
@ -620,27 +525,23 @@ void IMLInstruction::RewriteGPR(const std::unordered_map<IMLRegID, IMLRegID>& tr
{
op_storeLoad.registerData = replaceRegisterIdMultiple(op_storeLoad.registerData, translationTable);
op_storeLoad.registerMem = replaceRegisterIdMultiple(op_storeLoad.registerMem, translationTable);
op_storeLoad.registerGQR = replaceRegisterIdMultiple(op_storeLoad.registerGQR, translationTable);
}
else if (type == PPCREC_IML_TYPE_FPR_LOAD_INDEXED)
{
op_storeLoad.registerData = replaceRegisterIdMultiple(op_storeLoad.registerData, translationTable);
op_storeLoad.registerMem = replaceRegisterIdMultiple(op_storeLoad.registerMem, translationTable);
op_storeLoad.registerMem2 = replaceRegisterIdMultiple(op_storeLoad.registerMem2, translationTable);
op_storeLoad.registerGQR = replaceRegisterIdMultiple(op_storeLoad.registerGQR, translationTable);
}
else if (type == PPCREC_IML_TYPE_FPR_STORE)
{
op_storeLoad.registerData = replaceRegisterIdMultiple(op_storeLoad.registerData, translationTable);
op_storeLoad.registerMem = replaceRegisterIdMultiple(op_storeLoad.registerMem, translationTable);
op_storeLoad.registerGQR = replaceRegisterIdMultiple(op_storeLoad.registerGQR, translationTable);
}
else if (type == PPCREC_IML_TYPE_FPR_STORE_INDEXED)
{
op_storeLoad.registerData = replaceRegisterIdMultiple(op_storeLoad.registerData, translationTable);
op_storeLoad.registerMem = replaceRegisterIdMultiple(op_storeLoad.registerMem, translationTable);
op_storeLoad.registerMem2 = replaceRegisterIdMultiple(op_storeLoad.registerMem2, translationTable);
op_storeLoad.registerGQR = replaceRegisterIdMultiple(op_storeLoad.registerGQR, translationTable);
}
else if (type == PPCREC_IML_TYPE_FPR_R)
{

View file

@ -126,46 +126,22 @@ enum
PPCREC_IML_OP_SRW, // SRW (shift based on register by up to 63 bits)
PPCREC_IML_OP_CNTLZW,
// FPU
PPCREC_IML_OP_FPR_COPY_BOTTOM_TO_BOTTOM,
PPCREC_IML_OP_FPR_LOAD_ONE, // load constant 1.0 into register
PPCREC_IML_OP_FPR_ADD_BOTTOM,
PPCREC_IML_OP_FPR_ADD_PAIR,
PPCREC_IML_OP_FPR_SUB_PAIR,
PPCREC_IML_OP_FPR_SUB_BOTTOM,
PPCREC_IML_OP_FPR_MULTIPLY_BOTTOM,
PPCREC_IML_OP_FPR_MULTIPLY_PAIR,
PPCREC_IML_OP_FPR_DIVIDE_BOTTOM,
PPCREC_IML_OP_FPR_DIVIDE_PAIR,
PPCREC_IML_OP_FPR_COPY_BOTTOM_TO_BOTTOM_AND_TOP,
PPCREC_IML_OP_FPR_COPY_TOP_TO_BOTTOM_AND_TOP,
PPCREC_IML_OP_FPR_COPY_BOTTOM_TO_BOTTOM,
PPCREC_IML_OP_FPR_COPY_BOTTOM_TO_TOP, // leave bottom of destination untouched
PPCREC_IML_OP_FPR_COPY_TOP_TO_TOP, // leave bottom of destination untouched
PPCREC_IML_OP_FPR_COPY_TOP_TO_BOTTOM, // leave top of destination untouched
PPCREC_IML_OP_FPR_COPY_BOTTOM_AND_TOP_SWAPPED,
PPCREC_IML_OP_FPR_EXPAND_BOTTOM32_TO_BOTTOM64_AND_TOP64, // expand bottom f32 to f64 in bottom and top half
PPCREC_IML_OP_FPR_FCMPO_BOTTOM, // deprecated
PPCREC_IML_OP_FPR_FCMPU_BOTTOM, // deprecated
PPCREC_IML_OP_FPR_FCMPU_TOP, // deprecated
PPCREC_IML_OP_FPR_NEGATE_BOTTOM,
PPCREC_IML_OP_FPR_NEGATE_PAIR,
PPCREC_IML_OP_FPR_ABS_BOTTOM, // abs(fp0)
PPCREC_IML_OP_FPR_ABS_PAIR,
PPCREC_IML_OP_FPR_FRES_PAIR, // 1.0/fp approx (Espresso accuracy)
PPCREC_IML_OP_FPR_FRSQRTE_PAIR, // 1.0/sqrt(fp) approx (Espresso accuracy)
PPCREC_IML_OP_FPR_NEGATIVE_ABS_BOTTOM, // -abs(fp0)
PPCREC_IML_OP_FPR_ROUND_TO_SINGLE_PRECISION_BOTTOM, // round 64bit double to 64bit double with 32bit float precision (in bottom half of xmm register)
PPCREC_IML_OP_FPR_ROUND_TO_SINGLE_PRECISION_PAIR, // round two 64bit doubles to 64bit double with 32bit float precision
PPCREC_IML_OP_FPR_BOTTOM_RECIPROCAL_SQRT,
PPCREC_IML_OP_FPR_BOTTOM_FCTIWZ,
PPCREC_IML_OP_FPR_SELECT_BOTTOM, // selectively copy bottom value from operand B or C based on value in operand A
PPCREC_IML_OP_FPR_SELECT_PAIR, // selectively copy top/bottom from operand B or C based on value in top/bottom of operand A
// Conversion (FPR_R_R)
PPCREC_IML_OP_FPR_INT_TO_FLOAT, // convert integer value in gpr to floating point value in fpr
PPCREC_IML_OP_FPR_FLOAT_TO_INT, // convert floating point value in fpr to integer value in gpr
// PS
PPCREC_IML_OP_FPR_SUM0,
PPCREC_IML_OP_FPR_SUM1,
PPCREC_IML_OP_FPR_LOAD_ONE, // load constant 1.0 into register
// R_R_R only
@ -297,38 +273,13 @@ enum
{
// fpr load
PPCREC_FPR_LD_MODE_SINGLE_INTO_PS0,
PPCREC_FPR_LD_MODE_SINGLE_INTO_PS0_PS1,
PPCREC_FPR_LD_MODE_DOUBLE_INTO_PS0,
PPCREC_FPR_LD_MODE_PSQ_GENERIC_PS0,
PPCREC_FPR_LD_MODE_PSQ_GENERIC_PS0_PS1,
PPCREC_FPR_LD_MODE_PSQ_FLOAT_PS0,
PPCREC_FPR_LD_MODE_PSQ_FLOAT_PS0_PS1,
PPCREC_FPR_LD_MODE_PSQ_S16_PS0,
PPCREC_FPR_LD_MODE_PSQ_S16_PS0_PS1,
PPCREC_FPR_LD_MODE_PSQ_U16_PS0,
PPCREC_FPR_LD_MODE_PSQ_U16_PS0_PS1,
PPCREC_FPR_LD_MODE_PSQ_S8_PS0,
PPCREC_FPR_LD_MODE_PSQ_S8_PS0_PS1,
PPCREC_FPR_LD_MODE_PSQ_U8_PS0,
PPCREC_FPR_LD_MODE_PSQ_U8_PS0_PS1,
// fpr store
PPCREC_FPR_ST_MODE_SINGLE_FROM_PS0, // store 1 single precision float from ps0
PPCREC_FPR_ST_MODE_DOUBLE_FROM_PS0, // store 1 double precision float from ps0
PPCREC_FPR_ST_MODE_UI32_FROM_PS0, // store raw low-32bit of PS0
PPCREC_FPR_ST_MODE_PSQ_GENERIC_PS0_PS1,
PPCREC_FPR_ST_MODE_PSQ_GENERIC_PS0,
PPCREC_FPR_ST_MODE_PSQ_FLOAT_PS0_PS1,
PPCREC_FPR_ST_MODE_PSQ_FLOAT_PS0,
PPCREC_FPR_ST_MODE_PSQ_S8_PS0,
PPCREC_FPR_ST_MODE_PSQ_S8_PS0_PS1,
PPCREC_FPR_ST_MODE_PSQ_U8_PS0,
PPCREC_FPR_ST_MODE_PSQ_U8_PS0_PS1,
PPCREC_FPR_ST_MODE_PSQ_U16_PS0,
PPCREC_FPR_ST_MODE_PSQ_U16_PS0_PS1,
PPCREC_FPR_ST_MODE_PSQ_S16_PS0,
PPCREC_FPR_ST_MODE_PSQ_S16_PS0_PS1,
};
struct IMLUsedRegisters
@ -468,7 +419,6 @@ struct IMLInstruction
IMLReg registerData;
IMLReg registerMem;
IMLReg registerMem2;
IMLReg registerGQR;
uint8 copyWidth;
struct
{
@ -476,7 +426,7 @@ struct IMLInstruction
bool signExtend : 1;
bool notExpanded : 1; // for floats
}flags2;
uint8 mode; // transfer mode (copy width, ps0/ps1 behavior)
uint8 mode; // transfer mode
sint32 immS32;
}op_storeLoad;
struct
@ -760,58 +710,48 @@ struct IMLInstruction
// FPR
// load from memory
void make_fpr_r_memory(IMLReg registerDestination, IMLReg registerMemory, sint32 immS32, uint32 mode, bool switchEndian, IMLReg registerGQR = IMLREG_INVALID)
void make_fpr_r_memory(IMLReg registerDestination, IMLReg registerMemory, sint32 immS32, uint32 mode, bool switchEndian)
{
if (registerGQR.IsValid())
{
if ( mode == 0)
__debugbreak();
}
this->type = PPCREC_IML_TYPE_FPR_LOAD;
this->operation = 0;
this->op_storeLoad.registerData = registerDestination;
this->op_storeLoad.registerMem = registerMemory;
this->op_storeLoad.registerGQR = registerGQR;
this->op_storeLoad.immS32 = immS32;
this->op_storeLoad.mode = mode;
this->op_storeLoad.flags2.swapEndian = switchEndian;
}
void make_fpr_r_memory_indexed(IMLReg registerDestination, IMLReg registerMemory1, IMLReg registerMemory2, uint32 mode, bool switchEndian, IMLReg registerGQR = IMLREG_INVALID)
void make_fpr_r_memory_indexed(IMLReg registerDestination, IMLReg registerMemory1, IMLReg registerMemory2, uint32 mode, bool switchEndian)
{
this->type = PPCREC_IML_TYPE_FPR_LOAD_INDEXED;
this->operation = 0;
this->op_storeLoad.registerData = registerDestination;
this->op_storeLoad.registerMem = registerMemory1;
this->op_storeLoad.registerMem2 = registerMemory2;
this->op_storeLoad.registerGQR = registerGQR;
this->op_storeLoad.immS32 = 0;
this->op_storeLoad.mode = mode;
this->op_storeLoad.flags2.swapEndian = switchEndian;
}
// store to memory
void make_fpr_memory_r(IMLReg registerSource, IMLReg registerMemory, sint32 immS32, uint32 mode, bool switchEndian, IMLReg registerGQR = IMLREG_INVALID)
void make_fpr_memory_r(IMLReg registerSource, IMLReg registerMemory, sint32 immS32, uint32 mode, bool switchEndian)
{
this->type = PPCREC_IML_TYPE_FPR_STORE;
this->operation = 0;
this->op_storeLoad.registerData = registerSource;
this->op_storeLoad.registerMem = registerMemory;
this->op_storeLoad.registerGQR = registerGQR;
this->op_storeLoad.immS32 = immS32;
this->op_storeLoad.mode = mode;
this->op_storeLoad.flags2.swapEndian = switchEndian;
}
void make_fpr_memory_r_indexed(IMLReg registerSource, IMLReg registerMemory1, IMLReg registerMemory2, sint32 immS32, uint32 mode, bool switchEndian, IMLReg registerGQR = IMLREG_INVALID)
void make_fpr_memory_r_indexed(IMLReg registerSource, IMLReg registerMemory1, IMLReg registerMemory2, sint32 immS32, uint32 mode, bool switchEndian)
{
this->type = PPCREC_IML_TYPE_FPR_STORE_INDEXED;
this->operation = 0;
this->op_storeLoad.registerData = registerSource;
this->op_storeLoad.registerMem = registerMemory1;
this->op_storeLoad.registerMem2 = registerMemory2;
this->op_storeLoad.registerGQR = registerGQR;
this->op_storeLoad.immS32 = immS32;
this->op_storeLoad.mode = mode;
this->op_storeLoad.flags2.swapEndian = switchEndian;

View file

@ -90,21 +90,23 @@ void PPCRecompiler_optimizeDirectFloatCopiesScanForward(ppcImlGenContext_t* ppcI
*/
void IMLOptimizer_OptimizeDirectFloatCopies(ppcImlGenContext_t* ppcImlGenContext)
{
for (IMLSegment* segIt : ppcImlGenContext->segmentList2)
{
for (sint32 i = 0; i < segIt->imlList.size(); i++)
{
IMLInstruction* imlInstruction = segIt->imlList.data() + i;
if (imlInstruction->type == PPCREC_IML_TYPE_FPR_LOAD && imlInstruction->op_storeLoad.mode == PPCREC_FPR_LD_MODE_SINGLE_INTO_PS0_PS1)
{
PPCRecompiler_optimizeDirectFloatCopiesScanForward(ppcImlGenContext, segIt, i, imlInstruction->op_storeLoad.registerData);
}
else if (imlInstruction->type == PPCREC_IML_TYPE_FPR_LOAD_INDEXED && imlInstruction->op_storeLoad.mode == PPCREC_FPR_LD_MODE_SINGLE_INTO_PS0_PS1)
{
PPCRecompiler_optimizeDirectFloatCopiesScanForward(ppcImlGenContext, segIt, i, imlInstruction->op_storeLoad.registerData);
}
}
}
cemuLog_logDebugOnce(LogType::Force, "IMLOptimizer_OptimizeDirectFloatCopies(): Currently disabled\n");
return;
// for (IMLSegment* segIt : ppcImlGenContext->segmentList2)
// {
// for (sint32 i = 0; i < segIt->imlList.size(); i++)
// {
// IMLInstruction* imlInstruction = segIt->imlList.data() + i;
// if (imlInstruction->type == PPCREC_IML_TYPE_FPR_LOAD && imlInstruction->op_storeLoad.mode == PPCREC_FPR_LD_MODE_SINGLE_INTO_PS0_PS1)
// {
// PPCRecompiler_optimizeDirectFloatCopiesScanForward(ppcImlGenContext, segIt, i, imlInstruction->op_storeLoad.registerData);
// }
// else if (imlInstruction->type == PPCREC_IML_TYPE_FPR_LOAD_INDEXED && imlInstruction->op_storeLoad.mode == PPCREC_FPR_LD_MODE_SINGLE_INTO_PS0_PS1)
// {
// PPCRecompiler_optimizeDirectFloatCopiesScanForward(ppcImlGenContext, segIt, i, imlInstruction->op_storeLoad.registerData);
// }
// }
// }
}
void PPCRecompiler_optimizeDirectIntegerCopiesScanForward(ppcImlGenContext_t* ppcImlGenContext, IMLSegment* imlSegment, sint32 imlIndexLoad, IMLReg gprReg)
@ -224,118 +226,6 @@ bool PPCRecompiler_isUGQRValueKnown(ppcImlGenContext_t* ppcImlGenContext, sint32
return true;
}
/*
* If value of GQR can be predicted for a given PSQ load or store instruction then replace it with an optimized version
*/
void PPCRecompiler_optimizePSQLoadAndStore(ppcImlGenContext_t* ppcImlGenContext)
{
for (IMLSegment* segIt : ppcImlGenContext->segmentList2)
{
for(IMLInstruction& instIt : segIt->imlList)
{
if (instIt.type == PPCREC_IML_TYPE_FPR_LOAD || instIt.type == PPCREC_IML_TYPE_FPR_LOAD_INDEXED)
{
if(instIt.op_storeLoad.mode != PPCREC_FPR_LD_MODE_PSQ_GENERIC_PS0 &&
instIt.op_storeLoad.mode != PPCREC_FPR_LD_MODE_PSQ_GENERIC_PS0_PS1 )
continue;
// get GQR value
cemu_assert_debug(instIt.op_storeLoad.registerGQR.IsValid());
sint32 gqrIndex = _getGQRIndexFromRegister(ppcImlGenContext, instIt.op_storeLoad.registerGQR);
cemu_assert(gqrIndex >= 0);
if (ppcImlGenContext->tracking.modifiesGQR[gqrIndex])
continue;
uint32 gqrValue;
if (!PPCRecompiler_isUGQRValueKnown(ppcImlGenContext, gqrIndex, gqrValue))
continue;
uint32 formatType = (gqrValue >> 16) & 7;
uint32 scale = (gqrValue >> 24) & 0x3F;
if (scale != 0)
continue; // only generic handler supports scale
if (instIt.op_storeLoad.mode == PPCREC_FPR_LD_MODE_PSQ_GENERIC_PS0)
{
if (formatType == 0)
instIt.op_storeLoad.mode = PPCREC_FPR_LD_MODE_PSQ_FLOAT_PS0;
else if (formatType == 4)
instIt.op_storeLoad.mode = PPCREC_FPR_LD_MODE_PSQ_U8_PS0;
else if (formatType == 5)
instIt.op_storeLoad.mode = PPCREC_FPR_LD_MODE_PSQ_U16_PS0;
else if (formatType == 6)
instIt.op_storeLoad.mode = PPCREC_FPR_LD_MODE_PSQ_S8_PS0;
else if (formatType == 7)
instIt.op_storeLoad.mode = PPCREC_FPR_LD_MODE_PSQ_S16_PS0;
if (instIt.op_storeLoad.mode != PPCREC_FPR_LD_MODE_PSQ_GENERIC_PS0)
instIt.op_storeLoad.registerGQR = IMLREG_INVALID;
}
else if (instIt.op_storeLoad.mode == PPCREC_FPR_LD_MODE_PSQ_GENERIC_PS0_PS1)
{
if (formatType == 0)
instIt.op_storeLoad.mode = PPCREC_FPR_LD_MODE_PSQ_FLOAT_PS0_PS1;
else if (formatType == 4)
instIt.op_storeLoad.mode = PPCREC_FPR_LD_MODE_PSQ_U8_PS0_PS1;
else if (formatType == 5)
instIt.op_storeLoad.mode = PPCREC_FPR_LD_MODE_PSQ_U16_PS0_PS1;
else if (formatType == 6)
instIt.op_storeLoad.mode = PPCREC_FPR_LD_MODE_PSQ_S8_PS0_PS1;
else if (formatType == 7)
instIt.op_storeLoad.mode = PPCREC_FPR_LD_MODE_PSQ_S16_PS0_PS1;
if (instIt.op_storeLoad.mode != PPCREC_FPR_LD_MODE_PSQ_GENERIC_PS0_PS1)
instIt.op_storeLoad.registerGQR = IMLREG_INVALID;
}
}
else if (instIt.type == PPCREC_IML_TYPE_FPR_STORE || instIt.type == PPCREC_IML_TYPE_FPR_STORE_INDEXED)
{
if(instIt.op_storeLoad.mode != PPCREC_FPR_ST_MODE_PSQ_GENERIC_PS0 &&
instIt.op_storeLoad.mode != PPCREC_FPR_ST_MODE_PSQ_GENERIC_PS0_PS1)
continue;
// get GQR value
cemu_assert_debug(instIt.op_storeLoad.registerGQR.IsValid());
sint32 gqrIndex = _getGQRIndexFromRegister(ppcImlGenContext, instIt.op_storeLoad.registerGQR);
cemu_assert(gqrIndex >= 0 && gqrIndex < 8);
if (ppcImlGenContext->tracking.modifiesGQR[gqrIndex])
continue;
uint32 gqrValue;
if(!PPCRecompiler_isUGQRValueKnown(ppcImlGenContext, gqrIndex, gqrValue))
continue;
uint32 formatType = (gqrValue >> 16) & 7;
uint32 scale = (gqrValue >> 24) & 0x3F;
if (scale != 0)
continue; // only generic handler supports scale
if (instIt.op_storeLoad.mode == PPCREC_FPR_ST_MODE_PSQ_GENERIC_PS0)
{
if (formatType == 0)
instIt.op_storeLoad.mode = PPCREC_FPR_ST_MODE_PSQ_FLOAT_PS0;
else if (formatType == 4)
instIt.op_storeLoad.mode = PPCREC_FPR_ST_MODE_PSQ_U8_PS0;
else if (formatType == 5)
instIt.op_storeLoad.mode = PPCREC_FPR_ST_MODE_PSQ_U16_PS0;
else if (formatType == 6)
instIt.op_storeLoad.mode = PPCREC_FPR_ST_MODE_PSQ_S8_PS0;
else if (formatType == 7)
instIt.op_storeLoad.mode = PPCREC_FPR_ST_MODE_PSQ_S16_PS0;
if (instIt.op_storeLoad.mode != PPCREC_FPR_ST_MODE_PSQ_GENERIC_PS0)
instIt.op_storeLoad.registerGQR = IMLREG_INVALID;
}
else if (instIt.op_storeLoad.mode == PPCREC_FPR_ST_MODE_PSQ_GENERIC_PS0_PS1)
{
if (formatType == 0)
instIt.op_storeLoad.mode = PPCREC_FPR_ST_MODE_PSQ_FLOAT_PS0_PS1;
else if (formatType == 4)
instIt.op_storeLoad.mode = PPCREC_FPR_ST_MODE_PSQ_U8_PS0_PS1;
else if (formatType == 5)
instIt.op_storeLoad.mode = PPCREC_FPR_ST_MODE_PSQ_U16_PS0_PS1;
else if (formatType == 6)
instIt.op_storeLoad.mode = PPCREC_FPR_ST_MODE_PSQ_S8_PS0_PS1;
else if (formatType == 7)
instIt.op_storeLoad.mode = PPCREC_FPR_ST_MODE_PSQ_S16_PS0_PS1;
if (instIt.op_storeLoad.mode != PPCREC_FPR_ST_MODE_PSQ_GENERIC_PS0_PS1)
instIt.op_storeLoad.registerGQR = IMLREG_INVALID;
}
}
}
}
}
// analyses register dependencies across the entire function
// per segment this will generate information about which registers need to be preserved and which ones don't (e.g. are overwritten)
class IMLOptimizerRegIOAnalysis

View file

@ -311,10 +311,7 @@ bool PPCRecompiler_ApplyIMLPasses(ppcImlGenContext_t& ppcImlGenContext)
// this simplifies logic during register allocation
PPCRecompilerIML_isolateEnterableSegments(&ppcImlGenContext);
// if GQRs can be predicted, optimize PSQ load/stores
PPCRecompiler_optimizePSQLoadAndStore(&ppcImlGenContext);
// merge certain float load+store patterns (must happen before FPR register remapping)
// merge certain float load+store patterns
IMLOptimizer_OptimizeDirectFloatCopies(&ppcImlGenContext);
// delay byte swapping for certain load+store patterns
IMLOptimizer_OptimizeDirectIntegerCopies(&ppcImlGenContext);

View file

@ -12,9 +12,7 @@ IMLReg _GetRegCR(ppcImlGenContext_t* ppcImlGenContext, uint8 crReg, uint8 crBit)
#define DefinePS0(name, regIndex) IMLReg name = _GetFPRRegPS0(ppcImlGenContext, regIndex);
#define DefinePS1(name, regIndex) IMLReg name = _GetFPRRegPS1(ppcImlGenContext, regIndex);
#define DefinePSX(name, regIndex, isPS1) IMLReg name = isPS1 ? _GetFPRRegPS1(ppcImlGenContext, regIndex) : _GetFPRRegPS0(ppcImlGenContext, regIndex);
#define DefineTempFPR(name, index) IMLReg name = _GetFPRTemp(ppcImlGenContext, index);
IMLReg _GetFPRRegPS0(ppcImlGenContext_t* ppcImlGenContext, uint32 regIndex)
@ -51,17 +49,6 @@ void PPRecompilerImmGen_optionalRoundBottomFPRToSinglePrecision(ppcImlGenContext
assert_dbg();
}
/*
* Rounds pair of doubles to single precision (if single precision accuracy is emulated)
*/
void PPRecompilerImmGen_optionalRoundPairFPRToSinglePrecision(ppcImlGenContext_t* ppcImlGenContext, IMLReg fprRegister, bool flushDenormals=false)
{
cemu_assert_suspicious(); // should not be used any longer
ppcImlGenContext->emitInst().make_fpr_r(PPCREC_IML_OP_FPR_ROUND_TO_SINGLE_PRECISION_PAIR, fprRegister);
if( flushDenormals )
assert_dbg();
}
bool PPCRecompilerImlGen_LFS_LFSU_LFD_LFDU(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode, bool withUpdate, bool isDouble)
{
sint32 rA, frD;
@ -250,7 +237,7 @@ bool PPCRecompilerImlGen_FMUL(ppcImlGenContext_t* ppcImlGenContext, uint32 opcod
DefinePS0(fprD, frD);
// move frA to frD (if different register)
if( frD != frA )
ppcImlGenContext->emitInst().make_fpr_r_r(PPCREC_IML_OP_ASSIGN, fprD, fprA);
ppcImlGenContext->emitInst().make_fpr_r_r(PPCREC_IML_OP_FPR_COPY_BOTTOM_TO_BOTTOM, fprD, fprA);
// multiply bottom double of frD with bottom double of frB
ppcImlGenContext->emitInst().make_fpr_r_r(PPCREC_IML_OP_FPR_MULTIPLY_BOTTOM, fprD, fprC);
return true;
@ -268,7 +255,7 @@ bool PPCRecompilerImlGen_FDIV(ppcImlGenContext_t* ppcImlGenContext, uint32 opcod
{
DefineTempFPR(fprTemp, 0);
// move frA to temporary register
ppcImlGenContext->emitInst().make_fpr_r_r(PPCREC_IML_OP_ASSIGN, fprTemp, fprA);
ppcImlGenContext->emitInst().make_fpr_r_r(PPCREC_IML_OP_FPR_COPY_BOTTOM_TO_BOTTOM, fprTemp, fprA);
// divide bottom double of temporary register by bottom double of frB
ppcImlGenContext->emitInst().make_fpr_r_r(PPCREC_IML_OP_FPR_DIVIDE_BOTTOM, fprTemp, fprB);
// move result to frD
@ -296,7 +283,7 @@ bool PPCRecompilerImlGen_FMADD(ppcImlGenContext_t* ppcImlGenContext, uint32 opco
{
DefineTempFPR(fprTemp, 0);
// move frA to temporary register
ppcImlGenContext->emitInst().make_fpr_r_r(PPCREC_IML_OP_ASSIGN, fprTemp, fprA);
ppcImlGenContext->emitInst().make_fpr_r_r(PPCREC_IML_OP_FPR_COPY_BOTTOM_TO_BOTTOM, fprTemp, fprA);
// multiply bottom double of temporary register with bottom double of frC
ppcImlGenContext->emitInst().make_fpr_r_r(PPCREC_IML_OP_FPR_MULTIPLY_BOTTOM, fprTemp, fprC);
// add result to frD
@ -313,7 +300,7 @@ bool PPCRecompilerImlGen_FMADD(ppcImlGenContext_t* ppcImlGenContext, uint32 opco
}
// move frA to frD (if different register)
if( frD != frA )
ppcImlGenContext->emitInst().make_fpr_r_r(PPCREC_IML_OP_ASSIGN, fprD, fprA); // always copy ps0 and ps1
ppcImlGenContext->emitInst().make_fpr_r_r(PPCREC_IML_OP_FPR_COPY_BOTTOM_TO_BOTTOM, fprD, fprA); // always copy ps0 and ps1
// multiply bottom double of frD with bottom double of frC
ppcImlGenContext->emitInst().make_fpr_r_r(PPCREC_IML_OP_FPR_MULTIPLY_BOTTOM, fprD, fprC);
// add frB
@ -333,10 +320,10 @@ bool PPCRecompilerImlGen_FMSUB(ppcImlGenContext_t* ppcImlGenContext, uint32 opco
{
// if frB is already in frD we need a temporary register to store the product of frA*frC
DefineTempFPR(fprTemp, 0);
ppcImlGenContext->emitInst().make_fpr_r_r(PPCREC_IML_OP_ASSIGN, fprTemp, fprA);
ppcImlGenContext->emitInst().make_fpr_r_r(PPCREC_IML_OP_FPR_COPY_BOTTOM_TO_BOTTOM, fprTemp, fprA);
ppcImlGenContext->emitInst().make_fpr_r_r(PPCREC_IML_OP_FPR_MULTIPLY_BOTTOM, fprTemp, fprC);
ppcImlGenContext->emitInst().make_fpr_r_r(PPCREC_IML_OP_FPR_SUB_BOTTOM, fprTemp, fprB);
ppcImlGenContext->emitInst().make_fpr_r_r(PPCREC_IML_OP_ASSIGN, fprD, fprTemp);
ppcImlGenContext->emitInst().make_fpr_r_r(PPCREC_IML_OP_FPR_COPY_BOTTOM_TO_BOTTOM, fprD, fprTemp);
return false;
}
if( frD == frC )
@ -348,7 +335,7 @@ bool PPCRecompilerImlGen_FMSUB(ppcImlGenContext_t* ppcImlGenContext, uint32 opco
}
// move frA to frD
if( frD != frA )
ppcImlGenContext->emitInst().make_fpr_r_r(PPCREC_IML_OP_ASSIGN, fprD, fprA);
ppcImlGenContext->emitInst().make_fpr_r_r(PPCREC_IML_OP_FPR_COPY_BOTTOM_TO_BOTTOM, fprD, fprA);
// multiply bottom double of frD with bottom double of frC
ppcImlGenContext->emitInst().make_fpr_r_r(PPCREC_IML_OP_FPR_MULTIPLY_BOTTOM, fprD, fprC);
// sub frB
@ -423,7 +410,7 @@ bool PPCRecompilerImlGen_FMULS(ppcImlGenContext_t* ppcImlGenContext, uint32 opco
DefinePS0(fprD, frD);
// move frA to frD (if different register)
if( frD != frA )
ppcImlGenContext->emitInst().make_fpr_r_r(PPCREC_IML_OP_ASSIGN, fprD, fprA);
ppcImlGenContext->emitInst().make_fpr_r_r(PPCREC_IML_OP_FPR_COPY_BOTTOM_TO_BOTTOM, fprD, fprA);
// multiply bottom double of frD with bottom double of frB
ppcImlGenContext->emitInst().make_fpr_r_r(PPCREC_IML_OP_FPR_MULTIPLY_BOTTOM, fprD, fprC);
// adjust accuracy
@ -445,7 +432,7 @@ bool PPCRecompilerImlGen_FDIVS(ppcImlGenContext_t* ppcImlGenContext, uint32 opco
{
DefineTempFPR(fprTemp, 0);
// move frA to temporary register
ppcImlGenContext->emitInst().make_fpr_r_r(PPCREC_IML_OP_ASSIGN, fprTemp, fprA);
ppcImlGenContext->emitInst().make_fpr_r_r(PPCREC_IML_OP_FPR_COPY_BOTTOM_TO_BOTTOM, fprTemp, fprA);
// divide bottom double of temporary register by bottom double of frB
ppcImlGenContext->emitInst().make_fpr_r_r(PPCREC_IML_OP_FPR_DIVIDE_BOTTOM, fprTemp, fprB);
// move result to frD
@ -457,7 +444,7 @@ bool PPCRecompilerImlGen_FDIVS(ppcImlGenContext_t* ppcImlGenContext, uint32 opco
}
// move frA to frD (if different register)
if( frD != frA )
ppcImlGenContext->emitInst().make_fpr_r_r(PPCREC_IML_OP_ASSIGN, fprD, fprA);
ppcImlGenContext->emitInst().make_fpr_r_r(PPCREC_IML_OP_FPR_COPY_BOTTOM_TO_BOTTOM, fprD, fprA);
// subtract bottom double of frB from bottom double of frD
ppcImlGenContext->emitInst().make_fpr_r_r(PPCREC_IML_OP_FPR_DIVIDE_BOTTOM, fprD, fprB);
// adjust accuracy
@ -483,7 +470,7 @@ bool PPCRecompilerImlGen_FADDS(ppcImlGenContext_t* ppcImlGenContext, uint32 opco
DefinePS0(fprD, frD);
// move frA to frD (if different register)
if( frD != frA )
ppcImlGenContext->emitInst().make_fpr_r_r(PPCREC_IML_OP_ASSIGN, fprD, fprA);
ppcImlGenContext->emitInst().make_fpr_r_r(PPCREC_IML_OP_FPR_COPY_BOTTOM_TO_BOTTOM, fprD, fprA);
// add bottom double of frD and bottom double of frB
ppcImlGenContext->emitInst().make_fpr_r_r(PPCREC_IML_OP_FPR_ADD_BOTTOM, fprD, fprB);
// adjust accuracy