PPCRec: Rework floating point instructions (#1554)
Some checks failed
Build check / build (push) Waiting to run
Generate translation template / generate-pot (push) Failing after 1s

This commit is contained in:
Exzap 2025-05-08 03:48:22 +02:00 committed by GitHub
parent 33d5c6d490
commit de542410c2
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
22 changed files with 1428 additions and 2879 deletions

View file

@ -49,7 +49,6 @@ add_subdirectory(audio)
add_subdirectory(util)
add_subdirectory(imgui)
add_subdirectory(resource)
add_subdirectory(asm)
add_executable(CemuBin
main.cpp

View file

@ -548,7 +548,6 @@ else()
endif()
target_link_libraries(CemuCafe PRIVATE
CemuAsm
CemuAudio
CemuCommon
CemuComponents

View file

@ -10,6 +10,18 @@ namespace Espresso
CR_BIT_INDEX_SO = 3,
};
enum class PSQ_LOAD_TYPE
{
TYPE_F32 = 0,
TYPE_UNUSED1 = 1,
TYPE_UNUSED2 = 2,
TYPE_UNUSED3 = 3,
TYPE_U8 = 4,
TYPE_U16 = 5,
TYPE_S8 = 6,
TYPE_S16 = 7,
};
enum class PrimaryOpcode
{
// underscore at the end of the name means that this instruction always updates CR0 (as if RC bit is set)

View file

@ -1,5 +1,4 @@
#include "Cafe/HW/Espresso/Const.h"
#include "asm/x64util.h"
#include "config/ActiveSettings.h"
#include "util/helpers/fspinlock.h"
#include "util/highresolutiontimer/HighResolutionTimer.h"

View file

@ -609,7 +609,7 @@ bool PPCRecompilerX64Gen_imlInstruction_r_r(PPCRecFunction_t* PPCRecFunction, pp
}
else
{
debug_printf("PPCRecompilerX64Gen_imlInstruction_r_r(): Unsupported operation 0x%x\n", imlInstruction->operation);
cemuLog_logDebug(LogType::Force, "PPCRecompilerX64Gen_imlInstruction_r_r(): Unsupported operation 0x%x\n", imlInstruction->operation);
return false;
}
return true;
@ -635,7 +635,7 @@ bool PPCRecompilerX64Gen_imlInstruction_r_s32(PPCRecFunction_t* PPCRecFunction,
}
else
{
debug_printf("PPCRecompilerX64Gen_imlInstruction_r_s32(): Unsupported operation 0x%x\n", imlInstruction->operation);
cemuLog_logDebug(LogType::Force, "PPCRecompilerX64Gen_imlInstruction_r_s32(): Unsupported operation 0x%x\n", imlInstruction->operation);
return false;
}
return true;
@ -894,7 +894,7 @@ bool PPCRecompilerX64Gen_imlInstruction_r_r_r(PPCRecFunction_t* PPCRecFunction,
}
else
{
debug_printf("PPCRecompilerX64Gen_imlInstruction_r_r_r(): Unsupported operation 0x%x\n", imlInstruction->operation);
cemuLog_logDebug(LogType::Force, "PPCRecompilerX64Gen_imlInstruction_r_r_r(): Unsupported operation 0x%x\n", imlInstruction->operation);
return false;
}
return true;
@ -1204,9 +1204,11 @@ void PPCRecompilerX64Gen_imlInstruction_r_name(PPCRecFunction_t* PPCRecFunction,
else if (imlInstruction->op_r_name.regR.GetBaseFormat() == IMLRegFormat::F64)
{
auto regR = _regF64(imlInstruction->op_r_name.regR);
if (name >= PPCREC_NAME_FPR0 && name < (PPCREC_NAME_FPR0 + 32))
if (name >= PPCREC_NAME_FPR_HALF && name < (PPCREC_NAME_FPR_HALF + 64))
{
x64Gen_movupd_xmmReg_memReg128(x64GenContext, regR, REG_RESV_HCPU, offsetof(PPCInterpreter_t, fpr) + sizeof(FPR_t) * (name - PPCREC_NAME_FPR0));
sint32 regIndex = (name - PPCREC_NAME_FPR_HALF) / 2;
sint32 pairIndex = (name - PPCREC_NAME_FPR_HALF) % 2;
x64Gen_movsd_xmmReg_memReg64(x64GenContext, regR, REG_RESV_HCPU, offsetof(PPCInterpreter_t, fpr) + sizeof(FPR_t) * regIndex + pairIndex * sizeof(double));
}
else if (name >= PPCREC_NAME_TEMPORARY_FPR0 || name < (PPCREC_NAME_TEMPORARY_FPR0 + 8))
{
@ -1281,9 +1283,11 @@ void PPCRecompilerX64Gen_imlInstruction_name_r(PPCRecFunction_t* PPCRecFunction,
{
auto regR = _regF64(imlInstruction->op_r_name.regR);
uint32 name = imlInstruction->op_r_name.name;
if (name >= PPCREC_NAME_FPR0 && name < (PPCREC_NAME_FPR0 + 32))
if (name >= PPCREC_NAME_FPR_HALF && name < (PPCREC_NAME_FPR_HALF + 64))
{
x64Gen_movupd_memReg128_xmmReg(x64GenContext, regR, REG_RESV_HCPU, offsetof(PPCInterpreter_t, fpr) + sizeof(FPR_t) * (name - PPCREC_NAME_FPR0));
sint32 regIndex = (name - PPCREC_NAME_FPR_HALF) / 2;
sint32 pairIndex = (name - PPCREC_NAME_FPR_HALF) % 2;
x64Gen_movsd_memReg64_xmmReg(x64GenContext, regR, REG_RESV_HCPU, offsetof(PPCInterpreter_t, fpr) + sizeof(FPR_t) * regIndex + (pairIndex ? sizeof(double) : 0));
}
else if (name >= PPCREC_NAME_TEMPORARY_FPR0 && name < (PPCREC_NAME_TEMPORARY_FPR0 + 8))
{

View file

@ -205,6 +205,7 @@ void x64Gen_movddup_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegi
void x64Gen_movhlps_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc);
void x64Gen_movsd_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc);
void x64Gen_movsd_memReg64_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegister, sint32 memRegister, uint32 memImmU32);
void x64Gen_movsd_xmmReg_memReg64(x64GenContext_t* x64GenContext, sint32 xmmRegister, sint32 memRegister, uint32 memImmU32);
void x64Gen_movlpd_xmmReg_memReg64(x64GenContext_t* x64GenContext, sint32 xmmRegister, sint32 memRegister, uint32 memImmU32);
void x64Gen_unpcklpd_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc);
void x64Gen_unpckhpd_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc);
@ -230,6 +231,7 @@ void x64Gen_andps_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegist
void x64Gen_pcmpeqd_xmmReg_mem128Reg64(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, uint32 memReg, uint32 memImmS32);
void x64Gen_cvttpd2dq_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc);
void x64Gen_cvttsd2si_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 registerDest, sint32 xmmRegisterSrc);
void x64Gen_cvtsi2sd_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 registerSrc);
void x64Gen_cvtsd2ss_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc);
void x64Gen_cvtpd2ps_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc);
void x64Gen_cvtss2sd_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc);

View file

@ -3,8 +3,6 @@
#include "BackendX64.h"
#include "Common/cpu_features.h"
#include "asm/x64util.h" // for recompiler_fres / frsqrte
uint32 _regF64(IMLReg physReg);
uint32 _regI32(IMLReg r)
@ -34,231 +32,6 @@ static x86Assembler64::GPR8_REX _reg8_from_reg32(x86Assembler64::GPR32 regId)
return (x86Assembler64::GPR8_REX)regId;
}
void PPCRecompilerX64Gen_imlInstr_gqr_generateScaleCode(ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, sint32 registerXMM, bool isLoad, bool scalePS1, IMLReg registerGQR)
{
// load GQR
x64Gen_mov_reg64_reg64(x64GenContext, REG_RESV_TEMP, _regI32(registerGQR));
// extract scale field and multiply by 16 to get array offset
x64Gen_shr_reg64Low32_imm8(x64GenContext, REG_RESV_TEMP, (isLoad?16:0)+8-4);
x64Gen_and_reg64Low32_imm32(x64GenContext, REG_RESV_TEMP, (0x3F<<4));
// multiply xmm by scale
x64Gen_add_reg64_reg64(x64GenContext, REG_RESV_TEMP, REG_RESV_RECDATA);
if (isLoad)
{
if(scalePS1)
x64Gen_mulpd_xmmReg_memReg128(x64GenContext, registerXMM, REG_RESV_TEMP, offsetof(PPCRecompilerInstanceData_t, _psq_ld_scale_ps0_ps1));
else
x64Gen_mulpd_xmmReg_memReg128(x64GenContext, registerXMM, REG_RESV_TEMP, offsetof(PPCRecompilerInstanceData_t, _psq_ld_scale_ps0_1));
}
else
{
if (scalePS1)
x64Gen_mulpd_xmmReg_memReg128(x64GenContext, registerXMM, REG_RESV_TEMP, offsetof(PPCRecompilerInstanceData_t, _psq_st_scale_ps0_ps1));
else
x64Gen_mulpd_xmmReg_memReg128(x64GenContext, registerXMM, REG_RESV_TEMP, offsetof(PPCRecompilerInstanceData_t, _psq_st_scale_ps0_1));
}
}
// generate code for PSQ load for a particular type
// if scaleGQR is -1 then a scale of 1.0 is assumed (no scale)
void PPCRecompilerX64Gen_imlInstr_psq_load(ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, uint8 mode, sint32 registerXMM, sint32 memReg, sint32 memRegEx, sint32 memImmS32, bool indexed, IMLReg registerGQR = IMLREG_INVALID)
{
if (mode == PPCREC_FPR_LD_MODE_PSQ_FLOAT_PS0_PS1)
{
if (indexed)
{
assert_dbg();
}
// optimized code for ps float load
x64Emit_mov_reg64_mem64(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, memReg, memImmS32);
x64GenContext->emitter->BSWAP_q(REG_RESV_TEMP);
x64Gen_rol_reg64_imm8(x64GenContext, REG_RESV_TEMP, 32); // swap upper and lower DWORD
x64Gen_movq_xmmReg_reg64(x64GenContext, registerXMM, REG_RESV_TEMP);
x64Gen_cvtps2pd_xmmReg_xmmReg(x64GenContext, registerXMM, registerXMM);
// note: floats are not scaled
}
else if (mode == PPCREC_FPR_LD_MODE_PSQ_FLOAT_PS0)
{
if (indexed)
{
x64Gen_mov_reg64Low32_reg64Low32(x64GenContext, REG_RESV_TEMP, memRegEx);
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, REG_RESV_TEMP, memReg);
if (g_CPUFeatures.x86.movbe)
{
x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, REG_RESV_TEMP, memImmS32);
}
else
{
x64Emit_mov_reg32_mem32(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, REG_RESV_TEMP, memImmS32);
x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_RESV_TEMP);
}
}
else
{
if (g_CPUFeatures.x86.movbe)
{
x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, memReg, memImmS32);
}
else
{
x64Emit_mov_reg32_mem32(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, memReg, memImmS32);
x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_RESV_TEMP);
}
}
if (g_CPUFeatures.x86.avx)
{
x64Gen_movd_xmmReg_reg64Low32(x64GenContext, REG_RESV_FPR_TEMP, REG_RESV_TEMP);
}
else
{
x64Emit_mov_mem32_reg64(x64GenContext, REG_RESV_HCPU, offsetof(PPCInterpreter_t, temporaryFPR), REG_RESV_TEMP);
x64Gen_movddup_xmmReg_memReg64(x64GenContext, REG_RESV_FPR_TEMP, REG_RESV_HCPU, offsetof(PPCInterpreter_t, temporaryFPR));
}
x64Gen_cvtss2sd_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, REG_RESV_FPR_TEMP);
// load constant 1.0 into lower half and upper half of temp register
x64Gen_movddup_xmmReg_memReg64(x64GenContext, registerXMM, REG_RESV_RECDATA, offsetof(PPCRecompilerInstanceData_t, _x64XMM_constDouble1_1));
// overwrite lower half with single from memory
x64Gen_movsd_xmmReg_xmmReg(x64GenContext, registerXMM, REG_RESV_FPR_TEMP);
// note: floats are not scaled
}
else
{
sint32 readSize;
bool isSigned = false;
if (mode == PPCREC_FPR_LD_MODE_PSQ_S16_PS0 ||
mode == PPCREC_FPR_LD_MODE_PSQ_S16_PS0_PS1)
{
readSize = 16;
isSigned = true;
}
else if (mode == PPCREC_FPR_LD_MODE_PSQ_U16_PS0 ||
mode == PPCREC_FPR_LD_MODE_PSQ_U16_PS0_PS1)
{
readSize = 16;
isSigned = false;
}
else if (mode == PPCREC_FPR_LD_MODE_PSQ_S8_PS0 ||
mode == PPCREC_FPR_LD_MODE_PSQ_S8_PS0_PS1)
{
readSize = 8;
isSigned = true;
}
else if (mode == PPCREC_FPR_LD_MODE_PSQ_U8_PS0 ||
mode == PPCREC_FPR_LD_MODE_PSQ_U8_PS0_PS1)
{
readSize = 8;
isSigned = false;
}
else
assert_dbg();
bool loadPS1 = (mode == PPCREC_FPR_LD_MODE_PSQ_S16_PS0_PS1 ||
mode == PPCREC_FPR_LD_MODE_PSQ_U16_PS0_PS1 ||
mode == PPCREC_FPR_LD_MODE_PSQ_U8_PS0_PS1 ||
mode == PPCREC_FPR_LD_MODE_PSQ_S8_PS0_PS1);
for (sint32 wordIndex = 0; wordIndex < 2; wordIndex++)
{
if (indexed)
{
assert_dbg();
}
// read from memory
if (wordIndex == 1 && loadPS1 == false)
{
// store constant 1
x64Gen_mov_mem32Reg64_imm32(x64GenContext, REG_RESV_HCPU, offsetof(PPCInterpreter_t, temporaryGPR) + sizeof(uint32) * 1, 1);
}
else
{
uint32 memOffset = memImmS32 + wordIndex * (readSize / 8);
if (readSize == 16)
{
// half word
x64Gen_movZeroExtend_reg64Low16_mem16Reg64PlusReg64(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, memReg, memOffset);
x64Gen_rol_reg64Low16_imm8(x64GenContext, REG_RESV_TEMP, 8); // endian swap
if (isSigned)
x64Gen_movSignExtend_reg64Low32_reg64Low16(x64GenContext, REG_RESV_TEMP, REG_RESV_TEMP);
else
x64Gen_movZeroExtend_reg64Low32_reg64Low16(x64GenContext, REG_RESV_TEMP, REG_RESV_TEMP);
}
else if (readSize == 8)
{
// byte
x64Emit_mov_reg64b_mem8(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, memReg, memOffset);
if (isSigned)
x64Gen_movSignExtend_reg64Low32_reg64Low8(x64GenContext, REG_RESV_TEMP, REG_RESV_TEMP);
else
x64Gen_movZeroExtend_reg64Low32_reg64Low8(x64GenContext, REG_RESV_TEMP, REG_RESV_TEMP);
}
// store
x64Emit_mov_mem32_reg32(x64GenContext, REG_RESV_HCPU, offsetof(PPCInterpreter_t, temporaryGPR) + sizeof(uint32) * wordIndex, REG_RESV_TEMP);
}
}
// convert the two integers to doubles
x64Gen_cvtpi2pd_xmmReg_mem64Reg64(x64GenContext, registerXMM, REG_RESV_HCPU, offsetof(PPCInterpreter_t, temporaryGPR));
// scale
if (registerGQR.IsValid())
PPCRecompilerX64Gen_imlInstr_gqr_generateScaleCode(ppcImlGenContext, x64GenContext, registerXMM, true, loadPS1, registerGQR);
}
}
void PPCRecompilerX64Gen_imlInstr_psq_load_generic(ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, uint8 mode, sint32 registerXMM, sint32 memReg, sint32 memRegEx, sint32 memImmS32, bool indexed, IMLReg registerGQR)
{
bool loadPS1 = (mode == PPCREC_FPR_LD_MODE_PSQ_GENERIC_PS0_PS1);
// load GQR
x64Gen_mov_reg64_reg64(x64GenContext, REG_RESV_TEMP, _regI32(registerGQR));
// extract load type field
x64Gen_shr_reg64Low32_imm8(x64GenContext, REG_RESV_TEMP, 16);
x64Gen_and_reg64Low32_imm32(x64GenContext, REG_RESV_TEMP, 7);
// jump cases
x64Gen_cmp_reg64Low32_imm32(x64GenContext, REG_RESV_TEMP, 4); // type 4 -> u8
sint32 jumpOffset_caseU8 = x64GenContext->emitter->GetWriteIndex();
x64Gen_jmpc_far(x64GenContext, X86_CONDITION_EQUAL, 0);
x64Gen_cmp_reg64Low32_imm32(x64GenContext, REG_RESV_TEMP, 5); // type 5 -> u16
sint32 jumpOffset_caseU16 = x64GenContext->emitter->GetWriteIndex();
x64Gen_jmpc_far(x64GenContext, X86_CONDITION_EQUAL, 0);
x64Gen_cmp_reg64Low32_imm32(x64GenContext, REG_RESV_TEMP, 6); // type 4 -> s8
sint32 jumpOffset_caseS8 = x64GenContext->emitter->GetWriteIndex();
x64Gen_jmpc_far(x64GenContext, X86_CONDITION_EQUAL, 0);
x64Gen_cmp_reg64Low32_imm32(x64GenContext, REG_RESV_TEMP, 7); // type 5 -> s16
sint32 jumpOffset_caseS16 = x64GenContext->emitter->GetWriteIndex();
x64Gen_jmpc_far(x64GenContext, X86_CONDITION_EQUAL, 0);
// default case -> float
// generate cases
uint32 jumpOffset_endOfFloat;
uint32 jumpOffset_endOfU8;
uint32 jumpOffset_endOfU16;
uint32 jumpOffset_endOfS8;
PPCRecompilerX64Gen_imlInstr_psq_load(ppcImlGenContext, x64GenContext, loadPS1 ? PPCREC_FPR_LD_MODE_PSQ_FLOAT_PS0_PS1 : PPCREC_FPR_LD_MODE_PSQ_FLOAT_PS0, registerXMM, memReg, memRegEx, memImmS32, indexed, registerGQR);
jumpOffset_endOfFloat = x64GenContext->emitter->GetWriteIndex();
x64Gen_jmp_imm32(x64GenContext, 0);
PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpOffset_caseU16, x64GenContext->emitter->GetWriteIndex());
PPCRecompilerX64Gen_imlInstr_psq_load(ppcImlGenContext, x64GenContext, loadPS1 ? PPCREC_FPR_LD_MODE_PSQ_U16_PS0_PS1 : PPCREC_FPR_LD_MODE_PSQ_U16_PS0, registerXMM, memReg, memRegEx, memImmS32, indexed, registerGQR);
jumpOffset_endOfU8 = x64GenContext->emitter->GetWriteIndex();
x64Gen_jmp_imm32(x64GenContext, 0);
PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpOffset_caseS16, x64GenContext->emitter->GetWriteIndex());
PPCRecompilerX64Gen_imlInstr_psq_load(ppcImlGenContext, x64GenContext, loadPS1 ? PPCREC_FPR_LD_MODE_PSQ_S16_PS0_PS1 : PPCREC_FPR_LD_MODE_PSQ_S16_PS0, registerXMM, memReg, memRegEx, memImmS32, indexed, registerGQR);
jumpOffset_endOfU16 = x64GenContext->emitter->GetWriteIndex();
x64Gen_jmp_imm32(x64GenContext, 0);
PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpOffset_caseU8, x64GenContext->emitter->GetWriteIndex());
PPCRecompilerX64Gen_imlInstr_psq_load(ppcImlGenContext, x64GenContext, loadPS1 ? PPCREC_FPR_LD_MODE_PSQ_U8_PS0_PS1 : PPCREC_FPR_LD_MODE_PSQ_U8_PS0, registerXMM, memReg, memRegEx, memImmS32, indexed, registerGQR);
jumpOffset_endOfS8 = x64GenContext->emitter->GetWriteIndex();
x64Gen_jmp_imm32(x64GenContext, 0);
PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpOffset_caseS8, x64GenContext->emitter->GetWriteIndex());
PPCRecompilerX64Gen_imlInstr_psq_load(ppcImlGenContext, x64GenContext, loadPS1 ? PPCREC_FPR_LD_MODE_PSQ_S8_PS0_PS1 : PPCREC_FPR_LD_MODE_PSQ_S8_PS0, registerXMM, memReg, memRegEx, memImmS32, indexed, registerGQR);
PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpOffset_endOfFloat, x64GenContext->emitter->GetWriteIndex());
PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpOffset_endOfU8, x64GenContext->emitter->GetWriteIndex());
PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpOffset_endOfU16, x64GenContext->emitter->GetWriteIndex());
PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpOffset_endOfS8, x64GenContext->emitter->GetWriteIndex());
}
// load from memory
bool PPCRecompilerX64Gen_imlInstruction_fpr_load(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, IMLInstruction* imlInstruction, bool indexed)
{
@ -269,7 +42,7 @@ bool PPCRecompilerX64Gen_imlInstruction_fpr_load(PPCRecFunction_t* PPCRecFunctio
realRegisterMem2 = _regI32(imlInstruction->op_storeLoad.registerMem2);
uint8 mode = imlInstruction->op_storeLoad.mode;
if( mode == PPCREC_FPR_LD_MODE_SINGLE_INTO_PS0_PS1 )
if( mode == PPCREC_FPR_LD_MODE_SINGLE )
{
// load byte swapped single into temporary FPR
if( indexed )
@ -299,10 +72,9 @@ bool PPCRecompilerX64Gen_imlInstruction_fpr_load(PPCRecFunction_t* PPCRecFunctio
else
{
x64Gen_cvtss2sd_xmmReg_xmmReg(x64GenContext, realRegisterXMM, realRegisterXMM);
x64Gen_movddup_xmmReg_xmmReg(x64GenContext, realRegisterXMM, realRegisterXMM);
}
}
}
else if( mode == PPCREC_FPR_LD_MODE_DOUBLE_INTO_PS0 )
else if( mode == PPCREC_FPR_LD_MODE_DOUBLE )
{
if( g_CPUFeatures.x86.avx )
{
@ -361,25 +133,6 @@ bool PPCRecompilerX64Gen_imlInstruction_fpr_load(PPCRecFunction_t* PPCRecFunctio
}
}
}
else if (mode == PPCREC_FPR_LD_MODE_PSQ_FLOAT_PS0_PS1 ||
mode == PPCREC_FPR_LD_MODE_PSQ_FLOAT_PS0 ||
mode == PPCREC_FPR_LD_MODE_PSQ_S16_PS0 ||
mode == PPCREC_FPR_LD_MODE_PSQ_S16_PS0_PS1 ||
mode == PPCREC_FPR_LD_MODE_PSQ_S16_PS0 ||
mode == PPCREC_FPR_LD_MODE_PSQ_U16_PS0 ||
mode == PPCREC_FPR_LD_MODE_PSQ_U16_PS0_PS1 ||
mode == PPCREC_FPR_LD_MODE_PSQ_S8_PS0 ||
mode == PPCREC_FPR_LD_MODE_PSQ_S8_PS0_PS1 ||
mode == PPCREC_FPR_LD_MODE_PSQ_S8_PS0 ||
mode == PPCREC_FPR_LD_MODE_PSQ_U8_PS0_PS1 )
{
PPCRecompilerX64Gen_imlInstr_psq_load(ppcImlGenContext, x64GenContext, mode, realRegisterXMM, realRegisterMem, realRegisterMem2, imlInstruction->op_storeLoad.immS32, indexed);
}
else if (mode == PPCREC_FPR_LD_MODE_PSQ_GENERIC_PS0_PS1 ||
mode == PPCREC_FPR_LD_MODE_PSQ_GENERIC_PS0)
{
PPCRecompilerX64Gen_imlInstr_psq_load_generic(ppcImlGenContext, x64GenContext, mode, realRegisterXMM, realRegisterMem, realRegisterMem2, imlInstruction->op_storeLoad.immS32, indexed, imlInstruction->op_storeLoad.registerGQR);
}
else
{
return false;
@ -387,188 +140,6 @@ bool PPCRecompilerX64Gen_imlInstruction_fpr_load(PPCRecFunction_t* PPCRecFunctio
return true;
}
void PPCRecompilerX64Gen_imlInstr_psq_store(ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, uint8 mode, sint32 registerXMM, sint32 memReg, sint32 memRegEx, sint32 memImmS32, bool indexed, IMLReg registerGQR = IMLREG_INVALID)
{
bool storePS1 = (mode == PPCREC_FPR_ST_MODE_PSQ_FLOAT_PS0_PS1 ||
mode == PPCREC_FPR_ST_MODE_PSQ_S8_PS0_PS1 ||
mode == PPCREC_FPR_ST_MODE_PSQ_U8_PS0_PS1 ||
mode == PPCREC_FPR_ST_MODE_PSQ_U16_PS0_PS1 ||
mode == PPCREC_FPR_ST_MODE_PSQ_S16_PS0_PS1);
bool isFloat = mode == PPCREC_FPR_ST_MODE_PSQ_FLOAT_PS0 || mode == PPCREC_FPR_ST_MODE_PSQ_FLOAT_PS0_PS1;
if (registerGQR.IsValid())
{
// move to temporary xmm and update registerXMM
x64Gen_movaps_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, registerXMM);
registerXMM = REG_RESV_FPR_TEMP;
// apply scale
if(isFloat == false)
PPCRecompilerX64Gen_imlInstr_gqr_generateScaleCode(ppcImlGenContext, x64GenContext, registerXMM, false, storePS1, registerGQR);
}
if (mode == PPCREC_FPR_ST_MODE_PSQ_FLOAT_PS0)
{
x64Gen_cvtsd2ss_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, registerXMM);
x64Gen_movd_reg64Low32_xmmReg(x64GenContext, REG_RESV_TEMP, REG_RESV_FPR_TEMP);
if (g_CPUFeatures.x86.movbe == false)
x64Gen_bswap_reg64Lower32bit(x64GenContext, REG_RESV_TEMP);
if (indexed)
{
cemu_assert_debug(memReg != memRegEx);
x64Gen_add_reg64Low32_reg64Low32(x64GenContext, memReg, memRegEx);
}
if (g_CPUFeatures.x86.movbe)
x64Gen_movBETruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_RESV_MEMBASE, memReg, memImmS32, REG_RESV_TEMP);
else
x64Gen_movTruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_RESV_MEMBASE, memReg, memImmS32, REG_RESV_TEMP);
if (indexed)
{
x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, memReg, memRegEx);
}
return;
}
else if (mode == PPCREC_FPR_ST_MODE_PSQ_FLOAT_PS0_PS1)
{
if (indexed)
assert_dbg(); // todo
x64Gen_cvtpd2ps_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, registerXMM);
x64Gen_movq_reg64_xmmReg(x64GenContext, REG_RESV_TEMP, REG_RESV_FPR_TEMP);
x64Gen_rol_reg64_imm8(x64GenContext, REG_RESV_TEMP, 32); // swap upper and lower DWORD
x64GenContext->emitter->BSWAP_q(REG_RESV_TEMP);
x64Gen_mov_mem64Reg64PlusReg64_reg64(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE, memReg, memImmS32);
return;
}
// store as integer
// get limit from mode
sint32 clampMin, clampMax;
sint32 bitWriteSize;
if (mode == PPCREC_FPR_ST_MODE_PSQ_S8_PS0 ||
mode == PPCREC_FPR_ST_MODE_PSQ_S8_PS0_PS1 )
{
clampMin = -128;
clampMax = 127;
bitWriteSize = 8;
}
else if (mode == PPCREC_FPR_ST_MODE_PSQ_U8_PS0 ||
mode == PPCREC_FPR_ST_MODE_PSQ_U8_PS0_PS1 )
{
clampMin = 0;
clampMax = 255;
bitWriteSize = 8;
}
else if (mode == PPCREC_FPR_ST_MODE_PSQ_U16_PS0 ||
mode == PPCREC_FPR_ST_MODE_PSQ_U16_PS0_PS1 )
{
clampMin = 0;
clampMax = 0xFFFF;
bitWriteSize = 16;
}
else if (mode == PPCREC_FPR_ST_MODE_PSQ_S16_PS0 ||
mode == PPCREC_FPR_ST_MODE_PSQ_S16_PS0_PS1 )
{
clampMin = -32768;
clampMax = 32767;
bitWriteSize = 16;
}
else
{
cemu_assert(false);
}
for (sint32 valueIndex = 0; valueIndex < (storePS1?2:1); valueIndex++)
{
// todo - multiply by GQR scale
if (valueIndex == 0)
{
// convert low half (PS0) to integer
x64Gen_cvttsd2si_reg64Low_xmmReg(x64GenContext, REG_RESV_TEMP, registerXMM);
}
else
{
// load top half (PS1) into bottom half of temporary register
x64Gen_movhlps_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, registerXMM);
// convert low half to integer
x64Gen_cvttsd2si_reg64Low_xmmReg(x64GenContext, REG_RESV_TEMP, REG_RESV_FPR_TEMP);
}
// max(i, -clampMin)
x64Gen_cmp_reg64Low32_imm32(x64GenContext, REG_RESV_TEMP, clampMin);
sint32 jumpInstructionOffset1 = x64GenContext->emitter->GetWriteIndex();
x64Gen_jmpc_near(x64GenContext, X86_CONDITION_SIGNED_GREATER_EQUAL, 0);
x64Gen_mov_reg64Low32_imm32(x64GenContext, REG_RESV_TEMP, clampMin);
PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpInstructionOffset1, x64GenContext->emitter->GetWriteIndex());
// min(i, clampMax)
x64Gen_cmp_reg64Low32_imm32(x64GenContext, REG_RESV_TEMP, clampMax);
sint32 jumpInstructionOffset2 = x64GenContext->emitter->GetWriteIndex();
x64Gen_jmpc_near(x64GenContext, X86_CONDITION_SIGNED_LESS_EQUAL, 0);
x64Gen_mov_reg64Low32_imm32(x64GenContext, REG_RESV_TEMP, clampMax);
PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpInstructionOffset2, x64GenContext->emitter->GetWriteIndex());
// endian swap
if( bitWriteSize == 16)
x64Gen_rol_reg64Low16_imm8(x64GenContext, REG_RESV_TEMP, 8);
// write to memory
if (indexed)
assert_dbg(); // unsupported
sint32 memOffset = memImmS32 + valueIndex * (bitWriteSize/8);
if (bitWriteSize == 8)
x64Gen_movTruncate_mem8Reg64PlusReg64_reg64(x64GenContext, REG_RESV_MEMBASE, memReg, memOffset, REG_RESV_TEMP);
else if (bitWriteSize == 16)
x64Gen_movTruncate_mem16Reg64PlusReg64_reg64(x64GenContext, REG_RESV_MEMBASE, memReg, memOffset, REG_RESV_TEMP);
}
}
void PPCRecompilerX64Gen_imlInstr_psq_store_generic(ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, uint8 mode, sint32 registerXMM, sint32 memReg, sint32 memRegEx, sint32 memImmS32, bool indexed, IMLReg registerGQR)
{
bool storePS1 = (mode == PPCREC_FPR_ST_MODE_PSQ_GENERIC_PS0_PS1);
// load GQR
x64Gen_mov_reg64_reg64(x64GenContext, REG_RESV_TEMP, _regI32(registerGQR));
// extract store type field
x64Gen_and_reg64Low32_imm32(x64GenContext, REG_RESV_TEMP, 7);
// jump cases
x64Gen_cmp_reg64Low32_imm32(x64GenContext, REG_RESV_TEMP, 4); // type 4 -> u8
sint32 jumpOffset_caseU8 = x64GenContext->emitter->GetWriteIndex();
x64Gen_jmpc_far(x64GenContext, X86_CONDITION_EQUAL, 0);
x64Gen_cmp_reg64Low32_imm32(x64GenContext, REG_RESV_TEMP, 5); // type 5 -> u16
sint32 jumpOffset_caseU16 = x64GenContext->emitter->GetWriteIndex();
x64Gen_jmpc_far(x64GenContext, X86_CONDITION_EQUAL, 0);
x64Gen_cmp_reg64Low32_imm32(x64GenContext, REG_RESV_TEMP, 6); // type 4 -> s8
sint32 jumpOffset_caseS8 = x64GenContext->emitter->GetWriteIndex();
x64Gen_jmpc_far(x64GenContext, X86_CONDITION_EQUAL, 0);
x64Gen_cmp_reg64Low32_imm32(x64GenContext, REG_RESV_TEMP, 7); // type 5 -> s16
sint32 jumpOffset_caseS16 = x64GenContext->emitter->GetWriteIndex();
x64Gen_jmpc_far(x64GenContext, X86_CONDITION_EQUAL, 0);
// default case -> float
// generate cases
uint32 jumpOffset_endOfFloat;
uint32 jumpOffset_endOfU8;
uint32 jumpOffset_endOfU16;
uint32 jumpOffset_endOfS8;
PPCRecompilerX64Gen_imlInstr_psq_store(ppcImlGenContext, x64GenContext, storePS1 ? PPCREC_FPR_ST_MODE_PSQ_FLOAT_PS0_PS1 : PPCREC_FPR_ST_MODE_PSQ_FLOAT_PS0, registerXMM, memReg, memRegEx, memImmS32, indexed, registerGQR);
jumpOffset_endOfFloat = x64GenContext->emitter->GetWriteIndex();
x64Gen_jmp_imm32(x64GenContext, 0);
PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpOffset_caseU16, x64GenContext->emitter->GetWriteIndex());
PPCRecompilerX64Gen_imlInstr_psq_store(ppcImlGenContext, x64GenContext, storePS1 ? PPCREC_FPR_ST_MODE_PSQ_U16_PS0_PS1 : PPCREC_FPR_ST_MODE_PSQ_U16_PS0, registerXMM, memReg, memRegEx, memImmS32, indexed, registerGQR);
jumpOffset_endOfU8 = x64GenContext->emitter->GetWriteIndex();
x64Gen_jmp_imm32(x64GenContext, 0);
PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpOffset_caseS16, x64GenContext->emitter->GetWriteIndex());
PPCRecompilerX64Gen_imlInstr_psq_store(ppcImlGenContext, x64GenContext, storePS1 ? PPCREC_FPR_ST_MODE_PSQ_S16_PS0_PS1 : PPCREC_FPR_ST_MODE_PSQ_S16_PS0, registerXMM, memReg, memRegEx, memImmS32, indexed, registerGQR);
jumpOffset_endOfU16 = x64GenContext->emitter->GetWriteIndex();
x64Gen_jmp_imm32(x64GenContext, 0);
PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpOffset_caseU8, x64GenContext->emitter->GetWriteIndex());
PPCRecompilerX64Gen_imlInstr_psq_store(ppcImlGenContext, x64GenContext, storePS1 ? PPCREC_FPR_ST_MODE_PSQ_U8_PS0_PS1 : PPCREC_FPR_ST_MODE_PSQ_U8_PS0, registerXMM, memReg, memRegEx, memImmS32, indexed, registerGQR);
jumpOffset_endOfS8 = x64GenContext->emitter->GetWriteIndex();
x64Gen_jmp_imm32(x64GenContext, 0);
PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpOffset_caseS8, x64GenContext->emitter->GetWriteIndex());
PPCRecompilerX64Gen_imlInstr_psq_store(ppcImlGenContext, x64GenContext, storePS1 ? PPCREC_FPR_ST_MODE_PSQ_S8_PS0_PS1 : PPCREC_FPR_ST_MODE_PSQ_S8_PS0, registerXMM, memReg, memRegEx, memImmS32, indexed, registerGQR);
PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpOffset_endOfFloat, x64GenContext->emitter->GetWriteIndex());
PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpOffset_endOfU8, x64GenContext->emitter->GetWriteIndex());
PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpOffset_endOfU16, x64GenContext->emitter->GetWriteIndex());
PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpOffset_endOfS8, x64GenContext->emitter->GetWriteIndex());
}
// store to memory
bool PPCRecompilerX64Gen_imlInstruction_fpr_store(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, IMLInstruction* imlInstruction, bool indexed)
{
@ -578,7 +149,7 @@ bool PPCRecompilerX64Gen_imlInstruction_fpr_store(PPCRecFunction_t* PPCRecFuncti
if( indexed )
realRegisterMem2 = _regI32(imlInstruction->op_storeLoad.registerMem2);
uint8 mode = imlInstruction->op_storeLoad.mode;
if( mode == PPCREC_FPR_ST_MODE_SINGLE_FROM_PS0 )
if( mode == PPCREC_FPR_ST_MODE_SINGLE )
{
if (imlInstruction->op_storeLoad.flags2.notExpanded)
{
@ -607,7 +178,7 @@ bool PPCRecompilerX64Gen_imlInstruction_fpr_store(PPCRecFunction_t* PPCRecFuncti
x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2);
}
}
else if( mode == PPCREC_FPR_ST_MODE_DOUBLE_FROM_PS0 )
else if( mode == PPCREC_FPR_ST_MODE_DOUBLE )
{
if( indexed )
{
@ -645,192 +216,61 @@ bool PPCRecompilerX64Gen_imlInstruction_fpr_store(PPCRecFunction_t* PPCRecFuncti
x64Gen_movTruncate_mem32Reg64PlusReg64_reg64(x64GenContext, REG_RESV_MEMBASE, realRegisterMem, imlInstruction->op_storeLoad.immS32, REG_RESV_TEMP);
}
}
else if(mode == PPCREC_FPR_ST_MODE_PSQ_FLOAT_PS0_PS1 ||
mode == PPCREC_FPR_ST_MODE_PSQ_FLOAT_PS0 ||
mode == PPCREC_FPR_ST_MODE_PSQ_S8_PS0 ||
mode == PPCREC_FPR_ST_MODE_PSQ_S8_PS0_PS1 ||
mode == PPCREC_FPR_ST_MODE_PSQ_U8_PS0 ||
mode == PPCREC_FPR_ST_MODE_PSQ_U8_PS0_PS1 ||
mode == PPCREC_FPR_ST_MODE_PSQ_S16_PS0 ||
mode == PPCREC_FPR_ST_MODE_PSQ_S16_PS0_PS1 ||
mode == PPCREC_FPR_ST_MODE_PSQ_U16_PS0 ||
mode == PPCREC_FPR_ST_MODE_PSQ_U16_PS0_PS1 )
{
cemu_assert_debug(imlInstruction->op_storeLoad.flags2.notExpanded == false);
PPCRecompilerX64Gen_imlInstr_psq_store(ppcImlGenContext, x64GenContext, mode, realRegisterXMM, realRegisterMem, realRegisterMem2, imlInstruction->op_storeLoad.immS32, indexed);
}
else if (mode == PPCREC_FPR_ST_MODE_PSQ_GENERIC_PS0_PS1 ||
mode == PPCREC_FPR_ST_MODE_PSQ_GENERIC_PS0)
{
PPCRecompilerX64Gen_imlInstr_psq_store_generic(ppcImlGenContext, x64GenContext, mode, realRegisterXMM, realRegisterMem, realRegisterMem2, imlInstruction->op_storeLoad.immS32, indexed, imlInstruction->op_storeLoad.registerGQR);
}
else
{
if( indexed )
assert_dbg(); // todo
debug_printf("PPCRecompilerX64Gen_imlInstruction_fpr_store(): Unsupported mode %d\n", mode);
return false;
}
return true;
}
void _swapPS0PS1(x64GenContext_t* x64GenContext, sint32 xmmReg)
{
x64Gen_shufpd_xmmReg_xmmReg_imm8(x64GenContext, xmmReg, xmmReg, 1);
}
// FPR op FPR
void PPCRecompilerX64Gen_imlInstruction_fpr_r_r(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, IMLInstruction* imlInstruction)
{
if( imlInstruction->operation == PPCREC_IML_OP_FPR_FLOAT_TO_INT )
{
uint32 regGpr = _regI32(imlInstruction->op_fpr_r_r.regR);
uint32 regFpr = _regF64(imlInstruction->op_fpr_r_r.regA);
x64Gen_cvttsd2si_reg64Low_xmmReg(x64GenContext, regGpr, regFpr);
return;
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_INT_TO_FLOAT )
{
uint32 regFpr = _regF64(imlInstruction->op_fpr_r_r.regR);
uint32 regGpr = _regI32(imlInstruction->op_fpr_r_r.regA);
x64Gen_cvtsi2sd_xmmReg_xmmReg(x64GenContext, regFpr, regGpr);
return;
}
uint32 regR = _regF64(imlInstruction->op_fpr_r_r.regR);
uint32 regA = _regF64(imlInstruction->op_fpr_r_r.regA);
if( imlInstruction->operation == PPCREC_IML_OP_FPR_COPY_BOTTOM_TO_BOTTOM_AND_TOP )
{
x64Gen_movddup_xmmReg_xmmReg(x64GenContext, regR, regA);
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_COPY_TOP_TO_BOTTOM_AND_TOP )
{
// VPUNPCKHQDQ
if (regR == regA)
{
// unpack top to bottom and top
x64Gen_unpckhpd_xmmReg_xmmReg(x64GenContext, regR, regA);
}
//else if ( hasAVXSupport )
//{
// // unpack top to bottom and top with non-destructive destination
// // update: On Ivy Bridge this causes weird stalls?
// x64Gen_avx_VUNPCKHPD_xmm_xmm_xmm(x64GenContext, registerResult, registerOperand, registerOperand);
//}
else
{
// move top to bottom
x64Gen_movhlps_xmmReg_xmmReg(x64GenContext, regR, regA);
// duplicate bottom
x64Gen_movddup_xmmReg_xmmReg(x64GenContext, regR, regR);
}
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_COPY_BOTTOM_TO_BOTTOM )
if( imlInstruction->operation == PPCREC_IML_OP_FPR_ASSIGN )
{
x64Gen_movsd_xmmReg_xmmReg(x64GenContext, regR, regA);
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_COPY_BOTTOM_TO_TOP )
{
x64Gen_unpcklpd_xmmReg_xmmReg(x64GenContext, regR, regA);
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_COPY_BOTTOM_AND_TOP_SWAPPED )
{
if( regR != regA )
x64Gen_movaps_xmmReg_xmmReg(x64GenContext, regR, regA);
_swapPS0PS1(x64GenContext, regR);
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_COPY_TOP_TO_TOP )
{
x64Gen_shufpd_xmmReg_xmmReg_imm8(x64GenContext, regR, regA, 2);
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_COPY_TOP_TO_BOTTOM )
{
// use unpckhpd here?
x64Gen_shufpd_xmmReg_xmmReg_imm8(x64GenContext, regR, regA, 3);
_swapPS0PS1(x64GenContext, regR);
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_MULTIPLY_BOTTOM )
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_MULTIPLY )
{
x64Gen_mulsd_xmmReg_xmmReg(x64GenContext, regR, regA);
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_MULTIPLY_PAIR )
{
x64Gen_mulpd_xmmReg_xmmReg(x64GenContext, regR, regA);
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_DIVIDE_BOTTOM )
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_DIVIDE )
{
x64Gen_divsd_xmmReg_xmmReg(x64GenContext, regR, regA);
}
else if (imlInstruction->operation == PPCREC_IML_OP_FPR_DIVIDE_PAIR)
{
x64Gen_divpd_xmmReg_xmmReg(x64GenContext, regR, regA);
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_ADD_BOTTOM )
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_ADD )
{
x64Gen_addsd_xmmReg_xmmReg(x64GenContext, regR, regA);
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_ADD_PAIR )
{
x64Gen_addpd_xmmReg_xmmReg(x64GenContext, regR, regA);
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_SUB_PAIR )
{
x64Gen_subpd_xmmReg_xmmReg(x64GenContext, regR, regA);
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_SUB_BOTTOM )
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_SUB )
{
x64Gen_subsd_xmmReg_xmmReg(x64GenContext, regR, regA);
}
else if( imlInstruction->operation == PPCREC_IML_OP_ASSIGN )
{
x64Gen_movaps_xmmReg_xmmReg(x64GenContext, regR, regA);
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_BOTTOM_FCTIWZ )
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_FCTIWZ )
{
x64Gen_cvttsd2si_xmmReg_xmmReg(x64GenContext, REG_RESV_TEMP, regA);
x64Gen_mov_reg64Low32_reg64Low32(x64GenContext, REG_RESV_TEMP, REG_RESV_TEMP);
// move to FPR register
x64Gen_movq_xmmReg_reg64(x64GenContext, regR, REG_RESV_TEMP);
}
else if (imlInstruction->operation == PPCREC_IML_OP_FPR_BOTTOM_RECIPROCAL_SQRT)
{
// move register to XMM15
x64Gen_movsd_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, regA);
// call assembly routine to calculate accurate FRSQRTE result in XMM15
x64Gen_mov_reg64_imm64(x64GenContext, REG_RESV_TEMP, (uint64)recompiler_frsqrte);
x64Gen_call_reg64(x64GenContext, REG_RESV_TEMP);
// copy result to bottom of result register
x64Gen_movsd_xmmReg_xmmReg(x64GenContext, regR, REG_RESV_FPR_TEMP);
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_NEGATE_PAIR )
{
// copy register
if( regR != regA )
{
x64Gen_movaps_xmmReg_xmmReg(x64GenContext, regR, regA);
}
// toggle sign bits
x64Gen_xorps_xmmReg_mem128Reg64(x64GenContext, regR, REG_RESV_RECDATA, offsetof(PPCRecompilerInstanceData_t, _x64XMM_xorNegateMaskPair));
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_ABS_PAIR )
{
// copy register
if( regR != regA )
{
x64Gen_movaps_xmmReg_xmmReg(x64GenContext, regR, regA);
}
// set sign bit to 0
x64Gen_andps_xmmReg_mem128Reg64(x64GenContext, regR, REG_RESV_RECDATA, offsetof(PPCRecompilerInstanceData_t, _x64XMM_andAbsMaskPair));
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_FRES_PAIR || imlInstruction->operation == PPCREC_IML_OP_FPR_FRSQRTE_PAIR)
{
// calculate bottom half of result
x64Gen_movsd_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, regA);
if(imlInstruction->operation == PPCREC_IML_OP_FPR_FRES_PAIR)
x64Gen_mov_reg64_imm64(x64GenContext, REG_RESV_TEMP, (uint64)recompiler_fres);
else
x64Gen_mov_reg64_imm64(x64GenContext, REG_RESV_TEMP, (uint64)recompiler_frsqrte);
x64Gen_call_reg64(x64GenContext, REG_RESV_TEMP); // calculate fres result in xmm15
x64Gen_movsd_xmmReg_xmmReg(x64GenContext, regR, REG_RESV_FPR_TEMP);
// calculate top half of result
// todo - this top to bottom copy can be optimized?
x64Gen_shufpd_xmmReg_xmmReg_imm8(x64GenContext, REG_RESV_FPR_TEMP, regA, 3);
x64Gen_shufpd_xmmReg_xmmReg_imm8(x64GenContext, REG_RESV_FPR_TEMP, REG_RESV_FPR_TEMP, 1); // swap top and bottom
x64Gen_call_reg64(x64GenContext, REG_RESV_TEMP); // calculate fres result in xmm15
x64Gen_unpcklpd_xmmReg_xmmReg(x64GenContext, regR, REG_RESV_FPR_TEMP); // copy bottom to top
}
else
{
assert_dbg();
@ -846,7 +286,7 @@ void PPCRecompilerX64Gen_imlInstruction_fpr_r_r_r(PPCRecFunction_t* PPCRecFuncti
uint32 regA = _regF64(imlInstruction->op_fpr_r_r_r.regA);
uint32 regB = _regF64(imlInstruction->op_fpr_r_r_r.regB);
if (imlInstruction->operation == PPCREC_IML_OP_FPR_MULTIPLY_BOTTOM)
if (imlInstruction->operation == PPCREC_IML_OP_FPR_MULTIPLY)
{
if (regR == regA)
{
@ -862,7 +302,7 @@ void PPCRecompilerX64Gen_imlInstruction_fpr_r_r_r(PPCRecFunction_t* PPCRecFuncti
x64Gen_mulsd_xmmReg_xmmReg(x64GenContext, regR, regB);
}
}
else if (imlInstruction->operation == PPCREC_IML_OP_FPR_ADD_BOTTOM)
else if (imlInstruction->operation == PPCREC_IML_OP_FPR_ADD)
{
// todo: Use AVX 3-operand VADDSD if available
if (regR == regA)
@ -879,30 +319,7 @@ void PPCRecompilerX64Gen_imlInstruction_fpr_r_r_r(PPCRecFunction_t* PPCRecFuncti
x64Gen_addsd_xmmReg_xmmReg(x64GenContext, regR, regB);
}
}
else if (imlInstruction->operation == PPCREC_IML_OP_FPR_SUB_PAIR)
{
// registerResult = registerOperandA - registerOperandB
if( regR == regA )
{
x64Gen_subpd_xmmReg_xmmReg(x64GenContext, regR, regB);
}
else if (g_CPUFeatures.x86.avx)
{
x64Gen_avx_VSUBPD_xmm_xmm_xmm(x64GenContext, regR, regA, regB);
}
else if( regR == regB )
{
x64Gen_movaps_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, regA);
x64Gen_subpd_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, regB);
x64Gen_movaps_xmmReg_xmmReg(x64GenContext, regR, REG_RESV_FPR_TEMP);
}
else
{
x64Gen_movaps_xmmReg_xmmReg(x64GenContext, regR, regA);
x64Gen_subpd_xmmReg_xmmReg(x64GenContext, regR, regB);
}
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_SUB_BOTTOM )
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_SUB )
{
if( regR == regA )
{
@ -934,39 +351,7 @@ void PPCRecompilerX64Gen_imlInstruction_fpr_r_r_r_r(PPCRecFunction_t* PPCRecFunc
uint32 regB = _regF64(imlInstruction->op_fpr_r_r_r_r.regB);
uint32 regC = _regF64(imlInstruction->op_fpr_r_r_r_r.regC);
if( imlInstruction->operation == PPCREC_IML_OP_FPR_SUM0 )
{
// todo: Investigate if there are other optimizations possible if the operand registers overlap
// generic case
// 1) move frA bottom to frTemp bottom and top
x64Gen_movddup_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, regA);
// 2) add frB (both halfs, lower half is overwritten in the next step)
x64Gen_addpd_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, regB);
// 3) Interleave top of frTemp and frC
x64Gen_unpckhpd_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, regC);
// todo: We can optimize the REG_RESV_FPR_TEMP -> resultReg copy operation away when the result register does not overlap with any of the operand registers
x64Gen_movaps_xmmReg_xmmReg(x64GenContext, regR, REG_RESV_FPR_TEMP);
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_SUM1 )
{
// todo: Investigate if there are other optimizations possible if the operand registers overlap
// 1) move frA bottom to frTemp bottom and top
x64Gen_movddup_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, regA);
// 2) add frB (both halfs, lower half is overwritten in the next step)
x64Gen_addpd_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, regB);
// 3) Copy bottom from frC
x64Gen_movsd_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, regC);
//// 4) Swap bottom and top half
//x64Gen_shufpd_xmmReg_xmmReg_imm8(x64GenContext, REG_RESV_FPR_TEMP, REG_RESV_FPR_TEMP, 1);
// todo: We can optimize the REG_RESV_FPR_TEMP -> resultReg copy operation away when the result register does not overlap with any of the operand registers
x64Gen_movaps_xmmReg_xmmReg(x64GenContext, regR, REG_RESV_FPR_TEMP);
//float s0 = (float)hCPU->fpr[frC].fp0;
//float s1 = (float)(hCPU->fpr[frA].fp0 + hCPU->fpr[frB].fp1);
//hCPU->fpr[frD].fp0 = s0;
//hCPU->fpr[frD].fp1 = s1;
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_SELECT_BOTTOM )
if( imlInstruction->operation == PPCREC_IML_OP_FPR_SELECT )
{
x64Gen_comisd_xmmReg_mem64Reg64(x64GenContext, regA, REG_RESV_RECDATA, offsetof(PPCRecompilerInstanceData_t, _x64XMM_constDouble0_0));
sint32 jumpInstructionOffset1 = x64GenContext->emitter->GetWriteIndex();
@ -981,38 +366,6 @@ void PPCRecompilerX64Gen_imlInstruction_fpr_r_r_r_r(PPCRecFunction_t* PPCRecFunc
// end
PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpInstructionOffset2, x64GenContext->emitter->GetWriteIndex());
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_SELECT_PAIR )
{
// select bottom
x64Gen_comisd_xmmReg_mem64Reg64(x64GenContext, regA, REG_RESV_RECDATA, offsetof(PPCRecompilerInstanceData_t, _x64XMM_constDouble0_0));
sint32 jumpInstructionOffset1_bottom = x64GenContext->emitter->GetWriteIndex();
x64Gen_jmpc_near(x64GenContext, X86_CONDITION_UNSIGNED_BELOW, 0);
// select C bottom
x64Gen_movsd_xmmReg_xmmReg(x64GenContext, regR, regC);
sint32 jumpInstructionOffset2_bottom = x64GenContext->emitter->GetWriteIndex();
x64Gen_jmpc_near(x64GenContext, X86_CONDITION_NONE, 0);
// select B bottom
PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpInstructionOffset1_bottom, x64GenContext->emitter->GetWriteIndex());
x64Gen_movsd_xmmReg_xmmReg(x64GenContext, regR, regB);
// end
PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpInstructionOffset2_bottom, x64GenContext->emitter->GetWriteIndex());
// select top
x64Gen_movhlps_xmmReg_xmmReg(x64GenContext, REG_RESV_FPR_TEMP, regA); // copy top to bottom (todo: May cause stall?)
x64Gen_comisd_xmmReg_mem64Reg64(x64GenContext, REG_RESV_FPR_TEMP, REG_RESV_RECDATA, offsetof(PPCRecompilerInstanceData_t, _x64XMM_constDouble0_0));
sint32 jumpInstructionOffset1_top = x64GenContext->emitter->GetWriteIndex();
x64Gen_jmpc_near(x64GenContext, X86_CONDITION_UNSIGNED_BELOW, 0);
// select C top
//x64Gen_movsd_xmmReg_xmmReg(x64GenContext, registerResult, registerOperandC);
x64Gen_shufpd_xmmReg_xmmReg_imm8(x64GenContext, regR, regC, 2);
sint32 jumpInstructionOffset2_top = x64GenContext->emitter->GetWriteIndex();
x64Gen_jmpc_near(x64GenContext, X86_CONDITION_NONE, 0);
// select B top
PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpInstructionOffset1_top, x64GenContext->emitter->GetWriteIndex());
//x64Gen_movsd_xmmReg_xmmReg(x64GenContext, registerResult, registerOperandB);
x64Gen_shufpd_xmmReg_xmmReg_imm8(x64GenContext, regR, regB, 2);
// end
PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpInstructionOffset2_top, x64GenContext->emitter->GetWriteIndex());
}
else
assert_dbg();
}
@ -1021,15 +374,19 @@ void PPCRecompilerX64Gen_imlInstruction_fpr_r(PPCRecFunction_t* PPCRecFunction,
{
uint32 regR = _regF64(imlInstruction->op_fpr_r.regR);
if( imlInstruction->operation == PPCREC_IML_OP_FPR_NEGATE_BOTTOM )
if( imlInstruction->operation == PPCREC_IML_OP_FPR_NEGATE )
{
x64Gen_xorps_xmmReg_mem128Reg64(x64GenContext, regR, REG_RESV_RECDATA, offsetof(PPCRecompilerInstanceData_t, _x64XMM_xorNegateMaskBottom));
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_ABS_BOTTOM )
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_LOAD_ONE )
{
x64Gen_movsd_xmmReg_memReg64(x64GenContext, regR, REG_RESV_RECDATA, offsetof(PPCRecompilerInstanceData_t, _x64XMM_constDouble1_1));
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_ABS )
{
x64Gen_andps_xmmReg_mem128Reg64(x64GenContext, regR, REG_RESV_RECDATA, offsetof(PPCRecompilerInstanceData_t, _x64XMM_andAbsMaskBottom));
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_NEGATIVE_ABS_BOTTOM )
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_NEGATIVE_ABS )
{
x64Gen_orps_xmmReg_mem128Reg64(x64GenContext, regR, REG_RESV_RECDATA, offsetof(PPCRecompilerInstanceData_t, _x64XMM_xorNegateMaskBottom));
}
@ -1040,19 +397,10 @@ void PPCRecompilerX64Gen_imlInstruction_fpr_r(PPCRecFunction_t* PPCRecFunction,
// convert back to 64bit double
x64Gen_cvtss2sd_xmmReg_xmmReg(x64GenContext, regR, regR);
}
else if( imlInstruction->operation == PPCREC_IML_OP_FPR_ROUND_TO_SINGLE_PRECISION_PAIR )
{
// convert to 32bit singles
x64Gen_cvtpd2ps_xmmReg_xmmReg(x64GenContext, regR, regR);
// convert back to 64bit doubles
x64Gen_cvtps2pd_xmmReg_xmmReg(x64GenContext, regR, regR);
}
else if (imlInstruction->operation == PPCREC_IML_OP_FPR_EXPAND_BOTTOM32_TO_BOTTOM64_AND_TOP64)
else if (imlInstruction->operation == PPCREC_IML_OP_FPR_EXPAND_F32_TO_F64)
{
// convert bottom to 64bit double
x64Gen_cvtss2sd_xmmReg_xmmReg(x64GenContext, regR, regR);
// copy to top half
x64Gen_movddup_xmmReg_xmmReg(x64GenContext, regR, regR);
}
else
{

View file

@ -213,6 +213,37 @@ void x64Gen_movsd_memReg64_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegi
}
}
void x64Gen_movsd_xmmReg_memReg64(x64GenContext_t* x64GenContext, sint32 xmmRegister, sint32 memRegister, uint32 memImmU32)
{
// SSE2
if( memRegister == X86_REG_RSP )
{
// MOVSD <xmm>, [RSP+<imm>]
x64Gen_writeU8(x64GenContext, 0xF2);
x64Gen_genSSEVEXPrefix2(x64GenContext, 0, xmmRegister, false);
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x10);
x64Gen_writeU8(x64GenContext, 0x84+(xmmRegister&7)*8);
x64Gen_writeU8(x64GenContext, 0x24);
x64Gen_writeU32(x64GenContext, memImmU32);
}
else if( memRegister == 15 )
{
// MOVSD <xmm>, [R15+<imm>]
x64Gen_writeU8(x64GenContext, 0x36);
x64Gen_writeU8(x64GenContext, 0xF2);
x64Gen_genSSEVEXPrefix2(x64GenContext, memRegister, xmmRegister, false);
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x10);
x64Gen_writeU8(x64GenContext, 0x87+(xmmRegister&7)*8);
x64Gen_writeU32(x64GenContext, memImmU32);
}
else
{
assert_dbg();
}
}
void x64Gen_movlpd_xmmReg_memReg64(x64GenContext_t* x64GenContext, sint32 xmmRegister, sint32 memRegister, uint32 memImmU32)
{
// SSE3
@ -561,6 +592,16 @@ void x64Gen_cvttsd2si_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 regis
x64Gen_writeU8(x64GenContext, 0xC0+(registerDest&7)*8+(xmmRegisterSrc&7));
}
void x64Gen_cvtsi2sd_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 registerSrc)
{
// SSE2
x64Gen_writeU8(x64GenContext, 0xF2);
x64Gen_genSSEVEXPrefix2(x64GenContext, registerSrc, xmmRegisterDest, false);
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x2A);
x64Gen_writeU8(x64GenContext, 0xC0+(xmmRegisterDest&7)*8+(registerSrc&7));
}
void x64Gen_cvtsd2ss_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc)
{
// SSE2

View file

@ -189,9 +189,13 @@ void IMLDebug_DisassembleInstruction(const IMLInstruction& inst, std::string& di
{
strOutput.addFmt("r{}", inst.op_r_name.name - PPCREC_NAME_R0);
}
else if (inst.op_r_name.name >= PPCREC_NAME_FPR0 && inst.op_r_name.name < (PPCREC_NAME_FPR0 + 999))
if (inst.op_r_name.name >= PPCREC_NAME_FPR_HALF && inst.op_r_name.name < (PPCREC_NAME_FPR_HALF + 32*2))
{
strOutput.addFmt("f{}", inst.op_r_name.name - PPCREC_NAME_FPR0);
strOutput.addFmt("f{}", inst.op_r_name.name - ((PPCREC_NAME_FPR_HALF - inst.op_r_name.name)/2));
if ((inst.op_r_name.name-PPCREC_NAME_FPR_HALF)&1)
strOutput.add(".ps1");
else
strOutput.add(".ps0");
}
else if (inst.op_r_name.name >= PPCREC_NAME_SPR0 && inst.op_r_name.name < (PPCREC_NAME_SPR0 + 999))
{

View file

@ -226,35 +226,6 @@ void IMLInstruction::CheckRegisterUsage(IMLUsedRegisters* registersUsed) const
// address is in gpr register
if (op_storeLoad.registerMem.IsValid())
registersUsed->readGPR1 = op_storeLoad.registerMem;
// determine partially written result
switch (op_storeLoad.mode)
{
case PPCREC_FPR_LD_MODE_PSQ_GENERIC_PS0:
case PPCREC_FPR_LD_MODE_PSQ_GENERIC_PS0_PS1:
cemu_assert_debug(op_storeLoad.registerGQR.IsValid());
registersUsed->readGPR2 = op_storeLoad.registerGQR;
break;
case PPCREC_FPR_LD_MODE_DOUBLE_INTO_PS0:
// PS1 remains the same
cemu_assert_debug(op_storeLoad.registerGQR.IsInvalid());
registersUsed->readGPR2 = op_storeLoad.registerData;
break;
case PPCREC_FPR_LD_MODE_SINGLE_INTO_PS0_PS1:
case PPCREC_FPR_LD_MODE_PSQ_FLOAT_PS0_PS1:
case PPCREC_FPR_LD_MODE_PSQ_FLOAT_PS0:
case PPCREC_FPR_LD_MODE_PSQ_S16_PS0:
case PPCREC_FPR_LD_MODE_PSQ_S16_PS0_PS1:
case PPCREC_FPR_LD_MODE_PSQ_U16_PS0_PS1:
case PPCREC_FPR_LD_MODE_PSQ_U16_PS0:
case PPCREC_FPR_LD_MODE_PSQ_S8_PS0_PS1:
case PPCREC_FPR_LD_MODE_PSQ_U8_PS0_PS1:
case PPCREC_FPR_LD_MODE_PSQ_U8_PS0:
case PPCREC_FPR_LD_MODE_PSQ_S8_PS0:
cemu_assert_debug(op_storeLoad.registerGQR.IsInvalid());
break;
default:
cemu_assert_unimplemented();
}
}
else if (type == PPCREC_IML_TYPE_FPR_LOAD_INDEXED)
{
@ -265,34 +236,6 @@ void IMLInstruction::CheckRegisterUsage(IMLUsedRegisters* registersUsed) const
registersUsed->readGPR1 = op_storeLoad.registerMem;
if (op_storeLoad.registerMem2.IsValid())
registersUsed->readGPR2 = op_storeLoad.registerMem2;
// determine partially written result
switch (op_storeLoad.mode)
{
case PPCREC_FPR_LD_MODE_PSQ_GENERIC_PS0:
case PPCREC_FPR_LD_MODE_PSQ_GENERIC_PS0_PS1:
cemu_assert_debug(op_storeLoad.registerGQR.IsValid());
registersUsed->readGPR3 = op_storeLoad.registerGQR;
break;
case PPCREC_FPR_LD_MODE_DOUBLE_INTO_PS0:
// PS1 remains the same
cemu_assert_debug(op_storeLoad.registerGQR.IsInvalid());
registersUsed->readGPR3 = op_storeLoad.registerData;
break;
case PPCREC_FPR_LD_MODE_SINGLE_INTO_PS0_PS1:
case PPCREC_FPR_LD_MODE_PSQ_FLOAT_PS0_PS1:
case PPCREC_FPR_LD_MODE_PSQ_FLOAT_PS0:
case PPCREC_FPR_LD_MODE_PSQ_S16_PS0:
case PPCREC_FPR_LD_MODE_PSQ_S16_PS0_PS1:
case PPCREC_FPR_LD_MODE_PSQ_U16_PS0_PS1:
case PPCREC_FPR_LD_MODE_PSQ_U16_PS0:
case PPCREC_FPR_LD_MODE_PSQ_S8_PS0_PS1:
case PPCREC_FPR_LD_MODE_PSQ_U8_PS0_PS1:
case PPCREC_FPR_LD_MODE_PSQ_U8_PS0:
cemu_assert_debug(op_storeLoad.registerGQR.IsInvalid());
break;
default:
cemu_assert_unimplemented();
}
}
else if (type == PPCREC_IML_TYPE_FPR_STORE)
{
@ -300,18 +243,6 @@ void IMLInstruction::CheckRegisterUsage(IMLUsedRegisters* registersUsed) const
registersUsed->readGPR1 = op_storeLoad.registerData;
if (op_storeLoad.registerMem.IsValid())
registersUsed->readGPR2 = op_storeLoad.registerMem;
// PSQ generic stores also access GQR
switch (op_storeLoad.mode)
{
case PPCREC_FPR_ST_MODE_PSQ_GENERIC_PS0:
case PPCREC_FPR_ST_MODE_PSQ_GENERIC_PS0_PS1:
cemu_assert_debug(op_storeLoad.registerGQR.IsValid());
registersUsed->readGPR3 = op_storeLoad.registerGQR;
break;
default:
cemu_assert_debug(op_storeLoad.registerGQR.IsInvalid());
break;
}
}
else if (type == PPCREC_IML_TYPE_FPR_STORE_INDEXED)
{
@ -322,72 +253,34 @@ void IMLInstruction::CheckRegisterUsage(IMLUsedRegisters* registersUsed) const
registersUsed->readGPR2 = op_storeLoad.registerMem;
if (op_storeLoad.registerMem2.IsValid())
registersUsed->readGPR3 = op_storeLoad.registerMem2;
// PSQ generic stores also access GQR
switch (op_storeLoad.mode)
{
case PPCREC_FPR_ST_MODE_PSQ_GENERIC_PS0:
case PPCREC_FPR_ST_MODE_PSQ_GENERIC_PS0_PS1:
cemu_assert_debug(op_storeLoad.registerGQR.IsValid());
registersUsed->readGPR4 = op_storeLoad.registerGQR;
break;
default:
cemu_assert_debug(op_storeLoad.registerGQR.IsInvalid());
break;
}
}
else if (type == PPCREC_IML_TYPE_FPR_R_R)
{
// fpr operation
if (operation == PPCREC_IML_OP_FPR_COPY_BOTTOM_TO_BOTTOM_AND_TOP ||
operation == PPCREC_IML_OP_FPR_COPY_TOP_TO_BOTTOM_AND_TOP ||
operation == PPCREC_IML_OP_FPR_COPY_BOTTOM_AND_TOP_SWAPPED ||
operation == PPCREC_IML_OP_ASSIGN ||
operation == PPCREC_IML_OP_FPR_NEGATE_PAIR ||
operation == PPCREC_IML_OP_FPR_ABS_PAIR ||
operation == PPCREC_IML_OP_FPR_FRES_PAIR ||
operation == PPCREC_IML_OP_FPR_FRSQRTE_PAIR)
{
// operand read, result written
registersUsed->readGPR1 = op_fpr_r_r.regA;
registersUsed->writtenGPR1 = op_fpr_r_r.regR;
}
else if (
operation == PPCREC_IML_OP_FPR_COPY_BOTTOM_TO_BOTTOM ||
operation == PPCREC_IML_OP_FPR_COPY_BOTTOM_TO_TOP ||
operation == PPCREC_IML_OP_FPR_COPY_TOP_TO_TOP ||
operation == PPCREC_IML_OP_FPR_COPY_TOP_TO_BOTTOM ||
operation == PPCREC_IML_OP_FPR_EXPAND_BOTTOM32_TO_BOTTOM64_AND_TOP64 ||
operation == PPCREC_IML_OP_FPR_BOTTOM_FCTIWZ ||
operation == PPCREC_IML_OP_FPR_BOTTOM_RECIPROCAL_SQRT
if (
operation == PPCREC_IML_OP_FPR_ASSIGN ||
operation == PPCREC_IML_OP_FPR_EXPAND_F32_TO_F64 ||
operation == PPCREC_IML_OP_FPR_FCTIWZ
)
{
// operand read, result read and (partially) written
registersUsed->readGPR1 = op_fpr_r_r.regA;
registersUsed->readGPR2 = op_fpr_r_r.regR;
registersUsed->writtenGPR1 = op_fpr_r_r.regR;
}
else if (operation == PPCREC_IML_OP_FPR_MULTIPLY_BOTTOM ||
operation == PPCREC_IML_OP_FPR_MULTIPLY_PAIR ||
operation == PPCREC_IML_OP_FPR_DIVIDE_BOTTOM ||
operation == PPCREC_IML_OP_FPR_DIVIDE_PAIR ||
operation == PPCREC_IML_OP_FPR_ADD_BOTTOM ||
operation == PPCREC_IML_OP_FPR_ADD_PAIR ||
operation == PPCREC_IML_OP_FPR_SUB_PAIR ||
operation == PPCREC_IML_OP_FPR_SUB_BOTTOM)
else if (operation == PPCREC_IML_OP_FPR_MULTIPLY ||
operation == PPCREC_IML_OP_FPR_DIVIDE ||
operation == PPCREC_IML_OP_FPR_ADD ||
operation == PPCREC_IML_OP_FPR_SUB)
{
// operand read, result read and written
registersUsed->readGPR1 = op_fpr_r_r.regA;
registersUsed->readGPR2 = op_fpr_r_r.regR;
registersUsed->writtenGPR1 = op_fpr_r_r.regR;
}
else if (operation == PPCREC_IML_OP_FPR_FCMPU_BOTTOM ||
operation == PPCREC_IML_OP_FPR_FCMPU_TOP ||
operation == PPCREC_IML_OP_FPR_FCMPO_BOTTOM)
else if (operation == PPCREC_IML_OP_FPR_FLOAT_TO_INT ||
operation == PPCREC_IML_OP_FPR_INT_TO_FLOAT)
{
// operand read, result read
registersUsed->writtenGPR1 = op_fpr_r_r.regR;
registersUsed->readGPR1 = op_fpr_r_r.regA;
registersUsed->readGPR2 = op_fpr_r_r.regR;
}
else
cemu_assert_unimplemented();
@ -398,19 +291,6 @@ void IMLInstruction::CheckRegisterUsage(IMLUsedRegisters* registersUsed) const
registersUsed->readGPR1 = op_fpr_r_r_r.regA;
registersUsed->readGPR2 = op_fpr_r_r_r.regB;
registersUsed->writtenGPR1 = op_fpr_r_r_r.regR;
// handle partially written result
switch (operation)
{
case PPCREC_IML_OP_FPR_MULTIPLY_BOTTOM:
case PPCREC_IML_OP_FPR_ADD_BOTTOM:
case PPCREC_IML_OP_FPR_SUB_BOTTOM:
registersUsed->readGPR3 = op_fpr_r_r_r.regR;
break;
case PPCREC_IML_OP_FPR_SUB_PAIR:
break;
default:
cemu_assert_unimplemented();
}
}
else if (type == PPCREC_IML_TYPE_FPR_R_R_R_R)
{
@ -419,33 +299,23 @@ void IMLInstruction::CheckRegisterUsage(IMLUsedRegisters* registersUsed) const
registersUsed->readGPR2 = op_fpr_r_r_r_r.regB;
registersUsed->readGPR3 = op_fpr_r_r_r_r.regC;
registersUsed->writtenGPR1 = op_fpr_r_r_r_r.regR;
// handle partially written result
switch (operation)
{
case PPCREC_IML_OP_FPR_SELECT_BOTTOM:
registersUsed->readGPR4 = op_fpr_r_r_r_r.regR;
break;
case PPCREC_IML_OP_FPR_SUM0:
case PPCREC_IML_OP_FPR_SUM1:
case PPCREC_IML_OP_FPR_SELECT_PAIR:
break;
default:
cemu_assert_unimplemented();
}
}
else if (type == PPCREC_IML_TYPE_FPR_R)
{
// fpr operation
if (operation == PPCREC_IML_OP_FPR_NEGATE_BOTTOM ||
operation == PPCREC_IML_OP_FPR_ABS_BOTTOM ||
operation == PPCREC_IML_OP_FPR_NEGATIVE_ABS_BOTTOM ||
operation == PPCREC_IML_OP_FPR_EXPAND_BOTTOM32_TO_BOTTOM64_AND_TOP64 ||
operation == PPCREC_IML_OP_FPR_ROUND_TO_SINGLE_PRECISION_BOTTOM ||
operation == PPCREC_IML_OP_FPR_ROUND_TO_SINGLE_PRECISION_PAIR)
if (operation == PPCREC_IML_OP_FPR_NEGATE ||
operation == PPCREC_IML_OP_FPR_ABS ||
operation == PPCREC_IML_OP_FPR_NEGATIVE_ABS ||
operation == PPCREC_IML_OP_FPR_EXPAND_F32_TO_F64 ||
operation == PPCREC_IML_OP_FPR_ROUND_TO_SINGLE_PRECISION_BOTTOM)
{
registersUsed->readGPR1 = op_fpr_r.regR;
registersUsed->writtenGPR1 = op_fpr_r.regR;
}
else if (operation == PPCREC_IML_OP_FPR_LOAD_ONE)
{
registersUsed->writtenGPR1 = op_fpr_r.regR;
}
else
cemu_assert_unimplemented();
}
@ -608,27 +478,23 @@ void IMLInstruction::RewriteGPR(const std::unordered_map<IMLRegID, IMLRegID>& tr
{
op_storeLoad.registerData = replaceRegisterIdMultiple(op_storeLoad.registerData, translationTable);
op_storeLoad.registerMem = replaceRegisterIdMultiple(op_storeLoad.registerMem, translationTable);
op_storeLoad.registerGQR = replaceRegisterIdMultiple(op_storeLoad.registerGQR, translationTable);
}
else if (type == PPCREC_IML_TYPE_FPR_LOAD_INDEXED)
{
op_storeLoad.registerData = replaceRegisterIdMultiple(op_storeLoad.registerData, translationTable);
op_storeLoad.registerMem = replaceRegisterIdMultiple(op_storeLoad.registerMem, translationTable);
op_storeLoad.registerMem2 = replaceRegisterIdMultiple(op_storeLoad.registerMem2, translationTable);
op_storeLoad.registerGQR = replaceRegisterIdMultiple(op_storeLoad.registerGQR, translationTable);
}
else if (type == PPCREC_IML_TYPE_FPR_STORE)
{
op_storeLoad.registerData = replaceRegisterIdMultiple(op_storeLoad.registerData, translationTable);
op_storeLoad.registerMem = replaceRegisterIdMultiple(op_storeLoad.registerMem, translationTable);
op_storeLoad.registerGQR = replaceRegisterIdMultiple(op_storeLoad.registerGQR, translationTable);
}
else if (type == PPCREC_IML_TYPE_FPR_STORE_INDEXED)
{
op_storeLoad.registerData = replaceRegisterIdMultiple(op_storeLoad.registerData, translationTable);
op_storeLoad.registerMem = replaceRegisterIdMultiple(op_storeLoad.registerMem, translationTable);
op_storeLoad.registerMem2 = replaceRegisterIdMultiple(op_storeLoad.registerMem2, translationTable);
op_storeLoad.registerGQR = replaceRegisterIdMultiple(op_storeLoad.registerGQR, translationTable);
}
else if (type == PPCREC_IML_TYPE_FPR_R)
{

View file

@ -126,46 +126,22 @@ enum
PPCREC_IML_OP_SRW, // SRW (shift based on register by up to 63 bits)
PPCREC_IML_OP_CNTLZW,
// FPU
PPCREC_IML_OP_FPR_ADD_BOTTOM,
PPCREC_IML_OP_FPR_ADD_PAIR,
PPCREC_IML_OP_FPR_SUB_PAIR,
PPCREC_IML_OP_FPR_SUB_BOTTOM,
PPCREC_IML_OP_FPR_MULTIPLY_BOTTOM,
PPCREC_IML_OP_FPR_MULTIPLY_PAIR,
PPCREC_IML_OP_FPR_DIVIDE_BOTTOM,
PPCREC_IML_OP_FPR_DIVIDE_PAIR,
PPCREC_IML_OP_FPR_COPY_BOTTOM_TO_BOTTOM_AND_TOP,
PPCREC_IML_OP_FPR_COPY_TOP_TO_BOTTOM_AND_TOP,
PPCREC_IML_OP_FPR_COPY_BOTTOM_TO_BOTTOM,
PPCREC_IML_OP_FPR_COPY_BOTTOM_TO_TOP, // leave bottom of destination untouched
PPCREC_IML_OP_FPR_COPY_TOP_TO_TOP, // leave bottom of destination untouched
PPCREC_IML_OP_FPR_COPY_TOP_TO_BOTTOM, // leave top of destination untouched
PPCREC_IML_OP_FPR_COPY_BOTTOM_AND_TOP_SWAPPED,
PPCREC_IML_OP_FPR_EXPAND_BOTTOM32_TO_BOTTOM64_AND_TOP64, // expand bottom f32 to f64 in bottom and top half
PPCREC_IML_OP_FPR_FCMPO_BOTTOM, // deprecated
PPCREC_IML_OP_FPR_FCMPU_BOTTOM, // deprecated
PPCREC_IML_OP_FPR_FCMPU_TOP, // deprecated
PPCREC_IML_OP_FPR_NEGATE_BOTTOM,
PPCREC_IML_OP_FPR_NEGATE_PAIR,
PPCREC_IML_OP_FPR_ABS_BOTTOM, // abs(fp0)
PPCREC_IML_OP_FPR_ABS_PAIR,
PPCREC_IML_OP_FPR_FRES_PAIR, // 1.0/fp approx (Espresso accuracy)
PPCREC_IML_OP_FPR_FRSQRTE_PAIR, // 1.0/sqrt(fp) approx (Espresso accuracy)
PPCREC_IML_OP_FPR_NEGATIVE_ABS_BOTTOM, // -abs(fp0)
PPCREC_IML_OP_FPR_ASSIGN,
PPCREC_IML_OP_FPR_LOAD_ONE, // load constant 1.0 into register
PPCREC_IML_OP_FPR_ADD,
PPCREC_IML_OP_FPR_SUB,
PPCREC_IML_OP_FPR_MULTIPLY,
PPCREC_IML_OP_FPR_DIVIDE,
PPCREC_IML_OP_FPR_EXPAND_F32_TO_F64, // expand f32 to f64 in-place
PPCREC_IML_OP_FPR_NEGATE,
PPCREC_IML_OP_FPR_ABS, // abs(fpr)
PPCREC_IML_OP_FPR_NEGATIVE_ABS, // -abs(fpr)
PPCREC_IML_OP_FPR_ROUND_TO_SINGLE_PRECISION_BOTTOM, // round 64bit double to 64bit double with 32bit float precision (in bottom half of xmm register)
PPCREC_IML_OP_FPR_ROUND_TO_SINGLE_PRECISION_PAIR, // round two 64bit doubles to 64bit double with 32bit float precision
PPCREC_IML_OP_FPR_BOTTOM_RECIPROCAL_SQRT,
PPCREC_IML_OP_FPR_BOTTOM_FCTIWZ,
PPCREC_IML_OP_FPR_SELECT_BOTTOM, // selectively copy bottom value from operand B or C based on value in operand A
PPCREC_IML_OP_FPR_SELECT_PAIR, // selectively copy top/bottom from operand B or C based on value in top/bottom of operand A
// PS
PPCREC_IML_OP_FPR_SUM0,
PPCREC_IML_OP_FPR_SUM1,
// R_R_R only
// R_R_S32 only
PPCREC_IML_OP_FPR_FCTIWZ,
PPCREC_IML_OP_FPR_SELECT, // selectively copy bottom value from operand B or C based on value in operand A
// Conversion (FPR_R_R)
PPCREC_IML_OP_FPR_INT_TO_FLOAT, // convert integer value in gpr to floating point value in fpr
PPCREC_IML_OP_FPR_FLOAT_TO_INT, // convert floating point value in fpr to integer value in gpr
// R_R_R + R_R_S32
PPCREC_IML_OP_ADD, // also R_R_R_CARRY
@ -275,7 +251,7 @@ enum // IMLName
PPCREC_NAME_TEMPORARY = 1000,
PPCREC_NAME_R0 = 2000,
PPCREC_NAME_SPR0 = 3000,
PPCREC_NAME_FPR0 = 4000,
PPCREC_NAME_FPR_HALF = 4800, // Counts PS0 and PS1 separately. E.g. fp3.ps1 is at offset 3 * 2 + 1
PPCREC_NAME_TEMPORARY_FPR0 = 5000, // 0 to 7
PPCREC_NAME_XER_CA = 6000, // carry bit from XER
PPCREC_NAME_XER_OV = 6001, // overflow bit from XER
@ -291,39 +267,14 @@ enum // IMLName
enum
{
// fpr load
PPCREC_FPR_LD_MODE_SINGLE_INTO_PS0,
PPCREC_FPR_LD_MODE_SINGLE_INTO_PS0_PS1,
PPCREC_FPR_LD_MODE_DOUBLE_INTO_PS0,
PPCREC_FPR_LD_MODE_PSQ_GENERIC_PS0,
PPCREC_FPR_LD_MODE_PSQ_GENERIC_PS0_PS1,
PPCREC_FPR_LD_MODE_PSQ_FLOAT_PS0,
PPCREC_FPR_LD_MODE_PSQ_FLOAT_PS0_PS1,
PPCREC_FPR_LD_MODE_PSQ_S16_PS0,
PPCREC_FPR_LD_MODE_PSQ_S16_PS0_PS1,
PPCREC_FPR_LD_MODE_PSQ_U16_PS0,
PPCREC_FPR_LD_MODE_PSQ_U16_PS0_PS1,
PPCREC_FPR_LD_MODE_PSQ_S8_PS0,
PPCREC_FPR_LD_MODE_PSQ_S8_PS0_PS1,
PPCREC_FPR_LD_MODE_PSQ_U8_PS0,
PPCREC_FPR_LD_MODE_PSQ_U8_PS0_PS1,
PPCREC_FPR_LD_MODE_SINGLE,
PPCREC_FPR_LD_MODE_DOUBLE,
// fpr store
PPCREC_FPR_ST_MODE_SINGLE_FROM_PS0, // store 1 single precision float from ps0
PPCREC_FPR_ST_MODE_DOUBLE_FROM_PS0, // store 1 double precision float from ps0
PPCREC_FPR_ST_MODE_SINGLE,
PPCREC_FPR_ST_MODE_DOUBLE,
PPCREC_FPR_ST_MODE_UI32_FROM_PS0, // store raw low-32bit of PS0
PPCREC_FPR_ST_MODE_PSQ_GENERIC_PS0_PS1,
PPCREC_FPR_ST_MODE_PSQ_GENERIC_PS0,
PPCREC_FPR_ST_MODE_PSQ_FLOAT_PS0_PS1,
PPCREC_FPR_ST_MODE_PSQ_FLOAT_PS0,
PPCREC_FPR_ST_MODE_PSQ_S8_PS0,
PPCREC_FPR_ST_MODE_PSQ_S8_PS0_PS1,
PPCREC_FPR_ST_MODE_PSQ_U8_PS0,
PPCREC_FPR_ST_MODE_PSQ_U8_PS0_PS1,
PPCREC_FPR_ST_MODE_PSQ_U16_PS0,
PPCREC_FPR_ST_MODE_PSQ_U16_PS0_PS1,
PPCREC_FPR_ST_MODE_PSQ_S16_PS0,
PPCREC_FPR_ST_MODE_PSQ_S16_PS0_PS1,
};
struct IMLUsedRegisters
@ -463,7 +414,6 @@ struct IMLInstruction
IMLReg registerData;
IMLReg registerMem;
IMLReg registerMem2;
IMLReg registerGQR;
uint8 copyWidth;
struct
{
@ -471,7 +421,7 @@ struct IMLInstruction
bool signExtend : 1;
bool notExpanded : 1; // for floats
}flags2;
uint8 mode; // transfer mode (copy width, ps0/ps1 behavior)
uint8 mode; // transfer mode
sint32 immS32;
}op_storeLoad;
struct
@ -752,6 +702,56 @@ struct IMLInstruction
this->op_call_imm.regReturn = regReturn;
}
// FPR
// load from memory
void make_fpr_r_memory(IMLReg registerDestination, IMLReg registerMemory, sint32 immS32, uint32 mode, bool switchEndian)
{
this->type = PPCREC_IML_TYPE_FPR_LOAD;
this->operation = 0;
this->op_storeLoad.registerData = registerDestination;
this->op_storeLoad.registerMem = registerMemory;
this->op_storeLoad.immS32 = immS32;
this->op_storeLoad.mode = mode;
this->op_storeLoad.flags2.swapEndian = switchEndian;
}
void make_fpr_r_memory_indexed(IMLReg registerDestination, IMLReg registerMemory1, IMLReg registerMemory2, uint32 mode, bool switchEndian)
{
this->type = PPCREC_IML_TYPE_FPR_LOAD_INDEXED;
this->operation = 0;
this->op_storeLoad.registerData = registerDestination;
this->op_storeLoad.registerMem = registerMemory1;
this->op_storeLoad.registerMem2 = registerMemory2;
this->op_storeLoad.immS32 = 0;
this->op_storeLoad.mode = mode;
this->op_storeLoad.flags2.swapEndian = switchEndian;
}
// store to memory
void make_fpr_memory_r(IMLReg registerSource, IMLReg registerMemory, sint32 immS32, uint32 mode, bool switchEndian)
{
this->type = PPCREC_IML_TYPE_FPR_STORE;
this->operation = 0;
this->op_storeLoad.registerData = registerSource;
this->op_storeLoad.registerMem = registerMemory;
this->op_storeLoad.immS32 = immS32;
this->op_storeLoad.mode = mode;
this->op_storeLoad.flags2.swapEndian = switchEndian;
}
void make_fpr_memory_r_indexed(IMLReg registerSource, IMLReg registerMemory1, IMLReg registerMemory2, sint32 immS32, uint32 mode, bool switchEndian)
{
this->type = PPCREC_IML_TYPE_FPR_STORE_INDEXED;
this->operation = 0;
this->op_storeLoad.registerData = registerSource;
this->op_storeLoad.registerMem = registerMemory1;
this->op_storeLoad.registerMem2 = registerMemory2;
this->op_storeLoad.immS32 = immS32;
this->op_storeLoad.mode = mode;
this->op_storeLoad.flags2.swapEndian = switchEndian;
}
void make_fpr_compare(IMLReg regA, IMLReg regB, IMLReg regR, IMLCondition cond)
{
this->type = PPCREC_IML_TYPE_FPR_COMPARE;
@ -762,6 +762,44 @@ struct IMLInstruction
this->op_fpr_compare.cond = cond;
}
void make_fpr_r(sint32 operation, IMLReg registerResult)
{
// OP (fpr)
this->type = PPCREC_IML_TYPE_FPR_R;
this->operation = operation;
this->op_fpr_r.regR = registerResult;
}
void make_fpr_r_r(sint32 operation, IMLReg registerResult, IMLReg registerOperand, sint32 crRegister=PPC_REC_INVALID_REGISTER)
{
// fpr OP fpr
this->type = PPCREC_IML_TYPE_FPR_R_R;
this->operation = operation;
this->op_fpr_r_r.regR = registerResult;
this->op_fpr_r_r.regA = registerOperand;
}
void make_fpr_r_r_r(sint32 operation, IMLReg registerResult, IMLReg registerOperand1, IMLReg registerOperand2, sint32 crRegister=PPC_REC_INVALID_REGISTER)
{
// fpr = OP (fpr,fpr)
this->type = PPCREC_IML_TYPE_FPR_R_R_R;
this->operation = operation;
this->op_fpr_r_r_r.regR = registerResult;
this->op_fpr_r_r_r.regA = registerOperand1;
this->op_fpr_r_r_r.regB = registerOperand2;
}
void make_fpr_r_r_r_r(sint32 operation, IMLReg registerResult, IMLReg registerOperandA, IMLReg registerOperandB, IMLReg registerOperandC, sint32 crRegister=PPC_REC_INVALID_REGISTER)
{
// fpr = OP (fpr,fpr,fpr)
this->type = PPCREC_IML_TYPE_FPR_R_R_R_R;
this->operation = operation;
this->op_fpr_r_r_r_r.regR = registerResult;
this->op_fpr_r_r_r_r.regA = registerOperandA;
this->op_fpr_r_r_r_r.regB = registerOperandB;
this->op_fpr_r_r_r_r.regC = registerOperandC;
}
/* X86 specific */
void make_x86_eflags_jcc(IMLCondition cond, bool invertedCondition)
{

View file

@ -34,8 +34,8 @@ void PPCRecompiler_optimizeDirectFloatCopiesScanForward(ppcImlGenContext_t* ppcI
if (imlInstruction->IsSuffixInstruction())
break;
// check if FPR is stored
if ((imlInstruction->type == PPCREC_IML_TYPE_FPR_STORE && imlInstruction->op_storeLoad.mode == PPCREC_FPR_ST_MODE_SINGLE_FROM_PS0) ||
(imlInstruction->type == PPCREC_IML_TYPE_FPR_STORE_INDEXED && imlInstruction->op_storeLoad.mode == PPCREC_FPR_ST_MODE_SINGLE_FROM_PS0))
if ((imlInstruction->type == PPCREC_IML_TYPE_FPR_STORE && imlInstruction->op_storeLoad.mode == PPCREC_FPR_ST_MODE_SINGLE) ||
(imlInstruction->type == PPCREC_IML_TYPE_FPR_STORE_INDEXED && imlInstruction->op_storeLoad.mode == PPCREC_FPR_ST_MODE_SINGLE))
{
if (imlInstruction->op_storeLoad.registerData.GetRegID() == fprIndex)
{
@ -73,7 +73,7 @@ void PPCRecompiler_optimizeDirectFloatCopiesScanForward(ppcImlGenContext_t* ppcI
{
// insert expand instruction after store
IMLInstruction* newExpand = PPCRecompiler_insertInstruction(imlSegment, lastStore);
PPCRecompilerImlGen_generateNewInstruction_fpr_r(ppcImlGenContext, newExpand, PPCREC_IML_OP_FPR_EXPAND_BOTTOM32_TO_BOTTOM64_AND_TOP64, _FPRRegFromID(fprIndex));
newExpand->make_fpr_r(PPCREC_IML_OP_FPR_EXPAND_F32_TO_F64, _FPRRegFromID(fprIndex));
}
}
@ -90,21 +90,23 @@ void PPCRecompiler_optimizeDirectFloatCopiesScanForward(ppcImlGenContext_t* ppcI
*/
void IMLOptimizer_OptimizeDirectFloatCopies(ppcImlGenContext_t* ppcImlGenContext)
{
for (IMLSegment* segIt : ppcImlGenContext->segmentList2)
{
for (sint32 i = 0; i < segIt->imlList.size(); i++)
{
IMLInstruction* imlInstruction = segIt->imlList.data() + i;
if (imlInstruction->type == PPCREC_IML_TYPE_FPR_LOAD && imlInstruction->op_storeLoad.mode == PPCREC_FPR_LD_MODE_SINGLE_INTO_PS0_PS1)
{
PPCRecompiler_optimizeDirectFloatCopiesScanForward(ppcImlGenContext, segIt, i, imlInstruction->op_storeLoad.registerData);
}
else if (imlInstruction->type == PPCREC_IML_TYPE_FPR_LOAD_INDEXED && imlInstruction->op_storeLoad.mode == PPCREC_FPR_LD_MODE_SINGLE_INTO_PS0_PS1)
{
PPCRecompiler_optimizeDirectFloatCopiesScanForward(ppcImlGenContext, segIt, i, imlInstruction->op_storeLoad.registerData);
}
}
}
cemuLog_logDebugOnce(LogType::Force, "IMLOptimizer_OptimizeDirectFloatCopies(): Currently disabled\n");
return;
// for (IMLSegment* segIt : ppcImlGenContext->segmentList2)
// {
// for (sint32 i = 0; i < segIt->imlList.size(); i++)
// {
// IMLInstruction* imlInstruction = segIt->imlList.data() + i;
// if (imlInstruction->type == PPCREC_IML_TYPE_FPR_LOAD && imlInstruction->op_storeLoad.mode == PPCREC_FPR_LD_MODE_SINGLE_INTO_PS0_PS1)
// {
// PPCRecompiler_optimizeDirectFloatCopiesScanForward(ppcImlGenContext, segIt, i, imlInstruction->op_storeLoad.registerData);
// }
// else if (imlInstruction->type == PPCREC_IML_TYPE_FPR_LOAD_INDEXED && imlInstruction->op_storeLoad.mode == PPCREC_FPR_LD_MODE_SINGLE_INTO_PS0_PS1)
// {
// PPCRecompiler_optimizeDirectFloatCopiesScanForward(ppcImlGenContext, segIt, i, imlInstruction->op_storeLoad.registerData);
// }
// }
// }
}
void PPCRecompiler_optimizeDirectIntegerCopiesScanForward(ppcImlGenContext_t* ppcImlGenContext, IMLSegment* imlSegment, sint32 imlIndexLoad, IMLReg gprReg)
@ -207,133 +209,22 @@ sint32 _getGQRIndexFromRegister(ppcImlGenContext_t* ppcImlGenContext, IMLReg gqr
bool PPCRecompiler_isUGQRValueKnown(ppcImlGenContext_t* ppcImlGenContext, sint32 gqrIndex, uint32& gqrValue)
{
// UGQR 2 to 7 are initialized by the OS and we assume that games won't ever permanently touch those
// todo - hack - replace with more accurate solution
if (gqrIndex == 2)
gqrValue = 0x00040004;
else if (gqrIndex == 3)
gqrValue = 0x00050005;
else if (gqrIndex == 4)
gqrValue = 0x00060006;
else if (gqrIndex == 5)
gqrValue = 0x00070007;
// the default configuration is:
// UGQR0 = 0x00000000
// UGQR2 = 0x00040004
// UGQR3 = 0x00050005
// UGQR4 = 0x00060006
// UGQR5 = 0x00070007
// but games are free to modify UGQR2 to UGQR7 it seems.
// no game modifies UGQR0 so it's safe enough to optimize for the default value
// Ideally we would do some kind of runtime tracking and second recompilation to create fast paths for PSQ_L/PSQ_ST but thats todo
if (gqrIndex == 0)
gqrValue = 0x00000000;
else
return false;
return true;
}
/*
* If value of GQR can be predicted for a given PSQ load or store instruction then replace it with an optimized version
*/
void PPCRecompiler_optimizePSQLoadAndStore(ppcImlGenContext_t* ppcImlGenContext)
{
for (IMLSegment* segIt : ppcImlGenContext->segmentList2)
{
for(IMLInstruction& instIt : segIt->imlList)
{
if (instIt.type == PPCREC_IML_TYPE_FPR_LOAD || instIt.type == PPCREC_IML_TYPE_FPR_LOAD_INDEXED)
{
if(instIt.op_storeLoad.mode != PPCREC_FPR_LD_MODE_PSQ_GENERIC_PS0 &&
instIt.op_storeLoad.mode != PPCREC_FPR_LD_MODE_PSQ_GENERIC_PS0_PS1 )
continue;
// get GQR value
cemu_assert_debug(instIt.op_storeLoad.registerGQR.IsValid());
sint32 gqrIndex = _getGQRIndexFromRegister(ppcImlGenContext, instIt.op_storeLoad.registerGQR);
cemu_assert(gqrIndex >= 0);
if (ppcImlGenContext->tracking.modifiesGQR[gqrIndex])
continue;
uint32 gqrValue;
if (!PPCRecompiler_isUGQRValueKnown(ppcImlGenContext, gqrIndex, gqrValue))
continue;
uint32 formatType = (gqrValue >> 16) & 7;
uint32 scale = (gqrValue >> 24) & 0x3F;
if (scale != 0)
continue; // only generic handler supports scale
if (instIt.op_storeLoad.mode == PPCREC_FPR_LD_MODE_PSQ_GENERIC_PS0)
{
if (formatType == 0)
instIt.op_storeLoad.mode = PPCREC_FPR_LD_MODE_PSQ_FLOAT_PS0;
else if (formatType == 4)
instIt.op_storeLoad.mode = PPCREC_FPR_LD_MODE_PSQ_U8_PS0;
else if (formatType == 5)
instIt.op_storeLoad.mode = PPCREC_FPR_LD_MODE_PSQ_U16_PS0;
else if (formatType == 6)
instIt.op_storeLoad.mode = PPCREC_FPR_LD_MODE_PSQ_S8_PS0;
else if (formatType == 7)
instIt.op_storeLoad.mode = PPCREC_FPR_LD_MODE_PSQ_S16_PS0;
if (instIt.op_storeLoad.mode != PPCREC_FPR_LD_MODE_PSQ_GENERIC_PS0)
instIt.op_storeLoad.registerGQR = IMLREG_INVALID;
}
else if (instIt.op_storeLoad.mode == PPCREC_FPR_LD_MODE_PSQ_GENERIC_PS0_PS1)
{
if (formatType == 0)
instIt.op_storeLoad.mode = PPCREC_FPR_LD_MODE_PSQ_FLOAT_PS0_PS1;
else if (formatType == 4)
instIt.op_storeLoad.mode = PPCREC_FPR_LD_MODE_PSQ_U8_PS0_PS1;
else if (formatType == 5)
instIt.op_storeLoad.mode = PPCREC_FPR_LD_MODE_PSQ_U16_PS0_PS1;
else if (formatType == 6)
instIt.op_storeLoad.mode = PPCREC_FPR_LD_MODE_PSQ_S8_PS0_PS1;
else if (formatType == 7)
instIt.op_storeLoad.mode = PPCREC_FPR_LD_MODE_PSQ_S16_PS0_PS1;
if (instIt.op_storeLoad.mode != PPCREC_FPR_LD_MODE_PSQ_GENERIC_PS0_PS1)
instIt.op_storeLoad.registerGQR = IMLREG_INVALID;
}
}
else if (instIt.type == PPCREC_IML_TYPE_FPR_STORE || instIt.type == PPCREC_IML_TYPE_FPR_STORE_INDEXED)
{
if(instIt.op_storeLoad.mode != PPCREC_FPR_ST_MODE_PSQ_GENERIC_PS0 &&
instIt.op_storeLoad.mode != PPCREC_FPR_ST_MODE_PSQ_GENERIC_PS0_PS1)
continue;
// get GQR value
cemu_assert_debug(instIt.op_storeLoad.registerGQR.IsValid());
sint32 gqrIndex = _getGQRIndexFromRegister(ppcImlGenContext, instIt.op_storeLoad.registerGQR);
cemu_assert(gqrIndex >= 0 && gqrIndex < 8);
if (ppcImlGenContext->tracking.modifiesGQR[gqrIndex])
continue;
uint32 gqrValue;
if(!PPCRecompiler_isUGQRValueKnown(ppcImlGenContext, gqrIndex, gqrValue))
continue;
uint32 formatType = (gqrValue >> 16) & 7;
uint32 scale = (gqrValue >> 24) & 0x3F;
if (scale != 0)
continue; // only generic handler supports scale
if (instIt.op_storeLoad.mode == PPCREC_FPR_ST_MODE_PSQ_GENERIC_PS0)
{
if (formatType == 0)
instIt.op_storeLoad.mode = PPCREC_FPR_ST_MODE_PSQ_FLOAT_PS0;
else if (formatType == 4)
instIt.op_storeLoad.mode = PPCREC_FPR_ST_MODE_PSQ_U8_PS0;
else if (formatType == 5)
instIt.op_storeLoad.mode = PPCREC_FPR_ST_MODE_PSQ_U16_PS0;
else if (formatType == 6)
instIt.op_storeLoad.mode = PPCREC_FPR_ST_MODE_PSQ_S8_PS0;
else if (formatType == 7)
instIt.op_storeLoad.mode = PPCREC_FPR_ST_MODE_PSQ_S16_PS0;
if (instIt.op_storeLoad.mode != PPCREC_FPR_ST_MODE_PSQ_GENERIC_PS0)
instIt.op_storeLoad.registerGQR = IMLREG_INVALID;
}
else if (instIt.op_storeLoad.mode == PPCREC_FPR_ST_MODE_PSQ_GENERIC_PS0_PS1)
{
if (formatType == 0)
instIt.op_storeLoad.mode = PPCREC_FPR_ST_MODE_PSQ_FLOAT_PS0_PS1;
else if (formatType == 4)
instIt.op_storeLoad.mode = PPCREC_FPR_ST_MODE_PSQ_U8_PS0_PS1;
else if (formatType == 5)
instIt.op_storeLoad.mode = PPCREC_FPR_ST_MODE_PSQ_U16_PS0_PS1;
else if (formatType == 6)
instIt.op_storeLoad.mode = PPCREC_FPR_ST_MODE_PSQ_S8_PS0_PS1;
else if (formatType == 7)
instIt.op_storeLoad.mode = PPCREC_FPR_ST_MODE_PSQ_S16_PS0_PS1;
if (instIt.op_storeLoad.mode != PPCREC_FPR_ST_MODE_PSQ_GENERIC_PS0_PS1)
instIt.op_storeLoad.registerGQR = IMLREG_INVALID;
}
}
}
}
}
// analyses register dependencies across the entire function
// per segment this will generate information about which registers need to be preserved and which ones don't (e.g. are overwritten)
class IMLOptimizerRegIOAnalysis

View file

@ -2093,7 +2093,10 @@ void IMLRA_GenerateSegmentMoveInstructions2(IMLRegisterAllocatorContext& ctx, IM
cemu_assert_debug(hadSuffixInstruction == imlSegment->HasSuffixInstruction());
if (imlSegment->HasSuffixInstruction())
{
cemu_assert_debug(!currentRange); // currentRange should be NULL?
if (currentRange)
{
cemuLog_logDebug(LogType::Force, "[DEBUG] GenerateSegmentMoveInstructions() hit suffix path with non-null currentRange. Segment: {:08x}", imlSegment->ppcAddress);
}
for (auto& remainingRange : activeRanges)
{
cemu_assert_debug(!remainingRange->hasStore);

View file

@ -311,10 +311,7 @@ bool PPCRecompiler_ApplyIMLPasses(ppcImlGenContext_t& ppcImlGenContext)
// this simplifies logic during register allocation
PPCRecompilerIML_isolateEnterableSegments(&ppcImlGenContext);
// if GQRs can be predicted, optimize PSQ load/stores
PPCRecompiler_optimizePSQLoadAndStore(&ppcImlGenContext);
// merge certain float load+store patterns (must happen before FPR register remapping)
// merge certain float load+store patterns
IMLOptimizer_OptimizeDirectFloatCopies(&ppcImlGenContext);
// delay byte swapping for certain load+store patterns
IMLOptimizer_OptimizeDirectIntegerCopies(&ppcImlGenContext);

View file

@ -14,34 +14,20 @@ void PPCRecompilerIml_insertSegments(ppcImlGenContext_t* ppcImlGenContext, sint3
void PPCRecompilerIml_setSegmentPoint(IMLSegmentPoint* segmentPoint, IMLSegment* imlSegment, sint32 index);
void PPCRecompilerIml_removeSegmentPoint(IMLSegmentPoint* segmentPoint);
// GPR register management
IMLReg PPCRecompilerImlGen_loadRegister(ppcImlGenContext_t* ppcImlGenContext, uint32 mappedName);
// Register management
IMLReg PPCRecompilerImlGen_LookupReg(ppcImlGenContext_t* ppcImlGenContext, IMLName mappedName, IMLRegFormat regFormat);
// FPR register management
IMLReg PPCRecompilerImlGen_loadFPRRegister(ppcImlGenContext_t* ppcImlGenContext, uint32 mappedName, bool loadNew = false);
IMLReg PPCRecompilerImlGen_loadOverwriteFPRRegister(ppcImlGenContext_t* ppcImlGenContext, uint32 mappedName);
IMLReg PPCRecompilerImlGen_loadRegister(ppcImlGenContext_t* ppcImlGenContext, uint32 mappedName);
// IML instruction generation
void PPCRecompilerImlGen_generateNewInstruction_conditional_r_s32(ppcImlGenContext_t* ppcImlGenContext, IMLInstruction* imlInstruction, uint32 operation, IMLReg registerIndex, sint32 immS32, uint32 crRegisterIndex, uint32 crBitIndex, bool bitMustBeSet);
void PPCRecompilerImlGen_generateNewInstruction_fpr_r(ppcImlGenContext_t* ppcImlGenContext, IMLInstruction* imlInstruction, sint32 operation, IMLReg registerResult);
// IML generation - FPU
bool PPCRecompilerImlGen_LFS(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_LFSU(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_LFSX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_LFSUX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_LFD(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_LFDU(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_LFDX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_LFDUX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_STFS(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_STFSU(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_STFSX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_STFSUX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_LFS_LFSU_LFD_LFDU(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode, bool withUpdate, bool isDouble);
bool PPCRecompilerImlGen_LFSX_LFSUX_LFDX_LFDUX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode, bool withUpdate, bool isDouble);
bool PPCRecompilerImlGen_STFS_STFSU_STFD_STFDU(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode, bool withUpdate, bool isDouble);
bool PPCRecompilerImlGen_STFSX_STFSUX_STFDX_STFDUX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode, bool hasUpdate, bool isDouble);
bool PPCRecompilerImlGen_STFIWX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_STFD(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_STFDU(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_STFDX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_FADD(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_FSUB(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_FMUL(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
@ -67,22 +53,17 @@ bool PPCRecompilerImlGen_FNEG(ppcImlGenContext_t* ppcImlGenContext, uint32 opcod
bool PPCRecompilerImlGen_FSEL(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_FRSQRTE(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_FCTIWZ(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PSQ_L(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PSQ_LU(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PSQ_ST(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PSQ_STU(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PS_MULS0(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PS_MULS1(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PS_MADDS0(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PS_MADDS1(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PSQ_L(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode, bool withUpdate);
bool PPCRecompilerImlGen_PSQ_ST(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode, bool withUpdate);
bool PPCRecompilerImlGen_PS_MULSX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode, bool isVariant1);
bool PPCRecompilerImlGen_PS_MADDSX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode, bool isVariant1);
bool PPCRecompilerImlGen_PS_ADD(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PS_SUB(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PS_MUL(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PS_DIV(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PS_MADD(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PS_NMADD(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PS_MSUB(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PS_NMSUB(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PS_MSUB(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode, bool withNegative);
bool PPCRecompilerImlGen_PS_SUM0(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PS_SUM1(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PS_NEG(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
@ -102,3 +83,19 @@ bool PPCRecompilerImlGen_PS_CMPU1(ppcImlGenContext_t* ppcImlGenContext, uint32 o
// IML general
void PPCRecompilerIML_isolateEnterableSegments(ppcImlGenContext_t* ppcImlGenContext);
void PPCIMLGen_CreateSegmentBranchedPath(ppcImlGenContext_t& ppcImlGenContext, PPCBasicBlockInfo& basicBlockInfo, const std::function<void(ppcImlGenContext_t&)>& genSegmentBranchTaken, const std::function<void(ppcImlGenContext_t&)>& genSegmentBranchNotTaken);
void PPCIMLGen_CreateSegmentBranchedPath(ppcImlGenContext_t& ppcImlGenContext, PPCBasicBlockInfo& basicBlockInfo, const std::function<void(ppcImlGenContext_t&)>& genSegmentBranchNotTaken); // no else segment
void PPCIMLGen_CreateSegmentBranchedPathMultiple(ppcImlGenContext_t& ppcImlGenContext, PPCBasicBlockInfo& basicBlockInfo, IMLSegment** segmentsOut, IMLReg compareReg, sint32* compareValues, sint32 count, sint32 defaultCaseIndex);
class IMLRedirectInstOutput
{
public:
IMLRedirectInstOutput(ppcImlGenContext_t* ppcImlGenContext, IMLSegment* outputSegment);
~IMLRedirectInstOutput();
private:
ppcImlGenContext_t* m_context;
IMLSegment* m_prevSegment;
};

View file

@ -87,8 +87,7 @@ void PPCRecompilerImlGen_generateNewInstruction_memory_r_indexed(ppcImlGenContex
}
// create and fill two segments (branch taken and branch not taken) as a follow up to the current segment and then merge flow afterwards
template<typename F1n, typename F2n>
void PPCIMLGen_CreateSegmentBranchedPath(ppcImlGenContext_t& ppcImlGenContext, PPCBasicBlockInfo& basicBlockInfo, F1n genSegmentBranchTaken, F2n genSegmentBranchNotTaken)
void PPCIMLGen_CreateSegmentBranchedPath(ppcImlGenContext_t& ppcImlGenContext, PPCBasicBlockInfo& basicBlockInfo, const std::function<void(ppcImlGenContext_t&)>& genSegmentBranchTaken, const std::function<void(ppcImlGenContext_t&)>& genSegmentBranchNotTaken)
{
IMLSegment* currentWriteSegment = basicBlockInfo.GetSegmentForInstructionAppend();
@ -118,6 +117,122 @@ void PPCIMLGen_CreateSegmentBranchedPath(ppcImlGenContext_t& ppcImlGenContext, P
basicBlockInfo.appendSegment = segMerge;
}
void PPCIMLGen_CreateSegmentBranchedPath(ppcImlGenContext_t& ppcImlGenContext, PPCBasicBlockInfo& basicBlockInfo, const std::function<void(ppcImlGenContext_t&)>& genSegmentBranchNotTaken)
{
IMLSegment* currentWriteSegment = basicBlockInfo.GetSegmentForInstructionAppend();
std::span<IMLSegment*> segments = ppcImlGenContext.InsertSegments(ppcImlGenContext.GetSegmentIndex(currentWriteSegment) + 1, 2);
IMLSegment* segBranchNotTaken = segments[0];
IMLSegment* segMerge = segments[1];
// link the segments
segMerge->SetLinkBranchTaken(currentWriteSegment->GetBranchTaken());
segMerge->SetLinkBranchNotTaken(currentWriteSegment->GetBranchNotTaken());
currentWriteSegment->SetLinkBranchTaken(segMerge);
currentWriteSegment->SetLinkBranchNotTaken(segBranchNotTaken);
segBranchNotTaken->SetLinkBranchNotTaken(segMerge);
// generate code for branch not taken segment
ppcImlGenContext.currentOutputSegment = segBranchNotTaken;
genSegmentBranchNotTaken(ppcImlGenContext);
cemu_assert_debug(ppcImlGenContext.currentOutputSegment == segBranchNotTaken);
// make merge segment the new write segment
ppcImlGenContext.currentOutputSegment = segMerge;
basicBlockInfo.appendSegment = segMerge;
}
IMLReg _GetRegTemporaryS8(ppcImlGenContext_t* ppcImlGenContext, uint32 index);
IMLRedirectInstOutput::IMLRedirectInstOutput(ppcImlGenContext_t* ppcImlGenContext, IMLSegment* outputSegment) : m_context(ppcImlGenContext)
{
m_prevSegment = ppcImlGenContext->currentOutputSegment;
cemu_assert_debug(ppcImlGenContext->currentOutputSegment == ppcImlGenContext->currentBasicBlock->appendSegment);
if (outputSegment == ppcImlGenContext->currentOutputSegment)
{
m_prevSegment = nullptr;
return;
}
m_context->currentBasicBlock->appendSegment = outputSegment;
m_context->currentOutputSegment = outputSegment;
}
IMLRedirectInstOutput::~IMLRedirectInstOutput()
{
if (m_prevSegment)
{
m_context->currentBasicBlock->appendSegment = m_prevSegment;
m_context->currentOutputSegment = m_prevSegment;
}
}
// compare values and branch to segment with same index in segmentsOut. The last segment doesn't actually have any comparison and just is the default case. Thus compareValues is one shorter than count
void PPCIMLGen_CreateSegmentBranchedPathMultiple(ppcImlGenContext_t& ppcImlGenContext, PPCBasicBlockInfo& basicBlockInfo, IMLSegment** segmentsOut, IMLReg compareReg, sint32* compareValues, sint32 count, sint32 defaultCaseIndex)
{
IMLSegment* currentWriteSegment = basicBlockInfo.GetSegmentForInstructionAppend();
cemu_assert_debug(!currentWriteSegment->HasSuffixInstruction()); // must not already have a suffix instruction
const sint32 numBranchSegments = count + 1;
const sint32 numCaseSegments = count;
std::span<IMLSegment*> segments = ppcImlGenContext.InsertSegments(ppcImlGenContext.GetSegmentIndex(currentWriteSegment) + 1, numBranchSegments - 1 + numCaseSegments + 1);
IMLSegment** extraBranchSegments = segments.data();
IMLSegment** caseSegments = segments.data() + numBranchSegments - 1;
IMLSegment* mergeSegment = segments[numBranchSegments - 1 + numCaseSegments];
// move links to the merge segment
mergeSegment->SetLinkBranchTaken(currentWriteSegment->GetBranchTaken());
mergeSegment->SetLinkBranchNotTaken(currentWriteSegment->GetBranchNotTaken());
currentWriteSegment->SetLinkBranchTaken(nullptr);
currentWriteSegment->SetLinkBranchNotTaken(nullptr);
for (sint32 i=0; i<numCaseSegments; i++)
segmentsOut[i] = caseSegments[i];
IMLReg tmpBoolReg = _GetRegTemporaryS8(&ppcImlGenContext, 2);
// the first branch segment is the original current write segment
auto GetBranchSegment = [&](sint32 index) {
if (index == 0)
return currentWriteSegment;
else
return extraBranchSegments[index - 1];
};
// link branch segments (taken: Link to case segment. NotTaken: Link to next branch segment. For the last one use a non-conditional jump)
for (sint32 i=0; i<numBranchSegments; i++)
{
IMLSegment* seg = GetBranchSegment(i);
if (i < numBranchSegments - 1)
{
cemu_assert_debug(i < numCaseSegments);
seg->SetLinkBranchTaken(caseSegments[i]);
seg->SetLinkBranchNotTaken(GetBranchSegment(i + 1));
seg->AppendInstruction()->make_compare_s32(compareReg, compareValues[i], tmpBoolReg, IMLCondition::EQ);
seg->AppendInstruction()->make_conditional_jump(tmpBoolReg, true);
}
else
{
cemu_assert_debug(defaultCaseIndex < numCaseSegments);
seg->SetLinkBranchTaken(caseSegments[defaultCaseIndex]);
seg->AppendInstruction()->make_jump();
}
}
// link case segments
for (sint32 i=0; i<numCaseSegments; i++)
{
IMLSegment* seg = caseSegments[i];
if (i < numCaseSegments - 1)
{
seg->SetLinkBranchTaken(mergeSegment);
// -> Jumps are added after the instructions
}
else
{
seg->SetLinkBranchTaken(mergeSegment);
}
}
ppcImlGenContext.currentOutputSegment = mergeSegment;
basicBlockInfo.appendSegment = mergeSegment;
}
IMLReg PPCRecompilerImlGen_LookupReg(ppcImlGenContext_t* ppcImlGenContext, IMLName mappedName, IMLRegFormat regFormat)
{
auto it = ppcImlGenContext->mappedRegs.find(mappedName);
@ -212,32 +327,14 @@ IMLReg _GetRegTemporary(ppcImlGenContext_t* ppcImlGenContext, uint32 index)
return PPCRecompilerImlGen_loadRegister(ppcImlGenContext, PPCREC_NAME_TEMPORARY + index);
}
// get throw-away register. Only valid for the scope of a single translated instruction
// be careful to not collide with manually loaded temporary register
// get throw-away register
// be careful to not collide with other temporary register
IMLReg _GetRegTemporaryS8(ppcImlGenContext_t* ppcImlGenContext, uint32 index)
{
cemu_assert_debug(index < 4);
return PPCRecompilerImlGen_loadRegister(ppcImlGenContext, PPCREC_NAME_TEMPORARY + index);
}
/*
* Loads a PPC fpr into any of the available IML FPU registers
* If loadNew is false, it will check first if the fpr is already loaded into any IML register
*/
IMLReg PPCRecompilerImlGen_loadFPRRegister(ppcImlGenContext_t* ppcImlGenContext, uint32 mappedName, bool loadNew)
{
return PPCRecompilerImlGen_LookupReg(ppcImlGenContext, mappedName, IMLRegFormat::F64);
}
/*
* Checks if a PPC fpr register is already loaded into any IML register
* If not, it will create a new undefined temporary IML FPU register and map the name (effectively overwriting the old ppc register)
*/
IMLReg PPCRecompilerImlGen_loadOverwriteFPRRegister(ppcImlGenContext_t* ppcImlGenContext, uint32 mappedName)
{
return PPCRecompilerImlGen_LookupReg(ppcImlGenContext, mappedName, IMLRegFormat::F64);
}
bool PPCRecompiler_canInlineFunction(MPTR functionPtr, sint32* functionInstructionCount)
{
for (sint32 i = 0; i < 6; i++)
@ -1050,15 +1147,15 @@ bool PPCRecompilerImlGen_SRAW(ppcImlGenContext_t* ppcImlGenContext, uint32 opcod
// load masked shift factor into temporary register
ppcImlGenContext->emitInst().make_r_r_s32(PPCREC_IML_OP_AND, regTmpShiftAmount, regB, 0x3F);
ppcImlGenContext->emitInst().make_compare_s32(regTmpShiftAmount, 32, regTmpCondBool, IMLCondition::UNSIGNED_GT);
ppcImlGenContext->emitInst().make_compare_s32(regTmpShiftAmount, 31, regTmpCondBool, IMLCondition::UNSIGNED_GT);
ppcImlGenContext->emitInst().make_conditional_jump(regTmpCondBool, true);
PPCIMLGen_CreateSegmentBranchedPath(*ppcImlGenContext, *ppcImlGenContext->currentBasicBlock,
[&](ppcImlGenContext_t& genCtx)
{
/* branch taken */
genCtx.emitInst().make_r_r_r(PPCREC_IML_OP_RIGHT_SHIFT_S, regA, regS, regTmpShiftAmount);
genCtx.emitInst().make_compare_s32(regA, 0, regCarry, IMLCondition::NEQ); // if the sign bit is still set it also means it was shifted out and we can set carry
/* branch taken, shift size 32 or above */
genCtx.emitInst().make_r_r_s32(PPCREC_IML_OP_RIGHT_SHIFT_S, regA, regS, 31); // shift the sign bit into all the bits
genCtx.emitInst().make_compare_s32(regA, 0, regCarry, IMLCondition::NEQ);
},
[&](ppcImlGenContext_t& genCtx)
{
@ -1073,6 +1170,8 @@ bool PPCRecompilerImlGen_SRAW(ppcImlGenContext_t* ppcImlGenContext, uint32 opcod
genCtx.emitInst().make_r_r_r(PPCREC_IML_OP_RIGHT_SHIFT_S, regA, regS, regTmpShiftAmount);
}
);
if (opcode & PPC_OPC_RC)
PPCImlGen_UpdateCR0(ppcImlGenContext, regA);
return true;
}
@ -1909,23 +2008,23 @@ bool PPCRecompiler_decodePPCInstruction(ppcImlGenContext_t* ppcImlGenContext)
unsupportedInstructionFound = true;
ppcImlGenContext->hasFPUInstruction = true;
break;
case 12: // multiply scalar
if (PPCRecompilerImlGen_PS_MULS0(ppcImlGenContext, opcode) == false)
case 12: // PS_MULS0
if (PPCRecompilerImlGen_PS_MULSX(ppcImlGenContext, opcode, false) == false)
unsupportedInstructionFound = true;
ppcImlGenContext->hasFPUInstruction = true;
break;
case 13: // multiply scalar
if (PPCRecompilerImlGen_PS_MULS1(ppcImlGenContext, opcode) == false)
case 13: // PS_MULS1
if (PPCRecompilerImlGen_PS_MULSX(ppcImlGenContext, opcode, true) == false)
unsupportedInstructionFound = true;
ppcImlGenContext->hasFPUInstruction = true;
break;
case 14: // multiply add scalar
if (PPCRecompilerImlGen_PS_MADDS0(ppcImlGenContext, opcode) == false)
case 14: // PS_MADDS0
if (PPCRecompilerImlGen_PS_MADDSX(ppcImlGenContext, opcode, false) == false)
unsupportedInstructionFound = true;
ppcImlGenContext->hasFPUInstruction = true;
break;
case 15: // multiply add scalar
if (PPCRecompilerImlGen_PS_MADDS1(ppcImlGenContext, opcode) == false)
case 15: // PS_MADDS1
if (PPCRecompilerImlGen_PS_MADDSX(ppcImlGenContext, opcode, true) == false)
unsupportedInstructionFound = true;
ppcImlGenContext->hasFPUInstruction = true;
break;
@ -1992,22 +2091,22 @@ bool PPCRecompiler_decodePPCInstruction(ppcImlGenContext_t* ppcImlGenContext)
unsupportedInstructionFound = true;
ppcImlGenContext->hasFPUInstruction = true;
break;
case 28: // multiply sub paired
if (PPCRecompilerImlGen_PS_MSUB(ppcImlGenContext, opcode) == false)
case 28: // PS_MSUB
if (PPCRecompilerImlGen_PS_MSUB(ppcImlGenContext, opcode, false) == false)
unsupportedInstructionFound = true;
ppcImlGenContext->hasFPUInstruction = true;
break;
case 29: // multiply add paired
case 29: // PS_MADD
if (PPCRecompilerImlGen_PS_MADD(ppcImlGenContext, opcode) == false)
unsupportedInstructionFound = true;
ppcImlGenContext->hasFPUInstruction = true;
break;
case 30: // negative multiply sub paired
if (PPCRecompilerImlGen_PS_NMSUB(ppcImlGenContext, opcode) == false)
case 30: // PS_NMSUB
if (PPCRecompilerImlGen_PS_MSUB(ppcImlGenContext, opcode, true) == false)
unsupportedInstructionFound = true;
ppcImlGenContext->hasFPUInstruction = true;
break;
case 31: // negative multiply add paired
case 31: // PS_NMADD
if (PPCRecompilerImlGen_PS_NMADD(ppcImlGenContext, opcode) == false)
unsupportedInstructionFound = true;
ppcImlGenContext->hasFPUInstruction = true;
@ -2339,8 +2438,8 @@ bool PPCRecompiler_decodePPCInstruction(ppcImlGenContext_t* ppcImlGenContext)
case 534: // LWBRX
PPCRecompilerImlGen_LOAD_INDEXED(ppcImlGenContext, opcode, 32, false, false, false);
break;
case 535:
if (PPCRecompilerImlGen_LFSX(ppcImlGenContext, opcode) == false)
case 535: // LFSX
if (PPCRecompilerImlGen_LFSX_LFSUX_LFDX_LFDUX(ppcImlGenContext, opcode, false, false) == false)
unsupportedInstructionFound = true;
ppcImlGenContext->hasFPUInstruction = true;
break;
@ -2348,8 +2447,8 @@ bool PPCRecompiler_decodePPCInstruction(ppcImlGenContext_t* ppcImlGenContext)
if (PPCRecompilerImlGen_SRW(ppcImlGenContext, opcode) == false)
unsupportedInstructionFound = true;
break;
case 567:
if (PPCRecompilerImlGen_LFSUX(ppcImlGenContext, opcode) == false)
case 567: // LFSUX
if (PPCRecompilerImlGen_LFSX_LFSUX_LFDX_LFDUX(ppcImlGenContext, opcode, true, false) == false)
unsupportedInstructionFound = true;
ppcImlGenContext->hasFPUInstruction = true;
break;
@ -2360,13 +2459,13 @@ bool PPCRecompiler_decodePPCInstruction(ppcImlGenContext_t* ppcImlGenContext)
case 598:
PPCRecompilerImlGen_SYNC(ppcImlGenContext, opcode);
break;
case 599:
if (PPCRecompilerImlGen_LFDX(ppcImlGenContext, opcode) == false)
case 599: // LFDX
if (PPCRecompilerImlGen_LFSX_LFSUX_LFDX_LFDUX(ppcImlGenContext, opcode, false, true) == false)
unsupportedInstructionFound = true;
ppcImlGenContext->hasFPUInstruction = true;
break;
case 631:
if (PPCRecompilerImlGen_LFDUX(ppcImlGenContext, opcode) == false)
case 631: // LFDUX
if (PPCRecompilerImlGen_LFSX_LFSUX_LFDX_LFDUX(ppcImlGenContext, opcode, true, true) == false)
unsupportedInstructionFound = true;
ppcImlGenContext->hasFPUInstruction = true;
break;
@ -2374,20 +2473,24 @@ bool PPCRecompiler_decodePPCInstruction(ppcImlGenContext_t* ppcImlGenContext)
if (!PPCRecompilerImlGen_STORE_INDEXED(ppcImlGenContext, opcode, 32, false, false))
unsupportedInstructionFound = true;
break;
case 663:
if (PPCRecompilerImlGen_STFSX(ppcImlGenContext, opcode) == false)
case 663: // STFSX
if (PPCRecompilerImlGen_STFSX_STFSUX_STFDX_STFDUX(ppcImlGenContext, opcode, false, false) == false)
unsupportedInstructionFound = true;
break;
case 695:
if (PPCRecompilerImlGen_STFSUX(ppcImlGenContext, opcode) == false)
case 695: // STFSUX
if (PPCRecompilerImlGen_STFSX_STFSUX_STFDX_STFDUX(ppcImlGenContext, opcode, true, false) == false)
unsupportedInstructionFound = true;
break;
case 725:
if (PPCRecompilerImlGen_STSWI(ppcImlGenContext, opcode) == false)
unsupportedInstructionFound = true;
break;
case 727:
if (PPCRecompilerImlGen_STFDX(ppcImlGenContext, opcode) == false)
case 727: // STFDX
if (PPCRecompilerImlGen_STFSX_STFSUX_STFDX_STFDUX(ppcImlGenContext, opcode, false, true) == false)
unsupportedInstructionFound = true;
break;
case 759: // STFDUX
if (PPCRecompilerImlGen_STFSX_STFSUX_STFDX_STFDUX(ppcImlGenContext, opcode, true, true) == false)
unsupportedInstructionFound = true;
break;
case 790: // LHBRX
@ -2488,53 +2591,53 @@ bool PPCRecompiler_decodePPCInstruction(ppcImlGenContext_t* ppcImlGenContext)
case 47:
PPCRecompilerImlGen_STMW(ppcImlGenContext, opcode);
break;
case 48:
if (PPCRecompilerImlGen_LFS(ppcImlGenContext, opcode) == false)
case 48: // LFS
if (PPCRecompilerImlGen_LFS_LFSU_LFD_LFDU(ppcImlGenContext, opcode, false, false) == false)
unsupportedInstructionFound = true;
ppcImlGenContext->hasFPUInstruction = true;
break;
case 49:
if (PPCRecompilerImlGen_LFSU(ppcImlGenContext, opcode) == false)
case 49: // LFSU
if (PPCRecompilerImlGen_LFS_LFSU_LFD_LFDU(ppcImlGenContext, opcode, true, false) == false)
unsupportedInstructionFound = true;
ppcImlGenContext->hasFPUInstruction = true;
break;
case 50:
if (PPCRecompilerImlGen_LFD(ppcImlGenContext, opcode) == false)
case 50: // LFD
if (PPCRecompilerImlGen_LFS_LFSU_LFD_LFDU(ppcImlGenContext, opcode, false, true) == false)
unsupportedInstructionFound = true;
ppcImlGenContext->hasFPUInstruction = true;
break;
case 51:
if (PPCRecompilerImlGen_LFDU(ppcImlGenContext, opcode) == false)
case 51: // LFDU
if (PPCRecompilerImlGen_LFS_LFSU_LFD_LFDU(ppcImlGenContext, opcode, true, true) == false)
unsupportedInstructionFound = true;
ppcImlGenContext->hasFPUInstruction = true;
break;
case 52:
if (PPCRecompilerImlGen_STFS(ppcImlGenContext, opcode) == false)
case 52: // STFS
if (PPCRecompilerImlGen_STFS_STFSU_STFD_STFDU(ppcImlGenContext, opcode, false, false) == false)
unsupportedInstructionFound = true;
ppcImlGenContext->hasFPUInstruction = true;
break;
case 53:
if (PPCRecompilerImlGen_STFSU(ppcImlGenContext, opcode) == false)
case 53: // STFSU
if (PPCRecompilerImlGen_STFS_STFSU_STFD_STFDU(ppcImlGenContext, opcode, true, false) == false)
unsupportedInstructionFound = true;
ppcImlGenContext->hasFPUInstruction = true;
break;
case 54:
if (PPCRecompilerImlGen_STFD(ppcImlGenContext, opcode) == false)
case 54: // STFD
if (PPCRecompilerImlGen_STFS_STFSU_STFD_STFDU(ppcImlGenContext, opcode, false, true) == false)
unsupportedInstructionFound = true;
ppcImlGenContext->hasFPUInstruction = true;
break;
case 55:
if (PPCRecompilerImlGen_STFDU(ppcImlGenContext, opcode) == false)
case 55: // STFDU
if (PPCRecompilerImlGen_STFS_STFSU_STFD_STFDU(ppcImlGenContext, opcode, true, true) == false)
unsupportedInstructionFound = true;
ppcImlGenContext->hasFPUInstruction = true;
break;
case 56:
if (PPCRecompilerImlGen_PSQ_L(ppcImlGenContext, opcode) == false)
if (PPCRecompilerImlGen_PSQ_L(ppcImlGenContext, opcode, false) == false)
unsupportedInstructionFound = true;
ppcImlGenContext->hasFPUInstruction = true;
break;
case 57:
if (PPCRecompilerImlGen_PSQ_LU(ppcImlGenContext, opcode) == false)
if (PPCRecompilerImlGen_PSQ_L(ppcImlGenContext, opcode, true) == false)
unsupportedInstructionFound = true;
ppcImlGenContext->hasFPUInstruction = true;
break;
@ -2587,12 +2690,12 @@ bool PPCRecompiler_decodePPCInstruction(ppcImlGenContext_t* ppcImlGenContext)
}
break;
case 60:
if (PPCRecompilerImlGen_PSQ_ST(ppcImlGenContext, opcode) == false)
if (PPCRecompilerImlGen_PSQ_ST(ppcImlGenContext, opcode, false) == false)
unsupportedInstructionFound = true;
ppcImlGenContext->hasFPUInstruction = true;
break;
case 61:
if (PPCRecompilerImlGen_PSQ_STU(ppcImlGenContext, opcode) == false)
if (PPCRecompilerImlGen_PSQ_ST(ppcImlGenContext, opcode, true) == false)
unsupportedInstructionFound = true;
ppcImlGenContext->hasFPUInstruction = true;
break;
@ -2702,7 +2805,6 @@ bool PPCRecompiler_decodePPCInstruction(ppcImlGenContext_t* ppcImlGenContext)
}
// returns false if code flow is not interrupted
// continueDefaultPath: Controls if
bool PPCRecompiler_CheckIfInstructionEndsSegment(PPCFunctionBoundaryTracker& boundaryTracker, uint32 instructionAddress, uint32 opcode, bool& makeNextInstEnterable, bool& continueDefaultPath, bool& hasBranchTarget, uint32& branchTarget)
{
hasBranchTarget = false;

File diff suppressed because it is too large Load diff

View file

@ -1,53 +0,0 @@
project(CemuAsm C)
if (CMAKE_OSX_ARCHITECTURES)
set(CEMU_ASM_ARCHITECTURE ${CMAKE_OSX_ARCHITECTURES})
else()
set(CEMU_ASM_ARCHITECTURE ${CMAKE_SYSTEM_PROCESSOR})
endif()
if (CEMU_ASM_ARCHITECTURE MATCHES "(x86)|(X86)|(amd64)|(AMD64)")
if (WIN32)
enable_language(C ASM_MASM)
add_library(CemuAsm x64util_masm.asm)
set_source_files_properties(x64util_masm.asm PROPERTIES LANGUAGE ASM_MASM)
# workaround for cr flag being passed to LINK.exe which considers it an input file and thus fails
# doesn't always seem to happen. The Windows CI builds were fine, but locally I would run into this problem
# possibly related to https://gitlab.kitware.com/cmake/cmake/-/issues/18889
set(CMAKE_ASM_MASM_CREATE_STATIC_LIBRARY "<CMAKE_AR> /OUT:<TARGET> <LINK_FLAGS> <OBJECTS>")
set_property(TARGET CemuAsm PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")
else()
# NASM
if (APPLE)
set(CMAKE_ASM_NASM_COMPILE_OBJECT "<CMAKE_ASM_NASM_COMPILER> -g -Fdwarf -f macho64 --prefix _ -o <OBJECT> <SOURCE>")
else()
set(CMAKE_ASM_NASM_COMPILE_OBJECT "<CMAKE_ASM_NASM_COMPILER> -g -Fdwarf -f elf64 -o <OBJECT> <SOURCE>")
endif()
set(CMAKE_ASM_NASM_LINK_EXECUTABLE "ld <FLAGS> <CMAKE_ASM_NASM_LINK_FLAGS> <LINK_FLAGS> -fPIC <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
enable_language(C ASM_NASM)
add_library(CemuAsm x64util_nasm.asm)
set_source_files_properties(x64util_nasm.asm PROPERTIES LANGUAGE ASM_NASM)
if (APPLE)
set_target_properties(CemuAsm PROPERTIES NASM_OBJ_FORMAT macho64)
else()
set_target_properties(CemuAsm PROPERTIES NASM_OBJ_FORMAT elf64)
endif()
set_target_properties(CemuAsm PROPERTIES LINKER_LANGUAGE C)
endif()
elseif(CEMU_ASM_ARCHITECTURE MATCHES "(aarch64)|(AARCH64)|(arm64)|(ARM64)")
add_library(CemuAsm stub.cpp)
else()
message(STATUS "CemuAsm - Unsupported arch: ${CEMU_ASM_ARCHITECTURE}")
endif()

View file

@ -1 +0,0 @@

View file

@ -1,20 +0,0 @@
#pragma once
#if defined(ARCH_X86_64)
extern "C" void recompiler_fres();
extern "C" void recompiler_frsqrte();
#else
// stubbed on non-x86 for now
static void recompiler_fres()
{
cemu_assert_unimplemented();
}
static void recompiler_frsqrte()
{
cemu_assert_unimplemented();
}
#endif

View file

@ -1,233 +0,0 @@
.code
recompiler_fres PROC
; store all modified registers
push rdx
push rcx
push rax
push r8
lea r8,[asmFresLookupTable]
movq rdx, xmm15
mov rcx,rdx
shr rcx,2Fh
mov rax,rdx
and ecx,1Fh
shr rax,25h
and eax,3FFh
imul eax,dword ptr [r8+rcx*8+4]
mov r8d,dword ptr [r8+rcx*8]
mov rcx,rdx
shr rcx,34h
inc eax
shr eax,1
sub r8d,eax
and ecx,7FFh
jne fres_espresso_label3
mov rax,7FF0000000000000h
or rdx,rax
movq xmm15, rdx
pop r8
pop rax
pop rcx
pop rdx
ret
fres_espresso_label3:
cmp ecx,7FFh
jne fres_espresso_label4
mov rax,0FFFFFFFFFFFFFh
test rax,rdx
jne fres_espresso_label1
test rdx,rdx
jns fres_espresso_label2
mov rax,8000000000000000h
movq xmm15, rax
pop r8
pop rax
pop rcx
pop rdx
ret
fres_espresso_label2:
xorps xmm15,xmm15
pop r8
pop rax
pop rcx
pop rdx
ret
fres_espresso_label4:
mov eax,7FDh
sub eax,ecx
mov ecx,eax
mov rax,8000000000000000h
and rdx,rax
shl rcx,34h
mov eax,r8d
or rcx,rdx
shl rax,1Dh
add rcx,rax
movq xmm15, rcx
fres_espresso_label1:
pop r8
pop rax
pop rcx
pop rdx
ret
recompiler_fres ENDP
asmFresLookupTable:
DD 07ff800h, 03e1h
DD 0783800h, 03a7h
DD 070ea00h, 0371h
DD 06a0800h, 0340h
DD 0638800h, 0313h
DD 05d6200h, 02eah
DD 0579000h, 02c4h
DD 0520800h, 02a0h
DD 04cc800h, 027fh
DD 047ca00h, 0261h
DD 0430800h, 0245h
DD 03e8000h, 022ah
DD 03a2c00h, 0212h
DD 0360800h, 01fbh
DD 0321400h, 01e5h
DD 02e4a00h, 01d1h
DD 02aa800h, 01beh
DD 0272c00h, 01ach
DD 023d600h, 019bh
DD 0209e00h, 018bh
DD 01d8800h, 017ch
DD 01a9000h, 016eh
DD 017ae00h, 015bh
DD 014f800h, 015bh
DD 0124400h, 0143h
DD 0fbe00h, 0143h
DD 0d3800h, 012dh
DD 0ade00h, 012dh
DD 088400h, 011ah
DD 065000h, 011ah
DD 041c00h, 0108h
DD 020c00h, 0106h
recompiler_frsqrte PROC
; store all modified registers
push rdx
push rcx
push rax
push r8
push r9
movq r8, xmm15
mov rax,7FFFFFFFFFFFFFFFh
test rax,r8
jne frsqrte_espresso_label1
mov rax,0FFF0000000000000h
and r8,rax
mov rax,7FF0000000000000h
or r8,rax
movq xmm15, r8
pop r9
pop r8
pop rax
pop rcx
pop rdx
ret
frsqrte_espresso_label1:
mov r9,r8
shr r9,34h
and r9d,7FFh
cmp r9d,7FFh
jne frsqrte_espresso_label2
mov rax,0FFFFFFFFFFFFFh
test rax,r8
jne frsqrte_espresso_label3
test r8,r8
js frsqrte_espresso_label4
xorps xmm15,xmm15
pop r9
pop r8
pop rax
pop rcx
pop rdx
ret
frsqrte_espresso_label2:
test r8,r8
jns frsqrte_espresso_label5
frsqrte_espresso_label4:
mov rax,7FF8000000000000h
movq xmm15, rax
pop r9
pop r8
pop rax
pop rcx
pop rdx
ret
frsqrte_espresso_label5:
lea rdx,[asmFrsqrteLookupTable]
mov rax,r8
shr rax,30h
mov rcx,r8
shr rcx,25h
and eax,1Fh
and ecx,7FFh
imul ecx,dword ptr [rdx+rax*8+4]
mov eax,dword ptr [rdx+rax*8]
sub eax,ecx
lea ecx,[r9-3FDh]
shr ecx,1
movsxd rdx,eax
mov eax,3FFh
sub eax,ecx
shl rdx,1Ah
mov ecx,eax
mov rax,8000000000000000h
and r8,rax
shl rcx,34h
or rcx,r8
add rdx,rcx
movq xmm15, rdx
frsqrte_espresso_label3:
pop r9
pop r8
pop rax
pop rcx
pop rdx
ret
recompiler_frsqrte ENDP
asmFrsqrteLookupTable:
DD 01a7e800h, 0568h
DD 017cb800h, 04f3h
DD 01552800h, 048dh
DD 0130c000h, 0435h
DD 010f2000h, 03e7h
DD 0eff000h, 03a2h
DD 0d2e000h, 0365h
DD 0b7c000h, 032eh
DD 09e5000h, 02fch
DD 0867000h, 02d0h
DD 06ff000h, 02a8h
DD 05ab800h, 0283h
DD 046a000h, 0261h
DD 0339800h, 0243h
DD 0218800h, 0226h
DD 0105800h, 020bh
DD 03ffa000h, 07a4h
DD 03c29000h, 0700h
DD 038aa000h, 0670h
DD 03572000h, 05f2h
DD 03279000h, 0584h
DD 02fb7000h, 0524h
DD 02d26000h, 04cch
DD 02ac0000h, 047eh
DD 02881000h, 043ah
DD 02665000h, 03fah
DD 02468000h, 03c2h
DD 02287000h, 038eh
DD 020c1000h, 035eh
DD 01f12000h, 0332h
DD 01d79000h, 030ah
DD 01bf4000h, 02e6h
END

View file

@ -1,237 +0,0 @@
DEFAULT REL
SECTION .text
global udiv128
global recompiler_fres
global recompiler_frsqrte
udiv128:
mov rax, rcx
div r8
mov [r9], rdx
ret
recompiler_fres:
; store all modified registers
push rdx
push rcx
push rax
push r8
lea r8,[asmFresLookupTable]
movq rdx, xmm15
mov rcx,rdx
shr rcx,2Fh
mov rax,rdx
and ecx,1Fh
shr rax,25h
and eax,3FFh
imul eax,dword [r8+rcx*8+4]
mov r8d,dword [r8+rcx*8]
mov rcx,rdx
shr rcx,34h
inc eax
shr eax,1
sub r8d,eax
and ecx,7FFh
jne fres_espresso_label3
mov rax,7FF0000000000000h
or rdx,rax
movq xmm15, rdx
pop r8
pop rax
pop rcx
pop rdx
ret
fres_espresso_label3:
cmp ecx,7FFh
jne fres_espresso_label4
mov rax,0FFFFFFFFFFFFFh
test rax,rdx
jne fres_espresso_label1
test rdx,rdx
jns fres_espresso_label2
mov rax,8000000000000000h
movq xmm15, rax
pop r8
pop rax
pop rcx
pop rdx
ret
fres_espresso_label2:
xorps xmm15,xmm15
pop r8
pop rax
pop rcx
pop rdx
ret
fres_espresso_label4:
mov eax,7FDh
sub eax,ecx
mov ecx,eax
mov rax,8000000000000000h
and rdx,rax
shl rcx,34h
mov eax,r8d
or rcx,rdx
shl rax,1Dh
add rcx,rax
movq xmm15, rcx
fres_espresso_label1:
pop r8
pop rax
pop rcx
pop rdx
ret
asmFresLookupTable:
DD 07ff800h, 03e1h
DD 0783800h, 03a7h
DD 070ea00h, 0371h
DD 06a0800h, 0340h
DD 0638800h, 0313h
DD 05d6200h, 02eah
DD 0579000h, 02c4h
DD 0520800h, 02a0h
DD 04cc800h, 027fh
DD 047ca00h, 0261h
DD 0430800h, 0245h
DD 03e8000h, 022ah
DD 03a2c00h, 0212h
DD 0360800h, 01fbh
DD 0321400h, 01e5h
DD 02e4a00h, 01d1h
DD 02aa800h, 01beh
DD 0272c00h, 01ach
DD 023d600h, 019bh
DD 0209e00h, 018bh
DD 01d8800h, 017ch
DD 01a9000h, 016eh
DD 017ae00h, 015bh
DD 014f800h, 015bh
DD 0124400h, 0143h
DD 0fbe00h, 0143h
DD 0d3800h, 012dh
DD 0ade00h, 012dh
DD 088400h, 011ah
DD 065000h, 011ah
DD 041c00h, 0108h
DD 020c00h, 0106h
recompiler_frsqrte:
; store all modified registers
push rdx
push rcx
push rax
push r8
push r9
movq r8, xmm15
mov rax,7FFFFFFFFFFFFFFFh
test rax,r8
jne frsqrte_espresso_label1
mov rax,0FFF0000000000000h
and r8,rax
mov rax,7FF0000000000000h
or r8,rax
movq xmm15, r8
pop r9
pop r8
pop rax
pop rcx
pop rdx
ret
frsqrte_espresso_label1:
mov r9,r8
shr r9,34h
and r9d,7FFh
cmp r9d,7FFh
jne frsqrte_espresso_label2
mov rax,0FFFFFFFFFFFFFh
test rax,r8
jne frsqrte_espresso_label3
test r8,r8
js frsqrte_espresso_label4
xorps xmm15,xmm15
pop r9
pop r8
pop rax
pop rcx
pop rdx
ret
frsqrte_espresso_label2:
test r8,r8
jns frsqrte_espresso_label5
frsqrte_espresso_label4:
mov rax,7FF8000000000000h
movq xmm15, rax
pop r9
pop r8
pop rax
pop rcx
pop rdx
ret
frsqrte_espresso_label5:
lea rdx,[asmFrsqrteLookupTable]
mov rax,r8
shr rax,30h
mov rcx,r8
shr rcx,25h
and eax,1Fh
and ecx,7FFh
imul ecx,dword [rdx+rax*8+4]
mov eax,dword [rdx+rax*8]
sub eax,ecx
lea ecx,[r9-3FDh]
shr ecx,1
movsxd rdx,eax
mov eax,3FFh
sub eax,ecx
shl rdx,1Ah
mov ecx,eax
mov rax,8000000000000000h
and r8,rax
shl rcx,34h
or rcx,r8
add rdx,rcx
movq xmm15, rdx
frsqrte_espresso_label3:
pop r9
pop r8
pop rax
pop rcx
pop rdx
ret
asmFrsqrteLookupTable:
DD 01a7e800h, 0568h
DD 017cb800h, 04f3h
DD 01552800h, 048dh
DD 0130c000h, 0435h
DD 010f2000h, 03e7h
DD 0eff000h, 03a2h
DD 0d2e000h, 0365h
DD 0b7c000h, 032eh
DD 09e5000h, 02fch
DD 0867000h, 02d0h
DD 06ff000h, 02a8h
DD 05ab800h, 0283h
DD 046a000h, 0261h
DD 0339800h, 0243h
DD 0218800h, 0226h
DD 0105800h, 020bh
DD 03ffa000h, 07a4h
DD 03c29000h, 0700h
DD 038aa000h, 0670h
DD 03572000h, 05f2h
DD 03279000h, 0584h
DD 02fb7000h, 0524h
DD 02d26000h, 04cch
DD 02ac0000h, 047eh
DD 02881000h, 043ah
DD 02665000h, 03fah
DD 02468000h, 03c2h
DD 02287000h, 038eh
DD 020c1000h, 035eh
DD 01f12000h, 0332h
DD 01d79000h, 030ah
DD 01bf4000h, 02e6h