PPCRec: Reenable float copy optimization
Some checks failed
Build check / build (push) Waiting to run
Generate translation template / generate-pot (push) Failing after 1s

This commit is contained in:
Exzap 2025-05-09 02:06:08 +02:00
parent 557aff4024
commit ba09daf328
4 changed files with 84 additions and 67 deletions

View file

@ -36,6 +36,30 @@ const char* IMLDebug_GetOpcodeName(const IMLInstruction* iml)
return "MULS"; return "MULS";
else if (op == PPCREC_IML_OP_DIVIDE_SIGNED) else if (op == PPCREC_IML_OP_DIVIDE_SIGNED)
return "DIVS"; return "DIVS";
else if (op == PPCREC_IML_OP_FPR_ASSIGN)
return "FMOV";
else if (op == PPCREC_IML_OP_FPR_ADD)
return "FADD";
else if (op == PPCREC_IML_OP_FPR_SUB)
return "FSUB";
else if (op == PPCREC_IML_OP_FPR_MULTIPLY)
return "FMUL";
else if (op == PPCREC_IML_OP_FPR_DIVIDE)
return "FDIV";
else if (op == PPCREC_IML_OP_FPR_EXPAND_F32_TO_F64)
return "F32TOF64";
else if (op == PPCREC_IML_OP_FPR_ABS)
return "FABS";
else if (op == PPCREC_IML_OP_FPR_NEGATE)
return "FNEG";
else if (op == PPCREC_IML_OP_FPR_NEGATIVE_ABS)
return "FNABS";
else if (op == PPCREC_IML_OP_FPR_FLOAT_TO_INT)
return "F2I";
else if (op == PPCREC_IML_OP_FPR_INT_TO_FLOAT)
return "I2F";
else if (op == PPCREC_IML_OP_FPR_BITCAST_INT_TO_FLOAT)
return "BITMOVE";
sprintf(_tempOpcodename, "OP0%02x_T%d", iml->operation, iml->type); sprintf(_tempOpcodename, "OP0%02x_T%d", iml->operation, iml->type);
return _tempOpcodename; return _tempOpcodename;
@ -409,19 +433,24 @@ void IMLDebug_DisassembleInstruction(const IMLInstruction& inst, std::string& di
strOutput.addFmt("{} [t{}+{}]", inst.op_storeLoad.copyWidth / 8, inst.op_storeLoad.registerMem.GetRegID(), inst.op_storeLoad.immS32); strOutput.addFmt("{} [t{}+{}]", inst.op_storeLoad.copyWidth / 8, inst.op_storeLoad.registerMem.GetRegID(), inst.op_storeLoad.immS32);
strOutput.addFmt(" = {} mode {}", IMLDebug_GetRegName(inst.op_storeLoad.registerData), inst.op_storeLoad.mode); strOutput.addFmt(" = {} mode {}", IMLDebug_GetRegName(inst.op_storeLoad.registerData), inst.op_storeLoad.mode);
} }
else if (inst.type == PPCREC_IML_TYPE_FPR_R)
{
strOutput.addFmt("{:<6} ", IMLDebug_GetOpcodeName(&inst));
strOutput.addFmt("{}", IMLDebug_GetRegName(inst.op_fpr_r.regR));
}
else if (inst.type == PPCREC_IML_TYPE_FPR_R_R) else if (inst.type == PPCREC_IML_TYPE_FPR_R_R)
{ {
strOutput.addFmt("{:>6} ", IMLDebug_GetOpcodeName(&inst)); strOutput.addFmt("{:<6} ", IMLDebug_GetOpcodeName(&inst));
strOutput.addFmt("{}, {}", IMLDebug_GetRegName(inst.op_fpr_r_r.regR), IMLDebug_GetRegName(inst.op_fpr_r_r.regA)); strOutput.addFmt("{}, {}", IMLDebug_GetRegName(inst.op_fpr_r_r.regR), IMLDebug_GetRegName(inst.op_fpr_r_r.regA));
} }
else if (inst.type == PPCREC_IML_TYPE_FPR_R_R_R_R) else if (inst.type == PPCREC_IML_TYPE_FPR_R_R_R_R)
{ {
strOutput.addFmt("{:>6} ", IMLDebug_GetOpcodeName(&inst)); strOutput.addFmt("{:<6} ", IMLDebug_GetOpcodeName(&inst));
strOutput.addFmt("{}, {}, {}, {}", IMLDebug_GetRegName(inst.op_fpr_r_r_r_r.regR), IMLDebug_GetRegName(inst.op_fpr_r_r_r_r.regA), IMLDebug_GetRegName(inst.op_fpr_r_r_r_r.regB), IMLDebug_GetRegName(inst.op_fpr_r_r_r_r.regC)); strOutput.addFmt("{}, {}, {}, {}", IMLDebug_GetRegName(inst.op_fpr_r_r_r_r.regR), IMLDebug_GetRegName(inst.op_fpr_r_r_r_r.regA), IMLDebug_GetRegName(inst.op_fpr_r_r_r_r.regB), IMLDebug_GetRegName(inst.op_fpr_r_r_r_r.regC));
} }
else if (inst.type == PPCREC_IML_TYPE_FPR_R_R_R) else if (inst.type == PPCREC_IML_TYPE_FPR_R_R_R)
{ {
strOutput.addFmt("{:>6} ", IMLDebug_GetOpcodeName(&inst)); strOutput.addFmt("{:<6} ", IMLDebug_GetOpcodeName(&inst));
strOutput.addFmt("{}, {}, {}", IMLDebug_GetRegName(inst.op_fpr_r_r_r.regR), IMLDebug_GetRegName(inst.op_fpr_r_r_r.regA), IMLDebug_GetRegName(inst.op_fpr_r_r_r.regB)); strOutput.addFmt("{}, {}, {}", IMLDebug_GetRegName(inst.op_fpr_r_r_r.regR), IMLDebug_GetRegName(inst.op_fpr_r_r_r.regA), IMLDebug_GetRegName(inst.op_fpr_r_r_r.regB));
} }
else if (inst.type == PPCREC_IML_TYPE_CJUMP_CYCLE_CHECK) else if (inst.type == PPCREC_IML_TYPE_CJUMP_CYCLE_CHECK)

View file

@ -23,7 +23,7 @@ void PPCRecompiler_optimizeDirectFloatCopiesScanForward(ppcImlGenContext_t* ppcI
IMLInstruction* imlInstructionLoad = imlSegment->imlList.data() + imlIndexLoad; IMLInstruction* imlInstructionLoad = imlSegment->imlList.data() + imlIndexLoad;
if (imlInstructionLoad->op_storeLoad.flags2.notExpanded) if (imlInstructionLoad->op_storeLoad.flags2.notExpanded)
return; return;
boost::container::static_vector<sint32, 4> trackedMoves; // only track up to 4 copies
IMLUsedRegisters registersUsed; IMLUsedRegisters registersUsed;
sint32 scanRangeEnd = std::min<sint32>(imlIndexLoad + 25, imlSegment->imlList.size()); // don't scan too far (saves performance and also the chances we can merge the load+store become low at high distances) sint32 scanRangeEnd = std::min<sint32>(imlIndexLoad + 25, imlSegment->imlList.size()); // don't scan too far (saves performance and also the chances we can merge the load+store become low at high distances)
bool foundMatch = false; bool foundMatch = false;
@ -54,8 +54,24 @@ void PPCRecompiler_optimizeDirectFloatCopiesScanForward(ppcImlGenContext_t* ppcI
continue; continue;
} }
} }
// if the FPR is copied then keep track of it. We can expand the copies instead of the original
// check if FPR is overwritten (we can actually ignore read operations?) if (imlInstruction->type == PPCREC_IML_TYPE_FPR_R_R && imlInstruction->operation == PPCREC_IML_OP_FPR_ASSIGN && imlInstruction->op_fpr_r_r.regA.GetRegID() == fprIndex)
{
if (imlInstruction->op_fpr_r_r.regR.GetRegID() == fprIndex)
{
// unexpected no-op
break;
}
if (trackedMoves.size() >= trackedMoves.capacity())
{
// we cant track any more moves, expand here
lastStore = i;
break;
}
trackedMoves.push_back(i);
continue;
}
// check if FPR is overwritten
imlInstruction->CheckRegisterUsage(&registersUsed); imlInstruction->CheckRegisterUsage(&registersUsed);
if (registersUsed.writtenGPR1.IsValidAndSameRegID(fprIndex) || registersUsed.writtenGPR2.IsValidAndSameRegID(fprIndex)) if (registersUsed.writtenGPR1.IsValidAndSameRegID(fprIndex) || registersUsed.writtenGPR2.IsValidAndSameRegID(fprIndex))
break; break;
@ -71,6 +87,24 @@ void PPCRecompiler_optimizeDirectFloatCopiesScanForward(ppcImlGenContext_t* ppcI
if (foundMatch) if (foundMatch)
{ {
// insert expand instructions for each target register of a move
sint32 positionBias = 0;
for (auto& trackedMove : trackedMoves)
{
sint32 realPosition = trackedMove + positionBias;
IMLInstruction* imlMoveInstruction = imlSegment->imlList.data() + realPosition;
if (realPosition >= lastStore)
break; // expand is inserted before this move
else
lastStore++;
cemu_assert_debug(imlMoveInstruction->type == PPCREC_IML_TYPE_FPR_R_R && imlMoveInstruction->op_fpr_r_r.regA.GetRegID() == fprIndex);
cemu_assert_debug(imlMoveInstruction->op_fpr_r_r.regA.GetRegFormat() == IMLRegFormat::F64);
auto dstReg = imlMoveInstruction->op_fpr_r_r.regR;
IMLInstruction* newExpand = PPCRecompiler_insertInstruction(imlSegment, realPosition+1); // one after the move
newExpand->make_fpr_r(PPCREC_IML_OP_FPR_EXPAND_F32_TO_F64, dstReg);
positionBias++;
}
// insert expand instruction after store // insert expand instruction after store
IMLInstruction* newExpand = PPCRecompiler_insertInstruction(imlSegment, lastStore); IMLInstruction* newExpand = PPCRecompiler_insertInstruction(imlSegment, lastStore);
newExpand->make_fpr_r(PPCREC_IML_OP_FPR_EXPAND_F32_TO_F64, _FPRRegFromID(fprIndex)); newExpand->make_fpr_r(PPCREC_IML_OP_FPR_EXPAND_F32_TO_F64, _FPRRegFromID(fprIndex));
@ -90,23 +124,21 @@ void PPCRecompiler_optimizeDirectFloatCopiesScanForward(ppcImlGenContext_t* ppcI
*/ */
void IMLOptimizer_OptimizeDirectFloatCopies(ppcImlGenContext_t* ppcImlGenContext) void IMLOptimizer_OptimizeDirectFloatCopies(ppcImlGenContext_t* ppcImlGenContext)
{ {
cemuLog_logDebugOnce(LogType::Force, "IMLOptimizer_OptimizeDirectFloatCopies(): Currently disabled\n"); for (IMLSegment* segIt : ppcImlGenContext->segmentList2)
return; {
// for (IMLSegment* segIt : ppcImlGenContext->segmentList2) for (sint32 i = 0; i < segIt->imlList.size(); i++)
// { {
// for (sint32 i = 0; i < segIt->imlList.size(); i++) IMLInstruction* imlInstruction = segIt->imlList.data() + i;
// { if (imlInstruction->type == PPCREC_IML_TYPE_FPR_LOAD && imlInstruction->op_storeLoad.mode == PPCREC_FPR_LD_MODE_SINGLE)
// IMLInstruction* imlInstruction = segIt->imlList.data() + i; {
// if (imlInstruction->type == PPCREC_IML_TYPE_FPR_LOAD && imlInstruction->op_storeLoad.mode == PPCREC_FPR_LD_MODE_SINGLE_INTO_PS0_PS1) PPCRecompiler_optimizeDirectFloatCopiesScanForward(ppcImlGenContext, segIt, i, imlInstruction->op_storeLoad.registerData);
// { }
// PPCRecompiler_optimizeDirectFloatCopiesScanForward(ppcImlGenContext, segIt, i, imlInstruction->op_storeLoad.registerData); else if (imlInstruction->type == PPCREC_IML_TYPE_FPR_LOAD_INDEXED && imlInstruction->op_storeLoad.mode == PPCREC_FPR_LD_MODE_SINGLE)
// } {
// else if (imlInstruction->type == PPCREC_IML_TYPE_FPR_LOAD_INDEXED && imlInstruction->op_storeLoad.mode == PPCREC_FPR_LD_MODE_SINGLE_INTO_PS0_PS1) PPCRecompiler_optimizeDirectFloatCopiesScanForward(ppcImlGenContext, segIt, i, imlInstruction->op_storeLoad.registerData);
// { }
// PPCRecompiler_optimizeDirectFloatCopiesScanForward(ppcImlGenContext, segIt, i, imlInstruction->op_storeLoad.registerData); }
// } }
// }
// }
} }
void PPCRecompiler_optimizeDirectIntegerCopiesScanForward(ppcImlGenContext_t* ppcImlGenContext, IMLSegment* imlSegment, sint32 imlIndexLoad, IMLReg gprReg) void PPCRecompiler_optimizeDirectIntegerCopiesScanForward(ppcImlGenContext_t* ppcImlGenContext, IMLSegment* imlSegment, sint32 imlIndexLoad, IMLReg gprReg)

View file

@ -685,45 +685,6 @@ void PPCRecompiler_init()
PPCRecompiler_allocateRange(mmuRange_TRAMPOLINE_AREA.getBase(), mmuRange_TRAMPOLINE_AREA.getSize()); PPCRecompiler_allocateRange(mmuRange_TRAMPOLINE_AREA.getBase(), mmuRange_TRAMPOLINE_AREA.getSize());
PPCRecompiler_allocateRange(mmuRange_CODECAVE.getBase(), mmuRange_CODECAVE.getSize()); PPCRecompiler_allocateRange(mmuRange_CODECAVE.getBase(), mmuRange_CODECAVE.getSize());
// setup GQR scale tables
for (uint32 i = 0; i < 32; i++)
{
float a = 1.0f / (float)(1u << i);
float b = 0;
if (i == 0)
b = 4294967296.0f;
else
b = (float)(1u << (32u - i));
float ar = (float)(1u << i);
float br = 0;
if (i == 0)
br = 1.0f / 4294967296.0f;
else
br = 1.0f / (float)(1u << (32u - i));
ppcRecompilerInstanceData->_psq_ld_scale_ps0_1[i * 2 + 0] = a;
ppcRecompilerInstanceData->_psq_ld_scale_ps0_1[i * 2 + 1] = 1.0f;
ppcRecompilerInstanceData->_psq_ld_scale_ps0_1[(i + 32) * 2 + 0] = b;
ppcRecompilerInstanceData->_psq_ld_scale_ps0_1[(i + 32) * 2 + 1] = 1.0f;
ppcRecompilerInstanceData->_psq_ld_scale_ps0_ps1[i * 2 + 0] = a;
ppcRecompilerInstanceData->_psq_ld_scale_ps0_ps1[i * 2 + 1] = a;
ppcRecompilerInstanceData->_psq_ld_scale_ps0_ps1[(i + 32) * 2 + 0] = b;
ppcRecompilerInstanceData->_psq_ld_scale_ps0_ps1[(i + 32) * 2 + 1] = b;
ppcRecompilerInstanceData->_psq_st_scale_ps0_1[i * 2 + 0] = ar;
ppcRecompilerInstanceData->_psq_st_scale_ps0_1[i * 2 + 1] = 1.0f;
ppcRecompilerInstanceData->_psq_st_scale_ps0_1[(i + 32) * 2 + 0] = br;
ppcRecompilerInstanceData->_psq_st_scale_ps0_1[(i + 32) * 2 + 1] = 1.0f;
ppcRecompilerInstanceData->_psq_st_scale_ps0_ps1[i * 2 + 0] = ar;
ppcRecompilerInstanceData->_psq_st_scale_ps0_ps1[i * 2 + 1] = ar;
ppcRecompilerInstanceData->_psq_st_scale_ps0_ps1[(i + 32) * 2 + 0] = br;
ppcRecompilerInstanceData->_psq_st_scale_ps0_ps1[(i + 32) * 2 + 1] = br;
}
PPCRecompiler_initPlatform(); PPCRecompiler_initPlatform();
cemuLog_log(LogType::Force, "Recompiler initialized"); cemuLog_log(LogType::Force, "Recompiler initialized");

View file

@ -136,11 +136,6 @@ typedef struct
alignas(16) float _x64XMM_constFloatMin[2]; alignas(16) float _x64XMM_constFloatMin[2];
alignas(16) uint32 _x64XMM_flushDenormalMask1[4]; alignas(16) uint32 _x64XMM_flushDenormalMask1[4];
alignas(16) uint32 _x64XMM_flushDenormalMaskResetSignBits[4]; alignas(16) uint32 _x64XMM_flushDenormalMaskResetSignBits[4];
// PSQ load/store scale tables
double _psq_ld_scale_ps0_ps1[64 * 2];
double _psq_ld_scale_ps0_1[64 * 2];
double _psq_st_scale_ps0_ps1[64 * 2];
double _psq_st_scale_ps0_1[64 * 2];
// MXCSR // MXCSR
uint32 _x64XMM_mxCsr_ftzOn; uint32 _x64XMM_mxCsr_ftzOn;
uint32 _x64XMM_mxCsr_ftzOff; uint32 _x64XMM_mxCsr_ftzOff;