diff --git a/src/Cafe/HW/Espresso/Recompiler/IML/IMLDebug.cpp b/src/Cafe/HW/Espresso/Recompiler/IML/IMLDebug.cpp index f736c2a7..cd269869 100644 --- a/src/Cafe/HW/Espresso/Recompiler/IML/IMLDebug.cpp +++ b/src/Cafe/HW/Espresso/Recompiler/IML/IMLDebug.cpp @@ -36,6 +36,30 @@ const char* IMLDebug_GetOpcodeName(const IMLInstruction* iml) return "MULS"; else if (op == PPCREC_IML_OP_DIVIDE_SIGNED) return "DIVS"; + else if (op == PPCREC_IML_OP_FPR_ASSIGN) + return "FMOV"; + else if (op == PPCREC_IML_OP_FPR_ADD) + return "FADD"; + else if (op == PPCREC_IML_OP_FPR_SUB) + return "FSUB"; + else if (op == PPCREC_IML_OP_FPR_MULTIPLY) + return "FMUL"; + else if (op == PPCREC_IML_OP_FPR_DIVIDE) + return "FDIV"; + else if (op == PPCREC_IML_OP_FPR_EXPAND_F32_TO_F64) + return "F32TOF64"; + else if (op == PPCREC_IML_OP_FPR_ABS) + return "FABS"; + else if (op == PPCREC_IML_OP_FPR_NEGATE) + return "FNEG"; + else if (op == PPCREC_IML_OP_FPR_NEGATIVE_ABS) + return "FNABS"; + else if (op == PPCREC_IML_OP_FPR_FLOAT_TO_INT) + return "F2I"; + else if (op == PPCREC_IML_OP_FPR_INT_TO_FLOAT) + return "I2F"; + else if (op == PPCREC_IML_OP_FPR_BITCAST_INT_TO_FLOAT) + return "BITMOVE"; sprintf(_tempOpcodename, "OP0%02x_T%d", iml->operation, iml->type); return _tempOpcodename; @@ -409,19 +433,24 @@ void IMLDebug_DisassembleInstruction(const IMLInstruction& inst, std::string& di strOutput.addFmt("{} [t{}+{}]", inst.op_storeLoad.copyWidth / 8, inst.op_storeLoad.registerMem.GetRegID(), inst.op_storeLoad.immS32); strOutput.addFmt(" = {} mode {}", IMLDebug_GetRegName(inst.op_storeLoad.registerData), inst.op_storeLoad.mode); } + else if (inst.type == PPCREC_IML_TYPE_FPR_R) + { + strOutput.addFmt("{:<6} ", IMLDebug_GetOpcodeName(&inst)); + strOutput.addFmt("{}", IMLDebug_GetRegName(inst.op_fpr_r.regR)); + } else if (inst.type == PPCREC_IML_TYPE_FPR_R_R) { - strOutput.addFmt("{:>6} ", IMLDebug_GetOpcodeName(&inst)); + strOutput.addFmt("{:<6} ", IMLDebug_GetOpcodeName(&inst)); strOutput.addFmt("{}, {}", IMLDebug_GetRegName(inst.op_fpr_r_r.regR), IMLDebug_GetRegName(inst.op_fpr_r_r.regA)); } else if (inst.type == PPCREC_IML_TYPE_FPR_R_R_R_R) { - strOutput.addFmt("{:>6} ", IMLDebug_GetOpcodeName(&inst)); + strOutput.addFmt("{:<6} ", IMLDebug_GetOpcodeName(&inst)); strOutput.addFmt("{}, {}, {}, {}", IMLDebug_GetRegName(inst.op_fpr_r_r_r_r.regR), IMLDebug_GetRegName(inst.op_fpr_r_r_r_r.regA), IMLDebug_GetRegName(inst.op_fpr_r_r_r_r.regB), IMLDebug_GetRegName(inst.op_fpr_r_r_r_r.regC)); } else if (inst.type == PPCREC_IML_TYPE_FPR_R_R_R) { - strOutput.addFmt("{:>6} ", IMLDebug_GetOpcodeName(&inst)); + strOutput.addFmt("{:<6} ", IMLDebug_GetOpcodeName(&inst)); strOutput.addFmt("{}, {}, {}", IMLDebug_GetRegName(inst.op_fpr_r_r_r.regR), IMLDebug_GetRegName(inst.op_fpr_r_r_r.regA), IMLDebug_GetRegName(inst.op_fpr_r_r_r.regB)); } else if (inst.type == PPCREC_IML_TYPE_CJUMP_CYCLE_CHECK) diff --git a/src/Cafe/HW/Espresso/Recompiler/IML/IMLOptimizer.cpp b/src/Cafe/HW/Espresso/Recompiler/IML/IMLOptimizer.cpp index d0348e5a..d5693846 100644 --- a/src/Cafe/HW/Espresso/Recompiler/IML/IMLOptimizer.cpp +++ b/src/Cafe/HW/Espresso/Recompiler/IML/IMLOptimizer.cpp @@ -23,7 +23,7 @@ void PPCRecompiler_optimizeDirectFloatCopiesScanForward(ppcImlGenContext_t* ppcI IMLInstruction* imlInstructionLoad = imlSegment->imlList.data() + imlIndexLoad; if (imlInstructionLoad->op_storeLoad.flags2.notExpanded) return; - + boost::container::static_vector trackedMoves; // only track up to 4 copies IMLUsedRegisters registersUsed; sint32 scanRangeEnd = std::min(imlIndexLoad + 25, imlSegment->imlList.size()); // don't scan too far (saves performance and also the chances we can merge the load+store become low at high distances) bool foundMatch = false; @@ -54,8 +54,24 @@ void PPCRecompiler_optimizeDirectFloatCopiesScanForward(ppcImlGenContext_t* ppcI continue; } } - - // check if FPR is overwritten (we can actually ignore read operations?) + // if the FPR is copied then keep track of it. We can expand the copies instead of the original + if (imlInstruction->type == PPCREC_IML_TYPE_FPR_R_R && imlInstruction->operation == PPCREC_IML_OP_FPR_ASSIGN && imlInstruction->op_fpr_r_r.regA.GetRegID() == fprIndex) + { + if (imlInstruction->op_fpr_r_r.regR.GetRegID() == fprIndex) + { + // unexpected no-op + break; + } + if (trackedMoves.size() >= trackedMoves.capacity()) + { + // we cant track any more moves, expand here + lastStore = i; + break; + } + trackedMoves.push_back(i); + continue; + } + // check if FPR is overwritten imlInstruction->CheckRegisterUsage(®istersUsed); if (registersUsed.writtenGPR1.IsValidAndSameRegID(fprIndex) || registersUsed.writtenGPR2.IsValidAndSameRegID(fprIndex)) break; @@ -71,6 +87,24 @@ void PPCRecompiler_optimizeDirectFloatCopiesScanForward(ppcImlGenContext_t* ppcI if (foundMatch) { + // insert expand instructions for each target register of a move + sint32 positionBias = 0; + for (auto& trackedMove : trackedMoves) + { + sint32 realPosition = trackedMove + positionBias; + IMLInstruction* imlMoveInstruction = imlSegment->imlList.data() + realPosition; + if (realPosition >= lastStore) + break; // expand is inserted before this move + else + lastStore++; + + cemu_assert_debug(imlMoveInstruction->type == PPCREC_IML_TYPE_FPR_R_R && imlMoveInstruction->op_fpr_r_r.regA.GetRegID() == fprIndex); + cemu_assert_debug(imlMoveInstruction->op_fpr_r_r.regA.GetRegFormat() == IMLRegFormat::F64); + auto dstReg = imlMoveInstruction->op_fpr_r_r.regR; + IMLInstruction* newExpand = PPCRecompiler_insertInstruction(imlSegment, realPosition+1); // one after the move + newExpand->make_fpr_r(PPCREC_IML_OP_FPR_EXPAND_F32_TO_F64, dstReg); + positionBias++; + } // insert expand instruction after store IMLInstruction* newExpand = PPCRecompiler_insertInstruction(imlSegment, lastStore); newExpand->make_fpr_r(PPCREC_IML_OP_FPR_EXPAND_F32_TO_F64, _FPRRegFromID(fprIndex)); @@ -90,23 +124,21 @@ void PPCRecompiler_optimizeDirectFloatCopiesScanForward(ppcImlGenContext_t* ppcI */ void IMLOptimizer_OptimizeDirectFloatCopies(ppcImlGenContext_t* ppcImlGenContext) { - cemuLog_logDebugOnce(LogType::Force, "IMLOptimizer_OptimizeDirectFloatCopies(): Currently disabled\n"); - return; - // for (IMLSegment* segIt : ppcImlGenContext->segmentList2) - // { - // for (sint32 i = 0; i < segIt->imlList.size(); i++) - // { - // IMLInstruction* imlInstruction = segIt->imlList.data() + i; - // if (imlInstruction->type == PPCREC_IML_TYPE_FPR_LOAD && imlInstruction->op_storeLoad.mode == PPCREC_FPR_LD_MODE_SINGLE_INTO_PS0_PS1) - // { - // PPCRecompiler_optimizeDirectFloatCopiesScanForward(ppcImlGenContext, segIt, i, imlInstruction->op_storeLoad.registerData); - // } - // else if (imlInstruction->type == PPCREC_IML_TYPE_FPR_LOAD_INDEXED && imlInstruction->op_storeLoad.mode == PPCREC_FPR_LD_MODE_SINGLE_INTO_PS0_PS1) - // { - // PPCRecompiler_optimizeDirectFloatCopiesScanForward(ppcImlGenContext, segIt, i, imlInstruction->op_storeLoad.registerData); - // } - // } - // } + for (IMLSegment* segIt : ppcImlGenContext->segmentList2) + { + for (sint32 i = 0; i < segIt->imlList.size(); i++) + { + IMLInstruction* imlInstruction = segIt->imlList.data() + i; + if (imlInstruction->type == PPCREC_IML_TYPE_FPR_LOAD && imlInstruction->op_storeLoad.mode == PPCREC_FPR_LD_MODE_SINGLE) + { + PPCRecompiler_optimizeDirectFloatCopiesScanForward(ppcImlGenContext, segIt, i, imlInstruction->op_storeLoad.registerData); + } + else if (imlInstruction->type == PPCREC_IML_TYPE_FPR_LOAD_INDEXED && imlInstruction->op_storeLoad.mode == PPCREC_FPR_LD_MODE_SINGLE) + { + PPCRecompiler_optimizeDirectFloatCopiesScanForward(ppcImlGenContext, segIt, i, imlInstruction->op_storeLoad.registerData); + } + } + } } void PPCRecompiler_optimizeDirectIntegerCopiesScanForward(ppcImlGenContext_t* ppcImlGenContext, IMLSegment* imlSegment, sint32 imlIndexLoad, IMLReg gprReg) diff --git a/src/Cafe/HW/Espresso/Recompiler/PPCRecompiler.cpp b/src/Cafe/HW/Espresso/Recompiler/PPCRecompiler.cpp index 0dbc073b..087b90f5 100644 --- a/src/Cafe/HW/Espresso/Recompiler/PPCRecompiler.cpp +++ b/src/Cafe/HW/Espresso/Recompiler/PPCRecompiler.cpp @@ -685,45 +685,6 @@ void PPCRecompiler_init() PPCRecompiler_allocateRange(mmuRange_TRAMPOLINE_AREA.getBase(), mmuRange_TRAMPOLINE_AREA.getSize()); PPCRecompiler_allocateRange(mmuRange_CODECAVE.getBase(), mmuRange_CODECAVE.getSize()); - // setup GQR scale tables - - for (uint32 i = 0; i < 32; i++) - { - float a = 1.0f / (float)(1u << i); - float b = 0; - if (i == 0) - b = 4294967296.0f; - else - b = (float)(1u << (32u - i)); - - float ar = (float)(1u << i); - float br = 0; - if (i == 0) - br = 1.0f / 4294967296.0f; - else - br = 1.0f / (float)(1u << (32u - i)); - - ppcRecompilerInstanceData->_psq_ld_scale_ps0_1[i * 2 + 0] = a; - ppcRecompilerInstanceData->_psq_ld_scale_ps0_1[i * 2 + 1] = 1.0f; - ppcRecompilerInstanceData->_psq_ld_scale_ps0_1[(i + 32) * 2 + 0] = b; - ppcRecompilerInstanceData->_psq_ld_scale_ps0_1[(i + 32) * 2 + 1] = 1.0f; - - ppcRecompilerInstanceData->_psq_ld_scale_ps0_ps1[i * 2 + 0] = a; - ppcRecompilerInstanceData->_psq_ld_scale_ps0_ps1[i * 2 + 1] = a; - ppcRecompilerInstanceData->_psq_ld_scale_ps0_ps1[(i + 32) * 2 + 0] = b; - ppcRecompilerInstanceData->_psq_ld_scale_ps0_ps1[(i + 32) * 2 + 1] = b; - - ppcRecompilerInstanceData->_psq_st_scale_ps0_1[i * 2 + 0] = ar; - ppcRecompilerInstanceData->_psq_st_scale_ps0_1[i * 2 + 1] = 1.0f; - ppcRecompilerInstanceData->_psq_st_scale_ps0_1[(i + 32) * 2 + 0] = br; - ppcRecompilerInstanceData->_psq_st_scale_ps0_1[(i + 32) * 2 + 1] = 1.0f; - - ppcRecompilerInstanceData->_psq_st_scale_ps0_ps1[i * 2 + 0] = ar; - ppcRecompilerInstanceData->_psq_st_scale_ps0_ps1[i * 2 + 1] = ar; - ppcRecompilerInstanceData->_psq_st_scale_ps0_ps1[(i + 32) * 2 + 0] = br; - ppcRecompilerInstanceData->_psq_st_scale_ps0_ps1[(i + 32) * 2 + 1] = br; - } - PPCRecompiler_initPlatform(); cemuLog_log(LogType::Force, "Recompiler initialized"); diff --git a/src/Cafe/HW/Espresso/Recompiler/PPCRecompiler.h b/src/Cafe/HW/Espresso/Recompiler/PPCRecompiler.h index 706855d4..47902630 100644 --- a/src/Cafe/HW/Espresso/Recompiler/PPCRecompiler.h +++ b/src/Cafe/HW/Espresso/Recompiler/PPCRecompiler.h @@ -136,11 +136,6 @@ typedef struct alignas(16) float _x64XMM_constFloatMin[2]; alignas(16) uint32 _x64XMM_flushDenormalMask1[4]; alignas(16) uint32 _x64XMM_flushDenormalMaskResetSignBits[4]; - // PSQ load/store scale tables - double _psq_ld_scale_ps0_ps1[64 * 2]; - double _psq_ld_scale_ps0_1[64 * 2]; - double _psq_st_scale_ps0_ps1[64 * 2]; - double _psq_st_scale_ps0_1[64 * 2]; // MXCSR uint32 _x64XMM_mxCsr_ftzOn; uint32 _x64XMM_mxCsr_ftzOff;