From 557aff4024a475b4559be11b850f838ecb1e8896 Mon Sep 17 00:00:00 2001 From: Exzap <13877693+Exzap@users.noreply.github.com> Date: Thu, 8 May 2025 08:29:47 +0200 Subject: [PATCH] PPCRec: Implement PSQ scaling --- .../Recompiler/BackendX64/BackendX64FPU.cpp | 19 +++++ .../Recompiler/IML/IMLInstruction.cpp | 3 +- .../Espresso/Recompiler/IML/IMLInstruction.h | 3 + .../Recompiler/PPCRecompilerImlGenFPU.cpp | 77 +++++++++++++++---- 4 files changed, 85 insertions(+), 17 deletions(-) diff --git a/src/Cafe/HW/Espresso/Recompiler/BackendX64/BackendX64FPU.cpp b/src/Cafe/HW/Espresso/Recompiler/BackendX64/BackendX64FPU.cpp index dc07f9d0..6a8b1b97 100644 --- a/src/Cafe/HW/Espresso/Recompiler/BackendX64/BackendX64FPU.cpp +++ b/src/Cafe/HW/Espresso/Recompiler/BackendX64/BackendX64FPU.cpp @@ -241,6 +241,25 @@ void PPCRecompilerX64Gen_imlInstruction_fpr_r_r(PPCRecFunction_t* PPCRecFunction x64Gen_cvtsi2sd_xmmReg_xmmReg(x64GenContext, regFpr, regGpr); return; } + else if (imlInstruction->operation == PPCREC_IML_OP_FPR_BITCAST_INT_TO_FLOAT) + { + cemu_assert_debug(imlInstruction->op_fpr_r_r.regR.GetRegFormat() == IMLRegFormat::F64); // assuming target is always F64 for now + cemu_assert_debug(imlInstruction->op_fpr_r_r.regA.GetRegFormat() == IMLRegFormat::I32); // supporting only 32bit floats as input for now + // exact operation depends on size of types. Floats are automatically promoted to double if the target is F64 + uint32 regFpr = _regF64(imlInstruction->op_fpr_r_r.regR); + if (imlInstruction->op_fpr_r_r.regA.GetRegFormat() == IMLRegFormat::I32) + { + uint32 regGpr = _regI32(imlInstruction->op_fpr_r_r.regA); + x64Gen_movq_xmmReg_reg64(x64GenContext, regFpr, regGpr); // using reg32 as reg64 param here is ok. We'll refactor later + // float to double + x64Gen_cvtss2sd_xmmReg_xmmReg(x64GenContext, regFpr, regFpr); + } + else + { + cemu_assert_unimplemented(); + } + return; + } uint32 regR = _regF64(imlInstruction->op_fpr_r_r.regR); uint32 regA = _regF64(imlInstruction->op_fpr_r_r.regA); diff --git a/src/Cafe/HW/Espresso/Recompiler/IML/IMLInstruction.cpp b/src/Cafe/HW/Espresso/Recompiler/IML/IMLInstruction.cpp index 60b7c6ca..997de4e9 100644 --- a/src/Cafe/HW/Espresso/Recompiler/IML/IMLInstruction.cpp +++ b/src/Cafe/HW/Espresso/Recompiler/IML/IMLInstruction.cpp @@ -277,7 +277,8 @@ void IMLInstruction::CheckRegisterUsage(IMLUsedRegisters* registersUsed) const } else if (operation == PPCREC_IML_OP_FPR_FLOAT_TO_INT || - operation == PPCREC_IML_OP_FPR_INT_TO_FLOAT) + operation == PPCREC_IML_OP_FPR_INT_TO_FLOAT || + operation == PPCREC_IML_OP_FPR_BITCAST_INT_TO_FLOAT) { registersUsed->writtenGPR1 = op_fpr_r_r.regR; registersUsed->readGPR1 = op_fpr_r_r.regA; diff --git a/src/Cafe/HW/Espresso/Recompiler/IML/IMLInstruction.h b/src/Cafe/HW/Espresso/Recompiler/IML/IMLInstruction.h index 3b3898e9..4df2a666 100644 --- a/src/Cafe/HW/Espresso/Recompiler/IML/IMLInstruction.h +++ b/src/Cafe/HW/Espresso/Recompiler/IML/IMLInstruction.h @@ -143,6 +143,9 @@ enum PPCREC_IML_OP_FPR_INT_TO_FLOAT, // convert integer value in gpr to floating point value in fpr PPCREC_IML_OP_FPR_FLOAT_TO_INT, // convert floating point value in fpr to integer value in gpr + // Bitcast (FPR_R_R) + PPCREC_IML_OP_FPR_BITCAST_INT_TO_FLOAT, + // R_R_R + R_R_S32 PPCREC_IML_OP_ADD, // also R_R_R_CARRY PPCREC_IML_OP_SUB, diff --git a/src/Cafe/HW/Espresso/Recompiler/PPCRecompilerImlGenFPU.cpp b/src/Cafe/HW/Espresso/Recompiler/PPCRecompilerImlGenFPU.cpp index 7eb8a4b6..6e602b47 100644 --- a/src/Cafe/HW/Espresso/Recompiler/PPCRecompilerImlGenFPU.cpp +++ b/src/Cafe/HW/Espresso/Recompiler/PPCRecompilerImlGenFPU.cpp @@ -744,7 +744,21 @@ void PPCRecompilerImlGen_ClampInteger(ppcImlGenContext_t* ppcImlGenContext, IMLR ); } -void PPCRecompilerImlGen_EmitPSQLoadCase(ppcImlGenContext_t* ppcImlGenContext, Espresso::PSQ_LOAD_TYPE loadType, bool readPS1, IMLReg gprA, sint32 imm, IMLReg fprDPS0, IMLReg fprDPS1) +void PPCRecompilerIMLGen_GetPSQScale(ppcImlGenContext_t* ppcImlGenContext, IMLReg gqrRegister, IMLReg fprRegScaleOut, bool isLoad) +{ + IMLReg gprTmp2 = PPCRecompilerImlGen_loadRegister(ppcImlGenContext, PPCREC_NAME_TEMPORARY + 2); + // extract scale factor and sign extend it + ppcImlGenContext->emitInst().make_r_r_s32(PPCREC_IML_OP_LEFT_SHIFT, gprTmp2, gqrRegister, 32 - ((isLoad ? 24 : 8)+7)); + ppcImlGenContext->emitInst().make_r_r_s32(PPCREC_IML_OP_RIGHT_SHIFT_S, gprTmp2, gprTmp2, (32-23)-7); + ppcImlGenContext->emitInst().make_r_r_s32(PPCREC_IML_OP_AND, gprTmp2, gprTmp2, 0x1FF<<23); + if (isLoad) + ppcImlGenContext->emitInst().make_r_r(PPCREC_IML_OP_NEG, gprTmp2, gprTmp2); + ppcImlGenContext->emitInst().make_r_r_s32(PPCREC_IML_OP_ADD, gprTmp2, gprTmp2, 0x7F<<23); + // gprTmp2 now holds the scale float bits, bitcast to float + ppcImlGenContext->emitInst().make_fpr_r_r(PPCREC_IML_OP_FPR_BITCAST_INT_TO_FLOAT, fprRegScaleOut, gprTmp2); +} + +void PPCRecompilerImlGen_EmitPSQLoadCase(ppcImlGenContext_t* ppcImlGenContext, sint32 gqrIndex, Espresso::PSQ_LOAD_TYPE loadType, bool readPS1, IMLReg gprA, sint32 imm, IMLReg fprDPS0, IMLReg fprDPS1) { if (loadType == Espresso::PSQ_LOAD_TYPE::TYPE_F32) { @@ -756,26 +770,42 @@ void PPCRecompilerImlGen_EmitPSQLoadCase(ppcImlGenContext_t* ppcImlGenContext, E } if (loadType == Espresso::PSQ_LOAD_TYPE::TYPE_U16 || loadType == Espresso::PSQ_LOAD_TYPE::TYPE_S16) { + // get scale factor + IMLReg gqrRegister = PPCRecompilerImlGen_loadRegister(ppcImlGenContext, PPCREC_NAME_SPR0 + SPR_UGQR0 + gqrIndex); + IMLReg fprScaleReg = _GetFPRTemp(ppcImlGenContext, 2); + PPCRecompilerIMLGen_GetPSQScale(ppcImlGenContext, gqrRegister, fprScaleReg, true); + bool isSigned = (loadType == Espresso::PSQ_LOAD_TYPE::TYPE_S16); IMLReg gprTmp = PPCRecompilerImlGen_loadRegister(ppcImlGenContext, PPCREC_NAME_TEMPORARY + 0); ppcImlGenContext->emitInst().make_r_memory(gprTmp, gprA, imm, 16, isSigned, true); ppcImlGenContext->emitInst().make_fpr_r_r(PPCREC_IML_OP_FPR_INT_TO_FLOAT, fprDPS0, gprTmp); + + ppcImlGenContext->emitInst().make_fpr_r_r_r(PPCREC_IML_OP_FPR_MULTIPLY, fprDPS0, fprDPS0, fprScaleReg); + if(readPS1) { ppcImlGenContext->emitInst().make_r_memory(gprTmp, gprA, imm + 2, 16, isSigned, true); ppcImlGenContext->emitInst().make_fpr_r_r(PPCREC_IML_OP_FPR_INT_TO_FLOAT, fprDPS1, gprTmp); + ppcImlGenContext->emitInst().make_fpr_r_r_r(PPCREC_IML_OP_FPR_MULTIPLY, fprDPS1, fprDPS1, fprScaleReg); } } else if (loadType == Espresso::PSQ_LOAD_TYPE::TYPE_U8 || loadType == Espresso::PSQ_LOAD_TYPE::TYPE_S8) { + // get scale factor + IMLReg gqrRegister = PPCRecompilerImlGen_loadRegister(ppcImlGenContext, PPCREC_NAME_SPR0 + SPR_UGQR0 + gqrIndex); + IMLReg fprScaleReg = _GetFPRTemp(ppcImlGenContext, 2); + PPCRecompilerIMLGen_GetPSQScale(ppcImlGenContext, gqrRegister, fprScaleReg, true); + bool isSigned = (loadType == Espresso::PSQ_LOAD_TYPE::TYPE_S8); IMLReg gprTmp = PPCRecompilerImlGen_loadRegister(ppcImlGenContext, PPCREC_NAME_TEMPORARY + 0); ppcImlGenContext->emitInst().make_r_memory(gprTmp, gprA, imm, 8, isSigned, true); ppcImlGenContext->emitInst().make_fpr_r_r(PPCREC_IML_OP_FPR_INT_TO_FLOAT, fprDPS0, gprTmp); + ppcImlGenContext->emitInst().make_fpr_r_r_r(PPCREC_IML_OP_FPR_MULTIPLY, fprDPS0, fprDPS0, fprScaleReg); if(readPS1) { ppcImlGenContext->emitInst().make_r_memory(gprTmp, gprA, imm + 1, 8, isSigned, true); ppcImlGenContext->emitInst().make_fpr_r_r(PPCREC_IML_OP_FPR_INT_TO_FLOAT, fprDPS1, gprTmp); + ppcImlGenContext->emitInst().make_fpr_r_r_r(PPCREC_IML_OP_FPR_MULTIPLY, fprDPS1, fprDPS1, fprScaleReg); } } } @@ -812,14 +842,15 @@ bool PPCRecompilerImlGen_PSQ_L(ppcImlGenContext_t* ppcImlGenContext, uint32 opco IMLReg gqrRegister = PPCRecompilerImlGen_loadRegister(ppcImlGenContext, PPCREC_NAME_SPR0 + SPR_UGQR0 + gqrIndex); IMLReg loadTypeReg = PPCRecompilerImlGen_loadRegister(ppcImlGenContext, PPCREC_NAME_TEMPORARY + 0); // extract the load type from the GQR register - ppcImlGenContext->emitInst().make_r_r_s32(PPCREC_IML_OP_AND, loadTypeReg, gqrRegister, 0x7); + ppcImlGenContext->emitInst().make_r_r_s32(PPCREC_IML_OP_RIGHT_SHIFT_U, loadTypeReg, gqrRegister, 16); + ppcImlGenContext->emitInst().make_r_r_s32(PPCREC_IML_OP_AND, loadTypeReg, loadTypeReg, 0x7); IMLSegment* caseSegment[6]; sint32 compareValues[6] = {0, 4, 5, 6, 7}; PPCIMLGen_CreateSegmentBranchedPathMultiple(*ppcImlGenContext, *ppcImlGenContext->currentBasicBlock, caseSegment, loadTypeReg, compareValues, 5, 0); for (sint32 i=0; i<5; i++) { IMLRedirectInstOutput outputToCase(ppcImlGenContext, caseSegment[i]); // while this is in scope, instructions go to caseSegment[i] - PPCRecompilerImlGen_EmitPSQLoadCase(ppcImlGenContext, static_cast(compareValues[i]), readPS1, gprA, imm, fprDPS0, fprDPS1); + PPCRecompilerImlGen_EmitPSQLoadCase(ppcImlGenContext, gqrIndex, static_cast(compareValues[i]), readPS1, gprA, imm, fprDPS0, fprDPS1); // create the case jump instructions here because we need to add it last caseSegment[i]->AppendInstruction()->make_jump(); } @@ -839,11 +870,11 @@ bool PPCRecompilerImlGen_PSQ_L(ppcImlGenContext_t* ppcImlGenContext, uint32 opco return false; } - PPCRecompilerImlGen_EmitPSQLoadCase(ppcImlGenContext, type, readPS1, gprA, imm, fprDPS0, fprDPS1); + PPCRecompilerImlGen_EmitPSQLoadCase(ppcImlGenContext, gqrIndex, type, readPS1, gprA, imm, fprDPS0, fprDPS1); return true; } -void PPCRecompilerImlGen_EmitPSQStoreCase(ppcImlGenContext_t* ppcImlGenContext, Espresso::PSQ_LOAD_TYPE storeType, bool storePS1, IMLReg gprA, sint32 imm, IMLReg fprDPS0, IMLReg fprDPS1) +void PPCRecompilerImlGen_EmitPSQStoreCase(ppcImlGenContext_t* ppcImlGenContext, sint32 gqrIndex, Espresso::PSQ_LOAD_TYPE storeType, bool storePS1, IMLReg gprA, sint32 imm, IMLReg fprDPS0, IMLReg fprDPS1) { cemu_assert_debug(!storePS1 || fprDPS1.IsValid()); if (storeType == Espresso::PSQ_LOAD_TYPE::TYPE_F32) @@ -856,10 +887,18 @@ void PPCRecompilerImlGen_EmitPSQStoreCase(ppcImlGenContext_t* ppcImlGenContext, } else if (storeType == Espresso::PSQ_LOAD_TYPE::TYPE_U16 || storeType == Espresso::PSQ_LOAD_TYPE::TYPE_S16) { + // get scale factor + IMLReg gqrRegister = PPCRecompilerImlGen_loadRegister(ppcImlGenContext, PPCREC_NAME_SPR0 + SPR_UGQR0 + gqrIndex); + IMLReg fprScaleReg = _GetFPRTemp(ppcImlGenContext, 2); + PPCRecompilerIMLGen_GetPSQScale(ppcImlGenContext, gqrRegister, fprScaleReg, false); + bool isSigned = (storeType == Espresso::PSQ_LOAD_TYPE::TYPE_S16); + IMLReg fprTmp = _GetFPRTemp(ppcImlGenContext, 0); + IMLReg gprTmp = PPCRecompilerImlGen_loadRegister(ppcImlGenContext, PPCREC_NAME_TEMPORARY + 0); - ppcImlGenContext->emitInst().make_fpr_r_r(PPCREC_IML_OP_FPR_FLOAT_TO_INT, gprTmp, fprDPS0); - // todo - scaling + ppcImlGenContext->emitInst().make_fpr_r_r_r(PPCREC_IML_OP_FPR_MULTIPLY, fprTmp, fprDPS0, fprScaleReg); + ppcImlGenContext->emitInst().make_fpr_r_r(PPCREC_IML_OP_FPR_FLOAT_TO_INT, gprTmp, fprTmp); + if (isSigned) PPCRecompilerImlGen_ClampInteger(ppcImlGenContext, gprTmp, -32768, 32767); else @@ -867,8 +906,8 @@ void PPCRecompilerImlGen_EmitPSQStoreCase(ppcImlGenContext_t* ppcImlGenContext, ppcImlGenContext->emitInst().make_memory_r(gprTmp, gprA, imm, 16, true); if(storePS1) { - ppcImlGenContext->emitInst().make_fpr_r_r(PPCREC_IML_OP_FPR_FLOAT_TO_INT, gprTmp, fprDPS1); - // todo - scaling + ppcImlGenContext->emitInst().make_fpr_r_r_r(PPCREC_IML_OP_FPR_MULTIPLY, fprTmp, fprDPS1, fprScaleReg); + ppcImlGenContext->emitInst().make_fpr_r_r(PPCREC_IML_OP_FPR_FLOAT_TO_INT, gprTmp, fprTmp); if (isSigned) PPCRecompilerImlGen_ClampInteger(ppcImlGenContext, gprTmp, -32768, 32767); else @@ -878,9 +917,16 @@ void PPCRecompilerImlGen_EmitPSQStoreCase(ppcImlGenContext_t* ppcImlGenContext, } else if (storeType == Espresso::PSQ_LOAD_TYPE::TYPE_U8 || storeType == Espresso::PSQ_LOAD_TYPE::TYPE_S8) { + // get scale factor + IMLReg gqrRegister = PPCRecompilerImlGen_loadRegister(ppcImlGenContext, PPCREC_NAME_SPR0 + SPR_UGQR0 + gqrIndex); + IMLReg fprScaleReg = _GetFPRTemp(ppcImlGenContext, 2); + PPCRecompilerIMLGen_GetPSQScale(ppcImlGenContext, gqrRegister, fprScaleReg, false); + bool isSigned = (storeType == Espresso::PSQ_LOAD_TYPE::TYPE_S8); + IMLReg fprTmp = _GetFPRTemp(ppcImlGenContext, 0); IMLReg gprTmp = PPCRecompilerImlGen_loadRegister(ppcImlGenContext, PPCREC_NAME_TEMPORARY + 0); - ppcImlGenContext->emitInst().make_fpr_r_r(PPCREC_IML_OP_FPR_FLOAT_TO_INT, gprTmp, fprDPS0); + ppcImlGenContext->emitInst().make_fpr_r_r_r(PPCREC_IML_OP_FPR_MULTIPLY, fprTmp, fprDPS0, fprScaleReg); + ppcImlGenContext->emitInst().make_fpr_r_r(PPCREC_IML_OP_FPR_FLOAT_TO_INT, gprTmp, fprTmp); if (isSigned) PPCRecompilerImlGen_ClampInteger(ppcImlGenContext, gprTmp, -128, 127); else @@ -888,8 +934,8 @@ void PPCRecompilerImlGen_EmitPSQStoreCase(ppcImlGenContext_t* ppcImlGenContext, ppcImlGenContext->emitInst().make_memory_r(gprTmp, gprA, imm, 8, true); if(storePS1) { - ppcImlGenContext->emitInst().make_fpr_r_r(PPCREC_IML_OP_FPR_FLOAT_TO_INT, gprTmp, fprDPS1); - // todo - scaling + ppcImlGenContext->emitInst().make_fpr_r_r_r(PPCREC_IML_OP_FPR_MULTIPLY, fprTmp, fprDPS1, fprScaleReg); + ppcImlGenContext->emitInst().make_fpr_r_r(PPCREC_IML_OP_FPR_FLOAT_TO_INT, gprTmp, fprTmp); if (isSigned) PPCRecompilerImlGen_ClampInteger(ppcImlGenContext, gprTmp, -128, 127); else @@ -928,8 +974,7 @@ bool PPCRecompilerImlGen_PSQ_ST(ppcImlGenContext_t* ppcImlGenContext, uint32 opc IMLReg gqrRegister = PPCRecompilerImlGen_loadRegister(ppcImlGenContext, PPCREC_NAME_SPR0 + SPR_UGQR0 + gqrIndex); IMLReg loadTypeReg = PPCRecompilerImlGen_loadRegister(ppcImlGenContext, PPCREC_NAME_TEMPORARY + 0); // extract the load type from the GQR register - ppcImlGenContext->emitInst().make_r_r_s32(PPCREC_IML_OP_RIGHT_SHIFT_U, loadTypeReg, gqrRegister, 16); - ppcImlGenContext->emitInst().make_r_r_s32(PPCREC_IML_OP_AND, loadTypeReg, loadTypeReg, 0x7); + ppcImlGenContext->emitInst().make_r_r_s32(PPCREC_IML_OP_AND, loadTypeReg, gqrRegister, 0x7); IMLSegment* caseSegment[5]; sint32 compareValues[5] = {0, 4, 5, 6, 7}; @@ -937,7 +982,7 @@ bool PPCRecompilerImlGen_PSQ_ST(ppcImlGenContext_t* ppcImlGenContext, uint32 opc for (sint32 i=0; i<5; i++) { IMLRedirectInstOutput outputToCase(ppcImlGenContext, caseSegment[i]); // while this is in scope, instructions go to caseSegment[i] - PPCRecompilerImlGen_EmitPSQStoreCase(ppcImlGenContext, static_cast(compareValues[i]), storePS1, gprA, imm, fprDPS0, fprDPS1); + PPCRecompilerImlGen_EmitPSQStoreCase(ppcImlGenContext, gqrIndex, static_cast(compareValues[i]), storePS1, gprA, imm, fprDPS0, fprDPS1); ppcImlGenContext->emitInst().make_jump(); // finalize case } return true; @@ -954,7 +999,7 @@ bool PPCRecompilerImlGen_PSQ_ST(ppcImlGenContext_t* ppcImlGenContext, uint32 opc return false; } - PPCRecompilerImlGen_EmitPSQStoreCase(ppcImlGenContext, type, storePS1, gprA, imm, fprDPS0, fprDPS1); + PPCRecompilerImlGen_EmitPSQStoreCase(ppcImlGenContext, gqrIndex, type, storePS1, gprA, imm, fprDPS0, fprDPS1); return true; }