mirror of
https://github.com/cemu-project/Cemu.git
synced 2025-07-10 17:01:17 +12:00
Removes the -DPUBLIC_RELEASE flag. Cemu's debug asserts are now only enabled if the build configuration is Debug. Similarly, on Windows the console is only shown for Debug builds.
890 lines
25 KiB
C++
890 lines
25 KiB
C++
#include "Cafe/HW/Latte/ISA/RegDefines.h"
|
|
#include "Cafe/HW/Latte/Core/Latte.h"
|
|
#include "Cafe/HW/Latte/Core/LatteConst.h"
|
|
#include "Cafe/HW/Latte/Core/LatteShaderAssembly.h"
|
|
#include "Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInstructions.h"
|
|
#include "Cafe/HW/Latte/Core/FetchShader.h"
|
|
|
|
#include "Cafe/HW/Latte/Core/LattePM4.h"
|
|
|
|
#define GPU7_ENDIAN_8IN32 2
|
|
|
|
typedef struct
|
|
{
|
|
union
|
|
{
|
|
float f[4];
|
|
uint32 u32[4];
|
|
sint32 s32[4];
|
|
};
|
|
}LatteReg_t;
|
|
|
|
#define REG_AR (128)
|
|
|
|
typedef struct
|
|
{
|
|
LatteReg_t reg[128+1];
|
|
union
|
|
{
|
|
uint32 u32[5];
|
|
sint32 s32[5];
|
|
float f[5];
|
|
}pvps;
|
|
union
|
|
{
|
|
uint32 u32[5];
|
|
sint32 s32[5];
|
|
float f[5];
|
|
}pvpsUpdate;
|
|
void* cfilePtr;
|
|
LatteReg_t* literalPtr;
|
|
// cbank
|
|
LatteReg_t* cbank0;
|
|
LatteReg_t* cbank1;
|
|
// vertex shader exports
|
|
LatteReg_t export_pos;
|
|
|
|
uint32* shaderBase;
|
|
sint32 shaderSize;
|
|
// shaders
|
|
LatteFetchShader* fetchShader;
|
|
}LatteSoftwareExecContext_t;
|
|
|
|
LatteSoftwareExecContext_t LatteSWCtx;
|
|
|
|
char _tempStringBuf[2048];
|
|
sint32 tempStringIndex = 0;
|
|
|
|
char* getTempString()
|
|
{
|
|
tempStringIndex = (tempStringIndex + 1) % 10;
|
|
return _tempStringBuf + tempStringIndex * (sizeof(_tempStringBuf) / 10);
|
|
}
|
|
|
|
const char* _getSrcName(uint32 srcSel, uint32 srcChan)
|
|
{
|
|
if (GPU7_ALU_SRC_IS_GPR(srcSel))
|
|
{
|
|
char* tempStr = getTempString();
|
|
sprintf(tempStr, "R%d", srcSel & 0x7F);
|
|
if (srcChan == 0)
|
|
strcat(tempStr, ".x");
|
|
else if (srcChan == 1)
|
|
strcat(tempStr, ".y");
|
|
else if (srcChan == 2)
|
|
strcat(tempStr, ".z");
|
|
else
|
|
strcat(tempStr, ".w");
|
|
return tempStr;
|
|
}
|
|
else if (GPU7_ALU_SRC_IS_CFILE(srcSel))
|
|
{
|
|
return "CFILE";
|
|
}
|
|
else if (GPU7_ALU_SRC_IS_PV(srcSel))
|
|
{
|
|
return "PV";
|
|
}
|
|
else if (GPU7_ALU_SRC_IS_PS(srcSel))
|
|
{
|
|
return "PS";
|
|
}
|
|
else if (GPU7_ALU_SRC_IS_CBANK0(srcSel))
|
|
{
|
|
return "CBANK0";
|
|
}
|
|
else if (GPU7_ALU_SRC_IS_CBANK1(srcSel))
|
|
{
|
|
return "CBANK1";
|
|
}
|
|
else if (GPU7_ALU_SRC_IS_CONST_0F(srcSel))
|
|
{
|
|
return "0.0";
|
|
}
|
|
else if (GPU7_ALU_SRC_IS_CONST_1F(srcSel))
|
|
{
|
|
return "1.0";
|
|
}
|
|
else if (GPU7_ALU_SRC_IS_CONST_0_5F(srcSel))
|
|
{
|
|
return "0.5";
|
|
}
|
|
else if (GPU7_ALU_SRC_IS_LITERAL(srcSel))
|
|
{
|
|
return "LITERAL";
|
|
}
|
|
else
|
|
{
|
|
cemu_assert_unimplemented();
|
|
}
|
|
return "UKN";
|
|
}
|
|
|
|
sint32 _getSrc_genericS32(uint32 srcSel, uint32 srcChan, uint32 srcRel, uint32 indexMode)
|
|
{
|
|
sint32 v = 0;
|
|
if (GPU7_ALU_SRC_IS_GPR(srcSel))
|
|
{
|
|
cemu_assert_debug(srcRel == 0);
|
|
v = LatteSWCtx.reg[GPU7_ALU_SRC_GET_GPR_INDEX(srcSel)].s32[srcChan];
|
|
}
|
|
else if (GPU7_ALU_SRC_IS_CFILE(srcSel))
|
|
{
|
|
if (srcRel)
|
|
{
|
|
if (indexMode <= GPU7_INDEX_AR_W)
|
|
{
|
|
v = ((sint32*)LatteSWCtx.cfilePtr)[LatteSWCtx.reg[REG_AR].s32[indexMode] * 4 + GPU7_ALU_SRC_GET_CFILE_INDEX(srcSel) * 4 + srcChan];
|
|
}
|
|
else
|
|
cemu_assert_debug(false);
|
|
}
|
|
else
|
|
{
|
|
v = ((sint32*)LatteSWCtx.cfilePtr)[GPU7_ALU_SRC_GET_CFILE_INDEX(srcSel) * 4 + srcChan];
|
|
}
|
|
}
|
|
else if (GPU7_ALU_SRC_IS_PV(srcSel))
|
|
{
|
|
cemu_assert_debug(srcRel == 0);
|
|
v = LatteSWCtx.pvps.s32[srcChan];
|
|
}
|
|
else if (GPU7_ALU_SRC_IS_PS(srcSel))
|
|
{
|
|
cemu_assert_debug(srcRel == 0);
|
|
v = LatteSWCtx.pvps.s32[4];
|
|
}
|
|
else if (GPU7_ALU_SRC_IS_CBANK0(srcSel))
|
|
{
|
|
if (srcRel)
|
|
{
|
|
if (indexMode <= GPU7_INDEX_AR_W)
|
|
{
|
|
v = LatteSWCtx.cbank0[LatteSWCtx.reg[REG_AR].s32[indexMode] + GPU7_ALU_SRC_GET_CBANK0_INDEX(srcSel)].s32[srcChan];
|
|
}
|
|
else
|
|
assert_dbg();
|
|
}
|
|
else
|
|
{
|
|
v = LatteSWCtx.cbank0[GPU7_ALU_SRC_GET_CBANK0_INDEX(srcSel)].s32[srcChan];
|
|
}
|
|
}
|
|
else if (GPU7_ALU_SRC_IS_CBANK1(srcSel))
|
|
{
|
|
if (srcRel)
|
|
{
|
|
if (indexMode <= GPU7_INDEX_AR_W)
|
|
{
|
|
v = LatteSWCtx.cbank1[LatteSWCtx.reg[REG_AR].s32[indexMode] + GPU7_ALU_SRC_GET_CBANK1_INDEX(srcSel)].s32[srcChan];
|
|
}
|
|
else
|
|
assert_dbg();
|
|
}
|
|
else
|
|
{
|
|
v = LatteSWCtx.cbank1[GPU7_ALU_SRC_GET_CBANK1_INDEX(srcSel)].s32[srcChan];
|
|
}
|
|
}
|
|
else if (GPU7_ALU_SRC_IS_CONST_0F(srcSel))
|
|
{
|
|
cemu_assert_debug(srcRel == 0);
|
|
v = 0; // 0.0f
|
|
}
|
|
else if (GPU7_ALU_SRC_IS_CONST_1F(srcSel))
|
|
{
|
|
cemu_assert_debug(srcRel == 0);
|
|
v = 0x3f800000; // 1.0f
|
|
}
|
|
else if (GPU7_ALU_SRC_IS_CONST_0_5F(srcSel))
|
|
{
|
|
cemu_assert_debug(srcRel == 0);
|
|
v = 0x3f000000; // 0.5f
|
|
}
|
|
else if (GPU7_ALU_SRC_IS_LITERAL(srcSel))
|
|
{
|
|
v = LatteSWCtx.literalPtr->s32[srcChan];
|
|
}
|
|
else
|
|
assert_dbg();
|
|
return v;
|
|
}
|
|
|
|
sint32 _getSrc_s32(uint32 srcSel, uint32 srcChan, uint32 srcNeg, uint32 srcAbs, uint32 srcRel, uint32 indexMode)
|
|
{
|
|
sint32 v = _getSrc_genericS32(srcSel, srcChan, srcRel, indexMode);
|
|
cemu_assert_debug(srcNeg == 0);
|
|
cemu_assert_debug(srcAbs == 0);
|
|
return v;
|
|
}
|
|
|
|
float _getSrc_f(uint32 srcSel, uint32 srcChan, uint32 srcNeg, uint32 srcAbs, uint32 srcRel, uint32 indexMode)
|
|
{
|
|
float v = 0;
|
|
*(sint32*)&v = _getSrc_genericS32(srcSel, srcChan, srcRel, indexMode);
|
|
if (srcAbs) // todo - how does this interact with srcNeg
|
|
v = fabs(v);
|
|
if (srcNeg)
|
|
v = -v;
|
|
return v;
|
|
}
|
|
|
|
#define _updateGPR_S32(__gprIdx,__channel,__v) {gprUpdate[updateQueueLength].gprIndex = __gprIdx; gprUpdate[updateQueueLength].channel = __channel; gprUpdate[updateQueueLength].s32 = __v; updateQueueLength++;}
|
|
#define _updateGPR_F(__gprIdx,__channel,__v) {gprUpdate[updateQueueLength].gprIndex = __gprIdx; gprUpdate[updateQueueLength].channel = __channel; gprUpdate[updateQueueLength].f = __v; updateQueueLength++;}
|
|
|
|
#define _updatePVPS_S32(__pvIndex, __v) {LatteSWCtx.pvpsUpdate.s32[__pvIndex] = __v;}
|
|
#define _updatePVPS_F(__pvIndex, __v) {LatteSWCtx.pvpsUpdate.f[__pvIndex] = __v;}
|
|
|
|
float LatteSoftware_omod(uint32 omod, float f)
|
|
{
|
|
switch (omod)
|
|
{
|
|
case ALU_OMOD_NONE:
|
|
return f;
|
|
case ALU_OMOD_MUL2:
|
|
return f * 2.0f;
|
|
case ALU_OMOD_MUL4:
|
|
return f * 4.0f;
|
|
case ALU_OMOD_DIV2:
|
|
return f / 2.0f;
|
|
}
|
|
cemu_assert_debug(false);
|
|
return 0.0f;
|
|
}
|
|
|
|
#ifdef CEMU_DEBUG_ASSERT
|
|
#define _clamp(__v) if(destClamp != 0) cemu_assert_unimplemented()
|
|
#else
|
|
#define _clamp(__v) // todo
|
|
#endif
|
|
|
|
#define _omod(__v) __v = LatteSoftware_omod(omod, __v)
|
|
|
|
bool LatteDecompiler_IsALUTransInstruction(bool isOP3, uint32 opcode);
|
|
|
|
void LatteSoftware_setupCBankPointers(uint32 cBank0Index, uint32 cBank1Index, uint32 cBank0AddrBase, uint32 cBank1AddrBase)
|
|
{
|
|
MPTR cBank0Ptr = LatteGPUState.contextRegister[mmSQ_VTX_UNIFORM_BLOCK_START + cBank0Index * 7 + 0];
|
|
MPTR cBank1Ptr = LatteGPUState.contextRegister[mmSQ_VTX_UNIFORM_BLOCK_START + cBank1Index * 7 + 0];
|
|
LatteSWCtx.cbank0 = (LatteReg_t*)memory_getPointerFromPhysicalOffset(cBank0Ptr + cBank0AddrBase);
|
|
LatteSWCtx.cbank1 = (LatteReg_t*)memory_getPointerFromPhysicalOffset(cBank1Ptr + cBank1AddrBase);
|
|
}
|
|
|
|
void LatteSoftware_executeALUClause(uint32 cfType, uint32 addr, uint32 count, uint32 cBank0Index, uint32 cBank1Index, uint32 cBank0AddrBase, uint32 cBank1AddrBase)
|
|
{
|
|
cemu_assert_debug(cfType == GPU7_CF_INST_ALU); // todo - handle other ALU clauses
|
|
uint32* aluWordPtr = LatteSWCtx.shaderBase + addr * 2;
|
|
uint32* aluWordPtrEnd = aluWordPtr + count * 2;
|
|
|
|
LatteSoftware_setupCBankPointers(cBank0Index, cBank1Index, cBank0AddrBase, cBank1AddrBase);
|
|
|
|
struct
|
|
{
|
|
sint16 gprIndex;
|
|
sint16 channel;
|
|
union
|
|
{
|
|
float f;
|
|
uint32 u32;
|
|
sint32 s32;
|
|
};
|
|
}gprUpdate[16];
|
|
|
|
sint32 updateQueueLength = 0;
|
|
uint32 aluUnitWriteMask = 0;
|
|
uint8 literalAccessMask = 0;
|
|
while (aluWordPtr < aluWordPtrEnd)
|
|
{
|
|
// calculate number of instructions in group
|
|
sint32 groupInstructionCount;
|
|
for (sint32 i = 0; i < 6; i++)
|
|
{
|
|
if (aluWordPtr[i * 2] & 0x80000000)
|
|
{
|
|
groupInstructionCount = i + 1;
|
|
break;
|
|
}
|
|
cemu_assert_debug(i < 5);
|
|
}
|
|
LatteSWCtx.literalPtr = (LatteReg_t*)(aluWordPtr + groupInstructionCount*2);
|
|
// process group
|
|
bool hasReductionInstruction = false;
|
|
float reductionResult = 0.0f;
|
|
for (sint32 s = 0; s < groupInstructionCount; s++)
|
|
{
|
|
uint32 aluWord0 = aluWordPtr[0];
|
|
uint32 aluWord1 = aluWordPtr[1];
|
|
aluWordPtr += 2;
|
|
uint32 alu_inst13_5 = (aluWord1 >> 13) & 0x1F;
|
|
// parameters from ALU word 0 (shared for ALU OP2 and OP3)
|
|
uint32 src0Sel = (aluWord0 >> 0) & 0x1FF; // source selection
|
|
uint32 src1Sel = (aluWord0 >> 13) & 0x1FF;
|
|
uint32 src0Rel = (aluWord0 >> 9) & 0x1; // relative addressing mode
|
|
uint32 src1Rel = (aluWord0 >> 22) & 0x1;
|
|
uint32 src0Chan = (aluWord0 >> 10) & 0x3; // component selection x/y/z/w
|
|
uint32 src1Chan = (aluWord0 >> 23) & 0x3;
|
|
uint32 src0Neg = (aluWord0 >> 12) & 0x1; // negate input
|
|
uint32 src1Neg = (aluWord0 >> 25) & 0x1;
|
|
uint32 indexMode = (aluWord0 >> 26) & 7;
|
|
uint32 predSel = (aluWord0 >> 29) & 3;
|
|
if (GPU7_ALU_SRC_IS_LITERAL(src0Sel))
|
|
literalAccessMask |= (1 << src0Chan);
|
|
if (GPU7_ALU_SRC_IS_LITERAL(src1Sel))
|
|
literalAccessMask |= (1 << src1Chan);
|
|
|
|
if (alu_inst13_5 >= 0x8)
|
|
{
|
|
// op3
|
|
// parameters from ALU word 1
|
|
uint32 src2Sel = (aluWord1 >> 0) & 0x1FF; // source selection
|
|
uint32 src2Rel = (aluWord1 >> 9) & 0x1; // relative addressing mode
|
|
uint32 src2Chan = (aluWord1 >> 10) & 0x3; // component selection x/y/z/w
|
|
uint32 src2Neg = (aluWord1 >> 12) & 0x1; // negate input
|
|
if (GPU7_ALU_SRC_IS_LITERAL(src2Sel))
|
|
literalAccessMask |= (1 << src2Chan);
|
|
|
|
uint32 destGpr = (aluWord1 >> 21) & 0x7F;
|
|
uint32 destRel = (aluWord1 >> 28) & 1;
|
|
uint32 destElem = (aluWord1 >> 29) & 3;
|
|
uint32 destClamp = (aluWord1 >> 31) & 1;
|
|
|
|
uint32 aluUnit = destElem;
|
|
if (aluUnitWriteMask&(1 << destElem))
|
|
{
|
|
aluUnit = 4; // PV
|
|
}
|
|
else
|
|
aluUnitWriteMask |= (1 << destElem);
|
|
|
|
switch (alu_inst13_5)
|
|
{
|
|
case ALU_OP3_INST_CMOVE:
|
|
{
|
|
float f = _getSrc_f(src0Sel, src0Chan, src0Neg, 0, src0Rel, indexMode);
|
|
sint32 result;
|
|
if (f == 0.0f)
|
|
result = _getSrc_s32(src1Sel, src1Chan, src1Neg, 0, src1Rel, indexMode);
|
|
else
|
|
result = _getSrc_s32(src2Sel, src2Chan, src2Neg, 0, src2Rel, indexMode);
|
|
cemu_assert_debug(destClamp == 0);
|
|
_updateGPR_S32(destGpr, destElem, result);
|
|
_updatePVPS_S32(aluUnit, result);
|
|
break;
|
|
}
|
|
case ALU_OP3_INST_CMOVGT:
|
|
{
|
|
float f = _getSrc_f(src0Sel, src0Chan, src0Neg, 0, src0Rel, indexMode);
|
|
sint32 result;
|
|
if (f > 0.0f)
|
|
result = _getSrc_s32(src1Sel, src1Chan, src1Neg, 0, src1Rel, indexMode);
|
|
else
|
|
result = _getSrc_s32(src2Sel, src2Chan, src2Neg, 0, src2Rel, indexMode);
|
|
cemu_assert_debug(destClamp == 0);
|
|
_updateGPR_S32(destGpr, destElem, result);
|
|
_updatePVPS_S32(aluUnit, result);
|
|
break;
|
|
}
|
|
case ALU_OP3_INST_MULADD:
|
|
{
|
|
float f0 = _getSrc_f(src0Sel, src0Chan, src0Neg, 0, src0Rel, indexMode);
|
|
float f1 = _getSrc_f(src1Sel, src1Chan, src1Neg, 0, src1Rel, indexMode);
|
|
float f2 = _getSrc_f(src2Sel, src2Chan, src2Neg, 0, src2Rel, indexMode);
|
|
float f = f0*f1+f2;
|
|
_updateGPR_F(destGpr, destElem, f);
|
|
_updatePVPS_F(aluUnit, f);
|
|
break;
|
|
}
|
|
default:
|
|
cemu_assert_debug(false);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
uint32 alu_inst7_11 = (aluWord1 >> 7) & 0x7FF;
|
|
|
|
uint32 src0Abs = (aluWord1 >> 0) & 1;
|
|
uint32 src1Abs = (aluWord1 >> 1) & 1;
|
|
uint32 updateExecuteMask = (aluWord1 >> 2) & 1;
|
|
uint32 updatePredicate = (aluWord1 >> 3) & 1;
|
|
uint32 writeMask = (aluWord1 >> 4) & 1;
|
|
uint32 omod = (aluWord1 >> 5) & 3;
|
|
|
|
uint32 destGpr = (aluWord1 >> 21) & 0x7F;
|
|
uint32 destRel = (aluWord1 >> 28) & 1;
|
|
uint32 destElem = (aluWord1 >> 29) & 3;
|
|
uint32 destClamp = (aluWord1 >> 31) & 1;
|
|
|
|
uint32 aluUnit = destElem;
|
|
if (LatteDecompiler_IsALUTransInstruction(false, alu_inst7_11))
|
|
aluUnit = 4;
|
|
|
|
if (aluUnitWriteMask&(1 << destElem))
|
|
{
|
|
aluUnit = 4; // PV
|
|
}
|
|
else
|
|
aluUnitWriteMask |= (1 << destElem);
|
|
|
|
switch (alu_inst7_11)
|
|
{
|
|
case ALU_OP2_INST_NOP:
|
|
{
|
|
break;
|
|
}
|
|
case ALU_OP2_INST_MOV:
|
|
{
|
|
if (src0Neg || src0Abs || omod != 0)
|
|
{
|
|
float v = _getSrc_f(src0Sel, src0Chan, src0Neg, src0Abs, src0Rel, indexMode);
|
|
_omod(v);
|
|
_clamp(f);
|
|
if (writeMask)
|
|
_updateGPR_F(destGpr, destElem, v);
|
|
_updatePVPS_F(aluUnit, v);
|
|
}
|
|
else
|
|
{
|
|
sint32 v = _getSrc_s32(src0Sel, src0Chan, src0Neg, src0Abs, src0Rel, indexMode);
|
|
// todo - omod/clamp for float moves
|
|
if (writeMask)
|
|
_updateGPR_S32(destGpr, destElem, v);
|
|
_updatePVPS_S32(aluUnit, v);
|
|
}
|
|
break;
|
|
}
|
|
case ALU_OP2_INST_ADD:
|
|
{
|
|
float f0 = _getSrc_f(src0Sel, src0Chan, src0Neg, src0Abs, src0Rel, indexMode);
|
|
float f1 = _getSrc_f(src1Sel, src1Chan, src1Neg, src1Abs, src1Rel, indexMode);
|
|
float f = f0 + f1;
|
|
_omod(f);
|
|
_clamp(f);
|
|
if (writeMask)
|
|
_updateGPR_F(destGpr, destElem, f);
|
|
_updatePVPS_F(aluUnit, f);
|
|
break;
|
|
}
|
|
case ALU_OP2_INST_MUL:
|
|
case ALU_OP2_INST_MUL_IEEE:
|
|
{
|
|
float f0 = _getSrc_f(src0Sel, src0Chan, src0Neg, src0Abs, src0Rel, indexMode);
|
|
float f1 = _getSrc_f(src1Sel, src1Chan, src1Neg, src1Abs, src1Rel, indexMode);
|
|
float f = f0 * f1;
|
|
if (f0 == 0.0f || f1 == 0.0f)
|
|
f = 0.0f;
|
|
_omod(f);
|
|
_clamp(f);
|
|
if (writeMask)
|
|
_updateGPR_F(destGpr, destElem, f);
|
|
_updatePVPS_F(aluUnit, f);
|
|
break;
|
|
}
|
|
case ALU_OP2_INST_TRUNC:
|
|
{
|
|
float f0 = _getSrc_f(src0Sel, src0Chan, src0Neg, src0Abs, src0Rel, indexMode);
|
|
float f = truncf(f0);
|
|
_omod(f);
|
|
_clamp(f);
|
|
if (writeMask)
|
|
_updateGPR_F(destGpr, destElem, f);
|
|
_updatePVPS_F(aluUnit, f);
|
|
break;
|
|
}
|
|
case ALU_OP2_INST_RECIP_IEEE:
|
|
{
|
|
float f0 = _getSrc_f(src0Sel, src0Chan, src0Neg, src0Abs, src0Rel, indexMode);
|
|
float f = 1.0f / f0;
|
|
_omod(f);
|
|
_clamp(f);
|
|
if (writeMask)
|
|
_updateGPR_F(destGpr, destElem, f);
|
|
_updatePVPS_F(aluUnit, f);
|
|
break;
|
|
}
|
|
case ALU_OP2_INST_SETGT:
|
|
{
|
|
float f0 = _getSrc_f(src0Sel, src0Chan, src0Neg, src0Abs, src0Rel, indexMode);
|
|
float f1 = _getSrc_f(src1Sel, src1Chan, src1Neg, src1Abs, src1Rel, indexMode);
|
|
float f = (f0 > f1) ? 1.0f : 0.0f;
|
|
_omod(f);
|
|
_clamp(f);
|
|
if (writeMask)
|
|
_updateGPR_F(destGpr, destElem, f);
|
|
_updatePVPS_F(aluUnit, f);
|
|
break;
|
|
}
|
|
case ALU_OP2_INST_SETGE:
|
|
{
|
|
float f0 = _getSrc_f(src0Sel, src0Chan, src0Neg, src0Abs, src0Rel, indexMode);
|
|
float f1 = _getSrc_f(src1Sel, src1Chan, src1Neg, src1Abs, src1Rel, indexMode);
|
|
float f = (f0 >= f1) ? 1.0f : 0.0f;
|
|
_omod(f);
|
|
_clamp(f);
|
|
if (writeMask)
|
|
_updateGPR_F(destGpr, destElem, f);
|
|
_updatePVPS_F(aluUnit, f);
|
|
break;
|
|
}
|
|
case ALU_OP2_INST_INT_TO_FLOAT:
|
|
{
|
|
sint32 v = _getSrc_s32(src0Sel, src0Chan, src0Neg, src0Abs, src0Rel, indexMode);
|
|
float f = (float)v;
|
|
_omod(f);
|
|
_clamp(f);
|
|
if (writeMask)
|
|
_updateGPR_F(destGpr, destElem, f);
|
|
_updatePVPS_F(aluUnit, f);
|
|
break;
|
|
}
|
|
case ALU_OP2_INST_FLT_TO_INT:
|
|
{
|
|
float v = _getSrc_f(src0Sel, src0Chan, src0Neg, src0Abs, src0Rel, indexMode);
|
|
sint32 f = (sint32)v;
|
|
if (writeMask)
|
|
_updateGPR_S32(destGpr, destElem, f);
|
|
_updatePVPS_S32(aluUnit, f);
|
|
break;
|
|
}
|
|
case ALU_OP2_INST_MOVA_FLOOR:
|
|
{
|
|
float f = _getSrc_f(src0Sel, src0Chan, src0Neg, src0Abs, src0Rel, indexMode);
|
|
f = floor(f);
|
|
f = std::min(std::max(f, -256.0f), 255.0f);
|
|
// omod, clamp?
|
|
_updateGPR_S32(REG_AR, destElem, (sint32)f);
|
|
if (writeMask)
|
|
_updateGPR_F(destGpr, destElem, f);
|
|
_updatePVPS_F(aluUnit, f);
|
|
break;
|
|
}
|
|
case ALU_OP2_INST_DOT4:
|
|
{
|
|
float f0 = _getSrc_f(src0Sel, src0Chan, src0Neg, src0Abs, src0Rel, indexMode);
|
|
float f1 = _getSrc_f(src1Sel, src1Chan, src1Neg, src1Abs, src1Rel, indexMode);
|
|
float f = f0 * f1;
|
|
reductionResult += f;
|
|
_omod(f);
|
|
_clamp(f);
|
|
if (writeMask)
|
|
{
|
|
_updateGPR_F(destGpr, destElem, f);
|
|
}
|
|
_updatePVPS_F(aluUnit, f);
|
|
hasReductionInstruction = true;
|
|
break;
|
|
}
|
|
default:
|
|
cemu_assert_debug(false);
|
|
}
|
|
}
|
|
}
|
|
// apply updates
|
|
if (hasReductionInstruction == false)
|
|
{
|
|
for (sint32 i = 0; i < updateQueueLength; i++)
|
|
{
|
|
LatteSWCtx.reg[gprUpdate[i].gprIndex].s32[gprUpdate[i].channel] = gprUpdate[i].s32;
|
|
}
|
|
LatteSWCtx.pvps.s32[0] = LatteSWCtx.pvpsUpdate.s32[0];
|
|
LatteSWCtx.pvps.s32[1] = LatteSWCtx.pvpsUpdate.s32[1];
|
|
LatteSWCtx.pvps.s32[2] = LatteSWCtx.pvpsUpdate.s32[2];
|
|
LatteSWCtx.pvps.s32[3] = LatteSWCtx.pvpsUpdate.s32[3];
|
|
LatteSWCtx.pvps.s32[4] = LatteSWCtx.pvpsUpdate.s32[4];
|
|
}
|
|
else
|
|
{
|
|
for (sint32 i = 0; i < updateQueueLength; i++)
|
|
{
|
|
LatteSWCtx.reg[gprUpdate[i].gprIndex].f[gprUpdate[i].channel] = reductionResult;
|
|
}
|
|
LatteSWCtx.pvps.f[0] = reductionResult;
|
|
LatteSWCtx.pvps.f[1] = reductionResult;
|
|
LatteSWCtx.pvps.f[2] = reductionResult;
|
|
LatteSWCtx.pvps.f[3] = reductionResult;
|
|
}
|
|
updateQueueLength = 0;
|
|
// skip literals
|
|
if (literalAccessMask&(3 << 0))
|
|
aluWordPtr += 2;
|
|
if (literalAccessMask&(3 << 2))
|
|
{
|
|
cemu_assert_debug((literalAccessMask &3) != 0);
|
|
aluWordPtr += 2;
|
|
}
|
|
// reset group state tracking variables
|
|
aluUnitWriteMask = 0;
|
|
literalAccessMask = 0;
|
|
}
|
|
}
|
|
|
|
sint32 _getRegValueByCompSel(uint32 gprIndex, uint32 compSel)
|
|
{
|
|
if (compSel < 4)
|
|
return LatteSWCtx.reg[gprIndex].s32[compSel];
|
|
cemu_assert_unimplemented();
|
|
return 0;
|
|
}
|
|
|
|
void LatteSoftware_singleRun()
|
|
{
|
|
uint32* cfWords = LatteSWCtx.shaderBase;
|
|
sint32 instructionIndex = 0;
|
|
while (true)
|
|
{
|
|
uint32 cfWord0 = cfWords[instructionIndex + 0];
|
|
uint32 cfWord1 = cfWords[instructionIndex + 1];
|
|
instructionIndex += 2;
|
|
|
|
uint32 cf_inst23_7 = (cfWord1 >> 23) & 0x7F;
|
|
if (cf_inst23_7 < 0x40) // starting at 0x40 the bits overlap with the ALU instruction encoding
|
|
{
|
|
bool isEndOfProgram = ((cfWord1 >> 21) & 1) != 0;
|
|
uint32 addr = cfWord0 & 0xFFFFFFFF;
|
|
uint32 count = (cfWord1 >> 10) & 7;
|
|
if (((cfWord1 >> 19) & 1) != 0)
|
|
count |= 0x8;
|
|
count++;
|
|
if (cf_inst23_7 == GPU7_CF_INST_CALL_FS)
|
|
{
|
|
|
|
}
|
|
else if (cf_inst23_7 == GPU7_CF_INST_NOP)
|
|
{
|
|
// nop
|
|
if (((cfWord1 >> 0) & 7) != 0)
|
|
cemu_assert_debug(false); // pop count is not zero
|
|
}
|
|
else if (cf_inst23_7 == GPU7_CF_INST_EXPORT || cf_inst23_7 == GPU7_CF_INST_EXPORT_DONE)
|
|
{
|
|
// export
|
|
uint32 edType = (cfWord0 >> 13) & 0x3;
|
|
uint32 edIndexGpr = (cfWord0 >> 23) & 0x7F;
|
|
uint32 edRWRel = (cfWord0 >> 22) & 1;
|
|
if (edRWRel != 0 || edIndexGpr != 0)
|
|
cemu_assert_debug(false);
|
|
|
|
uint8 exportComponentSel[4];
|
|
exportComponentSel[0] = (cfWord1 >> 0) & 0x7;
|
|
exportComponentSel[1] = (cfWord1 >> 3) & 0x7;
|
|
exportComponentSel[2] = (cfWord1 >> 6) & 0x7;
|
|
exportComponentSel[3] = (cfWord1 >> 9) & 0x7;
|
|
uint32 exportArrayBase = (cfWord0 >> 0) & 0x1FFF;
|
|
uint32 exportBurstCount = (cfWord1 >> 17) & 0xF;
|
|
uint32 exportSourceGPR = (cfWord0 >> 15) & 0x7F;
|
|
uint32 memWriteElemSize = (cfWord0>>29)&3; // unused
|
|
|
|
cemu_assert_debug(exportBurstCount == 0);
|
|
if (edType == 1 && exportArrayBase == GPU7_DECOMPILER_CF_EXPORT_BASE_POSITION)
|
|
{
|
|
LatteSWCtx.export_pos.s32[0] = _getRegValueByCompSel(exportSourceGPR, exportComponentSel[0]);
|
|
LatteSWCtx.export_pos.s32[1] = _getRegValueByCompSel(exportSourceGPR, exportComponentSel[1]);
|
|
LatteSWCtx.export_pos.s32[2] = _getRegValueByCompSel(exportSourceGPR, exportComponentSel[2]);
|
|
LatteSWCtx.export_pos.s32[3] = _getRegValueByCompSel(exportSourceGPR, exportComponentSel[3]);
|
|
}
|
|
else
|
|
{
|
|
// unhandled export
|
|
cemu_assert_unimplemented();
|
|
}
|
|
}
|
|
else if (cf_inst23_7 == GPU7_CF_INST_TEX)
|
|
{
|
|
cemu_assert_unimplemented();
|
|
}
|
|
else if (cf_inst23_7 == GPU7_CF_INST_ELSE ||
|
|
cf_inst23_7 == GPU7_CF_INST_POP)
|
|
{
|
|
cemu_assert_unimplemented();
|
|
}
|
|
else if (cf_inst23_7 == GPU7_CF_INST_JUMP)
|
|
{
|
|
cemu_assert_unimplemented();
|
|
}
|
|
else if (cf_inst23_7 == GPU7_CF_INST_LOOP_START_DX10 || cf_inst23_7 == GPU7_CF_INST_LOOP_END)
|
|
{
|
|
cemu_assert_unimplemented();
|
|
}
|
|
else if (cf_inst23_7 == GPU7_CF_INST_LOOP_BREAK)
|
|
{
|
|
cemu_assert_unimplemented();
|
|
}
|
|
else if (cf_inst23_7 == GPU7_CF_INST_MEM_STREAM0_WRITE ||
|
|
cf_inst23_7 == GPU7_CF_INST_MEM_STREAM1_WRITE)
|
|
{
|
|
cemu_assert_unimplemented();
|
|
}
|
|
else if (cf_inst23_7 == GPU7_CF_INST_MEM_RING_WRITE)
|
|
{
|
|
cemu_assert_unimplemented();
|
|
}
|
|
else if (cf_inst23_7 == GPU7_CF_INST_EMIT_VERTEX)
|
|
{
|
|
cemu_assert_unimplemented();
|
|
}
|
|
else
|
|
{
|
|
cemu_assert_unimplemented();
|
|
}
|
|
if (isEndOfProgram)
|
|
{
|
|
return;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// ALU instruction
|
|
uint32 cf_inst26_4 = ((cfWord1 >> 26) & 0xF) | GPU7_CF_INST_ALU_MASK;
|
|
if (cf_inst26_4 == GPU7_CF_INST_ALU || cf_inst26_4 == GPU7_CF_INST_ALU_PUSH_BEFORE || cf_inst26_4 == GPU7_CF_INST_ALU_POP_AFTER || cf_inst26_4 == GPU7_CF_INST_ALU_POP2_AFTER || cf_inst26_4 == GPU7_CF_INST_ALU_BREAK || cf_inst26_4 == GPU7_CF_INST_ALU_ELSE_AFTER)
|
|
{
|
|
uint32 addr = (cfWord0 >> 0) & 0x3FFFFF;
|
|
uint32 count = ((cfWord1 >> 18) & 0x7F) + 1;
|
|
uint32 cBank0Index = (cfWord0 >> 22) & 0xF;
|
|
uint32 cBank1Index = (cfWord0 >> 26) & 0xF;
|
|
uint32 cBank0AddrBase = ((cfWord1 >> 2) & 0xFF) * 16;
|
|
uint32 cBank1AddrBase = ((cfWord1 >> 10) & 0xFF) * 16;
|
|
LatteSoftware_executeALUClause(cf_inst26_4, addr, count, cBank0Index, cBank1Index, cBank0AddrBase, cBank1AddrBase);
|
|
}
|
|
else
|
|
{
|
|
cemu_assert_unimplemented();
|
|
}
|
|
}
|
|
}
|
|
cemu_assert_debug(false);
|
|
}
|
|
|
|
template<int endianMode>
|
|
uint32 _readVtxU32(void* ptr)
|
|
{
|
|
uint32 v = *(uint32*)ptr;
|
|
if constexpr (endianMode == GPU7_ENDIAN_8IN32)
|
|
v = _swapEndianU32(v);
|
|
return v;
|
|
}
|
|
|
|
template<int endianMode, int nfa>
|
|
void _readAttr_FLOAT_32_32(void* ptr, LatteReg_t& output)
|
|
{
|
|
output.s32[0] = _readVtxU32<endianMode>((uint32*)ptr);
|
|
output.s32[1] = _readVtxU32<endianMode>((uint32*)ptr + 1);
|
|
output.s32[2] = 0;
|
|
output.s32[3] = 0;
|
|
}
|
|
|
|
template<int endianMode, int nfa>
|
|
void _readAttr_FLOAT_32_32_32(void* ptr, LatteReg_t& output)
|
|
{
|
|
output.s32[0] = _readVtxU32<endianMode>((uint32*)ptr);
|
|
output.s32[1] = _readVtxU32<endianMode>((uint32*)ptr + 1);
|
|
output.s32[2] = _readVtxU32<endianMode>((uint32*)ptr + 2);
|
|
output.s32[3] = 0;
|
|
}
|
|
|
|
template<int endianMode, int nfa>
|
|
void _readAttr_FLOAT_32_32_32_32(void* ptr, LatteReg_t& output)
|
|
{
|
|
output.s32[0] = _readVtxU32<endianMode>((uint32*)ptr);
|
|
output.s32[1] = _readVtxU32<endianMode>((uint32*)ptr + 1);
|
|
output.s32[2] = _readVtxU32<endianMode>((uint32*)ptr + 2);
|
|
output.s32[3] = _readVtxU32<endianMode>((uint32*)ptr + 3);
|
|
}
|
|
|
|
#define _fmtKey(__fmt, __endianSwap, __nfa, __isSigned) ((__endianSwap)|((__nfa)<<2)|((__isSigned)<<4)|((__fmt)<<5))
|
|
|
|
void LatteSoftware_loadVertexAttributes(sint32 index)
|
|
{
|
|
LatteSWCtx.reg[0].s32[0] = index;
|
|
for (auto& bufferGroup : LatteSWCtx.fetchShader->bufferGroups)
|
|
{
|
|
for (sint32 f = 0; f < bufferGroup.attribCount; f++)
|
|
{
|
|
auto attrib = bufferGroup.attrib + f;
|
|
// calculate element index
|
|
sint32 elementIndex = index;
|
|
// todo - handle instance index and attr divisor
|
|
|
|
// get buffer
|
|
uint32 bufferIndex = attrib->attributeBufferIndex;
|
|
if (bufferIndex >= 0x10)
|
|
{
|
|
continue;
|
|
}
|
|
uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + bufferIndex * 7;
|
|
MPTR bufferAddress = LatteGPUState.contextRegister[bufferBaseRegisterIndex + 0];
|
|
uint32 bufferSize = LatteGPUState.contextRegister[bufferBaseRegisterIndex + 1] + 1;
|
|
uint32 bufferStride = (LatteGPUState.contextRegister[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF;
|
|
if (bufferAddress == MPTR_NULL)
|
|
{
|
|
debug_printf("Warning: Attribute uses NULL buffer during software emulation\n");
|
|
continue;
|
|
}
|
|
|
|
// translate semanticId to gpr index
|
|
uint32 gprIndex = 0xFFFFFFFF;
|
|
for (sint32 f = 0; f < 32; f++)
|
|
{
|
|
if (LatteGPUState.contextRegister[mmSQ_VTX_SEMANTIC_0 + f] == attrib->semanticId)
|
|
{
|
|
gprIndex = f;
|
|
break;
|
|
}
|
|
}
|
|
if (gprIndex == 0xFFFFFFFF)
|
|
continue; // attribute is not mapped to VS input
|
|
gprIndex = gprIndex + 1;
|
|
|
|
sint32 formatKey = _fmtKey((sint32)attrib->format, (sint32)attrib->endianSwap, (sint32)attrib->nfa, (sint32)attrib->isSigned);
|
|
|
|
void* inputData = memory_getPointerFromPhysicalOffset(bufferAddress + elementIndex * bufferStride);
|
|
|
|
LatteReg_t attrData;
|
|
|
|
switch (formatKey)
|
|
{
|
|
case _fmtKey(FMT_32_32_FLOAT, GPU7_ENDIAN_8IN32, LATTE_NFA_2, LATTE_VTX_UNSIGNED):
|
|
_readAttr_FLOAT_32_32<GPU7_ENDIAN_8IN32, LATTE_NFA_2>(inputData, attrData);
|
|
break;
|
|
case _fmtKey(FMT_32_32_32_FLOAT, GPU7_ENDIAN_8IN32, LATTE_NFA_2, LATTE_VTX_UNSIGNED):
|
|
_readAttr_FLOAT_32_32_32<GPU7_ENDIAN_8IN32, LATTE_NFA_2>(inputData, attrData);
|
|
break;
|
|
case _fmtKey(FMT_32_32_32_32_FLOAT, GPU7_ENDIAN_8IN32, LATTE_NFA_2, LATTE_VTX_UNSIGNED):
|
|
_readAttr_FLOAT_32_32_32_32<GPU7_ENDIAN_8IN32, LATTE_NFA_2>(inputData, attrData);
|
|
break;
|
|
default:
|
|
cemu_assert_debug(false);
|
|
}
|
|
|
|
LatteReg_t* gprOutput = LatteSWCtx.reg+gprIndex;
|
|
for (uint32 f = 0; f < 4; f++)
|
|
{
|
|
if (attrib->ds[f] < 4)
|
|
gprOutput->s32[f] = attrData.s32[f];
|
|
else if (attrib->ds[f] == 4)
|
|
gprOutput->s32[f] = 0;
|
|
else if (attrib->ds[f] == 5)
|
|
gprOutput->f[f] = 1.0;
|
|
else
|
|
cemu_assert_debug(false);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
float* LatteSoftware_getPositionExport()
|
|
{
|
|
return LatteSWCtx.export_pos.f;
|
|
}
|
|
|
|
void LatteSoftware_executeVertex(sint32 index)
|
|
{
|
|
LatteSoftware_loadVertexAttributes(index);
|
|
LatteSoftware_singleRun();
|
|
}
|
|
|
|
void LatteSoftware_setupVertexShader(LatteFetchShader* fetchShader, void* shaderPtr, sint32 size)
|
|
{
|
|
LatteSWCtx.fetchShader = fetchShader;
|
|
LatteSWCtx.shaderBase = (uint32*)shaderPtr;
|
|
LatteSWCtx.shaderSize = size;
|
|
LatteSWCtx.cfilePtr = (void*)(LatteGPUState.contextRegister + LATTE_REG_BASE_ALU_CONST + 0x400);
|
|
}
|