#include "Cafe/HW/Latte/Core/LatteConst.h" #include "Cafe/HW/Latte/Core/LatteShaderAssembly.h" #include "Cafe/HW/Latte/ISA/RegDefines.h" #include "Cafe/OS/libs/gx2/GX2.h" // todo - remove this dependency #include "Cafe/HW/Latte/Core/Latte.h" #include "Cafe/HW/Latte/Core/LatteDraw.h" #include "Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h" #include "Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInternal.h" #include "Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInstructions.h" #include "Cafe/HW/Latte/Core/FetchShader.h" #include "Cafe/HW/Latte/Core/LatteShader.h" #include "Cafe/HW/Latte/Renderer/Renderer.h" /* * Return index of used color attachment based on shader pixel export index (0-7) */ sint32 LatteDecompiler_getColorOutputIndexFromExportIndex(LatteDecompilerShaderContext* shaderContext, sint32 exportIndex) { sint32 colorOutputIndex = -1; sint32 outputCounter = 0; uint32 cbShaderMask = shaderContext->contextRegisters[mmCB_SHADER_MASK]; uint32 cbShaderControl = shaderContext->contextRegisters[mmCB_SHADER_CONTROL]; for(sint32 m=0; m<8; m++) { uint32 outputMask = (cbShaderMask>>(m*4))&0xF; if( outputMask == 0 ) continue; cemu_assert_debug(outputMask == 0xF); // mask is unsupported if( outputCounter == exportIndex ) { colorOutputIndex = m; break; } outputCounter++; } cemu_assert_debug(colorOutputIndex != -1); // real outputs and outputs defined via mask do not match up return colorOutputIndex; } void _remapUniformAccess(LatteDecompilerShaderContext* shaderContext, bool isRegisterUniform, uint32 kcacheBankId, uint32 uniformIndex) { auto& list_uniformMapping = shaderContext->shader->list_remappedUniformEntries; for(uint32 i=0; iisRegister == true && ufMapping->index == uniformIndex ) { return; } } else { if( ufMapping->isRegister == false && ufMapping->kcacheBankId == kcacheBankId && ufMapping->index == uniformIndex ) { return; } } } // add new mapping LatteDecompilerRemappedUniformEntry_t newMapping = {0}; if( isRegisterUniform ) { newMapping.isRegister = true; newMapping.index = uniformIndex; newMapping.mappedIndex = (uint32)list_uniformMapping.size(); } else { newMapping.isRegister = false; newMapping.kcacheBankId = kcacheBankId; newMapping.index = uniformIndex; newMapping.mappedIndex = (uint32)list_uniformMapping.size(); } list_uniformMapping.emplace_back(newMapping); } /* * Returns true if the instruction takes integer operands or returns a integer value */ bool _isIntegerInstruction(const LatteDecompilerALUInstruction& aluInstruction) { if (aluInstruction.isOP3 == false) { // OP2 switch (aluInstruction.opcode) { case ALU_OP2_INST_ADD: case ALU_OP2_INST_MUL: case ALU_OP2_INST_MUL_IEEE: case ALU_OP2_INST_MAX: case ALU_OP2_INST_MIN: case ALU_OP2_INST_FLOOR: case ALU_OP2_INST_FRACT: case ALU_OP2_INST_TRUNC: case ALU_OP2_INST_MOV: case ALU_OP2_INST_NOP: case ALU_OP2_INST_DOT4: case ALU_OP2_INST_DOT4_IEEE: case ALU_OP2_INST_CUBE: case ALU_OP2_INST_EXP_IEEE: case ALU_OP2_INST_LOG_CLAMPED: case ALU_OP2_INST_LOG_IEEE: case ALU_OP2_INST_SQRT_IEEE: case ALU_OP2_INST_SIN: case ALU_OP2_INST_COS: case ALU_OP2_INST_RNDNE: case ALU_OP2_INST_MAX_DX10: case ALU_OP2_INST_MIN_DX10: case ALU_OP2_INST_SETGT: case ALU_OP2_INST_SETGE: case ALU_OP2_INST_SETNE: case ALU_OP2_INST_SETE: case ALU_OP2_INST_PRED_SETE: case ALU_OP2_INST_PRED_SETGT: case ALU_OP2_INST_PRED_SETGE: case ALU_OP2_INST_PRED_SETNE: case ALU_OP2_INST_KILLE: case ALU_OP2_INST_KILLGT: case ALU_OP2_INST_KILLGE: case ALU_OP2_INST_RECIP_FF: case ALU_OP2_INST_RECIP_IEEE: case ALU_OP2_INST_RECIPSQRT_CLAMPED: case ALU_OP2_INST_RECIPSQRT_FF: case ALU_OP2_INST_RECIPSQRT_IEEE: return false; case ALU_OP2_INST_FLT_TO_INT: case ALU_OP2_INST_INT_TO_FLOAT: case ALU_OP2_INST_UINT_TO_FLOAT: case ALU_OP2_INST_ASHR_INT: case ALU_OP2_INST_LSHR_INT: case ALU_OP2_INST_LSHL_INT: case ALU_OP2_INST_MULLO_INT: case ALU_OP2_INST_MULLO_UINT: case ALU_OP2_INST_FLT_TO_UINT: case ALU_OP2_INST_AND_INT: case ALU_OP2_INST_OR_INT: case ALU_OP2_INST_XOR_INT: case ALU_OP2_INST_NOT_INT: case ALU_OP2_INST_ADD_INT: case ALU_OP2_INST_SUB_INT: case ALU_OP2_INST_MAX_INT: case ALU_OP2_INST_MIN_INT: case ALU_OP2_INST_SETE_INT: case ALU_OP2_INST_SETGT_INT: case ALU_OP2_INST_SETGE_INT: case ALU_OP2_INST_SETNE_INT: case ALU_OP2_INST_SETGT_UINT: case ALU_OP2_INST_SETGE_UINT: case ALU_OP2_INST_PRED_SETE_INT: case ALU_OP2_INST_PRED_SETGT_INT: case ALU_OP2_INST_PRED_SETGE_INT: case ALU_OP2_INST_PRED_SETNE_INT: case ALU_OP2_INST_KILLE_INT: case ALU_OP2_INST_KILLGT_INT: case ALU_OP2_INST_KILLNE_INT: case ALU_OP2_INST_MOVA_FLOOR: case ALU_OP2_INST_MOVA_INT: return true; // these return an integer result but are usually used only for conditionals case ALU_OP2_INST_SETE_DX10: case ALU_OP2_INST_SETGT_DX10: case ALU_OP2_INST_SETGE_DX10: case ALU_OP2_INST_SETNE_DX10: return true; default: #ifdef CEMU_DEBUG_ASSERT debug_printf("_isIntegerInstruction(): OP3=%s opcode=%02x\n", aluInstruction.isOP3 ? "true" : "false", aluInstruction.opcode); cemu_assert_debug(false); #endif break; } } else { // OP3 switch (aluInstruction.opcode) { case ALU_OP3_INST_MULADD: case ALU_OP3_INST_MULADD_D2: case ALU_OP3_INST_MULADD_M2: case ALU_OP3_INST_MULADD_M4: case ALU_OP3_INST_MULADD_IEEE: case ALU_OP3_INST_CMOVE: case ALU_OP3_INST_CMOVGT: case ALU_OP3_INST_CMOVGE: return false; case ALU_OP3_INST_CNDE_INT: case ALU_OP3_INST_CNDGT_INT: case ALU_OP3_INST_CMOVGE_INT: return true; default: #ifdef CEMU_DEBUG_ASSERT debug_printf("_isIntegerInstruction(): OP3=%s opcode=%02x\n", aluInstruction.isOP3?"true":"false", aluInstruction.opcode); #endif break; } } return false; } /* * Analyze ALU CF instruction and all instructions within the ALU clause */ void LatteDecompiler_analyzeALUClause(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction) { // check if this shader has any clause that potentially modifies the pixel execution state if( cfInstruction->type == GPU7_CF_INST_ALU_PUSH_BEFORE || cfInstruction->type == GPU7_CF_INST_ALU_POP_AFTER || cfInstruction->type == GPU7_CF_INST_ALU_POP2_AFTER || cfInstruction->type == GPU7_CF_INST_ALU_BREAK || cfInstruction->type == GPU7_CF_INST_ALU_ELSE_AFTER ) { shaderContext->analyzer.modifiesPixelActiveState = true; } // analyze ALU instructions for(auto& aluInstruction : cfInstruction->instructionsALU) { // ignore NOP instruction if( !aluInstruction.isOP3 && aluInstruction.opcode == ALU_OP2_INST_NOP ) continue; // check for CUBE instruction if( !aluInstruction.isOP3 && aluInstruction.opcode == ALU_OP2_INST_CUBE ) { shaderContext->analyzer.hasRedcCUBE = true; } // check for integer instruction if (_isIntegerInstruction(aluInstruction)) shaderContext->analyzer.usesIntegerValues = true; // process all available operands (inputs) for(sint32 f=0; f<3; f++) { // check input for uniform access if( aluInstruction.sourceOperand[f].sel == 0xFFFFFFFF ) continue; // source operand not set/used // about uniform register and buffer access tracking: // for absolute indices we can determine a maximum size that is accessed // relative accesses are tricky because the upper bound of accessed indices is unknown // worst case we have to load the full file (256 * 16 byte entries) or for buffers an arbitrary upper bound (64KB in our case) if( GPU7_ALU_SRC_IS_CFILE(aluInstruction.sourceOperand[f].sel) ) { if (aluInstruction.sourceOperand[f].rel) { shaderContext->analyzer.uniformRegisterAccessTracker.TrackAccess(GPU7_ALU_SRC_GET_CFILE_INDEX(aluInstruction.sourceOperand[f].sel), true); } else { _remapUniformAccess(shaderContext, true, 0, GPU7_ALU_SRC_GET_CFILE_INDEX(aluInstruction.sourceOperand[f].sel)); shaderContext->analyzer.uniformRegisterAccessTracker.TrackAccess(GPU7_ALU_SRC_GET_CFILE_INDEX(aluInstruction.sourceOperand[f].sel), false); } } else if( GPU7_ALU_SRC_IS_CBANK0(aluInstruction.sourceOperand[f].sel) ) { // uniform bank 0 (uniform buffer with index cfInstruction->cBank0Index) uint32 uniformBufferIndex = cfInstruction->cBank0Index; cemu_assert(uniformBufferIndex < LATTE_NUM_MAX_UNIFORM_BUFFERS); uint32 offset = GPU7_ALU_SRC_GET_CBANK0_INDEX(aluInstruction.sourceOperand[f].sel)+cfInstruction->cBank0AddrBase; _remapUniformAccess(shaderContext, false, uniformBufferIndex, offset); shaderContext->analyzer.uniformBufferAccessTracker[uniformBufferIndex].TrackAccess(offset, aluInstruction.sourceOperand[f].rel); } else if( GPU7_ALU_SRC_IS_CBANK1(aluInstruction.sourceOperand[f].sel) ) { // uniform bank 1 (uniform buffer with index cfInstruction->cBank1Index) uint32 uniformBufferIndex = cfInstruction->cBank1Index; cemu_assert(uniformBufferIndex < LATTE_NUM_MAX_UNIFORM_BUFFERS); uint32 offset = GPU7_ALU_SRC_GET_CBANK1_INDEX(aluInstruction.sourceOperand[f].sel)+cfInstruction->cBank1AddrBase; _remapUniformAccess(shaderContext, false, uniformBufferIndex, offset); shaderContext->analyzer.uniformBufferAccessTracker[uniformBufferIndex].TrackAccess(offset, aluInstruction.sourceOperand[f].rel); } else if( GPU7_ALU_SRC_IS_GPR(aluInstruction.sourceOperand[f].sel) ) { sint32 gprIndex = GPU7_ALU_SRC_GET_GPR_INDEX(aluInstruction.sourceOperand[f].sel); shaderContext->analyzer.gprUseMask[gprIndex/8] |= (1<<(gprIndex%8)); if( aluInstruction.sourceOperand[f].rel != 0 ) { // if indexed register access is used, all possibly referenced registers are stored to a separate array at the beginning of the group shaderContext->analyzer.usesRelativeGPRRead = true; continue; } } } if( aluInstruction.destRel != 0 ) shaderContext->analyzer.usesRelativeGPRWrite = true; shaderContext->analyzer.gprUseMask[aluInstruction.destGpr/8] |= (1<<(aluInstruction.destGpr%8)); } } // analyze TEX CF instruction and all instructions within the TEX clause void LatteDecompiler_analyzeTEXClause(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction) { LatteDecompilerShader* shader = shaderContext->shader; for(auto& texInstruction : cfInstruction->instructionsTEX) { if( texInstruction.opcode == GPU7_TEX_INST_SAMPLE || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_L || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_LB || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_LZ || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_C || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_C_L || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_C_LZ || texInstruction.opcode == GPU7_TEX_INST_FETCH4 || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_G || texInstruction.opcode == GPU7_TEX_INST_LD ) { if (texInstruction.textureFetch.textureIndex < 0 || texInstruction.textureFetch.textureIndex >= LATTE_NUM_MAX_TEX_UNITS) { cemuLog_logDebug(LogType::Force, "Shader {:16x} has out of bounds texture access (texture {})", shaderContext->shader->baseHash, (sint32)texInstruction.textureFetch.textureIndex); continue; } if( texInstruction.textureFetch.samplerIndex < 0 || texInstruction.textureFetch.samplerIndex >= 0x12 ) cemu_assert_debug(false); if(shaderContext->output->textureUnitMask[texInstruction.textureFetch.textureIndex] && shader->textureUnitSamplerAssignment[texInstruction.textureFetch.textureIndex] != texInstruction.textureFetch.samplerIndex && shader->textureUnitSamplerAssignment[texInstruction.textureFetch.textureIndex] != LATTE_DECOMPILER_SAMPLER_NONE ) { cemu_assert_debug(false); } shaderContext->output->textureUnitMask[texInstruction.textureFetch.textureIndex] = true; shader->textureUnitSamplerAssignment[texInstruction.textureFetch.textureIndex] = texInstruction.textureFetch.samplerIndex; if( texInstruction.opcode == GPU7_TEX_INST_SAMPLE_C || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_C_L || texInstruction.opcode == GPU7_TEX_INST_SAMPLE_C_LZ) shader->textureUsesDepthCompare[texInstruction.textureFetch.textureIndex] = true; bool useTexelCoords = false; if (texInstruction.opcode == GPU7_TEX_INST_SAMPLE && (texInstruction.textureFetch.unnormalized[0] && texInstruction.textureFetch.unnormalized[1] && texInstruction.textureFetch.unnormalized[2] && texInstruction.textureFetch.unnormalized[3])) useTexelCoords = true; else if (texInstruction.opcode == GPU7_TEX_INST_LD) useTexelCoords = true; if (useTexelCoords) { shaderContext->analyzer.texUnitUsesTexelCoordinates.set(texInstruction.textureFetch.textureIndex); } } else if( texInstruction.opcode == GPU7_TEX_INST_GET_COMP_TEX_LOD || texInstruction.opcode == GPU7_TEX_INST_GET_TEXTURE_RESINFO ) { if( texInstruction.textureFetch.textureIndex < 0 || texInstruction.textureFetch.textureIndex >= LATTE_NUM_MAX_TEX_UNITS ) debugBreakpoint(); if( texInstruction.textureFetch.samplerIndex != 0 ) debugBreakpoint(); // sampler is ignored and should be 0 shaderContext->output->textureUnitMask[texInstruction.textureFetch.textureIndex] = true; } else if( texInstruction.opcode == GPU7_TEX_INST_SET_CUBEMAP_INDEX ) { // no analysis required } else if (texInstruction.opcode == GPU7_TEX_INST_GET_GRADIENTS_H || texInstruction.opcode == GPU7_TEX_INST_GET_GRADIENTS_V) { // no analysis required } else if (texInstruction.opcode == GPU7_TEX_INST_SET_GRADIENTS_H || texInstruction.opcode == GPU7_TEX_INST_SET_GRADIENTS_V) { shaderContext->analyzer.hasGradientLookup = true; } else if( texInstruction.opcode == GPU7_TEX_INST_VFETCH ) { // VFETCH is used to access uniform buffers dynamically if( texInstruction.textureFetch.textureIndex >= 0x80 && texInstruction.textureFetch.textureIndex <= 0x8F ) { uint32 uniformBufferIndex = texInstruction.textureFetch.textureIndex - 0x80; shaderContext->analyzer.uniformBufferAccessTracker[uniformBufferIndex].TrackAccess(0, true); } else if( texInstruction.textureFetch.textureIndex == 0x9F && shader->shaderType == LatteConst::ShaderType::Geometry ) { // instruction to read geometry shader input from ringbuffer } else debugBreakpoint(); } else if (texInstruction.opcode == GPU7_TEX_INST_MEM) { // SSBO access shaderContext->analyzer.hasSSBORead = true; } else debugBreakpoint(); // mark read and written registers as used if(texInstruction.dstGpr < LATTE_NUM_GPR) shaderContext->analyzer.gprUseMask[texInstruction.dstGpr/8] |= (1<<(texInstruction.dstGpr%8)); if(texInstruction.srcGpr < LATTE_NUM_GPR) shaderContext->analyzer.gprUseMask[texInstruction.srcGpr/8] |= (1<<(texInstruction.srcGpr%8)); } } /* * Analyze export CF instruction */ void LatteDecompiler_analyzeExport(LatteDecompilerShaderContext* shaderContext, LatteDecompilerCFInstruction* cfInstruction) { LatteDecompilerShader* shader = shaderContext->shader; if( shader->shaderType == LatteConst::ShaderType::Pixel ) { if( cfInstruction->exportType == 0 && cfInstruction->exportArrayBase < 8 ) { // remember color outputs that are written for(uint32 i=0; i<(cfInstruction->exportBurstCount+1); i++) { sint32 colorOutputIndex = LatteDecompiler_getColorOutputIndexFromExportIndex(shaderContext, cfInstruction->exportArrayBase+i); shader->pixelColorOutputMask |= (1<exportType == 0 && cfInstruction->exportArrayBase == 61 ) { // writes pixel depth } else debugBreakpoint(); } else if (shader->shaderType == LatteConst::ShaderType::Vertex) { if (cfInstruction->exportType == 2 && cfInstruction->exportArrayBase < 32) { shaderContext->shader->outputParameterMask |= (1<exportArrayBase); } else if (cfInstruction->exportType == 1 && cfInstruction->exportArrayBase == GPU7_DECOMPILER_CF_EXPORT_POINT_SIZE) { shaderContext->analyzer.writesPointSize = true; } } // mark input GPRs as used for(uint32 i=0; i<(cfInstruction->exportBurstCount+1); i++) { shaderContext->analyzer.gprUseMask[(cfInstruction->exportSourceGPR+i)/8] |= (1<<((cfInstruction->exportSourceGPR+i)%8)); } } void LatteDecompiler_analyzeSubroutine(LatteDecompilerShaderContext* shaderContext, uint32 cfAddr) { // analyze CF and clauses up to RET statement // todo - find cfInstruction index from cfAddr cemu_assert_debug(false); for(auto& cfInstruction : shaderContext->cfInstructions) { if (cfInstruction.type == GPU7_CF_INST_ALU || cfInstruction.type == GPU7_CF_INST_ALU_PUSH_BEFORE || cfInstruction.type == GPU7_CF_INST_ALU_POP_AFTER || cfInstruction.type == GPU7_CF_INST_ALU_POP2_AFTER || cfInstruction.type == GPU7_CF_INST_ALU_BREAK || cfInstruction.type == GPU7_CF_INST_ALU_ELSE_AFTER) { LatteDecompiler_analyzeALUClause(shaderContext, &cfInstruction); } else if (cfInstruction.type == GPU7_CF_INST_TEX) { LatteDecompiler_analyzeTEXClause(shaderContext, &cfInstruction); } else if (cfInstruction.type == GPU7_CF_INST_EXPORT || cfInstruction.type == GPU7_CF_INST_EXPORT_DONE) { LatteDecompiler_analyzeExport(shaderContext, &cfInstruction); } else if (cfInstruction.type == GPU7_CF_INST_ELSE || cfInstruction.type == GPU7_CF_INST_POP) { shaderContext->analyzer.modifiesPixelActiveState = true; } else if (cfInstruction.type == GPU7_CF_INST_LOOP_START_DX10 || cfInstruction.type == GPU7_CF_INST_LOOP_END) { shaderContext->analyzer.modifiesPixelActiveState = true; } else if (cfInstruction.type == GPU7_CF_INST_LOOP_BREAK) { shaderContext->analyzer.modifiesPixelActiveState = true; } else if (cfInstruction.type == GPU7_CF_INST_EMIT_VERTEX) { // nothing to analyze } else if (cfInstruction.type == GPU7_CF_INST_CALL) { cemu_assert_debug(false); // CALLs inside subroutines are still todo } else { cemu_assert_unimplemented(); } } } namespace LatteDecompiler { void _initTextureBindingPointsGL(LatteDecompilerShaderContext* decompilerContext) { // for OpenGL we use the relative texture unit index for (sint32 i = 0; i < LATTE_NUM_MAX_TEX_UNITS; i++) { if (!decompilerContext->output->textureUnitMask[i]) continue; sint32 textureBindingPoint; if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex) textureBindingPoint = i + CEMU_VS_TEX_UNIT_BASE; else if (decompilerContext->shaderType == LatteConst::ShaderType::Geometry) textureBindingPoint = i + CEMU_GS_TEX_UNIT_BASE; else if (decompilerContext->shaderType == LatteConst::ShaderType::Pixel) textureBindingPoint = i + CEMU_PS_TEX_UNIT_BASE; decompilerContext->output->resourceMappingGL.textureUnitToBindingPoint[i] = textureBindingPoint; } } void _initTextureBindingPointsVK(LatteDecompilerShaderContext* decompilerContext) { // for Vulkan we use consecutive indices for (sint32 i = 0; i < LATTE_NUM_MAX_TEX_UNITS; i++) { if (!decompilerContext->output->textureUnitMask[i]) continue; decompilerContext->output->resourceMappingVK.textureUnitToBindingPoint[i] = decompilerContext->currentBindingPointVK; decompilerContext->currentBindingPointVK++; } } void _initHasUniformVarBlock(LatteDecompilerShaderContext* decompilerContext) { decompilerContext->hasUniformVarBlock = false; if (decompilerContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_REMAPPED) decompilerContext->hasUniformVarBlock = true; else if (decompilerContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_FULL_CFILE) decompilerContext->hasUniformVarBlock = true; bool hasAnyViewportScaleDisabled = !decompilerContext->contextRegistersNew->PA_CL_VTE_CNTL.get_VPORT_X_SCALE_ENA() || !decompilerContext->contextRegistersNew->PA_CL_VTE_CNTL.get_VPORT_Y_SCALE_ENA() || !decompilerContext->contextRegistersNew->PA_CL_VTE_CNTL.get_VPORT_Z_SCALE_ENA(); // we currently only support all on/off. Individual component scaling is not supported cemu_assert_debug(decompilerContext->contextRegistersNew->PA_CL_VTE_CNTL.get_VPORT_X_SCALE_ENA() == !hasAnyViewportScaleDisabled); cemu_assert_debug(decompilerContext->contextRegistersNew->PA_CL_VTE_CNTL.get_VPORT_Y_SCALE_ENA() == !hasAnyViewportScaleDisabled); cemu_assert_debug(decompilerContext->contextRegistersNew->PA_CL_VTE_CNTL.get_VPORT_Z_SCALE_ENA() == !hasAnyViewportScaleDisabled); cemu_assert_debug(decompilerContext->contextRegistersNew->PA_CL_VTE_CNTL.get_VPORT_X_OFFSET_ENA() == !hasAnyViewportScaleDisabled); cemu_assert_debug(decompilerContext->contextRegistersNew->PA_CL_VTE_CNTL.get_VPORT_Y_OFFSET_ENA() == !hasAnyViewportScaleDisabled); cemu_assert_debug(decompilerContext->contextRegistersNew->PA_CL_VTE_CNTL.get_VPORT_Z_OFFSET_ENA() == !hasAnyViewportScaleDisabled); if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex && hasAnyViewportScaleDisabled) decompilerContext->hasUniformVarBlock = true; // uf_windowSpaceToClipSpaceTransform bool alphaTestEnable = decompilerContext->contextRegistersNew->SX_ALPHA_TEST_CONTROL.get_ALPHA_TEST_ENABLE(); if (decompilerContext->shaderType == LatteConst::ShaderType::Pixel && alphaTestEnable != 0) decompilerContext->hasUniformVarBlock = true; // uf_alphaTestRef if (decompilerContext->shaderType == LatteConst::ShaderType::Pixel) decompilerContext->hasUniformVarBlock = true; // uf_fragCoordScale if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex && decompilerContext->analyzer.outputPointSize && decompilerContext->analyzer.writesPointSize == false) decompilerContext->hasUniformVarBlock = true; // uf_pointSize if (decompilerContext->shaderType == LatteConst::ShaderType::Geometry && decompilerContext->analyzer.outputPointSize && decompilerContext->analyzer.writesPointSize == false) decompilerContext->hasUniformVarBlock = true; // uf_pointSize if (decompilerContext->analyzer.useSSBOForStreamout && (decompilerContext->shaderType == LatteConst::ShaderType::Vertex && !decompilerContext->options->usesGeometryShader) || (decompilerContext->shaderType == LatteConst::ShaderType::Geometry)) { decompilerContext->hasUniformVarBlock = true; // uf_verticesPerInstance and uf_streamoutBufferBase* } } void _initUniformBindingPoints(LatteDecompilerShaderContext* decompilerContext) { // check if uniform vars block has at least one variable _initHasUniformVarBlock(decompilerContext); if (decompilerContext->shaderType == LatteConst::ShaderType::Pixel) { for (sint32 t = 0; t < LATTE_NUM_MAX_TEX_UNITS; t++) { if (decompilerContext->analyzer.texUnitUsesTexelCoordinates.test(t) == false) continue; decompilerContext->hasUniformVarBlock = true; // uf_tex%dScale } } // assign binding point to uniform var block decompilerContext->output->resourceMappingGL.uniformVarsBufferBindingPoint = -1; // OpenGL currently doesnt use a uniform block if (decompilerContext->hasUniformVarBlock) { decompilerContext->output->resourceMappingVK.uniformVarsBufferBindingPoint = decompilerContext->currentBindingPointVK; decompilerContext->currentBindingPointVK++; } else decompilerContext->output->resourceMappingVK.uniformVarsBufferBindingPoint = -1; // assign binding points to uniform buffers if (decompilerContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_FULL_CBANK) { // for Vulkan we use consecutive indices for (uint32 i = 0; i < LATTE_NUM_MAX_UNIFORM_BUFFERS; i++) { if (!decompilerContext->analyzer.uniformBufferAccessTracker[i].HasAccess()) continue; sint32 uniformBindingPoint = i; if (decompilerContext->shaderType == LatteConst::ShaderType::Geometry) uniformBindingPoint += 64; else if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex) uniformBindingPoint += 0; else if (decompilerContext->shaderType == LatteConst::ShaderType::Pixel) uniformBindingPoint += 32; decompilerContext->output->resourceMappingVK.uniformBuffersBindingPoint[i] = decompilerContext->currentBindingPointVK; decompilerContext->currentBindingPointVK++; } // for OpenGL we use the relative buffer index for (uint32 i = 0; i < LATTE_NUM_MAX_UNIFORM_BUFFERS; i++) { if (!decompilerContext->analyzer.uniformBufferAccessTracker[i].HasAccess()) continue; sint32 uniformBindingPoint = i; if (decompilerContext->shaderType == LatteConst::ShaderType::Geometry) uniformBindingPoint += 64; else if (decompilerContext->shaderType == LatteConst::ShaderType::Vertex) uniformBindingPoint += 0; else if (decompilerContext->shaderType == LatteConst::ShaderType::Pixel) uniformBindingPoint += 32; decompilerContext->output->resourceMappingGL.uniformBuffersBindingPoint[i] = uniformBindingPoint; } } // shader storage buffer for alternative transform feedback path if (decompilerContext->analyzer.useSSBOForStreamout) { decompilerContext->output->resourceMappingVK.tfStorageBindingPoint = decompilerContext->currentBindingPointVK; decompilerContext->currentBindingPointVK++; } } void _initAttributeBindingPoints(LatteDecompilerShaderContext* decompilerContext) { if (decompilerContext->shaderType != LatteConst::ShaderType::Vertex) return; // create input attribute binding mapping // OpenGL and Vulkan use consecutive indices starting at 0 sint8 bindingIndex = 0; for (sint32 i = 0; i < LATTE_NUM_MAX_ATTRIBUTE_LOCATIONS; i++) { if (decompilerContext->analyzer.inputAttributSemanticMask[i]) { decompilerContext->output->resourceMappingGL.attributeMapping[i] = bindingIndex; decompilerContext->output->resourceMappingVK.attributeMapping[i] = bindingIndex; bindingIndex++; } } } } /* * Analyze the shader program * This will help to determine: * 1) Uniform usage * 2) Texture usage * 3) Data types * 4) CF stack and execution flow */ void LatteDecompiler_analyze(LatteDecompilerShaderContext* shaderContext, LatteDecompilerShader* shader) { // analyze render state shaderContext->analyzer.isPointsPrimitive = shaderContext->contextRegistersNew->VGT_PRIMITIVE_TYPE.get_PRIMITIVE_MODE() == Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::POINTS; shaderContext->analyzer.hasStreamoutEnable = shaderContext->contextRegisters[mmVGT_STRMOUT_EN] != 0; // set if the shader is used for transform feedback operations if (shaderContext->shaderType == LatteConst::ShaderType::Vertex && !shaderContext->options->usesGeometryShader) shaderContext->analyzer.outputPointSize = shaderContext->analyzer.isPointsPrimitive; else if (shaderContext->shaderType == LatteConst::ShaderType::Geometry) { uint32 gsOutPrimType = shaderContext->contextRegisters[mmVGT_GS_OUT_PRIM_TYPE]; if (gsOutPrimType == 0) // points shaderContext->analyzer.outputPointSize = true; } // analyze input attributes for vertex/geometry shader if (shader->shaderType == LatteConst::ShaderType::Vertex || shader->shaderType == LatteConst::ShaderType::Geometry) { if(shaderContext->fetchShader) { LatteFetchShader* parsedFetchShader = shaderContext->fetchShader; for(auto& bufferGroup : parsedFetchShader->bufferGroups) { for (sint32 i = 0; i < bufferGroup.attribCount; i++) { uint8 semanticId = bufferGroup.attrib[i].semanticId; if (semanticId == 0xFF) { // unused attribute? Found in Hot Wheels: World's best driver continue; } cemu_assert_debug(semanticId < 0x80); shaderContext->analyzer.inputAttributSemanticMask[semanticId] = true; } } } } // list of subroutines (call destinations) std::vector list_subroutineAddrs; // analyze CF and clauses for(auto& cfInstruction : shaderContext->cfInstructions) { if (cfInstruction.type == GPU7_CF_INST_ALU || cfInstruction.type == GPU7_CF_INST_ALU_PUSH_BEFORE || cfInstruction.type == GPU7_CF_INST_ALU_POP_AFTER || cfInstruction.type == GPU7_CF_INST_ALU_POP2_AFTER || cfInstruction.type == GPU7_CF_INST_ALU_BREAK || cfInstruction.type == GPU7_CF_INST_ALU_ELSE_AFTER) { LatteDecompiler_analyzeALUClause(shaderContext, &cfInstruction); } else if (cfInstruction.type == GPU7_CF_INST_TEX) { LatteDecompiler_analyzeTEXClause(shaderContext, &cfInstruction); } else if (cfInstruction.type == GPU7_CF_INST_EXPORT || cfInstruction.type == GPU7_CF_INST_EXPORT_DONE) { LatteDecompiler_analyzeExport(shaderContext, &cfInstruction); } else if (cfInstruction.type == GPU7_CF_INST_ELSE || cfInstruction.type == GPU7_CF_INST_POP) { shaderContext->analyzer.modifiesPixelActiveState = true; } else if (cfInstruction.type == GPU7_CF_INST_LOOP_START_DX10 || cfInstruction.type == GPU7_CF_INST_LOOP_END) { shaderContext->analyzer.modifiesPixelActiveState = true; shaderContext->analyzer.hasLoops = true; } else if (cfInstruction.type == GPU7_CF_INST_LOOP_BREAK) { shaderContext->analyzer.modifiesPixelActiveState = true; shaderContext->analyzer.hasLoops = true; } else if (cfInstruction.type == GPU7_CF_INST_MEM_STREAM0_WRITE || cfInstruction.type == GPU7_CF_INST_MEM_STREAM1_WRITE) { uint32 streamoutBufferIndex; if (cfInstruction.type == GPU7_CF_INST_MEM_STREAM0_WRITE) streamoutBufferIndex = 0; else if (cfInstruction.type == GPU7_CF_INST_MEM_STREAM1_WRITE) streamoutBufferIndex = 1; else cemu_assert_debug(false); shaderContext->analyzer.hasStreamoutWrite = true; cemu_assert(streamoutBufferIndex < shaderContext->output->streamoutBufferWriteMask.size()); shaderContext->output->streamoutBufferWriteMask[streamoutBufferIndex] = true; uint32 vectorWriteSize = 0; for (sint32 f = 0; f < 4; f++) { if ((cfInstruction.memWriteCompMask & (1 << f)) != 0) vectorWriteSize = (f + 1) * 4; shaderContext->output->streamoutBufferStride[f] = shaderContext->contextRegisters[mmVGT_STRMOUT_VTX_STRIDE_0 + f * 4] << 2; } cemu_assert_debug((cfInstruction.exportArrayBase * 4 + vectorWriteSize) <= shaderContext->output->streamoutBufferStride[streamoutBufferIndex]); } else if (cfInstruction.type == GPU7_CF_INST_MEM_RING_WRITE) { // track number of parameters that are output (simplified by just tracking the offset of the last one) if (cfInstruction.memWriteElemSize != 3) debugBreakpoint(); if (cfInstruction.exportBurstCount != 0 && cfInstruction.memWriteElemSize != 3) { debugBreakpoint(); } uint32 dwordWriteCount = (cfInstruction.exportBurstCount + 1) * 4; uint32 numRingParameter = (cfInstruction.exportArrayBase + dwordWriteCount) / 4; shader->ringParameterCount = std::max(shader->ringParameterCount, numRingParameter); // mark input GPRs as used for (uint32 i = 0; i < (cfInstruction.exportBurstCount + 1); i++) { shaderContext->analyzer.gprUseMask[(cfInstruction.exportSourceGPR + i) / 8] |= (1 << ((cfInstruction.exportSourceGPR + i) % 8)); } } else if (cfInstruction.type == GPU7_CF_INST_EMIT_VERTEX) { shaderContext->analyzer.numEmitVertex++; } else if (cfInstruction.type == GPU7_CF_INST_CALL) { // CALL instruction does not need analyzing // and subroutines are analyzed separately } else cemu_assert_unimplemented(); } // analyze subroutines for (auto subroutineAddr : list_subroutineAddrs) { LatteDecompiler_analyzeSubroutine(shaderContext, subroutineAddr); } // decide which uniform mode to use bool hasAnyDynamicBufferAccess = false; bool hasAnyBufferAccess = false; for(auto& it : shaderContext->analyzer.uniformBufferAccessTracker) { if( it.HasRelativeAccess() ) hasAnyDynamicBufferAccess = true; if( it.HasAccess() ) hasAnyBufferAccess = true; } if (hasAnyDynamicBufferAccess) { shader->uniformMode = LATTE_DECOMPILER_UNIFORM_MODE_FULL_CBANK; } else if(shaderContext->analyzer.uniformRegisterAccessTracker.HasRelativeAccess() ) { shader->uniformMode = LATTE_DECOMPILER_UNIFORM_MODE_FULL_CFILE; } else if(hasAnyBufferAccess || shaderContext->analyzer.uniformRegisterAccessTracker.HasAccess() ) { shader->uniformMode = LATTE_DECOMPILER_UNIFORM_MODE_REMAPPED; } else { shader->uniformMode = LATTE_DECOMPILER_UNIFORM_MODE_NONE; } // generate compact list of uniform buffers (for faster access) cemu_assert_debug(shader->list_quickBufferList.empty()); for (uint32 i = 0; i < LATTE_NUM_MAX_UNIFORM_BUFFERS; i++) { if( !shaderContext->analyzer.uniformBufferAccessTracker[i].HasAccess() ) continue; LatteDecompilerShader::QuickBufferEntry entry; entry.index = i; entry.size = shaderContext->analyzer.uniformBufferAccessTracker[i].DetermineSize(LATTE_GLSL_DYNAMIC_UNIFORM_BLOCK_SIZE) * 16; shader->list_quickBufferList.push_back(entry); } // get dimension of each used texture _LatteRegisterSetTextureUnit* texRegs = nullptr; if( shader->shaderType == LatteConst::ShaderType::Vertex ) texRegs = shaderContext->contextRegistersNew->SQ_TEX_START_VS; else if( shader->shaderType == LatteConst::ShaderType::Pixel ) texRegs = shaderContext->contextRegistersNew->SQ_TEX_START_PS; else if( shader->shaderType == LatteConst::ShaderType::Geometry ) texRegs = shaderContext->contextRegistersNew->SQ_TEX_START_GS; for(sint32 i=0; ioutput->textureUnitMask[i]) { // texture unit not used shader->textureUnitDim[i] = (Latte::E_DIM)0xFF; continue; } auto& texUnit = texRegs[i]; auto dim = texUnit.word0.get_DIM(); shader->textureUnitDim[i] = dim; if(dim == Latte::E_DIM::DIM_CUBEMAP) shaderContext->analyzer.hasCubeMapTexture = true; shader->textureIsIntegerFormat[i] = texUnit.word4.get_NUM_FORM_ALL() == Latte::LATTE_SQ_TEX_RESOURCE_WORD4_N::E_NUM_FORMAT_ALL::NUM_FORMAT_INT; } // generate list of used texture units shader->textureUnitListCount = 0; for (sint32 i = 0; i < LATTE_NUM_MAX_TEX_UNITS; i++) { if (shaderContext->output->textureUnitMask[i]) { shader->textureUnitList[shader->textureUnitListCount] = i; shader->textureUnitListCount++; } } // for geometry shaders check the copy shader for stream writes if (shader->shaderType == LatteConst::ShaderType::Geometry && shaderContext->parsedGSCopyShader->list_streamWrites.empty() == false) { shaderContext->analyzer.hasStreamoutWrite = true; if (shaderContext->contextRegisters[mmVGT_STRMOUT_EN] != 0) shaderContext->analyzer.hasStreamoutEnable = true; for (auto& it : shaderContext->parsedGSCopyShader->list_streamWrites) { shaderContext->output->streamoutBufferWriteMask[it.bufferIndex] = true; uint32 vectorWriteSize = 0; for (sint32 f = 0; f < 4; f++) { if ((it.memWriteCompMask&(1 << f)) != 0) vectorWriteSize = (f + 1) * 4; } shaderContext->output->streamoutBufferStride[it.bufferIndex] = std::max(shaderContext->output->streamoutBufferStride[it.bufferIndex], it.exportArrayBase * 4 + vectorWriteSize); } } // analyze input attributes again (if shader has relative GPR read) if(shaderContext->analyzer.usesRelativeGPRRead && (shader->shaderType == LatteConst::ShaderType::Vertex || shader->shaderType == LatteConst::ShaderType::Geometry) ) { if(shaderContext->fetchShader) { LatteFetchShader* parsedFetchShader = shaderContext->fetchShader; for(auto& bufferGroup : parsedFetchShader->bufferGroups) { for (sint32 i = 0; i < bufferGroup.attribCount; i++) { uint32 registerIndex; // get register index based on vtx semantic table uint32 attributeShaderLoc = 0xFFFFFFFF; for (sint32 f = 0; f < 32; f++) { if (shaderContext->contextRegisters[mmSQ_VTX_SEMANTIC_0 + f] == bufferGroup.attrib[i].semanticId) { attributeShaderLoc = f; break; } } if (attributeShaderLoc == 0xFFFFFFFF) continue; // attribute is not mapped to VS input registerIndex = attributeShaderLoc + 1; shaderContext->analyzer.gprUseMask[registerIndex / 8] |= (1 << (registerIndex % 8)); } } } } else if (shaderContext->analyzer.usesRelativeGPRRead && shader->shaderType == LatteConst::ShaderType::Pixel) { // mark pixel shader inputs as used if there is any relative GPR access LatteShaderPSInputTable* psInputTable = LatteSHRC_GetPSInputTable(); for (sint32 i = 0; i < psInputTable->count; i++) { shaderContext->analyzer.gprUseMask[i / 8] |= (1 << (i % 8)); } } // analyze CF stack sint32 cfCurrentStackDepth = 0; sint32 cfCurrentMaxStackDepth = 0; for(auto& cfInstruction : shaderContext->cfInstructions) { if (cfInstruction.type == GPU7_CF_INST_ALU) { // no effect on stack depth cfInstruction.activeStackDepth = cfCurrentStackDepth; } else if (cfInstruction.type == GPU7_CF_INST_ALU_PUSH_BEFORE ) { cfCurrentStackDepth++; cfCurrentMaxStackDepth = std::max(cfCurrentMaxStackDepth, cfCurrentStackDepth); cfInstruction.activeStackDepth = cfCurrentStackDepth; } else if (cfInstruction.type == GPU7_CF_INST_ALU_POP_AFTER) { cfInstruction.activeStackDepth = cfCurrentStackDepth; cfCurrentStackDepth--; } else if (cfInstruction.type == GPU7_CF_INST_ALU_POP2_AFTER) { cfInstruction.activeStackDepth = cfCurrentStackDepth; cfCurrentStackDepth -= 2; } else if (cfInstruction.type == GPU7_CF_INST_ALU_BREAK ) { cfInstruction.activeStackDepth = cfCurrentStackDepth; } else if (cfInstruction.type == GPU7_CF_INST_ALU_ELSE_AFTER) { if (cfInstruction.popCount != 0) debugBreakpoint(); cfInstruction.activeStackDepth = cfCurrentStackDepth; } else if (cfInstruction.type == GPU7_CF_INST_ELSE ) { //if (cfInstruction.popCount != 0) // debugBreakpoint(); -> Only relevant when ELSE jump is taken cfInstruction.activeStackDepth = cfCurrentStackDepth; } else if (cfInstruction.type == GPU7_CF_INST_POP) { cfInstruction.activeStackDepth = cfCurrentStackDepth; cfCurrentStackDepth -= cfInstruction.popCount; if (cfCurrentStackDepth < 0) debugBreakpoint(); } else if (cfInstruction.type == GPU7_CF_INST_LOOP_START_DX10 || cfInstruction.type == GPU7_CF_INST_LOOP_END) { // no effect on stack depth cfInstruction.activeStackDepth = cfCurrentStackDepth; } else if (cfInstruction.type == GPU7_CF_INST_LOOP_BREAK) { // since we assume that the break is not taken (for all pixels), we also don't need to worry about the stack depth adjustment cfInstruction.activeStackDepth = cfCurrentStackDepth; } else if (cfInstruction.type == GPU7_CF_INST_TEX) { // no effect on stack depth cfInstruction.activeStackDepth = cfCurrentStackDepth; } else if (cfInstruction.type == GPU7_CF_INST_EXPORT || cfInstruction.type == GPU7_CF_INST_EXPORT_DONE) { // no effect on stack depth cfInstruction.activeStackDepth = cfCurrentStackDepth; } else if (cfInstruction.type == GPU7_CF_INST_MEM_STREAM0_WRITE || cfInstruction.type == GPU7_CF_INST_MEM_STREAM1_WRITE) { // no effect on stack depth cfInstruction.activeStackDepth = cfCurrentStackDepth; } else if (cfInstruction.type == GPU7_CF_INST_MEM_RING_WRITE) { // no effect on stack depth cfInstruction.activeStackDepth = cfCurrentStackDepth; } else if (cfInstruction.type == GPU7_CF_INST_EMIT_VERTEX) { // no effect on stack depth cfInstruction.activeStackDepth = cfCurrentStackDepth; } else if (cfInstruction.type == GPU7_CF_INST_CALL) { // no effect on stack depth cfInstruction.activeStackDepth = cfCurrentStackDepth; } else { cemu_assert_debug(false); } } shaderContext->analyzer.activeStackMaxDepth = cfCurrentMaxStackDepth; if (cfCurrentStackDepth != 0) { debug_printf("cfCurrentStackDepth is not zero after all CF instructions. depth is %d\n", cfCurrentStackDepth); cemu_assert_debug(false); } if(list_subroutineAddrs.empty() == false) cemuLog_logDebug(LogType::Force, "Todo - analyze shader subroutine CF stack"); // TF mode if (shaderContext->options->useTFViaSSBO && shaderContext->output->streamoutBufferWriteMask.any()) { shaderContext->analyzer.useSSBOForStreamout = true; } // assign binding points if (shaderContext->shaderType == LatteConst::ShaderType::Vertex) shaderContext->output->resourceMappingVK.setIndex = 0; else if (shaderContext->shaderType == LatteConst::ShaderType::Pixel) shaderContext->output->resourceMappingVK.setIndex = 1; else if (shaderContext->shaderType == LatteConst::ShaderType::Geometry) shaderContext->output->resourceMappingVK.setIndex = 2; LatteDecompiler::_initTextureBindingPointsGL(shaderContext); LatteDecompiler::_initTextureBindingPointsVK(shaderContext); LatteDecompiler::_initUniformBindingPoints(shaderContext); LatteDecompiler::_initAttributeBindingPoints(shaderContext); }