Latte: Bound uniform buffers based on access patterns within the shader

This commit is contained in:
Exzap 2023-09-23 22:53:57 +02:00
parent 4d6b72b353
commit 3e925b7707
6 changed files with 114 additions and 93 deletions

View file

@ -230,47 +230,39 @@ void LatteDecompiler_analyzeALUClause(LatteDecompilerShaderContext* shaderContex
// check input for uniform access
if( aluInstruction.sourceOperand[f].sel == 0xFFFFFFFF )
continue; // source operand not set/used
// about uniform register and buffer access tracking:
// for absolute indices we can determine a maximum size that is accessed
// relative accesses are tricky because the upper bound of accessed indices is unknown
// worst case we have to load the full file (256 * 16 byte entries) or for buffers an arbitrary upper bound (64KB in our case)
if( GPU7_ALU_SRC_IS_CFILE(aluInstruction.sourceOperand[f].sel) )
{
// uniform register access
// relative register file accesses are tricky because the range of possible indices is unknown
// worst case we have to load the full file (256 * 16 byte entries)
// by tracking the accessed base indices the shader analyzer can determine bounds for the potentially accessed ranges
shaderContext->analyzer.uniformRegisterAccess = true;
if (aluInstruction.sourceOperand[f].rel)
{
shaderContext->analyzer.uniformRegisterDynamicAccess = true;
shaderContext->analyzer.uniformRegisterAccessIndices.emplace_back(GPU7_ALU_SRC_GET_CFILE_INDEX(aluInstruction.sourceOperand[f].sel), true);
shaderContext->analyzer.uniformRegisterAccessTracker.TrackAccess(GPU7_ALU_SRC_GET_CFILE_INDEX(aluInstruction.sourceOperand[f].sel), true);
}
else
{
_remapUniformAccess(shaderContext, true, 0, GPU7_ALU_SRC_GET_CFILE_INDEX(aluInstruction.sourceOperand[f].sel));
shaderContext->analyzer.uniformRegisterAccessIndices.emplace_back(GPU7_ALU_SRC_GET_CFILE_INDEX(aluInstruction.sourceOperand[f].sel), false);
shaderContext->analyzer.uniformRegisterAccessTracker.TrackAccess(GPU7_ALU_SRC_GET_CFILE_INDEX(aluInstruction.sourceOperand[f].sel), false);
}
}
else if( GPU7_ALU_SRC_IS_CBANK0(aluInstruction.sourceOperand[f].sel) )
{
// uniform bank 0 (uniform buffer with index cfInstruction->cBank0Index)
uint32 uniformBufferIndex = cfInstruction->cBank0Index;
if( uniformBufferIndex >= LATTE_NUM_MAX_UNIFORM_BUFFERS)
debugBreakpoint();
shaderContext->analyzer.uniformBufferAccessMask |= (1<<uniformBufferIndex);
if( aluInstruction.sourceOperand[f].rel )
shaderContext->analyzer.uniformBufferDynamicAccessMask |= (1<<uniformBufferIndex);
_remapUniformAccess(shaderContext, false, uniformBufferIndex, GPU7_ALU_SRC_GET_CBANK0_INDEX(aluInstruction.sourceOperand[f].sel)+cfInstruction->cBank0AddrBase);
cemu_assert(uniformBufferIndex < LATTE_NUM_MAX_UNIFORM_BUFFERS);
uint32 offset = GPU7_ALU_SRC_GET_CBANK0_INDEX(aluInstruction.sourceOperand[f].sel)+cfInstruction->cBank0AddrBase;
_remapUniformAccess(shaderContext, false, uniformBufferIndex, offset);
shaderContext->analyzer.uniformBufferAccessTracker[uniformBufferIndex].TrackAccess(offset, aluInstruction.sourceOperand[f].rel);
}
else if( GPU7_ALU_SRC_IS_CBANK1(aluInstruction.sourceOperand[f].sel) )
{
// uniform bank 1 (uniform buffer with index cfInstruction->cBank1Index)
uint32 uniformBufferIndex = cfInstruction->cBank1Index;
if( uniformBufferIndex >= LATTE_NUM_MAX_UNIFORM_BUFFERS)
debugBreakpoint();
shaderContext->analyzer.uniformBufferAccessMask |= (1<<uniformBufferIndex);
if( aluInstruction.sourceOperand[f].rel )
shaderContext->analyzer.uniformBufferDynamicAccessMask |= (1<<uniformBufferIndex);
_remapUniformAccess(shaderContext, false, uniformBufferIndex, GPU7_ALU_SRC_GET_CBANK1_INDEX(aluInstruction.sourceOperand[f].sel)+cfInstruction->cBank1AddrBase);
cemu_assert(uniformBufferIndex < LATTE_NUM_MAX_UNIFORM_BUFFERS);
uint32 offset = GPU7_ALU_SRC_GET_CBANK1_INDEX(aluInstruction.sourceOperand[f].sel)+cfInstruction->cBank1AddrBase;
_remapUniformAccess(shaderContext, false, uniformBufferIndex, offset);
shaderContext->analyzer.uniformBufferAccessTracker[uniformBufferIndex].TrackAccess(offset, aluInstruction.sourceOperand[f].rel);
}
else if( GPU7_ALU_SRC_IS_GPR(aluInstruction.sourceOperand[f].sel) )
{
@ -360,8 +352,7 @@ void LatteDecompiler_analyzeTEXClause(LatteDecompilerShaderContext* shaderContex
if( texInstruction.textureFetch.textureIndex >= 0x80 && texInstruction.textureFetch.textureIndex <= 0x8F )
{
uint32 uniformBufferIndex = texInstruction.textureFetch.textureIndex - 0x80;
shaderContext->analyzer.uniformBufferAccessMask |= (1<<uniformBufferIndex);
shaderContext->analyzer.uniformBufferDynamicAccessMask |= (1<<uniformBufferIndex);
shaderContext->analyzer.uniformBufferAccessTracker[uniformBufferIndex].TrackAccess(0, true);
}
else if( texInstruction.textureFetch.textureIndex == 0x9F && shader->shaderType == LatteConst::ShaderType::Geometry )
{
@ -576,7 +567,7 @@ namespace LatteDecompiler
// for Vulkan we use consecutive indices
for (uint32 i = 0; i < LATTE_NUM_MAX_UNIFORM_BUFFERS; i++)
{
if ((decompilerContext->analyzer.uniformBufferAccessMask&(1 << i)) == 0)
if (!decompilerContext->analyzer.uniformBufferAccessTracker[i].HasAccess())
continue;
sint32 uniformBindingPoint = i;
if (decompilerContext->shaderType == LatteConst::ShaderType::Geometry)
@ -592,7 +583,7 @@ namespace LatteDecompiler
// for OpenGL we use the relative buffer index
for (uint32 i = 0; i < LATTE_NUM_MAX_UNIFORM_BUFFERS; i++)
{
if ((decompilerContext->analyzer.uniformBufferAccessMask&(1 << i)) == 0)
if (!decompilerContext->analyzer.uniformBufferAccessTracker[i].HasAccess())
continue;
sint32 uniformBindingPoint = i;
if (decompilerContext->shaderType == LatteConst::ShaderType::Geometry)
@ -765,17 +756,24 @@ void LatteDecompiler_analyze(LatteDecompilerShaderContext* shaderContext, LatteD
LatteDecompiler_analyzeSubroutine(shaderContext, subroutineAddr);
}
// decide which uniform mode to use
if(shaderContext->analyzer.uniformBufferAccessMask != 0 && shaderContext->analyzer.uniformRegisterAccess )
debugBreakpoint(); // not allowed
if(shaderContext->analyzer.uniformBufferDynamicAccessMask != 0 )
bool hasAnyDynamicBufferAccess = false;
bool hasAnyBufferAccess = false;
for(auto& it : shaderContext->analyzer.uniformBufferAccessTracker)
{
if( it.HasRelativeAccess() )
hasAnyDynamicBufferAccess = true;
if( it.HasAccess() )
hasAnyBufferAccess = true;
}
if (hasAnyDynamicBufferAccess)
{
shader->uniformMode = LATTE_DECOMPILER_UNIFORM_MODE_FULL_CBANK;
}
else if(shaderContext->analyzer.uniformRegisterDynamicAccess )
else if(shaderContext->analyzer.uniformRegisterAccessTracker.HasRelativeAccess() )
{
shader->uniformMode = LATTE_DECOMPILER_UNIFORM_MODE_FULL_CFILE;
}
else if(shaderContext->analyzer.uniformBufferAccessMask != 0 || shaderContext->analyzer.uniformRegisterAccess != 0 )
else if(hasAnyBufferAccess || shaderContext->analyzer.uniformRegisterAccessTracker.HasAccess() )
{
shader->uniformMode = LATTE_DECOMPILER_UNIFORM_MODE_REMAPPED;
}
@ -783,16 +781,18 @@ void LatteDecompiler_analyze(LatteDecompilerShaderContext* shaderContext, LatteD
{
shader->uniformMode = LATTE_DECOMPILER_UNIFORM_MODE_NONE;
}
// generate list of uniform buffers based on uniformBufferAccessMask (for faster access)
shader->uniformBufferListCount = 0;
// generate compact list of uniform buffers (for faster access)
cemu_assert_debug(shader->list_quickBufferList.empty());
for (uint32 i = 0; i < LATTE_NUM_MAX_UNIFORM_BUFFERS; i++)
{
if( !HAS_FLAG(shaderContext->analyzer.uniformBufferAccessMask, (1<<i)) )
if( !shaderContext->analyzer.uniformBufferAccessTracker[i].HasAccess() )
continue;
shader->uniformBufferList[shader->uniformBufferListCount] = i;
shader->uniformBufferListCount++;
LatteDecompilerShader::QuickBufferEntry entry;
entry.index = i;
entry.size = shaderContext->analyzer.uniformBufferAccessTracker[i].DetermineSize(LATTE_GLSL_DYNAMIC_UNIFORM_BLOCK_SIZE) * 16;
shader->list_quickBufferList.push_back(entry);
}
// get dimension of each used textures
// get dimension of each used texture
_LatteRegisterSetTextureUnit* texRegs = nullptr;
if( shader->shaderType == LatteConst::ShaderType::Vertex )
texRegs = shaderContext->contextRegistersNew->SQ_TEX_START_VS;