mirror of
https://github.com/cemu-project/Cemu.git
synced 2025-07-15 19:28:29 +12:00
Latte: Implement better index caching (#1443)
This commit is contained in:
parent
1923b7a7c4
commit
8dd809d725
16 changed files with 526 additions and 191 deletions
|
@ -141,6 +141,14 @@ private:
|
|||
|
||||
void LatteCP_processCommandBuffer(DrawPassContext& drawPassCtx);
|
||||
|
||||
// called whenever the GPU runs out of commands or hits a wait condition (semaphores, HLE waits)
|
||||
void LatteCP_signalEnterWait()
|
||||
{
|
||||
// based on the assumption that games won't do a rugpull and swap out buffer data in the middle of an uninterrupted sequence of drawcalls,
|
||||
// we only flush caches when the GPU goes idle or has to wait for any operation
|
||||
LatteIndices_invalidateAll();
|
||||
}
|
||||
|
||||
/*
|
||||
* Read a U32 from the command buffer
|
||||
* If no data is available then wait in a busy loop
|
||||
|
@ -466,6 +474,8 @@ LatteCMDPtr LatteCP_itWaitRegMem(LatteCMDPtr cmd, uint32 nWords)
|
|||
const uint32 GPU7_WAIT_MEM_OP_GREATER = 6;
|
||||
const uint32 GPU7_WAIT_MEM_OP_NEVER = 7;
|
||||
|
||||
LatteCP_signalEnterWait();
|
||||
|
||||
bool stalls = false;
|
||||
if ((word0 & 0x10) != 0)
|
||||
{
|
||||
|
@ -594,6 +604,7 @@ LatteCMDPtr LatteCP_itMemSemaphore(LatteCMDPtr cmd, uint32 nWords)
|
|||
else if(SEM_SIGNAL == 7)
|
||||
{
|
||||
// wait
|
||||
LatteCP_signalEnterWait();
|
||||
size_t loopCount = 0;
|
||||
while (true)
|
||||
{
|
||||
|
@ -1305,11 +1316,13 @@ void LatteCP_processCommandBuffer(DrawPassContext& drawPassCtx)
|
|||
}
|
||||
case IT_HLE_TRIGGER_SCANBUFFER_SWAP:
|
||||
{
|
||||
LatteCP_signalEnterWait();
|
||||
LatteCP_itHLESwapScanBuffer(cmdData, nWords);
|
||||
break;
|
||||
}
|
||||
case IT_HLE_WAIT_FOR_FLIP:
|
||||
{
|
||||
LatteCP_signalEnterWait();
|
||||
LatteCP_itHLEWaitForFlip(cmdData, nWords);
|
||||
break;
|
||||
}
|
||||
|
@ -1594,12 +1607,14 @@ void LatteCP_ProcessRingbuffer()
|
|||
}
|
||||
case IT_HLE_TRIGGER_SCANBUFFER_SWAP:
|
||||
{
|
||||
LatteCP_signalEnterWait();
|
||||
LatteCP_itHLESwapScanBuffer(cmd, nWords);
|
||||
timerRecheck += CP_TIMER_RECHECK / 64;
|
||||
break;
|
||||
}
|
||||
case IT_HLE_WAIT_FOR_FLIP:
|
||||
{
|
||||
LatteCP_signalEnterWait();
|
||||
LatteCP_itHLEWaitForFlip(cmd, nWords);
|
||||
timerRecheck += CP_TIMER_RECHECK / 1;
|
||||
break;
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
#include "Cafe/HW/Latte/Core/LatteConst.h"
|
||||
#include "Cafe/HW/Latte/Renderer/Renderer.h"
|
||||
#include "Cafe/HW/Latte/ISA/RegDefines.h"
|
||||
#include "Cafe/HW/Latte/Core/LattePerformanceMonitor.h"
|
||||
#include "Common/cpu_features.h"
|
||||
|
||||
#if defined(ARCH_X86_64) && defined(__GNUC__)
|
||||
|
@ -9,32 +10,53 @@
|
|||
|
||||
struct
|
||||
{
|
||||
const void* lastPtr;
|
||||
uint32 lastCount;
|
||||
LattePrimitiveMode lastPrimitiveMode;
|
||||
LatteIndexType lastIndexType;
|
||||
// output
|
||||
uint32 indexMin;
|
||||
uint32 indexMax;
|
||||
Renderer::INDEX_TYPE renderIndexType;
|
||||
uint32 outputCount;
|
||||
uint32 indexBufferOffset;
|
||||
uint32 indexBufferIndex;
|
||||
struct CacheEntry
|
||||
{
|
||||
// input data
|
||||
const void* lastPtr;
|
||||
uint32 lastCount;
|
||||
LattePrimitiveMode lastPrimitiveMode;
|
||||
LatteIndexType lastIndexType;
|
||||
uint64 lastUsed;
|
||||
// output
|
||||
uint32 indexMin;
|
||||
uint32 indexMax;
|
||||
Renderer::INDEX_TYPE renderIndexType;
|
||||
uint32 outputCount;
|
||||
Renderer::IndexAllocation indexAllocation;
|
||||
};
|
||||
std::array<CacheEntry, 8> entry;
|
||||
uint64 currentUsageCounter{0};
|
||||
}LatteIndexCache{};
|
||||
|
||||
void LatteIndices_invalidate(const void* memPtr, uint32 size)
|
||||
{
|
||||
if (LatteIndexCache.lastPtr >= memPtr && (LatteIndexCache.lastPtr < ((uint8*)memPtr + size)) )
|
||||
for(auto& entry : LatteIndexCache.entry)
|
||||
{
|
||||
LatteIndexCache.lastPtr = nullptr;
|
||||
LatteIndexCache.lastCount = 0;
|
||||
if (entry.lastPtr >= memPtr && (entry.lastPtr < ((uint8*)memPtr + size)) )
|
||||
{
|
||||
if(entry.lastPtr != nullptr)
|
||||
g_renderer->indexData_releaseIndexMemory(entry.indexAllocation);
|
||||
entry.lastPtr = nullptr;
|
||||
entry.lastCount = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void LatteIndices_invalidateAll()
|
||||
{
|
||||
LatteIndexCache.lastPtr = nullptr;
|
||||
LatteIndexCache.lastCount = 0;
|
||||
for(auto& entry : LatteIndexCache.entry)
|
||||
{
|
||||
if (entry.lastPtr != nullptr)
|
||||
g_renderer->indexData_releaseIndexMemory(entry.indexAllocation);
|
||||
entry.lastPtr = nullptr;
|
||||
entry.lastCount = 0;
|
||||
}
|
||||
}
|
||||
|
||||
uint64 LatteIndices_GetNextUsageIndex()
|
||||
{
|
||||
return LatteIndexCache.currentUsageCounter++;
|
||||
}
|
||||
|
||||
uint32 LatteIndices_calculateIndexOutputSize(LattePrimitiveMode primitiveMode, LatteIndexType indexType, uint32 count)
|
||||
|
@ -532,7 +554,7 @@ void LatteIndices_alternativeCalculateIndexMinMax(const void* indexData, LatteIn
|
|||
}
|
||||
}
|
||||
|
||||
void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 count, LattePrimitiveMode primitiveMode, uint32& indexMin, uint32& indexMax, Renderer::INDEX_TYPE& renderIndexType, uint32& outputCount, uint32& indexBufferOffset, uint32& indexBufferIndex)
|
||||
void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 count, LattePrimitiveMode primitiveMode, uint32& indexMin, uint32& indexMax, Renderer::INDEX_TYPE& renderIndexType, uint32& outputCount, Renderer::IndexAllocation& indexAllocation)
|
||||
{
|
||||
// what this should do:
|
||||
// [x] use fast SIMD-based index decoding
|
||||
|
@ -542,17 +564,18 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32
|
|||
// [ ] better cache implementation, allow to cache across frames
|
||||
|
||||
// reuse from cache if data didn't change
|
||||
if (LatteIndexCache.lastPtr == indexData &&
|
||||
LatteIndexCache.lastCount == count &&
|
||||
LatteIndexCache.lastPrimitiveMode == primitiveMode &&
|
||||
LatteIndexCache.lastIndexType == indexType)
|
||||
auto cacheEntry = std::find_if(LatteIndexCache.entry.begin(), LatteIndexCache.entry.end(), [indexData, count, primitiveMode, indexType](const auto& entry)
|
||||
{
|
||||
indexMin = LatteIndexCache.indexMin;
|
||||
indexMax = LatteIndexCache.indexMax;
|
||||
renderIndexType = LatteIndexCache.renderIndexType;
|
||||
outputCount = LatteIndexCache.outputCount;
|
||||
indexBufferOffset = LatteIndexCache.indexBufferOffset;
|
||||
indexBufferIndex = LatteIndexCache.indexBufferIndex;
|
||||
return entry.lastPtr == indexData && entry.lastCount == count && entry.lastPrimitiveMode == primitiveMode && entry.lastIndexType == indexType;
|
||||
});
|
||||
if (cacheEntry != LatteIndexCache.entry.end())
|
||||
{
|
||||
indexMin = cacheEntry->indexMin;
|
||||
indexMax = cacheEntry->indexMax;
|
||||
renderIndexType = cacheEntry->renderIndexType;
|
||||
outputCount = cacheEntry->outputCount;
|
||||
indexAllocation = cacheEntry->indexAllocation;
|
||||
cacheEntry->lastUsed = LatteIndices_GetNextUsageIndex();
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -576,10 +599,12 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32
|
|||
indexMin = 0;
|
||||
indexMax = std::max(count, 1u)-1;
|
||||
renderIndexType = Renderer::INDEX_TYPE::NONE;
|
||||
indexAllocation = {};
|
||||
return; // no indices
|
||||
}
|
||||
// query index buffer from renderer
|
||||
void* indexOutputPtr = g_renderer->indexData_reserveIndexMemory(indexOutputSize, indexBufferOffset, indexBufferIndex);
|
||||
indexAllocation = g_renderer->indexData_reserveIndexMemory(indexOutputSize);
|
||||
void* indexOutputPtr = indexAllocation.mem;
|
||||
|
||||
// decode indices
|
||||
indexMin = std::numeric_limits<uint32>::max();
|
||||
|
@ -704,16 +729,25 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32
|
|||
// recalculate index range but filter out primitive restart index
|
||||
LatteIndices_alternativeCalculateIndexMinMax(indexData, indexType, count, indexMin, indexMax);
|
||||
}
|
||||
g_renderer->indexData_uploadIndexMemory(indexBufferOffset, indexOutputSize);
|
||||
g_renderer->indexData_uploadIndexMemory(indexAllocation);
|
||||
performanceMonitor.cycle[performanceMonitor.cycleIndex].indexDataUploaded += indexOutputSize;
|
||||
// get least recently used cache entry
|
||||
auto lruEntry = std::min_element(LatteIndexCache.entry.begin(), LatteIndexCache.entry.end(), [](const auto& a, const auto& b)
|
||||
{
|
||||
return a.lastUsed < b.lastUsed;
|
||||
});
|
||||
// invalidate previous allocation
|
||||
if(lruEntry->lastPtr != nullptr)
|
||||
g_renderer->indexData_releaseIndexMemory(lruEntry->indexAllocation);
|
||||
// update cache
|
||||
LatteIndexCache.lastPtr = indexData;
|
||||
LatteIndexCache.lastCount = count;
|
||||
LatteIndexCache.lastPrimitiveMode = primitiveMode;
|
||||
LatteIndexCache.lastIndexType = indexType;
|
||||
LatteIndexCache.indexMin = indexMin;
|
||||
LatteIndexCache.indexMax = indexMax;
|
||||
LatteIndexCache.renderIndexType = renderIndexType;
|
||||
LatteIndexCache.outputCount = outputCount;
|
||||
LatteIndexCache.indexBufferOffset = indexBufferOffset;
|
||||
LatteIndexCache.indexBufferIndex = indexBufferIndex;
|
||||
lruEntry->lastPtr = indexData;
|
||||
lruEntry->lastCount = count;
|
||||
lruEntry->lastPrimitiveMode = primitiveMode;
|
||||
lruEntry->lastIndexType = indexType;
|
||||
lruEntry->indexMin = indexMin;
|
||||
lruEntry->indexMax = indexMax;
|
||||
lruEntry->renderIndexType = renderIndexType;
|
||||
lruEntry->outputCount = outputCount;
|
||||
lruEntry->indexAllocation = indexAllocation;
|
||||
lruEntry->lastUsed = LatteIndices_GetNextUsageIndex();
|
||||
}
|
||||
|
|
|
@ -4,4 +4,4 @@
|
|||
|
||||
void LatteIndices_invalidate(const void* memPtr, uint32 size);
|
||||
void LatteIndices_invalidateAll();
|
||||
void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 count, LattePrimitiveMode primitiveMode, uint32& indexMin, uint32& indexMax, Renderer::INDEX_TYPE& renderIndexType, uint32& outputCount, uint32& indexBufferOffset, uint32& indexBufferIndex);
|
||||
void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 count, LattePrimitiveMode primitiveMode, uint32& indexMin, uint32& indexMax, Renderer::INDEX_TYPE& renderIndexType, uint32& outputCount, Renderer::IndexAllocation& indexAllocation);
|
|
@ -107,7 +107,13 @@ void LatteOverlay_renderOverlay(ImVec2& position, ImVec2& pivot, sint32 directio
|
|||
ImGui::Text("VRAM: %dMB / %dMB", g_state.vramUsage, g_state.vramTotal);
|
||||
|
||||
if (config.overlay.debug)
|
||||
{
|
||||
// general debug info
|
||||
ImGui::Text("--- Debug info ---");
|
||||
ImGui::Text("IndexUploadPerFrame: %dKB", (performanceMonitor.stats.indexDataUploadPerFrame+1023)/1024);
|
||||
// backend specific info
|
||||
g_renderer->AppendOverlayDebugInfo();
|
||||
}
|
||||
|
||||
position.y += (ImGui::GetWindowSize().y + 10.0f) * direction;
|
||||
}
|
||||
|
|
|
@ -74,7 +74,6 @@ void LattePerformanceMonitor_frameEnd()
|
|||
uniformBankDataUploadedPerFrame /= 1024ULL;
|
||||
uint32 uniformBankCountUploadedPerFrame = (uint32)(uniformBankUploadedCount / (uint64)elapsedFrames);
|
||||
uint64 indexDataUploadPerFrame = (indexDataUploaded / (uint64)elapsedFrames);
|
||||
indexDataUploadPerFrame /= 1024ULL;
|
||||
|
||||
double fps = (double)elapsedFrames2S * 1000.0 / (double)totalElapsedTimeFPS;
|
||||
uint32 shaderBindsPerFrame = shaderBindCounter / elapsedFrames;
|
||||
|
@ -82,7 +81,7 @@ void LattePerformanceMonitor_frameEnd()
|
|||
uint32 rlps = (uint32)((uint64)recompilerLeaveCount * 1000ULL / (uint64)totalElapsedTime);
|
||||
uint32 tlps = (uint32)((uint64)threadLeaveCount * 1000ULL / (uint64)totalElapsedTime);
|
||||
// set stats
|
||||
|
||||
performanceMonitor.stats.indexDataUploadPerFrame = indexDataUploadPerFrame;
|
||||
// next counter cycle
|
||||
sint32 nextCycleIndex = (performanceMonitor.cycleIndex + 1) % PERFORMANCE_MONITOR_TRACK_CYCLES;
|
||||
performanceMonitor.cycle[nextCycleIndex].drawCallCounter = 0;
|
||||
|
|
|
@ -132,6 +132,12 @@ typedef struct
|
|||
LattePerfStatCounter numDrawBarriersPerFrame;
|
||||
LattePerfStatCounter numBeginRenderpassPerFrame;
|
||||
}vk;
|
||||
|
||||
// calculated stats (per frame)
|
||||
struct
|
||||
{
|
||||
uint32 indexDataUploadPerFrame;
|
||||
}stats;
|
||||
}performanceMonitor_t;
|
||||
|
||||
extern performanceMonitor_t performanceMonitor;
|
||||
|
|
|
@ -11,7 +11,6 @@
|
|||
#include "Cafe/HW/Latte/Core/LattePerformanceMonitor.h"
|
||||
#include "Cafe/GraphicPack/GraphicPack2.h"
|
||||
#include "config/ActiveSettings.h"
|
||||
#include "Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h"
|
||||
#include "gui/guiWrapper.h"
|
||||
#include "Cafe/OS/libs/erreula/erreula.h"
|
||||
#include "input/InputManager.h"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue