From 13979d490f88c33c8d3bc98dff5a4bacbc93e374 Mon Sep 17 00:00:00 2001 From: Exzap <13877693+Exzap@users.noreply.github.com> Date: Sat, 23 Nov 2024 18:25:58 +0100 Subject: [PATCH] Latte/Vulkan: Add multiple entry LRU cache support for indices --- src/Cafe/HW/Latte/Core/LatteIndices.cpp | 112 ++++++++----- src/Cafe/HW/Latte/Core/LatteIndices.h | 2 +- src/Cafe/HW/Latte/Core/LatteOverlay.cpp | 6 + .../HW/Latte/Core/LattePerformanceMonitor.cpp | 3 +- .../HW/Latte/Core/LattePerformanceMonitor.h | 6 + src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp | 1 - .../HW/Latte/Renderer/OpenGL/OpenGLRenderer.h | 17 +- src/Cafe/HW/Latte/Renderer/Renderer.h | 11 +- .../Renderer/Vulkan/VKRMemoryManager.cpp | 147 +++++++++++++++-- .../Latte/Renderer/Vulkan/VKRMemoryManager.h | 154 +++++++++++++++--- .../Latte/Renderer/Vulkan/VulkanRenderer.cpp | 6 +- .../HW/Latte/Renderer/Vulkan/VulkanRenderer.h | 5 +- .../Renderer/Vulkan/VulkanRendererCore.cpp | 29 ++-- 13 files changed, 395 insertions(+), 104 deletions(-) diff --git a/src/Cafe/HW/Latte/Core/LatteIndices.cpp b/src/Cafe/HW/Latte/Core/LatteIndices.cpp index 6e1d7455..aec51725 100644 --- a/src/Cafe/HW/Latte/Core/LatteIndices.cpp +++ b/src/Cafe/HW/Latte/Core/LatteIndices.cpp @@ -1,6 +1,7 @@ #include "Cafe/HW/Latte/Core/LatteConst.h" #include "Cafe/HW/Latte/Renderer/Renderer.h" #include "Cafe/HW/Latte/ISA/RegDefines.h" +#include "Cafe/HW/Latte/Core/LattePerformanceMonitor.h" #include "Common/cpu_features.h" #if defined(ARCH_X86_64) && defined(__GNUC__) @@ -9,32 +10,53 @@ struct { - const void* lastPtr; - uint32 lastCount; - LattePrimitiveMode lastPrimitiveMode; - LatteIndexType lastIndexType; - // output - uint32 indexMin; - uint32 indexMax; - Renderer::INDEX_TYPE renderIndexType; - uint32 outputCount; - uint32 indexBufferOffset; - uint32 indexBufferIndex; + struct CacheEntry + { + // input data + const void* lastPtr; + uint32 lastCount; + LattePrimitiveMode lastPrimitiveMode; + LatteIndexType lastIndexType; + uint64 lastUsed; + // output + uint32 indexMin; + uint32 indexMax; + Renderer::INDEX_TYPE renderIndexType; + uint32 outputCount; + Renderer::IndexAllocation indexAllocation; + }; + std::array entry; + uint64 currentUsageCounter{0}; }LatteIndexCache{}; void LatteIndices_invalidate(const void* memPtr, uint32 size) { - if (LatteIndexCache.lastPtr >= memPtr && (LatteIndexCache.lastPtr < ((uint8*)memPtr + size)) ) + for(auto& entry : LatteIndexCache.entry) { - LatteIndexCache.lastPtr = nullptr; - LatteIndexCache.lastCount = 0; + if (entry.lastPtr >= memPtr && (entry.lastPtr < ((uint8*)memPtr + size)) ) + { + if(entry.lastPtr != nullptr) + g_renderer->indexData_releaseIndexMemory(entry.indexAllocation); + entry.lastPtr = nullptr; + entry.lastCount = 0; + } } } void LatteIndices_invalidateAll() { - LatteIndexCache.lastPtr = nullptr; - LatteIndexCache.lastCount = 0; + for(auto& entry : LatteIndexCache.entry) + { + if (entry.lastPtr != nullptr) + g_renderer->indexData_releaseIndexMemory(entry.indexAllocation); + entry.lastPtr = nullptr; + entry.lastCount = 0; + } +} + +uint64 LatteIndices_GetNextUsageIndex() +{ + return LatteIndexCache.currentUsageCounter++; } uint32 LatteIndices_calculateIndexOutputSize(LattePrimitiveMode primitiveMode, LatteIndexType indexType, uint32 count) @@ -532,7 +554,7 @@ void LatteIndices_alternativeCalculateIndexMinMax(const void* indexData, LatteIn } } -void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 count, LattePrimitiveMode primitiveMode, uint32& indexMin, uint32& indexMax, Renderer::INDEX_TYPE& renderIndexType, uint32& outputCount, uint32& indexBufferOffset, uint32& indexBufferIndex) +void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 count, LattePrimitiveMode primitiveMode, uint32& indexMin, uint32& indexMax, Renderer::INDEX_TYPE& renderIndexType, uint32& outputCount, Renderer::IndexAllocation& indexAllocation) { // what this should do: // [x] use fast SIMD-based index decoding @@ -542,17 +564,18 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 // [ ] better cache implementation, allow to cache across frames // reuse from cache if data didn't change - if (LatteIndexCache.lastPtr == indexData && - LatteIndexCache.lastCount == count && - LatteIndexCache.lastPrimitiveMode == primitiveMode && - LatteIndexCache.lastIndexType == indexType) + auto cacheEntry = std::find_if(LatteIndexCache.entry.begin(), LatteIndexCache.entry.end(), [indexData, count, primitiveMode, indexType](const auto& entry) { - indexMin = LatteIndexCache.indexMin; - indexMax = LatteIndexCache.indexMax; - renderIndexType = LatteIndexCache.renderIndexType; - outputCount = LatteIndexCache.outputCount; - indexBufferOffset = LatteIndexCache.indexBufferOffset; - indexBufferIndex = LatteIndexCache.indexBufferIndex; + return entry.lastPtr == indexData && entry.lastCount == count && entry.lastPrimitiveMode == primitiveMode && entry.lastIndexType == indexType; + }); + if (cacheEntry != LatteIndexCache.entry.end()) + { + indexMin = cacheEntry->indexMin; + indexMax = cacheEntry->indexMax; + renderIndexType = cacheEntry->renderIndexType; + outputCount = cacheEntry->outputCount; + indexAllocation = cacheEntry->indexAllocation; + cacheEntry->lastUsed = LatteIndices_GetNextUsageIndex(); return; } @@ -576,10 +599,12 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 indexMin = 0; indexMax = std::max(count, 1u)-1; renderIndexType = Renderer::INDEX_TYPE::NONE; + indexAllocation = {}; return; // no indices } // query index buffer from renderer - void* indexOutputPtr = g_renderer->indexData_reserveIndexMemory(indexOutputSize, indexBufferOffset, indexBufferIndex); + indexAllocation = g_renderer->indexData_reserveIndexMemory(indexOutputSize); + void* indexOutputPtr = indexAllocation.mem; // decode indices indexMin = std::numeric_limits::max(); @@ -704,16 +729,25 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 // recalculate index range but filter out primitive restart index LatteIndices_alternativeCalculateIndexMinMax(indexData, indexType, count, indexMin, indexMax); } - g_renderer->indexData_uploadIndexMemory(indexBufferOffset, indexOutputSize); + g_renderer->indexData_uploadIndexMemory(indexAllocation); + performanceMonitor.cycle[performanceMonitor.cycleIndex].indexDataUploaded += indexOutputSize; + // get least recently used cache entry + auto lruEntry = std::min_element(LatteIndexCache.entry.begin(), LatteIndexCache.entry.end(), [](const auto& a, const auto& b) + { + return a.lastUsed < b.lastUsed; + }); + // invalidate previous allocation + if(lruEntry->lastPtr != nullptr) + g_renderer->indexData_releaseIndexMemory(lruEntry->indexAllocation); // update cache - LatteIndexCache.lastPtr = indexData; - LatteIndexCache.lastCount = count; - LatteIndexCache.lastPrimitiveMode = primitiveMode; - LatteIndexCache.lastIndexType = indexType; - LatteIndexCache.indexMin = indexMin; - LatteIndexCache.indexMax = indexMax; - LatteIndexCache.renderIndexType = renderIndexType; - LatteIndexCache.outputCount = outputCount; - LatteIndexCache.indexBufferOffset = indexBufferOffset; - LatteIndexCache.indexBufferIndex = indexBufferIndex; + lruEntry->lastPtr = indexData; + lruEntry->lastCount = count; + lruEntry->lastPrimitiveMode = primitiveMode; + lruEntry->lastIndexType = indexType; + lruEntry->indexMin = indexMin; + lruEntry->indexMax = indexMax; + lruEntry->renderIndexType = renderIndexType; + lruEntry->outputCount = outputCount; + lruEntry->indexAllocation = indexAllocation; + lruEntry->lastUsed = LatteIndices_GetNextUsageIndex(); } diff --git a/src/Cafe/HW/Latte/Core/LatteIndices.h b/src/Cafe/HW/Latte/Core/LatteIndices.h index 917d7991..8aace24e 100644 --- a/src/Cafe/HW/Latte/Core/LatteIndices.h +++ b/src/Cafe/HW/Latte/Core/LatteIndices.h @@ -4,4 +4,4 @@ void LatteIndices_invalidate(const void* memPtr, uint32 size); void LatteIndices_invalidateAll(); -void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 count, LattePrimitiveMode primitiveMode, uint32& indexMin, uint32& indexMax, Renderer::INDEX_TYPE& renderIndexType, uint32& outputCount, uint32& indexBufferOffset, uint32& indexBufferIndex); \ No newline at end of file +void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 count, LattePrimitiveMode primitiveMode, uint32& indexMin, uint32& indexMax, Renderer::INDEX_TYPE& renderIndexType, uint32& outputCount, Renderer::IndexAllocation& indexAllocation); \ No newline at end of file diff --git a/src/Cafe/HW/Latte/Core/LatteOverlay.cpp b/src/Cafe/HW/Latte/Core/LatteOverlay.cpp index 238f85e8..e6edb904 100644 --- a/src/Cafe/HW/Latte/Core/LatteOverlay.cpp +++ b/src/Cafe/HW/Latte/Core/LatteOverlay.cpp @@ -107,7 +107,13 @@ void LatteOverlay_renderOverlay(ImVec2& position, ImVec2& pivot, sint32 directio ImGui::Text("VRAM: %dMB / %dMB", g_state.vramUsage, g_state.vramTotal); if (config.overlay.debug) + { + // general debug info + ImGui::Text("--- Debug info ---"); + ImGui::Text("IndexUploadPerFrame: %dKB", (performanceMonitor.stats.indexDataUploadPerFrame+1023)/1024); + // backend specific info g_renderer->AppendOverlayDebugInfo(); + } position.y += (ImGui::GetWindowSize().y + 10.0f) * direction; } diff --git a/src/Cafe/HW/Latte/Core/LattePerformanceMonitor.cpp b/src/Cafe/HW/Latte/Core/LattePerformanceMonitor.cpp index f2767446..14dfe9a9 100644 --- a/src/Cafe/HW/Latte/Core/LattePerformanceMonitor.cpp +++ b/src/Cafe/HW/Latte/Core/LattePerformanceMonitor.cpp @@ -74,7 +74,6 @@ void LattePerformanceMonitor_frameEnd() uniformBankDataUploadedPerFrame /= 1024ULL; uint32 uniformBankCountUploadedPerFrame = (uint32)(uniformBankUploadedCount / (uint64)elapsedFrames); uint64 indexDataUploadPerFrame = (indexDataUploaded / (uint64)elapsedFrames); - indexDataUploadPerFrame /= 1024ULL; double fps = (double)elapsedFrames2S * 1000.0 / (double)totalElapsedTimeFPS; uint32 shaderBindsPerFrame = shaderBindCounter / elapsedFrames; @@ -82,7 +81,7 @@ void LattePerformanceMonitor_frameEnd() uint32 rlps = (uint32)((uint64)recompilerLeaveCount * 1000ULL / (uint64)totalElapsedTime); uint32 tlps = (uint32)((uint64)threadLeaveCount * 1000ULL / (uint64)totalElapsedTime); // set stats - + performanceMonitor.stats.indexDataUploadPerFrame = indexDataUploadPerFrame; // next counter cycle sint32 nextCycleIndex = (performanceMonitor.cycleIndex + 1) % PERFORMANCE_MONITOR_TRACK_CYCLES; performanceMonitor.cycle[nextCycleIndex].drawCallCounter = 0; diff --git a/src/Cafe/HW/Latte/Core/LattePerformanceMonitor.h b/src/Cafe/HW/Latte/Core/LattePerformanceMonitor.h index 713e094e..7252e673 100644 --- a/src/Cafe/HW/Latte/Core/LattePerformanceMonitor.h +++ b/src/Cafe/HW/Latte/Core/LattePerformanceMonitor.h @@ -131,6 +131,12 @@ typedef struct LattePerfStatCounter numDrawBarriersPerFrame; LattePerfStatCounter numBeginRenderpassPerFrame; }vk; + + // calculated stats (per frame) + struct + { + uint32 indexDataUploadPerFrame; + }stats; }performanceMonitor_t; extern performanceMonitor_t performanceMonitor; diff --git a/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp b/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp index 3bb6c7e3..2efef5bf 100644 --- a/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp +++ b/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp @@ -11,7 +11,6 @@ #include "Cafe/HW/Latte/Core/LattePerformanceMonitor.h" #include "Cafe/GraphicPack/GraphicPack2.h" #include "config/ActiveSettings.h" -#include "Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h" #include "gui/guiWrapper.h" #include "Cafe/OS/libs/erreula/erreula.h" #include "input/InputManager.h" diff --git a/src/Cafe/HW/Latte/Renderer/OpenGL/OpenGLRenderer.h b/src/Cafe/HW/Latte/Renderer/OpenGL/OpenGLRenderer.h index 313ea3c0..e29e9d4c 100644 --- a/src/Cafe/HW/Latte/Renderer/OpenGL/OpenGLRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/OpenGL/OpenGLRenderer.h @@ -102,16 +102,21 @@ public: static void SetAttributeArrayState(uint32 index, bool isEnabled, sint32 aluDivisor); static void SetArrayElementBuffer(GLuint arrayElementBuffer); - // index - void* indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) override + // index (not used by OpenGL renderer yet) + IndexAllocation indexData_reserveIndexMemory(uint32 size) override { - assert_dbg(); - return nullptr; + cemu_assert_unimplemented(); + return {}; } - void indexData_uploadIndexMemory(uint32 offset, uint32 size) override + void indexData_releaseIndexMemory(IndexAllocation& allocation) override { - assert_dbg(); + cemu_assert_unimplemented(); + } + + void indexData_uploadIndexMemory(IndexAllocation& allocation) override + { + cemu_assert_unimplemented(); } // uniform diff --git a/src/Cafe/HW/Latte/Renderer/Renderer.h b/src/Cafe/HW/Latte/Renderer/Renderer.h index 0b694bb9..77d588b9 100644 --- a/src/Cafe/HW/Latte/Renderer/Renderer.h +++ b/src/Cafe/HW/Latte/Renderer/Renderer.h @@ -138,8 +138,15 @@ public: virtual void draw_endSequence() = 0; // index - virtual void* indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) = 0; - virtual void indexData_uploadIndexMemory(uint32 offset, uint32 size) = 0; + struct IndexAllocation + { + void* mem; // pointer to index data inside buffer + void* rendererInternal; // for renderer use + }; + + virtual IndexAllocation indexData_reserveIndexMemory(uint32 size) = 0; + virtual void indexData_releaseIndexMemory(IndexAllocation& allocation) = 0; + virtual void indexData_uploadIndexMemory(IndexAllocation& allocation) = 0; // occlusion queries virtual LatteQueryObject* occlusionQuery_create() = 0; diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.cpp index c4f47a2b..33af3651 100644 --- a/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.cpp @@ -23,11 +23,11 @@ void VKRSynchronizedRingAllocator::allocateAdditionalUploadBuffer(uint32 sizeReq AllocatorBuffer_t newBuffer{}; newBuffer.writeIndex = 0; newBuffer.basePtr = nullptr; - if (m_bufferType == BUFFER_TYPE::STAGING) + if (m_bufferType == VKR_BUFFER_TYPE::STAGING) m_vkrMemMgr->CreateBuffer(bufferAllocSize, VK_BUFFER_USAGE_TRANSFER_SRC_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, newBuffer.vk_buffer, newBuffer.vk_mem); - else if (m_bufferType == BUFFER_TYPE::INDEX) + else if (m_bufferType == VKR_BUFFER_TYPE::INDEX) m_vkrMemMgr->CreateBuffer(bufferAllocSize, VK_BUFFER_USAGE_INDEX_BUFFER_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, newBuffer.vk_buffer, newBuffer.vk_mem); - else if (m_bufferType == BUFFER_TYPE::STRIDE) + else if (m_bufferType == VKR_BUFFER_TYPE::STRIDE) m_vkrMemMgr->CreateBuffer(bufferAllocSize, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, newBuffer.vk_buffer, newBuffer.vk_mem); else cemu_assert_debug(false); @@ -53,7 +53,7 @@ VKRSynchronizedRingAllocator::AllocatorReservation_t VKRSynchronizedRingAllocato uint32 distanceToSyncPoint; if (!itr.queue_syncPoints.empty()) { - if(itr.queue_syncPoints.front().offset < itr.writeIndex) + if (itr.queue_syncPoints.front().offset < itr.writeIndex) distanceToSyncPoint = 0xFFFFFFFF; else distanceToSyncPoint = itr.queue_syncPoints.front().offset - itr.writeIndex; @@ -100,7 +100,7 @@ VKRSynchronizedRingAllocator::AllocatorReservation_t VKRSynchronizedRingAllocato void VKRSynchronizedRingAllocator::FlushReservation(AllocatorReservation_t& uploadReservation) { - cemu_assert_debug(m_bufferType == BUFFER_TYPE::STAGING); // only the staging buffer isn't coherent + cemu_assert_debug(m_bufferType == VKR_BUFFER_TYPE::STAGING); // only the staging buffer isn't coherent // todo - use nonCoherentAtomSize for flush size (instead of hardcoded constant) VkMappedMemoryRange flushedRange{}; flushedRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE; @@ -167,6 +167,70 @@ void VKRSynchronizedRingAllocator::GetStats(uint32& numBuffers, size_t& totalBuf } } +/* VKRSynchronizedHeapAllocator */ + +VKRSynchronizedHeapAllocator::VKRSynchronizedHeapAllocator(class VKRMemoryManager* vkMemoryManager, VKR_BUFFER_TYPE bufferType, size_t minimumBufferAllocSize) + : m_vkrMemMgr(vkMemoryManager), m_chunkedHeap(bufferType, minimumBufferAllocSize) {}; + +VKRSynchronizedHeapAllocator::AllocatorReservation* VKRSynchronizedHeapAllocator::AllocateBufferMemory(uint32 size, uint32 alignment) +{ + CHAddr addr = m_chunkedHeap.alloc(size, alignment); + m_activeAllocations.emplace_back(addr); + AllocatorReservation* res = m_poolAllocatorReservation.allocObj(); + res->bufferIndex = addr.chunkIndex; + res->bufferOffset = addr.offset; + res->size = size; + res->memPtr = m_chunkedHeap.GetChunkPtr(addr.chunkIndex) + addr.offset; + m_chunkedHeap.GetChunkVkMemInfo(addr.chunkIndex, res->vkBuffer, res->vkMem); + return res; +} + +void VKRSynchronizedHeapAllocator::FreeReservation(AllocatorReservation* uploadReservation) +{ + // put the allocation on a delayed release queue for the current command buffer + uint64 currentCommandBufferId = VulkanRenderer::GetInstance()->GetCurrentCommandBufferId(); + auto it = std::find_if(m_activeAllocations.begin(), m_activeAllocations.end(), [&uploadReservation](const TrackedAllocation& allocation) { return allocation.allocation.chunkIndex == uploadReservation->bufferIndex && allocation.allocation.offset == uploadReservation->bufferOffset; }); + cemu_assert_debug(it != m_activeAllocations.end()); + m_releaseQueue[currentCommandBufferId].emplace_back(it->allocation); + m_activeAllocations.erase(it); + m_poolAllocatorReservation.freeObj(uploadReservation); +} + +void VKRSynchronizedHeapAllocator::FlushReservation(AllocatorReservation* uploadReservation) +{ + if (m_chunkedHeap.RequiresFlush(uploadReservation->bufferIndex)) + { + VkMappedMemoryRange flushedRange{}; + flushedRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE; + flushedRange.memory = uploadReservation->vkMem; + flushedRange.offset = uploadReservation->bufferOffset; + flushedRange.size = uploadReservation->size; + vkFlushMappedMemoryRanges(VulkanRenderer::GetInstance()->GetLogicalDevice(), 1, &flushedRange); + } +} + +void VKRSynchronizedHeapAllocator::CleanupBuffer(uint64 latestFinishedCommandBufferId) +{ + auto it = m_releaseQueue.begin(); + while (it != m_releaseQueue.end()) + { + if (it->first <= latestFinishedCommandBufferId) + { + // release allocations + for(auto& addr : it->second) + m_chunkedHeap.free(addr); + it = m_releaseQueue.erase(it); + continue; + } + it++; + } +} + +void VKRSynchronizedHeapAllocator::GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const +{ + m_chunkedHeap.GetStats(numBuffers, totalBufferSize, freeBufferSize); +} + /* VkTextureChunkedHeap */ uint32 VkTextureChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize) @@ -175,7 +239,7 @@ uint32 VkTextureChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumA m_list_chunkInfo.resize(m_list_chunkInfo.size() + 1); // pad minimumAllocationSize to 32KB alignment - minimumAllocationSize = (minimumAllocationSize + (32*1024-1)) & ~(32 * 1024 - 1); + minimumAllocationSize = (minimumAllocationSize + (32 * 1024 - 1)) & ~(32 * 1024 - 1); uint32 allocationSize = 1024 * 1024 * 128; if (chunkIndex == 0) @@ -189,8 +253,7 @@ uint32 VkTextureChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumA std::vector deviceLocalMemoryTypeIndices = m_vkrMemoryManager->FindMemoryTypes(m_typeFilter, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); std::vector hostLocalMemoryTypeIndices = m_vkrMemoryManager->FindMemoryTypes(m_typeFilter, 0); // remove device local memory types from host local vector - auto pred = [&deviceLocalMemoryTypeIndices](const uint32& v) ->bool - { + auto pred = [&deviceLocalMemoryTypeIndices](const uint32& v) -> bool { return std::find(deviceLocalMemoryTypeIndices.begin(), deviceLocalMemoryTypeIndices.end(), v) != deviceLocalMemoryTypeIndices.end(); }; hostLocalMemoryTypeIndices.erase(std::remove_if(hostLocalMemoryTypeIndices.begin(), hostLocalMemoryTypeIndices.end(), pred), hostLocalMemoryTypeIndices.end()); @@ -206,7 +269,7 @@ uint32 VkTextureChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumA allocInfo.memoryTypeIndex = memType; VkDeviceMemory imageMemory; - VkResult r = vkAllocateMemory(m_device, &allocInfo, nullptr, &imageMemory); + VkResult r = vkAllocateMemory(VulkanRenderer::GetInstance()->GetLogicalDevice(), &allocInfo, nullptr, &imageMemory); if (r != VK_SUCCESS) continue; m_list_chunkInfo[chunkIndex].mem = imageMemory; @@ -221,7 +284,7 @@ uint32 VkTextureChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumA allocInfo.memoryTypeIndex = memType; VkDeviceMemory imageMemory; - VkResult r = vkAllocateMemory(m_device, &allocInfo, nullptr, &imageMemory); + VkResult r = vkAllocateMemory(VulkanRenderer::GetInstance()->GetLogicalDevice(), &allocInfo, nullptr, &imageMemory); if (r != VK_SUCCESS) continue; m_list_chunkInfo[chunkIndex].mem = imageMemory; @@ -238,6 +301,66 @@ uint32 VkTextureChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumA return 0; } +/* VkBufferChunkedHeap */ + +VKRBuffer* VKRBuffer::Create(VKR_BUFFER_TYPE bufferType, size_t bufferSize, VkMemoryPropertyFlags properties) +{ + auto* memMgr = VulkanRenderer::GetInstance()->GetMemoryManager(); + VkBuffer buffer; + VkDeviceMemory bufferMemory; + bool allocSuccess; + if (bufferType == VKR_BUFFER_TYPE::STAGING) + allocSuccess = memMgr->CreateBuffer2(bufferSize, VK_BUFFER_USAGE_TRANSFER_SRC_BIT, properties, buffer, bufferMemory); + else if (bufferType == VKR_BUFFER_TYPE::INDEX) + allocSuccess = memMgr->CreateBuffer2(bufferSize, VK_BUFFER_USAGE_INDEX_BUFFER_BIT, properties, buffer, bufferMemory); + else if (bufferType == VKR_BUFFER_TYPE::STRIDE) + allocSuccess = memMgr->CreateBuffer2(bufferSize, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, properties, buffer, bufferMemory); + else + cemu_assert_debug(false); + if (!allocSuccess) + return nullptr; + + VKRBuffer* bufferObj = new VKRBuffer(buffer, bufferMemory); + // if host visible, then map buffer + void* data = nullptr; + if (properties & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) + { + vkMapMemory(VulkanRenderer::GetInstance()->GetLogicalDevice(), bufferMemory, 0, bufferSize, 0, &data); + bufferObj->m_requiresFlush = !HAS_FLAG(properties, VK_MEMORY_PROPERTY_HOST_COHERENT_BIT); + } + bufferObj->m_mappedMemory = (uint8*)data; + return bufferObj; +} + +VKRBuffer::~VKRBuffer() +{ + if(m_mappedMemory) + vkUnmapMemory(VulkanRenderer::GetInstance()->GetLogicalDevice(), m_bufferMemory); + vkFreeMemory(VulkanRenderer::GetInstance()->GetLogicalDevice(), m_bufferMemory, nullptr); + vkDestroyBuffer(VulkanRenderer::GetInstance()->GetLogicalDevice(), m_buffer, nullptr); +} + +VkBufferChunkedHeap::~VkBufferChunkedHeap() +{ + for (auto& chunk : m_chunkBuffers) + delete chunk; +} + +uint32 VkBufferChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize) +{ + size_t allocationSize = std::max(m_minimumBufferAllocationSize, minimumAllocationSize); + VKRBuffer* buffer = VKRBuffer::Create(m_bufferType, allocationSize, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT); + if(!buffer) + buffer = VKRBuffer::Create(m_bufferType, allocationSize, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT); + if(!buffer) + VulkanRenderer::GetInstance()->UnrecoverableError("Failed to allocate buffer memory for VkBufferChunkedHeap"); + cemu_assert_debug(buffer); + cemu_assert_debug(m_chunkBuffers.size() == chunkIndex); + m_chunkBuffers.emplace_back(buffer); + // todo - VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT might be worth it? + return allocationSize; +} + uint32_t VKRMemoryManager::FindMemoryType(uint32_t typeFilter, VkMemoryPropertyFlags properties) const { VkPhysicalDeviceMemoryProperties memProperties; @@ -423,7 +546,7 @@ bool VKRMemoryManager::CreateBufferFromHostMemory(void* hostPointer, VkDeviceSiz importHostMem.sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_HOST_POINTER_INFO_EXT; importHostMem.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT; importHostMem.pHostPointer = hostPointer; - // VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT or + // VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT or // VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_MAPPED_FOREIGN_MEMORY_BIT_EXT // whats the difference ? @@ -469,7 +592,7 @@ VkImageMemAllocation* VKRMemoryManager::imageMemoryAllocate(VkImage image) auto it = map_textureHeap.find(typeFilter); if (it == map_textureHeap.end()) { - texHeap = new VkTextureChunkedHeap(this, typeFilter, m_vkr->GetLogicalDevice()); + texHeap = new VkTextureChunkedHeap(this, typeFilter); map_textureHeap.emplace(typeFilter, texHeap); } else diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.h b/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.h index bf2d919b..ecf53996 100644 --- a/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.h +++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.h @@ -2,6 +2,36 @@ #include "Cafe/HW/Latte/Renderer/Renderer.h" #include "Cafe/HW/Latte/Renderer/Vulkan/VulkanAPI.h" #include "util/ChunkedHeap/ChunkedHeap.h" +#include "util/helpers/MemoryPool.h" + +enum class VKR_BUFFER_TYPE +{ + STAGING, // staging upload buffer + INDEX, // buffer for index data + STRIDE, // buffer for stride-adjusted vertex data +}; + +class VKRBuffer +{ + public: + static VKRBuffer* Create(VKR_BUFFER_TYPE bufferType, size_t bufferSize, VkMemoryPropertyFlags properties); + ~VKRBuffer(); + + VkBuffer GetVkBuffer() const { return m_buffer; } + VkDeviceMemory GetVkBufferMemory() const { return m_bufferMemory; } + + uint8* GetPtr() const { return m_mappedMemory; } + + bool RequiresFlush() const { return m_requiresFlush; } + + private: + VKRBuffer(VkBuffer buffer, VkDeviceMemory bufferMem) : m_buffer(buffer), m_bufferMemory(bufferMem) { }; + + VkBuffer m_buffer; + VkDeviceMemory m_bufferMemory; + uint8* m_mappedMemory; + bool m_requiresFlush{false}; +}; struct VkImageMemAllocation { @@ -17,15 +47,13 @@ struct VkImageMemAllocation class VkTextureChunkedHeap : private ChunkedHeap { public: - VkTextureChunkedHeap(class VKRMemoryManager* memoryManager, uint32 typeFilter, VkDevice device) : m_vkrMemoryManager(memoryManager), m_typeFilter(typeFilter), m_device(device) { }; + VkTextureChunkedHeap(class VKRMemoryManager* memoryManager, uint32 typeFilter) : m_vkrMemoryManager(memoryManager), m_typeFilter(typeFilter) { }; struct ChunkInfo { VkDeviceMemory mem; }; - uint32 allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize) override; - CHAddr allocMem(uint32 size, uint32 alignment) { if (alignment < 4) @@ -43,11 +71,6 @@ public: this->free(addr); } - void setDevice(VkDevice dev) - { - m_device = dev; - } - VkDeviceMemory getChunkMem(uint32 index) { if (index >= m_list_chunkInfo.size()) @@ -61,24 +84,69 @@ public: allocatedBytes = numAllocatedBytes; } - VkDevice m_device; + private: + uint32 allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize) override; + uint32 m_typeFilter{ 0xFFFFFFFF }; class VKRMemoryManager* m_vkrMemoryManager; std::vector m_list_chunkInfo; }; +class VkBufferChunkedHeap : private ChunkedHeap +{ + public: + VkBufferChunkedHeap(VKR_BUFFER_TYPE bufferType, size_t minimumBufferAllocationSize) : m_bufferType(bufferType), m_minimumBufferAllocationSize(minimumBufferAllocationSize) { }; + ~VkBufferChunkedHeap(); + + using ChunkedHeap::alloc; + using ChunkedHeap::free; + + uint8* GetChunkPtr(uint32 index) const + { + if (index >= m_chunkBuffers.size()) + return nullptr; + return m_chunkBuffers[index]->GetPtr(); + } + + void GetChunkVkMemInfo(uint32 index, VkBuffer& buffer, VkDeviceMemory& mem) + { + if (index >= m_chunkBuffers.size()) + { + buffer = VK_NULL_HANDLE; + mem = VK_NULL_HANDLE; + return; + } + buffer = m_chunkBuffers[index]->GetVkBuffer(); + mem = m_chunkBuffers[index]->GetVkBufferMemory(); + } + + void GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const + { + numBuffers = m_chunkBuffers.size(); + totalBufferSize = numHeapBytes; + freeBufferSize = numHeapBytes - numAllocatedBytes; + } + + bool RequiresFlush(uint32 index) const + { + if (index >= m_chunkBuffers.size()) + return false; + return m_chunkBuffers[index]->RequiresFlush(); + } + + private: + uint32 allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize) override; + + VKR_BUFFER_TYPE m_bufferType; + std::vector m_chunkBuffers; + size_t m_minimumBufferAllocationSize; +}; + // a circular ring-buffer which tracks and releases memory per command-buffer class VKRSynchronizedRingAllocator { public: - enum class BUFFER_TYPE - { - STAGING, // staging upload buffer - INDEX, // buffer for index data - STRIDE, // buffer for stride-adjusted vertex data - }; - - VKRSynchronizedRingAllocator(class VulkanRenderer* vkRenderer, class VKRMemoryManager* vkMemoryManager, BUFFER_TYPE bufferType, uint32 minimumBufferAllocSize) : m_vkr(vkRenderer), m_vkrMemMgr(vkMemoryManager), m_bufferType(bufferType), m_minimumBufferAllocSize(minimumBufferAllocSize) {}; + VKRSynchronizedRingAllocator(class VulkanRenderer* vkRenderer, class VKRMemoryManager* vkMemoryManager, VKR_BUFFER_TYPE bufferType, uint32 minimumBufferAllocSize) : m_vkr(vkRenderer), m_vkrMemMgr(vkMemoryManager), m_bufferType(bufferType), m_minimumBufferAllocSize(minimumBufferAllocSize) {}; VKRSynchronizedRingAllocator(const VKRSynchronizedRingAllocator&) = delete; // disallow copy struct BufferSyncPoint_t @@ -126,13 +194,53 @@ private: const class VulkanRenderer* m_vkr; const class VKRMemoryManager* m_vkrMemMgr; - const BUFFER_TYPE m_bufferType; + const VKR_BUFFER_TYPE m_bufferType; const uint32 m_minimumBufferAllocSize; std::vector m_buffers; }; +// heap style allocator with released memory being freed after the current command buffer finishes +class VKRSynchronizedHeapAllocator +{ + struct TrackedAllocation + { + TrackedAllocation(CHAddr allocation) : allocation(allocation) {}; + CHAddr allocation; + }; + + public: + VKRSynchronizedHeapAllocator(class VKRMemoryManager* vkMemoryManager, VKR_BUFFER_TYPE bufferType, size_t minimumBufferAllocSize); + VKRSynchronizedHeapAllocator(const VKRSynchronizedHeapAllocator&) = delete; // disallow copy + + struct AllocatorReservation + { + VkBuffer vkBuffer; + VkDeviceMemory vkMem; + uint8* memPtr; + uint32 bufferOffset; + uint32 size; + uint32 bufferIndex; + }; + + AllocatorReservation* AllocateBufferMemory(uint32 size, uint32 alignment); + void FreeReservation(AllocatorReservation* uploadReservation); + void FlushReservation(AllocatorReservation* uploadReservation); + + void CleanupBuffer(uint64 latestFinishedCommandBufferId); + + void GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const; + private: + const class VKRMemoryManager* m_vkrMemMgr; + VkBufferChunkedHeap m_chunkedHeap; + // allocations + std::vector m_activeAllocations; + MemoryPool m_poolAllocatorReservation{32}; + // release queue + std::unordered_map> m_releaseQueue; +}; + void LatteIndices_invalidateAll(); class VKRMemoryManager @@ -140,9 +248,9 @@ class VKRMemoryManager friend class VKRSynchronizedRingAllocator; public: VKRMemoryManager(class VulkanRenderer* renderer) : - m_stagingBuffer(renderer, this, VKRSynchronizedRingAllocator::BUFFER_TYPE::STAGING, 32u * 1024 * 1024), - m_indexBuffer(renderer, this, VKRSynchronizedRingAllocator::BUFFER_TYPE::INDEX, 4u * 1024 * 1024), - m_vertexStrideMetalBuffer(renderer, this, VKRSynchronizedRingAllocator::BUFFER_TYPE::STRIDE, 4u * 1024 * 1024) + m_stagingBuffer(renderer, this, VKR_BUFFER_TYPE::STAGING, 32u * 1024 * 1024), + m_indexBuffer(this, VKR_BUFFER_TYPE::INDEX, 4u * 1024 * 1024), + m_vertexStrideMetalBuffer(renderer, this, VKR_BUFFER_TYPE::STRIDE, 4u * 1024 * 1024) { m_vkr = renderer; } @@ -167,7 +275,7 @@ public: } VKRSynchronizedRingAllocator& getStagingAllocator() { return m_stagingBuffer; }; // allocator for texture/attribute/uniform uploads - VKRSynchronizedRingAllocator& getIndexAllocator() { return m_indexBuffer; }; // allocator for index data + VKRSynchronizedHeapAllocator& GetIndexAllocator() { return m_indexBuffer; }; // allocator for index data VKRSynchronizedRingAllocator& getMetalStrideWorkaroundAllocator() { return m_vertexStrideMetalBuffer; }; // allocator for stride-adjusted vertex data void cleanupBuffers(uint64 latestFinishedCommandBufferId) @@ -202,6 +310,6 @@ public: private: class VulkanRenderer* m_vkr; VKRSynchronizedRingAllocator m_stagingBuffer; - VKRSynchronizedRingAllocator m_indexBuffer; + VKRSynchronizedHeapAllocator m_indexBuffer; VKRSynchronizedRingAllocator m_vertexStrideMetalBuffer; }; diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp index 37432eeb..20163987 100644 --- a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp @@ -3699,7 +3699,7 @@ void VulkanRenderer::bufferCache_copyStreamoutToMainBuffer(uint32 srcOffset, uin void VulkanRenderer::AppendOverlayDebugInfo() { - ImGui::Text("--- Vulkan info ---"); + ImGui::Text("--- Vulkan debug info ---"); ImGui::Text("GfxPipelines %u", performanceMonitor.vk.numGraphicPipelines.get()); ImGui::Text("DescriptorSets %u", performanceMonitor.vk.numDescriptorSets.get()); ImGui::Text("DS ImgSamplers %u", performanceMonitor.vk.numDescriptorSamplerTextures.get()); @@ -3716,7 +3716,7 @@ void VulkanRenderer::AppendOverlayDebugInfo() ImGui::Text("BeginRP/f %u", performanceMonitor.vk.numBeginRenderpassPerFrame.get()); ImGui::Text("Barriers/f %u", performanceMonitor.vk.numDrawBarriersPerFrame.get()); - ImGui::Text("--- Cache info ---"); + ImGui::Text("--- Cache debug info ---"); uint32 bufferCacheHeapSize = 0; uint32 bufferCacheAllocationSize = 0; @@ -3736,7 +3736,7 @@ void VulkanRenderer::AppendOverlayDebugInfo() ImGui::SameLine(60.0f); ImGui::Text("%06uKB / %06uKB Buffers: %u", ((uint32)(totalSize - freeSize) + 1023) / 1024, ((uint32)totalSize + 1023) / 1024, (uint32)numBuffers); - memoryManager->getIndexAllocator().GetStats(numBuffers, totalSize, freeSize); + memoryManager->GetIndexAllocator().GetStats(numBuffers, totalSize, freeSize); ImGui::Text("Index"); ImGui::SameLine(60.0f); ImGui::Text("%06uKB / %06uKB Buffers: %u", ((uint32)(totalSize - freeSize) + 1023) / 1024, ((uint32)totalSize + 1023) / 1024, (uint32)numBuffers); diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h index 52c1c6ed..5ef4558d 100644 --- a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h @@ -328,8 +328,9 @@ public: RendererShader* shader_create(RendererShader::ShaderType type, uint64 baseHash, uint64 auxHash, const std::string& source, bool isGameShader, bool isGfxPackShader) override; - void* indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) override; - void indexData_uploadIndexMemory(uint32 offset, uint32 size) override; + IndexAllocation indexData_reserveIndexMemory(uint32 size) override; + void indexData_releaseIndexMemory(IndexAllocation& allocation) override; + void indexData_uploadIndexMemory(IndexAllocation& allocation) override; // externally callable void GetTextureFormatInfoVK(Latte::E_GX2SURFFMT format, bool isDepth, Latte::E_DIM dim, sint32 width, sint32 height, FormatInfoVK* formatInfoOut); diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRendererCore.cpp b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRendererCore.cpp index 3a684072..a72b093b 100644 --- a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRendererCore.cpp +++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRendererCore.cpp @@ -357,18 +357,20 @@ PipelineInfo* VulkanRenderer::draw_getOrCreateGraphicsPipeline(uint32 indexCount return draw_createGraphicsPipeline(indexCount); } -void* VulkanRenderer::indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) +Renderer::IndexAllocation VulkanRenderer::indexData_reserveIndexMemory(uint32 size) { - auto& indexAllocator = this->memoryManager->getIndexAllocator(); - auto resv = indexAllocator.AllocateBufferMemory(size, 32); - offset = resv.bufferOffset; - bufferIndex = resv.bufferIndex; - return resv.memPtr; + VKRSynchronizedHeapAllocator::AllocatorReservation* resv = memoryManager->GetIndexAllocator().AllocateBufferMemory(size, 32); + return { resv->memPtr, resv }; } -void VulkanRenderer::indexData_uploadIndexMemory(uint32 offset, uint32 size) +void VulkanRenderer::indexData_releaseIndexMemory(IndexAllocation& allocation) { - // does nothing since the index buffer memory is coherent + memoryManager->GetIndexAllocator().FreeReservation((VKRSynchronizedHeapAllocator::AllocatorReservation*)allocation.rendererInternal); +} + +void VulkanRenderer::indexData_uploadIndexMemory(IndexAllocation& allocation) +{ + memoryManager->GetIndexAllocator().FlushReservation((VKRSynchronizedHeapAllocator::AllocatorReservation*)allocation.rendererInternal); } float s_vkUniformData[512 * 4]; @@ -1415,14 +1417,15 @@ void VulkanRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 uint32 hostIndexCount; uint32 indexMin = 0; uint32 indexMax = 0; - uint32 indexBufferOffset = 0; - uint32 indexBufferIndex = 0; - LatteIndices_decode(memory_getPointerFromVirtualOffset(indexDataMPTR), indexType, count, primitiveMode, indexMin, indexMax, hostIndexType, hostIndexCount, indexBufferOffset, indexBufferIndex); - + Renderer::IndexAllocation indexAllocation; + LatteIndices_decode(memory_getPointerFromVirtualOffset(indexDataMPTR), indexType, count, primitiveMode, indexMin, indexMax, hostIndexType, hostIndexCount, indexAllocation); + VKRSynchronizedHeapAllocator::AllocatorReservation* indexReservation = (VKRSynchronizedHeapAllocator::AllocatorReservation*)indexAllocation.rendererInternal; // update index binding bool isPrevIndexData = false; if (hostIndexType != INDEX_TYPE::NONE) { + uint32 indexBufferIndex = indexReservation->bufferIndex; + uint32 indexBufferOffset = indexReservation->bufferOffset; if (m_state.activeIndexBufferOffset != indexBufferOffset || m_state.activeIndexBufferIndex != indexBufferIndex || m_state.activeIndexType != hostIndexType) { m_state.activeIndexType = hostIndexType; @@ -1435,7 +1438,7 @@ void VulkanRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 vkType = VK_INDEX_TYPE_UINT32; else cemu_assert(false); - vkCmdBindIndexBuffer(m_state.currentCommandBuffer, memoryManager->getIndexAllocator().GetBufferByIndex(indexBufferIndex), indexBufferOffset, vkType); + vkCmdBindIndexBuffer(m_state.currentCommandBuffer, indexReservation->vkBuffer, indexBufferOffset, vkType); } else isPrevIndexData = true;