Merge pull request #19 from SamoZ256/metal-allocator-rework

Allocator rework
This commit is contained in:
SamoZ256 2025-01-18 19:18:24 +01:00 committed by GitHub
commit f0cf61461c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 445 additions and 450 deletions

View file

@ -555,6 +555,7 @@ if(ENABLE_METAL)
HW/Latte/Renderer/Metal/CachedFBOMtl.h
HW/Latte/Renderer/Metal/MetalAttachmentsInfo.cpp
HW/Latte/Renderer/Metal/MetalAttachmentsInfo.h
HW/Latte/Renderer/Metal/MetalBufferAllocator.cpp
HW/Latte/Renderer/Metal/MetalBufferAllocator.h
HW/Latte/Renderer/Metal/MetalMemoryManager.cpp
HW/Latte/Renderer/Metal/MetalMemoryManager.h

View file

@ -0,0 +1,217 @@
#include "Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h"
MetalBufferChunkedHeap::~MetalBufferChunkedHeap()
{
for (auto& chunk : m_chunkBuffers)
chunk->release();
}
uint32 MetalBufferChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize)
{
size_t allocationSize = std::max<size_t>(m_minimumBufferAllocationSize, minimumAllocationSize);
MTL::Buffer* buffer = m_mtlr->GetDevice()->newBuffer(allocationSize, m_options);
cemu_assert_debug(buffer);
cemu_assert_debug(m_chunkBuffers.size() == chunkIndex);
m_chunkBuffers.emplace_back(buffer);
return allocationSize;
}
void MetalSynchronizedRingAllocator::addUploadBufferSyncPoint(AllocatorBuffer_t& buffer, uint32 offset)
{
auto commandBuffer = m_mtlr->GetCurrentCommandBuffer();
if (commandBuffer == buffer.lastSyncpointCommandBuffer)
return;
buffer.lastSyncpointCommandBuffer = commandBuffer;
buffer.queue_syncPoints.emplace(commandBuffer, offset);
}
void MetalSynchronizedRingAllocator::allocateAdditionalUploadBuffer(uint32 sizeRequiredForAlloc)
{
// calculate buffer size, should be a multiple of bufferAllocSize that is at least as large as sizeRequiredForAlloc
uint32 bufferAllocSize = m_minimumBufferAllocSize;
while (bufferAllocSize < sizeRequiredForAlloc)
bufferAllocSize += m_minimumBufferAllocSize;
AllocatorBuffer_t newBuffer{};
newBuffer.writeIndex = 0;
newBuffer.basePtr = nullptr;
newBuffer.mtlBuffer = m_mtlr->GetDevice()->newBuffer(bufferAllocSize, m_options);
newBuffer.basePtr = (uint8*)newBuffer.mtlBuffer->contents();
newBuffer.size = bufferAllocSize;
newBuffer.index = (uint32)m_buffers.size();
m_buffers.push_back(newBuffer);
}
MetalSynchronizedRingAllocator::AllocatorReservation_t MetalSynchronizedRingAllocator::AllocateBufferMemory(uint32 size, uint32 alignment)
{
if (alignment < 128)
alignment = 128;
size = (size + 127) & ~127;
for (auto& itr : m_buffers)
{
// align pointer
uint32 alignmentPadding = (alignment - (itr.writeIndex % alignment)) % alignment;
uint32 distanceToSyncPoint;
if (!itr.queue_syncPoints.empty())
{
if (itr.queue_syncPoints.front().offset < itr.writeIndex)
distanceToSyncPoint = 0xFFFFFFFF;
else
distanceToSyncPoint = itr.queue_syncPoints.front().offset - itr.writeIndex;
}
else
distanceToSyncPoint = 0xFFFFFFFF;
uint32 spaceNeeded = alignmentPadding + size;
if (spaceNeeded > distanceToSyncPoint)
continue; // not enough space in current buffer
if ((itr.writeIndex + spaceNeeded) > itr.size)
{
// wrap-around
spaceNeeded = size;
alignmentPadding = 0;
// check if there is enough space in current buffer after wrap-around
if (!itr.queue_syncPoints.empty())
{
distanceToSyncPoint = itr.queue_syncPoints.front().offset - 0;
if (spaceNeeded > distanceToSyncPoint)
continue;
}
else if (spaceNeeded > itr.size)
continue;
itr.writeIndex = 0;
}
addUploadBufferSyncPoint(itr, itr.writeIndex);
itr.writeIndex += alignmentPadding;
uint32 offset = itr.writeIndex;
itr.writeIndex += size;
itr.cleanupCounter = 0;
MetalSynchronizedRingAllocator::AllocatorReservation_t res;
res.mtlBuffer = itr.mtlBuffer;
res.memPtr = itr.basePtr + offset;
res.bufferOffset = offset;
res.size = size;
res.bufferIndex = itr.index;
return res;
}
// allocate new buffer
allocateAdditionalUploadBuffer(size);
return AllocateBufferMemory(size, alignment);
}
void MetalSynchronizedRingAllocator::FlushReservation(AllocatorReservation_t& uploadReservation)
{
if (RequiresFlush())
{
uploadReservation.mtlBuffer->didModifyRange(NS::Range(uploadReservation.bufferOffset, uploadReservation.size));
}
}
void MetalSynchronizedRingAllocator::CleanupBuffer(MTL::CommandBuffer* latestFinishedCommandBuffer)
{
for (auto& itr : m_buffers)
{
while (!itr.queue_syncPoints.empty() && latestFinishedCommandBuffer == itr.queue_syncPoints.front().commandBuffer)
{
itr.queue_syncPoints.pop();
}
if (itr.queue_syncPoints.empty())
itr.cleanupCounter++;
}
// check if last buffer is available for deletion
if (m_buffers.size() >= 2)
{
auto& lastBuffer = m_buffers.back();
if (lastBuffer.cleanupCounter >= 1000)
{
// release buffer
lastBuffer.mtlBuffer->release();
m_buffers.pop_back();
}
}
}
MTL::Buffer* MetalSynchronizedRingAllocator::GetBufferByIndex(uint32 index) const
{
return m_buffers[index].mtlBuffer;
}
void MetalSynchronizedRingAllocator::GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const
{
numBuffers = (uint32)m_buffers.size();
totalBufferSize = 0;
freeBufferSize = 0;
for (auto& itr : m_buffers)
{
totalBufferSize += itr.size;
// calculate free space in buffer
uint32 distanceToSyncPoint;
if (!itr.queue_syncPoints.empty())
{
if (itr.queue_syncPoints.front().offset < itr.writeIndex)
distanceToSyncPoint = (itr.size - itr.writeIndex) + itr.queue_syncPoints.front().offset; // size with wrap-around
else
distanceToSyncPoint = itr.queue_syncPoints.front().offset - itr.writeIndex;
}
else
distanceToSyncPoint = itr.size;
freeBufferSize += distanceToSyncPoint;
}
}
/* MetalSynchronizedHeapAllocator */
MetalSynchronizedHeapAllocator::AllocatorReservation* MetalSynchronizedHeapAllocator::AllocateBufferMemory(uint32 size, uint32 alignment)
{
CHAddr addr = m_chunkedHeap.alloc(size, alignment);
m_activeAllocations.emplace_back(addr);
AllocatorReservation* res = m_poolAllocatorReservation.allocObj();
res->bufferIndex = addr.chunkIndex;
res->bufferOffset = addr.offset;
res->size = size;
res->mtlBuffer = m_chunkedHeap.GetBufferByIndex(addr.chunkIndex);
res->memPtr = m_chunkedHeap.GetChunkPtr(addr.chunkIndex) + addr.offset;
return res;
}
void MetalSynchronizedHeapAllocator::FreeReservation(AllocatorReservation* uploadReservation)
{
// put the allocation on a delayed release queue for the current command buffer
MTL::CommandBuffer* currentCommandBuffer = m_mtlr->GetCurrentCommandBuffer();
auto it = std::find_if(m_activeAllocations.begin(), m_activeAllocations.end(), [&uploadReservation](const TrackedAllocation& allocation) { return allocation.allocation.chunkIndex == uploadReservation->bufferIndex && allocation.allocation.offset == uploadReservation->bufferOffset; });
cemu_assert_debug(it != m_activeAllocations.end());
m_releaseQueue[currentCommandBuffer].emplace_back(it->allocation);
m_activeAllocations.erase(it);
m_poolAllocatorReservation.freeObj(uploadReservation);
}
void MetalSynchronizedHeapAllocator::FlushReservation(AllocatorReservation* uploadReservation)
{
if (m_chunkedHeap.RequiresFlush())
{
uploadReservation->mtlBuffer->didModifyRange(NS::Range(uploadReservation->bufferOffset, uploadReservation->size));
}
}
void MetalSynchronizedHeapAllocator::CleanupBuffer(MTL::CommandBuffer* latestFinishedCommandBuffer)
{
auto it = m_releaseQueue.find(latestFinishedCommandBuffer);
if (it == m_releaseQueue.end())
return;
// release allocations
for (auto& addr : it->second)
m_chunkedHeap.free(addr);
m_releaseQueue.erase(it);
}
void MetalSynchronizedHeapAllocator::GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const
{
m_chunkedHeap.GetStats(numBuffers, totalBufferSize, freeBufferSize);
}

View file

@ -1,354 +1,163 @@
#pragma once
#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h"
#include "Common/precompiled.h"
#include "Metal/MTLResource.hpp"
#include "util/ChunkedHeap/ChunkedHeap.h"
#include "util/helpers/MemoryPool.h"
#include <utility>
struct MetalBufferRange
inline MTL::ResourceOptions GetResourceOptions(MTL::ResourceOptions options)
{
size_t offset;
size_t size;
};
if (options & MTL::ResourceStorageModeShared || options & MTL::ResourceStorageModeManaged)
options |= MTL::ResourceCPUCacheModeWriteCombined;
constexpr size_t BASE_ALLOCATION_SIZE = 8 * 1024 * 1024; // 8 MB
constexpr size_t MAX_ALLOCATION_SIZE = 64 * 1024 * 1024; // 64 MB
return options;
}
void LatteIndices_invalidateAll();
template<typename BufferT>
class MetalBufferAllocator
class MetalBufferChunkedHeap : private ChunkedHeap<>
{
public:
struct Buffer
MetalBufferChunkedHeap(const class MetalRenderer* mtlRenderer, MTL::ResourceOptions options, size_t minimumBufferAllocationSize) : m_mtlr(mtlRenderer), m_options(GetResourceOptions(options)), m_minimumBufferAllocationSize(minimumBufferAllocationSize) { };
~MetalBufferChunkedHeap();
using ChunkedHeap::alloc;
using ChunkedHeap::free;
uint8* GetChunkPtr(uint32 index) const
{
MTL::Buffer* m_buffer;
std::vector<MetalBufferRange> m_freeRanges;
BufferT m_data;
};
if (index >= m_chunkBuffers.size())
return nullptr;
MetalBufferAllocator(class MetalRenderer* metalRenderer, MTL::ResourceOptions storageMode) : m_mtlr{metalRenderer} {
m_isCPUAccessible = (storageMode == MTL::ResourceStorageModeShared) || (storageMode == MTL::ResourceStorageModeManaged);
m_options = storageMode;
if (m_isCPUAccessible)
m_options |= MTL::ResourceCPUCacheModeWriteCombined;
return (uint8*)m_chunkBuffers[index]->contents();
}
~MetalBufferAllocator()
MTL::Buffer* GetBufferByIndex(uint32 index) const
{
for (auto buffer : m_buffers)
{
buffer.m_buffer->release();
}
cemu_assert_debug(index < m_chunkBuffers.size());
return m_chunkBuffers[index];
}
void ResetAllocations()
bool RequiresFlush() const
{
for (uint32 i = 0; i < m_buffers.size(); i++)
FreeBuffer(i);
return m_options & MTL::ResourceStorageModeManaged;
}
MTL::Buffer* GetBuffer(uint32 bufferIndex)
void GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const
{
return m_buffers[bufferIndex].m_buffer;
numBuffers = m_chunkBuffers.size();
totalBufferSize = m_numHeapBytes;
freeBufferSize = m_numHeapBytes - m_numAllocatedBytes;
}
MetalBufferAllocation GetBufferAllocation(size_t size)
{
// Align the size
size = Align(size, 128);
// First, try to find a free range
for (uint32 i = 0; i < m_buffers.size(); i++)
{
auto& buffer = m_buffers[i];
for (uint32 j = 0; j < buffer.m_freeRanges.size(); j++)
{
auto& range = buffer.m_freeRanges[j];
if (size <= range.size)
{
MetalBufferAllocation allocation;
allocation.bufferIndex = i;
allocation.offset = range.offset;
allocation.size = size;
allocation.data = (m_isCPUAccessible ? (uint8*)buffer.m_buffer->contents() + range.offset : nullptr);
range.offset += size;
range.size -= size;
if (range.size == 0)
{
buffer.m_freeRanges.erase(buffer.m_freeRanges.begin() + j);
}
return allocation;
}
}
}
// If no free range was found, allocate a new buffer
size_t allocationSize = BASE_ALLOCATION_SIZE * (1u << m_buffers.size());
allocationSize = std::min(allocationSize, MAX_ALLOCATION_SIZE); // Limit the allocation size
allocationSize = std::max(allocationSize, size);
MTL::Buffer* mtlBuffer = m_mtlr->GetDevice()->newBuffer(allocationSize, m_options);
#ifdef CEMU_DEBUG_ASSERT
mtlBuffer->setLabel(GetLabel("Buffer from buffer allocator", mtlBuffer));
#endif
MetalBufferAllocation allocation;
allocation.bufferIndex = m_buffers.size();
allocation.offset = 0;
allocation.size = size;
allocation.data = (m_isCPUAccessible ? mtlBuffer->contents() : nullptr);
m_buffers.push_back({mtlBuffer});
auto& buffer = m_buffers.back();
// If the buffer is larger than the requested size, add the remaining space to the free buffer ranges
if (size < allocationSize)
{
MetalBufferRange range;
range.offset = size;
range.size = allocationSize - size;
buffer.m_freeRanges.push_back(range);
}
// Debug
m_mtlr->GetPerformanceMonitor().m_bufferAllocatorMemory += allocationSize;
return allocation;
}
void FreeAllocation(MetalBufferAllocation& allocation)
{
MetalBufferRange range;
range.offset = allocation.offset;
range.size = allocation.size;
allocation.offset = INVALID_OFFSET;
// Find the correct position to insert the free range
auto& buffer = m_buffers[allocation.bufferIndex];
for (uint32 i = 0; i < buffer.m_freeRanges.size(); i++)
{
auto& freeRange = buffer.m_freeRanges[i];
if (freeRange.offset + freeRange.size == range.offset)
{
freeRange.size += range.size;
return;
}
}
buffer.m_freeRanges.push_back(range);
}
protected:
class MetalRenderer* m_mtlr;
// TODO: make these template arguments
bool m_isCPUAccessible;
MTL::ResourceOptions m_options;
std::vector<Buffer> m_buffers;
void FreeBuffer(uint32 bufferIndex)
{
auto& buffer = m_buffers[bufferIndex];
buffer.m_freeRanges.clear();
buffer.m_freeRanges.push_back({0, buffer.m_buffer->length()});
}
};
struct Empty {};
typedef MetalBufferAllocator<Empty> MetalDefaultBufferAllocator;
struct MetalSyncedBuffer
{
uint32 m_commandBufferCount = 0;
MTL::CommandBuffer* m_lastCommandBuffer = nullptr;
uint32 m_lock = 0;
bool IsLocked() const
{
return (m_lock != 0);
}
};
constexpr uint16 BUFFER_RELEASE_FRAME_TRESHOLD = 1024;
class MetalTemporaryBufferAllocator : public MetalBufferAllocator<MetalSyncedBuffer>
{
public:
MetalTemporaryBufferAllocator(class MetalRenderer* metalRenderer) : MetalBufferAllocator<MetalSyncedBuffer>(metalRenderer, MTL::ResourceStorageModeShared) {}
void LockBuffer(uint32 bufferIndex)
{
m_buffers[bufferIndex].m_data.m_lock++;
}
void UnlockBuffer(uint32 bufferIndex)
{
auto& buffer = m_buffers[bufferIndex];
buffer.m_data.m_lock--;
// Release the buffer if it wasn't released due to the lock
if (!buffer.m_data.IsLocked() && buffer.m_data.m_commandBufferCount == 0)
FreeBuffer(bufferIndex);
}
void EndFrame()
{
// Unlock all buffers
for (uint32_t i = 0; i < m_buffers.size(); i++)
{
auto& buffer = m_buffers[i];
if (buffer.m_data.IsLocked())
{
if (buffer.m_data.m_commandBufferCount == 0)
FreeBuffer(i);
buffer.m_data.m_lock = 0;
}
}
// TODO: do this for other buffer allocators as well?
// Track how many frames have passed since the last access to the back buffer
if (!m_buffers.empty())
{
auto& backBuffer = m_buffers.back();
if (backBuffer.m_data.m_commandBufferCount == 0)
{
// Release the back buffer if it hasn't been accessed for a while
if (m_framesSinceBackBufferAccess >= BUFFER_RELEASE_FRAME_TRESHOLD)
{
// Debug
m_mtlr->GetPerformanceMonitor().m_bufferAllocatorMemory -= backBuffer.m_buffer->length();
backBuffer.m_buffer->release();
m_buffers.pop_back();
m_framesSinceBackBufferAccess = 0;
}
else
{
m_framesSinceBackBufferAccess++;
}
}
else
{
m_framesSinceBackBufferAccess = 0;
}
}
}
void SetActiveCommandBuffer(MTL::CommandBuffer* commandBuffer)
{
m_activeCommandBuffer = commandBuffer;
if (commandBuffer)
{
auto result = m_executingCommandBuffers.emplace(std::make_pair(m_activeCommandBuffer, std::vector<uint32>{}));
cemu_assert_debug(result.second);
m_activeCommandBufferIt = result.first;
}
else
{
m_activeCommandBufferIt = m_executingCommandBuffers.end();
}
}
void CommandBufferFinished(MTL::CommandBuffer* commandBuffer)
{
auto it = m_executingCommandBuffers.find(commandBuffer);
for (auto bufferIndex : it->second)
{
auto& buffer = m_buffers[bufferIndex];
buffer.m_data.m_commandBufferCount--;
// TODO: is this neccessary?
if (!buffer.m_data.IsLocked() && buffer.m_data.m_commandBufferCount == 0)
FreeBuffer(bufferIndex);
}
m_executingCommandBuffers.erase(it);
}
MTL::Buffer* GetBuffer(uint32 bufferIndex)
{
cemu_assert_debug(m_activeCommandBuffer);
auto& buffer = m_buffers[bufferIndex];
if (buffer.m_data.m_commandBufferCount == 0 || buffer.m_data.m_lastCommandBuffer != m_activeCommandBuffer)
{
m_activeCommandBufferIt->second.push_back(bufferIndex);
buffer.m_data.m_commandBufferCount++;
buffer.m_data.m_lastCommandBuffer = m_activeCommandBuffer;
}
return buffer.m_buffer;
}
MTL::Buffer* GetBufferOutsideOfCommandBuffer(uint32 bufferIndex)
{
return m_buffers[bufferIndex].m_buffer;
}
/*
MetalBufferAllocation GetBufferAllocation(size_t size)
{
if (!m_activeCommandBuffer)
throw std::runtime_error("No active command buffer when allocating a buffer!");
auto allocation = MetalBufferAllocator<MetalSyncedBuffer>::GetBufferAllocation(size);
auto& buffer = m_buffers[allocation.bufferIndex];
if (buffer.m_commandBuffers.empty() || buffer.m_commandBuffers.back() != m_activeCommandBuffer)
buffer.m_commandBuffers.push_back(m_activeCommandBuffer);
return allocation;
}
*/
// For debugging
/*
void LogInfo()
{
debug_printf("BUFFERS:\n");
for (auto& buffer : m_buffers)
{
debug_printf(" %p -> size: %lu, command buffers: %zu\n", buffer.m_buffer, buffer.m_buffer->length(), buffer.m_data.m_commandBuffers.size());
uint32 same = 0;
uint32 completed = 0;
for (uint32 i = 0; i < buffer.m_data.m_commandBuffers.size(); i++)
{
if (m_mtlr->CommandBufferCompleted(buffer.m_data.m_commandBuffers[i]))
completed++;
for (uint32 j = 0; j < buffer.m_data.m_commandBuffers.size(); j++)
{
if (i != j && buffer.m_data.m_commandBuffers[i] == buffer.m_data.m_commandBuffers[j])
same++;
}
}
debug_printf(" same: %u\n", same);
debug_printf(" completed: %u\n", completed);
debug_printf(" FREE RANGES:\n");
for (auto& range : buffer.m_freeRanges)
{
debug_printf(" offset: %zu, size: %zu\n", range.offset, range.size);
}
}
}
*/
private:
MTL::CommandBuffer* m_activeCommandBuffer = nullptr;
uint32 allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize) override;
std::map<MTL::CommandBuffer*, std::vector<uint32>> m_executingCommandBuffers;
std::map<MTL::CommandBuffer*, std::vector<uint32>>::iterator m_activeCommandBufferIt;
const class MetalRenderer* m_mtlr;
uint16 m_framesSinceBackBufferAccess = 0;
MTL::ResourceOptions m_options;
size_t m_minimumBufferAllocationSize;
std::vector<MTL::Buffer*> m_chunkBuffers;
};
// a circular ring-buffer which tracks and releases memory per command-buffer
class MetalSynchronizedRingAllocator
{
public:
MetalSynchronizedRingAllocator(class MetalRenderer* mtlRenderer, MTL::ResourceOptions options, uint32 minimumBufferAllocSize) : m_mtlr(mtlRenderer), m_options(GetResourceOptions(options)), m_minimumBufferAllocSize(minimumBufferAllocSize) {};
MetalSynchronizedRingAllocator(const MetalSynchronizedRingAllocator&) = delete; // disallow copy
struct BufferSyncPoint_t
{
// todo - modularize sync point
MTL::CommandBuffer* commandBuffer;
uint32 offset;
BufferSyncPoint_t(MTL::CommandBuffer* _commandBuffer, uint32 _offset) : commandBuffer(_commandBuffer), offset(_offset) {};
};
struct AllocatorBuffer_t
{
MTL::Buffer* mtlBuffer;
uint8* basePtr;
uint32 size;
uint32 writeIndex;
std::queue<BufferSyncPoint_t> queue_syncPoints;
MTL::CommandBuffer* lastSyncpointCommandBuffer{ nullptr };
uint32 index;
uint32 cleanupCounter{ 0 }; // increased by one every time CleanupBuffer() is called if there is no sync point. If it reaches 300 then the buffer is released
};
struct AllocatorReservation_t
{
MTL::Buffer* mtlBuffer;
uint8* memPtr;
uint32 bufferOffset;
uint32 size;
uint32 bufferIndex;
};
AllocatorReservation_t AllocateBufferMemory(uint32 size, uint32 alignment);
void FlushReservation(AllocatorReservation_t& uploadReservation);
void CleanupBuffer(MTL::CommandBuffer* latestFinishedCommandBuffer);
MTL::Buffer* GetBufferByIndex(uint32 index) const;
bool RequiresFlush() const
{
return m_options & MTL::ResourceStorageModeManaged;
}
void GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const;
private:
void allocateAdditionalUploadBuffer(uint32 sizeRequiredForAlloc);
void addUploadBufferSyncPoint(AllocatorBuffer_t& buffer, uint32 offset);
const class MetalRenderer* m_mtlr;
MTL::ResourceOptions m_options;
const uint32 m_minimumBufferAllocSize;
std::vector<AllocatorBuffer_t> m_buffers;
};
// heap style allocator with released memory being freed after the current command buffer finishes
class MetalSynchronizedHeapAllocator
{
struct TrackedAllocation
{
TrackedAllocation(CHAddr allocation) : allocation(allocation) {};
CHAddr allocation;
};
public:
MetalSynchronizedHeapAllocator(class MetalRenderer* mtlRenderer, MTL::ResourceOptions options, size_t minimumBufferAllocSize) : m_mtlr(mtlRenderer), m_chunkedHeap(m_mtlr, options, minimumBufferAllocSize) {}
MetalSynchronizedHeapAllocator(const MetalSynchronizedHeapAllocator&) = delete; // disallow copy
struct AllocatorReservation
{
MTL::Buffer* mtlBuffer;
uint8* memPtr;
uint32 bufferOffset;
uint32 size;
uint32 bufferIndex;
};
AllocatorReservation* AllocateBufferMemory(uint32 size, uint32 alignment);
void FreeReservation(AllocatorReservation* uploadReservation);
void FlushReservation(AllocatorReservation* uploadReservation);
void CleanupBuffer(MTL::CommandBuffer* latestFinishedCommandBuffer);
void GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const;
private:
const class MetalRenderer* m_mtlr;
MetalBufferChunkedHeap m_chunkedHeap;
// allocations
std::vector<TrackedAllocation> m_activeAllocations;
MemoryPool<AllocatorReservation> m_poolAllocatorReservation{32};
// release queue
std::unordered_map<MTL::CommandBuffer*, std::vector<CHAddr>> m_releaseQueue;
};

View file

@ -73,20 +73,15 @@ void MetalMemoryManager::UploadToBufferCache(const void* data, size_t offset, si
if (m_bufferCacheMode == BufferCacheMode::DevicePrivate)
{
auto allocation = m_tempBufferAllocator.GetBufferAllocation(size);
auto buffer = m_tempBufferAllocator.GetBufferOutsideOfCommandBuffer(allocation.bufferIndex);
memcpy((uint8*)buffer->contents() + allocation.offset, data, size);
auto blitCommandEncoder = m_mtlr->GetBlitCommandEncoder();
// Lock the buffer to make sure it's not deallocated before the copy is done
m_tempBufferAllocator.LockBuffer(allocation.bufferIndex);
auto allocation = m_stagingAllocator.AllocateBufferMemory(size, 1);
memcpy(allocation.memPtr, data, size);
m_stagingAllocator.FlushReservation(allocation);
m_mtlr->CopyBufferToBuffer(buffer, allocation.offset, m_bufferCache, offset, size, ALL_MTL_RENDER_STAGES, ALL_MTL_RENDER_STAGES);
blitCommandEncoder->copyFromBuffer(allocation.mtlBuffer, allocation.bufferOffset, m_bufferCache, offset, size);
// Make sure the buffer has the right command buffer
m_tempBufferAllocator.GetBuffer(allocation.bufferIndex); // TODO: make a helper function for this
// We can now safely unlock the buffer
m_tempBufferAllocator.UnlockBuffer(allocation.bufferIndex);
//m_mtlr->CopyBufferToBuffer(allocation.mtlBuffer, allocation.bufferOffset, m_bufferCache, offset, size, ALL_MTL_RENDER_STAGES, ALL_MTL_RENDER_STAGES);
}
else
{

View file

@ -7,22 +7,17 @@
class MetalMemoryManager
{
public:
MetalMemoryManager(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer}, m_bufferAllocator(metalRenderer, m_mtlr->GetOptimalBufferStorageMode()), m_framePersistentBufferAllocator(metalRenderer, MTL::ResourceStorageModePrivate), m_tempBufferAllocator(metalRenderer) {}
MetalMemoryManager(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer}, m_stagingAllocator(m_mtlr, m_mtlr->GetOptimalBufferStorageMode(), 32u * 1024 * 1024), m_indexAllocator(m_mtlr, m_mtlr->GetOptimalBufferStorageMode(), 4u * 1024 * 1024) {}
~MetalMemoryManager();
MetalDefaultBufferAllocator& GetBufferAllocator()
MetalSynchronizedRingAllocator& GetStagingAllocator()
{
return m_bufferAllocator;
return m_stagingAllocator;
}
MetalDefaultBufferAllocator& GetFramePersistentBufferAllocator()
MetalSynchronizedHeapAllocator& GetIndexAllocator()
{
return m_framePersistentBufferAllocator;
}
MetalTemporaryBufferAllocator& GetTemporaryBufferAllocator()
{
return m_tempBufferAllocator;
return m_indexAllocator;
}
MTL::Buffer* GetBufferCache()
@ -30,6 +25,12 @@ public:
return m_bufferCache;
}
void CleanupBuffers(MTL::CommandBuffer* latestFinishedCommandBuffer)
{
m_stagingAllocator.CleanupBuffer(latestFinishedCommandBuffer);
m_indexAllocator.CleanupBuffer(latestFinishedCommandBuffer);
}
// Texture upload buffer
void* AcquireTextureUploadBuffer(size_t size);
void ReleaseTextureUploadBuffer(uint8* mem);
@ -65,9 +66,8 @@ private:
std::vector<uint8> m_textureUploadBuffer;
MetalDefaultBufferAllocator m_bufferAllocator;
MetalDefaultBufferAllocator m_framePersistentBufferAllocator;
MetalTemporaryBufferAllocator m_tempBufferAllocator;
MetalSynchronizedRingAllocator m_stagingAllocator;
MetalSynchronizedHeapAllocator m_indexAllocator;
MTL::Buffer* m_bufferCache = nullptr;
BufferCacheMode m_bufferCacheMode;

View file

@ -3,8 +3,6 @@
class MetalPerformanceMonitor
{
public:
size_t m_bufferAllocatorMemory = 0;
// Per frame data
uint32 m_commandBuffers = 0;
uint32 m_renderPasses = 0;

View file

@ -16,13 +16,12 @@
#include "Cafe/HW/Latte/Core/LatteShader.h"
#include "Cafe/HW/Latte/Core/LatteIndices.h"
#include "Cemu/Logging/CemuDebugLogging.h"
#include "Cafe/HW/Latte/Core/LatteBufferCache.h"
#include "Cemu/Logging/CemuLogging.h"
#include "Cafe/HW/Latte/Core/FetchShader.h"
#include "Cafe/HW/Latte/Core/LatteConst.h"
#include "Common/precompiled.h"
#include "HW/Latte/Renderer/Metal/MetalBufferAllocator.h"
#include "HW/Latte/Renderer/Metal/MetalCommon.h"
#include "Metal/MTLCaptureManager.hpp"
#include "config/CemuConfig.h"
#include "gui/guiWrapper.h"
@ -191,6 +190,7 @@ MetalRenderer::MetalRenderer()
utilityLibrary->release();
// HACK: for some reason, this variable ends up being initialized to some garbage data, even though its declared as bool m_captureFrame = false;
m_occlusionQuery.m_lastCommandBuffer = nullptr;
m_captureFrame = false;
}
@ -302,12 +302,6 @@ void MetalRenderer::SwapBuffers(bool swapTV, bool swapDRC)
// Reset the command buffers (they are released by TemporaryBufferAllocator)
CommitCommandBuffer();
// Release frame persistent buffers
m_memoryManager->GetFramePersistentBufferAllocator().ResetAllocations();
// Unlock all temporary buffers
m_memoryManager->GetTemporaryBufferAllocator().EndFrame();
// Debug
m_performanceMonitor.ResetPerFrameData();
@ -593,7 +587,6 @@ void MetalRenderer::AppendOverlayDebugInfo()
ImGui::Text("--- Metal info ---");
ImGui::Text("Render pipeline states %zu", m_pipelineCache->GetPipelineCacheSize());
ImGui::Text("Buffer allocator memory %zuMB", m_performanceMonitor.m_bufferAllocatorMemory / 1024 / 1024);
ImGui::Text("--- Metal info (per frame) ---");
ImGui::Text("Command buffers %u", m_performanceMonitor.m_commandBuffers);
@ -601,6 +594,31 @@ void MetalRenderer::AppendOverlayDebugInfo()
ImGui::Text("Clears %u", m_performanceMonitor.m_clears);
ImGui::Text("Manual vertex fetch draws %u (mesh draws: %u)", m_performanceMonitor.m_manualVertexFetchDraws, m_performanceMonitor.m_meshDraws);
ImGui::Text("Triangle fans %u", m_performanceMonitor.m_triangleFans);
ImGui::Text("--- Cache debug info ---");
uint32 bufferCacheHeapSize = 0;
uint32 bufferCacheAllocationSize = 0;
uint32 bufferCacheNumAllocations = 0;
LatteBufferCache_getStats(bufferCacheHeapSize, bufferCacheAllocationSize, bufferCacheNumAllocations);
ImGui::Text("Buffer");
ImGui::SameLine(60.0f);
ImGui::Text("%06uKB / %06uKB Allocs: %u", (uint32)(bufferCacheAllocationSize + 1023) / 1024, ((uint32)bufferCacheHeapSize + 1023) / 1024, (uint32)bufferCacheNumAllocations);
uint32 numBuffers;
size_t totalSize, freeSize;
m_memoryManager->GetStagingAllocator().GetStats(numBuffers, totalSize, freeSize);
ImGui::Text("Staging");
ImGui::SameLine(60.0f);
ImGui::Text("%06uKB / %06uKB Buffers: %u", ((uint32)(totalSize - freeSize) + 1023) / 1024, ((uint32)totalSize + 1023) / 1024, (uint32)numBuffers);
m_memoryManager->GetIndexAllocator().GetStats(numBuffers, totalSize, freeSize);
ImGui::Text("Index");
ImGui::SameLine(60.0f);
ImGui::Text("%06uKB / %06uKB Buffers: %u", ((uint32)(totalSize - freeSize) + 1023) / 1024, ((uint32)totalSize + 1023) / 1024, (uint32)numBuffers);
}
void MetalRenderer::renderTarget_setViewport(float x, float y, float width, float height, float nearZ, float farZ, bool halfZ)
@ -682,17 +700,17 @@ void MetalRenderer::texture_loadSlice(LatteTexture* hostTexture, sint32 width, s
auto blitCommandEncoder = GetBlitCommandEncoder();
// Allocate a temporary buffer
auto& bufferAllocator = m_memoryManager->GetTemporaryBufferAllocator();
auto allocation = bufferAllocator.GetBufferAllocation(compressedImageSize);
auto buffer = bufferAllocator.GetBuffer(allocation.bufferIndex);
auto& bufferAllocator = m_memoryManager->GetStagingAllocator();
auto allocation = bufferAllocator.AllocateBufferMemory(compressedImageSize, 1);
bufferAllocator.FlushReservation(allocation);
// Copy the data to the temporary buffer
memcpy(allocation.data, pixelData, compressedImageSize);
memcpy(allocation.memPtr, pixelData, compressedImageSize);
//buffer->didModifyRange(NS::Range(allocation.offset, allocation.size));
// TODO: specify blit options when copying to a depth stencil texture?
// Copy the data from the temporary buffer to the texture
blitCommandEncoder->copyFromBuffer(buffer, allocation.offset, bytesPerRow, 0, MTL::Size(width, height, 1), textureMtl->GetTexture(), sliceIndex, mipIndex, MTL::Origin(0, 0, offsetZ));
blitCommandEncoder->copyFromBuffer(allocation.mtlBuffer, allocation.bufferOffset, bytesPerRow, 0, MTL::Size(width, height, 1), textureMtl->GetTexture(), sliceIndex, mipIndex, MTL::Origin(0, 0, offsetZ));
//}
}
@ -1067,9 +1085,9 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32
uint32 hostIndexCount;
uint32 indexMin = 0;
uint32 indexMax = 0;
uint32 indexBufferOffset = 0;
uint32 indexBufferIndex = 0;
LatteIndices_decode(memory_getPointerFromVirtualOffset(indexDataMPTR), indexType, count, primitiveMode, indexMin, indexMax, hostIndexType, hostIndexCount, indexBufferOffset, indexBufferIndex);
Renderer::IndexAllocation indexAllocation;
LatteIndices_decode(memory_getPointerFromVirtualOffset(indexDataMPTR), indexType, count, primitiveMode, indexMin, indexMax, hostIndexType, hostIndexCount, indexAllocation);
auto indexAllocationMtl = static_cast<MetalSynchronizedHeapAllocator::AllocatorReservation*>(indexAllocation.rendererInternal);
// Buffer cache
if (m_memoryManager->UseHostMemoryForCache())
@ -1308,20 +1326,10 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32
BindStageResources(renderCommandEncoder, pixelShader, usesGeometryShader);
// Draw
MTL::Buffer* indexBuffer = nullptr;
if (hostIndexType != INDEX_TYPE::NONE)
{
auto& bufferAllocator = m_memoryManager->GetTemporaryBufferAllocator();
indexBuffer = bufferAllocator.GetBuffer(indexBufferIndex);
// We have already retrieved the buffer, no need for it to be locked anymore
bufferAllocator.UnlockBuffer(indexBufferIndex);
}
if (usesGeometryShader)
{
if (indexBuffer)
SetBuffer(renderCommandEncoder, METAL_SHADER_TYPE_OBJECT, indexBuffer, indexBufferOffset, vertexShader->resourceMapping.indexBufferBinding);
if (hostIndexType != INDEX_TYPE::NONE)
SetBuffer(renderCommandEncoder, METAL_SHADER_TYPE_OBJECT, indexAllocationMtl->mtlBuffer, indexAllocationMtl->bufferOffset, vertexShader->resourceMapping.indexBufferBinding);
uint8 hostIndexTypeU8 = (uint8)hostIndexType;
renderCommandEncoder->setObjectBytes(&hostIndexTypeU8, sizeof(hostIndexTypeU8), vertexShader->resourceMapping.indexTypeBinding);
@ -1349,10 +1357,10 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32
}
else
{
if (indexBuffer)
if (hostIndexType != INDEX_TYPE::NONE)
{
auto mtlIndexType = GetMtlIndexType(hostIndexType);
renderCommandEncoder->drawIndexedPrimitives(mtlPrimitiveType, hostIndexCount, mtlIndexType, indexBuffer, indexBufferOffset, instanceCount, baseVertex, baseInstance);
renderCommandEncoder->drawIndexedPrimitives(mtlPrimitiveType, hostIndexCount, mtlIndexType, indexAllocationMtl->mtlBuffer, indexAllocationMtl->bufferOffset, instanceCount, baseVertex, baseInstance);
}
else
{
@ -1492,29 +1500,21 @@ void MetalRenderer::draw_handleSpecialState5()
renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypeTriangle, NS::UInteger(0), NS::UInteger(3));
}
void* MetalRenderer::indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex)
Renderer::IndexAllocation MetalRenderer::indexData_reserveIndexMemory(uint32 size)
{
auto& bufferAllocator = m_memoryManager->GetTemporaryBufferAllocator();
auto allocation = bufferAllocator.GetBufferAllocation(size);
offset = allocation.offset;
bufferIndex = allocation.bufferIndex;
auto allocation = m_memoryManager->GetIndexAllocator().AllocateBufferMemory(size, 128);
// Lock the buffer so that it doesn't get released
bufferAllocator.LockBuffer(allocation.bufferIndex);
return allocation.data;
return {allocation->memPtr, allocation};
}
void MetalRenderer::indexData_uploadIndexMemory(uint32 bufferIndex, uint32 offset, uint32 size)
void MetalRenderer::indexData_releaseIndexMemory(IndexAllocation& allocation)
{
// Do nothing
/*
if (!HasUnifiedMemory())
{
auto buffer = m_memoryManager->GetTemporaryBufferAllocator().GetBufferOutsideOfCommandBuffer(bufferIndex);
buffer->didModifyRange(NS::Range(offset, size));
m_memoryManager->GetIndexAllocator().FreeReservation(static_cast<MetalSynchronizedHeapAllocator::AllocatorReservation*>(allocation.rendererInternal));
}
*/
void MetalRenderer::indexData_uploadIndexMemory(IndexAllocation& allocation)
{
m_memoryManager->GetIndexAllocator().FlushReservation(static_cast<MetalSynchronizedHeapAllocator::AllocatorReservation*>(allocation.rendererInternal));
}
LatteQueryObject* MetalRenderer::occlusionQuery_create() {
@ -1652,9 +1652,6 @@ MTL::CommandBuffer* MetalRenderer::GetCommandBuffer()
m_recordedDrawcalls = 0;
m_commitTreshold = m_defaultCommitTreshlod;
// Notify memory manager about the new command buffer
m_memoryManager->GetTemporaryBufferAllocator().SetActiveCommandBuffer(mtlCommandBuffer);
// Debug
m_performanceMonitor.m_commandBuffers++;
@ -1835,8 +1832,6 @@ void MetalRenderer::CommitCommandBuffer()
m_executingCommandBuffers.push_back(mtlCommandBuffer);
m_memoryManager->GetTemporaryBufferAllocator().SetActiveCommandBuffer(nullptr);
// Debug
//m_commandQueue->insertDebugCaptureBoundary();
}
@ -1845,26 +1840,20 @@ void MetalRenderer::CommitCommandBuffer()
void MetalRenderer::ProcessFinishedCommandBuffers()
{
// Check for finished command buffers
bool atLeastOneCompleted = false;
for (auto it = m_executingCommandBuffers.begin(); it != m_executingCommandBuffers.end();)
{
auto commandBuffer = *it;
if (CommandBufferCompleted(commandBuffer))
{
m_memoryManager->GetTemporaryBufferAllocator().CommandBufferFinished(commandBuffer);
m_memoryManager->CleanupBuffers(commandBuffer);
commandBuffer->release();
it = m_executingCommandBuffers.erase(it);
atLeastOneCompleted = true;
}
else
{
++it;
}
}
// Invalidate indices if at least one command buffer has completed
if (atLeastOneCompleted)
LatteIndices_invalidateAll();
}
bool MetalRenderer::AcquireDrawable(bool mainWindow)
@ -2102,15 +2091,13 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE
}
}
auto& bufferAllocator = m_memoryManager->GetTemporaryBufferAllocator();
size_t size = shader->uniform.uniformRangeSize;
auto supportBuffer = bufferAllocator.GetBufferAllocation(size);
memcpy(supportBuffer.data, supportBufferData, size);
auto buffer = bufferAllocator.GetBuffer(supportBuffer.bufferIndex);
//if (!HasUnifiedMemory())
// buffer->didModifyRange(NS::Range(supportBuffer.offset, size));
auto& bufferAllocator = m_memoryManager->GetStagingAllocator();
auto allocation = bufferAllocator.AllocateBufferMemory(size, 1);
memcpy(allocation.memPtr, supportBufferData, size);
bufferAllocator.FlushReservation(allocation);
SetBuffer(renderCommandEncoder, mtlShaderType, buffer, supportBuffer.offset, shader->resourceMapping.uniformVarsBufferBindingPoint);
SetBuffer(renderCommandEncoder, mtlShaderType, allocation.mtlBuffer, allocation.bufferOffset, shader->resourceMapping.uniformVarsBufferBindingPoint);
}
// Uniform buffers

View file

@ -7,19 +7,6 @@
#include "Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.h"
#include "Cafe/HW/Latte/Renderer/Metal/MetalAttachmentsInfo.h"
struct MetalBufferAllocation
{
void* data;
uint32 bufferIndex;
size_t offset = INVALID_OFFSET;
size_t size;
bool IsValid() const
{
return offset != INVALID_OFFSET;
}
};
enum MetalGeneralShaderType
{
METAL_GENERAL_SHADER_TYPE_VERTEX,
@ -271,8 +258,9 @@ public:
void draw_handleSpecialState5();
// index
void* indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) override;
void indexData_uploadIndexMemory(uint32 bufferIndex, uint32 offset, uint32 size) override;
IndexAllocation indexData_reserveIndexMemory(uint32 size) override;
void indexData_releaseIndexMemory(IndexAllocation& allocation) override;
void indexData_uploadIndexMemory(IndexAllocation& allocation) override;
// occlusion queries
LatteQueryObject* occlusionQuery_create() override;
@ -294,14 +282,14 @@ public:
return (m_currentCommandBuffer.m_commandBuffer && !m_currentCommandBuffer.m_commited);
}
MTL::CommandBuffer* GetCurrentCommandBuffer()
MTL::CommandBuffer* GetCurrentCommandBuffer() const
{
cemu_assert_debug(m_currentCommandBuffer.m_commandBuffer);
return m_currentCommandBuffer.m_commandBuffer;
}
MTL::CommandBuffer* GetAndRetainCurrentCommandBufferIfNotCompleted()
MTL::CommandBuffer* GetAndRetainCurrentCommandBufferIfNotCompleted() const
{
// The command buffer has been commited and has finished execution
if (m_currentCommandBuffer.m_commited && m_executingCommandBuffers.size() == 0)

View file

@ -15,8 +15,8 @@
#define METAL_AIR_CACHE_BLOCK_COUNT (METAL_AIR_CACHE_SIZE / 512)
static bool s_isLoadingShadersMtl{false};
static bool s_hasRAMFilesystem{false};
class FileCache* s_airCache{nullptr};
//static bool s_hasRAMFilesystem{false};
//class FileCache* s_airCache{nullptr};
extern std::atomic_int g_compiled_shaders_total;
extern std::atomic_int g_compiled_shaders_async;
@ -190,6 +190,7 @@ void RendererShaderMtl::ShaderCacheLoading_end()
void RendererShaderMtl::ShaderCacheLoading_Close()
{
// Close the AIR cache
/*
if (s_airCache)
{
delete s_airCache;
@ -197,7 +198,6 @@ void RendererShaderMtl::ShaderCacheLoading_Close()
}
// Close RAM filesystem
/*
if (s_hasRAMFilesystem)
executeCommand("diskutil eject {}", METAL_AIR_CACHE_PATH);
*/