Cemu/src/Cafe/HW/Latte/Core/LatteBufferCache.cpp
Tom Lally d3a7b3b5a6
Misc. Linux improvements and bug fixes. (#121)
Co-authored-by: Tom Lally <tomlally@protonmail.com>
2022-09-01 20:46:20 +02:00

1103 lines
35 KiB
C++

#include "Cafe/HW/Latte/Renderer/Renderer.h"
#include "util/ChunkedHeap/ChunkedHeap.h"
#include "util/helpers/fspinlock.h"
#include "config/ActiveSettings.h"
#define CACHE_PAGE_SIZE 0x400
#define CACHE_PAGE_SIZE_M1 (CACHE_PAGE_SIZE-1)
uint32 g_currentCacheChronon = 0;
template<typename TRangeData, typename TNodeObject>
class IntervalTree2
{
// TNodeObject will be interfaced with via callbacks to static methods
// static TNodeObject* Create(TRangeData rangeBegin, TRangeData rangeEnd, std::span<TNodeObject*> overlappingObjects)
// Create a new node with the given range. overlappingObjects contains all the nodes that are replaced by this operation. The callee has to delete all objects in overlappingObjects (Delete callback wont be invoked)
// static void Delete(TNodeObject* nodeObject)
// Delete a node object. Replacement operations won't trigger this callback and instead pass the objects to Create()
// static void Resize(TNodeObject* nodeObject, TRangeData rangeBegin, TRangeData rangeEnd)
// Shrink or extend an existing range
// static TNodeObject* Split(TNodeObject* nodeObject, TRangeData firstRangeBegin, TRangeData firstRangeEnd, TRangeData secondRangeBegin, TRangeData secondRangeEnd)
// Cut a hole into an existing range and split it in two. Should return the newly created node object after the hole
static_assert(std::is_pointer<TNodeObject>::value == false, "TNodeObject must be a non-pointer type");
struct InternalRange
{
InternalRange() {};
InternalRange(TRangeData _rangeBegin, TRangeData _rangeEnd) : rangeBegin(_rangeBegin), rangeEnd(_rangeEnd) { cemu_assert_debug(_rangeBegin < _rangeEnd); };
TRangeData rangeBegin;
TRangeData rangeEnd;
bool operator<(const InternalRange& rhs) const
{
// use <= instead of < because ranges are allowed to touch (e.g. 10-20 and 20-30 dont get merged)
return this->rangeEnd <= rhs.rangeBegin;
}
};
std::map<InternalRange, TNodeObject*> m_map;
std::vector<TNodeObject*> m_tempObjectArray;
public:
TNodeObject* getRange(TRangeData rangeBegin, TRangeData rangeEnd)
{
auto itr = m_map.find(InternalRange(rangeBegin, rangeEnd));
if (itr == m_map.cend())
return nullptr;
if (rangeBegin < (*itr).first.rangeBegin)
return nullptr;
if (rangeEnd > (*itr).first.rangeEnd)
return nullptr;
return (*itr).second;
}
TNodeObject* getRangeByPoint(TRangeData rangeOffset)
{
auto itr = m_map.find(InternalRange(rangeOffset, rangeOffset+1)); // todo - better to use custom comparator instead of +1?
if (itr == m_map.cend())
return nullptr;
cemu_assert_debug(rangeOffset >= (*itr).first.rangeBegin);
cemu_assert_debug(rangeOffset < (*itr).first.rangeEnd);
return (*itr).second;
}
void addRange(TRangeData rangeBegin, TRangeData rangeEnd)
{
if (rangeEnd == rangeBegin)
return;
InternalRange range(rangeBegin, rangeEnd);
auto itr = m_map.find(range);
if (itr == m_map.cend())
{
// new entry
m_map.emplace(range, TNodeObject::Create(rangeBegin, rangeEnd, std::span<TNodeObject*>()));
}
else
{
// overlap detected
if (rangeBegin >= (*itr).first.rangeBegin && rangeEnd <= (*itr).first.rangeEnd)
return; // do nothing if added range is already covered
rangeBegin = (std::min)(rangeBegin, (*itr).first.rangeBegin);
// DEBUG - make sure this is the start point of the merge process (the first entry that starts below minValue)
#ifndef PUBLIC_RELEASE
if (itr != m_map.cbegin())
{
// check previous result
auto itrCopy = itr;
--itrCopy;
if ((*itrCopy).first.rangeEnd > rangeBegin)
{
assert_dbg(); // n-1 entry is also overlapping
rangeBegin = (std::min)(rangeBegin, (*itrCopy).first.rangeBegin);
}
}
#endif
// DEBUG - END
// collect and remove all overlapping ranges
size_t count = 0;
while (itr != m_map.cend() && (*itr).first.rangeBegin < rangeEnd)
{
rangeEnd = (std::max)(rangeEnd, (*itr).first.rangeEnd);
if (m_tempObjectArray.size() <= count)
m_tempObjectArray.resize(count + 8);
m_tempObjectArray[count] = (*itr).second;
count++;
auto tempItr = itr;
++itr;
m_map.erase(tempItr);
}
// create callback
TNodeObject* newObject = TNodeObject::Create(rangeBegin, rangeEnd, std::span<TNodeObject*>(m_tempObjectArray.data(), count));
m_map.emplace(InternalRange(rangeBegin, rangeEnd), newObject);
}
}
void removeRange(TRangeData rangeBegin, TRangeData rangeEnd)
{
InternalRange range(rangeBegin, rangeEnd);
auto itr = m_map.find(range);
if (itr == m_map.cend())
return;
cemu_assert_debug(itr == m_map.lower_bound(range));
while (itr != m_map.cend() && (*itr).first.rangeBegin < rangeEnd)
{
if ((*itr).first.rangeBegin >= rangeBegin && (*itr).first.rangeEnd <= rangeEnd)
{
// delete entire range
auto itrCopy = itr;
TNodeObject* t = (*itr).second;
++itr;
m_map.erase(itrCopy);
TNodeObject::Delete(t);
continue;
}
if (rangeBegin > (*itr).first.rangeBegin && rangeEnd < (*itr).first.rangeEnd)
{
// cut hole into existing range
TRangeData firstRangeBegin = (*itr).first.rangeBegin;
TRangeData firstRangeEnd = rangeBegin;
TRangeData secondRangeBegin = rangeEnd;
TRangeData secondRangeEnd = (*itr).first.rangeEnd;
TNodeObject* newObject = TNodeObject::Split((*itr).second, firstRangeBegin, firstRangeEnd, secondRangeBegin, secondRangeEnd);
// modify key
auto nh = m_map.extract(itr);
nh.key().rangeBegin = firstRangeBegin;
nh.key().rangeEnd = firstRangeEnd;
m_map.insert(std::move(nh));
// insert new object after hole
m_map.emplace(InternalRange(secondRangeBegin, secondRangeEnd), newObject);
return; // done
}
// shrink (trim either beginning or end)
TRangeData newRangeBegin;
TRangeData newRangeEnd;
if ((rangeBegin <= (*itr).first.rangeBegin && rangeEnd < (*itr).first.rangeEnd))
{
// trim from beginning
newRangeBegin = (std::max)((*itr).first.rangeBegin, rangeEnd);
newRangeEnd = (*itr).first.rangeEnd;
}
else if ((rangeBegin > (*itr).first.rangeBegin && rangeEnd >= (*itr).first.rangeEnd))
{
// trim from end
newRangeBegin = (*itr).first.rangeBegin;
newRangeEnd = (std::min)((*itr).first.rangeEnd, rangeBegin);
}
else
{
assert_dbg(); // should not happen
}
TNodeObject::Resize((*itr).second, newRangeBegin, newRangeEnd);
// modify key and increment iterator
auto itrCopy = itr;
++itr;
auto nh = m_map.extract(itrCopy);
nh.key().rangeBegin = newRangeBegin;
nh.key().rangeEnd = newRangeEnd;
m_map.insert(std::move(nh));
}
}
// remove existing range that matches given begin and end
void removeRangeSingle(TRangeData rangeBegin, TRangeData rangeEnd)
{
InternalRange range(rangeBegin, rangeEnd);
auto itr = m_map.find(range);
cemu_assert_debug(itr != m_map.cend());
if (itr == m_map.cend())
return;
cemu_assert_debug((*itr).first.rangeBegin == rangeBegin && (*itr).first.rangeEnd == rangeEnd);
// delete entire range
TNodeObject* t = (*itr).second;
m_map.erase(itr);
TNodeObject::Delete(t);
}
// remove existing range that matches given begin and end without calling delete callback
void removeRangeSingleWithoutCallback(TRangeData rangeBegin, TRangeData rangeEnd)
{
InternalRange range(rangeBegin, rangeEnd);
auto itr = m_map.find(range);
cemu_assert_debug(itr != m_map.cend());
if (itr == m_map.cend())
return;
cemu_assert_debug((*itr).first.rangeBegin == rangeBegin && (*itr).first.rangeEnd == rangeEnd);
// delete entire range
TNodeObject* t = (*itr).second;
m_map.erase(itr);
}
void splitRange(TRangeData rangeOffset)
{
// not well tested
removeRange(rangeOffset, rangeOffset+1);
}
template<typename TFunc>
void forEachOverlapping(TRangeData rangeBegin, TRangeData rangeEnd, TFunc f)
{
InternalRange range(rangeBegin, rangeEnd);
auto itr = m_map.find(range);
if (itr == m_map.cend())
return;
cemu_assert_debug(itr == m_map.lower_bound(range));
while (itr != m_map.cend() && (*itr).first.rangeBegin < rangeEnd)
{
f((*itr).second, rangeBegin, rangeEnd);
++itr;
}
}
void validate()
{
if (m_map.empty())
return;
auto itr = m_map.begin();
if ((*itr).first.rangeBegin > (*itr).first.rangeEnd)
assert_dbg();
TRangeData currentLoc = (*itr).first.rangeEnd;
++itr;
while (itr != m_map.end())
{
if ((*itr).first.rangeBegin >= (*itr).first.rangeEnd)
assert_dbg(); // negative or zero size ranges are not allowed
if (currentLoc > (*itr).first.rangeBegin)
assert_dbg(); // stored ranges must not overlap
currentLoc = (*itr).first.rangeEnd;
++itr;
}
}
const std::map<InternalRange, TNodeObject*>& getAll() const { return m_map; };
};
std::unique_ptr<VHeap> g_gpuBufferHeap = nullptr;
std::vector<uint8> s_pageUploadBuffer;
std::vector<class BufferCacheNode*> s_allCacheNodes;
void LatteBufferCache_removeSingleNodeFromTree(BufferCacheNode* node);
class BufferCacheNode
{
static inline constexpr uint64 c_streamoutSig0 = 0xF0F0F0F0155C5B6Aull;
static inline constexpr uint64 c_streamoutSig1 = 0x8BE6336411814F4Full;
public:
// returns false if not enough space is available
bool allocateCacheMemory()
{
cemu_assert_debug(m_hasCacheAlloc == false);
cemu_assert_debug(m_rangeEnd > m_rangeBegin);
m_hasCacheAlloc = g_gpuBufferHeap->allocOffset(m_rangeEnd - m_rangeBegin, CACHE_PAGE_SIZE, m_cacheOffset);
return m_hasCacheAlloc;
}
void ReleaseCacheMemoryImmediately()
{
if (m_hasCacheAlloc)
{
cemu_assert_debug(isInUse() == false);
g_gpuBufferHeap->freeOffset(m_cacheOffset);
m_hasCacheAlloc = false;
}
}
uint32 getBufferOffset(MPTR physAddr) const
{
cemu_assert_debug(m_hasCacheAlloc);
cemu_assert_debug(physAddr >= m_rangeBegin);
cemu_assert_debug(physAddr < m_rangeEnd);
uint32 relOffset = physAddr - m_rangeBegin;
return m_cacheOffset + relOffset;
}
void writeStreamout(MPTR rangeBegin, MPTR rangeEnd)
{
if ((rangeBegin & 0xF))
{
forceLogDebug_printf("writeStreamout(): RangeBegin not aligned to 16. Begin %08x End %08x", rangeBegin, rangeEnd);
rangeBegin = (rangeBegin + 0xF) & ~0xF;
rangeEnd = std::max(rangeBegin, rangeEnd);
}
if (rangeEnd & 0xF)
{
// todo - add support for 4 byte granularity for streamout writes and cache
// used by Affordable Space Adventures and YWW Level 1-8
// also used by CoD Ghosts (8 byte granularity)
//forceLogDebug_printf("Streamout write size is not aligned to 16 bytes");
rangeEnd &= ~0xF;
}
//cemu_assert_debug((rangeEnd & 0xF) == 0);
rangeBegin = std::max(rangeBegin, m_rangeBegin);
rangeEnd = std::min(rangeEnd, m_rangeEnd);
if (rangeBegin >= rangeEnd)
return;
sint32 numPages = getPageCountFromRange(rangeBegin, rangeEnd);
sint32 pageIndex = getPageIndexFromAddr(rangeBegin);
cemu_assert_debug((m_rangeBegin + pageIndex * CACHE_PAGE_SIZE) <= rangeBegin);
cemu_assert_debug((m_rangeBegin + (pageIndex + numPages) * CACHE_PAGE_SIZE) >= rangeEnd);
for (sint32 i = 0; i < numPages; i++)
{
pageWriteStreamoutSignatures(pageIndex, rangeBegin, rangeEnd);
pageIndex++;
//pageInfo->hasStreamoutData = true;
//pageInfo++;
}
if (numPages > 0)
m_hasStreamoutData = true;
}
void checkAndSyncModifications(MPTR rangeBegin, MPTR rangeEnd, bool uploadData)
{
cemu_assert_debug(rangeBegin >= m_rangeBegin);
cemu_assert_debug(rangeEnd <= m_rangeEnd);
cemu_assert_debug(rangeBegin < m_rangeEnd);
cemu_assert_debug((rangeBegin % CACHE_PAGE_SIZE) == 0);
cemu_assert_debug((rangeEnd % CACHE_PAGE_SIZE) == 0);
sint32 basePageIndex = getPageIndexFromAddrAligned(rangeBegin);
sint32 numPages = getPageCountFromRangeAligned(rangeBegin, rangeEnd);
uint8* pagePtr = memory_getPointerFromPhysicalOffset(rangeBegin);
sint32 uploadPageBegin = -1;
CachePageInfo* pageInfo = m_pageInfo.data() + basePageIndex;
for (sint32 i = 0; i < numPages; i++)
{
if (pageInfo->hasStreamoutData)
{
// first upload any pending sequence of pages
if (uploadPageBegin != -1)
{
// upload range
if (uploadData)
uploadPages(uploadPageBegin, basePageIndex + i);
uploadPageBegin = -1;
}
// check if hash changed
uint64 pageHash = hashPage(pagePtr);
if (pageInfo->hash != pageHash)
{
pageInfo->hash = pageHash;
// for pages that contain streamout data we do uploads with a much smaller granularity
// and skip uploading any data that is marked with streamout filler bytes
if (!uploadPageWithStreamoutFiltered(basePageIndex + i))
pageInfo->hasStreamoutData = false; // all streamout data was replaced
}
pagePtr += CACHE_PAGE_SIZE;
pageInfo++;
continue;
}
uint64 pageHash = hashPage(pagePtr);
pagePtr += CACHE_PAGE_SIZE;
if (pageInfo->hash != pageHash)
{
if (uploadPageBegin == -1)
uploadPageBegin = i + basePageIndex;
pageInfo->hash = pageHash;
}
else
{
if (uploadPageBegin != -1)
{
// upload range
if (uploadData)
uploadPages(uploadPageBegin, basePageIndex + i);
uploadPageBegin = -1;
}
}
pageInfo++;
}
if (uploadPageBegin != -1)
{
if (uploadData)
uploadPages(uploadPageBegin, basePageIndex + numPages);
}
}
void checkAndSyncModifications(bool uploadData)
{
checkAndSyncModifications(m_rangeBegin, m_rangeEnd, uploadData);
m_lastModifyCheckCronon = g_currentCacheChronon;
m_hasInvalidation = false;
}
void checkAndSyncModificationsIfChrononChanged(MPTR reservePhysAddress, uint32 reserveSize)
{
if (m_lastModifyCheckCronon != g_currentCacheChronon)
{
m_lastModifyCheckCronon = g_currentCacheChronon;
checkAndSyncModifications(m_rangeBegin, m_rangeEnd, true);
m_hasInvalidation = false;
}
if (m_hasInvalidation)
{
// ideally we would only upload the pages that intersect both the reserve range and the invalidation range
// but this would require complex per-page tracking of invalidation. Since this is on a hot path we do a cheap approximation
// where we only track one continous invalidation range
// try to bound uploads to the reserve range within the invalidation
uint32 resRangeBegin = reservePhysAddress & ~CACHE_PAGE_SIZE_M1;
uint32 resRangeEnd = ((reservePhysAddress + reserveSize) + CACHE_PAGE_SIZE_M1) & ~CACHE_PAGE_SIZE_M1;
uint32 uploadBegin = std::max(m_invalidationRangeBegin, resRangeBegin);
uint32 uploadEnd = std::min(resRangeEnd, m_invalidationRangeEnd);
if (uploadBegin >= uploadEnd)
return; // reserve range not within invalidation or range is zero sized
if (uploadBegin == m_invalidationRangeBegin)
{
m_invalidationRangeBegin = uploadEnd;
checkAndSyncModifications(uploadBegin, uploadEnd, true);
}
if (uploadEnd == m_invalidationRangeEnd)
{
m_invalidationRangeEnd = uploadBegin;
checkAndSyncModifications(uploadBegin, uploadEnd, true);
}
else
{
// upload all of invalidation
checkAndSyncModifications(m_invalidationRangeBegin, m_invalidationRangeEnd, true);
m_invalidationRangeBegin = m_invalidationRangeEnd;
}
if(m_invalidationRangeEnd <= m_invalidationRangeBegin)
m_hasInvalidation = false;
//if (resRangeBegin <= m_invalidationRangeBegin)
//{
// // shrink/replace invalidation range from the bottom
// uint32 uploadBegin = m_invalidationRangeBegin;//std::max(m_invalidationRangeBegin, resRangeBegin);
// uint32 uploadEnd = std::min(resRangeEnd, m_invalidationRangeEnd);
// cemu_assert_debug(uploadEnd >= uploadBegin);
// if (uploadBegin != uploadEnd)
// checkAndSyncModifications(uploadBegin, uploadEnd, true);
// m_invalidationRangeBegin = uploadEnd;
// cemu_assert_debug(m_invalidationRangeBegin <= m_invalidationRangeEnd);
// if (m_invalidationRangeBegin >= m_invalidationRangeEnd)
// m_hasInvalidation = false;
//}
//else if (resRangeEnd >= m_invalidationRangeEnd)
//{
// // shrink/replace invalidation range from the top
// uint32 uploadBegin = std::max(m_invalidationRangeBegin, resRangeBegin);
// uint32 uploadEnd = m_invalidationRangeEnd;// std::min(resRangeEnd, m_invalidationRangeEnd);
// cemu_assert_debug(uploadEnd >= uploadBegin);
// if (uploadBegin != uploadEnd)
// checkAndSyncModifications(uploadBegin, uploadEnd, true);
// m_invalidationRangeEnd = uploadBegin;
// cemu_assert_debug(m_invalidationRangeBegin <= m_invalidationRangeEnd);
// if (m_invalidationRangeBegin >= m_invalidationRangeEnd)
// m_hasInvalidation = false;
//}
//else
//{
// // since we cant cut holes into the range upload it in it's entirety
// cemu_assert_debug(m_invalidationRangeEnd <= m_rangeEnd);
// cemu_assert_debug(m_invalidationRangeBegin >= m_rangeBegin);
// cemu_assert_debug(m_invalidationRangeBegin < m_invalidationRangeEnd);
// checkAndSyncModifications(m_invalidationRangeBegin, m_invalidationRangeEnd, true);
// m_hasInvalidation = false;
//}
// todo - dont re-upload the whole range immediately
// under ideal circumstances we would only upload the data range requested for the current draw call
// but this is a hot path so we can't check
}
}
void invalidate(MPTR rangeBegin, MPTR rangeEnd)
{
rangeBegin = std::max(rangeBegin, m_rangeBegin);
rangeEnd = std::min(rangeEnd, m_rangeEnd);
if (rangeBegin >= rangeEnd)
return;
if (m_hasInvalidation)
{
m_invalidationRangeBegin = std::min(m_invalidationRangeBegin, rangeBegin);
m_invalidationRangeEnd = std::max(m_invalidationRangeEnd, rangeEnd);
}
else
{
m_invalidationRangeBegin = rangeBegin;
m_invalidationRangeEnd = rangeEnd;
m_hasInvalidation = true;
}
cemu_assert_debug(m_invalidationRangeBegin >= m_rangeBegin);
cemu_assert_debug(m_invalidationRangeEnd <= m_rangeEnd);
cemu_assert_debug(m_invalidationRangeBegin < m_invalidationRangeEnd);
m_invalidationRangeBegin = m_invalidationRangeBegin & ~CACHE_PAGE_SIZE_M1;
m_invalidationRangeEnd = (m_invalidationRangeEnd + CACHE_PAGE_SIZE_M1) & ~CACHE_PAGE_SIZE_M1;
}
void flagInUse()
{
m_lastDrawcall = LatteGPUState.drawCallCounter;
m_lastFrame = LatteGPUState.frameCounter;
}
bool isInUse() const
{
return m_lastDrawcall == LatteGPUState.drawCallCounter;
}
// returns true if the range does not contain any GPU-cache-only data and can be fully restored from RAM
bool isRAMOnly() const
{
return !m_hasStreamoutData;
}
MPTR GetRangeBegin() const { return m_rangeBegin; }
MPTR GetRangeEnd() const { return m_rangeEnd; }
uint32 GetDrawcallAge() const { return LatteGPUState.drawCallCounter - m_lastDrawcall; };
uint32 GetFrameAge() const { return LatteGPUState.frameCounter - m_lastFrame; };
bool HasStreamoutData() const { return m_hasStreamoutData; };
private:
struct CachePageInfo
{
uint64 hash{ 0 };
bool hasStreamoutData{ false };
};
MPTR m_rangeBegin;
MPTR m_rangeEnd; // (exclusive)
bool m_hasCacheAlloc{ false };
uint32 m_cacheOffset{ 0 };
// usage
uint32 m_lastDrawcall;
uint32 m_lastFrame;
uint32 m_arrayIndex;
// state tracking
uint32 m_lastModifyCheckCronon{ g_currentCacheChronon - 1 };
std::vector<CachePageInfo> m_pageInfo;
bool m_hasStreamoutData{ false };
// invalidation
bool m_hasInvalidation{false};
MPTR m_invalidationRangeBegin;
MPTR m_invalidationRangeEnd;
BufferCacheNode(MPTR rangeBegin, MPTR rangeEnd): m_rangeBegin(rangeBegin), m_rangeEnd(rangeEnd)
{
flagInUse();
cemu_assert_debug(rangeBegin < rangeEnd);
size_t numPages = getPageCountFromRangeAligned(rangeBegin, rangeEnd);
m_pageInfo.resize(numPages);
// append to array
m_arrayIndex = (uint32)s_allCacheNodes.size();
s_allCacheNodes.emplace_back(this);
};
~BufferCacheNode()
{
if (m_hasCacheAlloc)
g_deallocateQueue.emplace_back(m_cacheOffset); // release after current drawcall
// remove from array
auto temp = s_allCacheNodes.back();
s_allCacheNodes.pop_back();
if (this != temp)
{
s_allCacheNodes[m_arrayIndex] = temp;
temp->m_arrayIndex = m_arrayIndex;
}
}
uint32 getPageIndexFromAddrAligned(uint32 offset) const
{
cemu_assert_debug((offset % CACHE_PAGE_SIZE) == 0);
return (offset - m_rangeBegin) / CACHE_PAGE_SIZE;
}
uint32 getPageIndexFromAddr(uint32 offset) const
{
offset &= ~CACHE_PAGE_SIZE_M1;
return (offset - m_rangeBegin) / CACHE_PAGE_SIZE;
}
uint32 getPageCountFromRangeAligned(MPTR rangeBegin, MPTR rangeEnd) const
{
cemu_assert_debug((rangeBegin % CACHE_PAGE_SIZE) == 0);
cemu_assert_debug((rangeEnd % CACHE_PAGE_SIZE) == 0);
cemu_assert_debug(rangeBegin <= rangeEnd);
return (rangeEnd - rangeBegin) / CACHE_PAGE_SIZE;
}
uint32 getPageCountFromRange(MPTR rangeBegin, MPTR rangeEnd) const
{
rangeEnd = (rangeEnd + CACHE_PAGE_SIZE_M1) & ~CACHE_PAGE_SIZE_M1;
rangeBegin &= ~CACHE_PAGE_SIZE_M1;
cemu_assert_debug(rangeBegin <= rangeEnd);
return (rangeEnd - rangeBegin) / CACHE_PAGE_SIZE;
}
void syncFromRAM(MPTR rangeBegin, MPTR rangeEnd)
{
cemu_assert_debug(rangeBegin >= m_rangeBegin);
cemu_assert_debug(rangeEnd <= m_rangeEnd);
cemu_assert_debug(rangeEnd > rangeBegin);
cemu_assert_debug(m_hasCacheAlloc);
// reset write tracking
checkAndSyncModifications(rangeBegin, rangeEnd, false);
g_renderer->bufferCache_upload(memory_getPointerFromPhysicalOffset(rangeBegin), rangeEnd - rangeBegin, getBufferOffset(rangeBegin));
}
void syncFromNode(BufferCacheNode* srcNode)
{
// get shared range
MPTR rangeBegin = std::max(m_rangeBegin, srcNode->m_rangeBegin);
MPTR rangeEnd = std::min(m_rangeEnd, srcNode->m_rangeEnd);
cemu_assert_debug(rangeBegin < rangeEnd);
g_renderer->bufferCache_copy(srcNode->getBufferOffset(rangeBegin), this->getBufferOffset(rangeBegin), rangeEnd - rangeBegin);
// copy page checksums and information
sint32 numPages = getPageCountFromRangeAligned(rangeBegin, rangeEnd);
CachePageInfo* pageInfoDst = this->m_pageInfo.data() + this->getPageIndexFromAddrAligned(rangeBegin);
CachePageInfo* pageInfoSrc = srcNode->m_pageInfo.data() + srcNode->getPageIndexFromAddrAligned(rangeBegin);
for (sint32 i = 0; i < numPages; i++)
{
pageInfoDst[i] = pageInfoSrc[i];
if (pageInfoSrc[i].hasStreamoutData)
m_hasStreamoutData = true;
}
}
void uploadPages(uint32 firstPage, uint32 lastPagePlusOne)
{
cemu_assert_debug(lastPagePlusOne > firstPage);
uint32 uploadRangeBegin = m_rangeBegin + firstPage * CACHE_PAGE_SIZE;
uint32 uploadRangeEnd = m_rangeBegin + lastPagePlusOne * CACHE_PAGE_SIZE;
cemu_assert_debug(uploadRangeEnd > uploadRangeBegin);
// make sure uploaded pages and hashes match
uint32 numPages = lastPagePlusOne - firstPage;
if (s_pageUploadBuffer.size() < (numPages * CACHE_PAGE_SIZE))
s_pageUploadBuffer.resize(numPages * CACHE_PAGE_SIZE);
// todo - improve performance by merging memcpy + hashPage() ?
memcpy(s_pageUploadBuffer.data(), memory_getPointerFromPhysicalOffset(uploadRangeBegin), numPages * CACHE_PAGE_SIZE);
for (uint32 i = 0; i < numPages; i++)
{
m_pageInfo[firstPage + i].hash = hashPage(s_pageUploadBuffer.data() + i * CACHE_PAGE_SIZE);
}
g_renderer->bufferCache_upload(s_pageUploadBuffer.data(), uploadRangeEnd - uploadRangeBegin, getBufferOffset(uploadRangeBegin));
}
// upload only non-streamout data of a single page
// returns true if at least one streamout 16-byte block is present
// also updates the page hash to match the uploaded data (strict match)
sint32 uploadPageWithStreamoutFiltered(uint32 pageIndex)
{
uint8 pageCopy[CACHE_PAGE_SIZE];
memcpy(pageCopy, memory_getPointerFromPhysicalOffset(m_rangeBegin + pageIndex * CACHE_PAGE_SIZE), CACHE_PAGE_SIZE);
MPTR pageBase = m_rangeBegin + pageIndex * CACHE_PAGE_SIZE;
sint32 blockBegin = -1;
uint64* pagePtrU64 = (uint64*)pageCopy;
m_pageInfo[pageIndex].hash = hashPage(pageCopy);
bool hasStreamoutBlocks = false;
for (sint32 i = 0; i < CACHE_PAGE_SIZE / 16; i++)
{
if (pagePtrU64[0] == c_streamoutSig0 && pagePtrU64[1] == c_streamoutSig1)
{
hasStreamoutBlocks = true;
if (blockBegin != -1)
{
uint32 uploadRelRangeBegin = blockBegin * 16;
uint32 uploadRelRangeEnd = i * 16;
cemu_assert_debug(uploadRelRangeEnd > uploadRelRangeBegin);
g_renderer->bufferCache_upload(pageCopy + uploadRelRangeBegin, uploadRelRangeEnd - uploadRelRangeBegin, getBufferOffset(pageBase + uploadRelRangeBegin));
blockBegin = -1;
}
pagePtrU64 += 2;
continue;
}
else if (blockBegin == -1)
blockBegin = i;
pagePtrU64 += 2;
}
if (blockBegin != -1)
{
uint32 uploadRelRangeBegin = blockBegin * 16;
uint32 uploadRelRangeEnd = CACHE_PAGE_SIZE;
cemu_assert_debug(uploadRelRangeEnd > uploadRelRangeBegin);
g_renderer->bufferCache_upload(pageCopy + uploadRelRangeBegin, uploadRelRangeEnd - uploadRelRangeBegin, getBufferOffset(pageBase + uploadRelRangeBegin));
blockBegin = -1;
}
return hasStreamoutBlocks;
}
void shrink(MPTR newRangeBegin, MPTR newRangeEnd)
{
cemu_assert_debug(newRangeBegin >= m_rangeBegin);
cemu_assert_debug(newRangeEnd >= m_rangeEnd);
cemu_assert_debug(newRangeEnd > m_rangeBegin);
assert_dbg(); // todo (resize page array)
m_rangeBegin = newRangeBegin;
m_rangeEnd = newRangeEnd;
}
static uint64 hashPage(uint8* mem)
{
// note - this algorithm is/was also baked into pageWriteStreamoutSignatures()
uint64 h = 0;
uint64* memU64 = (uint64*)mem;
for (uint32 i = 0; i < CACHE_PAGE_SIZE / 8; i++)
{
//h = _rotr64(h, 7);
//h ^= *memU64;
//memU64++;
h = std::rotr<uint64>(h, 7);
h += (*memU64 + (uint64)i);
memU64++;
}
return h;
}
// flag page as having streamout data, also write streamout signatures to page memory
// also incrementally updates the page hash to include the written signatures, this prevents signature writes from triggering a data upload
void pageWriteStreamoutSignatures(uint32 pageIndex, MPTR rangeBegin, MPTR rangeEnd)
{
uint32 pageRangeBegin = m_rangeBegin + pageIndex * CACHE_PAGE_SIZE;
uint32 pageRangeEnd = pageRangeBegin + CACHE_PAGE_SIZE;
rangeBegin = std::max(pageRangeBegin, rangeBegin);
rangeEnd = std::min(pageRangeEnd, rangeEnd);
cemu_assert_debug(rangeEnd > rangeBegin);
cemu_assert_debug(rangeBegin >= pageRangeBegin);
cemu_assert_debug(rangeEnd <= pageRangeEnd);
cemu_assert_debug((rangeBegin & 0xF) == 0);
cemu_assert_debug((rangeEnd & 0xF) == 0);
auto pageInfo = m_pageInfo.data() + pageIndex;
pageInfo->hasStreamoutData = true;
// if the whole page is replaced we can use a cached hash
if (pageRangeBegin == rangeBegin && pageRangeEnd == rangeEnd)
{
uint64* pageMem = (uint64*)memory_getPointerFromPhysicalOffset(rangeBegin);
uint32 numBlocks = (rangeEnd - rangeBegin) / 16;
for (uint32 i = 0; i < numBlocks; i++)
{
pageMem[0] = c_streamoutSig0;
pageMem[1] = c_streamoutSig1;
pageMem += 2;
}
pageInfo->hash = c_fullStreamoutPageHash;
return;
}
uint64* pageMem = (uint64*)memory_getPointerFromPhysicalOffset(rangeBegin);
uint32 numBlocks = (rangeEnd - rangeBegin) / 16;
uint32 indexHashBlock = (rangeBegin - pageRangeBegin) / sizeof(uint64);
for (uint32 i = 0; i < numBlocks; i++)
{
pageMem[0] = c_streamoutSig0;
pageMem[1] = c_streamoutSig1;
pageMem += 2;
}
pageInfo->hash = 0; // reset hash
}
static uint64 genStreamoutPageHash()
{
uint8 pageMem[CACHE_PAGE_SIZE];
uint64* pageMemU64 = (uint64*)pageMem;
for (uint32 i = 0; i < sizeof(pageMem) / sizeof(uint64) / 2; i++)
{
pageMemU64[0] = c_streamoutSig0;
pageMemU64[1] = c_streamoutSig1;
pageMemU64 += 2;
}
return hashPage(pageMem);
}
static inline uint64 c_fullStreamoutPageHash = genStreamoutPageHash();
static std::vector<uint32> g_deallocateQueue;
public:
static void ProcessDeallocations()
{
for(auto& itr : g_deallocateQueue)
g_gpuBufferHeap->freeOffset(itr);
g_deallocateQueue.clear();
}
// drops everything from the cache that isn't considered in use or unrestorable (ranges with streamout)
static void CleanupCacheAggressive(MPTR excludedRangeBegin, MPTR excludedRangeEnd)
{
size_t i = 0;
while (i < s_allCacheNodes.size())
{
BufferCacheNode* node = s_allCacheNodes[i];
if (node->isInUse())
{
i++;
continue;
}
if(!node->isRAMOnly())
{
i++;
continue;
}
if(node->GetRangeBegin() < excludedRangeEnd && node->GetRangeEnd() > excludedRangeBegin)
{
i++;
continue;
}
// delete range
node->ReleaseCacheMemoryImmediately();
LatteBufferCache_removeSingleNodeFromTree(node);
delete node;
}
}
/* callbacks from IntervalTree */
static BufferCacheNode* Create(MPTR rangeBegin, MPTR rangeEnd, std::span<BufferCacheNode*> overlappingObjects)
{
auto newRange = new BufferCacheNode(rangeBegin, rangeEnd);
if (!newRange->allocateCacheMemory())
{
// not enough memory available, try to drop ram-only ranges from the ones we replace
for (size_t i = 0; i < overlappingObjects.size(); i++)
{
BufferCacheNode* nodeItr = overlappingObjects[i];
if (!nodeItr->isInUse() && nodeItr->isRAMOnly())
{
nodeItr->ReleaseCacheMemoryImmediately();
delete nodeItr;
overlappingObjects[i] = nullptr;
}
}
// retry allocation
if (!newRange->allocateCacheMemory())
{
forceLog_printf("Out-of-memory in GPU buffer (trying to allocate: %dKB) Cleaning up cache...", (rangeEnd - rangeBegin + 1023) / 1024);
CleanupCacheAggressive(rangeBegin, rangeEnd);
if (!newRange->allocateCacheMemory())
{
forceLog_printf("Failed to free enough memory in GPU buffer");
cemu_assert(false);
}
}
}
newRange->syncFromRAM(rangeBegin, rangeEnd); // possible small optimization: only load the ranges from RAM which are not overwritten by ->syncFromNode()
for (auto itr : overlappingObjects)
{
if(itr == nullptr)
continue;
newRange->syncFromNode(itr);
delete itr;
}
return newRange;
}
static void Delete(BufferCacheNode* nodeObject)
{
delete nodeObject;
}
static void Resize(BufferCacheNode* nodeObject, MPTR rangeBegin, MPTR rangeEnd)
{
nodeObject->shrink(rangeBegin, rangeEnd);
}
static BufferCacheNode* Split(BufferCacheNode* nodeObject, MPTR firstRangeBegin, MPTR firstRangeEnd, MPTR secondRangeBegin, MPTR secondRangeEnd)
{
auto newRange = new BufferCacheNode(secondRangeBegin, secondRangeEnd);
// todo - add support for splitting BufferCacheNode memory allocations, then we dont need to do a separate allocation
if (!newRange->allocateCacheMemory())
{
forceLog_printf("Out-of-memory in GPU buffer during split operation");
cemu_assert(false);
}
newRange->syncFromNode(nodeObject);
nodeObject->shrink(firstRangeBegin, firstRangeEnd);
return newRange;
}
};
std::vector<uint32> BufferCacheNode::g_deallocateQueue;
IntervalTree2<MPTR, BufferCacheNode> g_gpuBufferCache;
void LatteBufferCache_removeSingleNodeFromTree(BufferCacheNode* node)
{
g_gpuBufferCache.removeRangeSingleWithoutCallback(node->GetRangeBegin(), node->GetRangeEnd());
}
BufferCacheNode* LatteBufferCache_reserveRange(MPTR physAddress, uint32 size)
{
MPTR rangeStart = physAddress - (physAddress % CACHE_PAGE_SIZE);
MPTR rangeEnd = (physAddress + size + CACHE_PAGE_SIZE_M1) & ~CACHE_PAGE_SIZE_M1;
auto range = g_gpuBufferCache.getRange(rangeStart, rangeEnd);
if (!range)
{
g_gpuBufferCache.addRange(rangeStart, rangeEnd);
range = g_gpuBufferCache.getRange(rangeStart, rangeEnd);
cemu_assert_debug(range);
}
cemu_assert_debug(range->GetRangeBegin() <= physAddress);
cemu_assert_debug(range->GetRangeEnd() >= (physAddress + size));
return range;
}
uint32 LatteBufferCache_retrieveDataInCache(MPTR physAddress, uint32 size)
{
auto range = LatteBufferCache_reserveRange(physAddress, size);
range->flagInUse();
range->checkAndSyncModificationsIfChrononChanged(physAddress, size);
return range->getBufferOffset(physAddress);
}
void LatteBufferCache_copyStreamoutDataToCache(MPTR physAddress, uint32 size, uint32 streamoutBufferOffset)
{
if (size == 0)
return;
cemu_assert_debug(size >= 16);
auto range = LatteBufferCache_reserveRange(physAddress, size);
range->flagInUse();
g_renderer->bufferCache_copyStreamoutToMainBuffer(streamoutBufferOffset, range->getBufferOffset(physAddress), size);
// write streamout signatures, flag affected pages
range->writeStreamout(physAddress, (physAddress + size));
}
void LatteBufferCache_invalidate(MPTR physAddress, uint32 size)
{
if (size == 0)
return;
g_gpuBufferCache.forEachOverlapping(physAddress, physAddress + size, [](BufferCacheNode* node, MPTR invalidationRangeBegin, MPTR invalidationRangeEnd)
{
node->invalidate(invalidationRangeBegin, invalidationRangeEnd);
}
);
}
// optimized version of LatteBufferCache_invalidate() if physAddress points to the beginning of a page
void LatteBufferCache_invalidatePage(MPTR physAddress)
{
cemu_assert_debug((physAddress & CACHE_PAGE_SIZE_M1) == 0);
BufferCacheNode* node = g_gpuBufferCache.getRangeByPoint(physAddress);
if (node)
node->invalidate(physAddress, physAddress+CACHE_PAGE_SIZE);
}
void LatteBufferCache_processDeallocations()
{
BufferCacheNode::ProcessDeallocations();
}
void LatteBufferCache_init(size_t bufferSize)
{
g_gpuBufferHeap.reset(new VHeap(nullptr, (uint32)bufferSize));
g_renderer->bufferCache_init((uint32)bufferSize);
}
void LatteBufferCache_getStats(uint32& heapSize, uint32& allocationSize, uint32& allocNum)
{
g_gpuBufferHeap->getStats(heapSize, allocationSize, allocNum);
}
FSpinlock g_spinlockDCFlushQueue;
std::unordered_set<uint32>* g_DCFlushQueue = new std::unordered_set<uint32>(); // queued pages
std::unordered_set<uint32>* g_DCFlushQueueAlternate = new std::unordered_set<uint32>();
void LatteBufferCache_notifyDCFlush(MPTR address, uint32 size)
{
if (address == 0 || size == 0xFFFFFFFF)
return; // global flushes are ignored for now
uint32 firstPage = address / CACHE_PAGE_SIZE;
uint32 lastPage = (address + size - 1) / CACHE_PAGE_SIZE;
g_spinlockDCFlushQueue.acquire();
for (uint32 i = firstPage; i <= lastPage; i++)
g_DCFlushQueue->emplace(i);
g_spinlockDCFlushQueue.release();
}
void LatteBufferCache_processDCFlushQueue()
{
if (g_DCFlushQueue->empty()) // accessing this outside of the lock is technically undefined/unsafe behavior but on all known implementations this is fine and we can avoid the spinlock
return;
g_spinlockDCFlushQueue.acquire();
std::swap(g_DCFlushQueue, g_DCFlushQueueAlternate);
g_spinlockDCFlushQueue.release();
for (auto& itr : *g_DCFlushQueueAlternate)
LatteBufferCache_invalidatePage(itr * CACHE_PAGE_SIZE);
g_DCFlushQueueAlternate->clear();
}
void LatteBufferCache_notifyDrawDone()
{
}
void LatteBufferCache_notifySwapTVScanBuffer()
{
if( ActiveSettings::FlushGPUCacheOnSwap() )
g_currentCacheChronon++;
}
void LatteBufferCache_incrementalCleanup()
{
static uint32 s_counter = 0;
if (s_allCacheNodes.empty())
return;
s_counter++;
s_counter %= (uint32)s_allCacheNodes.size();
auto range = s_allCacheNodes[s_counter];
if (range->HasStreamoutData())
{
// currently we never delete streamout ranges
// todo - check if streamout pages got overwritten + if the range would lose the hasStreamoutData flag
return;
}
uint32 heapSize;
uint32 allocationSize;
uint32 allocNum;
g_gpuBufferHeap->getStats(heapSize, allocationSize, allocNum);
if (allocationSize >= (heapSize * 4 / 5))
{
// heap is 80% filled
if (range->GetFrameAge() >= 2)
{
g_gpuBufferCache.removeRangeSingle(range->GetRangeBegin(), range->GetRangeEnd());
}
}
else if (allocationSize >= (heapSize * 3 / 4))
{
// heap is 75-100% filled
if (range->GetFrameAge() >= 4)
{
g_gpuBufferCache.removeRangeSingle(range->GetRangeBegin(), range->GetRangeEnd());
}
}
else if (allocationSize >= (heapSize / 2))
{
// if heap is 50-75% filled
if (range->GetFrameAge() >= 20)
{
g_gpuBufferCache.removeRangeSingle(range->GetRangeBegin(), range->GetRangeEnd());
}
}
else
{
// heap is under 50% capacity
if (range->GetFrameAge() >= 500)
{
g_gpuBufferCache.removeRangeSingle(range->GetRangeBegin(), range->GetRangeEnd());
}
}
}