Cemu/src/Cafe/HW/Latte/Core/LatteBufferCache.cpp

#include "Cafe/HW/Latte/Renderer/Renderer.h"
#include "util/ChunkedHeap/ChunkedHeap.h"
#include "util/helpers/fspinlock.h"
#include "config/ActiveSettings.h"

#define CACHE_PAGE_SIZE		0x400
#define CACHE_PAGE_SIZE_M1	(CACHE_PAGE_SIZE-1)

uint32 g_currentCacheChronon = 0;

template<typename TRangeData, typename TNodeObject>
class IntervalTree2
{
	// TNodeObject will be interfaced with via callbacks to static methods

	// static TNodeObject* Create(TRangeData rangeBegin, TRangeData rangeEnd, std::span<TNodeObject*> overlappingObjects)
	// Create a new node with the given range. overlappingObjects contains all the nodes that are replaced by this operation. The callee has to delete all objects in overlappingObjects (Delete callback wont be invoked)

	// static void Delete(TNodeObject* nodeObject)
	// Delete a node object. Replacement operations won't trigger this callback and instead pass the objects to Create()

	// static void Resize(TNodeObject* nodeObject, TRangeData rangeBegin, TRangeData rangeEnd)
	// Shrink or extend an existing range

	// static TNodeObject* Split(TNodeObject* nodeObject, TRangeData firstRangeBegin, TRangeData firstRangeEnd, TRangeData secondRangeBegin, TRangeData secondRangeEnd)
	// Cut a hole into an existing range and split it in two. Should return the newly created node object after the hole

	static_assert(std::is_pointer<TNodeObject>::value == false, "TNodeObject must be a non-pointer type");

	struct InternalRange
	{
		InternalRange() {};
		InternalRange(TRangeData _rangeBegin, TRangeData _rangeEnd) : rangeBegin(_rangeBegin), rangeEnd(_rangeEnd) { cemu_assert_debug(_rangeBegin < _rangeEnd); };

		TRangeData rangeBegin;
		TRangeData rangeEnd;

		bool operator<(const InternalRange& rhs) const
		{
			// use <= instead of < because ranges are allowed to touch (e.g. 10-20 and 20-30 dont get merged)
			return this->rangeEnd <= rhs.rangeBegin;
		}

	};

	std::map<InternalRange, TNodeObject*> m_map;
	std::vector<TNodeObject*> m_tempObjectArray;

public:
	TNodeObject* getRange(TRangeData rangeBegin, TRangeData rangeEnd)
	{
		auto itr = m_map.find(InternalRange(rangeBegin, rangeEnd));
		if (itr == m_map.cend())
			return nullptr;
		if (rangeBegin < (*itr).first.rangeBegin)
			return nullptr;
		if (rangeEnd > (*itr).first.rangeEnd)
			return nullptr;
		return (*itr).second;
	}

	TNodeObject* getRangeByPoint(TRangeData rangeOffset)
	{
		auto itr = m_map.find(InternalRange(rangeOffset, rangeOffset+1)); // todo - better to use custom comparator instead of +1?
		if (itr == m_map.cend())
			return nullptr;
		cemu_assert_debug(rangeOffset >= (*itr).first.rangeBegin);
		cemu_assert_debug(rangeOffset < (*itr).first.rangeEnd);
		return (*itr).second;
	}

	void addRange(TRangeData rangeBegin, TRangeData rangeEnd)
	{
		if (rangeEnd == rangeBegin)
			return;
		InternalRange range(rangeBegin, rangeEnd);
		auto itr = m_map.find(range);
		if (itr == m_map.cend())
		{
			// new entry
			m_map.emplace(range, TNodeObject::Create(rangeBegin, rangeEnd, std::span<TNodeObject*>()));
		}
		else
		{
			// overlap detected
			if (rangeBegin >= (*itr).first.rangeBegin && rangeEnd <= (*itr).first.rangeEnd)
				return; // do nothing if added range is already covered
			rangeBegin = (std::min)(rangeBegin, (*itr).first.rangeBegin);
			// DEBUG - make sure this is the start point of the merge process (the first entry that starts below minValue)
#ifndef PUBLIC_RELEASE
			if (itr != m_map.cbegin())
			{
				// check previous result
				auto itrCopy = itr;
				--itrCopy;
				if ((*itrCopy).first.rangeEnd > rangeBegin)
				{
					assert_dbg(); // n-1 entry is also overlapping
					rangeBegin = (std::min)(rangeBegin, (*itrCopy).first.rangeBegin);
				}
			}
#endif
			// DEBUG - END
			// collect and remove all overlapping ranges
			size_t count = 0;
			while (itr != m_map.cend() && (*itr).first.rangeBegin < rangeEnd)
			{
				rangeEnd = (std::max)(rangeEnd, (*itr).first.rangeEnd);
				if (m_tempObjectArray.size() <= count)
					m_tempObjectArray.resize(count + 8);
				m_tempObjectArray[count] = (*itr).second;
				count++;
				auto tempItr = itr;
				++itr;
				m_map.erase(tempItr);
			}

			// create callback
			TNodeObject* newObject = TNodeObject::Create(rangeBegin, rangeEnd, std::span<TNodeObject*>(m_tempObjectArray.data(), count));
			m_map.emplace(InternalRange(rangeBegin, rangeEnd), newObject);
		}
	}

	void removeRange(TRangeData rangeBegin, TRangeData rangeEnd)
	{
		InternalRange range(rangeBegin, rangeEnd);
		auto itr = m_map.find(range);
		if (itr == m_map.cend())
			return;
		cemu_assert_debug(itr == m_map.lower_bound(range));
		while (itr != m_map.cend() && (*itr).first.rangeBegin < rangeEnd)
		{
			if ((*itr).first.rangeBegin >= rangeBegin && (*itr).first.rangeEnd <= rangeEnd)
			{
				// delete entire range
				auto itrCopy = itr;
				TNodeObject* t = (*itr).second;
				++itr;
				m_map.erase(itrCopy);
				TNodeObject::Delete(t);
				continue;
			}
			if (rangeBegin > (*itr).first.rangeBegin && rangeEnd < (*itr).first.rangeEnd)
			{
				// cut hole into existing range
				TRangeData firstRangeBegin = (*itr).first.rangeBegin;
				TRangeData firstRangeEnd = rangeBegin;
				TRangeData secondRangeBegin = rangeEnd;
				TRangeData secondRangeEnd = (*itr).first.rangeEnd;
				TNodeObject* newObject = TNodeObject::Split((*itr).second, firstRangeBegin, firstRangeEnd, secondRangeBegin, secondRangeEnd);
				// modify key
				auto nh = m_map.extract(itr);
				nh.key().rangeBegin = firstRangeBegin;
				nh.key().rangeEnd = firstRangeEnd;
				m_map.insert(std::move(nh));
				// insert new object after hole
				m_map.emplace(InternalRange(secondRangeBegin, secondRangeEnd), newObject);
				return; // done
			}
			// shrink (trim either beginning or end)
			TRangeData newRangeBegin;
			TRangeData newRangeEnd;
			if ((rangeBegin <= (*itr).first.rangeBegin && rangeEnd < (*itr).first.rangeEnd))
			{
				// trim from beginning
				newRangeBegin = (std::max)((*itr).first.rangeBegin, rangeEnd);
				newRangeEnd = (*itr).first.rangeEnd;
			}
			else if ((rangeBegin > (*itr).first.rangeBegin && rangeEnd >= (*itr).first.rangeEnd))
			{
				// trim from end
				newRangeBegin = (*itr).first.rangeBegin;
				newRangeEnd = (std::min)((*itr).first.rangeEnd, rangeBegin);
			}
			else
			{
				assert_dbg(); // should not happen
			}
			TNodeObject::Resize((*itr).second, newRangeBegin, newRangeEnd);
			// modify key and increment iterator
			auto itrCopy = itr;
			++itr;
			auto nh = m_map.extract(itrCopy);
			nh.key().rangeBegin = newRangeBegin;
			nh.key().rangeEnd = newRangeEnd;
			m_map.insert(std::move(nh));
		}
	}

	// remove existing range that matches given begin and end
	void removeRangeSingle(TRangeData rangeBegin, TRangeData rangeEnd)
	{
		InternalRange range(rangeBegin, rangeEnd);
		auto itr = m_map.find(range);
		cemu_assert_debug(itr != m_map.cend());
		if (itr == m_map.cend())
			return;
		cemu_assert_debug((*itr).first.rangeBegin == rangeBegin && (*itr).first.rangeEnd == rangeEnd);
		// delete entire range
		TNodeObject* t = (*itr).second;
		m_map.erase(itr);
		TNodeObject::Delete(t);
	}

	// remove existing range that matches given begin and end without calling delete callback
	void removeRangeSingleWithoutCallback(TRangeData rangeBegin, TRangeData rangeEnd)
	{
		InternalRange range(rangeBegin, rangeEnd);
		auto itr = m_map.find(range);
		cemu_assert_debug(itr != m_map.cend());
		if (itr == m_map.cend())
			return;
		cemu_assert_debug((*itr).first.rangeBegin == rangeBegin && (*itr).first.rangeEnd == rangeEnd);
		// delete entire range
		TNodeObject* t = (*itr).second;
		m_map.erase(itr);
	}

	void splitRange(TRangeData rangeOffset)
	{
		// not well tested
		removeRange(rangeOffset, rangeOffset+1);
	}

	template<typename TFunc>
	void forEachOverlapping(TRangeData rangeBegin, TRangeData rangeEnd, TFunc f)
	{
		InternalRange range(rangeBegin, rangeEnd);
		auto itr = m_map.find(range);
		if (itr == m_map.cend())
			return;
		cemu_assert_debug(itr == m_map.lower_bound(range));
		while (itr != m_map.cend() && (*itr).first.rangeBegin < rangeEnd)
		{
			f((*itr).second, rangeBegin, rangeEnd);
			++itr;
		}
	}

	void validate()
	{
		if (m_map.empty())
			return;
		auto itr = m_map.begin();
		if ((*itr).first.rangeBegin > (*itr).first.rangeEnd)
			assert_dbg();
		TRangeData currentLoc = (*itr).first.rangeEnd;
		++itr;
		while (itr != m_map.end())
		{
			if ((*itr).first.rangeBegin >= (*itr).first.rangeEnd)
				assert_dbg(); // negative or zero size ranges are not allowed
			if (currentLoc > (*itr).first.rangeBegin)
				assert_dbg(); // stored ranges must not overlap
			currentLoc = (*itr).first.rangeEnd;
			++itr;
		}
	}

	const std::map<InternalRange, TNodeObject*>& getAll() const { return m_map; };
};

std::unique_ptr<VHeap> g_gpuBufferHeap = nullptr;
std::vector<uint8> s_pageUploadBuffer;
std::vector<class BufferCacheNode*> s_allCacheNodes;

void LatteBufferCache_removeSingleNodeFromTree(BufferCacheNode* node);

class BufferCacheNode
{
	static inline constexpr uint64 c_streamoutSig0 = 0xF0F0F0F0155C5B6Aull;
	static inline constexpr uint64 c_streamoutSig1 = 0x8BE6336411814F4Full;

public:
	// returns false if not enough space is available
	bool allocateCacheMemory()
	{
		cemu_assert_debug(m_hasCacheAlloc == false);
		cemu_assert_debug(m_rangeEnd > m_rangeBegin);
		m_hasCacheAlloc = g_gpuBufferHeap->allocOffset(m_rangeEnd - m_rangeBegin, CACHE_PAGE_SIZE, m_cacheOffset);
		return m_hasCacheAlloc;
	}

	void ReleaseCacheMemoryImmediately()
	{
		if (m_hasCacheAlloc)
		{
			cemu_assert_debug(isInUse() == false);
			g_gpuBufferHeap->freeOffset(m_cacheOffset);
			m_hasCacheAlloc = false;
		}
	}

	uint32 getBufferOffset(MPTR physAddr) const
	{
		cemu_assert_debug(m_hasCacheAlloc);
		cemu_assert_debug(physAddr >= m_rangeBegin);
		cemu_assert_debug(physAddr < m_rangeEnd);
		uint32 relOffset = physAddr - m_rangeBegin;
		return m_cacheOffset + relOffset;
	}

	void writeStreamout(MPTR rangeBegin, MPTR rangeEnd)
	{
		if ((rangeBegin & 0xF))
		{
			forceLogDebug_printf("writeStreamout(): RangeBegin not aligned to 16. Begin %08x End %08x", rangeBegin, rangeEnd);
			rangeBegin = (rangeBegin + 0xF) & ~0xF;
			rangeEnd = std::max(rangeBegin, rangeEnd);
		}
		if (rangeEnd & 0xF)
		{
			// todo - add support for 4 byte granularity for streamout writes and cache
			// used by Affordable Space Adventures and YWW Level 1-8
			// also used by CoD Ghosts (8 byte granularity)
			//forceLogDebug_printf("Streamout write size is not aligned to 16 bytes");
			rangeEnd &= ~0xF;
		}
		//cemu_assert_debug((rangeEnd & 0xF) == 0);
		rangeBegin = std::max(rangeBegin, m_rangeBegin);
		rangeEnd = std::min(rangeEnd, m_rangeEnd);
		if (rangeBegin >= rangeEnd)
			return;
		sint32 numPages = getPageCountFromRange(rangeBegin, rangeEnd);
		sint32 pageIndex = getPageIndexFromAddr(rangeBegin);

		cemu_assert_debug((m_rangeBegin + pageIndex * CACHE_PAGE_SIZE) <= rangeBegin);
		cemu_assert_debug((m_rangeBegin + (pageIndex + numPages) * CACHE_PAGE_SIZE) >= rangeEnd);

		for (sint32 i = 0; i < numPages; i++)
		{
			pageWriteStreamoutSignatures(pageIndex, rangeBegin, rangeEnd);
			pageIndex++;
			//pageInfo->hasStreamoutData = true;
			//pageInfo++;
		}
		if (numPages > 0)
			m_hasStreamoutData = true;
	}

	void checkAndSyncModifications(MPTR rangeBegin, MPTR rangeEnd, bool uploadData)
	{
		cemu_assert_debug(rangeBegin >= m_rangeBegin);
		cemu_assert_debug(rangeEnd <= m_rangeEnd);
		cemu_assert_debug(rangeBegin < m_rangeEnd);
		cemu_assert_debug((rangeBegin % CACHE_PAGE_SIZE) == 0);
		cemu_assert_debug((rangeEnd % CACHE_PAGE_SIZE) == 0);

		sint32 basePageIndex = getPageIndexFromAddrAligned(rangeBegin);
		sint32 numPages = getPageCountFromRangeAligned(rangeBegin, rangeEnd);
		uint8* pagePtr = memory_getPointerFromPhysicalOffset(rangeBegin);
		sint32 uploadPageBegin = -1;
		CachePageInfo* pageInfo = m_pageInfo.data() + basePageIndex;
		for (sint32 i = 0; i < numPages; i++)
		{
			if (pageInfo->hasStreamoutData)
			{
				// first upload any pending sequence of pages
				if (uploadPageBegin != -1)
				{
					// upload range
					if (uploadData)
						uploadPages(uploadPageBegin, basePageIndex + i);
					uploadPageBegin = -1;
				}
				// check if hash changed
				uint64 pageHash = hashPage(pagePtr);
				if (pageInfo->hash != pageHash)
				{
					pageInfo->hash = pageHash;
					// for pages that contain streamout data we do uploads with a much smaller granularity
					// and skip uploading any data that is marked with streamout filler bytes
					if (!uploadPageWithStreamoutFiltered(basePageIndex + i))
						pageInfo->hasStreamoutData = false; // all streamout data was replaced
				}
				pagePtr += CACHE_PAGE_SIZE;
				pageInfo++;
				continue;
			}

			uint64 pageHash = hashPage(pagePtr);
			pagePtr += CACHE_PAGE_SIZE;
			if (pageInfo->hash != pageHash)
			{
				if (uploadPageBegin == -1)
					uploadPageBegin = i + basePageIndex;
				pageInfo->hash = pageHash;
			}
			else
			{
				if (uploadPageBegin != -1)
				{
					// upload range
					if (uploadData)
						uploadPages(uploadPageBegin, basePageIndex + i);
					uploadPageBegin = -1;
				}
			}
			pageInfo++;
		}
		if (uploadPageBegin != -1)
		{
			if (uploadData)
				uploadPages(uploadPageBegin, basePageIndex + numPages);
		}
	}

	void checkAndSyncModifications(bool uploadData)
	{
		checkAndSyncModifications(m_rangeBegin, m_rangeEnd, uploadData);
		m_lastModifyCheckCronon = g_currentCacheChronon;
		m_hasInvalidation = false;
	}

	void checkAndSyncModificationsIfChrononChanged(MPTR reservePhysAddress, uint32 reserveSize)
	{
		if (m_lastModifyCheckCronon != g_currentCacheChronon)
		{
			m_lastModifyCheckCronon = g_currentCacheChronon;
			checkAndSyncModifications(m_rangeBegin, m_rangeEnd, true);
			m_hasInvalidation = false;
		}
		if (m_hasInvalidation)
		{
			// ideally we would only upload the pages that intersect both the reserve range and the invalidation range
			// but this would require complex per-page tracking of invalidation. Since this is on a hot path we do a cheap approximation
			// where we only track one continous invalidation range

			// try to bound uploads to the reserve range within the invalidation
			uint32 resRangeBegin = reservePhysAddress & ~CACHE_PAGE_SIZE_M1;
			uint32 resRangeEnd = ((reservePhysAddress + reserveSize) + CACHE_PAGE_SIZE_M1) & ~CACHE_PAGE_SIZE_M1;

			uint32 uploadBegin = std::max(m_invalidationRangeBegin, resRangeBegin);
			uint32 uploadEnd = std::min(resRangeEnd, m_invalidationRangeEnd);

			if (uploadBegin >= uploadEnd)
				return; // reserve range not within invalidation or range is zero sized


			if (uploadBegin == m_invalidationRangeBegin)
			{
				m_invalidationRangeBegin = uploadEnd;
				checkAndSyncModifications(uploadBegin, uploadEnd, true);
			}
			if (uploadEnd == m_invalidationRangeEnd)
			{
				m_invalidationRangeEnd = uploadBegin;
				checkAndSyncModifications(uploadBegin, uploadEnd, true);
			}
			else
			{
				// upload all of invalidation
				checkAndSyncModifications(m_invalidationRangeBegin, m_invalidationRangeEnd, true);
				m_invalidationRangeBegin = m_invalidationRangeEnd;
			}
			if(m_invalidationRangeEnd <= m_invalidationRangeBegin)
				m_hasInvalidation = false;

			//if (resRangeBegin <= m_invalidationRangeBegin)
			//{
			//	// shrink/replace invalidation range from the bottom
			//	uint32 uploadBegin = m_invalidationRangeBegin;//std::max(m_invalidationRangeBegin, resRangeBegin);
			//	uint32 uploadEnd = std::min(resRangeEnd, m_invalidationRangeEnd);
			//	cemu_assert_debug(uploadEnd >= uploadBegin);
			//	if (uploadBegin != uploadEnd)
			//		checkAndSyncModifications(uploadBegin, uploadEnd, true);
			//	m_invalidationRangeBegin = uploadEnd;
			//	cemu_assert_debug(m_invalidationRangeBegin <= m_invalidationRangeEnd);
			//	if (m_invalidationRangeBegin >= m_invalidationRangeEnd)
			//		m_hasInvalidation = false;
			//}
			//else if (resRangeEnd >= m_invalidationRangeEnd)
			//{
			//	// shrink/replace invalidation range from the top
			//	uint32 uploadBegin = std::max(m_invalidationRangeBegin, resRangeBegin);
			//	uint32 uploadEnd = m_invalidationRangeEnd;// std::min(resRangeEnd, m_invalidationRangeEnd);
			//	cemu_assert_debug(uploadEnd >= uploadBegin);
			//	if (uploadBegin != uploadEnd)
			//		checkAndSyncModifications(uploadBegin, uploadEnd, true);
			//	m_invalidationRangeEnd = uploadBegin;
			//	cemu_assert_debug(m_invalidationRangeBegin <= m_invalidationRangeEnd);
			//	if (m_invalidationRangeBegin >= m_invalidationRangeEnd)
			//		m_hasInvalidation = false;
			//}
			//else
			//{
			//	// since we cant cut holes into the range upload it in it's entirety
			//	cemu_assert_debug(m_invalidationRangeEnd <= m_rangeEnd);
			//	cemu_assert_debug(m_invalidationRangeBegin >= m_rangeBegin);
			//	cemu_assert_debug(m_invalidationRangeBegin < m_invalidationRangeEnd);
			//	checkAndSyncModifications(m_invalidationRangeBegin, m_invalidationRangeEnd, true);
			//	m_hasInvalidation = false;
			//}


			// todo - dont re-upload the whole range immediately
			// under ideal circumstances we would only upload the data range requested for the current draw call
			// but this is a hot path so we can't check
		}
	}

	void invalidate(MPTR rangeBegin, MPTR rangeEnd)
	{
		rangeBegin = std::max(rangeBegin, m_rangeBegin);
		rangeEnd = std::min(rangeEnd, m_rangeEnd);
		if (rangeBegin >= rangeEnd)
			return;
		if (m_hasInvalidation)
		{
			m_invalidationRangeBegin = std::min(m_invalidationRangeBegin, rangeBegin);
			m_invalidationRangeEnd = std::max(m_invalidationRangeEnd, rangeEnd);
		}
		else
		{
			m_invalidationRangeBegin = rangeBegin;
			m_invalidationRangeEnd = rangeEnd;
			m_hasInvalidation = true;
		}
		cemu_assert_debug(m_invalidationRangeBegin >= m_rangeBegin);
		cemu_assert_debug(m_invalidationRangeEnd <= m_rangeEnd);
		cemu_assert_debug(m_invalidationRangeBegin < m_invalidationRangeEnd);
		m_invalidationRangeBegin = m_invalidationRangeBegin & ~CACHE_PAGE_SIZE_M1;
		m_invalidationRangeEnd = (m_invalidationRangeEnd + CACHE_PAGE_SIZE_M1) & ~CACHE_PAGE_SIZE_M1;
	}

	void flagInUse()
	{
		m_lastDrawcall = LatteGPUState.drawCallCounter;
		m_lastFrame = LatteGPUState.frameCounter;
	}

	bool isInUse() const
	{
		return m_lastDrawcall == LatteGPUState.drawCallCounter;
	}

	// returns true if the range does not contain any GPU-cache-only data and can be fully restored from RAM
	bool isRAMOnly() const
	{
		return !m_hasStreamoutData;
	}

	MPTR GetRangeBegin() const { return m_rangeBegin; }
	MPTR GetRangeEnd() const { return m_rangeEnd; }

	uint32 GetDrawcallAge() const { return LatteGPUState.drawCallCounter - m_lastDrawcall; };
	uint32 GetFrameAge() const { return LatteGPUState.frameCounter - m_lastFrame; };

	bool HasStreamoutData() const { return m_hasStreamoutData; };

private:
	struct CachePageInfo
	{
		uint64 hash{ 0 };
		bool hasStreamoutData{ false };
	};

	MPTR m_rangeBegin;
	MPTR m_rangeEnd; // (exclusive)
	bool m_hasCacheAlloc{ false };
	uint32 m_cacheOffset{ 0 };
	// usage
	uint32 m_lastDrawcall;
	uint32 m_lastFrame;
	uint32 m_arrayIndex;
	// state tracking
	uint32 m_lastModifyCheckCronon{ g_currentCacheChronon - 1 };
	std::vector<CachePageInfo> m_pageInfo;
	bool m_hasStreamoutData{ false };
	// invalidation
	bool m_hasInvalidation{false};
	MPTR m_invalidationRangeBegin;
	MPTR m_invalidationRangeEnd;

	BufferCacheNode(MPTR rangeBegin, MPTR rangeEnd): m_rangeBegin(rangeBegin), m_rangeEnd(rangeEnd)
	{
		flagInUse();
		cemu_assert_debug(rangeBegin < rangeEnd);
		size_t numPages = getPageCountFromRangeAligned(rangeBegin, rangeEnd);
		m_pageInfo.resize(numPages);
		// append to array
		m_arrayIndex = (uint32)s_allCacheNodes.size();
		s_allCacheNodes.emplace_back(this);
	};

	~BufferCacheNode()
	{
		if (m_hasCacheAlloc)
			g_deallocateQueue.emplace_back(m_cacheOffset); // release after current drawcall
		// remove from array
		auto temp = s_allCacheNodes.back();
		s_allCacheNodes.pop_back();
		if (this != temp)
		{
			s_allCacheNodes[m_arrayIndex] = temp;
			temp->m_arrayIndex = m_arrayIndex;
		}
	}

	uint32 getPageIndexFromAddrAligned(uint32 offset) const
	{
		cemu_assert_debug((offset % CACHE_PAGE_SIZE) == 0);
		return (offset - m_rangeBegin) / CACHE_PAGE_SIZE;
	}

	uint32 getPageIndexFromAddr(uint32 offset) const
	{
		offset &= ~CACHE_PAGE_SIZE_M1;
		return (offset - m_rangeBegin) / CACHE_PAGE_SIZE;
	}

	uint32 getPageCountFromRangeAligned(MPTR rangeBegin, MPTR rangeEnd) const
	{
		cemu_assert_debug((rangeBegin % CACHE_PAGE_SIZE) == 0);
		cemu_assert_debug((rangeEnd % CACHE_PAGE_SIZE) == 0);
		cemu_assert_debug(rangeBegin <= rangeEnd);
		return (rangeEnd - rangeBegin) / CACHE_PAGE_SIZE;
	}

	uint32 getPageCountFromRange(MPTR rangeBegin, MPTR rangeEnd) const
	{
		rangeEnd = (rangeEnd + CACHE_PAGE_SIZE_M1) & ~CACHE_PAGE_SIZE_M1;
		rangeBegin &= ~CACHE_PAGE_SIZE_M1;
		cemu_assert_debug(rangeBegin <= rangeEnd);
		return (rangeEnd - rangeBegin) / CACHE_PAGE_SIZE;
	}

	void syncFromRAM(MPTR rangeBegin, MPTR rangeEnd)
	{
		cemu_assert_debug(rangeBegin >= m_rangeBegin);
		cemu_assert_debug(rangeEnd <= m_rangeEnd);
		cemu_assert_debug(rangeEnd > rangeBegin);
		cemu_assert_debug(m_hasCacheAlloc);

		// reset write tracking
		checkAndSyncModifications(rangeBegin, rangeEnd, false);

		g_renderer->bufferCache_upload(memory_getPointerFromPhysicalOffset(rangeBegin), rangeEnd - rangeBegin, getBufferOffset(rangeBegin));
	}

	void syncFromNode(BufferCacheNode* srcNode)
	{
		// get shared range
		MPTR rangeBegin = std::max(m_rangeBegin, srcNode->m_rangeBegin);
		MPTR rangeEnd = std::min(m_rangeEnd, srcNode->m_rangeEnd);
		cemu_assert_debug(rangeBegin < rangeEnd);
		g_renderer->bufferCache_copy(srcNode->getBufferOffset(rangeBegin), this->getBufferOffset(rangeBegin), rangeEnd - rangeBegin);
		// copy page checksums and information
		sint32 numPages = getPageCountFromRangeAligned(rangeBegin, rangeEnd);
		CachePageInfo* pageInfoDst = this->m_pageInfo.data() + this->getPageIndexFromAddrAligned(rangeBegin);
		CachePageInfo* pageInfoSrc = srcNode->m_pageInfo.data() + srcNode->getPageIndexFromAddrAligned(rangeBegin);
		for (sint32 i = 0; i < numPages; i++)
		{
			pageInfoDst[i] = pageInfoSrc[i];
			if (pageInfoSrc[i].hasStreamoutData)
				m_hasStreamoutData = true;
		}
	}

	void uploadPages(uint32 firstPage, uint32 lastPagePlusOne)
	{
		cemu_assert_debug(lastPagePlusOne > firstPage);
		uint32 uploadRangeBegin = m_rangeBegin + firstPage * CACHE_PAGE_SIZE;
		uint32 uploadRangeEnd = m_rangeBegin + lastPagePlusOne * CACHE_PAGE_SIZE;
		cemu_assert_debug(uploadRangeEnd > uploadRangeBegin);
		// make sure uploaded pages and hashes match
		uint32 numPages = lastPagePlusOne - firstPage;
		if (s_pageUploadBuffer.size() < (numPages * CACHE_PAGE_SIZE))
			s_pageUploadBuffer.resize(numPages * CACHE_PAGE_SIZE);
		// todo - improve performance by merging memcpy + hashPage() ?
		memcpy(s_pageUploadBuffer.data(), memory_getPointerFromPhysicalOffset(uploadRangeBegin), numPages * CACHE_PAGE_SIZE);
		for (uint32 i = 0; i < numPages; i++)
		{
			m_pageInfo[firstPage + i].hash = hashPage(s_pageUploadBuffer.data() + i * CACHE_PAGE_SIZE);
		}
		g_renderer->bufferCache_upload(s_pageUploadBuffer.data(), uploadRangeEnd - uploadRangeBegin, getBufferOffset(uploadRangeBegin));
	}

	// upload only non-streamout data of a single page
	// returns true if at least one streamout 16-byte block is present
	// also updates the page hash to match the uploaded data (strict match)
	sint32 uploadPageWithStreamoutFiltered(uint32 pageIndex)
	{
		uint8 pageCopy[CACHE_PAGE_SIZE];
		memcpy(pageCopy, memory_getPointerFromPhysicalOffset(m_rangeBegin + pageIndex * CACHE_PAGE_SIZE), CACHE_PAGE_SIZE);

		MPTR pageBase = m_rangeBegin + pageIndex * CACHE_PAGE_SIZE;

		sint32 blockBegin = -1;
		uint64* pagePtrU64 = (uint64*)pageCopy;
		m_pageInfo[pageIndex].hash = hashPage(pageCopy);
		bool hasStreamoutBlocks = false;
		for (sint32 i = 0; i < CACHE_PAGE_SIZE / 16; i++)
		{
			if (pagePtrU64[0] == c_streamoutSig0 && pagePtrU64[1] == c_streamoutSig1)
			{
				hasStreamoutBlocks = true;
				if (blockBegin != -1)
				{
					uint32 uploadRelRangeBegin = blockBegin * 16;
					uint32 uploadRelRangeEnd = i * 16;
					cemu_assert_debug(uploadRelRangeEnd > uploadRelRangeBegin);
					g_renderer->bufferCache_upload(pageCopy + uploadRelRangeBegin, uploadRelRangeEnd - uploadRelRangeBegin, getBufferOffset(pageBase + uploadRelRangeBegin));
					blockBegin = -1;
				}
				pagePtrU64 += 2;
				continue;
			}
			else if (blockBegin == -1)
				blockBegin = i;
			pagePtrU64 += 2;
		}
		if (blockBegin != -1)
		{
			uint32 uploadRelRangeBegin = blockBegin * 16;
			uint32 uploadRelRangeEnd = CACHE_PAGE_SIZE;
			cemu_assert_debug(uploadRelRangeEnd > uploadRelRangeBegin);
			g_renderer->bufferCache_upload(pageCopy + uploadRelRangeBegin, uploadRelRangeEnd - uploadRelRangeBegin, getBufferOffset(pageBase + uploadRelRangeBegin));
			blockBegin = -1;
		}
		return hasStreamoutBlocks;
	}

	void shrink(MPTR newRangeBegin, MPTR newRangeEnd)
	{
		cemu_assert_debug(newRangeBegin >= m_rangeBegin);
		cemu_assert_debug(newRangeEnd >= m_rangeEnd);
		cemu_assert_debug(newRangeEnd > m_rangeBegin);
		assert_dbg(); // todo (resize page array)
		m_rangeBegin = newRangeBegin;
		m_rangeEnd = newRangeEnd;
	}

	static uint64 hashPage(uint8* mem)
	{
		// note - this algorithm is/was also baked into pageWriteStreamoutSignatures()
		uint64 h = 0;
		uint64* memU64 = (uint64*)mem;
		for (uint32 i = 0; i < CACHE_PAGE_SIZE / 8; i++)
		{
			//h = _rotr64(h, 7);
			//h ^= *memU64;
			//memU64++;

			h = std::rotr<uint64>(h, 7);
			h += (*memU64 + (uint64)i);
			memU64++;
		}
		return h;
	}

	// flag page as having streamout data, also write streamout signatures to page memory
	// also incrementally updates the page hash to include the written signatures, this prevents signature writes from triggering a data upload
	void pageWriteStreamoutSignatures(uint32 pageIndex, MPTR rangeBegin, MPTR rangeEnd)
	{
		uint32 pageRangeBegin = m_rangeBegin + pageIndex * CACHE_PAGE_SIZE;
		uint32 pageRangeEnd = pageRangeBegin + CACHE_PAGE_SIZE;
		rangeBegin = std::max(pageRangeBegin, rangeBegin);
		rangeEnd = std::min(pageRangeEnd, rangeEnd);
		cemu_assert_debug(rangeEnd > rangeBegin);
		cemu_assert_debug(rangeBegin >= pageRangeBegin);
		cemu_assert_debug(rangeEnd <= pageRangeEnd);
		cemu_assert_debug((rangeBegin & 0xF) == 0);
		cemu_assert_debug((rangeEnd & 0xF) == 0);

		auto pageInfo = m_pageInfo.data() + pageIndex;
		pageInfo->hasStreamoutData = true;

		// if the whole page is replaced we can use a cached hash
		if (pageRangeBegin == rangeBegin && pageRangeEnd == rangeEnd)
		{
			uint64* pageMem = (uint64*)memory_getPointerFromPhysicalOffset(rangeBegin);
			uint32 numBlocks = (rangeEnd - rangeBegin) / 16;
			for (uint32 i = 0; i < numBlocks; i++)
			{
				pageMem[0] = c_streamoutSig0;
				pageMem[1] = c_streamoutSig1;
				pageMem += 2;
			}

			pageInfo->hash = c_fullStreamoutPageHash;
			return;
		}

		uint64* pageMem = (uint64*)memory_getPointerFromPhysicalOffset(rangeBegin);
		uint32 numBlocks = (rangeEnd - rangeBegin) / 16;
		uint32 indexHashBlock = (rangeBegin - pageRangeBegin) / sizeof(uint64);
		for (uint32 i = 0; i < numBlocks; i++)
		{
			pageMem[0] = c_streamoutSig0;
			pageMem[1] = c_streamoutSig1;
			pageMem += 2;
		}
		pageInfo->hash = 0; // reset hash
	}

	static uint64 genStreamoutPageHash()
	{
		uint8 pageMem[CACHE_PAGE_SIZE];
		uint64* pageMemU64 = (uint64*)pageMem;
		for (uint32 i = 0; i < sizeof(pageMem) / sizeof(uint64) / 2; i++)
		{
			pageMemU64[0] = c_streamoutSig0;
			pageMemU64[1] = c_streamoutSig1;
			pageMemU64 += 2;
		}

		return hashPage(pageMem);
	}

	static inline uint64 c_fullStreamoutPageHash = genStreamoutPageHash();
	static std::vector<uint32> g_deallocateQueue;

public:

	static void ProcessDeallocations()
	{
		for(auto& itr : g_deallocateQueue)
			g_gpuBufferHeap->freeOffset(itr);
		g_deallocateQueue.clear();
	}

	// drops everything from the cache that isn't considered in use or unrestorable (ranges with streamout)
	static void CleanupCacheAggressive(MPTR excludedRangeBegin, MPTR excludedRangeEnd)
	{
		size_t i = 0;
		while (i < s_allCacheNodes.size())
		{
			BufferCacheNode* node = s_allCacheNodes[i];
			if (node->isInUse())
			{
				i++;
				continue;
			}
			if(!node->isRAMOnly())
			{
				i++;
				continue;
			}
			if(node->GetRangeBegin() < excludedRangeEnd && node->GetRangeEnd() > excludedRangeBegin)
			{
				i++;
				continue;
			}
			// delete range
			node->ReleaseCacheMemoryImmediately();
			LatteBufferCache_removeSingleNodeFromTree(node);
			delete node;
		}
	}

	/* callbacks from IntervalTree */

	static BufferCacheNode* Create(MPTR rangeBegin, MPTR rangeEnd, std::span<BufferCacheNode*> overlappingObjects)
	{
		auto newRange = new BufferCacheNode(rangeBegin, rangeEnd);
		if (!newRange->allocateCacheMemory())
		{
			// not enough memory available, try to drop ram-only ranges from the ones we replace
			for (size_t i = 0; i < overlappingObjects.size(); i++)
			{
				BufferCacheNode* nodeItr = overlappingObjects[i];
				if (!nodeItr->isInUse() && nodeItr->isRAMOnly())
				{
					nodeItr->ReleaseCacheMemoryImmediately();
					delete nodeItr;
					overlappingObjects[i] = nullptr;
				}
			}
			// retry allocation
			if (!newRange->allocateCacheMemory())
			{
				forceLog_printf("Out-of-memory in GPU buffer (trying to allocate: %dKB) Cleaning up cache...", (rangeEnd - rangeBegin + 1023) / 1024);
				CleanupCacheAggressive(rangeBegin, rangeEnd);
				if (!newRange->allocateCacheMemory())
				{
					forceLog_printf("Failed to free enough memory in GPU buffer");
					cemu_assert(false);
				}
			}
		}
		newRange->syncFromRAM(rangeBegin, rangeEnd); // possible small optimization: only load the ranges from RAM which are not overwritten by ->syncFromNode()
		for (auto itr : overlappingObjects)
		{
			if(itr == nullptr)
				continue;
			newRange->syncFromNode(itr);
			delete itr;
		}
		return newRange;
	}

	static void Delete(BufferCacheNode* nodeObject)
	{
		delete nodeObject;
	}

	static void Resize(BufferCacheNode* nodeObject, MPTR rangeBegin, MPTR rangeEnd)
	{
		nodeObject->shrink(rangeBegin, rangeEnd);
	}

	static BufferCacheNode* Split(BufferCacheNode* nodeObject, MPTR firstRangeBegin, MPTR firstRangeEnd, MPTR secondRangeBegin, MPTR secondRangeEnd)
	{
		auto newRange = new BufferCacheNode(secondRangeBegin, secondRangeEnd);
		// todo - add support for splitting BufferCacheNode memory allocations, then we dont need to do a separate allocation
		if (!newRange->allocateCacheMemory())
		{
			forceLog_printf("Out-of-memory in GPU buffer during split operation");
			cemu_assert(false);
		}
		newRange->syncFromNode(nodeObject);
		nodeObject->shrink(firstRangeBegin, firstRangeEnd);
		return newRange;
	}
};

std::vector<uint32> BufferCacheNode::g_deallocateQueue;

IntervalTree2<MPTR, BufferCacheNode> g_gpuBufferCache;

void LatteBufferCache_removeSingleNodeFromTree(BufferCacheNode* node)
{
	g_gpuBufferCache.removeRangeSingleWithoutCallback(node->GetRangeBegin(), node->GetRangeEnd());
}

BufferCacheNode* LatteBufferCache_reserveRange(MPTR physAddress, uint32 size)
{
	MPTR rangeStart = physAddress - (physAddress % CACHE_PAGE_SIZE);
	MPTR rangeEnd = (physAddress + size + CACHE_PAGE_SIZE_M1) & ~CACHE_PAGE_SIZE_M1;

	auto range = g_gpuBufferCache.getRange(rangeStart, rangeEnd);
	if (!range)
	{
		g_gpuBufferCache.addRange(rangeStart, rangeEnd);
		range = g_gpuBufferCache.getRange(rangeStart, rangeEnd);
		cemu_assert_debug(range);
	}
	cemu_assert_debug(range->GetRangeBegin() <= physAddress);
	cemu_assert_debug(range->GetRangeEnd() >= (physAddress + size));
	return range;
}


uint32 LatteBufferCache_retrieveDataInCache(MPTR physAddress, uint32 size)
{
	auto range = LatteBufferCache_reserveRange(physAddress, size);
	range->flagInUse();

	range->checkAndSyncModificationsIfChrononChanged(physAddress, size);

	return range->getBufferOffset(physAddress);
}

void LatteBufferCache_copyStreamoutDataToCache(MPTR physAddress, uint32 size, uint32 streamoutBufferOffset)
{
	if (size == 0)
		return;
	cemu_assert_debug(size >= 16);

	auto range = LatteBufferCache_reserveRange(physAddress, size);
	range->flagInUse();

	g_renderer->bufferCache_copyStreamoutToMainBuffer(streamoutBufferOffset, range->getBufferOffset(physAddress), size);

	// write streamout signatures, flag affected pages
	range->writeStreamout(physAddress, (physAddress + size));
}

void LatteBufferCache_invalidate(MPTR physAddress, uint32 size)
{
	if (size == 0)
		return;
	g_gpuBufferCache.forEachOverlapping(physAddress, physAddress + size, [](BufferCacheNode* node, MPTR invalidationRangeBegin, MPTR invalidationRangeEnd)
		{
			node->invalidate(invalidationRangeBegin, invalidationRangeEnd);
		}
	);
}

// optimized version of LatteBufferCache_invalidate() if physAddress points to the beginning of a page
void LatteBufferCache_invalidatePage(MPTR physAddress)
{
	cemu_assert_debug((physAddress & CACHE_PAGE_SIZE_M1) == 0);
	BufferCacheNode* node = g_gpuBufferCache.getRangeByPoint(physAddress);
	if (node)
		node->invalidate(physAddress, physAddress+CACHE_PAGE_SIZE);
}

void LatteBufferCache_processDeallocations()
{
	BufferCacheNode::ProcessDeallocations();
}

void LatteBufferCache_init(size_t bufferSize)
{
	g_gpuBufferHeap.reset(new VHeap(nullptr, (uint32)bufferSize));
	g_renderer->bufferCache_init((uint32)bufferSize);
}

void LatteBufferCache_getStats(uint32& heapSize, uint32& allocationSize, uint32& allocNum)
{
	g_gpuBufferHeap->getStats(heapSize, allocationSize, allocNum);
}

FSpinlock g_spinlockDCFlushQueue;
std::unordered_set<uint32>* g_DCFlushQueue = new std::unordered_set<uint32>(); // queued pages
std::unordered_set<uint32>* g_DCFlushQueueAlternate = new std::unordered_set<uint32>();

void LatteBufferCache_notifyDCFlush(MPTR address, uint32 size)
{
	if (address == 0 || size == 0xFFFFFFFF)
		return; // global flushes are ignored for now

	uint32 firstPage = address / CACHE_PAGE_SIZE;
	uint32 lastPage = (address + size - 1) / CACHE_PAGE_SIZE;
	g_spinlockDCFlushQueue.acquire();
	for (uint32 i = firstPage; i <= lastPage; i++)
		g_DCFlushQueue->emplace(i);
	g_spinlockDCFlushQueue.release();
}

void LatteBufferCache_processDCFlushQueue()
{
	if (g_DCFlushQueue->empty()) // accessing this outside of the lock is technically undefined/unsafe behavior but on all known implementations this is fine and we can avoid the spinlock
		return;
	g_spinlockDCFlushQueue.acquire();
	std::swap(g_DCFlushQueue, g_DCFlushQueueAlternate);
	g_spinlockDCFlushQueue.release();
	for (auto& itr : *g_DCFlushQueueAlternate)
		LatteBufferCache_invalidatePage(itr * CACHE_PAGE_SIZE);
	g_DCFlushQueueAlternate->clear();
}

void LatteBufferCache_notifyDrawDone()
{

}

void LatteBufferCache_notifySwapTVScanBuffer()
{
	if( ActiveSettings::FlushGPUCacheOnSwap() )
		g_currentCacheChronon++;
}

void LatteBufferCache_incrementalCleanup()
{
	static uint32 s_counter = 0;

	if (s_allCacheNodes.empty())
		return;

	s_counter++;
	s_counter %= (uint32)s_allCacheNodes.size();

	auto range = s_allCacheNodes[s_counter];

	if (range->HasStreamoutData())
	{
		// currently we never delete streamout ranges
		// todo - check if streamout pages got overwritten + if the range would lose the hasStreamoutData flag
		return;
	}

	uint32 heapSize;
	uint32 allocationSize;
	uint32 allocNum;
	g_gpuBufferHeap->getStats(heapSize, allocationSize, allocNum);

	if (allocationSize >= (heapSize * 4 / 5))
	{
		// heap is 80% filled
		if (range->GetFrameAge() >= 2)
		{
			g_gpuBufferCache.removeRangeSingle(range->GetRangeBegin(), range->GetRangeEnd());
		}
	}
	else if (allocationSize >= (heapSize * 3 / 4))
	{
		// heap is 75-100% filled
		if (range->GetFrameAge() >= 4)
		{
			g_gpuBufferCache.removeRangeSingle(range->GetRangeBegin(), range->GetRangeEnd());
		}
	}
	else if (allocationSize >= (heapSize / 2))
	{
		// if heap is 50-75% filled
		if (range->GetFrameAge() >= 20)
		{
			g_gpuBufferCache.removeRangeSingle(range->GetRangeBegin(), range->GetRangeEnd());
		}
	}
	else
	{
		// heap is under 50% capacity
		if (range->GetFrameAge() >= 500)
		{
			g_gpuBufferCache.removeRangeSingle(range->GetRangeBegin(), range->GetRangeEnd());
		}
	}
}