#include "Cafe/HW/Latte/Core/LatteConst.h"
#include "Cafe/HW/Latte/Core/LatteShaderAssembly.h"
#include "Cafe/HW/Latte/ISA/RegDefines.h"
#include "Cafe/OS/libs/gx2/GX2.h"
#include "Cafe/HW/Latte/Core/Latte.h"
#include "Cafe/HW/Latte/Core/LatteDraw.h"
#include "Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h"
#include "Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInstructions.h"
#include "Cafe/HW/Latte/Core/FetchShader.h"
#include "Cafe/HW/Latte/ISA/LatteInstructions.h"
#include "util/containers/LookupTableL3.h"
#include "util/helpers/fspinlock.h"
#include <openssl/sha.h> /* SHA1_DIGEST_LENGTH */
#include <openssl/evp.h> /* EVP_Digest */

uint32 LatteShaderRecompiler_getAttributeSize(LatteParsedFetchShaderAttribute_t* attrib)
{
	if (attrib->format == FMT_32_32_32_32 || attrib->format == FMT_32_32_32_32_FLOAT)
		return 4 * 4;
	else if (attrib->format == FMT_32_32_32 || attrib->format == FMT_32_32_32_FLOAT)
		return 3 * 4;
	else if (attrib->format == FMT_32_32 || attrib->format == FMT_32_32_FLOAT)
		return 2 * 4;
	else if (attrib->format == FMT_32 || attrib->format == FMT_32_FLOAT)
		return 1 * 4;
	else if (attrib->format == FMT_16_16_16_16 || attrib->format == FMT_16_16_16_16_FLOAT)
		return 4 * 2;
	else if (attrib->format == FMT_16_16 || attrib->format == FMT_16_16_FLOAT)
		return 2 * 2;
	else if (attrib->format == FMT_16 || attrib->format == FMT_16_FLOAT)
		return 1 * 2;
	else if (attrib->format == FMT_8_8_8_8)
		return 4 * 1;
	else if (attrib->format == FMT_8_8)
		return 2 * 1;
	else if (attrib->format == FMT_8)
		return 1 * 1;
	else if (attrib->format == FMT_2_10_10_10)
		return 4;
	else
		cemu_assert_unimplemented();
	return 0;
}

uint32 LatteShaderRecompiler_getAttributeAlignment(LatteParsedFetchShaderAttribute_t* attrib)
{
	if (attrib->format == FMT_32_32_32_32 || attrib->format == FMT_32_32_32_32_FLOAT)
		return 4;
	else if (attrib->format == FMT_32_32_32 || attrib->format == FMT_32_32_32_FLOAT)
		return 4;
	else if (attrib->format == FMT_32_32 || attrib->format == FMT_32_32_FLOAT)
		return 4;
	else if (attrib->format == FMT_32 || attrib->format == FMT_32_FLOAT)
		return 4;
	else if (attrib->format == FMT_16_16_16_16 || attrib->format == FMT_16_16_16_16_FLOAT)
		return 2;
	else if (attrib->format == FMT_16_16 || attrib->format == FMT_16_16_FLOAT)
		return 2;
	else if (attrib->format == FMT_16 || attrib->format == FMT_16_FLOAT)
		return 2;
	else if (attrib->format == FMT_8_8_8_8)
		return 1;
	else if (attrib->format == FMT_8_8)
		return 1;
	else if (attrib->format == FMT_8)
		return 1;
	else if (attrib->format == FMT_2_10_10_10)
		return 4;
	else
		cemu_assert_unimplemented();
	return 4;
}

void LatteShader_calculateFSKey(LatteFetchShader* fetchShader)
{
	uint64 key = 0;
	for (sint32 g = 0; g < fetchShader->bufferGroups.size(); g++)
	{
		LatteParsedFetchShaderBufferGroup_t& group = fetchShader->bufferGroups[g];
		for (sint32 f = 0; f < group.attribCount; f++)
		{
			LatteParsedFetchShaderAttribute_t* attrib = group.attrib + f;
			key += (uint64)attrib->endianSwap;
			key = std::rotl(key, 3);
			key += (uint64)attrib->nfa;
			key = std::rotl(key, 3);
			key += (uint64)(attrib->isSigned?1:0);
			key = std::rotl(key, 1);
			key += (uint64)attrib->format;
			key = std::rotl(key, 7);
			key += (uint64)attrib->fetchType;
			key = std::rotl(key, 8);
			key += (uint64)attrib->ds[0];
			key = std::rotl(key, 2);
			key += (uint64)attrib->ds[1];
			key = std::rotl(key, 2);
			key += (uint64)attrib->ds[2];
			key = std::rotl(key, 2);
			key += (uint64)attrib->ds[3];
			key = std::rotl(key, 2);
			key += (uint64)(attrib->aluDivisor+1);
			key = std::rotl(key, 2);
			key += (uint64)attrib->attributeBufferIndex;
			key = std::rotl(key, 8);
			key += (uint64)attrib->semanticId;
			key = std::rotl(key, 8);
			key += (uint64)(attrib->offset & 3);
			key = std::rotl(key, 2);
		}
	}
	// todo - also hash invalid buffer groups?
	fetchShader->key = key;
}

uint32 LatteParsedFetchShaderBufferGroup_t::getCurrentBufferStride(uint32* contextRegister) const
{
	uint32 bufferIndex = this->attributeBufferIndex;
	uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + bufferIndex * 7;
	uint32 bufferStride = (contextRegister[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF;
	return bufferStride;
}

void LatteFetchShader::CalculateFetchShaderVkHash()
{
	// calculate SHA1 of all states that are part of the Vulkan graphics pipeline
	EVP_MD_CTX *ctx = EVP_MD_CTX_new();
	EVP_DigestInit(ctx, EVP_sha1());
	for(auto& group : bufferGroups)
	{
		// offsets
		for (sint32 t = 0; t < group.attribCount; t++)
		{
			uint32 offset = group.attrib[t].offset;
			EVP_DigestUpdate(ctx, &t, sizeof(t));
			EVP_DigestUpdate(ctx, &offset, sizeof(offset));
		}
	}
	uint8 shaDigest[SHA_DIGEST_LENGTH];
	EVP_DigestFinal_ex(ctx, shaDigest, NULL);
	EVP_MD_CTX_free(ctx);

	// fold SHA1 hash into a 64bit value
	uint64 h = *(uint64*)(shaDigest + 0);
	h += *(uint64*)(shaDigest + 8);
	h += (uint64)*(uint32*)(shaDigest + 16);
	this->vkPipelineHashFragment = h;
}

void _fetchShaderDecompiler_parseInstruction_VTX_SEMANTIC(LatteFetchShader* parsedFetchShader, uint32* contextRegister, const LatteClauseInstruction_VTX* instr)
{
	uint32 semanticId = instr->getFieldSEM_SEMANTIC_ID(); // location (attribute index inside shader)
	uint32 bufferId = instr->getField_BUFFER_ID(); // the index used for GX2SetAttribBuffer (+0xA0)
	LatteConst::VertexFetchType2 fetchType = instr->getField_FETCH_TYPE();
	auto srcSelX = instr->getField_SRC_SEL_X();
	auto dsx = instr->getField_DST_SEL(0);
	auto dsy = instr->getField_DST_SEL(1);
	auto dsz = instr->getField_DST_SEL(2);
	auto dsw = instr->getField_DST_SEL(3);
	auto dataFormat = instr->getField_DATA_FORMAT();
	uint32 offset = instr->getField_OFFSET();
	auto nfa = instr->getField_NUM_FORMAT_ALL();
	bool isSigned = instr->getField_FORMAT_COMP_ALL() == LatteClauseInstruction_VTX::FORMAT_COMP::COMP_SIGNED;
	auto endianSwap = instr->getField_ENDIAN_SWAP();
	
	// get buffer
	cemu_assert_debug(bufferId >= 0xA0 && bufferId < 0xB0);
	uint32 bufferIndex = (bufferId - 0xA0);

	// get or add new attribute group (by buffer index)
	LatteParsedFetchShaderBufferGroup_t* attribGroup = nullptr;
	if (LatteFetchShader::isValidBufferIndex(bufferIndex))
	{
		auto bufferGroupItr = std::find_if(parsedFetchShader->bufferGroups.begin(), parsedFetchShader->bufferGroups.end(), [bufferIndex](LatteParsedFetchShaderBufferGroup_t& bufferGroup) {return bufferGroup.attributeBufferIndex == bufferIndex; });
		if (bufferGroupItr != parsedFetchShader->bufferGroups.end())
			attribGroup = &(*bufferGroupItr);
	}
	else
	{
		auto bufferGroupItr = std::find_if(parsedFetchShader->bufferGroupsInvalid.begin(), parsedFetchShader->bufferGroupsInvalid.end(), [bufferIndex](LatteParsedFetchShaderBufferGroup_t& bufferGroup) {return bufferGroup.attributeBufferIndex == bufferIndex; });
		if (bufferGroupItr != parsedFetchShader->bufferGroupsInvalid.end())
			attribGroup = &(*bufferGroupItr);
	}
	// create new group if none found
	if (attribGroup == nullptr)
	{
		if (LatteFetchShader::isValidBufferIndex(bufferIndex))
			attribGroup = &parsedFetchShader->bufferGroups.emplace_back();
		else
			attribGroup = &parsedFetchShader->bufferGroupsInvalid.emplace_back();

		attribGroup->attributeBufferIndex = bufferIndex;
		attribGroup->minOffset = offset;
		attribGroup->maxOffset = offset;
	}
	// add attribute
	sint32 groupAttribIndex = attribGroup->attribCount;
	if (attribGroup->attribCount < (groupAttribIndex + 1))
	{
		attribGroup->attribCount = (groupAttribIndex + 1);
		attribGroup->attrib = (LatteParsedFetchShaderAttribute_t*)realloc(attribGroup->attrib, sizeof(LatteParsedFetchShaderAttribute_t) * attribGroup->attribCount);
	}
	attribGroup->attrib[groupAttribIndex].semanticId = semanticId;
	attribGroup->attrib[groupAttribIndex].format = (uint8)dataFormat;
	attribGroup->attrib[groupAttribIndex].fetchType = fetchType;
	attribGroup->attrib[groupAttribIndex].nfa = (uint8)nfa;
	attribGroup->attrib[groupAttribIndex].isSigned = isSigned;
	attribGroup->attrib[groupAttribIndex].offset = offset;
	attribGroup->attrib[groupAttribIndex].ds[0] = (uint8)dsx;
	attribGroup->attrib[groupAttribIndex].ds[1] = (uint8)dsy;
	attribGroup->attrib[groupAttribIndex].ds[2] = (uint8)dsz;
	attribGroup->attrib[groupAttribIndex].ds[3] = (uint8)dsw;
	attribGroup->attrib[groupAttribIndex].attributeBufferIndex = bufferIndex;
	attribGroup->attrib[groupAttribIndex].endianSwap = endianSwap;
	attribGroup->minOffset = (std::min)(attribGroup->minOffset, offset);
	attribGroup->maxOffset = (std::max)(attribGroup->maxOffset, offset);
	// get alu divisor
	if (srcSelX == LatteClauseInstruction_VTX::SRC_SEL::SEL_X)
	{
		cemu_assert_debug(fetchType != LatteConst::VertexFetchType2::INSTANCE_DATA); // aluDivisor 0 in combination with instanced data is not allowed?
		attribGroup->attrib[groupAttribIndex].aluDivisor = -1;
	}
	else if (srcSelX == LatteClauseInstruction_VTX::SRC_SEL::SEL_W)
	{
		cemu_assert_debug(fetchType == LatteConst::VertexFetchType2::INSTANCE_DATA); // using constant divisor 1 with per-vertex data seems strange? (divisor is instance-only)
		// aluDivisor is constant 1
		attribGroup->attrib[groupAttribIndex].aluDivisor = 1;
	}
	else if (srcSelX == LatteClauseInstruction_VTX::SRC_SEL::SEL_Y)
	{
		// use alu divisor 1
		attribGroup->attrib[groupAttribIndex].aluDivisor = (sint32)contextRegister[mmVGT_INSTANCE_STEP_RATE_0 + 0];
		cemu_assert_debug(attribGroup->attrib[groupAttribIndex].aluDivisor > 0);
	}
	else if (srcSelX == LatteClauseInstruction_VTX::SRC_SEL::SEL_Z)
	{
		// use alu divisor 2
		attribGroup->attrib[groupAttribIndex].aluDivisor = (sint32)contextRegister[mmVGT_INSTANCE_STEP_RATE_0 + 1];
		cemu_assert_debug(attribGroup->attrib[groupAttribIndex].aluDivisor > 0);
	}
}

void _fetchShaderDecompiler_parseVTXClause(LatteFetchShader* parsedFetchShader, uint32* contextRegister, std::span<uint8> clauseCode, size_t numInstructions)
{
	const LatteClauseInstruction_VTX* instr = (LatteClauseInstruction_VTX*)clauseCode.data();
	const LatteClauseInstruction_VTX* end = instr + numInstructions;
	while (instr < end)
	{
		if (instr->getField_VTX_INST() == LatteClauseInstruction_VTX::VTX_INST::_VTX_INST_SEMANTIC)
		{
			_fetchShaderDecompiler_parseInstruction_VTX_SEMANTIC(parsedFetchShader, contextRegister, instr);
		}
		else
		{
			assert_dbg();
		}
		instr++;
	}
}

void _fetchShaderDecompiler_parseCF(LatteFetchShader* parsedFetchShader, uint32* contextRegister, std::span<uint8> programCode)
{
	size_t maxCountCFInstructions = programCode.size_bytes() / sizeof(LatteCFInstruction);
	const LatteCFInstruction* cfInstruction = (LatteCFInstruction*)programCode.data();
	const LatteCFInstruction* end = cfInstruction + maxCountCFInstructions;
	while (cfInstruction < end)
	{
		if (cfInstruction->getField_Opcode() == LatteCFInstruction::INST_VTX_TC)
		{
			auto vtxInstruction = cfInstruction->getParserIfOpcodeMatch<LatteCFInstruction_DEFAULT>();
			cemu_assert_debug(vtxInstruction->getField_COND() == LatteCFInstruction::CF_COND::CF_COND_ACTIVE);
			_fetchShaderDecompiler_parseVTXClause(parsedFetchShader, contextRegister, vtxInstruction->getClauseCode(programCode), vtxInstruction->getField_COUNT());
		}
		else if (cfInstruction->getField_Opcode() == LatteCFInstruction::INST_RETURN)
		{
			cemu_assert_debug(!cfInstruction->getField_END_OF_PROGRAM());
			return;
		}
		else
		{
			cemu_assert_debug(false); // unhandled / unexpected CF instruction
		}
		if (cfInstruction->getField_END_OF_PROGRAM())
		{
			cemu_assert_debug(false); // unusual for fetch shader? They should end with a return instruction
			break;
		}
		cfInstruction++;
	}
	cemu_assert_debug(false); // program must be terminated with an instruction that has EOP set?
}

// parse fetch shader and create LatteFetchShader object
// also registers the fs in the cache (s_fetchShaderByHash)
// can be assumed to be thread-safe, if called simultaneously on the same fetch shader only one shader will become registered. The others will be destroyed
LatteFetchShader* LatteShaderRecompiler_createFetchShader(LatteFetchShader::CacheHash fsHash, uint32* contextRegister, uint32* fsProgramCode, uint32 fsProgramSize)
{
	LatteFetchShader* newFetchShader = new LatteFetchShader();
	newFetchShader->m_cacheHash = fsHash;
	if( (fsProgramSize&0xF) != 0 )
		debugBreakpoint();
	uint32 index = 0;

	// if the first instruction is a CF instruction then parse shader properly
	// otherwise fall back to our broken legacy method (where we assumed fetch shaders had no CF program)
	// this workaround is required to make sure old shader caches dont break

	// from old fetch shader gen (CF part missing):
	//			{0x0000a001, 0x27961000, 0x00020000, 0x00000000}
	//			{0x0000a001, 0x2c151002, 0x00020000, 0x00000000, 0x0000a001, 0x068d1000, 0x0000000c, ...}
	//			{0x0000a001, 0x2c151000, 0x00020000, 0x00000000}
	//          {0x0300aa21, 0x28cd1006, 0x00000000, 0x00000000, 0x0300ab21, 0x28cd1007, 0x00000000, ...}

	// shaders shipped with games (e.g. BotW):
	//			{0x00000002, 0x01800400, 0x00000000, 0x8a000000, 0x1c00a001, 0x280d1000, 0x00090000, ...}
	//			{0x00000002, 0x01800000, 0x00000000, 0x8a000000, 0x1c00a001, 0x27961000, 0x000a0000, ...}
	//			{0x00000002, 0x01800c00, 0x00000000, 0x8a000000, 0x2c00a001, 0x2c151000, 0x000a0000, ...} // size 0x50
	//          {0x00000002, 0x01801000, 0x00000000, 0x8a000000, 0x1c00a001, 0x280d1000, 0x00090000, ...} // size 0x60
	//			{0x00000002, 0x01801c00, 0x00000000, 0x8a000000, 0x1c00a001, 0x280d1000, 0x00090000, ...} // size 0x90
	
	// our new implementation:
	//			{0x00000002, 0x01800400, 0x00000000, 0x8a000000, 0x0000a001, 0x2c151000, 0x00020000, ...}

	// for ALU instructions everything except the 01 is dynamic
	newFetchShader->bufferGroups.reserve(16);
	if (fsProgramSize == 0)
	{
		// empty fetch shader, seen in Minecraft
		// these only make sense when vertex shader does not call FS?
		LatteShader_calculateFSKey(newFetchShader);
		newFetchShader->CalculateFetchShaderVkHash();
		return newFetchShader;
	}

	if ((fsProgramCode[0] & 1) == 0 && fsProgramCode[0] <= 0x30 && (fsProgramCode[1]&~((3 << 10)| (1 << 19))) == 0x01800000)
	{
		// very likely a CF instruction
		_fetchShaderDecompiler_parseCF(newFetchShader, contextRegister, { (uint8*)fsProgramCode, fsProgramSize });
	}
	else
	{
		while (index < (fsProgramSize / 4))
		{
			uint32 dword0 = fsProgramCode[index];
			uint32 opcode = dword0 & 0x1F;
			index++;
			if (opcode == VTX_INST_MEM)
			{
				// this might be the clause initialization instruction? (Seems to be the first instruction always)
				// todo - upon further investigation, it seems like fetch shaders also start with a CF program. Our implementation doesnt emit one right now
				uint32 opcode2 = (dword0 >> 8) & 7;

				index += 3;
			}
			else if (opcode == VTX_INST_SEMANTIC)
			{
				_fetchShaderDecompiler_parseInstruction_VTX_SEMANTIC(newFetchShader, contextRegister, (const LatteClauseInstruction_VTX*)(fsProgramCode + index - 1));
				index += 3;
			}
		}
	}
	newFetchShader->bufferGroups.shrink_to_fit();
	// calculate group information
	// VBO offsets and stride
	uint32 vboOffset = 0;
	for (auto& bufferGroup : newFetchShader->bufferGroups)
	{
		for(sint32 i=0; i< bufferGroup.attribCount; i++)
		{
			uint32 attribSize = LatteShaderRecompiler_getAttributeSize(bufferGroup.attrib+i);
			uint32 attribAlignment = LatteShaderRecompiler_getAttributeAlignment(bufferGroup.attrib+i);
			// fix alignment
			vboOffset = (vboOffset+attribAlignment-1)&~(attribAlignment-1);
			vboOffset += attribSize;
			// index type
			if(bufferGroup.attrib[i].fetchType == LatteConst::VERTEX_DATA)
				bufferGroup.hasVtxIndexAccess = true;
			else if (bufferGroup.attrib[i].fetchType == LatteConst::INSTANCE_DATA)
				bufferGroup.hasInstanceIndexAccess = true;
		}
		// fix alignment of whole vertex
		if(bufferGroup.attribCount > 0 )
		{
			uint32 attribAlignment = LatteShaderRecompiler_getAttributeAlignment(bufferGroup.attrib+0);
			vboOffset = (vboOffset+attribAlignment-1)&~(attribAlignment-1);
		}
		bufferGroup.vboStride = vboOffset;
	}
	LatteShader_calculateFSKey(newFetchShader);
	newFetchShader->CalculateFetchShaderVkHash();

	// register in cache
	// its possible that during multi-threaded shader cache loading, two identical (same hash) fetch shaders get created simultaneously
	// we catch and handle this case here. RegisterInCache() is atomic and if another fetch shader is already registered, we abandon the local instance
	LatteFetchShader* registeredFS = newFetchShader->RegisterInCache(fsHash);
	if (registeredFS)
	{
		delete newFetchShader;
		newFetchShader = registeredFS;
	}
	else
	{
		newFetchShader->m_isRegistered = true;
	}


	return newFetchShader;
}

LatteFetchShader::~LatteFetchShader()
{
	UnregisterInCache();
}

struct FetchShaderLookupInfo 
{
	LatteFetchShader* fetchShader;
	uint32 programSize;
	uint32 lastFrameAccessed;
};

LookupTableL3<8, 8, 8, FetchShaderLookupInfo*> g_fetchShaderLookupCache;

LatteFetchShader::CacheHash LatteFetchShader::CalculateCacheHash(void* programCode, uint32 programSize)
{
	uint32* programCodeU32 = (uint32*)programCode;
	uint64 progHash1 = 0;
	uint64 progHash2 = 0;
	for (uint32 i = 0; i < programSize / 4; i++)
	{
		uint32 temp = programCodeU32[i];
		progHash1 += (uint64)temp;
		progHash2 ^= (uint64)temp;
		progHash1 = (progHash1 << 3) | (progHash1 >> 61);
		progHash2 = (progHash2 >> 7) | (progHash2 << 57);
	}

	// todo - we should incorporate the value of VGT_INSTANCE_STEP_RATE_0/1 into the hash since it affects the generated LatteFetchShader object
	//        However, this would break compatibility with shader caches and gfx packs due to altering the shader base hashes

	return progHash1 + progHash2;
}

LatteFetchShader* LatteFetchShader::FindInCacheByHash(LatteFetchShader::CacheHash fsHash)
{
	// does not hold s_fetchShaderCache for better performance. Be careful not to call this while another thread invokes RegisterInCache()
	auto itr = s_fetchShaderByHash.find(fsHash);
	if (itr == s_fetchShaderByHash.end())
		return nullptr;
	return itr->second;
}

void* _getFSProgramPtr()
{
	return memory_getPointerFromPhysicalOffset(LatteGPUState.contextRegister[mmSQ_PGM_START_FS + 0] << 8);
}

uint32 _getFSProgramSize()
{
	return LatteGPUState.contextRegister[mmSQ_PGM_START_FS + 1] << 3;
}

LatteFetchShader* LatteFetchShader::FindByGPUState()
{
	// retrieve fetch shader that matches the currently set GPU context registers
	uint32 fsPhysAddr24 = LatteGPUState.contextRegister[mmSQ_PGM_START_FS + 0];
	cemu_assert_debug(fsPhysAddr24 < 0x1000000); // should only contain the upper 24 bit of the address in the lower 24 bit of the register

	FetchShaderLookupInfo* lookupInfo = g_fetchShaderLookupCache.lookup(fsPhysAddr24);
	if (lookupInfo)
	{
		// return fetch shader if still the same
		uint32 fsSize = _getFSProgramSize();
		uint32 framesSinceLastAccess = LatteGPUState.frameCounter - lookupInfo->lastFrameAccessed;
		if (lookupInfo->programSize == fsSize && framesSinceLastAccess == 0)
		{
			lookupInfo->lastFrameAccessed = LatteGPUState.frameCounter;
			return lookupInfo->fetchShader;
		}
		// update lookup info
		CacheHash fsHash = CalculateCacheHash(_getFSProgramPtr(), _getFSProgramSize());
		LatteFetchShader* fetchShader = FindInCacheByHash(fsHash);
		if (!fetchShader)
		{
			fetchShader = LatteShaderRecompiler_createFetchShader(fsHash, LatteGPUState.contextNew.GetRawView(), (uint32*)_getFSProgramPtr(), _getFSProgramSize());
			cemu_assert(fetchShader);
		}
		lookupInfo->fetchShader = fetchShader;
		lookupInfo->programSize = fsSize;
		lookupInfo->lastFrameAccessed = LatteGPUState.frameCounter;
		return fetchShader;
	}
	else
	{
		// try to find fetch shader by data hash
		CacheHash fsHash = CalculateCacheHash(_getFSProgramPtr(), _getFSProgramSize());
		LatteFetchShader* fetchShader = FindInCacheByHash(fsHash);
		if (!fetchShader)
		{
			fetchShader = LatteShaderRecompiler_createFetchShader(fsHash, LatteGPUState.contextNew.GetRawView(), (uint32*)_getFSProgramPtr(), _getFSProgramSize());
			cemu_assert(fetchShader);
		}
		// create new lookup entry
		lookupInfo = new FetchShaderLookupInfo();
		lookupInfo->fetchShader = fetchShader;
		lookupInfo->programSize = _getFSProgramSize();
		lookupInfo->lastFrameAccessed = LatteGPUState.frameCounter;
		g_fetchShaderLookupCache.store(fsPhysAddr24, lookupInfo);
#ifndef PUBLIC_RELEASE
		cemu_assert_debug(g_fetchShaderLookupCache.lookup(fsPhysAddr24) == lookupInfo);
#endif
	}
	return lookupInfo->fetchShader;
}

FSpinlock s_spinlockFetchShaderCache;

LatteFetchShader* LatteFetchShader::RegisterInCache(CacheHash fsHash)
{
	s_spinlockFetchShaderCache.acquire();
	auto itr = s_fetchShaderByHash.find(fsHash);
	if (itr != s_fetchShaderByHash.end())
	{
		LatteFetchShader* fs = itr->second;
		s_spinlockFetchShaderCache.release();
		return fs;
	}
	s_fetchShaderByHash.emplace(fsHash, this);
	s_spinlockFetchShaderCache.release();
	return nullptr;
}

void LatteFetchShader::UnregisterInCache()
{
	if (!m_isRegistered)
		return;
	s_spinlockFetchShaderCache.acquire();
	auto itr = s_fetchShaderByHash.find(m_cacheHash);
	cemu_assert(itr == s_fetchShaderByHash.end());
	s_fetchShaderByHash.erase(itr);
	s_spinlockFetchShaderCache.release();
}

std::unordered_map<LatteFetchShader::CacheHash, LatteFetchShader*> LatteFetchShader::s_fetchShaderByHash;