GX2+TCL: Reimplement command buffer submission

- GX2 utilizes TCL(.rpl) API for command submission instead of directly writing to an internal GPU fifo - Submission & retire timestamps are correctly implemented as incremental counters - Command buffering behaviour matches console - Fixes race conditions on aarch64
2025-07-02 13:01:18 +12:00 · 2025-05-14 18:59:50 +02:00 · 2025-05-14 18:59:50 +02:00 · 28ea70b6d8
commit 28ea70b6d8
parent 96765e4ac6
21 changed files with 761 additions and 472 deletions
--- a/src/Cafe/HW/Latte/Core/Latte.h
+++ b/src/Cafe/HW/Latte/Core/Latte.h
@ -47,8 +47,6 @@ struct LatteGPUState_t
 	gx2GPUSharedArea_t* sharedArea; // quick reference to shared area
 	MPTR sharedAreaAddr;
 	// other
-	// todo: Currently we have the command buffer logic implemented as a FIFO ringbuffer. On real HW it's handled as a series of command buffers that are pushed individually.
-	std::atomic<uint64> lastSubmittedCommandBufferTimestamp;
 	uint32 gx2InitCalled; // incremented every time GX2Init() is called
 	// OpenGL control
 	uint32 glVendor; // GLVENDOR_*
@ -75,8 +73,6 @@ struct LatteGPUState_t

 extern LatteGPUState_t LatteGPUState;

-extern uint8* gxRingBufferReadPtr; // currently active read pointer (gx2 ring buffer or display list)
-
 // texture

 #include "Cafe/HW/Latte/Core/LatteTexture.h"
--- a/src/Cafe/HW/Latte/Core/LatteCommandProcessor.cpp
+++ b/src/Cafe/HW/Latte/Core/LatteCommandProcessor.cpp
@ -13,6 +13,7 @@
 #include "Cafe/HW/Latte/Core/LattePM4.h"

 #include "Cafe/OS/libs/coreinit/coreinit_Time.h"
+#include "Cafe/OS/libs/TCL/TCL.h" // TCL currently handles the GPU command ringbuffer

 #include "Cafe/CafeSystem.h"

@ -28,11 +29,6 @@ typedef uint32be* LatteCMDPtr;
 #define LatteReadCMD() ((uint32)*(cmd++))
 #define LatteSkipCMD(_nWords) cmd += (_nWords)

-uint8* gxRingBufferReadPtr; // currently active read pointer (gx2 ring buffer or display list)
-uint8* gx2CPParserDisplayListPtr;
-uint8* gx2CPParserDisplayListStart; // used for debugging
-uint8* gx2CPParserDisplayListEnd;
-
 void LatteThread_HandleOSScreen();

 void LatteThread_Exit();
@ -155,16 +151,12 @@ void LatteCP_signalEnterWait()
 */
 uint32 LatteCP_readU32Deprc()
 {
-	uint32 v;
-	uint8* gxRingBufferWritePtr;
-	sint32 readDistance;
 	// no display list active
 	while (true)
 	{
-		gxRingBufferWritePtr = gx2WriteGatherPipe.writeGatherPtrGxBuffer[GX2::sGX2MainCoreIndex];
-		readDistance = (sint32)(gxRingBufferWritePtr - gxRingBufferReadPtr);
-		if (readDistance != 0)
-			break;
+		uint32 cmdWord;
+		if ( TCL::TCLGPUReadRBWord(cmdWord) )
+			return cmdWord;

 		g_renderer->NotifyLatteCommandProcessorIdle(); // let the renderer know in case it wants to flush any commands
 		performanceMonitor.gpuTime_idleTime.beginMeasuring();
@ -175,56 +167,8 @@ uint32 LatteCP_readU32Deprc()
 		}
 		LatteThread_HandleOSScreen(); // check if new frame was presented via OSScreen API

-		readDistance = (sint32)(gxRingBufferWritePtr - gxRingBufferReadPtr);
-		if (readDistance != 0)
-			break;
-		if (Latte_GetStopSignal())
-			LatteThread_Exit();
-
-		// still no command data available, do some other tasks
-		LatteTiming_HandleTimedVsync();
-		LatteAsyncCommands_checkAndExecute();
-		std::this_thread::yield();
-		performanceMonitor.gpuTime_idleTime.endMeasuring();
-	}
-	v = *(uint32*)gxRingBufferReadPtr;
-	gxRingBufferReadPtr += 4;
-#ifdef CEMU_DEBUG_ASSERT
-	if (v == 0xcdcdcdcd)
-		assert_dbg();
-#endif
-	v = _swapEndianU32(v);
-	return v;
-}
-
-void LatteCP_waitForNWords(uint32 numWords)
-{
-	uint8* gxRingBufferWritePtr;
-	sint32 readDistance;
-	bool isFlushed = false;
-	sint32 waitDistance = numWords * sizeof(uint32be);
-	// no display list active
-	while (true)
-	{
-		gxRingBufferWritePtr = gx2WriteGatherPipe.writeGatherPtrGxBuffer[GX2::sGX2MainCoreIndex];
-		readDistance = (sint32)(gxRingBufferWritePtr - gxRingBufferReadPtr);
-		if (readDistance < 0)
-			return; // wrap around means there is at least one full command queued after this
-		if (readDistance >= waitDistance)
-			break;
-		g_renderer->NotifyLatteCommandProcessorIdle(); // let the renderer know in case it wants to flush any commands
-		performanceMonitor.gpuTime_idleTime.beginMeasuring();
-		// no command data available, spin in a busy loop for a while then check again
-		for (sint32 busy = 0; busy < 80; busy++)
-		{
-			_mm_pause();
-		}
-		readDistance = (sint32)(gxRingBufferWritePtr - gxRingBufferReadPtr);
-		if (readDistance < 0)
-			return; // wrap around means there is at least one full command queued after this
-		if (readDistance >= waitDistance)
-			break;
-
+		if ( TCL::TCLGPUReadRBWord(cmdWord) )
+			return cmdWord;
 		if (Latte_GetStopSignal())
 			LatteThread_Exit();

@ -234,6 +178,7 @@ void LatteCP_waitForNWords(uint32 numWords)
 		std::this_thread::yield();
 		performanceMonitor.gpuTime_idleTime.endMeasuring();
 	}
+	UNREACHABLE;
 }

 template<uint32 readU32()>
@ -270,21 +215,23 @@ void LatteCP_itIndirectBufferDepr(LatteCMDPtr cmd, uint32 nWords)
 	cemu_assert_debug(nWords == 3);
 	uint32 physicalAddress = LatteReadCMD();
 	uint32 physicalAddressHigh = LatteReadCMD(); // unused
-	uint32 sizeInDWords = LatteReadCMD();
-	uint32 displayListSize = sizeInDWords * 4;
-	DrawPassContext drawPassCtx;
+	uint32 sizeInU32s = LatteReadCMD();

 #ifdef LATTE_CP_LOGGING
 	if (GetAsyncKeyState('A'))
 		LatteCP_DebugPrintCmdBuffer(MEMPTR<uint32be>(physicalAddress), displayListSize);
 #endif

+	if (sizeInU32s > 0)
+	{
+		DrawPassContext drawPassCtx;
 		uint32be* buf = MEMPTR<uint32be>(physicalAddress).GetPtr();
-	drawPassCtx.PushCurrentCommandQueuePos(buf, buf, buf + sizeInDWords);
+		drawPassCtx.PushCurrentCommandQueuePos(buf, buf, buf + sizeInU32s);

 		LatteCP_processCommandBuffer(drawPassCtx);
 		if (drawPassCtx.isWithinDrawPass())
 			drawPassCtx.endDrawPass();
+	}
 }

 // pushes the command buffer to the stack
@ -294,11 +241,12 @@ void LatteCP_itIndirectBuffer(LatteCMDPtr cmd, uint32 nWords, DrawPassContext& d
 	uint32 physicalAddress = LatteReadCMD();
 	uint32 physicalAddressHigh = LatteReadCMD(); // unused
 	uint32 sizeInDWords = LatteReadCMD();
+	if (sizeInDWords > 0)
+	{
 		uint32 displayListSize = sizeInDWords * 4;
-	cemu_assert_debug(displayListSize >= 4);
-
 		uint32be* buf = MEMPTR<uint32be>(physicalAddress).GetPtr();
 		drawPassCtx.PushCurrentCommandQueuePos(buf, buf, buf + sizeInDWords);
+	}
 }

 LatteCMDPtr LatteCP_itStreamoutBufferUpdate(LatteCMDPtr cmd, uint32 nWords)
@ -565,26 +513,55 @@ LatteCMDPtr LatteCP_itMemWrite(LatteCMDPtr cmd, uint32 nWords)
 	if (word1 == 0x40000)
 	{
 		// write U32
-		*memPtr = word2;
+		stdx::atomic_ref<uint32be> atomicRef(*memPtr);
+		atomicRef.store(word2);
 	}
 	else if (word1 == 0x00000)
 	{
-		// write U64 (as two U32)
-		// note: The U32s are swapped
-		memPtr[0] = word2;
-		memPtr[1] = word3;
+		// write U64
+		// note: The U32s are swapped here, but needs verification. Also, it seems like the two U32 halves are written independently and the U64 as a whole is not atomic -> investiagte
+		stdx::atomic_ref<uint64be> atomicRef(*(uint64be*)memPtr);
+		atomicRef.store(((uint64le)word2 << 32) | word3);
 	}
 	else if (word1 == 0x20000)
 	{
 		// write U64 (little endian)
-		memPtr[0] = _swapEndianU32(word2);
-		memPtr[1] = _swapEndianU32(word3);
+		stdx::atomic_ref<uint64le> atomicRef(*(uint64le*)memPtr);
+		atomicRef.store(((uint64le)word3 << 32) | word2);
 	}
 	else
 		cemu_assert_unimplemented();
 	return cmd;
 }

+LatteCMDPtr LatteCP_itEventWriteEOP(LatteCMDPtr cmd, uint32 nWords)
+{
+	cemu_assert_debug(nWords == 5);
+	uint32 word0 = LatteReadCMD();
+	uint32 word1 = LatteReadCMD();
+	uint32 word2 = LatteReadCMD();
+	uint32 word3 = LatteReadCMD(); // value low bits
+	uint32 word4 = LatteReadCMD(); // value high bits
+
+	cemu_assert_debug(word2 == 0x40000000 || word2 == 0x42000000);
+
+	if (word0 == 0x504 && (word2&0x40000000)) // todo - figure out the flags
+	{
+		stdx::atomic_ref<uint64be> atomicRef(*(uint64be*)memory_getPointerFromPhysicalOffset(word1));
+		uint64 val = ((uint64)word4 << 32) | word3;
+		atomicRef.store(val);
+	}
+	else
+	{	cemu_assert_unimplemented();
+	}
+	bool triggerInterrupt = (word2 & 0x2000000) != 0;
+	if (triggerInterrupt)
+	{
+		// todo - timestamp interrupt
+	}
+	TCL::TCLGPUNotifyNewRetirementTimestamp();
+	return cmd;
+}

 LatteCMDPtr LatteCP_itMemSemaphore(LatteCMDPtr cmd, uint32 nWords)
 {
@ -783,16 +760,6 @@ LatteCMDPtr LatteCP_itDrawImmediate(LatteCMDPtr cmd, uint32 nWords, DrawPassCont

 	drawPassCtx.executeDraw(count, false, _tempIndexArrayMPTR);
 	return cmd;
-
-}
-
-LatteCMDPtr LatteCP_itHLEFifoWrapAround(LatteCMDPtr cmd, uint32 nWords)
-{
-	cemu_assert_debug(nWords == 1);
-	uint32 unused = LatteReadCMD();
-	gxRingBufferReadPtr = gx2WriteGatherPipe.gxRingBuffer;
-	cmd = (LatteCMDPtr)gxRingBufferReadPtr;
-	return cmd;
 }

 LatteCMDPtr LatteCP_itHLESampleTimer(LatteCMDPtr cmd, uint32 nWords)
@ -819,16 +786,6 @@ LatteCMDPtr LatteCP_itHLESpecialState(LatteCMDPtr cmd, uint32 nWords)
 	return cmd;
 }

-LatteCMDPtr LatteCP_itHLESetRetirementTimestamp(LatteCMDPtr cmd, uint32 nWords)
-{
-	cemu_assert_debug(nWords == 2);
-	uint32 timestampHigh = (uint32)LatteReadCMD();
-	uint32 timestampLow = (uint32)LatteReadCMD();
-	uint64 timestamp = ((uint64)timestampHigh << 32ULL) | (uint64)timestampLow;
-	GX2::__GX2NotifyNewRetirementTimestamp(timestamp);
-	return cmd;
-}
-
 LatteCMDPtr LatteCP_itHLEBeginOcclusionQuery(LatteCMDPtr cmd, uint32 nWords)
 {
 	cemu_assert_debug(nWords == 1);
@ -1145,9 +1102,10 @@ void LatteCP_processCommandBuffer(DrawPassContext& drawPassCtx)
 		LatteCMDPtr cmd, cmdStart, cmdEnd;
 		if (!drawPassCtx.PopCurrentCommandQueuePos(cmd, cmdStart, cmdEnd))
 			break;
+		uint32 itHeader;
 		while (cmd < cmdEnd)
 		{
-			uint32 itHeader = LatteReadCMD();
+			itHeader = LatteReadCMD();
 			uint32 itHeaderType = (itHeader >> 30) & 3;
 			if (itHeaderType == 3)
 			{
@ -1361,11 +1319,6 @@ void LatteCP_processCommandBuffer(DrawPassContext& drawPassCtx)
 					LatteCP_itHLEEndOcclusionQuery(cmdData, nWords);
 					break;
 				}
-				case IT_HLE_SET_CB_RETIREMENT_TIMESTAMP:
-				{
-					LatteCP_itHLESetRetirementTimestamp(cmdData, nWords);
-					break;
-				}
 				case IT_HLE_BOTTOM_OF_PIPE_CB:
 				{
 					LatteCP_itHLEBottomOfPipeCB(cmdData, nWords);
@ -1421,6 +1374,7 @@ void LatteCP_processCommandBuffer(DrawPassContext& drawPassCtx)
 void LatteCP_ProcessRingbuffer()
 {
 	sint32 timerRecheck = 0; // estimates how much CP processing time has elapsed based on the executed commands, if the value exceeds CP_TIMER_RECHECK then _handleTimers() is called
+	uint32be tmpBuffer[128];
 	while (true)
 	{
 		uint32 itHeader = LatteCP_readU32Deprc();
@ -1429,10 +1383,13 @@ void LatteCP_ProcessRingbuffer()
 		{
 			uint32 itCode = (itHeader >> 8) & 0xFF;
 			uint32 nWords = ((itHeader >> 16) & 0x3FFF) + 1;
-			LatteCP_waitForNWords(nWords);
-			LatteCMDPtr cmd = (LatteCMDPtr)gxRingBufferReadPtr;
-			uint8* cmdEnd = gxRingBufferReadPtr + nWords * 4;
-			gxRingBufferReadPtr = cmdEnd;
+			cemu_assert(nWords < 128);
+			for (sint32 i=0; i<nWords; i++)
+			{
+				uint32 word = LatteCP_readU32Deprc();
+				tmpBuffer[i] = word;
+			}
+			LatteCMDPtr cmd = (LatteCMDPtr)tmpBuffer;
 			switch (itCode)
 			{
 			case IT_SURFACE_SYNC:
@ -1599,6 +1556,11 @@ void LatteCP_ProcessRingbuffer()
 				timerRecheck += CP_TIMER_RECHECK / 512;
 				break;
 			}
+			case IT_EVENT_WRITE_EOP:
+			{
+				LatteCP_itEventWriteEOP(cmd, nWords);
+				break;
+			}
 			case IT_HLE_COPY_COLORBUFFER_TO_SCANBUFFER:
 			{
 				LatteCP_itHLECopyColorBufferToScanBuffer(cmd, nWords);
@ -1637,12 +1599,6 @@ void LatteCP_ProcessRingbuffer()
 				timerRecheck += CP_TIMER_RECHECK / 128;
 				break;
 			}
-			case IT_HLE_FIFO_WRAP_AROUND:
-			{
-				LatteCP_itHLEFifoWrapAround(cmd, nWords);
-				timerRecheck += CP_TIMER_RECHECK / 512;
-				break;
-			}
 			case IT_HLE_SAMPLE_TIMER:
 			{
 				LatteCP_itHLESampleTimer(cmd, nWords);
@ -1667,12 +1623,6 @@ void LatteCP_ProcessRingbuffer()
 				timerRecheck += CP_TIMER_RECHECK / 512;
 				break;
 			}
-			case IT_HLE_SET_CB_RETIREMENT_TIMESTAMP:
-			{
-				LatteCP_itHLESetRetirementTimestamp(cmd, nWords);
-				timerRecheck += CP_TIMER_RECHECK / 512;
-				break;
-			}
 			case IT_HLE_BOTTOM_OF_PIPE_CB:
 			{
 				LatteCP_itHLEBottomOfPipeCB(cmd, nWords);
@ -1933,11 +1883,6 @@ void LatteCP_DebugPrintCmdBuffer(uint32be* bufferPtr, uint32 size)
 				cemuLog_log(LogType::Force, "{} IT_HLE_COPY_SURFACE_NEW", strPrefix);
 				break;
 			}
-			case IT_HLE_FIFO_WRAP_AROUND:
-			{
-				cemuLog_log(LogType::Force, "{} IT_HLE_FIFO_WRAP_AROUND", strPrefix);
-				break;
-			}
 			case IT_HLE_SAMPLE_TIMER:
 			{
 				cemuLog_log(LogType::Force, "{} IT_HLE_SAMPLE_TIMER", strPrefix);
@ -1958,11 +1903,6 @@ void LatteCP_DebugPrintCmdBuffer(uint32be* bufferPtr, uint32 size)
 				cemuLog_log(LogType::Force, "{} IT_HLE_END_OCCLUSION_QUERY", strPrefix);
 				break;
 			}
-			case IT_HLE_SET_CB_RETIREMENT_TIMESTAMP:
-			{
-				cemuLog_log(LogType::Force, "{} IT_HLE_SET_CB_RETIREMENT_TIMESTAMP", strPrefix);
-				break;
-			}
 			case IT_HLE_BOTTOM_OF_PIPE_CB:
 			{
 				cemuLog_log(LogType::Force, "{} IT_HLE_BOTTOM_OF_PIPE_CB", strPrefix);
--- a/src/Cafe/HW/Latte/Core/LattePM4.h
+++ b/src/Cafe/HW/Latte/Core/LattePM4.h
@ -14,6 +14,7 @@
 #define IT_MEM_WRITE				0x3D
 #define IT_SURFACE_SYNC				0x43
 #define IT_EVENT_WRITE				0x46
+#define IT_EVENT_WRITE_EOP			0x47	// end of pipe

 #define IT_LOAD_CONFIG_REG			0x60
 #define IT_LOAD_CONTEXT_REG			0x61
@ -47,14 +48,12 @@
 #define IT_HLE_WAIT_FOR_FLIP					0xF1
 #define IT_HLE_BOTTOM_OF_PIPE_CB				0xF2
 #define IT_HLE_COPY_COLORBUFFER_TO_SCANBUFFER	0xF3
-#define IT_HLE_FIFO_WRAP_AROUND					0xF4
 #define IT_HLE_CLEAR_COLOR_DEPTH_STENCIL		0xF5
 #define IT_HLE_SAMPLE_TIMER						0xF7
 #define IT_HLE_TRIGGER_SCANBUFFER_SWAP			0xF8
 #define IT_HLE_SPECIAL_STATE					0xF9
 #define IT_HLE_BEGIN_OCCLUSION_QUERY			0xFA
 #define IT_HLE_END_OCCLUSION_QUERY				0xFB
-#define IT_HLE_SET_CB_RETIREMENT_TIMESTAMP		0xFD

 #define pm4HeaderType3(__itCode, __dataDWordCount) (0xC0000000|((uint32)(__itCode)<<8)|((uint32)((__dataDWordCount)-1)<<16))
 #define pm4HeaderType2Filler() (0x80000000)
--- a/src/Cafe/HW/Latte/Core/LatteThread.cpp
+++ b/src/Cafe/HW/Latte/Core/LatteThread.cpp
@ -207,7 +207,6 @@ int Latte_ThreadEntry()
 		if (Latte_GetStopSignal())
 			LatteThread_Exit();
 	}
-	gxRingBufferReadPtr = gx2WriteGatherPipe.gxRingBuffer;
 	LatteCP_ProcessRingbuffer();
 	cemu_assert_debug(false); // should never reach
 	return 0;
--- a/src/Cafe/OS/libs/TCL/TCL.cpp
+++ b/src/Cafe/OS/libs/TCL/TCL.cpp
@ -1,28 +1,161 @@
 #include "Cafe/OS/common/OSCommon.h"
 #include "Cafe/OS/libs/TCL/TCL.h"

+#include "HW/Latte/Core/LattePM4.h"
+
 namespace TCL
 {
+	SysAllocator<coreinit::OSEvent> s_updateRetirementEvent;
+	uint64 s_currentRetireMarker = 0;

-	enum class TCL_SUBMISSION_FLAG : uint32
+	struct TCLStatePPC // mapped into PPC space
 	{
-		SURFACE_SYNC = 0x400000, // submit surface sync packet before cmd
-		TRIGGER_INTERRUPT = 0x200000, // probably
-		UKN_20000000 = 0x20000000,
+		uint64be gpuRetireMarker; // written by GPU
 	};

-	int TCLSubmitToRing(uint32be* cmd, uint32 cmdLen, uint32be* controlFlags, uint64* submissionTimestamp)
+	SysAllocator<TCLStatePPC> s_tclStatePPC;
+
+	// called from GPU for timestamp EOP event
+	void TCLGPUNotifyNewRetirementTimestamp()
 	{
-		// todo - figure out all the bits of *controlFlags
-		// if submissionTimestamp != nullptr then set it to the timestamp of the submission. Note: We should make sure that uint64's are written atomically by the GPU command processor
+		// gpuRetireMarker is updated via event eop command
+		__OSLockScheduler();
+		coreinit::OSSignalEventAllInternal(s_updateRetirementEvent.GetPtr());
+		__OSUnlockScheduler();
+	}

-		cemu_assert_debug(false);
+	int TCLTimestamp(TCLTimestampId id, uint64be* timestampOut)
+	{
+		if (id == TCLTimestampId::TIMESTAMP_LAST_BUFFER_RETIRED)
+		{
+			MEMPTR<uint32> b;
+			// this is the timestamp of the last buffer that was retired by the GPU
+			stdx::atomic_ref<uint64be> retireTimestamp(s_tclStatePPC->gpuRetireMarker);
+			*timestampOut = retireTimestamp.load();
+			return 0;
+		}
+		else
+		{
+			cemuLog_log(LogType::Force, "TCLTimestamp(): Unsupported timestamp ID {}", (uint32)id);
+			*timestampOut = 0;
+			return 0;
+		}
+	}

+	int TCLWaitTimestamp(TCLTimestampId id, uint64 waitTs, uint64 timeout)
+	{
+		if (id == TCLTimestampId::TIMESTAMP_LAST_BUFFER_RETIRED)
+		{
+			while ( true )
+			{
+				stdx::atomic_ref<uint64be> retireTimestamp(s_tclStatePPC->gpuRetireMarker);
+				uint64 currentTimestamp = retireTimestamp.load();
+				if (currentTimestamp >= waitTs)
+					return 0;
+				coreinit::OSWaitEvent(s_updateRetirementEvent.GetPtr());
+			}
+		}
+		else
+		{
+			cemuLog_log(LogType::Force, "TCLWaitTimestamp(): Unsupported timestamp ID {}", (uint32)id);
+		}
+		return 0;
+	}
+
+	static constexpr uint32 TCL_RING_BUFFER_SIZE = 4096; // in U32s
+
+	std::atomic<uint32> tclRingBufferA[TCL_RING_BUFFER_SIZE];
+	std::atomic<uint32> tclRingBufferA_readIndex{0};
+	uint32 tclRingBufferA_writeIndex{0};
+
+	// GPU code calls this to grab the next command word
+	bool TCLGPUReadRBWord(uint32& cmdWord)
+	{
+		if (tclRingBufferA_readIndex == tclRingBufferA_writeIndex)
+			return false;
+		cmdWord = tclRingBufferA[tclRingBufferA_readIndex];
+		tclRingBufferA_readIndex = (tclRingBufferA_readIndex+1) % TCL_RING_BUFFER_SIZE;
+		return true;
+	}
+
+	void TCLWaitForRBSpace(uint32be numU32s)
+	{
+		while ( true )
+		{
+			uint32 distance = (tclRingBufferA_readIndex + TCL_RING_BUFFER_SIZE - tclRingBufferA_writeIndex) & (TCL_RING_BUFFER_SIZE - 1);
+			if (tclRingBufferA_writeIndex == tclRingBufferA_readIndex) // buffer completely empty
+				distance = TCL_RING_BUFFER_SIZE;
+			if (distance >= numU32s+1) // assume distance minus one, because we are never allowed to completely wrap around
+				break;
+			_mm_pause();
+		}
+	}
+
+	// this function assumes that TCLWaitForRBSpace was called and that there is enough space
+	void TCLWriteCmd(uint32be* cmd, uint32 cmdLen)
+	{
+		while (cmdLen > 0)
+		{
+			tclRingBufferA[tclRingBufferA_writeIndex] = *cmd;
+			tclRingBufferA_writeIndex++;
+			tclRingBufferA_writeIndex &= (TCL_RING_BUFFER_SIZE - 1);
+			cmd++;
+			cmdLen--;
+		}
+	}
+
+	#define EVENT_TYPE_TS		5
+
+	void TCLSubmitRetireMarker(bool triggerEventInterrupt)
+	{
+		s_currentRetireMarker++;
+		uint32be cmd[6];
+		cmd[0] = pm4HeaderType3(IT_EVENT_WRITE_EOP, 5);
+		cmd[1] = (4 | (EVENT_TYPE_TS << 8)); // event type (bits 8-15) and event index (bits 0-7).
+		cmd[2] = MEMPTR<void>(&s_tclStatePPC->gpuRetireMarker).GetMPTR(); // address lower 32bits + data sel bits
+		cmd[3] = 0x40000000; // select 64bit write, lower 16 bits are the upper bits of the address
+		if (triggerEventInterrupt)
+			cmd[3] |= 0x2000000; // trigger interrupt after value has been written
+		cmd[4] = (uint32)s_currentRetireMarker; // data lower 32 bits
+		cmd[5] = (uint32)(s_currentRetireMarker>>32); // data higher 32 bits
+		TCLWriteCmd(cmd, 6);
+	}
+
+	int TCLSubmitToRing(uint32be* cmd, uint32 cmdLen, betype<TCLSubmissionFlag>* controlFlags, uint64be* timestampValueOut)
+	{
+		TCLSubmissionFlag flags = *controlFlags;
+		cemu_assert_debug(timestampValueOut); // handle case where this is null
+
+		// make sure there is enough space to submit all commands at one
+		uint32 totalCommandLength = cmdLen;
+		totalCommandLength += 6; // space needed for TCLSubmitRetireMarker
+
+		TCLWaitForRBSpace(totalCommandLength);
+
+		// submit command buffer
+		TCLWriteCmd(cmd, cmdLen);
+
+		// create new marker timestamp and tell GPU to write it to our variable after its done processing the command
+		if ((HAS_FLAG(flags, TCLSubmissionFlag::USE_RETIRED_MARKER)))
+		{
+			TCLSubmitRetireMarker(!HAS_FLAG(flags, TCLSubmissionFlag::NO_MARKER_INTERRUPT));
+			*timestampValueOut = s_currentRetireMarker; // incremented before each submit
+		}
+		else
+		{
+			cemu_assert_unimplemented();
+		}
 		return 0;
 	}

 	void Initialize()
 	{
 		cafeExportRegister("TCL", TCLSubmitToRing, LogType::Placeholder);
+		cafeExportRegister("TCL", TCLTimestamp, LogType::Placeholder);
+		cafeExportRegister("TCL", TCLWaitTimestamp, LogType::Placeholder);
+
+		s_currentRetireMarker = 0;
+		s_tclStatePPC->gpuRetireMarker = 0;
+		coreinit::OSInitEvent(s_updateRetirementEvent.GetPtr(), coreinit::OSEvent::EVENT_STATE::STATE_NOT_SIGNALED, coreinit::OSEvent::EVENT_MODE::MODE_AUTO);
 	}
 }
--- a/src/Cafe/OS/libs/TCL/TCL.h
+++ b/src/Cafe/OS/libs/TCL/TCL.h
@ -1,4 +1,25 @@
 namespace TCL
 {
+	enum class TCLTimestampId
+	{
+		TIMESTAMP_LAST_BUFFER_RETIRED = 1,
+	};
+
+	enum class TCLSubmissionFlag : uint32
+	{
+		SURFACE_SYNC = 0x400000, // submit surface sync packet before cmd
+		NO_MARKER_INTERRUPT = 0x200000,
+		USE_RETIRED_MARKER = 0x20000000, // Controls whether the timer is updated before or after (retired) the cmd. Also controls which timestamp is returned for the submission. Before and after using separate counters
+	};
+
+	int TCLTimestamp(TCLTimestampId id, uint64be* timestampOut);
+	int TCLWaitTimestamp(TCLTimestampId id, uint64 waitTs, uint64 timeout);
+	int TCLSubmitToRing(uint32be* cmd, uint32 cmdLen, betype<TCLSubmissionFlag>* controlFlags, uint64be* timestampValueOut);
+
+	// called from Latte code
+	bool TCLGPUReadRBWord(uint32& cmdWord);
+	void TCLGPUNotifyNewRetirementTimestamp();
+
 	void Initialize();
 }
+ENABLE_BITMASK_OPERATORS(TCL::TCLSubmissionFlag);
--- a/src/Cafe/OS/libs/gx2/GX2.cpp
+++ b/src/Cafe/OS/libs/gx2/GX2.cpp
@ -59,7 +59,7 @@ void gx2Export_GX2SwapScanBuffers(PPCInterpreter_t* hCPU)
 	if (isPokken)
 		GX2::GX2DrawDone();

-	GX2ReserveCmdSpace(5+2);
+	GX2::GX2ReserveCmdSpace(5+2);

 	uint64 tick64 = PPCInterpreter_getMainCoreCycleCounter() / 20ULL;
 	lastSwapTime = tick64;
@ -86,24 +86,16 @@ void gx2Export_GX2SwapScanBuffers(PPCInterpreter_t* hCPU)
 		GX2::GX2WaitForFlip();
 	}

-	GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
 	osLib_returnFromFunction(hCPU, 0);
 }

 void gx2Export_GX2CopyColorBufferToScanBuffer(PPCInterpreter_t* hCPU)
 {
 	cemuLog_log(LogType::GX2, "GX2CopyColorBufferToScanBuffer(0x{:08x},{})", hCPU->gpr[3], hCPU->gpr[4]);
-	GX2ReserveCmdSpace(5);
+	GX2::GX2ReserveCmdSpace(10);

 	// todo: proper implementation

-	// hack: Avoid running to far ahead of GPU. Normally this would be guaranteed by the circular buffer model, which we currently dont fully emulate
-	if(GX2::GX2WriteGather_getReadWriteDistance() > 32*1024*1024 )
-	{
-		debug_printf("Waiting for GPU to catch up...\n");
-		PPCInterpreter_relinquishTimeslice(); // release current thread
-		return;
-	}
 	GX2ColorBuffer* colorBuffer = (GX2ColorBuffer*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]);

 	gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_HLE_COPY_COLORBUFFER_TO_SCANBUFFER, 9));
@ -309,81 +301,6 @@ void gx2Export_GX2SetSemaphore(PPCInterpreter_t* hCPU)
 	osLib_returnFromFunction(hCPU, 0);
 }

-void gx2Export_GX2Flush(PPCInterpreter_t* hCPU)
-{
-	cemuLog_log(LogType::GX2, "GX2Flush()");
-	_GX2SubmitToTCL();
-	osLib_returnFromFunction(hCPU, 0);
-}
-
-uint8* _GX2LastFlushPtr[PPC_CORE_COUNT] = {NULL};
-
-uint64 _prevReturnedGPUTime = 0;
-
-uint64 Latte_GetTime()
-{
-	uint64 gpuTime = coreinit::OSGetSystemTime();
-	gpuTime *= 20000ULL;
-	if (gpuTime <= _prevReturnedGPUTime)
-		gpuTime = _prevReturnedGPUTime + 1; // avoid ever returning identical timestamps
-	_prevReturnedGPUTime = gpuTime;
-	return gpuTime;
-}
-
-void _GX2SubmitToTCL()
-{
-	uint32 coreIndex = PPCInterpreter_getCoreIndex(PPCInterpreter_getCurrentInstance());
-	// do nothing if called from non-main GX2 core
-	if (GX2::sGX2MainCoreIndex != coreIndex)
-	{
-		cemuLog_logDebug(LogType::Force, "_GX2SubmitToTCL() called on non-main GX2 core");
-		return;
-	}
-	if( gx2WriteGatherPipe.displayListStart[coreIndex] != MPTR_NULL )
-		return; // quit if in display list
-	_GX2LastFlushPtr[coreIndex] = (gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex]);
-	// update last submitted CB timestamp
-	uint64 commandBufferTimestamp = Latte_GetTime();
-	LatteGPUState.lastSubmittedCommandBufferTimestamp.store(commandBufferTimestamp);
-	cemuLog_log(LogType::GX2, "Submitting GX2 command buffer with timestamp {:016x}", commandBufferTimestamp);
-	// submit HLE packet to write retirement timestamp
-	gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_HLE_SET_CB_RETIREMENT_TIMESTAMP, 2));
-	gx2WriteGather_submitU32AsBE((uint32)(commandBufferTimestamp>>32ULL));
-	gx2WriteGather_submitU32AsBE((uint32)(commandBufferTimestamp&0xFFFFFFFFULL));
-}
-
-uint32 _GX2GetUnflushedBytes(uint32 coreIndex)
-{
-	uint32 unflushedBytes = 0;
-	if (_GX2LastFlushPtr[coreIndex] != NULL)
-	{
-		if (_GX2LastFlushPtr[coreIndex] > gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex])
-			unflushedBytes = (uint32)(gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex] - gx2WriteGatherPipe.gxRingBuffer + 4); // this isn't 100% correct since we ignore the bytes between the last flush address and the start of the wrap around
-		else
-			unflushedBytes = (uint32)(gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex] - _GX2LastFlushPtr[coreIndex]);
-	}
-	else
-		unflushedBytes = (uint32)(gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex] - gx2WriteGatherPipe.gxRingBuffer);
-	return unflushedBytes;
-}
-
-/*
- * Guarantees that the requested amount of space is available on the current command buffer
- * If the space is not available, the current command buffer is pushed to the GPU and a new one is allocated
- */
-void GX2ReserveCmdSpace(uint32 reservedFreeSpaceInU32)
-{
-	uint32 coreIndex = coreinit::OSGetCoreId();
-	// if we are in a display list then do nothing
-	if( gx2WriteGatherPipe.displayListStart[coreIndex] != MPTR_NULL )
-		return;
-	uint32 unflushedBytes = _GX2GetUnflushedBytes(coreIndex);
-	if( unflushedBytes >= 0x1000 )
-	{
-		_GX2SubmitToTCL();
-	}
-}
-
 void gx2_load()
 {
 	osLib_addFunction("gx2", "GX2GetContextStateDisplayList", gx2Export_GX2GetContextStateDisplayList);
@ -445,10 +362,6 @@ void gx2_load()
 	// semaphore
 	osLib_addFunction("gx2", "GX2SetSemaphore", gx2Export_GX2SetSemaphore);

-	// command buffer
-	osLib_addFunction("gx2", "GX2Flush", gx2Export_GX2Flush);
-
-	GX2::GX2Init_writeGather();
 	GX2::GX2MemInit();
 	GX2::GX2ResourceInit();
 	GX2::GX2CommandInit();
--- a/src/Cafe/OS/libs/gx2/GX2.h
+++ b/src/Cafe/OS/libs/gx2/GX2.h
@ -68,9 +68,3 @@ void gx2Export_GX2MarkScanBufferCopied(PPCInterpreter_t* hCPU);
 void gx2Export_GX2SetDefaultState(PPCInterpreter_t* hCPU);
 void gx2Export_GX2SetupContextStateEx(PPCInterpreter_t* hCPU);
 void gx2Export_GX2SetContextState(PPCInterpreter_t* hCPU);
-
-// command buffer
-
-uint32 _GX2GetUnflushedBytes(uint32 coreIndex);
-void _GX2SubmitToTCL();
-void GX2ReserveCmdSpace(uint32 reservedFreeSpaceInU32);
--- a/src/Cafe/OS/libs/gx2/GX2_Blit.cpp
+++ b/src/Cafe/OS/libs/gx2/GX2_Blit.cpp
@ -132,7 +132,6 @@ namespace GX2
 			depthFirstSlice = _swapEndianU32(depthBuffer->viewFirstSlice);
 			depthNumSlices = _swapEndianU32(depthBuffer->viewNumSlices);
 		}
-
 		gx2WriteGather_submit(pm4HeaderType3(IT_HLE_CLEAR_COLOR_DEPTH_STENCIL, 23),
 		hleClearFlags,
 		colorPhysAddr,
--- a/src/Cafe/OS/libs/gx2/GX2_Command.cpp
+++ b/src/Cafe/OS/libs/gx2/GX2_Command.cpp
@ -4,178 +4,397 @@
 #include "Cafe/HW/Latte/Core/LattePM4.h"
 #include "Cafe/OS/libs/coreinit/coreinit.h"
 #include "Cafe/OS/libs/coreinit/coreinit_Thread.h"
+#include "Cafe/OS/libs/TCL/TCL.h"
 #include "Cafe/HW/Latte/ISA/RegDefines.h"
 #include "GX2.h"
 #include "GX2_Command.h"
 #include "GX2_Shader.h"
 #include "GX2_Misc.h"
+#include "OS/libs/coreinit/coreinit_MEM.h"

-extern uint8* gxRingBufferReadPtr;
-
-GX2WriteGatherPipeState gx2WriteGatherPipe = { 0 };
+namespace GX2
+{
+	GX2PerCoreCBState s_perCoreCBState[Espresso::CORE_COUNT];
+}

 void gx2WriteGather_submitU32AsBE(uint32 v)
 {
 	uint32 coreIndex = PPCInterpreter_getCoreIndex(PPCInterpreter_getCurrentInstance());
-	if (gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex] == NULL)
+	if (GX2::s_perCoreCBState[coreIndex].currentWritePtr == nullptr)
 		return;
-	*(uint32*)(*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex]) = _swapEndianU32(v);
-	(*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex]) += 4;
+	*(uint32*)(GX2::s_perCoreCBState[coreIndex].currentWritePtr) = _swapEndianU32(v);
+	GX2::s_perCoreCBState[coreIndex].currentWritePtr++;
+	cemu_assert_debug(GX2::s_perCoreCBState[coreIndex].currentWritePtr <= (GX2::s_perCoreCBState[coreIndex].bufferPtr + GX2::s_perCoreCBState[coreIndex].bufferSizeInU32s));
 }

 void gx2WriteGather_submitU32AsLE(uint32 v)
 {
 	uint32 coreIndex = PPCInterpreter_getCoreIndex(PPCInterpreter_getCurrentInstance());
-	if (gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex] == NULL)
+	if (GX2::s_perCoreCBState[coreIndex].currentWritePtr == nullptr)
 		return;
-	*(uint32*)(*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex]) = v;
-	(*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex]) += 4;
+	*(uint32*)(GX2::s_perCoreCBState[coreIndex].currentWritePtr) = v;
+	GX2::s_perCoreCBState[coreIndex].currentWritePtr++;
+	cemu_assert_debug(GX2::s_perCoreCBState[coreIndex].currentWritePtr <= (GX2::s_perCoreCBState[coreIndex].bufferPtr + GX2::s_perCoreCBState[coreIndex].bufferSizeInU32s));
 }

 void gx2WriteGather_submitU32AsLEArray(uint32* v, uint32 numValues)
 {
 	uint32 coreIndex = PPCInterpreter_getCoreIndex(PPCInterpreter_getCurrentInstance());
-	if (gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex] == NULL)
+	if (GX2::s_perCoreCBState[coreIndex].currentWritePtr == nullptr)
 		return;
-	memcpy_dwords((*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex]), v, numValues);
-	(*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex]) += 4 * numValues;
+	memcpy_dwords(GX2::s_perCoreCBState[coreIndex].currentWritePtr, v, numValues);
+	GX2::s_perCoreCBState[coreIndex].currentWritePtr += numValues;
+	cemu_assert_debug(GX2::s_perCoreCBState[coreIndex].currentWritePtr <= (GX2::s_perCoreCBState[coreIndex].bufferPtr + GX2::s_perCoreCBState[coreIndex].bufferSizeInU32s));
 }

 namespace GX2
 {
-	sint32 gx2WriteGatherCurrentMainCoreIndex = -1;
-	bool gx2WriteGatherInited = false;

-    void GX2WriteGather_ResetToDefaultState()
+	struct GX2CommandState // mapped to PPC space since the GPU writes here
    {
-        gx2WriteGatherCurrentMainCoreIndex = -1;
-        gx2WriteGatherInited = false;
-    }
+    	// command pool
+		MEMPTR<uint32be> commandPoolBase;
+    	uint32 commandPoolSizeInU32s;
+		MEMPTR<uint32be> gpuCommandReadPtr;
+		// timestamp
+		uint64be lastSubmissionTime;
+    };

-	void GX2Init_writeGather() // init write gather, make current core 
+	SysAllocator<GX2CommandState> s_commandState;
+	GX2PerCoreCBState s_mainCoreLastCommandState;
+	bool s_cbBufferIsInternallyAllocated;
+
+	void GX2Command_StartNewCommandBuffer(uint32 numU32s);
+
+	// called from GX2Init. Allocates a 4MB memory chunk from which command buffers are suballocated from
+	void GX2Init_commandBufferPool(void* bufferBase, uint32 bufferSize)
 	{
-		if (gx2WriteGatherPipe.gxRingBuffer == NULL)
-			gx2WriteGatherPipe.gxRingBuffer = (uint8*)malloc(GX2_COMMAND_RING_BUFFER_SIZE);
-		if (gx2WriteGatherCurrentMainCoreIndex == sGX2MainCoreIndex)
-			return; // write gather already configured for same core
-		for (sint32 i = 0; i < PPC_CORE_COUNT; i++)
+		cemu_assert_debug(!s_commandState->commandPoolBase); // should not be allocated already
+    	// setup command buffer pool. If not provided allocate a 4MB or custom size buffer
+		uint32 poolSize = bufferSize ? bufferSize : 0x400000; // 4MB (can be overwritten by custom GX2Init parameters?)
+		if (bufferBase)
 		{
-			if (i == sGX2MainCoreIndex)
-			{
-				gx2WriteGatherPipe.writeGatherPtrGxBuffer[i] = gx2WriteGatherPipe.gxRingBuffer;
-				gx2WriteGatherPipe.writeGatherPtrWrite[i] = &gx2WriteGatherPipe.writeGatherPtrGxBuffer[i];
+			s_commandState->commandPoolBase = (uint32be*)bufferBase;
+			s_cbBufferIsInternallyAllocated = false;
 		}
 		else
 		{
-				gx2WriteGatherPipe.writeGatherPtrGxBuffer[i] = NULL;
-				gx2WriteGatherPipe.writeGatherPtrWrite[i] = NULL;
+			s_commandState->commandPoolBase = (uint32be*)coreinit::_weak_MEMAllocFromDefaultHeapEx(poolSize, 0x100);
+			s_cbBufferIsInternallyAllocated = true;
 		}
-			gx2WriteGatherPipe.displayListStart[i] = MPTR_NULL;
-			gx2WriteGatherPipe.writeGatherPtrDisplayList[i] = NULL;
-			gx2WriteGatherPipe.displayListMaxSize[i] = 0;
+		if (!s_commandState->commandPoolBase)
+		{
+			cemuLog_log(LogType::Force, "GX2: Failed to allocate command buffer pool");
+		}
+		s_commandState->commandPoolSizeInU32s = poolSize / sizeof(uint32be);
+		s_commandState->gpuCommandReadPtr = s_commandState->commandPoolBase;
+		// init per-core command buffer state
+		for (uint32 i = 0; i < Espresso::CORE_COUNT; i++)
+		{
+			s_perCoreCBState[i].bufferPtr = nullptr;
+			s_perCoreCBState[i].bufferSizeInU32s = 0;
+			s_perCoreCBState[i].currentWritePtr = nullptr;
+		}
+		// start first command buffer for main core
+		GX2Command_StartNewCommandBuffer(0x100);
+	}
+
+	void GX2Shutdown_commandBufferPool()
+	{
+		if (!s_commandState->commandPoolBase)
+			return;
+		if (s_cbBufferIsInternallyAllocated)
+			coreinit::_weak_MEMFreeToDefaultHeap(s_commandState->commandPoolBase.GetPtr());
+		s_cbBufferIsInternallyAllocated = false;
+		s_commandState->commandPoolBase = nullptr;
+		s_commandState->commandPoolSizeInU32s = 0;
+		s_commandState->gpuCommandReadPtr = nullptr;
+	}
+
+	// current position of where the GPU is reading from. Updated via a memory write command submitted to the GPU
+	uint32 GX2Command_GetPoolGPUReadIndex()
+	{
+		stdx::atomic_ref<MEMPTR<uint32be>> _readPtr(s_commandState->gpuCommandReadPtr);
+		MEMPTR<uint32be> currentReadPtr = _readPtr.load();
+		cemu_assert_debug(currentReadPtr);
+		return (uint32)(currentReadPtr.GetPtr() - s_commandState->commandPoolBase.GetPtr());
+	}
+
+	void GX2Command_WaitForNextBufferRetired()
+	{
+		uint64 retiredTimeStamp = GX2GetRetiredTimeStamp();
+		retiredTimeStamp += 1;
+		// but cant be higher than the submission timestamp
+		stdx::atomic_ref<uint64be> _lastSubmissionTime(s_commandState->lastSubmissionTime);
+		uint64 submissionTimeStamp = _lastSubmissionTime.load();
+		if (retiredTimeStamp > submissionTimeStamp)
+			retiredTimeStamp = submissionTimeStamp;
+		GX2WaitTimeStamp(retiredTimeStamp);
+	}
+
+	void GX2Command_SetupCoreCommandBuffer(uint32be* buffer, uint32 sizeInU32s, bool isDisplayList)
+	{
+		uint32 coreIndex = coreinit::OSGetCoreId();
+		auto& coreCBState = s_perCoreCBState[coreIndex];
+		coreCBState.bufferPtr = buffer;
+		coreCBState.bufferSizeInU32s = sizeInU32s;
+		coreCBState.currentWritePtr = buffer;
+		coreCBState.isDisplayList = isDisplayList;
+	}
+
+	void GX2Command_StartNewCommandBuffer(uint32 numU32s)
+	{
+		uint32 coreIndex = coreinit::OSGetCoreId();
+		auto& coreCBState = s_perCoreCBState[coreIndex];
+		numU32s = std::max<uint32>(numU32s, 0x100);
+		// grab space from command buffer pool and if necessary wait for it
+		uint32be* bufferPtr = nullptr;
+		uint32 bufferSizeInU32s = 0;
+		uint32 readIndex;
+		while (true)
+		{
+			// try to grab buffer data from first available spot:
+			// 1. At the current write location up to the end of the buffer (avoiding an overlap with the read location)
+			// 2. From the start of the buffer up to the read location
+			readIndex = GX2Command_GetPoolGPUReadIndex();
+			uint32be* nextWritePos = coreCBState.bufferPtr ? coreCBState.bufferPtr + coreCBState.bufferSizeInU32s : s_commandState->commandPoolBase.GetPtr();
+			uint32 writeIndex = nextWritePos - s_commandState->commandPoolBase;
+			uint32 poolSizeInU32s = s_commandState->commandPoolSizeInU32s;
+			// readIndex == writeIndex can mean either buffer full or buffer empty
+			// we could use GX2GetRetiredTimeStamp() == GX2GetLastSubmittedTimeStamp() to determine if the buffer is truly empty
+			// but this can have false negatives since the last submission timestamp is updated independently of the read index
+			// so instead we just avoid ever filling the buffer completely
+			cemu_assert_debug(readIndex < poolSizeInU32s);
+			cemu_assert_debug(writeIndex < poolSizeInU32s);
+			if (writeIndex < readIndex)
+			{
+				// writeIndex has wrapped around
+				uint32 wordsAvailable = readIndex - writeIndex;
+				if (wordsAvailable > 0)
+					wordsAvailable--; // avoid writeIndex becoming equal to readIndex
+				if (wordsAvailable >= numU32s)
+				{
+					bufferPtr = s_commandState->commandPoolBase + writeIndex;
+					bufferSizeInU32s = wordsAvailable;
+					break;
+				}
+			}
+			else
+			{
+				uint32 wordsAvailable = poolSizeInU32s - writeIndex;
+				if (wordsAvailable > 0)
+					wordsAvailable--; // avoid writeIndex becoming equal to readIndex
+				if (wordsAvailable >= numU32s)
+				{
+					bufferPtr = nextWritePos;
+					bufferSizeInU32s = wordsAvailable;
+					break;
+				}
+				// not enough space at end of buffer, try to grab from the beginning of the buffer
+				wordsAvailable = readIndex;
+				if (wordsAvailable > 0)
+					wordsAvailable--; // avoid writeIndex becoming equal to readIndex
+				if (wordsAvailable >= numU32s)
+				{
+					bufferPtr = s_commandState->commandPoolBase;
+					bufferSizeInU32s = wordsAvailable;
+					break;
+				}
+			}
+			GX2Command_WaitForNextBufferRetired();
+		}
+		cemu_assert_debug(bufferPtr);
+		bufferSizeInU32s = std::min<uint32>(numU32s, 0x20000); // size cap
+#ifdef CEMU_DEBUG_ASSERT
+		uint32 newWriteIndex = ((bufferPtr - s_commandState->commandPoolBase) + bufferSizeInU32s) % s_commandState->commandPoolSizeInU32s;
+		cemu_assert_debug(newWriteIndex != readIndex);
+#endif
+		// setup buffer and make it the current write gather target
+		cemu_assert_debug(bufferPtr >= s_commandState->commandPoolBase && (bufferPtr + bufferSizeInU32s) <= s_commandState->commandPoolBase + s_commandState->commandPoolSizeInU32s);
+		GX2Command_SetupCoreCommandBuffer(bufferPtr, bufferSizeInU32s, false);
+	}
+
+	void GX2Command_SubmitCommandBuffer(uint32be* buffer, uint32 sizeInU32s, MEMPTR<uint32be>* completionGPUReadPointer, bool triggerMarkerInterrupt)
+	{
+		uint32be cmd[10];
+		uint32 cmdLen = 4;
+		cmd[0] = pm4HeaderType3(IT_INDIRECT_BUFFER_PRIV, 3);
+		cmd[1] = memory_virtualToPhysical(MEMPTR<void>(buffer).GetMPTR());
+		cmd[2] = 0x00000000; // address high bits
+		cmd[3] = sizeInU32s;
+		if (completionGPUReadPointer)
+		{
+			// append command to update completionGPUReadPointer after the GPU is done with the command buffer
+			cmd[4] = pm4HeaderType3(IT_MEM_WRITE, 4);
+			cmd[5] = memory_virtualToPhysical(MEMPTR<void>(completionGPUReadPointer).GetMPTR()) | 2;
+			cmd[6] = 0x40000;
+			cmd[7] = MEMPTR<void>(buffer + sizeInU32s).GetMPTR(); // value to write
+			cmd[8] = 0x00000000;
+			cmdLen = 9;
+		}
+
+		betype<TCL::TCLSubmissionFlag> submissionFlags{};
+		if (!triggerMarkerInterrupt)
+			submissionFlags |= TCL::TCLSubmissionFlag::NO_MARKER_INTERRUPT;
+		submissionFlags |= TCL::TCLSubmissionFlag::USE_RETIRED_MARKER;
+
+		TCL::TCLSubmitToRing(cmd, cmdLen, &submissionFlags, &s_commandState->lastSubmissionTime);
+	}
+
+	void GX2Command_PadCurrentBuffer()
+	{
+		uint32 coreIndex = coreinit::OSGetCoreId();
+		auto& coreCBState = s_perCoreCBState[coreIndex];
+		if (!coreCBState.currentWritePtr)
+			return;
+		uint32 writeDistance = (uint32)(coreCBState.currentWritePtr - coreCBState.bufferPtr);
+		if ((writeDistance&7) != 0)
+		{
+			uint32 distanceToPad = 0x8 - (writeDistance & 0x7);
+			while (distanceToPad)
+			{
+				*coreCBState.currentWritePtr = pm4HeaderType2Filler();
+				coreCBState.currentWritePtr++;
+				distanceToPad--;
+			}
+		}
+	}
+
+	void GX2Command_Flush(uint32 numU32sForNextBuffer, bool triggerMarkerInterrupt)
+	{
+		uint32 coreIndex = coreinit::OSGetCoreId();
+		auto& coreCBState = s_perCoreCBState[coreIndex];
+		if (coreCBState.isDisplayList)
+		{
+			// display list
+			cemu_assert_debug((uint32)(coreCBState.currentWritePtr - coreCBState.bufferPtr) < coreCBState.bufferSizeInU32s);
+			cemuLog_logDebugOnce(LogType::Force, "GX2 flush called on display list");
+		}
+		else
+		{
+			// command buffer
+			if (coreCBState.currentWritePtr != coreCBState.bufferPtr)
+			{
+				// pad the command buffer to 32 byte alignment
+				GX2Command_PadCurrentBuffer();
+				// submit it to the GPU
+				uint32 bufferLength = (uint32)(coreCBState.currentWritePtr - coreCBState.bufferPtr);
+				cemu_assert_debug(bufferLength <= coreCBState.bufferSizeInU32s);
+				GX2Command_SubmitCommandBuffer(coreCBState.bufferPtr, bufferLength, &s_commandState->gpuCommandReadPtr, triggerMarkerInterrupt);
+				GX2Command_StartNewCommandBuffer(numU32sForNextBuffer);
+			}
+			else
+			{
+				// current buffer is empty so we dont need to queue it
+				if (numU32sForNextBuffer > s_commandState->commandPoolSizeInU32s)
+					GX2Command_StartNewCommandBuffer(numU32sForNextBuffer);
+			}
+		}
+	}
+
+	void GX2Flush()
+	{
+		GX2Command_Flush(256, true);
+	}
+
+	uint64 GX2GetLastSubmittedTimeStamp()
+	{
+		stdx::atomic_ref<uint64be> _lastSubmissionTime(s_commandState->lastSubmissionTime);
+		return _lastSubmissionTime.load();
+	}
+
+	uint64 GX2GetRetiredTimeStamp()
+	{
+		uint64be ts = 0;
+		TCL::TCLTimestamp(TCL::TCLTimestampId::TIMESTAMP_LAST_BUFFER_RETIRED, &ts);
+		return ts;
+	}
+
+	bool GX2WaitTimeStamp(uint64 tsWait)
+	{
+		// handle GPU timeout here? But for now we timeout after 60 seconds
+		TCL::TCLWaitTimestamp(TCL::TCLTimestampId::TIMESTAMP_LAST_BUFFER_RETIRED, tsWait, Espresso::TIMER_CLOCK * 60);
+		return true;
+	}
+
+	/*
+	 * Guarantees that the requested amount of space is available on the current command buffer
+	 * If the space is not available, the current command buffer is pushed to the GPU and a new one is allocated
+	 */
+	void GX2ReserveCmdSpace(uint32 reservedFreeSpaceInU32)
+	{
+		uint32 coreIndex = coreinit::OSGetCoreId();
+		auto& coreCBState = s_perCoreCBState[coreIndex];
+		if (coreCBState.currentWritePtr == nullptr)
+			return;
+		uint32 writeDistance = (uint32)(coreCBState.currentWritePtr - coreCBState.bufferPtr);
+		if (writeDistance + reservedFreeSpaceInU32 > coreCBState.bufferSizeInU32s)
+		{
+			GX2Command_Flush(reservedFreeSpaceInU32, true);
 		}
-		gx2WriteGatherCurrentMainCoreIndex = sGX2MainCoreIndex;
-		gx2WriteGatherInited = true;
 	}

 	void GX2WriteGather_beginDisplayList(PPCInterpreter_t* hCPU, MPTR buffer, uint32 maxSize)
 	{
 		uint32 coreIndex = PPCInterpreter_getCoreIndex(hCPU);
-		gx2WriteGatherPipe.displayListStart[coreIndex] = buffer;
-		gx2WriteGatherPipe.displayListMaxSize[coreIndex] = maxSize;
-		// set new write gather ptr
-		gx2WriteGatherPipe.writeGatherPtrDisplayList[coreIndex] = memory_getPointerFromVirtualOffset(gx2WriteGatherPipe.displayListStart[coreIndex]);
-		gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex] = &gx2WriteGatherPipe.writeGatherPtrDisplayList[coreIndex];
+		if (coreIndex == sGX2MainCoreIndex)
+		{
+			GX2Command_PadCurrentBuffer();
+			cemu_assert_debug(!s_perCoreCBState[coreIndex].isDisplayList);
+			s_mainCoreLastCommandState = s_perCoreCBState[coreIndex];
+		}
+		GX2Command_SetupCoreCommandBuffer(MEMPTR<uint32be>(buffer), maxSize/4, true);
 	}

 	uint32 GX2WriteGather_getDisplayListWriteDistance(sint32 coreIndex)
 	{
-		return (uint32)(*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex] - memory_getPointerFromVirtualOffset(gx2WriteGatherPipe.displayListStart[coreIndex]));
-	}
-
-	uint32 GX2WriteGather_getFifoWriteDistance(uint32 coreIndex)
-	{
-		uint32 writeDistance = (uint32)(gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex] - gx2WriteGatherPipe.gxRingBuffer);
-		return writeDistance;
+		auto& coreCBState = s_perCoreCBState[coreIndex];
+		cemu_assert_debug(coreCBState.isDisplayList);
+		if (coreCBState.currentWritePtr == nullptr)
+			return 0;
+		return (uint32)(coreCBState.currentWritePtr - coreCBState.bufferPtr) * 4;
 	}

 	uint32 GX2WriteGather_endDisplayList(PPCInterpreter_t* hCPU, MPTR buffer)
 	{
-		uint32 coreIndex = PPCInterpreter_getCoreIndex(hCPU);
-		if (gx2WriteGatherPipe.displayListStart[coreIndex] != MPTR_NULL)
+		uint32 coreIndex = coreinit::OSGetCoreId();
+		auto& coreCBState = s_perCoreCBState[coreIndex];
+		GX2Command_PadCurrentBuffer();
+		uint32 finalWriteIndex = coreCBState.currentWritePtr - coreCBState.bufferPtr;
+		cemu_assert_debug(finalWriteIndex <= coreCBState.bufferSizeInU32s);
+		// if we are on the main GX2 core then restore the GPU command buffer
+		if (coreIndex == sGX2MainCoreIndex)
 		{
-			uint32 currentWriteSize = GX2WriteGather_getDisplayListWriteDistance(coreIndex);
-			// pad to 32 byte
-			if (gx2WriteGatherPipe.displayListMaxSize[coreIndex] >= ((gx2WriteGatherPipe.displayListMaxSize[coreIndex] + 0x1F) & ~0x1F))
-			{
-				while ((currentWriteSize & 0x1F) != 0)
-				{
-					gx2WriteGather_submitU32AsBE(pm4HeaderType2Filler());
-					currentWriteSize += 4;
-				}
-			}
-			// get size of written data
-			currentWriteSize = GX2WriteGather_getDisplayListWriteDistance(coreIndex);
-			// disable current display list and restore write gather ptr
-			gx2WriteGatherPipe.displayListStart[coreIndex] = MPTR_NULL;
-			if (sGX2MainCoreIndex == coreIndex)
-				gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex] = &gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex];
-			else
-				gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex] = NULL;
-			// return size of (written) display list
-			return currentWriteSize;
+  			coreCBState = s_mainCoreLastCommandState;
 		}
 		else
 		{
-			// no active display list
-			// return a size of 0
-			return 0;
+			coreCBState.bufferPtr = nullptr;
+			coreCBState.currentWritePtr = nullptr;
+			coreCBState.bufferSizeInU32s = 0;
+			coreCBState.isDisplayList = false;
 		}
+		return finalWriteIndex * 4;
 	}

-	bool GX2GetCurrentDisplayList(betype<MPTR>* displayListAddr, uint32be* displayListSize)
+	bool GX2GetCurrentDisplayList(MEMPTR<uint32be>* displayListAddr, uint32be* displayListSize)
 	{
 		uint32 coreIndex = coreinit::OSGetCoreId();
-		if (gx2WriteGatherPipe.displayListStart[coreIndex] == MPTR_NULL)
+		auto& coreCBState = s_perCoreCBState[coreIndex];
+		if (!coreCBState.isDisplayList)
 			return false;
-
 		if (displayListAddr)
-			*displayListAddr = gx2WriteGatherPipe.displayListStart[coreIndex];
+			*displayListAddr = coreCBState.bufferPtr;
 		if (displayListSize)
-			*displayListSize = gx2WriteGatherPipe.displayListMaxSize[coreIndex];
-
+			*displayListSize = coreCBState.bufferSizeInU32s * sizeof(uint32be);
 		return true;
 	}

+	// returns true if we are writing to a display list
 	bool GX2GetDisplayListWriteStatus()
 	{
-		// returns true if we are writing to a display list
 		uint32 coreIndex = coreinit::OSGetCoreId();
-		return gx2WriteGatherPipe.displayListStart[coreIndex] != MPTR_NULL;
-	}
-
-	uint32 GX2WriteGather_getReadWriteDistance()
-	{
-		uint32 coreIndex = sGX2MainCoreIndex;
-		uint32 writeDistance = (uint32)(gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex] + GX2_COMMAND_RING_BUFFER_SIZE - gxRingBufferReadPtr);
-		writeDistance %= GX2_COMMAND_RING_BUFFER_SIZE;
-		return writeDistance;
-	}
-
-	void GX2WriteGather_checkAndInsertWrapAroundMark()
-	{
-		uint32 coreIndex = coreinit::OSGetCoreId();
-		if (coreIndex != sGX2MainCoreIndex) // only if main gx2 core
-			return;
-		if (gx2WriteGatherPipe.displayListStart[coreIndex] != MPTR_NULL)
-			return;
-		uint32 writeDistance = GX2WriteGather_getFifoWriteDistance(coreIndex);
-		if (writeDistance >= (GX2_COMMAND_RING_BUFFER_SIZE * 3 / 5))
-		{
-			gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_HLE_FIFO_WRAP_AROUND, 1));
-			gx2WriteGather_submitU32AsBE(0); // empty word since we can't send commands with zero data words
-			gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex] = gx2WriteGatherPipe.gxRingBuffer;
-		}
+		return s_perCoreCBState[coreIndex].isDisplayList;
 	}

 	void GX2BeginDisplayList(MEMPTR<void> displayListAddr, uint32 size)
@ -204,28 +423,23 @@ namespace GX2
 			memory_virtualToPhysical(addr),
 			0, // high address bits
 			size / 4);
-		GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
 	}

 	void GX2DirectCallDisplayList(void* addr, uint32 size)
 	{
 		// this API submits to TCL directly and bypasses write-gatherer
 		// its basically a way to manually submit a command buffer to the GPU
-		// as such it also affects the submission and retire timestamps
-
-		uint32 coreIndex = PPCInterpreter_getCoreIndex(PPCInterpreter_getCurrentInstance());
-		cemu_assert_debug(coreIndex == sGX2MainCoreIndex);
-		coreIndex = sGX2MainCoreIndex; // always submit to main queue which is owned by GX2 main core (TCLSubmitToRing does not need this workaround)
-
-		uint32be* cmdStream = (uint32be*)(gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex]);
-		cmdStream[0] = pm4HeaderType3(IT_INDIRECT_BUFFER_PRIV, 3);
-		cmdStream[1] = memory_virtualToPhysical(MEMPTR<void>(addr).GetMPTR());
-		cmdStream[2] = 0;
-		cmdStream[3] = size / 4;
-		gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex] += 16;
-
-		// update submission timestamp and retired timestamp
-		_GX2SubmitToTCL();
+		uint32 coreIndex = coreinit::OSGetCoreId();
+		if (coreIndex != sGX2MainCoreIndex)
+		{
+			cemuLog_logDebugOnce(LogType::Force, "GX2DirectCallDisplayList() called on non-main GX2 core");
+		}
+		if (!s_perCoreCBState[coreIndex].isDisplayList)
+		{
+			// make sure any preceeding commands are submitted first
+			GX2Command_Flush(0x100, false);
+		}
+		GX2Command_SubmitCommandBuffer(static_cast<uint32be*>(addr), size / 4, nullptr, false);
 	}

 	void GX2CopyDisplayList(MEMPTR<uint32be*> addr, uint32 size)
@ -288,6 +502,12 @@ namespace GX2

 	void GX2CommandInit()
 	{
+		cafeExportRegister("gx2", GX2Flush, LogType::GX2);
+
+		cafeExportRegister("gx2", GX2GetLastSubmittedTimeStamp, LogType::GX2);
+		cafeExportRegister("gx2", GX2GetRetiredTimeStamp, LogType::GX2);
+		cafeExportRegister("gx2", GX2WaitTimeStamp, LogType::GX2);
+
 		cafeExportRegister("gx2", GX2BeginDisplayList, LogType::GX2);
 		cafeExportRegister("gx2", GX2BeginDisplayListEx, LogType::GX2);
 		cafeExportRegister("gx2", GX2EndDisplayList, LogType::GX2);
@ -295,7 +515,6 @@ namespace GX2
 		cafeExportRegister("gx2", GX2GetCurrentDisplayList, LogType::GX2);
 		cafeExportRegister("gx2", GX2GetDisplayListWriteStatus, LogType::GX2);

-
 		cafeExportRegister("gx2", GX2CallDisplayList, LogType::GX2);
 		cafeExportRegister("gx2", GX2DirectCallDisplayList, LogType::GX2);
 		cafeExportRegister("gx2", GX2CopyDisplayList, LogType::GX2);
@ -305,7 +524,10 @@ namespace GX2

    void GX2CommandResetToDefaultState()
    {
-        GX2WriteGather_ResetToDefaultState();
+		s_commandState->commandPoolBase = nullptr;
+		s_commandState->commandPoolSizeInU32s = 0;
+		s_commandState->gpuCommandReadPtr = nullptr;
+		s_cbBufferIsInternallyAllocated = false;
    }

 }
--- a/src/Cafe/OS/libs/gx2/GX2_Command.h
+++ b/src/Cafe/OS/libs/gx2/GX2_Command.h
@ -2,21 +2,19 @@
 #include "Cafe/HW/Latte/ISA/LatteReg.h"
 #include "Cafe/HW/Espresso/Const.h"

-struct GX2WriteGatherPipeState
+namespace GX2
 {
-	uint8* gxRingBuffer;
-	// each core has it's own write gatherer and display list state (writing)
-	uint8* writeGatherPtrGxBuffer[Espresso::CORE_COUNT];
-	uint8** writeGatherPtrWrite[Espresso::CORE_COUNT];
-	uint8* writeGatherPtrDisplayList[Espresso::CORE_COUNT];
-	MPTR displayListStart[Espresso::CORE_COUNT];
-	uint32 displayListMaxSize[Espresso::CORE_COUNT];
+	struct GX2PerCoreCBState
+	{
+		uint32be* bufferPtr;
+		uint32 bufferSizeInU32s;
+		uint32be* currentWritePtr;
+		bool isDisplayList;
+	};
+
+	extern GX2PerCoreCBState s_perCoreCBState[Espresso::CORE_COUNT];
 };

-extern GX2WriteGatherPipeState gx2WriteGatherPipe;
-
-void GX2ReserveCmdSpace(uint32 reservedFreeSpaceInU32); // move to GX2 namespace eventually
-
 void gx2WriteGather_submitU32AsBE(uint32 v);
 void gx2WriteGather_submitU32AsLE(uint32 v);
 void gx2WriteGather_submitU32AsLEArray(uint32* v, uint32 numValues);
@ -27,7 +25,8 @@ uint32 PPCInterpreter_getCurrentCoreIndex();
 template <typename ...Targs>
 inline void gx2WriteGather_submit_(uint32 coreIndex, uint32be* writePtr)
 {
-	(*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex]) = (uint8*)writePtr;
+	GX2::s_perCoreCBState[coreIndex].currentWritePtr = writePtr;
+	cemu_assert_debug(GX2::s_perCoreCBState[coreIndex].currentWritePtr <= (GX2::s_perCoreCBState[coreIndex].bufferPtr + GX2::s_perCoreCBState[coreIndex].bufferSizeInU32s));
 }

 template <typename T, typename ...Targs>
@ -75,17 +74,23 @@ template <typename ...Targs>
 inline void gx2WriteGather_submit(Targs... args)
 {
 	uint32 coreIndex = PPCInterpreter_getCurrentCoreIndex();
-	if (gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex] == nullptr)
+	if (GX2::s_perCoreCBState[coreIndex].currentWritePtr == nullptr)
+	{
+		cemu_assert_suspicious(); // writing to command buffer without valid write pointer?
 		return;
-
-	uint32be* writePtr = (uint32be*)(*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex]);
+	}
+	uint32be* writePtr = GX2::s_perCoreCBState[coreIndex].currentWritePtr;
 	gx2WriteGather_submit_(coreIndex, writePtr, std::forward<Targs>(args)...);
 }

 namespace GX2
 {
-	uint32 GX2WriteGather_getReadWriteDistance();
-	void GX2WriteGather_checkAndInsertWrapAroundMark();
+	void GX2Command_Flush(uint32 numU32sForNextBuffer, bool triggerMarkerInterrupt = true);
+	void GX2ReserveCmdSpace(uint32 reservedFreeSpaceInU32);
+
+	uint64 GX2GetLastSubmittedTimeStamp();
+	uint64 GX2GetRetiredTimeStamp();
+	bool GX2WaitTimeStamp(uint64 tsWait);

 	void GX2BeginDisplayList(MEMPTR<void> displayListAddr, uint32 size);
 	void GX2BeginDisplayListEx(MEMPTR<void> displayListAddr, uint32 size, bool profiling);
@ -96,7 +101,8 @@ namespace GX2

 	bool GX2GetDisplayListWriteStatus();

-	void GX2Init_writeGather();
    void GX2CommandInit();
+	void GX2Init_commandBufferPool(void* bufferBase, uint32 bufferSize);
+	void GX2Shutdown_commandBufferPool();
    void GX2CommandResetToDefaultState();
 }
--- a/src/Cafe/OS/libs/gx2/GX2_ContextState.cpp
+++ b/src/Cafe/OS/libs/gx2/GX2_ContextState.cpp
@ -168,7 +168,7 @@ uint32 _GX2Context_CalcStateSize()

 void _GX2Context_CreateLoadDL()
 {
-	GX2ReserveCmdSpace(3);
+	GX2::GX2ReserveCmdSpace(3);
 	gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_CONTEXT_CONTROL, 2));
 	gx2WriteGather_submitU32AsBE(0x80000077);
 	gx2WriteGather_submitU32AsBE(0x80000077);
@ -176,7 +176,7 @@ void _GX2Context_CreateLoadDL()

 void _GX2Context_WriteCmdDisableStateShadowing()
 {
-	GX2ReserveCmdSpace(3);
+	GX2::GX2ReserveCmdSpace(3);
 	gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_CONTEXT_CONTROL, 2));
 	gx2WriteGather_submitU32AsBE(0x80000000);
 	gx2WriteGather_submitU32AsBE(0x80000000);
@ -184,7 +184,7 @@ void _GX2Context_WriteCmdDisableStateShadowing()

 void _GX2Context_cmdLoad(void* gx2ukn, uint32 pm4Header, MPTR physAddrRegArea, uint32 waitForIdle, uint32 numRegOffsetEntries, GX2RegLoadPktEntry_t* regOffsetEntries)
 {
-	GX2ReserveCmdSpace(3 + numRegOffsetEntries*2);
+	GX2::GX2ReserveCmdSpace(3 + numRegOffsetEntries*2);
 	gx2WriteGather_submitU32AsBE(pm4Header);
 	gx2WriteGather_submitU32AsBE(physAddrRegArea);
 	gx2WriteGather_submitU32AsBE(waitForIdle);
@ -199,7 +199,6 @@ void _GX2Context_cmdLoad(void* gx2ukn, uint32 pm4Header, MPTR physAddrRegArea, u

 void _GX2Context_WriteCmdRestoreState(GX2ContextState_t* gx2ContextState, uint32 ukn)
 {
-	GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
 	MPTR physAddrContextState = memory_virtualToPhysical(memory_getVirtualOffsetFromPointer(gx2ContextState));
 	_GX2Context_CreateLoadDL();
 	__cmdStateLoad(NULL, IT_LOAD_CONFIG_REG, gx2ContextState->hwContext.areaConfigReg, 0x80000000, configReg_loadPktEntries);
@ -212,7 +211,7 @@ void _GX2Context_WriteCmdRestoreState(GX2ContextState_t* gx2ContextState, uint32

 void GX2SetDefaultState()
 {
-	GX2ReserveCmdSpace(0x100);
+	GX2::GX2ReserveCmdSpace(0x100);

 	Latte::LATTE_PA_CL_VTE_CNTL reg{};
 	reg.set_VPORT_X_OFFSET_ENA(true).set_VPORT_X_SCALE_ENA(true);
@ -376,7 +375,6 @@ void gx2Export_GX2SetContextState(PPCInterpreter_t* hCPU)
 	osLib_returnFromFunction(hCPU, 0);
 }

-
 void gx2Export_GX2GetContextStateDisplayList(PPCInterpreter_t* hCPU)
 {
 	cemuLog_log(LogType::GX2, "GX2GetContextStateDisplayList(0x{:08x}, 0x{:08x}, 0x{:08x})", hCPU->gpr[3], hCPU->gpr[4], hCPU->gpr[5]);
--- a/src/Cafe/OS/libs/gx2/GX2_Draw.cpp
+++ b/src/Cafe/OS/libs/gx2/GX2_Draw.cpp
@ -52,7 +52,6 @@ namespace GX2
 			0,
 			count,
 			0);
-		GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
 	}

 	void GX2DrawIndexedEx2(GX2PrimitiveMode2 primitiveMode, uint32 count, GX2IndexType indexType, void* indexData, uint32 baseVertex, uint32 numInstances, uint32 baseInstance)
@ -85,7 +84,6 @@ namespace GX2
 			pm4HeaderType3(IT_SET_CTL_CONST, 2), 1,
 			0 // baseInstance
 		);
-		GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
 	}

 	void GX2DrawEx(GX2PrimitiveMode2 primitiveMode, uint32 count, uint32 baseVertex, uint32 numInstances)
@ -109,7 +107,6 @@ namespace GX2
 			count,
 			0 // DRAW_INITIATOR
 		);
-		GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
 	}

 	void GX2DrawIndexedImmediateEx(GX2PrimitiveMode2 primitiveMode, uint32 count, GX2IndexType indexType, void* indexData, uint32 baseVertex, uint32 numInstances)
@ -177,7 +174,6 @@ namespace GX2
 			}
 		}

-		GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
 	}

 	struct GX2DispatchComputeParam
--- a/src/Cafe/OS/libs/gx2/GX2_Event.cpp
+++ b/src/Cafe/OS/libs/gx2/GX2_Event.cpp
@ -16,18 +16,6 @@ namespace GX2
 	SysAllocator<coreinit::OSThreadQueue> g_vsyncThreadQueue;
 	SysAllocator<coreinit::OSThreadQueue> g_flipThreadQueue;

-	SysAllocator<coreinit::OSEvent> s_updateRetirementEvent;
-	std::atomic<uint64> s_lastRetirementTimestamp = 0;
-
-	// called from GPU code when a command buffer is retired
-	void __GX2NotifyNewRetirementTimestamp(uint64 tsRetire)
-	{
-		__OSLockScheduler();
-		s_lastRetirementTimestamp = tsRetire;
-		coreinit::OSSignalEventAllInternal(s_updateRetirementEvent.GetPtr());
-		__OSUnlockScheduler();
-	}
-
 	void GX2SetGPUFence(uint32be* fencePtr, uint32 mask, uint32 compareOp, uint32 compareValue)
 	{
 		GX2ReserveCmdSpace(7);
@ -210,16 +198,6 @@ namespace GX2
 		osLib_returnFromFunction(hCPU, 0);
 	}

-	uint64 GX2GetLastSubmittedTimeStamp()
-	{
-		return LatteGPUState.lastSubmittedCommandBufferTimestamp.load();
-	}
-
-	uint64 GX2GetRetiredTimeStamp()
-	{
-		return s_lastRetirementTimestamp;
-	}
-
 	void GX2WaitForVsync()
 	{
 		__OSLockScheduler();
@ -236,19 +214,6 @@ namespace GX2
 		__OSUnlockScheduler();
 	}

-	bool GX2WaitTimeStamp(uint64 tsWait)
-	{
-		__OSLockScheduler();
-		while (tsWait > s_lastRetirementTimestamp)
-		{
-			// GPU hasn't caught up yet
-			coreinit::OSWaitEventInternal(s_updateRetirementEvent.GetPtr());
-		}
-		__OSUnlockScheduler();
-		// return true to indicate no timeout
-		return true;
-	}
-
 	void GX2DrawDone()
 	{
 		// optional force full sync (texture readback and occlusion queries)
@ -263,13 +228,10 @@ namespace GX2
 			gx2WriteGather_submitU32AsBE(0x00000000); // unused
 		}
 		// flush pipeline
-		if (_GX2GetUnflushedBytes(coreinit::OSGetCoreId()) > 0)
-			_GX2SubmitToTCL();
+		GX2Command_Flush(0x100, true);

 		uint64 ts = GX2GetLastSubmittedTimeStamp();
 		GX2WaitTimeStamp(ts);
-
-		GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
 	}

 	void GX2Init_event()
@ -294,25 +256,19 @@ namespace GX2
 		cafeExportRegister("gx2", GX2SetEventCallback, LogType::GX2);
 		cafeExportRegister("gx2", GX2GetEventCallback, LogType::GX2);

-		cafeExportRegister("gx2", GX2GetLastSubmittedTimeStamp, LogType::GX2);
-		cafeExportRegister("gx2", GX2GetRetiredTimeStamp, LogType::GX2);
-
 		cafeExportRegister("gx2", GX2WaitForVsync, LogType::GX2);
 		cafeExportRegister("gx2", GX2WaitForFlip, LogType::GX2);
-		cafeExportRegister("gx2", GX2WaitTimeStamp, LogType::GX2);
 		cafeExportRegister("gx2", GX2DrawDone, LogType::GX2);

 		coreinit::OSInitThreadQueue(g_vsyncThreadQueue.GetPtr());
 		coreinit::OSInitThreadQueue(g_flipThreadQueue.GetPtr());

-		coreinit::OSInitEvent(s_updateRetirementEvent, coreinit::OSEvent::EVENT_STATE::STATE_NOT_SIGNALED, coreinit::OSEvent::EVENT_MODE::MODE_AUTO);
 		coreinit::OSInitSemaphore(s_eventCbQueueSemaphore, 0);
 	}

    void GX2EventResetToDefaultState()
    {
        s_callbackThreadLaunched = false;
-        s_lastRetirementTimestamp = 0;
        for(auto& it : s_eventCallback)
        {
            it.callbackFuncPtr = nullptr;
--- a/src/Cafe/OS/libs/gx2/GX2_Misc.cpp
+++ b/src/Cafe/OS/libs/gx2/GX2_Misc.cpp
@ -81,19 +81,68 @@ namespace GX2

 	void _test_AddrLib();

-	void GX2Init(void* initSettings)
+	using GX2InitArg = uint32;
+	enum class GX2InitArgId : GX2InitArg
+	{
+		EndOfArgs = 0,
+		CommandPoolBase = 1,
+		CommandPoolSize = 2,
+		UknArg7 = 7,
+		UknArg8 = 8,
+		UknArg9 = 9,
+		UknArg11 = 11,
+	};
+
+	void GX2Init(betype<GX2InitArg>* initArgStream)
 	{
 		if (LatteGPUState.gx2InitCalled)
 		{
 			cemuLog_logDebug(LogType::Force, "GX2Init() called while already initialized");
 			return;
 		}
+		// parse init params from the stream
+		MEMPTR<void> commandPoolBase = nullptr;
+		uint32 commandPoolSize = 0;
+		if (initArgStream)
+		{
+			while (true)
+			{
+				GX2InitArgId paramId = static_cast<GX2InitArgId>((GX2InitArg)*initArgStream);
+				initArgStream++;
+				if (paramId == GX2InitArgId::EndOfArgs)
+				{
+					break;
+				}
+				else if (paramId == GX2InitArgId::CommandPoolBase)
+				{
+					commandPoolBase = MEMPTR<void>(*initArgStream);
+					initArgStream++;
+				}
+				else if (paramId == GX2InitArgId::CommandPoolSize)
+				{
+					commandPoolSize = *initArgStream;
+					initArgStream++;
+				}
+				else if (paramId == GX2InitArgId::UknArg7 ||
+					paramId == GX2InitArgId::UknArg8 ||
+					paramId == GX2InitArgId::UknArg9 ||
+					paramId == GX2InitArgId::UknArg11)
+				{
+					initArgStream++;
+				}
+				else
+				{
+					cemuLog_log(LogType::Force, "GX2Init: Unsupported init arg {}", (uint32)paramId);
+				}
+			}
+		}
+		// init main core
 		uint32 coreIndex = coreinit::OSGetCoreId();
 		cemuLog_log(LogType::GX2, "GX2Init() on core {} by thread 0x{:08x}", coreIndex, MEMPTR<OSThread_t>(coreinit::OSGetCurrentThread()).GetMPTR());
 		sGX2MainCoreIndex = coreIndex;
 		// init submodules
 		GX2::GX2Init_event();
-		GX2::GX2Init_writeGather();
+		GX2::GX2Init_commandBufferPool(commandPoolBase, commandPoolSize);
 		// init shared area
 		if (LatteGPUState.sharedAreaAddr == MPTR_NULL)
 		{
@ -112,6 +161,21 @@ namespace GX2
 		_test_AddrLib();
 	}

+	void GX2Shutdown()
+	{
+		if (!LatteGPUState.gx2InitCalled)
+		{
+			cemuLog_logDebug(LogType::Force, "GX2Shutdown() called while not initialized");
+			return;
+		}
+		LatteGPUState.gx2InitCalled--;
+		if (LatteGPUState.gx2InitCalled != 0)
+			return;
+		GX2DrawDone();
+		GX2Shutdown_commandBufferPool();
+		cemuLog_log(LogType::Force, "GX2 shutdown");
+	}
+
 	void _GX2DriverReset()
 	{
 		LatteGPUState.gx2InitCalled = 0;
@ -237,6 +301,7 @@ namespace GX2
 	void GX2MiscInit()
 	{
 		cafeExportRegister("gx2", GX2Init, LogType::GX2);
+		cafeExportRegister("gx2", GX2Shutdown, LogType::GX2);
 		cafeExportRegister("gx2", GX2GetMainCoreId, LogType::GX2);
 		cafeExportRegister("gx2", GX2ResetGPU, LogType::GX2);

--- a/src/Cafe/OS/libs/gx2/GX2_RenderTarget.cpp
+++ b/src/Cafe/OS/libs/gx2/GX2_RenderTarget.cpp
@ -135,7 +135,7 @@ void gx2Export_GX2InitDepthBufferRegs(PPCInterpreter_t* hCPU)
 void gx2Export_GX2SetColorBuffer(PPCInterpreter_t* hCPU)
 {
 	cemuLog_log(LogType::GX2, "GX2SetColorBuffer(0x{:08x}, {})", hCPU->gpr[3], hCPU->gpr[4]);
-	GX2ReserveCmdSpace(20);
+	GX2::GX2ReserveCmdSpace(20);

 	GX2ColorBuffer* colorBufferBE = (GX2ColorBuffer*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]);

@ -198,15 +198,13 @@ void gx2Export_GX2SetColorBuffer(PPCInterpreter_t* hCPU)
 		mmCB_COLOR0_INFO - 0xA000 + hCPU->gpr[4],
 		colorBufferBE->reg_info);

-	GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
-
 	osLib_returnFromFunction(hCPU, 0);
 }

 void gx2Export_GX2SetDepthBuffer(PPCInterpreter_t* hCPU)
 {
 	cemuLog_log(LogType::GX2, "GX2SetDepthBuffer(0x{:08x})", hCPU->gpr[3]);
-	GX2ReserveCmdSpace(20);
+	GX2::GX2ReserveCmdSpace(20);

 	GX2DepthBuffer* depthBufferBE = (GX2DepthBuffer*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]);

@ -264,8 +262,6 @@ void gx2Export_GX2SetDepthBuffer(PPCInterpreter_t* hCPU)
 	gx2WriteGather_submitU32AsBE(mmDB_DEPTH_VIEW - 0xA000);
 	gx2WriteGather_submitU32AsBE(db_view);

-	GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
-
 	osLib_returnFromFunction(hCPU, 0);
 }

@ -281,7 +277,7 @@ void gx2Export_GX2MarkScanBufferCopied(PPCInterpreter_t* hCPU)
 	uint32 scanTarget = hCPU->gpr[3];
 	if( scanTarget == GX2_SCAN_TARGET_TV )
 	{
-		GX2ReserveCmdSpace(10);
+		GX2::GX2ReserveCmdSpace(10);

 		uint32 physAddr = (MEMORY_TILINGAPERTURE_AREA_ADDR+0x200000);

--- a/src/Cafe/OS/libs/gx2/GX2_Shader.cpp
+++ b/src/Cafe/OS/libs/gx2/GX2_Shader.cpp
@ -303,7 +303,27 @@ namespace GX2

 	void GX2SetVertexShader(GX2VertexShader* vertexShader)
 	{
-		GX2ReserveCmdSpace(100);
+		uint32 numOutputIds = vertexShader->regs.vsOutIdTableSize;
+		numOutputIds = std::min<uint32>(numOutputIds, 0xA);
+		uint32 vsSemanticTableSize = vertexShader->regs.semanticTableSize;
+
+		uint32 reserveSize = 31;
+		if (vertexShader->shaderMode == GX2_SHADER_MODE::GEOMETRY_SHADER)
+		{
+			reserveSize += 7;
+		}
+		else
+		{
+			reserveSize += 18;
+			reserveSize += numOutputIds;
+			if (vertexShader->usesStreamOut != 0)
+				reserveSize += 2+12;
+		}
+		if (vsSemanticTableSize > 0)
+		{
+			reserveSize += 5 + vsSemanticTableSize;
+		}
+		GX2ReserveCmdSpace(reserveSize);

 		MPTR shaderProgramAddr;
 		uint32 shaderProgramSize;
@ -361,8 +381,6 @@ namespace GX2

 			cemu_assert_debug(vertexShader->regs.SPI_VS_OUT_CONFIG.value().get_VS_PER_COMPONENT() == false); // not handled on the GPU side

-			uint32 numOutputIds = vertexShader->regs.vsOutIdTableSize;
-			numOutputIds = std::min<uint32>(numOutputIds, 0xA);
 			gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 1+numOutputIds));
 			gx2WriteGather_submitU32AsBE(Latte::REGADDR::SPI_VS_OUT_ID_0-0xA000);
 			for(uint32 i=0; i<numOutputIds; i++)
@ -392,7 +410,6 @@ namespace GX2
 			}
 		}
 		// update semantic table
-		uint32 vsSemanticTableSize = vertexShader->regs.semanticTableSize;
 		if (vsSemanticTableSize > 0)
 		{
 			gx2WriteGather_submit(
--- a/src/Cafe/OS/libs/gx2/GX2_State.cpp
+++ b/src/Cafe/OS/libs/gx2/GX2_State.cpp
@ -213,7 +213,6 @@ namespace GX2

 	void GX2SetViewportReg(GX2ViewportReg* viewportReg)
 	{
-		GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
 		GX2ReserveCmdSpace(2 + 6);

 		gx2WriteGather_submit(pm4HeaderType3(IT_SET_CONTEXT_REG, 1 + 6),
--- a/src/Cafe/OS/libs/gx2/GX2_Surface_Copy.cpp
+++ b/src/Cafe/OS/libs/gx2/GX2_Surface_Copy.cpp
@ -264,7 +264,7 @@ void gx2Surface_GX2CopySurface(GX2Surface* srcSurface, uint32 srcMip, uint32 src
 	// send copy command to GPU
 	if( srcHwTileMode > 0 && srcHwTileMode < 16 && dstHwTileMode > 0 && dstHwTileMode < 16 || requestGPURAMCopy )
 	{
-		GX2ReserveCmdSpace(1+13*2);
+		GX2::GX2ReserveCmdSpace(1+13*2);

 		gx2WriteGather_submit(pm4HeaderType3(IT_HLE_COPY_SURFACE_NEW, 13*2),
 		// src
@ -540,7 +540,7 @@ void gx2Export_GX2ResolveAAColorBuffer(PPCInterpreter_t* hCPU)
 	uint32 dstDepth = std::max<uint32>(surfOutDst.depth, 1);

 	// send copy command to GPU
-	GX2ReserveCmdSpace(1 + 13 * 2);
+	GX2::GX2ReserveCmdSpace(1 + 13 * 2);
 	gx2WriteGather_submit(pm4HeaderType3(IT_HLE_COPY_SURFACE_NEW, 13 * 2),
 		// src
 		(uint32)srcSurface->imagePtr,
@ -619,7 +619,7 @@ void gx2Export_GX2ConvertDepthBufferToTextureSurface(PPCInterpreter_t* hCPU)
 	sint32 srcMip = 0;

 	uint32 numSlices = std::max<uint32>(_swapEndianU32(depthBuffer->viewNumSlices), 1);
-	GX2ReserveCmdSpace((1 + 13 * 2) * numSlices);
+	GX2::GX2ReserveCmdSpace((1 + 13 * 2) * numSlices);
 	for (uint32 subSliceIndex = 0; subSliceIndex < numSlices; subSliceIndex++)
 	{
 		// send copy command to GPU
--- a/src/Cafe/OS/libs/gx2/GX2_shader_legacy.cpp
+++ b/src/Cafe/OS/libs/gx2/GX2_shader_legacy.cpp
@ -11,9 +11,14 @@
 void gx2Export_GX2SetPixelShader(PPCInterpreter_t* hCPU)
 {
 	cemuLog_log(LogType::GX2, "GX2SetPixelShader(0x{:08x})", hCPU->gpr[3]);
-	GX2ReserveCmdSpace(100);
-
 	GX2PixelShader_t* pixelShader = (GX2PixelShader_t*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]);
+
+	uint32 numInputs = _swapEndianU32(pixelShader->regs[4]);
+	if( numInputs > 0x20 )
+		numInputs = 0x20;
+
+	GX2::GX2ReserveCmdSpace(26 + numInputs);
+
 	MPTR shaderProgramAddr;
 	uint32 shaderProgramSize;

@ -44,9 +49,6 @@ void gx2Export_GX2SetPixelShader(PPCInterpreter_t* hCPU)
 		_swapEndianU32(pixelShader->regs[2]),
 		_swapEndianU32(pixelShader->regs[3]));
 	// setup pixel shader extended inputs control
-	uint32 numInputs = _swapEndianU32(pixelShader->regs[4]);
-	if( numInputs > 0x20 )
-		numInputs = 0x20;
 	gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 1+numInputs));
 	gx2WriteGather_submitU32AsBE(mmSPI_PS_INPUT_CNTL_0-0xA000);
 	for(uint32 i=0; i<numInputs; i++)
@ -79,9 +81,17 @@ void gx2Export_GX2SetPixelShader(PPCInterpreter_t* hCPU)
 void gx2Export_GX2SetGeometryShader(PPCInterpreter_t* hCPU)
 {
 	cemuLog_log(LogType::GX2, "GX2SetGeometryShader(0x{:08x})", hCPU->gpr[3]);
-	GX2ReserveCmdSpace(100);

 	GX2GeometryShader_t* geometryShader = (GX2GeometryShader_t*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]);
+	uint32 numOutputIds = _swapEndianU32(geometryShader->regs[7]);
+	numOutputIds = std::min<uint32>(numOutputIds, 0xA);
+	uint32 reserveSize = 38; // 38 fixed parameters
+	if (numOutputIds != 0)
+		reserveSize += 2 + numOutputIds;
+	if( _swapEndianU32(geometryShader->useStreamout) != 0 )
+		reserveSize += 2 + 12;
+
+	GX2::GX2ReserveCmdSpace(reserveSize);

 	MPTR shaderProgramAddr;
 	uint32 shaderProgramSize;
@ -128,6 +138,7 @@ void gx2Export_GX2SetGeometryShader(PPCInterpreter_t* hCPU)
 	
 	if( _swapEndianU32(geometryShader->useStreamout) != 0 )
 	{
+		// todo - IT_EVENT_WRITE packet here
 		// stride 0
 		gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 2));
 		gx2WriteGather_submitU32AsBE(mmVGT_STRMOUT_VTX_STRIDE_0-0xA000);
@ -180,8 +191,6 @@ void gx2Export_GX2SetGeometryShader(PPCInterpreter_t* hCPU)
 	gx2WriteGather_submitU32AsBE(_swapEndianU32(geometryShader->regs[3]));

 	// GS outputs
-	uint32 numOutputIds = _swapEndianU32(geometryShader->regs[7]);
-	numOutputIds = std::min<uint32>(numOutputIds, 0xA);
 	if( numOutputIds != 0 )
 	{
 		gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 1+numOutputIds));
@ -254,8 +263,7 @@ void gx2Export_GX2SetComputeShader(PPCInterpreter_t* hCPU)
 		shaderPtr = computeShader->rBuffer.GetVirtualAddr();
 		shaderSize = computeShader->rBuffer.GetSize();
 	}
-
-	GX2ReserveCmdSpace(0x11);
+	GX2::GX2ReserveCmdSpace(0x11);

 	gx2WriteGather_submit(pm4HeaderType3(IT_SET_CONTEXT_REG, 6),
 		mmSQ_PGM_START_ES-0xA000,
@ -272,7 +280,7 @@ void gx2Export_GX2SetComputeShader(PPCInterpreter_t* hCPU)

 void _GX2SubmitUniformBlock(uint32 registerBase, uint32 index, MPTR virtualAddress, uint32 size)
 {
-	GX2ReserveCmdSpace(9);
+	GX2::GX2ReserveCmdSpace(9);
 	gx2WriteGather_submit(pm4HeaderType3(IT_SET_RESOURCE, 8),
 		registerBase + index * 7,
 		memory_virtualToPhysical(virtualAddress),
@ -307,7 +315,7 @@ void gx2Export_GX2SetGeometryUniformBlock(PPCInterpreter_t* hCPU)

 void gx2Export_GX2RSetVertexUniformBlock(PPCInterpreter_t* hCPU)
 {
-	GX2ReserveCmdSpace(9);
+	GX2::GX2ReserveCmdSpace(9);

 	GX2RBuffer* bufferPtr = (GX2RBuffer*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]);
 	uint32 index = hCPU->gpr[4];
@ -320,7 +328,7 @@ void gx2Export_GX2RSetVertexUniformBlock(PPCInterpreter_t* hCPU)

 void gx2Export_GX2SetShaderModeEx(PPCInterpreter_t* hCPU)
 {
-	GX2ReserveCmdSpace(8+4);
+	GX2::GX2ReserveCmdSpace(8+4);
 	uint32 mode = hCPU->gpr[3];

 	uint32 sqConfig = hCPU->gpr[3] == 0 ? 4 : 0;
--- a/src/Common/precompiled.h
+++ b/src/Common/precompiled.h
@ -616,4 +616,36 @@ namespace stdx
 		scope_exit& operator=(scope_exit) = delete;
 		void release() { m_released = true;}
 	};
+
+	// Xcode 16 doesn't have std::atomic_ref support and we provide a minimalist reimplementation as fallback
+#ifdef __cpp_lib_atomic_ref
+	#include <atomic>
+	template<typename T>
+	using atomic_ref = std::atomic_ref<T>;
+#else
+	template<typename T>
+	class atomic_ref
+	{
+		static_assert(std::is_trivially_copyable<T>::value, "atomic_ref requires trivially copyable types");
+	public:
+		using value_type = T;
+
+		explicit atomic_ref(T& obj) noexcept : ptr_(std::addressof(obj)) {}
+
+		T load(std::memory_order order = std::memory_order_seq_cst) const noexcept
+		{
+			auto aptr = reinterpret_cast<std::atomic<T>*>(ptr_);
+			return aptr->load(order);
+		}
+
+		void store(T desired, std::memory_order order = std::memory_order_seq_cst) const noexcept
+		{
+			auto aptr = reinterpret_cast<std::atomic<T>*>(ptr_);
+			aptr->store(desired, order);
+		}
+
+	private:
+		T* ptr_;
+	};
+#endif
 }