GX2+TCL: Reimplement command buffer submission

- GX2 utilizes TCL(.rpl) API for command submission instead of directly writing to an internal GPU fifo
- Submission & retire timestamps are correctly implemented as incremental counters
- Command buffering behaviour matches console
- Fixes race conditions on aarch64
This commit is contained in:
Exzap 2025-05-14 18:59:50 +02:00
parent 96765e4ac6
commit 28ea70b6d8
21 changed files with 761 additions and 472 deletions

View file

@ -47,8 +47,6 @@ struct LatteGPUState_t
gx2GPUSharedArea_t* sharedArea; // quick reference to shared area
MPTR sharedAreaAddr;
// other
// todo: Currently we have the command buffer logic implemented as a FIFO ringbuffer. On real HW it's handled as a series of command buffers that are pushed individually.
std::atomic<uint64> lastSubmittedCommandBufferTimestamp;
uint32 gx2InitCalled; // incremented every time GX2Init() is called
// OpenGL control
uint32 glVendor; // GLVENDOR_*
@ -75,8 +73,6 @@ struct LatteGPUState_t
extern LatteGPUState_t LatteGPUState;
extern uint8* gxRingBufferReadPtr; // currently active read pointer (gx2 ring buffer or display list)
// texture
#include "Cafe/HW/Latte/Core/LatteTexture.h"

View file

@ -13,6 +13,7 @@
#include "Cafe/HW/Latte/Core/LattePM4.h"
#include "Cafe/OS/libs/coreinit/coreinit_Time.h"
#include "Cafe/OS/libs/TCL/TCL.h" // TCL currently handles the GPU command ringbuffer
#include "Cafe/CafeSystem.h"
@ -28,11 +29,6 @@ typedef uint32be* LatteCMDPtr;
#define LatteReadCMD() ((uint32)*(cmd++))
#define LatteSkipCMD(_nWords) cmd += (_nWords)
uint8* gxRingBufferReadPtr; // currently active read pointer (gx2 ring buffer or display list)
uint8* gx2CPParserDisplayListPtr;
uint8* gx2CPParserDisplayListStart; // used for debugging
uint8* gx2CPParserDisplayListEnd;
void LatteThread_HandleOSScreen();
void LatteThread_Exit();
@ -155,16 +151,12 @@ void LatteCP_signalEnterWait()
*/
uint32 LatteCP_readU32Deprc()
{
uint32 v;
uint8* gxRingBufferWritePtr;
sint32 readDistance;
// no display list active
while (true)
{
gxRingBufferWritePtr = gx2WriteGatherPipe.writeGatherPtrGxBuffer[GX2::sGX2MainCoreIndex];
readDistance = (sint32)(gxRingBufferWritePtr - gxRingBufferReadPtr);
if (readDistance != 0)
break;
uint32 cmdWord;
if ( TCL::TCLGPUReadRBWord(cmdWord) )
return cmdWord;
g_renderer->NotifyLatteCommandProcessorIdle(); // let the renderer know in case it wants to flush any commands
performanceMonitor.gpuTime_idleTime.beginMeasuring();
@ -175,56 +167,8 @@ uint32 LatteCP_readU32Deprc()
}
LatteThread_HandleOSScreen(); // check if new frame was presented via OSScreen API
readDistance = (sint32)(gxRingBufferWritePtr - gxRingBufferReadPtr);
if (readDistance != 0)
break;
if (Latte_GetStopSignal())
LatteThread_Exit();
// still no command data available, do some other tasks
LatteTiming_HandleTimedVsync();
LatteAsyncCommands_checkAndExecute();
std::this_thread::yield();
performanceMonitor.gpuTime_idleTime.endMeasuring();
}
v = *(uint32*)gxRingBufferReadPtr;
gxRingBufferReadPtr += 4;
#ifdef CEMU_DEBUG_ASSERT
if (v == 0xcdcdcdcd)
assert_dbg();
#endif
v = _swapEndianU32(v);
return v;
}
void LatteCP_waitForNWords(uint32 numWords)
{
uint8* gxRingBufferWritePtr;
sint32 readDistance;
bool isFlushed = false;
sint32 waitDistance = numWords * sizeof(uint32be);
// no display list active
while (true)
{
gxRingBufferWritePtr = gx2WriteGatherPipe.writeGatherPtrGxBuffer[GX2::sGX2MainCoreIndex];
readDistance = (sint32)(gxRingBufferWritePtr - gxRingBufferReadPtr);
if (readDistance < 0)
return; // wrap around means there is at least one full command queued after this
if (readDistance >= waitDistance)
break;
g_renderer->NotifyLatteCommandProcessorIdle(); // let the renderer know in case it wants to flush any commands
performanceMonitor.gpuTime_idleTime.beginMeasuring();
// no command data available, spin in a busy loop for a while then check again
for (sint32 busy = 0; busy < 80; busy++)
{
_mm_pause();
}
readDistance = (sint32)(gxRingBufferWritePtr - gxRingBufferReadPtr);
if (readDistance < 0)
return; // wrap around means there is at least one full command queued after this
if (readDistance >= waitDistance)
break;
if ( TCL::TCLGPUReadRBWord(cmdWord) )
return cmdWord;
if (Latte_GetStopSignal())
LatteThread_Exit();
@ -234,6 +178,7 @@ void LatteCP_waitForNWords(uint32 numWords)
std::this_thread::yield();
performanceMonitor.gpuTime_idleTime.endMeasuring();
}
UNREACHABLE;
}
template<uint32 readU32()>
@ -270,21 +215,23 @@ void LatteCP_itIndirectBufferDepr(LatteCMDPtr cmd, uint32 nWords)
cemu_assert_debug(nWords == 3);
uint32 physicalAddress = LatteReadCMD();
uint32 physicalAddressHigh = LatteReadCMD(); // unused
uint32 sizeInDWords = LatteReadCMD();
uint32 displayListSize = sizeInDWords * 4;
DrawPassContext drawPassCtx;
uint32 sizeInU32s = LatteReadCMD();
#ifdef LATTE_CP_LOGGING
if (GetAsyncKeyState('A'))
LatteCP_DebugPrintCmdBuffer(MEMPTR<uint32be>(physicalAddress), displayListSize);
#endif
uint32be* buf = MEMPTR<uint32be>(physicalAddress).GetPtr();
drawPassCtx.PushCurrentCommandQueuePos(buf, buf, buf + sizeInDWords);
if (sizeInU32s > 0)
{
DrawPassContext drawPassCtx;
uint32be* buf = MEMPTR<uint32be>(physicalAddress).GetPtr();
drawPassCtx.PushCurrentCommandQueuePos(buf, buf, buf + sizeInU32s);
LatteCP_processCommandBuffer(drawPassCtx);
if (drawPassCtx.isWithinDrawPass())
drawPassCtx.endDrawPass();
LatteCP_processCommandBuffer(drawPassCtx);
if (drawPassCtx.isWithinDrawPass())
drawPassCtx.endDrawPass();
}
}
// pushes the command buffer to the stack
@ -294,11 +241,12 @@ void LatteCP_itIndirectBuffer(LatteCMDPtr cmd, uint32 nWords, DrawPassContext& d
uint32 physicalAddress = LatteReadCMD();
uint32 physicalAddressHigh = LatteReadCMD(); // unused
uint32 sizeInDWords = LatteReadCMD();
uint32 displayListSize = sizeInDWords * 4;
cemu_assert_debug(displayListSize >= 4);
uint32be* buf = MEMPTR<uint32be>(physicalAddress).GetPtr();
drawPassCtx.PushCurrentCommandQueuePos(buf, buf, buf + sizeInDWords);
if (sizeInDWords > 0)
{
uint32 displayListSize = sizeInDWords * 4;
uint32be* buf = MEMPTR<uint32be>(physicalAddress).GetPtr();
drawPassCtx.PushCurrentCommandQueuePos(buf, buf, buf + sizeInDWords);
}
}
LatteCMDPtr LatteCP_itStreamoutBufferUpdate(LatteCMDPtr cmd, uint32 nWords)
@ -565,26 +513,55 @@ LatteCMDPtr LatteCP_itMemWrite(LatteCMDPtr cmd, uint32 nWords)
if (word1 == 0x40000)
{
// write U32
*memPtr = word2;
stdx::atomic_ref<uint32be> atomicRef(*memPtr);
atomicRef.store(word2);
}
else if (word1 == 0x00000)
{
// write U64 (as two U32)
// note: The U32s are swapped
memPtr[0] = word2;
memPtr[1] = word3;
// write U64
// note: The U32s are swapped here, but needs verification. Also, it seems like the two U32 halves are written independently and the U64 as a whole is not atomic -> investiagte
stdx::atomic_ref<uint64be> atomicRef(*(uint64be*)memPtr);
atomicRef.store(((uint64le)word2 << 32) | word3);
}
else if (word1 == 0x20000)
{
// write U64 (little endian)
memPtr[0] = _swapEndianU32(word2);
memPtr[1] = _swapEndianU32(word3);
stdx::atomic_ref<uint64le> atomicRef(*(uint64le*)memPtr);
atomicRef.store(((uint64le)word3 << 32) | word2);
}
else
cemu_assert_unimplemented();
return cmd;
}
LatteCMDPtr LatteCP_itEventWriteEOP(LatteCMDPtr cmd, uint32 nWords)
{
cemu_assert_debug(nWords == 5);
uint32 word0 = LatteReadCMD();
uint32 word1 = LatteReadCMD();
uint32 word2 = LatteReadCMD();
uint32 word3 = LatteReadCMD(); // value low bits
uint32 word4 = LatteReadCMD(); // value high bits
cemu_assert_debug(word2 == 0x40000000 || word2 == 0x42000000);
if (word0 == 0x504 && (word2&0x40000000)) // todo - figure out the flags
{
stdx::atomic_ref<uint64be> atomicRef(*(uint64be*)memory_getPointerFromPhysicalOffset(word1));
uint64 val = ((uint64)word4 << 32) | word3;
atomicRef.store(val);
}
else
{ cemu_assert_unimplemented();
}
bool triggerInterrupt = (word2 & 0x2000000) != 0;
if (triggerInterrupt)
{
// todo - timestamp interrupt
}
TCL::TCLGPUNotifyNewRetirementTimestamp();
return cmd;
}
LatteCMDPtr LatteCP_itMemSemaphore(LatteCMDPtr cmd, uint32 nWords)
{
@ -783,16 +760,6 @@ LatteCMDPtr LatteCP_itDrawImmediate(LatteCMDPtr cmd, uint32 nWords, DrawPassCont
drawPassCtx.executeDraw(count, false, _tempIndexArrayMPTR);
return cmd;
}
LatteCMDPtr LatteCP_itHLEFifoWrapAround(LatteCMDPtr cmd, uint32 nWords)
{
cemu_assert_debug(nWords == 1);
uint32 unused = LatteReadCMD();
gxRingBufferReadPtr = gx2WriteGatherPipe.gxRingBuffer;
cmd = (LatteCMDPtr)gxRingBufferReadPtr;
return cmd;
}
LatteCMDPtr LatteCP_itHLESampleTimer(LatteCMDPtr cmd, uint32 nWords)
@ -819,16 +786,6 @@ LatteCMDPtr LatteCP_itHLESpecialState(LatteCMDPtr cmd, uint32 nWords)
return cmd;
}
LatteCMDPtr LatteCP_itHLESetRetirementTimestamp(LatteCMDPtr cmd, uint32 nWords)
{
cemu_assert_debug(nWords == 2);
uint32 timestampHigh = (uint32)LatteReadCMD();
uint32 timestampLow = (uint32)LatteReadCMD();
uint64 timestamp = ((uint64)timestampHigh << 32ULL) | (uint64)timestampLow;
GX2::__GX2NotifyNewRetirementTimestamp(timestamp);
return cmd;
}
LatteCMDPtr LatteCP_itHLEBeginOcclusionQuery(LatteCMDPtr cmd, uint32 nWords)
{
cemu_assert_debug(nWords == 1);
@ -1145,9 +1102,10 @@ void LatteCP_processCommandBuffer(DrawPassContext& drawPassCtx)
LatteCMDPtr cmd, cmdStart, cmdEnd;
if (!drawPassCtx.PopCurrentCommandQueuePos(cmd, cmdStart, cmdEnd))
break;
uint32 itHeader;
while (cmd < cmdEnd)
{
uint32 itHeader = LatteReadCMD();
itHeader = LatteReadCMD();
uint32 itHeaderType = (itHeader >> 30) & 3;
if (itHeaderType == 3)
{
@ -1361,11 +1319,6 @@ void LatteCP_processCommandBuffer(DrawPassContext& drawPassCtx)
LatteCP_itHLEEndOcclusionQuery(cmdData, nWords);
break;
}
case IT_HLE_SET_CB_RETIREMENT_TIMESTAMP:
{
LatteCP_itHLESetRetirementTimestamp(cmdData, nWords);
break;
}
case IT_HLE_BOTTOM_OF_PIPE_CB:
{
LatteCP_itHLEBottomOfPipeCB(cmdData, nWords);
@ -1421,6 +1374,7 @@ void LatteCP_processCommandBuffer(DrawPassContext& drawPassCtx)
void LatteCP_ProcessRingbuffer()
{
sint32 timerRecheck = 0; // estimates how much CP processing time has elapsed based on the executed commands, if the value exceeds CP_TIMER_RECHECK then _handleTimers() is called
uint32be tmpBuffer[128];
while (true)
{
uint32 itHeader = LatteCP_readU32Deprc();
@ -1429,10 +1383,13 @@ void LatteCP_ProcessRingbuffer()
{
uint32 itCode = (itHeader >> 8) & 0xFF;
uint32 nWords = ((itHeader >> 16) & 0x3FFF) + 1;
LatteCP_waitForNWords(nWords);
LatteCMDPtr cmd = (LatteCMDPtr)gxRingBufferReadPtr;
uint8* cmdEnd = gxRingBufferReadPtr + nWords * 4;
gxRingBufferReadPtr = cmdEnd;
cemu_assert(nWords < 128);
for (sint32 i=0; i<nWords; i++)
{
uint32 word = LatteCP_readU32Deprc();
tmpBuffer[i] = word;
}
LatteCMDPtr cmd = (LatteCMDPtr)tmpBuffer;
switch (itCode)
{
case IT_SURFACE_SYNC:
@ -1599,6 +1556,11 @@ void LatteCP_ProcessRingbuffer()
timerRecheck += CP_TIMER_RECHECK / 512;
break;
}
case IT_EVENT_WRITE_EOP:
{
LatteCP_itEventWriteEOP(cmd, nWords);
break;
}
case IT_HLE_COPY_COLORBUFFER_TO_SCANBUFFER:
{
LatteCP_itHLECopyColorBufferToScanBuffer(cmd, nWords);
@ -1637,12 +1599,6 @@ void LatteCP_ProcessRingbuffer()
timerRecheck += CP_TIMER_RECHECK / 128;
break;
}
case IT_HLE_FIFO_WRAP_AROUND:
{
LatteCP_itHLEFifoWrapAround(cmd, nWords);
timerRecheck += CP_TIMER_RECHECK / 512;
break;
}
case IT_HLE_SAMPLE_TIMER:
{
LatteCP_itHLESampleTimer(cmd, nWords);
@ -1667,12 +1623,6 @@ void LatteCP_ProcessRingbuffer()
timerRecheck += CP_TIMER_RECHECK / 512;
break;
}
case IT_HLE_SET_CB_RETIREMENT_TIMESTAMP:
{
LatteCP_itHLESetRetirementTimestamp(cmd, nWords);
timerRecheck += CP_TIMER_RECHECK / 512;
break;
}
case IT_HLE_BOTTOM_OF_PIPE_CB:
{
LatteCP_itHLEBottomOfPipeCB(cmd, nWords);
@ -1933,11 +1883,6 @@ void LatteCP_DebugPrintCmdBuffer(uint32be* bufferPtr, uint32 size)
cemuLog_log(LogType::Force, "{} IT_HLE_COPY_SURFACE_NEW", strPrefix);
break;
}
case IT_HLE_FIFO_WRAP_AROUND:
{
cemuLog_log(LogType::Force, "{} IT_HLE_FIFO_WRAP_AROUND", strPrefix);
break;
}
case IT_HLE_SAMPLE_TIMER:
{
cemuLog_log(LogType::Force, "{} IT_HLE_SAMPLE_TIMER", strPrefix);
@ -1958,11 +1903,6 @@ void LatteCP_DebugPrintCmdBuffer(uint32be* bufferPtr, uint32 size)
cemuLog_log(LogType::Force, "{} IT_HLE_END_OCCLUSION_QUERY", strPrefix);
break;
}
case IT_HLE_SET_CB_RETIREMENT_TIMESTAMP:
{
cemuLog_log(LogType::Force, "{} IT_HLE_SET_CB_RETIREMENT_TIMESTAMP", strPrefix);
break;
}
case IT_HLE_BOTTOM_OF_PIPE_CB:
{
cemuLog_log(LogType::Force, "{} IT_HLE_BOTTOM_OF_PIPE_CB", strPrefix);

View file

@ -14,6 +14,7 @@
#define IT_MEM_WRITE 0x3D
#define IT_SURFACE_SYNC 0x43
#define IT_EVENT_WRITE 0x46
#define IT_EVENT_WRITE_EOP 0x47 // end of pipe
#define IT_LOAD_CONFIG_REG 0x60
#define IT_LOAD_CONTEXT_REG 0x61
@ -47,14 +48,12 @@
#define IT_HLE_WAIT_FOR_FLIP 0xF1
#define IT_HLE_BOTTOM_OF_PIPE_CB 0xF2
#define IT_HLE_COPY_COLORBUFFER_TO_SCANBUFFER 0xF3
#define IT_HLE_FIFO_WRAP_AROUND 0xF4
#define IT_HLE_CLEAR_COLOR_DEPTH_STENCIL 0xF5
#define IT_HLE_SAMPLE_TIMER 0xF7
#define IT_HLE_TRIGGER_SCANBUFFER_SWAP 0xF8
#define IT_HLE_SPECIAL_STATE 0xF9
#define IT_HLE_BEGIN_OCCLUSION_QUERY 0xFA
#define IT_HLE_END_OCCLUSION_QUERY 0xFB
#define IT_HLE_SET_CB_RETIREMENT_TIMESTAMP 0xFD
#define pm4HeaderType3(__itCode, __dataDWordCount) (0xC0000000|((uint32)(__itCode)<<8)|((uint32)((__dataDWordCount)-1)<<16))
#define pm4HeaderType2Filler() (0x80000000)

View file

@ -207,7 +207,6 @@ int Latte_ThreadEntry()
if (Latte_GetStopSignal())
LatteThread_Exit();
}
gxRingBufferReadPtr = gx2WriteGatherPipe.gxRingBuffer;
LatteCP_ProcessRingbuffer();
cemu_assert_debug(false); // should never reach
return 0;