GX2+TCL: Reimplement command buffer submission

- GX2 utilizes TCL(.rpl) API for command submission instead of directly writing to an internal GPU fifo
- Submission & retire timestamps are correctly implemented as incremental counters
- Command buffering behaviour matches console
- Fixes race conditions on aarch64
This commit is contained in:
Exzap 2025-05-14 18:59:50 +02:00
parent 96765e4ac6
commit 28ea70b6d8
21 changed files with 761 additions and 472 deletions

View file

@ -47,8 +47,6 @@ struct LatteGPUState_t
gx2GPUSharedArea_t* sharedArea; // quick reference to shared area
MPTR sharedAreaAddr;
// other
// todo: Currently we have the command buffer logic implemented as a FIFO ringbuffer. On real HW it's handled as a series of command buffers that are pushed individually.
std::atomic<uint64> lastSubmittedCommandBufferTimestamp;
uint32 gx2InitCalled; // incremented every time GX2Init() is called
// OpenGL control
uint32 glVendor; // GLVENDOR_*
@ -75,8 +73,6 @@ struct LatteGPUState_t
extern LatteGPUState_t LatteGPUState;
extern uint8* gxRingBufferReadPtr; // currently active read pointer (gx2 ring buffer or display list)
// texture
#include "Cafe/HW/Latte/Core/LatteTexture.h"

View file

@ -13,6 +13,7 @@
#include "Cafe/HW/Latte/Core/LattePM4.h"
#include "Cafe/OS/libs/coreinit/coreinit_Time.h"
#include "Cafe/OS/libs/TCL/TCL.h" // TCL currently handles the GPU command ringbuffer
#include "Cafe/CafeSystem.h"
@ -28,11 +29,6 @@ typedef uint32be* LatteCMDPtr;
#define LatteReadCMD() ((uint32)*(cmd++))
#define LatteSkipCMD(_nWords) cmd += (_nWords)
uint8* gxRingBufferReadPtr; // currently active read pointer (gx2 ring buffer or display list)
uint8* gx2CPParserDisplayListPtr;
uint8* gx2CPParserDisplayListStart; // used for debugging
uint8* gx2CPParserDisplayListEnd;
void LatteThread_HandleOSScreen();
void LatteThread_Exit();
@ -155,16 +151,12 @@ void LatteCP_signalEnterWait()
*/
uint32 LatteCP_readU32Deprc()
{
uint32 v;
uint8* gxRingBufferWritePtr;
sint32 readDistance;
// no display list active
while (true)
{
gxRingBufferWritePtr = gx2WriteGatherPipe.writeGatherPtrGxBuffer[GX2::sGX2MainCoreIndex];
readDistance = (sint32)(gxRingBufferWritePtr - gxRingBufferReadPtr);
if (readDistance != 0)
break;
uint32 cmdWord;
if ( TCL::TCLGPUReadRBWord(cmdWord) )
return cmdWord;
g_renderer->NotifyLatteCommandProcessorIdle(); // let the renderer know in case it wants to flush any commands
performanceMonitor.gpuTime_idleTime.beginMeasuring();
@ -175,56 +167,8 @@ uint32 LatteCP_readU32Deprc()
}
LatteThread_HandleOSScreen(); // check if new frame was presented via OSScreen API
readDistance = (sint32)(gxRingBufferWritePtr - gxRingBufferReadPtr);
if (readDistance != 0)
break;
if (Latte_GetStopSignal())
LatteThread_Exit();
// still no command data available, do some other tasks
LatteTiming_HandleTimedVsync();
LatteAsyncCommands_checkAndExecute();
std::this_thread::yield();
performanceMonitor.gpuTime_idleTime.endMeasuring();
}
v = *(uint32*)gxRingBufferReadPtr;
gxRingBufferReadPtr += 4;
#ifdef CEMU_DEBUG_ASSERT
if (v == 0xcdcdcdcd)
assert_dbg();
#endif
v = _swapEndianU32(v);
return v;
}
void LatteCP_waitForNWords(uint32 numWords)
{
uint8* gxRingBufferWritePtr;
sint32 readDistance;
bool isFlushed = false;
sint32 waitDistance = numWords * sizeof(uint32be);
// no display list active
while (true)
{
gxRingBufferWritePtr = gx2WriteGatherPipe.writeGatherPtrGxBuffer[GX2::sGX2MainCoreIndex];
readDistance = (sint32)(gxRingBufferWritePtr - gxRingBufferReadPtr);
if (readDistance < 0)
return; // wrap around means there is at least one full command queued after this
if (readDistance >= waitDistance)
break;
g_renderer->NotifyLatteCommandProcessorIdle(); // let the renderer know in case it wants to flush any commands
performanceMonitor.gpuTime_idleTime.beginMeasuring();
// no command data available, spin in a busy loop for a while then check again
for (sint32 busy = 0; busy < 80; busy++)
{
_mm_pause();
}
readDistance = (sint32)(gxRingBufferWritePtr - gxRingBufferReadPtr);
if (readDistance < 0)
return; // wrap around means there is at least one full command queued after this
if (readDistance >= waitDistance)
break;
if ( TCL::TCLGPUReadRBWord(cmdWord) )
return cmdWord;
if (Latte_GetStopSignal())
LatteThread_Exit();
@ -234,6 +178,7 @@ void LatteCP_waitForNWords(uint32 numWords)
std::this_thread::yield();
performanceMonitor.gpuTime_idleTime.endMeasuring();
}
UNREACHABLE;
}
template<uint32 readU32()>
@ -270,21 +215,23 @@ void LatteCP_itIndirectBufferDepr(LatteCMDPtr cmd, uint32 nWords)
cemu_assert_debug(nWords == 3);
uint32 physicalAddress = LatteReadCMD();
uint32 physicalAddressHigh = LatteReadCMD(); // unused
uint32 sizeInDWords = LatteReadCMD();
uint32 displayListSize = sizeInDWords * 4;
DrawPassContext drawPassCtx;
uint32 sizeInU32s = LatteReadCMD();
#ifdef LATTE_CP_LOGGING
if (GetAsyncKeyState('A'))
LatteCP_DebugPrintCmdBuffer(MEMPTR<uint32be>(physicalAddress), displayListSize);
#endif
if (sizeInU32s > 0)
{
DrawPassContext drawPassCtx;
uint32be* buf = MEMPTR<uint32be>(physicalAddress).GetPtr();
drawPassCtx.PushCurrentCommandQueuePos(buf, buf, buf + sizeInDWords);
drawPassCtx.PushCurrentCommandQueuePos(buf, buf, buf + sizeInU32s);
LatteCP_processCommandBuffer(drawPassCtx);
if (drawPassCtx.isWithinDrawPass())
drawPassCtx.endDrawPass();
}
}
// pushes the command buffer to the stack
@ -294,11 +241,12 @@ void LatteCP_itIndirectBuffer(LatteCMDPtr cmd, uint32 nWords, DrawPassContext& d
uint32 physicalAddress = LatteReadCMD();
uint32 physicalAddressHigh = LatteReadCMD(); // unused
uint32 sizeInDWords = LatteReadCMD();
if (sizeInDWords > 0)
{
uint32 displayListSize = sizeInDWords * 4;
cemu_assert_debug(displayListSize >= 4);
uint32be* buf = MEMPTR<uint32be>(physicalAddress).GetPtr();
drawPassCtx.PushCurrentCommandQueuePos(buf, buf, buf + sizeInDWords);
}
}
LatteCMDPtr LatteCP_itStreamoutBufferUpdate(LatteCMDPtr cmd, uint32 nWords)
@ -565,26 +513,55 @@ LatteCMDPtr LatteCP_itMemWrite(LatteCMDPtr cmd, uint32 nWords)
if (word1 == 0x40000)
{
// write U32
*memPtr = word2;
stdx::atomic_ref<uint32be> atomicRef(*memPtr);
atomicRef.store(word2);
}
else if (word1 == 0x00000)
{
// write U64 (as two U32)
// note: The U32s are swapped
memPtr[0] = word2;
memPtr[1] = word3;
// write U64
// note: The U32s are swapped here, but needs verification. Also, it seems like the two U32 halves are written independently and the U64 as a whole is not atomic -> investiagte
stdx::atomic_ref<uint64be> atomicRef(*(uint64be*)memPtr);
atomicRef.store(((uint64le)word2 << 32) | word3);
}
else if (word1 == 0x20000)
{
// write U64 (little endian)
memPtr[0] = _swapEndianU32(word2);
memPtr[1] = _swapEndianU32(word3);
stdx::atomic_ref<uint64le> atomicRef(*(uint64le*)memPtr);
atomicRef.store(((uint64le)word3 << 32) | word2);
}
else
cemu_assert_unimplemented();
return cmd;
}
LatteCMDPtr LatteCP_itEventWriteEOP(LatteCMDPtr cmd, uint32 nWords)
{
cemu_assert_debug(nWords == 5);
uint32 word0 = LatteReadCMD();
uint32 word1 = LatteReadCMD();
uint32 word2 = LatteReadCMD();
uint32 word3 = LatteReadCMD(); // value low bits
uint32 word4 = LatteReadCMD(); // value high bits
cemu_assert_debug(word2 == 0x40000000 || word2 == 0x42000000);
if (word0 == 0x504 && (word2&0x40000000)) // todo - figure out the flags
{
stdx::atomic_ref<uint64be> atomicRef(*(uint64be*)memory_getPointerFromPhysicalOffset(word1));
uint64 val = ((uint64)word4 << 32) | word3;
atomicRef.store(val);
}
else
{ cemu_assert_unimplemented();
}
bool triggerInterrupt = (word2 & 0x2000000) != 0;
if (triggerInterrupt)
{
// todo - timestamp interrupt
}
TCL::TCLGPUNotifyNewRetirementTimestamp();
return cmd;
}
LatteCMDPtr LatteCP_itMemSemaphore(LatteCMDPtr cmd, uint32 nWords)
{
@ -783,16 +760,6 @@ LatteCMDPtr LatteCP_itDrawImmediate(LatteCMDPtr cmd, uint32 nWords, DrawPassCont
drawPassCtx.executeDraw(count, false, _tempIndexArrayMPTR);
return cmd;
}
LatteCMDPtr LatteCP_itHLEFifoWrapAround(LatteCMDPtr cmd, uint32 nWords)
{
cemu_assert_debug(nWords == 1);
uint32 unused = LatteReadCMD();
gxRingBufferReadPtr = gx2WriteGatherPipe.gxRingBuffer;
cmd = (LatteCMDPtr)gxRingBufferReadPtr;
return cmd;
}
LatteCMDPtr LatteCP_itHLESampleTimer(LatteCMDPtr cmd, uint32 nWords)
@ -819,16 +786,6 @@ LatteCMDPtr LatteCP_itHLESpecialState(LatteCMDPtr cmd, uint32 nWords)
return cmd;
}
LatteCMDPtr LatteCP_itHLESetRetirementTimestamp(LatteCMDPtr cmd, uint32 nWords)
{
cemu_assert_debug(nWords == 2);
uint32 timestampHigh = (uint32)LatteReadCMD();
uint32 timestampLow = (uint32)LatteReadCMD();
uint64 timestamp = ((uint64)timestampHigh << 32ULL) | (uint64)timestampLow;
GX2::__GX2NotifyNewRetirementTimestamp(timestamp);
return cmd;
}
LatteCMDPtr LatteCP_itHLEBeginOcclusionQuery(LatteCMDPtr cmd, uint32 nWords)
{
cemu_assert_debug(nWords == 1);
@ -1145,9 +1102,10 @@ void LatteCP_processCommandBuffer(DrawPassContext& drawPassCtx)
LatteCMDPtr cmd, cmdStart, cmdEnd;
if (!drawPassCtx.PopCurrentCommandQueuePos(cmd, cmdStart, cmdEnd))
break;
uint32 itHeader;
while (cmd < cmdEnd)
{
uint32 itHeader = LatteReadCMD();
itHeader = LatteReadCMD();
uint32 itHeaderType = (itHeader >> 30) & 3;
if (itHeaderType == 3)
{
@ -1361,11 +1319,6 @@ void LatteCP_processCommandBuffer(DrawPassContext& drawPassCtx)
LatteCP_itHLEEndOcclusionQuery(cmdData, nWords);
break;
}
case IT_HLE_SET_CB_RETIREMENT_TIMESTAMP:
{
LatteCP_itHLESetRetirementTimestamp(cmdData, nWords);
break;
}
case IT_HLE_BOTTOM_OF_PIPE_CB:
{
LatteCP_itHLEBottomOfPipeCB(cmdData, nWords);
@ -1421,6 +1374,7 @@ void LatteCP_processCommandBuffer(DrawPassContext& drawPassCtx)
void LatteCP_ProcessRingbuffer()
{
sint32 timerRecheck = 0; // estimates how much CP processing time has elapsed based on the executed commands, if the value exceeds CP_TIMER_RECHECK then _handleTimers() is called
uint32be tmpBuffer[128];
while (true)
{
uint32 itHeader = LatteCP_readU32Deprc();
@ -1429,10 +1383,13 @@ void LatteCP_ProcessRingbuffer()
{
uint32 itCode = (itHeader >> 8) & 0xFF;
uint32 nWords = ((itHeader >> 16) & 0x3FFF) + 1;
LatteCP_waitForNWords(nWords);
LatteCMDPtr cmd = (LatteCMDPtr)gxRingBufferReadPtr;
uint8* cmdEnd = gxRingBufferReadPtr + nWords * 4;
gxRingBufferReadPtr = cmdEnd;
cemu_assert(nWords < 128);
for (sint32 i=0; i<nWords; i++)
{
uint32 word = LatteCP_readU32Deprc();
tmpBuffer[i] = word;
}
LatteCMDPtr cmd = (LatteCMDPtr)tmpBuffer;
switch (itCode)
{
case IT_SURFACE_SYNC:
@ -1599,6 +1556,11 @@ void LatteCP_ProcessRingbuffer()
timerRecheck += CP_TIMER_RECHECK / 512;
break;
}
case IT_EVENT_WRITE_EOP:
{
LatteCP_itEventWriteEOP(cmd, nWords);
break;
}
case IT_HLE_COPY_COLORBUFFER_TO_SCANBUFFER:
{
LatteCP_itHLECopyColorBufferToScanBuffer(cmd, nWords);
@ -1637,12 +1599,6 @@ void LatteCP_ProcessRingbuffer()
timerRecheck += CP_TIMER_RECHECK / 128;
break;
}
case IT_HLE_FIFO_WRAP_AROUND:
{
LatteCP_itHLEFifoWrapAround(cmd, nWords);
timerRecheck += CP_TIMER_RECHECK / 512;
break;
}
case IT_HLE_SAMPLE_TIMER:
{
LatteCP_itHLESampleTimer(cmd, nWords);
@ -1667,12 +1623,6 @@ void LatteCP_ProcessRingbuffer()
timerRecheck += CP_TIMER_RECHECK / 512;
break;
}
case IT_HLE_SET_CB_RETIREMENT_TIMESTAMP:
{
LatteCP_itHLESetRetirementTimestamp(cmd, nWords);
timerRecheck += CP_TIMER_RECHECK / 512;
break;
}
case IT_HLE_BOTTOM_OF_PIPE_CB:
{
LatteCP_itHLEBottomOfPipeCB(cmd, nWords);
@ -1933,11 +1883,6 @@ void LatteCP_DebugPrintCmdBuffer(uint32be* bufferPtr, uint32 size)
cemuLog_log(LogType::Force, "{} IT_HLE_COPY_SURFACE_NEW", strPrefix);
break;
}
case IT_HLE_FIFO_WRAP_AROUND:
{
cemuLog_log(LogType::Force, "{} IT_HLE_FIFO_WRAP_AROUND", strPrefix);
break;
}
case IT_HLE_SAMPLE_TIMER:
{
cemuLog_log(LogType::Force, "{} IT_HLE_SAMPLE_TIMER", strPrefix);
@ -1958,11 +1903,6 @@ void LatteCP_DebugPrintCmdBuffer(uint32be* bufferPtr, uint32 size)
cemuLog_log(LogType::Force, "{} IT_HLE_END_OCCLUSION_QUERY", strPrefix);
break;
}
case IT_HLE_SET_CB_RETIREMENT_TIMESTAMP:
{
cemuLog_log(LogType::Force, "{} IT_HLE_SET_CB_RETIREMENT_TIMESTAMP", strPrefix);
break;
}
case IT_HLE_BOTTOM_OF_PIPE_CB:
{
cemuLog_log(LogType::Force, "{} IT_HLE_BOTTOM_OF_PIPE_CB", strPrefix);

View file

@ -14,6 +14,7 @@
#define IT_MEM_WRITE 0x3D
#define IT_SURFACE_SYNC 0x43
#define IT_EVENT_WRITE 0x46
#define IT_EVENT_WRITE_EOP 0x47 // end of pipe
#define IT_LOAD_CONFIG_REG 0x60
#define IT_LOAD_CONTEXT_REG 0x61
@ -47,14 +48,12 @@
#define IT_HLE_WAIT_FOR_FLIP 0xF1
#define IT_HLE_BOTTOM_OF_PIPE_CB 0xF2
#define IT_HLE_COPY_COLORBUFFER_TO_SCANBUFFER 0xF3
#define IT_HLE_FIFO_WRAP_AROUND 0xF4
#define IT_HLE_CLEAR_COLOR_DEPTH_STENCIL 0xF5
#define IT_HLE_SAMPLE_TIMER 0xF7
#define IT_HLE_TRIGGER_SCANBUFFER_SWAP 0xF8
#define IT_HLE_SPECIAL_STATE 0xF9
#define IT_HLE_BEGIN_OCCLUSION_QUERY 0xFA
#define IT_HLE_END_OCCLUSION_QUERY 0xFB
#define IT_HLE_SET_CB_RETIREMENT_TIMESTAMP 0xFD
#define pm4HeaderType3(__itCode, __dataDWordCount) (0xC0000000|((uint32)(__itCode)<<8)|((uint32)((__dataDWordCount)-1)<<16))
#define pm4HeaderType2Filler() (0x80000000)

View file

@ -207,7 +207,6 @@ int Latte_ThreadEntry()
if (Latte_GetStopSignal())
LatteThread_Exit();
}
gxRingBufferReadPtr = gx2WriteGatherPipe.gxRingBuffer;
LatteCP_ProcessRingbuffer();
cemu_assert_debug(false); // should never reach
return 0;

View file

@ -1,28 +1,161 @@
#include "Cafe/OS/common/OSCommon.h"
#include "Cafe/OS/libs/TCL/TCL.h"
#include "HW/Latte/Core/LattePM4.h"
namespace TCL
{
SysAllocator<coreinit::OSEvent> s_updateRetirementEvent;
uint64 s_currentRetireMarker = 0;
enum class TCL_SUBMISSION_FLAG : uint32
struct TCLStatePPC // mapped into PPC space
{
SURFACE_SYNC = 0x400000, // submit surface sync packet before cmd
TRIGGER_INTERRUPT = 0x200000, // probably
UKN_20000000 = 0x20000000,
uint64be gpuRetireMarker; // written by GPU
};
int TCLSubmitToRing(uint32be* cmd, uint32 cmdLen, uint32be* controlFlags, uint64* submissionTimestamp)
SysAllocator<TCLStatePPC> s_tclStatePPC;
// called from GPU for timestamp EOP event
void TCLGPUNotifyNewRetirementTimestamp()
{
// todo - figure out all the bits of *controlFlags
// if submissionTimestamp != nullptr then set it to the timestamp of the submission. Note: We should make sure that uint64's are written atomically by the GPU command processor
// gpuRetireMarker is updated via event eop command
__OSLockScheduler();
coreinit::OSSignalEventAllInternal(s_updateRetirementEvent.GetPtr());
__OSUnlockScheduler();
}
cemu_assert_debug(false);
int TCLTimestamp(TCLTimestampId id, uint64be* timestampOut)
{
if (id == TCLTimestampId::TIMESTAMP_LAST_BUFFER_RETIRED)
{
MEMPTR<uint32> b;
// this is the timestamp of the last buffer that was retired by the GPU
stdx::atomic_ref<uint64be> retireTimestamp(s_tclStatePPC->gpuRetireMarker);
*timestampOut = retireTimestamp.load();
return 0;
}
else
{
cemuLog_log(LogType::Force, "TCLTimestamp(): Unsupported timestamp ID {}", (uint32)id);
*timestampOut = 0;
return 0;
}
}
int TCLWaitTimestamp(TCLTimestampId id, uint64 waitTs, uint64 timeout)
{
if (id == TCLTimestampId::TIMESTAMP_LAST_BUFFER_RETIRED)
{
while ( true )
{
stdx::atomic_ref<uint64be> retireTimestamp(s_tclStatePPC->gpuRetireMarker);
uint64 currentTimestamp = retireTimestamp.load();
if (currentTimestamp >= waitTs)
return 0;
coreinit::OSWaitEvent(s_updateRetirementEvent.GetPtr());
}
}
else
{
cemuLog_log(LogType::Force, "TCLWaitTimestamp(): Unsupported timestamp ID {}", (uint32)id);
}
return 0;
}
static constexpr uint32 TCL_RING_BUFFER_SIZE = 4096; // in U32s
std::atomic<uint32> tclRingBufferA[TCL_RING_BUFFER_SIZE];
std::atomic<uint32> tclRingBufferA_readIndex{0};
uint32 tclRingBufferA_writeIndex{0};
// GPU code calls this to grab the next command word
bool TCLGPUReadRBWord(uint32& cmdWord)
{
if (tclRingBufferA_readIndex == tclRingBufferA_writeIndex)
return false;
cmdWord = tclRingBufferA[tclRingBufferA_readIndex];
tclRingBufferA_readIndex = (tclRingBufferA_readIndex+1) % TCL_RING_BUFFER_SIZE;
return true;
}
void TCLWaitForRBSpace(uint32be numU32s)
{
while ( true )
{
uint32 distance = (tclRingBufferA_readIndex + TCL_RING_BUFFER_SIZE - tclRingBufferA_writeIndex) & (TCL_RING_BUFFER_SIZE - 1);
if (tclRingBufferA_writeIndex == tclRingBufferA_readIndex) // buffer completely empty
distance = TCL_RING_BUFFER_SIZE;
if (distance >= numU32s+1) // assume distance minus one, because we are never allowed to completely wrap around
break;
_mm_pause();
}
}
// this function assumes that TCLWaitForRBSpace was called and that there is enough space
void TCLWriteCmd(uint32be* cmd, uint32 cmdLen)
{
while (cmdLen > 0)
{
tclRingBufferA[tclRingBufferA_writeIndex] = *cmd;
tclRingBufferA_writeIndex++;
tclRingBufferA_writeIndex &= (TCL_RING_BUFFER_SIZE - 1);
cmd++;
cmdLen--;
}
}
#define EVENT_TYPE_TS 5
void TCLSubmitRetireMarker(bool triggerEventInterrupt)
{
s_currentRetireMarker++;
uint32be cmd[6];
cmd[0] = pm4HeaderType3(IT_EVENT_WRITE_EOP, 5);
cmd[1] = (4 | (EVENT_TYPE_TS << 8)); // event type (bits 8-15) and event index (bits 0-7).
cmd[2] = MEMPTR<void>(&s_tclStatePPC->gpuRetireMarker).GetMPTR(); // address lower 32bits + data sel bits
cmd[3] = 0x40000000; // select 64bit write, lower 16 bits are the upper bits of the address
if (triggerEventInterrupt)
cmd[3] |= 0x2000000; // trigger interrupt after value has been written
cmd[4] = (uint32)s_currentRetireMarker; // data lower 32 bits
cmd[5] = (uint32)(s_currentRetireMarker>>32); // data higher 32 bits
TCLWriteCmd(cmd, 6);
}
int TCLSubmitToRing(uint32be* cmd, uint32 cmdLen, betype<TCLSubmissionFlag>* controlFlags, uint64be* timestampValueOut)
{
TCLSubmissionFlag flags = *controlFlags;
cemu_assert_debug(timestampValueOut); // handle case where this is null
// make sure there is enough space to submit all commands at one
uint32 totalCommandLength = cmdLen;
totalCommandLength += 6; // space needed for TCLSubmitRetireMarker
TCLWaitForRBSpace(totalCommandLength);
// submit command buffer
TCLWriteCmd(cmd, cmdLen);
// create new marker timestamp and tell GPU to write it to our variable after its done processing the command
if ((HAS_FLAG(flags, TCLSubmissionFlag::USE_RETIRED_MARKER)))
{
TCLSubmitRetireMarker(!HAS_FLAG(flags, TCLSubmissionFlag::NO_MARKER_INTERRUPT));
*timestampValueOut = s_currentRetireMarker; // incremented before each submit
}
else
{
cemu_assert_unimplemented();
}
return 0;
}
void Initialize()
{
cafeExportRegister("TCL", TCLSubmitToRing, LogType::Placeholder);
cafeExportRegister("TCL", TCLTimestamp, LogType::Placeholder);
cafeExportRegister("TCL", TCLWaitTimestamp, LogType::Placeholder);
s_currentRetireMarker = 0;
s_tclStatePPC->gpuRetireMarker = 0;
coreinit::OSInitEvent(s_updateRetirementEvent.GetPtr(), coreinit::OSEvent::EVENT_STATE::STATE_NOT_SIGNALED, coreinit::OSEvent::EVENT_MODE::MODE_AUTO);
}
}

View file

@ -1,4 +1,25 @@
namespace TCL
{
enum class TCLTimestampId
{
TIMESTAMP_LAST_BUFFER_RETIRED = 1,
};
enum class TCLSubmissionFlag : uint32
{
SURFACE_SYNC = 0x400000, // submit surface sync packet before cmd
NO_MARKER_INTERRUPT = 0x200000,
USE_RETIRED_MARKER = 0x20000000, // Controls whether the timer is updated before or after (retired) the cmd. Also controls which timestamp is returned for the submission. Before and after using separate counters
};
int TCLTimestamp(TCLTimestampId id, uint64be* timestampOut);
int TCLWaitTimestamp(TCLTimestampId id, uint64 waitTs, uint64 timeout);
int TCLSubmitToRing(uint32be* cmd, uint32 cmdLen, betype<TCLSubmissionFlag>* controlFlags, uint64be* timestampValueOut);
// called from Latte code
bool TCLGPUReadRBWord(uint32& cmdWord);
void TCLGPUNotifyNewRetirementTimestamp();
void Initialize();
}
ENABLE_BITMASK_OPERATORS(TCL::TCLSubmissionFlag);

View file

@ -59,7 +59,7 @@ void gx2Export_GX2SwapScanBuffers(PPCInterpreter_t* hCPU)
if (isPokken)
GX2::GX2DrawDone();
GX2ReserveCmdSpace(5+2);
GX2::GX2ReserveCmdSpace(5+2);
uint64 tick64 = PPCInterpreter_getMainCoreCycleCounter() / 20ULL;
lastSwapTime = tick64;
@ -86,24 +86,16 @@ void gx2Export_GX2SwapScanBuffers(PPCInterpreter_t* hCPU)
GX2::GX2WaitForFlip();
}
GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
osLib_returnFromFunction(hCPU, 0);
}
void gx2Export_GX2CopyColorBufferToScanBuffer(PPCInterpreter_t* hCPU)
{
cemuLog_log(LogType::GX2, "GX2CopyColorBufferToScanBuffer(0x{:08x},{})", hCPU->gpr[3], hCPU->gpr[4]);
GX2ReserveCmdSpace(5);
GX2::GX2ReserveCmdSpace(10);
// todo: proper implementation
// hack: Avoid running to far ahead of GPU. Normally this would be guaranteed by the circular buffer model, which we currently dont fully emulate
if(GX2::GX2WriteGather_getReadWriteDistance() > 32*1024*1024 )
{
debug_printf("Waiting for GPU to catch up...\n");
PPCInterpreter_relinquishTimeslice(); // release current thread
return;
}
GX2ColorBuffer* colorBuffer = (GX2ColorBuffer*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]);
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_HLE_COPY_COLORBUFFER_TO_SCANBUFFER, 9));
@ -309,81 +301,6 @@ void gx2Export_GX2SetSemaphore(PPCInterpreter_t* hCPU)
osLib_returnFromFunction(hCPU, 0);
}
void gx2Export_GX2Flush(PPCInterpreter_t* hCPU)
{
cemuLog_log(LogType::GX2, "GX2Flush()");
_GX2SubmitToTCL();
osLib_returnFromFunction(hCPU, 0);
}
uint8* _GX2LastFlushPtr[PPC_CORE_COUNT] = {NULL};
uint64 _prevReturnedGPUTime = 0;
uint64 Latte_GetTime()
{
uint64 gpuTime = coreinit::OSGetSystemTime();
gpuTime *= 20000ULL;
if (gpuTime <= _prevReturnedGPUTime)
gpuTime = _prevReturnedGPUTime + 1; // avoid ever returning identical timestamps
_prevReturnedGPUTime = gpuTime;
return gpuTime;
}
void _GX2SubmitToTCL()
{
uint32 coreIndex = PPCInterpreter_getCoreIndex(PPCInterpreter_getCurrentInstance());
// do nothing if called from non-main GX2 core
if (GX2::sGX2MainCoreIndex != coreIndex)
{
cemuLog_logDebug(LogType::Force, "_GX2SubmitToTCL() called on non-main GX2 core");
return;
}
if( gx2WriteGatherPipe.displayListStart[coreIndex] != MPTR_NULL )
return; // quit if in display list
_GX2LastFlushPtr[coreIndex] = (gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex]);
// update last submitted CB timestamp
uint64 commandBufferTimestamp = Latte_GetTime();
LatteGPUState.lastSubmittedCommandBufferTimestamp.store(commandBufferTimestamp);
cemuLog_log(LogType::GX2, "Submitting GX2 command buffer with timestamp {:016x}", commandBufferTimestamp);
// submit HLE packet to write retirement timestamp
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_HLE_SET_CB_RETIREMENT_TIMESTAMP, 2));
gx2WriteGather_submitU32AsBE((uint32)(commandBufferTimestamp>>32ULL));
gx2WriteGather_submitU32AsBE((uint32)(commandBufferTimestamp&0xFFFFFFFFULL));
}
uint32 _GX2GetUnflushedBytes(uint32 coreIndex)
{
uint32 unflushedBytes = 0;
if (_GX2LastFlushPtr[coreIndex] != NULL)
{
if (_GX2LastFlushPtr[coreIndex] > gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex])
unflushedBytes = (uint32)(gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex] - gx2WriteGatherPipe.gxRingBuffer + 4); // this isn't 100% correct since we ignore the bytes between the last flush address and the start of the wrap around
else
unflushedBytes = (uint32)(gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex] - _GX2LastFlushPtr[coreIndex]);
}
else
unflushedBytes = (uint32)(gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex] - gx2WriteGatherPipe.gxRingBuffer);
return unflushedBytes;
}
/*
* Guarantees that the requested amount of space is available on the current command buffer
* If the space is not available, the current command buffer is pushed to the GPU and a new one is allocated
*/
void GX2ReserveCmdSpace(uint32 reservedFreeSpaceInU32)
{
uint32 coreIndex = coreinit::OSGetCoreId();
// if we are in a display list then do nothing
if( gx2WriteGatherPipe.displayListStart[coreIndex] != MPTR_NULL )
return;
uint32 unflushedBytes = _GX2GetUnflushedBytes(coreIndex);
if( unflushedBytes >= 0x1000 )
{
_GX2SubmitToTCL();
}
}
void gx2_load()
{
osLib_addFunction("gx2", "GX2GetContextStateDisplayList", gx2Export_GX2GetContextStateDisplayList);
@ -445,10 +362,6 @@ void gx2_load()
// semaphore
osLib_addFunction("gx2", "GX2SetSemaphore", gx2Export_GX2SetSemaphore);
// command buffer
osLib_addFunction("gx2", "GX2Flush", gx2Export_GX2Flush);
GX2::GX2Init_writeGather();
GX2::GX2MemInit();
GX2::GX2ResourceInit();
GX2::GX2CommandInit();

View file

@ -68,9 +68,3 @@ void gx2Export_GX2MarkScanBufferCopied(PPCInterpreter_t* hCPU);
void gx2Export_GX2SetDefaultState(PPCInterpreter_t* hCPU);
void gx2Export_GX2SetupContextStateEx(PPCInterpreter_t* hCPU);
void gx2Export_GX2SetContextState(PPCInterpreter_t* hCPU);
// command buffer
uint32 _GX2GetUnflushedBytes(uint32 coreIndex);
void _GX2SubmitToTCL();
void GX2ReserveCmdSpace(uint32 reservedFreeSpaceInU32);

View file

@ -132,7 +132,6 @@ namespace GX2
depthFirstSlice = _swapEndianU32(depthBuffer->viewFirstSlice);
depthNumSlices = _swapEndianU32(depthBuffer->viewNumSlices);
}
gx2WriteGather_submit(pm4HeaderType3(IT_HLE_CLEAR_COLOR_DEPTH_STENCIL, 23),
hleClearFlags,
colorPhysAddr,

View file

@ -4,178 +4,397 @@
#include "Cafe/HW/Latte/Core/LattePM4.h"
#include "Cafe/OS/libs/coreinit/coreinit.h"
#include "Cafe/OS/libs/coreinit/coreinit_Thread.h"
#include "Cafe/OS/libs/TCL/TCL.h"
#include "Cafe/HW/Latte/ISA/RegDefines.h"
#include "GX2.h"
#include "GX2_Command.h"
#include "GX2_Shader.h"
#include "GX2_Misc.h"
#include "OS/libs/coreinit/coreinit_MEM.h"
extern uint8* gxRingBufferReadPtr;
GX2WriteGatherPipeState gx2WriteGatherPipe = { 0 };
namespace GX2
{
GX2PerCoreCBState s_perCoreCBState[Espresso::CORE_COUNT];
}
void gx2WriteGather_submitU32AsBE(uint32 v)
{
uint32 coreIndex = PPCInterpreter_getCoreIndex(PPCInterpreter_getCurrentInstance());
if (gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex] == NULL)
if (GX2::s_perCoreCBState[coreIndex].currentWritePtr == nullptr)
return;
*(uint32*)(*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex]) = _swapEndianU32(v);
(*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex]) += 4;
*(uint32*)(GX2::s_perCoreCBState[coreIndex].currentWritePtr) = _swapEndianU32(v);
GX2::s_perCoreCBState[coreIndex].currentWritePtr++;
cemu_assert_debug(GX2::s_perCoreCBState[coreIndex].currentWritePtr <= (GX2::s_perCoreCBState[coreIndex].bufferPtr + GX2::s_perCoreCBState[coreIndex].bufferSizeInU32s));
}
void gx2WriteGather_submitU32AsLE(uint32 v)
{
uint32 coreIndex = PPCInterpreter_getCoreIndex(PPCInterpreter_getCurrentInstance());
if (gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex] == NULL)
if (GX2::s_perCoreCBState[coreIndex].currentWritePtr == nullptr)
return;
*(uint32*)(*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex]) = v;
(*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex]) += 4;
*(uint32*)(GX2::s_perCoreCBState[coreIndex].currentWritePtr) = v;
GX2::s_perCoreCBState[coreIndex].currentWritePtr++;
cemu_assert_debug(GX2::s_perCoreCBState[coreIndex].currentWritePtr <= (GX2::s_perCoreCBState[coreIndex].bufferPtr + GX2::s_perCoreCBState[coreIndex].bufferSizeInU32s));
}
void gx2WriteGather_submitU32AsLEArray(uint32* v, uint32 numValues)
{
uint32 coreIndex = PPCInterpreter_getCoreIndex(PPCInterpreter_getCurrentInstance());
if (gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex] == NULL)
if (GX2::s_perCoreCBState[coreIndex].currentWritePtr == nullptr)
return;
memcpy_dwords((*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex]), v, numValues);
(*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex]) += 4 * numValues;
memcpy_dwords(GX2::s_perCoreCBState[coreIndex].currentWritePtr, v, numValues);
GX2::s_perCoreCBState[coreIndex].currentWritePtr += numValues;
cemu_assert_debug(GX2::s_perCoreCBState[coreIndex].currentWritePtr <= (GX2::s_perCoreCBState[coreIndex].bufferPtr + GX2::s_perCoreCBState[coreIndex].bufferSizeInU32s));
}
namespace GX2
{
sint32 gx2WriteGatherCurrentMainCoreIndex = -1;
bool gx2WriteGatherInited = false;
void GX2WriteGather_ResetToDefaultState()
struct GX2CommandState // mapped to PPC space since the GPU writes here
{
gx2WriteGatherCurrentMainCoreIndex = -1;
gx2WriteGatherInited = false;
}
// command pool
MEMPTR<uint32be> commandPoolBase;
uint32 commandPoolSizeInU32s;
MEMPTR<uint32be> gpuCommandReadPtr;
// timestamp
uint64be lastSubmissionTime;
};
void GX2Init_writeGather() // init write gather, make current core
SysAllocator<GX2CommandState> s_commandState;
GX2PerCoreCBState s_mainCoreLastCommandState;
bool s_cbBufferIsInternallyAllocated;
void GX2Command_StartNewCommandBuffer(uint32 numU32s);
// called from GX2Init. Allocates a 4MB memory chunk from which command buffers are suballocated from
void GX2Init_commandBufferPool(void* bufferBase, uint32 bufferSize)
{
if (gx2WriteGatherPipe.gxRingBuffer == NULL)
gx2WriteGatherPipe.gxRingBuffer = (uint8*)malloc(GX2_COMMAND_RING_BUFFER_SIZE);
if (gx2WriteGatherCurrentMainCoreIndex == sGX2MainCoreIndex)
return; // write gather already configured for same core
for (sint32 i = 0; i < PPC_CORE_COUNT; i++)
cemu_assert_debug(!s_commandState->commandPoolBase); // should not be allocated already
// setup command buffer pool. If not provided allocate a 4MB or custom size buffer
uint32 poolSize = bufferSize ? bufferSize : 0x400000; // 4MB (can be overwritten by custom GX2Init parameters?)
if (bufferBase)
{
if (i == sGX2MainCoreIndex)
{
gx2WriteGatherPipe.writeGatherPtrGxBuffer[i] = gx2WriteGatherPipe.gxRingBuffer;
gx2WriteGatherPipe.writeGatherPtrWrite[i] = &gx2WriteGatherPipe.writeGatherPtrGxBuffer[i];
s_commandState->commandPoolBase = (uint32be*)bufferBase;
s_cbBufferIsInternallyAllocated = false;
}
else
{
gx2WriteGatherPipe.writeGatherPtrGxBuffer[i] = NULL;
gx2WriteGatherPipe.writeGatherPtrWrite[i] = NULL;
s_commandState->commandPoolBase = (uint32be*)coreinit::_weak_MEMAllocFromDefaultHeapEx(poolSize, 0x100);
s_cbBufferIsInternallyAllocated = true;
}
gx2WriteGatherPipe.displayListStart[i] = MPTR_NULL;
gx2WriteGatherPipe.writeGatherPtrDisplayList[i] = NULL;
gx2WriteGatherPipe.displayListMaxSize[i] = 0;
if (!s_commandState->commandPoolBase)
{
cemuLog_log(LogType::Force, "GX2: Failed to allocate command buffer pool");
}
s_commandState->commandPoolSizeInU32s = poolSize / sizeof(uint32be);
s_commandState->gpuCommandReadPtr = s_commandState->commandPoolBase;
// init per-core command buffer state
for (uint32 i = 0; i < Espresso::CORE_COUNT; i++)
{
s_perCoreCBState[i].bufferPtr = nullptr;
s_perCoreCBState[i].bufferSizeInU32s = 0;
s_perCoreCBState[i].currentWritePtr = nullptr;
}
// start first command buffer for main core
GX2Command_StartNewCommandBuffer(0x100);
}
void GX2Shutdown_commandBufferPool()
{
if (!s_commandState->commandPoolBase)
return;
if (s_cbBufferIsInternallyAllocated)
coreinit::_weak_MEMFreeToDefaultHeap(s_commandState->commandPoolBase.GetPtr());
s_cbBufferIsInternallyAllocated = false;
s_commandState->commandPoolBase = nullptr;
s_commandState->commandPoolSizeInU32s = 0;
s_commandState->gpuCommandReadPtr = nullptr;
}
// current position of where the GPU is reading from. Updated via a memory write command submitted to the GPU
uint32 GX2Command_GetPoolGPUReadIndex()
{
stdx::atomic_ref<MEMPTR<uint32be>> _readPtr(s_commandState->gpuCommandReadPtr);
MEMPTR<uint32be> currentReadPtr = _readPtr.load();
cemu_assert_debug(currentReadPtr);
return (uint32)(currentReadPtr.GetPtr() - s_commandState->commandPoolBase.GetPtr());
}
void GX2Command_WaitForNextBufferRetired()
{
uint64 retiredTimeStamp = GX2GetRetiredTimeStamp();
retiredTimeStamp += 1;
// but cant be higher than the submission timestamp
stdx::atomic_ref<uint64be> _lastSubmissionTime(s_commandState->lastSubmissionTime);
uint64 submissionTimeStamp = _lastSubmissionTime.load();
if (retiredTimeStamp > submissionTimeStamp)
retiredTimeStamp = submissionTimeStamp;
GX2WaitTimeStamp(retiredTimeStamp);
}
void GX2Command_SetupCoreCommandBuffer(uint32be* buffer, uint32 sizeInU32s, bool isDisplayList)
{
uint32 coreIndex = coreinit::OSGetCoreId();
auto& coreCBState = s_perCoreCBState[coreIndex];
coreCBState.bufferPtr = buffer;
coreCBState.bufferSizeInU32s = sizeInU32s;
coreCBState.currentWritePtr = buffer;
coreCBState.isDisplayList = isDisplayList;
}
void GX2Command_StartNewCommandBuffer(uint32 numU32s)
{
uint32 coreIndex = coreinit::OSGetCoreId();
auto& coreCBState = s_perCoreCBState[coreIndex];
numU32s = std::max<uint32>(numU32s, 0x100);
// grab space from command buffer pool and if necessary wait for it
uint32be* bufferPtr = nullptr;
uint32 bufferSizeInU32s = 0;
uint32 readIndex;
while (true)
{
// try to grab buffer data from first available spot:
// 1. At the current write location up to the end of the buffer (avoiding an overlap with the read location)
// 2. From the start of the buffer up to the read location
readIndex = GX2Command_GetPoolGPUReadIndex();
uint32be* nextWritePos = coreCBState.bufferPtr ? coreCBState.bufferPtr + coreCBState.bufferSizeInU32s : s_commandState->commandPoolBase.GetPtr();
uint32 writeIndex = nextWritePos - s_commandState->commandPoolBase;
uint32 poolSizeInU32s = s_commandState->commandPoolSizeInU32s;
// readIndex == writeIndex can mean either buffer full or buffer empty
// we could use GX2GetRetiredTimeStamp() == GX2GetLastSubmittedTimeStamp() to determine if the buffer is truly empty
// but this can have false negatives since the last submission timestamp is updated independently of the read index
// so instead we just avoid ever filling the buffer completely
cemu_assert_debug(readIndex < poolSizeInU32s);
cemu_assert_debug(writeIndex < poolSizeInU32s);
if (writeIndex < readIndex)
{
// writeIndex has wrapped around
uint32 wordsAvailable = readIndex - writeIndex;
if (wordsAvailable > 0)
wordsAvailable--; // avoid writeIndex becoming equal to readIndex
if (wordsAvailable >= numU32s)
{
bufferPtr = s_commandState->commandPoolBase + writeIndex;
bufferSizeInU32s = wordsAvailable;
break;
}
}
else
{
uint32 wordsAvailable = poolSizeInU32s - writeIndex;
if (wordsAvailable > 0)
wordsAvailable--; // avoid writeIndex becoming equal to readIndex
if (wordsAvailable >= numU32s)
{
bufferPtr = nextWritePos;
bufferSizeInU32s = wordsAvailable;
break;
}
// not enough space at end of buffer, try to grab from the beginning of the buffer
wordsAvailable = readIndex;
if (wordsAvailable > 0)
wordsAvailable--; // avoid writeIndex becoming equal to readIndex
if (wordsAvailable >= numU32s)
{
bufferPtr = s_commandState->commandPoolBase;
bufferSizeInU32s = wordsAvailable;
break;
}
}
GX2Command_WaitForNextBufferRetired();
}
cemu_assert_debug(bufferPtr);
bufferSizeInU32s = std::min<uint32>(numU32s, 0x20000); // size cap
#ifdef CEMU_DEBUG_ASSERT
uint32 newWriteIndex = ((bufferPtr - s_commandState->commandPoolBase) + bufferSizeInU32s) % s_commandState->commandPoolSizeInU32s;
cemu_assert_debug(newWriteIndex != readIndex);
#endif
// setup buffer and make it the current write gather target
cemu_assert_debug(bufferPtr >= s_commandState->commandPoolBase && (bufferPtr + bufferSizeInU32s) <= s_commandState->commandPoolBase + s_commandState->commandPoolSizeInU32s);
GX2Command_SetupCoreCommandBuffer(bufferPtr, bufferSizeInU32s, false);
}
void GX2Command_SubmitCommandBuffer(uint32be* buffer, uint32 sizeInU32s, MEMPTR<uint32be>* completionGPUReadPointer, bool triggerMarkerInterrupt)
{
uint32be cmd[10];
uint32 cmdLen = 4;
cmd[0] = pm4HeaderType3(IT_INDIRECT_BUFFER_PRIV, 3);
cmd[1] = memory_virtualToPhysical(MEMPTR<void>(buffer).GetMPTR());
cmd[2] = 0x00000000; // address high bits
cmd[3] = sizeInU32s;
if (completionGPUReadPointer)
{
// append command to update completionGPUReadPointer after the GPU is done with the command buffer
cmd[4] = pm4HeaderType3(IT_MEM_WRITE, 4);
cmd[5] = memory_virtualToPhysical(MEMPTR<void>(completionGPUReadPointer).GetMPTR()) | 2;
cmd[6] = 0x40000;
cmd[7] = MEMPTR<void>(buffer + sizeInU32s).GetMPTR(); // value to write
cmd[8] = 0x00000000;
cmdLen = 9;
}
betype<TCL::TCLSubmissionFlag> submissionFlags{};
if (!triggerMarkerInterrupt)
submissionFlags |= TCL::TCLSubmissionFlag::NO_MARKER_INTERRUPT;
submissionFlags |= TCL::TCLSubmissionFlag::USE_RETIRED_MARKER;
TCL::TCLSubmitToRing(cmd, cmdLen, &submissionFlags, &s_commandState->lastSubmissionTime);
}
void GX2Command_PadCurrentBuffer()
{
uint32 coreIndex = coreinit::OSGetCoreId();
auto& coreCBState = s_perCoreCBState[coreIndex];
if (!coreCBState.currentWritePtr)
return;
uint32 writeDistance = (uint32)(coreCBState.currentWritePtr - coreCBState.bufferPtr);
if ((writeDistance&7) != 0)
{
uint32 distanceToPad = 0x8 - (writeDistance & 0x7);
while (distanceToPad)
{
*coreCBState.currentWritePtr = pm4HeaderType2Filler();
coreCBState.currentWritePtr++;
distanceToPad--;
}
}
}
void GX2Command_Flush(uint32 numU32sForNextBuffer, bool triggerMarkerInterrupt)
{
uint32 coreIndex = coreinit::OSGetCoreId();
auto& coreCBState = s_perCoreCBState[coreIndex];
if (coreCBState.isDisplayList)
{
// display list
cemu_assert_debug((uint32)(coreCBState.currentWritePtr - coreCBState.bufferPtr) < coreCBState.bufferSizeInU32s);
cemuLog_logDebugOnce(LogType::Force, "GX2 flush called on display list");
}
else
{
// command buffer
if (coreCBState.currentWritePtr != coreCBState.bufferPtr)
{
// pad the command buffer to 32 byte alignment
GX2Command_PadCurrentBuffer();
// submit it to the GPU
uint32 bufferLength = (uint32)(coreCBState.currentWritePtr - coreCBState.bufferPtr);
cemu_assert_debug(bufferLength <= coreCBState.bufferSizeInU32s);
GX2Command_SubmitCommandBuffer(coreCBState.bufferPtr, bufferLength, &s_commandState->gpuCommandReadPtr, triggerMarkerInterrupt);
GX2Command_StartNewCommandBuffer(numU32sForNextBuffer);
}
else
{
// current buffer is empty so we dont need to queue it
if (numU32sForNextBuffer > s_commandState->commandPoolSizeInU32s)
GX2Command_StartNewCommandBuffer(numU32sForNextBuffer);
}
}
}
void GX2Flush()
{
GX2Command_Flush(256, true);
}
uint64 GX2GetLastSubmittedTimeStamp()
{
stdx::atomic_ref<uint64be> _lastSubmissionTime(s_commandState->lastSubmissionTime);
return _lastSubmissionTime.load();
}
uint64 GX2GetRetiredTimeStamp()
{
uint64be ts = 0;
TCL::TCLTimestamp(TCL::TCLTimestampId::TIMESTAMP_LAST_BUFFER_RETIRED, &ts);
return ts;
}
bool GX2WaitTimeStamp(uint64 tsWait)
{
// handle GPU timeout here? But for now we timeout after 60 seconds
TCL::TCLWaitTimestamp(TCL::TCLTimestampId::TIMESTAMP_LAST_BUFFER_RETIRED, tsWait, Espresso::TIMER_CLOCK * 60);
return true;
}
/*
* Guarantees that the requested amount of space is available on the current command buffer
* If the space is not available, the current command buffer is pushed to the GPU and a new one is allocated
*/
void GX2ReserveCmdSpace(uint32 reservedFreeSpaceInU32)
{
uint32 coreIndex = coreinit::OSGetCoreId();
auto& coreCBState = s_perCoreCBState[coreIndex];
if (coreCBState.currentWritePtr == nullptr)
return;
uint32 writeDistance = (uint32)(coreCBState.currentWritePtr - coreCBState.bufferPtr);
if (writeDistance + reservedFreeSpaceInU32 > coreCBState.bufferSizeInU32s)
{
GX2Command_Flush(reservedFreeSpaceInU32, true);
}
gx2WriteGatherCurrentMainCoreIndex = sGX2MainCoreIndex;
gx2WriteGatherInited = true;
}
void GX2WriteGather_beginDisplayList(PPCInterpreter_t* hCPU, MPTR buffer, uint32 maxSize)
{
uint32 coreIndex = PPCInterpreter_getCoreIndex(hCPU);
gx2WriteGatherPipe.displayListStart[coreIndex] = buffer;
gx2WriteGatherPipe.displayListMaxSize[coreIndex] = maxSize;
// set new write gather ptr
gx2WriteGatherPipe.writeGatherPtrDisplayList[coreIndex] = memory_getPointerFromVirtualOffset(gx2WriteGatherPipe.displayListStart[coreIndex]);
gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex] = &gx2WriteGatherPipe.writeGatherPtrDisplayList[coreIndex];
if (coreIndex == sGX2MainCoreIndex)
{
GX2Command_PadCurrentBuffer();
cemu_assert_debug(!s_perCoreCBState[coreIndex].isDisplayList);
s_mainCoreLastCommandState = s_perCoreCBState[coreIndex];
}
GX2Command_SetupCoreCommandBuffer(MEMPTR<uint32be>(buffer), maxSize/4, true);
}
uint32 GX2WriteGather_getDisplayListWriteDistance(sint32 coreIndex)
{
return (uint32)(*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex] - memory_getPointerFromVirtualOffset(gx2WriteGatherPipe.displayListStart[coreIndex]));
}
uint32 GX2WriteGather_getFifoWriteDistance(uint32 coreIndex)
{
uint32 writeDistance = (uint32)(gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex] - gx2WriteGatherPipe.gxRingBuffer);
return writeDistance;
auto& coreCBState = s_perCoreCBState[coreIndex];
cemu_assert_debug(coreCBState.isDisplayList);
if (coreCBState.currentWritePtr == nullptr)
return 0;
return (uint32)(coreCBState.currentWritePtr - coreCBState.bufferPtr) * 4;
}
uint32 GX2WriteGather_endDisplayList(PPCInterpreter_t* hCPU, MPTR buffer)
{
uint32 coreIndex = PPCInterpreter_getCoreIndex(hCPU);
if (gx2WriteGatherPipe.displayListStart[coreIndex] != MPTR_NULL)
uint32 coreIndex = coreinit::OSGetCoreId();
auto& coreCBState = s_perCoreCBState[coreIndex];
GX2Command_PadCurrentBuffer();
uint32 finalWriteIndex = coreCBState.currentWritePtr - coreCBState.bufferPtr;
cemu_assert_debug(finalWriteIndex <= coreCBState.bufferSizeInU32s);
// if we are on the main GX2 core then restore the GPU command buffer
if (coreIndex == sGX2MainCoreIndex)
{
uint32 currentWriteSize = GX2WriteGather_getDisplayListWriteDistance(coreIndex);
// pad to 32 byte
if (gx2WriteGatherPipe.displayListMaxSize[coreIndex] >= ((gx2WriteGatherPipe.displayListMaxSize[coreIndex] + 0x1F) & ~0x1F))
{
while ((currentWriteSize & 0x1F) != 0)
{
gx2WriteGather_submitU32AsBE(pm4HeaderType2Filler());
currentWriteSize += 4;
}
}
// get size of written data
currentWriteSize = GX2WriteGather_getDisplayListWriteDistance(coreIndex);
// disable current display list and restore write gather ptr
gx2WriteGatherPipe.displayListStart[coreIndex] = MPTR_NULL;
if (sGX2MainCoreIndex == coreIndex)
gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex] = &gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex];
else
gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex] = NULL;
// return size of (written) display list
return currentWriteSize;
coreCBState = s_mainCoreLastCommandState;
}
else
{
// no active display list
// return a size of 0
return 0;
coreCBState.bufferPtr = nullptr;
coreCBState.currentWritePtr = nullptr;
coreCBState.bufferSizeInU32s = 0;
coreCBState.isDisplayList = false;
}
return finalWriteIndex * 4;
}
bool GX2GetCurrentDisplayList(betype<MPTR>* displayListAddr, uint32be* displayListSize)
bool GX2GetCurrentDisplayList(MEMPTR<uint32be>* displayListAddr, uint32be* displayListSize)
{
uint32 coreIndex = coreinit::OSGetCoreId();
if (gx2WriteGatherPipe.displayListStart[coreIndex] == MPTR_NULL)
auto& coreCBState = s_perCoreCBState[coreIndex];
if (!coreCBState.isDisplayList)
return false;
if (displayListAddr)
*displayListAddr = gx2WriteGatherPipe.displayListStart[coreIndex];
*displayListAddr = coreCBState.bufferPtr;
if (displayListSize)
*displayListSize = gx2WriteGatherPipe.displayListMaxSize[coreIndex];
*displayListSize = coreCBState.bufferSizeInU32s * sizeof(uint32be);
return true;
}
// returns true if we are writing to a display list
bool GX2GetDisplayListWriteStatus()
{
// returns true if we are writing to a display list
uint32 coreIndex = coreinit::OSGetCoreId();
return gx2WriteGatherPipe.displayListStart[coreIndex] != MPTR_NULL;
}
uint32 GX2WriteGather_getReadWriteDistance()
{
uint32 coreIndex = sGX2MainCoreIndex;
uint32 writeDistance = (uint32)(gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex] + GX2_COMMAND_RING_BUFFER_SIZE - gxRingBufferReadPtr);
writeDistance %= GX2_COMMAND_RING_BUFFER_SIZE;
return writeDistance;
}
void GX2WriteGather_checkAndInsertWrapAroundMark()
{
uint32 coreIndex = coreinit::OSGetCoreId();
if (coreIndex != sGX2MainCoreIndex) // only if main gx2 core
return;
if (gx2WriteGatherPipe.displayListStart[coreIndex] != MPTR_NULL)
return;
uint32 writeDistance = GX2WriteGather_getFifoWriteDistance(coreIndex);
if (writeDistance >= (GX2_COMMAND_RING_BUFFER_SIZE * 3 / 5))
{
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_HLE_FIFO_WRAP_AROUND, 1));
gx2WriteGather_submitU32AsBE(0); // empty word since we can't send commands with zero data words
gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex] = gx2WriteGatherPipe.gxRingBuffer;
}
return s_perCoreCBState[coreIndex].isDisplayList;
}
void GX2BeginDisplayList(MEMPTR<void> displayListAddr, uint32 size)
@ -204,28 +423,23 @@ namespace GX2
memory_virtualToPhysical(addr),
0, // high address bits
size / 4);
GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
}
void GX2DirectCallDisplayList(void* addr, uint32 size)
{
// this API submits to TCL directly and bypasses write-gatherer
// its basically a way to manually submit a command buffer to the GPU
// as such it also affects the submission and retire timestamps
uint32 coreIndex = PPCInterpreter_getCoreIndex(PPCInterpreter_getCurrentInstance());
cemu_assert_debug(coreIndex == sGX2MainCoreIndex);
coreIndex = sGX2MainCoreIndex; // always submit to main queue which is owned by GX2 main core (TCLSubmitToRing does not need this workaround)
uint32be* cmdStream = (uint32be*)(gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex]);
cmdStream[0] = pm4HeaderType3(IT_INDIRECT_BUFFER_PRIV, 3);
cmdStream[1] = memory_virtualToPhysical(MEMPTR<void>(addr).GetMPTR());
cmdStream[2] = 0;
cmdStream[3] = size / 4;
gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex] += 16;
// update submission timestamp and retired timestamp
_GX2SubmitToTCL();
uint32 coreIndex = coreinit::OSGetCoreId();
if (coreIndex != sGX2MainCoreIndex)
{
cemuLog_logDebugOnce(LogType::Force, "GX2DirectCallDisplayList() called on non-main GX2 core");
}
if (!s_perCoreCBState[coreIndex].isDisplayList)
{
// make sure any preceeding commands are submitted first
GX2Command_Flush(0x100, false);
}
GX2Command_SubmitCommandBuffer(static_cast<uint32be*>(addr), size / 4, nullptr, false);
}
void GX2CopyDisplayList(MEMPTR<uint32be*> addr, uint32 size)
@ -288,6 +502,12 @@ namespace GX2
void GX2CommandInit()
{
cafeExportRegister("gx2", GX2Flush, LogType::GX2);
cafeExportRegister("gx2", GX2GetLastSubmittedTimeStamp, LogType::GX2);
cafeExportRegister("gx2", GX2GetRetiredTimeStamp, LogType::GX2);
cafeExportRegister("gx2", GX2WaitTimeStamp, LogType::GX2);
cafeExportRegister("gx2", GX2BeginDisplayList, LogType::GX2);
cafeExportRegister("gx2", GX2BeginDisplayListEx, LogType::GX2);
cafeExportRegister("gx2", GX2EndDisplayList, LogType::GX2);
@ -295,7 +515,6 @@ namespace GX2
cafeExportRegister("gx2", GX2GetCurrentDisplayList, LogType::GX2);
cafeExportRegister("gx2", GX2GetDisplayListWriteStatus, LogType::GX2);
cafeExportRegister("gx2", GX2CallDisplayList, LogType::GX2);
cafeExportRegister("gx2", GX2DirectCallDisplayList, LogType::GX2);
cafeExportRegister("gx2", GX2CopyDisplayList, LogType::GX2);
@ -305,7 +524,10 @@ namespace GX2
void GX2CommandResetToDefaultState()
{
GX2WriteGather_ResetToDefaultState();
s_commandState->commandPoolBase = nullptr;
s_commandState->commandPoolSizeInU32s = 0;
s_commandState->gpuCommandReadPtr = nullptr;
s_cbBufferIsInternallyAllocated = false;
}
}

View file

@ -2,21 +2,19 @@
#include "Cafe/HW/Latte/ISA/LatteReg.h"
#include "Cafe/HW/Espresso/Const.h"
struct GX2WriteGatherPipeState
namespace GX2
{
uint8* gxRingBuffer;
// each core has it's own write gatherer and display list state (writing)
uint8* writeGatherPtrGxBuffer[Espresso::CORE_COUNT];
uint8** writeGatherPtrWrite[Espresso::CORE_COUNT];
uint8* writeGatherPtrDisplayList[Espresso::CORE_COUNT];
MPTR displayListStart[Espresso::CORE_COUNT];
uint32 displayListMaxSize[Espresso::CORE_COUNT];
struct GX2PerCoreCBState
{
uint32be* bufferPtr;
uint32 bufferSizeInU32s;
uint32be* currentWritePtr;
bool isDisplayList;
};
extern GX2PerCoreCBState s_perCoreCBState[Espresso::CORE_COUNT];
};
extern GX2WriteGatherPipeState gx2WriteGatherPipe;
void GX2ReserveCmdSpace(uint32 reservedFreeSpaceInU32); // move to GX2 namespace eventually
void gx2WriteGather_submitU32AsBE(uint32 v);
void gx2WriteGather_submitU32AsLE(uint32 v);
void gx2WriteGather_submitU32AsLEArray(uint32* v, uint32 numValues);
@ -27,7 +25,8 @@ uint32 PPCInterpreter_getCurrentCoreIndex();
template <typename ...Targs>
inline void gx2WriteGather_submit_(uint32 coreIndex, uint32be* writePtr)
{
(*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex]) = (uint8*)writePtr;
GX2::s_perCoreCBState[coreIndex].currentWritePtr = writePtr;
cemu_assert_debug(GX2::s_perCoreCBState[coreIndex].currentWritePtr <= (GX2::s_perCoreCBState[coreIndex].bufferPtr + GX2::s_perCoreCBState[coreIndex].bufferSizeInU32s));
}
template <typename T, typename ...Targs>
@ -75,17 +74,23 @@ template <typename ...Targs>
inline void gx2WriteGather_submit(Targs... args)
{
uint32 coreIndex = PPCInterpreter_getCurrentCoreIndex();
if (gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex] == nullptr)
if (GX2::s_perCoreCBState[coreIndex].currentWritePtr == nullptr)
{
cemu_assert_suspicious(); // writing to command buffer without valid write pointer?
return;
uint32be* writePtr = (uint32be*)(*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex]);
}
uint32be* writePtr = GX2::s_perCoreCBState[coreIndex].currentWritePtr;
gx2WriteGather_submit_(coreIndex, writePtr, std::forward<Targs>(args)...);
}
namespace GX2
{
uint32 GX2WriteGather_getReadWriteDistance();
void GX2WriteGather_checkAndInsertWrapAroundMark();
void GX2Command_Flush(uint32 numU32sForNextBuffer, bool triggerMarkerInterrupt = true);
void GX2ReserveCmdSpace(uint32 reservedFreeSpaceInU32);
uint64 GX2GetLastSubmittedTimeStamp();
uint64 GX2GetRetiredTimeStamp();
bool GX2WaitTimeStamp(uint64 tsWait);
void GX2BeginDisplayList(MEMPTR<void> displayListAddr, uint32 size);
void GX2BeginDisplayListEx(MEMPTR<void> displayListAddr, uint32 size, bool profiling);
@ -96,7 +101,8 @@ namespace GX2
bool GX2GetDisplayListWriteStatus();
void GX2Init_writeGather();
void GX2CommandInit();
void GX2Init_commandBufferPool(void* bufferBase, uint32 bufferSize);
void GX2Shutdown_commandBufferPool();
void GX2CommandResetToDefaultState();
}

View file

@ -168,7 +168,7 @@ uint32 _GX2Context_CalcStateSize()
void _GX2Context_CreateLoadDL()
{
GX2ReserveCmdSpace(3);
GX2::GX2ReserveCmdSpace(3);
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_CONTEXT_CONTROL, 2));
gx2WriteGather_submitU32AsBE(0x80000077);
gx2WriteGather_submitU32AsBE(0x80000077);
@ -176,7 +176,7 @@ void _GX2Context_CreateLoadDL()
void _GX2Context_WriteCmdDisableStateShadowing()
{
GX2ReserveCmdSpace(3);
GX2::GX2ReserveCmdSpace(3);
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_CONTEXT_CONTROL, 2));
gx2WriteGather_submitU32AsBE(0x80000000);
gx2WriteGather_submitU32AsBE(0x80000000);
@ -184,7 +184,7 @@ void _GX2Context_WriteCmdDisableStateShadowing()
void _GX2Context_cmdLoad(void* gx2ukn, uint32 pm4Header, MPTR physAddrRegArea, uint32 waitForIdle, uint32 numRegOffsetEntries, GX2RegLoadPktEntry_t* regOffsetEntries)
{
GX2ReserveCmdSpace(3 + numRegOffsetEntries*2);
GX2::GX2ReserveCmdSpace(3 + numRegOffsetEntries*2);
gx2WriteGather_submitU32AsBE(pm4Header);
gx2WriteGather_submitU32AsBE(physAddrRegArea);
gx2WriteGather_submitU32AsBE(waitForIdle);
@ -199,7 +199,6 @@ void _GX2Context_cmdLoad(void* gx2ukn, uint32 pm4Header, MPTR physAddrRegArea, u
void _GX2Context_WriteCmdRestoreState(GX2ContextState_t* gx2ContextState, uint32 ukn)
{
GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
MPTR physAddrContextState = memory_virtualToPhysical(memory_getVirtualOffsetFromPointer(gx2ContextState));
_GX2Context_CreateLoadDL();
__cmdStateLoad(NULL, IT_LOAD_CONFIG_REG, gx2ContextState->hwContext.areaConfigReg, 0x80000000, configReg_loadPktEntries);
@ -212,7 +211,7 @@ void _GX2Context_WriteCmdRestoreState(GX2ContextState_t* gx2ContextState, uint32
void GX2SetDefaultState()
{
GX2ReserveCmdSpace(0x100);
GX2::GX2ReserveCmdSpace(0x100);
Latte::LATTE_PA_CL_VTE_CNTL reg{};
reg.set_VPORT_X_OFFSET_ENA(true).set_VPORT_X_SCALE_ENA(true);
@ -376,7 +375,6 @@ void gx2Export_GX2SetContextState(PPCInterpreter_t* hCPU)
osLib_returnFromFunction(hCPU, 0);
}
void gx2Export_GX2GetContextStateDisplayList(PPCInterpreter_t* hCPU)
{
cemuLog_log(LogType::GX2, "GX2GetContextStateDisplayList(0x{:08x}, 0x{:08x}, 0x{:08x})", hCPU->gpr[3], hCPU->gpr[4], hCPU->gpr[5]);

View file

@ -52,7 +52,6 @@ namespace GX2
0,
count,
0);
GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
}
void GX2DrawIndexedEx2(GX2PrimitiveMode2 primitiveMode, uint32 count, GX2IndexType indexType, void* indexData, uint32 baseVertex, uint32 numInstances, uint32 baseInstance)
@ -85,7 +84,6 @@ namespace GX2
pm4HeaderType3(IT_SET_CTL_CONST, 2), 1,
0 // baseInstance
);
GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
}
void GX2DrawEx(GX2PrimitiveMode2 primitiveMode, uint32 count, uint32 baseVertex, uint32 numInstances)
@ -109,7 +107,6 @@ namespace GX2
count,
0 // DRAW_INITIATOR
);
GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
}
void GX2DrawIndexedImmediateEx(GX2PrimitiveMode2 primitiveMode, uint32 count, GX2IndexType indexType, void* indexData, uint32 baseVertex, uint32 numInstances)
@ -177,7 +174,6 @@ namespace GX2
}
}
GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
}
struct GX2DispatchComputeParam

View file

@ -16,18 +16,6 @@ namespace GX2
SysAllocator<coreinit::OSThreadQueue> g_vsyncThreadQueue;
SysAllocator<coreinit::OSThreadQueue> g_flipThreadQueue;
SysAllocator<coreinit::OSEvent> s_updateRetirementEvent;
std::atomic<uint64> s_lastRetirementTimestamp = 0;
// called from GPU code when a command buffer is retired
void __GX2NotifyNewRetirementTimestamp(uint64 tsRetire)
{
__OSLockScheduler();
s_lastRetirementTimestamp = tsRetire;
coreinit::OSSignalEventAllInternal(s_updateRetirementEvent.GetPtr());
__OSUnlockScheduler();
}
void GX2SetGPUFence(uint32be* fencePtr, uint32 mask, uint32 compareOp, uint32 compareValue)
{
GX2ReserveCmdSpace(7);
@ -210,16 +198,6 @@ namespace GX2
osLib_returnFromFunction(hCPU, 0);
}
uint64 GX2GetLastSubmittedTimeStamp()
{
return LatteGPUState.lastSubmittedCommandBufferTimestamp.load();
}
uint64 GX2GetRetiredTimeStamp()
{
return s_lastRetirementTimestamp;
}
void GX2WaitForVsync()
{
__OSLockScheduler();
@ -236,19 +214,6 @@ namespace GX2
__OSUnlockScheduler();
}
bool GX2WaitTimeStamp(uint64 tsWait)
{
__OSLockScheduler();
while (tsWait > s_lastRetirementTimestamp)
{
// GPU hasn't caught up yet
coreinit::OSWaitEventInternal(s_updateRetirementEvent.GetPtr());
}
__OSUnlockScheduler();
// return true to indicate no timeout
return true;
}
void GX2DrawDone()
{
// optional force full sync (texture readback and occlusion queries)
@ -263,13 +228,10 @@ namespace GX2
gx2WriteGather_submitU32AsBE(0x00000000); // unused
}
// flush pipeline
if (_GX2GetUnflushedBytes(coreinit::OSGetCoreId()) > 0)
_GX2SubmitToTCL();
GX2Command_Flush(0x100, true);
uint64 ts = GX2GetLastSubmittedTimeStamp();
GX2WaitTimeStamp(ts);
GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
}
void GX2Init_event()
@ -294,25 +256,19 @@ namespace GX2
cafeExportRegister("gx2", GX2SetEventCallback, LogType::GX2);
cafeExportRegister("gx2", GX2GetEventCallback, LogType::GX2);
cafeExportRegister("gx2", GX2GetLastSubmittedTimeStamp, LogType::GX2);
cafeExportRegister("gx2", GX2GetRetiredTimeStamp, LogType::GX2);
cafeExportRegister("gx2", GX2WaitForVsync, LogType::GX2);
cafeExportRegister("gx2", GX2WaitForFlip, LogType::GX2);
cafeExportRegister("gx2", GX2WaitTimeStamp, LogType::GX2);
cafeExportRegister("gx2", GX2DrawDone, LogType::GX2);
coreinit::OSInitThreadQueue(g_vsyncThreadQueue.GetPtr());
coreinit::OSInitThreadQueue(g_flipThreadQueue.GetPtr());
coreinit::OSInitEvent(s_updateRetirementEvent, coreinit::OSEvent::EVENT_STATE::STATE_NOT_SIGNALED, coreinit::OSEvent::EVENT_MODE::MODE_AUTO);
coreinit::OSInitSemaphore(s_eventCbQueueSemaphore, 0);
}
void GX2EventResetToDefaultState()
{
s_callbackThreadLaunched = false;
s_lastRetirementTimestamp = 0;
for(auto& it : s_eventCallback)
{
it.callbackFuncPtr = nullptr;

View file

@ -81,19 +81,68 @@ namespace GX2
void _test_AddrLib();
void GX2Init(void* initSettings)
using GX2InitArg = uint32;
enum class GX2InitArgId : GX2InitArg
{
EndOfArgs = 0,
CommandPoolBase = 1,
CommandPoolSize = 2,
UknArg7 = 7,
UknArg8 = 8,
UknArg9 = 9,
UknArg11 = 11,
};
void GX2Init(betype<GX2InitArg>* initArgStream)
{
if (LatteGPUState.gx2InitCalled)
{
cemuLog_logDebug(LogType::Force, "GX2Init() called while already initialized");
return;
}
// parse init params from the stream
MEMPTR<void> commandPoolBase = nullptr;
uint32 commandPoolSize = 0;
if (initArgStream)
{
while (true)
{
GX2InitArgId paramId = static_cast<GX2InitArgId>((GX2InitArg)*initArgStream);
initArgStream++;
if (paramId == GX2InitArgId::EndOfArgs)
{
break;
}
else if (paramId == GX2InitArgId::CommandPoolBase)
{
commandPoolBase = MEMPTR<void>(*initArgStream);
initArgStream++;
}
else if (paramId == GX2InitArgId::CommandPoolSize)
{
commandPoolSize = *initArgStream;
initArgStream++;
}
else if (paramId == GX2InitArgId::UknArg7 ||
paramId == GX2InitArgId::UknArg8 ||
paramId == GX2InitArgId::UknArg9 ||
paramId == GX2InitArgId::UknArg11)
{
initArgStream++;
}
else
{
cemuLog_log(LogType::Force, "GX2Init: Unsupported init arg {}", (uint32)paramId);
}
}
}
// init main core
uint32 coreIndex = coreinit::OSGetCoreId();
cemuLog_log(LogType::GX2, "GX2Init() on core {} by thread 0x{:08x}", coreIndex, MEMPTR<OSThread_t>(coreinit::OSGetCurrentThread()).GetMPTR());
sGX2MainCoreIndex = coreIndex;
// init submodules
GX2::GX2Init_event();
GX2::GX2Init_writeGather();
GX2::GX2Init_commandBufferPool(commandPoolBase, commandPoolSize);
// init shared area
if (LatteGPUState.sharedAreaAddr == MPTR_NULL)
{
@ -112,6 +161,21 @@ namespace GX2
_test_AddrLib();
}
void GX2Shutdown()
{
if (!LatteGPUState.gx2InitCalled)
{
cemuLog_logDebug(LogType::Force, "GX2Shutdown() called while not initialized");
return;
}
LatteGPUState.gx2InitCalled--;
if (LatteGPUState.gx2InitCalled != 0)
return;
GX2DrawDone();
GX2Shutdown_commandBufferPool();
cemuLog_log(LogType::Force, "GX2 shutdown");
}
void _GX2DriverReset()
{
LatteGPUState.gx2InitCalled = 0;
@ -237,6 +301,7 @@ namespace GX2
void GX2MiscInit()
{
cafeExportRegister("gx2", GX2Init, LogType::GX2);
cafeExportRegister("gx2", GX2Shutdown, LogType::GX2);
cafeExportRegister("gx2", GX2GetMainCoreId, LogType::GX2);
cafeExportRegister("gx2", GX2ResetGPU, LogType::GX2);

View file

@ -135,7 +135,7 @@ void gx2Export_GX2InitDepthBufferRegs(PPCInterpreter_t* hCPU)
void gx2Export_GX2SetColorBuffer(PPCInterpreter_t* hCPU)
{
cemuLog_log(LogType::GX2, "GX2SetColorBuffer(0x{:08x}, {})", hCPU->gpr[3], hCPU->gpr[4]);
GX2ReserveCmdSpace(20);
GX2::GX2ReserveCmdSpace(20);
GX2ColorBuffer* colorBufferBE = (GX2ColorBuffer*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]);
@ -198,15 +198,13 @@ void gx2Export_GX2SetColorBuffer(PPCInterpreter_t* hCPU)
mmCB_COLOR0_INFO - 0xA000 + hCPU->gpr[4],
colorBufferBE->reg_info);
GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
osLib_returnFromFunction(hCPU, 0);
}
void gx2Export_GX2SetDepthBuffer(PPCInterpreter_t* hCPU)
{
cemuLog_log(LogType::GX2, "GX2SetDepthBuffer(0x{:08x})", hCPU->gpr[3]);
GX2ReserveCmdSpace(20);
GX2::GX2ReserveCmdSpace(20);
GX2DepthBuffer* depthBufferBE = (GX2DepthBuffer*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]);
@ -264,8 +262,6 @@ void gx2Export_GX2SetDepthBuffer(PPCInterpreter_t* hCPU)
gx2WriteGather_submitU32AsBE(mmDB_DEPTH_VIEW - 0xA000);
gx2WriteGather_submitU32AsBE(db_view);
GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
osLib_returnFromFunction(hCPU, 0);
}
@ -281,7 +277,7 @@ void gx2Export_GX2MarkScanBufferCopied(PPCInterpreter_t* hCPU)
uint32 scanTarget = hCPU->gpr[3];
if( scanTarget == GX2_SCAN_TARGET_TV )
{
GX2ReserveCmdSpace(10);
GX2::GX2ReserveCmdSpace(10);
uint32 physAddr = (MEMORY_TILINGAPERTURE_AREA_ADDR+0x200000);

View file

@ -303,7 +303,27 @@ namespace GX2
void GX2SetVertexShader(GX2VertexShader* vertexShader)
{
GX2ReserveCmdSpace(100);
uint32 numOutputIds = vertexShader->regs.vsOutIdTableSize;
numOutputIds = std::min<uint32>(numOutputIds, 0xA);
uint32 vsSemanticTableSize = vertexShader->regs.semanticTableSize;
uint32 reserveSize = 31;
if (vertexShader->shaderMode == GX2_SHADER_MODE::GEOMETRY_SHADER)
{
reserveSize += 7;
}
else
{
reserveSize += 18;
reserveSize += numOutputIds;
if (vertexShader->usesStreamOut != 0)
reserveSize += 2+12;
}
if (vsSemanticTableSize > 0)
{
reserveSize += 5 + vsSemanticTableSize;
}
GX2ReserveCmdSpace(reserveSize);
MPTR shaderProgramAddr;
uint32 shaderProgramSize;
@ -361,8 +381,6 @@ namespace GX2
cemu_assert_debug(vertexShader->regs.SPI_VS_OUT_CONFIG.value().get_VS_PER_COMPONENT() == false); // not handled on the GPU side
uint32 numOutputIds = vertexShader->regs.vsOutIdTableSize;
numOutputIds = std::min<uint32>(numOutputIds, 0xA);
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 1+numOutputIds));
gx2WriteGather_submitU32AsBE(Latte::REGADDR::SPI_VS_OUT_ID_0-0xA000);
for(uint32 i=0; i<numOutputIds; i++)
@ -392,7 +410,6 @@ namespace GX2
}
}
// update semantic table
uint32 vsSemanticTableSize = vertexShader->regs.semanticTableSize;
if (vsSemanticTableSize > 0)
{
gx2WriteGather_submit(

View file

@ -213,7 +213,6 @@ namespace GX2
void GX2SetViewportReg(GX2ViewportReg* viewportReg)
{
GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
GX2ReserveCmdSpace(2 + 6);
gx2WriteGather_submit(pm4HeaderType3(IT_SET_CONTEXT_REG, 1 + 6),

View file

@ -264,7 +264,7 @@ void gx2Surface_GX2CopySurface(GX2Surface* srcSurface, uint32 srcMip, uint32 src
// send copy command to GPU
if( srcHwTileMode > 0 && srcHwTileMode < 16 && dstHwTileMode > 0 && dstHwTileMode < 16 || requestGPURAMCopy )
{
GX2ReserveCmdSpace(1+13*2);
GX2::GX2ReserveCmdSpace(1+13*2);
gx2WriteGather_submit(pm4HeaderType3(IT_HLE_COPY_SURFACE_NEW, 13*2),
// src
@ -540,7 +540,7 @@ void gx2Export_GX2ResolveAAColorBuffer(PPCInterpreter_t* hCPU)
uint32 dstDepth = std::max<uint32>(surfOutDst.depth, 1);
// send copy command to GPU
GX2ReserveCmdSpace(1 + 13 * 2);
GX2::GX2ReserveCmdSpace(1 + 13 * 2);
gx2WriteGather_submit(pm4HeaderType3(IT_HLE_COPY_SURFACE_NEW, 13 * 2),
// src
(uint32)srcSurface->imagePtr,
@ -619,7 +619,7 @@ void gx2Export_GX2ConvertDepthBufferToTextureSurface(PPCInterpreter_t* hCPU)
sint32 srcMip = 0;
uint32 numSlices = std::max<uint32>(_swapEndianU32(depthBuffer->viewNumSlices), 1);
GX2ReserveCmdSpace((1 + 13 * 2) * numSlices);
GX2::GX2ReserveCmdSpace((1 + 13 * 2) * numSlices);
for (uint32 subSliceIndex = 0; subSliceIndex < numSlices; subSliceIndex++)
{
// send copy command to GPU

View file

@ -11,9 +11,14 @@
void gx2Export_GX2SetPixelShader(PPCInterpreter_t* hCPU)
{
cemuLog_log(LogType::GX2, "GX2SetPixelShader(0x{:08x})", hCPU->gpr[3]);
GX2ReserveCmdSpace(100);
GX2PixelShader_t* pixelShader = (GX2PixelShader_t*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]);
uint32 numInputs = _swapEndianU32(pixelShader->regs[4]);
if( numInputs > 0x20 )
numInputs = 0x20;
GX2::GX2ReserveCmdSpace(26 + numInputs);
MPTR shaderProgramAddr;
uint32 shaderProgramSize;
@ -44,9 +49,6 @@ void gx2Export_GX2SetPixelShader(PPCInterpreter_t* hCPU)
_swapEndianU32(pixelShader->regs[2]),
_swapEndianU32(pixelShader->regs[3]));
// setup pixel shader extended inputs control
uint32 numInputs = _swapEndianU32(pixelShader->regs[4]);
if( numInputs > 0x20 )
numInputs = 0x20;
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 1+numInputs));
gx2WriteGather_submitU32AsBE(mmSPI_PS_INPUT_CNTL_0-0xA000);
for(uint32 i=0; i<numInputs; i++)
@ -79,9 +81,17 @@ void gx2Export_GX2SetPixelShader(PPCInterpreter_t* hCPU)
void gx2Export_GX2SetGeometryShader(PPCInterpreter_t* hCPU)
{
cemuLog_log(LogType::GX2, "GX2SetGeometryShader(0x{:08x})", hCPU->gpr[3]);
GX2ReserveCmdSpace(100);
GX2GeometryShader_t* geometryShader = (GX2GeometryShader_t*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]);
uint32 numOutputIds = _swapEndianU32(geometryShader->regs[7]);
numOutputIds = std::min<uint32>(numOutputIds, 0xA);
uint32 reserveSize = 38; // 38 fixed parameters
if (numOutputIds != 0)
reserveSize += 2 + numOutputIds;
if( _swapEndianU32(geometryShader->useStreamout) != 0 )
reserveSize += 2 + 12;
GX2::GX2ReserveCmdSpace(reserveSize);
MPTR shaderProgramAddr;
uint32 shaderProgramSize;
@ -128,6 +138,7 @@ void gx2Export_GX2SetGeometryShader(PPCInterpreter_t* hCPU)
if( _swapEndianU32(geometryShader->useStreamout) != 0 )
{
// todo - IT_EVENT_WRITE packet here
// stride 0
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 2));
gx2WriteGather_submitU32AsBE(mmVGT_STRMOUT_VTX_STRIDE_0-0xA000);
@ -180,8 +191,6 @@ void gx2Export_GX2SetGeometryShader(PPCInterpreter_t* hCPU)
gx2WriteGather_submitU32AsBE(_swapEndianU32(geometryShader->regs[3]));
// GS outputs
uint32 numOutputIds = _swapEndianU32(geometryShader->regs[7]);
numOutputIds = std::min<uint32>(numOutputIds, 0xA);
if( numOutputIds != 0 )
{
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 1+numOutputIds));
@ -254,8 +263,7 @@ void gx2Export_GX2SetComputeShader(PPCInterpreter_t* hCPU)
shaderPtr = computeShader->rBuffer.GetVirtualAddr();
shaderSize = computeShader->rBuffer.GetSize();
}
GX2ReserveCmdSpace(0x11);
GX2::GX2ReserveCmdSpace(0x11);
gx2WriteGather_submit(pm4HeaderType3(IT_SET_CONTEXT_REG, 6),
mmSQ_PGM_START_ES-0xA000,
@ -272,7 +280,7 @@ void gx2Export_GX2SetComputeShader(PPCInterpreter_t* hCPU)
void _GX2SubmitUniformBlock(uint32 registerBase, uint32 index, MPTR virtualAddress, uint32 size)
{
GX2ReserveCmdSpace(9);
GX2::GX2ReserveCmdSpace(9);
gx2WriteGather_submit(pm4HeaderType3(IT_SET_RESOURCE, 8),
registerBase + index * 7,
memory_virtualToPhysical(virtualAddress),
@ -307,7 +315,7 @@ void gx2Export_GX2SetGeometryUniformBlock(PPCInterpreter_t* hCPU)
void gx2Export_GX2RSetVertexUniformBlock(PPCInterpreter_t* hCPU)
{
GX2ReserveCmdSpace(9);
GX2::GX2ReserveCmdSpace(9);
GX2RBuffer* bufferPtr = (GX2RBuffer*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]);
uint32 index = hCPU->gpr[4];
@ -320,7 +328,7 @@ void gx2Export_GX2RSetVertexUniformBlock(PPCInterpreter_t* hCPU)
void gx2Export_GX2SetShaderModeEx(PPCInterpreter_t* hCPU)
{
GX2ReserveCmdSpace(8+4);
GX2::GX2ReserveCmdSpace(8+4);
uint32 mode = hCPU->gpr[3];
uint32 sqConfig = hCPU->gpr[3] == 0 ? 4 : 0;

View file

@ -616,4 +616,36 @@ namespace stdx
scope_exit& operator=(scope_exit) = delete;
void release() { m_released = true;}
};
// Xcode 16 doesn't have std::atomic_ref support and we provide a minimalist reimplementation as fallback
#ifdef __cpp_lib_atomic_ref
#include <atomic>
template<typename T>
using atomic_ref = std::atomic_ref<T>;
#else
template<typename T>
class atomic_ref
{
static_assert(std::is_trivially_copyable<T>::value, "atomic_ref requires trivially copyable types");
public:
using value_type = T;
explicit atomic_ref(T& obj) noexcept : ptr_(std::addressof(obj)) {}
T load(std::memory_order order = std::memory_order_seq_cst) const noexcept
{
auto aptr = reinterpret_cast<std::atomic<T>*>(ptr_);
return aptr->load(order);
}
void store(T desired, std::memory_order order = std::memory_order_seq_cst) const noexcept
{
auto aptr = reinterpret_cast<std::atomic<T>*>(ptr_);
aptr->store(desired, order);
}
private:
T* ptr_;
};
#endif
}