Cemu/src/Cafe/OS/libs/TCL/TCL.cpp
Exzap 28ea70b6d8 GX2+TCL: Reimplement command buffer submission
- GX2 utilizes TCL(.rpl) API for command submission instead of directly writing to an internal GPU fifo
- Submission & retire timestamps are correctly implemented as incremental counters
- Command buffering behaviour matches console
- Fixes race conditions on aarch64
2025-05-17 21:35:42 +02:00

161 lines
5 KiB
C++

#include "Cafe/OS/common/OSCommon.h"
#include "Cafe/OS/libs/TCL/TCL.h"
#include "HW/Latte/Core/LattePM4.h"
namespace TCL
{
SysAllocator<coreinit::OSEvent> s_updateRetirementEvent;
uint64 s_currentRetireMarker = 0;
struct TCLStatePPC // mapped into PPC space
{
uint64be gpuRetireMarker; // written by GPU
};
SysAllocator<TCLStatePPC> s_tclStatePPC;
// called from GPU for timestamp EOP event
void TCLGPUNotifyNewRetirementTimestamp()
{
// gpuRetireMarker is updated via event eop command
__OSLockScheduler();
coreinit::OSSignalEventAllInternal(s_updateRetirementEvent.GetPtr());
__OSUnlockScheduler();
}
int TCLTimestamp(TCLTimestampId id, uint64be* timestampOut)
{
if (id == TCLTimestampId::TIMESTAMP_LAST_BUFFER_RETIRED)
{
MEMPTR<uint32> b;
// this is the timestamp of the last buffer that was retired by the GPU
stdx::atomic_ref<uint64be> retireTimestamp(s_tclStatePPC->gpuRetireMarker);
*timestampOut = retireTimestamp.load();
return 0;
}
else
{
cemuLog_log(LogType::Force, "TCLTimestamp(): Unsupported timestamp ID {}", (uint32)id);
*timestampOut = 0;
return 0;
}
}
int TCLWaitTimestamp(TCLTimestampId id, uint64 waitTs, uint64 timeout)
{
if (id == TCLTimestampId::TIMESTAMP_LAST_BUFFER_RETIRED)
{
while ( true )
{
stdx::atomic_ref<uint64be> retireTimestamp(s_tclStatePPC->gpuRetireMarker);
uint64 currentTimestamp = retireTimestamp.load();
if (currentTimestamp >= waitTs)
return 0;
coreinit::OSWaitEvent(s_updateRetirementEvent.GetPtr());
}
}
else
{
cemuLog_log(LogType::Force, "TCLWaitTimestamp(): Unsupported timestamp ID {}", (uint32)id);
}
return 0;
}
static constexpr uint32 TCL_RING_BUFFER_SIZE = 4096; // in U32s
std::atomic<uint32> tclRingBufferA[TCL_RING_BUFFER_SIZE];
std::atomic<uint32> tclRingBufferA_readIndex{0};
uint32 tclRingBufferA_writeIndex{0};
// GPU code calls this to grab the next command word
bool TCLGPUReadRBWord(uint32& cmdWord)
{
if (tclRingBufferA_readIndex == tclRingBufferA_writeIndex)
return false;
cmdWord = tclRingBufferA[tclRingBufferA_readIndex];
tclRingBufferA_readIndex = (tclRingBufferA_readIndex+1) % TCL_RING_BUFFER_SIZE;
return true;
}
void TCLWaitForRBSpace(uint32be numU32s)
{
while ( true )
{
uint32 distance = (tclRingBufferA_readIndex + TCL_RING_BUFFER_SIZE - tclRingBufferA_writeIndex) & (TCL_RING_BUFFER_SIZE - 1);
if (tclRingBufferA_writeIndex == tclRingBufferA_readIndex) // buffer completely empty
distance = TCL_RING_BUFFER_SIZE;
if (distance >= numU32s+1) // assume distance minus one, because we are never allowed to completely wrap around
break;
_mm_pause();
}
}
// this function assumes that TCLWaitForRBSpace was called and that there is enough space
void TCLWriteCmd(uint32be* cmd, uint32 cmdLen)
{
while (cmdLen > 0)
{
tclRingBufferA[tclRingBufferA_writeIndex] = *cmd;
tclRingBufferA_writeIndex++;
tclRingBufferA_writeIndex &= (TCL_RING_BUFFER_SIZE - 1);
cmd++;
cmdLen--;
}
}
#define EVENT_TYPE_TS 5
void TCLSubmitRetireMarker(bool triggerEventInterrupt)
{
s_currentRetireMarker++;
uint32be cmd[6];
cmd[0] = pm4HeaderType3(IT_EVENT_WRITE_EOP, 5);
cmd[1] = (4 | (EVENT_TYPE_TS << 8)); // event type (bits 8-15) and event index (bits 0-7).
cmd[2] = MEMPTR<void>(&s_tclStatePPC->gpuRetireMarker).GetMPTR(); // address lower 32bits + data sel bits
cmd[3] = 0x40000000; // select 64bit write, lower 16 bits are the upper bits of the address
if (triggerEventInterrupt)
cmd[3] |= 0x2000000; // trigger interrupt after value has been written
cmd[4] = (uint32)s_currentRetireMarker; // data lower 32 bits
cmd[5] = (uint32)(s_currentRetireMarker>>32); // data higher 32 bits
TCLWriteCmd(cmd, 6);
}
int TCLSubmitToRing(uint32be* cmd, uint32 cmdLen, betype<TCLSubmissionFlag>* controlFlags, uint64be* timestampValueOut)
{
TCLSubmissionFlag flags = *controlFlags;
cemu_assert_debug(timestampValueOut); // handle case where this is null
// make sure there is enough space to submit all commands at one
uint32 totalCommandLength = cmdLen;
totalCommandLength += 6; // space needed for TCLSubmitRetireMarker
TCLWaitForRBSpace(totalCommandLength);
// submit command buffer
TCLWriteCmd(cmd, cmdLen);
// create new marker timestamp and tell GPU to write it to our variable after its done processing the command
if ((HAS_FLAG(flags, TCLSubmissionFlag::USE_RETIRED_MARKER)))
{
TCLSubmitRetireMarker(!HAS_FLAG(flags, TCLSubmissionFlag::NO_MARKER_INTERRUPT));
*timestampValueOut = s_currentRetireMarker; // incremented before each submit
}
else
{
cemu_assert_unimplemented();
}
return 0;
}
void Initialize()
{
cafeExportRegister("TCL", TCLSubmitToRing, LogType::Placeholder);
cafeExportRegister("TCL", TCLTimestamp, LogType::Placeholder);
cafeExportRegister("TCL", TCLWaitTimestamp, LogType::Placeholder);
s_currentRetireMarker = 0;
s_tclStatePPC->gpuRetireMarker = 0;
coreinit::OSInitEvent(s_updateRetirementEvent.GetPtr(), coreinit::OSEvent::EVENT_STATE::STATE_NOT_SIGNALED, coreinit::OSEvent::EVENT_MODE::MODE_AUTO);
}
}