mirror of
https://github.com/cemu-project/Cemu.git
synced 2025-07-02 13:01:18 +12:00
- GX2 utilizes TCL(.rpl) API for command submission instead of directly writing to an internal GPU fifo - Submission & retire timestamps are correctly implemented as incremental counters - Command buffering behaviour matches console - Fixes race conditions on aarch64
161 lines
5 KiB
C++
161 lines
5 KiB
C++
#include "Cafe/OS/common/OSCommon.h"
|
|
#include "Cafe/OS/libs/TCL/TCL.h"
|
|
|
|
#include "HW/Latte/Core/LattePM4.h"
|
|
|
|
namespace TCL
|
|
{
|
|
SysAllocator<coreinit::OSEvent> s_updateRetirementEvent;
|
|
uint64 s_currentRetireMarker = 0;
|
|
|
|
struct TCLStatePPC // mapped into PPC space
|
|
{
|
|
uint64be gpuRetireMarker; // written by GPU
|
|
};
|
|
|
|
SysAllocator<TCLStatePPC> s_tclStatePPC;
|
|
|
|
// called from GPU for timestamp EOP event
|
|
void TCLGPUNotifyNewRetirementTimestamp()
|
|
{
|
|
// gpuRetireMarker is updated via event eop command
|
|
__OSLockScheduler();
|
|
coreinit::OSSignalEventAllInternal(s_updateRetirementEvent.GetPtr());
|
|
__OSUnlockScheduler();
|
|
}
|
|
|
|
int TCLTimestamp(TCLTimestampId id, uint64be* timestampOut)
|
|
{
|
|
if (id == TCLTimestampId::TIMESTAMP_LAST_BUFFER_RETIRED)
|
|
{
|
|
MEMPTR<uint32> b;
|
|
// this is the timestamp of the last buffer that was retired by the GPU
|
|
stdx::atomic_ref<uint64be> retireTimestamp(s_tclStatePPC->gpuRetireMarker);
|
|
*timestampOut = retireTimestamp.load();
|
|
return 0;
|
|
}
|
|
else
|
|
{
|
|
cemuLog_log(LogType::Force, "TCLTimestamp(): Unsupported timestamp ID {}", (uint32)id);
|
|
*timestampOut = 0;
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
int TCLWaitTimestamp(TCLTimestampId id, uint64 waitTs, uint64 timeout)
|
|
{
|
|
if (id == TCLTimestampId::TIMESTAMP_LAST_BUFFER_RETIRED)
|
|
{
|
|
while ( true )
|
|
{
|
|
stdx::atomic_ref<uint64be> retireTimestamp(s_tclStatePPC->gpuRetireMarker);
|
|
uint64 currentTimestamp = retireTimestamp.load();
|
|
if (currentTimestamp >= waitTs)
|
|
return 0;
|
|
coreinit::OSWaitEvent(s_updateRetirementEvent.GetPtr());
|
|
}
|
|
}
|
|
else
|
|
{
|
|
cemuLog_log(LogType::Force, "TCLWaitTimestamp(): Unsupported timestamp ID {}", (uint32)id);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static constexpr uint32 TCL_RING_BUFFER_SIZE = 4096; // in U32s
|
|
|
|
std::atomic<uint32> tclRingBufferA[TCL_RING_BUFFER_SIZE];
|
|
std::atomic<uint32> tclRingBufferA_readIndex{0};
|
|
uint32 tclRingBufferA_writeIndex{0};
|
|
|
|
// GPU code calls this to grab the next command word
|
|
bool TCLGPUReadRBWord(uint32& cmdWord)
|
|
{
|
|
if (tclRingBufferA_readIndex == tclRingBufferA_writeIndex)
|
|
return false;
|
|
cmdWord = tclRingBufferA[tclRingBufferA_readIndex];
|
|
tclRingBufferA_readIndex = (tclRingBufferA_readIndex+1) % TCL_RING_BUFFER_SIZE;
|
|
return true;
|
|
}
|
|
|
|
void TCLWaitForRBSpace(uint32be numU32s)
|
|
{
|
|
while ( true )
|
|
{
|
|
uint32 distance = (tclRingBufferA_readIndex + TCL_RING_BUFFER_SIZE - tclRingBufferA_writeIndex) & (TCL_RING_BUFFER_SIZE - 1);
|
|
if (tclRingBufferA_writeIndex == tclRingBufferA_readIndex) // buffer completely empty
|
|
distance = TCL_RING_BUFFER_SIZE;
|
|
if (distance >= numU32s+1) // assume distance minus one, because we are never allowed to completely wrap around
|
|
break;
|
|
_mm_pause();
|
|
}
|
|
}
|
|
|
|
// this function assumes that TCLWaitForRBSpace was called and that there is enough space
|
|
void TCLWriteCmd(uint32be* cmd, uint32 cmdLen)
|
|
{
|
|
while (cmdLen > 0)
|
|
{
|
|
tclRingBufferA[tclRingBufferA_writeIndex] = *cmd;
|
|
tclRingBufferA_writeIndex++;
|
|
tclRingBufferA_writeIndex &= (TCL_RING_BUFFER_SIZE - 1);
|
|
cmd++;
|
|
cmdLen--;
|
|
}
|
|
}
|
|
|
|
#define EVENT_TYPE_TS 5
|
|
|
|
void TCLSubmitRetireMarker(bool triggerEventInterrupt)
|
|
{
|
|
s_currentRetireMarker++;
|
|
uint32be cmd[6];
|
|
cmd[0] = pm4HeaderType3(IT_EVENT_WRITE_EOP, 5);
|
|
cmd[1] = (4 | (EVENT_TYPE_TS << 8)); // event type (bits 8-15) and event index (bits 0-7).
|
|
cmd[2] = MEMPTR<void>(&s_tclStatePPC->gpuRetireMarker).GetMPTR(); // address lower 32bits + data sel bits
|
|
cmd[3] = 0x40000000; // select 64bit write, lower 16 bits are the upper bits of the address
|
|
if (triggerEventInterrupt)
|
|
cmd[3] |= 0x2000000; // trigger interrupt after value has been written
|
|
cmd[4] = (uint32)s_currentRetireMarker; // data lower 32 bits
|
|
cmd[5] = (uint32)(s_currentRetireMarker>>32); // data higher 32 bits
|
|
TCLWriteCmd(cmd, 6);
|
|
}
|
|
|
|
int TCLSubmitToRing(uint32be* cmd, uint32 cmdLen, betype<TCLSubmissionFlag>* controlFlags, uint64be* timestampValueOut)
|
|
{
|
|
TCLSubmissionFlag flags = *controlFlags;
|
|
cemu_assert_debug(timestampValueOut); // handle case where this is null
|
|
|
|
// make sure there is enough space to submit all commands at one
|
|
uint32 totalCommandLength = cmdLen;
|
|
totalCommandLength += 6; // space needed for TCLSubmitRetireMarker
|
|
|
|
TCLWaitForRBSpace(totalCommandLength);
|
|
|
|
// submit command buffer
|
|
TCLWriteCmd(cmd, cmdLen);
|
|
|
|
// create new marker timestamp and tell GPU to write it to our variable after its done processing the command
|
|
if ((HAS_FLAG(flags, TCLSubmissionFlag::USE_RETIRED_MARKER)))
|
|
{
|
|
TCLSubmitRetireMarker(!HAS_FLAG(flags, TCLSubmissionFlag::NO_MARKER_INTERRUPT));
|
|
*timestampValueOut = s_currentRetireMarker; // incremented before each submit
|
|
}
|
|
else
|
|
{
|
|
cemu_assert_unimplemented();
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
void Initialize()
|
|
{
|
|
cafeExportRegister("TCL", TCLSubmitToRing, LogType::Placeholder);
|
|
cafeExportRegister("TCL", TCLTimestamp, LogType::Placeholder);
|
|
cafeExportRegister("TCL", TCLWaitTimestamp, LogType::Placeholder);
|
|
|
|
s_currentRetireMarker = 0;
|
|
s_tclStatePPC->gpuRetireMarker = 0;
|
|
coreinit::OSInitEvent(s_updateRetirementEvent.GetPtr(), coreinit::OSEvent::EVENT_STATE::STATE_NOT_SIGNALED, coreinit::OSEvent::EVENT_MODE::MODE_AUTO);
|
|
}
|
|
}
|