mirror of
https://github.com/cemu-project/Cemu.git
synced 2025-07-02 04:51:19 +12:00
GX2+TCL: Reimplement command buffer submission
- GX2 utilizes TCL(.rpl) API for command submission instead of directly writing to an internal GPU fifo - Submission & retire timestamps are correctly implemented as incremental counters - Command buffering behaviour matches console - Fixes race conditions on aarch64
This commit is contained in:
parent
96765e4ac6
commit
28ea70b6d8
21 changed files with 761 additions and 472 deletions
|
@ -47,8 +47,6 @@ struct LatteGPUState_t
|
|||
gx2GPUSharedArea_t* sharedArea; // quick reference to shared area
|
||||
MPTR sharedAreaAddr;
|
||||
// other
|
||||
// todo: Currently we have the command buffer logic implemented as a FIFO ringbuffer. On real HW it's handled as a series of command buffers that are pushed individually.
|
||||
std::atomic<uint64> lastSubmittedCommandBufferTimestamp;
|
||||
uint32 gx2InitCalled; // incremented every time GX2Init() is called
|
||||
// OpenGL control
|
||||
uint32 glVendor; // GLVENDOR_*
|
||||
|
@ -75,8 +73,6 @@ struct LatteGPUState_t
|
|||
|
||||
extern LatteGPUState_t LatteGPUState;
|
||||
|
||||
extern uint8* gxRingBufferReadPtr; // currently active read pointer (gx2 ring buffer or display list)
|
||||
|
||||
// texture
|
||||
|
||||
#include "Cafe/HW/Latte/Core/LatteTexture.h"
|
||||
|
|
|
@ -13,6 +13,7 @@
|
|||
#include "Cafe/HW/Latte/Core/LattePM4.h"
|
||||
|
||||
#include "Cafe/OS/libs/coreinit/coreinit_Time.h"
|
||||
#include "Cafe/OS/libs/TCL/TCL.h" // TCL currently handles the GPU command ringbuffer
|
||||
|
||||
#include "Cafe/CafeSystem.h"
|
||||
|
||||
|
@ -28,11 +29,6 @@ typedef uint32be* LatteCMDPtr;
|
|||
#define LatteReadCMD() ((uint32)*(cmd++))
|
||||
#define LatteSkipCMD(_nWords) cmd += (_nWords)
|
||||
|
||||
uint8* gxRingBufferReadPtr; // currently active read pointer (gx2 ring buffer or display list)
|
||||
uint8* gx2CPParserDisplayListPtr;
|
||||
uint8* gx2CPParserDisplayListStart; // used for debugging
|
||||
uint8* gx2CPParserDisplayListEnd;
|
||||
|
||||
void LatteThread_HandleOSScreen();
|
||||
|
||||
void LatteThread_Exit();
|
||||
|
@ -155,16 +151,12 @@ void LatteCP_signalEnterWait()
|
|||
*/
|
||||
uint32 LatteCP_readU32Deprc()
|
||||
{
|
||||
uint32 v;
|
||||
uint8* gxRingBufferWritePtr;
|
||||
sint32 readDistance;
|
||||
// no display list active
|
||||
while (true)
|
||||
{
|
||||
gxRingBufferWritePtr = gx2WriteGatherPipe.writeGatherPtrGxBuffer[GX2::sGX2MainCoreIndex];
|
||||
readDistance = (sint32)(gxRingBufferWritePtr - gxRingBufferReadPtr);
|
||||
if (readDistance != 0)
|
||||
break;
|
||||
uint32 cmdWord;
|
||||
if ( TCL::TCLGPUReadRBWord(cmdWord) )
|
||||
return cmdWord;
|
||||
|
||||
g_renderer->NotifyLatteCommandProcessorIdle(); // let the renderer know in case it wants to flush any commands
|
||||
performanceMonitor.gpuTime_idleTime.beginMeasuring();
|
||||
|
@ -175,56 +167,8 @@ uint32 LatteCP_readU32Deprc()
|
|||
}
|
||||
LatteThread_HandleOSScreen(); // check if new frame was presented via OSScreen API
|
||||
|
||||
readDistance = (sint32)(gxRingBufferWritePtr - gxRingBufferReadPtr);
|
||||
if (readDistance != 0)
|
||||
break;
|
||||
if (Latte_GetStopSignal())
|
||||
LatteThread_Exit();
|
||||
|
||||
// still no command data available, do some other tasks
|
||||
LatteTiming_HandleTimedVsync();
|
||||
LatteAsyncCommands_checkAndExecute();
|
||||
std::this_thread::yield();
|
||||
performanceMonitor.gpuTime_idleTime.endMeasuring();
|
||||
}
|
||||
v = *(uint32*)gxRingBufferReadPtr;
|
||||
gxRingBufferReadPtr += 4;
|
||||
#ifdef CEMU_DEBUG_ASSERT
|
||||
if (v == 0xcdcdcdcd)
|
||||
assert_dbg();
|
||||
#endif
|
||||
v = _swapEndianU32(v);
|
||||
return v;
|
||||
}
|
||||
|
||||
void LatteCP_waitForNWords(uint32 numWords)
|
||||
{
|
||||
uint8* gxRingBufferWritePtr;
|
||||
sint32 readDistance;
|
||||
bool isFlushed = false;
|
||||
sint32 waitDistance = numWords * sizeof(uint32be);
|
||||
// no display list active
|
||||
while (true)
|
||||
{
|
||||
gxRingBufferWritePtr = gx2WriteGatherPipe.writeGatherPtrGxBuffer[GX2::sGX2MainCoreIndex];
|
||||
readDistance = (sint32)(gxRingBufferWritePtr - gxRingBufferReadPtr);
|
||||
if (readDistance < 0)
|
||||
return; // wrap around means there is at least one full command queued after this
|
||||
if (readDistance >= waitDistance)
|
||||
break;
|
||||
g_renderer->NotifyLatteCommandProcessorIdle(); // let the renderer know in case it wants to flush any commands
|
||||
performanceMonitor.gpuTime_idleTime.beginMeasuring();
|
||||
// no command data available, spin in a busy loop for a while then check again
|
||||
for (sint32 busy = 0; busy < 80; busy++)
|
||||
{
|
||||
_mm_pause();
|
||||
}
|
||||
readDistance = (sint32)(gxRingBufferWritePtr - gxRingBufferReadPtr);
|
||||
if (readDistance < 0)
|
||||
return; // wrap around means there is at least one full command queued after this
|
||||
if (readDistance >= waitDistance)
|
||||
break;
|
||||
|
||||
if ( TCL::TCLGPUReadRBWord(cmdWord) )
|
||||
return cmdWord;
|
||||
if (Latte_GetStopSignal())
|
||||
LatteThread_Exit();
|
||||
|
||||
|
@ -234,6 +178,7 @@ void LatteCP_waitForNWords(uint32 numWords)
|
|||
std::this_thread::yield();
|
||||
performanceMonitor.gpuTime_idleTime.endMeasuring();
|
||||
}
|
||||
UNREACHABLE;
|
||||
}
|
||||
|
||||
template<uint32 readU32()>
|
||||
|
@ -270,21 +215,23 @@ void LatteCP_itIndirectBufferDepr(LatteCMDPtr cmd, uint32 nWords)
|
|||
cemu_assert_debug(nWords == 3);
|
||||
uint32 physicalAddress = LatteReadCMD();
|
||||
uint32 physicalAddressHigh = LatteReadCMD(); // unused
|
||||
uint32 sizeInDWords = LatteReadCMD();
|
||||
uint32 displayListSize = sizeInDWords * 4;
|
||||
DrawPassContext drawPassCtx;
|
||||
uint32 sizeInU32s = LatteReadCMD();
|
||||
|
||||
#ifdef LATTE_CP_LOGGING
|
||||
if (GetAsyncKeyState('A'))
|
||||
LatteCP_DebugPrintCmdBuffer(MEMPTR<uint32be>(physicalAddress), displayListSize);
|
||||
#endif
|
||||
|
||||
uint32be* buf = MEMPTR<uint32be>(physicalAddress).GetPtr();
|
||||
drawPassCtx.PushCurrentCommandQueuePos(buf, buf, buf + sizeInDWords);
|
||||
if (sizeInU32s > 0)
|
||||
{
|
||||
DrawPassContext drawPassCtx;
|
||||
uint32be* buf = MEMPTR<uint32be>(physicalAddress).GetPtr();
|
||||
drawPassCtx.PushCurrentCommandQueuePos(buf, buf, buf + sizeInU32s);
|
||||
|
||||
LatteCP_processCommandBuffer(drawPassCtx);
|
||||
if (drawPassCtx.isWithinDrawPass())
|
||||
drawPassCtx.endDrawPass();
|
||||
LatteCP_processCommandBuffer(drawPassCtx);
|
||||
if (drawPassCtx.isWithinDrawPass())
|
||||
drawPassCtx.endDrawPass();
|
||||
}
|
||||
}
|
||||
|
||||
// pushes the command buffer to the stack
|
||||
|
@ -294,11 +241,12 @@ void LatteCP_itIndirectBuffer(LatteCMDPtr cmd, uint32 nWords, DrawPassContext& d
|
|||
uint32 physicalAddress = LatteReadCMD();
|
||||
uint32 physicalAddressHigh = LatteReadCMD(); // unused
|
||||
uint32 sizeInDWords = LatteReadCMD();
|
||||
uint32 displayListSize = sizeInDWords * 4;
|
||||
cemu_assert_debug(displayListSize >= 4);
|
||||
|
||||
uint32be* buf = MEMPTR<uint32be>(physicalAddress).GetPtr();
|
||||
drawPassCtx.PushCurrentCommandQueuePos(buf, buf, buf + sizeInDWords);
|
||||
if (sizeInDWords > 0)
|
||||
{
|
||||
uint32 displayListSize = sizeInDWords * 4;
|
||||
uint32be* buf = MEMPTR<uint32be>(physicalAddress).GetPtr();
|
||||
drawPassCtx.PushCurrentCommandQueuePos(buf, buf, buf + sizeInDWords);
|
||||
}
|
||||
}
|
||||
|
||||
LatteCMDPtr LatteCP_itStreamoutBufferUpdate(LatteCMDPtr cmd, uint32 nWords)
|
||||
|
@ -565,26 +513,55 @@ LatteCMDPtr LatteCP_itMemWrite(LatteCMDPtr cmd, uint32 nWords)
|
|||
if (word1 == 0x40000)
|
||||
{
|
||||
// write U32
|
||||
*memPtr = word2;
|
||||
stdx::atomic_ref<uint32be> atomicRef(*memPtr);
|
||||
atomicRef.store(word2);
|
||||
}
|
||||
else if (word1 == 0x00000)
|
||||
{
|
||||
// write U64 (as two U32)
|
||||
// note: The U32s are swapped
|
||||
memPtr[0] = word2;
|
||||
memPtr[1] = word3;
|
||||
// write U64
|
||||
// note: The U32s are swapped here, but needs verification. Also, it seems like the two U32 halves are written independently and the U64 as a whole is not atomic -> investiagte
|
||||
stdx::atomic_ref<uint64be> atomicRef(*(uint64be*)memPtr);
|
||||
atomicRef.store(((uint64le)word2 << 32) | word3);
|
||||
}
|
||||
else if (word1 == 0x20000)
|
||||
{
|
||||
// write U64 (little endian)
|
||||
memPtr[0] = _swapEndianU32(word2);
|
||||
memPtr[1] = _swapEndianU32(word3);
|
||||
stdx::atomic_ref<uint64le> atomicRef(*(uint64le*)memPtr);
|
||||
atomicRef.store(((uint64le)word3 << 32) | word2);
|
||||
}
|
||||
else
|
||||
cemu_assert_unimplemented();
|
||||
return cmd;
|
||||
}
|
||||
|
||||
LatteCMDPtr LatteCP_itEventWriteEOP(LatteCMDPtr cmd, uint32 nWords)
|
||||
{
|
||||
cemu_assert_debug(nWords == 5);
|
||||
uint32 word0 = LatteReadCMD();
|
||||
uint32 word1 = LatteReadCMD();
|
||||
uint32 word2 = LatteReadCMD();
|
||||
uint32 word3 = LatteReadCMD(); // value low bits
|
||||
uint32 word4 = LatteReadCMD(); // value high bits
|
||||
|
||||
cemu_assert_debug(word2 == 0x40000000 || word2 == 0x42000000);
|
||||
|
||||
if (word0 == 0x504 && (word2&0x40000000)) // todo - figure out the flags
|
||||
{
|
||||
stdx::atomic_ref<uint64be> atomicRef(*(uint64be*)memory_getPointerFromPhysicalOffset(word1));
|
||||
uint64 val = ((uint64)word4 << 32) | word3;
|
||||
atomicRef.store(val);
|
||||
}
|
||||
else
|
||||
{ cemu_assert_unimplemented();
|
||||
}
|
||||
bool triggerInterrupt = (word2 & 0x2000000) != 0;
|
||||
if (triggerInterrupt)
|
||||
{
|
||||
// todo - timestamp interrupt
|
||||
}
|
||||
TCL::TCLGPUNotifyNewRetirementTimestamp();
|
||||
return cmd;
|
||||
}
|
||||
|
||||
LatteCMDPtr LatteCP_itMemSemaphore(LatteCMDPtr cmd, uint32 nWords)
|
||||
{
|
||||
|
@ -783,16 +760,6 @@ LatteCMDPtr LatteCP_itDrawImmediate(LatteCMDPtr cmd, uint32 nWords, DrawPassCont
|
|||
|
||||
drawPassCtx.executeDraw(count, false, _tempIndexArrayMPTR);
|
||||
return cmd;
|
||||
|
||||
}
|
||||
|
||||
LatteCMDPtr LatteCP_itHLEFifoWrapAround(LatteCMDPtr cmd, uint32 nWords)
|
||||
{
|
||||
cemu_assert_debug(nWords == 1);
|
||||
uint32 unused = LatteReadCMD();
|
||||
gxRingBufferReadPtr = gx2WriteGatherPipe.gxRingBuffer;
|
||||
cmd = (LatteCMDPtr)gxRingBufferReadPtr;
|
||||
return cmd;
|
||||
}
|
||||
|
||||
LatteCMDPtr LatteCP_itHLESampleTimer(LatteCMDPtr cmd, uint32 nWords)
|
||||
|
@ -819,16 +786,6 @@ LatteCMDPtr LatteCP_itHLESpecialState(LatteCMDPtr cmd, uint32 nWords)
|
|||
return cmd;
|
||||
}
|
||||
|
||||
LatteCMDPtr LatteCP_itHLESetRetirementTimestamp(LatteCMDPtr cmd, uint32 nWords)
|
||||
{
|
||||
cemu_assert_debug(nWords == 2);
|
||||
uint32 timestampHigh = (uint32)LatteReadCMD();
|
||||
uint32 timestampLow = (uint32)LatteReadCMD();
|
||||
uint64 timestamp = ((uint64)timestampHigh << 32ULL) | (uint64)timestampLow;
|
||||
GX2::__GX2NotifyNewRetirementTimestamp(timestamp);
|
||||
return cmd;
|
||||
}
|
||||
|
||||
LatteCMDPtr LatteCP_itHLEBeginOcclusionQuery(LatteCMDPtr cmd, uint32 nWords)
|
||||
{
|
||||
cemu_assert_debug(nWords == 1);
|
||||
|
@ -1145,9 +1102,10 @@ void LatteCP_processCommandBuffer(DrawPassContext& drawPassCtx)
|
|||
LatteCMDPtr cmd, cmdStart, cmdEnd;
|
||||
if (!drawPassCtx.PopCurrentCommandQueuePos(cmd, cmdStart, cmdEnd))
|
||||
break;
|
||||
uint32 itHeader;
|
||||
while (cmd < cmdEnd)
|
||||
{
|
||||
uint32 itHeader = LatteReadCMD();
|
||||
itHeader = LatteReadCMD();
|
||||
uint32 itHeaderType = (itHeader >> 30) & 3;
|
||||
if (itHeaderType == 3)
|
||||
{
|
||||
|
@ -1361,11 +1319,6 @@ void LatteCP_processCommandBuffer(DrawPassContext& drawPassCtx)
|
|||
LatteCP_itHLEEndOcclusionQuery(cmdData, nWords);
|
||||
break;
|
||||
}
|
||||
case IT_HLE_SET_CB_RETIREMENT_TIMESTAMP:
|
||||
{
|
||||
LatteCP_itHLESetRetirementTimestamp(cmdData, nWords);
|
||||
break;
|
||||
}
|
||||
case IT_HLE_BOTTOM_OF_PIPE_CB:
|
||||
{
|
||||
LatteCP_itHLEBottomOfPipeCB(cmdData, nWords);
|
||||
|
@ -1421,6 +1374,7 @@ void LatteCP_processCommandBuffer(DrawPassContext& drawPassCtx)
|
|||
void LatteCP_ProcessRingbuffer()
|
||||
{
|
||||
sint32 timerRecheck = 0; // estimates how much CP processing time has elapsed based on the executed commands, if the value exceeds CP_TIMER_RECHECK then _handleTimers() is called
|
||||
uint32be tmpBuffer[128];
|
||||
while (true)
|
||||
{
|
||||
uint32 itHeader = LatteCP_readU32Deprc();
|
||||
|
@ -1429,10 +1383,13 @@ void LatteCP_ProcessRingbuffer()
|
|||
{
|
||||
uint32 itCode = (itHeader >> 8) & 0xFF;
|
||||
uint32 nWords = ((itHeader >> 16) & 0x3FFF) + 1;
|
||||
LatteCP_waitForNWords(nWords);
|
||||
LatteCMDPtr cmd = (LatteCMDPtr)gxRingBufferReadPtr;
|
||||
uint8* cmdEnd = gxRingBufferReadPtr + nWords * 4;
|
||||
gxRingBufferReadPtr = cmdEnd;
|
||||
cemu_assert(nWords < 128);
|
||||
for (sint32 i=0; i<nWords; i++)
|
||||
{
|
||||
uint32 word = LatteCP_readU32Deprc();
|
||||
tmpBuffer[i] = word;
|
||||
}
|
||||
LatteCMDPtr cmd = (LatteCMDPtr)tmpBuffer;
|
||||
switch (itCode)
|
||||
{
|
||||
case IT_SURFACE_SYNC:
|
||||
|
@ -1599,6 +1556,11 @@ void LatteCP_ProcessRingbuffer()
|
|||
timerRecheck += CP_TIMER_RECHECK / 512;
|
||||
break;
|
||||
}
|
||||
case IT_EVENT_WRITE_EOP:
|
||||
{
|
||||
LatteCP_itEventWriteEOP(cmd, nWords);
|
||||
break;
|
||||
}
|
||||
case IT_HLE_COPY_COLORBUFFER_TO_SCANBUFFER:
|
||||
{
|
||||
LatteCP_itHLECopyColorBufferToScanBuffer(cmd, nWords);
|
||||
|
@ -1637,12 +1599,6 @@ void LatteCP_ProcessRingbuffer()
|
|||
timerRecheck += CP_TIMER_RECHECK / 128;
|
||||
break;
|
||||
}
|
||||
case IT_HLE_FIFO_WRAP_AROUND:
|
||||
{
|
||||
LatteCP_itHLEFifoWrapAround(cmd, nWords);
|
||||
timerRecheck += CP_TIMER_RECHECK / 512;
|
||||
break;
|
||||
}
|
||||
case IT_HLE_SAMPLE_TIMER:
|
||||
{
|
||||
LatteCP_itHLESampleTimer(cmd, nWords);
|
||||
|
@ -1667,12 +1623,6 @@ void LatteCP_ProcessRingbuffer()
|
|||
timerRecheck += CP_TIMER_RECHECK / 512;
|
||||
break;
|
||||
}
|
||||
case IT_HLE_SET_CB_RETIREMENT_TIMESTAMP:
|
||||
{
|
||||
LatteCP_itHLESetRetirementTimestamp(cmd, nWords);
|
||||
timerRecheck += CP_TIMER_RECHECK / 512;
|
||||
break;
|
||||
}
|
||||
case IT_HLE_BOTTOM_OF_PIPE_CB:
|
||||
{
|
||||
LatteCP_itHLEBottomOfPipeCB(cmd, nWords);
|
||||
|
@ -1933,11 +1883,6 @@ void LatteCP_DebugPrintCmdBuffer(uint32be* bufferPtr, uint32 size)
|
|||
cemuLog_log(LogType::Force, "{} IT_HLE_COPY_SURFACE_NEW", strPrefix);
|
||||
break;
|
||||
}
|
||||
case IT_HLE_FIFO_WRAP_AROUND:
|
||||
{
|
||||
cemuLog_log(LogType::Force, "{} IT_HLE_FIFO_WRAP_AROUND", strPrefix);
|
||||
break;
|
||||
}
|
||||
case IT_HLE_SAMPLE_TIMER:
|
||||
{
|
||||
cemuLog_log(LogType::Force, "{} IT_HLE_SAMPLE_TIMER", strPrefix);
|
||||
|
@ -1958,11 +1903,6 @@ void LatteCP_DebugPrintCmdBuffer(uint32be* bufferPtr, uint32 size)
|
|||
cemuLog_log(LogType::Force, "{} IT_HLE_END_OCCLUSION_QUERY", strPrefix);
|
||||
break;
|
||||
}
|
||||
case IT_HLE_SET_CB_RETIREMENT_TIMESTAMP:
|
||||
{
|
||||
cemuLog_log(LogType::Force, "{} IT_HLE_SET_CB_RETIREMENT_TIMESTAMP", strPrefix);
|
||||
break;
|
||||
}
|
||||
case IT_HLE_BOTTOM_OF_PIPE_CB:
|
||||
{
|
||||
cemuLog_log(LogType::Force, "{} IT_HLE_BOTTOM_OF_PIPE_CB", strPrefix);
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
#define IT_MEM_WRITE 0x3D
|
||||
#define IT_SURFACE_SYNC 0x43
|
||||
#define IT_EVENT_WRITE 0x46
|
||||
#define IT_EVENT_WRITE_EOP 0x47 // end of pipe
|
||||
|
||||
#define IT_LOAD_CONFIG_REG 0x60
|
||||
#define IT_LOAD_CONTEXT_REG 0x61
|
||||
|
@ -47,14 +48,12 @@
|
|||
#define IT_HLE_WAIT_FOR_FLIP 0xF1
|
||||
#define IT_HLE_BOTTOM_OF_PIPE_CB 0xF2
|
||||
#define IT_HLE_COPY_COLORBUFFER_TO_SCANBUFFER 0xF3
|
||||
#define IT_HLE_FIFO_WRAP_AROUND 0xF4
|
||||
#define IT_HLE_CLEAR_COLOR_DEPTH_STENCIL 0xF5
|
||||
#define IT_HLE_SAMPLE_TIMER 0xF7
|
||||
#define IT_HLE_TRIGGER_SCANBUFFER_SWAP 0xF8
|
||||
#define IT_HLE_SPECIAL_STATE 0xF9
|
||||
#define IT_HLE_BEGIN_OCCLUSION_QUERY 0xFA
|
||||
#define IT_HLE_END_OCCLUSION_QUERY 0xFB
|
||||
#define IT_HLE_SET_CB_RETIREMENT_TIMESTAMP 0xFD
|
||||
|
||||
#define pm4HeaderType3(__itCode, __dataDWordCount) (0xC0000000|((uint32)(__itCode)<<8)|((uint32)((__dataDWordCount)-1)<<16))
|
||||
#define pm4HeaderType2Filler() (0x80000000)
|
||||
|
|
|
@ -207,7 +207,6 @@ int Latte_ThreadEntry()
|
|||
if (Latte_GetStopSignal())
|
||||
LatteThread_Exit();
|
||||
}
|
||||
gxRingBufferReadPtr = gx2WriteGatherPipe.gxRingBuffer;
|
||||
LatteCP_ProcessRingbuffer();
|
||||
cemu_assert_debug(false); // should never reach
|
||||
return 0;
|
||||
|
|
|
@ -1,28 +1,161 @@
|
|||
#include "Cafe/OS/common/OSCommon.h"
|
||||
#include "Cafe/OS/libs/TCL/TCL.h"
|
||||
|
||||
#include "HW/Latte/Core/LattePM4.h"
|
||||
|
||||
namespace TCL
|
||||
{
|
||||
SysAllocator<coreinit::OSEvent> s_updateRetirementEvent;
|
||||
uint64 s_currentRetireMarker = 0;
|
||||
|
||||
enum class TCL_SUBMISSION_FLAG : uint32
|
||||
struct TCLStatePPC // mapped into PPC space
|
||||
{
|
||||
SURFACE_SYNC = 0x400000, // submit surface sync packet before cmd
|
||||
TRIGGER_INTERRUPT = 0x200000, // probably
|
||||
UKN_20000000 = 0x20000000,
|
||||
uint64be gpuRetireMarker; // written by GPU
|
||||
};
|
||||
|
||||
int TCLSubmitToRing(uint32be* cmd, uint32 cmdLen, uint32be* controlFlags, uint64* submissionTimestamp)
|
||||
SysAllocator<TCLStatePPC> s_tclStatePPC;
|
||||
|
||||
// called from GPU for timestamp EOP event
|
||||
void TCLGPUNotifyNewRetirementTimestamp()
|
||||
{
|
||||
// todo - figure out all the bits of *controlFlags
|
||||
// if submissionTimestamp != nullptr then set it to the timestamp of the submission. Note: We should make sure that uint64's are written atomically by the GPU command processor
|
||||
// gpuRetireMarker is updated via event eop command
|
||||
__OSLockScheduler();
|
||||
coreinit::OSSignalEventAllInternal(s_updateRetirementEvent.GetPtr());
|
||||
__OSUnlockScheduler();
|
||||
}
|
||||
|
||||
cemu_assert_debug(false);
|
||||
int TCLTimestamp(TCLTimestampId id, uint64be* timestampOut)
|
||||
{
|
||||
if (id == TCLTimestampId::TIMESTAMP_LAST_BUFFER_RETIRED)
|
||||
{
|
||||
MEMPTR<uint32> b;
|
||||
// this is the timestamp of the last buffer that was retired by the GPU
|
||||
stdx::atomic_ref<uint64be> retireTimestamp(s_tclStatePPC->gpuRetireMarker);
|
||||
*timestampOut = retireTimestamp.load();
|
||||
return 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
cemuLog_log(LogType::Force, "TCLTimestamp(): Unsupported timestamp ID {}", (uint32)id);
|
||||
*timestampOut = 0;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
int TCLWaitTimestamp(TCLTimestampId id, uint64 waitTs, uint64 timeout)
|
||||
{
|
||||
if (id == TCLTimestampId::TIMESTAMP_LAST_BUFFER_RETIRED)
|
||||
{
|
||||
while ( true )
|
||||
{
|
||||
stdx::atomic_ref<uint64be> retireTimestamp(s_tclStatePPC->gpuRetireMarker);
|
||||
uint64 currentTimestamp = retireTimestamp.load();
|
||||
if (currentTimestamp >= waitTs)
|
||||
return 0;
|
||||
coreinit::OSWaitEvent(s_updateRetirementEvent.GetPtr());
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
cemuLog_log(LogType::Force, "TCLWaitTimestamp(): Unsupported timestamp ID {}", (uint32)id);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static constexpr uint32 TCL_RING_BUFFER_SIZE = 4096; // in U32s
|
||||
|
||||
std::atomic<uint32> tclRingBufferA[TCL_RING_BUFFER_SIZE];
|
||||
std::atomic<uint32> tclRingBufferA_readIndex{0};
|
||||
uint32 tclRingBufferA_writeIndex{0};
|
||||
|
||||
// GPU code calls this to grab the next command word
|
||||
bool TCLGPUReadRBWord(uint32& cmdWord)
|
||||
{
|
||||
if (tclRingBufferA_readIndex == tclRingBufferA_writeIndex)
|
||||
return false;
|
||||
cmdWord = tclRingBufferA[tclRingBufferA_readIndex];
|
||||
tclRingBufferA_readIndex = (tclRingBufferA_readIndex+1) % TCL_RING_BUFFER_SIZE;
|
||||
return true;
|
||||
}
|
||||
|
||||
void TCLWaitForRBSpace(uint32be numU32s)
|
||||
{
|
||||
while ( true )
|
||||
{
|
||||
uint32 distance = (tclRingBufferA_readIndex + TCL_RING_BUFFER_SIZE - tclRingBufferA_writeIndex) & (TCL_RING_BUFFER_SIZE - 1);
|
||||
if (tclRingBufferA_writeIndex == tclRingBufferA_readIndex) // buffer completely empty
|
||||
distance = TCL_RING_BUFFER_SIZE;
|
||||
if (distance >= numU32s+1) // assume distance minus one, because we are never allowed to completely wrap around
|
||||
break;
|
||||
_mm_pause();
|
||||
}
|
||||
}
|
||||
|
||||
// this function assumes that TCLWaitForRBSpace was called and that there is enough space
|
||||
void TCLWriteCmd(uint32be* cmd, uint32 cmdLen)
|
||||
{
|
||||
while (cmdLen > 0)
|
||||
{
|
||||
tclRingBufferA[tclRingBufferA_writeIndex] = *cmd;
|
||||
tclRingBufferA_writeIndex++;
|
||||
tclRingBufferA_writeIndex &= (TCL_RING_BUFFER_SIZE - 1);
|
||||
cmd++;
|
||||
cmdLen--;
|
||||
}
|
||||
}
|
||||
|
||||
#define EVENT_TYPE_TS 5
|
||||
|
||||
void TCLSubmitRetireMarker(bool triggerEventInterrupt)
|
||||
{
|
||||
s_currentRetireMarker++;
|
||||
uint32be cmd[6];
|
||||
cmd[0] = pm4HeaderType3(IT_EVENT_WRITE_EOP, 5);
|
||||
cmd[1] = (4 | (EVENT_TYPE_TS << 8)); // event type (bits 8-15) and event index (bits 0-7).
|
||||
cmd[2] = MEMPTR<void>(&s_tclStatePPC->gpuRetireMarker).GetMPTR(); // address lower 32bits + data sel bits
|
||||
cmd[3] = 0x40000000; // select 64bit write, lower 16 bits are the upper bits of the address
|
||||
if (triggerEventInterrupt)
|
||||
cmd[3] |= 0x2000000; // trigger interrupt after value has been written
|
||||
cmd[4] = (uint32)s_currentRetireMarker; // data lower 32 bits
|
||||
cmd[5] = (uint32)(s_currentRetireMarker>>32); // data higher 32 bits
|
||||
TCLWriteCmd(cmd, 6);
|
||||
}
|
||||
|
||||
int TCLSubmitToRing(uint32be* cmd, uint32 cmdLen, betype<TCLSubmissionFlag>* controlFlags, uint64be* timestampValueOut)
|
||||
{
|
||||
TCLSubmissionFlag flags = *controlFlags;
|
||||
cemu_assert_debug(timestampValueOut); // handle case where this is null
|
||||
|
||||
// make sure there is enough space to submit all commands at one
|
||||
uint32 totalCommandLength = cmdLen;
|
||||
totalCommandLength += 6; // space needed for TCLSubmitRetireMarker
|
||||
|
||||
TCLWaitForRBSpace(totalCommandLength);
|
||||
|
||||
// submit command buffer
|
||||
TCLWriteCmd(cmd, cmdLen);
|
||||
|
||||
// create new marker timestamp and tell GPU to write it to our variable after its done processing the command
|
||||
if ((HAS_FLAG(flags, TCLSubmissionFlag::USE_RETIRED_MARKER)))
|
||||
{
|
||||
TCLSubmitRetireMarker(!HAS_FLAG(flags, TCLSubmissionFlag::NO_MARKER_INTERRUPT));
|
||||
*timestampValueOut = s_currentRetireMarker; // incremented before each submit
|
||||
}
|
||||
else
|
||||
{
|
||||
cemu_assert_unimplemented();
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void Initialize()
|
||||
{
|
||||
cafeExportRegister("TCL", TCLSubmitToRing, LogType::Placeholder);
|
||||
cafeExportRegister("TCL", TCLTimestamp, LogType::Placeholder);
|
||||
cafeExportRegister("TCL", TCLWaitTimestamp, LogType::Placeholder);
|
||||
|
||||
s_currentRetireMarker = 0;
|
||||
s_tclStatePPC->gpuRetireMarker = 0;
|
||||
coreinit::OSInitEvent(s_updateRetirementEvent.GetPtr(), coreinit::OSEvent::EVENT_STATE::STATE_NOT_SIGNALED, coreinit::OSEvent::EVENT_MODE::MODE_AUTO);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,4 +1,25 @@
|
|||
namespace TCL
|
||||
{
|
||||
enum class TCLTimestampId
|
||||
{
|
||||
TIMESTAMP_LAST_BUFFER_RETIRED = 1,
|
||||
};
|
||||
|
||||
enum class TCLSubmissionFlag : uint32
|
||||
{
|
||||
SURFACE_SYNC = 0x400000, // submit surface sync packet before cmd
|
||||
NO_MARKER_INTERRUPT = 0x200000,
|
||||
USE_RETIRED_MARKER = 0x20000000, // Controls whether the timer is updated before or after (retired) the cmd. Also controls which timestamp is returned for the submission. Before and after using separate counters
|
||||
};
|
||||
|
||||
int TCLTimestamp(TCLTimestampId id, uint64be* timestampOut);
|
||||
int TCLWaitTimestamp(TCLTimestampId id, uint64 waitTs, uint64 timeout);
|
||||
int TCLSubmitToRing(uint32be* cmd, uint32 cmdLen, betype<TCLSubmissionFlag>* controlFlags, uint64be* timestampValueOut);
|
||||
|
||||
// called from Latte code
|
||||
bool TCLGPUReadRBWord(uint32& cmdWord);
|
||||
void TCLGPUNotifyNewRetirementTimestamp();
|
||||
|
||||
void Initialize();
|
||||
}
|
||||
}
|
||||
ENABLE_BITMASK_OPERATORS(TCL::TCLSubmissionFlag);
|
||||
|
|
|
@ -59,7 +59,7 @@ void gx2Export_GX2SwapScanBuffers(PPCInterpreter_t* hCPU)
|
|||
if (isPokken)
|
||||
GX2::GX2DrawDone();
|
||||
|
||||
GX2ReserveCmdSpace(5+2);
|
||||
GX2::GX2ReserveCmdSpace(5+2);
|
||||
|
||||
uint64 tick64 = PPCInterpreter_getMainCoreCycleCounter() / 20ULL;
|
||||
lastSwapTime = tick64;
|
||||
|
@ -86,24 +86,16 @@ void gx2Export_GX2SwapScanBuffers(PPCInterpreter_t* hCPU)
|
|||
GX2::GX2WaitForFlip();
|
||||
}
|
||||
|
||||
GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
|
||||
osLib_returnFromFunction(hCPU, 0);
|
||||
}
|
||||
|
||||
void gx2Export_GX2CopyColorBufferToScanBuffer(PPCInterpreter_t* hCPU)
|
||||
{
|
||||
cemuLog_log(LogType::GX2, "GX2CopyColorBufferToScanBuffer(0x{:08x},{})", hCPU->gpr[3], hCPU->gpr[4]);
|
||||
GX2ReserveCmdSpace(5);
|
||||
GX2::GX2ReserveCmdSpace(10);
|
||||
|
||||
// todo: proper implementation
|
||||
|
||||
// hack: Avoid running to far ahead of GPU. Normally this would be guaranteed by the circular buffer model, which we currently dont fully emulate
|
||||
if(GX2::GX2WriteGather_getReadWriteDistance() > 32*1024*1024 )
|
||||
{
|
||||
debug_printf("Waiting for GPU to catch up...\n");
|
||||
PPCInterpreter_relinquishTimeslice(); // release current thread
|
||||
return;
|
||||
}
|
||||
GX2ColorBuffer* colorBuffer = (GX2ColorBuffer*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]);
|
||||
|
||||
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_HLE_COPY_COLORBUFFER_TO_SCANBUFFER, 9));
|
||||
|
@ -309,81 +301,6 @@ void gx2Export_GX2SetSemaphore(PPCInterpreter_t* hCPU)
|
|||
osLib_returnFromFunction(hCPU, 0);
|
||||
}
|
||||
|
||||
void gx2Export_GX2Flush(PPCInterpreter_t* hCPU)
|
||||
{
|
||||
cemuLog_log(LogType::GX2, "GX2Flush()");
|
||||
_GX2SubmitToTCL();
|
||||
osLib_returnFromFunction(hCPU, 0);
|
||||
}
|
||||
|
||||
uint8* _GX2LastFlushPtr[PPC_CORE_COUNT] = {NULL};
|
||||
|
||||
uint64 _prevReturnedGPUTime = 0;
|
||||
|
||||
uint64 Latte_GetTime()
|
||||
{
|
||||
uint64 gpuTime = coreinit::OSGetSystemTime();
|
||||
gpuTime *= 20000ULL;
|
||||
if (gpuTime <= _prevReturnedGPUTime)
|
||||
gpuTime = _prevReturnedGPUTime + 1; // avoid ever returning identical timestamps
|
||||
_prevReturnedGPUTime = gpuTime;
|
||||
return gpuTime;
|
||||
}
|
||||
|
||||
void _GX2SubmitToTCL()
|
||||
{
|
||||
uint32 coreIndex = PPCInterpreter_getCoreIndex(PPCInterpreter_getCurrentInstance());
|
||||
// do nothing if called from non-main GX2 core
|
||||
if (GX2::sGX2MainCoreIndex != coreIndex)
|
||||
{
|
||||
cemuLog_logDebug(LogType::Force, "_GX2SubmitToTCL() called on non-main GX2 core");
|
||||
return;
|
||||
}
|
||||
if( gx2WriteGatherPipe.displayListStart[coreIndex] != MPTR_NULL )
|
||||
return; // quit if in display list
|
||||
_GX2LastFlushPtr[coreIndex] = (gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex]);
|
||||
// update last submitted CB timestamp
|
||||
uint64 commandBufferTimestamp = Latte_GetTime();
|
||||
LatteGPUState.lastSubmittedCommandBufferTimestamp.store(commandBufferTimestamp);
|
||||
cemuLog_log(LogType::GX2, "Submitting GX2 command buffer with timestamp {:016x}", commandBufferTimestamp);
|
||||
// submit HLE packet to write retirement timestamp
|
||||
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_HLE_SET_CB_RETIREMENT_TIMESTAMP, 2));
|
||||
gx2WriteGather_submitU32AsBE((uint32)(commandBufferTimestamp>>32ULL));
|
||||
gx2WriteGather_submitU32AsBE((uint32)(commandBufferTimestamp&0xFFFFFFFFULL));
|
||||
}
|
||||
|
||||
uint32 _GX2GetUnflushedBytes(uint32 coreIndex)
|
||||
{
|
||||
uint32 unflushedBytes = 0;
|
||||
if (_GX2LastFlushPtr[coreIndex] != NULL)
|
||||
{
|
||||
if (_GX2LastFlushPtr[coreIndex] > gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex])
|
||||
unflushedBytes = (uint32)(gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex] - gx2WriteGatherPipe.gxRingBuffer + 4); // this isn't 100% correct since we ignore the bytes between the last flush address and the start of the wrap around
|
||||
else
|
||||
unflushedBytes = (uint32)(gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex] - _GX2LastFlushPtr[coreIndex]);
|
||||
}
|
||||
else
|
||||
unflushedBytes = (uint32)(gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex] - gx2WriteGatherPipe.gxRingBuffer);
|
||||
return unflushedBytes;
|
||||
}
|
||||
|
||||
/*
|
||||
* Guarantees that the requested amount of space is available on the current command buffer
|
||||
* If the space is not available, the current command buffer is pushed to the GPU and a new one is allocated
|
||||
*/
|
||||
void GX2ReserveCmdSpace(uint32 reservedFreeSpaceInU32)
|
||||
{
|
||||
uint32 coreIndex = coreinit::OSGetCoreId();
|
||||
// if we are in a display list then do nothing
|
||||
if( gx2WriteGatherPipe.displayListStart[coreIndex] != MPTR_NULL )
|
||||
return;
|
||||
uint32 unflushedBytes = _GX2GetUnflushedBytes(coreIndex);
|
||||
if( unflushedBytes >= 0x1000 )
|
||||
{
|
||||
_GX2SubmitToTCL();
|
||||
}
|
||||
}
|
||||
|
||||
void gx2_load()
|
||||
{
|
||||
osLib_addFunction("gx2", "GX2GetContextStateDisplayList", gx2Export_GX2GetContextStateDisplayList);
|
||||
|
@ -445,10 +362,6 @@ void gx2_load()
|
|||
// semaphore
|
||||
osLib_addFunction("gx2", "GX2SetSemaphore", gx2Export_GX2SetSemaphore);
|
||||
|
||||
// command buffer
|
||||
osLib_addFunction("gx2", "GX2Flush", gx2Export_GX2Flush);
|
||||
|
||||
GX2::GX2Init_writeGather();
|
||||
GX2::GX2MemInit();
|
||||
GX2::GX2ResourceInit();
|
||||
GX2::GX2CommandInit();
|
||||
|
|
|
@ -67,10 +67,4 @@ void gx2Export_GX2MarkScanBufferCopied(PPCInterpreter_t* hCPU);
|
|||
|
||||
void gx2Export_GX2SetDefaultState(PPCInterpreter_t* hCPU);
|
||||
void gx2Export_GX2SetupContextStateEx(PPCInterpreter_t* hCPU);
|
||||
void gx2Export_GX2SetContextState(PPCInterpreter_t* hCPU);
|
||||
|
||||
// command buffer
|
||||
|
||||
uint32 _GX2GetUnflushedBytes(uint32 coreIndex);
|
||||
void _GX2SubmitToTCL();
|
||||
void GX2ReserveCmdSpace(uint32 reservedFreeSpaceInU32);
|
||||
void gx2Export_GX2SetContextState(PPCInterpreter_t* hCPU);
|
|
@ -132,7 +132,6 @@ namespace GX2
|
|||
depthFirstSlice = _swapEndianU32(depthBuffer->viewFirstSlice);
|
||||
depthNumSlices = _swapEndianU32(depthBuffer->viewNumSlices);
|
||||
}
|
||||
|
||||
gx2WriteGather_submit(pm4HeaderType3(IT_HLE_CLEAR_COLOR_DEPTH_STENCIL, 23),
|
||||
hleClearFlags,
|
||||
colorPhysAddr,
|
||||
|
|
|
@ -4,178 +4,397 @@
|
|||
#include "Cafe/HW/Latte/Core/LattePM4.h"
|
||||
#include "Cafe/OS/libs/coreinit/coreinit.h"
|
||||
#include "Cafe/OS/libs/coreinit/coreinit_Thread.h"
|
||||
#include "Cafe/OS/libs/TCL/TCL.h"
|
||||
#include "Cafe/HW/Latte/ISA/RegDefines.h"
|
||||
#include "GX2.h"
|
||||
#include "GX2_Command.h"
|
||||
#include "GX2_Shader.h"
|
||||
#include "GX2_Misc.h"
|
||||
#include "OS/libs/coreinit/coreinit_MEM.h"
|
||||
|
||||
extern uint8* gxRingBufferReadPtr;
|
||||
|
||||
GX2WriteGatherPipeState gx2WriteGatherPipe = { 0 };
|
||||
namespace GX2
|
||||
{
|
||||
GX2PerCoreCBState s_perCoreCBState[Espresso::CORE_COUNT];
|
||||
}
|
||||
|
||||
void gx2WriteGather_submitU32AsBE(uint32 v)
|
||||
{
|
||||
uint32 coreIndex = PPCInterpreter_getCoreIndex(PPCInterpreter_getCurrentInstance());
|
||||
if (gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex] == NULL)
|
||||
if (GX2::s_perCoreCBState[coreIndex].currentWritePtr == nullptr)
|
||||
return;
|
||||
*(uint32*)(*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex]) = _swapEndianU32(v);
|
||||
(*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex]) += 4;
|
||||
*(uint32*)(GX2::s_perCoreCBState[coreIndex].currentWritePtr) = _swapEndianU32(v);
|
||||
GX2::s_perCoreCBState[coreIndex].currentWritePtr++;
|
||||
cemu_assert_debug(GX2::s_perCoreCBState[coreIndex].currentWritePtr <= (GX2::s_perCoreCBState[coreIndex].bufferPtr + GX2::s_perCoreCBState[coreIndex].bufferSizeInU32s));
|
||||
}
|
||||
|
||||
void gx2WriteGather_submitU32AsLE(uint32 v)
|
||||
{
|
||||
uint32 coreIndex = PPCInterpreter_getCoreIndex(PPCInterpreter_getCurrentInstance());
|
||||
if (gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex] == NULL)
|
||||
if (GX2::s_perCoreCBState[coreIndex].currentWritePtr == nullptr)
|
||||
return;
|
||||
*(uint32*)(*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex]) = v;
|
||||
(*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex]) += 4;
|
||||
*(uint32*)(GX2::s_perCoreCBState[coreIndex].currentWritePtr) = v;
|
||||
GX2::s_perCoreCBState[coreIndex].currentWritePtr++;
|
||||
cemu_assert_debug(GX2::s_perCoreCBState[coreIndex].currentWritePtr <= (GX2::s_perCoreCBState[coreIndex].bufferPtr + GX2::s_perCoreCBState[coreIndex].bufferSizeInU32s));
|
||||
}
|
||||
|
||||
void gx2WriteGather_submitU32AsLEArray(uint32* v, uint32 numValues)
|
||||
{
|
||||
uint32 coreIndex = PPCInterpreter_getCoreIndex(PPCInterpreter_getCurrentInstance());
|
||||
if (gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex] == NULL)
|
||||
if (GX2::s_perCoreCBState[coreIndex].currentWritePtr == nullptr)
|
||||
return;
|
||||
memcpy_dwords((*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex]), v, numValues);
|
||||
(*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex]) += 4 * numValues;
|
||||
memcpy_dwords(GX2::s_perCoreCBState[coreIndex].currentWritePtr, v, numValues);
|
||||
GX2::s_perCoreCBState[coreIndex].currentWritePtr += numValues;
|
||||
cemu_assert_debug(GX2::s_perCoreCBState[coreIndex].currentWritePtr <= (GX2::s_perCoreCBState[coreIndex].bufferPtr + GX2::s_perCoreCBState[coreIndex].bufferSizeInU32s));
|
||||
}
|
||||
|
||||
namespace GX2
|
||||
{
|
||||
sint32 gx2WriteGatherCurrentMainCoreIndex = -1;
|
||||
bool gx2WriteGatherInited = false;
|
||||
|
||||
void GX2WriteGather_ResetToDefaultState()
|
||||
struct GX2CommandState // mapped to PPC space since the GPU writes here
|
||||
{
|
||||
gx2WriteGatherCurrentMainCoreIndex = -1;
|
||||
gx2WriteGatherInited = false;
|
||||
}
|
||||
// command pool
|
||||
MEMPTR<uint32be> commandPoolBase;
|
||||
uint32 commandPoolSizeInU32s;
|
||||
MEMPTR<uint32be> gpuCommandReadPtr;
|
||||
// timestamp
|
||||
uint64be lastSubmissionTime;
|
||||
};
|
||||
|
||||
void GX2Init_writeGather() // init write gather, make current core
|
||||
SysAllocator<GX2CommandState> s_commandState;
|
||||
GX2PerCoreCBState s_mainCoreLastCommandState;
|
||||
bool s_cbBufferIsInternallyAllocated;
|
||||
|
||||
void GX2Command_StartNewCommandBuffer(uint32 numU32s);
|
||||
|
||||
// called from GX2Init. Allocates a 4MB memory chunk from which command buffers are suballocated from
|
||||
void GX2Init_commandBufferPool(void* bufferBase, uint32 bufferSize)
|
||||
{
|
||||
if (gx2WriteGatherPipe.gxRingBuffer == NULL)
|
||||
gx2WriteGatherPipe.gxRingBuffer = (uint8*)malloc(GX2_COMMAND_RING_BUFFER_SIZE);
|
||||
if (gx2WriteGatherCurrentMainCoreIndex == sGX2MainCoreIndex)
|
||||
return; // write gather already configured for same core
|
||||
for (sint32 i = 0; i < PPC_CORE_COUNT; i++)
|
||||
cemu_assert_debug(!s_commandState->commandPoolBase); // should not be allocated already
|
||||
// setup command buffer pool. If not provided allocate a 4MB or custom size buffer
|
||||
uint32 poolSize = bufferSize ? bufferSize : 0x400000; // 4MB (can be overwritten by custom GX2Init parameters?)
|
||||
if (bufferBase)
|
||||
{
|
||||
if (i == sGX2MainCoreIndex)
|
||||
s_commandState->commandPoolBase = (uint32be*)bufferBase;
|
||||
s_cbBufferIsInternallyAllocated = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
s_commandState->commandPoolBase = (uint32be*)coreinit::_weak_MEMAllocFromDefaultHeapEx(poolSize, 0x100);
|
||||
s_cbBufferIsInternallyAllocated = true;
|
||||
}
|
||||
if (!s_commandState->commandPoolBase)
|
||||
{
|
||||
cemuLog_log(LogType::Force, "GX2: Failed to allocate command buffer pool");
|
||||
}
|
||||
s_commandState->commandPoolSizeInU32s = poolSize / sizeof(uint32be);
|
||||
s_commandState->gpuCommandReadPtr = s_commandState->commandPoolBase;
|
||||
// init per-core command buffer state
|
||||
for (uint32 i = 0; i < Espresso::CORE_COUNT; i++)
|
||||
{
|
||||
s_perCoreCBState[i].bufferPtr = nullptr;
|
||||
s_perCoreCBState[i].bufferSizeInU32s = 0;
|
||||
s_perCoreCBState[i].currentWritePtr = nullptr;
|
||||
}
|
||||
// start first command buffer for main core
|
||||
GX2Command_StartNewCommandBuffer(0x100);
|
||||
}
|
||||
|
||||
void GX2Shutdown_commandBufferPool()
|
||||
{
|
||||
if (!s_commandState->commandPoolBase)
|
||||
return;
|
||||
if (s_cbBufferIsInternallyAllocated)
|
||||
coreinit::_weak_MEMFreeToDefaultHeap(s_commandState->commandPoolBase.GetPtr());
|
||||
s_cbBufferIsInternallyAllocated = false;
|
||||
s_commandState->commandPoolBase = nullptr;
|
||||
s_commandState->commandPoolSizeInU32s = 0;
|
||||
s_commandState->gpuCommandReadPtr = nullptr;
|
||||
}
|
||||
|
||||
// current position of where the GPU is reading from. Updated via a memory write command submitted to the GPU
|
||||
uint32 GX2Command_GetPoolGPUReadIndex()
|
||||
{
|
||||
stdx::atomic_ref<MEMPTR<uint32be>> _readPtr(s_commandState->gpuCommandReadPtr);
|
||||
MEMPTR<uint32be> currentReadPtr = _readPtr.load();
|
||||
cemu_assert_debug(currentReadPtr);
|
||||
return (uint32)(currentReadPtr.GetPtr() - s_commandState->commandPoolBase.GetPtr());
|
||||
}
|
||||
|
||||
void GX2Command_WaitForNextBufferRetired()
|
||||
{
|
||||
uint64 retiredTimeStamp = GX2GetRetiredTimeStamp();
|
||||
retiredTimeStamp += 1;
|
||||
// but cant be higher than the submission timestamp
|
||||
stdx::atomic_ref<uint64be> _lastSubmissionTime(s_commandState->lastSubmissionTime);
|
||||
uint64 submissionTimeStamp = _lastSubmissionTime.load();
|
||||
if (retiredTimeStamp > submissionTimeStamp)
|
||||
retiredTimeStamp = submissionTimeStamp;
|
||||
GX2WaitTimeStamp(retiredTimeStamp);
|
||||
}
|
||||
|
||||
void GX2Command_SetupCoreCommandBuffer(uint32be* buffer, uint32 sizeInU32s, bool isDisplayList)
|
||||
{
|
||||
uint32 coreIndex = coreinit::OSGetCoreId();
|
||||
auto& coreCBState = s_perCoreCBState[coreIndex];
|
||||
coreCBState.bufferPtr = buffer;
|
||||
coreCBState.bufferSizeInU32s = sizeInU32s;
|
||||
coreCBState.currentWritePtr = buffer;
|
||||
coreCBState.isDisplayList = isDisplayList;
|
||||
}
|
||||
|
||||
void GX2Command_StartNewCommandBuffer(uint32 numU32s)
|
||||
{
|
||||
uint32 coreIndex = coreinit::OSGetCoreId();
|
||||
auto& coreCBState = s_perCoreCBState[coreIndex];
|
||||
numU32s = std::max<uint32>(numU32s, 0x100);
|
||||
// grab space from command buffer pool and if necessary wait for it
|
||||
uint32be* bufferPtr = nullptr;
|
||||
uint32 bufferSizeInU32s = 0;
|
||||
uint32 readIndex;
|
||||
while (true)
|
||||
{
|
||||
// try to grab buffer data from first available spot:
|
||||
// 1. At the current write location up to the end of the buffer (avoiding an overlap with the read location)
|
||||
// 2. From the start of the buffer up to the read location
|
||||
readIndex = GX2Command_GetPoolGPUReadIndex();
|
||||
uint32be* nextWritePos = coreCBState.bufferPtr ? coreCBState.bufferPtr + coreCBState.bufferSizeInU32s : s_commandState->commandPoolBase.GetPtr();
|
||||
uint32 writeIndex = nextWritePos - s_commandState->commandPoolBase;
|
||||
uint32 poolSizeInU32s = s_commandState->commandPoolSizeInU32s;
|
||||
// readIndex == writeIndex can mean either buffer full or buffer empty
|
||||
// we could use GX2GetRetiredTimeStamp() == GX2GetLastSubmittedTimeStamp() to determine if the buffer is truly empty
|
||||
// but this can have false negatives since the last submission timestamp is updated independently of the read index
|
||||
// so instead we just avoid ever filling the buffer completely
|
||||
cemu_assert_debug(readIndex < poolSizeInU32s);
|
||||
cemu_assert_debug(writeIndex < poolSizeInU32s);
|
||||
if (writeIndex < readIndex)
|
||||
{
|
||||
gx2WriteGatherPipe.writeGatherPtrGxBuffer[i] = gx2WriteGatherPipe.gxRingBuffer;
|
||||
gx2WriteGatherPipe.writeGatherPtrWrite[i] = &gx2WriteGatherPipe.writeGatherPtrGxBuffer[i];
|
||||
// writeIndex has wrapped around
|
||||
uint32 wordsAvailable = readIndex - writeIndex;
|
||||
if (wordsAvailable > 0)
|
||||
wordsAvailable--; // avoid writeIndex becoming equal to readIndex
|
||||
if (wordsAvailable >= numU32s)
|
||||
{
|
||||
bufferPtr = s_commandState->commandPoolBase + writeIndex;
|
||||
bufferSizeInU32s = wordsAvailable;
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
gx2WriteGatherPipe.writeGatherPtrGxBuffer[i] = NULL;
|
||||
gx2WriteGatherPipe.writeGatherPtrWrite[i] = NULL;
|
||||
uint32 wordsAvailable = poolSizeInU32s - writeIndex;
|
||||
if (wordsAvailable > 0)
|
||||
wordsAvailable--; // avoid writeIndex becoming equal to readIndex
|
||||
if (wordsAvailable >= numU32s)
|
||||
{
|
||||
bufferPtr = nextWritePos;
|
||||
bufferSizeInU32s = wordsAvailable;
|
||||
break;
|
||||
}
|
||||
// not enough space at end of buffer, try to grab from the beginning of the buffer
|
||||
wordsAvailable = readIndex;
|
||||
if (wordsAvailable > 0)
|
||||
wordsAvailable--; // avoid writeIndex becoming equal to readIndex
|
||||
if (wordsAvailable >= numU32s)
|
||||
{
|
||||
bufferPtr = s_commandState->commandPoolBase;
|
||||
bufferSizeInU32s = wordsAvailable;
|
||||
break;
|
||||
}
|
||||
}
|
||||
gx2WriteGatherPipe.displayListStart[i] = MPTR_NULL;
|
||||
gx2WriteGatherPipe.writeGatherPtrDisplayList[i] = NULL;
|
||||
gx2WriteGatherPipe.displayListMaxSize[i] = 0;
|
||||
GX2Command_WaitForNextBufferRetired();
|
||||
}
|
||||
cemu_assert_debug(bufferPtr);
|
||||
bufferSizeInU32s = std::min<uint32>(numU32s, 0x20000); // size cap
|
||||
#ifdef CEMU_DEBUG_ASSERT
|
||||
uint32 newWriteIndex = ((bufferPtr - s_commandState->commandPoolBase) + bufferSizeInU32s) % s_commandState->commandPoolSizeInU32s;
|
||||
cemu_assert_debug(newWriteIndex != readIndex);
|
||||
#endif
|
||||
// setup buffer and make it the current write gather target
|
||||
cemu_assert_debug(bufferPtr >= s_commandState->commandPoolBase && (bufferPtr + bufferSizeInU32s) <= s_commandState->commandPoolBase + s_commandState->commandPoolSizeInU32s);
|
||||
GX2Command_SetupCoreCommandBuffer(bufferPtr, bufferSizeInU32s, false);
|
||||
}
|
||||
|
||||
void GX2Command_SubmitCommandBuffer(uint32be* buffer, uint32 sizeInU32s, MEMPTR<uint32be>* completionGPUReadPointer, bool triggerMarkerInterrupt)
|
||||
{
|
||||
uint32be cmd[10];
|
||||
uint32 cmdLen = 4;
|
||||
cmd[0] = pm4HeaderType3(IT_INDIRECT_BUFFER_PRIV, 3);
|
||||
cmd[1] = memory_virtualToPhysical(MEMPTR<void>(buffer).GetMPTR());
|
||||
cmd[2] = 0x00000000; // address high bits
|
||||
cmd[3] = sizeInU32s;
|
||||
if (completionGPUReadPointer)
|
||||
{
|
||||
// append command to update completionGPUReadPointer after the GPU is done with the command buffer
|
||||
cmd[4] = pm4HeaderType3(IT_MEM_WRITE, 4);
|
||||
cmd[5] = memory_virtualToPhysical(MEMPTR<void>(completionGPUReadPointer).GetMPTR()) | 2;
|
||||
cmd[6] = 0x40000;
|
||||
cmd[7] = MEMPTR<void>(buffer + sizeInU32s).GetMPTR(); // value to write
|
||||
cmd[8] = 0x00000000;
|
||||
cmdLen = 9;
|
||||
}
|
||||
|
||||
betype<TCL::TCLSubmissionFlag> submissionFlags{};
|
||||
if (!triggerMarkerInterrupt)
|
||||
submissionFlags |= TCL::TCLSubmissionFlag::NO_MARKER_INTERRUPT;
|
||||
submissionFlags |= TCL::TCLSubmissionFlag::USE_RETIRED_MARKER;
|
||||
|
||||
TCL::TCLSubmitToRing(cmd, cmdLen, &submissionFlags, &s_commandState->lastSubmissionTime);
|
||||
}
|
||||
|
||||
void GX2Command_PadCurrentBuffer()
|
||||
{
|
||||
uint32 coreIndex = coreinit::OSGetCoreId();
|
||||
auto& coreCBState = s_perCoreCBState[coreIndex];
|
||||
if (!coreCBState.currentWritePtr)
|
||||
return;
|
||||
uint32 writeDistance = (uint32)(coreCBState.currentWritePtr - coreCBState.bufferPtr);
|
||||
if ((writeDistance&7) != 0)
|
||||
{
|
||||
uint32 distanceToPad = 0x8 - (writeDistance & 0x7);
|
||||
while (distanceToPad)
|
||||
{
|
||||
*coreCBState.currentWritePtr = pm4HeaderType2Filler();
|
||||
coreCBState.currentWritePtr++;
|
||||
distanceToPad--;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GX2Command_Flush(uint32 numU32sForNextBuffer, bool triggerMarkerInterrupt)
|
||||
{
|
||||
uint32 coreIndex = coreinit::OSGetCoreId();
|
||||
auto& coreCBState = s_perCoreCBState[coreIndex];
|
||||
if (coreCBState.isDisplayList)
|
||||
{
|
||||
// display list
|
||||
cemu_assert_debug((uint32)(coreCBState.currentWritePtr - coreCBState.bufferPtr) < coreCBState.bufferSizeInU32s);
|
||||
cemuLog_logDebugOnce(LogType::Force, "GX2 flush called on display list");
|
||||
}
|
||||
else
|
||||
{
|
||||
// command buffer
|
||||
if (coreCBState.currentWritePtr != coreCBState.bufferPtr)
|
||||
{
|
||||
// pad the command buffer to 32 byte alignment
|
||||
GX2Command_PadCurrentBuffer();
|
||||
// submit it to the GPU
|
||||
uint32 bufferLength = (uint32)(coreCBState.currentWritePtr - coreCBState.bufferPtr);
|
||||
cemu_assert_debug(bufferLength <= coreCBState.bufferSizeInU32s);
|
||||
GX2Command_SubmitCommandBuffer(coreCBState.bufferPtr, bufferLength, &s_commandState->gpuCommandReadPtr, triggerMarkerInterrupt);
|
||||
GX2Command_StartNewCommandBuffer(numU32sForNextBuffer);
|
||||
}
|
||||
else
|
||||
{
|
||||
// current buffer is empty so we dont need to queue it
|
||||
if (numU32sForNextBuffer > s_commandState->commandPoolSizeInU32s)
|
||||
GX2Command_StartNewCommandBuffer(numU32sForNextBuffer);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GX2Flush()
|
||||
{
|
||||
GX2Command_Flush(256, true);
|
||||
}
|
||||
|
||||
uint64 GX2GetLastSubmittedTimeStamp()
|
||||
{
|
||||
stdx::atomic_ref<uint64be> _lastSubmissionTime(s_commandState->lastSubmissionTime);
|
||||
return _lastSubmissionTime.load();
|
||||
}
|
||||
|
||||
uint64 GX2GetRetiredTimeStamp()
|
||||
{
|
||||
uint64be ts = 0;
|
||||
TCL::TCLTimestamp(TCL::TCLTimestampId::TIMESTAMP_LAST_BUFFER_RETIRED, &ts);
|
||||
return ts;
|
||||
}
|
||||
|
||||
bool GX2WaitTimeStamp(uint64 tsWait)
|
||||
{
|
||||
// handle GPU timeout here? But for now we timeout after 60 seconds
|
||||
TCL::TCLWaitTimestamp(TCL::TCLTimestampId::TIMESTAMP_LAST_BUFFER_RETIRED, tsWait, Espresso::TIMER_CLOCK * 60);
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Guarantees that the requested amount of space is available on the current command buffer
|
||||
* If the space is not available, the current command buffer is pushed to the GPU and a new one is allocated
|
||||
*/
|
||||
void GX2ReserveCmdSpace(uint32 reservedFreeSpaceInU32)
|
||||
{
|
||||
uint32 coreIndex = coreinit::OSGetCoreId();
|
||||
auto& coreCBState = s_perCoreCBState[coreIndex];
|
||||
if (coreCBState.currentWritePtr == nullptr)
|
||||
return;
|
||||
uint32 writeDistance = (uint32)(coreCBState.currentWritePtr - coreCBState.bufferPtr);
|
||||
if (writeDistance + reservedFreeSpaceInU32 > coreCBState.bufferSizeInU32s)
|
||||
{
|
||||
GX2Command_Flush(reservedFreeSpaceInU32, true);
|
||||
}
|
||||
gx2WriteGatherCurrentMainCoreIndex = sGX2MainCoreIndex;
|
||||
gx2WriteGatherInited = true;
|
||||
}
|
||||
|
||||
void GX2WriteGather_beginDisplayList(PPCInterpreter_t* hCPU, MPTR buffer, uint32 maxSize)
|
||||
{
|
||||
uint32 coreIndex = PPCInterpreter_getCoreIndex(hCPU);
|
||||
gx2WriteGatherPipe.displayListStart[coreIndex] = buffer;
|
||||
gx2WriteGatherPipe.displayListMaxSize[coreIndex] = maxSize;
|
||||
// set new write gather ptr
|
||||
gx2WriteGatherPipe.writeGatherPtrDisplayList[coreIndex] = memory_getPointerFromVirtualOffset(gx2WriteGatherPipe.displayListStart[coreIndex]);
|
||||
gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex] = &gx2WriteGatherPipe.writeGatherPtrDisplayList[coreIndex];
|
||||
if (coreIndex == sGX2MainCoreIndex)
|
||||
{
|
||||
GX2Command_PadCurrentBuffer();
|
||||
cemu_assert_debug(!s_perCoreCBState[coreIndex].isDisplayList);
|
||||
s_mainCoreLastCommandState = s_perCoreCBState[coreIndex];
|
||||
}
|
||||
GX2Command_SetupCoreCommandBuffer(MEMPTR<uint32be>(buffer), maxSize/4, true);
|
||||
}
|
||||
|
||||
uint32 GX2WriteGather_getDisplayListWriteDistance(sint32 coreIndex)
|
||||
{
|
||||
return (uint32)(*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex] - memory_getPointerFromVirtualOffset(gx2WriteGatherPipe.displayListStart[coreIndex]));
|
||||
}
|
||||
|
||||
uint32 GX2WriteGather_getFifoWriteDistance(uint32 coreIndex)
|
||||
{
|
||||
uint32 writeDistance = (uint32)(gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex] - gx2WriteGatherPipe.gxRingBuffer);
|
||||
return writeDistance;
|
||||
auto& coreCBState = s_perCoreCBState[coreIndex];
|
||||
cemu_assert_debug(coreCBState.isDisplayList);
|
||||
if (coreCBState.currentWritePtr == nullptr)
|
||||
return 0;
|
||||
return (uint32)(coreCBState.currentWritePtr - coreCBState.bufferPtr) * 4;
|
||||
}
|
||||
|
||||
uint32 GX2WriteGather_endDisplayList(PPCInterpreter_t* hCPU, MPTR buffer)
|
||||
{
|
||||
uint32 coreIndex = PPCInterpreter_getCoreIndex(hCPU);
|
||||
if (gx2WriteGatherPipe.displayListStart[coreIndex] != MPTR_NULL)
|
||||
uint32 coreIndex = coreinit::OSGetCoreId();
|
||||
auto& coreCBState = s_perCoreCBState[coreIndex];
|
||||
GX2Command_PadCurrentBuffer();
|
||||
uint32 finalWriteIndex = coreCBState.currentWritePtr - coreCBState.bufferPtr;
|
||||
cemu_assert_debug(finalWriteIndex <= coreCBState.bufferSizeInU32s);
|
||||
// if we are on the main GX2 core then restore the GPU command buffer
|
||||
if (coreIndex == sGX2MainCoreIndex)
|
||||
{
|
||||
uint32 currentWriteSize = GX2WriteGather_getDisplayListWriteDistance(coreIndex);
|
||||
// pad to 32 byte
|
||||
if (gx2WriteGatherPipe.displayListMaxSize[coreIndex] >= ((gx2WriteGatherPipe.displayListMaxSize[coreIndex] + 0x1F) & ~0x1F))
|
||||
{
|
||||
while ((currentWriteSize & 0x1F) != 0)
|
||||
{
|
||||
gx2WriteGather_submitU32AsBE(pm4HeaderType2Filler());
|
||||
currentWriteSize += 4;
|
||||
}
|
||||
}
|
||||
// get size of written data
|
||||
currentWriteSize = GX2WriteGather_getDisplayListWriteDistance(coreIndex);
|
||||
// disable current display list and restore write gather ptr
|
||||
gx2WriteGatherPipe.displayListStart[coreIndex] = MPTR_NULL;
|
||||
if (sGX2MainCoreIndex == coreIndex)
|
||||
gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex] = &gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex];
|
||||
else
|
||||
gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex] = NULL;
|
||||
// return size of (written) display list
|
||||
return currentWriteSize;
|
||||
coreCBState = s_mainCoreLastCommandState;
|
||||
}
|
||||
else
|
||||
{
|
||||
// no active display list
|
||||
// return a size of 0
|
||||
return 0;
|
||||
coreCBState.bufferPtr = nullptr;
|
||||
coreCBState.currentWritePtr = nullptr;
|
||||
coreCBState.bufferSizeInU32s = 0;
|
||||
coreCBState.isDisplayList = false;
|
||||
}
|
||||
return finalWriteIndex * 4;
|
||||
}
|
||||
|
||||
bool GX2GetCurrentDisplayList(betype<MPTR>* displayListAddr, uint32be* displayListSize)
|
||||
bool GX2GetCurrentDisplayList(MEMPTR<uint32be>* displayListAddr, uint32be* displayListSize)
|
||||
{
|
||||
uint32 coreIndex = coreinit::OSGetCoreId();
|
||||
if (gx2WriteGatherPipe.displayListStart[coreIndex] == MPTR_NULL)
|
||||
auto& coreCBState = s_perCoreCBState[coreIndex];
|
||||
if (!coreCBState.isDisplayList)
|
||||
return false;
|
||||
|
||||
if (displayListAddr)
|
||||
*displayListAddr = gx2WriteGatherPipe.displayListStart[coreIndex];
|
||||
*displayListAddr = coreCBState.bufferPtr;
|
||||
if (displayListSize)
|
||||
*displayListSize = gx2WriteGatherPipe.displayListMaxSize[coreIndex];
|
||||
|
||||
*displayListSize = coreCBState.bufferSizeInU32s * sizeof(uint32be);
|
||||
return true;
|
||||
}
|
||||
|
||||
// returns true if we are writing to a display list
|
||||
bool GX2GetDisplayListWriteStatus()
|
||||
{
|
||||
// returns true if we are writing to a display list
|
||||
uint32 coreIndex = coreinit::OSGetCoreId();
|
||||
return gx2WriteGatherPipe.displayListStart[coreIndex] != MPTR_NULL;
|
||||
}
|
||||
|
||||
uint32 GX2WriteGather_getReadWriteDistance()
|
||||
{
|
||||
uint32 coreIndex = sGX2MainCoreIndex;
|
||||
uint32 writeDistance = (uint32)(gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex] + GX2_COMMAND_RING_BUFFER_SIZE - gxRingBufferReadPtr);
|
||||
writeDistance %= GX2_COMMAND_RING_BUFFER_SIZE;
|
||||
return writeDistance;
|
||||
}
|
||||
|
||||
void GX2WriteGather_checkAndInsertWrapAroundMark()
|
||||
{
|
||||
uint32 coreIndex = coreinit::OSGetCoreId();
|
||||
if (coreIndex != sGX2MainCoreIndex) // only if main gx2 core
|
||||
return;
|
||||
if (gx2WriteGatherPipe.displayListStart[coreIndex] != MPTR_NULL)
|
||||
return;
|
||||
uint32 writeDistance = GX2WriteGather_getFifoWriteDistance(coreIndex);
|
||||
if (writeDistance >= (GX2_COMMAND_RING_BUFFER_SIZE * 3 / 5))
|
||||
{
|
||||
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_HLE_FIFO_WRAP_AROUND, 1));
|
||||
gx2WriteGather_submitU32AsBE(0); // empty word since we can't send commands with zero data words
|
||||
gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex] = gx2WriteGatherPipe.gxRingBuffer;
|
||||
}
|
||||
return s_perCoreCBState[coreIndex].isDisplayList;
|
||||
}
|
||||
|
||||
void GX2BeginDisplayList(MEMPTR<void> displayListAddr, uint32 size)
|
||||
|
@ -204,28 +423,23 @@ namespace GX2
|
|||
memory_virtualToPhysical(addr),
|
||||
0, // high address bits
|
||||
size / 4);
|
||||
GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
|
||||
}
|
||||
|
||||
void GX2DirectCallDisplayList(void* addr, uint32 size)
|
||||
{
|
||||
// this API submits to TCL directly and bypasses write-gatherer
|
||||
// its basically a way to manually submit a command buffer to the GPU
|
||||
// as such it also affects the submission and retire timestamps
|
||||
|
||||
uint32 coreIndex = PPCInterpreter_getCoreIndex(PPCInterpreter_getCurrentInstance());
|
||||
cemu_assert_debug(coreIndex == sGX2MainCoreIndex);
|
||||
coreIndex = sGX2MainCoreIndex; // always submit to main queue which is owned by GX2 main core (TCLSubmitToRing does not need this workaround)
|
||||
|
||||
uint32be* cmdStream = (uint32be*)(gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex]);
|
||||
cmdStream[0] = pm4HeaderType3(IT_INDIRECT_BUFFER_PRIV, 3);
|
||||
cmdStream[1] = memory_virtualToPhysical(MEMPTR<void>(addr).GetMPTR());
|
||||
cmdStream[2] = 0;
|
||||
cmdStream[3] = size / 4;
|
||||
gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex] += 16;
|
||||
|
||||
// update submission timestamp and retired timestamp
|
||||
_GX2SubmitToTCL();
|
||||
uint32 coreIndex = coreinit::OSGetCoreId();
|
||||
if (coreIndex != sGX2MainCoreIndex)
|
||||
{
|
||||
cemuLog_logDebugOnce(LogType::Force, "GX2DirectCallDisplayList() called on non-main GX2 core");
|
||||
}
|
||||
if (!s_perCoreCBState[coreIndex].isDisplayList)
|
||||
{
|
||||
// make sure any preceeding commands are submitted first
|
||||
GX2Command_Flush(0x100, false);
|
||||
}
|
||||
GX2Command_SubmitCommandBuffer(static_cast<uint32be*>(addr), size / 4, nullptr, false);
|
||||
}
|
||||
|
||||
void GX2CopyDisplayList(MEMPTR<uint32be*> addr, uint32 size)
|
||||
|
@ -288,6 +502,12 @@ namespace GX2
|
|||
|
||||
void GX2CommandInit()
|
||||
{
|
||||
cafeExportRegister("gx2", GX2Flush, LogType::GX2);
|
||||
|
||||
cafeExportRegister("gx2", GX2GetLastSubmittedTimeStamp, LogType::GX2);
|
||||
cafeExportRegister("gx2", GX2GetRetiredTimeStamp, LogType::GX2);
|
||||
cafeExportRegister("gx2", GX2WaitTimeStamp, LogType::GX2);
|
||||
|
||||
cafeExportRegister("gx2", GX2BeginDisplayList, LogType::GX2);
|
||||
cafeExportRegister("gx2", GX2BeginDisplayListEx, LogType::GX2);
|
||||
cafeExportRegister("gx2", GX2EndDisplayList, LogType::GX2);
|
||||
|
@ -295,7 +515,6 @@ namespace GX2
|
|||
cafeExportRegister("gx2", GX2GetCurrentDisplayList, LogType::GX2);
|
||||
cafeExportRegister("gx2", GX2GetDisplayListWriteStatus, LogType::GX2);
|
||||
|
||||
|
||||
cafeExportRegister("gx2", GX2CallDisplayList, LogType::GX2);
|
||||
cafeExportRegister("gx2", GX2DirectCallDisplayList, LogType::GX2);
|
||||
cafeExportRegister("gx2", GX2CopyDisplayList, LogType::GX2);
|
||||
|
@ -305,7 +524,10 @@ namespace GX2
|
|||
|
||||
void GX2CommandResetToDefaultState()
|
||||
{
|
||||
GX2WriteGather_ResetToDefaultState();
|
||||
s_commandState->commandPoolBase = nullptr;
|
||||
s_commandState->commandPoolSizeInU32s = 0;
|
||||
s_commandState->gpuCommandReadPtr = nullptr;
|
||||
s_cbBufferIsInternallyAllocated = false;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -2,21 +2,19 @@
|
|||
#include "Cafe/HW/Latte/ISA/LatteReg.h"
|
||||
#include "Cafe/HW/Espresso/Const.h"
|
||||
|
||||
struct GX2WriteGatherPipeState
|
||||
namespace GX2
|
||||
{
|
||||
uint8* gxRingBuffer;
|
||||
// each core has it's own write gatherer and display list state (writing)
|
||||
uint8* writeGatherPtrGxBuffer[Espresso::CORE_COUNT];
|
||||
uint8** writeGatherPtrWrite[Espresso::CORE_COUNT];
|
||||
uint8* writeGatherPtrDisplayList[Espresso::CORE_COUNT];
|
||||
MPTR displayListStart[Espresso::CORE_COUNT];
|
||||
uint32 displayListMaxSize[Espresso::CORE_COUNT];
|
||||
struct GX2PerCoreCBState
|
||||
{
|
||||
uint32be* bufferPtr;
|
||||
uint32 bufferSizeInU32s;
|
||||
uint32be* currentWritePtr;
|
||||
bool isDisplayList;
|
||||
};
|
||||
|
||||
extern GX2PerCoreCBState s_perCoreCBState[Espresso::CORE_COUNT];
|
||||
};
|
||||
|
||||
extern GX2WriteGatherPipeState gx2WriteGatherPipe;
|
||||
|
||||
void GX2ReserveCmdSpace(uint32 reservedFreeSpaceInU32); // move to GX2 namespace eventually
|
||||
|
||||
void gx2WriteGather_submitU32AsBE(uint32 v);
|
||||
void gx2WriteGather_submitU32AsLE(uint32 v);
|
||||
void gx2WriteGather_submitU32AsLEArray(uint32* v, uint32 numValues);
|
||||
|
@ -27,7 +25,8 @@ uint32 PPCInterpreter_getCurrentCoreIndex();
|
|||
template <typename ...Targs>
|
||||
inline void gx2WriteGather_submit_(uint32 coreIndex, uint32be* writePtr)
|
||||
{
|
||||
(*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex]) = (uint8*)writePtr;
|
||||
GX2::s_perCoreCBState[coreIndex].currentWritePtr = writePtr;
|
||||
cemu_assert_debug(GX2::s_perCoreCBState[coreIndex].currentWritePtr <= (GX2::s_perCoreCBState[coreIndex].bufferPtr + GX2::s_perCoreCBState[coreIndex].bufferSizeInU32s));
|
||||
}
|
||||
|
||||
template <typename T, typename ...Targs>
|
||||
|
@ -75,17 +74,23 @@ template <typename ...Targs>
|
|||
inline void gx2WriteGather_submit(Targs... args)
|
||||
{
|
||||
uint32 coreIndex = PPCInterpreter_getCurrentCoreIndex();
|
||||
if (gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex] == nullptr)
|
||||
if (GX2::s_perCoreCBState[coreIndex].currentWritePtr == nullptr)
|
||||
{
|
||||
cemu_assert_suspicious(); // writing to command buffer without valid write pointer?
|
||||
return;
|
||||
|
||||
uint32be* writePtr = (uint32be*)(*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex]);
|
||||
}
|
||||
uint32be* writePtr = GX2::s_perCoreCBState[coreIndex].currentWritePtr;
|
||||
gx2WriteGather_submit_(coreIndex, writePtr, std::forward<Targs>(args)...);
|
||||
}
|
||||
|
||||
namespace GX2
|
||||
{
|
||||
uint32 GX2WriteGather_getReadWriteDistance();
|
||||
void GX2WriteGather_checkAndInsertWrapAroundMark();
|
||||
void GX2Command_Flush(uint32 numU32sForNextBuffer, bool triggerMarkerInterrupt = true);
|
||||
void GX2ReserveCmdSpace(uint32 reservedFreeSpaceInU32);
|
||||
|
||||
uint64 GX2GetLastSubmittedTimeStamp();
|
||||
uint64 GX2GetRetiredTimeStamp();
|
||||
bool GX2WaitTimeStamp(uint64 tsWait);
|
||||
|
||||
void GX2BeginDisplayList(MEMPTR<void> displayListAddr, uint32 size);
|
||||
void GX2BeginDisplayListEx(MEMPTR<void> displayListAddr, uint32 size, bool profiling);
|
||||
|
@ -96,7 +101,8 @@ namespace GX2
|
|||
|
||||
bool GX2GetDisplayListWriteStatus();
|
||||
|
||||
void GX2Init_writeGather();
|
||||
void GX2CommandInit();
|
||||
void GX2Init_commandBufferPool(void* bufferBase, uint32 bufferSize);
|
||||
void GX2Shutdown_commandBufferPool();
|
||||
void GX2CommandResetToDefaultState();
|
||||
}
|
|
@ -168,7 +168,7 @@ uint32 _GX2Context_CalcStateSize()
|
|||
|
||||
void _GX2Context_CreateLoadDL()
|
||||
{
|
||||
GX2ReserveCmdSpace(3);
|
||||
GX2::GX2ReserveCmdSpace(3);
|
||||
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_CONTEXT_CONTROL, 2));
|
||||
gx2WriteGather_submitU32AsBE(0x80000077);
|
||||
gx2WriteGather_submitU32AsBE(0x80000077);
|
||||
|
@ -176,7 +176,7 @@ void _GX2Context_CreateLoadDL()
|
|||
|
||||
void _GX2Context_WriteCmdDisableStateShadowing()
|
||||
{
|
||||
GX2ReserveCmdSpace(3);
|
||||
GX2::GX2ReserveCmdSpace(3);
|
||||
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_CONTEXT_CONTROL, 2));
|
||||
gx2WriteGather_submitU32AsBE(0x80000000);
|
||||
gx2WriteGather_submitU32AsBE(0x80000000);
|
||||
|
@ -184,7 +184,7 @@ void _GX2Context_WriteCmdDisableStateShadowing()
|
|||
|
||||
void _GX2Context_cmdLoad(void* gx2ukn, uint32 pm4Header, MPTR physAddrRegArea, uint32 waitForIdle, uint32 numRegOffsetEntries, GX2RegLoadPktEntry_t* regOffsetEntries)
|
||||
{
|
||||
GX2ReserveCmdSpace(3 + numRegOffsetEntries*2);
|
||||
GX2::GX2ReserveCmdSpace(3 + numRegOffsetEntries*2);
|
||||
gx2WriteGather_submitU32AsBE(pm4Header);
|
||||
gx2WriteGather_submitU32AsBE(physAddrRegArea);
|
||||
gx2WriteGather_submitU32AsBE(waitForIdle);
|
||||
|
@ -199,7 +199,6 @@ void _GX2Context_cmdLoad(void* gx2ukn, uint32 pm4Header, MPTR physAddrRegArea, u
|
|||
|
||||
void _GX2Context_WriteCmdRestoreState(GX2ContextState_t* gx2ContextState, uint32 ukn)
|
||||
{
|
||||
GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
|
||||
MPTR physAddrContextState = memory_virtualToPhysical(memory_getVirtualOffsetFromPointer(gx2ContextState));
|
||||
_GX2Context_CreateLoadDL();
|
||||
__cmdStateLoad(NULL, IT_LOAD_CONFIG_REG, gx2ContextState->hwContext.areaConfigReg, 0x80000000, configReg_loadPktEntries);
|
||||
|
@ -212,7 +211,7 @@ void _GX2Context_WriteCmdRestoreState(GX2ContextState_t* gx2ContextState, uint32
|
|||
|
||||
void GX2SetDefaultState()
|
||||
{
|
||||
GX2ReserveCmdSpace(0x100);
|
||||
GX2::GX2ReserveCmdSpace(0x100);
|
||||
|
||||
Latte::LATTE_PA_CL_VTE_CNTL reg{};
|
||||
reg.set_VPORT_X_OFFSET_ENA(true).set_VPORT_X_SCALE_ENA(true);
|
||||
|
@ -376,7 +375,6 @@ void gx2Export_GX2SetContextState(PPCInterpreter_t* hCPU)
|
|||
osLib_returnFromFunction(hCPU, 0);
|
||||
}
|
||||
|
||||
|
||||
void gx2Export_GX2GetContextStateDisplayList(PPCInterpreter_t* hCPU)
|
||||
{
|
||||
cemuLog_log(LogType::GX2, "GX2GetContextStateDisplayList(0x{:08x}, 0x{:08x}, 0x{:08x})", hCPU->gpr[3], hCPU->gpr[4], hCPU->gpr[5]);
|
||||
|
|
|
@ -52,7 +52,6 @@ namespace GX2
|
|||
0,
|
||||
count,
|
||||
0);
|
||||
GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
|
||||
}
|
||||
|
||||
void GX2DrawIndexedEx2(GX2PrimitiveMode2 primitiveMode, uint32 count, GX2IndexType indexType, void* indexData, uint32 baseVertex, uint32 numInstances, uint32 baseInstance)
|
||||
|
@ -85,7 +84,6 @@ namespace GX2
|
|||
pm4HeaderType3(IT_SET_CTL_CONST, 2), 1,
|
||||
0 // baseInstance
|
||||
);
|
||||
GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
|
||||
}
|
||||
|
||||
void GX2DrawEx(GX2PrimitiveMode2 primitiveMode, uint32 count, uint32 baseVertex, uint32 numInstances)
|
||||
|
@ -109,7 +107,6 @@ namespace GX2
|
|||
count,
|
||||
0 // DRAW_INITIATOR
|
||||
);
|
||||
GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
|
||||
}
|
||||
|
||||
void GX2DrawIndexedImmediateEx(GX2PrimitiveMode2 primitiveMode, uint32 count, GX2IndexType indexType, void* indexData, uint32 baseVertex, uint32 numInstances)
|
||||
|
@ -177,7 +174,6 @@ namespace GX2
|
|||
}
|
||||
}
|
||||
|
||||
GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
|
||||
}
|
||||
|
||||
struct GX2DispatchComputeParam
|
||||
|
|
|
@ -16,18 +16,6 @@ namespace GX2
|
|||
SysAllocator<coreinit::OSThreadQueue> g_vsyncThreadQueue;
|
||||
SysAllocator<coreinit::OSThreadQueue> g_flipThreadQueue;
|
||||
|
||||
SysAllocator<coreinit::OSEvent> s_updateRetirementEvent;
|
||||
std::atomic<uint64> s_lastRetirementTimestamp = 0;
|
||||
|
||||
// called from GPU code when a command buffer is retired
|
||||
void __GX2NotifyNewRetirementTimestamp(uint64 tsRetire)
|
||||
{
|
||||
__OSLockScheduler();
|
||||
s_lastRetirementTimestamp = tsRetire;
|
||||
coreinit::OSSignalEventAllInternal(s_updateRetirementEvent.GetPtr());
|
||||
__OSUnlockScheduler();
|
||||
}
|
||||
|
||||
void GX2SetGPUFence(uint32be* fencePtr, uint32 mask, uint32 compareOp, uint32 compareValue)
|
||||
{
|
||||
GX2ReserveCmdSpace(7);
|
||||
|
@ -210,16 +198,6 @@ namespace GX2
|
|||
osLib_returnFromFunction(hCPU, 0);
|
||||
}
|
||||
|
||||
uint64 GX2GetLastSubmittedTimeStamp()
|
||||
{
|
||||
return LatteGPUState.lastSubmittedCommandBufferTimestamp.load();
|
||||
}
|
||||
|
||||
uint64 GX2GetRetiredTimeStamp()
|
||||
{
|
||||
return s_lastRetirementTimestamp;
|
||||
}
|
||||
|
||||
void GX2WaitForVsync()
|
||||
{
|
||||
__OSLockScheduler();
|
||||
|
@ -236,19 +214,6 @@ namespace GX2
|
|||
__OSUnlockScheduler();
|
||||
}
|
||||
|
||||
bool GX2WaitTimeStamp(uint64 tsWait)
|
||||
{
|
||||
__OSLockScheduler();
|
||||
while (tsWait > s_lastRetirementTimestamp)
|
||||
{
|
||||
// GPU hasn't caught up yet
|
||||
coreinit::OSWaitEventInternal(s_updateRetirementEvent.GetPtr());
|
||||
}
|
||||
__OSUnlockScheduler();
|
||||
// return true to indicate no timeout
|
||||
return true;
|
||||
}
|
||||
|
||||
void GX2DrawDone()
|
||||
{
|
||||
// optional force full sync (texture readback and occlusion queries)
|
||||
|
@ -263,13 +228,10 @@ namespace GX2
|
|||
gx2WriteGather_submitU32AsBE(0x00000000); // unused
|
||||
}
|
||||
// flush pipeline
|
||||
if (_GX2GetUnflushedBytes(coreinit::OSGetCoreId()) > 0)
|
||||
_GX2SubmitToTCL();
|
||||
GX2Command_Flush(0x100, true);
|
||||
|
||||
uint64 ts = GX2GetLastSubmittedTimeStamp();
|
||||
GX2WaitTimeStamp(ts);
|
||||
|
||||
GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
|
||||
}
|
||||
|
||||
void GX2Init_event()
|
||||
|
@ -294,25 +256,19 @@ namespace GX2
|
|||
cafeExportRegister("gx2", GX2SetEventCallback, LogType::GX2);
|
||||
cafeExportRegister("gx2", GX2GetEventCallback, LogType::GX2);
|
||||
|
||||
cafeExportRegister("gx2", GX2GetLastSubmittedTimeStamp, LogType::GX2);
|
||||
cafeExportRegister("gx2", GX2GetRetiredTimeStamp, LogType::GX2);
|
||||
|
||||
cafeExportRegister("gx2", GX2WaitForVsync, LogType::GX2);
|
||||
cafeExportRegister("gx2", GX2WaitForFlip, LogType::GX2);
|
||||
cafeExportRegister("gx2", GX2WaitTimeStamp, LogType::GX2);
|
||||
cafeExportRegister("gx2", GX2DrawDone, LogType::GX2);
|
||||
|
||||
coreinit::OSInitThreadQueue(g_vsyncThreadQueue.GetPtr());
|
||||
coreinit::OSInitThreadQueue(g_flipThreadQueue.GetPtr());
|
||||
|
||||
coreinit::OSInitEvent(s_updateRetirementEvent, coreinit::OSEvent::EVENT_STATE::STATE_NOT_SIGNALED, coreinit::OSEvent::EVENT_MODE::MODE_AUTO);
|
||||
coreinit::OSInitSemaphore(s_eventCbQueueSemaphore, 0);
|
||||
}
|
||||
|
||||
void GX2EventResetToDefaultState()
|
||||
{
|
||||
s_callbackThreadLaunched = false;
|
||||
s_lastRetirementTimestamp = 0;
|
||||
for(auto& it : s_eventCallback)
|
||||
{
|
||||
it.callbackFuncPtr = nullptr;
|
||||
|
|
|
@ -81,19 +81,68 @@ namespace GX2
|
|||
|
||||
void _test_AddrLib();
|
||||
|
||||
void GX2Init(void* initSettings)
|
||||
using GX2InitArg = uint32;
|
||||
enum class GX2InitArgId : GX2InitArg
|
||||
{
|
||||
EndOfArgs = 0,
|
||||
CommandPoolBase = 1,
|
||||
CommandPoolSize = 2,
|
||||
UknArg7 = 7,
|
||||
UknArg8 = 8,
|
||||
UknArg9 = 9,
|
||||
UknArg11 = 11,
|
||||
};
|
||||
|
||||
void GX2Init(betype<GX2InitArg>* initArgStream)
|
||||
{
|
||||
if (LatteGPUState.gx2InitCalled)
|
||||
{
|
||||
cemuLog_logDebug(LogType::Force, "GX2Init() called while already initialized");
|
||||
return;
|
||||
}
|
||||
// parse init params from the stream
|
||||
MEMPTR<void> commandPoolBase = nullptr;
|
||||
uint32 commandPoolSize = 0;
|
||||
if (initArgStream)
|
||||
{
|
||||
while (true)
|
||||
{
|
||||
GX2InitArgId paramId = static_cast<GX2InitArgId>((GX2InitArg)*initArgStream);
|
||||
initArgStream++;
|
||||
if (paramId == GX2InitArgId::EndOfArgs)
|
||||
{
|
||||
break;
|
||||
}
|
||||
else if (paramId == GX2InitArgId::CommandPoolBase)
|
||||
{
|
||||
commandPoolBase = MEMPTR<void>(*initArgStream);
|
||||
initArgStream++;
|
||||
}
|
||||
else if (paramId == GX2InitArgId::CommandPoolSize)
|
||||
{
|
||||
commandPoolSize = *initArgStream;
|
||||
initArgStream++;
|
||||
}
|
||||
else if (paramId == GX2InitArgId::UknArg7 ||
|
||||
paramId == GX2InitArgId::UknArg8 ||
|
||||
paramId == GX2InitArgId::UknArg9 ||
|
||||
paramId == GX2InitArgId::UknArg11)
|
||||
{
|
||||
initArgStream++;
|
||||
}
|
||||
else
|
||||
{
|
||||
cemuLog_log(LogType::Force, "GX2Init: Unsupported init arg {}", (uint32)paramId);
|
||||
}
|
||||
}
|
||||
}
|
||||
// init main core
|
||||
uint32 coreIndex = coreinit::OSGetCoreId();
|
||||
cemuLog_log(LogType::GX2, "GX2Init() on core {} by thread 0x{:08x}", coreIndex, MEMPTR<OSThread_t>(coreinit::OSGetCurrentThread()).GetMPTR());
|
||||
sGX2MainCoreIndex = coreIndex;
|
||||
// init submodules
|
||||
GX2::GX2Init_event();
|
||||
GX2::GX2Init_writeGather();
|
||||
GX2::GX2Init_commandBufferPool(commandPoolBase, commandPoolSize);
|
||||
// init shared area
|
||||
if (LatteGPUState.sharedAreaAddr == MPTR_NULL)
|
||||
{
|
||||
|
@ -112,6 +161,21 @@ namespace GX2
|
|||
_test_AddrLib();
|
||||
}
|
||||
|
||||
void GX2Shutdown()
|
||||
{
|
||||
if (!LatteGPUState.gx2InitCalled)
|
||||
{
|
||||
cemuLog_logDebug(LogType::Force, "GX2Shutdown() called while not initialized");
|
||||
return;
|
||||
}
|
||||
LatteGPUState.gx2InitCalled--;
|
||||
if (LatteGPUState.gx2InitCalled != 0)
|
||||
return;
|
||||
GX2DrawDone();
|
||||
GX2Shutdown_commandBufferPool();
|
||||
cemuLog_log(LogType::Force, "GX2 shutdown");
|
||||
}
|
||||
|
||||
void _GX2DriverReset()
|
||||
{
|
||||
LatteGPUState.gx2InitCalled = 0;
|
||||
|
@ -237,6 +301,7 @@ namespace GX2
|
|||
void GX2MiscInit()
|
||||
{
|
||||
cafeExportRegister("gx2", GX2Init, LogType::GX2);
|
||||
cafeExportRegister("gx2", GX2Shutdown, LogType::GX2);
|
||||
cafeExportRegister("gx2", GX2GetMainCoreId, LogType::GX2);
|
||||
cafeExportRegister("gx2", GX2ResetGPU, LogType::GX2);
|
||||
|
||||
|
|
|
@ -135,7 +135,7 @@ void gx2Export_GX2InitDepthBufferRegs(PPCInterpreter_t* hCPU)
|
|||
void gx2Export_GX2SetColorBuffer(PPCInterpreter_t* hCPU)
|
||||
{
|
||||
cemuLog_log(LogType::GX2, "GX2SetColorBuffer(0x{:08x}, {})", hCPU->gpr[3], hCPU->gpr[4]);
|
||||
GX2ReserveCmdSpace(20);
|
||||
GX2::GX2ReserveCmdSpace(20);
|
||||
|
||||
GX2ColorBuffer* colorBufferBE = (GX2ColorBuffer*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]);
|
||||
|
||||
|
@ -198,15 +198,13 @@ void gx2Export_GX2SetColorBuffer(PPCInterpreter_t* hCPU)
|
|||
mmCB_COLOR0_INFO - 0xA000 + hCPU->gpr[4],
|
||||
colorBufferBE->reg_info);
|
||||
|
||||
GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
|
||||
|
||||
osLib_returnFromFunction(hCPU, 0);
|
||||
}
|
||||
|
||||
void gx2Export_GX2SetDepthBuffer(PPCInterpreter_t* hCPU)
|
||||
{
|
||||
cemuLog_log(LogType::GX2, "GX2SetDepthBuffer(0x{:08x})", hCPU->gpr[3]);
|
||||
GX2ReserveCmdSpace(20);
|
||||
GX2::GX2ReserveCmdSpace(20);
|
||||
|
||||
GX2DepthBuffer* depthBufferBE = (GX2DepthBuffer*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]);
|
||||
|
||||
|
@ -264,8 +262,6 @@ void gx2Export_GX2SetDepthBuffer(PPCInterpreter_t* hCPU)
|
|||
gx2WriteGather_submitU32AsBE(mmDB_DEPTH_VIEW - 0xA000);
|
||||
gx2WriteGather_submitU32AsBE(db_view);
|
||||
|
||||
GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
|
||||
|
||||
osLib_returnFromFunction(hCPU, 0);
|
||||
}
|
||||
|
||||
|
@ -281,7 +277,7 @@ void gx2Export_GX2MarkScanBufferCopied(PPCInterpreter_t* hCPU)
|
|||
uint32 scanTarget = hCPU->gpr[3];
|
||||
if( scanTarget == GX2_SCAN_TARGET_TV )
|
||||
{
|
||||
GX2ReserveCmdSpace(10);
|
||||
GX2::GX2ReserveCmdSpace(10);
|
||||
|
||||
uint32 physAddr = (MEMORY_TILINGAPERTURE_AREA_ADDR+0x200000);
|
||||
|
||||
|
|
|
@ -303,7 +303,27 @@ namespace GX2
|
|||
|
||||
void GX2SetVertexShader(GX2VertexShader* vertexShader)
|
||||
{
|
||||
GX2ReserveCmdSpace(100);
|
||||
uint32 numOutputIds = vertexShader->regs.vsOutIdTableSize;
|
||||
numOutputIds = std::min<uint32>(numOutputIds, 0xA);
|
||||
uint32 vsSemanticTableSize = vertexShader->regs.semanticTableSize;
|
||||
|
||||
uint32 reserveSize = 31;
|
||||
if (vertexShader->shaderMode == GX2_SHADER_MODE::GEOMETRY_SHADER)
|
||||
{
|
||||
reserveSize += 7;
|
||||
}
|
||||
else
|
||||
{
|
||||
reserveSize += 18;
|
||||
reserveSize += numOutputIds;
|
||||
if (vertexShader->usesStreamOut != 0)
|
||||
reserveSize += 2+12;
|
||||
}
|
||||
if (vsSemanticTableSize > 0)
|
||||
{
|
||||
reserveSize += 5 + vsSemanticTableSize;
|
||||
}
|
||||
GX2ReserveCmdSpace(reserveSize);
|
||||
|
||||
MPTR shaderProgramAddr;
|
||||
uint32 shaderProgramSize;
|
||||
|
@ -361,8 +381,6 @@ namespace GX2
|
|||
|
||||
cemu_assert_debug(vertexShader->regs.SPI_VS_OUT_CONFIG.value().get_VS_PER_COMPONENT() == false); // not handled on the GPU side
|
||||
|
||||
uint32 numOutputIds = vertexShader->regs.vsOutIdTableSize;
|
||||
numOutputIds = std::min<uint32>(numOutputIds, 0xA);
|
||||
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 1+numOutputIds));
|
||||
gx2WriteGather_submitU32AsBE(Latte::REGADDR::SPI_VS_OUT_ID_0-0xA000);
|
||||
for(uint32 i=0; i<numOutputIds; i++)
|
||||
|
@ -392,7 +410,6 @@ namespace GX2
|
|||
}
|
||||
}
|
||||
// update semantic table
|
||||
uint32 vsSemanticTableSize = vertexShader->regs.semanticTableSize;
|
||||
if (vsSemanticTableSize > 0)
|
||||
{
|
||||
gx2WriteGather_submit(
|
||||
|
|
|
@ -213,7 +213,6 @@ namespace GX2
|
|||
|
||||
void GX2SetViewportReg(GX2ViewportReg* viewportReg)
|
||||
{
|
||||
GX2::GX2WriteGather_checkAndInsertWrapAroundMark();
|
||||
GX2ReserveCmdSpace(2 + 6);
|
||||
|
||||
gx2WriteGather_submit(pm4HeaderType3(IT_SET_CONTEXT_REG, 1 + 6),
|
||||
|
|
|
@ -264,7 +264,7 @@ void gx2Surface_GX2CopySurface(GX2Surface* srcSurface, uint32 srcMip, uint32 src
|
|||
// send copy command to GPU
|
||||
if( srcHwTileMode > 0 && srcHwTileMode < 16 && dstHwTileMode > 0 && dstHwTileMode < 16 || requestGPURAMCopy )
|
||||
{
|
||||
GX2ReserveCmdSpace(1+13*2);
|
||||
GX2::GX2ReserveCmdSpace(1+13*2);
|
||||
|
||||
gx2WriteGather_submit(pm4HeaderType3(IT_HLE_COPY_SURFACE_NEW, 13*2),
|
||||
// src
|
||||
|
@ -540,7 +540,7 @@ void gx2Export_GX2ResolveAAColorBuffer(PPCInterpreter_t* hCPU)
|
|||
uint32 dstDepth = std::max<uint32>(surfOutDst.depth, 1);
|
||||
|
||||
// send copy command to GPU
|
||||
GX2ReserveCmdSpace(1 + 13 * 2);
|
||||
GX2::GX2ReserveCmdSpace(1 + 13 * 2);
|
||||
gx2WriteGather_submit(pm4HeaderType3(IT_HLE_COPY_SURFACE_NEW, 13 * 2),
|
||||
// src
|
||||
(uint32)srcSurface->imagePtr,
|
||||
|
@ -619,7 +619,7 @@ void gx2Export_GX2ConvertDepthBufferToTextureSurface(PPCInterpreter_t* hCPU)
|
|||
sint32 srcMip = 0;
|
||||
|
||||
uint32 numSlices = std::max<uint32>(_swapEndianU32(depthBuffer->viewNumSlices), 1);
|
||||
GX2ReserveCmdSpace((1 + 13 * 2) * numSlices);
|
||||
GX2::GX2ReserveCmdSpace((1 + 13 * 2) * numSlices);
|
||||
for (uint32 subSliceIndex = 0; subSliceIndex < numSlices; subSliceIndex++)
|
||||
{
|
||||
// send copy command to GPU
|
||||
|
|
|
@ -11,9 +11,14 @@
|
|||
void gx2Export_GX2SetPixelShader(PPCInterpreter_t* hCPU)
|
||||
{
|
||||
cemuLog_log(LogType::GX2, "GX2SetPixelShader(0x{:08x})", hCPU->gpr[3]);
|
||||
GX2ReserveCmdSpace(100);
|
||||
|
||||
GX2PixelShader_t* pixelShader = (GX2PixelShader_t*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]);
|
||||
|
||||
uint32 numInputs = _swapEndianU32(pixelShader->regs[4]);
|
||||
if( numInputs > 0x20 )
|
||||
numInputs = 0x20;
|
||||
|
||||
GX2::GX2ReserveCmdSpace(26 + numInputs);
|
||||
|
||||
MPTR shaderProgramAddr;
|
||||
uint32 shaderProgramSize;
|
||||
|
||||
|
@ -44,9 +49,6 @@ void gx2Export_GX2SetPixelShader(PPCInterpreter_t* hCPU)
|
|||
_swapEndianU32(pixelShader->regs[2]),
|
||||
_swapEndianU32(pixelShader->regs[3]));
|
||||
// setup pixel shader extended inputs control
|
||||
uint32 numInputs = _swapEndianU32(pixelShader->regs[4]);
|
||||
if( numInputs > 0x20 )
|
||||
numInputs = 0x20;
|
||||
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 1+numInputs));
|
||||
gx2WriteGather_submitU32AsBE(mmSPI_PS_INPUT_CNTL_0-0xA000);
|
||||
for(uint32 i=0; i<numInputs; i++)
|
||||
|
@ -79,9 +81,17 @@ void gx2Export_GX2SetPixelShader(PPCInterpreter_t* hCPU)
|
|||
void gx2Export_GX2SetGeometryShader(PPCInterpreter_t* hCPU)
|
||||
{
|
||||
cemuLog_log(LogType::GX2, "GX2SetGeometryShader(0x{:08x})", hCPU->gpr[3]);
|
||||
GX2ReserveCmdSpace(100);
|
||||
|
||||
GX2GeometryShader_t* geometryShader = (GX2GeometryShader_t*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]);
|
||||
uint32 numOutputIds = _swapEndianU32(geometryShader->regs[7]);
|
||||
numOutputIds = std::min<uint32>(numOutputIds, 0xA);
|
||||
uint32 reserveSize = 38; // 38 fixed parameters
|
||||
if (numOutputIds != 0)
|
||||
reserveSize += 2 + numOutputIds;
|
||||
if( _swapEndianU32(geometryShader->useStreamout) != 0 )
|
||||
reserveSize += 2 + 12;
|
||||
|
||||
GX2::GX2ReserveCmdSpace(reserveSize);
|
||||
|
||||
MPTR shaderProgramAddr;
|
||||
uint32 shaderProgramSize;
|
||||
|
@ -128,6 +138,7 @@ void gx2Export_GX2SetGeometryShader(PPCInterpreter_t* hCPU)
|
|||
|
||||
if( _swapEndianU32(geometryShader->useStreamout) != 0 )
|
||||
{
|
||||
// todo - IT_EVENT_WRITE packet here
|
||||
// stride 0
|
||||
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 2));
|
||||
gx2WriteGather_submitU32AsBE(mmVGT_STRMOUT_VTX_STRIDE_0-0xA000);
|
||||
|
@ -180,8 +191,6 @@ void gx2Export_GX2SetGeometryShader(PPCInterpreter_t* hCPU)
|
|||
gx2WriteGather_submitU32AsBE(_swapEndianU32(geometryShader->regs[3]));
|
||||
|
||||
// GS outputs
|
||||
uint32 numOutputIds = _swapEndianU32(geometryShader->regs[7]);
|
||||
numOutputIds = std::min<uint32>(numOutputIds, 0xA);
|
||||
if( numOutputIds != 0 )
|
||||
{
|
||||
gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 1+numOutputIds));
|
||||
|
@ -254,8 +263,7 @@ void gx2Export_GX2SetComputeShader(PPCInterpreter_t* hCPU)
|
|||
shaderPtr = computeShader->rBuffer.GetVirtualAddr();
|
||||
shaderSize = computeShader->rBuffer.GetSize();
|
||||
}
|
||||
|
||||
GX2ReserveCmdSpace(0x11);
|
||||
GX2::GX2ReserveCmdSpace(0x11);
|
||||
|
||||
gx2WriteGather_submit(pm4HeaderType3(IT_SET_CONTEXT_REG, 6),
|
||||
mmSQ_PGM_START_ES-0xA000,
|
||||
|
@ -272,7 +280,7 @@ void gx2Export_GX2SetComputeShader(PPCInterpreter_t* hCPU)
|
|||
|
||||
void _GX2SubmitUniformBlock(uint32 registerBase, uint32 index, MPTR virtualAddress, uint32 size)
|
||||
{
|
||||
GX2ReserveCmdSpace(9);
|
||||
GX2::GX2ReserveCmdSpace(9);
|
||||
gx2WriteGather_submit(pm4HeaderType3(IT_SET_RESOURCE, 8),
|
||||
registerBase + index * 7,
|
||||
memory_virtualToPhysical(virtualAddress),
|
||||
|
@ -307,7 +315,7 @@ void gx2Export_GX2SetGeometryUniformBlock(PPCInterpreter_t* hCPU)
|
|||
|
||||
void gx2Export_GX2RSetVertexUniformBlock(PPCInterpreter_t* hCPU)
|
||||
{
|
||||
GX2ReserveCmdSpace(9);
|
||||
GX2::GX2ReserveCmdSpace(9);
|
||||
|
||||
GX2RBuffer* bufferPtr = (GX2RBuffer*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]);
|
||||
uint32 index = hCPU->gpr[4];
|
||||
|
@ -320,7 +328,7 @@ void gx2Export_GX2RSetVertexUniformBlock(PPCInterpreter_t* hCPU)
|
|||
|
||||
void gx2Export_GX2SetShaderModeEx(PPCInterpreter_t* hCPU)
|
||||
{
|
||||
GX2ReserveCmdSpace(8+4);
|
||||
GX2::GX2ReserveCmdSpace(8+4);
|
||||
uint32 mode = hCPU->gpr[3];
|
||||
|
||||
uint32 sqConfig = hCPU->gpr[3] == 0 ? 4 : 0;
|
||||
|
|
|
@ -616,4 +616,36 @@ namespace stdx
|
|||
scope_exit& operator=(scope_exit) = delete;
|
||||
void release() { m_released = true;}
|
||||
};
|
||||
|
||||
// Xcode 16 doesn't have std::atomic_ref support and we provide a minimalist reimplementation as fallback
|
||||
#ifdef __cpp_lib_atomic_ref
|
||||
#include <atomic>
|
||||
template<typename T>
|
||||
using atomic_ref = std::atomic_ref<T>;
|
||||
#else
|
||||
template<typename T>
|
||||
class atomic_ref
|
||||
{
|
||||
static_assert(std::is_trivially_copyable<T>::value, "atomic_ref requires trivially copyable types");
|
||||
public:
|
||||
using value_type = T;
|
||||
|
||||
explicit atomic_ref(T& obj) noexcept : ptr_(std::addressof(obj)) {}
|
||||
|
||||
T load(std::memory_order order = std::memory_order_seq_cst) const noexcept
|
||||
{
|
||||
auto aptr = reinterpret_cast<std::atomic<T>*>(ptr_);
|
||||
return aptr->load(order);
|
||||
}
|
||||
|
||||
void store(T desired, std::memory_order order = std::memory_order_seq_cst) const noexcept
|
||||
{
|
||||
auto aptr = reinterpret_cast<std::atomic<T>*>(ptr_);
|
||||
aptr->store(desired, order);
|
||||
}
|
||||
|
||||
private:
|
||||
T* ptr_;
|
||||
};
|
||||
#endif
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue