diff --git a/src/Cafe/HW/Latte/Core/Latte.h b/src/Cafe/HW/Latte/Core/Latte.h index e5e9dd5c..2636467b 100644 --- a/src/Cafe/HW/Latte/Core/Latte.h +++ b/src/Cafe/HW/Latte/Core/Latte.h @@ -47,8 +47,6 @@ struct LatteGPUState_t gx2GPUSharedArea_t* sharedArea; // quick reference to shared area MPTR sharedAreaAddr; // other - // todo: Currently we have the command buffer logic implemented as a FIFO ringbuffer. On real HW it's handled as a series of command buffers that are pushed individually. - std::atomic lastSubmittedCommandBufferTimestamp; uint32 gx2InitCalled; // incremented every time GX2Init() is called // OpenGL control uint32 glVendor; // GLVENDOR_* @@ -75,8 +73,6 @@ struct LatteGPUState_t extern LatteGPUState_t LatteGPUState; -extern uint8* gxRingBufferReadPtr; // currently active read pointer (gx2 ring buffer or display list) - // texture #include "Cafe/HW/Latte/Core/LatteTexture.h" diff --git a/src/Cafe/HW/Latte/Core/LatteCommandProcessor.cpp b/src/Cafe/HW/Latte/Core/LatteCommandProcessor.cpp index f592cc9e..4385cf49 100644 --- a/src/Cafe/HW/Latte/Core/LatteCommandProcessor.cpp +++ b/src/Cafe/HW/Latte/Core/LatteCommandProcessor.cpp @@ -13,6 +13,7 @@ #include "Cafe/HW/Latte/Core/LattePM4.h" #include "Cafe/OS/libs/coreinit/coreinit_Time.h" +#include "Cafe/OS/libs/TCL/TCL.h" // TCL currently handles the GPU command ringbuffer #include "Cafe/CafeSystem.h" @@ -28,11 +29,6 @@ typedef uint32be* LatteCMDPtr; #define LatteReadCMD() ((uint32)*(cmd++)) #define LatteSkipCMD(_nWords) cmd += (_nWords) -uint8* gxRingBufferReadPtr; // currently active read pointer (gx2 ring buffer or display list) -uint8* gx2CPParserDisplayListPtr; -uint8* gx2CPParserDisplayListStart; // used for debugging -uint8* gx2CPParserDisplayListEnd; - void LatteThread_HandleOSScreen(); void LatteThread_Exit(); @@ -155,16 +151,12 @@ void LatteCP_signalEnterWait() */ uint32 LatteCP_readU32Deprc() { - uint32 v; - uint8* gxRingBufferWritePtr; - sint32 readDistance; // no display list active while (true) { - gxRingBufferWritePtr = gx2WriteGatherPipe.writeGatherPtrGxBuffer[GX2::sGX2MainCoreIndex]; - readDistance = (sint32)(gxRingBufferWritePtr - gxRingBufferReadPtr); - if (readDistance != 0) - break; + uint32 cmdWord; + if ( TCL::TCLGPUReadRBWord(cmdWord) ) + return cmdWord; g_renderer->NotifyLatteCommandProcessorIdle(); // let the renderer know in case it wants to flush any commands performanceMonitor.gpuTime_idleTime.beginMeasuring(); @@ -175,56 +167,8 @@ uint32 LatteCP_readU32Deprc() } LatteThread_HandleOSScreen(); // check if new frame was presented via OSScreen API - readDistance = (sint32)(gxRingBufferWritePtr - gxRingBufferReadPtr); - if (readDistance != 0) - break; - if (Latte_GetStopSignal()) - LatteThread_Exit(); - - // still no command data available, do some other tasks - LatteTiming_HandleTimedVsync(); - LatteAsyncCommands_checkAndExecute(); - std::this_thread::yield(); - performanceMonitor.gpuTime_idleTime.endMeasuring(); - } - v = *(uint32*)gxRingBufferReadPtr; - gxRingBufferReadPtr += 4; -#ifdef CEMU_DEBUG_ASSERT - if (v == 0xcdcdcdcd) - assert_dbg(); -#endif - v = _swapEndianU32(v); - return v; -} - -void LatteCP_waitForNWords(uint32 numWords) -{ - uint8* gxRingBufferWritePtr; - sint32 readDistance; - bool isFlushed = false; - sint32 waitDistance = numWords * sizeof(uint32be); - // no display list active - while (true) - { - gxRingBufferWritePtr = gx2WriteGatherPipe.writeGatherPtrGxBuffer[GX2::sGX2MainCoreIndex]; - readDistance = (sint32)(gxRingBufferWritePtr - gxRingBufferReadPtr); - if (readDistance < 0) - return; // wrap around means there is at least one full command queued after this - if (readDistance >= waitDistance) - break; - g_renderer->NotifyLatteCommandProcessorIdle(); // let the renderer know in case it wants to flush any commands - performanceMonitor.gpuTime_idleTime.beginMeasuring(); - // no command data available, spin in a busy loop for a while then check again - for (sint32 busy = 0; busy < 80; busy++) - { - _mm_pause(); - } - readDistance = (sint32)(gxRingBufferWritePtr - gxRingBufferReadPtr); - if (readDistance < 0) - return; // wrap around means there is at least one full command queued after this - if (readDistance >= waitDistance) - break; - + if ( TCL::TCLGPUReadRBWord(cmdWord) ) + return cmdWord; if (Latte_GetStopSignal()) LatteThread_Exit(); @@ -234,6 +178,7 @@ void LatteCP_waitForNWords(uint32 numWords) std::this_thread::yield(); performanceMonitor.gpuTime_idleTime.endMeasuring(); } + UNREACHABLE; } template @@ -270,21 +215,23 @@ void LatteCP_itIndirectBufferDepr(LatteCMDPtr cmd, uint32 nWords) cemu_assert_debug(nWords == 3); uint32 physicalAddress = LatteReadCMD(); uint32 physicalAddressHigh = LatteReadCMD(); // unused - uint32 sizeInDWords = LatteReadCMD(); - uint32 displayListSize = sizeInDWords * 4; - DrawPassContext drawPassCtx; + uint32 sizeInU32s = LatteReadCMD(); #ifdef LATTE_CP_LOGGING if (GetAsyncKeyState('A')) LatteCP_DebugPrintCmdBuffer(MEMPTR(physicalAddress), displayListSize); #endif - uint32be* buf = MEMPTR(physicalAddress).GetPtr(); - drawPassCtx.PushCurrentCommandQueuePos(buf, buf, buf + sizeInDWords); + if (sizeInU32s > 0) + { + DrawPassContext drawPassCtx; + uint32be* buf = MEMPTR(physicalAddress).GetPtr(); + drawPassCtx.PushCurrentCommandQueuePos(buf, buf, buf + sizeInU32s); - LatteCP_processCommandBuffer(drawPassCtx); - if (drawPassCtx.isWithinDrawPass()) - drawPassCtx.endDrawPass(); + LatteCP_processCommandBuffer(drawPassCtx); + if (drawPassCtx.isWithinDrawPass()) + drawPassCtx.endDrawPass(); + } } // pushes the command buffer to the stack @@ -294,11 +241,12 @@ void LatteCP_itIndirectBuffer(LatteCMDPtr cmd, uint32 nWords, DrawPassContext& d uint32 physicalAddress = LatteReadCMD(); uint32 physicalAddressHigh = LatteReadCMD(); // unused uint32 sizeInDWords = LatteReadCMD(); - uint32 displayListSize = sizeInDWords * 4; - cemu_assert_debug(displayListSize >= 4); - - uint32be* buf = MEMPTR(physicalAddress).GetPtr(); - drawPassCtx.PushCurrentCommandQueuePos(buf, buf, buf + sizeInDWords); + if (sizeInDWords > 0) + { + uint32 displayListSize = sizeInDWords * 4; + uint32be* buf = MEMPTR(physicalAddress).GetPtr(); + drawPassCtx.PushCurrentCommandQueuePos(buf, buf, buf + sizeInDWords); + } } LatteCMDPtr LatteCP_itStreamoutBufferUpdate(LatteCMDPtr cmd, uint32 nWords) @@ -565,26 +513,55 @@ LatteCMDPtr LatteCP_itMemWrite(LatteCMDPtr cmd, uint32 nWords) if (word1 == 0x40000) { // write U32 - *memPtr = word2; + stdx::atomic_ref atomicRef(*memPtr); + atomicRef.store(word2); } else if (word1 == 0x00000) { - // write U64 (as two U32) - // note: The U32s are swapped - memPtr[0] = word2; - memPtr[1] = word3; + // write U64 + // note: The U32s are swapped here, but needs verification. Also, it seems like the two U32 halves are written independently and the U64 as a whole is not atomic -> investiagte + stdx::atomic_ref atomicRef(*(uint64be*)memPtr); + atomicRef.store(((uint64le)word2 << 32) | word3); } else if (word1 == 0x20000) { // write U64 (little endian) - memPtr[0] = _swapEndianU32(word2); - memPtr[1] = _swapEndianU32(word3); + stdx::atomic_ref atomicRef(*(uint64le*)memPtr); + atomicRef.store(((uint64le)word3 << 32) | word2); } else cemu_assert_unimplemented(); return cmd; } +LatteCMDPtr LatteCP_itEventWriteEOP(LatteCMDPtr cmd, uint32 nWords) +{ + cemu_assert_debug(nWords == 5); + uint32 word0 = LatteReadCMD(); + uint32 word1 = LatteReadCMD(); + uint32 word2 = LatteReadCMD(); + uint32 word3 = LatteReadCMD(); // value low bits + uint32 word4 = LatteReadCMD(); // value high bits + + cemu_assert_debug(word2 == 0x40000000 || word2 == 0x42000000); + + if (word0 == 0x504 && (word2&0x40000000)) // todo - figure out the flags + { + stdx::atomic_ref atomicRef(*(uint64be*)memory_getPointerFromPhysicalOffset(word1)); + uint64 val = ((uint64)word4 << 32) | word3; + atomicRef.store(val); + } + else + { cemu_assert_unimplemented(); + } + bool triggerInterrupt = (word2 & 0x2000000) != 0; + if (triggerInterrupt) + { + // todo - timestamp interrupt + } + TCL::TCLGPUNotifyNewRetirementTimestamp(); + return cmd; +} LatteCMDPtr LatteCP_itMemSemaphore(LatteCMDPtr cmd, uint32 nWords) { @@ -783,16 +760,6 @@ LatteCMDPtr LatteCP_itDrawImmediate(LatteCMDPtr cmd, uint32 nWords, DrawPassCont drawPassCtx.executeDraw(count, false, _tempIndexArrayMPTR); return cmd; - -} - -LatteCMDPtr LatteCP_itHLEFifoWrapAround(LatteCMDPtr cmd, uint32 nWords) -{ - cemu_assert_debug(nWords == 1); - uint32 unused = LatteReadCMD(); - gxRingBufferReadPtr = gx2WriteGatherPipe.gxRingBuffer; - cmd = (LatteCMDPtr)gxRingBufferReadPtr; - return cmd; } LatteCMDPtr LatteCP_itHLESampleTimer(LatteCMDPtr cmd, uint32 nWords) @@ -819,16 +786,6 @@ LatteCMDPtr LatteCP_itHLESpecialState(LatteCMDPtr cmd, uint32 nWords) return cmd; } -LatteCMDPtr LatteCP_itHLESetRetirementTimestamp(LatteCMDPtr cmd, uint32 nWords) -{ - cemu_assert_debug(nWords == 2); - uint32 timestampHigh = (uint32)LatteReadCMD(); - uint32 timestampLow = (uint32)LatteReadCMD(); - uint64 timestamp = ((uint64)timestampHigh << 32ULL) | (uint64)timestampLow; - GX2::__GX2NotifyNewRetirementTimestamp(timestamp); - return cmd; -} - LatteCMDPtr LatteCP_itHLEBeginOcclusionQuery(LatteCMDPtr cmd, uint32 nWords) { cemu_assert_debug(nWords == 1); @@ -1145,9 +1102,10 @@ void LatteCP_processCommandBuffer(DrawPassContext& drawPassCtx) LatteCMDPtr cmd, cmdStart, cmdEnd; if (!drawPassCtx.PopCurrentCommandQueuePos(cmd, cmdStart, cmdEnd)) break; + uint32 itHeader; while (cmd < cmdEnd) { - uint32 itHeader = LatteReadCMD(); + itHeader = LatteReadCMD(); uint32 itHeaderType = (itHeader >> 30) & 3; if (itHeaderType == 3) { @@ -1361,11 +1319,6 @@ void LatteCP_processCommandBuffer(DrawPassContext& drawPassCtx) LatteCP_itHLEEndOcclusionQuery(cmdData, nWords); break; } - case IT_HLE_SET_CB_RETIREMENT_TIMESTAMP: - { - LatteCP_itHLESetRetirementTimestamp(cmdData, nWords); - break; - } case IT_HLE_BOTTOM_OF_PIPE_CB: { LatteCP_itHLEBottomOfPipeCB(cmdData, nWords); @@ -1421,6 +1374,7 @@ void LatteCP_processCommandBuffer(DrawPassContext& drawPassCtx) void LatteCP_ProcessRingbuffer() { sint32 timerRecheck = 0; // estimates how much CP processing time has elapsed based on the executed commands, if the value exceeds CP_TIMER_RECHECK then _handleTimers() is called + uint32be tmpBuffer[128]; while (true) { uint32 itHeader = LatteCP_readU32Deprc(); @@ -1429,10 +1383,13 @@ void LatteCP_ProcessRingbuffer() { uint32 itCode = (itHeader >> 8) & 0xFF; uint32 nWords = ((itHeader >> 16) & 0x3FFF) + 1; - LatteCP_waitForNWords(nWords); - LatteCMDPtr cmd = (LatteCMDPtr)gxRingBufferReadPtr; - uint8* cmdEnd = gxRingBufferReadPtr + nWords * 4; - gxRingBufferReadPtr = cmdEnd; + cemu_assert(nWords < 128); + for (sint32 i=0; i s_updateRetirementEvent; + uint64 s_currentRetireMarker = 0; - enum class TCL_SUBMISSION_FLAG : uint32 + struct TCLStatePPC // mapped into PPC space { - SURFACE_SYNC = 0x400000, // submit surface sync packet before cmd - TRIGGER_INTERRUPT = 0x200000, // probably - UKN_20000000 = 0x20000000, + uint64be gpuRetireMarker; // written by GPU }; - int TCLSubmitToRing(uint32be* cmd, uint32 cmdLen, uint32be* controlFlags, uint64* submissionTimestamp) + SysAllocator s_tclStatePPC; + + // called from GPU for timestamp EOP event + void TCLGPUNotifyNewRetirementTimestamp() { - // todo - figure out all the bits of *controlFlags - // if submissionTimestamp != nullptr then set it to the timestamp of the submission. Note: We should make sure that uint64's are written atomically by the GPU command processor + // gpuRetireMarker is updated via event eop command + __OSLockScheduler(); + coreinit::OSSignalEventAllInternal(s_updateRetirementEvent.GetPtr()); + __OSUnlockScheduler(); + } - cemu_assert_debug(false); + int TCLTimestamp(TCLTimestampId id, uint64be* timestampOut) + { + if (id == TCLTimestampId::TIMESTAMP_LAST_BUFFER_RETIRED) + { + MEMPTR b; + // this is the timestamp of the last buffer that was retired by the GPU + stdx::atomic_ref retireTimestamp(s_tclStatePPC->gpuRetireMarker); + *timestampOut = retireTimestamp.load(); + return 0; + } + else + { + cemuLog_log(LogType::Force, "TCLTimestamp(): Unsupported timestamp ID {}", (uint32)id); + *timestampOut = 0; + return 0; + } + } + int TCLWaitTimestamp(TCLTimestampId id, uint64 waitTs, uint64 timeout) + { + if (id == TCLTimestampId::TIMESTAMP_LAST_BUFFER_RETIRED) + { + while ( true ) + { + stdx::atomic_ref retireTimestamp(s_tclStatePPC->gpuRetireMarker); + uint64 currentTimestamp = retireTimestamp.load(); + if (currentTimestamp >= waitTs) + return 0; + coreinit::OSWaitEvent(s_updateRetirementEvent.GetPtr()); + } + } + else + { + cemuLog_log(LogType::Force, "TCLWaitTimestamp(): Unsupported timestamp ID {}", (uint32)id); + } + return 0; + } + + static constexpr uint32 TCL_RING_BUFFER_SIZE = 4096; // in U32s + + std::atomic tclRingBufferA[TCL_RING_BUFFER_SIZE]; + std::atomic tclRingBufferA_readIndex{0}; + uint32 tclRingBufferA_writeIndex{0}; + + // GPU code calls this to grab the next command word + bool TCLGPUReadRBWord(uint32& cmdWord) + { + if (tclRingBufferA_readIndex == tclRingBufferA_writeIndex) + return false; + cmdWord = tclRingBufferA[tclRingBufferA_readIndex]; + tclRingBufferA_readIndex = (tclRingBufferA_readIndex+1) % TCL_RING_BUFFER_SIZE; + return true; + } + + void TCLWaitForRBSpace(uint32be numU32s) + { + while ( true ) + { + uint32 distance = (tclRingBufferA_readIndex + TCL_RING_BUFFER_SIZE - tclRingBufferA_writeIndex) & (TCL_RING_BUFFER_SIZE - 1); + if (tclRingBufferA_writeIndex == tclRingBufferA_readIndex) // buffer completely empty + distance = TCL_RING_BUFFER_SIZE; + if (distance >= numU32s+1) // assume distance minus one, because we are never allowed to completely wrap around + break; + _mm_pause(); + } + } + + // this function assumes that TCLWaitForRBSpace was called and that there is enough space + void TCLWriteCmd(uint32be* cmd, uint32 cmdLen) + { + while (cmdLen > 0) + { + tclRingBufferA[tclRingBufferA_writeIndex] = *cmd; + tclRingBufferA_writeIndex++; + tclRingBufferA_writeIndex &= (TCL_RING_BUFFER_SIZE - 1); + cmd++; + cmdLen--; + } + } + + #define EVENT_TYPE_TS 5 + + void TCLSubmitRetireMarker(bool triggerEventInterrupt) + { + s_currentRetireMarker++; + uint32be cmd[6]; + cmd[0] = pm4HeaderType3(IT_EVENT_WRITE_EOP, 5); + cmd[1] = (4 | (EVENT_TYPE_TS << 8)); // event type (bits 8-15) and event index (bits 0-7). + cmd[2] = MEMPTR(&s_tclStatePPC->gpuRetireMarker).GetMPTR(); // address lower 32bits + data sel bits + cmd[3] = 0x40000000; // select 64bit write, lower 16 bits are the upper bits of the address + if (triggerEventInterrupt) + cmd[3] |= 0x2000000; // trigger interrupt after value has been written + cmd[4] = (uint32)s_currentRetireMarker; // data lower 32 bits + cmd[5] = (uint32)(s_currentRetireMarker>>32); // data higher 32 bits + TCLWriteCmd(cmd, 6); + } + + int TCLSubmitToRing(uint32be* cmd, uint32 cmdLen, betype* controlFlags, uint64be* timestampValueOut) + { + TCLSubmissionFlag flags = *controlFlags; + cemu_assert_debug(timestampValueOut); // handle case where this is null + + // make sure there is enough space to submit all commands at one + uint32 totalCommandLength = cmdLen; + totalCommandLength += 6; // space needed for TCLSubmitRetireMarker + + TCLWaitForRBSpace(totalCommandLength); + + // submit command buffer + TCLWriteCmd(cmd, cmdLen); + + // create new marker timestamp and tell GPU to write it to our variable after its done processing the command + if ((HAS_FLAG(flags, TCLSubmissionFlag::USE_RETIRED_MARKER))) + { + TCLSubmitRetireMarker(!HAS_FLAG(flags, TCLSubmissionFlag::NO_MARKER_INTERRUPT)); + *timestampValueOut = s_currentRetireMarker; // incremented before each submit + } + else + { + cemu_assert_unimplemented(); + } return 0; } void Initialize() { cafeExportRegister("TCL", TCLSubmitToRing, LogType::Placeholder); + cafeExportRegister("TCL", TCLTimestamp, LogType::Placeholder); + cafeExportRegister("TCL", TCLWaitTimestamp, LogType::Placeholder); + + s_currentRetireMarker = 0; + s_tclStatePPC->gpuRetireMarker = 0; + coreinit::OSInitEvent(s_updateRetirementEvent.GetPtr(), coreinit::OSEvent::EVENT_STATE::STATE_NOT_SIGNALED, coreinit::OSEvent::EVENT_MODE::MODE_AUTO); } } diff --git a/src/Cafe/OS/libs/TCL/TCL.h b/src/Cafe/OS/libs/TCL/TCL.h index ab5358b0..35f0a6bf 100644 --- a/src/Cafe/OS/libs/TCL/TCL.h +++ b/src/Cafe/OS/libs/TCL/TCL.h @@ -1,4 +1,25 @@ namespace TCL { + enum class TCLTimestampId + { + TIMESTAMP_LAST_BUFFER_RETIRED = 1, + }; + + enum class TCLSubmissionFlag : uint32 + { + SURFACE_SYNC = 0x400000, // submit surface sync packet before cmd + NO_MARKER_INTERRUPT = 0x200000, + USE_RETIRED_MARKER = 0x20000000, // Controls whether the timer is updated before or after (retired) the cmd. Also controls which timestamp is returned for the submission. Before and after using separate counters + }; + + int TCLTimestamp(TCLTimestampId id, uint64be* timestampOut); + int TCLWaitTimestamp(TCLTimestampId id, uint64 waitTs, uint64 timeout); + int TCLSubmitToRing(uint32be* cmd, uint32 cmdLen, betype* controlFlags, uint64be* timestampValueOut); + + // called from Latte code + bool TCLGPUReadRBWord(uint32& cmdWord); + void TCLGPUNotifyNewRetirementTimestamp(); + void Initialize(); -} \ No newline at end of file +} +ENABLE_BITMASK_OPERATORS(TCL::TCLSubmissionFlag); diff --git a/src/Cafe/OS/libs/gx2/GX2.cpp b/src/Cafe/OS/libs/gx2/GX2.cpp index 593d31fb..1c3a8dcc 100644 --- a/src/Cafe/OS/libs/gx2/GX2.cpp +++ b/src/Cafe/OS/libs/gx2/GX2.cpp @@ -59,7 +59,7 @@ void gx2Export_GX2SwapScanBuffers(PPCInterpreter_t* hCPU) if (isPokken) GX2::GX2DrawDone(); - GX2ReserveCmdSpace(5+2); + GX2::GX2ReserveCmdSpace(5+2); uint64 tick64 = PPCInterpreter_getMainCoreCycleCounter() / 20ULL; lastSwapTime = tick64; @@ -86,24 +86,16 @@ void gx2Export_GX2SwapScanBuffers(PPCInterpreter_t* hCPU) GX2::GX2WaitForFlip(); } - GX2::GX2WriteGather_checkAndInsertWrapAroundMark(); osLib_returnFromFunction(hCPU, 0); } void gx2Export_GX2CopyColorBufferToScanBuffer(PPCInterpreter_t* hCPU) { cemuLog_log(LogType::GX2, "GX2CopyColorBufferToScanBuffer(0x{:08x},{})", hCPU->gpr[3], hCPU->gpr[4]); - GX2ReserveCmdSpace(5); + GX2::GX2ReserveCmdSpace(10); // todo: proper implementation - // hack: Avoid running to far ahead of GPU. Normally this would be guaranteed by the circular buffer model, which we currently dont fully emulate - if(GX2::GX2WriteGather_getReadWriteDistance() > 32*1024*1024 ) - { - debug_printf("Waiting for GPU to catch up...\n"); - PPCInterpreter_relinquishTimeslice(); // release current thread - return; - } GX2ColorBuffer* colorBuffer = (GX2ColorBuffer*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]); gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_HLE_COPY_COLORBUFFER_TO_SCANBUFFER, 9)); @@ -309,81 +301,6 @@ void gx2Export_GX2SetSemaphore(PPCInterpreter_t* hCPU) osLib_returnFromFunction(hCPU, 0); } -void gx2Export_GX2Flush(PPCInterpreter_t* hCPU) -{ - cemuLog_log(LogType::GX2, "GX2Flush()"); - _GX2SubmitToTCL(); - osLib_returnFromFunction(hCPU, 0); -} - -uint8* _GX2LastFlushPtr[PPC_CORE_COUNT] = {NULL}; - -uint64 _prevReturnedGPUTime = 0; - -uint64 Latte_GetTime() -{ - uint64 gpuTime = coreinit::OSGetSystemTime(); - gpuTime *= 20000ULL; - if (gpuTime <= _prevReturnedGPUTime) - gpuTime = _prevReturnedGPUTime + 1; // avoid ever returning identical timestamps - _prevReturnedGPUTime = gpuTime; - return gpuTime; -} - -void _GX2SubmitToTCL() -{ - uint32 coreIndex = PPCInterpreter_getCoreIndex(PPCInterpreter_getCurrentInstance()); - // do nothing if called from non-main GX2 core - if (GX2::sGX2MainCoreIndex != coreIndex) - { - cemuLog_logDebug(LogType::Force, "_GX2SubmitToTCL() called on non-main GX2 core"); - return; - } - if( gx2WriteGatherPipe.displayListStart[coreIndex] != MPTR_NULL ) - return; // quit if in display list - _GX2LastFlushPtr[coreIndex] = (gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex]); - // update last submitted CB timestamp - uint64 commandBufferTimestamp = Latte_GetTime(); - LatteGPUState.lastSubmittedCommandBufferTimestamp.store(commandBufferTimestamp); - cemuLog_log(LogType::GX2, "Submitting GX2 command buffer with timestamp {:016x}", commandBufferTimestamp); - // submit HLE packet to write retirement timestamp - gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_HLE_SET_CB_RETIREMENT_TIMESTAMP, 2)); - gx2WriteGather_submitU32AsBE((uint32)(commandBufferTimestamp>>32ULL)); - gx2WriteGather_submitU32AsBE((uint32)(commandBufferTimestamp&0xFFFFFFFFULL)); -} - -uint32 _GX2GetUnflushedBytes(uint32 coreIndex) -{ - uint32 unflushedBytes = 0; - if (_GX2LastFlushPtr[coreIndex] != NULL) - { - if (_GX2LastFlushPtr[coreIndex] > gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex]) - unflushedBytes = (uint32)(gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex] - gx2WriteGatherPipe.gxRingBuffer + 4); // this isn't 100% correct since we ignore the bytes between the last flush address and the start of the wrap around - else - unflushedBytes = (uint32)(gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex] - _GX2LastFlushPtr[coreIndex]); - } - else - unflushedBytes = (uint32)(gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex] - gx2WriteGatherPipe.gxRingBuffer); - return unflushedBytes; -} - -/* - * Guarantees that the requested amount of space is available on the current command buffer - * If the space is not available, the current command buffer is pushed to the GPU and a new one is allocated - */ -void GX2ReserveCmdSpace(uint32 reservedFreeSpaceInU32) -{ - uint32 coreIndex = coreinit::OSGetCoreId(); - // if we are in a display list then do nothing - if( gx2WriteGatherPipe.displayListStart[coreIndex] != MPTR_NULL ) - return; - uint32 unflushedBytes = _GX2GetUnflushedBytes(coreIndex); - if( unflushedBytes >= 0x1000 ) - { - _GX2SubmitToTCL(); - } -} - void gx2_load() { osLib_addFunction("gx2", "GX2GetContextStateDisplayList", gx2Export_GX2GetContextStateDisplayList); @@ -445,10 +362,6 @@ void gx2_load() // semaphore osLib_addFunction("gx2", "GX2SetSemaphore", gx2Export_GX2SetSemaphore); - // command buffer - osLib_addFunction("gx2", "GX2Flush", gx2Export_GX2Flush); - - GX2::GX2Init_writeGather(); GX2::GX2MemInit(); GX2::GX2ResourceInit(); GX2::GX2CommandInit(); diff --git a/src/Cafe/OS/libs/gx2/GX2.h b/src/Cafe/OS/libs/gx2/GX2.h index a22719f4..92452864 100644 --- a/src/Cafe/OS/libs/gx2/GX2.h +++ b/src/Cafe/OS/libs/gx2/GX2.h @@ -67,10 +67,4 @@ void gx2Export_GX2MarkScanBufferCopied(PPCInterpreter_t* hCPU); void gx2Export_GX2SetDefaultState(PPCInterpreter_t* hCPU); void gx2Export_GX2SetupContextStateEx(PPCInterpreter_t* hCPU); -void gx2Export_GX2SetContextState(PPCInterpreter_t* hCPU); - -// command buffer - -uint32 _GX2GetUnflushedBytes(uint32 coreIndex); -void _GX2SubmitToTCL(); -void GX2ReserveCmdSpace(uint32 reservedFreeSpaceInU32); \ No newline at end of file +void gx2Export_GX2SetContextState(PPCInterpreter_t* hCPU); \ No newline at end of file diff --git a/src/Cafe/OS/libs/gx2/GX2_Blit.cpp b/src/Cafe/OS/libs/gx2/GX2_Blit.cpp index db21c9af..6e0db6aa 100644 --- a/src/Cafe/OS/libs/gx2/GX2_Blit.cpp +++ b/src/Cafe/OS/libs/gx2/GX2_Blit.cpp @@ -132,7 +132,6 @@ namespace GX2 depthFirstSlice = _swapEndianU32(depthBuffer->viewFirstSlice); depthNumSlices = _swapEndianU32(depthBuffer->viewNumSlices); } - gx2WriteGather_submit(pm4HeaderType3(IT_HLE_CLEAR_COLOR_DEPTH_STENCIL, 23), hleClearFlags, colorPhysAddr, diff --git a/src/Cafe/OS/libs/gx2/GX2_Command.cpp b/src/Cafe/OS/libs/gx2/GX2_Command.cpp index ec96a4ff..6699e1e1 100644 --- a/src/Cafe/OS/libs/gx2/GX2_Command.cpp +++ b/src/Cafe/OS/libs/gx2/GX2_Command.cpp @@ -4,178 +4,397 @@ #include "Cafe/HW/Latte/Core/LattePM4.h" #include "Cafe/OS/libs/coreinit/coreinit.h" #include "Cafe/OS/libs/coreinit/coreinit_Thread.h" +#include "Cafe/OS/libs/TCL/TCL.h" #include "Cafe/HW/Latte/ISA/RegDefines.h" #include "GX2.h" #include "GX2_Command.h" #include "GX2_Shader.h" #include "GX2_Misc.h" +#include "OS/libs/coreinit/coreinit_MEM.h" -extern uint8* gxRingBufferReadPtr; - -GX2WriteGatherPipeState gx2WriteGatherPipe = { 0 }; +namespace GX2 +{ + GX2PerCoreCBState s_perCoreCBState[Espresso::CORE_COUNT]; +} void gx2WriteGather_submitU32AsBE(uint32 v) { uint32 coreIndex = PPCInterpreter_getCoreIndex(PPCInterpreter_getCurrentInstance()); - if (gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex] == NULL) + if (GX2::s_perCoreCBState[coreIndex].currentWritePtr == nullptr) return; - *(uint32*)(*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex]) = _swapEndianU32(v); - (*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex]) += 4; + *(uint32*)(GX2::s_perCoreCBState[coreIndex].currentWritePtr) = _swapEndianU32(v); + GX2::s_perCoreCBState[coreIndex].currentWritePtr++; + cemu_assert_debug(GX2::s_perCoreCBState[coreIndex].currentWritePtr <= (GX2::s_perCoreCBState[coreIndex].bufferPtr + GX2::s_perCoreCBState[coreIndex].bufferSizeInU32s)); } void gx2WriteGather_submitU32AsLE(uint32 v) { uint32 coreIndex = PPCInterpreter_getCoreIndex(PPCInterpreter_getCurrentInstance()); - if (gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex] == NULL) + if (GX2::s_perCoreCBState[coreIndex].currentWritePtr == nullptr) return; - *(uint32*)(*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex]) = v; - (*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex]) += 4; + *(uint32*)(GX2::s_perCoreCBState[coreIndex].currentWritePtr) = v; + GX2::s_perCoreCBState[coreIndex].currentWritePtr++; + cemu_assert_debug(GX2::s_perCoreCBState[coreIndex].currentWritePtr <= (GX2::s_perCoreCBState[coreIndex].bufferPtr + GX2::s_perCoreCBState[coreIndex].bufferSizeInU32s)); } void gx2WriteGather_submitU32AsLEArray(uint32* v, uint32 numValues) { uint32 coreIndex = PPCInterpreter_getCoreIndex(PPCInterpreter_getCurrentInstance()); - if (gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex] == NULL) + if (GX2::s_perCoreCBState[coreIndex].currentWritePtr == nullptr) return; - memcpy_dwords((*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex]), v, numValues); - (*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex]) += 4 * numValues; + memcpy_dwords(GX2::s_perCoreCBState[coreIndex].currentWritePtr, v, numValues); + GX2::s_perCoreCBState[coreIndex].currentWritePtr += numValues; + cemu_assert_debug(GX2::s_perCoreCBState[coreIndex].currentWritePtr <= (GX2::s_perCoreCBState[coreIndex].bufferPtr + GX2::s_perCoreCBState[coreIndex].bufferSizeInU32s)); } namespace GX2 { - sint32 gx2WriteGatherCurrentMainCoreIndex = -1; - bool gx2WriteGatherInited = false; - void GX2WriteGather_ResetToDefaultState() + struct GX2CommandState // mapped to PPC space since the GPU writes here { - gx2WriteGatherCurrentMainCoreIndex = -1; - gx2WriteGatherInited = false; - } + // command pool + MEMPTR commandPoolBase; + uint32 commandPoolSizeInU32s; + MEMPTR gpuCommandReadPtr; + // timestamp + uint64be lastSubmissionTime; + }; - void GX2Init_writeGather() // init write gather, make current core + SysAllocator s_commandState; + GX2PerCoreCBState s_mainCoreLastCommandState; + bool s_cbBufferIsInternallyAllocated; + + void GX2Command_StartNewCommandBuffer(uint32 numU32s); + + // called from GX2Init. Allocates a 4MB memory chunk from which command buffers are suballocated from + void GX2Init_commandBufferPool(void* bufferBase, uint32 bufferSize) { - if (gx2WriteGatherPipe.gxRingBuffer == NULL) - gx2WriteGatherPipe.gxRingBuffer = (uint8*)malloc(GX2_COMMAND_RING_BUFFER_SIZE); - if (gx2WriteGatherCurrentMainCoreIndex == sGX2MainCoreIndex) - return; // write gather already configured for same core - for (sint32 i = 0; i < PPC_CORE_COUNT; i++) + cemu_assert_debug(!s_commandState->commandPoolBase); // should not be allocated already + // setup command buffer pool. If not provided allocate a 4MB or custom size buffer + uint32 poolSize = bufferSize ? bufferSize : 0x400000; // 4MB (can be overwritten by custom GX2Init parameters?) + if (bufferBase) { - if (i == sGX2MainCoreIndex) + s_commandState->commandPoolBase = (uint32be*)bufferBase; + s_cbBufferIsInternallyAllocated = false; + } + else + { + s_commandState->commandPoolBase = (uint32be*)coreinit::_weak_MEMAllocFromDefaultHeapEx(poolSize, 0x100); + s_cbBufferIsInternallyAllocated = true; + } + if (!s_commandState->commandPoolBase) + { + cemuLog_log(LogType::Force, "GX2: Failed to allocate command buffer pool"); + } + s_commandState->commandPoolSizeInU32s = poolSize / sizeof(uint32be); + s_commandState->gpuCommandReadPtr = s_commandState->commandPoolBase; + // init per-core command buffer state + for (uint32 i = 0; i < Espresso::CORE_COUNT; i++) + { + s_perCoreCBState[i].bufferPtr = nullptr; + s_perCoreCBState[i].bufferSizeInU32s = 0; + s_perCoreCBState[i].currentWritePtr = nullptr; + } + // start first command buffer for main core + GX2Command_StartNewCommandBuffer(0x100); + } + + void GX2Shutdown_commandBufferPool() + { + if (!s_commandState->commandPoolBase) + return; + if (s_cbBufferIsInternallyAllocated) + coreinit::_weak_MEMFreeToDefaultHeap(s_commandState->commandPoolBase.GetPtr()); + s_cbBufferIsInternallyAllocated = false; + s_commandState->commandPoolBase = nullptr; + s_commandState->commandPoolSizeInU32s = 0; + s_commandState->gpuCommandReadPtr = nullptr; + } + + // current position of where the GPU is reading from. Updated via a memory write command submitted to the GPU + uint32 GX2Command_GetPoolGPUReadIndex() + { + stdx::atomic_ref> _readPtr(s_commandState->gpuCommandReadPtr); + MEMPTR currentReadPtr = _readPtr.load(); + cemu_assert_debug(currentReadPtr); + return (uint32)(currentReadPtr.GetPtr() - s_commandState->commandPoolBase.GetPtr()); + } + + void GX2Command_WaitForNextBufferRetired() + { + uint64 retiredTimeStamp = GX2GetRetiredTimeStamp(); + retiredTimeStamp += 1; + // but cant be higher than the submission timestamp + stdx::atomic_ref _lastSubmissionTime(s_commandState->lastSubmissionTime); + uint64 submissionTimeStamp = _lastSubmissionTime.load(); + if (retiredTimeStamp > submissionTimeStamp) + retiredTimeStamp = submissionTimeStamp; + GX2WaitTimeStamp(retiredTimeStamp); + } + + void GX2Command_SetupCoreCommandBuffer(uint32be* buffer, uint32 sizeInU32s, bool isDisplayList) + { + uint32 coreIndex = coreinit::OSGetCoreId(); + auto& coreCBState = s_perCoreCBState[coreIndex]; + coreCBState.bufferPtr = buffer; + coreCBState.bufferSizeInU32s = sizeInU32s; + coreCBState.currentWritePtr = buffer; + coreCBState.isDisplayList = isDisplayList; + } + + void GX2Command_StartNewCommandBuffer(uint32 numU32s) + { + uint32 coreIndex = coreinit::OSGetCoreId(); + auto& coreCBState = s_perCoreCBState[coreIndex]; + numU32s = std::max(numU32s, 0x100); + // grab space from command buffer pool and if necessary wait for it + uint32be* bufferPtr = nullptr; + uint32 bufferSizeInU32s = 0; + uint32 readIndex; + while (true) + { + // try to grab buffer data from first available spot: + // 1. At the current write location up to the end of the buffer (avoiding an overlap with the read location) + // 2. From the start of the buffer up to the read location + readIndex = GX2Command_GetPoolGPUReadIndex(); + uint32be* nextWritePos = coreCBState.bufferPtr ? coreCBState.bufferPtr + coreCBState.bufferSizeInU32s : s_commandState->commandPoolBase.GetPtr(); + uint32 writeIndex = nextWritePos - s_commandState->commandPoolBase; + uint32 poolSizeInU32s = s_commandState->commandPoolSizeInU32s; + // readIndex == writeIndex can mean either buffer full or buffer empty + // we could use GX2GetRetiredTimeStamp() == GX2GetLastSubmittedTimeStamp() to determine if the buffer is truly empty + // but this can have false negatives since the last submission timestamp is updated independently of the read index + // so instead we just avoid ever filling the buffer completely + cemu_assert_debug(readIndex < poolSizeInU32s); + cemu_assert_debug(writeIndex < poolSizeInU32s); + if (writeIndex < readIndex) { - gx2WriteGatherPipe.writeGatherPtrGxBuffer[i] = gx2WriteGatherPipe.gxRingBuffer; - gx2WriteGatherPipe.writeGatherPtrWrite[i] = &gx2WriteGatherPipe.writeGatherPtrGxBuffer[i]; + // writeIndex has wrapped around + uint32 wordsAvailable = readIndex - writeIndex; + if (wordsAvailable > 0) + wordsAvailable--; // avoid writeIndex becoming equal to readIndex + if (wordsAvailable >= numU32s) + { + bufferPtr = s_commandState->commandPoolBase + writeIndex; + bufferSizeInU32s = wordsAvailable; + break; + } } else { - gx2WriteGatherPipe.writeGatherPtrGxBuffer[i] = NULL; - gx2WriteGatherPipe.writeGatherPtrWrite[i] = NULL; + uint32 wordsAvailable = poolSizeInU32s - writeIndex; + if (wordsAvailable > 0) + wordsAvailable--; // avoid writeIndex becoming equal to readIndex + if (wordsAvailable >= numU32s) + { + bufferPtr = nextWritePos; + bufferSizeInU32s = wordsAvailable; + break; + } + // not enough space at end of buffer, try to grab from the beginning of the buffer + wordsAvailable = readIndex; + if (wordsAvailable > 0) + wordsAvailable--; // avoid writeIndex becoming equal to readIndex + if (wordsAvailable >= numU32s) + { + bufferPtr = s_commandState->commandPoolBase; + bufferSizeInU32s = wordsAvailable; + break; + } } - gx2WriteGatherPipe.displayListStart[i] = MPTR_NULL; - gx2WriteGatherPipe.writeGatherPtrDisplayList[i] = NULL; - gx2WriteGatherPipe.displayListMaxSize[i] = 0; + GX2Command_WaitForNextBufferRetired(); + } + cemu_assert_debug(bufferPtr); + bufferSizeInU32s = std::min(numU32s, 0x20000); // size cap +#ifdef CEMU_DEBUG_ASSERT + uint32 newWriteIndex = ((bufferPtr - s_commandState->commandPoolBase) + bufferSizeInU32s) % s_commandState->commandPoolSizeInU32s; + cemu_assert_debug(newWriteIndex != readIndex); +#endif + // setup buffer and make it the current write gather target + cemu_assert_debug(bufferPtr >= s_commandState->commandPoolBase && (bufferPtr + bufferSizeInU32s) <= s_commandState->commandPoolBase + s_commandState->commandPoolSizeInU32s); + GX2Command_SetupCoreCommandBuffer(bufferPtr, bufferSizeInU32s, false); + } + + void GX2Command_SubmitCommandBuffer(uint32be* buffer, uint32 sizeInU32s, MEMPTR* completionGPUReadPointer, bool triggerMarkerInterrupt) + { + uint32be cmd[10]; + uint32 cmdLen = 4; + cmd[0] = pm4HeaderType3(IT_INDIRECT_BUFFER_PRIV, 3); + cmd[1] = memory_virtualToPhysical(MEMPTR(buffer).GetMPTR()); + cmd[2] = 0x00000000; // address high bits + cmd[3] = sizeInU32s; + if (completionGPUReadPointer) + { + // append command to update completionGPUReadPointer after the GPU is done with the command buffer + cmd[4] = pm4HeaderType3(IT_MEM_WRITE, 4); + cmd[5] = memory_virtualToPhysical(MEMPTR(completionGPUReadPointer).GetMPTR()) | 2; + cmd[6] = 0x40000; + cmd[7] = MEMPTR(buffer + sizeInU32s).GetMPTR(); // value to write + cmd[8] = 0x00000000; + cmdLen = 9; + } + + betype submissionFlags{}; + if (!triggerMarkerInterrupt) + submissionFlags |= TCL::TCLSubmissionFlag::NO_MARKER_INTERRUPT; + submissionFlags |= TCL::TCLSubmissionFlag::USE_RETIRED_MARKER; + + TCL::TCLSubmitToRing(cmd, cmdLen, &submissionFlags, &s_commandState->lastSubmissionTime); + } + + void GX2Command_PadCurrentBuffer() + { + uint32 coreIndex = coreinit::OSGetCoreId(); + auto& coreCBState = s_perCoreCBState[coreIndex]; + if (!coreCBState.currentWritePtr) + return; + uint32 writeDistance = (uint32)(coreCBState.currentWritePtr - coreCBState.bufferPtr); + if ((writeDistance&7) != 0) + { + uint32 distanceToPad = 0x8 - (writeDistance & 0x7); + while (distanceToPad) + { + *coreCBState.currentWritePtr = pm4HeaderType2Filler(); + coreCBState.currentWritePtr++; + distanceToPad--; + } + } + } + + void GX2Command_Flush(uint32 numU32sForNextBuffer, bool triggerMarkerInterrupt) + { + uint32 coreIndex = coreinit::OSGetCoreId(); + auto& coreCBState = s_perCoreCBState[coreIndex]; + if (coreCBState.isDisplayList) + { + // display list + cemu_assert_debug((uint32)(coreCBState.currentWritePtr - coreCBState.bufferPtr) < coreCBState.bufferSizeInU32s); + cemuLog_logDebugOnce(LogType::Force, "GX2 flush called on display list"); + } + else + { + // command buffer + if (coreCBState.currentWritePtr != coreCBState.bufferPtr) + { + // pad the command buffer to 32 byte alignment + GX2Command_PadCurrentBuffer(); + // submit it to the GPU + uint32 bufferLength = (uint32)(coreCBState.currentWritePtr - coreCBState.bufferPtr); + cemu_assert_debug(bufferLength <= coreCBState.bufferSizeInU32s); + GX2Command_SubmitCommandBuffer(coreCBState.bufferPtr, bufferLength, &s_commandState->gpuCommandReadPtr, triggerMarkerInterrupt); + GX2Command_StartNewCommandBuffer(numU32sForNextBuffer); + } + else + { + // current buffer is empty so we dont need to queue it + if (numU32sForNextBuffer > s_commandState->commandPoolSizeInU32s) + GX2Command_StartNewCommandBuffer(numU32sForNextBuffer); + } + } + } + + void GX2Flush() + { + GX2Command_Flush(256, true); + } + + uint64 GX2GetLastSubmittedTimeStamp() + { + stdx::atomic_ref _lastSubmissionTime(s_commandState->lastSubmissionTime); + return _lastSubmissionTime.load(); + } + + uint64 GX2GetRetiredTimeStamp() + { + uint64be ts = 0; + TCL::TCLTimestamp(TCL::TCLTimestampId::TIMESTAMP_LAST_BUFFER_RETIRED, &ts); + return ts; + } + + bool GX2WaitTimeStamp(uint64 tsWait) + { + // handle GPU timeout here? But for now we timeout after 60 seconds + TCL::TCLWaitTimestamp(TCL::TCLTimestampId::TIMESTAMP_LAST_BUFFER_RETIRED, tsWait, Espresso::TIMER_CLOCK * 60); + return true; + } + + /* + * Guarantees that the requested amount of space is available on the current command buffer + * If the space is not available, the current command buffer is pushed to the GPU and a new one is allocated + */ + void GX2ReserveCmdSpace(uint32 reservedFreeSpaceInU32) + { + uint32 coreIndex = coreinit::OSGetCoreId(); + auto& coreCBState = s_perCoreCBState[coreIndex]; + if (coreCBState.currentWritePtr == nullptr) + return; + uint32 writeDistance = (uint32)(coreCBState.currentWritePtr - coreCBState.bufferPtr); + if (writeDistance + reservedFreeSpaceInU32 > coreCBState.bufferSizeInU32s) + { + GX2Command_Flush(reservedFreeSpaceInU32, true); } - gx2WriteGatherCurrentMainCoreIndex = sGX2MainCoreIndex; - gx2WriteGatherInited = true; } void GX2WriteGather_beginDisplayList(PPCInterpreter_t* hCPU, MPTR buffer, uint32 maxSize) { uint32 coreIndex = PPCInterpreter_getCoreIndex(hCPU); - gx2WriteGatherPipe.displayListStart[coreIndex] = buffer; - gx2WriteGatherPipe.displayListMaxSize[coreIndex] = maxSize; - // set new write gather ptr - gx2WriteGatherPipe.writeGatherPtrDisplayList[coreIndex] = memory_getPointerFromVirtualOffset(gx2WriteGatherPipe.displayListStart[coreIndex]); - gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex] = &gx2WriteGatherPipe.writeGatherPtrDisplayList[coreIndex]; + if (coreIndex == sGX2MainCoreIndex) + { + GX2Command_PadCurrentBuffer(); + cemu_assert_debug(!s_perCoreCBState[coreIndex].isDisplayList); + s_mainCoreLastCommandState = s_perCoreCBState[coreIndex]; + } + GX2Command_SetupCoreCommandBuffer(MEMPTR(buffer), maxSize/4, true); } uint32 GX2WriteGather_getDisplayListWriteDistance(sint32 coreIndex) { - return (uint32)(*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex] - memory_getPointerFromVirtualOffset(gx2WriteGatherPipe.displayListStart[coreIndex])); - } - - uint32 GX2WriteGather_getFifoWriteDistance(uint32 coreIndex) - { - uint32 writeDistance = (uint32)(gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex] - gx2WriteGatherPipe.gxRingBuffer); - return writeDistance; + auto& coreCBState = s_perCoreCBState[coreIndex]; + cemu_assert_debug(coreCBState.isDisplayList); + if (coreCBState.currentWritePtr == nullptr) + return 0; + return (uint32)(coreCBState.currentWritePtr - coreCBState.bufferPtr) * 4; } uint32 GX2WriteGather_endDisplayList(PPCInterpreter_t* hCPU, MPTR buffer) { - uint32 coreIndex = PPCInterpreter_getCoreIndex(hCPU); - if (gx2WriteGatherPipe.displayListStart[coreIndex] != MPTR_NULL) + uint32 coreIndex = coreinit::OSGetCoreId(); + auto& coreCBState = s_perCoreCBState[coreIndex]; + GX2Command_PadCurrentBuffer(); + uint32 finalWriteIndex = coreCBState.currentWritePtr - coreCBState.bufferPtr; + cemu_assert_debug(finalWriteIndex <= coreCBState.bufferSizeInU32s); + // if we are on the main GX2 core then restore the GPU command buffer + if (coreIndex == sGX2MainCoreIndex) { - uint32 currentWriteSize = GX2WriteGather_getDisplayListWriteDistance(coreIndex); - // pad to 32 byte - if (gx2WriteGatherPipe.displayListMaxSize[coreIndex] >= ((gx2WriteGatherPipe.displayListMaxSize[coreIndex] + 0x1F) & ~0x1F)) - { - while ((currentWriteSize & 0x1F) != 0) - { - gx2WriteGather_submitU32AsBE(pm4HeaderType2Filler()); - currentWriteSize += 4; - } - } - // get size of written data - currentWriteSize = GX2WriteGather_getDisplayListWriteDistance(coreIndex); - // disable current display list and restore write gather ptr - gx2WriteGatherPipe.displayListStart[coreIndex] = MPTR_NULL; - if (sGX2MainCoreIndex == coreIndex) - gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex] = &gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex]; - else - gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex] = NULL; - // return size of (written) display list - return currentWriteSize; + coreCBState = s_mainCoreLastCommandState; } else { - // no active display list - // return a size of 0 - return 0; + coreCBState.bufferPtr = nullptr; + coreCBState.currentWritePtr = nullptr; + coreCBState.bufferSizeInU32s = 0; + coreCBState.isDisplayList = false; } + return finalWriteIndex * 4; } - bool GX2GetCurrentDisplayList(betype* displayListAddr, uint32be* displayListSize) + bool GX2GetCurrentDisplayList(MEMPTR* displayListAddr, uint32be* displayListSize) { uint32 coreIndex = coreinit::OSGetCoreId(); - if (gx2WriteGatherPipe.displayListStart[coreIndex] == MPTR_NULL) + auto& coreCBState = s_perCoreCBState[coreIndex]; + if (!coreCBState.isDisplayList) return false; - if (displayListAddr) - *displayListAddr = gx2WriteGatherPipe.displayListStart[coreIndex]; + *displayListAddr = coreCBState.bufferPtr; if (displayListSize) - *displayListSize = gx2WriteGatherPipe.displayListMaxSize[coreIndex]; - + *displayListSize = coreCBState.bufferSizeInU32s * sizeof(uint32be); return true; } + // returns true if we are writing to a display list bool GX2GetDisplayListWriteStatus() { - // returns true if we are writing to a display list uint32 coreIndex = coreinit::OSGetCoreId(); - return gx2WriteGatherPipe.displayListStart[coreIndex] != MPTR_NULL; - } - - uint32 GX2WriteGather_getReadWriteDistance() - { - uint32 coreIndex = sGX2MainCoreIndex; - uint32 writeDistance = (uint32)(gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex] + GX2_COMMAND_RING_BUFFER_SIZE - gxRingBufferReadPtr); - writeDistance %= GX2_COMMAND_RING_BUFFER_SIZE; - return writeDistance; - } - - void GX2WriteGather_checkAndInsertWrapAroundMark() - { - uint32 coreIndex = coreinit::OSGetCoreId(); - if (coreIndex != sGX2MainCoreIndex) // only if main gx2 core - return; - if (gx2WriteGatherPipe.displayListStart[coreIndex] != MPTR_NULL) - return; - uint32 writeDistance = GX2WriteGather_getFifoWriteDistance(coreIndex); - if (writeDistance >= (GX2_COMMAND_RING_BUFFER_SIZE * 3 / 5)) - { - gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_HLE_FIFO_WRAP_AROUND, 1)); - gx2WriteGather_submitU32AsBE(0); // empty word since we can't send commands with zero data words - gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex] = gx2WriteGatherPipe.gxRingBuffer; - } + return s_perCoreCBState[coreIndex].isDisplayList; } void GX2BeginDisplayList(MEMPTR displayListAddr, uint32 size) @@ -204,28 +423,23 @@ namespace GX2 memory_virtualToPhysical(addr), 0, // high address bits size / 4); - GX2::GX2WriteGather_checkAndInsertWrapAroundMark(); } void GX2DirectCallDisplayList(void* addr, uint32 size) { // this API submits to TCL directly and bypasses write-gatherer // its basically a way to manually submit a command buffer to the GPU - // as such it also affects the submission and retire timestamps - - uint32 coreIndex = PPCInterpreter_getCoreIndex(PPCInterpreter_getCurrentInstance()); - cemu_assert_debug(coreIndex == sGX2MainCoreIndex); - coreIndex = sGX2MainCoreIndex; // always submit to main queue which is owned by GX2 main core (TCLSubmitToRing does not need this workaround) - - uint32be* cmdStream = (uint32be*)(gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex]); - cmdStream[0] = pm4HeaderType3(IT_INDIRECT_BUFFER_PRIV, 3); - cmdStream[1] = memory_virtualToPhysical(MEMPTR(addr).GetMPTR()); - cmdStream[2] = 0; - cmdStream[3] = size / 4; - gx2WriteGatherPipe.writeGatherPtrGxBuffer[coreIndex] += 16; - - // update submission timestamp and retired timestamp - _GX2SubmitToTCL(); + uint32 coreIndex = coreinit::OSGetCoreId(); + if (coreIndex != sGX2MainCoreIndex) + { + cemuLog_logDebugOnce(LogType::Force, "GX2DirectCallDisplayList() called on non-main GX2 core"); + } + if (!s_perCoreCBState[coreIndex].isDisplayList) + { + // make sure any preceeding commands are submitted first + GX2Command_Flush(0x100, false); + } + GX2Command_SubmitCommandBuffer(static_cast(addr), size / 4, nullptr, false); } void GX2CopyDisplayList(MEMPTR addr, uint32 size) @@ -288,6 +502,12 @@ namespace GX2 void GX2CommandInit() { + cafeExportRegister("gx2", GX2Flush, LogType::GX2); + + cafeExportRegister("gx2", GX2GetLastSubmittedTimeStamp, LogType::GX2); + cafeExportRegister("gx2", GX2GetRetiredTimeStamp, LogType::GX2); + cafeExportRegister("gx2", GX2WaitTimeStamp, LogType::GX2); + cafeExportRegister("gx2", GX2BeginDisplayList, LogType::GX2); cafeExportRegister("gx2", GX2BeginDisplayListEx, LogType::GX2); cafeExportRegister("gx2", GX2EndDisplayList, LogType::GX2); @@ -295,7 +515,6 @@ namespace GX2 cafeExportRegister("gx2", GX2GetCurrentDisplayList, LogType::GX2); cafeExportRegister("gx2", GX2GetDisplayListWriteStatus, LogType::GX2); - cafeExportRegister("gx2", GX2CallDisplayList, LogType::GX2); cafeExportRegister("gx2", GX2DirectCallDisplayList, LogType::GX2); cafeExportRegister("gx2", GX2CopyDisplayList, LogType::GX2); @@ -305,7 +524,10 @@ namespace GX2 void GX2CommandResetToDefaultState() { - GX2WriteGather_ResetToDefaultState(); + s_commandState->commandPoolBase = nullptr; + s_commandState->commandPoolSizeInU32s = 0; + s_commandState->gpuCommandReadPtr = nullptr; + s_cbBufferIsInternallyAllocated = false; } } diff --git a/src/Cafe/OS/libs/gx2/GX2_Command.h b/src/Cafe/OS/libs/gx2/GX2_Command.h index 51c04928..00f5d427 100644 --- a/src/Cafe/OS/libs/gx2/GX2_Command.h +++ b/src/Cafe/OS/libs/gx2/GX2_Command.h @@ -2,21 +2,19 @@ #include "Cafe/HW/Latte/ISA/LatteReg.h" #include "Cafe/HW/Espresso/Const.h" -struct GX2WriteGatherPipeState +namespace GX2 { - uint8* gxRingBuffer; - // each core has it's own write gatherer and display list state (writing) - uint8* writeGatherPtrGxBuffer[Espresso::CORE_COUNT]; - uint8** writeGatherPtrWrite[Espresso::CORE_COUNT]; - uint8* writeGatherPtrDisplayList[Espresso::CORE_COUNT]; - MPTR displayListStart[Espresso::CORE_COUNT]; - uint32 displayListMaxSize[Espresso::CORE_COUNT]; + struct GX2PerCoreCBState + { + uint32be* bufferPtr; + uint32 bufferSizeInU32s; + uint32be* currentWritePtr; + bool isDisplayList; + }; + + extern GX2PerCoreCBState s_perCoreCBState[Espresso::CORE_COUNT]; }; -extern GX2WriteGatherPipeState gx2WriteGatherPipe; - -void GX2ReserveCmdSpace(uint32 reservedFreeSpaceInU32); // move to GX2 namespace eventually - void gx2WriteGather_submitU32AsBE(uint32 v); void gx2WriteGather_submitU32AsLE(uint32 v); void gx2WriteGather_submitU32AsLEArray(uint32* v, uint32 numValues); @@ -27,7 +25,8 @@ uint32 PPCInterpreter_getCurrentCoreIndex(); template inline void gx2WriteGather_submit_(uint32 coreIndex, uint32be* writePtr) { - (*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex]) = (uint8*)writePtr; + GX2::s_perCoreCBState[coreIndex].currentWritePtr = writePtr; + cemu_assert_debug(GX2::s_perCoreCBState[coreIndex].currentWritePtr <= (GX2::s_perCoreCBState[coreIndex].bufferPtr + GX2::s_perCoreCBState[coreIndex].bufferSizeInU32s)); } template @@ -75,17 +74,23 @@ template inline void gx2WriteGather_submit(Targs... args) { uint32 coreIndex = PPCInterpreter_getCurrentCoreIndex(); - if (gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex] == nullptr) + if (GX2::s_perCoreCBState[coreIndex].currentWritePtr == nullptr) + { + cemu_assert_suspicious(); // writing to command buffer without valid write pointer? return; - - uint32be* writePtr = (uint32be*)(*gx2WriteGatherPipe.writeGatherPtrWrite[coreIndex]); + } + uint32be* writePtr = GX2::s_perCoreCBState[coreIndex].currentWritePtr; gx2WriteGather_submit_(coreIndex, writePtr, std::forward(args)...); } namespace GX2 { - uint32 GX2WriteGather_getReadWriteDistance(); - void GX2WriteGather_checkAndInsertWrapAroundMark(); + void GX2Command_Flush(uint32 numU32sForNextBuffer, bool triggerMarkerInterrupt = true); + void GX2ReserveCmdSpace(uint32 reservedFreeSpaceInU32); + + uint64 GX2GetLastSubmittedTimeStamp(); + uint64 GX2GetRetiredTimeStamp(); + bool GX2WaitTimeStamp(uint64 tsWait); void GX2BeginDisplayList(MEMPTR displayListAddr, uint32 size); void GX2BeginDisplayListEx(MEMPTR displayListAddr, uint32 size, bool profiling); @@ -96,7 +101,8 @@ namespace GX2 bool GX2GetDisplayListWriteStatus(); - void GX2Init_writeGather(); void GX2CommandInit(); + void GX2Init_commandBufferPool(void* bufferBase, uint32 bufferSize); + void GX2Shutdown_commandBufferPool(); void GX2CommandResetToDefaultState(); } \ No newline at end of file diff --git a/src/Cafe/OS/libs/gx2/GX2_ContextState.cpp b/src/Cafe/OS/libs/gx2/GX2_ContextState.cpp index cf150b47..fb631a11 100644 --- a/src/Cafe/OS/libs/gx2/GX2_ContextState.cpp +++ b/src/Cafe/OS/libs/gx2/GX2_ContextState.cpp @@ -168,7 +168,7 @@ uint32 _GX2Context_CalcStateSize() void _GX2Context_CreateLoadDL() { - GX2ReserveCmdSpace(3); + GX2::GX2ReserveCmdSpace(3); gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_CONTEXT_CONTROL, 2)); gx2WriteGather_submitU32AsBE(0x80000077); gx2WriteGather_submitU32AsBE(0x80000077); @@ -176,7 +176,7 @@ void _GX2Context_CreateLoadDL() void _GX2Context_WriteCmdDisableStateShadowing() { - GX2ReserveCmdSpace(3); + GX2::GX2ReserveCmdSpace(3); gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_CONTEXT_CONTROL, 2)); gx2WriteGather_submitU32AsBE(0x80000000); gx2WriteGather_submitU32AsBE(0x80000000); @@ -184,7 +184,7 @@ void _GX2Context_WriteCmdDisableStateShadowing() void _GX2Context_cmdLoad(void* gx2ukn, uint32 pm4Header, MPTR physAddrRegArea, uint32 waitForIdle, uint32 numRegOffsetEntries, GX2RegLoadPktEntry_t* regOffsetEntries) { - GX2ReserveCmdSpace(3 + numRegOffsetEntries*2); + GX2::GX2ReserveCmdSpace(3 + numRegOffsetEntries*2); gx2WriteGather_submitU32AsBE(pm4Header); gx2WriteGather_submitU32AsBE(physAddrRegArea); gx2WriteGather_submitU32AsBE(waitForIdle); @@ -199,7 +199,6 @@ void _GX2Context_cmdLoad(void* gx2ukn, uint32 pm4Header, MPTR physAddrRegArea, u void _GX2Context_WriteCmdRestoreState(GX2ContextState_t* gx2ContextState, uint32 ukn) { - GX2::GX2WriteGather_checkAndInsertWrapAroundMark(); MPTR physAddrContextState = memory_virtualToPhysical(memory_getVirtualOffsetFromPointer(gx2ContextState)); _GX2Context_CreateLoadDL(); __cmdStateLoad(NULL, IT_LOAD_CONFIG_REG, gx2ContextState->hwContext.areaConfigReg, 0x80000000, configReg_loadPktEntries); @@ -212,7 +211,7 @@ void _GX2Context_WriteCmdRestoreState(GX2ContextState_t* gx2ContextState, uint32 void GX2SetDefaultState() { - GX2ReserveCmdSpace(0x100); + GX2::GX2ReserveCmdSpace(0x100); Latte::LATTE_PA_CL_VTE_CNTL reg{}; reg.set_VPORT_X_OFFSET_ENA(true).set_VPORT_X_SCALE_ENA(true); @@ -376,7 +375,6 @@ void gx2Export_GX2SetContextState(PPCInterpreter_t* hCPU) osLib_returnFromFunction(hCPU, 0); } - void gx2Export_GX2GetContextStateDisplayList(PPCInterpreter_t* hCPU) { cemuLog_log(LogType::GX2, "GX2GetContextStateDisplayList(0x{:08x}, 0x{:08x}, 0x{:08x})", hCPU->gpr[3], hCPU->gpr[4], hCPU->gpr[5]); diff --git a/src/Cafe/OS/libs/gx2/GX2_Draw.cpp b/src/Cafe/OS/libs/gx2/GX2_Draw.cpp index 053b787b..958978e1 100644 --- a/src/Cafe/OS/libs/gx2/GX2_Draw.cpp +++ b/src/Cafe/OS/libs/gx2/GX2_Draw.cpp @@ -52,7 +52,6 @@ namespace GX2 0, count, 0); - GX2::GX2WriteGather_checkAndInsertWrapAroundMark(); } void GX2DrawIndexedEx2(GX2PrimitiveMode2 primitiveMode, uint32 count, GX2IndexType indexType, void* indexData, uint32 baseVertex, uint32 numInstances, uint32 baseInstance) @@ -85,7 +84,6 @@ namespace GX2 pm4HeaderType3(IT_SET_CTL_CONST, 2), 1, 0 // baseInstance ); - GX2::GX2WriteGather_checkAndInsertWrapAroundMark(); } void GX2DrawEx(GX2PrimitiveMode2 primitiveMode, uint32 count, uint32 baseVertex, uint32 numInstances) @@ -109,7 +107,6 @@ namespace GX2 count, 0 // DRAW_INITIATOR ); - GX2::GX2WriteGather_checkAndInsertWrapAroundMark(); } void GX2DrawIndexedImmediateEx(GX2PrimitiveMode2 primitiveMode, uint32 count, GX2IndexType indexType, void* indexData, uint32 baseVertex, uint32 numInstances) @@ -177,7 +174,6 @@ namespace GX2 } } - GX2::GX2WriteGather_checkAndInsertWrapAroundMark(); } struct GX2DispatchComputeParam diff --git a/src/Cafe/OS/libs/gx2/GX2_Event.cpp b/src/Cafe/OS/libs/gx2/GX2_Event.cpp index 9748e20b..645f0a79 100644 --- a/src/Cafe/OS/libs/gx2/GX2_Event.cpp +++ b/src/Cafe/OS/libs/gx2/GX2_Event.cpp @@ -16,18 +16,6 @@ namespace GX2 SysAllocator g_vsyncThreadQueue; SysAllocator g_flipThreadQueue; - SysAllocator s_updateRetirementEvent; - std::atomic s_lastRetirementTimestamp = 0; - - // called from GPU code when a command buffer is retired - void __GX2NotifyNewRetirementTimestamp(uint64 tsRetire) - { - __OSLockScheduler(); - s_lastRetirementTimestamp = tsRetire; - coreinit::OSSignalEventAllInternal(s_updateRetirementEvent.GetPtr()); - __OSUnlockScheduler(); - } - void GX2SetGPUFence(uint32be* fencePtr, uint32 mask, uint32 compareOp, uint32 compareValue) { GX2ReserveCmdSpace(7); @@ -210,16 +198,6 @@ namespace GX2 osLib_returnFromFunction(hCPU, 0); } - uint64 GX2GetLastSubmittedTimeStamp() - { - return LatteGPUState.lastSubmittedCommandBufferTimestamp.load(); - } - - uint64 GX2GetRetiredTimeStamp() - { - return s_lastRetirementTimestamp; - } - void GX2WaitForVsync() { __OSLockScheduler(); @@ -236,19 +214,6 @@ namespace GX2 __OSUnlockScheduler(); } - bool GX2WaitTimeStamp(uint64 tsWait) - { - __OSLockScheduler(); - while (tsWait > s_lastRetirementTimestamp) - { - // GPU hasn't caught up yet - coreinit::OSWaitEventInternal(s_updateRetirementEvent.GetPtr()); - } - __OSUnlockScheduler(); - // return true to indicate no timeout - return true; - } - void GX2DrawDone() { // optional force full sync (texture readback and occlusion queries) @@ -263,13 +228,10 @@ namespace GX2 gx2WriteGather_submitU32AsBE(0x00000000); // unused } // flush pipeline - if (_GX2GetUnflushedBytes(coreinit::OSGetCoreId()) > 0) - _GX2SubmitToTCL(); + GX2Command_Flush(0x100, true); uint64 ts = GX2GetLastSubmittedTimeStamp(); GX2WaitTimeStamp(ts); - - GX2::GX2WriteGather_checkAndInsertWrapAroundMark(); } void GX2Init_event() @@ -294,25 +256,19 @@ namespace GX2 cafeExportRegister("gx2", GX2SetEventCallback, LogType::GX2); cafeExportRegister("gx2", GX2GetEventCallback, LogType::GX2); - cafeExportRegister("gx2", GX2GetLastSubmittedTimeStamp, LogType::GX2); - cafeExportRegister("gx2", GX2GetRetiredTimeStamp, LogType::GX2); - cafeExportRegister("gx2", GX2WaitForVsync, LogType::GX2); cafeExportRegister("gx2", GX2WaitForFlip, LogType::GX2); - cafeExportRegister("gx2", GX2WaitTimeStamp, LogType::GX2); cafeExportRegister("gx2", GX2DrawDone, LogType::GX2); coreinit::OSInitThreadQueue(g_vsyncThreadQueue.GetPtr()); coreinit::OSInitThreadQueue(g_flipThreadQueue.GetPtr()); - coreinit::OSInitEvent(s_updateRetirementEvent, coreinit::OSEvent::EVENT_STATE::STATE_NOT_SIGNALED, coreinit::OSEvent::EVENT_MODE::MODE_AUTO); coreinit::OSInitSemaphore(s_eventCbQueueSemaphore, 0); } void GX2EventResetToDefaultState() { s_callbackThreadLaunched = false; - s_lastRetirementTimestamp = 0; for(auto& it : s_eventCallback) { it.callbackFuncPtr = nullptr; diff --git a/src/Cafe/OS/libs/gx2/GX2_Misc.cpp b/src/Cafe/OS/libs/gx2/GX2_Misc.cpp index 3c7ea3f9..e7830cd8 100644 --- a/src/Cafe/OS/libs/gx2/GX2_Misc.cpp +++ b/src/Cafe/OS/libs/gx2/GX2_Misc.cpp @@ -81,19 +81,68 @@ namespace GX2 void _test_AddrLib(); - void GX2Init(void* initSettings) + using GX2InitArg = uint32; + enum class GX2InitArgId : GX2InitArg + { + EndOfArgs = 0, + CommandPoolBase = 1, + CommandPoolSize = 2, + UknArg7 = 7, + UknArg8 = 8, + UknArg9 = 9, + UknArg11 = 11, + }; + + void GX2Init(betype* initArgStream) { if (LatteGPUState.gx2InitCalled) { cemuLog_logDebug(LogType::Force, "GX2Init() called while already initialized"); return; } + // parse init params from the stream + MEMPTR commandPoolBase = nullptr; + uint32 commandPoolSize = 0; + if (initArgStream) + { + while (true) + { + GX2InitArgId paramId = static_cast((GX2InitArg)*initArgStream); + initArgStream++; + if (paramId == GX2InitArgId::EndOfArgs) + { + break; + } + else if (paramId == GX2InitArgId::CommandPoolBase) + { + commandPoolBase = MEMPTR(*initArgStream); + initArgStream++; + } + else if (paramId == GX2InitArgId::CommandPoolSize) + { + commandPoolSize = *initArgStream; + initArgStream++; + } + else if (paramId == GX2InitArgId::UknArg7 || + paramId == GX2InitArgId::UknArg8 || + paramId == GX2InitArgId::UknArg9 || + paramId == GX2InitArgId::UknArg11) + { + initArgStream++; + } + else + { + cemuLog_log(LogType::Force, "GX2Init: Unsupported init arg {}", (uint32)paramId); + } + } + } + // init main core uint32 coreIndex = coreinit::OSGetCoreId(); cemuLog_log(LogType::GX2, "GX2Init() on core {} by thread 0x{:08x}", coreIndex, MEMPTR(coreinit::OSGetCurrentThread()).GetMPTR()); sGX2MainCoreIndex = coreIndex; // init submodules GX2::GX2Init_event(); - GX2::GX2Init_writeGather(); + GX2::GX2Init_commandBufferPool(commandPoolBase, commandPoolSize); // init shared area if (LatteGPUState.sharedAreaAddr == MPTR_NULL) { @@ -112,6 +161,21 @@ namespace GX2 _test_AddrLib(); } + void GX2Shutdown() + { + if (!LatteGPUState.gx2InitCalled) + { + cemuLog_logDebug(LogType::Force, "GX2Shutdown() called while not initialized"); + return; + } + LatteGPUState.gx2InitCalled--; + if (LatteGPUState.gx2InitCalled != 0) + return; + GX2DrawDone(); + GX2Shutdown_commandBufferPool(); + cemuLog_log(LogType::Force, "GX2 shutdown"); + } + void _GX2DriverReset() { LatteGPUState.gx2InitCalled = 0; @@ -237,6 +301,7 @@ namespace GX2 void GX2MiscInit() { cafeExportRegister("gx2", GX2Init, LogType::GX2); + cafeExportRegister("gx2", GX2Shutdown, LogType::GX2); cafeExportRegister("gx2", GX2GetMainCoreId, LogType::GX2); cafeExportRegister("gx2", GX2ResetGPU, LogType::GX2); diff --git a/src/Cafe/OS/libs/gx2/GX2_RenderTarget.cpp b/src/Cafe/OS/libs/gx2/GX2_RenderTarget.cpp index 2a257a67..8abc3613 100644 --- a/src/Cafe/OS/libs/gx2/GX2_RenderTarget.cpp +++ b/src/Cafe/OS/libs/gx2/GX2_RenderTarget.cpp @@ -135,7 +135,7 @@ void gx2Export_GX2InitDepthBufferRegs(PPCInterpreter_t* hCPU) void gx2Export_GX2SetColorBuffer(PPCInterpreter_t* hCPU) { cemuLog_log(LogType::GX2, "GX2SetColorBuffer(0x{:08x}, {})", hCPU->gpr[3], hCPU->gpr[4]); - GX2ReserveCmdSpace(20); + GX2::GX2ReserveCmdSpace(20); GX2ColorBuffer* colorBufferBE = (GX2ColorBuffer*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]); @@ -198,15 +198,13 @@ void gx2Export_GX2SetColorBuffer(PPCInterpreter_t* hCPU) mmCB_COLOR0_INFO - 0xA000 + hCPU->gpr[4], colorBufferBE->reg_info); - GX2::GX2WriteGather_checkAndInsertWrapAroundMark(); - osLib_returnFromFunction(hCPU, 0); } void gx2Export_GX2SetDepthBuffer(PPCInterpreter_t* hCPU) { cemuLog_log(LogType::GX2, "GX2SetDepthBuffer(0x{:08x})", hCPU->gpr[3]); - GX2ReserveCmdSpace(20); + GX2::GX2ReserveCmdSpace(20); GX2DepthBuffer* depthBufferBE = (GX2DepthBuffer*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]); @@ -264,8 +262,6 @@ void gx2Export_GX2SetDepthBuffer(PPCInterpreter_t* hCPU) gx2WriteGather_submitU32AsBE(mmDB_DEPTH_VIEW - 0xA000); gx2WriteGather_submitU32AsBE(db_view); - GX2::GX2WriteGather_checkAndInsertWrapAroundMark(); - osLib_returnFromFunction(hCPU, 0); } @@ -281,7 +277,7 @@ void gx2Export_GX2MarkScanBufferCopied(PPCInterpreter_t* hCPU) uint32 scanTarget = hCPU->gpr[3]; if( scanTarget == GX2_SCAN_TARGET_TV ) { - GX2ReserveCmdSpace(10); + GX2::GX2ReserveCmdSpace(10); uint32 physAddr = (MEMORY_TILINGAPERTURE_AREA_ADDR+0x200000); diff --git a/src/Cafe/OS/libs/gx2/GX2_Shader.cpp b/src/Cafe/OS/libs/gx2/GX2_Shader.cpp index 7a153737..20a773e0 100644 --- a/src/Cafe/OS/libs/gx2/GX2_Shader.cpp +++ b/src/Cafe/OS/libs/gx2/GX2_Shader.cpp @@ -303,7 +303,27 @@ namespace GX2 void GX2SetVertexShader(GX2VertexShader* vertexShader) { - GX2ReserveCmdSpace(100); + uint32 numOutputIds = vertexShader->regs.vsOutIdTableSize; + numOutputIds = std::min(numOutputIds, 0xA); + uint32 vsSemanticTableSize = vertexShader->regs.semanticTableSize; + + uint32 reserveSize = 31; + if (vertexShader->shaderMode == GX2_SHADER_MODE::GEOMETRY_SHADER) + { + reserveSize += 7; + } + else + { + reserveSize += 18; + reserveSize += numOutputIds; + if (vertexShader->usesStreamOut != 0) + reserveSize += 2+12; + } + if (vsSemanticTableSize > 0) + { + reserveSize += 5 + vsSemanticTableSize; + } + GX2ReserveCmdSpace(reserveSize); MPTR shaderProgramAddr; uint32 shaderProgramSize; @@ -361,8 +381,6 @@ namespace GX2 cemu_assert_debug(vertexShader->regs.SPI_VS_OUT_CONFIG.value().get_VS_PER_COMPONENT() == false); // not handled on the GPU side - uint32 numOutputIds = vertexShader->regs.vsOutIdTableSize; - numOutputIds = std::min(numOutputIds, 0xA); gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 1+numOutputIds)); gx2WriteGather_submitU32AsBE(Latte::REGADDR::SPI_VS_OUT_ID_0-0xA000); for(uint32 i=0; iregs.semanticTableSize; if (vsSemanticTableSize > 0) { gx2WriteGather_submit( diff --git a/src/Cafe/OS/libs/gx2/GX2_State.cpp b/src/Cafe/OS/libs/gx2/GX2_State.cpp index d9c0420f..795ff527 100644 --- a/src/Cafe/OS/libs/gx2/GX2_State.cpp +++ b/src/Cafe/OS/libs/gx2/GX2_State.cpp @@ -213,7 +213,6 @@ namespace GX2 void GX2SetViewportReg(GX2ViewportReg* viewportReg) { - GX2::GX2WriteGather_checkAndInsertWrapAroundMark(); GX2ReserveCmdSpace(2 + 6); gx2WriteGather_submit(pm4HeaderType3(IT_SET_CONTEXT_REG, 1 + 6), diff --git a/src/Cafe/OS/libs/gx2/GX2_Surface_Copy.cpp b/src/Cafe/OS/libs/gx2/GX2_Surface_Copy.cpp index fe785d61..ce85048e 100644 --- a/src/Cafe/OS/libs/gx2/GX2_Surface_Copy.cpp +++ b/src/Cafe/OS/libs/gx2/GX2_Surface_Copy.cpp @@ -264,7 +264,7 @@ void gx2Surface_GX2CopySurface(GX2Surface* srcSurface, uint32 srcMip, uint32 src // send copy command to GPU if( srcHwTileMode > 0 && srcHwTileMode < 16 && dstHwTileMode > 0 && dstHwTileMode < 16 || requestGPURAMCopy ) { - GX2ReserveCmdSpace(1+13*2); + GX2::GX2ReserveCmdSpace(1+13*2); gx2WriteGather_submit(pm4HeaderType3(IT_HLE_COPY_SURFACE_NEW, 13*2), // src @@ -540,7 +540,7 @@ void gx2Export_GX2ResolveAAColorBuffer(PPCInterpreter_t* hCPU) uint32 dstDepth = std::max(surfOutDst.depth, 1); // send copy command to GPU - GX2ReserveCmdSpace(1 + 13 * 2); + GX2::GX2ReserveCmdSpace(1 + 13 * 2); gx2WriteGather_submit(pm4HeaderType3(IT_HLE_COPY_SURFACE_NEW, 13 * 2), // src (uint32)srcSurface->imagePtr, @@ -619,7 +619,7 @@ void gx2Export_GX2ConvertDepthBufferToTextureSurface(PPCInterpreter_t* hCPU) sint32 srcMip = 0; uint32 numSlices = std::max(_swapEndianU32(depthBuffer->viewNumSlices), 1); - GX2ReserveCmdSpace((1 + 13 * 2) * numSlices); + GX2::GX2ReserveCmdSpace((1 + 13 * 2) * numSlices); for (uint32 subSliceIndex = 0; subSliceIndex < numSlices; subSliceIndex++) { // send copy command to GPU diff --git a/src/Cafe/OS/libs/gx2/GX2_shader_legacy.cpp b/src/Cafe/OS/libs/gx2/GX2_shader_legacy.cpp index b0a5d2fa..d91a8529 100644 --- a/src/Cafe/OS/libs/gx2/GX2_shader_legacy.cpp +++ b/src/Cafe/OS/libs/gx2/GX2_shader_legacy.cpp @@ -11,9 +11,14 @@ void gx2Export_GX2SetPixelShader(PPCInterpreter_t* hCPU) { cemuLog_log(LogType::GX2, "GX2SetPixelShader(0x{:08x})", hCPU->gpr[3]); - GX2ReserveCmdSpace(100); - GX2PixelShader_t* pixelShader = (GX2PixelShader_t*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]); + + uint32 numInputs = _swapEndianU32(pixelShader->regs[4]); + if( numInputs > 0x20 ) + numInputs = 0x20; + + GX2::GX2ReserveCmdSpace(26 + numInputs); + MPTR shaderProgramAddr; uint32 shaderProgramSize; @@ -44,9 +49,6 @@ void gx2Export_GX2SetPixelShader(PPCInterpreter_t* hCPU) _swapEndianU32(pixelShader->regs[2]), _swapEndianU32(pixelShader->regs[3])); // setup pixel shader extended inputs control - uint32 numInputs = _swapEndianU32(pixelShader->regs[4]); - if( numInputs > 0x20 ) - numInputs = 0x20; gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 1+numInputs)); gx2WriteGather_submitU32AsBE(mmSPI_PS_INPUT_CNTL_0-0xA000); for(uint32 i=0; igpr[3]); - GX2ReserveCmdSpace(100); GX2GeometryShader_t* geometryShader = (GX2GeometryShader_t*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]); + uint32 numOutputIds = _swapEndianU32(geometryShader->regs[7]); + numOutputIds = std::min(numOutputIds, 0xA); + uint32 reserveSize = 38; // 38 fixed parameters + if (numOutputIds != 0) + reserveSize += 2 + numOutputIds; + if( _swapEndianU32(geometryShader->useStreamout) != 0 ) + reserveSize += 2 + 12; + + GX2::GX2ReserveCmdSpace(reserveSize); MPTR shaderProgramAddr; uint32 shaderProgramSize; @@ -128,6 +138,7 @@ void gx2Export_GX2SetGeometryShader(PPCInterpreter_t* hCPU) if( _swapEndianU32(geometryShader->useStreamout) != 0 ) { + // todo - IT_EVENT_WRITE packet here // stride 0 gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 2)); gx2WriteGather_submitU32AsBE(mmVGT_STRMOUT_VTX_STRIDE_0-0xA000); @@ -180,8 +191,6 @@ void gx2Export_GX2SetGeometryShader(PPCInterpreter_t* hCPU) gx2WriteGather_submitU32AsBE(_swapEndianU32(geometryShader->regs[3])); // GS outputs - uint32 numOutputIds = _swapEndianU32(geometryShader->regs[7]); - numOutputIds = std::min(numOutputIds, 0xA); if( numOutputIds != 0 ) { gx2WriteGather_submitU32AsBE(pm4HeaderType3(IT_SET_CONTEXT_REG, 1+numOutputIds)); @@ -254,8 +263,7 @@ void gx2Export_GX2SetComputeShader(PPCInterpreter_t* hCPU) shaderPtr = computeShader->rBuffer.GetVirtualAddr(); shaderSize = computeShader->rBuffer.GetSize(); } - - GX2ReserveCmdSpace(0x11); + GX2::GX2ReserveCmdSpace(0x11); gx2WriteGather_submit(pm4HeaderType3(IT_SET_CONTEXT_REG, 6), mmSQ_PGM_START_ES-0xA000, @@ -272,7 +280,7 @@ void gx2Export_GX2SetComputeShader(PPCInterpreter_t* hCPU) void _GX2SubmitUniformBlock(uint32 registerBase, uint32 index, MPTR virtualAddress, uint32 size) { - GX2ReserveCmdSpace(9); + GX2::GX2ReserveCmdSpace(9); gx2WriteGather_submit(pm4HeaderType3(IT_SET_RESOURCE, 8), registerBase + index * 7, memory_virtualToPhysical(virtualAddress), @@ -307,7 +315,7 @@ void gx2Export_GX2SetGeometryUniformBlock(PPCInterpreter_t* hCPU) void gx2Export_GX2RSetVertexUniformBlock(PPCInterpreter_t* hCPU) { - GX2ReserveCmdSpace(9); + GX2::GX2ReserveCmdSpace(9); GX2RBuffer* bufferPtr = (GX2RBuffer*)memory_getPointerFromVirtualOffset(hCPU->gpr[3]); uint32 index = hCPU->gpr[4]; @@ -320,7 +328,7 @@ void gx2Export_GX2RSetVertexUniformBlock(PPCInterpreter_t* hCPU) void gx2Export_GX2SetShaderModeEx(PPCInterpreter_t* hCPU) { - GX2ReserveCmdSpace(8+4); + GX2::GX2ReserveCmdSpace(8+4); uint32 mode = hCPU->gpr[3]; uint32 sqConfig = hCPU->gpr[3] == 0 ? 4 : 0; diff --git a/src/Common/precompiled.h b/src/Common/precompiled.h index bda75cef..9e5c60f5 100644 --- a/src/Common/precompiled.h +++ b/src/Common/precompiled.h @@ -616,4 +616,36 @@ namespace stdx scope_exit& operator=(scope_exit) = delete; void release() { m_released = true;} }; + + // Xcode 16 doesn't have std::atomic_ref support and we provide a minimalist reimplementation as fallback +#ifdef __cpp_lib_atomic_ref + #include + template + using atomic_ref = std::atomic_ref; +#else + template + class atomic_ref + { + static_assert(std::is_trivially_copyable::value, "atomic_ref requires trivially copyable types"); + public: + using value_type = T; + + explicit atomic_ref(T& obj) noexcept : ptr_(std::addressof(obj)) {} + + T load(std::memory_order order = std::memory_order_seq_cst) const noexcept + { + auto aptr = reinterpret_cast*>(ptr_); + return aptr->load(order); + } + + void store(T desired, std::memory_order order = std::memory_order_seq_cst) const noexcept + { + auto aptr = reinterpret_cast*>(ptr_); + aptr->store(desired, order); + } + + private: + T* ptr_; + }; +#endif }