From 5a4731f919db19b4d5ba07f70b39c756c805b825 Mon Sep 17 00:00:00 2001 From: Exzap <13877693+Exzap@users.noreply.github.com> Date: Sun, 22 Jun 2025 20:56:47 +0200 Subject: [PATCH 1/3] HLE: Make HLE table access thread-safe Previous code could sometimes resize the vector while a read access was happening --- .../Interpreter/PPCInterpreterHLE.cpp | 52 +++++++++++-------- src/Cafe/HW/Espresso/PPCState.h | 4 +- 2 files changed, 32 insertions(+), 24 deletions(-) diff --git a/src/Cafe/HW/Espresso/Interpreter/PPCInterpreterHLE.cpp b/src/Cafe/HW/Espresso/Interpreter/PPCInterpreterHLE.cpp index 24219e66..cf7ba195 100644 --- a/src/Cafe/HW/Espresso/Interpreter/PPCInterpreterHLE.cpp +++ b/src/Cafe/HW/Espresso/Interpreter/PPCInterpreterHLE.cpp @@ -2,62 +2,70 @@ #include "PPCInterpreterInternal.h" #include "PPCInterpreterHelper.h" -std::unordered_set sUnsupportedHLECalls; +std::unordered_set s_unsupportedHLECalls; void PPCInterpreter_handleUnsupportedHLECall(PPCInterpreter_t* hCPU) { const char* libFuncName = (char*)memory_getPointerFromVirtualOffset(hCPU->instructionPointer + 8); std::string tempString = fmt::format("Unsupported lib call: {}", libFuncName); - if (sUnsupportedHLECalls.find(tempString) == sUnsupportedHLECalls.end()) + if (s_unsupportedHLECalls.find(tempString) == s_unsupportedHLECalls.end()) { cemuLog_log(LogType::UnsupportedAPI, "{}", tempString); - sUnsupportedHLECalls.emplace(tempString); + s_unsupportedHLECalls.emplace(tempString); } hCPU->gpr[3] = 0; PPCInterpreter_nextInstruction(hCPU); } -std::vector* sPPCHLETable{}; +static constexpr size_t HLE_TABLE_CAPACITY = 0x4000; +HLECALL s_ppcHleTable[HLE_TABLE_CAPACITY]{}; +sint32 s_ppcHleTableWriteIndex = 0; +std::mutex s_ppcHleTableMutex; HLEIDX PPCInterpreter_registerHLECall(HLECALL hleCall, std::string hleName) { - if (!sPPCHLETable) - sPPCHLETable = new std::vector(); - for (sint32 i = 0; i < sPPCHLETable->size(); i++) + std::unique_lock _l(s_ppcHleTableMutex); + if (s_ppcHleTableWriteIndex >= HLE_TABLE_CAPACITY) { - if ((*sPPCHLETable)[i] == hleCall) - return i; + cemuLog_log(LogType::Force, "HLE table is full"); + cemu_assert(false); } - HLEIDX newFuncIndex = (sint32)sPPCHLETable->size(); - sPPCHLETable->resize(sPPCHLETable->size() + 1); - (*sPPCHLETable)[newFuncIndex] = hleCall; - return newFuncIndex; + for (sint32 i = 0; i < s_ppcHleTableWriteIndex; i++) + { + if (s_ppcHleTable[i] == hleCall) + { + return i; + } + } + cemu_assert(s_ppcHleTableWriteIndex < HLE_TABLE_CAPACITY); + s_ppcHleTable[s_ppcHleTableWriteIndex] = hleCall; + HLEIDX funcIndex = s_ppcHleTableWriteIndex; + s_ppcHleTableWriteIndex++; + return funcIndex; } HLECALL PPCInterpreter_getHLECall(HLEIDX funcIndex) { - if (funcIndex < 0 || funcIndex >= sPPCHLETable->size()) + if (funcIndex < 0 || funcIndex >= HLE_TABLE_CAPACITY) return nullptr; - return sPPCHLETable->data()[funcIndex]; + return s_ppcHleTable[funcIndex]; } -std::mutex g_hleLogMutex; +std::mutex s_hleLogMutex; void PPCInterpreter_virtualHLE(PPCInterpreter_t* hCPU, unsigned int opcode) { uint32 hleFuncId = opcode & 0xFFFF; - if (hleFuncId == 0xFFD0) + if (hleFuncId == 0xFFD0) [[unlikely]] { - g_hleLogMutex.lock(); + s_hleLogMutex.lock(); PPCInterpreter_handleUnsupportedHLECall(hCPU); - g_hleLogMutex.unlock(); - return; + s_hleLogMutex.unlock(); } else { // os lib function - cemu_assert(hleFuncId < sPPCHLETable->size()); - auto hleCall = (*sPPCHLETable)[hleFuncId]; + auto hleCall = PPCInterpreter_getHLECall(hleFuncId); cemu_assert(hleCall); hleCall(hCPU); } diff --git a/src/Cafe/HW/Espresso/PPCState.h b/src/Cafe/HW/Espresso/PPCState.h index 179e2687..fd943d39 100644 --- a/src/Cafe/HW/Espresso/PPCState.h +++ b/src/Cafe/HW/Espresso/PPCState.h @@ -230,9 +230,9 @@ static inline float flushDenormalToZero(float f) // HLE interface -typedef void(*HLECALL)(PPCInterpreter_t* hCPU); +using HLECALL = void(*)(PPCInterpreter_t*); +using HLEIDX = sint32; -typedef sint32 HLEIDX; HLEIDX PPCInterpreter_registerHLECall(HLECALL hleCall, std::string hleName); HLECALL PPCInterpreter_getHLECall(HLEIDX funcIndex); From 4f4c9594ac77c74ef63a8b4208343ddf06669797 Mon Sep 17 00:00:00 2001 From: Exzap <13877693+Exzap@users.noreply.github.com> Date: Sun, 22 Jun 2025 22:17:29 +0200 Subject: [PATCH 2/3] GX2: Fix command buffer padding writing out of bounds --- src/Cafe/OS/libs/gx2/GX2_Command.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/Cafe/OS/libs/gx2/GX2_Command.cpp b/src/Cafe/OS/libs/gx2/GX2_Command.cpp index 6699e1e1..d12bf210 100644 --- a/src/Cafe/OS/libs/gx2/GX2_Command.cpp +++ b/src/Cafe/OS/libs/gx2/GX2_Command.cpp @@ -144,6 +144,11 @@ namespace GX2 void GX2Command_StartNewCommandBuffer(uint32 numU32s) { + // On submission command buffers are padded to 32 byte alignment + // but nowhere is it guaranteed that internal command buffers have their size aligned to 32 byte (even on console, but testing is required) + // Thus the padding can write out of bounds but this seems to trigger only very rarely in partice. As a workaround we always pad the command buffer size to 32 bytes here + numU32s = (numU32s + 7) & ~0x7; + uint32 coreIndex = coreinit::OSGetCoreId(); auto& coreCBState = s_perCoreCBState[coreIndex]; numU32s = std::max(numU32s, 0x100); From e91740cf29248bfbf2f059ac7e42159e8e7e9e9a Mon Sep 17 00:00:00 2001 From: Exzap <13877693+Exzap@users.noreply.github.com> Date: Sun, 22 Jun 2025 23:34:14 +0200 Subject: [PATCH 3/3] coreinit: Make sure thread deallocation runs before join returns Fixes crash in Coaster Crazy Deluxe --- src/Cafe/OS/libs/coreinit/coreinit_Thread.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/Cafe/OS/libs/coreinit/coreinit_Thread.cpp b/src/Cafe/OS/libs/coreinit/coreinit_Thread.cpp index 2eef929d..2f89000b 100644 --- a/src/Cafe/OS/libs/coreinit/coreinit_Thread.cpp +++ b/src/Cafe/OS/libs/coreinit/coreinit_Thread.cpp @@ -717,7 +717,10 @@ namespace coreinit thread->id = 0x8000; if (!thread->deallocatorFunc.IsNull()) + { __OSQueueThreadDeallocation(thread); + PPCCore_switchToSchedulerWithLock(); // make sure the deallocation function runs before we return + } __OSUnlockScheduler(); @@ -1525,7 +1528,7 @@ namespace coreinit } // queue thread deallocation to run after current thread finishes - // the termination threads run at a higher priority on the same threads + // the termination threads run at a higher priority on the same core void __OSQueueThreadDeallocation(OSThread_t* thread) { uint32 coreIndex = OSGetCoreId();