coreinit: Make sure thread deallocation runs before join returns

Fixes crash in Coaster Crazy Deluxe
GX2: Fix command buffer padding writing out of bounds
2025-07-02 13:01:18 +12:00 · 2025-06-22 23:34:41 +02:00 · 2025-06-22 22:17:29 +02:00 · 2025-06-22 20:56:47 +02:00
4 changed files with 41 additions and 25 deletions
--- a/src/Cafe/HW/Espresso/Interpreter/PPCInterpreterHLE.cpp
+++ b/src/Cafe/HW/Espresso/Interpreter/PPCInterpreterHLE.cpp
@ -2,62 +2,70 @@
 #include "PPCInterpreterInternal.h"
 #include "PPCInterpreterHelper.h"

-std::unordered_set<std::string> sUnsupportedHLECalls;
+std::unordered_set<std::string> s_unsupportedHLECalls;

 void PPCInterpreter_handleUnsupportedHLECall(PPCInterpreter_t* hCPU)
 {
 	const char* libFuncName = (char*)memory_getPointerFromVirtualOffset(hCPU->instructionPointer + 8);
 	std::string tempString = fmt::format("Unsupported lib call: {}", libFuncName);
-	if (sUnsupportedHLECalls.find(tempString) == sUnsupportedHLECalls.end())
+	if (s_unsupportedHLECalls.find(tempString) == s_unsupportedHLECalls.end())
 	{
 		cemuLog_log(LogType::UnsupportedAPI, "{}", tempString);
-		sUnsupportedHLECalls.emplace(tempString);
+		s_unsupportedHLECalls.emplace(tempString);
 	}
 	hCPU->gpr[3] = 0;
 	PPCInterpreter_nextInstruction(hCPU);
 }

-std::vector<void(*)(PPCInterpreter_t* hCPU)>* sPPCHLETable{};
+static constexpr size_t HLE_TABLE_CAPACITY = 0x4000;
+HLECALL s_ppcHleTable[HLE_TABLE_CAPACITY]{};
+sint32 s_ppcHleTableWriteIndex = 0;
+std::mutex s_ppcHleTableMutex;

 HLEIDX PPCInterpreter_registerHLECall(HLECALL hleCall, std::string hleName)
 {
-	if (!sPPCHLETable)
-		sPPCHLETable = new std::vector<void(*)(PPCInterpreter_t* hCPU)>();
-	for (sint32 i = 0; i < sPPCHLETable->size(); i++)
+	std::unique_lock _l(s_ppcHleTableMutex);
+	if (s_ppcHleTableWriteIndex >= HLE_TABLE_CAPACITY)
+	{
+		cemuLog_log(LogType::Force, "HLE table is full");
+		cemu_assert(false);
+	}
+	for (sint32 i = 0; i < s_ppcHleTableWriteIndex; i++)
+	{
+		if (s_ppcHleTable[i] == hleCall)
 		{
-		if ((*sPPCHLETable)[i] == hleCall)
 			return i;
 		}
-	HLEIDX newFuncIndex = (sint32)sPPCHLETable->size();
-	sPPCHLETable->resize(sPPCHLETable->size() + 1);
-	(*sPPCHLETable)[newFuncIndex] = hleCall;
-	return newFuncIndex;
+	}
+	cemu_assert(s_ppcHleTableWriteIndex < HLE_TABLE_CAPACITY);
+	s_ppcHleTable[s_ppcHleTableWriteIndex] = hleCall;
+	HLEIDX funcIndex = s_ppcHleTableWriteIndex;
+	s_ppcHleTableWriteIndex++;
+	return funcIndex;
 }

 HLECALL PPCInterpreter_getHLECall(HLEIDX funcIndex)
 {
-	if (funcIndex < 0 || funcIndex >= sPPCHLETable->size())
+	if (funcIndex < 0 || funcIndex >= HLE_TABLE_CAPACITY)
 		return nullptr;
-	return sPPCHLETable->data()[funcIndex];
+	return s_ppcHleTable[funcIndex];
 }

-std::mutex g_hleLogMutex;
+std::mutex s_hleLogMutex;

 void PPCInterpreter_virtualHLE(PPCInterpreter_t* hCPU, unsigned int opcode)
 {
 	uint32 hleFuncId = opcode & 0xFFFF;
-	if (hleFuncId == 0xFFD0)
+	if (hleFuncId == 0xFFD0) [[unlikely]]
 	{
-		g_hleLogMutex.lock();
+		s_hleLogMutex.lock();
 		PPCInterpreter_handleUnsupportedHLECall(hCPU);
-		g_hleLogMutex.unlock();
-		return;
+		s_hleLogMutex.unlock();
 	}
 	else
 	{
 		// os lib function
-		cemu_assert(hleFuncId < sPPCHLETable->size());
-		auto hleCall = (*sPPCHLETable)[hleFuncId];
+		auto hleCall = PPCInterpreter_getHLECall(hleFuncId);
 		cemu_assert(hleCall);
 		hleCall(hCPU);
 	}
--- a/src/Cafe/HW/Espresso/PPCState.h
+++ b/src/Cafe/HW/Espresso/PPCState.h
@ -230,9 +230,9 @@ static inline float flushDenormalToZero(float f)

 // HLE interface

-typedef void(*HLECALL)(PPCInterpreter_t* hCPU);
+using HLECALL = void(*)(PPCInterpreter_t*);
+using HLEIDX = sint32;

-typedef sint32 HLEIDX;
 HLEIDX PPCInterpreter_registerHLECall(HLECALL hleCall, std::string hleName);
 HLECALL PPCInterpreter_getHLECall(HLEIDX funcIndex);

--- a/src/Cafe/OS/libs/coreinit/coreinit_Thread.cpp
+++ b/src/Cafe/OS/libs/coreinit/coreinit_Thread.cpp
@ -717,7 +717,10 @@ namespace coreinit
 		thread->id = 0x8000;

 		if (!thread->deallocatorFunc.IsNull())
+		{
 			__OSQueueThreadDeallocation(thread);
+			PPCCore_switchToSchedulerWithLock(); // make sure the deallocation function runs before we return
+		}

 		__OSUnlockScheduler();

@ -1525,7 +1528,7 @@ namespace coreinit
 	}

 	// queue thread deallocation to run after current thread finishes
-	// the termination threads run at a higher priority on the same threads
+	// the termination threads run at a higher priority on the same core
 	void __OSQueueThreadDeallocation(OSThread_t* thread)
 	{
 		uint32 coreIndex = OSGetCoreId();
--- a/src/Cafe/OS/libs/gx2/GX2_Command.cpp
+++ b/src/Cafe/OS/libs/gx2/GX2_Command.cpp
@ -144,6 +144,11 @@ namespace GX2

 	void GX2Command_StartNewCommandBuffer(uint32 numU32s)
 	{
+		// On submission command buffers are padded to 32 byte alignment
+		// but nowhere is it guaranteed that internal command buffers have their size aligned to 32 byte (even on console, but testing is required)
+		// Thus the padding can write out of bounds but this seems to trigger only very rarely in partice. As a workaround we always pad the command buffer size to 32 bytes here
+		numU32s = (numU32s + 7) & ~0x7;
+
 		uint32 coreIndex = coreinit::OSGetCoreId();
 		auto& coreCBState = s_perCoreCBState[coreIndex];
 		numU32s = std::max<uint32>(numU32s, 0x100);
Author	SHA1	Message	Date
Exzap	e91740cf29	coreinit: Make sure thread deallocation runs before join returns Some checks failed Generate translation template / generate-pot (push) Failing after 36s Details Build check / build (push) Has been cancelled Details Fixes crash in Coaster Crazy Deluxe	2025-06-22 23:34:41 +02:00
Exzap	4f4c9594ac	GX2: Fix command buffer padding writing out of bounds	2025-06-22 22:17:29 +02:00
Exzap	5a4731f919	HLE: Make HLE table access thread-safe Previous code could sometimes resize the vector while a read access was happening	2025-06-22 20:56:47 +02:00