From 5a4731f919db19b4d5ba07f70b39c756c805b825 Mon Sep 17 00:00:00 2001
From: Exzap <13877693+Exzap@users.noreply.github.com>
Date: Sun, 22 Jun 2025 20:56:47 +0200
Subject: [PATCH 1/3] HLE: Make HLE table access thread-safe

Previous code could sometimes resize the vector while a read access was happening
---
 .../Interpreter/PPCInterpreterHLE.cpp         | 52 +++++++++++--------
 src/Cafe/HW/Espresso/PPCState.h               |  4 +-
 2 files changed, 32 insertions(+), 24 deletions(-)

diff --git a/src/Cafe/HW/Espresso/Interpreter/PPCInterpreterHLE.cpp b/src/Cafe/HW/Espresso/Interpreter/PPCInterpreterHLE.cpp
index 24219e66..cf7ba195 100644
--- a/src/Cafe/HW/Espresso/Interpreter/PPCInterpreterHLE.cpp
+++ b/src/Cafe/HW/Espresso/Interpreter/PPCInterpreterHLE.cpp
@@ -2,62 +2,70 @@
 #include "PPCInterpreterInternal.h"
 #include "PPCInterpreterHelper.h"
 
-std::unordered_set<std::string> sUnsupportedHLECalls;
+std::unordered_set<std::string> s_unsupportedHLECalls;
 
 void PPCInterpreter_handleUnsupportedHLECall(PPCInterpreter_t* hCPU)
 {
 	const char* libFuncName = (char*)memory_getPointerFromVirtualOffset(hCPU->instructionPointer + 8);
 	std::string tempString = fmt::format("Unsupported lib call: {}", libFuncName);
-	if (sUnsupportedHLECalls.find(tempString) == sUnsupportedHLECalls.end())
+	if (s_unsupportedHLECalls.find(tempString) == s_unsupportedHLECalls.end())
 	{
 		cemuLog_log(LogType::UnsupportedAPI, "{}", tempString);
-		sUnsupportedHLECalls.emplace(tempString);
+		s_unsupportedHLECalls.emplace(tempString);
 	}
 	hCPU->gpr[3] = 0;
 	PPCInterpreter_nextInstruction(hCPU);
 }
 
-std::vector<void(*)(PPCInterpreter_t* hCPU)>* sPPCHLETable{};
+static constexpr size_t HLE_TABLE_CAPACITY = 0x4000;
+HLECALL s_ppcHleTable[HLE_TABLE_CAPACITY]{};
+sint32 s_ppcHleTableWriteIndex = 0;
+std::mutex s_ppcHleTableMutex;
 
 HLEIDX PPCInterpreter_registerHLECall(HLECALL hleCall, std::string hleName)
 {
-	if (!sPPCHLETable)
-		sPPCHLETable = new std::vector<void(*)(PPCInterpreter_t* hCPU)>();
-	for (sint32 i = 0; i < sPPCHLETable->size(); i++)
+	std::unique_lock _l(s_ppcHleTableMutex);
+	if (s_ppcHleTableWriteIndex >= HLE_TABLE_CAPACITY)
 	{
-		if ((*sPPCHLETable)[i] == hleCall)
-			return i;
+		cemuLog_log(LogType::Force, "HLE table is full");
+		cemu_assert(false);
 	}
-	HLEIDX newFuncIndex = (sint32)sPPCHLETable->size();
-	sPPCHLETable->resize(sPPCHLETable->size() + 1);
-	(*sPPCHLETable)[newFuncIndex] = hleCall;
-	return newFuncIndex;
+	for (sint32 i = 0; i < s_ppcHleTableWriteIndex; i++)
+	{
+		if (s_ppcHleTable[i] == hleCall)
+		{
+			return i;
+		}
+	}
+	cemu_assert(s_ppcHleTableWriteIndex < HLE_TABLE_CAPACITY);
+	s_ppcHleTable[s_ppcHleTableWriteIndex] = hleCall;
+	HLEIDX funcIndex = s_ppcHleTableWriteIndex;
+	s_ppcHleTableWriteIndex++;
+	return funcIndex;
 }
 
 HLECALL PPCInterpreter_getHLECall(HLEIDX funcIndex)
 {
-	if (funcIndex < 0 || funcIndex >= sPPCHLETable->size())
+	if (funcIndex < 0 || funcIndex >= HLE_TABLE_CAPACITY)
 		return nullptr;
-	return sPPCHLETable->data()[funcIndex];
+	return s_ppcHleTable[funcIndex];
 }
 
-std::mutex g_hleLogMutex;
+std::mutex s_hleLogMutex;
 
 void PPCInterpreter_virtualHLE(PPCInterpreter_t* hCPU, unsigned int opcode)
 {
 	uint32 hleFuncId = opcode & 0xFFFF;
-	if (hleFuncId == 0xFFD0)
+	if (hleFuncId == 0xFFD0) [[unlikely]]
 	{
-		g_hleLogMutex.lock();
+		s_hleLogMutex.lock();
 		PPCInterpreter_handleUnsupportedHLECall(hCPU);
-		g_hleLogMutex.unlock();
-		return;
+		s_hleLogMutex.unlock();
 	}
 	else
 	{
 		// os lib function
-		cemu_assert(hleFuncId < sPPCHLETable->size());
-		auto hleCall = (*sPPCHLETable)[hleFuncId];
+		auto hleCall = PPCInterpreter_getHLECall(hleFuncId);
 		cemu_assert(hleCall);
 		hleCall(hCPU);
 	}
diff --git a/src/Cafe/HW/Espresso/PPCState.h b/src/Cafe/HW/Espresso/PPCState.h
index 179e2687..fd943d39 100644
--- a/src/Cafe/HW/Espresso/PPCState.h
+++ b/src/Cafe/HW/Espresso/PPCState.h
@@ -230,9 +230,9 @@ static inline float flushDenormalToZero(float f)
 
 // HLE interface
 
-typedef void(*HLECALL)(PPCInterpreter_t* hCPU);
+using HLECALL = void(*)(PPCInterpreter_t*);
+using HLEIDX = sint32;
 
-typedef sint32 HLEIDX;
 HLEIDX PPCInterpreter_registerHLECall(HLECALL hleCall, std::string hleName);
 HLECALL PPCInterpreter_getHLECall(HLEIDX funcIndex);
 

From 4f4c9594ac77c74ef63a8b4208343ddf06669797 Mon Sep 17 00:00:00 2001
From: Exzap <13877693+Exzap@users.noreply.github.com>
Date: Sun, 22 Jun 2025 22:17:29 +0200
Subject: [PATCH 2/3] GX2: Fix command buffer padding writing out of bounds

---
 src/Cafe/OS/libs/gx2/GX2_Command.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/Cafe/OS/libs/gx2/GX2_Command.cpp b/src/Cafe/OS/libs/gx2/GX2_Command.cpp
index 6699e1e1..d12bf210 100644
--- a/src/Cafe/OS/libs/gx2/GX2_Command.cpp
+++ b/src/Cafe/OS/libs/gx2/GX2_Command.cpp
@@ -144,6 +144,11 @@ namespace GX2
 
 	void GX2Command_StartNewCommandBuffer(uint32 numU32s)
 	{
+		// On submission command buffers are padded to 32 byte alignment
+		// but nowhere is it guaranteed that internal command buffers have their size aligned to 32 byte (even on console, but testing is required)
+		// Thus the padding can write out of bounds but this seems to trigger only very rarely in partice. As a workaround we always pad the command buffer size to 32 bytes here
+		numU32s = (numU32s + 7) & ~0x7;
+
 		uint32 coreIndex = coreinit::OSGetCoreId();
 		auto& coreCBState = s_perCoreCBState[coreIndex];
 		numU32s = std::max<uint32>(numU32s, 0x100);

From e91740cf29248bfbf2f059ac7e42159e8e7e9e9a Mon Sep 17 00:00:00 2001
From: Exzap <13877693+Exzap@users.noreply.github.com>
Date: Sun, 22 Jun 2025 23:34:14 +0200
Subject: [PATCH 3/3] coreinit: Make sure thread deallocation runs before join
 returns

Fixes crash in Coaster Crazy Deluxe
---
 src/Cafe/OS/libs/coreinit/coreinit_Thread.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/Cafe/OS/libs/coreinit/coreinit_Thread.cpp b/src/Cafe/OS/libs/coreinit/coreinit_Thread.cpp
index 2eef929d..2f89000b 100644
--- a/src/Cafe/OS/libs/coreinit/coreinit_Thread.cpp
+++ b/src/Cafe/OS/libs/coreinit/coreinit_Thread.cpp
@@ -717,7 +717,10 @@ namespace coreinit
 		thread->id = 0x8000;
 
 		if (!thread->deallocatorFunc.IsNull())
+		{
 			__OSQueueThreadDeallocation(thread);
+			PPCCore_switchToSchedulerWithLock(); // make sure the deallocation function runs before we return
+		}
 
 		__OSUnlockScheduler();
 
@@ -1525,7 +1528,7 @@ namespace coreinit
 	}
 
 	// queue thread deallocation to run after current thread finishes
-	// the termination threads run at a higher priority on the same threads
+	// the termination threads run at a higher priority on the same core
 	void __OSQueueThreadDeallocation(OSThread_t* thread)
 	{
 		uint32 coreIndex = OSGetCoreId();