From 5693cc9eb33522bf717a6b259b7e616dfa58e573 Mon Sep 17 00:00:00 2001
From: Elad Ashkenazi <18193363+elad335@users.noreply.github.com>
Date: Fri, 24 May 2024 17:42:05 +0300
Subject: [PATCH] SPU: smart GETLLAR spin detection

---
 rpcs3/Emu/Cell/SPUThread.cpp | 145 ++++++++++++++++++++++++++++++++---
 rpcs3/Emu/Cell/SPUThread.h   |   5 ++
 rpcs3/Emu/system_config.h    |   2 +-
 3 files changed, 139 insertions(+), 13 deletions(-)
diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp
index 6c960f0988..b98dd8b463 100644
--- a/rpcs3/Emu/Cell/SPUThread.cpp
+++ b/rpcs3/Emu/Cell/SPUThread.cpp
@@ -4422,7 +4422,10 @@ bool spu_thread::process_mfc_cmd()
 			else
 			{
 				// Check if we can reuse our existing reservation
-				if (rtime == vm::reservation_acquire(addr) && cmp_rdata(rdata, data))
+				auto& res = vm::reservation_acquire(addr);
+				const u64 this_time = res;
+
+				if (this_time % 128 == 0 && cmp_rdata(rdata, data))
 				{
 					mov_rdata(_ref<spu_rdata_t>(ch_mfc_cmd.lsa & 0x3ff80), rdata);
 					ch_atomic_stat.set_value(MFC_GETLLAR_SUCCESS);
@@ -4430,27 +4433,105 @@ bool spu_thread::process_mfc_cmd()
 					// Need to check twice for it to be accurate, the code is before and not after this check for:
 					// 1. Reduce time between reservation accesses so TSX panelty would be lowered
 					// 2. Increase the chance of change detection: if GETLLAR has been called again new data is probably wanted
-					if (rtime == vm::reservation_acquire(addr) && cmp_rdata(rdata, data))
+					if (this_time == res && cmp_rdata(rdata, data))
 					{
+						if (this_time != rtime)
+						{
+							// Reservation was lost but the data itself remains unchanged so try to ignore it
+							set_events(SPU_EVENT_LR);
+							rtime = this_time;
+						}
+
 						if ([&]() -> bool
 						{
 							// Validation that it is indeed GETLLAR spinning (large time window is intentional)
-							if (last_getllar != pc || mfc_cmd_id - 1 != last_getllar_id || perf0.get() - last_gtsc >= 10'000)
+							if (last_getllar_addr != addr || last_getllar != pc || mfc_cmd_id - 1 != last_getllar_id || perf0.get() - last_gtsc >= 15'000)
 							{
 								// Seemingly not
 								getllar_busy_waiting_switch = umax;
+								getllar_spin_count = 0;
 								return true;
 							}
 
-							getllar_spin_count++;
+							getllar_spin_count = std::min<u32>(getllar_spin_count + 1, u16{umax});
 
-							if (getllar_busy_waiting_switch == umax)
+							static atomic_t<usz> g_ok = 0, g_fail = 0;
+
+							if (getllar_busy_waiting_switch == umax && getllar_spin_count == 4)
 							{
-								// Evalute its value (shift-right to ensure its randomness with different CPUs)
-								getllar_busy_waiting_switch = ((perf0.get() >> 8) % 100 < g_cfg.core.spu_getllar_busy_waiting_percentage) ? 1 : 0;
+								const u32 percent = g_cfg.core.spu_getllar_busy_waiting_percentage;
+
+								// Hidden value to force busy waiting (100 to 1 are dynamically adjusted, 0 is not)
+								if (percent != 101)
+								{
+									// Predict whether or not to use operating system sleep based on history
+									auto& stats = getllar_wait_time[pc / 32];
+
+									const auto old_stats = stats;
+									std::array<u8, 4> new_stats{};
+
+									// Rotate history (prepare newest entry)
+									new_stats[0] = 0;
+									new_stats[1] = old_stats[0];
+									new_stats[2] = old_stats[1];
+									new_stats[3] = old_stats[2];
+
+									stats = new_stats;
+
+									u32 total_wait = 0;
+									u32 zero_count = 0; // Try to ignore major inconsistencies
+
+									for (u8 val : old_stats)
+									{
+										total_wait += val;
+										zero_count += (val == 0 ? 1 : 0);
+									}
+
+									// Add to chance if previous wait was long enough
+									const u32 add_count = zero_count == 3 && total_wait >= 40 ? (total_wait - 39) * 40
+										: zero_count == 2 && total_wait >= 11 ? (total_wait - 10) * 40 
+										: zero_count == 1 && total_wait >= 8 ? (total_wait - 7) * 40
+										: zero_count == 0 && total_wait >= 6 ? (total_wait - 5) * 40
+										: 0;
+
+									// Evalute its value (shift-right to ensure its randomness with different CPUs)
+									getllar_busy_waiting_switch = ((perf0.get() >> 8) % 100 + add_count < percent) ? 1 : 0;
+
+									getllar_evaluate_time = perf0.get();
+
+									if (getllar_busy_waiting_switch)
+									{
+										g_fail++;
+									}
+									else
+									{
+										g_ok++;
+									}
+
+									if ((g_ok + g_fail) % 20 == 0 && !getllar_busy_waiting_switch)
+										spu_log.trace("SPU wait: count=%d. switch=%d, spin=%d, fail=%d, ok=%d, {%d, %d, %d, %d}", total_wait, getllar_busy_waiting_switch, getllar_spin_count, +g_fail, +g_ok, old_stats[0], old_stats[1], old_stats[2], old_stats[3] );
+								}
+								else
+								{
+									getllar_busy_waiting_switch = 1;
+								}
+							}
+							// Don't be stubborn, force operating sleep if too much time has passed
+							else if (getllar_busy_waiting_switch == 1 && perf0.get() > getllar_evaluate_time && perf0.get() - getllar_evaluate_time >= 400'000)
+							{
+								const u32 percent = g_cfg.core.spu_getllar_busy_waiting_percentage;
+
+								// Hidden value to force busy waiting
+								if (percent != 101)
+								{
+									spu_log.trace("SPU wait for 0x%x", addr);
+									getllar_wait_time[pc / 32].front() = 1;
+									getllar_busy_waiting_switch = 0;
+								}
 							}
 
-							return !!getllar_busy_waiting_switch || getllar_spin_count < 3;
+							// Either 1 or umax
+							return getllar_busy_waiting_switch != 0;
 						}())
 						{
 							if (g_cfg.core.mfc_debug)
@@ -4468,6 +4549,8 @@ bool spu_thread::process_mfc_cmd()
 
 							if (getllar_busy_waiting_switch == 1)
 							{
+								getllar_wait_time[pc / 32].front() = 0;
+
 #if defined(ARCH_X64)
 								if (utils::has_um_wait())
 								{
@@ -4501,13 +4584,21 @@ bool spu_thread::process_mfc_cmd()
 
 						// Spinning, might as well yield cpu resources
 						state += cpu_flag::wait;
-						vm::reservation_notifier(addr).wait(rtime, atomic_wait_timeout{50'000});
+
+						// Storage efficient method to distinguish different nearby addresses (which are likely)
+						g_reservation_waiters[std::popcount(addr)]++;
+
+						vm::reservation_notifier(addr).wait(this_time, atomic_wait_timeout{100'000});
+
+						g_reservation_waiters[std::popcount(addr)]--;
 
 						// Reset perf
 						perf0.restart();
 
 						// Quick check if there were reservation changes
-						if (rtime == vm::reservation_acquire(addr) && cmp_rdata(rdata, data))
+						const u64 new_time = res;
+
+						if (new_time % 128 == 0 && cmp_rdata(rdata, data) && res == new_time && cmp_rdata(rdata, data))
 						{
 							if (g_cfg.core.mfc_debug)
 							{
@@ -4518,12 +4609,40 @@ bool spu_thread::process_mfc_cmd()
 								std::memcpy(dump.data, rdata, 128);
 							}
 
-							// Let the game recheck its state, maybe after a long period of time something else changed which satisfies its waiting condition
-							getllar_spin_count = 1;
+							if (new_time != rtime)
+							{
+								// Reservation was lost but the data itself remains unchanged so try to ignore it
+								set_events(SPU_EVENT_LR);
+								rtime = this_time;
+							}
+
+							u8& val = getllar_wait_time[pc / 32].front();
+							val = static_cast<u8>(std::min<u32>(val + 1, u8{umax}));
+
 							last_getllar_id = mfc_cmd_id;
 							last_gtsc = perf0.get();
 							return true;
 						}
+
+						if (new_time == this_time && res == this_time)
+						{
+							spu_log.trace("RTIME unchanged on address 0x%x", addr);
+
+							// Try to forcefully change in order to notify threads
+							if (g_reservation_waiters[std::popcount(addr)] && res.compare_and_swap_test(this_time, this_time + 128))
+							{
+								vm::reservation_notifier(addr).notify_all();
+							}
+						}
+					}
+				}
+
+				if (this_time == rtime)
+				{
+					// Try to forcefully change in order to notify threads
+					if (g_reservation_waiters[std::popcount(addr)] && res.compare_and_swap_test(this_time, this_time + 128))
+					{
+						vm::reservation_notifier(addr).notify_all();
 					}
 				}
 
@@ -4535,6 +4654,7 @@ bool spu_thread::process_mfc_cmd()
 
 		last_getllar_id = mfc_cmd_id;
 		last_getllar = pc;
+		last_getllar_addr = addr;
 		last_gtsc = perf0.get();
 		getllar_spin_count = 0;
 		getllar_busy_waiting_switch = umax;
@@ -6855,3 +6975,4 @@ void fmt_class_string<spu_channel_4_t>::format(std::string& out, u64 arg)
 DECLARE(spu_thread::g_raw_spu_ctr){};
 DECLARE(spu_thread::g_raw_spu_id){};
 DECLARE(spu_thread::g_spu_work_count){};
+DECLARE(spu_thread::g_reservation_waiters){};
diff --git a/rpcs3/Emu/Cell/SPUThread.h b/rpcs3/Emu/Cell/SPUThread.h
index 539617f30d..9df636fc7e 100644
--- a/rpcs3/Emu/Cell/SPUThread.h
+++ b/rpcs3/Emu/Cell/SPUThread.h
@@ -798,8 +798,10 @@ public:
 	u64 last_gtsc = 0;
 	u32 last_getllar = umax; // LS address of last GETLLAR (if matches current GETLLAR we can let the thread rest)
 	u32 last_getllar_id = umax;
+	u32 last_getllar_addr = umax;
 	u32 getllar_spin_count = 0;
 	u32 getllar_busy_waiting_switch = umax; // umax means the test needs evaluation, otherwise it's a boolean
+	u64 getllar_evaluate_time = 0;
 
 	std::vector<mfc_cmd_dump> mfc_history;
 	u64 mfc_dump_idx = 0;
@@ -823,6 +825,8 @@ public:
 	u32 current_bp_pc = umax;
 	bool stop_flag_removal_protection = false;
 
+	std::array<std::array<u8, 4>, SPU_LS_SIZE / 32> getllar_wait_time{};
+ 
 	void push_snr(u32 number, u32 value);
 	static void do_dma_transfer(spu_thread* _this, const spu_mfc_cmd& args, u8* ls);
 	bool do_dma_check(const spu_mfc_cmd& args);
@@ -892,6 +896,7 @@ public:
 	static atomic_t<u32> g_raw_spu_ctr;
 	static atomic_t<u32> g_raw_spu_id[5];
 	static atomic_t<u32> g_spu_work_count;
+	static atomic_t<u8> g_reservation_waiters[32];
 
 	static u32 find_raw_spu(u32 id)
 	{
diff --git a/rpcs3/Emu/system_config.h b/rpcs3/Emu/system_config.h
index 25b82427ab..4d470363f8 100644
--- a/rpcs3/Emu/system_config.h
+++ b/rpcs3/Emu/system_config.h
@@ -33,7 +33,7 @@ struct cfg_root : cfg::node
 		cfg::_bool set_daz_and_ftz{ this, "Set DAZ and FTZ", false };
 		cfg::_enum<spu_decoder_type> spu_decoder{ this, "SPU Decoder", spu_decoder_type::llvm };
 		cfg::uint<0, 100> spu_reservation_busy_waiting_percentage{ this, "SPU Reservation Busy Waiting Percentage", 0, true };
-		cfg::uint<0, 100> spu_getllar_busy_waiting_percentage{ this, "SPU GETLLAR Busy Waiting Percentage", 100, true };
+		cfg::uint<0, 101> spu_getllar_busy_waiting_percentage{ this, "SPU GETLLAR Busy Waiting Percentage", 100, true };
 		cfg::_bool spu_debug{ this, "SPU Debug" };
 		cfg::_bool mfc_debug{ this, "MFC Debug" };
 		cfg::_int<0, 6> preferred_spu_threads{ this, "Preferred SPU Threads", 0, true }; // Number of hardware threads dedicated to heavy simultaneous spu tasks