SPU: smart GETLLAR spin detection

2025-07-03 21:41:26 +12:00 · 2024-05-24 17:42:05 +03:00 · 2024-05-24 17:42:05 +03:00 · 5693cc9eb3
commit 5693cc9eb3
parent feff2ba09e
3 changed files with 139 additions and 13 deletions
--- a/rpcs3/Emu/Cell/SPUThread.cpp
+++ b/rpcs3/Emu/Cell/SPUThread.cpp
@ -4422,7 +4422,10 @@ bool spu_thread::process_mfc_cmd()
 			else
 			{
 				// Check if we can reuse our existing reservation
-				if (rtime == vm::reservation_acquire(addr) && cmp_rdata(rdata, data))
+				auto& res = vm::reservation_acquire(addr);
 				const u64 this_time = res;
 				if (this_time % 128 == 0 && cmp_rdata(rdata, data))
 				{
 					mov_rdata(_ref<spu_rdata_t>(ch_mfc_cmd.lsa & 0x3ff80), rdata);
 					ch_atomic_stat.set_value(MFC_GETLLAR_SUCCESS);
@ -4430,27 +4433,105 @@ bool spu_thread::process_mfc_cmd()
 					// Need to check twice for it to be accurate, the code is before and not after this check for:
 					// 1. Reduce time between reservation accesses so TSX panelty would be lowered
 					// 2. Increase the chance of change detection: if GETLLAR has been called again new data is probably wanted
-					if (rtime == vm::reservation_acquire(addr) && cmp_rdata(rdata, data))
+					if (this_time == res && cmp_rdata(rdata, data))
 					{
 						if (this_time != rtime)
 						{
 							// Reservation was lost but the data itself remains unchanged so try to ignore it
 							set_events(SPU_EVENT_LR);
 							rtime = this_time;
 						}
 						if ([&]() -> bool
 						{
 							// Validation that it is indeed GETLLAR spinning (large time window is intentional)
-							if (last_getllar != pc || mfc_cmd_id - 1 != last_getllar_id || perf0.get() - last_gtsc >= 10'000)
+							if (last_getllar_addr != addr || last_getllar != pc || mfc_cmd_id - 1 != last_getllar_id || perf0.get() - last_gtsc >= 15'000)
 							{
 								// Seemingly not
 								getllar_busy_waiting_switch = umax;
 								getllar_spin_count = 0;
 								return true;
 							}
-							getllar_spin_count++;
+							getllar_spin_count = std::min<u32>(getllar_spin_count + 1, u16{umax});
-							if (getllar_busy_waiting_switch == umax)
+							static atomic_t<usz> g_ok = 0, g_fail = 0;
 							if (getllar_busy_waiting_switch == umax && getllar_spin_count == 4)
 							{
-								// Evalute its value (shift-right to ensure its randomness with different CPUs)
+								const u32 percent = g_cfg.core.spu_getllar_busy_waiting_percentage;
-								getllar_busy_waiting_switch = ((perf0.get() >> 8) % 100 < g_cfg.core.spu_getllar_busy_waiting_percentage) ? 1 : 0;
+
 								// Hidden value to force busy waiting (100 to 1 are dynamically adjusted, 0 is not)
 								if (percent != 101)
 								{
 									// Predict whether or not to use operating system sleep based on history
 									auto& stats = getllar_wait_time[pc / 32];
 									const auto old_stats = stats;
 									std::array<u8, 4> new_stats{};
 									// Rotate history (prepare newest entry)
 									new_stats[0] = 0;
 									new_stats[1] = old_stats[0];
 									new_stats[2] = old_stats[1];
 									new_stats[3] = old_stats[2];
 									stats = new_stats;
 									u32 total_wait = 0;
 									u32 zero_count = 0; // Try to ignore major inconsistencies
 									for (u8 val : old_stats)
 									{
 										total_wait += val;
 										zero_count += (val == 0 ? 1 : 0);
 									}
-							return !!getllar_busy_waiting_switch || getllar_spin_count < 3;
+									// Add to chance if previous wait was long enough
 									const u32 add_count = zero_count == 3 && total_wait >= 40 ? (total_wait - 39) * 40
 										: zero_count == 2 && total_wait >= 11 ? (total_wait - 10) * 40 
 										: zero_count == 1 && total_wait >= 8 ? (total_wait - 7) * 40
 										: zero_count == 0 && total_wait >= 6 ? (total_wait - 5) * 40
 										: 0;
 									// Evalute its value (shift-right to ensure its randomness with different CPUs)
 									getllar_busy_waiting_switch = ((perf0.get() >> 8) % 100 + add_count < percent) ? 1 : 0;
 									getllar_evaluate_time = perf0.get();
 									if (getllar_busy_waiting_switch)
 									{
 										g_fail++;
 									}
 									else
 									{
 										g_ok++;
 									}
 									if ((g_ok + g_fail) % 20 == 0 && !getllar_busy_waiting_switch)
 										spu_log.trace("SPU wait: count=%d. switch=%d, spin=%d, fail=%d, ok=%d, {%d, %d, %d, %d}", total_wait, getllar_busy_waiting_switch, getllar_spin_count, +g_fail, +g_ok, old_stats[0], old_stats[1], old_stats[2], old_stats[3] );
 								}
 								else
 								{
 									getllar_busy_waiting_switch = 1;
 								}
 							}
 							// Don't be stubborn, force operating sleep if too much time has passed
 							else if (getllar_busy_waiting_switch == 1 && perf0.get() > getllar_evaluate_time && perf0.get() - getllar_evaluate_time >= 400'000)
 							{
 								const u32 percent = g_cfg.core.spu_getllar_busy_waiting_percentage;
 								// Hidden value to force busy waiting
 								if (percent != 101)
 								{
 									spu_log.trace("SPU wait for 0x%x", addr);
 									getllar_wait_time[pc / 32].front() = 1;
 									getllar_busy_waiting_switch = 0;
 								}
 							}
 							// Either 1 or umax
 							return getllar_busy_waiting_switch != 0;
 						}())
 						{
 							if (g_cfg.core.mfc_debug)
@ -4468,6 +4549,8 @@ bool spu_thread::process_mfc_cmd()
 							if (getllar_busy_waiting_switch == 1)
 							{
 								getllar_wait_time[pc / 32].front() = 0;
 #if defined(ARCH_X64)
 								if (utils::has_um_wait())
 								{
@ -4501,13 +4584,21 @@ bool spu_thread::process_mfc_cmd()
 						// Spinning, might as well yield cpu resources
 						state += cpu_flag::wait;
-						vm::reservation_notifier(addr).wait(rtime, atomic_wait_timeout{50'000});
+
 						// Storage efficient method to distinguish different nearby addresses (which are likely)
 						g_reservation_waiters[std::popcount(addr)]++;
 						vm::reservation_notifier(addr).wait(this_time, atomic_wait_timeout{100'000});
 						g_reservation_waiters[std::popcount(addr)]--;
 						// Reset perf
 						perf0.restart();
 						// Quick check if there were reservation changes
-						if (rtime == vm::reservation_acquire(addr) && cmp_rdata(rdata, data))
+						const u64 new_time = res;
 						if (new_time % 128 == 0 && cmp_rdata(rdata, data) && res == new_time && cmp_rdata(rdata, data))
 						{
 							if (g_cfg.core.mfc_debug)
 							{
@ -4518,12 +4609,40 @@ bool spu_thread::process_mfc_cmd()
 								std::memcpy(dump.data, rdata, 128);
 							}
-							// Let the game recheck its state, maybe after a long period of time something else changed which satisfies its waiting condition
+							if (new_time != rtime)
-							getllar_spin_count = 1;
+							{
 								// Reservation was lost but the data itself remains unchanged so try to ignore it
 								set_events(SPU_EVENT_LR);
 								rtime = this_time;
 							}
 							u8& val = getllar_wait_time[pc / 32].front();
 							val = static_cast<u8>(std::min<u32>(val + 1, u8{umax}));
 							last_getllar_id = mfc_cmd_id;
 							last_gtsc = perf0.get();
 							return true;
 						}
 						if (new_time == this_time && res == this_time)
 						{
 							spu_log.trace("RTIME unchanged on address 0x%x", addr);
 							// Try to forcefully change in order to notify threads
 							if (g_reservation_waiters[std::popcount(addr)] && res.compare_and_swap_test(this_time, this_time + 128))
 							{
 								vm::reservation_notifier(addr).notify_all();
 							}
 						}
 					}
 				}
 				if (this_time == rtime)
 				{
 					// Try to forcefully change in order to notify threads
 					if (g_reservation_waiters[std::popcount(addr)] && res.compare_and_swap_test(this_time, this_time + 128))
 					{
 						vm::reservation_notifier(addr).notify_all();
 					}
 				}
@ -4535,6 +4654,7 @@ bool spu_thread::process_mfc_cmd()
 		last_getllar_id = mfc_cmd_id;
 		last_getllar = pc;
 		last_getllar_addr = addr;
 		last_gtsc = perf0.get();
 		getllar_spin_count = 0;
 		getllar_busy_waiting_switch = umax;
@ -6855,3 +6975,4 @@ void fmt_class_string<spu_channel_4_t>::format(std::string& out, u64 arg)
 DECLARE(spu_thread::g_raw_spu_ctr){};
 DECLARE(spu_thread::g_raw_spu_id){};
 DECLARE(spu_thread::g_spu_work_count){};
 DECLARE(spu_thread::g_reservation_waiters){};
--- a/rpcs3/Emu/Cell/SPUThread.h
+++ b/rpcs3/Emu/Cell/SPUThread.h
@ -798,8 +798,10 @@ public:
 	u64 last_gtsc = 0;
 	u32 last_getllar = umax; // LS address of last GETLLAR (if matches current GETLLAR we can let the thread rest)
 	u32 last_getllar_id = umax;
 	u32 last_getllar_addr = umax;
 	u32 getllar_spin_count = 0;
 	u32 getllar_busy_waiting_switch = umax; // umax means the test needs evaluation, otherwise it's a boolean
 	u64 getllar_evaluate_time = 0;
 	std::vector<mfc_cmd_dump> mfc_history;
 	u64 mfc_dump_idx = 0;
@ -823,6 +825,8 @@ public:
 	u32 current_bp_pc = umax;
 	bool stop_flag_removal_protection = false;
 	std::array<std::array<u8, 4>, SPU_LS_SIZE / 32> getllar_wait_time{};
 	void push_snr(u32 number, u32 value);
 	static void do_dma_transfer(spu_thread* _this, const spu_mfc_cmd& args, u8* ls);
 	bool do_dma_check(const spu_mfc_cmd& args);
@ -892,6 +896,7 @@ public:
 	static atomic_t<u32> g_raw_spu_ctr;
 	static atomic_t<u32> g_raw_spu_id[5];
 	static atomic_t<u32> g_spu_work_count;
 	static atomic_t<u8> g_reservation_waiters[32];
 	static u32 find_raw_spu(u32 id)
 	{
--- a/rpcs3/Emu/system_config.h
+++ b/rpcs3/Emu/system_config.h
@ -33,7 +33,7 @@ struct cfg_root : cfg::node
 		cfg::_bool set_daz_and_ftz{ this, "Set DAZ and FTZ", false };
 		cfg::_enum<spu_decoder_type> spu_decoder{ this, "SPU Decoder", spu_decoder_type::llvm };
 		cfg::uint<0, 100> spu_reservation_busy_waiting_percentage{ this, "SPU Reservation Busy Waiting Percentage", 0, true };
-		cfg::uint<0, 100> spu_getllar_busy_waiting_percentage{ this, "SPU GETLLAR Busy Waiting Percentage", 100, true };
+		cfg::uint<0, 101> spu_getllar_busy_waiting_percentage{ this, "SPU GETLLAR Busy Waiting Percentage", 100, true };
 		cfg::_bool spu_debug{ this, "SPU Debug" };
 		cfg::_bool mfc_debug{ this, "MFC Debug" };
 		cfg::_int<0, 6> preferred_spu_threads{ this, "Preferred SPU Threads", 0, true }; // Number of hardware threads dedicated to heavy simultaneous spu tasks