CPU: improve cpu_thread::suspend_all for cache efficiency (TSX)

Add prefetch hint list parameter. Workloads may be executed by another thread on another CPU core. It means they may benefit from directly prefetching the data as hinted. Also implement mov_rdata_nt, for "streaming" data from such workloads.
2025-07-07 23:41:26 +12:00 · 2020-10-30 05:17:00 +03:00 · 2020-10-30 05:17:00 +03:00 · 0da24f21d6
commit 0da24f21d6
parent e794109a67
6 changed files with 132 additions and 32 deletions
--- a/rpcs3/Emu/CPU/CPUThread.cpp
+++ b/rpcs3/Emu/CPU/CPUThread.cpp
@ -889,7 +889,7 @@ bool cpu_thread::suspend_work::push(cpu_thread* _this, bool cancel_if_not_suspen
 			}
 		});

-		while (std::accumulate(std::begin(ctr->cpu_copy_bits), std::end(ctr->cpu_copy_bits), u64{0}, std::bit_or()))
+		while (true)
 		{
 			// Check only CPUs which haven't acknowledged their waiting state yet
 			for_all_cpu<true>([&](cpu_thread* cpu, u64 index)
@ -900,6 +900,11 @@ bool cpu_thread::suspend_work::push(cpu_thread* _this, bool cancel_if_not_suspen
 				}
 			});

+			if (!std::accumulate(std::begin(ctr->cpu_copy_bits), std::end(ctr->cpu_copy_bits), u64{0}, std::bit_or()))
+			{
+				break;
+			}
+
 			_mm_pause();
 		}

@ -927,13 +932,20 @@ bool cpu_thread::suspend_work::push(cpu_thread* _this, bool cancel_if_not_suspen
 			while (prev);
 		}

+		// Execute prefetch hint(s)
+		for (auto work = head; work; work = work->next)
+		{
+			for (u32 i = 0; i < work->prf_size; i++)
+			{
+				_m_prefetchw(work->prf_list[0]);
+			}
+		}
+
 		for_all_cpu<true>([&](cpu_thread* cpu)
 		{
 			_m_prefetchw(&cpu->state);
 		});

-		_m_prefetchw(&g_suspend_counter);
-
 		// Execute all stored workload
 		for (s32 prio = max_prio; prio >= min_prio; prio--)
 		{
@ -948,6 +960,9 @@ bool cpu_thread::suspend_work::push(cpu_thread* _this, bool cancel_if_not_suspen
 			}
 		}

+		// Not sure if needed, may be overkill. Some workloads may execute instructions with non-temporal hint.
+		_mm_sfence();
+
 		// Finalization
 		g_suspend_counter++;