vm: improve range_lock and shareable cache (Non-TSX)

Allocate "personal" range lock variable for each spu_thread. Switch from reservation_lock to range lock for all stores. Detect actual memory mirrors in shareable cache setup logic.
2025-07-06 23:11:25 +12:00 · 2020-10-26 04:05:17 +03:00 · 2020-10-26 04:05:17 +03:00 · 4966f6de73
commit 4966f6de73
parent 6806e3d5c7
5 changed files with 343 additions and 212 deletions
--- a/rpcs3/Emu/Cell/SPUThread.cpp
+++ b/rpcs3/Emu/Cell/SPUThread.cpp
@ -1368,6 +1368,9 @@ spu_thread::~spu_thread()
 		g_raw_spu_id[index] = 0;
 		g_raw_spu_ctr--;
 	}
+
+	// Free range lock
+	vm::free_range_lock(range_lock);
 }

 spu_thread::spu_thread(vm::addr_t _ls, lv2_spu_group* group, u32 index, std::string_view name, u32 lv2_id, bool is_isolated, u32 option)
@ -1418,6 +1421,8 @@ spu_thread::spu_thread(vm::addr_t _ls, lv2_spu_group* group, u32 index, std::str
 	{
 		cpu_init();
 	}
+
+	range_lock = vm::alloc_range_lock();
 }

 void spu_thread::push_snr(u32 number, u32 value)
@ -1704,101 +1709,121 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
 			}
 		}

-		switch (u32 size = args.size)
+		if (g_cfg.core.spu_accurate_dma) [[unlikely]]
 		{
-		case 1:
-		{
-			auto [res, time0] = vm::reservation_lock(eal);
-			*reinterpret_cast<u8*>(dst) = *reinterpret_cast<const u8*>(src);
-			res += 64;
-			break;
-		}
-		case 2:
-		{
-			auto [res, time0] = vm::reservation_lock(eal);
-			*reinterpret_cast<u16*>(dst) = *reinterpret_cast<const u16*>(src);
-			res += 64;
-			break;
-		}
-		case 4:
-		{
-			auto [res, time0] = vm::reservation_lock(eal);
-			*reinterpret_cast<u32*>(dst) = *reinterpret_cast<const u32*>(src);
-			res += 64;
-			break;
-		}
-		case 8:
-		{
-			auto [res, time0] = vm::reservation_lock(eal);
-			*reinterpret_cast<u64*>(dst) = *reinterpret_cast<const u64*>(src);
-			res += 64;
-			break;
-		}
-		default:
-		{
-			if (g_cfg.core.spu_accurate_dma)
+			for (u32 size0, size = args.size;; size -= size0, dst += size0, src += size0, eal += size0)
 			{
-				for (u32 size0;; size -= size0, dst += size0, src += size0, eal += size0)
+				size0 = std::min<u32>(128 - (eal & 127), std::min<u32>(size, 128));
+
+				if (size0 == 128u && g_cfg.core.accurate_cache_line_stores)
 				{
-					size0 = std::min<u32>(128 - (eal & 127), std::min<u32>(size, 128));
-
-					if (size0 == 128u && g_cfg.core.accurate_cache_line_stores)
-					{
-						// As atomic as PUTLLUC
-						do_cell_atomic_128_store(eal, src);
-
-						if (size == size0)
-						{
-							break;
-						}
-
-						continue;
-					}
-
-					// Lock each cache line execlusively
-					auto [res, time0] = vm::reservation_lock(eal);
-
-					switch (size0)
-					{
-					case 128:
-					{
-						mov_rdata(*reinterpret_cast<spu_rdata_t*>(dst), *reinterpret_cast<const spu_rdata_t*>(src));
-						break;
-					}
-					default:
-					{
-						auto _dst = dst;
-						auto _src = src;
-						auto _size = size0;
-
-						while (_size)
-						{
-							*reinterpret_cast<v128*>(_dst) = *reinterpret_cast<const v128*>(_src);
-
-							_dst += 16;
-							_src += 16;
-							_size -= 16;
-						}
-
-						break;
-					}
-					}
-
-					res += 64;
+					// As atomic as PUTLLUC
+					do_cell_atomic_128_store(eal, src);

 					if (size == size0)
 					{
 						break;
 					}
+
+					continue;
 				}

-				break;
+				// Lock each cache line execlusively
+				auto [res, time0] = vm::reservation_lock(eal);
+
+				switch (size0)
+				{
+				case 1:
+				{
+					*reinterpret_cast<u8*>(dst) = *reinterpret_cast<const u8*>(src);
+					break;
+				}
+				case 2:
+				{
+					*reinterpret_cast<u16*>(dst) = *reinterpret_cast<const u16*>(src);
+					break;
+				}
+				case 4:
+				{
+					*reinterpret_cast<u32*>(dst) = *reinterpret_cast<const u32*>(src);
+					break;
+				}
+				case 8:
+				{
+					*reinterpret_cast<u64*>(dst) = *reinterpret_cast<const u64*>(src);
+					break;
+				}
+				case 128:
+				{
+					mov_rdata(*reinterpret_cast<spu_rdata_t*>(dst), *reinterpret_cast<const spu_rdata_t*>(src));
+					break;
+				}
+				default:
+				{
+					auto _dst = dst;
+					auto _src = src;
+					auto _size = size0;
+
+					while (_size)
+					{
+						*reinterpret_cast<v128*>(_dst) = *reinterpret_cast<const v128*>(_src);
+
+						_dst += 16;
+						_src += 16;
+						_size -= 16;
+					}
+
+					break;
+				}
+				}
+
+				res += 64;
+
+				if (size == size0)
+				{
+					break;
+				}
 			}

+			std::atomic_thread_fence(std::memory_order_seq_cst);
+			return;
+		}
+
+		switch (u32 size = args.size)
+		{
+		case 1:
+		{
+			vm::range_lock(range_lock, eal, 1);
+			*reinterpret_cast<u8*>(dst) = *reinterpret_cast<const u8*>(src);
+			range_lock->release(0);
+			break;
+		}
+		case 2:
+		{
+			vm::range_lock(range_lock, eal, 2);
+			*reinterpret_cast<u16*>(dst) = *reinterpret_cast<const u16*>(src);
+			range_lock->release(0);
+			break;
+		}
+		case 4:
+		{
+			vm::range_lock(range_lock, eal, 4);
+			*reinterpret_cast<u32*>(dst) = *reinterpret_cast<const u32*>(src);
+			range_lock->release(0);
+			break;
+		}
+		case 8:
+		{
+			vm::range_lock(range_lock, eal, 8);
+			*reinterpret_cast<u64*>(dst) = *reinterpret_cast<const u64*>(src);
+			range_lock->release(0);
+			break;
+		}
+		default:
+		{
 			if (((eal & 127) + size) <= 128)
 			{
-				// Lock one cache line
-				auto [res, time0] = vm::reservation_lock(eal);
+				vm::range_lock(range_lock, eal, size);

 				while (size)
 				{
@ -1809,14 +1834,14 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
 					size -= 16;
 				}

-				res += 64;
+				range_lock->release(0);
 				break;
 			}

 			u32 range_addr = eal & -128;
 			u32 range_end = ::align(eal + size, 128);

-			// Handle the case of crossing 64K page borders
+			// Handle the case of crossing 64K page borders (TODO: maybe split in 4K fragments?)
 			if (range_addr >> 16 != (range_end - 1) >> 16)
 			{
 				u32 nexta = range_end & -65536;
@ -1824,7 +1849,7 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
 				size -= size0;

 				// Split locking + transfer in two parts (before 64K border, and after it)
-				const auto lock = vm::range_lock(range_addr, nexta);
+				vm::range_lock(range_lock, range_addr, size0);

 				// Avoid unaligned stores in mov_rdata_avx
 				if (reinterpret_cast<u64>(dst) & 0x10)
@ -1854,11 +1879,11 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
 					size0 -= 16;
 				}

-				lock->release(0);
+				range_lock->release(0);
 				range_addr = nexta;
 			}

-			const auto lock = vm::range_lock(range_addr, range_end);
+			vm::range_lock(range_lock, range_addr, range_end - range_addr);

 			// Avoid unaligned stores in mov_rdata_avx
 			if (reinterpret_cast<u64>(dst) & 0x10)
@ -1888,16 +1913,11 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
 				size -= 16;
 			}

-			lock->release(0);
+			range_lock->release(0);
 			break;
 		}
 		}

-		if (g_cfg.core.spu_accurate_dma)
-		{
-			std::atomic_thread_fence(std::memory_order_seq_cst);
-		}
-
 		return;
 	}