From 346a1d4433621db384005eff587e69dceb46dd47 Mon Sep 17 00:00:00 2001
From: Nekotekina <nekotekina@gmail.com>
Date: Thu, 8 Oct 2020 16:13:55 +0300
Subject: [PATCH] vm: rewrite reservation bits

Implement classic unique/shared locking concept.
Implement vm::reservation_light_op.
---
 rpcs3/Emu/CPU/CPUThread.cpp             |   3 +-
 rpcs3/Emu/CPU/CPUThread.h               |   1 +
 rpcs3/Emu/Cell/Modules/cellSpurs.cpp    |  92 ++++++++--------
 rpcs3/Emu/Cell/Modules/cellSpurs.h      |  16 +++
 rpcs3/Emu/Cell/Modules/cellSpursSpu.cpp |  14 +--
 rpcs3/Emu/Cell/PPUThread.cpp            | 131 +++++++++++++++++-----
 rpcs3/Emu/Cell/SPUThread.cpp            | 137 +++++++++++++-----------
 rpcs3/Emu/Memory/vm.cpp                 |  34 +++++-
 rpcs3/Emu/Memory/vm_reservation.h       |  90 ++++++++++++----
 rpcs3/Emu/RSX/rsx_methods.cpp           |   8 +-
 10 files changed, 356 insertions(+), 170 deletions(-)
diff --git a/rpcs3/Emu/CPU/CPUThread.cpp b/rpcs3/Emu/CPU/CPUThread.cpp
index 604d0429f8..2ad974fb0f 100644
--- a/rpcs3/Emu/CPU/CPUThread.cpp
+++ b/rpcs3/Emu/CPU/CPUThread.cpp
@@ -338,7 +338,7 @@ void cpu_thread::operator()()
 		{
 			thread_ctrl::set_native_priority(-1);
 		}
-	
+
 		// force input/output denormals to zero for SPU threads (FTZ/DAZ)
 		_mm_setcsr( _mm_getcsr() | 0x8040 );
 
@@ -653,6 +653,7 @@ cpu_thread::suspend_all::suspend_all(cpu_thread* _this) noexcept
 
 	for_all_cpu([](cpu_thread* cpu)
 	{
+		// Should be atomic
 		if (!(cpu->state & cpu_flag::pause))
 		{
 			cpu->state += cpu_flag::pause;
diff --git a/rpcs3/Emu/CPU/CPUThread.h b/rpcs3/Emu/CPU/CPUThread.h
index 853d967eec..0aebb8c345 100644
--- a/rpcs3/Emu/CPU/CPUThread.h
+++ b/rpcs3/Emu/CPU/CPUThread.h
@@ -12,6 +12,7 @@ enum class cpu_flag : u32
 	exit, // Irreversible exit
 	wait, // Indicates waiting state, set by the thread itself
 	pause, // Thread suspended by suspend_all technique
+	pause2, // Used by suspend_all internally
 	suspend, // Thread suspended
 	ret, // Callback return requested
 	signal, // Thread received a signal (HLE)
diff --git a/rpcs3/Emu/Cell/Modules/cellSpurs.cpp b/rpcs3/Emu/Cell/Modules/cellSpurs.cpp
index f9a34c4a4e..f7a135a26c 100644
--- a/rpcs3/Emu/Cell/Modules/cellSpurs.cpp
+++ b/rpcs3/Emu/Cell/Modules/cellSpurs.cpp
@@ -2536,9 +2536,10 @@ s32 cellSpursShutdownWorkload(ppu_thread& ppu, vm::ptr<CellSpurs> spurs, u32 wid
 		return CELL_OK;
 	}
 
-	auto [res, rtime] = vm::reservation_lock(vm::get_addr(&spurs->wklEvent(wid)), 1, vm::dma_lockb);
-	const auto old = spurs->wklEvent(wid).fetch_or(1);
-	res.release(rtime + (old & 1 ? 0 : 128));
+	const auto old = vm::reservation_light_op(spurs->wklEvent(wid), [](atomic_t<u8>& v)
+	{
+		return v.fetch_or(1);
+	});
 
 	if (old & 0x12 && !(old & 1) && sys_event_port_send(spurs->eventPort, 0, 0, (1u << 31) >> wid))
 	{
@@ -2693,9 +2694,11 @@ s32 cellSpursReadyCountStore(ppu_thread& ppu, vm::ptr<CellSpurs> spurs, u32 wid,
 		return CELL_SPURS_POLICY_MODULE_ERROR_STAT;
 	}
 
-	auto [res, rtime] = vm::reservation_lock(spurs.addr(), 128, vm::dma_lockb);
-	spurs->readyCount(wid).release(static_cast<u8>(value));
-	res.store(rtime + 128);
+	vm::reservation_light_op(spurs->readyCount(wid), [&](atomic_t<u8>& v)
+	{
+		v.release(static_cast<u8>(value));
+	});
+
 	return CELL_OK;
 }
 
@@ -2729,11 +2732,11 @@ s32 cellSpursReadyCountSwap(ppu_thread& ppu, vm::ptr<CellSpurs> spurs, u32 wid,
 		return CELL_SPURS_POLICY_MODULE_ERROR_STAT;
 	}
 
-	auto [res, rtime] = vm::reservation_lock(spurs.addr(), 128, vm::dma_lockb);
-	u32 temp = spurs->readyCount(wid).exchange(static_cast<u8>(swap));
-	res.release(rtime + 128);
+	*old = vm::reservation_light_op(spurs->readyCount(wid), [&](atomic_t<u8>& v)
+	{
+		return v.exchange(static_cast<u8>(swap));
+	});
 
-	*old = temp;
 	return CELL_OK;
 }
 
@@ -2769,9 +2772,10 @@ s32 cellSpursReadyCountCompareAndSwap(ppu_thread& ppu, vm::ptr<CellSpurs> spurs,
 
 	u8 temp = static_cast<u8>(compare);
 
-	auto [res, rtime] = vm::reservation_lock(spurs.addr(), 128, vm::dma_lockb);
-	spurs->readyCount(wid).compare_exchange(temp, static_cast<u8>(swap));
-	res.release(rtime + 128);
+	vm::reservation_light_op(spurs->readyCount(wid), [&](atomic_t<u8>& v)
+	{
+		v.compare_exchange(temp, static_cast<u8>(swap));
+	});
 
 	*old = temp;
 	return CELL_OK;
@@ -2807,17 +2811,15 @@ s32 cellSpursReadyCountAdd(ppu_thread& ppu, vm::ptr<CellSpurs> spurs, u32 wid, v
 		return CELL_SPURS_POLICY_MODULE_ERROR_STAT;
 	}
 
-	auto [res, rtime] = vm::reservation_lock(spurs.addr(), 128, vm::dma_lockb);
-
-	u32 temp = spurs->readyCount(wid).fetch_op([&](u8& val)
+	*old = vm::reservation_light_op(spurs->readyCount(wid), [&](atomic_t<u8>& v)
 	{
-		const s32 _new = val + value;
-		val = static_cast<u8>(std::clamp<s32>(_new, 0, 0xFF));
+		return v.fetch_op([&](u8& val)
+		{
+			const s32 _new = val + value;
+			val = static_cast<u8>(std::clamp<s32>(_new, 0, 255));
+		});
 	});
 
-	res.release(rtime + 128);
-
-	*old = temp;
 	return CELL_OK;
 }
 
@@ -3833,13 +3835,12 @@ s32 _spurs::create_task(vm::ptr<CellSpursTaskset> taskset, vm::ptr<u32> task_id,
 	// TODO: Verify the ELF header is proper and all its load segments are at address >= 0x3000
 
 	u32 tmp_task_id;
-	{
-		auto addr = taskset.ptr(&CellSpursTaskset::enabled).addr();
-		auto [res, rtime] = vm::reservation_lock(addr, 16, vm::dma_lockb);
 
+	vm::reservation_light_op(vm::_ref<atomic_be_t<v128>>(taskset.ptr(&CellSpursTaskset::enabled).addr()), [&](atomic_be_t<v128>& ptr)
+	{
 		// NOTE: Realfw processes this using 4 32-bits atomic loops
 		// But here its processed within a single 128-bit atomic op
-		vm::_ref<atomic_be_t<v128>>(addr).fetch_op([&](be_t<v128>& value)
+		ptr.fetch_op([&](be_t<v128>& value)
 		{
 			auto value0 = value.value();
 
@@ -3862,9 +3863,7 @@ s32 _spurs::create_task(vm::ptr<CellSpursTaskset> taskset, vm::ptr<u32> task_id,
 			tmp_task_id = CELL_SPURS_MAX_TASK;
 			return false;
 		});
-
-		res.release(rtime + 128);
-	}
+	});
 
 	if (tmp_task_id >= CELL_SPURS_MAX_TASK)
 	{
@@ -3885,9 +3884,10 @@ s32 _spurs::create_task(vm::ptr<CellSpursTaskset> taskset, vm::ptr<u32> task_id,
 
 s32 _spurs::task_start(ppu_thread& ppu, vm::ptr<CellSpursTaskset> taskset, u32 taskId)
 {
-	auto [res, rtime] = vm::reservation_lock(taskset.ptr(&CellSpursTaskset::pending_ready).addr(), 16, vm::dma_lockb);
-	taskset->pending_ready.values[taskId / 32] |= (1u << 31) >> (taskId % 32);
-	res.release(rtime + 128);
+	vm::reservation_light_op(taskset->pending_ready, [&](CellSpursTaskset::atomic_tasks_bitset& v)
+	{
+		v.values[taskId / 32] |= (1u << 31) >> (taskId % 32);
+	});
 
 	auto spurs = +taskset->spurs;
 	ppu_execute<&cellSpursSendWorkloadSignal>(ppu, spurs, +taskset->wid);
@@ -4706,25 +4706,23 @@ s32 cellSpursJobGuardNotify(ppu_thread& ppu, vm::ptr<CellSpursJobGuard> jobGuard
 	if (!jobGuard.aligned())
 		return CELL_SPURS_JOB_ERROR_ALIGN;
 
-	auto [res, rtime] = vm::reservation_lock(jobGuard.addr(), 128, vm::dma_lockb);
-
 	u32 allow_jobchain_run = 0; // Affects cellSpursJobChainRun execution
+	u32 old = 0;
 
-	auto [old, ok] = jobGuard->ncount0.fetch_op([&](be_t<u32>& value)
+	const bool ok = vm::reservation_op(vm::unsafe_ptr_cast<CellSpursJobGuard_x00>(jobGuard), [&](CellSpursJobGuard_x00& jg)
 	{
-		allow_jobchain_run = jobGuard->zero;
+		allow_jobchain_run = jg.zero;
+		old = jg.ncount0;
 
-		if (!value)
+		if (!jg.ncount0)
 		{
 			return false;
 		}
 
-		--value;
+		jg.ncount0--;
 		return true;
 	});
 
-	res.release(rtime + (ok ? 128 : 0));
-
 	if (!ok)
 	{
 		return CELL_SPURS_CORE_ERROR_STAT;
@@ -4759,9 +4757,11 @@ s32 cellSpursJobGuardReset(vm::ptr<CellSpursJobGuard> jobGuard)
 	if (!jobGuard.aligned())
 		return CELL_SPURS_JOB_ERROR_ALIGN;
 
-	auto [res, rtime] = vm::reservation_lock(jobGuard.addr(), 128, vm::dma_lockb);
-	jobGuard->ncount0 = jobGuard->ncount1;
-	res.release(rtime + 128);
+	vm::reservation_light_op(jobGuard->ncount0, [&](atomic_be_t<u32>& ncount0)
+	{
+		ncount0 = jobGuard->ncount1;
+	});
+
 	return CELL_OK;
 }
 
@@ -4844,9 +4844,11 @@ s32 cellSpursJobSetMaxGrab(vm::ptr<CellSpursJobChain> jobChain, u32 maxGrabbedJo
 	if ((spurs->wklEnabled & (0x80000000u >> wid)) == 0u)
 		return CELL_SPURS_JOB_ERROR_STAT;
 
-	auto [res, rtime] = vm::reservation_lock(jobChain.addr(), 128, vm::dma_lockb);
-	jobChain->maxGrabbedJob.release(static_cast<u16>(maxGrabbedJob));
-	res.store(rtime + 128);
+	vm::reservation_light_op(jobChain->maxGrabbedJob, [&](atomic_be_t<u16>& v)
+	{
+		v.release(static_cast<u16>(maxGrabbedJob));
+	});
+
 	return CELL_OK;
 }
 
diff --git a/rpcs3/Emu/Cell/Modules/cellSpurs.h b/rpcs3/Emu/Cell/Modules/cellSpurs.h
index 4adfff87ae..b602bf97fd 100644
--- a/rpcs3/Emu/Cell/Modules/cellSpurs.h
+++ b/rpcs3/Emu/Cell/Modules/cellSpurs.h
@@ -561,6 +561,22 @@ struct alignas(128) CellSpursJobGuard
 
 CHECK_SIZE_ALIGN(CellSpursJobGuard, 128, 128);
 
+struct alignas(128) CellSpursJobGuard_x00
+{
+	be_t<u32> ncount0;                    // 0x00
+	be_t<u32> ncount1;                    // 0x04
+	vm::bptr<CellSpursJobChain> jobChain; // 0x0C
+	be_t<u32> unk0;
+	be_t<u32> requestSpuCount;            // 0x10
+	be_t<u32> unk1[3];
+	be_t<u32> autoReset;                  // 0x20
+	be_t<u32> unk2[3];
+	be_t<u32> zero;                       // 0x30
+	u8 unk3[0x80 - 0x34];
+};
+
+CHECK_SIZE_ALIGN(CellSpursJobGuard_x00, 128, 128);
+
 // Core CellSpurs structures
 struct alignas(128) CellSpurs
 {
diff --git a/rpcs3/Emu/Cell/Modules/cellSpursSpu.cpp b/rpcs3/Emu/Cell/Modules/cellSpursSpu.cpp
index ebe3f7b0c8..ab4b2f920a 100644
--- a/rpcs3/Emu/Cell/Modules/cellSpursSpu.cpp
+++ b/rpcs3/Emu/Cell/Modules/cellSpursSpu.cpp
@@ -1431,7 +1431,7 @@ s32 spursTasksetProcessRequest(spu_thread& spu, s32 request, u32* taskId, u32* i
 		// Find the number of tasks that have become ready since the last iteration
 		{
 			auto newlyReadyTasks = v128::andnot(ready, signalled | pready);
-		
+
 			// TODO: Optimize this shit with std::popcount when it's known to be fixed
 			for (auto i = 0; i < 128; i++)
 			{
@@ -1597,14 +1597,14 @@ s32 spursTasksetProcessRequest(spu_thread& spu, s32 request, u32* taskId, u32* i
 	{
 		auto spurs = kernelCtxt->spurs;
 
-		auto [res, rtime] = vm::reservation_lock(spurs.addr(), 128, vm::dma_lockb);
-		spurs->readyCount(kernelCtxt->wklCurrentId).fetch_op([&](u8& val)
+		vm::reservation_light_op(spurs->readyCount(kernelCtxt->wklCurrentId), [&](atomic_t<u8>& val)
 		{
-			const s32 _new = val + numNewlyReadyTasks;
-			val = static_cast<u8>(std::clamp<s32>(_new, 0, 0xFF));
+			val.fetch_op([&](u8& val)
+			{
+				const s32 _new = val + numNewlyReadyTasks;
+				val = static_cast<u8>(std::clamp<s32>(_new, 0, 0xFF));
+			});
 		});
-
-		res.release(rtime + 128);
 	}
 
 	return rc;
diff --git a/rpcs3/Emu/Cell/PPUThread.cpp b/rpcs3/Emu/Cell/PPUThread.cpp
index 915e9fa5f3..66a49f1340 100644
--- a/rpcs3/Emu/Cell/PPUThread.cpp
+++ b/rpcs3/Emu/Cell/PPUThread.cpp
@@ -946,7 +946,7 @@ void ppu_thread::fast_call(u32 addr, u32 rtoc)
 
 		if (_this->current_function && vm::read32(cia) != ppu_instructions::SC(0))
 		{
-			return fmt::format("PPU[0x%x] Thread (%s) [HLE:0x%08x, LR:0x%08x]", _this->id, *name_cache.get(), cia, _this->lr);	
+			return fmt::format("PPU[0x%x] Thread (%s) [HLE:0x%08x, LR:0x%08x]", _this->id, *name_cache.get(), cia, _this->lr);
 		}
 
 		return fmt::format("PPU[0x%x] Thread (%s) [0x%08x]", _this->id, *name_cache.get(), cia);
@@ -1103,7 +1103,6 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr)
 	const u64 data_off = (addr & 7) * 8;
 
 	ppu.raddr = addr;
-	const u64 mask_res = g_use_rtm ? (-128 | vm::dma_lockb) : -1;
 
 	if (const s32 max = g_cfg.core.ppu_128_reservations_loop_max_length)
 	{
@@ -1160,7 +1159,7 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr)
 	for (u64 count = 0;; [&]()
 	{
 		if (ppu.state)
-		{ 
+		{
 			ppu.check_state();
 		}
 		else if (++count < 20) [[likely]]
@@ -1175,7 +1174,7 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr)
 		}
 	}())
 	{
-		ppu.rtime = vm::reservation_acquire(addr, sizeof(T)) & mask_res;
+		ppu.rtime = vm::reservation_acquire(addr, sizeof(T));
 
 		if (ppu.rtime & 127)
 		{
@@ -1189,7 +1188,7 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr)
 			mov_rdata(ppu.rdata, vm::_ref<spu_rdata_t>(addr & -128));
 		}
 
-		if ((vm::reservation_acquire(addr, sizeof(T)) & mask_res) == ppu.rtime) [[likely]]
+		if (vm::reservation_acquire(addr, sizeof(T)) == ppu.rtime) [[likely]]
 		{
 			if (count >= 15) [[unlikely]]
 			{
@@ -1218,6 +1217,7 @@ const auto ppu_stcx_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, u64 rda
 
 	Label fall = c.newLabel();
 	Label fail = c.newLabel();
+	Label fail2 = c.newLabel();
 
 	// Prepare registers
 	c.mov(x86::r10, imm_ptr(+vm::g_reservations));
@@ -1234,7 +1234,9 @@ const auto ppu_stcx_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, u64 rda
 	// Begin transaction
 	build_transaction_enter(c, fall, args[0], 16);
 	c.mov(x86::rax, x86::qword_ptr(x86::r10));
-	c.and_(x86::rax, -128 | vm::dma_lockb);
+	c.test(x86::eax, vm::rsrv_unique_lock);
+	c.jnz(fail2);
+	c.and_(x86::rax, -128);
 	c.cmp(x86::rax, args[1]);
 	c.jne(fail);
 	c.cmp(x86::qword_ptr(x86::r11), args[2]);
@@ -1249,6 +1251,7 @@ const auto ppu_stcx_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, u64 rda
 	c.bind(fall);
 	c.sar(x86::eax, 24);
 	c.js(fail);
+	c.bind(fail2);
 	c.mov(x86::eax, 2);
 	c.ret();
 
@@ -1324,11 +1327,11 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
 	// Begin transaction
 	build_transaction_enter(c, fall, x86::r12, 4);
 	c.mov(x86::rax, x86::qword_ptr(x86::rbx));
+	c.test(x86::eax, vm::rsrv_unique_lock);
+	c.jnz(skip);
 	c.and_(x86::rax, -128);
 	c.cmp(x86::rax, x86::r13);
 	c.jne(fail);
-	c.test(x86::qword_ptr(x86::rbx), 127);
-	c.jnz(skip);
 
 	if (s_tsx_avx)
 	{
@@ -1394,15 +1397,19 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
 
 	Label fall2 = c.newLabel();
 	Label fail2 = c.newLabel();
+	Label fail3 = c.newLabel();
 
 	// Lightened transaction: only compare and swap data
 	c.bind(next);
 
 	// Try to "lock" reservation
-	c.mov(x86::rax, x86::r13);
-	c.add(x86::r13, 1);
-	c.lock().cmpxchg(x86::qword_ptr(x86::rbx), x86::r13);
-	c.jne(fail);
+	c.mov(x86::eax, x86::r13);
+	c.lock().xadd(x86::qword_ptr(x86::rbx), x86::rax);
+	c.test(x86::eax, vm::rsrv_unique_lock);
+	c.jnz(fail3);
+	c.and_(x86::rax, -128);
+	c.cmp(x86::rax, x86::r13);
+	c.jne(fail2);
 
 	build_transaction_enter(c, fall2, x86::r12, 666);
 
@@ -1453,6 +1460,7 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
 	c.bind(fall2);
 	c.sar(x86::eax, 24);
 	c.js(fail2);
+	c.bind(fail3);
 	c.mov(x86::eax, 2);
 	c.jmp(_ret);
 
@@ -1579,20 +1587,51 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value)
 
 				cpu_thread::suspend_all cpu_lock(&ppu);
 
-				// Give up if PUTLLUC happened
-				if (res == (rtime | 1) && cmp_rdata(ppu.rdata, vm::_ref<spu_rdata_t>(addr & -128)))
+				// Obtain unique lock
+				while (res.bts(std::countr_zero<u32>(vm::rsrv_unique_lock)))
+				{
+					busy_wait(100);
+
+					// Give up if reservation has been updated
+					if ((res & -128) != rtime)
+					{
+						res -= 1;
+						return false;
+					}
+				}
+
+				if ((res & -128) == rtime && cmp_rdata(ppu.rdata, vm::_ref<spu_rdata_t>(addr & -128)))
 				{
 					data.release(reg_value);
-					res.release(rtime + 128);
+					res += 63;
 					return true;
 				}
 
-				res.release(rtime);
+				res -= (vm::rsrv_unique_lock + 1);
 				return false;
 			}
 
-			if (!vm::reservation_trylock(res, rtime))
+			while (res.bts(std::countr_zero<u32>(vm::rsrv_unique_lock)))
 			{
+				// Give up if reservation has been updated
+				if ((res & -128) != rtime)
+				{
+					return false;
+				}
+
+				if (ppu.state && ppu.check_state())
+				{
+					return false;
+				}
+				else
+				{
+					busy_wait(100);
+				}
+			}
+
+			if ((res & -128) != rtime)
+			{
+				res -= vm::rsrv_unique_lock;
 				return false;
 			}
 
@@ -1654,24 +1693,64 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value)
 			default: break;
 			}
 
-			if (res == rtime && vm::reservation_trylock(res, rtime))
+			if (res.fetch_add(1) & vm::rsrv_unique_lock)
 			{
-				const bool ret = data.compare_and_swap_test(old_data, reg_value);
-				res.release(rtime + 128);
-				return ret;
+				res -= 1;
+				return false;
 			}
 
+			if (data.compare_and_swap_test(old_data, reg_value))
+			{
+				res += 127;
+				return true;
+			}
+
+			res -= 1;
 			return false;
 		}
 
-		if (!vm::reservation_trylock(res, rtime))
+		while (true)
 		{
-			return false;
+			auto [_old, _ok] = res.fetch_op([&](u64& r)
+			{
+				if ((r & -128) != rtime || (r & vm::rsrv_unique_lock))
+				{
+					return false;
+				}
+
+				r += 1;
+				return true;
+			});
+
+			// Give up if reservation has been updated
+			if ((_old & -128) != rtime)
+			{
+				return false;
+			}
+
+			if (_ok)
+			{
+				break;
+			}
+
+			if (ppu.state && ppu.check_state())
+			{
+				return false;
+			}
+			else
+			{
+				busy_wait(100);
+			}
 		}
 
-		const bool ret = data.compare_and_swap_test(old_data, reg_value);
-		res.release(rtime + 128);
-		return ret;
+		if (data.compare_and_swap_test(old_data, reg_value))
+		{
+			res += 127;
+			return true;
+		}
+
+		res -=1;
+		return false;
 	}())
 	{
 		res.notify_all();
diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp
index 15be3b017f..facba04dd8 100644
--- a/rpcs3/Emu/Cell/SPUThread.cpp
+++ b/rpcs3/Emu/Cell/SPUThread.cpp
@@ -378,11 +378,11 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, const
 	// Begin transaction
 	build_transaction_enter(c, fall, x86::r12, 4);
 	c.mov(x86::rax, x86::qword_ptr(x86::rbx));
+	c.test(x86::eax, vm::rsrv_unique_lock);
+	c.jnz(skip);
 	c.and_(x86::rax, -128);
 	c.cmp(x86::rax, x86::r13);
 	c.jne(fail);
-	c.test(x86::qword_ptr(x86::rbx), 127);
-	c.jnz(skip);
 
 	if (s_tsx_avx)
 	{
@@ -461,15 +461,19 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, const
 
 	Label fall2 = c.newLabel();
 	Label fail2 = c.newLabel();
+	Label fail3 = c.newLabel();
 
 	// Lightened transaction: only compare and swap data
 	c.bind(next);
 
 	// Try to "lock" reservation
-	c.mov(x86::rax, x86::r13);
-	c.add(x86::r13, 1);
-	c.lock().cmpxchg(x86::qword_ptr(x86::rbx), x86::r13);
-	c.jne(fail);
+	c.mov(x86::eax, 1);
+	c.lock().xadd(x86::qword_ptr(x86::rbx), x86::rax);
+	c.test(x86::eax, vm::rsrv_unique_lock);
+	c.jnz(fail3);
+	c.and_(x86::rax, -128);
+	c.cmp(x86::rax, x86::r13);
+	c.jne(fail2);
 
 	build_transaction_enter(c, fall2, x86::r12, 666);
 
@@ -533,6 +537,7 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, const
 	c.bind(fall2);
 	c.sar(x86::eax, 24);
 	c.js(fail2);
+	c.bind(fail3);
 	c.mov(x86::eax, 2);
 	c.jmp(_ret);
 
@@ -644,7 +649,7 @@ const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rda
 
 	// Begin transaction
 	build_transaction_enter(c, fall, x86::r12, 8);
-	c.test(x86::dword_ptr(x86::rbx), 127);
+	c.test(x86::dword_ptr(x86::rbx), vm::rsrv_unique_lock);
 	c.jnz(skip);
 
 	if (s_tsx_avx)
@@ -688,14 +693,15 @@ const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rda
 	c.xor_(x86::rbp, 0xf80);
 
 	Label fall2 = c.newLabel();
-	Label fail2 = c.newLabel();
 
 	// Lightened transaction
 	c.bind(next);
 
-	// Try to acquire "PUTLLUC lock"
-	c.lock().bts(x86::qword_ptr(x86::rbx), std::countr_zero<u32>(vm::putlluc_lockb));
-	c.jc(fail2);
+	// Lock reservation
+	c.mov(x86::eax, 1);
+	c.lock().xadd(x86::qword_ptr(x86::rbx), x86::rax);
+	c.test(x86::eax, vm::rsrv_unique_lock);
+	c.jnz(fall2);
 
 	build_transaction_enter(c, fall2, x86::r12, 666);
 
@@ -719,16 +725,12 @@ const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rda
 	}
 
 	c.xend();
-	c.lock().add(x86::qword_ptr(x86::rbx), 64);
+	c.lock().add(x86::qword_ptr(x86::rbx), 127);
 	c.mov(x86::eax, 1);
 	c.jmp(_ret);
 
-	c.bind(fail2);
-	c.xor_(x86::eax, x86::eax);
-	c.jmp(_ret);
-
 	c.bind(fall2);
-	c.mov(x86::eax, 2);
+	c.xor_(x86::eax, x86::eax);
 	//c.jmp(_ret);
 
 	c.bind(_ret);
@@ -1415,30 +1417,30 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
 		{
 		case 1:
 		{
-			auto [res, time0] = vm::reservation_lock(eal, 1, vm::dma_lockb);
+			auto [res, time0] = vm::reservation_lock(eal);
 			*reinterpret_cast<u8*>(dst) = *reinterpret_cast<const u8*>(src);
-			res.release(time0 + 128);
+			res += 64;
 			break;
 		}
 		case 2:
 		{
-			auto [res, time0] = vm::reservation_lock(eal, 2, vm::dma_lockb);
+			auto [res, time0] = vm::reservation_lock(eal);
 			*reinterpret_cast<u16*>(dst) = *reinterpret_cast<const u16*>(src);
-			res.release(time0 + 128);
+			res += 64;
 			break;
 		}
 		case 4:
 		{
-			auto [res, time0] = vm::reservation_lock(eal, 4, vm::dma_lockb);
+			auto [res, time0] = vm::reservation_lock(eal);
 			*reinterpret_cast<u32*>(dst) = *reinterpret_cast<const u32*>(src);
-			res.release(time0 + 128);
+			res += 64;
 			break;
 		}
 		case 8:
 		{
-			auto [res, time0] = vm::reservation_lock(eal, 8, vm::dma_lockb);
+			auto [res, time0] = vm::reservation_lock(eal);
 			*reinterpret_cast<u64*>(dst) = *reinterpret_cast<const u64*>(src);
-			res.release(time0 + 128);
+			res += 64;
 			break;
 		}
 		default:
@@ -1463,7 +1465,7 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
 					}
 
 					// Lock each cache line execlusively
-					auto [res, time0] = vm::reservation_lock(eal, size0, vm::dma_lockb);
+					auto [res, time0] = vm::reservation_lock(eal);
 
 					switch (size0)
 					{
@@ -1491,7 +1493,7 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
 					}
 					}
 
-					res.release(time0 + 128);
+					res += 64;
 
 					if (size == size0)
 					{
@@ -1505,7 +1507,7 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
 			if (((eal & 127) + size) <= 128)
 			{
 				// Lock one cache line
-				auto [res, time0] = vm::reservation_lock(eal, 128);
+				auto [res, time0] = vm::reservation_lock(eal);
 
 				while (size)
 				{
@@ -1516,7 +1518,7 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
 					size -= 16;
 				}
 
-				res.release(time0);
+				res += 64;
 				break;
 			}
 
@@ -1848,21 +1850,34 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args)
 
 				cpu_thread::suspend_all cpu_lock(this);
 
-				// Give up if PUTLLUC happened
-				if (res == (rtime | 1))
+				// Obtain unique lock
+				while (res.bts(std::countr_zero<u32>(vm::rsrv_unique_lock)))
+				{
+					busy_wait(100);
+
+					// Give up if reservation has been updated
+					if ((res & -128) != rtime)
+					{
+						res -= 1;
+						if (render) render->unpause();
+						return false;
+					}
+				}
+
+				if ((res & -128) == rtime)
 				{
 					auto& data = vm::_ref<spu_rdata_t>(addr);
 
 					if (cmp_rdata(rdata, data))
 					{
 						mov_rdata(data, to_write);
-						res += 127;
+						res += 63;
 						if (render) render->unpause();
 						return true;
 					}
 				}
 
-				res -= 1;
+				res -= (vm::rsrv_unique_lock | 1);
 				if (render) render->unpause();
 				return false;
 			}
@@ -1872,8 +1887,27 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args)
 			}
 		}
 
-		if (!vm::reservation_trylock(res, rtime))
+		while (res.bts(std::countr_zero<u32>(vm::rsrv_unique_lock)))
 		{
+			// Give up if reservation has been updated
+			if ((res & -128) != rtime)
+			{
+				return false;
+			}
+
+			if (state && check_state())
+			{
+				return false;
+			}
+			else
+			{
+				busy_wait(100);
+			}
+		}
+
+		if ((res & -128) != rtime)
+		{
+			res -= vm::rsrv_unique_lock;
 			return false;
 		}
 
@@ -1914,7 +1948,7 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args)
 		if (raddr)
 		{
 			// Last check for event before we clear the reservation
-			if (raddr == addr || rtime != (vm::reservation_acquire(raddr, 128) & (-128 | vm::dma_lockb)) || !cmp_rdata(rdata, vm::_ref<spu_rdata_t>(raddr)))
+			if (raddr == addr || rtime != (vm::reservation_acquire(raddr, 128) & -128) || !cmp_rdata(rdata, vm::_ref<spu_rdata_t>(raddr)))
 			{
 				set_events(SPU_EVENT_LR);
 			}
@@ -1937,38 +1971,17 @@ void do_cell_atomic_128_store(u32 addr, const void* to_write)
 
 		if (render) render->pause();
 
-		if (result == 2)
+		if (result == 0)
 		{
 			cpu_thread::suspend_all cpu_lock(cpu);
 
-			if (vm::reservation_acquire(addr, 128) & 64)
-			{
-				// Wait for PUTLLC to complete
-				while (vm::reservation_acquire(addr, 128) & 63)
-				{
-					busy_wait(100);
-				}
-
-				mov_rdata(vm::_ref<spu_rdata_t>(addr), *static_cast<const spu_rdata_t*>(to_write));
-				vm::reservation_acquire(addr, 128) += 64;
-			}
-		}
-		else if (result == 0)
-		{
-			cpu_thread::suspend_all cpu_lock(cpu);
-
-			while (vm::reservation_acquire(addr, 128).bts(std::countr_zero<u32>(vm::putlluc_lockb)))
-			{
-				busy_wait(100);
-			}
-
-			while (vm::reservation_acquire(addr, 128) & 63)
+			while (vm::reservation_acquire(addr, 128).bts(std::countr_zero<u32>(vm::rsrv_unique_lock)))
 			{
 				busy_wait(100);
 			}
 
 			mov_rdata(vm::_ref<spu_rdata_t>(addr), *static_cast<const spu_rdata_t*>(to_write));
-			vm::reservation_acquire(addr, 128) += 64;
+			vm::reservation_acquire(addr, 128) += 63;
 		}
 
 		if (render) render->unpause();
@@ -1977,7 +1990,7 @@ void do_cell_atomic_128_store(u32 addr, const void* to_write)
 	else
 	{
 		auto& data = vm::_ref<spu_rdata_t>(addr);
-		auto [res, time0] = vm::reservation_lock(addr, 128);
+		auto [res, time0] = vm::reservation_lock(addr);
 
 		*reinterpret_cast<atomic_t<u32>*>(&data) += 0;
 
@@ -2247,7 +2260,7 @@ bool spu_thread::process_mfc_cmd()
 		if (raddr && raddr != addr)
 		{
 			// Last check for event before we replace the reservation with a new one
-			if ((vm::reservation_acquire(raddr, 128) & (-128 | vm::dma_lockb)) != rtime || !cmp_rdata(temp, vm::_ref<spu_rdata_t>(raddr)))
+			if ((vm::reservation_acquire(raddr, 128) & -128) != rtime || !cmp_rdata(temp, vm::_ref<spu_rdata_t>(raddr)))
 			{
 				set_events(SPU_EVENT_LR);
 			}
@@ -3583,7 +3596,7 @@ bool spu_thread::capture_local_storage() const
 		if (name.empty())
 		{
 			// TODO: Maybe add thread group name here
-			fmt::append(name, "SPU.%u", lv2_id);	
+			fmt::append(name, "SPU.%u", lv2_id);
 		}
 	}
 	else
diff --git a/rpcs3/Emu/Memory/vm.cpp b/rpcs3/Emu/Memory/vm.cpp
index 0251f28dc8..a15b805f0d 100644
--- a/rpcs3/Emu/Memory/vm.cpp
+++ b/rpcs3/Emu/Memory/vm.cpp
@@ -441,11 +441,11 @@ namespace vm
 		g_mutex.unlock();
 	}
 
-	u64 reservation_lock_internal(u32 addr, atomic_t<u64>& res, u64 lock_bits)
+	u64 reservation_lock_internal(u32 addr, atomic_t<u64>& res)
 	{
 		for (u64 i = 0;; i++)
 		{
-			if (u64 rtime = res; !(rtime & 127) && reservation_trylock(res, rtime, lock_bits)) [[likely]]
+			if (u64 rtime = res; !(rtime & 127) && reservation_try_lock(res, rtime)) [[likely]]
 			{
 				return rtime;
 			}
@@ -471,6 +471,30 @@ namespace vm
 		}
 	}
 
+	void reservation_shared_lock_internal(atomic_t<u64>& res)
+	{
+		for (u64 i = 0;; i++)
+		{
+			if (!(res & rsrv_unique_lock)) [[likely]]
+			{
+				return;
+			}
+
+			if (auto cpu = get_current_cpu_thread(); cpu && cpu->state)
+			{
+				cpu->check_state();
+			}
+			else if (i < 15)
+			{
+				busy_wait(500);
+			}
+			else
+			{
+				std::this_thread::yield();
+			}
+		}
+	}
+
 	void reservation_op_internal(u32 addr, std::function<bool()> func)
 	{
 		const auto _cpu = get_current_cpu_thread();
@@ -481,15 +505,15 @@ namespace vm
 		{
 			cpu_thread::suspend_all cpu_lock(_cpu);
 
-			// Wait to acquire PUTLLUC lock
-			while (vm::reservation_acquire(addr, 128).bts(std::countr_zero<u32>(vm::putlluc_lockb)))
+			// Wait to acquire unique lock
+			while (vm::reservation_acquire(addr, 128).bts(std::countr_zero<u32>(vm::rsrv_unique_lock)))
 			{
 				busy_wait(100);
 			}
 
 			if (func())
 			{
-				// Success, release PUTLLUC and PUTLLC locks if necessary
+				// Success, release all locks if necessary
 				vm::reservation_acquire(addr, 128) += 63;
 			}
 			else
diff --git a/rpcs3/Emu/Memory/vm_reservation.h b/rpcs3/Emu/Memory/vm_reservation.h
index fd593c28db..2b022659f6 100644
--- a/rpcs3/Emu/Memory/vm_reservation.h
+++ b/rpcs3/Emu/Memory/vm_reservation.h
@@ -10,11 +10,11 @@ extern bool g_use_rtm;
 
 namespace vm
 {
-	enum reservation_lock_bit : u64
+	enum : u64
 	{
-		stcx_lockb = 1 << 0, // Exclusive conditional reservation lock
-		dma_lockb = 1 << 5, // Exclusive unconditional reservation lock
-		putlluc_lockb = 1 << 6, // Exclusive unconditional reservation lock
+		rsrv_lock_mask = 127,
+		rsrv_unique_lock = 64,
+		rsrv_shared_mask = 63,
 	};
 
 	// Get reservation status for further atomic update: last update timestamp
@@ -42,11 +42,13 @@ namespace vm
 		return *reinterpret_cast<atomic_t<u64>*>(g_reservations + (addr & 0xff80) / 2);
 	}
 
-	u64 reservation_lock_internal(u32, atomic_t<u64>&, u64);
+	u64 reservation_lock_internal(u32, atomic_t<u64>&);
 
-	inline bool reservation_trylock(atomic_t<u64>& res, u64 rtime, u64 lock_bits = stcx_lockb)
+	void reservation_shared_lock_internal(atomic_t<u64>&);
+
+	inline bool reservation_try_lock(atomic_t<u64>& res, u64 rtime)
 	{
-		if (res.compare_and_swap_test(rtime, rtime + lock_bits)) [[likely]]
+		if (res.compare_and_swap_test(rtime, rtime | rsrv_unique_lock)) [[likely]]
 		{
 			return true;
 		}
@@ -54,16 +56,16 @@ namespace vm
 		return false;
 	}
 
-	inline std::pair<atomic_t<u64>&, u64> reservation_lock(u32 addr, u32 size, u64 lock_bits = stcx_lockb)
+	inline std::pair<atomic_t<u64>&, u64> reservation_lock(u32 addr)
 	{
-		auto res = &vm::reservation_acquire(addr, size);
+		auto res = &vm::reservation_acquire(addr, 1);
 		auto rtime = res->load();
 
-		if (rtime & 127 || !reservation_trylock(*res, rtime, lock_bits)) [[unlikely]]
+		if (rtime & 127 || !reservation_try_lock(*res, rtime)) [[unlikely]]
 		{
 			static atomic_t<u64> no_lock{};
 
-			rtime = reservation_lock_internal(addr, *res, lock_bits);
+			rtime = reservation_lock_internal(addr, *res);
 
 			if (rtime == umax)
 			{
@@ -95,6 +97,7 @@ namespace vm
 
 			// Stage 1: single optimistic transaction attempt
 			unsigned status = _XBEGIN_STARTED;
+			u64 _old = 0;
 
 #ifndef _MSC_VER
 			__asm__ goto ("xbegin %l[stage2];" ::: "memory" : stage2);
@@ -103,6 +106,15 @@ namespace vm
 			if (status == _XBEGIN_STARTED)
 #endif
 			{
+				if (res & rsrv_unique_lock)
+				{
+#ifndef _MSC_VER
+					__asm__ volatile ("xabort $0;" ::: "memory");
+#else
+					_xabort(0);
+#endif
+				}
+
 				if constexpr (std::is_void_v<std::invoke_result_t<F, T&>>)
 				{
 					res += 128;
@@ -161,10 +173,10 @@ namespace vm
 			}
 
 			// Stage 2: try to lock reservation first
-			res += stcx_lockb;
+			_old = res.fetch_add(1);
 
 			// Start lightened transaction (TODO: tweaking)
-			while (true)
+			while (!(_old & rsrv_unique_lock))
 			{
 #ifndef _MSC_VER
 				__asm__ goto ("xbegin %l[retry];" ::: "memory" : retry);
@@ -263,11 +275,8 @@ namespace vm
 			}
 		}
 
-
-		// Perform under heavyweight lock
-		auto& res = vm::reservation_acquire(addr, 128);
-
-		res += stcx_lockb;
+		// Perform heavyweight lock
+		auto [res, rtime] = vm::reservation_lock(addr);
 
 		// Write directly if the op cannot fail
 		if constexpr (std::is_void_v<std::invoke_result_t<F, T&>>)
@@ -294,12 +303,12 @@ namespace vm
 				{
 					// If operation succeeds, write the data back
 					*sptr = buf;
-					res += 127;
+					res.release(rtime + 128);
 				}
 				else
 				{
 					// Operation failed, no memory has been modified
-					res -= 1;
+					res.release(rtime);
 					return std::invoke_result_t<F, T&>();
 				}
 			}
@@ -363,4 +372,45 @@ namespace vm
 			}
 		}
 	}
+
+	template <bool Ack = false, typename T, typename F>
+	SAFE_BUFFERS inline auto reservation_light_op(T& data, F op)
+	{
+		// Optimized real ptr -> vm ptr conversion, simply UB if out of range
+		const u32 addr = static_cast<u32>(reinterpret_cast<const u8*>(&data) - g_base_addr);
+
+		// Use "super" pointer to prevent access violation handling during atomic op
+		const auto sptr = vm::get_super_ptr<T>(addr);
+
+		// "Lock" reservation
+		auto& res = vm::reservation_acquire(addr, 128);
+
+		if (res.fetch_add(1) & vm::rsrv_unique_lock) [[unlikely]]
+		{
+			vm::reservation_shared_lock_internal(res);
+		}
+
+		if constexpr (std::is_void_v<std::invoke_result_t<F, T&>>)
+		{
+			std::invoke(op, *sptr);
+			res += 127;
+
+			if constexpr (Ack)
+			{
+				res.notify_all();
+			}
+		}
+		else
+		{
+			auto result = std::invoke(op, *sptr);
+			res += 127;
+
+			if constexpr (Ack)
+			{
+				res.notify_all();
+			}
+
+			return result;
+		}
+	}
 } // namespace vm
diff --git a/rpcs3/Emu/RSX/rsx_methods.cpp b/rpcs3/Emu/RSX/rsx_methods.cpp
index 007bee6f5c..1f0318ea52 100644
--- a/rpcs3/Emu/RSX/rsx_methods.cpp
+++ b/rpcs3/Emu/RSX/rsx_methods.cpp
@@ -151,14 +151,14 @@ namespace rsx
 			// TODO: Check if possible to write on reservations
 			if (!g_use_rtm && rsx->label_addr >> 28 != addr >> 28) [[likely]]
 			{
-				res = &vm::reservation_lock(addr, 4).first;
+				res = &vm::reservation_lock(addr).first;
 			}
 
 			vm::_ref<RsxSemaphore>(addr).val = arg;
 
 			if (res)
 			{
-				res->release(*res + 127);
+				res += 127;
 			}
 
 			vm::reservation_notifier(addr, 4).notify_all();
@@ -818,7 +818,7 @@ namespace rsx
 				case CELL_GCM_FUNC_ADD_SIGNED:
 				case CELL_GCM_FUNC_REVERSE_ADD_SIGNED:
 					break;
-			
+
 				default:
 				{
 					// Ignore invalid values as a whole
@@ -1513,7 +1513,7 @@ namespace rsx
 			const auto data_length = in_pitch * (line_count - 1) + line_length;
 
 			rsx->invalidate_fragment_program(dst_dma, dst_offset, data_length);
-	
+
 			if (const auto result = rsx->read_barrier(read_address, data_length, !is_block_transfer);
 				result == rsx::result_zcull_intr)
 			{