vm: rewrite reservation bits

Implement classic unique/shared locking concept.
Implement vm::reservation_light_op.
This commit is contained in:
Nekotekina 2020-10-08 16:13:55 +03:00
parent d962bb018d
commit 346a1d4433
10 changed files with 356 additions and 170 deletions

View file

@ -338,7 +338,7 @@ void cpu_thread::operator()()
{ {
thread_ctrl::set_native_priority(-1); thread_ctrl::set_native_priority(-1);
} }
// force input/output denormals to zero for SPU threads (FTZ/DAZ) // force input/output denormals to zero for SPU threads (FTZ/DAZ)
_mm_setcsr( _mm_getcsr() | 0x8040 ); _mm_setcsr( _mm_getcsr() | 0x8040 );
@ -653,6 +653,7 @@ cpu_thread::suspend_all::suspend_all(cpu_thread* _this) noexcept
for_all_cpu([](cpu_thread* cpu) for_all_cpu([](cpu_thread* cpu)
{ {
// Should be atomic
if (!(cpu->state & cpu_flag::pause)) if (!(cpu->state & cpu_flag::pause))
{ {
cpu->state += cpu_flag::pause; cpu->state += cpu_flag::pause;

View file

@ -12,6 +12,7 @@ enum class cpu_flag : u32
exit, // Irreversible exit exit, // Irreversible exit
wait, // Indicates waiting state, set by the thread itself wait, // Indicates waiting state, set by the thread itself
pause, // Thread suspended by suspend_all technique pause, // Thread suspended by suspend_all technique
pause2, // Used by suspend_all internally
suspend, // Thread suspended suspend, // Thread suspended
ret, // Callback return requested ret, // Callback return requested
signal, // Thread received a signal (HLE) signal, // Thread received a signal (HLE)

View file

@ -2536,9 +2536,10 @@ s32 cellSpursShutdownWorkload(ppu_thread& ppu, vm::ptr<CellSpurs> spurs, u32 wid
return CELL_OK; return CELL_OK;
} }
auto [res, rtime] = vm::reservation_lock(vm::get_addr(&spurs->wklEvent(wid)), 1, vm::dma_lockb); const auto old = vm::reservation_light_op(spurs->wklEvent(wid), [](atomic_t<u8>& v)
const auto old = spurs->wklEvent(wid).fetch_or(1); {
res.release(rtime + (old & 1 ? 0 : 128)); return v.fetch_or(1);
});
if (old & 0x12 && !(old & 1) && sys_event_port_send(spurs->eventPort, 0, 0, (1u << 31) >> wid)) if (old & 0x12 && !(old & 1) && sys_event_port_send(spurs->eventPort, 0, 0, (1u << 31) >> wid))
{ {
@ -2693,9 +2694,11 @@ s32 cellSpursReadyCountStore(ppu_thread& ppu, vm::ptr<CellSpurs> spurs, u32 wid,
return CELL_SPURS_POLICY_MODULE_ERROR_STAT; return CELL_SPURS_POLICY_MODULE_ERROR_STAT;
} }
auto [res, rtime] = vm::reservation_lock(spurs.addr(), 128, vm::dma_lockb); vm::reservation_light_op(spurs->readyCount(wid), [&](atomic_t<u8>& v)
spurs->readyCount(wid).release(static_cast<u8>(value)); {
res.store(rtime + 128); v.release(static_cast<u8>(value));
});
return CELL_OK; return CELL_OK;
} }
@ -2729,11 +2732,11 @@ s32 cellSpursReadyCountSwap(ppu_thread& ppu, vm::ptr<CellSpurs> spurs, u32 wid,
return CELL_SPURS_POLICY_MODULE_ERROR_STAT; return CELL_SPURS_POLICY_MODULE_ERROR_STAT;
} }
auto [res, rtime] = vm::reservation_lock(spurs.addr(), 128, vm::dma_lockb); *old = vm::reservation_light_op(spurs->readyCount(wid), [&](atomic_t<u8>& v)
u32 temp = spurs->readyCount(wid).exchange(static_cast<u8>(swap)); {
res.release(rtime + 128); return v.exchange(static_cast<u8>(swap));
});
*old = temp;
return CELL_OK; return CELL_OK;
} }
@ -2769,9 +2772,10 @@ s32 cellSpursReadyCountCompareAndSwap(ppu_thread& ppu, vm::ptr<CellSpurs> spurs,
u8 temp = static_cast<u8>(compare); u8 temp = static_cast<u8>(compare);
auto [res, rtime] = vm::reservation_lock(spurs.addr(), 128, vm::dma_lockb); vm::reservation_light_op(spurs->readyCount(wid), [&](atomic_t<u8>& v)
spurs->readyCount(wid).compare_exchange(temp, static_cast<u8>(swap)); {
res.release(rtime + 128); v.compare_exchange(temp, static_cast<u8>(swap));
});
*old = temp; *old = temp;
return CELL_OK; return CELL_OK;
@ -2807,17 +2811,15 @@ s32 cellSpursReadyCountAdd(ppu_thread& ppu, vm::ptr<CellSpurs> spurs, u32 wid, v
return CELL_SPURS_POLICY_MODULE_ERROR_STAT; return CELL_SPURS_POLICY_MODULE_ERROR_STAT;
} }
auto [res, rtime] = vm::reservation_lock(spurs.addr(), 128, vm::dma_lockb); *old = vm::reservation_light_op(spurs->readyCount(wid), [&](atomic_t<u8>& v)
u32 temp = spurs->readyCount(wid).fetch_op([&](u8& val)
{ {
const s32 _new = val + value; return v.fetch_op([&](u8& val)
val = static_cast<u8>(std::clamp<s32>(_new, 0, 0xFF)); {
const s32 _new = val + value;
val = static_cast<u8>(std::clamp<s32>(_new, 0, 255));
});
}); });
res.release(rtime + 128);
*old = temp;
return CELL_OK; return CELL_OK;
} }
@ -3833,13 +3835,12 @@ s32 _spurs::create_task(vm::ptr<CellSpursTaskset> taskset, vm::ptr<u32> task_id,
// TODO: Verify the ELF header is proper and all its load segments are at address >= 0x3000 // TODO: Verify the ELF header is proper and all its load segments are at address >= 0x3000
u32 tmp_task_id; u32 tmp_task_id;
{
auto addr = taskset.ptr(&CellSpursTaskset::enabled).addr();
auto [res, rtime] = vm::reservation_lock(addr, 16, vm::dma_lockb);
vm::reservation_light_op(vm::_ref<atomic_be_t<v128>>(taskset.ptr(&CellSpursTaskset::enabled).addr()), [&](atomic_be_t<v128>& ptr)
{
// NOTE: Realfw processes this using 4 32-bits atomic loops // NOTE: Realfw processes this using 4 32-bits atomic loops
// But here its processed within a single 128-bit atomic op // But here its processed within a single 128-bit atomic op
vm::_ref<atomic_be_t<v128>>(addr).fetch_op([&](be_t<v128>& value) ptr.fetch_op([&](be_t<v128>& value)
{ {
auto value0 = value.value(); auto value0 = value.value();
@ -3862,9 +3863,7 @@ s32 _spurs::create_task(vm::ptr<CellSpursTaskset> taskset, vm::ptr<u32> task_id,
tmp_task_id = CELL_SPURS_MAX_TASK; tmp_task_id = CELL_SPURS_MAX_TASK;
return false; return false;
}); });
});
res.release(rtime + 128);
}
if (tmp_task_id >= CELL_SPURS_MAX_TASK) if (tmp_task_id >= CELL_SPURS_MAX_TASK)
{ {
@ -3885,9 +3884,10 @@ s32 _spurs::create_task(vm::ptr<CellSpursTaskset> taskset, vm::ptr<u32> task_id,
s32 _spurs::task_start(ppu_thread& ppu, vm::ptr<CellSpursTaskset> taskset, u32 taskId) s32 _spurs::task_start(ppu_thread& ppu, vm::ptr<CellSpursTaskset> taskset, u32 taskId)
{ {
auto [res, rtime] = vm::reservation_lock(taskset.ptr(&CellSpursTaskset::pending_ready).addr(), 16, vm::dma_lockb); vm::reservation_light_op(taskset->pending_ready, [&](CellSpursTaskset::atomic_tasks_bitset& v)
taskset->pending_ready.values[taskId / 32] |= (1u << 31) >> (taskId % 32); {
res.release(rtime + 128); v.values[taskId / 32] |= (1u << 31) >> (taskId % 32);
});
auto spurs = +taskset->spurs; auto spurs = +taskset->spurs;
ppu_execute<&cellSpursSendWorkloadSignal>(ppu, spurs, +taskset->wid); ppu_execute<&cellSpursSendWorkloadSignal>(ppu, spurs, +taskset->wid);
@ -4706,25 +4706,23 @@ s32 cellSpursJobGuardNotify(ppu_thread& ppu, vm::ptr<CellSpursJobGuard> jobGuard
if (!jobGuard.aligned()) if (!jobGuard.aligned())
return CELL_SPURS_JOB_ERROR_ALIGN; return CELL_SPURS_JOB_ERROR_ALIGN;
auto [res, rtime] = vm::reservation_lock(jobGuard.addr(), 128, vm::dma_lockb);
u32 allow_jobchain_run = 0; // Affects cellSpursJobChainRun execution u32 allow_jobchain_run = 0; // Affects cellSpursJobChainRun execution
u32 old = 0;
auto [old, ok] = jobGuard->ncount0.fetch_op([&](be_t<u32>& value) const bool ok = vm::reservation_op(vm::unsafe_ptr_cast<CellSpursJobGuard_x00>(jobGuard), [&](CellSpursJobGuard_x00& jg)
{ {
allow_jobchain_run = jobGuard->zero; allow_jobchain_run = jg.zero;
old = jg.ncount0;
if (!value) if (!jg.ncount0)
{ {
return false; return false;
} }
--value; jg.ncount0--;
return true; return true;
}); });
res.release(rtime + (ok ? 128 : 0));
if (!ok) if (!ok)
{ {
return CELL_SPURS_CORE_ERROR_STAT; return CELL_SPURS_CORE_ERROR_STAT;
@ -4759,9 +4757,11 @@ s32 cellSpursJobGuardReset(vm::ptr<CellSpursJobGuard> jobGuard)
if (!jobGuard.aligned()) if (!jobGuard.aligned())
return CELL_SPURS_JOB_ERROR_ALIGN; return CELL_SPURS_JOB_ERROR_ALIGN;
auto [res, rtime] = vm::reservation_lock(jobGuard.addr(), 128, vm::dma_lockb); vm::reservation_light_op(jobGuard->ncount0, [&](atomic_be_t<u32>& ncount0)
jobGuard->ncount0 = jobGuard->ncount1; {
res.release(rtime + 128); ncount0 = jobGuard->ncount1;
});
return CELL_OK; return CELL_OK;
} }
@ -4844,9 +4844,11 @@ s32 cellSpursJobSetMaxGrab(vm::ptr<CellSpursJobChain> jobChain, u32 maxGrabbedJo
if ((spurs->wklEnabled & (0x80000000u >> wid)) == 0u) if ((spurs->wklEnabled & (0x80000000u >> wid)) == 0u)
return CELL_SPURS_JOB_ERROR_STAT; return CELL_SPURS_JOB_ERROR_STAT;
auto [res, rtime] = vm::reservation_lock(jobChain.addr(), 128, vm::dma_lockb); vm::reservation_light_op(jobChain->maxGrabbedJob, [&](atomic_be_t<u16>& v)
jobChain->maxGrabbedJob.release(static_cast<u16>(maxGrabbedJob)); {
res.store(rtime + 128); v.release(static_cast<u16>(maxGrabbedJob));
});
return CELL_OK; return CELL_OK;
} }

View file

@ -561,6 +561,22 @@ struct alignas(128) CellSpursJobGuard
CHECK_SIZE_ALIGN(CellSpursJobGuard, 128, 128); CHECK_SIZE_ALIGN(CellSpursJobGuard, 128, 128);
struct alignas(128) CellSpursJobGuard_x00
{
be_t<u32> ncount0; // 0x00
be_t<u32> ncount1; // 0x04
vm::bptr<CellSpursJobChain> jobChain; // 0x0C
be_t<u32> unk0;
be_t<u32> requestSpuCount; // 0x10
be_t<u32> unk1[3];
be_t<u32> autoReset; // 0x20
be_t<u32> unk2[3];
be_t<u32> zero; // 0x30
u8 unk3[0x80 - 0x34];
};
CHECK_SIZE_ALIGN(CellSpursJobGuard_x00, 128, 128);
// Core CellSpurs structures // Core CellSpurs structures
struct alignas(128) CellSpurs struct alignas(128) CellSpurs
{ {

View file

@ -1431,7 +1431,7 @@ s32 spursTasksetProcessRequest(spu_thread& spu, s32 request, u32* taskId, u32* i
// Find the number of tasks that have become ready since the last iteration // Find the number of tasks that have become ready since the last iteration
{ {
auto newlyReadyTasks = v128::andnot(ready, signalled | pready); auto newlyReadyTasks = v128::andnot(ready, signalled | pready);
// TODO: Optimize this shit with std::popcount when it's known to be fixed // TODO: Optimize this shit with std::popcount when it's known to be fixed
for (auto i = 0; i < 128; i++) for (auto i = 0; i < 128; i++)
{ {
@ -1597,14 +1597,14 @@ s32 spursTasksetProcessRequest(spu_thread& spu, s32 request, u32* taskId, u32* i
{ {
auto spurs = kernelCtxt->spurs; auto spurs = kernelCtxt->spurs;
auto [res, rtime] = vm::reservation_lock(spurs.addr(), 128, vm::dma_lockb); vm::reservation_light_op(spurs->readyCount(kernelCtxt->wklCurrentId), [&](atomic_t<u8>& val)
spurs->readyCount(kernelCtxt->wklCurrentId).fetch_op([&](u8& val)
{ {
const s32 _new = val + numNewlyReadyTasks; val.fetch_op([&](u8& val)
val = static_cast<u8>(std::clamp<s32>(_new, 0, 0xFF)); {
const s32 _new = val + numNewlyReadyTasks;
val = static_cast<u8>(std::clamp<s32>(_new, 0, 0xFF));
});
}); });
res.release(rtime + 128);
} }
return rc; return rc;

View file

@ -946,7 +946,7 @@ void ppu_thread::fast_call(u32 addr, u32 rtoc)
if (_this->current_function && vm::read32(cia) != ppu_instructions::SC(0)) if (_this->current_function && vm::read32(cia) != ppu_instructions::SC(0))
{ {
return fmt::format("PPU[0x%x] Thread (%s) [HLE:0x%08x, LR:0x%08x]", _this->id, *name_cache.get(), cia, _this->lr); return fmt::format("PPU[0x%x] Thread (%s) [HLE:0x%08x, LR:0x%08x]", _this->id, *name_cache.get(), cia, _this->lr);
} }
return fmt::format("PPU[0x%x] Thread (%s) [0x%08x]", _this->id, *name_cache.get(), cia); return fmt::format("PPU[0x%x] Thread (%s) [0x%08x]", _this->id, *name_cache.get(), cia);
@ -1103,7 +1103,6 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr)
const u64 data_off = (addr & 7) * 8; const u64 data_off = (addr & 7) * 8;
ppu.raddr = addr; ppu.raddr = addr;
const u64 mask_res = g_use_rtm ? (-128 | vm::dma_lockb) : -1;
if (const s32 max = g_cfg.core.ppu_128_reservations_loop_max_length) if (const s32 max = g_cfg.core.ppu_128_reservations_loop_max_length)
{ {
@ -1160,7 +1159,7 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr)
for (u64 count = 0;; [&]() for (u64 count = 0;; [&]()
{ {
if (ppu.state) if (ppu.state)
{ {
ppu.check_state(); ppu.check_state();
} }
else if (++count < 20) [[likely]] else if (++count < 20) [[likely]]
@ -1175,7 +1174,7 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr)
} }
}()) }())
{ {
ppu.rtime = vm::reservation_acquire(addr, sizeof(T)) & mask_res; ppu.rtime = vm::reservation_acquire(addr, sizeof(T));
if (ppu.rtime & 127) if (ppu.rtime & 127)
{ {
@ -1189,7 +1188,7 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr)
mov_rdata(ppu.rdata, vm::_ref<spu_rdata_t>(addr & -128)); mov_rdata(ppu.rdata, vm::_ref<spu_rdata_t>(addr & -128));
} }
if ((vm::reservation_acquire(addr, sizeof(T)) & mask_res) == ppu.rtime) [[likely]] if (vm::reservation_acquire(addr, sizeof(T)) == ppu.rtime) [[likely]]
{ {
if (count >= 15) [[unlikely]] if (count >= 15) [[unlikely]]
{ {
@ -1218,6 +1217,7 @@ const auto ppu_stcx_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, u64 rda
Label fall = c.newLabel(); Label fall = c.newLabel();
Label fail = c.newLabel(); Label fail = c.newLabel();
Label fail2 = c.newLabel();
// Prepare registers // Prepare registers
c.mov(x86::r10, imm_ptr(+vm::g_reservations)); c.mov(x86::r10, imm_ptr(+vm::g_reservations));
@ -1234,7 +1234,9 @@ const auto ppu_stcx_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, u64 rda
// Begin transaction // Begin transaction
build_transaction_enter(c, fall, args[0], 16); build_transaction_enter(c, fall, args[0], 16);
c.mov(x86::rax, x86::qword_ptr(x86::r10)); c.mov(x86::rax, x86::qword_ptr(x86::r10));
c.and_(x86::rax, -128 | vm::dma_lockb); c.test(x86::eax, vm::rsrv_unique_lock);
c.jnz(fail2);
c.and_(x86::rax, -128);
c.cmp(x86::rax, args[1]); c.cmp(x86::rax, args[1]);
c.jne(fail); c.jne(fail);
c.cmp(x86::qword_ptr(x86::r11), args[2]); c.cmp(x86::qword_ptr(x86::r11), args[2]);
@ -1249,6 +1251,7 @@ const auto ppu_stcx_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, u64 rda
c.bind(fall); c.bind(fall);
c.sar(x86::eax, 24); c.sar(x86::eax, 24);
c.js(fail); c.js(fail);
c.bind(fail2);
c.mov(x86::eax, 2); c.mov(x86::eax, 2);
c.ret(); c.ret();
@ -1324,11 +1327,11 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
// Begin transaction // Begin transaction
build_transaction_enter(c, fall, x86::r12, 4); build_transaction_enter(c, fall, x86::r12, 4);
c.mov(x86::rax, x86::qword_ptr(x86::rbx)); c.mov(x86::rax, x86::qword_ptr(x86::rbx));
c.test(x86::eax, vm::rsrv_unique_lock);
c.jnz(skip);
c.and_(x86::rax, -128); c.and_(x86::rax, -128);
c.cmp(x86::rax, x86::r13); c.cmp(x86::rax, x86::r13);
c.jne(fail); c.jne(fail);
c.test(x86::qword_ptr(x86::rbx), 127);
c.jnz(skip);
if (s_tsx_avx) if (s_tsx_avx)
{ {
@ -1394,15 +1397,19 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
Label fall2 = c.newLabel(); Label fall2 = c.newLabel();
Label fail2 = c.newLabel(); Label fail2 = c.newLabel();
Label fail3 = c.newLabel();
// Lightened transaction: only compare and swap data // Lightened transaction: only compare and swap data
c.bind(next); c.bind(next);
// Try to "lock" reservation // Try to "lock" reservation
c.mov(x86::rax, x86::r13); c.mov(x86::eax, x86::r13);
c.add(x86::r13, 1); c.lock().xadd(x86::qword_ptr(x86::rbx), x86::rax);
c.lock().cmpxchg(x86::qword_ptr(x86::rbx), x86::r13); c.test(x86::eax, vm::rsrv_unique_lock);
c.jne(fail); c.jnz(fail3);
c.and_(x86::rax, -128);
c.cmp(x86::rax, x86::r13);
c.jne(fail2);
build_transaction_enter(c, fall2, x86::r12, 666); build_transaction_enter(c, fall2, x86::r12, 666);
@ -1453,6 +1460,7 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
c.bind(fall2); c.bind(fall2);
c.sar(x86::eax, 24); c.sar(x86::eax, 24);
c.js(fail2); c.js(fail2);
c.bind(fail3);
c.mov(x86::eax, 2); c.mov(x86::eax, 2);
c.jmp(_ret); c.jmp(_ret);
@ -1579,20 +1587,51 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value)
cpu_thread::suspend_all cpu_lock(&ppu); cpu_thread::suspend_all cpu_lock(&ppu);
// Give up if PUTLLUC happened // Obtain unique lock
if (res == (rtime | 1) && cmp_rdata(ppu.rdata, vm::_ref<spu_rdata_t>(addr & -128))) while (res.bts(std::countr_zero<u32>(vm::rsrv_unique_lock)))
{
busy_wait(100);
// Give up if reservation has been updated
if ((res & -128) != rtime)
{
res -= 1;
return false;
}
}
if ((res & -128) == rtime && cmp_rdata(ppu.rdata, vm::_ref<spu_rdata_t>(addr & -128)))
{ {
data.release(reg_value); data.release(reg_value);
res.release(rtime + 128); res += 63;
return true; return true;
} }
res.release(rtime); res -= (vm::rsrv_unique_lock + 1);
return false; return false;
} }
if (!vm::reservation_trylock(res, rtime)) while (res.bts(std::countr_zero<u32>(vm::rsrv_unique_lock)))
{ {
// Give up if reservation has been updated
if ((res & -128) != rtime)
{
return false;
}
if (ppu.state && ppu.check_state())
{
return false;
}
else
{
busy_wait(100);
}
}
if ((res & -128) != rtime)
{
res -= vm::rsrv_unique_lock;
return false; return false;
} }
@ -1654,24 +1693,64 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value)
default: break; default: break;
} }
if (res == rtime && vm::reservation_trylock(res, rtime)) if (res.fetch_add(1) & vm::rsrv_unique_lock)
{ {
const bool ret = data.compare_and_swap_test(old_data, reg_value); res -= 1;
res.release(rtime + 128); return false;
return ret;
} }
if (data.compare_and_swap_test(old_data, reg_value))
{
res += 127;
return true;
}
res -= 1;
return false; return false;
} }
if (!vm::reservation_trylock(res, rtime)) while (true)
{ {
return false; auto [_old, _ok] = res.fetch_op([&](u64& r)
{
if ((r & -128) != rtime || (r & vm::rsrv_unique_lock))
{
return false;
}
r += 1;
return true;
});
// Give up if reservation has been updated
if ((_old & -128) != rtime)
{
return false;
}
if (_ok)
{
break;
}
if (ppu.state && ppu.check_state())
{
return false;
}
else
{
busy_wait(100);
}
} }
const bool ret = data.compare_and_swap_test(old_data, reg_value); if (data.compare_and_swap_test(old_data, reg_value))
res.release(rtime + 128); {
return ret; res += 127;
return true;
}
res -=1;
return false;
}()) }())
{ {
res.notify_all(); res.notify_all();

View file

@ -378,11 +378,11 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, const
// Begin transaction // Begin transaction
build_transaction_enter(c, fall, x86::r12, 4); build_transaction_enter(c, fall, x86::r12, 4);
c.mov(x86::rax, x86::qword_ptr(x86::rbx)); c.mov(x86::rax, x86::qword_ptr(x86::rbx));
c.test(x86::eax, vm::rsrv_unique_lock);
c.jnz(skip);
c.and_(x86::rax, -128); c.and_(x86::rax, -128);
c.cmp(x86::rax, x86::r13); c.cmp(x86::rax, x86::r13);
c.jne(fail); c.jne(fail);
c.test(x86::qword_ptr(x86::rbx), 127);
c.jnz(skip);
if (s_tsx_avx) if (s_tsx_avx)
{ {
@ -461,15 +461,19 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, const
Label fall2 = c.newLabel(); Label fall2 = c.newLabel();
Label fail2 = c.newLabel(); Label fail2 = c.newLabel();
Label fail3 = c.newLabel();
// Lightened transaction: only compare and swap data // Lightened transaction: only compare and swap data
c.bind(next); c.bind(next);
// Try to "lock" reservation // Try to "lock" reservation
c.mov(x86::rax, x86::r13); c.mov(x86::eax, 1);
c.add(x86::r13, 1); c.lock().xadd(x86::qword_ptr(x86::rbx), x86::rax);
c.lock().cmpxchg(x86::qword_ptr(x86::rbx), x86::r13); c.test(x86::eax, vm::rsrv_unique_lock);
c.jne(fail); c.jnz(fail3);
c.and_(x86::rax, -128);
c.cmp(x86::rax, x86::r13);
c.jne(fail2);
build_transaction_enter(c, fall2, x86::r12, 666); build_transaction_enter(c, fall2, x86::r12, 666);
@ -533,6 +537,7 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, const
c.bind(fall2); c.bind(fall2);
c.sar(x86::eax, 24); c.sar(x86::eax, 24);
c.js(fail2); c.js(fail2);
c.bind(fail3);
c.mov(x86::eax, 2); c.mov(x86::eax, 2);
c.jmp(_ret); c.jmp(_ret);
@ -644,7 +649,7 @@ const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rda
// Begin transaction // Begin transaction
build_transaction_enter(c, fall, x86::r12, 8); build_transaction_enter(c, fall, x86::r12, 8);
c.test(x86::dword_ptr(x86::rbx), 127); c.test(x86::dword_ptr(x86::rbx), vm::rsrv_unique_lock);
c.jnz(skip); c.jnz(skip);
if (s_tsx_avx) if (s_tsx_avx)
@ -688,14 +693,15 @@ const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rda
c.xor_(x86::rbp, 0xf80); c.xor_(x86::rbp, 0xf80);
Label fall2 = c.newLabel(); Label fall2 = c.newLabel();
Label fail2 = c.newLabel();
// Lightened transaction // Lightened transaction
c.bind(next); c.bind(next);
// Try to acquire "PUTLLUC lock" // Lock reservation
c.lock().bts(x86::qword_ptr(x86::rbx), std::countr_zero<u32>(vm::putlluc_lockb)); c.mov(x86::eax, 1);
c.jc(fail2); c.lock().xadd(x86::qword_ptr(x86::rbx), x86::rax);
c.test(x86::eax, vm::rsrv_unique_lock);
c.jnz(fall2);
build_transaction_enter(c, fall2, x86::r12, 666); build_transaction_enter(c, fall2, x86::r12, 666);
@ -719,16 +725,12 @@ const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rda
} }
c.xend(); c.xend();
c.lock().add(x86::qword_ptr(x86::rbx), 64); c.lock().add(x86::qword_ptr(x86::rbx), 127);
c.mov(x86::eax, 1); c.mov(x86::eax, 1);
c.jmp(_ret); c.jmp(_ret);
c.bind(fail2);
c.xor_(x86::eax, x86::eax);
c.jmp(_ret);
c.bind(fall2); c.bind(fall2);
c.mov(x86::eax, 2); c.xor_(x86::eax, x86::eax);
//c.jmp(_ret); //c.jmp(_ret);
c.bind(_ret); c.bind(_ret);
@ -1415,30 +1417,30 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
{ {
case 1: case 1:
{ {
auto [res, time0] = vm::reservation_lock(eal, 1, vm::dma_lockb); auto [res, time0] = vm::reservation_lock(eal);
*reinterpret_cast<u8*>(dst) = *reinterpret_cast<const u8*>(src); *reinterpret_cast<u8*>(dst) = *reinterpret_cast<const u8*>(src);
res.release(time0 + 128); res += 64;
break; break;
} }
case 2: case 2:
{ {
auto [res, time0] = vm::reservation_lock(eal, 2, vm::dma_lockb); auto [res, time0] = vm::reservation_lock(eal);
*reinterpret_cast<u16*>(dst) = *reinterpret_cast<const u16*>(src); *reinterpret_cast<u16*>(dst) = *reinterpret_cast<const u16*>(src);
res.release(time0 + 128); res += 64;
break; break;
} }
case 4: case 4:
{ {
auto [res, time0] = vm::reservation_lock(eal, 4, vm::dma_lockb); auto [res, time0] = vm::reservation_lock(eal);
*reinterpret_cast<u32*>(dst) = *reinterpret_cast<const u32*>(src); *reinterpret_cast<u32*>(dst) = *reinterpret_cast<const u32*>(src);
res.release(time0 + 128); res += 64;
break; break;
} }
case 8: case 8:
{ {
auto [res, time0] = vm::reservation_lock(eal, 8, vm::dma_lockb); auto [res, time0] = vm::reservation_lock(eal);
*reinterpret_cast<u64*>(dst) = *reinterpret_cast<const u64*>(src); *reinterpret_cast<u64*>(dst) = *reinterpret_cast<const u64*>(src);
res.release(time0 + 128); res += 64;
break; break;
} }
default: default:
@ -1463,7 +1465,7 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
} }
// Lock each cache line execlusively // Lock each cache line execlusively
auto [res, time0] = vm::reservation_lock(eal, size0, vm::dma_lockb); auto [res, time0] = vm::reservation_lock(eal);
switch (size0) switch (size0)
{ {
@ -1491,7 +1493,7 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
} }
} }
res.release(time0 + 128); res += 64;
if (size == size0) if (size == size0)
{ {
@ -1505,7 +1507,7 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
if (((eal & 127) + size) <= 128) if (((eal & 127) + size) <= 128)
{ {
// Lock one cache line // Lock one cache line
auto [res, time0] = vm::reservation_lock(eal, 128); auto [res, time0] = vm::reservation_lock(eal);
while (size) while (size)
{ {
@ -1516,7 +1518,7 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
size -= 16; size -= 16;
} }
res.release(time0); res += 64;
break; break;
} }
@ -1848,21 +1850,34 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args)
cpu_thread::suspend_all cpu_lock(this); cpu_thread::suspend_all cpu_lock(this);
// Give up if PUTLLUC happened // Obtain unique lock
if (res == (rtime | 1)) while (res.bts(std::countr_zero<u32>(vm::rsrv_unique_lock)))
{
busy_wait(100);
// Give up if reservation has been updated
if ((res & -128) != rtime)
{
res -= 1;
if (render) render->unpause();
return false;
}
}
if ((res & -128) == rtime)
{ {
auto& data = vm::_ref<spu_rdata_t>(addr); auto& data = vm::_ref<spu_rdata_t>(addr);
if (cmp_rdata(rdata, data)) if (cmp_rdata(rdata, data))
{ {
mov_rdata(data, to_write); mov_rdata(data, to_write);
res += 127; res += 63;
if (render) render->unpause(); if (render) render->unpause();
return true; return true;
} }
} }
res -= 1; res -= (vm::rsrv_unique_lock | 1);
if (render) render->unpause(); if (render) render->unpause();
return false; return false;
} }
@ -1872,8 +1887,27 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args)
} }
} }
if (!vm::reservation_trylock(res, rtime)) while (res.bts(std::countr_zero<u32>(vm::rsrv_unique_lock)))
{ {
// Give up if reservation has been updated
if ((res & -128) != rtime)
{
return false;
}
if (state && check_state())
{
return false;
}
else
{
busy_wait(100);
}
}
if ((res & -128) != rtime)
{
res -= vm::rsrv_unique_lock;
return false; return false;
} }
@ -1914,7 +1948,7 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args)
if (raddr) if (raddr)
{ {
// Last check for event before we clear the reservation // Last check for event before we clear the reservation
if (raddr == addr || rtime != (vm::reservation_acquire(raddr, 128) & (-128 | vm::dma_lockb)) || !cmp_rdata(rdata, vm::_ref<spu_rdata_t>(raddr))) if (raddr == addr || rtime != (vm::reservation_acquire(raddr, 128) & -128) || !cmp_rdata(rdata, vm::_ref<spu_rdata_t>(raddr)))
{ {
set_events(SPU_EVENT_LR); set_events(SPU_EVENT_LR);
} }
@ -1937,38 +1971,17 @@ void do_cell_atomic_128_store(u32 addr, const void* to_write)
if (render) render->pause(); if (render) render->pause();
if (result == 2) if (result == 0)
{ {
cpu_thread::suspend_all cpu_lock(cpu); cpu_thread::suspend_all cpu_lock(cpu);
if (vm::reservation_acquire(addr, 128) & 64) while (vm::reservation_acquire(addr, 128).bts(std::countr_zero<u32>(vm::rsrv_unique_lock)))
{
// Wait for PUTLLC to complete
while (vm::reservation_acquire(addr, 128) & 63)
{
busy_wait(100);
}
mov_rdata(vm::_ref<spu_rdata_t>(addr), *static_cast<const spu_rdata_t*>(to_write));
vm::reservation_acquire(addr, 128) += 64;
}
}
else if (result == 0)
{
cpu_thread::suspend_all cpu_lock(cpu);
while (vm::reservation_acquire(addr, 128).bts(std::countr_zero<u32>(vm::putlluc_lockb)))
{
busy_wait(100);
}
while (vm::reservation_acquire(addr, 128) & 63)
{ {
busy_wait(100); busy_wait(100);
} }
mov_rdata(vm::_ref<spu_rdata_t>(addr), *static_cast<const spu_rdata_t*>(to_write)); mov_rdata(vm::_ref<spu_rdata_t>(addr), *static_cast<const spu_rdata_t*>(to_write));
vm::reservation_acquire(addr, 128) += 64; vm::reservation_acquire(addr, 128) += 63;
} }
if (render) render->unpause(); if (render) render->unpause();
@ -1977,7 +1990,7 @@ void do_cell_atomic_128_store(u32 addr, const void* to_write)
else else
{ {
auto& data = vm::_ref<spu_rdata_t>(addr); auto& data = vm::_ref<spu_rdata_t>(addr);
auto [res, time0] = vm::reservation_lock(addr, 128); auto [res, time0] = vm::reservation_lock(addr);
*reinterpret_cast<atomic_t<u32>*>(&data) += 0; *reinterpret_cast<atomic_t<u32>*>(&data) += 0;
@ -2247,7 +2260,7 @@ bool spu_thread::process_mfc_cmd()
if (raddr && raddr != addr) if (raddr && raddr != addr)
{ {
// Last check for event before we replace the reservation with a new one // Last check for event before we replace the reservation with a new one
if ((vm::reservation_acquire(raddr, 128) & (-128 | vm::dma_lockb)) != rtime || !cmp_rdata(temp, vm::_ref<spu_rdata_t>(raddr))) if ((vm::reservation_acquire(raddr, 128) & -128) != rtime || !cmp_rdata(temp, vm::_ref<spu_rdata_t>(raddr)))
{ {
set_events(SPU_EVENT_LR); set_events(SPU_EVENT_LR);
} }
@ -3583,7 +3596,7 @@ bool spu_thread::capture_local_storage() const
if (name.empty()) if (name.empty())
{ {
// TODO: Maybe add thread group name here // TODO: Maybe add thread group name here
fmt::append(name, "SPU.%u", lv2_id); fmt::append(name, "SPU.%u", lv2_id);
} }
} }
else else

View file

@ -441,11 +441,11 @@ namespace vm
g_mutex.unlock(); g_mutex.unlock();
} }
u64 reservation_lock_internal(u32 addr, atomic_t<u64>& res, u64 lock_bits) u64 reservation_lock_internal(u32 addr, atomic_t<u64>& res)
{ {
for (u64 i = 0;; i++) for (u64 i = 0;; i++)
{ {
if (u64 rtime = res; !(rtime & 127) && reservation_trylock(res, rtime, lock_bits)) [[likely]] if (u64 rtime = res; !(rtime & 127) && reservation_try_lock(res, rtime)) [[likely]]
{ {
return rtime; return rtime;
} }
@ -471,6 +471,30 @@ namespace vm
} }
} }
void reservation_shared_lock_internal(atomic_t<u64>& res)
{
for (u64 i = 0;; i++)
{
if (!(res & rsrv_unique_lock)) [[likely]]
{
return;
}
if (auto cpu = get_current_cpu_thread(); cpu && cpu->state)
{
cpu->check_state();
}
else if (i < 15)
{
busy_wait(500);
}
else
{
std::this_thread::yield();
}
}
}
void reservation_op_internal(u32 addr, std::function<bool()> func) void reservation_op_internal(u32 addr, std::function<bool()> func)
{ {
const auto _cpu = get_current_cpu_thread(); const auto _cpu = get_current_cpu_thread();
@ -481,15 +505,15 @@ namespace vm
{ {
cpu_thread::suspend_all cpu_lock(_cpu); cpu_thread::suspend_all cpu_lock(_cpu);
// Wait to acquire PUTLLUC lock // Wait to acquire unique lock
while (vm::reservation_acquire(addr, 128).bts(std::countr_zero<u32>(vm::putlluc_lockb))) while (vm::reservation_acquire(addr, 128).bts(std::countr_zero<u32>(vm::rsrv_unique_lock)))
{ {
busy_wait(100); busy_wait(100);
} }
if (func()) if (func())
{ {
// Success, release PUTLLUC and PUTLLC locks if necessary // Success, release all locks if necessary
vm::reservation_acquire(addr, 128) += 63; vm::reservation_acquire(addr, 128) += 63;
} }
else else

View file

@ -10,11 +10,11 @@ extern bool g_use_rtm;
namespace vm namespace vm
{ {
enum reservation_lock_bit : u64 enum : u64
{ {
stcx_lockb = 1 << 0, // Exclusive conditional reservation lock rsrv_lock_mask = 127,
dma_lockb = 1 << 5, // Exclusive unconditional reservation lock rsrv_unique_lock = 64,
putlluc_lockb = 1 << 6, // Exclusive unconditional reservation lock rsrv_shared_mask = 63,
}; };
// Get reservation status for further atomic update: last update timestamp // Get reservation status for further atomic update: last update timestamp
@ -42,11 +42,13 @@ namespace vm
return *reinterpret_cast<atomic_t<u64>*>(g_reservations + (addr & 0xff80) / 2); return *reinterpret_cast<atomic_t<u64>*>(g_reservations + (addr & 0xff80) / 2);
} }
u64 reservation_lock_internal(u32, atomic_t<u64>&, u64); u64 reservation_lock_internal(u32, atomic_t<u64>&);
inline bool reservation_trylock(atomic_t<u64>& res, u64 rtime, u64 lock_bits = stcx_lockb) void reservation_shared_lock_internal(atomic_t<u64>&);
inline bool reservation_try_lock(atomic_t<u64>& res, u64 rtime)
{ {
if (res.compare_and_swap_test(rtime, rtime + lock_bits)) [[likely]] if (res.compare_and_swap_test(rtime, rtime | rsrv_unique_lock)) [[likely]]
{ {
return true; return true;
} }
@ -54,16 +56,16 @@ namespace vm
return false; return false;
} }
inline std::pair<atomic_t<u64>&, u64> reservation_lock(u32 addr, u32 size, u64 lock_bits = stcx_lockb) inline std::pair<atomic_t<u64>&, u64> reservation_lock(u32 addr)
{ {
auto res = &vm::reservation_acquire(addr, size); auto res = &vm::reservation_acquire(addr, 1);
auto rtime = res->load(); auto rtime = res->load();
if (rtime & 127 || !reservation_trylock(*res, rtime, lock_bits)) [[unlikely]] if (rtime & 127 || !reservation_try_lock(*res, rtime)) [[unlikely]]
{ {
static atomic_t<u64> no_lock{}; static atomic_t<u64> no_lock{};
rtime = reservation_lock_internal(addr, *res, lock_bits); rtime = reservation_lock_internal(addr, *res);
if (rtime == umax) if (rtime == umax)
{ {
@ -95,6 +97,7 @@ namespace vm
// Stage 1: single optimistic transaction attempt // Stage 1: single optimistic transaction attempt
unsigned status = _XBEGIN_STARTED; unsigned status = _XBEGIN_STARTED;
u64 _old = 0;
#ifndef _MSC_VER #ifndef _MSC_VER
__asm__ goto ("xbegin %l[stage2];" ::: "memory" : stage2); __asm__ goto ("xbegin %l[stage2];" ::: "memory" : stage2);
@ -103,6 +106,15 @@ namespace vm
if (status == _XBEGIN_STARTED) if (status == _XBEGIN_STARTED)
#endif #endif
{ {
if (res & rsrv_unique_lock)
{
#ifndef _MSC_VER
__asm__ volatile ("xabort $0;" ::: "memory");
#else
_xabort(0);
#endif
}
if constexpr (std::is_void_v<std::invoke_result_t<F, T&>>) if constexpr (std::is_void_v<std::invoke_result_t<F, T&>>)
{ {
res += 128; res += 128;
@ -161,10 +173,10 @@ namespace vm
} }
// Stage 2: try to lock reservation first // Stage 2: try to lock reservation first
res += stcx_lockb; _old = res.fetch_add(1);
// Start lightened transaction (TODO: tweaking) // Start lightened transaction (TODO: tweaking)
while (true) while (!(_old & rsrv_unique_lock))
{ {
#ifndef _MSC_VER #ifndef _MSC_VER
__asm__ goto ("xbegin %l[retry];" ::: "memory" : retry); __asm__ goto ("xbegin %l[retry];" ::: "memory" : retry);
@ -263,11 +275,8 @@ namespace vm
} }
} }
// Perform heavyweight lock
// Perform under heavyweight lock auto [res, rtime] = vm::reservation_lock(addr);
auto& res = vm::reservation_acquire(addr, 128);
res += stcx_lockb;
// Write directly if the op cannot fail // Write directly if the op cannot fail
if constexpr (std::is_void_v<std::invoke_result_t<F, T&>>) if constexpr (std::is_void_v<std::invoke_result_t<F, T&>>)
@ -294,12 +303,12 @@ namespace vm
{ {
// If operation succeeds, write the data back // If operation succeeds, write the data back
*sptr = buf; *sptr = buf;
res += 127; res.release(rtime + 128);
} }
else else
{ {
// Operation failed, no memory has been modified // Operation failed, no memory has been modified
res -= 1; res.release(rtime);
return std::invoke_result_t<F, T&>(); return std::invoke_result_t<F, T&>();
} }
} }
@ -363,4 +372,45 @@ namespace vm
} }
} }
} }
template <bool Ack = false, typename T, typename F>
SAFE_BUFFERS inline auto reservation_light_op(T& data, F op)
{
// Optimized real ptr -> vm ptr conversion, simply UB if out of range
const u32 addr = static_cast<u32>(reinterpret_cast<const u8*>(&data) - g_base_addr);
// Use "super" pointer to prevent access violation handling during atomic op
const auto sptr = vm::get_super_ptr<T>(addr);
// "Lock" reservation
auto& res = vm::reservation_acquire(addr, 128);
if (res.fetch_add(1) & vm::rsrv_unique_lock) [[unlikely]]
{
vm::reservation_shared_lock_internal(res);
}
if constexpr (std::is_void_v<std::invoke_result_t<F, T&>>)
{
std::invoke(op, *sptr);
res += 127;
if constexpr (Ack)
{
res.notify_all();
}
}
else
{
auto result = std::invoke(op, *sptr);
res += 127;
if constexpr (Ack)
{
res.notify_all();
}
return result;
}
}
} // namespace vm } // namespace vm

View file

@ -151,14 +151,14 @@ namespace rsx
// TODO: Check if possible to write on reservations // TODO: Check if possible to write on reservations
if (!g_use_rtm && rsx->label_addr >> 28 != addr >> 28) [[likely]] if (!g_use_rtm && rsx->label_addr >> 28 != addr >> 28) [[likely]]
{ {
res = &vm::reservation_lock(addr, 4).first; res = &vm::reservation_lock(addr).first;
} }
vm::_ref<RsxSemaphore>(addr).val = arg; vm::_ref<RsxSemaphore>(addr).val = arg;
if (res) if (res)
{ {
res->release(*res + 127); res += 127;
} }
vm::reservation_notifier(addr, 4).notify_all(); vm::reservation_notifier(addr, 4).notify_all();
@ -818,7 +818,7 @@ namespace rsx
case CELL_GCM_FUNC_ADD_SIGNED: case CELL_GCM_FUNC_ADD_SIGNED:
case CELL_GCM_FUNC_REVERSE_ADD_SIGNED: case CELL_GCM_FUNC_REVERSE_ADD_SIGNED:
break; break;
default: default:
{ {
// Ignore invalid values as a whole // Ignore invalid values as a whole
@ -1513,7 +1513,7 @@ namespace rsx
const auto data_length = in_pitch * (line_count - 1) + line_length; const auto data_length = in_pitch * (line_count - 1) + line_length;
rsx->invalidate_fragment_program(dst_dma, dst_offset, data_length); rsx->invalidate_fragment_program(dst_dma, dst_offset, data_length);
if (const auto result = rsx->read_barrier(read_address, data_length, !is_block_transfer); if (const auto result = rsx->read_barrier(read_address, data_length, !is_block_transfer);
result == rsx::result_zcull_intr) result == rsx::result_zcull_intr)
{ {