vm: rewrite reservation bits

Implement classic unique/shared locking concept.
Implement vm::reservation_light_op.
This commit is contained in:
Nekotekina 2020-10-08 16:13:55 +03:00
parent d962bb018d
commit 346a1d4433
10 changed files with 356 additions and 170 deletions

View file

@ -338,7 +338,7 @@ void cpu_thread::operator()()
{
thread_ctrl::set_native_priority(-1);
}
// force input/output denormals to zero for SPU threads (FTZ/DAZ)
_mm_setcsr( _mm_getcsr() | 0x8040 );
@ -653,6 +653,7 @@ cpu_thread::suspend_all::suspend_all(cpu_thread* _this) noexcept
for_all_cpu([](cpu_thread* cpu)
{
// Should be atomic
if (!(cpu->state & cpu_flag::pause))
{
cpu->state += cpu_flag::pause;

View file

@ -12,6 +12,7 @@ enum class cpu_flag : u32
exit, // Irreversible exit
wait, // Indicates waiting state, set by the thread itself
pause, // Thread suspended by suspend_all technique
pause2, // Used by suspend_all internally
suspend, // Thread suspended
ret, // Callback return requested
signal, // Thread received a signal (HLE)

View file

@ -2536,9 +2536,10 @@ s32 cellSpursShutdownWorkload(ppu_thread& ppu, vm::ptr<CellSpurs> spurs, u32 wid
return CELL_OK;
}
auto [res, rtime] = vm::reservation_lock(vm::get_addr(&spurs->wklEvent(wid)), 1, vm::dma_lockb);
const auto old = spurs->wklEvent(wid).fetch_or(1);
res.release(rtime + (old & 1 ? 0 : 128));
const auto old = vm::reservation_light_op(spurs->wklEvent(wid), [](atomic_t<u8>& v)
{
return v.fetch_or(1);
});
if (old & 0x12 && !(old & 1) && sys_event_port_send(spurs->eventPort, 0, 0, (1u << 31) >> wid))
{
@ -2693,9 +2694,11 @@ s32 cellSpursReadyCountStore(ppu_thread& ppu, vm::ptr<CellSpurs> spurs, u32 wid,
return CELL_SPURS_POLICY_MODULE_ERROR_STAT;
}
auto [res, rtime] = vm::reservation_lock(spurs.addr(), 128, vm::dma_lockb);
spurs->readyCount(wid).release(static_cast<u8>(value));
res.store(rtime + 128);
vm::reservation_light_op(spurs->readyCount(wid), [&](atomic_t<u8>& v)
{
v.release(static_cast<u8>(value));
});
return CELL_OK;
}
@ -2729,11 +2732,11 @@ s32 cellSpursReadyCountSwap(ppu_thread& ppu, vm::ptr<CellSpurs> spurs, u32 wid,
return CELL_SPURS_POLICY_MODULE_ERROR_STAT;
}
auto [res, rtime] = vm::reservation_lock(spurs.addr(), 128, vm::dma_lockb);
u32 temp = spurs->readyCount(wid).exchange(static_cast<u8>(swap));
res.release(rtime + 128);
*old = vm::reservation_light_op(spurs->readyCount(wid), [&](atomic_t<u8>& v)
{
return v.exchange(static_cast<u8>(swap));
});
*old = temp;
return CELL_OK;
}
@ -2769,9 +2772,10 @@ s32 cellSpursReadyCountCompareAndSwap(ppu_thread& ppu, vm::ptr<CellSpurs> spurs,
u8 temp = static_cast<u8>(compare);
auto [res, rtime] = vm::reservation_lock(spurs.addr(), 128, vm::dma_lockb);
spurs->readyCount(wid).compare_exchange(temp, static_cast<u8>(swap));
res.release(rtime + 128);
vm::reservation_light_op(spurs->readyCount(wid), [&](atomic_t<u8>& v)
{
v.compare_exchange(temp, static_cast<u8>(swap));
});
*old = temp;
return CELL_OK;
@ -2807,17 +2811,15 @@ s32 cellSpursReadyCountAdd(ppu_thread& ppu, vm::ptr<CellSpurs> spurs, u32 wid, v
return CELL_SPURS_POLICY_MODULE_ERROR_STAT;
}
auto [res, rtime] = vm::reservation_lock(spurs.addr(), 128, vm::dma_lockb);
u32 temp = spurs->readyCount(wid).fetch_op([&](u8& val)
*old = vm::reservation_light_op(spurs->readyCount(wid), [&](atomic_t<u8>& v)
{
const s32 _new = val + value;
val = static_cast<u8>(std::clamp<s32>(_new, 0, 0xFF));
return v.fetch_op([&](u8& val)
{
const s32 _new = val + value;
val = static_cast<u8>(std::clamp<s32>(_new, 0, 255));
});
});
res.release(rtime + 128);
*old = temp;
return CELL_OK;
}
@ -3833,13 +3835,12 @@ s32 _spurs::create_task(vm::ptr<CellSpursTaskset> taskset, vm::ptr<u32> task_id,
// TODO: Verify the ELF header is proper and all its load segments are at address >= 0x3000
u32 tmp_task_id;
{
auto addr = taskset.ptr(&CellSpursTaskset::enabled).addr();
auto [res, rtime] = vm::reservation_lock(addr, 16, vm::dma_lockb);
vm::reservation_light_op(vm::_ref<atomic_be_t<v128>>(taskset.ptr(&CellSpursTaskset::enabled).addr()), [&](atomic_be_t<v128>& ptr)
{
// NOTE: Realfw processes this using 4 32-bits atomic loops
// But here its processed within a single 128-bit atomic op
vm::_ref<atomic_be_t<v128>>(addr).fetch_op([&](be_t<v128>& value)
ptr.fetch_op([&](be_t<v128>& value)
{
auto value0 = value.value();
@ -3862,9 +3863,7 @@ s32 _spurs::create_task(vm::ptr<CellSpursTaskset> taskset, vm::ptr<u32> task_id,
tmp_task_id = CELL_SPURS_MAX_TASK;
return false;
});
res.release(rtime + 128);
}
});
if (tmp_task_id >= CELL_SPURS_MAX_TASK)
{
@ -3885,9 +3884,10 @@ s32 _spurs::create_task(vm::ptr<CellSpursTaskset> taskset, vm::ptr<u32> task_id,
s32 _spurs::task_start(ppu_thread& ppu, vm::ptr<CellSpursTaskset> taskset, u32 taskId)
{
auto [res, rtime] = vm::reservation_lock(taskset.ptr(&CellSpursTaskset::pending_ready).addr(), 16, vm::dma_lockb);
taskset->pending_ready.values[taskId / 32] |= (1u << 31) >> (taskId % 32);
res.release(rtime + 128);
vm::reservation_light_op(taskset->pending_ready, [&](CellSpursTaskset::atomic_tasks_bitset& v)
{
v.values[taskId / 32] |= (1u << 31) >> (taskId % 32);
});
auto spurs = +taskset->spurs;
ppu_execute<&cellSpursSendWorkloadSignal>(ppu, spurs, +taskset->wid);
@ -4706,25 +4706,23 @@ s32 cellSpursJobGuardNotify(ppu_thread& ppu, vm::ptr<CellSpursJobGuard> jobGuard
if (!jobGuard.aligned())
return CELL_SPURS_JOB_ERROR_ALIGN;
auto [res, rtime] = vm::reservation_lock(jobGuard.addr(), 128, vm::dma_lockb);
u32 allow_jobchain_run = 0; // Affects cellSpursJobChainRun execution
u32 old = 0;
auto [old, ok] = jobGuard->ncount0.fetch_op([&](be_t<u32>& value)
const bool ok = vm::reservation_op(vm::unsafe_ptr_cast<CellSpursJobGuard_x00>(jobGuard), [&](CellSpursJobGuard_x00& jg)
{
allow_jobchain_run = jobGuard->zero;
allow_jobchain_run = jg.zero;
old = jg.ncount0;
if (!value)
if (!jg.ncount0)
{
return false;
}
--value;
jg.ncount0--;
return true;
});
res.release(rtime + (ok ? 128 : 0));
if (!ok)
{
return CELL_SPURS_CORE_ERROR_STAT;
@ -4759,9 +4757,11 @@ s32 cellSpursJobGuardReset(vm::ptr<CellSpursJobGuard> jobGuard)
if (!jobGuard.aligned())
return CELL_SPURS_JOB_ERROR_ALIGN;
auto [res, rtime] = vm::reservation_lock(jobGuard.addr(), 128, vm::dma_lockb);
jobGuard->ncount0 = jobGuard->ncount1;
res.release(rtime + 128);
vm::reservation_light_op(jobGuard->ncount0, [&](atomic_be_t<u32>& ncount0)
{
ncount0 = jobGuard->ncount1;
});
return CELL_OK;
}
@ -4844,9 +4844,11 @@ s32 cellSpursJobSetMaxGrab(vm::ptr<CellSpursJobChain> jobChain, u32 maxGrabbedJo
if ((spurs->wklEnabled & (0x80000000u >> wid)) == 0u)
return CELL_SPURS_JOB_ERROR_STAT;
auto [res, rtime] = vm::reservation_lock(jobChain.addr(), 128, vm::dma_lockb);
jobChain->maxGrabbedJob.release(static_cast<u16>(maxGrabbedJob));
res.store(rtime + 128);
vm::reservation_light_op(jobChain->maxGrabbedJob, [&](atomic_be_t<u16>& v)
{
v.release(static_cast<u16>(maxGrabbedJob));
});
return CELL_OK;
}

View file

@ -561,6 +561,22 @@ struct alignas(128) CellSpursJobGuard
CHECK_SIZE_ALIGN(CellSpursJobGuard, 128, 128);
struct alignas(128) CellSpursJobGuard_x00
{
be_t<u32> ncount0; // 0x00
be_t<u32> ncount1; // 0x04
vm::bptr<CellSpursJobChain> jobChain; // 0x0C
be_t<u32> unk0;
be_t<u32> requestSpuCount; // 0x10
be_t<u32> unk1[3];
be_t<u32> autoReset; // 0x20
be_t<u32> unk2[3];
be_t<u32> zero; // 0x30
u8 unk3[0x80 - 0x34];
};
CHECK_SIZE_ALIGN(CellSpursJobGuard_x00, 128, 128);
// Core CellSpurs structures
struct alignas(128) CellSpurs
{

View file

@ -1431,7 +1431,7 @@ s32 spursTasksetProcessRequest(spu_thread& spu, s32 request, u32* taskId, u32* i
// Find the number of tasks that have become ready since the last iteration
{
auto newlyReadyTasks = v128::andnot(ready, signalled | pready);
// TODO: Optimize this shit with std::popcount when it's known to be fixed
for (auto i = 0; i < 128; i++)
{
@ -1597,14 +1597,14 @@ s32 spursTasksetProcessRequest(spu_thread& spu, s32 request, u32* taskId, u32* i
{
auto spurs = kernelCtxt->spurs;
auto [res, rtime] = vm::reservation_lock(spurs.addr(), 128, vm::dma_lockb);
spurs->readyCount(kernelCtxt->wklCurrentId).fetch_op([&](u8& val)
vm::reservation_light_op(spurs->readyCount(kernelCtxt->wklCurrentId), [&](atomic_t<u8>& val)
{
const s32 _new = val + numNewlyReadyTasks;
val = static_cast<u8>(std::clamp<s32>(_new, 0, 0xFF));
val.fetch_op([&](u8& val)
{
const s32 _new = val + numNewlyReadyTasks;
val = static_cast<u8>(std::clamp<s32>(_new, 0, 0xFF));
});
});
res.release(rtime + 128);
}
return rc;

View file

@ -946,7 +946,7 @@ void ppu_thread::fast_call(u32 addr, u32 rtoc)
if (_this->current_function && vm::read32(cia) != ppu_instructions::SC(0))
{
return fmt::format("PPU[0x%x] Thread (%s) [HLE:0x%08x, LR:0x%08x]", _this->id, *name_cache.get(), cia, _this->lr);
return fmt::format("PPU[0x%x] Thread (%s) [HLE:0x%08x, LR:0x%08x]", _this->id, *name_cache.get(), cia, _this->lr);
}
return fmt::format("PPU[0x%x] Thread (%s) [0x%08x]", _this->id, *name_cache.get(), cia);
@ -1103,7 +1103,6 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr)
const u64 data_off = (addr & 7) * 8;
ppu.raddr = addr;
const u64 mask_res = g_use_rtm ? (-128 | vm::dma_lockb) : -1;
if (const s32 max = g_cfg.core.ppu_128_reservations_loop_max_length)
{
@ -1160,7 +1159,7 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr)
for (u64 count = 0;; [&]()
{
if (ppu.state)
{
{
ppu.check_state();
}
else if (++count < 20) [[likely]]
@ -1175,7 +1174,7 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr)
}
}())
{
ppu.rtime = vm::reservation_acquire(addr, sizeof(T)) & mask_res;
ppu.rtime = vm::reservation_acquire(addr, sizeof(T));
if (ppu.rtime & 127)
{
@ -1189,7 +1188,7 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr)
mov_rdata(ppu.rdata, vm::_ref<spu_rdata_t>(addr & -128));
}
if ((vm::reservation_acquire(addr, sizeof(T)) & mask_res) == ppu.rtime) [[likely]]
if (vm::reservation_acquire(addr, sizeof(T)) == ppu.rtime) [[likely]]
{
if (count >= 15) [[unlikely]]
{
@ -1218,6 +1217,7 @@ const auto ppu_stcx_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, u64 rda
Label fall = c.newLabel();
Label fail = c.newLabel();
Label fail2 = c.newLabel();
// Prepare registers
c.mov(x86::r10, imm_ptr(+vm::g_reservations));
@ -1234,7 +1234,9 @@ const auto ppu_stcx_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, u64 rda
// Begin transaction
build_transaction_enter(c, fall, args[0], 16);
c.mov(x86::rax, x86::qword_ptr(x86::r10));
c.and_(x86::rax, -128 | vm::dma_lockb);
c.test(x86::eax, vm::rsrv_unique_lock);
c.jnz(fail2);
c.and_(x86::rax, -128);
c.cmp(x86::rax, args[1]);
c.jne(fail);
c.cmp(x86::qword_ptr(x86::r11), args[2]);
@ -1249,6 +1251,7 @@ const auto ppu_stcx_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, u64 rda
c.bind(fall);
c.sar(x86::eax, 24);
c.js(fail);
c.bind(fail2);
c.mov(x86::eax, 2);
c.ret();
@ -1324,11 +1327,11 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
// Begin transaction
build_transaction_enter(c, fall, x86::r12, 4);
c.mov(x86::rax, x86::qword_ptr(x86::rbx));
c.test(x86::eax, vm::rsrv_unique_lock);
c.jnz(skip);
c.and_(x86::rax, -128);
c.cmp(x86::rax, x86::r13);
c.jne(fail);
c.test(x86::qword_ptr(x86::rbx), 127);
c.jnz(skip);
if (s_tsx_avx)
{
@ -1394,15 +1397,19 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
Label fall2 = c.newLabel();
Label fail2 = c.newLabel();
Label fail3 = c.newLabel();
// Lightened transaction: only compare and swap data
c.bind(next);
// Try to "lock" reservation
c.mov(x86::rax, x86::r13);
c.add(x86::r13, 1);
c.lock().cmpxchg(x86::qword_ptr(x86::rbx), x86::r13);
c.jne(fail);
c.mov(x86::eax, x86::r13);
c.lock().xadd(x86::qword_ptr(x86::rbx), x86::rax);
c.test(x86::eax, vm::rsrv_unique_lock);
c.jnz(fail3);
c.and_(x86::rax, -128);
c.cmp(x86::rax, x86::r13);
c.jne(fail2);
build_transaction_enter(c, fall2, x86::r12, 666);
@ -1453,6 +1460,7 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime
c.bind(fall2);
c.sar(x86::eax, 24);
c.js(fail2);
c.bind(fail3);
c.mov(x86::eax, 2);
c.jmp(_ret);
@ -1579,20 +1587,51 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value)
cpu_thread::suspend_all cpu_lock(&ppu);
// Give up if PUTLLUC happened
if (res == (rtime | 1) && cmp_rdata(ppu.rdata, vm::_ref<spu_rdata_t>(addr & -128)))
// Obtain unique lock
while (res.bts(std::countr_zero<u32>(vm::rsrv_unique_lock)))
{
busy_wait(100);
// Give up if reservation has been updated
if ((res & -128) != rtime)
{
res -= 1;
return false;
}
}
if ((res & -128) == rtime && cmp_rdata(ppu.rdata, vm::_ref<spu_rdata_t>(addr & -128)))
{
data.release(reg_value);
res.release(rtime + 128);
res += 63;
return true;
}
res.release(rtime);
res -= (vm::rsrv_unique_lock + 1);
return false;
}
if (!vm::reservation_trylock(res, rtime))
while (res.bts(std::countr_zero<u32>(vm::rsrv_unique_lock)))
{
// Give up if reservation has been updated
if ((res & -128) != rtime)
{
return false;
}
if (ppu.state && ppu.check_state())
{
return false;
}
else
{
busy_wait(100);
}
}
if ((res & -128) != rtime)
{
res -= vm::rsrv_unique_lock;
return false;
}
@ -1654,24 +1693,64 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value)
default: break;
}
if (res == rtime && vm::reservation_trylock(res, rtime))
if (res.fetch_add(1) & vm::rsrv_unique_lock)
{
const bool ret = data.compare_and_swap_test(old_data, reg_value);
res.release(rtime + 128);
return ret;
res -= 1;
return false;
}
if (data.compare_and_swap_test(old_data, reg_value))
{
res += 127;
return true;
}
res -= 1;
return false;
}
if (!vm::reservation_trylock(res, rtime))
while (true)
{
return false;
auto [_old, _ok] = res.fetch_op([&](u64& r)
{
if ((r & -128) != rtime || (r & vm::rsrv_unique_lock))
{
return false;
}
r += 1;
return true;
});
// Give up if reservation has been updated
if ((_old & -128) != rtime)
{
return false;
}
if (_ok)
{
break;
}
if (ppu.state && ppu.check_state())
{
return false;
}
else
{
busy_wait(100);
}
}
const bool ret = data.compare_and_swap_test(old_data, reg_value);
res.release(rtime + 128);
return ret;
if (data.compare_and_swap_test(old_data, reg_value))
{
res += 127;
return true;
}
res -=1;
return false;
}())
{
res.notify_all();

View file

@ -378,11 +378,11 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, const
// Begin transaction
build_transaction_enter(c, fall, x86::r12, 4);
c.mov(x86::rax, x86::qword_ptr(x86::rbx));
c.test(x86::eax, vm::rsrv_unique_lock);
c.jnz(skip);
c.and_(x86::rax, -128);
c.cmp(x86::rax, x86::r13);
c.jne(fail);
c.test(x86::qword_ptr(x86::rbx), 127);
c.jnz(skip);
if (s_tsx_avx)
{
@ -461,15 +461,19 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, const
Label fall2 = c.newLabel();
Label fail2 = c.newLabel();
Label fail3 = c.newLabel();
// Lightened transaction: only compare and swap data
c.bind(next);
// Try to "lock" reservation
c.mov(x86::rax, x86::r13);
c.add(x86::r13, 1);
c.lock().cmpxchg(x86::qword_ptr(x86::rbx), x86::r13);
c.jne(fail);
c.mov(x86::eax, 1);
c.lock().xadd(x86::qword_ptr(x86::rbx), x86::rax);
c.test(x86::eax, vm::rsrv_unique_lock);
c.jnz(fail3);
c.and_(x86::rax, -128);
c.cmp(x86::rax, x86::r13);
c.jne(fail2);
build_transaction_enter(c, fall2, x86::r12, 666);
@ -533,6 +537,7 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, const
c.bind(fall2);
c.sar(x86::eax, 24);
c.js(fail2);
c.bind(fail3);
c.mov(x86::eax, 2);
c.jmp(_ret);
@ -644,7 +649,7 @@ const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rda
// Begin transaction
build_transaction_enter(c, fall, x86::r12, 8);
c.test(x86::dword_ptr(x86::rbx), 127);
c.test(x86::dword_ptr(x86::rbx), vm::rsrv_unique_lock);
c.jnz(skip);
if (s_tsx_avx)
@ -688,14 +693,15 @@ const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rda
c.xor_(x86::rbp, 0xf80);
Label fall2 = c.newLabel();
Label fail2 = c.newLabel();
// Lightened transaction
c.bind(next);
// Try to acquire "PUTLLUC lock"
c.lock().bts(x86::qword_ptr(x86::rbx), std::countr_zero<u32>(vm::putlluc_lockb));
c.jc(fail2);
// Lock reservation
c.mov(x86::eax, 1);
c.lock().xadd(x86::qword_ptr(x86::rbx), x86::rax);
c.test(x86::eax, vm::rsrv_unique_lock);
c.jnz(fall2);
build_transaction_enter(c, fall2, x86::r12, 666);
@ -719,16 +725,12 @@ const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rda
}
c.xend();
c.lock().add(x86::qword_ptr(x86::rbx), 64);
c.lock().add(x86::qword_ptr(x86::rbx), 127);
c.mov(x86::eax, 1);
c.jmp(_ret);
c.bind(fail2);
c.xor_(x86::eax, x86::eax);
c.jmp(_ret);
c.bind(fall2);
c.mov(x86::eax, 2);
c.xor_(x86::eax, x86::eax);
//c.jmp(_ret);
c.bind(_ret);
@ -1415,30 +1417,30 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
{
case 1:
{
auto [res, time0] = vm::reservation_lock(eal, 1, vm::dma_lockb);
auto [res, time0] = vm::reservation_lock(eal);
*reinterpret_cast<u8*>(dst) = *reinterpret_cast<const u8*>(src);
res.release(time0 + 128);
res += 64;
break;
}
case 2:
{
auto [res, time0] = vm::reservation_lock(eal, 2, vm::dma_lockb);
auto [res, time0] = vm::reservation_lock(eal);
*reinterpret_cast<u16*>(dst) = *reinterpret_cast<const u16*>(src);
res.release(time0 + 128);
res += 64;
break;
}
case 4:
{
auto [res, time0] = vm::reservation_lock(eal, 4, vm::dma_lockb);
auto [res, time0] = vm::reservation_lock(eal);
*reinterpret_cast<u32*>(dst) = *reinterpret_cast<const u32*>(src);
res.release(time0 + 128);
res += 64;
break;
}
case 8:
{
auto [res, time0] = vm::reservation_lock(eal, 8, vm::dma_lockb);
auto [res, time0] = vm::reservation_lock(eal);
*reinterpret_cast<u64*>(dst) = *reinterpret_cast<const u64*>(src);
res.release(time0 + 128);
res += 64;
break;
}
default:
@ -1463,7 +1465,7 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
}
// Lock each cache line execlusively
auto [res, time0] = vm::reservation_lock(eal, size0, vm::dma_lockb);
auto [res, time0] = vm::reservation_lock(eal);
switch (size0)
{
@ -1491,7 +1493,7 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
}
}
res.release(time0 + 128);
res += 64;
if (size == size0)
{
@ -1505,7 +1507,7 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
if (((eal & 127) + size) <= 128)
{
// Lock one cache line
auto [res, time0] = vm::reservation_lock(eal, 128);
auto [res, time0] = vm::reservation_lock(eal);
while (size)
{
@ -1516,7 +1518,7 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
size -= 16;
}
res.release(time0);
res += 64;
break;
}
@ -1848,21 +1850,34 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args)
cpu_thread::suspend_all cpu_lock(this);
// Give up if PUTLLUC happened
if (res == (rtime | 1))
// Obtain unique lock
while (res.bts(std::countr_zero<u32>(vm::rsrv_unique_lock)))
{
busy_wait(100);
// Give up if reservation has been updated
if ((res & -128) != rtime)
{
res -= 1;
if (render) render->unpause();
return false;
}
}
if ((res & -128) == rtime)
{
auto& data = vm::_ref<spu_rdata_t>(addr);
if (cmp_rdata(rdata, data))
{
mov_rdata(data, to_write);
res += 127;
res += 63;
if (render) render->unpause();
return true;
}
}
res -= 1;
res -= (vm::rsrv_unique_lock | 1);
if (render) render->unpause();
return false;
}
@ -1872,8 +1887,27 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args)
}
}
if (!vm::reservation_trylock(res, rtime))
while (res.bts(std::countr_zero<u32>(vm::rsrv_unique_lock)))
{
// Give up if reservation has been updated
if ((res & -128) != rtime)
{
return false;
}
if (state && check_state())
{
return false;
}
else
{
busy_wait(100);
}
}
if ((res & -128) != rtime)
{
res -= vm::rsrv_unique_lock;
return false;
}
@ -1914,7 +1948,7 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args)
if (raddr)
{
// Last check for event before we clear the reservation
if (raddr == addr || rtime != (vm::reservation_acquire(raddr, 128) & (-128 | vm::dma_lockb)) || !cmp_rdata(rdata, vm::_ref<spu_rdata_t>(raddr)))
if (raddr == addr || rtime != (vm::reservation_acquire(raddr, 128) & -128) || !cmp_rdata(rdata, vm::_ref<spu_rdata_t>(raddr)))
{
set_events(SPU_EVENT_LR);
}
@ -1937,38 +1971,17 @@ void do_cell_atomic_128_store(u32 addr, const void* to_write)
if (render) render->pause();
if (result == 2)
if (result == 0)
{
cpu_thread::suspend_all cpu_lock(cpu);
if (vm::reservation_acquire(addr, 128) & 64)
{
// Wait for PUTLLC to complete
while (vm::reservation_acquire(addr, 128) & 63)
{
busy_wait(100);
}
mov_rdata(vm::_ref<spu_rdata_t>(addr), *static_cast<const spu_rdata_t*>(to_write));
vm::reservation_acquire(addr, 128) += 64;
}
}
else if (result == 0)
{
cpu_thread::suspend_all cpu_lock(cpu);
while (vm::reservation_acquire(addr, 128).bts(std::countr_zero<u32>(vm::putlluc_lockb)))
{
busy_wait(100);
}
while (vm::reservation_acquire(addr, 128) & 63)
while (vm::reservation_acquire(addr, 128).bts(std::countr_zero<u32>(vm::rsrv_unique_lock)))
{
busy_wait(100);
}
mov_rdata(vm::_ref<spu_rdata_t>(addr), *static_cast<const spu_rdata_t*>(to_write));
vm::reservation_acquire(addr, 128) += 64;
vm::reservation_acquire(addr, 128) += 63;
}
if (render) render->unpause();
@ -1977,7 +1990,7 @@ void do_cell_atomic_128_store(u32 addr, const void* to_write)
else
{
auto& data = vm::_ref<spu_rdata_t>(addr);
auto [res, time0] = vm::reservation_lock(addr, 128);
auto [res, time0] = vm::reservation_lock(addr);
*reinterpret_cast<atomic_t<u32>*>(&data) += 0;
@ -2247,7 +2260,7 @@ bool spu_thread::process_mfc_cmd()
if (raddr && raddr != addr)
{
// Last check for event before we replace the reservation with a new one
if ((vm::reservation_acquire(raddr, 128) & (-128 | vm::dma_lockb)) != rtime || !cmp_rdata(temp, vm::_ref<spu_rdata_t>(raddr)))
if ((vm::reservation_acquire(raddr, 128) & -128) != rtime || !cmp_rdata(temp, vm::_ref<spu_rdata_t>(raddr)))
{
set_events(SPU_EVENT_LR);
}
@ -3583,7 +3596,7 @@ bool spu_thread::capture_local_storage() const
if (name.empty())
{
// TODO: Maybe add thread group name here
fmt::append(name, "SPU.%u", lv2_id);
fmt::append(name, "SPU.%u", lv2_id);
}
}
else

View file

@ -441,11 +441,11 @@ namespace vm
g_mutex.unlock();
}
u64 reservation_lock_internal(u32 addr, atomic_t<u64>& res, u64 lock_bits)
u64 reservation_lock_internal(u32 addr, atomic_t<u64>& res)
{
for (u64 i = 0;; i++)
{
if (u64 rtime = res; !(rtime & 127) && reservation_trylock(res, rtime, lock_bits)) [[likely]]
if (u64 rtime = res; !(rtime & 127) && reservation_try_lock(res, rtime)) [[likely]]
{
return rtime;
}
@ -471,6 +471,30 @@ namespace vm
}
}
void reservation_shared_lock_internal(atomic_t<u64>& res)
{
for (u64 i = 0;; i++)
{
if (!(res & rsrv_unique_lock)) [[likely]]
{
return;
}
if (auto cpu = get_current_cpu_thread(); cpu && cpu->state)
{
cpu->check_state();
}
else if (i < 15)
{
busy_wait(500);
}
else
{
std::this_thread::yield();
}
}
}
void reservation_op_internal(u32 addr, std::function<bool()> func)
{
const auto _cpu = get_current_cpu_thread();
@ -481,15 +505,15 @@ namespace vm
{
cpu_thread::suspend_all cpu_lock(_cpu);
// Wait to acquire PUTLLUC lock
while (vm::reservation_acquire(addr, 128).bts(std::countr_zero<u32>(vm::putlluc_lockb)))
// Wait to acquire unique lock
while (vm::reservation_acquire(addr, 128).bts(std::countr_zero<u32>(vm::rsrv_unique_lock)))
{
busy_wait(100);
}
if (func())
{
// Success, release PUTLLUC and PUTLLC locks if necessary
// Success, release all locks if necessary
vm::reservation_acquire(addr, 128) += 63;
}
else

View file

@ -10,11 +10,11 @@ extern bool g_use_rtm;
namespace vm
{
enum reservation_lock_bit : u64
enum : u64
{
stcx_lockb = 1 << 0, // Exclusive conditional reservation lock
dma_lockb = 1 << 5, // Exclusive unconditional reservation lock
putlluc_lockb = 1 << 6, // Exclusive unconditional reservation lock
rsrv_lock_mask = 127,
rsrv_unique_lock = 64,
rsrv_shared_mask = 63,
};
// Get reservation status for further atomic update: last update timestamp
@ -42,11 +42,13 @@ namespace vm
return *reinterpret_cast<atomic_t<u64>*>(g_reservations + (addr & 0xff80) / 2);
}
u64 reservation_lock_internal(u32, atomic_t<u64>&, u64);
u64 reservation_lock_internal(u32, atomic_t<u64>&);
inline bool reservation_trylock(atomic_t<u64>& res, u64 rtime, u64 lock_bits = stcx_lockb)
void reservation_shared_lock_internal(atomic_t<u64>&);
inline bool reservation_try_lock(atomic_t<u64>& res, u64 rtime)
{
if (res.compare_and_swap_test(rtime, rtime + lock_bits)) [[likely]]
if (res.compare_and_swap_test(rtime, rtime | rsrv_unique_lock)) [[likely]]
{
return true;
}
@ -54,16 +56,16 @@ namespace vm
return false;
}
inline std::pair<atomic_t<u64>&, u64> reservation_lock(u32 addr, u32 size, u64 lock_bits = stcx_lockb)
inline std::pair<atomic_t<u64>&, u64> reservation_lock(u32 addr)
{
auto res = &vm::reservation_acquire(addr, size);
auto res = &vm::reservation_acquire(addr, 1);
auto rtime = res->load();
if (rtime & 127 || !reservation_trylock(*res, rtime, lock_bits)) [[unlikely]]
if (rtime & 127 || !reservation_try_lock(*res, rtime)) [[unlikely]]
{
static atomic_t<u64> no_lock{};
rtime = reservation_lock_internal(addr, *res, lock_bits);
rtime = reservation_lock_internal(addr, *res);
if (rtime == umax)
{
@ -95,6 +97,7 @@ namespace vm
// Stage 1: single optimistic transaction attempt
unsigned status = _XBEGIN_STARTED;
u64 _old = 0;
#ifndef _MSC_VER
__asm__ goto ("xbegin %l[stage2];" ::: "memory" : stage2);
@ -103,6 +106,15 @@ namespace vm
if (status == _XBEGIN_STARTED)
#endif
{
if (res & rsrv_unique_lock)
{
#ifndef _MSC_VER
__asm__ volatile ("xabort $0;" ::: "memory");
#else
_xabort(0);
#endif
}
if constexpr (std::is_void_v<std::invoke_result_t<F, T&>>)
{
res += 128;
@ -161,10 +173,10 @@ namespace vm
}
// Stage 2: try to lock reservation first
res += stcx_lockb;
_old = res.fetch_add(1);
// Start lightened transaction (TODO: tweaking)
while (true)
while (!(_old & rsrv_unique_lock))
{
#ifndef _MSC_VER
__asm__ goto ("xbegin %l[retry];" ::: "memory" : retry);
@ -263,11 +275,8 @@ namespace vm
}
}
// Perform under heavyweight lock
auto& res = vm::reservation_acquire(addr, 128);
res += stcx_lockb;
// Perform heavyweight lock
auto [res, rtime] = vm::reservation_lock(addr);
// Write directly if the op cannot fail
if constexpr (std::is_void_v<std::invoke_result_t<F, T&>>)
@ -294,12 +303,12 @@ namespace vm
{
// If operation succeeds, write the data back
*sptr = buf;
res += 127;
res.release(rtime + 128);
}
else
{
// Operation failed, no memory has been modified
res -= 1;
res.release(rtime);
return std::invoke_result_t<F, T&>();
}
}
@ -363,4 +372,45 @@ namespace vm
}
}
}
template <bool Ack = false, typename T, typename F>
SAFE_BUFFERS inline auto reservation_light_op(T& data, F op)
{
// Optimized real ptr -> vm ptr conversion, simply UB if out of range
const u32 addr = static_cast<u32>(reinterpret_cast<const u8*>(&data) - g_base_addr);
// Use "super" pointer to prevent access violation handling during atomic op
const auto sptr = vm::get_super_ptr<T>(addr);
// "Lock" reservation
auto& res = vm::reservation_acquire(addr, 128);
if (res.fetch_add(1) & vm::rsrv_unique_lock) [[unlikely]]
{
vm::reservation_shared_lock_internal(res);
}
if constexpr (std::is_void_v<std::invoke_result_t<F, T&>>)
{
std::invoke(op, *sptr);
res += 127;
if constexpr (Ack)
{
res.notify_all();
}
}
else
{
auto result = std::invoke(op, *sptr);
res += 127;
if constexpr (Ack)
{
res.notify_all();
}
return result;
}
}
} // namespace vm

View file

@ -151,14 +151,14 @@ namespace rsx
// TODO: Check if possible to write on reservations
if (!g_use_rtm && rsx->label_addr >> 28 != addr >> 28) [[likely]]
{
res = &vm::reservation_lock(addr, 4).first;
res = &vm::reservation_lock(addr).first;
}
vm::_ref<RsxSemaphore>(addr).val = arg;
if (res)
{
res->release(*res + 127);
res += 127;
}
vm::reservation_notifier(addr, 4).notify_all();
@ -818,7 +818,7 @@ namespace rsx
case CELL_GCM_FUNC_ADD_SIGNED:
case CELL_GCM_FUNC_REVERSE_ADD_SIGNED:
break;
default:
{
// Ignore invalid values as a whole
@ -1513,7 +1513,7 @@ namespace rsx
const auto data_length = in_pitch * (line_count - 1) + line_length;
rsx->invalidate_fragment_program(dst_dma, dst_offset, data_length);
if (const auto result = rsx->read_barrier(read_address, data_length, !is_block_transfer);
result == rsx::result_zcull_intr)
{