vm: improve range_lock and shareable cache (Non-TSX)

Allocate "personal" range lock variable for each spu_thread.
Switch from reservation_lock to range lock for all stores.
Detect actual memory mirrors in shareable cache setup logic.
This commit is contained in:
Nekotekina 2020-10-26 04:05:17 +03:00
parent 6806e3d5c7
commit 4966f6de73
5 changed files with 343 additions and 212 deletions

View file

@ -1368,6 +1368,9 @@ spu_thread::~spu_thread()
g_raw_spu_id[index] = 0;
g_raw_spu_ctr--;
}
// Free range lock
vm::free_range_lock(range_lock);
}
spu_thread::spu_thread(vm::addr_t _ls, lv2_spu_group* group, u32 index, std::string_view name, u32 lv2_id, bool is_isolated, u32 option)
@ -1418,6 +1421,8 @@ spu_thread::spu_thread(vm::addr_t _ls, lv2_spu_group* group, u32 index, std::str
{
cpu_init();
}
range_lock = vm::alloc_range_lock();
}
void spu_thread::push_snr(u32 number, u32 value)
@ -1704,101 +1709,121 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
}
}
switch (u32 size = args.size)
if (g_cfg.core.spu_accurate_dma) [[unlikely]]
{
case 1:
{
auto [res, time0] = vm::reservation_lock(eal);
*reinterpret_cast<u8*>(dst) = *reinterpret_cast<const u8*>(src);
res += 64;
break;
}
case 2:
{
auto [res, time0] = vm::reservation_lock(eal);
*reinterpret_cast<u16*>(dst) = *reinterpret_cast<const u16*>(src);
res += 64;
break;
}
case 4:
{
auto [res, time0] = vm::reservation_lock(eal);
*reinterpret_cast<u32*>(dst) = *reinterpret_cast<const u32*>(src);
res += 64;
break;
}
case 8:
{
auto [res, time0] = vm::reservation_lock(eal);
*reinterpret_cast<u64*>(dst) = *reinterpret_cast<const u64*>(src);
res += 64;
break;
}
default:
{
if (g_cfg.core.spu_accurate_dma)
for (u32 size0, size = args.size;; size -= size0, dst += size0, src += size0, eal += size0)
{
for (u32 size0;; size -= size0, dst += size0, src += size0, eal += size0)
size0 = std::min<u32>(128 - (eal & 127), std::min<u32>(size, 128));
if (size0 == 128u && g_cfg.core.accurate_cache_line_stores)
{
size0 = std::min<u32>(128 - (eal & 127), std::min<u32>(size, 128));
if (size0 == 128u && g_cfg.core.accurate_cache_line_stores)
{
// As atomic as PUTLLUC
do_cell_atomic_128_store(eal, src);
if (size == size0)
{
break;
}
continue;
}
// Lock each cache line execlusively
auto [res, time0] = vm::reservation_lock(eal);
switch (size0)
{
case 128:
{
mov_rdata(*reinterpret_cast<spu_rdata_t*>(dst), *reinterpret_cast<const spu_rdata_t*>(src));
break;
}
default:
{
auto _dst = dst;
auto _src = src;
auto _size = size0;
while (_size)
{
*reinterpret_cast<v128*>(_dst) = *reinterpret_cast<const v128*>(_src);
_dst += 16;
_src += 16;
_size -= 16;
}
break;
}
}
res += 64;
// As atomic as PUTLLUC
do_cell_atomic_128_store(eal, src);
if (size == size0)
{
break;
}
continue;
}
break;
// Lock each cache line execlusively
auto [res, time0] = vm::reservation_lock(eal);
switch (size0)
{
case 1:
{
*reinterpret_cast<u8*>(dst) = *reinterpret_cast<const u8*>(src);
break;
}
case 2:
{
*reinterpret_cast<u16*>(dst) = *reinterpret_cast<const u16*>(src);
break;
}
case 4:
{
*reinterpret_cast<u32*>(dst) = *reinterpret_cast<const u32*>(src);
break;
}
case 8:
{
*reinterpret_cast<u64*>(dst) = *reinterpret_cast<const u64*>(src);
break;
}
case 128:
{
mov_rdata(*reinterpret_cast<spu_rdata_t*>(dst), *reinterpret_cast<const spu_rdata_t*>(src));
break;
}
default:
{
auto _dst = dst;
auto _src = src;
auto _size = size0;
while (_size)
{
*reinterpret_cast<v128*>(_dst) = *reinterpret_cast<const v128*>(_src);
_dst += 16;
_src += 16;
_size -= 16;
}
break;
}
}
res += 64;
if (size == size0)
{
break;
}
}
std::atomic_thread_fence(std::memory_order_seq_cst);
return;
}
switch (u32 size = args.size)
{
case 1:
{
vm::range_lock(range_lock, eal, 1);
*reinterpret_cast<u8*>(dst) = *reinterpret_cast<const u8*>(src);
range_lock->release(0);
break;
}
case 2:
{
vm::range_lock(range_lock, eal, 2);
*reinterpret_cast<u16*>(dst) = *reinterpret_cast<const u16*>(src);
range_lock->release(0);
break;
}
case 4:
{
vm::range_lock(range_lock, eal, 4);
*reinterpret_cast<u32*>(dst) = *reinterpret_cast<const u32*>(src);
range_lock->release(0);
break;
}
case 8:
{
vm::range_lock(range_lock, eal, 8);
*reinterpret_cast<u64*>(dst) = *reinterpret_cast<const u64*>(src);
range_lock->release(0);
break;
}
default:
{
if (((eal & 127) + size) <= 128)
{
// Lock one cache line
auto [res, time0] = vm::reservation_lock(eal);
vm::range_lock(range_lock, eal, size);
while (size)
{
@ -1809,14 +1834,14 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
size -= 16;
}
res += 64;
range_lock->release(0);
break;
}
u32 range_addr = eal & -128;
u32 range_end = ::align(eal + size, 128);
// Handle the case of crossing 64K page borders
// Handle the case of crossing 64K page borders (TODO: maybe split in 4K fragments?)
if (range_addr >> 16 != (range_end - 1) >> 16)
{
u32 nexta = range_end & -65536;
@ -1824,7 +1849,7 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
size -= size0;
// Split locking + transfer in two parts (before 64K border, and after it)
const auto lock = vm::range_lock(range_addr, nexta);
vm::range_lock(range_lock, range_addr, size0);
// Avoid unaligned stores in mov_rdata_avx
if (reinterpret_cast<u64>(dst) & 0x10)
@ -1854,11 +1879,11 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
size0 -= 16;
}
lock->release(0);
range_lock->release(0);
range_addr = nexta;
}
const auto lock = vm::range_lock(range_addr, range_end);
vm::range_lock(range_lock, range_addr, range_end - range_addr);
// Avoid unaligned stores in mov_rdata_avx
if (reinterpret_cast<u64>(dst) & 0x10)
@ -1888,16 +1913,11 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
size -= 16;
}
lock->release(0);
range_lock->release(0);
break;
}
}
if (g_cfg.core.spu_accurate_dma)
{
std::atomic_thread_fence(std::memory_order_seq_cst);
}
return;
}