SPU/PPU atomics performance and LR event fixes (#5435)

* Fix SPU LR event setting in atomic commands according to hw test
* MFC: increment timestamp for PUT cmd in non-tsx path
* MFC: fix reservation lost test on non-tsx path in regard to the lock bit
* Reservation notification moved out of writer_lock scope to reduce its lifetime
* Use passive_lock/unlock in ppu atomic inctrustions to reduce redundancy
* Lock only once for dma transfers (non-TSX)
* Don't use RDTSC in reservation update logic
* Remove MFC cmd args passing to process_mfc_cmd
* Reorder check_state cpu_flag::memory check for faster unlocking
* Specialization for 128-byte data copy in SPU dma transfers
* Implement memory range locks and isolate PPU and SPU passive lock logic
This commit is contained in:
elad 2019-01-15 17:31:21 +02:00 committed by Ivan
parent f19fd23227
commit fc92ae4085
9 changed files with 344 additions and 235 deletions

View file

@ -119,15 +119,16 @@ bool cpu_thread::check_state()
while (true) while (true)
{ {
if (state & cpu_flag::memory && state.test_and_reset(cpu_flag::memory)) if (state & cpu_flag::memory)
{ {
cpu_flag_memory = true;
if (auto& ptr = vm::g_tls_locked) if (auto& ptr = vm::g_tls_locked)
{ {
ptr->compare_and_swap(this, nullptr); ptr->compare_and_swap(this, nullptr);
ptr = nullptr; ptr = nullptr;
} }
cpu_flag_memory = true;
state -= cpu_flag::memory;
} }
if (state & cpu_flag::exit + cpu_flag::dbg_global_stop) if (state & cpu_flag::exit + cpu_flag::dbg_global_stop)

View file

@ -977,7 +977,7 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr)
} }
} }
vm::temporary_unlock(ppu); vm::passive_unlock(ppu);
for (u64 i = 0;; i++) for (u64 i = 0;; i++)
{ {
@ -1003,8 +1003,7 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr)
} }
} }
ppu.cpu_mem(); vm::passive_lock(ppu);
return static_cast<T>(ppu.rdata << data_off >> size_off); return static_cast<T>(ppu.rdata << data_off >> size_off);
} }
@ -1044,7 +1043,7 @@ const auto ppu_stwcx_tx = build_function_asm<bool(*)(u32 raddr, u64 rtime, u64 r
c.cmp(x86::dword_ptr(x86::r11), args[2].r32()); c.cmp(x86::dword_ptr(x86::r11), args[2].r32());
c.jne(fail); c.jne(fail);
c.mov(x86::dword_ptr(x86::r11), args[3].r32()); c.mov(x86::dword_ptr(x86::r11), args[3].r32());
c.add(x86::qword_ptr(x86::r10), 1); c.add(x86::qword_ptr(x86::r10), 2);
c.xend(); c.xend();
c.mov(x86::eax, 1); c.mov(x86::eax, 1);
c.ret(); c.ret();
@ -1070,7 +1069,7 @@ extern bool ppu_stwcx(ppu_thread& ppu, u32 addr, u32 reg_value)
auto& data = vm::_ref<atomic_be_t<u32>>(addr & -4); auto& data = vm::_ref<atomic_be_t<u32>>(addr & -4);
const u32 old_data = static_cast<u32>(ppu.rdata << ((addr & 7) * 8) >> 32); const u32 old_data = static_cast<u32>(ppu.rdata << ((addr & 7) * 8) >> 32);
if (ppu.raddr != addr || addr & 3 || old_data != data.load() || ppu.rtime != vm::reservation_acquire(addr, sizeof(u32))) if (ppu.raddr != addr || addr & 3 || old_data != data.load() || ppu.rtime != (vm::reservation_acquire(addr, sizeof(u32)) & ~1ull))
{ {
ppu.raddr = 0; ppu.raddr = 0;
return false; return false;
@ -1090,7 +1089,7 @@ extern bool ppu_stwcx(ppu_thread& ppu, u32 addr, u32 reg_value)
return false; return false;
} }
vm::temporary_unlock(ppu); vm::passive_unlock(ppu);
auto& res = vm::reservation_lock(addr, sizeof(u32)); auto& res = vm::reservation_lock(addr, sizeof(u32));
@ -1098,7 +1097,7 @@ extern bool ppu_stwcx(ppu_thread& ppu, u32 addr, u32 reg_value)
if (result) if (result)
{ {
vm::reservation_update(addr, sizeof(u32)); res++;
vm::reservation_notifier(addr, sizeof(u32)).notify_all(); vm::reservation_notifier(addr, sizeof(u32)).notify_all();
} }
else else
@ -1106,7 +1105,7 @@ extern bool ppu_stwcx(ppu_thread& ppu, u32 addr, u32 reg_value)
res &= ~1ull; res &= ~1ull;
} }
ppu.cpu_mem(); vm::passive_lock(ppu);
ppu.raddr = 0; ppu.raddr = 0;
return result; return result;
} }
@ -1137,7 +1136,7 @@ const auto ppu_stdcx_tx = build_function_asm<bool(*)(u32 raddr, u64 rtime, u64 r
c.cmp(x86::qword_ptr(x86::r11), args[2]); c.cmp(x86::qword_ptr(x86::r11), args[2]);
c.jne(fail); c.jne(fail);
c.mov(x86::qword_ptr(x86::r11), args[3]); c.mov(x86::qword_ptr(x86::r11), args[3]);
c.add(x86::qword_ptr(x86::r10), 1); c.add(x86::qword_ptr(x86::r10), 2);
c.xend(); c.xend();
c.mov(x86::eax, 1); c.mov(x86::eax, 1);
c.ret(); c.ret();
@ -1163,7 +1162,7 @@ extern bool ppu_stdcx(ppu_thread& ppu, u32 addr, u64 reg_value)
auto& data = vm::_ref<atomic_be_t<u64>>(addr & -8); auto& data = vm::_ref<atomic_be_t<u64>>(addr & -8);
const u64 old_data = ppu.rdata << ((addr & 7) * 8); const u64 old_data = ppu.rdata << ((addr & 7) * 8);
if (ppu.raddr != addr || addr & 7 || old_data != data.load() || ppu.rtime != vm::reservation_acquire(addr, sizeof(u64))) if (ppu.raddr != addr || addr & 7 || old_data != data.load() || ppu.rtime != (vm::reservation_acquire(addr, sizeof(u64)) & ~1ull))
{ {
ppu.raddr = 0; ppu.raddr = 0;
return false; return false;
@ -1183,7 +1182,7 @@ extern bool ppu_stdcx(ppu_thread& ppu, u32 addr, u64 reg_value)
return false; return false;
} }
vm::temporary_unlock(ppu); vm::passive_unlock(ppu);
auto& res = vm::reservation_lock(addr, sizeof(u64)); auto& res = vm::reservation_lock(addr, sizeof(u64));
@ -1191,7 +1190,7 @@ extern bool ppu_stdcx(ppu_thread& ppu, u32 addr, u64 reg_value)
if (result) if (result)
{ {
vm::reservation_update(addr, sizeof(u64)); res++;
vm::reservation_notifier(addr, sizeof(u64)).notify_all(); vm::reservation_notifier(addr, sizeof(u64)).notify_all();
} }
else else
@ -1199,7 +1198,7 @@ extern bool ppu_stdcx(ppu_thread& ppu, u32 addr, u64 reg_value)
res &= ~1ull; res &= ~1ull;
} }
ppu.cpu_mem(); vm::passive_lock(ppu);
ppu.raddr = 0; ppu.raddr = 0;
return result; return result;
} }

View file

@ -1436,6 +1436,7 @@ void spu_recompiler::get_events()
c->mov(*qw0, imm_ptr(vm::g_reservations)); c->mov(*qw0, imm_ptr(vm::g_reservations));
c->shr(qw1->r32(), 4); c->shr(qw1->r32(), 4);
c->mov(*qw0, x86::qword_ptr(*qw0, *qw1)); c->mov(*qw0, x86::qword_ptr(*qw0, *qw1));
c->and_(qw0->r64(), (u64)(~1ull));
c->cmp(*qw0, SPU_OFF_64(rtime)); c->cmp(*qw0, SPU_OFF_64(rtime));
c->jne(fail); c->jne(fail);
c->mov(*qw0, imm_ptr(vm::g_base_addr)); c->mov(*qw0, imm_ptr(vm::g_base_addr));
@ -2596,7 +2597,7 @@ static void spu_wrch(spu_thread* _spu, u32 ch, u32 value, spu_function_t _ret)
static void spu_wrch_mfc(spu_thread* _spu, spu_function_t _ret) static void spu_wrch_mfc(spu_thread* _spu, spu_function_t _ret)
{ {
if (!_spu->process_mfc_cmd(_spu->ch_mfc_cmd)) if (!_spu->process_mfc_cmd())
{ {
_ret = &spu_wrch_ret; _ret = &spu_wrch_ret;
} }

View file

@ -3362,7 +3362,7 @@ public:
static bool exec_mfc_cmd(spu_thread* _spu) static bool exec_mfc_cmd(spu_thread* _spu)
{ {
return _spu->process_mfc_cmd(_spu->ch_mfc_cmd); return _spu->process_mfc_cmd();
} }
void WRCH(spu_opcode_t op) // void WRCH(spu_opcode_t op) //
@ -3541,9 +3541,9 @@ public:
csize = ci->getZExtValue(); csize = ci->getZExtValue();
} }
if (cmd >= MFC_SNDSIG_CMD) if (cmd >= MFC_SNDSIG_CMD && csize != 4)
{ {
csize = 4; csize = -1;
} }
llvm::Value* src = m_ir->CreateGEP(m_lsptr, zext<u64>(lsa).value); llvm::Value* src = m_ir->CreateGEP(m_lsptr, zext<u64>(lsa).value);

View file

@ -40,6 +40,34 @@ bool operator ==(const u128& lhs, const u128& rhs)
} }
#endif #endif
static FORCE_INLINE void mov_rdata(u128* const dst, const u128* const src)
{
{
const u128 data0 = src[0];
const u128 data1 = src[1];
const u128 data2 = src[2];
dst[0] = data0;
dst[1] = data1;
dst[2] = data2;
}
{
const u128 data0 = src[3];
const u128 data1 = src[4];
const u128 data2 = src[5];
dst[3] = data0;
dst[4] = data1;
dst[5] = data2;
}
{
const u128 data0 = src[6];
const u128 data1 = src[7];
dst[6] = data0;
dst[7] = data1;
}
};
extern u64 get_timebased_time(); extern u64 get_timebased_time();
extern u64 get_system_time(); extern u64 get_system_time();
@ -158,12 +186,13 @@ namespace spu
} }
} }
const auto spu_putllc_tx = build_function_asm<bool(*)(u32 raddr, u64 rtime, const void* _old, const void* _new)>([](asmjit::X86Assembler& c, auto& args) const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, const void* _old, const void* _new)>([](asmjit::X86Assembler& c, auto& args)
{ {
using namespace asmjit; using namespace asmjit;
Label fall = c.newLabel(); Label fall = c.newLabel();
Label fail = c.newLabel(); Label fail = c.newLabel();
Label retry = c.newLabel();
// Prepare registers // Prepare registers
c.mov(x86::rax, imm_ptr(&vm::g_reservations)); c.mov(x86::rax, imm_ptr(&vm::g_reservations));
@ -216,7 +245,7 @@ const auto spu_putllc_tx = build_function_asm<bool(*)(u32 raddr, u64 rtime, cons
c.vmovaps(x86::yword_ptr(x86::r11, 64), x86::ymm8); c.vmovaps(x86::yword_ptr(x86::r11, 64), x86::ymm8);
c.vmovaps(x86::yword_ptr(x86::r11, 96), x86::ymm9); c.vmovaps(x86::yword_ptr(x86::r11, 96), x86::ymm9);
#endif #endif
c.add(x86::qword_ptr(x86::r10), 1); c.add(x86::qword_ptr(x86::r10), 2);
c.xend(); c.xend();
c.vzeroupper(); c.vzeroupper();
c.mov(x86::eax, 1); c.mov(x86::eax, 1);
@ -224,10 +253,10 @@ const auto spu_putllc_tx = build_function_asm<bool(*)(u32 raddr, u64 rtime, cons
// Touch memory after transaction failure // Touch memory after transaction failure
c.bind(fall); c.bind(fall);
c.sub(args[0].r32(), 1);
c.jz(fail);
c.sar(x86::eax, 24); c.sar(x86::eax, 24);
c.js(fail); c.js(fail);
c.sub(args[0].r32(), 1);
c.jz(retry);
c.lock().add(x86::qword_ptr(x86::r11), 0); c.lock().add(x86::qword_ptr(x86::r11), 0);
c.lock().add(x86::qword_ptr(x86::r10), 0); c.lock().add(x86::qword_ptr(x86::r10), 0);
#ifdef _WIN32 #ifdef _WIN32
@ -240,9 +269,12 @@ const auto spu_putllc_tx = build_function_asm<bool(*)(u32 raddr, u64 rtime, cons
build_transaction_abort(c, 0xff); build_transaction_abort(c, 0xff);
c.xor_(x86::eax, x86::eax); c.xor_(x86::eax, x86::eax);
c.ret(); c.ret();
c.bind(retry);
c.mov(x86::eax, 2);
c.ret();
}); });
const auto spu_getll_tx = build_function_asm<bool(*)(u32 raddr, void* rdata, u64* out_rtime)>([](asmjit::X86Assembler& c, auto& args) const auto spu_getll_tx = build_function_asm<u64(*)(u32 raddr, void* rdata)>([](asmjit::X86Assembler& c, auto& args)
{ {
using namespace asmjit; using namespace asmjit;
@ -271,8 +303,6 @@ const auto spu_getll_tx = build_function_asm<bool(*)(u32 raddr, void* rdata, u64
c.vmovups(x86::yword_ptr(args[1], 64), x86::ymm2); c.vmovups(x86::yword_ptr(args[1], 64), x86::ymm2);
c.vmovups(x86::yword_ptr(args[1], 96), x86::ymm3); c.vmovups(x86::yword_ptr(args[1], 96), x86::ymm3);
c.vzeroupper(); c.vzeroupper();
c.mov(x86::qword_ptr(args[2]), x86::rax);
c.mov(x86::eax, 1);
c.ret(); c.ret();
// Touch memory after transaction failure // Touch memory after transaction failure
@ -282,7 +312,7 @@ const auto spu_getll_tx = build_function_asm<bool(*)(u32 raddr, void* rdata, u64
c.mov(x86::rax, x86::qword_ptr(x86::r10)); c.mov(x86::rax, x86::qword_ptr(x86::r10));
c.sub(args[0], 1); c.sub(args[0], 1);
c.jnz(begin); c.jnz(begin);
c.xor_(x86::eax, x86::eax); c.mov(x86::eax, 1);
c.ret(); c.ret();
}); });
@ -314,7 +344,7 @@ const auto spu_putlluc_tx = build_function_asm<bool(*)(u32 raddr, const void* rd
c.vmovaps(x86::yword_ptr(x86::r11, 32), x86::ymm1); c.vmovaps(x86::yword_ptr(x86::r11, 32), x86::ymm1);
c.vmovaps(x86::yword_ptr(x86::r11, 64), x86::ymm2); c.vmovaps(x86::yword_ptr(x86::r11, 64), x86::ymm2);
c.vmovaps(x86::yword_ptr(x86::r11, 96), x86::ymm3); c.vmovaps(x86::yword_ptr(x86::r11, 96), x86::ymm3);
c.add(x86::qword_ptr(x86::r10), 1); c.add(x86::qword_ptr(x86::r10), 2);
c.xend(); c.xend();
c.vzeroupper(); c.vzeroupper();
c.mov(x86::eax, 1); c.mov(x86::eax, 1);
@ -767,8 +797,8 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
} }
} }
void* dst = vm::base(eal); u8* dst = (u8*)vm::base(eal);
void* src = vm::base(offset + lsa); u8* src = (u8*)vm::base(offset + lsa);
if (UNLIKELY(!is_get && !g_use_rtm)) if (UNLIKELY(!is_get && !g_use_rtm))
{ {
@ -777,65 +807,72 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
case 1: case 1:
{ {
auto& res = vm::reservation_lock(eal, 1); auto& res = vm::reservation_lock(eal, 1);
*static_cast<u8*>(dst) = *static_cast<const u8*>(src); *reinterpret_cast<u8*>(dst) = *reinterpret_cast<const u8*>(src);
res &= ~1ull; res++;
break; break;
} }
case 2: case 2:
{ {
auto& res = vm::reservation_lock(eal, 2); auto& res = vm::reservation_lock(eal, 2);
*static_cast<u16*>(dst) = *static_cast<const u16*>(src); *reinterpret_cast<u16*>(dst) = *reinterpret_cast<const u16*>(src);
res &= ~1ull; res++;
break; break;
} }
case 4: case 4:
{ {
auto& res = vm::reservation_lock(eal, 4); auto& res = vm::reservation_lock(eal, 4);
*static_cast<u32*>(dst) = *static_cast<const u32*>(src); *reinterpret_cast<u32*>(dst) = *reinterpret_cast<const u32*>(src);
res &= ~1ull; res++;
break; break;
} }
case 8: case 8:
{ {
auto& res = vm::reservation_lock(eal, 8); auto& res = vm::reservation_lock(eal, 8);
*static_cast<u64*>(dst) = *static_cast<const u64*>(src); *reinterpret_cast<u64*>(dst) = *reinterpret_cast<const u64*>(src);
res &= ~1ull; res++;
break;
}
case 16:
{
auto& res = vm::reservation_lock(eal, 16);
_mm_store_si128(static_cast<__m128i*>(dst), _mm_load_si128(static_cast<const __m128i*>(src)));
res &= ~1ull;
break; break;
} }
default: default:
{ {
auto* res = &vm::reservation_lock(eal, 16); if (((eal & 127) + size) <= 128)
auto vdst = static_cast<__m128i*>(dst);
auto vsrc = static_cast<const __m128i*>(src);
for (u32 addr = eal, end = eal + size;; vdst++, vsrc++)
{ {
_mm_store_si128(vdst, _mm_load_si128(vsrc)); // Lock one cache line
auto& res = vm::reservation_lock(eal, 128);
addr += 16; while (size)
if (addr == end)
{ {
break; *reinterpret_cast<u128*>(dst) = *reinterpret_cast<const u128*>(src);
dst += 16;
src += 16;
size -= 16;
} }
if (addr % 128) res++;
{ break;
continue;
}
res->fetch_and(~1ull);
res = &vm::reservation_lock(addr, 16);
} }
res->fetch_and(~1ull); auto lock = vm::passive_lock(eal & -128u, ::align(eal + size, 128));
while (size >= 128)
{
mov_rdata(reinterpret_cast<u128*>(dst), reinterpret_cast<const u128*>(src));
dst += 128;
src += 128;
size -= 128;
}
while (size)
{
*reinterpret_cast<u128*>(dst) = *reinterpret_cast<const u128*>(src);
dst += 16;
src += 16;
size -= 16;
}
*lock = 0;
break; break;
} }
} }
@ -852,67 +889,44 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
{ {
case 1: case 1:
{ {
*static_cast<u8*>(dst) = *static_cast<const u8*>(src); *reinterpret_cast<u8*>(dst) = *reinterpret_cast<const u8*>(src);
break; break;
} }
case 2: case 2:
{ {
*static_cast<u16*>(dst) = *static_cast<const u16*>(src); *reinterpret_cast<u16*>(dst) = *reinterpret_cast<const u16*>(src);
break; break;
} }
case 4: case 4:
{ {
*static_cast<u32*>(dst) = *static_cast<const u32*>(src); *reinterpret_cast<u32*>(dst) = *reinterpret_cast<const u32*>(src);
break; break;
} }
case 8: case 8:
{ {
*static_cast<u64*>(dst) = *static_cast<const u64*>(src); *reinterpret_cast<u64*>(dst) = *reinterpret_cast<const u64*>(src);
break;
}
case 16:
{
_mm_store_si128(static_cast<__m128i*>(dst), _mm_load_si128(static_cast<const __m128i*>(src)));
break; break;
} }
default: default:
{ {
auto vdst = static_cast<__m128i*>(dst); while (size >= 128)
auto vsrc = static_cast<const __m128i*>(src);
auto vcnt = size / sizeof(__m128i);
while (vcnt >= 8)
{ {
const __m128i data[] mov_rdata(reinterpret_cast<u128*>(dst), reinterpret_cast<const u128*>(src));
{
_mm_load_si128(vsrc + 0),
_mm_load_si128(vsrc + 1),
_mm_load_si128(vsrc + 2),
_mm_load_si128(vsrc + 3),
_mm_load_si128(vsrc + 4),
_mm_load_si128(vsrc + 5),
_mm_load_si128(vsrc + 6),
_mm_load_si128(vsrc + 7),
};
_mm_store_si128(vdst + 0, data[0]); dst += 128;
_mm_store_si128(vdst + 1, data[1]); src += 128;
_mm_store_si128(vdst + 2, data[2]); size -= 128;
_mm_store_si128(vdst + 3, data[3]);
_mm_store_si128(vdst + 4, data[4]);
_mm_store_si128(vdst + 5, data[5]);
_mm_store_si128(vdst + 6, data[6]);
_mm_store_si128(vdst + 7, data[7]);
vcnt -= 8;
vsrc += 8;
vdst += 8;
} }
while (vcnt--) while (size)
{ {
_mm_store_si128(vdst++, _mm_load_si128(vsrc++)); *reinterpret_cast<u128*>(dst) = *reinterpret_cast<const u128*>(src);
dst += 16;
src += 16;
size -= 16;
} }
break; break;
} }
} }
@ -1030,7 +1044,12 @@ void spu_thread::do_putlluc(const spu_mfc_cmd& args)
if (raddr && addr == raddr) if (raddr && addr == raddr)
{ {
ch_event_stat |= SPU_EVENT_LR; // Last check for event before we clear the reservation
if ((vm::reservation_acquire(addr, 128) & ~1ull) != rtime || rdata != vm::_ref<decltype(rdata)>(addr))
{
ch_event_stat |= SPU_EVENT_LR;
}
raddr = 0; raddr = 0;
} }
@ -1057,20 +1076,20 @@ void spu_thread::do_putlluc(const spu_mfc_cmd& args)
auto& data = vm::_ref<decltype(rdata)>(addr); auto& data = vm::_ref<decltype(rdata)>(addr);
auto& res = vm::reservation_lock(addr, 128); auto& res = vm::reservation_lock(addr, 128);
vm::_ref<atomic_t<u32>>(addr) += 0; *reinterpret_cast<atomic_t<u32>*>(&data) += 0;
if (g_cfg.core.spu_accurate_putlluc) if (g_cfg.core.spu_accurate_putlluc)
{ {
// Full lock (heavyweight) // Full lock (heavyweight)
// TODO: vm::check_addr // TODO: vm::check_addr
vm::writer_lock lock(1); vm::writer_lock lock(addr);
data = to_write; mov_rdata(data.data(), to_write.data());
vm::reservation_update(addr, 128); res++;
} }
else else
{ {
data = to_write; mov_rdata(data.data(), to_write.data());
vm::reservation_update(addr, 128); res++;
} }
} }
@ -1140,11 +1159,7 @@ void spu_thread::do_mfc(bool wait)
return false; return false;
} }
if (args.size) if (args.cmd == MFC_PUTQLLUC_CMD)
{
do_dma_transfer(args);
}
else if (args.cmd == MFC_PUTQLLUC_CMD)
{ {
if (fence & mask) if (fence & mask)
{ {
@ -1153,6 +1168,10 @@ void spu_thread::do_mfc(bool wait)
do_putlluc(args); do_putlluc(args);
} }
else if (args.size)
{
do_dma_transfer(args);
}
removed++; removed++;
return true; return true;
@ -1184,7 +1203,7 @@ u32 spu_thread::get_mfc_completed()
return ch_tag_mask & ~mfc_fence; return ch_tag_mask & ~mfc_fence;
} }
bool spu_thread::process_mfc_cmd(spu_mfc_cmd args) bool spu_thread::process_mfc_cmd()
{ {
// Stall infinitely if MFC queue is full // Stall infinitely if MFC queue is full
while (UNLIKELY(mfc_size >= 16)) while (UNLIKELY(mfc_size >= 16))
@ -1198,29 +1217,24 @@ bool spu_thread::process_mfc_cmd(spu_mfc_cmd args)
} }
spu::scheduler::concurrent_execution_watchdog watchdog(*this); spu::scheduler::concurrent_execution_watchdog watchdog(*this);
LOG_TRACE(SPU, "DMAC: cmd=%s, lsa=0x%x, ea=0x%llx, tag=0x%x, size=0x%x", args.cmd, args.lsa, args.eal, args.tag, args.size); LOG_TRACE(SPU, "DMAC: cmd=%s, lsa=0x%x, ea=0x%llx, tag=0x%x, size=0x%x", ch_mfc_cmd.cmd, ch_mfc_cmd.lsa, ch_mfc_cmd.eal, ch_mfc_cmd.tag, ch_mfc_cmd.size);
switch (args.cmd) switch (ch_mfc_cmd.cmd)
{ {
case MFC_GETLLAR_CMD: case MFC_GETLLAR_CMD:
{ {
const u32 addr = args.eal & -128u; const u32 addr = ch_mfc_cmd.eal & -128u;
auto& data = vm::_ref<decltype(rdata)>(addr); auto& data = vm::_ref<decltype(rdata)>(addr);
auto& dst = _ref<decltype(rdata)>(ch_mfc_cmd.lsa & 0x3ff80);
if (raddr && raddr != addr) u64 ntime;
{
ch_event_stat |= SPU_EVENT_LR;
}
raddr = addr;
const bool is_polling = false; // TODO const bool is_polling = false; // TODO
if (is_polling) if (is_polling)
{ {
rtime = vm::reservation_acquire(raddr, 128); rtime = vm::reservation_acquire(addr, 128);
while (rdata == data && vm::reservation_acquire(raddr, 128) == rtime) while (rdata == data && vm::reservation_acquire(addr, 128) == rtime)
{ {
if (is_stopped()) if (is_stopped())
{ {
@ -1235,57 +1249,78 @@ bool spu_thread::process_mfc_cmd(spu_mfc_cmd args)
{ {
u64 count = 1; u64 count = 1;
while (g_cfg.core.spu_accurate_getllar && !spu_getll_tx(raddr, rdata.data(), &rtime)) if (g_cfg.core.spu_accurate_getllar)
{ {
std::this_thread::yield(); while ((ntime = spu_getll_tx(addr, dst.data())) & 1)
count += 2; {
std::this_thread::yield();
count += 2;
}
} }
else
if (!g_cfg.core.spu_accurate_getllar)
{ {
for (;; count++, busy_wait(300)) for (;; count++, busy_wait(300))
{ {
rtime = vm::reservation_acquire(raddr, 128); ntime = vm::reservation_acquire(addr, 128);
rdata = data; dst = data;
if (LIKELY(vm::reservation_acquire(raddr, 128) == rtime)) if (LIKELY(vm::reservation_acquire(addr, 128) == ntime))
{ {
break; break;
} }
} }
} }
if (count > 9) if (count > 15)
{ {
LOG_ERROR(SPU, "%s took too long: %u", args.cmd, count); LOG_ERROR(SPU, "%s took too long: %u", ch_mfc_cmd.cmd, count);
} }
} }
else else
{ {
auto& res = vm::reservation_lock(raddr, 128); auto& res = vm::reservation_lock(addr, 128);
if (g_cfg.core.spu_accurate_getllar) if (g_cfg.core.spu_accurate_getllar)
{ {
vm::_ref<atomic_t<u32>>(raddr) += 0; *reinterpret_cast<atomic_t<u32>*>(&data) += 0;
// Full lock (heavyweight) // Full lock (heavyweight)
// TODO: vm::check_addr // TODO: vm::check_addr
vm::writer_lock lock(1); vm::writer_lock lock(addr);
rtime = res & ~1ull; ntime = res & ~1ull;
rdata = data; mov_rdata(dst.data(), data.data());
res &= ~1ull; res &= ~1ull;
} }
else else
{ {
rtime = res & ~1ull; ntime = res & ~1ull;
rdata = data; mov_rdata(dst.data(), data.data());
res &= ~1ull; res &= ~1ull;
} }
} }
// Copy to LS if (const u32 _addr = raddr)
_ref<decltype(rdata)>(args.lsa & 0x3ff80) = rdata; {
// Last check for event before we replace the reservation with a new one
if ((vm::reservation_acquire(_addr, 128) & ~1ull) != rtime || rdata != vm::_ref<decltype(rdata)>(_addr))
{
ch_event_stat |= SPU_EVENT_LR;
if (_addr == addr)
{
// Lost current reservation
raddr = 0;
ch_atomic_stat.set_value(MFC_GETLLAR_SUCCESS);
return true;
}
}
}
raddr = addr;
rtime = ntime;
mov_rdata(rdata.data(), dst.data());
ch_atomic_stat.set_value(MFC_GETLLAR_SUCCESS); ch_atomic_stat.set_value(MFC_GETLLAR_SUCCESS);
return true; return true;
} }
@ -1293,40 +1328,50 @@ bool spu_thread::process_mfc_cmd(spu_mfc_cmd args)
case MFC_PUTLLC_CMD: case MFC_PUTLLC_CMD:
{ {
// Store conditionally // Store conditionally
const u32 addr = args.eal & -128u; const u32 addr = ch_mfc_cmd.eal & -128u;
u32 result = 0;
bool result = false; if (raddr == addr && rtime == (vm::reservation_acquire(raddr, 128) & ~1ull))
if (raddr == addr && rtime == vm::reservation_acquire(raddr, 128))
{ {
const auto& to_write = _ref<decltype(rdata)>(args.lsa & 0x3ff80); const auto& to_write = _ref<decltype(rdata)>(ch_mfc_cmd.lsa & 0x3ff80);
if (LIKELY(g_use_rtm)) if (LIKELY(g_use_rtm))
{ {
if (spu_putllc_tx(raddr, rtime, rdata.data(), to_write.data())) while (true)
{ {
vm::reservation_notifier(raddr, 128).notify_all(); result = spu_putllc_tx(addr, rtime, rdata.data(), to_write.data());
result = true;
} if (result < 2)
{
break;
}
// Don't fallback to heavyweight lock, just give up // Retry
std::this_thread::yield();
}
} }
else if (auto& data = vm::_ref<decltype(rdata)>(addr); rdata == data) else if (auto& data = vm::_ref<decltype(rdata)>(addr); rdata == data)
{ {
auto& res = vm::reservation_lock(raddr, 128); auto& res = vm::reservation_lock(raddr, 128);
vm::_ref<atomic_t<u32>>(raddr) += 0; if (rtime == (res & ~1ull))
// Full lock (heavyweight)
// TODO: vm::check_addr
vm::writer_lock lock(1);
if (rtime == (res & ~1ull) && rdata == data)
{ {
data = to_write; *reinterpret_cast<atomic_t<u32>*>(&data) += 0;
vm::reservation_update(raddr, 128);
vm::reservation_notifier(raddr, 128).notify_all(); // Full lock (heavyweight)
result = true; // TODO: vm::check_addr
vm::writer_lock lock(addr);
if (rdata == data)
{
mov_rdata(data.data(), to_write.data());
res++;
result = 1;
}
else
{
res &= ~1ull;
}
} }
else else
{ {
@ -1337,16 +1382,21 @@ bool spu_thread::process_mfc_cmd(spu_mfc_cmd args)
if (result) if (result)
{ {
vm::reservation_notifier(addr, 128).notify_all();
ch_atomic_stat.set_value(MFC_PUTLLC_SUCCESS); ch_atomic_stat.set_value(MFC_PUTLLC_SUCCESS);
} }
else else
{ {
ch_atomic_stat.set_value(MFC_PUTLLC_FAILURE); if (raddr)
} {
// Last check for event before we clear the reservation
if (raddr == addr || rtime != (vm::reservation_acquire(raddr, 128) & ~1ull) || rdata != vm::_ref<decltype(rdata)>(raddr))
{
ch_event_stat |= SPU_EVENT_LR;
}
}
if (raddr && !result) ch_atomic_stat.set_value(MFC_PUTLLC_FAILURE);
{
ch_event_stat |= SPU_EVENT_LR;
} }
raddr = 0; raddr = 0;
@ -1354,23 +1404,22 @@ bool spu_thread::process_mfc_cmd(spu_mfc_cmd args)
} }
case MFC_PUTLLUC_CMD: case MFC_PUTLLUC_CMD:
{ {
do_putlluc(args); do_putlluc(ch_mfc_cmd);
ch_atomic_stat.set_value(MFC_PUTLLUC_SUCCESS); ch_atomic_stat.set_value(MFC_PUTLLUC_SUCCESS);
return true; return true;
} }
case MFC_PUTQLLUC_CMD: case MFC_PUTQLLUC_CMD:
{ {
const u32 mask = utils::rol32(1, args.tag); const u32 mask = utils::rol32(1, ch_mfc_cmd.tag);
if (UNLIKELY((mfc_barrier | mfc_fence) & mask)) if (UNLIKELY((mfc_barrier | mfc_fence) & mask))
{ {
args.size = 0; mfc_queue[mfc_size++] = ch_mfc_cmd;
mfc_queue[mfc_size++] = args;
mfc_fence |= mask; mfc_fence |= mask;
} }
else else
{ {
do_putlluc(args); do_putlluc(ch_mfc_cmd);
} }
return true; return true;
@ -1379,7 +1428,11 @@ bool spu_thread::process_mfc_cmd(spu_mfc_cmd args)
case MFC_SNDSIGB_CMD: case MFC_SNDSIGB_CMD:
case MFC_SNDSIGF_CMD: case MFC_SNDSIGF_CMD:
{ {
args.size = 4; if (ch_mfc_cmd.size != 4)
{
break;
}
// Fallthrough // Fallthrough
} }
case MFC_PUT_CMD: case MFC_PUT_CMD:
@ -1392,24 +1445,24 @@ bool spu_thread::process_mfc_cmd(spu_mfc_cmd args)
case MFC_GETB_CMD: case MFC_GETB_CMD:
case MFC_GETF_CMD: case MFC_GETF_CMD:
{ {
if (LIKELY(args.size <= 0x4000)) if (LIKELY(ch_mfc_cmd.size <= 0x4000))
{ {
if (LIKELY(do_dma_check(args))) if (LIKELY(do_dma_check(ch_mfc_cmd)))
{ {
if (LIKELY(args.size)) if (ch_mfc_cmd.size)
{ {
do_dma_transfer(args); do_dma_transfer(ch_mfc_cmd);
} }
return true; return true;
} }
mfc_queue[mfc_size++] = args; mfc_queue[mfc_size++] = ch_mfc_cmd;
mfc_fence |= utils::rol32(1, args.tag); mfc_fence |= utils::rol32(1, ch_mfc_cmd.tag);
if (args.cmd & MFC_BARRIER_MASK) if (ch_mfc_cmd.cmd & MFC_BARRIER_MASK)
{ {
mfc_barrier |= utils::rol32(1, args.tag); mfc_barrier |= utils::rol32(1, ch_mfc_cmd.tag);
} }
return true; return true;
@ -1427,22 +1480,25 @@ bool spu_thread::process_mfc_cmd(spu_mfc_cmd args)
case MFC_GETLB_CMD: case MFC_GETLB_CMD:
case MFC_GETLF_CMD: case MFC_GETLF_CMD:
{ {
if (LIKELY(args.size <= 0x4000)) if (LIKELY(ch_mfc_cmd.size <= 0x4000))
{ {
if (LIKELY(do_dma_check(args))) auto& cmd = mfc_queue[mfc_size];
cmd = ch_mfc_cmd;
if (LIKELY(do_dma_check(cmd)))
{ {
if (LIKELY(do_list_transfer(args))) if (LIKELY(do_list_transfer(cmd)))
{ {
return true; return true;
} }
} }
mfc_queue[mfc_size++] = args; mfc_size++;
mfc_fence |= utils::rol32(1, args.tag); mfc_fence |= utils::rol32(1, cmd.tag);
if (args.cmd & MFC_BARRIER_MASK) if (cmd.cmd & MFC_BARRIER_MASK)
{ {
mfc_barrier |= utils::rol32(1, args.tag); mfc_barrier |= utils::rol32(1, cmd.tag);
} }
return true; return true;
@ -1460,7 +1516,7 @@ bool spu_thread::process_mfc_cmd(spu_mfc_cmd args)
} }
else else
{ {
mfc_queue[mfc_size++] = args; mfc_queue[mfc_size++] = ch_mfc_cmd;
mfc_barrier |= -1; mfc_barrier |= -1;
} }
@ -1473,7 +1529,7 @@ bool spu_thread::process_mfc_cmd(spu_mfc_cmd args)
} }
fmt::throw_exception("Unknown command (cmd=%s, lsa=0x%x, ea=0x%llx, tag=0x%x, size=0x%x)" HERE, fmt::throw_exception("Unknown command (cmd=%s, lsa=0x%x, ea=0x%llx, tag=0x%x, size=0x%x)" HERE,
args.cmd, args.lsa, args.eal, args.tag, args.size); ch_mfc_cmd.cmd, ch_mfc_cmd.lsa, ch_mfc_cmd.eal, ch_mfc_cmd.tag, ch_mfc_cmd.size);
} }
u32 spu_thread::get_events(bool waiting) u32 spu_thread::get_events(bool waiting)
@ -1486,7 +1542,7 @@ u32 spu_thread::get_events(bool waiting)
} }
// Check reservation status and set SPU_EVENT_LR if lost // Check reservation status and set SPU_EVENT_LR if lost
if (raddr && (vm::reservation_acquire(raddr, sizeof(rdata)) != rtime || rdata != vm::_ref<decltype(rdata)>(raddr))) if (raddr && ((vm::reservation_acquire(raddr, sizeof(rdata)) & ~1ull) != rtime || rdata != vm::_ref<decltype(rdata)>(raddr)))
{ {
ch_event_stat |= SPU_EVENT_LR; ch_event_stat |= SPU_EVENT_LR;
raddr = 0; raddr = 0;
@ -2026,7 +2082,7 @@ bool spu_thread::set_ch_value(u32 ch, u32 value)
case MFC_Cmd: case MFC_Cmd:
{ {
ch_mfc_cmd.cmd = MFC(value & 0xff); ch_mfc_cmd.cmd = MFC(value & 0xff);
return process_mfc_cmd(ch_mfc_cmd); return process_mfc_cmd();
} }
case MFC_WrListStallAck: case MFC_WrListStallAck:

View file

@ -1,8 +1,9 @@
#pragma once #pragma once
#include "Emu/Cell/Common.h" #include "Emu/Cell/Common.h"
#include "Emu/CPU/CPUThread.h" #include "Emu/CPU/CPUThread.h"
#include "Emu/Cell/SPUInterpreter.h" #include "Emu/Cell/SPUInterpreter.h"
#include "Emu/Memory/vm.h"
#include "MFC.h" #include "MFC.h"
#include <map> #include <map>
@ -595,7 +596,7 @@ public:
void do_mfc(bool wait = true); void do_mfc(bool wait = true);
u32 get_mfc_completed(); u32 get_mfc_completed();
bool process_mfc_cmd(spu_mfc_cmd args); bool process_mfc_cmd();
u32 get_events(bool waiting = false); u32 get_events(bool waiting = false);
void set_events(u32 mask); void set_events(u32 mask);
void set_interrupt_status(bool enable); void set_interrupt_status(bool enable);

View file

@ -57,8 +57,12 @@ namespace vm
// Memory mutex acknowledgement // Memory mutex acknowledgement
thread_local atomic_t<cpu_thread*>* g_tls_locked = nullptr; thread_local atomic_t<cpu_thread*>* g_tls_locked = nullptr;
// Currently locked address
atomic_t<u32> g_addr_lock = 0;
// Memory mutex: passive locks // Memory mutex: passive locks
std::array<atomic_t<cpu_thread*>, 32> g_locks; std::array<atomic_t<cpu_thread*>, 4> g_locks{};
std::array<atomic_t<u64>, 6> g_range_locks{};
static void _register_lock(cpu_thread* _cpu) static void _register_lock(cpu_thread* _cpu)
{ {
@ -72,11 +76,25 @@ namespace vm
} }
} }
bool passive_lock(cpu_thread& cpu, bool wait) static atomic_t<u64>* _register_range_lock(const u64 lock_info)
{
while (true)
{
for (auto& lock : g_range_locks)
{
if (!lock && lock.compare_and_swap_test(0, lock_info))
{
return &lock;
}
}
}
}
void passive_lock(cpu_thread& cpu)
{ {
if (UNLIKELY(g_tls_locked && *g_tls_locked == &cpu)) if (UNLIKELY(g_tls_locked && *g_tls_locked == &cpu))
{ {
return true; return;
} }
if (LIKELY(g_mutex.is_lockable())) if (LIKELY(g_mutex.is_lockable()))
@ -84,31 +102,46 @@ namespace vm
// Optimistic path (hope that mutex is not exclusively locked) // Optimistic path (hope that mutex is not exclusively locked)
_register_lock(&cpu); _register_lock(&cpu);
if (UNLIKELY(!g_mutex.is_lockable())) if (LIKELY(g_mutex.is_lockable()))
{ {
passive_unlock(cpu); return;
if (!wait)
{
return false;
}
::reader_lock lock(g_mutex);
_register_lock(&cpu);
} }
passive_unlock(cpu);
} }
else
::reader_lock lock(g_mutex);
_register_lock(&cpu);
}
atomic_t<u64>* passive_lock(const u32 addr, const u32 end)
{
static const auto test_addr = [](const u32 target, const u32 addr, const u32 end)
{ {
if (!wait) return addr > target || end <= target;
};
atomic_t<u64>* _ret;
if (LIKELY(test_addr(g_addr_lock.load(), addr, end)))
{
// Optimistic path (hope that address range is not locked)
_ret = _register_range_lock((u64)end << 32 | addr);
if (LIKELY(test_addr(g_addr_lock.load(), addr, end)))
{ {
return false; return _ret;
} }
::reader_lock lock(g_mutex); *_ret = 0;
_register_lock(&cpu);
} }
return true; {
::reader_lock lock(g_mutex);
_ret = _register_range_lock((u64)end << 32 | addr);
}
return _ret;
} }
void passive_unlock(cpu_thread& cpu) void passive_unlock(cpu_thread& cpu)
@ -194,8 +227,7 @@ namespace vm
m_upgraded = true; m_upgraded = true;
} }
writer_lock::writer_lock(int full) writer_lock::writer_lock(u32 addr)
: locked(true)
{ {
auto cpu = get_current_cpu_thread(); auto cpu = get_current_cpu_thread();
@ -206,7 +238,7 @@ namespace vm
g_mutex.lock(); g_mutex.lock();
if (full) if (addr)
{ {
for (auto& lock : g_locks) for (auto& lock : g_locks)
{ {
@ -216,6 +248,30 @@ namespace vm
} }
} }
g_addr_lock = addr;
for (auto& lock : g_range_locks)
{
while (true)
{
const u64 value = lock;
// Test beginning address
if (static_cast<u32>(value) > addr)
{
break;
}
// Test end address
if (static_cast<u32>(value >> 32) <= addr)
{
break;
}
_mm_pause();
}
}
for (auto& lock : g_locks) for (auto& lock : g_locks)
{ {
while (cpu_thread* ptr = lock) while (cpu_thread* ptr = lock)
@ -225,7 +281,7 @@ namespace vm
break; break;
} }
busy_wait(); _mm_pause();
} }
} }
} }
@ -239,10 +295,8 @@ namespace vm
writer_lock::~writer_lock() writer_lock::~writer_lock()
{ {
if (locked) g_addr_lock.raw() = 0;
{ g_mutex.unlock();
g_mutex.unlock();
}
} }
void reservation_lock_internal(atomic_t<u64>& res) void reservation_lock_internal(atomic_t<u64>& res)

View file

@ -53,7 +53,8 @@ namespace vm
extern thread_local atomic_t<cpu_thread*>* g_tls_locked; extern thread_local atomic_t<cpu_thread*>* g_tls_locked;
// Register reader // Register reader
bool passive_lock(cpu_thread& cpu, bool wait = true); void passive_lock(cpu_thread& cpu);
atomic_t<u64>* passive_lock(const u32 begin, const u32 end);
// Unregister reader // Unregister reader
void passive_unlock(cpu_thread& cpu); void passive_unlock(cpu_thread& cpu);
@ -80,14 +81,10 @@ namespace vm
struct writer_lock final struct writer_lock final
{ {
const bool locked;
writer_lock(const writer_lock&) = delete; writer_lock(const writer_lock&) = delete;
writer_lock& operator=(const writer_lock&) = delete; writer_lock& operator=(const writer_lock&) = delete;
writer_lock(int full); writer_lock(u32 addr = 0);
~writer_lock(); ~writer_lock();
explicit operator bool() const { return locked; }
}; };
// Get reservation status for further atomic update: last update timestamp // Get reservation status for further atomic update: last update timestamp
@ -101,7 +98,7 @@ namespace vm
inline void reservation_update(u32 addr, u32 size, bool lsb = false) inline void reservation_update(u32 addr, u32 size, bool lsb = false)
{ {
// Update reservation info with new timestamp // Update reservation info with new timestamp
reservation_acquire(addr, size) = (__rdtsc() << 1) | u64{lsb}; reservation_acquire(addr, size) += 2;
} }
// Get reservation sync variable // Get reservation sync variable

View file

@ -354,7 +354,7 @@ struct cfg_root : cfg::node
node_core(cfg::node* _this) : cfg::node(_this, "Core") {} node_core(cfg::node* _this) : cfg::node(_this, "Core") {}
cfg::_enum<ppu_decoder_type> ppu_decoder{this, "PPU Decoder", ppu_decoder_type::llvm}; cfg::_enum<ppu_decoder_type> ppu_decoder{this, "PPU Decoder", ppu_decoder_type::llvm};
cfg::_int<1, 16> ppu_threads{this, "PPU Threads", 2}; // Amount of PPU threads running simultaneously (must be 2) cfg::_int<1, 4> ppu_threads{this, "PPU Threads", 2}; // Amount of PPU threads running simultaneously (must be 2)
cfg::_bool ppu_debug{this, "PPU Debug"}; cfg::_bool ppu_debug{this, "PPU Debug"};
cfg::_bool llvm_logs{this, "Save LLVM logs"}; cfg::_bool llvm_logs{this, "Save LLVM logs"};
cfg::string llvm_cpu{this, "Use LLVM CPU"}; cfg::string llvm_cpu{this, "Use LLVM CPU"};