SPU LLVM: LS Memory Mirrors (Optimize loads/stores)

This commit is contained in:
Eladash 2020-07-15 21:57:39 +03:00 committed by Ivan
parent c1a80b8146
commit af1ceb1151
5 changed files with 104 additions and 31 deletions

View file

@ -173,7 +173,7 @@ bool spu_thread::write_reg(const u32 addr, const u32 value)
{ {
case MFC_LSA_offs: case MFC_LSA_offs:
{ {
if (value >= 0x40000) if (value >= SPU_LS_SIZE)
{ {
break; break;
} }
@ -321,7 +321,7 @@ bool spu_thread::write_reg(const u32 addr, const u32 value)
void spu_load_exec(const spu_exec_object& elf) void spu_load_exec(const spu_exec_object& elf)
{ {
auto ls0 = vm::cast(vm::falloc(RAW_SPU_BASE_ADDR, 0x80000, vm::spu)); auto ls0 = vm::cast(vm::falloc(RAW_SPU_BASE_ADDR, SPU_LS_SIZE, vm::spu));
auto spu = idm::make_ptr<named_thread<spu_thread>>("TEST_SPU", ls0, nullptr, 0, "", 0); auto spu = idm::make_ptr<named_thread<spu_thread>>("TEST_SPU", ls0, nullptr, 0, "", 0);
spu_thread::g_raw_spu_ctr++; spu_thread::g_raw_spu_ctr++;
@ -331,7 +331,7 @@ void spu_load_exec(const spu_exec_object& elf)
{ {
if (prog.p_type == 0x1u /* LOAD */ && prog.p_memsz) if (prog.p_type == 0x1u /* LOAD */ && prog.p_memsz)
{ {
std::memcpy(vm::base(spu->offset + prog.p_vaddr), prog.bin.data(), prog.p_filesz); std::memcpy(spu->_ptr<void>(prog.p_vaddr), prog.bin.data(), prog.p_filesz);
} }
} }

View file

@ -272,8 +272,7 @@ DECLARE(spu_runtime::g_tail_escape) = build_function_asm<void(*)(spu_thread*, sp
// Tail call, GHC CC (second arg) // Tail call, GHC CC (second arg)
c.mov(x86::r13, args[0]); c.mov(x86::r13, args[0]);
c.mov(x86::ebp, x86::dword_ptr(args[0], ::offset32(&spu_thread::offset))); c.mov(x86::rbp, x86::qword_ptr(args[0], ::offset32(&spu_thread::ls)));
c.add(x86::rbp, x86::qword_ptr(args[0], ::offset32(&spu_thread::memory_base_addr)));
c.mov(x86::r12, args[2]); c.mov(x86::r12, args[2]);
c.xor_(x86::ebx, x86::ebx); c.xor_(x86::ebx, x86::ebx);
c.jmp(args[1]); c.jmp(args[1]);
@ -1138,7 +1137,7 @@ void spu_recompiler_base::branch(spu_thread& spu, void*, u8* rip)
} }
// Find function // Find function
const auto func = spu.jit->get_runtime().find(static_cast<u32*>(vm::base(spu.offset)), spu.pc); const auto func = spu.jit->get_runtime().find(static_cast<u32*>(spu._ptr<void>(0)), spu.pc);
if (!func) if (!func)
{ {
@ -7902,13 +7901,51 @@ public:
void STQX(spu_opcode_t op) void STQX(spu_opcode_t op)
{ {
value_t<u64> addr = eval(zext<u64>((extract(get_vr(op.ra), 3) + extract(get_vr(op.rb), 3)) & 0x3fff0)); const auto a = get_vr(op.ra);
const auto b = get_vr(op.rb);
for (auto pair : std::initializer_list<std::pair<value_t<u32[4]>, value_t<u32[4]>>>{{a, b}, {b, a}})
{
if (auto cv = llvm::dyn_cast<llvm::Constant>(pair.first.value))
{
v128 data = get_const_vector(cv, m_pos, 10000);
data._u32[3] %= SPU_LS_SIZE;
if (data._u32[3] % 0x10 == 0)
{
value_t<u64> addr = eval(splat<u64>(data._u32[3]) + zext<u64>(extract(pair.second, 3) & 0x3fff0));
make_store_ls(addr, get_vr<u8[16]>(op.rt));
return;
}
}
}
value_t<u64> addr = eval(zext<u64>((extract(a, 3) + extract(b, 3)) & 0x3fff0));
make_store_ls(addr, get_vr<u8[16]>(op.rt)); make_store_ls(addr, get_vr<u8[16]>(op.rt));
} }
void LQX(spu_opcode_t op) void LQX(spu_opcode_t op)
{ {
value_t<u64> addr = eval(zext<u64>((extract(get_vr(op.ra), 3) + extract(get_vr(op.rb), 3)) & 0x3fff0)); const auto a = get_vr(op.ra);
const auto b = get_vr(op.rb);
for (auto pair : std::initializer_list<std::pair<value_t<u32[4]>, value_t<u32[4]>>>{{a, b}, {b, a}})
{
if (auto cv = llvm::dyn_cast<llvm::Constant>(pair.first.value))
{
v128 data = get_const_vector(cv, m_pos, 10000);
data._u32[3] %= SPU_LS_SIZE;
if (data._u32[3] % 0x10 == 0)
{
value_t<u64> addr = eval(splat<u64>(data._u32[3]) + zext<u64>(extract(pair.second, 3) & 0x3fff0));
set_vr(op.rt, make_load_ls(addr));
return;
}
}
}
value_t<u64> addr = eval(zext<u64>((extract(a, 3) + extract(b, 3)) & 0x3fff0));
set_vr(op.rt, make_load_ls(addr)); set_vr(op.rt, make_load_ls(addr));
} }
@ -7928,7 +7965,7 @@ public:
{ {
value_t<u64> addr; value_t<u64> addr;
addr.value = m_ir->CreateZExt(m_interp_magn ? m_interp_pc : get_pc(m_pos), get_type<u64>()); addr.value = m_ir->CreateZExt(m_interp_magn ? m_interp_pc : get_pc(m_pos), get_type<u64>());
addr = eval(((get_imm<u64>(op.i16, false) << 2) + addr) & 0x3fff0); addr = eval(((get_imm<u64>(op.i16, false) << 2) + addr) & (m_interp_magn ? 0x3fff0 : ~0xf));
make_store_ls(addr, get_vr<u8[16]>(op.rt)); make_store_ls(addr, get_vr<u8[16]>(op.rt));
} }
@ -7936,7 +7973,7 @@ public:
{ {
value_t<u64> addr; value_t<u64> addr;
addr.value = m_ir->CreateZExt(m_interp_magn ? m_interp_pc : get_pc(m_pos), get_type<u64>()); addr.value = m_ir->CreateZExt(m_interp_magn ? m_interp_pc : get_pc(m_pos), get_type<u64>());
addr = eval(((get_imm<u64>(op.i16, false) << 2) + addr) & 0x3fff0); addr = eval(((get_imm<u64>(op.i16, false) << 2) + addr) & (m_interp_magn ? 0x3fff0 : ~0xf));
set_vr(op.rt, make_load_ls(addr)); set_vr(op.rt, make_load_ls(addr));
} }
@ -7953,13 +7990,13 @@ public:
} }
} }
value_t<u64> addr = eval(zext<u64>((extract(get_vr(op.ra), 3) + (get_imm<u32>(op.si10) << 4)) & 0x3fff0)); value_t<u64> addr = eval(zext<u64>(extract(get_vr(op.ra), 3) & 0x3fff0) + (get_imm<u64>(op.si10) << 4));
make_store_ls(addr, get_vr<u8[16]>(op.rt)); make_store_ls(addr, get_vr<u8[16]>(op.rt));
} }
void LQD(spu_opcode_t op) void LQD(spu_opcode_t op)
{ {
value_t<u64> addr = eval(zext<u64>((extract(get_vr(op.ra), 3) + (get_imm<u32>(op.si10) << 4)) & 0x3fff0)); value_t<u64> addr = eval(zext<u64>(extract(get_vr(op.ra), 3) & 0x3fff0) + (get_imm<u64>(op.si10) << 4));
set_vr(op.rt, make_load_ls(addr)); set_vr(op.rt, make_load_ls(addr));
} }

View file

@ -2,6 +2,7 @@
#include "Utilities/JIT.h" #include "Utilities/JIT.h"
#include "Utilities/asm.h" #include "Utilities/asm.h"
#include "Utilities/sysinfo.h" #include "Utilities/sysinfo.h"
#include "Emu/Memory/vm.h"
#include "Emu/Memory/vm_ptr.h" #include "Emu/Memory/vm_ptr.h"
#include "Emu/Memory/vm_reservation.h" #include "Emu/Memory/vm_reservation.h"
@ -1111,7 +1112,7 @@ void spu_thread::cpu_task()
continue; continue;
} }
spu_runtime::g_gateway(*this, vm::_ptr<u8>(offset), nullptr); spu_runtime::g_gateway(*this, _ptr<u8>(0), nullptr);
} }
// Print some stats // Print some stats
@ -1129,7 +1130,7 @@ void spu_thread::cpu_task()
break; break;
} }
spu_runtime::g_interpreter(*this, vm::_ptr<u8>(offset), nullptr); spu_runtime::g_interpreter(*this, _ptr<u8>(0), nullptr);
} }
} }
@ -1148,8 +1149,21 @@ void spu_thread::cpu_unmem()
spu_thread::~spu_thread() spu_thread::~spu_thread()
{ {
// Deallocate Local Storage {
vm::dealloc_verbose_nothrow(offset); const auto [_, shm] = vm::get(vm::any, offset)->get(offset);
for (s32 i = -1; i < 2; i++)
{
// Unmap LS mirrors
shm->unmap_critical(ls + (i * SPU_LS_SIZE));
}
// Deallocate Local Storage
vm::dealloc_verbose_nothrow(offset);
}
// Release LS mirrors area
utils::memory_release(ls - SPU_LS_SIZE, SPU_LS_SIZE * 3);
// Deallocate RawSPU ID // Deallocate RawSPU ID
if (!group && offset >= RAW_SPU_BASE_ADDR) if (!group && offset >= RAW_SPU_BASE_ADDR)
@ -1159,11 +1173,26 @@ spu_thread::~spu_thread()
} }
} }
spu_thread::spu_thread(vm::addr_t ls, lv2_spu_group* group, u32 index, std::string_view name, u32 lv2_id, bool is_isolated) spu_thread::spu_thread(vm::addr_t _ls, lv2_spu_group* group, u32 index, std::string_view name, u32 lv2_id, bool is_isolated)
: cpu_thread(idm::last_id()) : cpu_thread(idm::last_id())
, is_isolated(is_isolated) , is_isolated(is_isolated)
, index(index) , index(index)
, offset(ls) , offset(_ls)
, ls([&]()
{
const auto [_, shm] = vm::get(vm::any, _ls)->get(_ls);
const auto addr = static_cast<u8*>(utils::memory_reserve(SPU_LS_SIZE * 3));
for (u32 i = 0; i < 3; i++)
{
// Map LS mirrors
const auto ptr = addr + (i * SPU_LS_SIZE);
verify(HERE), shm->map_critical(ptr) == ptr;
}
// Use the middle mirror
return addr + SPU_LS_SIZE;
}())
, group(group) , group(group)
, lv2_id(lv2_id) , lv2_id(lv2_id)
, spu_tname(stx::shared_cptr<std::string>::make(name)) , spu_tname(stx::shared_cptr<std::string>::make(name))
@ -1233,7 +1262,7 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
} }
u32 value; u32 value;
if ((eal - RAW_SPU_BASE_ADDR) % RAW_SPU_OFFSET + args.size - 1 < 0x40000) // LS access if ((eal - RAW_SPU_BASE_ADDR) % RAW_SPU_OFFSET + args.size - 1 < SPU_LS_SIZE) // LS access
{ {
} }
else if (args.size == 4 && is_get && thread->read_reg(eal, value)) else if (args.size == 4 && is_get && thread->read_reg(eal, value))
@ -1258,7 +1287,7 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
{ {
auto& spu = static_cast<spu_thread&>(*group->threads[group->threads_map[index]]); auto& spu = static_cast<spu_thread&>(*group->threads[group->threads_map[index]]);
if (offset + args.size - 1 < 0x40000) // LS access if (offset + args.size - 1 < SPU_LS_SIZE) // LS access
{ {
eal = spu.offset + offset; // redirect access eal = spu.offset + offset; // redirect access
} }
@ -1282,7 +1311,7 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
auto [dst, src] = [&]() -> std::pair<u8*, const u8*> auto [dst, src] = [&]() -> std::pair<u8*, const u8*>
{ {
u8* dst = vm::_ptr<u8>(eal); u8* dst = vm::_ptr<u8>(eal);
u8* src = vm::_ptr<u8>(offset + lsa); u8* src = _ptr<u8>(lsa);
if (is_get) if (is_get)
{ {
@ -1638,6 +1667,7 @@ bool spu_thread::do_list_transfer(spu_mfc_cmd& args)
transfer.cmd = MFC(args.cmd & ~MFC_LIST_MASK); transfer.cmd = MFC(args.cmd & ~MFC_LIST_MASK);
args.lsa &= 0x3fff0; args.lsa &= 0x3fff0;
args.eal &= 0x3fff8;
u32 index = fetch_size; u32 index = fetch_size;
@ -1650,7 +1680,7 @@ bool spu_thread::do_list_transfer(spu_mfc_cmd& args)
// Reset to elements array head // Reset to elements array head
index = 0; index = 0;
const auto src = _ptr<const void>(args.eal & 0x3fff8); const auto src = _ptr<const void>(args.eal);
const v128 data0 = v128::loadu(src, 0); const v128 data0 = v128::loadu(src, 0);
const v128 data1 = v128::loadu(src, 1); const v128 data1 = v128::loadu(src, 1);
const v128 data2 = v128::loadu(src, 2); const v128 data2 = v128::loadu(src, 2);
@ -2947,7 +2977,7 @@ bool spu_thread::stop_and_signal(u32 code)
spu_log.warning("STOP 0x0"); spu_log.warning("STOP 0x0");
// HACK: find an ILA instruction // HACK: find an ILA instruction
for (u32 addr = pc; addr < 0x40000; addr += 4) for (u32 addr = pc; addr < SPU_LS_SIZE; addr += 4)
{ {
const u32 instr = _ref<u32>(addr); const u32 instr = _ref<u32>(addr);

View file

@ -118,6 +118,11 @@ enum : u32
SPU_STATUS_IS_ISOLATED = 0x80, SPU_STATUS_IS_ISOLATED = 0x80,
}; };
enum : s32
{
SPU_LS_SIZE = 0x40000,
};
enum : u32 enum : u32
{ {
SYS_SPU_THREAD_BASE_LOW = 0xf0000000, SYS_SPU_THREAD_BASE_LOW = 0xf0000000,
@ -636,6 +641,7 @@ public:
const u32 index; // SPU index const u32 index; // SPU index
const u32 offset; // SPU LS offset const u32 offset; // SPU LS offset
const std::add_pointer_t<u8> ls; // SPU LS pointer
private: private:
lv2_spu_group* const group; // SPU Thread Group (only safe to access in the spu thread itself) lv2_spu_group* const group; // SPU Thread Group (only safe to access in the spu thread itself)
public: public:
@ -682,7 +688,7 @@ public:
template<typename T> template<typename T>
inline to_be_t<T>* _ptr(u32 lsa) inline to_be_t<T>* _ptr(u32 lsa)
{ {
return static_cast<to_be_t<T>*>(vm::base(offset + lsa)); return reinterpret_cast<to_be_t<T>*>(ls + lsa);
} }
// Convert specified SPU LS address to a reference of specified (possibly converted to BE) type // Convert specified SPU LS address to a reference of specified (possibly converted to BE) type

View file

@ -397,7 +397,7 @@ error_code sys_spu_thread_initialize(ppu_thread& ppu, vm::ptr<u32> thread, u32 g
sys_spu.warning("Unimplemented SPU Thread options (0x%x)", option); sys_spu.warning("Unimplemented SPU Thread options (0x%x)", option);
} }
const vm::addr_t ls_addr{verify("SPU LS" HERE, vm::alloc(0x80000, vm::main))}; const vm::addr_t ls_addr{verify("SPU LS" HERE, vm::alloc(SPU_LS_SIZE, vm::main))};
const u32 inited = group->init; const u32 inited = group->init;
@ -579,7 +579,7 @@ error_code sys_spu_thread_group_create(ppu_thread& ppu, vm::ptr<u32> id, u32 num
if (type & SYS_SPU_THREAD_GROUP_TYPE_COOPERATE_WITH_SYSTEM) if (type & SYS_SPU_THREAD_GROUP_TYPE_COOPERATE_WITH_SYSTEM)
{ {
// Constant size, unknown what it means but it's definitely not for each spu thread alone // Constant size, unknown what it means but it's definitely not for each spu thread alone
mem_size = 0x40000; mem_size = SPU_LS_SIZE;
use_scheduler = false; use_scheduler = false;
} }
else if (type & SYS_SPU_THREAD_GROUP_TYPE_NON_CONTEXT) else if (type & SYS_SPU_THREAD_GROUP_TYPE_NON_CONTEXT)
@ -591,7 +591,7 @@ error_code sys_spu_thread_group_create(ppu_thread& ppu, vm::ptr<u32> id, u32 num
else else
{ {
// 256kb for each spu thread, probably for saving and restoring SPU LS (used by scheduler?) // 256kb for each spu thread, probably for saving and restoring SPU LS (used by scheduler?)
mem_size = 0x40000 * num; mem_size = SPU_LS_SIZE * num;
} }
if (num < min_threads || num > max_threads || if (num < min_threads || num > max_threads ||
@ -1225,7 +1225,7 @@ error_code sys_spu_thread_write_ls(ppu_thread& ppu, u32 id, u32 lsa, u64 value,
sys_spu.trace("sys_spu_thread_write_ls(id=0x%x, lsa=0x%05x, value=0x%llx, type=%d)", id, lsa, value, type); sys_spu.trace("sys_spu_thread_write_ls(id=0x%x, lsa=0x%05x, value=0x%llx, type=%d)", id, lsa, value, type);
if (lsa >= 0x40000 || type > 8 || !type || (type | lsa) & (type - 1)) // check range and alignment if (lsa >= SPU_LS_SIZE || type > 8 || !type || (type | lsa) & (type - 1)) // check range and alignment
{ {
return CELL_EINVAL; return CELL_EINVAL;
} }
@ -1268,7 +1268,7 @@ error_code sys_spu_thread_read_ls(ppu_thread& ppu, u32 id, u32 lsa, vm::ptr<u64>
sys_spu.trace("sys_spu_thread_read_ls(id=0x%x, lsa=0x%05x, value=*0x%x, type=%d)", id, lsa, value, type); sys_spu.trace("sys_spu_thread_read_ls(id=0x%x, lsa=0x%05x, value=*0x%x, type=%d)", id, lsa, value, type);
if (lsa >= 0x40000 || type > 8 || !type || (type | lsa) & (type - 1)) // check range and alignment if (lsa >= SPU_LS_SIZE || type > 8 || !type || (type | lsa) & (type - 1)) // check range and alignment
{ {
return CELL_EINVAL; return CELL_EINVAL;
} }
@ -1831,7 +1831,7 @@ error_code sys_raw_spu_create(ppu_thread& ppu, vm::ptr<u32> id, vm::ptr<void> at
index = 0; index = 0;
} }
const vm::addr_t ls_addr{verify(HERE, vm::falloc(RAW_SPU_BASE_ADDR + RAW_SPU_OFFSET * index, 0x40000, vm::spu))}; const vm::addr_t ls_addr{verify(HERE, vm::falloc(RAW_SPU_BASE_ADDR + RAW_SPU_OFFSET * index, SPU_LS_SIZE, vm::spu))};
const u32 tid = idm::make<named_thread<spu_thread>>(fmt::format("RawSPU[0x%x] ", index), ls_addr, nullptr, index, "", index); const u32 tid = idm::make<named_thread<spu_thread>>(fmt::format("RawSPU[0x%x] ", index), ls_addr, nullptr, index, "", index);
@ -1879,7 +1879,7 @@ error_code sys_isolated_spu_create(ppu_thread& ppu, vm::ptr<u32> id, vm::ptr<voi
index = 0; index = 0;
} }
const vm::addr_t ls_addr{verify(HERE, vm::falloc(RAW_SPU_BASE_ADDR + RAW_SPU_OFFSET * index, 0x40000, vm::spu))}; const vm::addr_t ls_addr{verify(HERE, vm::falloc(RAW_SPU_BASE_ADDR + RAW_SPU_OFFSET * index, SPU_LS_SIZE, vm::spu))};
const auto thread = idm::make_ptr<named_thread<spu_thread>>(fmt::format("IsoSPU[0x%x] ", index), ls_addr, nullptr, index, "", index, true); const auto thread = idm::make_ptr<named_thread<spu_thread>>(fmt::format("IsoSPU[0x%x] ", index), ls_addr, nullptr, index, "", index, true);