SPU Re: more precise jt generation

Improve analyser, set v1
Fix branch indirect conditional
This commit is contained in:
Nekotekina 2018-05-10 19:38:07 +03:00
parent 737db90058
commit be5c18cc85
4 changed files with 190 additions and 68 deletions

View file

@ -1156,10 +1156,18 @@ void spu_recompiler::branch_fixed(u32 target)
c->jmp(x86::rax); c->jmp(x86::rax);
} }
void spu_recompiler::branch_indirect(spu_opcode_t op) void spu_recompiler::branch_indirect(spu_opcode_t op, bool jt)
{ {
using namespace asmjit; using namespace asmjit;
if (g_cfg.core.spu_block_size == spu_block_size_type::safe && !jt)
{
// Simply external call (return or indirect call)
c->mov(x86::r10, x86::qword_ptr(*cpu, addr->r64(), 1, offset32(&SPUThread::jit_dispatcher)));
c->xor_(qw0->r32(), qw0->r32());
}
else
{
if (!instr_table.isValid()) if (!instr_table.isValid())
{ {
// Request instruction table // Request instruction table
@ -1177,6 +1185,7 @@ void spu_recompiler::branch_indirect(spu_opcode_t op)
c->cmovae(qw1->r32(), qw0->r32()); c->cmovae(qw1->r32(), qw0->r32());
c->cmovb(x86::r10, x86::qword_ptr(x86::r10, *qw1, 1, 0)); c->cmovb(x86::r10, x86::qword_ptr(x86::r10, *qw1, 1, 0));
c->cmovae(x86::r10, x86::qword_ptr(*cpu, addr->r64(), 1, offset32(&SPUThread::jit_dispatcher))); c->cmovae(x86::r10, x86::qword_ptr(*cpu, addr->r64(), 1, offset32(&SPUThread::jit_dispatcher)));
}
if (op.d) if (op.d)
{ {
@ -2741,7 +2750,7 @@ void spu_recompiler::BI(spu_opcode_t op)
{ {
c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3)); c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
c->and_(*addr, 0x3fffc); c->and_(*addr, 0x3fffc);
branch_indirect(op); branch_indirect(op, verify(HERE, m_targets[m_pos].size()) > 2);
m_pos = -1; m_pos = -1;
} }

View file

@ -107,7 +107,7 @@ private:
asmjit::X86Mem XmmConst(__m128i data); asmjit::X86Mem XmmConst(__m128i data);
void branch_fixed(u32 target); void branch_fixed(u32 target);
void branch_indirect(spu_opcode_t op); void branch_indirect(spu_opcode_t op, bool jt = false);
void fall(spu_opcode_t op); void fall(spu_opcode_t op);
void save_rcx(); void save_rcx();
void load_rcx(); void load_rcx();

View file

@ -89,7 +89,7 @@ void spu_cache::initialize()
} }
// SPU cache file (version + block size type) // SPU cache file (version + block size type)
const std::string loc = _main->cache + u8"spu-§" + fmt::to_lower(g_cfg.core.spu_block_size.to_string()) + "-v0.dat"; const std::string loc = _main->cache + u8"spu-§" + fmt::to_lower(g_cfg.core.spu_block_size.to_string()) + "-v1.dat";
auto cache = std::make_shared<spu_cache>(loc); auto cache = std::make_shared<spu_cache>(loc);
@ -272,14 +272,16 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
result.push_back(lsa); result.push_back(lsa);
// Initialize block entries // Initialize block entries
std::bitset<0x10000>& blocks = m_block_info; m_block_info.reset();
blocks.reset(); m_block_info.set(lsa / 4);
blocks.set(lsa / 4);
// Simple block entry workload list // Simple block entry workload list
std::vector<u32> wl; std::vector<u32> wl;
wl.push_back(lsa); wl.push_back(lsa);
m_regmod.fill(0xff);
m_targets.clear();
// Value flags (TODO) // Value flags (TODO)
enum class vf : u32 enum class vf : u32
{ {
@ -310,9 +312,9 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
if (target > lsa) if (target > lsa)
{ {
// Check for redundancy // Check for redundancy
if (!blocks[target / 4]) if (!m_block_info[target / 4])
{ {
blocks[target / 4] = true; m_block_info[target / 4] = true;
wl.push_back(target); wl.push_back(target);
return; return;
} }
@ -325,6 +327,8 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
wl[wi] += 4; wl[wi] += 4;
m_targets.erase(pos);
// Analyse instruction // Analyse instruction
switch (const auto type = s_spu_itype.decode(data)) switch (const auto type = s_spu_itype.decode(data))
{ {
@ -336,7 +340,8 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
case spu_itype::DFTSV: case spu_itype::DFTSV:
{ {
// Stop before invalid instructions (TODO) // Stop before invalid instructions (TODO)
blocks[pos / 4] = true; m_targets[pos].push_back(-1);
m_block_info[pos / 4] = true;
next_block(); next_block();
continue; continue;
} }
@ -349,7 +354,8 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
if (data == 0 || data == 3) if (data == 0 || data == 3)
{ {
// Stop before null data // Stop before null data
blocks[pos / 4] = true; m_targets[pos].push_back(-1);
m_block_info[pos / 4] = true;
next_block(); next_block();
continue; continue;
} }
@ -357,6 +363,7 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
if (g_cfg.core.spu_block_size != spu_block_size_type::giga) if (g_cfg.core.spu_block_size != spu_block_size_type::giga)
{ {
// Stop on special instructions (TODO) // Stop on special instructions (TODO)
m_targets[pos].push_back(-1);
next_block(); next_block();
break; break;
} }
@ -366,6 +373,7 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
case spu_itype::IRET: case spu_itype::IRET:
{ {
m_targets[pos].push_back(-1);
next_block(); next_block();
break; break;
} }
@ -382,6 +390,7 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
if (type == spu_itype::BISL) if (type == spu_itype::BISL)
{ {
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = +vf::is_const; vflags[op.rt] = +vf::is_const;
values[op.rt] = pos + 4; values[op.rt] = pos + 4;
} }
@ -389,23 +398,24 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
if (test(af, vf::is_const)) if (test(af, vf::is_const))
{ {
const u32 target = spu_branch_target(av); const u32 target = spu_branch_target(av);
LOG_WARNING(SPU, "[0x%x] At 0x%x: indirect branch to 0x%x", lsa, pos, target);
if (target == pos + 4) if (target == pos + 4)
{ {
// Nop (unless BISL) // Nop (unless BISL)
break; LOG_WARNING(SPU, "[0x%x] At 0x%x: indirect branch to next!", lsa, pos);
} }
m_targets[pos].push_back(target);
if (type != spu_itype::BISL || g_cfg.core.spu_block_size == spu_block_size_type::giga) if (type != spu_itype::BISL || g_cfg.core.spu_block_size == spu_block_size_type::giga)
{ {
LOG_WARNING(SPU, "[0x%x] At 0x%x: indirect branch to 0x%x", lsa, pos, target);
add_block(target); add_block(target);
} }
if (type == spu_itype::BISL && target < lsa) if (type == spu_itype::BISL && target >= lsa && g_cfg.core.spu_block_size == spu_block_size_type::giga)
{ {
next_block(); add_block(pos + 4);
break;
} }
} }
else if (type == spu_itype::BI && !op.d && !op.e) else if (type == spu_itype::BI && !op.d && !op.e)
@ -488,6 +498,8 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
add_block(jt_abs[i]); add_block(jt_abs[i]);
result[(start - lsa) / 4 + 1 + i] = se_storage<u32>::swap(jt_abs[i]); result[(start - lsa) / 4 + 1 + i] = se_storage<u32>::swap(jt_abs[i]);
} }
m_targets.emplace(pos, std::move(jt_abs));
} }
if (jt_rel.size() >= jt_abs.size()) if (jt_rel.size() >= jt_abs.size())
@ -504,19 +516,33 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
add_block(jt_rel[i]); add_block(jt_rel[i]);
result[(start - lsa) / 4 + 1 + i] = se_storage<u32>::swap(jt_rel[i] - start); result[(start - lsa) / 4 + 1 + i] = se_storage<u32>::swap(jt_rel[i] - start);
} }
m_targets.emplace(pos, std::move(jt_rel));
} }
} }
} }
if (type == spu_itype::BI || type == spu_itype::BISL || g_cfg.core.spu_block_size == spu_block_size_type::safe) if (type == spu_itype::BI || type == spu_itype::BISL)
{ {
if (type == spu_itype::BI || g_cfg.core.spu_block_size != spu_block_size_type::giga) if (type == spu_itype::BI || g_cfg.core.spu_block_size != spu_block_size_type::giga)
{ {
next_block(); if (m_targets[pos].empty())
break; {
m_targets[pos].push_back(-1);
} }
} }
else
{
add_block(pos + 4);
}
}
else
{
m_targets[pos].push_back(pos + 4);
add_block(pos + 4);
}
next_block();
break; break;
} }
@ -525,6 +551,7 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
{ {
const u32 target = spu_branch_target(type == spu_itype::BRASL ? 0 : pos, op.i16); const u32 target = spu_branch_target(type == spu_itype::BRASL ? 0 : pos, op.i16);
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = +vf::is_const; vflags[op.rt] = +vf::is_const;
values[op.rt] = pos + 4; values[op.rt] = pos + 4;
@ -534,11 +561,11 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
break; break;
} }
if (target < lsa || g_cfg.core.spu_block_size != spu_block_size_type::giga) m_targets[pos].push_back(target);
if (target >= lsa && g_cfg.core.spu_block_size == spu_block_size_type::giga)
{ {
// Stop on direct calls add_block(pos + 4);
next_block();
break;
} }
if (g_cfg.core.spu_block_size == spu_block_size_type::giga) if (g_cfg.core.spu_block_size == spu_block_size_type::giga)
@ -546,6 +573,7 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
add_block(target); add_block(target);
} }
next_block();
break; break;
} }
@ -564,15 +592,16 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
break; break;
} }
m_targets[pos].push_back(target);
add_block(target); add_block(target);
if (type == spu_itype::BR || type == spu_itype::BRA) if (type != spu_itype::BR && type != spu_itype::BRA)
{ {
// Stop on direct branches m_targets[pos].push_back(pos + 4);
next_block(); add_block(pos + 4);
break;
} }
next_block();
break; break;
} }
@ -601,61 +630,131 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
case spu_itype::IL: case spu_itype::IL:
{ {
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = +vf::is_const; vflags[op.rt] = +vf::is_const;
values[op.rt] = op.si16; values[op.rt] = op.si16;
break; break;
} }
case spu_itype::ILA: case spu_itype::ILA:
{ {
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = +vf::is_const; vflags[op.rt] = +vf::is_const;
values[op.rt] = op.i18; values[op.rt] = op.i18;
break; break;
} }
case spu_itype::ILH: case spu_itype::ILH:
{ {
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = +vf::is_const; vflags[op.rt] = +vf::is_const;
values[op.rt] = op.i16 << 16 | op.i16; values[op.rt] = op.i16 << 16 | op.i16;
break; break;
} }
case spu_itype::ILHU: case spu_itype::ILHU:
{ {
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = +vf::is_const; vflags[op.rt] = +vf::is_const;
values[op.rt] = op.i16 << 16; values[op.rt] = op.i16 << 16;
break; break;
} }
case spu_itype::IOHL: case spu_itype::IOHL:
{ {
m_regmod[pos / 4] = op.rt;
values[op.rt] = values[op.rt] | op.i16; values[op.rt] = values[op.rt] | op.i16;
break; break;
} }
case spu_itype::ORI: case spu_itype::ORI:
{ {
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = vflags[op.ra] & vf::is_const; vflags[op.rt] = vflags[op.ra] & vf::is_const;
values[op.rt] = values[op.ra] | op.si10; values[op.rt] = values[op.ra] | op.si10;
break; break;
} }
case spu_itype::OR: case spu_itype::OR:
{ {
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = vflags[op.ra] & vflags[op.rb] & vf::is_const; vflags[op.rt] = vflags[op.ra] & vflags[op.rb] & vf::is_const;
values[op.rt] = values[op.ra] | values[op.rb]; values[op.rt] = values[op.ra] | values[op.rb];
break; break;
} }
case spu_itype::ANDI:
{
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = vflags[op.ra] & vf::is_const;
values[op.rt] = values[op.ra] & op.si10;
break;
}
case spu_itype::AND:
{
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = vflags[op.ra] & vflags[op.rb] & vf::is_const;
values[op.rt] = values[op.ra] & values[op.rb];
break;
}
case spu_itype::AI: case spu_itype::AI:
{ {
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = vflags[op.ra] & vf::is_const; vflags[op.rt] = vflags[op.ra] & vf::is_const;
values[op.rt] = values[op.ra] + op.si10; values[op.rt] = values[op.ra] + op.si10;
break; break;
} }
case spu_itype::A: case spu_itype::A:
{ {
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = vflags[op.ra] & vflags[op.rb] & vf::is_const; vflags[op.rt] = vflags[op.ra] & vflags[op.rb] & vf::is_const;
values[op.rt] = values[op.ra] + values[op.rb]; values[op.rt] = values[op.ra] + values[op.rb];
break; break;
} }
case spu_itype::SFI:
{
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = vflags[op.ra] & vf::is_const;
values[op.rt] = op.si10 - values[op.ra];
break;
}
case spu_itype::SF:
{
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = vflags[op.ra] & vflags[op.rb] & vf::is_const;
values[op.rt] = values[op.rb] - values[op.ra];
break;
}
case spu_itype::ROTMI:
{
m_regmod[pos / 4] = op.rt;
if (-op.i7 & 0x20)
{
vflags[op.rt] = +vf::is_const;
values[op.rt] = 0;
break;
}
vflags[op.rt] = vflags[op.ra] & vf::is_const;
values[op.rt] = values[op.ra] >> (-op.i7 & 0x1f);
break;
}
case spu_itype::SHLI:
{
m_regmod[pos / 4] = op.rt;
if (op.i7 & 0x20)
{
vflags[op.rt] = +vf::is_const;
values[op.rt] = 0;
break;
}
vflags[op.rt] = vflags[op.ra] & vf::is_const;
values[op.rt] = values[op.ra] << (op.i7 & 0x1f);
break;
}
default: default:
{ {
// Unconst // Unconst
vflags[type & spu_itype::_quadrop ? +op.rt4 : +op.rt] = {}; const u32 op_rt = type & spu_itype::_quadrop ? +op.rt4 : +op.rt;
m_regmod[pos / 4] = op_rt;
vflags[op_rt] = {};
break; break;
} }
} }
@ -783,7 +882,6 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
llvm::Value* m_lsptr; llvm::Value* m_lsptr;
llvm::BasicBlock* m_stop; llvm::BasicBlock* m_stop;
llvm::GlobalVariable* m_jt;
std::array<std::pair<llvm::Value*, llvm::Value*>, 128> m_gpr; std::array<std::pair<llvm::Value*, llvm::Value*>, 128> m_gpr;
std::array<llvm::Instruction*, 128> m_flush_gpr; std::array<llvm::Instruction*, 128> m_flush_gpr;
@ -1047,27 +1145,15 @@ public:
m_stop = BasicBlock::Create(m_context, "", m_function); m_stop = BasicBlock::Create(m_context, "", m_function);
const auto jtt = ArrayType::get(GetType<u8*>(), m_size / 4);
std::vector<llvm::Constant*> jt;
jt.reserve(m_size / 4);
// Create instruction blocks // Create instruction blocks
for (u32 i = 1, pos = start; i < func.size(); i++, pos += 4) for (u32 i = 1, pos = start; i < func.size(); i++, pos += 4)
{ {
if (func[i] && m_block_info[pos / 4]) if (func[i] && m_block_info[pos / 4])
{ {
const auto b = BasicBlock::Create(m_context, "", m_function); m_instr_map.emplace(pos, BasicBlock::Create(m_context, "", m_function));
jt.push_back(llvm::BlockAddress::get(b));
m_instr_map.emplace(pos, b);
}
else
{
jt.push_back(llvm::BlockAddress::get(m_stop));
} }
} }
m_jt = new GlobalVariable(*module, jtt, true, GlobalValue::PrivateLinkage, llvm::ConstantArray::get(jtt, jt), "jt");
update_pc(); update_pc();
const auto label_test = BasicBlock::Create(m_context, "", m_function); const auto label_test = BasicBlock::Create(m_context, "", m_function);
@ -2764,24 +2850,43 @@ public:
addr.value = call(&exec_check_interrupts, m_thread, addr.value); addr.value = call(&exec_check_interrupts, m_thread, addr.value);
} }
if (llvm::isa<llvm::ConstantInt>(addr.value)) if (const auto _int = llvm::dyn_cast<llvm::ConstantInt>(addr.value))
{ {
return branch_fixed(llvm::cast<llvm::ConstantInt>(addr.value)->getZExtValue()); LOG_WARNING(SPU, "[0x%x] Fixed branch to 0x%x", m_pos, _int->getZExtValue());
return branch_fixed(_int->getZExtValue());
} }
m_ir->CreateStore(addr.value, spu_ptr<u32>(&SPUThread::pc)); m_ir->CreateStore(addr.value, spu_ptr<u32>(&SPUThread::pc));
const auto tfound = m_targets.find(m_pos);
if (tfound != m_targets.end() && tfound->second.size() >= 3)
{
const u32 start = m_instr_map.begin()->first; const u32 start = m_instr_map.begin()->first;
const auto local = llvm::BasicBlock::Create(m_context, "", m_function);
const std::set<u32> targets(tfound->second.begin(), tfound->second.end());
const auto exter = llvm::BasicBlock::Create(m_context, "", m_function); const auto exter = llvm::BasicBlock::Create(m_context, "", m_function);
const auto off = m_ir->CreateSub(addr.value, m_ir->getInt32(start));
m_ir->CreateCondBr(m_ir->CreateICmpULT(off, m_ir->getInt32(m_size)), local, exter); const auto sw = m_ir->CreateSwitch(m_ir->CreateLShr(addr.value, 2, "", true), exter, m_size / 4);
m_ir->SetInsertPoint(local);
const auto table = m_ir->CreateIndirectBr(m_ir->CreateLoad(m_ir->CreateGEP(m_jt, {(llvm::Value*)m_ir->getInt32(0), m_ir->CreateLShr(off, 2)})), m_instr_map.size() + 1); for (u32 pos = start; pos < start + m_size; pos += 4)
for (const auto& pair : m_instr_map) {
table->addDestination(pair.second); const auto found = m_instr_map.find(pos);
table->addDestination(m_stop);
if (found != m_instr_map.end() && targets.count(pos))
{
sw->addCase(m_ir->getInt32(pos / 4), found->second);
}
else
{
sw->addCase(m_ir->getInt32(pos / 4), m_stop);
}
}
m_ir->SetInsertPoint(exter); m_ir->SetInsertPoint(exter);
}
const auto disp = m_ir->CreateAdd(m_thread, m_ir->getInt64(::offset32(&SPUThread::jit_dispatcher))); const auto disp = m_ir->CreateAdd(m_thread, m_ir->getInt64(::offset32(&SPUThread::jit_dispatcher)));
const auto type = llvm::FunctionType::get(get_type<void>(), {get_type<u64>(), get_type<u64>(), get_type<u32>()}, false)->getPointerTo()->getPointerTo(); const auto type = llvm::FunctionType::get(get_type<void>(), {get_type<u64>(), get_type<u64>(), get_type<u32>()}, false)->getPointerTo()->getPointerTo();
tail(m_ir->CreateLoad(m_ir->CreateIntToPtr(m_ir->CreateAdd(disp, zext<u64>(addr << 1).value), type))); tail(m_ir->CreateLoad(m_ir->CreateIntToPtr(m_ir->CreateAdd(disp, zext<u64>(addr << 1).value), type)));

View file

@ -5,6 +5,7 @@
#include <vector> #include <vector>
#include <bitset> #include <bitset>
#include <memory> #include <memory>
#include <string>
// Helper class // Helper class
class spu_cache class spu_cache
@ -35,8 +36,15 @@ protected:
u32 m_pos; u32 m_pos;
u32 m_size; u32 m_size;
// Bit indicating start of the block
std::bitset<0x10000> m_block_info; std::bitset<0x10000> m_block_info;
// GPR modified by the instruction (-1 = not set)
std::array<u8, 0x10000> m_regmod;
// List of possible targets for the instruction ({} = next instruction, {-1} = no targets)
std::unordered_map<u32, std::basic_string<u32>, value_hash<u32, 2>> m_targets;
std::shared_ptr<spu_cache> m_cache; std::shared_ptr<spu_cache> m_cache;
public: public: