SPU Re: more precise jt generation

Improve analyser, set v1 Fix branch indirect conditional
2025-07-11 09:18:40 +12:00 · 2018-05-10 19:38:07 +03:00 · 2018-05-10 19:38:07 +03:00 · be5c18cc85
commit be5c18cc85
parent 737db90058
4 changed files with 190 additions and 68 deletions
--- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
@ -1156,10 +1156,18 @@ void spu_recompiler::branch_fixed(u32 target)
 	c->jmp(x86::rax);
 }
-void spu_recompiler::branch_indirect(spu_opcode_t op)
+void spu_recompiler::branch_indirect(spu_opcode_t op, bool jt)
 {
 	using namespace asmjit;
 	if (g_cfg.core.spu_block_size == spu_block_size_type::safe && !jt)
 	{
 		// Simply external call (return or indirect call)
 		c->mov(x86::r10, x86::qword_ptr(*cpu, addr->r64(), 1, offset32(&SPUThread::jit_dispatcher)));
 		c->xor_(qw0->r32(), qw0->r32());
 	}
 	else
 	{
 		if (!instr_table.isValid())
 		{
 			// Request instruction table
@ -1177,6 +1185,7 @@ void spu_recompiler::branch_indirect(spu_opcode_t op)
 		c->cmovae(qw1->r32(), qw0->r32());
 		c->cmovb(x86::r10, x86::qword_ptr(x86::r10, *qw1, 1, 0));
 		c->cmovae(x86::r10, x86::qword_ptr(*cpu, addr->r64(), 1, offset32(&SPUThread::jit_dispatcher)));
 	}
 	if (op.d)
 	{
@ -2741,7 +2750,7 @@ void spu_recompiler::BI(spu_opcode_t op)
 {
 	c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
 	c->and_(*addr, 0x3fffc);
-	branch_indirect(op);
+	branch_indirect(op, verify(HERE, m_targets[m_pos].size()) > 2);
 	m_pos = -1;
 }
--- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.h
+++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.h
@ -107,7 +107,7 @@ private:
 	asmjit::X86Mem XmmConst(__m128i data);
 	void branch_fixed(u32 target);
-	void branch_indirect(spu_opcode_t op);
+	void branch_indirect(spu_opcode_t op, bool jt = false);
 	void fall(spu_opcode_t op);
 	void save_rcx();
 	void load_rcx();
--- a/rpcs3/Emu/Cell/SPURecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPURecompiler.cpp
@ -89,7 +89,7 @@ void spu_cache::initialize()
 	}
 	// SPU cache file (version + block size type)
-	const std::string loc = _main->cache + u8"spu-§" + fmt::to_lower(g_cfg.core.spu_block_size.to_string()) + "-v0.dat";
+	const std::string loc = _main->cache + u8"spu-§" + fmt::to_lower(g_cfg.core.spu_block_size.to_string()) + "-v1.dat";
 	auto cache = std::make_shared<spu_cache>(loc);
@ -272,14 +272,16 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 	result.push_back(lsa);
 	// Initialize block entries
-	std::bitset<0x10000>& blocks = m_block_info;
+	m_block_info.reset();
-	blocks.reset();
+	m_block_info.set(lsa / 4);
 	blocks.set(lsa / 4);
 	// Simple block entry workload list
 	std::vector<u32> wl;
 	wl.push_back(lsa);
 	m_regmod.fill(0xff);
 	m_targets.clear();
 	// Value flags (TODO)
 	enum class vf : u32
 	{
@ -310,9 +312,9 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 			if (target > lsa)
 			{
 				// Check for redundancy
-				if (!blocks[target / 4])
+				if (!m_block_info[target / 4])
 				{
-					blocks[target / 4] = true;
+					m_block_info[target / 4] = true;
 					wl.push_back(target);
 					return;
 				}
@ -325,6 +327,8 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 		wl[wi] += 4;
 		m_targets.erase(pos);
 		// Analyse instruction
 		switch (const auto type = s_spu_itype.decode(data))
 		{
@ -336,7 +340,8 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 		case spu_itype::DFTSV:
 		{
 			// Stop before invalid instructions (TODO)
-			blocks[pos / 4] = true;
+			m_targets[pos].push_back(-1);
 			m_block_info[pos / 4] = true;
 			next_block();
 			continue;
 		}
@ -349,7 +354,8 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 			if (data == 0 || data == 3)
 			{
 				// Stop before null data
-				blocks[pos / 4] = true;
+				m_targets[pos].push_back(-1);
 				m_block_info[pos / 4] = true;
 				next_block();
 				continue;
 			}
@ -357,6 +363,7 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 			if (g_cfg.core.spu_block_size != spu_block_size_type::giga)
 			{
 				// Stop on special instructions (TODO)
 				m_targets[pos].push_back(-1);
 				next_block();
 				break;
 			}
@ -366,6 +373,7 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 		case spu_itype::IRET:
 		{
 			m_targets[pos].push_back(-1);
 			next_block();
 			break;
 		}
@ -382,6 +390,7 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 			if (type == spu_itype::BISL)
 			{
 				m_regmod[pos / 4] = op.rt;
 				vflags[op.rt] = +vf::is_const;
 				values[op.rt] = pos + 4;
 			}
@ -389,23 +398,24 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 			if (test(af, vf::is_const))
 			{
 				const u32 target = spu_branch_target(av);
 				LOG_WARNING(SPU, "[0x%x] At 0x%x: indirect branch to 0x%x", lsa, pos, target);
 				if (target == pos + 4)
 				{
 					// Nop (unless BISL)
-					break;
+					LOG_WARNING(SPU, "[0x%x] At 0x%x: indirect branch to next!", lsa, pos);
 				}
 				m_targets[pos].push_back(target);
 				if (type != spu_itype::BISL || g_cfg.core.spu_block_size == spu_block_size_type::giga)
 				{
 					LOG_WARNING(SPU, "[0x%x] At 0x%x: indirect branch to 0x%x", lsa, pos, target);
 					add_block(target);
 				}
-				if (type == spu_itype::BISL && target < lsa)
+				if (type == spu_itype::BISL && target >= lsa && g_cfg.core.spu_block_size == spu_block_size_type::giga)
 				{
-					next_block();
+					add_block(pos + 4);
 					break;
 				}
 			}
 			else if (type == spu_itype::BI && !op.d && !op.e)
@ -488,6 +498,8 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 							add_block(jt_abs[i]);
 							result[(start - lsa) / 4 + 1 + i] = se_storage<u32>::swap(jt_abs[i]);
 						}
 						m_targets.emplace(pos, std::move(jt_abs));
 					}
 					if (jt_rel.size() >= jt_abs.size())
@ -504,19 +516,33 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 							add_block(jt_rel[i]);
 							result[(start - lsa) / 4 + 1 + i] = se_storage<u32>::swap(jt_rel[i] - start);
 						}
 						m_targets.emplace(pos, std::move(jt_rel));
 					}
 				}
 			}
-			if (type == spu_itype::BI || type == spu_itype::BISL || g_cfg.core.spu_block_size == spu_block_size_type::safe)
+			if (type == spu_itype::BI || type == spu_itype::BISL)
 			{
 				if (type == spu_itype::BI || g_cfg.core.spu_block_size != spu_block_size_type::giga)
 				{
-					next_block();
+					if (m_targets[pos].empty())
-					break;
+					{
 						m_targets[pos].push_back(-1);
 					}
 				}
 				else
 				{
 					add_block(pos + 4);
 				}
 			}
 			else
 			{
 				m_targets[pos].push_back(pos + 4);
 				add_block(pos + 4);
 			}
 			next_block();
 			break;
 		}
@ -525,6 +551,7 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 		{
 			const u32 target = spu_branch_target(type == spu_itype::BRASL ? 0 : pos, op.i16);
 			m_regmod[pos / 4] = op.rt;
 			vflags[op.rt] = +vf::is_const;
 			values[op.rt] = pos + 4;
@ -534,11 +561,11 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 				break;
 			}
-			if (target < lsa || g_cfg.core.spu_block_size != spu_block_size_type::giga)
+			m_targets[pos].push_back(target);
 			if (target >= lsa && g_cfg.core.spu_block_size == spu_block_size_type::giga)
 			{
-				// Stop on direct calls
+				add_block(pos + 4);
 				next_block();
 				break;
 			}
 			if (g_cfg.core.spu_block_size == spu_block_size_type::giga)
@ -546,6 +573,7 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 				add_block(target);
 			}
 			next_block();
 			break;
 		}
@ -564,15 +592,16 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 				break;
 			}
 			m_targets[pos].push_back(target);
 			add_block(target);
-			if (type == spu_itype::BR || type == spu_itype::BRA)
+			if (type != spu_itype::BR && type != spu_itype::BRA)
 			{
-				// Stop on direct branches
+				m_targets[pos].push_back(pos + 4);
-				next_block();
+				add_block(pos + 4);
 				break;
 			}
 			next_block();
 			break;
 		}
@ -601,61 +630,131 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
 		case spu_itype::IL:
 		{
 			m_regmod[pos / 4] = op.rt;
 			vflags[op.rt] = +vf::is_const;
 			values[op.rt] = op.si16;
 			break;
 		}
 		case spu_itype::ILA:
 		{
 			m_regmod[pos / 4] = op.rt;
 			vflags[op.rt] = +vf::is_const;
 			values[op.rt] = op.i18;
 			break;
 		}
 		case spu_itype::ILH:
 		{
 			m_regmod[pos / 4] = op.rt;
 			vflags[op.rt] = +vf::is_const;
 			values[op.rt] = op.i16 << 16 | op.i16;
 			break;
 		}
 		case spu_itype::ILHU:
 		{
 			m_regmod[pos / 4] = op.rt;
 			vflags[op.rt] = +vf::is_const;
 			values[op.rt] = op.i16 << 16;
 			break;
 		}
 		case spu_itype::IOHL:
 		{
 			m_regmod[pos / 4] = op.rt;
 			values[op.rt] = values[op.rt] | op.i16;
 			break;
 		}
 		case spu_itype::ORI:
 		{
 			m_regmod[pos / 4] = op.rt;
 			vflags[op.rt] = vflags[op.ra] & vf::is_const;
 			values[op.rt] = values[op.ra] | op.si10;
 			break;
 		}
 		case spu_itype::OR:
 		{
 			m_regmod[pos / 4] = op.rt;
 			vflags[op.rt] = vflags[op.ra] & vflags[op.rb] & vf::is_const;
 			values[op.rt] = values[op.ra] | values[op.rb];
 			break;
 		}
 		case spu_itype::ANDI:
 		{
 			m_regmod[pos / 4] = op.rt;
 			vflags[op.rt] = vflags[op.ra] & vf::is_const;
 			values[op.rt] = values[op.ra] & op.si10;
 			break;
 		}
 		case spu_itype::AND:
 		{
 			m_regmod[pos / 4] = op.rt;
 			vflags[op.rt] = vflags[op.ra] & vflags[op.rb] & vf::is_const;
 			values[op.rt] = values[op.ra] & values[op.rb];
 			break;
 		}
 		case spu_itype::AI:
 		{
 			m_regmod[pos / 4] = op.rt;
 			vflags[op.rt] = vflags[op.ra] & vf::is_const;
 			values[op.rt] = values[op.ra] + op.si10;
 			break;
 		}
 		case spu_itype::A:
 		{
 			m_regmod[pos / 4] = op.rt;
 			vflags[op.rt] = vflags[op.ra] & vflags[op.rb] & vf::is_const;
 			values[op.rt] = values[op.ra] + values[op.rb];
 			break;
 		}
 		case spu_itype::SFI:
 		{
 			m_regmod[pos / 4] = op.rt;
 			vflags[op.rt] = vflags[op.ra] & vf::is_const;
 			values[op.rt] = op.si10 - values[op.ra];
 			break;
 		}
 		case spu_itype::SF:
 		{
 			m_regmod[pos / 4] = op.rt;
 			vflags[op.rt] = vflags[op.ra] & vflags[op.rb] & vf::is_const;
 			values[op.rt] = values[op.rb] - values[op.ra];
 			break;
 		}
 		case spu_itype::ROTMI:
 		{
 			m_regmod[pos / 4] = op.rt;
 			if (-op.i7 & 0x20)
 			{
 				vflags[op.rt] = +vf::is_const;
 				values[op.rt] = 0;
 				break;
 			}
 			vflags[op.rt] = vflags[op.ra] & vf::is_const;
 			values[op.rt] = values[op.ra] >> (-op.i7 & 0x1f);
 			break;
 		}
 		case spu_itype::SHLI:
 		{
 			m_regmod[pos / 4] = op.rt;
 			if (op.i7 & 0x20)
 			{
 				vflags[op.rt] = +vf::is_const;
 				values[op.rt] = 0;
 				break;
 			}
 			vflags[op.rt] = vflags[op.ra] & vf::is_const;
 			values[op.rt] = values[op.ra] << (op.i7 & 0x1f);
 			break;
 		}
 		default:
 		{
 			// Unconst
-			vflags[type & spu_itype::_quadrop ? +op.rt4 : +op.rt] = {};
+			const u32 op_rt = type & spu_itype::_quadrop ? +op.rt4 : +op.rt;
 			m_regmod[pos / 4] = op_rt;
 			vflags[op_rt] = {};
 			break;
 		}
 		}
@ -783,7 +882,6 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 	llvm::Value* m_lsptr;
 	llvm::BasicBlock* m_stop;
 	llvm::GlobalVariable* m_jt;
 	std::array<std::pair<llvm::Value*, llvm::Value*>, 128> m_gpr;
 	std::array<llvm::Instruction*, 128> m_flush_gpr;
@ -1047,27 +1145,15 @@ public:
 		m_stop = BasicBlock::Create(m_context, "", m_function);
 		const auto jtt = ArrayType::get(GetType<u8*>(), m_size / 4);
 		std::vector<llvm::Constant*> jt;
 		jt.reserve(m_size / 4);
 		// Create instruction blocks
 		for (u32 i = 1, pos = start; i < func.size(); i++, pos += 4)
 		{
 			if (func[i] && m_block_info[pos / 4])
 			{
-				const auto b = BasicBlock::Create(m_context, "", m_function);
+				m_instr_map.emplace(pos, BasicBlock::Create(m_context, "", m_function));
 				jt.push_back(llvm::BlockAddress::get(b));
 				m_instr_map.emplace(pos, b);
 			}
 			else
 			{
 				jt.push_back(llvm::BlockAddress::get(m_stop));
 			}
 		}
 		m_jt = new GlobalVariable(*module, jtt, true, GlobalValue::PrivateLinkage, llvm::ConstantArray::get(jtt, jt), "jt");
 		update_pc();
 		const auto label_test = BasicBlock::Create(m_context, "", m_function);
@ -2764,24 +2850,43 @@ public:
 			addr.value = call(&exec_check_interrupts, m_thread, addr.value);
 		}
-		if (llvm::isa<llvm::ConstantInt>(addr.value))
+		if (const auto _int = llvm::dyn_cast<llvm::ConstantInt>(addr.value))
 		{
-			return branch_fixed(llvm::cast<llvm::ConstantInt>(addr.value)->getZExtValue());
+			LOG_WARNING(SPU, "[0x%x] Fixed branch to 0x%x", m_pos, _int->getZExtValue());
 			return branch_fixed(_int->getZExtValue());
 		}
 		m_ir->CreateStore(addr.value, spu_ptr<u32>(&SPUThread::pc));
 		const auto tfound = m_targets.find(m_pos);
 		if (tfound != m_targets.end() && tfound->second.size() >= 3)
 		{
 			const u32 start = m_instr_map.begin()->first;
-		const auto local = llvm::BasicBlock::Create(m_context, "", m_function);
+
 			const std::set<u32> targets(tfound->second.begin(), tfound->second.end());
 			const auto exter = llvm::BasicBlock::Create(m_context, "", m_function);
-		const auto off = m_ir->CreateSub(addr.value, m_ir->getInt32(start));
+
-		m_ir->CreateCondBr(m_ir->CreateICmpULT(off, m_ir->getInt32(m_size)), local, exter);
+			const auto sw = m_ir->CreateSwitch(m_ir->CreateLShr(addr.value, 2, "", true), exter, m_size / 4);
-		m_ir->SetInsertPoint(local);
+
-		const auto table = m_ir->CreateIndirectBr(m_ir->CreateLoad(m_ir->CreateGEP(m_jt, {(llvm::Value*)m_ir->getInt32(0), m_ir->CreateLShr(off, 2)})), m_instr_map.size() + 1);
+			for (u32 pos = start; pos < start + m_size; pos += 4)
-		for (const auto& pair : m_instr_map)
+			{
-			table->addDestination(pair.second);
+				const auto found = m_instr_map.find(pos);
-		table->addDestination(m_stop);
+
 				if (found != m_instr_map.end() && targets.count(pos))
 				{
 					sw->addCase(m_ir->getInt32(pos / 4), found->second);
 				}
 				else
 				{
 					sw->addCase(m_ir->getInt32(pos / 4), m_stop);
 				}
 			}
 			m_ir->SetInsertPoint(exter);
 		}
 		const auto disp = m_ir->CreateAdd(m_thread, m_ir->getInt64(::offset32(&SPUThread::jit_dispatcher)));
 		const auto type = llvm::FunctionType::get(get_type<void>(), {get_type<u64>(), get_type<u64>(), get_type<u32>()}, false)->getPointerTo()->getPointerTo();
 		tail(m_ir->CreateLoad(m_ir->CreateIntToPtr(m_ir->CreateAdd(disp, zext<u64>(addr << 1).value), type)));
--- a/rpcs3/Emu/Cell/SPURecompiler.h
+++ b/rpcs3/Emu/Cell/SPURecompiler.h
@ -5,6 +5,7 @@
 #include <vector>
 #include <bitset>
 #include <memory>
 #include <string>
 // Helper class
 class spu_cache
@ -35,8 +36,15 @@ protected:
 	u32 m_pos;
 	u32 m_size;
 	// Bit indicating start of the block
 	std::bitset<0x10000> m_block_info;
 	// GPR modified by the instruction (-1 = not set)
 	std::array<u8, 0x10000> m_regmod;
 	// List of possible targets for the instruction ({} = next instruction, {-1} = no targets)
 	std::unordered_map<u32, std::basic_string<u32>, value_hash<u32, 2>> m_targets;
 	std::shared_ptr<spu_cache> m_cache;
 public: