SPU: fix Giga mode (kinda)

Don't scan before the entry point. Disable stack mirror in SPU LLVM. Improve analyser logic for holes.
2025-07-13 10:18:40 +12:00 · 2019-05-14 18:55:10 +03:00 · 2019-05-14 18:55:10 +03:00 · 3753d27aba
commit 3753d27aba
parent c481472faf
3 changed files with 58 additions and 41 deletions
--- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
@ -144,7 +144,7 @@ spu_function_t spu_recompiler::compile(u64 last_reset_count, const std::vector<u
 	m_pos = func[0];
 	m_base = func[0];
 	m_size = ::size32(func) * 4 - 4;
-	const u32 start = m_pos * (g_cfg.core.spu_block_size != spu_block_size_type::giga);
+	const u32 start = m_pos;
 	const u32 end = start + m_size;

 	// Create block labels
@ -226,7 +226,7 @@ spu_function_t spu_recompiler::compile(u64 last_reset_count, const std::vector<u
 			c->vzeroupper();
 		}
 	}
-	else if (m_size == 8 && (g_cfg.core.spu_block_size != spu_block_size_type::giga || func[0] != 4))
+	else if (m_size == 8)
 	{
 		c->mov(x86::rax, static_cast<u64>(func[2]) << 32 | func[1]);
 		c->cmp(x86::rax, x86::qword_ptr(*ls, *pc0));
@ -237,9 +237,9 @@ spu_function_t spu_recompiler::compile(u64 last_reset_count, const std::vector<u
 			c->vzeroupper();
 		}
 	}
-	else if (m_size == 8 || m_size == 4)
+	else if (m_size == 4)
 	{
-		c->cmp(x86::dword_ptr(*ls, *pc0), +func.back());
+		c->cmp(x86::dword_ptr(*ls, *pc0), func[1]);
 		c->jnz(label_diff);

 		if (utils::has_avx())
@ -724,7 +724,7 @@ spu_function_t spu_recompiler::compile(u64 last_reset_count, const std::vector<u
 	// Acknowledge success and add statistics
 	c->add(SPU_OFF_64(block_counter), ::size32(words) / (words_align / 4));

-	if (g_cfg.core.spu_block_size == spu_block_size_type::giga && m_pos != start)
+	if (m_pos != start)
 	{
 		// Jump to the entry point if necessary
 		c->jmp(instr_labels[m_pos]);
@ -971,7 +971,7 @@ void spu_recompiler::branch_fixed(u32 target)
 		return;
 	}

-	const auto ppptr = g_cfg.core.spu_block_size == spu_block_size_type::giga || !g_cfg.core.spu_verification ? nullptr : m_spurt->make_branch_patchpoint();
+	const auto ppptr = !g_cfg.core.spu_verification ? nullptr : m_spurt->make_branch_patchpoint();

 	c->lea(addr->r64(), get_pc(target));
 	c->mov(SPU_OFF_32(pc), *addr);
@ -1088,7 +1088,7 @@ void spu_recompiler::branch_indirect(spu_opcode_t op, bool jt, bool ret)
 	}

 	// Simply external call (return or indirect call)
-	const auto ppptr = g_cfg.core.spu_block_size == spu_block_size_type::giga || !g_cfg.core.spu_verification ? nullptr : m_spurt->make_branch_patchpoint();
+	const auto ppptr = !g_cfg.core.spu_verification ? nullptr : m_spurt->make_branch_patchpoint();

 	if (ppptr)
 	{
--- a/rpcs3/Emu/Cell/SPURecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPURecompiler.cpp
@ -226,6 +226,12 @@ std::deque<std::vector<u32>> spu_cache::get()
 			break;
 		}

+		if (!size || !func[1])
+		{
+			// Skip old format Giga entries
+			continue;
+		}
+
 		result.emplace_front(std::move(func));
 	}

@ -349,7 +355,7 @@ void spu_cache::initialize()
 			}

 			// Get data start
-			const u32 start = func[0] * (g_cfg.core.spu_block_size != spu_block_size_type::giga);
+			const u32 start = func[0];
 			const u32 size0 = ::size32(func);

 			// Initialize LS with function data only
@ -448,7 +454,7 @@ bool spu_runtime::func_compare::operator()(const std::vector<u32>& lhs, const st
 	else if (rhs_data.empty())
 		return false;

-	if (g_cfg.core.spu_block_size == spu_block_size_type::giga)
+	if (false)
 	{
 		// In Giga mode, compare instructions starting from the entry point first
 		lhs_data.remove_prefix(lhs_addr / 4);
@ -507,7 +513,7 @@ bool spu_runtime::add(u64 last_reset_count, void* _where, spu_function_t compile
 	const std::vector<u32>& func = where.first;

 	//
-	const u32 _off = 1 + (func[0] / 4) * (g_cfg.core.spu_block_size == spu_block_size_type::giga);
+	const u32 _off = 1 + (func[0] / 4) * (false);

 	// Set pointer to the compiled function
 	where.second = compiled;
@ -515,23 +521,27 @@ bool spu_runtime::add(u64 last_reset_count, void* _where, spu_function_t compile
 	// Register function in PIC map
 	m_pic_map[{func.data() + _off, func.size() - _off}] = compiled;

+	// Prepare sorted list
+	m_flat_list.clear();
+	m_flat_list.assign(m_pic_map.cbegin(), m_pic_map.cend());
+
 	struct work
 	{
 		u32 size;
 		u16 from;
 		u16 level;
 		u8* rel32;
-		decltype(m_pic_map)::iterator beg;
-		decltype(m_pic_map)::iterator end;
+		decltype(m_flat_list)::iterator beg;
+		decltype(m_flat_list)::iterator end;
 	};

 	// Scratch vector
 	static thread_local std::vector<work> workload;

 	// Generate a dispatcher (übertrampoline)
-	const auto beg = m_pic_map.begin();
-	const auto _end = m_pic_map.end();
-	const u32 size0 = ::size32(m_pic_map);
+	const auto beg = m_flat_list.begin();
+	const auto _end = m_flat_list.end();
+	const u32 size0 = ::size32(m_flat_list);

 	if (size0 == 1)
 	{
@ -630,6 +640,19 @@ bool spu_runtime::add(u64 last_reset_count, void* _where, spu_function_t compile
 				{
 					// Cannot split: some functions contain holes at this level
 					w.level++;
+
+					// Resort subrange starting from the new level
+					std::stable_sort(w.beg, w.end, [&](const auto& a, const auto& b)
+					{
+						std::basic_string_view<u32> lhs = a.first;
+						std::basic_string_view<u32> rhs = b.first;
+
+						lhs.remove_prefix(w.level);
+						rhs.remove_prefix(w.level);
+
+						return lhs < rhs;
+					});
+
 					continue;
 				}

@ -662,7 +685,7 @@ bool spu_runtime::add(u64 last_reset_count, void* _where, spu_function_t compile
 			if (w.level >= w.beg->first.size() || w.level >= it->first.size())
 			{
 				// If functions cannot be compared, assume smallest function
-				LOG_FATAL(SPU, "Trampoline simplified at 0x%x (level=%u)", func[0], w.level);
+				LOG_ERROR(SPU, "Trampoline simplified at 0x%x (level=%u)", func[0], w.level);
 				make_jump(0xe9, w.beg->second); // jmp rel32
 				continue;
 			}
@ -671,13 +694,13 @@ bool spu_runtime::add(u64 last_reset_count, void* _where, spu_function_t compile
 			const u32 x = it->first.at(w.level);

 			// Adjust ranges (backward)
-			while (it != m_pic_map.begin())
+			while (it != m_flat_list.begin())
 			{
 				it--;

 				if (w.level >= it->first.size())
 				{
-					it = m_pic_map.end();
+					it = m_flat_list.end();
 					break;
 				}

@ -692,9 +715,9 @@ bool spu_runtime::add(u64 last_reset_count, void* _where, spu_function_t compile
 				size2++;
 			}

-			if (it == m_pic_map.end())
+			if (it == m_flat_list.end())
 			{
-				LOG_FATAL(SPU, "Trampoline simplified (II) at 0x%x (level=%u)", func[0], w.level);
+				LOG_ERROR(SPU, "Trampoline simplified (II) at 0x%x (level=%u)", func[0], w.level);
 				make_jump(0xe9, w.beg->second); // jmp rel32
 				continue;
 			}
@ -824,7 +847,7 @@ void* spu_runtime::find(u64 last_reset_count, const std::vector<u32>& func)
 	}

 	//
-	const u32 _off = 1 + (func[0] / 4) * (g_cfg.core.spu_block_size == spu_block_size_type::giga);
+	const u32 _off = 1 + (func[0] / 4) * (false);

 	// Try to find PIC first
 	const auto found = m_pic_map.find({func.data() + _off, func.size() - _off});
@ -1154,8 +1177,6 @@ const std::vector<u32>& spu_recompiler_base::analyse(const be_t<u32>* ls, u32 en

 	if (g_cfg.core.spu_block_size == spu_block_size_type::giga)
 	{
-		// In Giga mode, all data starts from the address 0
-		lsa = 0;
 	}

 	for (u32 wi = 0, wa = workload[0]; wi < workload.size();)
@ -1842,7 +1863,7 @@ const std::vector<u32>& spu_recompiler_base::analyse(const be_t<u32>* ls, u32 en
 		}
 	}

-	while (g_cfg.core.spu_block_size != spu_block_size_type::giga || limit < 0x40000)
+	while (lsa > 0 || limit < 0x40000)
 	{
 		const u32 initial_size = result.size();

@ -2032,14 +2053,6 @@ const std::vector<u32>& spu_recompiler_base::analyse(const be_t<u32>* ls, u32 en
 			continue;
 		}

-		// Erase unreachable targets
-		const auto new_end = std::remove_if(it->second.begin(), it->second.end(), [&](u32 addr)
-		{
-			return addr < lsa || addr >= limit;
-		});
-
-		it->second.erase(new_end, it->second.end());
-
 		it++;
 	}

@ -3013,7 +3026,7 @@ const std::vector<u32>& spu_recompiler_base::analyse(const be_t<u32>* ls, u32 en

 					if (f.second.good)
 					{
-						LOG_ERROR(SPU, "Function 0x%05x: calls bad function (0x%05x)", f.first, ffound->first);
+						LOG_ERROR(SPU, "Function 0x%05x: calls bad function (0x%05x)", f.first, call);
 						f.second.good = false;
 					}
 				}
@ -3040,7 +3053,7 @@ void spu_recompiler_base::dump(std::string& out)
 	SPUDisAsm dis_asm(CPUDisAsm_InterpreterMode);
 	dis_asm.offset = reinterpret_cast<const u8*>(result.data() + 1);

-	if (g_cfg.core.spu_block_size != spu_block_size_type::giga)
+	if (true)
 	{
 		dis_asm.offset -= result[0];
 	}
@ -3299,15 +3312,15 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 	// Create tail call to the function chunk (non-tail calls are just out of question)
 	void tail_chunk(llvm::Value* chunk, llvm::Value* base_pc = nullptr)
 	{
-		if (!chunk && (g_cfg.core.spu_block_size == spu_block_size_type::giga || !g_cfg.core.spu_verification))
+		if (!chunk && !g_cfg.core.spu_verification)
 		{
-			// Disable patchpoints in some cases
+			// Disable patchpoints if verification is disabled
 			chunk = m_dispatch;
 		}
 		else if (!chunk)
 		{
 			// Create branch patchpoint if chunk == nullptr
-			verify(HERE), m_finfo, !m_finfo->fn;
+			verify(HERE), m_finfo, !m_finfo->fn || m_function == m_finfo->chunk;

 			// Register under a unique linkable name
 			const std::string ppname = fmt::format("%s-pp-0x%05x", m_hash, m_pos);
@ -4111,7 +4124,7 @@ public:
 		m_pos = func[0];
 		m_base = func[0];
 		m_size = (func.size() - 1) * 4;
-		const u32 start = m_pos * (g_cfg.core.spu_block_size != spu_block_size_type::giga);
+		const u32 start = m_pos;
 		const u32 end = start + m_size;

 		if (g_cfg.core.spu_debug)
@ -4169,7 +4182,7 @@ public:
 			const auto cond = m_ir->CreateICmpNE(m_ir->CreateLoad(pu32), m_ir->getInt32(func[1]));
 			m_ir->CreateCondBr(cond, label_diff, label_body, m_md_unlikely);
 		}
-		else if (func.size() - 1 == 2 && g_cfg.core.spu_block_size != spu_block_size_type::giga)
+		else if (func.size() - 1 == 2)
 		{
 			const auto pu64 = m_ir->CreateBitCast(m_ir->CreateGEP(m_lsptr, m_base_pc), get_type<u64*>());
 			const auto cond = m_ir->CreateICmpNE(m_ir->CreateLoad(pu64), m_ir->getInt64(static_cast<u64>(func[2]) << 32 | func[1]));
@ -5617,6 +5630,7 @@ public:
 					m_ir->CreateUnreachable();
 					m_ir->SetInsertPoint(next);
 					m_ir->CreateStore(ci, spu_ptr<u8>(&spu_thread::ch_mfc_cmd, &spu_mfc_cmd::cmd));
+					update_pc();
 					call("spu_exec_mfc_cmd", &exec_mfc_cmd, m_thread);
 					return;
 				}
@ -7698,7 +7712,7 @@ public:
 		m_ir->CreateStore(addr.value, spu_ptr<u32>(&spu_thread::pc));
 		const auto type = m_finfo->chunk->getFunctionType()->getPointerTo()->getPointerTo();

-		if (ret && g_cfg.core.spu_block_size != spu_block_size_type::safe)
+		if (ret && g_cfg.core.spu_block_size == spu_block_size_type::mega)
 		{
 			// Compare address stored in stack mirror with addr
 			const auto stack0 = eval(zext<u64>(sp) + ::offset32(&spu_thread::stack_mirror));
@ -8089,7 +8103,7 @@ public:
 			return;
 		}

-		if (g_cfg.core.spu_block_size != spu_block_size_type::safe && m_block_info[m_pos / 4 + 1] && m_entry_info[m_pos / 4 + 1])
+		if (g_cfg.core.spu_block_size == spu_block_size_type::mega && m_block_info[m_pos / 4 + 1] && m_entry_info[m_pos / 4 + 1])
 		{
 			// Store the return function chunk address at the stack mirror
 			const auto pfunc = add_function(m_pos + 4);
--- a/rpcs3/Emu/Cell/SPURecompiler.h
+++ b/rpcs3/Emu/Cell/SPURecompiler.h
@ -59,6 +59,9 @@ class spu_runtime
 	// Debug module output location
 	std::string m_cache_path;

+	// Scratch vector
+	std::vector<std::pair<std::basic_string_view<u32>, spu_function_t>> m_flat_list;
+
 public:

 	// Trampoline to spu_recompiler_base::dispatch