PPUThread refactoring

`CallbackManager` removed, added _gcm_intr_thread for cellGcmSys `PPUThread` renamed to `ppu_thread`, inheritance allowed Added lightweight command queue for `ppu_thread` Implemented call stack dump for PPU `get_current_thread_mutex` removed `thread_ctrl::spawn`: minor initialization fix `thread_ctrl::wait_for` added `named_thread`: some methods added `cpu_thread::run` added Some bugs fixes, including SPU channels
2025-07-09 08:21:29 +12:00 · 2016-07-28 00:43:22 +03:00 · 2016-07-28 00:43:22 +03:00 · f8719c1230
commit f8719c1230
parent 33c59fa51b
99 changed files with 4480 additions and 4592 deletions
--- a/rpcs3/Emu/Cell/PPUThread.cpp
+++ b/rpcs3/Emu/Cell/PPUThread.cpp
@ -57,6 +57,9 @@ cfg::map_entry<ppu_decoder_type> g_cfg_ppu_decoder(cfg::root.core, "PPU Decoder"
 const ppu_decoder<ppu_interpreter_precise> s_ppu_interpreter_precise;
 const ppu_decoder<ppu_interpreter_fast> s_ppu_interpreter_fast;

+extern void ppu_execute_syscall(ppu_thread& ppu, u64 code);
+extern void ppu_execute_function(ppu_thread& ppu, u32 index);
+
 const auto s_ppu_compiled = static_cast<u32*>(memory_helper::reserve_memory(0x100000000));

 extern void ppu_register_function_at(u32 addr, ppu_function_t ptr)
@ -68,82 +71,117 @@ extern void ppu_register_function_at(u32 addr, ppu_function_t ptr)
 	}
 }

-std::string PPUThread::get_name() const
+std::string ppu_thread::get_name() const
 {
 	return fmt::format("PPU[0x%x] Thread (%s)", id, m_name);
 }

-std::string PPUThread::dump() const
+std::string ppu_thread::dump() const
 {
-	std::string ret = "Registers:\n=========\n";
+	std::string ret;

-	for (uint i = 0; i<32; ++i) ret += fmt::format("GPR[%d] = 0x%llx\n", i, GPR[i]);
-	for (uint i = 0; i<32; ++i) ret += fmt::format("FPR[%d] = %.6G\n", i, FPR[i]);
-	for (uint i = 0; i<32; ++i) ret += fmt::format("VR[%d] = 0x%s [%s]\n", i, VR[i].to_hex().c_str(), VR[i].to_xyzw().c_str());
-	ret += fmt::format("CR = 0x%08x\n", GetCR());
-	ret += fmt::format("LR = 0x%llx\n", LR);
-	ret += fmt::format("CTR = 0x%llx\n", CTR);
-	ret += fmt::format("XER = [CA=%u | OV=%u | SO=%u | CNT=%u]\n", u32{ CA }, u32{ OV }, u32{ SO }, u32{ XCNT });
-	//ret += fmt::format("FPSCR = 0x%x "
-	//	"[RN=%d | NI=%d | XE=%d | ZE=%d | UE=%d | OE=%d | VE=%d | "
-	//	"VXCVI=%d | VXSQRT=%d | VXSOFT=%d | FPRF=%d | "
-	//	"FI=%d | FR=%d | VXVC=%d | VXIMZ=%d | "
-	//	"VXZDZ=%d | VXIDI=%d | VXISI=%d | VXSNAN=%d | "
-	//	"XX=%d | ZX=%d | UX=%d | OX=%d | VX=%d | FEX=%d | FX=%d]\n",
-	//	FPSCR.FPSCR,
-	//	u32{ FPSCR.RN },
-	//	u32{ FPSCR.NI }, u32{ FPSCR.XE }, u32{ FPSCR.ZE }, u32{ FPSCR.UE }, u32{ FPSCR.OE }, u32{ FPSCR.VE },
-	//	u32{ FPSCR.VXCVI }, u32{ FPSCR.VXSQRT }, u32{ FPSCR.VXSOFT }, u32{ FPSCR.FPRF },
-	//	u32{ FPSCR.FI }, u32{ FPSCR.FR }, u32{ FPSCR.VXVC }, u32{ FPSCR.VXIMZ },
-	//	u32{ FPSCR.VXZDZ }, u32{ FPSCR.VXIDI }, u32{ FPSCR.VXISI }, u32{ FPSCR.VXSNAN },
-	//	u32{ FPSCR.XX }, u32{ FPSCR.ZX }, u32{ FPSCR.UX }, u32{ FPSCR.OX }, u32{ FPSCR.VX }, u32{ FPSCR.FEX }, u32{ FPSCR.FX });
+	ret += fmt::format("State: 0x%08x\n", state.load());
+	ret += fmt::format("Priority: %d\n", prio);
+	
+	ret += "\nRegisters:\n=========\n";
+	for (uint i = 0; i < 32; ++i) ret += fmt::format("GPR[%d] = 0x%llx\n", i, gpr[i]);
+	for (uint i = 0; i < 32; ++i) ret += fmt::format("FPR[%d] = %.6G\n", i, fpr[i]);
+	for (uint i = 0; i < 32; ++i) ret += fmt::format("VR[%d] = 0x%s [%s]\n", i, vr[i].to_hex().c_str(), vr[i].to_xyzw().c_str());
+
+	if (g_cfg_ppu_decoder.get() != ppu_decoder_type::llvm)
+	{
+		ret += fmt::format("CR = 0x%08x\n", cr_pack());
+		ret += fmt::format("LR = 0x%llx\n", lr);
+		ret += fmt::format("CTR = 0x%llx\n", ctr);
+		ret += fmt::format("VRSAVE = 0x%08x\n", vrsave);
+		ret += fmt::format("XER = [CA=%u | OV=%u | SO=%u | CNT=%u]\n", xer.ca, xer.ov, xer.so, xer.cnt);
+		ret += fmt::format("VSCR = [SAT=%u | NJ=%u]\n", sat, nj);
+		ret += fmt::format("FPSCR = [FL=%u | FG=%u | FE=%u | FU=%u]\n", fpscr.fl, fpscr.fg, fpscr.fe, fpscr.fu);
+
+		ret += "\nCall stack:\n=========\n";
+		ret += fmt::format("0x%08x (0x0) called\n", cia);
+		const u32 stack_max = ::align(stack_addr + stack_size, 0x200) - 0x200;
+		for (u64 sp = vm::read64(static_cast<u32>(gpr[1])); sp >= stack_addr && sp < stack_max; sp = vm::read64(static_cast<u32>(sp)))
+		{
+			// TODO: print also function addresses
+			ret += fmt::format("> from 0x%08llx (0x0)\n", vm::read64(static_cast<u32>(sp + 16)));
+		}
+	}

 	return ret;
 }

-void PPUThread::cpu_init()
-{
-	if (!stack_addr)
-	{
-		if (!stack_size)
-		{
-			throw EXCEPTION("Invalid stack size");
-		}
-
-		stack_addr = vm::alloc(stack_size, vm::stack);
-
-		if (!stack_addr)
-		{
-			throw EXCEPTION("Out of stack memory");
-		}
-	}
-
-	GPR[1] = align(stack_addr + stack_size, 0x200) - 0x200;
-}
-
 extern thread_local std::string(*g_tls_log_prefix)();

-void PPUThread::cpu_task()
+void ppu_thread::cpu_task()
 {
 	//SetHostRoundingMode(FPSCR_RN_NEAR);

-	return custom_task ? custom_task(*this) : fast_call(pc, static_cast<u32>(GPR[2]));
+	// Execute cmd_queue
+	while (ppu_cmd cmd = cmd_wait())
+	{
+		const u32 pos = cmd_queue.peek() + 1; // Additional arguments start from [pos]
+		const u32 arg = cmd.arg2<u32>(); // 32-bit arg extracted
+
+		switch (u32 type = cmd.arg1<u32>())
+		{
+		case ppu_cmd::opcode:
+		{
+			cmd_pop(), s_ppu_interpreter_fast.decode(arg)(*this, {arg});
+			break;
+		}
+		case ppu_cmd::set_gpr:
+		{
+			if (arg >= 32)
+			{
+				throw fmt::exception("Invalid ppu_cmd::set_gpr arg (0x%x)" HERE, arg);
+			}
+
+			gpr[arg % 32] = cmd_queue[pos].load().as<u64>();
+			cmd_pop(1);
+			break;
+		}
+		case ppu_cmd::set_args:
+		{
+			if (arg > 8)
+			{
+				throw fmt::exception("Unsupported ppu_cmd::set_args size (0x%x)" HERE, arg);
+			}
+
+			for (u32 i = 0; i < arg; i++)
+			{
+				gpr[i + 3] = cmd_queue[pos + i].load().as<u64>();
+			}
+
+			cmd_pop(arg);
+			break;
+		}
+		case ppu_cmd::lle_call:
+		{
+			const vm::ptr<u32> opd(arg < 32 ? vm::cast(gpr[arg]) : vm::cast(arg));
+			cmd_pop(), fast_call(opd[0], opd[1]);
+			break;
+		}
+		case ppu_cmd::hle_call:
+		{
+			cmd_pop(), ppu_execute_function(*this, arg);
+			break;
+		}
+		default:
+		{
+			throw fmt::exception("Unknown ppu_cmd(0x%x)" HERE, type);
+		}
+		}
+	}
 }

-void PPUThread::cpu_task_main()
+void ppu_thread::exec_task()
 {
 	if (g_cfg_ppu_decoder.get() == ppu_decoder_type::llvm)
 	{
-		return reinterpret_cast<ppu_function_t>((std::uintptr_t)s_ppu_compiled[pc / 4])(*this);
+		return reinterpret_cast<ppu_function_t>((std::uintptr_t)s_ppu_compiled[cia / 4])(*this);
 	}

-	g_tls_log_prefix = []
-	{
-		const auto cpu = static_cast<PPUThread*>(get_current_cpu_thread());
-
-		return fmt::format("%s [0x%08x]", cpu->get_name(), cpu->pc);
-	};
-
 	const auto base = vm::_ptr<const u8>(0);

 	// Select opcode table
@ -159,12 +197,12 @@ void PPUThread::cpu_task_main()
 	{
 		if (UNLIKELY(state.load()))
 		{
-			if (check_status()) return;
+			if (check_state()) return;
 		}

 		// Reinitialize
 		{
-			const auto _ops = _mm_shuffle_epi8(_mm_lddqu_si128(reinterpret_cast<const __m128i*>(base + pc)), _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3));
+			const auto _ops = _mm_shuffle_epi8(_mm_lddqu_si128(reinterpret_cast<const __m128i*>(base + cia)), _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3));
 			_op.vi = _ops;
 			const v128 _i = v128::fromV(_mm_and_si128(_mm_or_si128(_mm_slli_epi32(_op.vi, 6), _mm_srli_epi32(_op.vi, 26)), _mm_set1_epi32(0x1ffff)));
 			func0 = table[_i._u32[0]];
@ -175,14 +213,14 @@ void PPUThread::cpu_task_main()

 		while (LIKELY(func0(*this, { _op._u32[0] })))
 		{
-			if (pc += 4, LIKELY(func1(*this, { _op._u32[1] })))
+			if (cia += 4, LIKELY(func1(*this, { _op._u32[1] })))
 			{
-				if (pc += 4, LIKELY(func2(*this, { _op._u32[2] })))
+				if (cia += 4, LIKELY(func2(*this, { _op._u32[2] })))
 				{
-					pc += 4;
+					cia += 4;
 					func0 = func3;

-					const auto _ops = _mm_shuffle_epi8(_mm_lddqu_si128(reinterpret_cast<const __m128i*>(base + pc + 4)), _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3));
+					const auto _ops = _mm_shuffle_epi8(_mm_lddqu_si128(reinterpret_cast<const __m128i*>(base + cia + 4)), _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3));
 					_op.vi = _mm_alignr_epi8(_ops, _op.vi, 12);
 					const v128 _i = v128::fromV(_mm_and_si128(_mm_or_si128(_mm_slli_epi32(_op.vi, 6), _mm_srli_epi32(_op.vi, 26)), _mm_set1_epi32(0x1ffff)));
 					func1 = table[_i._u32[1]];
@ -204,87 +242,7 @@ void PPUThread::cpu_task_main()

 constexpr auto stop_state = make_bitset(cpu_state::stop, cpu_state::exit, cpu_state::suspend);

-atomic_t<u32> g_ppu_core[2]{};
-
-bool PPUThread::handle_interrupt()
-{
-	// Reschedule and wake up a new thread, possibly this one as well.
-	return false;
-
-	// Check virtual core allocation
-	if (g_ppu_core[0] != id && g_ppu_core[1] != id)
-	{
-		auto cpu0 = idm::get<PPUThread>(g_ppu_core[0]);
-		auto cpu1 = idm::get<PPUThread>(g_ppu_core[1]);
-
-		if (cpu0 && cpu1)
-		{
-			if (cpu1->prio > cpu0->prio)
-			{
-				cpu0 = std::move(cpu1);
-			}
-
-			// Preempt thread with the lowest priority
-			if (prio < cpu0->prio)
-			{
-				cpu0->state += cpu_state::interrupt;
-			}
-		}
-		else
-		{
-			// Try to obtain a virtual core in optimistic way
-			if (g_ppu_core[0].compare_and_swap_test(0, id) || g_ppu_core[1].compare_and_swap_test(0, id))
-			{
-				state -= cpu_state::interrupt;
-				return true;
-			}
-		}
-
-		return false;
-	}
-
-	// Select appropriate thread
-	u32 top_prio = -1;
-	u32 selected = -1;
-
-	idm::select<PPUThread>([&](u32 id, PPUThread& ppu)
-	{
-		// Exclude suspended and low-priority threads
-		if (!ppu.state.test(stop_state) && ppu.prio < top_prio /*&& (!ppu.is_sleep() || ppu.state & cpu_state::signal)*/)
-		{
-			top_prio = ppu.prio;
-			selected = id;
-		}
-	});
-
-	// If current thread selected
-	if (selected == id)
-	{
-		state -= cpu_state::interrupt;
-		VERIFY(g_ppu_core[0] == id || g_ppu_core[1] == id);
-		return true;
-	}
-
-	// If another thread selected
-	const auto thread = idm::get<PPUThread>(selected);
-
-	// Lend virtual core to another thread
-	if (thread && thread->state.test_and_reset(cpu_state::interrupt))
-	{
-		g_ppu_core[0].compare_and_swap(id, thread->id);
-		g_ppu_core[1].compare_and_swap(id, thread->id);
-		(*thread)->lock_notify();
-	}
-	else
-	{
-		g_ppu_core[0].compare_and_swap(id, 0);
-		g_ppu_core[1].compare_and_swap(id, 0);
-	}
-
-	return false;
-}
-
-PPUThread::~PPUThread()
+ppu_thread::~ppu_thread()
 {
 	if (stack_addr)
 	{
@ -292,40 +250,126 @@ PPUThread::~PPUThread()
 	}
 }

-PPUThread::PPUThread(const std::string& name)
+ppu_thread::ppu_thread(const std::string& name, u32 prio, u32 stack)
 	: cpu_thread(cpu_type::ppu)
+	, prio(prio)
+	, stack_size(std::max<u32>(stack, 0x4000))
+	, stack_addr(vm::alloc(stack_size, vm::stack))
 	, m_name(name)
 {
+	if (!stack_addr)
+	{
+		throw fmt::exception("Out of stack memory (size=0x%x)" HERE, stack_size);
+	}
+
+	gpr[1] = ::align(stack_addr + stack_size, 0x200) - 0x200;
 }

-be_t<u64>* PPUThread::get_stack_arg(s32 i, u64 align)
+void ppu_thread::cmd_push(ppu_cmd cmd)
+{
+	// Reserve queue space
+	const u32 pos = cmd_queue.push_begin();
+
+	// Write single command
+	cmd_queue[pos] = cmd;
+}
+
+void ppu_thread::cmd_list(std::initializer_list<ppu_cmd> list)
+{
+	// Reserve queue space
+	const u32 pos = cmd_queue.push_begin(static_cast<u32>(list.size()));
+
+	// Write command tail in relaxed manner
+	for (u32 i = 1; i < list.size(); i++)
+	{
+		cmd_queue[pos + i].raw() = list.begin()[i];
+	}
+
+	// Write command head after all
+	cmd_queue[pos] = *list.begin();
+}
+
+void ppu_thread::cmd_pop(u32 count)
+{
+	// Get current position
+	const u32 pos = cmd_queue.peek();
+
+	// Clean command buffer for command tail
+	for (u32 i = 1; i <= count; i++)
+	{
+		cmd_queue[pos + i].raw() = ppu_cmd{};
+	}
+
+	// Free
+	cmd_queue.pop_end(count + 1);
+}
+
+ppu_cmd ppu_thread::cmd_wait()
+{
+	std::unique_lock<named_thread> lock(*this, std::defer_lock);
+
+	while (true)
+	{
+		if (UNLIKELY(state.load()))
+		{
+			if (lock) lock.unlock();
+
+			if (check_state()) // check_status() requires unlocked mutex
+			{
+				return ppu_cmd{};
+			}
+		}
+
+		// Lightweight queue doesn't care about mutex state
+		if (ppu_cmd result = cmd_queue[cmd_queue.peek()].exchange(ppu_cmd{}))
+		{
+			return result;
+		}
+
+		if (!lock)
+		{
+			lock.lock();
+			continue;
+		}
+
+		thread_ctrl::wait(); // Waiting requires locked mutex
+	}
+}
+
+be_t<u64>* ppu_thread::get_stack_arg(s32 i, u64 align)
 {
 	if (align != 1 && align != 2 && align != 4 && align != 8 && align != 16) throw fmt::exception("Unsupported alignment: 0x%llx" HERE, align);
-	return vm::_ptr<u64>(vm::cast((GPR[1] + 0x30 + 0x8 * (i - 1)) & (0 - align), HERE));
+	return vm::_ptr<u64>(vm::cast((gpr[1] + 0x30 + 0x8 * (i - 1)) & (0 - align), HERE));
 }

-void PPUThread::fast_call(u32 addr, u32 rtoc)
+void ppu_thread::fast_call(u32 addr, u32 rtoc)
 {
-	const auto old_PC = pc;
-	const auto old_stack = GPR[1];
-	const auto old_rtoc = GPR[2];
-	const auto old_LR = LR;
-	const auto old_task = std::move(custom_task);
+	const auto old_pc = cia;
+	const auto old_stack = gpr[1];
+	const auto old_rtoc = gpr[2];
+	const auto old_lr = lr;
 	const auto old_func = last_function;
+	const auto old_fmt = g_tls_log_prefix;

-	pc = addr;
-	GPR[2] = rtoc;
-	LR = Emu.GetCPUThreadStop();
-	custom_task = nullptr;
+	cia = addr;
+	gpr[2] = rtoc;
+	lr = Emu.GetCPUThreadStop();
 	last_function = nullptr;

+	g_tls_log_prefix = []
+	{
+		const auto ppu = static_cast<ppu_thread*>(get_current_cpu_thread());
+
+		return fmt::format("%s [0x%08x]", ppu->get_name(), ppu->cia);
+	};
+
 	try
 	{
-		cpu_task_main();
+		exec_task();

-		if (GPR[1] != old_stack && !state.test(cpu_state::ret) && !state.test(cpu_state::exit)) // GPR[1] shouldn't change
+		if (gpr[1] != old_stack && !state.test(cpu_state::ret) && !state.test(cpu_state::exit)) // gpr[1] shouldn't change
 		{
-			throw fmt::exception("Stack inconsistency (addr=0x%x, rtoc=0x%x, SP=0x%llx, old=0x%llx)", addr, rtoc, GPR[1], old_stack);
+			throw fmt::exception("Stack inconsistency (addr=0x%x, rtoc=0x%x, SP=0x%llx, old=0x%llx)", addr, rtoc, gpr[1], old_stack);
 		}
 	}
 	catch (cpu_state _s)
@ -348,25 +392,17 @@ void PPUThread::fast_call(u32 addr, u32 rtoc)

 	state -= cpu_state::ret;

-	pc = old_PC;
-	GPR[1] = old_stack;
-	GPR[2] = old_rtoc;
-	LR = old_LR;
-	custom_task = std::move(old_task);
+	cia = old_pc;
+	gpr[1] = old_stack;
+	gpr[2] = old_rtoc;
+	lr = old_lr;
 	last_function = old_func;
-
-	//if (custom_task)
-	//{
-	//	state += cpu_state::interrupt;
-	//	handle_interrupt();
-	//}
+	g_tls_log_prefix = old_fmt;
 }

 const ppu_decoder<ppu_itype> s_ppu_itype;

 extern u64 get_timebased_time();
-extern void ppu_execute_syscall(PPUThread& ppu, u64 code);
-extern void ppu_execute_function(PPUThread& ppu, u32 index);
 extern ppu_function_t ppu_get_syscall(u64 code);
 extern std::string ppu_get_syscall_name(u64 code);
 extern ppu_function_t ppu_get_function(u32 index);
@ -434,7 +470,7 @@ static bool adde_carry(u64 a, u64 b, bool c)
 #endif
 }

-extern void ppu_initialize(const std::string& name, const std::vector<ppu_function>& funcs, u32 entry)
+extern void ppu_initialize(const std::string& name, const std::vector<ppu_function>& funcs)
 {
 	if (g_cfg_ppu_decoder.get() != ppu_decoder_type::llvm || funcs.empty())
 	{
@ -485,7 +521,7 @@ extern void ppu_initialize(const std::string& name, const std::vector<ppu_functi
 	module->setTargetTriple(Triple::normalize(sys::getProcessTriple()));
 	
 	// Initialize translator
-	std::unique_ptr<PPUTranslator> translator = std::make_unique<PPUTranslator>(g_llvm_ctx, module.get(), 0, entry);
+	std::unique_ptr<PPUTranslator> translator = std::make_unique<PPUTranslator>(g_llvm_ctx, module.get(), 0);

 	// Define some types
 	const auto _void = Type::getVoidTy(g_llvm_ctx);