SPU LLVM: Optimize branch following ORX

- test the input of ORX directly for zeroes, instead of the result
2025-07-04 05:51:27 +12:00 · 2021-10-26 04:56:47 -04:00 · 2021-10-26 04:56:47 -04:00 · 3fde455932
commit 3fde455932
parent ba727e13ae
1 changed files with 83 additions and 8 deletions
--- a/rpcs3/Emu/Cell/SPURecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPURecompiler.cpp
@ -6889,12 +6889,23 @@ public:
 		set_vr(op.rt, pshufb(a, sh));
 	}
 	template <typename T>
 	static llvm_calli<u32[4], T> orx(T&& a)
 	{
 		return {"spu_orx", {std::forward<T>(a)}};
 	}
 	void ORX(spu_opcode_t op)
 	{
-		const auto a = get_vr(op.ra);
+		register_intrinsic("spu_orx", [&](llvm::CallInst* ci)
 		{
 			const auto a = value<u32[4]>(ci->getOperand(0));
 			const auto x = zshuffle(a, 2, 3, 0, 1) | a;
 			const auto y = zshuffle(x, 1, 0, 3, 2) | x;
-		set_vr(op.rt, zshuffle(y, 4, 4, 4, 3));
+			return zshuffle(y, 4, 4, 4, 3);
 		});
 		set_vr(op.rt, orx(get_vr(op.ra)));
 	}
 	void CBD(spu_opcode_t op)
@ -9234,7 +9245,7 @@ public:
 		const auto rt = get_vr<u8[16]>(op.rt);
-		// Checking for zero doeesn't care about the order of the bytes,
+		// Checking for zero doesn't care about the order of the bytes,
 		// so load the data before it's byteswapped
 		if (auto [ok, as] = match_expr(rt, byteswap(match<u8[16]>())); ok)
 		{
@ -9246,6 +9257,22 @@ public:
 			return;
 		}
 		const auto ox = get_vr<u32[4]>(op.rt);
 		// Instead of extracting the value generated by orx, just test the input to orx with ptest
 		if (auto [ok, as] = match_expr(ox, orx(match<u32[4]>())); ok)
 		{
 			m_block->block_end = m_ir->GetInsertBlock();
 			const auto a = extract(bitcast<u64[2]>(as), 0);
 			const auto b = extract(bitcast<u64[2]>(as), 1);
 			const auto cond = eval((a | b) == 0);
 			const auto addr = eval(extract(get_vr(op.ra), 3) & 0x3fffc);
 			const auto target = add_block_indirect(op, addr);
 			m_ir->CreateCondBr(cond.value, target, add_block_next());
 			return;
 		}
 		// Check sign bit instead (optimization)
 		if (match_vr<s32[4], s64[2]>(op.rt, [&](auto c, auto MP)
 		{
@ -9279,7 +9306,7 @@ public:
 		const auto rt = get_vr<u8[16]>(op.rt);
-		// Checking for zero doeesn't care about the order of the bytes,
+		// Checking for zero doesn't care about the order of the bytes,
 		// so load the data before it's byteswapped
 		if (auto [ok, as] = match_expr(rt, byteswap(match<u8[16]>())); ok)
 		{
@ -9291,6 +9318,21 @@ public:
 			return;
 		}
 		const auto ox = get_vr<u32[4]>(op.rt);
 		// Instead of extracting the value generated by orx, just test the input to orx with ptest
 		if (auto [ok, as] = match_expr(ox, orx(match<u32[4]>())); ok)
 		{
 			m_block->block_end = m_ir->GetInsertBlock();
 			const auto a = extract(bitcast<u64[2]>(as), 0);
 			const auto b = extract(bitcast<u64[2]>(as), 1);
 			const auto cond = eval((a | b) != 0);
 			const auto addr = eval(extract(get_vr(op.ra), 3) & 0x3fffc);
 			const auto target = add_block_indirect(op, addr);
 			m_ir->CreateCondBr(cond.value, target, add_block_next());
 			return;
 		}
 		// Check sign bit instead (optimization)
 		if (match_vr<s32[4], s64[2]>(op.rt, [&](auto c, auto MP)
@ -9514,7 +9556,7 @@ public:
 		const auto rt = get_vr<u8[16]>(op.rt);
-		// Checking for zero doeesn't care about the order of the bytes,
+		// Checking for zero doesn't care about the order of the bytes,
 		// so load the data before it's byteswapped
 		if (auto [ok, as] = match_expr(rt, byteswap(match<u8[16]>())); ok)
 		{
@ -9527,6 +9569,23 @@ public:
 			}
 		}
 		const auto ox = get_vr<u32[4]>(op.rt);
 		// Instead of extracting the value generated by orx, just test the input to orx with ptest
 		if (auto [ok, as] = match_expr(ox, orx(match<u32[4]>())); ok)
 		{
 			if (target != m_pos + 4)
 			{
 				m_block->block_end = m_ir->GetInsertBlock();
 				const auto a = extract(bitcast<u64[2]>(as), 0);
 				const auto b = extract(bitcast<u64[2]>(as), 1);
 				const auto cond = eval((a | b) == 0);
 				m_ir->CreateCondBr(cond.value, add_block(target), add_block(m_pos + 4));
 				return;
 			}
 		}
 		// Check sign bit instead (optimization)
 		if (match_vr<s32[4], s64[2]>(op.rt, [&](auto c, auto MP)
 		{
@ -9573,7 +9632,7 @@ public:
 		const auto rt = get_vr<u8[16]>(op.rt);
-		// Checking for zero doeesn't care about the order of the bytes,
+		// Checking for zero doesn't care about the order of the bytes,
 		// so load the data before it's byteswapped
 		if (auto [ok, as] = match_expr(rt, byteswap(match<u8[16]>())); ok)
 		{
@ -9586,6 +9645,22 @@ public:
 			}
 		}
 		const auto ox = get_vr<u32[4]>(op.rt);
 		// Instead of extracting the value generated by orx, just test the input to orx with ptest
 		if (auto [ok, as] = match_expr(ox, orx(match<u32[4]>())); ok)
 		{
 			if (target != m_pos + 4)
 			{
 				m_block->block_end = m_ir->GetInsertBlock();
 				const auto a = extract(bitcast<u64[2]>(as), 0);
 				const auto b = extract(bitcast<u64[2]>(as), 1);
 				const auto cond = eval((a | b) != 0);
 				m_ir->CreateCondBr(cond.value, add_block(target), add_block(m_pos + 4));
 				return;
 			}
 		}
 		// Check sign bit instead (optimization)
 		if (match_vr<s32[4], s64[2]>(op.rt, [&](auto c, auto MP)
 		{