diff --git a/rpcs3/Emu/CPU/CPUTranslator.h b/rpcs3/Emu/CPU/CPUTranslator.h
index d622fcdf21..848eda53f8 100644
--- a/rpcs3/Emu/CPU/CPUTranslator.h
+++ b/rpcs3/Emu/CPU/CPUTranslator.h
@@ -386,6 +386,12 @@ struct llvm_match_t
 		return value != nullptr;
 	}
 
+	template <typename... Args>
+	bool eq(const Args&... args) const
+	{
+		return value && ((value == args.value) && ...);
+	}
+
 	llvm::Value* eval(llvm::IRBuilder<>* ir) const
 	{
 		return value;
@@ -405,6 +411,8 @@ struct llvm_match_t
 template <typename T, typename U = llvm_common_t<llvm_value_t<T>>>
 struct llvm_placeholder_t
 {
+	// TODO: placeholder extracting actual constant values (u64, f64, vector, etc)
+
 	using type = T;
 
 	llvm::Value* eval(llvm::IRBuilder<>* ir) const
@@ -416,7 +424,7 @@ struct llvm_placeholder_t
 	{
 		if (value && value->getType() == llvm_value_t<T>::get_type(value->getContext()))
 		{
-			return {value};
+			return {{value}};
 		}
 
 		value = nullptr;
@@ -1339,7 +1347,7 @@ struct llvm_cmp
 		llvm::Value* v1 = {};
 		llvm::Value* v2 = {};
 
-		if (auto i = llvm::dyn_cast_or_null<llvm::ICmpInst>(value); i && i->getOpcode() == pred)
+		if (auto i = llvm::dyn_cast_or_null<llvm::ICmpInst>(value); i && i->getPredicate() == pred)
 		{
 			v1 = i->getOperand(0);
 			v2 = i->getOperand(1);
@@ -1399,7 +1407,7 @@ struct llvm_ord
 		llvm::Value* v1 = {};
 		llvm::Value* v2 = {};
 
-		if (auto i = llvm::dyn_cast_or_null<llvm::FCmpInst>(value); i && i->getOpcode() == pred)
+		if (auto i = llvm::dyn_cast_or_null<llvm::FCmpInst>(value); i && i->getPredicate() == pred)
 		{
 			v1 = i->getOperand(0);
 			v2 = i->getOperand(1);
@@ -1452,7 +1460,7 @@ struct llvm_uno
 		llvm::Value* v1 = {};
 		llvm::Value* v2 = {};
 
-		if (auto i = llvm::dyn_cast_or_null<llvm::FCmpInst>(value); i && i->getOpcode() == pred)
+		if (auto i = llvm::dyn_cast_or_null<llvm::FCmpInst>(value); i && i->getPredicate() == pred)
 		{
 			v1 = i->getOperand(0);
 			v2 = i->getOperand(1);
@@ -1591,6 +1599,7 @@ struct llvm_bitcast
 	using type = U;
 
 	llvm_expr_t<A1> a1;
+	llvm::Module* module;
 
 	static constexpr uint bitsize0 = llvm_value_t<T>::is_vector ? llvm_value_t<T>::is_vector * llvm_value_t<T>::esize : llvm_value_t<T>::esize;
 	static constexpr uint bitsize1 = llvm_value_t<U>::is_vector ? llvm_value_t<U>::is_vector * llvm_value_t<U>::esize : llvm_value_t<U>::esize;
@@ -1598,8 +1607,6 @@ struct llvm_bitcast
 	static_assert(bitsize0 == bitsize1, "llvm_bitcast<>: invalid type (size mismatch)");
 	static_assert(llvm_value_t<T>::is_int || llvm_value_t<T>::is_float, "llvm_bitcast<>: invalid type");
 	static_assert(llvm_value_t<U>::is_int || llvm_value_t<U>::is_float, "llvm_bitcast<>: invalid result type");
-	static_assert(llvm_value_t<T>::is_int != llvm_value_t<U>::is_int || llvm_value_t<T>::is_vector != llvm_value_t<U>::is_vector,
-		"llvm_bitcast<>: no-op cast (use noncast)");
 
 	static constexpr bool is_ok =
 		bitsize0 && bitsize0 == bitsize1 &&
@@ -1611,9 +1618,13 @@ struct llvm_bitcast
 		const auto v1 = a1.eval(ir);
 		const auto rt = llvm_value_t<U>::get_type(ir->getContext());
 
+		if constexpr (llvm_value_t<T>::is_int == llvm_value_t<U>::is_int && llvm_value_t<T>::is_vector == llvm_value_t<U>::is_vector)
+		{
+			return v1;
+		}
+
 		if (const auto c1 = llvm::dyn_cast<llvm::Constant>(v1))
 		{
-			const auto module = ir->GetInsertBlock()->getParent()->getParent();
 			const auto result = llvm::ConstantFoldCastOperand(llvm::Instruction::BitCast, c1, rt, module->getDataLayout());
 
 			if (result)
@@ -1627,6 +1638,19 @@ struct llvm_bitcast
 
 	llvm_match_tuple<A1> match(llvm::Value*& value) const
 	{
+		if constexpr (llvm_value_t<T>::is_int == llvm_value_t<U>::is_int && llvm_value_t<T>::is_vector == llvm_value_t<U>::is_vector)
+		{
+			if (value)
+			{
+				if (auto r1 = a1.match(value); value)
+				{
+					return r1;
+				}
+			}
+
+			return {};
+		}
+
 		llvm::Value* v1 = {};
 
 		if (auto i = llvm::dyn_cast_or_null<llvm::CastInst>(value); i && i->getOpcode() == llvm::Instruction::BitCast)
@@ -1644,17 +1668,16 @@ struct llvm_bitcast
 
 		if (auto c = llvm::dyn_cast_or_null<llvm::Constant>(value))
 		{
-			// TODO
-			// const auto target = llvm_value_t<T>::get_type(c->getContext());
+			const auto target = llvm_value_t<T>::get_type(c->getContext());
 
-			// // Reverse bitcast on a constant
-			// if (llvm::Value* cv = llvm::ConstantFoldCastOperand(llvm::Instruction::BitCast, c, target, module->getDataLayout()))
-			// {
-			// 	if (auto r1 = a1.match(cv); cv)
-			// 	{
-			// 		return r1;
-			// 	}
-			// }
+			// Reverse bitcast on a constant
+			if (llvm::Value* cv = llvm::ConstantFoldCastOperand(llvm::Instruction::BitCast, c, target, module->getDataLayout()))
+			{
+				if (auto r1 = a1.match(cv); cv)
+				{
+					return r1;
+				}
+			}
 		}
 
 		value = nullptr;
@@ -1859,24 +1882,40 @@ struct llvm_min
 
 	static constexpr bool is_ok = llvm_value_t<T>::is_sint || llvm_value_t<T>::is_uint;
 
+	static constexpr auto pred = llvm_value_t<T>::is_sint ? llvm::ICmpInst::ICMP_SLT : llvm::ICmpInst::ICMP_ULT;
+
 	llvm::Value* eval(llvm::IRBuilder<>* ir) const
 	{
 		const auto v1 = a1.eval(ir);
 		const auto v2 = a2.eval(ir);
-
-		if constexpr (llvm_value_t<T>::is_sint)
-		{
-			return ir->CreateSelect(ir->CreateICmpSLT(v1, v2), v1, v2);
-		}
-
-		if constexpr (llvm_value_t<T>::is_uint)
-		{
-			return ir->CreateSelect(ir->CreateICmpULT(v1, v2), v1, v2);
-		}
+		return ir->CreateSelect(ir->CreateICmp(pred, v1, v2), v1, v2);
 	}
 
 	llvm_match_tuple<A1, A2> match(llvm::Value*& value) const
 	{
+		llvm::Value* v1 = {};
+		llvm::Value* v2 = {};
+
+		if (auto i = llvm::dyn_cast_or_null<llvm::SelectInst>(value))
+		{
+			v1 = i->getOperand(1);
+			v2 = i->getOperand(2);
+
+			if (auto j = llvm::dyn_cast<llvm::ICmpInst>(i->getOperand(0)); j && j->getPredicate() == pred)
+			{
+				if (v1 == j->getOperand(0) && v2 == j->getOperand(1))
+				{
+					if (auto r1 = a1.match(v1); v1)
+					{
+						if (auto r2 = a2.match(v2); v2)
+						{
+							return std::tuple_cat(r1, r2);
+						}
+					}
+				}
+			}
+		}
+
 		value = nullptr;
 		return {};
 	}
@@ -1892,24 +1931,40 @@ struct llvm_max
 
 	static_assert(llvm_value_t<T>::is_sint || llvm_value_t<T>::is_uint, "llvm_max<>: invalid type");
 
+	static constexpr auto pred = llvm_value_t<T>::is_sint ? llvm::ICmpInst::ICMP_SLT : llvm::ICmpInst::ICMP_ULT;
+
 	llvm::Value* eval(llvm::IRBuilder<>* ir) const
 	{
 		const auto v1 = a1.eval(ir);
 		const auto v2 = a2.eval(ir);
-
-		if constexpr (llvm_value_t<T>::is_sint)
-		{
-			return ir->CreateSelect(ir->CreateICmpSLT(v1, v2), v2, v1);
-		}
-
-		if constexpr (llvm_value_t<T>::is_uint)
-		{
-			return ir->CreateSelect(ir->CreateICmpULT(v1, v2), v2, v1);
-		}
+		return ir->CreateSelect(ir->CreateICmp(pred, v1, v2), v2, v1);
 	}
 
 	llvm_match_tuple<A1, A2> match(llvm::Value*& value) const
 	{
+		llvm::Value* v1 = {};
+		llvm::Value* v2 = {};
+
+		if (auto i = llvm::dyn_cast_or_null<llvm::SelectInst>(value))
+		{
+			v1 = i->getOperand(2);
+			v2 = i->getOperand(1);
+
+			if (auto j = llvm::dyn_cast<llvm::ICmpInst>(i->getOperand(0)); j && j->getPredicate() == pred)
+			{
+				if (v1 == j->getOperand(0) && v2 == j->getOperand(1))
+				{
+					if (auto r1 = a1.match(v1); v1)
+					{
+						if (auto r2 = a2.match(v2); v2)
+						{
+							return std::tuple_cat(r1, r2);
+						}
+					}
+				}
+			}
+		}
+
 		value = nullptr;
 		return {};
 	}
@@ -2136,8 +2191,8 @@ struct llvm_insert
 		if (auto i = llvm::dyn_cast_or_null<llvm::InsertElementInst>(value))
 		{
 			v1 = i->getOperand(0);
-			v2 = i->getOperand(1);
-			v3 = i->getOperand(2);
+			v2 = i->getOperand(2);
+			v3 = i->getOperand(1);
 
 			if (auto r1 = a1.match(v1); v1)
 			{
@@ -2181,6 +2236,27 @@ struct llvm_splat
 
 	llvm_match_tuple<A1> match(llvm::Value*& value) const
 	{
+		llvm::Value* v1 = {};
+
+		if (auto i = llvm::dyn_cast_or_null<llvm::ShuffleVectorInst>(value))
+		{
+			if (llvm::isa<llvm::ConstantAggregateZero>(i->getOperand(2)))
+			{
+				if (auto j = llvm::dyn_cast<llvm::InsertElementInst>(i->getOperand(0)))
+				{
+					if (llvm::cast<llvm::ConstantInt>(j->getOperand(2))->isZero())
+					{
+						v1 = j->getOperand(1);
+
+						if (auto r1 = a1.match(v1); v1)
+						{
+							return r1;
+						}
+					}
+				}
+			}
+		}
+
 		value = nullptr;
 		return {};
 	}
@@ -2207,6 +2283,24 @@ struct llvm_zshuffle
 
 	llvm_match_tuple<A1> match(llvm::Value*& value) const
 	{
+		llvm::Value* v1 = {};
+
+		if (auto i = llvm::dyn_cast_or_null<llvm::ShuffleVectorInst>(value))
+		{
+			v1 = i->getOperand(0);
+
+			if (auto z = llvm::dyn_cast<llvm::ConstantAggregateZero>(i->getOperand(1)); z && z->getType() == v1->getType())
+			{
+				if (llvm::ConstantDataVector::get(value->getContext(), index_array) == i->getOperand(2))
+				{
+					if (auto r1 = a1.match(v1); v1)
+					{
+						return r1;
+					}
+				}
+			}
+		}
+
 		value = nullptr;
 		return {};
 	}
@@ -2235,6 +2329,29 @@ struct llvm_shuffle2
 
 	llvm_match_tuple<A1, A2> match(llvm::Value*& value) const
 	{
+		llvm::Value* v1 = {};
+		llvm::Value* v2 = {};
+
+		if (auto i = llvm::dyn_cast_or_null<llvm::ShuffleVectorInst>(value))
+		{
+			v1 = i->getOperand(0);
+			v2 = i->getOperand(1);
+
+			if (v1->getType() == v2->getType() && v1->getType() == llvm_value_t<T>::get_type(v1->getContext()))
+			{
+				if (llvm::ConstantDataVector::get(value->getContext(), index_array) == i->getOperand(2))
+				{
+					if (auto r1 = a1.match(v1); v1)
+					{
+						if (auto r2 = a2.match(v2); v2)
+						{
+							return std::tuple_cat(r1, r2);
+						}
+					}
+				}
+			}
+		}
+
 		value = nullptr;
 		return {};
 	}
@@ -2304,6 +2421,27 @@ public:
 		return result;
 	}
 
+	template <typename T>
+	static llvm_placeholder_t<T> match()
+	{
+		return {};
+	}
+
+	template <typename T, typename U, typename = llvm_common_t<T, U>>
+	auto match_expr(T&& arg, U&& expr) -> decltype(std::tuple_cat(std::make_tuple(false), expr.match(std::declval<llvm::Value*&>())))
+	{
+		auto v = arg.eval(m_ir);
+		auto r = expr.match(v);
+		return std::tuple_cat(std::make_tuple(v != nullptr), r);
+	}
+
+	template <typename... Types, typename F>
+	bool match_for(F&& pred)
+	{
+		// Execute pred(.) for each type until one of them returns true
+		return (pred(llvm_placeholder_t<Types>{}) || ...);
+	}
+
 	template <typename T, typename = std::enable_if_t<is_llvm_cmp<std::decay_t<T>>::value>>
 	static auto fcmp_ord(T&& cmp_expr)
 	{
@@ -2323,9 +2461,9 @@ public:
 	}
 
 	template <typename U, typename T, typename = std::enable_if_t<llvm_bitcast<U, T>::is_ok>>
-	static auto bitcast(T&& expr)
+	auto bitcast(T&& expr)
 	{
-		return llvm_bitcast<U, T>{std::forward<T>(expr)};
+		return llvm_bitcast<U, T>{std::forward<T>(expr), m_module};
 	}
 
 	template <typename U, typename T, typename = std::enable_if_t<llvm_trunc<U, T>::is_ok>>
diff --git a/rpcs3/Emu/Cell/SPURecompiler.cpp b/rpcs3/Emu/Cell/SPURecompiler.cpp
index b7d2cd5e59..56a4f7c288 100644
--- a/rpcs3/Emu/Cell/SPURecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPURecompiler.cpp
@@ -2478,27 +2478,6 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 	{
 		verify("double_to_xfloat" HERE), val, val->getType() == get_type<f64[4]>();
 
-		// Detect xfloat_to_double to avoid unnecessary ops and prevent zeroed denormals
-		if (auto _bitcast = llvm::dyn_cast<llvm::CastInst>(val))
-		{
-			if (_bitcast->getOpcode() == llvm::Instruction::BitCast)
-			{
-				if (auto _select = llvm::dyn_cast<llvm::SelectInst>(_bitcast->getOperand(0)))
-				{
-					if (auto _icmp = llvm::dyn_cast<llvm::ICmpInst>(_select->getOperand(0)))
-					{
-						if (auto _and = llvm::dyn_cast<llvm::BinaryOperator>(_icmp->getOperand(0)))
-						{
-							if (auto _zext = llvm::dyn_cast<llvm::CastInst>(_and->getOperand(0)))
-							{
-								// TODO: check all details and return xfloat_to_double() arg
-							}
-						}
-					}
-				}
-			}
-		}
-
 		const auto d = double_as_uint64(val);
 		const auto s = m_ir->CreateAnd(m_ir->CreateLShr(d, 32), 0x80000000);
 		const auto m = m_ir->CreateXor(m_ir->CreateLShr(d, 29), 0x40000000);
@@ -2680,6 +2659,12 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 		return r;
 	}
 
+	template <typename U, uint I>
+	auto get_vr_as(U&&, const bf_t<u32, I, 7>& index)
+	{
+		return get_vr<typename llvm_expr_t<U>::type>(index);
+	}
+
 	template <typename T = u32[4], typename... Args>
 	std::tuple<std::conditional_t<false, Args, value_t<T>>...> get_vrs(const Args&... args)
 	{
@@ -2705,12 +2690,51 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 		return r;
 	}
 
+	template <typename U, uint I>
+	auto match_vr_as(U&&, const bf_t<u32, I, 7>& index)
+	{
+		return match_vr<typename llvm_expr_t<U>::type>(index);
+	}
+
+	template <typename... Types, uint I, typename F>
+	bool match_vr(const bf_t<u32, I, 7>& index, F&& pred)
+	{
+		return ((match_vr<Types>(index) && pred(match_vr<Types>(index), match<Types>())) || ...);
+	}
+
 	template <typename T = u32[4], typename... Args>
 	std::tuple<std::conditional_t<false, Args, llvm_match_t<T>>...> match_vrs(const Args&... args)
 	{
 		return {match_vr<T>(args)...};
 	}
 
+	// Extract scalar value from the preferred slot
+	template <typename T>
+	auto get_scalar(T&& value)
+	{
+		using v_type = typename llvm_expr_t<T>::type;
+		using e_type = std::remove_extent_t<v_type>;
+
+		static_assert(sizeof(v_type) == 16 || std::is_same_v<f64[4], v_type>, "Unknown vector type");
+
+		if constexpr (sizeof(e_type) == 1)
+		{
+			return extract(std::forward<T>(value), 12);
+		}
+		else if constexpr (sizeof(e_type) == 2)
+		{
+			return extract(std::forward<T>(value), 6);
+		}
+		else if constexpr (sizeof(e_type) == 4 || sizeof(v_type) == 32)
+		{
+			return extract(std::forward<T>(value), 3);
+		}
+		else
+		{
+			return extract(std::forward<T>(value), 1);
+		}
+	}
+
 	void set_reg_fixed(u32 index, llvm::Value* value, bool fixup = true)
 	{
 		llvm::StoreInst* dummy{};
@@ -4987,15 +5011,21 @@ public:
 
 	void AND(spu_opcode_t op)
 	{
-		if (const auto [a, b] = match_vrs<u8[16]>(op.ra, op.rb); a && b)
+		if (match_vr<u8[16], u16[8], u64[2]>(op.ra, [&](auto a, auto MP1)
 		{
-			set_vr(op.rt, a & b);
-			return;
-		}
+			if (auto b = match_vr_as(a, op.rb))
+			{
+				set_vr(op.rt, a & b);
+				return true;
+			}
 
-		if (const auto [a, b] = match_vrs<u16[8]>(op.ra, op.rb); a && b)
+			return match_vr<u8[16], u16[8], u64[2]>(op.rb, [&](auto b, auto MP2)
+			{
+				set_vr(op.rt, a & get_vr_as(a, op.rb));
+				return true;
+			});
+		}))
 		{
-			set_vr(op.rt, a & b);
 			return;
 		}
 
@@ -5086,36 +5116,37 @@ public:
 		set_vr(op.rt, pshufb(get_vr<u8[16]>(op.ra), sh));
 	}
 
+	template <typename RT, typename T>
+	auto spu_get_insertion_shuffle_mask(T&& index)
+	{
+		const auto c = bitcast<RT>(build<u8[16]>(0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10));
+		using e_type = std::remove_extent_t<RT>;
+		const auto v = splat<e_type>(static_cast<e_type>(sizeof(e_type) == 8 ? 0x01020304050607ull : 0x010203ull));
+		return insert(c, std::forward<T>(index), v);
+	}
+
 	void CBX(spu_opcode_t op)
 	{
-		const auto s = eval(extract(get_vr(op.ra), 3) + extract(get_vr(op.rb), 3));
-		const auto i = eval(~s & 0xf);
-		auto r = build<u8[16]>(0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10);
-		set_vr(op.rt, insert(r, i, splat<u8>(0x03)));
+		const auto s = get_scalar(get_vr(op.ra)) + get_scalar(get_vr(op.rb));
+		set_vr(op.rt, spu_get_insertion_shuffle_mask<u8[16]>(~s & 0xf));
 	}
 
 	void CHX(spu_opcode_t op)
 	{
-		const auto s = eval(extract(get_vr(op.ra), 3) + extract(get_vr(op.rb), 3));
-		const auto i = eval(~s >> 1 & 0x7);
-		auto r = build<u16[8]>(0x1e1f, 0x1c1d, 0x1a1b, 0x1819, 0x1617, 0x1415, 0x1213, 0x1011);
-		set_vr(op.rt, insert(r, i, splat<u16>(0x0203)));
+		const auto s = get_scalar(get_vr(op.ra)) + get_scalar(get_vr(op.rb));
+		set_vr(op.rt, spu_get_insertion_shuffle_mask<u16[8]>(~s >> 1 & 0x7));
 	}
 
 	void CWX(spu_opcode_t op)
 	{
-		const auto s = eval(extract(get_vr(op.ra), 3) + extract(get_vr(op.rb), 3));
-		const auto i = eval(~s >> 2 & 0x3);
-		auto r = build<u32[4]>(0x1c1d1e1f, 0x18191a1b, 0x14151617, 0x10111213);
-		set_vr(op.rt, insert(r, i, splat<u32>(0x010203)));
+		const auto s = get_scalar(get_vr(op.ra)) + get_scalar(get_vr(op.rb));
+		set_vr(op.rt, spu_get_insertion_shuffle_mask<u32[4]>(~s >> 2 & 0x3));
 	}
 
 	void CDX(spu_opcode_t op)
 	{
-		const auto s = eval(extract(get_vr(op.ra), 3) + extract(get_vr(op.rb), 3));
-		const auto i = eval(~s >> 3 & 0x1);
-		auto r = build<u64[2]>(0x18191a1b1c1d1e1f, 0x1011121314151617);
-		set_vr(op.rt, insert(r, i, splat<u64>(0x01020304050607)));
+		const auto s = get_scalar(get_vr(op.ra)) + get_scalar(get_vr(op.rb));
+		set_vr(op.rt, spu_get_insertion_shuffle_mask<u64[2]>(~s >> 3 & 0x1));
 	}
 
 	void ROTQBI(spu_opcode_t op)
@@ -5176,34 +5207,26 @@ public:
 
 	void CBD(spu_opcode_t op)
 	{
-		const auto a = eval(extract(get_vr(op.ra), 3) + get_imm<u32>(op.i7));
-		const auto i = eval(~a & 0xf);
-		auto r = build<u8[16]>(0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10);
-		set_vr(op.rt, insert(r, i, splat<u8>(0x03)));
+		const auto a = get_scalar(get_vr(op.ra)) + get_imm<u32>(op.i7);
+		set_vr(op.rt, spu_get_insertion_shuffle_mask<u8[16]>(~a & 0xf));
 	}
 
 	void CHD(spu_opcode_t op)
 	{
-		const auto a = eval(extract(get_vr(op.ra), 3) + get_imm<u32>(op.i7));
-		const auto i = eval(~a >> 1 & 0x7);
-		auto r = build<u16[8]>(0x1e1f, 0x1c1d, 0x1a1b, 0x1819, 0x1617, 0x1415, 0x1213, 0x1011);
-		set_vr(op.rt, insert(r, i, splat<u16>(0x0203)));
+		const auto a = get_scalar(get_vr(op.ra)) + get_imm<u32>(op.i7);
+		set_vr(op.rt, spu_get_insertion_shuffle_mask<u16[8]>(~a >> 1 & 0x7));
 	}
 
 	void CWD(spu_opcode_t op)
 	{
-		const auto a = eval(extract(get_vr(op.ra), 3) + get_imm<u32>(op.i7));
-		const auto i = eval(~a >> 2 & 0x3);
-		auto r = build<u32[4]>(0x1c1d1e1f, 0x18191a1b, 0x14151617, 0x10111213);
-		set_vr(op.rt, insert(r, i, splat<u32>(0x010203)));
+		const auto a = get_scalar(get_vr(op.ra)) + get_imm<u32>(op.i7);
+		set_vr(op.rt, spu_get_insertion_shuffle_mask<u32[4]>(~a >> 2 & 0x3));
 	}
 
 	void CDD(spu_opcode_t op)
 	{
-		const auto a = eval(extract(get_vr(op.ra), 3) + get_imm<u32>(op.i7));
-		const auto i = eval(~a >> 3 & 0x1);
-		auto r = build<u64[2]>(0x18191a1b1c1d1e1f, 0x1011121314151617);
-		set_vr(op.rt, insert(r, i, splat<u64>(0x01020304050607)));
+		const auto a = get_scalar(get_vr(op.ra)) + get_imm<u32>(op.i7);
+		set_vr(op.rt, spu_get_insertion_shuffle_mask<u64[2]>(~a >> 3 & 0x1));
 	}
 
 	void ROTQBII(spu_opcode_t op)
@@ -5656,48 +5679,21 @@ public:
 
 	void SHUFB(spu_opcode_t op) //
 	{
-		if (auto ii = llvm::dyn_cast_or_null<llvm::InsertElementInst>(get_reg_raw(op.rc)))
+		if (match_vr<u8[16], u16[8], u32[4], u64[2]>(op.rc, [&](auto c, auto MP)
 		{
-			// Detect if the mask comes from a CWD-like constant generation instruction
-			auto c0 = llvm::dyn_cast<llvm::Constant>(ii->getOperand(0));
+			using VT = typename decltype(MP)::type;
 
-			if (c0 && get_const_vector(c0, m_pos, op.rc) != v128::from64(0x18191a1b1c1d1e1f, 0x1011121314151617))
+			// If the mask comes from a constant generation instruction, replace SHUFB with insert
+			if (auto [ok, i] = match_expr(c, spu_get_insertion_shuffle_mask<VT>(match<u32>())); ok)
 			{
-				c0 = nullptr;
+				set_vr(op.rt4, insert(get_vr_as(c, op.rb), i, get_scalar(get_vr_as(c, op.ra))));
+				return true;
 			}
 
-			auto c1 = llvm::dyn_cast<llvm::ConstantInt>(ii->getOperand(1));
-
-			llvm::Type* vtype = nullptr;
-			llvm::Value* _new = nullptr;
-
-			// Optimization: emit SHUFB as simple vector insert
-			if (c0 && c1 && c1->getType() == get_type<u64>() && c1->getZExtValue() == 0x01020304050607)
-			{
-				vtype = get_type<u64[2]>();
-				_new  = extract(get_vr<u64[2]>(op.ra), 1).eval(m_ir);
-			}
-			else if (c0 && c1 && c1->getType() == get_type<u32>() && c1->getZExtValue() == 0x010203)
-			{
-				vtype = get_type<u32[4]>();
-				_new  = extract(get_vr<u32[4]>(op.ra), 3).eval(m_ir);
-			}
-			else if (c0 && c1 && c1->getType() == get_type<u16>() && c1->getZExtValue() == 0x0203)
-			{
-				vtype = get_type<u16[8]>();
-				_new  = extract(get_vr<u16[8]>(op.ra), 6).eval(m_ir);
-			}
-			else if (c0 && c1 && c1->getType() == get_type<u8>() && c1->getZExtValue() == 0x03)
-			{
-				vtype = get_type<u8[16]>();
-				_new  = extract(get_vr<u8[16]>(op.ra), 12).eval(m_ir);
-			}
-
-			if (vtype && _new)
-			{
-				set_reg_fixed(op.rt4, m_ir->CreateInsertElement(get_reg_fixed(op.rb, vtype), _new, ii->getOperand(2)));
-				return;
-			}
+			return false;
+		}))
+		{
+			return;
 		}
 
 		const auto c = get_vr<u8[16]>(op.rc);