SPU LLVM: Use 512bit xorsum for SPU verification

- Provides a 2-3% uplift in SPU limited titles - Removes the full_width_avx512 option - Adds a precise spu verification option, for debugging (config file only)
2025-07-03 05:21:25 +12:00 · 2025-01-30 16:00:20 -05:00 · 2025-01-30 16:00:20 -05:00 · 506d92107c
commit 506d92107c
parent 665bb83297
2 changed files with 153 additions and 70 deletions
--- a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp
@ -1652,7 +1652,7 @@ public:
 			u32 elements;
 			u32 dwords;
-			if (m_use_avx512 && g_cfg.core.full_width_avx512)
+			if (m_use_avx512)
 			{
 				stride = 64;
 				elements = 16;
@ -1677,94 +1677,177 @@ public:
 			llvm::Value* acc = nullptr;
-			for (u32 j = starta; j < end; j += stride)
+			// Use 512bit xorsum to verify integrity if size is atleast 512b * 3
 			// This code uses a 512bit vector for all hardware to ensure behavior matches.
 			// The xorsum path is still faster even on narrow hardware.
 			if ((end - starta) >= 192 && !g_cfg.core.precise_spu_verification)
 			{
-				int indices[16];
+				for (u32 j = starta; j < end; j += 64)
 				bool holes = false;
 				bool data = false;
 				for (u32 i = 0; i < elements; i++)
 				{
-					const u32 k = j + i * 4;
+					int indices[16];
 					bool holes = false;
 					bool data = false;
-					if (k < start || k >= end || !func.data[(k - start) / 4])
+					for (u32 i = 0; i < 16; i++)
 					{
-						indices[i] = elements;
+						const u32 k = j + i * 4;
-						holes      = true;
+
 						if (k < start || k >= end || !func.data[(k - start) / 4])
 						{
 							indices[i] = 16;
 							holes      = true;
 						}
 						else
 						{
 							indices[i] = i;
 							data       = true;
 						}
 					}
-					else
+
 					if (!data)
 					{
-						indices[i] = i;
+						// Skip full-sized holes
-						data       = true;
+						continue;
 					}
 				}
-				if (!data)
+					llvm::Value* vls = nullptr;
 				{
 					// Skip full-sized holes
 					continue;
 				}
-				llvm::Value* vls = nullptr;
+					// Load unaligned code block from LS
 				// Load unaligned code block from LS
 				if (m_use_avx512 && g_cfg.core.full_width_avx512)
 				{
 					vls = m_ir->CreateAlignedLoad(get_type<u32[16]>(), _ptr<u32[16]>(data_addr, j - starta), llvm::MaybeAlign{4});
 					// Mask if necessary
 					if (holes)
 					{
 						vls = m_ir->CreateShuffleVector(vls, ConstantAggregateZero::get(vls->getType()), llvm::ArrayRef(indices, 16));
 					}
 					acc = acc ? m_ir->CreateXor(acc, vls) : vls;
 					check_iterations++;
 				}
-				else if (m_use_avx)
+
 				// Create the Xorsum
 				u32 xorsum[16] = {0};
 				for (u32 j = 0; j < func.data.size(); j += 16) // Process 16 elements per iteration
 				{
-					vls = m_ir->CreateAlignedLoad(get_type<u32[8]>(), _ptr<u32[8]>(data_addr, j - starta), llvm::MaybeAlign{4});
+					for (u32 i = 0; i < 16; i++)
-				}
+					{
-				else
+						if (j + i < func.data.size())
-				{
+						{
-					vls = m_ir->CreateAlignedLoad(get_type<u32[4]>(), _ptr<u32[4]>(data_addr, j - starta), llvm::MaybeAlign{4});
+							xorsum[i] ^= func.data[j + i];
 						}
 					}
 				}
-				// Mask if necessary
+				auto* const_vector = ConstantDataVector::get(m_context, llvm::ArrayRef(xorsum, 16));
-				if (holes)
+				acc = m_ir->CreateXor(acc, const_vector);
 				{
 					vls = m_ir->CreateShuffleVector(vls, ConstantAggregateZero::get(vls->getType()), llvm::ArrayRef(indices, elements));
 				}
-				// Perform bitwise comparison and accumulate
+				// Pattern for PTEST
 				u32 words[16];
 				for (u32 i = 0; i < elements; i++)
 				{
 					const u32 k = j + i * 4;
 					words[i] = k >= start && k < end ? func.data[(k - start) / 4] : 0;
 				}
 				vls = m_ir->CreateXor(vls, ConstantDataVector::get(m_context, llvm::ArrayRef(words, elements)));
 				acc = acc ? m_ir->CreateOr(acc, vls) : vls;
 				check_iterations++;
 			}
 			// Pattern for PTEST
 			if (m_use_avx512 && g_cfg.core.full_width_avx512)
 			{
 				acc = m_ir->CreateBitCast(acc, get_type<u64[8]>());
-			}
+
-			else if (m_use_avx)
+				llvm::Value* elem = m_ir->CreateExtractElement(acc, u64{0});
-			{
+
-				acc = m_ir->CreateBitCast(acc, get_type<u64[4]>());
+				for (u32 i = 1; i < 8; i++)
 				{
 					elem = m_ir->CreateOr(elem, m_ir->CreateExtractElement(acc, i));
 				}
 				spu_log.error("end");
 				// Compare result with zero
 				const auto cond = m_ir->CreateICmpNE(elem, m_ir->getInt64(0));
 				m_ir->CreateCondBr(cond, label_diff, label_body, m_md_unlikely);
 			}
 			else
 			{
-				acc = m_ir->CreateBitCast(acc, get_type<u64[2]>());
+				for (u32 j = starta; j < end; j += stride)
 				{
 					int indices[16];
 					bool holes = false;
 					bool data = false;
 					for (u32 i = 0; i < elements; i++)
 					{
 						const u32 k = j + i * 4;
 						if (k < start || k >= end || !func.data[(k - start) / 4])
 						{
 							indices[i] = elements;
 							holes      = true;
 						}
 						else
 						{
 							indices[i] = i;
 							data       = true;
 						}
 					}
 					if (!data)
 					{
 						// Skip full-sized holes
 						continue;
 					}
 					llvm::Value* vls = nullptr;
 					// Load unaligned code block from LS
 					if (m_use_avx512)
 					{
 						vls = m_ir->CreateAlignedLoad(get_type<u32[16]>(), _ptr<u32[16]>(data_addr, j - starta), llvm::MaybeAlign{4});
 					}
 					else if (m_use_avx)
 					{
 						vls = m_ir->CreateAlignedLoad(get_type<u32[8]>(), _ptr<u32[8]>(data_addr, j - starta), llvm::MaybeAlign{4});
 					}
 					else
 					{
 						vls = m_ir->CreateAlignedLoad(get_type<u32[4]>(), _ptr<u32[4]>(data_addr, j - starta), llvm::MaybeAlign{4});
 					}
 					// Mask if necessary
 					if (holes)
 					{
 						vls = m_ir->CreateShuffleVector(vls, ConstantAggregateZero::get(vls->getType()), llvm::ArrayRef(indices, elements));
 					}
 					// Perform bitwise comparison and accumulate
 					u32 words[16];
 					for (u32 i = 0; i < elements; i++)
 					{
 						const u32 k = j + i * 4;
 						words[i] = k >= start && k < end ? func.data[(k - start) / 4] : 0;
 					}
 					vls = m_ir->CreateXor(vls, ConstantDataVector::get(m_context, llvm::ArrayRef(words, elements)));
 					acc = acc ? m_ir->CreateOr(acc, vls) : vls;
 					check_iterations++;
 				}
 				// Pattern for PTEST
 				if (m_use_avx512)
 				{
 					acc = m_ir->CreateBitCast(acc, get_type<u64[8]>());
 				}
 				else if (m_use_avx)
 				{
 					acc = m_ir->CreateBitCast(acc, get_type<u64[4]>());
 				}
 				else
 				{
 					acc = m_ir->CreateBitCast(acc, get_type<u64[2]>());
 				}
 				llvm::Value* elem = m_ir->CreateExtractElement(acc, u64{0});
 				for (u32 i = 1; i < dwords; i++)
 				{
 					elem = m_ir->CreateOr(elem, m_ir->CreateExtractElement(acc, i));
 				}
 				// Compare result with zero
 				const auto cond = m_ir->CreateICmpNE(elem, m_ir->getInt64(0));
 				m_ir->CreateCondBr(cond, label_diff, label_body, m_md_unlikely);
 			}
 			llvm::Value* elem = m_ir->CreateExtractElement(acc, u64{0});
 			for (u32 i = 1; i < dwords; i++)
 			{
 				elem = m_ir->CreateOr(elem, m_ir->CreateExtractElement(acc, i));
 			}
 			// Compare result with zero
 			const auto cond = m_ir->CreateICmpNE(elem, m_ir->getInt64(0));
 			m_ir->CreateCondBr(cond, label_diff, label_body, m_md_unlikely);
 		}
 		// Increase block counter with statistics
--- a/rpcs3/Emu/system_config.h
+++ b/rpcs3/Emu/system_config.h
@ -68,7 +68,7 @@ struct cfg_root : cfg::node
 		cfg::_enum<xfloat_accuracy> spu_xfloat_accuracy{ this, "XFloat Accuracy", xfloat_accuracy::approximate, false };
 		cfg::_int<-1, 14> ppu_128_reservations_loop_max_length{ this, "Accurate PPU 128-byte Reservation Op Max Length", 0, true }; // -1: Always accurate, 0: Never accurate, 1-14: max accurate loop length
 		cfg::_int<-64, 64> stub_ppu_traps{ this, "Stub PPU Traps", 0, true }; // Hack, skip PPU traps for rare cases where the trap is continueable (specify relative instructions to skip)
-		cfg::_bool full_width_avx512{ this, "Full Width AVX-512", true };
+		cfg::_bool precise_spu_verification{ this, "Precise SPU Verification", false }; // Disables use of xorsum based spu verification if enabled.
 		cfg::_bool ppu_llvm_nj_fixup{ this, "PPU LLVM Java Mode Handling", true }; // Partially respect current Java Mode for alti-vec ops by PPU LLVM
 		cfg::_bool use_accurate_dfma{ this, "Use Accurate DFMA", true }; // Enable accurate double-precision FMA for CPUs which do not support it natively
 		cfg::_bool ppu_set_sat_bit{ this, "PPU Set Saturation Bit", false }; // Accuracy. If unset, completely disable saturation flag handling.