mirror of
https://github.com/RPCS3/rpcs3.git
synced 2025-07-03 05:21:25 +12:00
SPU LLVM: Use 512bit xorsum for SPU verification
- Provides a 2-3% uplift in SPU limited titles - Removes the full_width_avx512 option - Adds a precise spu verification option, for debugging (config file only)
This commit is contained in:
parent
665bb83297
commit
506d92107c
2 changed files with 153 additions and 70 deletions
|
@ -1652,7 +1652,7 @@ public:
|
||||||
u32 elements;
|
u32 elements;
|
||||||
u32 dwords;
|
u32 dwords;
|
||||||
|
|
||||||
if (m_use_avx512 && g_cfg.core.full_width_avx512)
|
if (m_use_avx512)
|
||||||
{
|
{
|
||||||
stride = 64;
|
stride = 64;
|
||||||
elements = 16;
|
elements = 16;
|
||||||
|
@ -1677,94 +1677,177 @@ public:
|
||||||
|
|
||||||
llvm::Value* acc = nullptr;
|
llvm::Value* acc = nullptr;
|
||||||
|
|
||||||
for (u32 j = starta; j < end; j += stride)
|
// Use 512bit xorsum to verify integrity if size is atleast 512b * 3
|
||||||
|
// This code uses a 512bit vector for all hardware to ensure behavior matches.
|
||||||
|
// The xorsum path is still faster even on narrow hardware.
|
||||||
|
if ((end - starta) >= 192 && !g_cfg.core.precise_spu_verification)
|
||||||
{
|
{
|
||||||
int indices[16];
|
for (u32 j = starta; j < end; j += 64)
|
||||||
bool holes = false;
|
|
||||||
bool data = false;
|
|
||||||
|
|
||||||
for (u32 i = 0; i < elements; i++)
|
|
||||||
{
|
{
|
||||||
const u32 k = j + i * 4;
|
int indices[16];
|
||||||
|
bool holes = false;
|
||||||
|
bool data = false;
|
||||||
|
|
||||||
if (k < start || k >= end || !func.data[(k - start) / 4])
|
for (u32 i = 0; i < 16; i++)
|
||||||
{
|
{
|
||||||
indices[i] = elements;
|
const u32 k = j + i * 4;
|
||||||
holes = true;
|
|
||||||
|
if (k < start || k >= end || !func.data[(k - start) / 4])
|
||||||
|
{
|
||||||
|
indices[i] = 16;
|
||||||
|
holes = true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
indices[i] = i;
|
||||||
|
data = true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else
|
|
||||||
|
if (!data)
|
||||||
{
|
{
|
||||||
indices[i] = i;
|
// Skip full-sized holes
|
||||||
data = true;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
if (!data)
|
llvm::Value* vls = nullptr;
|
||||||
{
|
|
||||||
// Skip full-sized holes
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
llvm::Value* vls = nullptr;
|
// Load unaligned code block from LS
|
||||||
|
|
||||||
// Load unaligned code block from LS
|
|
||||||
if (m_use_avx512 && g_cfg.core.full_width_avx512)
|
|
||||||
{
|
|
||||||
vls = m_ir->CreateAlignedLoad(get_type<u32[16]>(), _ptr<u32[16]>(data_addr, j - starta), llvm::MaybeAlign{4});
|
vls = m_ir->CreateAlignedLoad(get_type<u32[16]>(), _ptr<u32[16]>(data_addr, j - starta), llvm::MaybeAlign{4});
|
||||||
|
|
||||||
|
// Mask if necessary
|
||||||
|
if (holes)
|
||||||
|
{
|
||||||
|
vls = m_ir->CreateShuffleVector(vls, ConstantAggregateZero::get(vls->getType()), llvm::ArrayRef(indices, 16));
|
||||||
|
}
|
||||||
|
|
||||||
|
acc = acc ? m_ir->CreateXor(acc, vls) : vls;
|
||||||
|
check_iterations++;
|
||||||
}
|
}
|
||||||
else if (m_use_avx)
|
|
||||||
|
// Create the Xorsum
|
||||||
|
u32 xorsum[16] = {0};
|
||||||
|
|
||||||
|
for (u32 j = 0; j < func.data.size(); j += 16) // Process 16 elements per iteration
|
||||||
{
|
{
|
||||||
vls = m_ir->CreateAlignedLoad(get_type<u32[8]>(), _ptr<u32[8]>(data_addr, j - starta), llvm::MaybeAlign{4});
|
for (u32 i = 0; i < 16; i++)
|
||||||
}
|
{
|
||||||
else
|
if (j + i < func.data.size())
|
||||||
{
|
{
|
||||||
vls = m_ir->CreateAlignedLoad(get_type<u32[4]>(), _ptr<u32[4]>(data_addr, j - starta), llvm::MaybeAlign{4});
|
xorsum[i] ^= func.data[j + i];
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Mask if necessary
|
auto* const_vector = ConstantDataVector::get(m_context, llvm::ArrayRef(xorsum, 16));
|
||||||
if (holes)
|
acc = m_ir->CreateXor(acc, const_vector);
|
||||||
{
|
|
||||||
vls = m_ir->CreateShuffleVector(vls, ConstantAggregateZero::get(vls->getType()), llvm::ArrayRef(indices, elements));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Perform bitwise comparison and accumulate
|
// Pattern for PTEST
|
||||||
u32 words[16];
|
|
||||||
|
|
||||||
for (u32 i = 0; i < elements; i++)
|
|
||||||
{
|
|
||||||
const u32 k = j + i * 4;
|
|
||||||
words[i] = k >= start && k < end ? func.data[(k - start) / 4] : 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
vls = m_ir->CreateXor(vls, ConstantDataVector::get(m_context, llvm::ArrayRef(words, elements)));
|
|
||||||
acc = acc ? m_ir->CreateOr(acc, vls) : vls;
|
|
||||||
check_iterations++;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Pattern for PTEST
|
|
||||||
if (m_use_avx512 && g_cfg.core.full_width_avx512)
|
|
||||||
{
|
|
||||||
acc = m_ir->CreateBitCast(acc, get_type<u64[8]>());
|
acc = m_ir->CreateBitCast(acc, get_type<u64[8]>());
|
||||||
}
|
|
||||||
else if (m_use_avx)
|
llvm::Value* elem = m_ir->CreateExtractElement(acc, u64{0});
|
||||||
{
|
|
||||||
acc = m_ir->CreateBitCast(acc, get_type<u64[4]>());
|
for (u32 i = 1; i < 8; i++)
|
||||||
|
{
|
||||||
|
elem = m_ir->CreateOr(elem, m_ir->CreateExtractElement(acc, i));
|
||||||
|
}
|
||||||
|
|
||||||
|
spu_log.error("end");
|
||||||
|
|
||||||
|
// Compare result with zero
|
||||||
|
const auto cond = m_ir->CreateICmpNE(elem, m_ir->getInt64(0));
|
||||||
|
m_ir->CreateCondBr(cond, label_diff, label_body, m_md_unlikely);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
acc = m_ir->CreateBitCast(acc, get_type<u64[2]>());
|
for (u32 j = starta; j < end; j += stride)
|
||||||
|
{
|
||||||
|
int indices[16];
|
||||||
|
bool holes = false;
|
||||||
|
bool data = false;
|
||||||
|
|
||||||
|
for (u32 i = 0; i < elements; i++)
|
||||||
|
{
|
||||||
|
const u32 k = j + i * 4;
|
||||||
|
|
||||||
|
if (k < start || k >= end || !func.data[(k - start) / 4])
|
||||||
|
{
|
||||||
|
indices[i] = elements;
|
||||||
|
holes = true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
indices[i] = i;
|
||||||
|
data = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!data)
|
||||||
|
{
|
||||||
|
// Skip full-sized holes
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
llvm::Value* vls = nullptr;
|
||||||
|
|
||||||
|
// Load unaligned code block from LS
|
||||||
|
if (m_use_avx512)
|
||||||
|
{
|
||||||
|
vls = m_ir->CreateAlignedLoad(get_type<u32[16]>(), _ptr<u32[16]>(data_addr, j - starta), llvm::MaybeAlign{4});
|
||||||
|
}
|
||||||
|
else if (m_use_avx)
|
||||||
|
{
|
||||||
|
vls = m_ir->CreateAlignedLoad(get_type<u32[8]>(), _ptr<u32[8]>(data_addr, j - starta), llvm::MaybeAlign{4});
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
vls = m_ir->CreateAlignedLoad(get_type<u32[4]>(), _ptr<u32[4]>(data_addr, j - starta), llvm::MaybeAlign{4});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Mask if necessary
|
||||||
|
if (holes)
|
||||||
|
{
|
||||||
|
vls = m_ir->CreateShuffleVector(vls, ConstantAggregateZero::get(vls->getType()), llvm::ArrayRef(indices, elements));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Perform bitwise comparison and accumulate
|
||||||
|
u32 words[16];
|
||||||
|
|
||||||
|
for (u32 i = 0; i < elements; i++)
|
||||||
|
{
|
||||||
|
const u32 k = j + i * 4;
|
||||||
|
words[i] = k >= start && k < end ? func.data[(k - start) / 4] : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
vls = m_ir->CreateXor(vls, ConstantDataVector::get(m_context, llvm::ArrayRef(words, elements)));
|
||||||
|
acc = acc ? m_ir->CreateOr(acc, vls) : vls;
|
||||||
|
check_iterations++;
|
||||||
|
}
|
||||||
|
// Pattern for PTEST
|
||||||
|
if (m_use_avx512)
|
||||||
|
{
|
||||||
|
acc = m_ir->CreateBitCast(acc, get_type<u64[8]>());
|
||||||
|
}
|
||||||
|
else if (m_use_avx)
|
||||||
|
{
|
||||||
|
acc = m_ir->CreateBitCast(acc, get_type<u64[4]>());
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
acc = m_ir->CreateBitCast(acc, get_type<u64[2]>());
|
||||||
|
}
|
||||||
|
|
||||||
|
llvm::Value* elem = m_ir->CreateExtractElement(acc, u64{0});
|
||||||
|
|
||||||
|
for (u32 i = 1; i < dwords; i++)
|
||||||
|
{
|
||||||
|
elem = m_ir->CreateOr(elem, m_ir->CreateExtractElement(acc, i));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compare result with zero
|
||||||
|
const auto cond = m_ir->CreateICmpNE(elem, m_ir->getInt64(0));
|
||||||
|
m_ir->CreateCondBr(cond, label_diff, label_body, m_md_unlikely);
|
||||||
}
|
}
|
||||||
|
|
||||||
llvm::Value* elem = m_ir->CreateExtractElement(acc, u64{0});
|
|
||||||
|
|
||||||
for (u32 i = 1; i < dwords; i++)
|
|
||||||
{
|
|
||||||
elem = m_ir->CreateOr(elem, m_ir->CreateExtractElement(acc, i));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Compare result with zero
|
|
||||||
const auto cond = m_ir->CreateICmpNE(elem, m_ir->getInt64(0));
|
|
||||||
m_ir->CreateCondBr(cond, label_diff, label_body, m_md_unlikely);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Increase block counter with statistics
|
// Increase block counter with statistics
|
||||||
|
|
|
@ -68,7 +68,7 @@ struct cfg_root : cfg::node
|
||||||
cfg::_enum<xfloat_accuracy> spu_xfloat_accuracy{ this, "XFloat Accuracy", xfloat_accuracy::approximate, false };
|
cfg::_enum<xfloat_accuracy> spu_xfloat_accuracy{ this, "XFloat Accuracy", xfloat_accuracy::approximate, false };
|
||||||
cfg::_int<-1, 14> ppu_128_reservations_loop_max_length{ this, "Accurate PPU 128-byte Reservation Op Max Length", 0, true }; // -1: Always accurate, 0: Never accurate, 1-14: max accurate loop length
|
cfg::_int<-1, 14> ppu_128_reservations_loop_max_length{ this, "Accurate PPU 128-byte Reservation Op Max Length", 0, true }; // -1: Always accurate, 0: Never accurate, 1-14: max accurate loop length
|
||||||
cfg::_int<-64, 64> stub_ppu_traps{ this, "Stub PPU Traps", 0, true }; // Hack, skip PPU traps for rare cases where the trap is continueable (specify relative instructions to skip)
|
cfg::_int<-64, 64> stub_ppu_traps{ this, "Stub PPU Traps", 0, true }; // Hack, skip PPU traps for rare cases where the trap is continueable (specify relative instructions to skip)
|
||||||
cfg::_bool full_width_avx512{ this, "Full Width AVX-512", true };
|
cfg::_bool precise_spu_verification{ this, "Precise SPU Verification", false }; // Disables use of xorsum based spu verification if enabled.
|
||||||
cfg::_bool ppu_llvm_nj_fixup{ this, "PPU LLVM Java Mode Handling", true }; // Partially respect current Java Mode for alti-vec ops by PPU LLVM
|
cfg::_bool ppu_llvm_nj_fixup{ this, "PPU LLVM Java Mode Handling", true }; // Partially respect current Java Mode for alti-vec ops by PPU LLVM
|
||||||
cfg::_bool use_accurate_dfma{ this, "Use Accurate DFMA", true }; // Enable accurate double-precision FMA for CPUs which do not support it natively
|
cfg::_bool use_accurate_dfma{ this, "Use Accurate DFMA", true }; // Enable accurate double-precision FMA for CPUs which do not support it natively
|
||||||
cfg::_bool ppu_set_sat_bit{ this, "PPU Set Saturation Bit", false }; // Accuracy. If unset, completely disable saturation flag handling.
|
cfg::_bool ppu_set_sat_bit{ this, "PPU Set Saturation Bit", false }; // Accuracy. If unset, completely disable saturation flag handling.
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue