SPU LLVM: Optimize branch following ORX

- test the input of ORX directly for zeroes, instead of the result
This commit is contained in:
Malcolm Jestadt 2021-10-26 04:56:47 -04:00 committed by Ivan
parent ba727e13ae
commit 3fde455932

View file

@ -6889,12 +6889,23 @@ public:
set_vr(op.rt, pshufb(a, sh)); set_vr(op.rt, pshufb(a, sh));
} }
template <typename T>
static llvm_calli<u32[4], T> orx(T&& a)
{
return {"spu_orx", {std::forward<T>(a)}};
}
void ORX(spu_opcode_t op) void ORX(spu_opcode_t op)
{ {
const auto a = get_vr(op.ra); register_intrinsic("spu_orx", [&](llvm::CallInst* ci)
{
const auto a = value<u32[4]>(ci->getOperand(0));
const auto x = zshuffle(a, 2, 3, 0, 1) | a; const auto x = zshuffle(a, 2, 3, 0, 1) | a;
const auto y = zshuffle(x, 1, 0, 3, 2) | x; const auto y = zshuffle(x, 1, 0, 3, 2) | x;
set_vr(op.rt, zshuffle(y, 4, 4, 4, 3)); return zshuffle(y, 4, 4, 4, 3);
});
set_vr(op.rt, orx(get_vr(op.ra)));
} }
void CBD(spu_opcode_t op) void CBD(spu_opcode_t op)
@ -9234,7 +9245,7 @@ public:
const auto rt = get_vr<u8[16]>(op.rt); const auto rt = get_vr<u8[16]>(op.rt);
// Checking for zero doeesn't care about the order of the bytes, // Checking for zero doesn't care about the order of the bytes,
// so load the data before it's byteswapped // so load the data before it's byteswapped
if (auto [ok, as] = match_expr(rt, byteswap(match<u8[16]>())); ok) if (auto [ok, as] = match_expr(rt, byteswap(match<u8[16]>())); ok)
{ {
@ -9246,6 +9257,22 @@ public:
return; return;
} }
const auto ox = get_vr<u32[4]>(op.rt);
// Instead of extracting the value generated by orx, just test the input to orx with ptest
if (auto [ok, as] = match_expr(ox, orx(match<u32[4]>())); ok)
{
m_block->block_end = m_ir->GetInsertBlock();
const auto a = extract(bitcast<u64[2]>(as), 0);
const auto b = extract(bitcast<u64[2]>(as), 1);
const auto cond = eval((a | b) == 0);
const auto addr = eval(extract(get_vr(op.ra), 3) & 0x3fffc);
const auto target = add_block_indirect(op, addr);
m_ir->CreateCondBr(cond.value, target, add_block_next());
return;
}
// Check sign bit instead (optimization) // Check sign bit instead (optimization)
if (match_vr<s32[4], s64[2]>(op.rt, [&](auto c, auto MP) if (match_vr<s32[4], s64[2]>(op.rt, [&](auto c, auto MP)
{ {
@ -9279,7 +9306,7 @@ public:
const auto rt = get_vr<u8[16]>(op.rt); const auto rt = get_vr<u8[16]>(op.rt);
// Checking for zero doeesn't care about the order of the bytes, // Checking for zero doesn't care about the order of the bytes,
// so load the data before it's byteswapped // so load the data before it's byteswapped
if (auto [ok, as] = match_expr(rt, byteswap(match<u8[16]>())); ok) if (auto [ok, as] = match_expr(rt, byteswap(match<u8[16]>())); ok)
{ {
@ -9291,6 +9318,21 @@ public:
return; return;
} }
const auto ox = get_vr<u32[4]>(op.rt);
// Instead of extracting the value generated by orx, just test the input to orx with ptest
if (auto [ok, as] = match_expr(ox, orx(match<u32[4]>())); ok)
{
m_block->block_end = m_ir->GetInsertBlock();
const auto a = extract(bitcast<u64[2]>(as), 0);
const auto b = extract(bitcast<u64[2]>(as), 1);
const auto cond = eval((a | b) != 0);
const auto addr = eval(extract(get_vr(op.ra), 3) & 0x3fffc);
const auto target = add_block_indirect(op, addr);
m_ir->CreateCondBr(cond.value, target, add_block_next());
return;
}
// Check sign bit instead (optimization) // Check sign bit instead (optimization)
if (match_vr<s32[4], s64[2]>(op.rt, [&](auto c, auto MP) if (match_vr<s32[4], s64[2]>(op.rt, [&](auto c, auto MP)
@ -9514,7 +9556,7 @@ public:
const auto rt = get_vr<u8[16]>(op.rt); const auto rt = get_vr<u8[16]>(op.rt);
// Checking for zero doeesn't care about the order of the bytes, // Checking for zero doesn't care about the order of the bytes,
// so load the data before it's byteswapped // so load the data before it's byteswapped
if (auto [ok, as] = match_expr(rt, byteswap(match<u8[16]>())); ok) if (auto [ok, as] = match_expr(rt, byteswap(match<u8[16]>())); ok)
{ {
@ -9527,6 +9569,23 @@ public:
} }
} }
const auto ox = get_vr<u32[4]>(op.rt);
// Instead of extracting the value generated by orx, just test the input to orx with ptest
if (auto [ok, as] = match_expr(ox, orx(match<u32[4]>())); ok)
{
if (target != m_pos + 4)
{
m_block->block_end = m_ir->GetInsertBlock();
const auto a = extract(bitcast<u64[2]>(as), 0);
const auto b = extract(bitcast<u64[2]>(as), 1);
const auto cond = eval((a | b) == 0);
m_ir->CreateCondBr(cond.value, add_block(target), add_block(m_pos + 4));
return;
}
}
// Check sign bit instead (optimization) // Check sign bit instead (optimization)
if (match_vr<s32[4], s64[2]>(op.rt, [&](auto c, auto MP) if (match_vr<s32[4], s64[2]>(op.rt, [&](auto c, auto MP)
{ {
@ -9573,7 +9632,7 @@ public:
const auto rt = get_vr<u8[16]>(op.rt); const auto rt = get_vr<u8[16]>(op.rt);
// Checking for zero doeesn't care about the order of the bytes, // Checking for zero doesn't care about the order of the bytes,
// so load the data before it's byteswapped // so load the data before it's byteswapped
if (auto [ok, as] = match_expr(rt, byteswap(match<u8[16]>())); ok) if (auto [ok, as] = match_expr(rt, byteswap(match<u8[16]>())); ok)
{ {
@ -9586,6 +9645,22 @@ public:
} }
} }
const auto ox = get_vr<u32[4]>(op.rt);
// Instead of extracting the value generated by orx, just test the input to orx with ptest
if (auto [ok, as] = match_expr(ox, orx(match<u32[4]>())); ok)
{
if (target != m_pos + 4)
{
m_block->block_end = m_ir->GetInsertBlock();
const auto a = extract(bitcast<u64[2]>(as), 0);
const auto b = extract(bitcast<u64[2]>(as), 1);
const auto cond = eval((a | b) != 0);
m_ir->CreateCondBr(cond.value, add_block(target), add_block(m_pos + 4));
return;
}
}
// Check sign bit instead (optimization) // Check sign bit instead (optimization)
if (match_vr<s32[4], s64[2]>(op.rt, [&](auto c, auto MP) if (match_vr<s32[4], s64[2]>(op.rt, [&](auto c, auto MP)
{ {