Implement simd_builder for x86

ASMJIT-based tool for building vectorized loops (such as ones in BufferUtils.cpp)
This commit is contained in:
Nekotekina 2022-08-24 19:36:37 +03:00 committed by Ivan
parent 698c3415ea
commit e28707055b
5 changed files with 740 additions and 387 deletions

View file

@ -51,6 +51,8 @@ using native_asm = asmjit::a64::Assembler;
using native_args = std::array<asmjit::a64::Gp, 4>;
#endif
union v128;
void jit_announce(uptr func, usz size, std::string_view name);
void jit_announce(auto* func, usz size, std::string_view name)
@ -211,40 +213,132 @@ namespace asmjit
}
#if defined(ARCH_X64)
template <uint Size>
struct native_vec;
template <>
struct native_vec<16> { using type = x86::Xmm; };
template <>
struct native_vec<32> { using type = x86::Ymm; };
template <>
struct native_vec<64> { using type = x86::Zmm; };
template <uint Size>
using native_vec_t = typename native_vec<Size>::type;
// if (count > step) { for (; ctr < (count - step); ctr += step) {...} count -= ctr; }
inline void build_incomplete_loop(native_asm& c, auto ctr, auto count, u32 step, auto&& build)
struct simd_builder : native_asm
{
asmjit::Label body = c.newLabel();
asmjit::Label exit = c.newLabel();
Operand v0, v1, v2, v3, v4, v5;
ensure((step & (step - 1)) == 0);
c.cmp(count, step);
c.jbe(exit);
c.sub(count, step);
c.align(asmjit::AlignMode::kCode, 16);
c.bind(body);
build();
c.add(ctr, step);
c.sub(count, step);
c.ja(body);
c.add(count, step);
c.bind(exit);
}
uint vsize = 16;
uint vmask = 0;
simd_builder(CodeHolder* ch) noexcept;
void _init(bool full);
void vec_cleanup_ret();
void vec_set_all_zeros(const Operand& v);
void vec_set_all_ones(const Operand& v);
void vec_set_const(const Operand& v, const v128& value);
void vec_clobbering_test(u32 esize, const Operand& v, const Operand& rhs);
// return x86::ptr(base, ctr, X, 0) where X is set for esize accordingly
x86::Mem ptr_scale_for_vec(u32 esize, const x86::Gp& base, const x86::Gp& index);
void vec_load_unaligned(u32 esize, const Operand& v, const x86::Mem& src);
void vec_store_unaligned(u32 esize, const Operand& v, const x86::Mem& dst);
void vec_partial_move(u32 esize, const Operand& dst, const Operand& src);
void _vec_binary_op(x86::Inst::Id sse_op, x86::Inst::Id vex_op, x86::Inst::Id evex_op, const Operand& dst, const Operand& lhs, const Operand& rhs);
void vec_shuffle_xi8(const Operand& dst, const Operand& lhs, const Operand& rhs)
{
using enum x86::Inst::Id;
_vec_binary_op(kIdPshufb, kIdVpshufb, kIdVpshufb, dst, lhs, rhs);
}
void vec_xor(u32, const Operand& dst, const Operand& lhs, const Operand& rhs)
{
using enum x86::Inst::Id;
_vec_binary_op(kIdPxor, kIdVpxor, kIdVpxord, dst, lhs, rhs);
}
void vec_or(u32, const Operand& dst, const Operand& lhs, const Operand& rhs)
{
using enum x86::Inst::Id;
_vec_binary_op(kIdPor, kIdVpor, kIdVpord, dst, lhs, rhs);
}
void vec_umin(u32 esize, const Operand& dst, const Operand& lhs, const Operand& rhs);
void vec_umax(u32 esize, const Operand& dst, const Operand& lhs, const Operand& rhs);
void vec_umin_horizontal_i128(u32 esize, const x86::Gp& dst, const Operand& src, const Operand& tmp);
void vec_umax_horizontal_i128(u32 esize, const x86::Gp& dst, const Operand& src, const Operand& tmp);
simd_builder& keep_if_not_masked()
{
if (vmask && vmask < 8)
{
this->k(x86::KReg(vmask));
}
return *this;
}
simd_builder& zero_if_not_masked()
{
if (vmask && vmask < 8)
{
this->k(x86::KReg(vmask));
this->z();
}
return *this;
}
void build_loop(u32 esize, auto reg_ctr, auto reg_cnt, auto&& build, auto&& reduce)
{
ensure((esize & (esize - 1)) == 0);
ensure(esize <= vsize);
Label body = this->newLabel();
Label next = this->newLabel();
Label exit = this->newLabel();
const u32 step = vsize / esize;
this->xor_(reg_ctr.r32(), reg_ctr.r32()); // Reset counter reg
this->sub(reg_cnt, step);
this->jb(next); // If count < step, skip main loop body
this->align(AlignMode::kCode, 16);
this->bind(body);
build();
this->add(reg_ctr, step);
this->sub(reg_cnt, step);
this->ja(body);
this->bind(next);
if (!vmask)
reduce();
this->add(reg_cnt, step);
this->jz(exit);
if (vmask)
{
// Build single last iteration (masked)
static constexpr u64 all_ones = -1;
this->bzhi(reg_cnt, x86::Mem(uptr(&all_ones)), reg_cnt);
this->kmovq(x86::k7, reg_cnt);
vmask = 7;
build();
vmask = -1;
reduce();
}
else
{
// Build tail loop (reduced vector width)
Label body = this->newLabel();
this->align(AlignMode::kCode, 16);
this->bind(body);
const uint vsz = vsize / step;
this->_init(false);
vsize = vsz;
build();
this->_init(true);
this->inc(reg_ctr);
this->sub(reg_cnt, 1);
this->ja(body);
}
this->bind(exit);
}
};
// for (; count > 0; ctr++, count--)
inline void build_loop(native_asm& c, auto ctr, auto count, auto&& build)
@ -262,6 +356,27 @@ namespace asmjit
c.ja(body);
c.bind(exit);
}
inline void maybe_flush_lbr(native_asm& c, uint count = 2)
{
// Workaround for bad LBR callstacks which happen in some situations (mainly TSX) - execute additional RETs
Label next = c.newLabel();
c.lea(x86::rcx, x86::qword_ptr(next));
for (u32 i = 0; i < count; i++)
{
c.push(x86::rcx);
c.sub(x86::rcx, 16);
}
for (u32 i = 0; i < count; i++)
{
c.ret();
c.align(asmjit::AlignMode::kCode, 16);
}
c.bind(next);
}
#endif
}