spu/arm64: clean up assembly code generation

Clean up asmjit usage so we don't unnecessarily allocate memory
anymore for SPURecompiler functions.
This commit is contained in:
sguo35 2022-09-04 20:25:53 -04:00 committed by Ivan
parent 5e2424da58
commit a0d48c588a

View file

@ -1038,43 +1038,73 @@ spu_function_t spu_runtime::rebuild_ubertrampoline(u32 id_inst)
auto make_jump = [&](asmjit::arm::CondCode op, auto target) auto make_jump = [&](asmjit::arm::CondCode op, auto target)
{ {
// 36 bytes
// Fallback to dispatch if no target // Fallback to dispatch if no target
const u64 taddr = target ? reinterpret_cast<u64>(target) : reinterpret_cast<u64>(tr_dispatch); const u64 taddr = target ? reinterpret_cast<u64>(target) : reinterpret_cast<u64>(tr_dispatch);
// Build with asmjit to make things more readable // ldr x9, #16 -> ldr x9, taddr
// Might cost some RAM, but meh whatever *raw++ = 0x89;
auto temp = build_function_asm<spu_function_t>("", [&](native_asm& c, auto& args) *raw++ = 0x00;
*raw++ = 0x00;
*raw++ = 0x58;
if (op == asmjit::arm::CondCode::kAlways)
{
// br x9
*raw++ = 0x20;
*raw++ = 0x01;
*raw++ = 0x1F;
*raw++ = 0xD6;
// nop
*raw++ = 0x1F;
*raw++ = 0x20;
*raw++ = 0x03;
*raw++ = 0xD5;
// nop
*raw++ = 0x1F;
*raw++ = 0x20;
*raw++ = 0x03;
*raw++ = 0xD5;
}
else
{
// b.COND #8 -> b.COND do_branch
switch (op)
{ {
using namespace asmjit; case asmjit::arm::CondCode::kUnsignedLT:
*raw++ = 0x43;
break;
case asmjit::arm::CondCode::kUnsignedGT:
*raw++ = 0x48;
break;
default:
asm("brk 0x42");
}
c.movk(a64::x9, Imm(static_cast<u16>(taddr >> 48)), Imm(48)); *raw++ = 0x00;
c.movk(a64::x9, Imm(static_cast<u16>(taddr >> 32)), Imm(32)); *raw++ = 0x00;
c.movk(a64::x9, Imm(static_cast<u16>(taddr >> 16)), Imm(16)); *raw++ = 0x54;
c.movk(a64::x9, Imm(static_cast<u16>(taddr)), Imm(0));
if (op == arm::CondCode::kAlways) // b #16 -> b cont
{ *raw++ = 0x04;
c.br(a64::x9); *raw++ = 0x00;
// Constant length per jmp for easier stub patching *raw++ = 0x00;
c.nop(); *raw++ = 0x14;
c.nop();
} else
{
Label do_branch = c.newLabel();
Label cont = c.newLabel();
c.b(op, do_branch); // do_branch: br x9
c.b(cont); *raw++ = 0x20;
*raw++ = 0x01;
*raw++ = 0x1f;
*raw++ = 0xD6;
}
c.bind(do_branch); // taddr
c.br(a64::x9); std::memcpy(raw, &taddr, 8);
raw += 8;
c.bind(cont); // cont: next instruction
}
});
u8 mem_used = 7 * 4;
memcpy(raw, reinterpret_cast<u8*>(temp), mem_used);
raw += mem_used;
}; };
#elif defined(ARCH_X64) #elif defined(ARCH_X64)
// Allocate some writable executable memory // Allocate some writable executable memory
@ -1203,17 +1233,7 @@ spu_function_t spu_runtime::rebuild_ubertrampoline(u32 id_inst)
// Rewrite jump address // Rewrite jump address
{ {
u64 raw64 = reinterpret_cast<u64>(raw); u64 raw64 = reinterpret_cast<u64>(raw);
auto temp = build_function_asm<spu_function_t>("", [&](native_asm& c, auto& args) memcpy(w.rel32 - 8, &raw64, 8);
{
using namespace asmjit;
c.movk(a64::x9, Imm(static_cast<u16>(raw64 >> 48)), Imm(48));
c.movk(a64::x9, Imm(static_cast<u16>(raw64 >> 32)), Imm(32));
c.movk(a64::x9, Imm(static_cast<u16>(raw64 >> 16)), Imm(16));
c.movk(a64::x9, Imm(static_cast<u16>(raw64)), Imm(0));
});
memcpy(w.rel32 - (4 * 7), reinterpret_cast<u8*>(temp), 4 * 4);
} }
#else #else
#error "Unimplemented" #error "Unimplemented"
@ -1302,19 +1322,27 @@ spu_function_t spu_runtime::rebuild_ubertrampoline(u32 id_inst)
raw += 4; raw += 4;
} }
#elif defined(ARCH_ARM64) #elif defined(ARCH_ARM64)
{ // ldr w9, #8
auto temp = build_function_asm<spu_function_t>("", [&](native_asm& c, auto& args) *raw++ = 0x49;
{ *raw++ = 0x00;
using namespace asmjit; *raw++ = 0x00;
*raw++ = 0x18;
c.movz(a64::w9, Imm(static_cast<u16>(cmp_lsa >> 16)), Imm(16)); // b #8
c.movk(a64::w9, Imm(static_cast<u16>(cmp_lsa)), Imm(0)); *raw++ = 0x02;
c.ldr(a64::w1, arm::Mem(a64::x7, a64::x9)); *raw++ = 0x00;
}); *raw++ = 0x00;
*raw++ = 0x14;
memcpy(raw, reinterpret_cast<u8*>(temp), 3 * 4); // cmp_lsa
raw += 3 * 4; std::memcpy(raw, &cmp_lsa, 4);
} raw += 4;
// ldr w1, [x7, x9]
*raw++ = 0xE1;
*raw++ = 0x68;
*raw++ = 0x69;
*raw++ = 0xB8;
#else #else
#error "Unimplemented" #error "Unimplemented"
#endif #endif
@ -1326,19 +1354,27 @@ spu_function_t spu_runtime::rebuild_ubertrampoline(u32 id_inst)
std::memcpy(raw, &x, 4); std::memcpy(raw, &x, 4);
raw += 4; raw += 4;
#elif defined(ARCH_ARM64) #elif defined(ARCH_ARM64)
{ // ldr w9, #8
auto temp = build_function_asm<spu_function_t>("", [&](native_asm& c, auto& args) *raw++ = 0x49;
{ *raw++ = 0x00;
using namespace asmjit; *raw++ = 0x00;
*raw++ = 0x18;
c.movz(a64::w9, Imm(static_cast<u16>(x >> 16)), Imm(16)); // b #8
c.movk(a64::w9, Imm(static_cast<u16>(x)), Imm(0)); *raw++ = 0x02;
c.cmp(a64::w1, a64::w9); *raw++ = 0x00;
}); *raw++ = 0x00;
*raw++ = 0x14;
memcpy(raw, reinterpret_cast<u8*>(temp), 3 * 4); // x
raw += 3 * 4; std::memcpy(raw, &x, 4);
} raw += 4;
// cmp w1, w9
*raw++ = 0x3f;
*raw++ = 0x00;
*raw++ = 0x09;
*raw++ = 0x6B;
#else #else
#error "Unimplemented" #error "Unimplemented"
#endif #endif
@ -1573,34 +1609,56 @@ spu_function_t spu_runtime::make_branch_patchpoint(u16 data) const
return reinterpret_cast<spu_function_t>(raw); return reinterpret_cast<spu_function_t>(raw);
#elif defined(ARCH_ARM64) #elif defined(ARCH_ARM64)
spu_function_t func = build_function_asm<spu_function_t>("", [&](native_asm& c, auto& args) #if defined(__APPLE__)
{ pthread_jit_write_protect_np(false);
using namespace asmjit; #endif
// Save the jmp addr to GHC CC 3rd arg -> REG_Hp u8* const patch_fn = ensure(jit_runtime::alloc(36, 16));
Label replace_addr = c.newLabel(); u8* raw = patch_fn;
c.adr(a64::x21, replace_addr); // adr x21, #16
// 16 byte alignment for the jump replacement *raw++ = 0x95;
c.nop(); *raw++ = 0x00;
c.nop(); *raw++ = 0x00;
c.nop(); *raw++ = 0x10;
Label branch_target = c.newLabel();
c.bind(replace_addr); // nop x3
c.ldr(a64::x9, arm::Mem(branch_target)); for (int i = 0; i < 3; i++)
c.br(a64::x9); {
*raw++ = 0x1F;
*raw++ = 0x20;
*raw++ = 0x03;
*raw++ = 0xD5;
}
c.bind(branch_target); // ldr x9, #8
c.embedUInt64(reinterpret_cast<u64>(tr_branch)); *raw++ = 0x49;
*raw++ = 0x00;
*raw++ = 0x00;
*raw++ = 0x58;
c.embedUInt8(data >> 8); // br x9
c.embedUInt8(data & 0xff); *raw++ = 0x20;
*raw++ = 0x01;
*raw++ = 0x1F;
*raw++ = 0xD6;
c.embed("branch_patchpoint", 17); u64 branch_target = reinterpret_cast<u64>(tr_branch);
}); std::memcpy(raw, &branch_target, 8);
raw += 8;
return func; *raw++ = static_cast<u8>(data >> 8);
*raw++ = static_cast<u8>(data & 0xff);
#if defined(__APPLE__)
pthread_jit_write_protect_np(true);
#endif
// Flush all cache lines after potentially writing executable code
asm("ISB");
asm("DSB ISH");
return reinterpret_cast<spu_function_t>(patch_fn);
#else #else
#error "Unimplemented" #error "Unimplemented"
#endif #endif
@ -1636,18 +1694,26 @@ void spu_recompiler_base::dispatch(spu_thread& spu, void*, u8* rip)
atomic_storage<u64>::release(*reinterpret_cast<u64*>(rip - 8), result); atomic_storage<u64>::release(*reinterpret_cast<u64*>(rip - 8), result);
#elif defined(ARCH_ARM64) #elif defined(ARCH_ARM64)
auto jump_instrs = build_function_asm<spu_function_t>("", [](native_asm& c, auto& args) union
{ {
using namespace asmjit; u8 bytes[16];
u128 result;
};
Label branch_target = c.newLabel(); // ldr x9, #8
c.ldr(a64::x9, arm::Mem(branch_target)); // PC rel load bytes[0] = 0x49;
c.br(a64::x9); bytes[1] = 0x00;
bytes[2] = 0x00;
bytes[3] = 0x58;
c.bind(branch_target); // br x9
c.embedUInt64(reinterpret_cast<u64>(spu_runtime::tr_all)); bytes[4] = 0x20;
}); bytes[5] = 0x01;
u128 result = *reinterpret_cast<u128*>(jump_instrs); bytes[6] = 0x1F;
bytes[7] = 0xD6;
const u64 target = reinterpret_cast<u64>(spu_runtime::tr_all);
std::memcpy(bytes + 8, &target, 8);
#if defined(__APPLE__) #if defined(__APPLE__)
pthread_jit_write_protect_np(false); pthread_jit_write_protect_np(false);
#endif #endif
@ -1768,18 +1834,26 @@ void spu_recompiler_base::branch(spu_thread& spu, void*, u8* rip)
atomic_storage<u64>::release(*reinterpret_cast<u64*>(rip), result); atomic_storage<u64>::release(*reinterpret_cast<u64*>(rip), result);
#elif defined(ARCH_ARM64) #elif defined(ARCH_ARM64)
auto jmp_instrs = build_function_asm<spu_function_t>("", [&](native_asm& c, auto& args) union
{ {
using namespace asmjit; u8 bytes[16];
u128 result;
};
Label branch_target = c.newLabel(); // ldr x9, #8
c.ldr(a64::x9, arm::Mem(branch_target)); // PC rel load bytes[0] = 0x49;
c.br(a64::x9); bytes[1] = 0x00;
bytes[2] = 0x00;
bytes[3] = 0x58;
c.bind(branch_target); // br x9
c.embedUInt64(reinterpret_cast<u64>(func)); bytes[4] = 0x20;
}); bytes[5] = 0x01;
u128 result = *reinterpret_cast<u128*>(jmp_instrs); bytes[6] = 0x1F;
bytes[7] = 0xD6;
const u64 target = reinterpret_cast<u64>(func);
std::memcpy(bytes + 8, &target, 8);
#if defined(__APPLE__) #if defined(__APPLE__)
pthread_jit_write_protect_np(false); pthread_jit_write_protect_np(false);
#endif #endif