Header optimizations (#1684)

Shouldn't break anything. I hope.
This commit is contained in:
Ivan 2016-04-27 01:27:24 +03:00
parent da7472fe81
commit aafcf44581
89 changed files with 2370 additions and 2348 deletions

View file

@ -3,6 +3,7 @@
#include "PPUThread.h"
#include "PPUInterpreter.h"
// TODO: fix rol8 and rol16 for __GNUG__ (probably with __asm__)
inline u8 rol8(const u8 x, const u8 n) { return x << n | x >> (8 - n); }
inline u16 rol16(const u16 x, const u16 n) { return x << n | x >> (16 - n); }
inline u32 rol32(const u32 x, const u32 n) { return x << n | x >> (32 - n); }
@ -10,16 +11,16 @@ inline u64 rol64(const u64 x, const u64 n) { return x << n | x >> (64 - n); }
inline u64 dup32(const u32 x) { return x | static_cast<u64>(x) << 32; }
#if defined(__GNUG__)
inline std::uint64_t UMULH64(std::uint64_t a, std::uint64_t b)
inline u64 UMULH64(u64 a, u64 b)
{
std::uint64_t result;
u64 result;
__asm__("mulq %[b]" : "=d" (result) : [a] "a" (a), [b] "rm" (b));
return result;
}
inline std::int64_t MULH64(std::int64_t a, std::int64_t b)
inline s64 MULH64(s64 a, s64 b)
{
std::int64_t result;
s64 result;
__asm__("imulq %[b]" : "=d" (result) : [a] "a" (a), [b] "rm" (b));
return result;
}
@ -30,6 +31,95 @@ inline std::int64_t MULH64(std::int64_t a, std::int64_t b)
#define MULH64 __mulh
#endif
// Compare 16 packed unsigned bytes (greater than)
inline __m128i sse_cmpgt_epu8(__m128i A, __m128i B)
{
// (A xor 0x80) > (B xor 0x80)
const auto sign = _mm_set1_epi32(0x80808080);
return _mm_cmpgt_epi8(_mm_xor_si128(A, sign), _mm_xor_si128(B, sign));
}
inline __m128i sse_cmpgt_epu16(__m128i A, __m128i B)
{
const auto sign = _mm_set1_epi32(0x80008000);
return _mm_cmpgt_epi16(_mm_xor_si128(A, sign), _mm_xor_si128(B, sign));
}
inline __m128i sse_cmpgt_epu32(__m128i A, __m128i B)
{
const auto sign = _mm_set1_epi32(0x80000000);
return _mm_cmpgt_epi32(_mm_xor_si128(A, sign), _mm_xor_si128(B, sign));
}
inline __m128 sse_exp2_ps(__m128 A)
{
const auto x0 = _mm_max_ps(_mm_min_ps(A, _mm_set1_ps(127.4999961f)), _mm_set1_ps(-127.4999961f));
const auto x1 = _mm_add_ps(x0, _mm_set1_ps(0.5f));
const auto x2 = _mm_sub_epi32(_mm_cvtps_epi32(x1), _mm_and_si128(_mm_castps_si128(_mm_cmpnlt_ps(_mm_setzero_ps(), x1)), _mm_set1_epi32(1)));
const auto x3 = _mm_sub_ps(x0, _mm_cvtepi32_ps(x2));
const auto x4 = _mm_mul_ps(x3, x3);
const auto x5 = _mm_mul_ps(x3, _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(x4, _mm_set1_ps(0.023093347705f)), _mm_set1_ps(20.20206567f)), x4), _mm_set1_ps(1513.906801f)));
const auto x6 = _mm_mul_ps(x5, _mm_rcp_ps(_mm_sub_ps(_mm_add_ps(_mm_mul_ps(_mm_set1_ps(233.1842117f), x4), _mm_set1_ps(4368.211667f)), x5)));
return _mm_mul_ps(_mm_add_ps(_mm_add_ps(x6, x6), _mm_set1_ps(1.0f)), _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(x2, _mm_set1_epi32(127)), 23)));
}
inline __m128 sse_log2_ps(__m128 A)
{
const auto _1 = _mm_set1_ps(1.0f);
const auto _c = _mm_set1_ps(1.442695040f);
const auto x0 = _mm_max_ps(A, _mm_castsi128_ps(_mm_set1_epi32(0x00800000)));
const auto x1 = _mm_or_ps(_mm_and_ps(x0, _mm_castsi128_ps(_mm_set1_epi32(0x807fffff))), _1);
const auto x2 = _mm_rcp_ps(_mm_add_ps(x1, _1));
const auto x3 = _mm_mul_ps(_mm_sub_ps(x1, _1), x2);
const auto x4 = _mm_add_ps(x3, x3);
const auto x5 = _mm_mul_ps(x4, x4);
const auto x6 = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_set1_ps(-0.7895802789f), x5), _mm_set1_ps(16.38666457f)), x5), _mm_set1_ps(-64.1409953f));
const auto x7 = _mm_rcp_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_set1_ps(-35.67227983f), x5), _mm_set1_ps(312.0937664f)), x5), _mm_set1_ps(-769.6919436f)));
const auto x8 = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_castps_si128(x0), 23), _mm_set1_epi32(127)));
return _mm_add_ps(_mm_mul_ps(_mm_mul_ps(_mm_mul_ps(_mm_mul_ps(x5, x6), x7), x4), _c), _mm_add_ps(_mm_mul_ps(x4, _c), x8));
}
template<typename T>
struct add_flags_result_t
{
T result;
bool carry;
bool zero;
bool sign;
add_flags_result_t() = default;
// Straighforward ADD with flags
add_flags_result_t(T a, T b)
: result(a + b)
, carry(result < a)
, zero(result == 0)
, sign(result >> (sizeof(T) * 8 - 1) != 0)
{
}
// Straighforward ADC with flags
add_flags_result_t(T a, T b, bool c)
: add_flags_result_t(a, b)
{
add_flags_result_t r(result, c);
result = r.result;
carry |= r.carry;
zero = r.zero;
sign = r.sign;
}
};
static add_flags_result_t<u64> add64_flags(u64 a, u64 b)
{
return{ a, b };
}
static add_flags_result_t<u64> add64_flags(u64 a, u64 b, bool c)
{
return{ a, b, c };
}
extern u64 get_timebased_time();
extern void ppu_execute_syscall(PPUThread& ppu, u64 code);
extern void ppu_execute_function(PPUThread& ppu, u32 index);