From 3b8e7d0967627eb48f3cda77c7c9c6109cf04a6f Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Fri, 5 Jun 2020 17:51:34 +0300 Subject: [PATCH] Implement v128::fma32f --- Utilities/BEType.h | 31 +++++++++++++++++++++++++++++++ rpcs3/Emu/Cell/PPUInterpreter.cpp | 16 ++++------------ rpcs3/main.cpp | 2 ++ 3 files changed, 37 insertions(+), 12 deletions(-) diff --git a/Utilities/BEType.h b/Utilities/BEType.h index 966a44f550..3913262171 100644 --- a/Utilities/BEType.h +++ b/Utilities/BEType.h @@ -3,6 +3,7 @@ #include "types.h" #include "util/endian.hpp" #include +#include #if __has_include() #include @@ -322,6 +323,36 @@ union alignas(16) v128 return fromD(_mm_cmpeq_pd(left.vd, right.vd)); } + static inline bool use_fma = false; + + static inline v128 fma32f(v128 a, const v128& b, const v128& c) + { +#ifndef __FMA__ + if (use_fma) [[likely]] + { +#ifdef _MSC_VER + a.vf = _mm_fmadd_ps(a.vf, b.vf, c.vf); + return a; +#else + __asm__("vfmadd213ps %[c], %[b], %[a]" + : [a] "+x" (a.vf) + : [b] "x" (b.vf) + , [c] "x" (c.vf)); + return a; +#endif + } + + for (int i = 0; i < 4; i++) + { + a._f[i] = std::fmaf(a._f[i], b._f[i], c._f[i]); + } + return a; +#else + a.vf = _mm_fmadd_ps(a.vf, b.vf, c.vf); + return a; +#endif + } + bool operator==(const v128& right) const { return _u64[0] == right._u64[0] && _u64[1] == right._u64[1]; diff --git a/rpcs3/Emu/Cell/PPUInterpreter.cpp b/rpcs3/Emu/Cell/PPUInterpreter.cpp index 7b14110f9b..9330830d63 100644 --- a/rpcs3/Emu/Cell/PPUInterpreter.cpp +++ b/rpcs3/Emu/Cell/PPUInterpreter.cpp @@ -959,9 +959,9 @@ bool ppu_interpreter::VLOGEFP(ppu_thread& ppu, ppu_opcode_t op) bool ppu_interpreter_fast::VMADDFP(ppu_thread& ppu, ppu_opcode_t op) { const auto a = ppu.vr[op.va].vf; - const auto b = ppu.vr[op.vc].vf; - const auto c = ppu.vr[op.vb].vf; - const auto result = _mm_add_ps(_mm_mul_ps(a, b), c); + const auto b = ppu.vr[op.vb].vf; + const auto c = ppu.vr[op.vc].vf; + const auto result = _mm_add_ps(_mm_mul_ps(a, c), b); ppu.vr[op.vd] = vec_handle_nan(result); return true; } @@ -971,15 +971,7 @@ bool ppu_interpreter_precise::VMADDFP(ppu_thread& ppu, ppu_opcode_t op) const auto a = ppu.vr[op.va]; const auto b = ppu.vr[op.vb]; const auto c = ppu.vr[op.vc]; - v128 d; - - // TODO: Optimize - for (u32 i = 0; i < 4; i++) - { - d._f[i] = f32(f64{a._f[i]} * f64{c._f[i]} + f64{b._f[i]}); - } - - ppu.vr[op.rd] = vec_handle_nan(d, a, b, c); + ppu.vr[op.rd] = vec_handle_nan(v128::fma32f(a, c, b), a, b, c); return true; } diff --git a/rpcs3/main.cpp b/rpcs3/main.cpp index e45efac0fa..9cdfe3300a 100644 --- a/rpcs3/main.cpp +++ b/rpcs3/main.cpp @@ -268,6 +268,8 @@ int main(int argc, char** argv) const u64 intro_time = (intro_stats.ru_utime.tv_sec + intro_stats.ru_stime.tv_sec) * 1000000000ull + (intro_stats.ru_utime.tv_usec + intro_stats.ru_stime.tv_usec) * 1000ull; #endif + v128::use_fma = utils::has_fma3(); + s_argv0 = argv[0]; // Save for report_fatal_error // Only run RPCS3 to display an error