mirror of
https://github.com/RPCS3/rpcs3.git
synced 2025-07-14 02:38:37 +12:00
spu: Reimplement FCGT and FCMGT to handle corner cases (such as comparisons against infinites and denormals)
- Also optimize FMA/FMS/FNMS for recompiler
This commit is contained in:
parent
9e7a42d057
commit
41d921808b
2 changed files with 161 additions and 39 deletions
|
@ -719,7 +719,40 @@ void spu_interpreter::ANDC(SPUThread& spu, spu_opcode_t op)
|
|||
|
||||
void spu_interpreter_fast::FCGT(SPUThread& spu, spu_opcode_t op)
|
||||
{
|
||||
spu.gpr[op.rt].vf = _mm_cmplt_ps(spu.gpr[op.rb].vf, spu.gpr[op.ra].vf);
|
||||
// IMPL NOTES:
|
||||
// if (v is inf) v = (inf - 1) i.e nearest normal value to inf with mantissa bits left intact
|
||||
// if (v is denormalized) v = 0 flush denormals
|
||||
// return v1 > v2
|
||||
// branching simulated using bitwise ops and_not+or
|
||||
|
||||
const auto zero = _mm_set1_ps(0.f);
|
||||
const auto nan_check_a = _mm_cmpunord_ps(spu.gpr[op.ra].vf, zero); //mask true where a is extended
|
||||
const auto nan_check_b = _mm_cmpunord_ps(spu.gpr[op.rb].vf, zero); //mask true where b is extended
|
||||
|
||||
//calculate lowered a and b. The mantissa bits are left untouched for now unless its proven they should be flushed
|
||||
const auto last_exp_bit = _mm_castsi128_ps(_mm_set1_epi32(0x00800000));
|
||||
const auto lowered_a =_mm_andnot_ps(last_exp_bit, spu.gpr[op.ra].vf); //a is lowered to largest unextended value with sign
|
||||
const auto lowered_b = _mm_andnot_ps(last_exp_bit, spu.gpr[op.rb].vf); //b is lowered to largest unextended value with sign
|
||||
|
||||
//check if a and b are denormalized
|
||||
const auto all_exp_bits = _mm_castsi128_ps(_mm_set1_epi32(0x7f800000));
|
||||
const auto denorm_check_a = _mm_cmpeq_ps(zero, _mm_and_ps(all_exp_bits, spu.gpr[op.ra].vf));
|
||||
const auto denorm_check_b = _mm_cmpeq_ps(zero, _mm_and_ps(all_exp_bits, spu.gpr[op.rb].vf));
|
||||
|
||||
//set a and b to their lowered values if they are extended
|
||||
const auto a_values_lowered = _mm_and_ps(nan_check_a, lowered_a);
|
||||
const auto original_a_masked = _mm_andnot_ps(nan_check_a, spu.gpr[op.ra].vf);
|
||||
const auto a_final1 = _mm_or_ps(a_values_lowered, original_a_masked);
|
||||
|
||||
const auto b_values_lowered = _mm_and_ps(nan_check_b, lowered_b);
|
||||
const auto original_b_masked = _mm_andnot_ps(nan_check_b, spu.gpr[op.rb].vf);
|
||||
const auto b_final1 = _mm_or_ps(b_values_lowered, original_b_masked);
|
||||
|
||||
//Flush denormals to zero
|
||||
const auto final_a = _mm_andnot_ps(denorm_check_a, a_final1);
|
||||
const auto final_b = _mm_andnot_ps(denorm_check_b, b_final1);
|
||||
|
||||
spu.gpr[op.rt].vf = _mm_cmplt_ps(final_b, final_a);
|
||||
}
|
||||
|
||||
void spu_interpreter::DFCGT(SPUThread& spu, spu_opcode_t op)
|
||||
|
@ -754,8 +787,28 @@ void spu_interpreter::ORC(SPUThread& spu, spu_opcode_t op)
|
|||
|
||||
void spu_interpreter_fast::FCMGT(SPUThread& spu, spu_opcode_t op)
|
||||
{
|
||||
const auto mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
|
||||
spu.gpr[op.rt].vf = _mm_cmplt_ps(_mm_and_ps(spu.gpr[op.rb].vf, mask), _mm_and_ps(spu.gpr[op.ra].vf, mask));
|
||||
//IMPL NOTES: See FCGT
|
||||
|
||||
const auto zero = _mm_set1_ps(0.f);
|
||||
const auto nan_check_a = _mm_cmpunord_ps(spu.gpr[op.ra].vf, zero); //mask true where a is extended
|
||||
const auto nan_check_b = _mm_cmpunord_ps(spu.gpr[op.rb].vf, zero); //mask true where b is extended
|
||||
|
||||
//check if a and b are denormalized
|
||||
const auto all_exp_bits = _mm_castsi128_ps(_mm_set1_epi32(0x7f800000));
|
||||
const auto denorm_check_a = _mm_cmpeq_ps(zero, _mm_and_ps(all_exp_bits, spu.gpr[op.ra].vf));
|
||||
const auto denorm_check_b = _mm_cmpeq_ps(zero, _mm_and_ps(all_exp_bits, spu.gpr[op.rb].vf));
|
||||
|
||||
//Flush denormals to zero
|
||||
const auto final_a = _mm_andnot_ps(denorm_check_a, spu.gpr[op.ra].vf);
|
||||
const auto final_b = _mm_andnot_ps(denorm_check_b, spu.gpr[op.rb].vf);
|
||||
|
||||
//Mask to make a > b if a is extended but b is not (is this necessary on x86?)
|
||||
const auto nan_mask = _mm_andnot_ps(nan_check_b, _mm_xor_ps(nan_check_a, nan_check_b));
|
||||
|
||||
const auto sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
|
||||
const auto comparison = _mm_cmplt_ps(_mm_and_ps(final_b, sign_mask), _mm_and_ps(final_a, sign_mask));
|
||||
|
||||
spu.gpr[op.rt].vf = _mm_or_ps(comparison, nan_mask);
|
||||
}
|
||||
|
||||
void spu_interpreter::DFCMGT(SPUThread& spu, spu_opcode_t op)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue