rsx: Fragment shader decompiler cleanup

TODO: Investigate the _s input modifier behaviour further, in case it can avoid generating zeroes from a MAD instruction.
x = MAD(+ve, -ve, -ve) with _s input modifier in BFBC expects result to be Non-zero
This commit is contained in:
kd-11 2019-04-23 17:53:39 +03:00 committed by kd-11
parent 4cd1c25729
commit 3cbccdd760
2 changed files with 150 additions and 90 deletions

View file

@ -61,20 +61,12 @@ void FragmentProgramDecompiler::SetDst(std::string code, u32 flags)
if (dst.fp16 && device_props.has_native_half_support && !(flags & OPFLAGS::skip_type_cast)) if (dst.fp16 && device_props.has_native_half_support && !(flags & OPFLAGS::skip_type_cast))
{ {
// Cast to native data type // Cast to native data type
if (dst.opcode == RSX_FP_OPCODE_NRM) code = ClampValue(code, 1);
{
// Returns a 3-component vector as the result
code = ClampValue(code + ".xyzz", 1, true);
}
else
{
code = ClampValue(code, 1, true);
}
} }
if (dst.saturate) if (dst.saturate)
{ {
code = ClampValue(code, 4, !!dst.fp16); code = ClampValue(code, 4);
} }
else if (dst.prec) else if (dst.prec)
{ {
@ -108,7 +100,7 @@ void FragmentProgramDecompiler::SetDst(std::string code, u32 flags)
break; break;
// clamp value to allowed range // clamp value to allowed range
code = ClampValue(code, dst.prec, !!dst.fp16); code = ClampValue(code, dst.prec);
break; break;
} }
} }
@ -254,6 +246,7 @@ std::string FragmentProgramDecompiler::AddTex()
break; break;
} }
opflags |= OPFLAGS::texture_ref;
return m_parr.AddParam(PF_PARAM_UNIFORM, sampler, std::string("tex") + std::to_string(dst.tex_num)); return m_parr.AddParam(PF_PARAM_UNIFORM, sampler, std::string("tex") + std::to_string(dst.tex_num));
} }
@ -267,48 +260,37 @@ std::string FragmentProgramDecompiler::AddX2d()
return m_parr.AddParam(PF_PARAM_NONE, getFloatTypeName(4), "x2d", getFloatTypeName(4) + "(0., 0., 0., 0.)"); return m_parr.AddParam(PF_PARAM_NONE, getFloatTypeName(4), "x2d", getFloatTypeName(4) + "(0., 0., 0., 0.)");
} }
std::string FragmentProgramDecompiler::ClampValue(const std::string& code, u32 precision, bool is_half_type) std::string FragmentProgramDecompiler::ClampValue(const std::string& code, u32 precision)
{ {
// FP16 is expected to overflow a lot easier at 0+-65504 // FP16 is expected to overflow a lot easier at 0+-65504
// FP32 can still work up to 0+-3.4E38 // FP32 can still work up to 0+-3.4E38
// See http://http.download.nvidia.com/developer/Papers/2005/FP_Specials/FP_Specials.pdf // See http://http.download.nvidia.com/developer/Papers/2005/FP_Specials/FP_Specials.pdf
if (LIKELY(precision <= 1)) if (precision > 1 && precision < 5)
{ {
if (precision == 1) // Define precision_clamp
{ properties.has_clamp = true;
return "clamp16(" + code + ")";
}
else
{
return code;
}
} }
if (!is_half_type || !device_props.has_native_half_support) switch (precision)
{ {
switch (precision) case 0:
{ // Full 32-bit precision
case 2: break;
return "clamp(" + code + ", -2., 2.)"; case 1:
case 3: return "clamp16(" + code + ")";
return "clamp(" + code + ", -1., 1.)"; case 2:
case 4: return "precision_clamp(" + code + ", -2., 2.)";
return "clamp(" + code + ", 0., 1.)"; case 3:
} return "precision_clamp(" + code + ", -1., 1.)";
} case 4:
else return "precision_clamp(" + code + ", 0., 1.)";
{ case 5:
const auto type = getHalfTypeName(1); // Doesn't seem to do anything to the input from hw tests, same as 0
switch (precision) break;
{ default:
case 2: LOG_ERROR(RSX, "Unexpected precision modifier (%d)\n", precision);
return "clamp(" + code + ", " + type + "(-2.), " + type + "(2.))"; break;
case 3:
return "clamp(" + code + ", " + type + "(-1.), " + type + "(1.))";
case 4:
return "clamp(" + code + ", " + type + "(0.), " + type + "(1.))";
}
} }
return code; return code;
@ -349,6 +331,11 @@ std::string FragmentProgramDecompiler::Format(const std::string& code, bool igno
{ "$float4", [this]() -> std::string { return getFloatTypeName(4); } }, { "$float4", [this]() -> std::string { return getFloatTypeName(4); } },
{ "$float3", [this]() -> std::string { return getFloatTypeName(3); } }, { "$float3", [this]() -> std::string { return getFloatTypeName(3); } },
{ "$float2", [this]() -> std::string { return getFloatTypeName(2); } }, { "$float2", [this]() -> std::string { return getFloatTypeName(2); } },
{ "$float_t", [this]() -> std::string { return getFloatTypeName(1); } },
{ "$half4", [this]() -> std::string { return getHalfTypeName(4); } },
{ "$half3", [this]() -> std::string { return getHalfTypeName(3); } },
{ "$half2", [this]() -> std::string { return getHalfTypeName(2); } },
{ "$half_t", [this]() -> std::string { return getHalfTypeName(1); } },
{ "$Ty", [this]() -> std::string { return (!device_props.has_native_half_support || !dst.fp16)? getFloatTypeName(4) : getHalfTypeName(4); } } { "$Ty", [this]() -> std::string { return (!device_props.has_native_half_support || !dst.fp16)? getFloatTypeName(4) : getHalfTypeName(4); } }
}; };
@ -428,16 +415,59 @@ void FragmentProgramDecompiler::AddCodeCond(const std::string& dst, const std::s
return; return;
} }
std::string src_prefix;
if (device_props.has_native_half_support && !this->dst.fp16)
{
// Target is not fp16 but src might be
// Usually vecX a = f16vecX b is fine, but causes operator overload issues when used in a mix/lerp function
// mix(f32, f16, bvec) causes compiler issues
// NOTE: If dst is fp16 the src will already have been cast to match so this is not a problem in that case
bool src_is_fp16 = false;
if ((opflags & (OPFLAGS::texture_ref | OPFLAGS::src_cast_f32)) == 0 &&
src.find("$0") != std::string::npos)
{
// Texture sample operations are full-width and are exempt
src_is_fp16 = (src0.fp16 && src0.reg_type == RSX_FP_REGISTER_TYPE_TEMP);
if (src_is_fp16 && src.find("$1") != std::string::npos)
{
// References operand 1
src_is_fp16 = (src1.fp16 && src1.reg_type == RSX_FP_REGISTER_TYPE_TEMP);
if (src_is_fp16 && src.find("$2") != std::string::npos)
{
// References operand 2
src_is_fp16 = (src2.fp16 && src2.reg_type == RSX_FP_REGISTER_TYPE_TEMP);
}
}
}
if (src_is_fp16)
{
// LHS argument is of native half type, need to cast to proper type!
if (src[0] != '(')
{
// Upcast inputs to processing function instead
opflags |= OPFLAGS::src_cast_f32;
}
else
{
// No need to add explicit casts all over the place, just cast the result once
src_prefix = "$Ty";
}
}
}
// NOTE: dst = _select(dst, src, cond) is equivalent to dst = cond? src : dst; // NOTE: dst = _select(dst, src, cond) is equivalent to dst = cond? src : dst;
const auto cond = ShaderVariable(dst).match_size(GetRawCond()); const auto cond = ShaderVariable(dst).match_size(GetRawCond());
AddCode(dst + " = _select(" + dst + ", " + src + ", " + cond + ");"); AddCode(dst + " = _select(" + dst + ", " + src_prefix + src + ", " + cond + ");");
} }
template<typename T> std::string FragmentProgramDecompiler::GetSRC(T src) template<typename T> std::string FragmentProgramDecompiler::GetSRC(T src)
{ {
std::string ret; std::string ret;
bool apply_precision_modifier = !!src1.input_prec_mod; bool apply_precision_modifier = !!src1.input_prec_mod;
bool is_half_type = false;
switch (src.reg_type) switch (src.reg_type)
{ {
@ -461,14 +491,10 @@ template<typename T> std::string FragmentProgramDecompiler::GetSRC(T src)
} }
} }
} }
else
{
is_half_type = true;
}
ret += AddReg(src.tmp_reg_index, is_half_type); ret += AddReg(src.tmp_reg_index, src.fp16);
if (opflags & OPFLAGS::src_cast_f32 && is_half_type && device_props.has_native_half_support) if (opflags & OPFLAGS::src_cast_f32 && src.fp16 && device_props.has_native_half_support)
{ {
// Upconvert if there is a chance for ambiguity // Upconvert if there is a chance for ambiguity
ret = getFloatTypeName(4) + "(" + ret + ")"; ret = getFloatTypeName(4) + "(" + ret + ")";
@ -542,7 +568,7 @@ template<typename T> std::string FragmentProgramDecompiler::GetSRC(T src)
// Warning: Modifier order matters. e.g neg should be applied after precision clamping (tested with Naruto UNS) // Warning: Modifier order matters. e.g neg should be applied after precision clamping (tested with Naruto UNS)
if (src.abs) ret = "abs(" + ret + ")"; if (src.abs) ret = "abs(" + ret + ")";
if (apply_precision_modifier) ret = ClampValue(ret, src1.input_prec_mod, is_half_type); if (apply_precision_modifier) ret = ClampValue(ret, src1.input_prec_mod);
if (src.neg) ret = "-" + ret; if (src.neg) ret = "-" + ret;
return ret; return ret;
@ -620,6 +646,32 @@ std::string FragmentProgramDecompiler::BuildCode()
std::string float4 = getFloatTypeName(4); std::string float4 = getFloatTypeName(4);
const bool glsl = float4 == "vec4"; const bool glsl = float4 == "vec4";
if (properties.has_clamp)
{
std::string precision_func =
"$float4 precision_clamp($float4 x, float _min, float _max)\n"
"{\n"
" // Treat NaNs as 0\n"
" bvec4 nans = isnan(x);\n"
" x = _select(x, $float4(0., 0., 0., 0.), nans);\n"
" return clamp(x, _min, _max);\n"
"}\n\n";
if (device_props.has_native_half_support)
{
precision_func +=
"$half4 precision_clamp($half4 x, float _min, float _max)\n"
"{\n"
" // Treat NaNs as 0\n"
" bvec4 nans = isnan(x);\n"
" x = _select(x, $half4(0., 0., 0., 0.), nans);\n"
" return clamp(x, $half_t(_min), $half_t(_max));\n"
"}\n\n";
}
OS << Format(precision_func);
}
if (!device_props.has_native_half_support) if (!device_props.has_native_half_support)
{ {
// Accurate float to half clamping (preserves IEEE-754 NaN) // Accurate float to half clamping (preserves IEEE-754 NaN)
@ -659,44 +711,45 @@ std::string FragmentProgramDecompiler::BuildCode()
} }
OS << OS <<
"#define _builtin_min min\n"
"#define _builtin_max max\n"
"#define _builtin_lit lit_legacy\n" "#define _builtin_lit lit_legacy\n"
"#define _builtin_distance distance\n" "#define _builtin_log2 log2\n"
"#define _builtin_log2(x) log2(abs(x))\n" "#define _builtin_normalize(x) (length(x) > 0? normalize(x) : x)\n" // HACK!! Workaround for some games that generate NaNs unless texture filtering exactly matches PS3 (BFBC)
"#define _builtin_sqrt(x) sqrt(abs(x))\n" "#define _builtin_sqrt(x) sqrt(abs(x))\n"
"#define _builtin_rcp(x) (1. / x)\n" "#define _builtin_rcp(x) (1. / x)\n"
"#define _builtin_rsq(x) (1. / _builtin_sqrt(x))\n" "#define _builtin_rsq(x) (1. / _builtin_sqrt(x))\n"
"#define _builtin_div(x, y) (x / y)\n\n"; "#define _builtin_div(x, y) (x / y)\n\n";
// Define RSX-compliant DIVSQ if (properties.has_divsq)
// If the numerator is 0, the result is always 0 even if the denominator is 0
// NOTE: This operation is component-wise and cannot be accelerated with lerp/mix because these always return NaN if any of the choices is NaN
std::string divsq_func =
"$float4 _builtin_divsq($float4 a, float b)\n"
"{\n"
" $float4 tmp = a / _builtin_sqrt(b);\n"
" $float4 choice = abs(a);\n";
if (glsl)
{ {
divsq_func += // Define RSX-compliant DIVSQ
" return _select(a, tmp, greaterThan(choice, vec4(0.)));\n"; // If the numerator is 0, the result is always 0 even if the denominator is 0
} // NOTE: This operation is component-wise and cannot be accelerated with lerp/mix because these always return NaN if any of the choices is NaN
else std::string divsq_func =
{ "$float4 _builtin_divsq($float4 a, float b)\n"
divsq_func += "{\n"
" if (choice.x > 0.) a.x = tmp.x;\n" " $float4 tmp = a / _builtin_sqrt(b);\n"
" if (choice.y > 0.) a.y = tmp.y;\n" " $float4 choice = abs(a);\n";
" if (choice.z > 0.) a.z = tmp.z;\n"
" if (choice.w > 0.) a.w = tmp.w;\n"
" return a;\n";
}
divsq_func += if (glsl)
"}\n\n"; {
divsq_func +=
" return _select(a, tmp, greaterThan(choice, vec4(0.)));\n";
}
else
{
divsq_func +=
" if (choice.x > 0.) a.x = tmp.x;\n"
" if (choice.y > 0.) a.y = tmp.y;\n"
" if (choice.z > 0.) a.z = tmp.z;\n"
" if (choice.w > 0.) a.w = tmp.w;\n"
" return a;\n";
}
OS << Format(divsq_func); divsq_func +=
"}\n\n";
OS << Format(divsq_func);
}
// Declare register gather/merge if needed // Declare register gather/merge if needed
if (properties.has_gather_op) if (properties.has_gather_op)
@ -730,24 +783,28 @@ std::string FragmentProgramDecompiler::BuildCode()
bool FragmentProgramDecompiler::handle_sct_scb(u32 opcode) bool FragmentProgramDecompiler::handle_sct_scb(u32 opcode)
{ {
// Compliance notes based on HW tests: // Compliance notes based on HW tests:
// DIV is IEEE compliant as is MUL, LG2, EX2 with exception to the fact that they operate on absolute values (Needs more testing) // DIV is IEEE compliant as is MUL, LG2, EX2. LG2 with negative input returns NaN as expected.
// DIVSQ is not compliant. Result is 0 if numerator is 0 regardless of denominator // DIVSQ is not compliant. Result is 0 if numerator is 0 regardless of denominator
// RSQ(0) and RCP(0) return INF as expected // RSQ(0) and RCP(0) return INF as expected
// RSQ and LG2 ignore the sign of the inputs (Metro Last Light, GTA4) // RSQ ignores the sign of the inputs (Metro Last Light, GTA4)
// SAT modifier flushes NaNs to 0
// Some games that rely on broken DIVSQ behaviour include Dark Souls II and Super Puzzle Fighter II Turbo HD Remix // Some games that rely on broken DIVSQ behaviour include Dark Souls II and Super Puzzle Fighter II Turbo HD Remix
switch (opcode) switch (opcode)
{ {
case RSX_FP_OPCODE_ADD: SetDst("($0 + $1)"); return true; case RSX_FP_OPCODE_ADD: SetDst("($0 + $1)"); return true;
case RSX_FP_OPCODE_DIV: SetDst("_builtin_div($0, $1.x)"); return true; case RSX_FP_OPCODE_DIV: SetDst("_builtin_div($0, $1.x)"); return true;
case RSX_FP_OPCODE_DIVSQ: SetDst("_builtin_divsq($0, $1.x)"); return true; case RSX_FP_OPCODE_DIVSQ:
SetDst("_builtin_divsq($0, $1.x)");
properties.has_divsq = true;
return true;
case RSX_FP_OPCODE_DP2: SetDst(getFunction(FUNCTION::FUNCTION_DP2), OPFLAGS::op_extern); return true; case RSX_FP_OPCODE_DP2: SetDst(getFunction(FUNCTION::FUNCTION_DP2), OPFLAGS::op_extern); return true;
case RSX_FP_OPCODE_DP3: SetDst(getFunction(FUNCTION::FUNCTION_DP3), OPFLAGS::op_extern); return true; case RSX_FP_OPCODE_DP3: SetDst(getFunction(FUNCTION::FUNCTION_DP3), OPFLAGS::op_extern); return true;
case RSX_FP_OPCODE_DP4: SetDst(getFunction(FUNCTION::FUNCTION_DP4), OPFLAGS::op_extern); return true; case RSX_FP_OPCODE_DP4: SetDst(getFunction(FUNCTION::FUNCTION_DP4), OPFLAGS::op_extern); return true;
case RSX_FP_OPCODE_DP2A: SetDst(getFunction(FUNCTION::FUNCTION_DP2A), OPFLAGS::op_extern); return true; case RSX_FP_OPCODE_DP2A: SetDst(getFunction(FUNCTION::FUNCTION_DP2A), OPFLAGS::op_extern); return true;
case RSX_FP_OPCODE_MAD: SetDst("($0 * $1 + $2)"); return true; case RSX_FP_OPCODE_MAD: SetDst("($0 * $1 + $2)"); return true;
case RSX_FP_OPCODE_MAX: SetDst("_builtin_max($0, $1)", OPFLAGS::src_cast_f32); return true; case RSX_FP_OPCODE_MAX: SetDst("max($0, $1)", OPFLAGS::src_cast_f32); return true;
case RSX_FP_OPCODE_MIN: SetDst("_builtin_min($0, $1)", OPFLAGS::src_cast_f32); return true; case RSX_FP_OPCODE_MIN: SetDst("min($0, $1)", OPFLAGS::src_cast_f32); return true;
case RSX_FP_OPCODE_MOV: SetDst("$0"); return true; case RSX_FP_OPCODE_MOV: SetDst("$0"); return true;
case RSX_FP_OPCODE_MUL: SetDst("($0 * $1)"); return true; case RSX_FP_OPCODE_MUL: SetDst("($0 * $1)"); return true;
case RSX_FP_OPCODE_RCP: SetDst("_builtin_rcp($0.x).xxxx"); return true; case RSX_FP_OPCODE_RCP: SetDst("_builtin_rcp($0.x).xxxx"); return true;
@ -763,7 +820,7 @@ bool FragmentProgramDecompiler::handle_sct_scb(u32 opcode)
// SCB-only ops // SCB-only ops
case RSX_FP_OPCODE_COS: SetDst("cos($0.xxxx)"); return true; case RSX_FP_OPCODE_COS: SetDst("cos($0.xxxx)"); return true;
case RSX_FP_OPCODE_DST: SetDst("_builtin_distance($0, $1).xxxx", OPFLAGS::src_cast_f32); return true; case RSX_FP_OPCODE_DST: SetDst("distance($0, $1).xxxx", OPFLAGS::src_cast_f32); return true;
case RSX_FP_OPCODE_REFL: SetDst(getFunction(FUNCTION::FUNCTION_REFL), OPFLAGS::op_extern); return true; case RSX_FP_OPCODE_REFL: SetDst(getFunction(FUNCTION::FUNCTION_REFL), OPFLAGS::op_extern); return true;
case RSX_FP_OPCODE_EX2: SetDst("exp2($0.xxxx)"); return true; case RSX_FP_OPCODE_EX2: SetDst("exp2($0.xxxx)"); return true;
case RSX_FP_OPCODE_FLR: SetDst("floor($0)"); return true; case RSX_FP_OPCODE_FLR: SetDst("floor($0)"); return true;
@ -793,7 +850,7 @@ bool FragmentProgramDecompiler::handle_tex_srb(u32 opcode)
{ {
case RSX_FP_OPCODE_DDX: SetDst(getFunction(FUNCTION::FUNCTION_DFDX)); return true; case RSX_FP_OPCODE_DDX: SetDst(getFunction(FUNCTION::FUNCTION_DFDX)); return true;
case RSX_FP_OPCODE_DDY: SetDst(getFunction(FUNCTION::FUNCTION_DFDY)); return true; case RSX_FP_OPCODE_DDY: SetDst(getFunction(FUNCTION::FUNCTION_DFDY)); return true;
case RSX_FP_OPCODE_NRM: SetDst("normalize($0.xyz)"); return true; case RSX_FP_OPCODE_NRM: SetDst("_builtin_normalize($0.xyz).xyzz", OPFLAGS::src_cast_f32); return true;
case RSX_FP_OPCODE_BEM: SetDst("$0.xyxy + $1.xxxx * $2.xzxz + $1.yyyy * $2.ywyw"); return true; case RSX_FP_OPCODE_BEM: SetDst("$0.xyxy + $1.xxxx * $2.xzxz + $1.yyyy * $2.ywyw"); return true;
case RSX_FP_OPCODE_TEXBEM: case RSX_FP_OPCODE_TEXBEM:
//Untested, should be x2d followed by TEX //Untested, should be x2d followed by TEX

View file

@ -133,6 +133,7 @@ class FragmentProgramDecompiler
no_src_mask = 1, no_src_mask = 1,
src_cast_f32 = 2, src_cast_f32 = 2,
skip_type_cast = 4, skip_type_cast = 4,
texture_ref = 8,
op_extern = src_cast_f32 | skip_type_cast, op_extern = src_cast_f32 | skip_type_cast,
}; };
@ -175,7 +176,7 @@ class FragmentProgramDecompiler
std::string AddX2d(); std::string AddX2d();
//Prevents operations from overflowing the desired range (tested with fp_dynamic3 autotest sample, DS2 for src1.input_prec_mod) //Prevents operations from overflowing the desired range (tested with fp_dynamic3 autotest sample, DS2 for src1.input_prec_mod)
std::string ClampValue(const std::string& code, u32 precision, bool is_half_type); std::string ClampValue(const std::string& code, u32 precision);
/** /**
* Returns true if the dst set is not a vector (i.e only a single component) * Returns true if the dst set is not a vector (i.e only a single component)
@ -259,6 +260,8 @@ public:
bool has_no_output = false; bool has_no_output = false;
bool has_discard_op = false; bool has_discard_op = false;
bool has_tex_op = false; bool has_tex_op = false;
bool has_divsq = false;
bool has_clamp = false;
} }
properties; properties;