From abe7188acfac6eb54ddb2f5317ec415a7701d033 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Tue, 16 Apr 2019 21:57:52 +0300 Subject: [PATCH] rsx: Proper workaround for broken DIVSQ instruction on realhw - While mul(0, nan) = nan and 0 / 0 = nan, 0 / sqrt(0) = 0 because of hw gremlins. normalize(0) is also nan so this behaviour does not work around that particular case either which makes it even more baffling. --- .../RSX/Common/FragmentProgramDecompiler.cpp | 79 +++++++++++-------- 1 file changed, 47 insertions(+), 32 deletions(-) diff --git a/rpcs3/Emu/RSX/Common/FragmentProgramDecompiler.cpp b/rpcs3/Emu/RSX/Common/FragmentProgramDecompiler.cpp index 17f5e02383..b466820d82 100644 --- a/rpcs3/Emu/RSX/Common/FragmentProgramDecompiler.cpp +++ b/rpcs3/Emu/RSX/Common/FragmentProgramDecompiler.cpp @@ -346,6 +346,9 @@ std::string FragmentProgramDecompiler::Format(const std::string& code, bool igno }, { "$cond", std::bind(std::mem_fn(&FragmentProgramDecompiler::GetCond), this) }, { "$_c", std::bind(std::mem_fn(&FragmentProgramDecompiler::AddConst), this) }, + { "$float4", [this]() -> std::string { return getFloatTypeName(4); } }, + { "$float3", [this]() -> std::string { return getFloatTypeName(3); } }, + { "$float2", [this]() -> std::string { return getFloatTypeName(2); } }, { "$Ty", [this]() -> std::string { return (!device_props.has_native_half_support || !dst.fp16)? getFloatTypeName(4) : getHalfTypeName(4); } } }; @@ -551,8 +554,8 @@ std::string FragmentProgramDecompiler::BuildCode() // Shader must at least write to one output for the body to be considered valid const bool fp16_out = !(m_ctrl & CELL_GCM_SHADER_CONTROL_32_BITS_EXPORTS); - const std::string vec4_type = (fp16_out && device_props.has_native_half_support)? getHalfTypeName(4) : getFloatTypeName(4); - const std::string init_value = vec4_type + "(0., 0., 0., 0.)"; + const std::string $float4_type = (fp16_out && device_props.has_native_half_support)? getHalfTypeName(4) : getFloatTypeName(4); + const std::string init_value = $float4_type + "(0., 0., 0., 0.)"; std::array output_register_names; std::array ouput_register_indices = { 0, 2, 3, 4 }; bool shader_is_valid = false; @@ -579,9 +582,9 @@ std::string FragmentProgramDecompiler::BuildCode() for (int n = 0; n < 4; ++n) { - if (!m_parr.HasParam(PF_PARAM_NONE, vec4_type, output_register_names[n])) + if (!m_parr.HasParam(PF_PARAM_NONE, $float4_type, output_register_names[n])) { - m_parr.AddParam(PF_PARAM_NONE, vec4_type, output_register_names[n], init_value); + m_parr.AddParam(PF_PARAM_NONE, $float4_type, output_register_names[n], init_value); continue; } @@ -614,24 +617,20 @@ std::string FragmentProgramDecompiler::BuildCode() // Insert global function definitions insertGlobalFunctions(OS); + std::string float4 = getFloatTypeName(4); + if (!device_props.has_native_half_support) { // Accurate float to half clamping (preserves IEEE-754 NaN) - OS << - "vec4 clamp16(vec4 x)\n" + std::string clamp_func = + "$float4 clamp16($float4 x)\n" "{\n" - " bvec4 sel = isnan(x);\n" - " vec4 clamped = clamp(x, -65504., +65504.);\n" - " if (!any(sel))\n" - " {\n" - " return clamped;\n" - " }\n\n" + " $float4 sel = $float4(isnan(x));\n" + " $float4 clamped = clamp(x, -65504., +65504.);\n" " return _select(clamped, x, sel);\n" - "}\n\n" + "}\n\n"; - "vec3 clamp16(vec3 x){ return clamp16(x.xyzz).xyz; }\n" - "vec2 clamp16(vec2 x){ return clamp16(x.xyxy).xy; }\n" - "float clamp16(float x){ return isnan(x)? x : clamp(x, -65504., +65504.); }\n\n"; + OS << Format(clamp_func); OS << "#define _builtin_min min\n" @@ -641,33 +640,48 @@ std::string FragmentProgramDecompiler::BuildCode() "#define _builtin_rcp(x) (1. / x)\n" "#define _builtin_rsq(x) (1. / sqrt(x))\n" "#define _builtin_log2(x) log2(abs(x))\n" - "#define _builtin_div(x, y) (x / y)\n" - "#define _builtin_divsq(x, y) (x / sqrt(y))\n\n"; + "#define _builtin_div(x, y) (x / y)\n\n"; } else { // Define raw casts from f32->f16 // Also define upcasting to avoid ambiguous function overloading in case of mixed inputs const std::string half4 = getHalfTypeName(4); - OS << - "#define clamp16(x) " << half4 << "(x)\n" - "#define _builtin_min(x, y) min(vec4(x), vec4(y))\n" - "#define _builtin_max(x, y) max(vec4(x), vec4(y))\n" + const std::string builtin_funcs = + "#define clamp16(x) " + half4 + "(x)\n" + "#define _builtin_min(x, y) min($float4(x), $float4(y))\n" + "#define _builtin_max(x, y) max($float4(x), $float4(y))\n" "#define _builtin_lit lit_legacy\n" - "#define _builtin_distance(x, y) distance(vec4(x), vec4(y))\n" + "#define _builtin_distance(x, y) distance($float4(x), $float4(y))\n" "#define _builtin_rcp(x) (1. / x)\n" "#define _builtin_rsq(x) (1. / sqrt(x))\n" "#define _builtin_log2(x) log2(abs(x))\n" - "#define _builtin_div(x, y) (x / y)\n" - "#define _builtin_divsq(x, y) (x / sqrt(y))\n\n"; + "#define _builtin_div(x, y) (x / y)\n\n"; + + OS << Format(builtin_funcs); } - + + // Define RSX-compliant DIVSQ + // If the numerator is 0, the result is always 0 even if the denominator is 0 + // NOTE: This operation is component-wise and cannot be accelerated with lerp/mix because these always return NaN if any of the choices is NaN + std::string divsq_func = + "$float4 _builtin_divsq($float4 a, float b)\n" + "{" + " $float4 tmp = a / sqrt(b);\n" + " $float4 choice = abs(a);\n" + " if (choice.x > 0.) a.x = tmp.x;\n" + " if (choice.y > 0.) a.y = tmp.y;\n" + " if (choice.z > 0.) a.z = tmp.z;\n" + " if (choice.w > 0.) a.w = tmp.w;\n" + " return a;\n" + "}\n\n"; + + OS << Format(divsq_func); // Declare register gather/merge if needed if (properties.has_gather_op) { std::string float2 = getFloatTypeName(2); - std::string float4 = getFloatTypeName(4); OS << float4 << " gather(" << float4 << " _h0, " << float4 << " _h1)\n"; OS << "{\n"; @@ -695,12 +709,16 @@ std::string FragmentProgramDecompiler::BuildCode() bool FragmentProgramDecompiler::handle_sct_scb(u32 opcode) { + // Compliance notes based on HW tests: + // DIV is IEEE compliant as is MUL, LG2, EX2 with exception to the fact that they operate on absolute values (Needs more testing) + // DIVSQ is not compliant. Result is 0 if numerator is 0 regardless of denominator + // RSQ(0) and RCP(0) return INF as expected + // Some games that rely on broken DIVSQ behaviour include Dark Souls II and Super Puzzle Fighter II Turbo HD Remix + switch (opcode) { case RSX_FP_OPCODE_ADD: SetDst("($0 + $1)"); return true; case RSX_FP_OPCODE_DIV: SetDst("_builtin_div($0, $1.x)"); return true; - // Note: DIVSQ is not IEEE compliant. divsq(0, 0) is 0 (Super Puzzle Fighter II Turbo HD Remix). - // sqrt(x, 0) might be equal to some big value (in absolute) whose sign is sign(x) but it has to be proven. case RSX_FP_OPCODE_DIVSQ: SetDst("_builtin_divsq($0, $1.x)"); return true; case RSX_FP_OPCODE_DP2: SetDst(getFunction(FUNCTION::FUNCTION_DP2), OPFLAGS::op_extern); return true; case RSX_FP_OPCODE_DP3: SetDst(getFunction(FUNCTION::FUNCTION_DP3), OPFLAGS::op_extern); return true; @@ -711,10 +729,7 @@ bool FragmentProgramDecompiler::handle_sct_scb(u32 opcode) case RSX_FP_OPCODE_MIN: SetDst("_builtin_min($0, $1)"); return true; case RSX_FP_OPCODE_MOV: SetDst("$0"); return true; case RSX_FP_OPCODE_MUL: SetDst("($0 * $1)"); return true; - // Note: It's highly likely that RCP is not IEEE compliant but a game that uses rcp(0) has to be found case RSX_FP_OPCODE_RCP: SetDst("_builtin_rcp($0.x).xxxx"); return true; - // Note: RSQ is not IEEE compliant. rsq(0) is some big number (Silent Hill 3 HD) - // It is not know what happens if 0 is negative. case RSX_FP_OPCODE_RSQ: SetDst("_builtin_rsq($0.x).xxxx"); return true; case RSX_FP_OPCODE_SEQ: SetDst("$Ty(" + compareFunction(COMPARE::FUNCTION_SEQ, "$0", "$1") + ")", OPFLAGS::op_extern); return true; case RSX_FP_OPCODE_SFL: SetDst(getFunction(FUNCTION::FUNCTION_SFL), OPFLAGS::skip_type_cast); return true;