glsl: Refactoring, cleanup and optimizations

- Avoid generating unused code
- Reduce GPR usage in emitted code
This commit is contained in:
kd-11 2019-06-15 16:15:44 +03:00 committed by kd-11
parent c963c51a60
commit 6be7c58fa4
6 changed files with 145 additions and 122 deletions

View file

@ -229,23 +229,23 @@ namespace glsl
" bool modulo;\n" " bool modulo;\n"
"};\n\n" "};\n\n"
"uint get_bits(uvec4 v, bool swap)\n" "uint get_bits(uint x, uint y, uint z, uint w, bool swap)\n"
"{\n" "{\n"
" if (swap) return (v.w | v.z << 8 | v.y << 16 | v.x << 24);\n" " if (swap) return (w | z << 8 | y << 16 | x << 24);\n"
" return (v.x | v.y << 8 | v.z << 16 | v.w << 24);\n" " return (x | y << 8 | z << 16 | w << 24);\n"
"}\n\n" "}\n\n"
"uint get_bits(uvec2 v, bool swap)\n" "uint get_bits(uint x, uint y, bool swap)\n"
"{\n" "{\n"
" if (swap) return (v.y | v.x << 8);\n" " if (swap) return (y | x << 8);\n"
" return (v.x | v.y << 8);\n" " return (x | y << 8);\n"
"}\n\n" "}\n\n"
"int preserve_sign_s16(uint bits)\n" "int preserve_sign_s16(uint bits)\n"
"{\n" "{\n"
" //convert raw 16 bit value into signed 32-bit integer counterpart\n" " //convert raw 16 bit value into signed 32-bit integer counterpart\n"
" uint sign = bits & 0x8000;\n" " uint sign = bits & 0x8000;\n"
" if (sign != 0) return int(bits | 0xFFFF0000);\n" " if (sign != 0) bits |= 0xFFFF0000;\n"
" return int(bits);\n" " return int(bits);\n"
"}\n\n" "}\n\n"
@ -282,7 +282,7 @@ namespace glsl
{ {
OS << OS <<
"#define mov(v, i, s) v[i] = s\n" "#define mov(v, i, s) v[i] = s\n"
"#define ref(v, i) v[i]\n"; "#define ref(v, i) v[i]\n\n";
} }
OS << OS <<
@ -290,70 +290,67 @@ namespace glsl
"{\n" "{\n"
" vec4 result = vec4(0., 0., 0., 1.);\n" " vec4 result = vec4(0., 0., 0., 1.);\n"
" vec4 scale = vec4(1.);\n" " vec4 scale = vec4(1.);\n"
" uvec4 tmp;\n"
" uint bits;\n"
" bool reverse_order = false;\n" " bool reverse_order = false;\n"
"\n" "\n"
" int first_byte = int((vertex_id * desc.stride) + desc.starting_offset);\n" " const int elem_size_table[] = { 2, 4, 2, 1, 2, 4, 1 };\n"
" for (int n = 0; n < 4; n++)\n" " const int elem_size = elem_size_table[desc.type];\n"
" uvec4 tmp;\n"
"\n"
" int n;\n"
" int i = int((vertex_id * desc.stride) + desc.starting_offset);\n"
"\n"
" for (n = 0; n < desc.attribute_size; n++)\n"
" {\n" " {\n"
" if (n == desc.attribute_size) break;\n" " tmp.x = texelFetch(input_stream, i++).x;\n"
" if (elem_size == 2)\n"
" {\n"
" tmp.y = texelFetch(input_stream, i++).x;\n"
" tmp.x = get_bits(tmp.x, tmp.y, desc.swap_bytes);\n"
" }\n"
" else if (elem_size == 4)\n"
" {\n"
" tmp.y = texelFetch(input_stream, i++).x;\n"
" tmp.z = texelFetch(input_stream, i++).x;\n"
" tmp.w = texelFetch(input_stream, i++).x;\n"
" tmp.x = get_bits(tmp.x, tmp.y, tmp.z, tmp.w, desc.swap_bytes);\n"
" }\n"
"\n" "\n"
" switch (desc.type)\n" " switch (desc.type)\n"
" {\n" " {\n"
" case 0:\n" " case 0:\n"
" //signed normalized 16-bit\n" " //signed normalized 16-bit\n"
" tmp.x = texelFetch(input_stream, first_byte++).x;\n"
" tmp.y = texelFetch(input_stream, first_byte++).x;\n"
" mov(result, n, get_s16(tmp.xy, desc.swap_bytes));\n"
" mov(scale, n, 32767.);\n" " mov(scale, n, 32767.);\n"
" case 4:\n"
" //signed word\n"
" mov(result, n, preserve_sign_s16(tmp.x));\n"
" break;\n" " break;\n"
" case 1:\n" " case 1:\n"
" //float\n" " //float\n"
" tmp.x = texelFetch(input_stream, first_byte++).x;\n" " mov(result, n, uintBitsToFloat(tmp.x));\n"
" tmp.y = texelFetch(input_stream, first_byte++).x;\n"
" tmp.z = texelFetch(input_stream, first_byte++).x;\n"
" tmp.w = texelFetch(input_stream, first_byte++).x;\n"
" mov(result, n, uintBitsToFloat(get_bits(tmp, desc.swap_bytes)));\n"
" break;\n" " break;\n"
" case 2:\n" " case 2:\n"
" //half\n" " //half\n"
" tmp.x = texelFetch(input_stream, first_byte++).x;\n" " mov(result, n, unpackHalf2x16(tmp.x).x);\n"
" tmp.y = texelFetch(input_stream, first_byte++).x;\n"
" mov(result, n, unpackHalf2x16(uint(get_bits(tmp.xy, desc.swap_bytes))).x);\n"
" break;\n" " break;\n"
" case 3:\n" " case 3:\n"
" //unsigned byte\n" " //unsigned byte\n"
" mov(result, n, texelFetch(input_stream, first_byte++).x);\n"
" mov(scale, n, 255.);\n" " mov(scale, n, 255.);\n"
" case 6:\n"
" //ub256\n"
" mov(result, n, tmp.x);\n"
" reverse_order = desc.swap_bytes;\n" " reverse_order = desc.swap_bytes;\n"
" break;\n" " break;\n"
" case 4:\n"
" //signed word\n"
" tmp.x = texelFetch(input_stream, first_byte++).x;\n"
" tmp.y = texelFetch(input_stream, first_byte++).x;\n"
" mov(result, n, get_s16(tmp.xy, desc.swap_bytes));\n"
" break;\n"
" case 5:\n" " case 5:\n"
" //cmp\n" " //cmp\n"
" tmp.x = texelFetch(input_stream, first_byte++).x;\n" " result.x = preserve_sign_s16((tmp.x & 0x7FF) << 5);\n"
" tmp.y = texelFetch(input_stream, first_byte++).x;\n" " result.y = preserve_sign_s16(((tmp.x >> 11) & 0x7FF) << 5);\n"
" tmp.z = texelFetch(input_stream, first_byte++).x;\n" " result.z = preserve_sign_s16(((tmp.x >> 22) & 0x3FF) << 6);\n"
" tmp.w = texelFetch(input_stream, first_byte++).x;\n"
" bits = get_bits(tmp, desc.swap_bytes);\n"
" result.x = preserve_sign_s16((bits & 0x7FF) << 5);\n"
" result.y = preserve_sign_s16(((bits >> 11) & 0x7FF) << 5);\n"
" result.z = preserve_sign_s16(((bits >> 22) & 0x3FF) << 6);\n"
" result.w = 1.;\n" " result.w = 1.;\n"
" scale = vec4(32767., 32767., 32767., 1.);\n" " scale = vec4(32767., 32767., 32767., 1.);\n"
" break;\n" " break;\n"
" case 6:\n"
" //ub256\n"
" mov(result, n, float(texelFetch(input_stream, first_byte++).x));\n"
" reverse_order = desc.swap_bytes;\n"
" break;\n"
" }\n" " }\n"
" }\n\n" " }\n"
"\n"
" result /= scale;\n" " result /= scale;\n"
" return (reverse_order)? result.wzyx: result;\n" " return (reverse_order)? result.wzyx: result;\n"
"}\n\n" "}\n\n"
@ -410,17 +407,14 @@ namespace glsl
" {\n" " {\n"
" vertex_id = 0;\n" " vertex_id = 0;\n"
" }\n" " }\n"
" else if (desc.frequency > 1)\n" " else if (desc.modulo)\n"
" {\n" " {\n"
" //if a vertex modifier is active; vertex_base must be 0 and is ignored\n" " //if a vertex modifier is active; vertex_base must be 0 and is ignored\n"
" if (desc.modulo)\n"
" {\n"
" vertex_id = (" << vertex_id_name << " + int(vertex_index_offset)) % int(desc.frequency);\n" " vertex_id = (" << vertex_id_name << " + int(vertex_index_offset)) % int(desc.frequency);\n"
" }\n" " }\n"
" else\n" " else\n"
" {\n" " {\n"
" vertex_id = vertex_id / int(desc.frequency); \n" " vertex_id /= int(desc.frequency); \n"
" }\n"
" }\n" " }\n"
"\n" "\n"
" if (desc.is_volatile)\n" " if (desc.is_volatile)\n"
@ -430,7 +424,7 @@ namespace glsl
"}\n\n"; "}\n\n";
} }
static void insert_rop(std::ostream& OS, bool _32_bit_exports, bool native_half_support) static void insert_rop(std::ostream& OS, bool _32_bit_exports, bool native_half_support, bool emulate_coverage_tests)
{ {
const std::string reg0 = _32_bit_exports ? "r0" : "h0"; const std::string reg0 = _32_bit_exports ? "r0" : "h0";
const std::string reg1 = _32_bit_exports ? "r2" : "h4"; const std::string reg1 = _32_bit_exports ? "r2" : "h4";
@ -442,17 +436,32 @@ namespace glsl
" if ((rop_control & 0xFF) != 0)\n" " if ((rop_control & 0xFF) != 0)\n"
" {\n" " {\n"
" bool alpha_test = (rop_control & 0x1) > 0;\n" " bool alpha_test = (rop_control & 0x1) > 0;\n"
" uint alpha_func = ((rop_control >> 16) & 0x7);\n" " uint alpha_func = ((rop_control >> 16) & 0x7);\n";
" bool srgb_convert = (rop_control & 0x2) > 0;\n\n"
" bool a2c_enabled = (rop_control & 0x10) > 0;\n" if (!_32_bit_exports)
{
OS << " bool srgb_convert = (rop_control & 0x2) > 0;\n\n";
}
if (emulate_coverage_tests)
{
OS << " bool a2c_enabled = (rop_control & 0x10) > 0;\n";
}
OS <<
" if (alpha_test && !comparison_passes(" << reg0 << ".a, alpha_ref, alpha_func))\n" " if (alpha_test && !comparison_passes(" << reg0 << ".a, alpha_ref, alpha_func))\n"
" {\n" " {\n"
" discard;\n" " discard;\n"
" }\n" " }\n";
if (emulate_coverage_tests)
{
OS <<
" else if (a2c_enabled && !coverage_test_passes(" << reg0 << ", rop_control >> 5))\n" " else if (a2c_enabled && !coverage_test_passes(" << reg0 << ", rop_control >> 5))\n"
" {\n" " {\n"
" discard;\n" " discard;\n"
" }\n"; " }\n";
}
if (!_32_bit_exports) if (!_32_bit_exports)
{ {
@ -535,11 +544,13 @@ namespace glsl
program_common::insert_compare_op(OS, props.low_precision_tests); program_common::insert_compare_op(OS, props.low_precision_tests);
if (props.require_texture_ops && props.emulate_shadow_compare) if (props.require_shadow_ops && props.emulate_shadow_compare)
{ {
program_common::insert_compare_op_vector(OS); program_common::insert_compare_op_vector(OS);
} }
if (props.emulate_coverage_tests)
{
// NOTES: // NOTES:
// Lowers alpha accuracy down to 2 bits, to mimic A2C banding // Lowers alpha accuracy down to 2 bits, to mimic A2C banding
// Alpha lower than the real threshold (e.g 0.25 for 4 samples) gets a randomized chance to make it to the lowest transparency state // Alpha lower than the real threshold (e.g 0.25 for 4 samples) gets a randomized chance to make it to the lowest transparency state
@ -555,21 +566,20 @@ namespace glsl
" float alpha = trunc((_sample.a + epsilon) * samples) / samples;\n" " float alpha = trunc((_sample.a + epsilon) * samples) / samples;\n"
" //_sample.a = min(_sample.a, alpha);\n" // Cannot blend A2C samples naively as they are order independent! Causes background bleeding " //_sample.a = min(_sample.a, alpha);\n" // Cannot blend A2C samples naively as they are order independent! Causes background bleeding
" return (alpha > 0.f);\n" " return (alpha > 0.f);\n"
"}\n\n" "}\n\n";
}
if (!props.fp32_outputs)
{
OS <<
"vec4 linear_to_srgb(vec4 cl)\n" "vec4 linear_to_srgb(vec4 cl)\n"
"{\n" "{\n"
" vec4 low = cl * 12.92;\n" " vec4 low = cl * 12.92;\n"
" vec4 high = 1.055 * pow(cl, vec4(1. / 2.4)) - 0.055;\n" " vec4 high = 1.055 * pow(cl, vec4(1. / 2.4)) - 0.055;\n"
" bvec4 select = lessThan(cl, vec4(0.0031308));\n" " bvec4 select = lessThan(cl, vec4(0.0031308));\n"
" return clamp(mix(high, low, select), 0., 1.);\n" " return clamp(mix(high, low, select), 0., 1.);\n"
"}\n\n"
"float srgb_to_linear(float cs)\n"
"{\n"
" if (cs <= 0.04045) return cs / 12.92;\n"
" return pow((cs + 0.055) / 1.055, 2.4);\n"
"}\n\n"; "}\n\n";
}
if (props.require_depth_conversion) if (props.require_depth_conversion)
{ {
@ -617,7 +627,7 @@ namespace glsl
if (props.require_texture_ops) if (props.require_texture_ops)
{ {
if (props.emulate_shadow_compare) if (props.require_shadow_ops && props.emulate_shadow_compare)
{ {
OS << OS <<
"vec4 shadowCompare(sampler2D tex, vec3 p, uint func)\n" "vec4 shadowCompare(sampler2D tex, vec3 p, uint func)\n"
@ -648,6 +658,12 @@ namespace glsl
" return mix(direct, indexed, choice);\n" " return mix(direct, indexed, choice);\n"
"}\n\n" "}\n\n"
#endif #endif
"vec4 srgb_to_linear(vec4 cs)\n"
"{\n"
" vec4 a = cs / 12.92;\n"
" vec4 b = pow((cs + 0.055) / 1.055, vec4(2.4));\n"
" return _select(a, b, greaterThan(cs, vec4(0.04045)));\n"
"}\n\n"
//TODO: Move all the texture read control operations here //TODO: Move all the texture read control operations here
"vec4 process_texel(vec4 rgba, uint control_bits)\n" "vec4 process_texel(vec4 rgba, uint control_bits)\n"
@ -656,23 +672,25 @@ namespace glsl
" uint remap_bits = (control_bits >> 16) & 0xFFFF;\n" " uint remap_bits = (control_bits >> 16) & 0xFFFF;\n"
" if (remap_bits != 0x8D5) rgba = remap_vector(rgba, remap_bits);\n\n" " if (remap_bits != 0x8D5) rgba = remap_vector(rgba, remap_bits);\n\n"
#endif #endif
" if ((control_bits & 0xFF) == 0) return rgba;\n\n" " if (control_bits == 0)\n"
" if ((control_bits & 0x10) > 0)\n" " {\n"
" return rgba;\n"
" }\n"
"\n"
" if ((control_bits & 0x10) != 0)\n"
" {\n" " {\n"
" // Alphakill\n" " // Alphakill\n"
" if (rgba.a < 0.0000000001)\n" " if (rgba.a < 0.000001)\n"
" {\n" " {\n"
" discard;\n" " discard;\n"
" return rgba;\n" " return rgba;\n"
" }\n" " }\n"
" }\n\n" " }\n"
"\n"
" //TODO: Verify gamma control bit ordering, looks to be 0x7 for rgb, 0xF for rgba\n" " //TODO: Verify gamma control bit ordering, looks to be 0x7 for rgb, 0xF for rgba\n"
" uint srgb_in = (control_bits & 0xF);\n" " uvec4 mask = uvec4(control_bits & 0xF) & uvec4(0x1, 0x2, 0x4, 0x8);\n"
" if ((srgb_in & 0x1) > 0) rgba.r = srgb_to_linear(rgba.r);\n" " vec4 convert = srgb_to_linear(rgba);\n"
" if ((srgb_in & 0x2) > 0) rgba.g = srgb_to_linear(rgba.g);\n" " return _select(rgba, convert, notEqual(mask, uvec4(0)));\n"
" if ((srgb_in & 0x4) > 0) rgba.b = srgb_to_linear(rgba.b);\n"
" if ((srgb_in & 0x8) > 0) rgba.a = srgb_to_linear(rgba.a);\n"
" return rgba;\n"
"}\n\n" "}\n\n"
"#define TEX_NAME(index) tex##index\n" "#define TEX_NAME(index) tex##index\n"

View file

@ -1,4 +1,4 @@
#pragma once #pragma once
namespace glsl namespace glsl
{ {
@ -22,9 +22,12 @@ namespace glsl
bool require_lit_emulation; bool require_lit_emulation;
// Only relevant for fragment programs // Only relevant for fragment programs
bool fp32_outputs;
bool require_wpos; bool require_wpos;
bool require_depth_conversion; bool require_depth_conversion;
bool require_texture_ops; bool require_texture_ops;
bool require_shadow_ops;
bool emulate_coverage_tests;
bool emulate_shadow_compare; bool emulate_shadow_compare;
bool low_precision_tests; bool low_precision_tests;
}; };

View file

@ -199,9 +199,12 @@ void GLFragmentDecompilerThread::insertGlobalFunctions(std::stringstream &OS)
glsl::shader_properties properties2; glsl::shader_properties properties2;
properties2.domain = glsl::glsl_fragment_program; properties2.domain = glsl::glsl_fragment_program;
properties2.require_lit_emulation = properties.has_lit_op; properties2.require_lit_emulation = properties.has_lit_op;
properties2.fp32_outputs = !!(m_prog.ctrl & CELL_GCM_SHADER_CONTROL_32_BITS_EXPORTS);
properties2.require_depth_conversion = m_prog.redirected_textures != 0; properties2.require_depth_conversion = m_prog.redirected_textures != 0;
properties2.require_wpos = properties.has_wpos_input; properties2.require_wpos = properties.has_wpos_input;
properties2.require_texture_ops = properties.has_tex_op; properties2.require_texture_ops = properties.has_tex_op;
properties2.require_shadow_ops = m_prog.shadow_textures != 0;
properties2.emulate_coverage_tests = g_cfg.video.antialiasing_level == msaa_level::none;
properties2.emulate_shadow_compare = device_props.emulate_depth_compare; properties2.emulate_shadow_compare = device_props.emulate_depth_compare;
properties2.low_precision_tests = ::gl::get_driver_caps().vendor_NVIDIA; properties2.low_precision_tests = ::gl::get_driver_caps().vendor_NVIDIA;
@ -350,7 +353,11 @@ void GLFragmentDecompilerThread::insertMainEnd(std::stringstream & OS)
OS << "\n" << " fs_main(" + parameters + ");\n\n"; OS << "\n" << " fs_main(" + parameters + ");\n\n";
glsl::insert_rop(OS, !!(m_ctrl & CELL_GCM_SHADER_CONTROL_32_BITS_EXPORTS), device_props.has_native_half_support); glsl::insert_rop(
OS,
!!(m_ctrl & CELL_GCM_SHADER_CONTROL_32_BITS_EXPORTS),
device_props.has_native_half_support,
g_cfg.video.antialiasing_level == msaa_level::none);
if (m_ctrl & CELL_GCM_SHADER_CONTROL_DEPTH_EXPORT) if (m_ctrl & CELL_GCM_SHADER_CONTROL_DEPTH_EXPORT)
{ {

View file

@ -157,15 +157,9 @@ void GLVertexDecompilerThread::insertMainStart(std::stringstream & OS)
{ {
const auto& dev_caps = gl::get_driver_caps(); const auto& dev_caps = gl::get_driver_caps();
glsl::shader_properties properties2; glsl::shader_properties properties2{};
properties2.domain = glsl::glsl_vertex_program; properties2.domain = glsl::glsl_vertex_program;
properties2.require_lit_emulation = properties.has_lit_op; properties2.require_lit_emulation = properties.has_lit_op;
// Unused
properties2.require_depth_conversion = false;
properties2.require_wpos = false;
properties2.require_texture_ops = false;
properties2.emulate_shadow_compare = false;
properties2.low_precision_tests = false;
insert_glsl_legacy_function(OS, properties2); insert_glsl_legacy_function(OS, properties2);
glsl::insert_vertex_input_fetch(OS, glsl::glsl_rules_opengl4, dev_caps.vendor_INTEL == false); glsl::insert_vertex_input_fetch(OS, glsl::glsl_rules_opengl4, dev_caps.vendor_INTEL == false);

View file

@ -229,9 +229,12 @@ void VKFragmentDecompilerThread::insertGlobalFunctions(std::stringstream &OS)
glsl::shader_properties properties2; glsl::shader_properties properties2;
properties2.domain = glsl::glsl_fragment_program; properties2.domain = glsl::glsl_fragment_program;
properties2.require_lit_emulation = properties.has_lit_op; properties2.require_lit_emulation = properties.has_lit_op;
properties2.fp32_outputs = !!(m_prog.ctrl & CELL_GCM_SHADER_CONTROL_32_BITS_EXPORTS);
properties2.require_depth_conversion = m_prog.redirected_textures != 0; properties2.require_depth_conversion = m_prog.redirected_textures != 0;
properties2.require_wpos = properties.has_wpos_input; properties2.require_wpos = properties.has_wpos_input;
properties2.require_texture_ops = properties.has_tex_op; properties2.require_texture_ops = properties.has_tex_op;
properties2.require_shadow_ops = m_prog.shadow_textures != 0;
properties2.emulate_coverage_tests = g_cfg.video.antialiasing_level == msaa_level::none;
properties2.emulate_shadow_compare = device_props.emulate_depth_compare; properties2.emulate_shadow_compare = device_props.emulate_depth_compare;
properties2.low_precision_tests = vk::get_driver_vendor() == vk::driver_vendor::NVIDIA; properties2.low_precision_tests = vk::get_driver_vendor() == vk::driver_vendor::NVIDIA;
@ -383,7 +386,11 @@ void VKFragmentDecompilerThread::insertMainEnd(std::stringstream & OS)
OS << "\n" << " fs_main(" + parameters + ");\n\n"; OS << "\n" << " fs_main(" + parameters + ");\n\n";
glsl::insert_rop(OS, !!(m_ctrl & CELL_GCM_SHADER_CONTROL_32_BITS_EXPORTS), device_props.has_native_half_support); glsl::insert_rop(
OS,
!!(m_ctrl & CELL_GCM_SHADER_CONTROL_32_BITS_EXPORTS),
device_props.has_native_half_support,
g_cfg.video.antialiasing_level == msaa_level::none);
if (m_ctrl & CELL_GCM_SHADER_CONTROL_DEPTH_EXPORT) if (m_ctrl & CELL_GCM_SHADER_CONTROL_DEPTH_EXPORT)
{ {

View file

@ -193,15 +193,9 @@ void VKVertexDecompilerThread::insertOutputs(std::stringstream & OS, const std::
void VKVertexDecompilerThread::insertMainStart(std::stringstream & OS) void VKVertexDecompilerThread::insertMainStart(std::stringstream & OS)
{ {
glsl::shader_properties properties2; glsl::shader_properties properties2{};
properties2.domain = glsl::glsl_vertex_program; properties2.domain = glsl::glsl_vertex_program;
properties2.require_lit_emulation = properties.has_lit_op; properties2.require_lit_emulation = properties.has_lit_op;
// Unused
properties2.require_depth_conversion = false;
properties2.require_wpos = false;
properties2.require_texture_ops = false;
properties2.emulate_shadow_compare = false;
properties2.low_precision_tests = false;
glsl::insert_glsl_legacy_function(OS, properties2); glsl::insert_glsl_legacy_function(OS, properties2);
glsl::insert_vertex_input_fetch(OS, glsl::glsl_rules_spirv); glsl::insert_vertex_input_fetch(OS, glsl::glsl_rules_spirv);