diff --git a/rpcs3/Emu/RSX/GL/GLDraw.cpp b/rpcs3/Emu/RSX/GL/GLDraw.cpp index b2de00e3b7..33ef42570b 100644 --- a/rpcs3/Emu/RSX/GL/GLDraw.cpp +++ b/rpcs3/Emu/RSX/GL/GLDraw.cpp @@ -599,7 +599,11 @@ void GLGSRender::emit_geometry(u32 sub_index) if (!upload_info.index_info) { - if (draw_call.is_single_draw()) + if (draw_call.is_trivial_instanced_draw) + { + glDrawArraysInstanced(draw_mode, 0, upload_info.vertex_draw_count, draw_call.pass_count()); + } + else if (draw_call.is_single_draw()) { glDrawArrays(draw_mode, 0, upload_info.vertex_draw_count); } @@ -625,7 +629,7 @@ void GLGSRender::emit_geometry(u32 sub_index) if (driver_caps.vendor_AMD && (first + range.count) > (0x100000 >> 2)) { - //Unlikely, but added here in case the identity buffer is not large enough somehow + // Unlikely, but added here in case the identity buffer is not large enough somehow use_draw_arrays_fallback = true; break; } @@ -635,7 +639,7 @@ void GLGSRender::emit_geometry(u32 sub_index) if (use_draw_arrays_fallback) { - //MultiDrawArrays is broken on some primitive types using AMD. One known type is GL_TRIANGLE_STRIP but there could be more + // MultiDrawArrays is broken on some primitive types using AMD. One known type is GL_TRIANGLE_STRIP but there could be more for (u32 n = 0; n < draw_count; ++n) { glDrawArrays(draw_mode, firsts[n], counts[n]); @@ -643,13 +647,13 @@ void GLGSRender::emit_geometry(u32 sub_index) } else if (driver_caps.vendor_AMD) { - //Use identity index buffer to fix broken vertexID on AMD + // Use identity index buffer to fix broken vertexID on AMD m_identity_index_buffer->bind(); glMultiDrawElements(draw_mode, counts, GL_UNSIGNED_INT, offsets, static_cast(draw_count)); } else { - //Normal render + // Normal render glMultiDrawArrays(draw_mode, firsts, counts, static_cast(draw_count)); } } @@ -667,7 +671,11 @@ void GLGSRender::emit_geometry(u32 sub_index) m_index_ring_buffer->bind(); - if (draw_call.is_single_draw()) + if (draw_call.is_trivial_instanced_draw) + { + glDrawElementsInstanced(draw_mode, upload_info.vertex_draw_count, index_type, reinterpret_cast(u64{ index_offset }), draw_call.pass_count()); + } + else if (draw_call.is_single_draw()) { glDrawElements(draw_mode, upload_info.vertex_draw_count, index_type, reinterpret_cast(u64{index_offset})); } @@ -781,13 +789,20 @@ void GLGSRender::end() m_program->validate(); } - rsx::method_registers.current_draw_clause.begin(); + auto& draw_call = REGS(m_ctx)->current_draw_clause; + draw_call.begin(); u32 subdraw = 0u; do { emit_geometry(subdraw++); + + if (draw_call.is_trivial_instanced_draw) + { + // We already completed. End the draw. + draw_call.end(); + } } - while (rsx::method_registers.current_draw_clause.next()); + while (draw_call.next()); m_rtts.on_write(m_framebuffer_layout.color_write_enabled, m_framebuffer_layout.zeta_write_enabled); diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.cpp b/rpcs3/Emu/RSX/GL/GLGSRender.cpp index 8cefa0c794..c23f560268 100644 --- a/rpcs3/Emu/RSX/GL/GLGSRender.cpp +++ b/rpcs3/Emu/RSX/GL/GLGSRender.cpp @@ -296,6 +296,7 @@ void GLGSRender::on_init_thread() m_fragment_instructions_buffer = std::make_unique(); m_raster_env_ring_buffer = std::make_unique(); m_scratch_ring_buffer = std::make_unique(); + m_instancing_ring_buffer = std::make_unique(); } else { @@ -311,6 +312,7 @@ void GLGSRender::on_init_thread() m_fragment_instructions_buffer = std::make_unique(); m_raster_env_ring_buffer = std::make_unique(); m_scratch_ring_buffer = std::make_unique(); + m_instancing_ring_buffer = std::make_unique(); } m_attrib_ring_buffer->create(gl::buffer::target::texture, 256 * 0x100000); @@ -323,6 +325,7 @@ void GLGSRender::on_init_thread() m_vertex_layout_buffer->create(gl::buffer::target::uniform, 16 * 0x100000); m_raster_env_ring_buffer->create(gl::buffer::target::uniform, 16 * 0x100000); m_scratch_ring_buffer->create(gl::buffer::target::uniform, 16 * 0x100000); + m_instancing_ring_buffer->create(gl::buffer::target::ssbo, 64 * 0x100000); if (shadermode == shader_mode::async_with_interpreter || shadermode == shader_mode::interpreter_only) { @@ -547,6 +550,11 @@ void GLGSRender::on_exit() m_scratch_ring_buffer->remove(); } + if (m_instancing_ring_buffer) + { + m_instancing_ring_buffer->remove(); + } + m_null_textures.clear(); m_gl_texture_cache.destroy(); m_ui_renderer.destroy(); @@ -866,7 +874,8 @@ void GLGSRender::load_program_env() const bool update_fragment_env = m_graphics_state & rsx::pipeline_state::fragment_state_dirty; const bool update_fragment_texture_env = m_graphics_state & rsx::pipeline_state::fragment_texture_state_dirty; const bool update_instruction_buffers = !!m_interpreter_state && m_shader_interpreter.is_interpreter(m_program); - const bool update_raster_env = rsx::method_registers.polygon_stipple_enabled() && (m_graphics_state & rsx::pipeline_state::polygon_stipple_pattern_dirty); + const bool update_raster_env = REGS(m_ctx)->polygon_stipple_enabled() && (m_graphics_state & rsx::pipeline_state::polygon_stipple_pattern_dirty); + const bool update_instancing_data = REGS(m_ctx)->current_draw_clause.is_trivial_instanced_draw; if (manually_flush_ring_buffers) { @@ -876,6 +885,7 @@ void GLGSRender::load_program_env() if (update_fragment_constants) m_fragment_constants_buffer->reserve_storage_on_heap(utils::align(fragment_constants_size, 256)); if (update_transform_constants) m_transform_constants_buffer->reserve_storage_on_heap(8192); if (update_raster_env) m_raster_env_ring_buffer->reserve_storage_on_heap(128); + if (update_instancing_data) m_instancing_ring_buffer->reserve_storage_on_heap(8192 * REGS(m_ctx)->current_draw_clause.pass_count()); if (update_instruction_buffers) { @@ -899,6 +909,33 @@ void GLGSRender::load_program_env() m_vertex_env_buffer->bind_range(GL_VERTEX_PARAMS_BIND_SLOT, mapping.second, 144); } + if (update_instancing_data) + { + // Combines transform load + instancing lookup table + const auto alignment = m_min_ssbo_alignment; + u32 indirection_table_offset = 0; + u32 constants_data_table_offset = 0; + + rsx::io_buffer indirection_table_buf([&](usz size) -> std::pair + { + const auto mapping = m_instancing_ring_buffer->alloc_from_heap(static_cast(size), alignment); + indirection_table_offset = mapping.second; + return mapping; + }); + + rsx::io_buffer constants_array_buf([&](usz size) -> std::pair + { + const auto mapping = m_instancing_ring_buffer->alloc_from_heap(static_cast(size), alignment); + constants_data_table_offset = mapping.second; + return mapping; + }); + + m_draw_processor.fill_constants_instancing_buffer(indirection_table_buf, constants_array_buf, m_vertex_prog); + + m_instancing_ring_buffer->bind_range(GL_INSTANCING_LUT_BIND_SLOT, indirection_table_offset, ::size32(indirection_table_buf)); + m_instancing_ring_buffer->bind_range(GL_INSTANCING_XFORM_CONSTANTS_SLOT, constants_data_table_offset, ::size32(constants_array_buf)); + } + if (update_transform_constants) { // Vertex constants @@ -1011,6 +1048,7 @@ void GLGSRender::load_program_env() if (update_fragment_constants) m_fragment_constants_buffer->unmap(); if (update_transform_constants) m_transform_constants_buffer->unmap(); if (update_raster_env) m_raster_env_ring_buffer->unmap(); + if (update_instancing_data) m_instancing_ring_buffer->unmap(); if (update_instruction_buffers) { diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.h b/rpcs3/Emu/RSX/GL/GLGSRender.h index 1c84233934..8b2caaf194 100644 --- a/rpcs3/Emu/RSX/GL/GLGSRender.h +++ b/rpcs3/Emu/RSX/GL/GLGSRender.h @@ -105,6 +105,7 @@ class GLGSRender : public GSRender, public ::rsx::reports::ZCULL_control std::unique_ptr m_vertex_instructions_buffer; std::unique_ptr m_fragment_instructions_buffer; std::unique_ptr m_raster_env_ring_buffer; + std::unique_ptr m_instancing_ring_buffer; // Identity buffer used to fix broken gl_VertexID on ATI stack std::unique_ptr m_identity_index_buffer; @@ -117,6 +118,7 @@ class GLGSRender : public GSRender, public ::rsx::reports::ZCULL_control GLint m_min_texbuffer_alignment = 256; GLint m_uniform_buffer_offset_align = 256; + GLint m_min_ssbo_alignment = 256; GLint m_max_texbuffer_size = 65536; bool manually_flush_ring_buffers = false; diff --git a/rpcs3/Emu/RSX/GL/GLProcTable.h b/rpcs3/Emu/RSX/GL/GLProcTable.h index 783701e4dd..8f42995a9d 100644 --- a/rpcs3/Emu/RSX/GL/GLProcTable.h +++ b/rpcs3/Emu/RSX/GL/GLProcTable.h @@ -183,6 +183,9 @@ OPENGL_PROC(PFNGLUNMAPNAMEDBUFFEREXTPROC, UnmapNamedBufferEXT); OPENGL_PROC(PFNGLMULTIDRAWELEMENTSPROC, MultiDrawElements); OPENGL_PROC(PFNGLMULTIDRAWARRAYSPROC, MultiDrawArrays); +OPENGL_PROC(PFNGLDRAWARRAYSINSTANCEDPROC, DrawArraysInstanced); +OPENGL_PROC(PFNGLDRAWELEMENTSINSTANCEDPROC, DrawElementsInstanced); + OPENGL_PROC(PFNGLGETTEXTUREIMAGEEXTPROC, GetTextureImageEXT); OPENGL_PROC(PFNGLGETTEXTUREIMAGEPROC, GetTextureImage); OPENGL_PROC(PFNGLGETTEXTURESUBIMAGEPROC, GetTextureSubImage); diff --git a/rpcs3/Emu/RSX/GL/GLVertexProgram.cpp b/rpcs3/Emu/RSX/GL/GLVertexProgram.cpp index 79286bd69d..dfa18d37e6 100644 --- a/rpcs3/Emu/RSX/GL/GLVertexProgram.cpp +++ b/rpcs3/Emu/RSX/GL/GLVertexProgram.cpp @@ -28,30 +28,31 @@ std::string GLVertexDecompilerThread::compareFunction(COMPARE f, const std::stri void GLVertexDecompilerThread::insertHeader(std::stringstream &OS) { - OS << "#version 430\n"; - OS << "layout(std140, binding = " << GL_VERTEX_PARAMS_BIND_SLOT << ") uniform VertexContextBuffer\n"; - OS << "{\n"; - OS << " mat4 scale_offset_mat;\n"; - OS << " ivec4 user_clip_enabled[2];\n"; - OS << " vec4 user_clip_factor[2];\n"; - OS << " uint transform_branch_bits;\n"; - OS << " float point_size;\n"; - OS << " float z_near;\n"; - OS << " float z_far;\n"; - OS << "};\n\n"; + OS << + "#version 430\n" + "layout(std140, binding = " << GL_VERTEX_PARAMS_BIND_SLOT << ") uniform VertexContextBuffer\n" + "{\n" + " mat4 scale_offset_mat;\n" + " ivec4 user_clip_enabled[2];\n" + " vec4 user_clip_factor[2];\n" + " uint transform_branch_bits;\n" + " float point_size;\n" + " float z_near;\n" + " float z_far;\n" + "};\n\n" - OS << "layout(std140, binding = " << GL_VERTEX_LAYOUT_BIND_SLOT << ") uniform VertexLayoutBuffer\n"; - OS << "{\n"; - OS << " uint vertex_base_index;\n"; - OS << " uint vertex_index_offset;\n"; - OS << " uvec4 input_attributes_blob[16 / 2];\n"; - OS << "};\n\n"; + "layout(std140, binding = " << GL_VERTEX_LAYOUT_BIND_SLOT << ") uniform VertexLayoutBuffer\n" + "{\n" + " uint vertex_base_index;\n" + " uint vertex_index_offset;\n" + " uvec4 input_attributes_blob[16 / 2];\n" + "};\n\n"; } void GLVertexDecompilerThread::insertInputs(std::stringstream& OS, const std::vector& /*inputs*/) { - OS << "layout(location=0) uniform usamplerBuffer persistent_input_stream;\n"; //Data stream with persistent vertex data (cacheable) - OS << "layout(location=1) uniform usamplerBuffer volatile_input_stream;\n"; //Data stream with per-draw data (registers and immediate draw data) + OS << "layout(location=0) uniform usamplerBuffer persistent_input_stream;\n"; // Data stream with persistent vertex data (cacheable) + OS << "layout(location=1) uniform usamplerBuffer volatile_input_stream;\n"; // Data stream with per-draw data (registers and immediate draw data) } void GLVertexDecompilerThread::insertConstants(std::stringstream& OS, const std::vector& constants) @@ -62,10 +63,29 @@ void GLVertexDecompilerThread::insertConstants(std::stringstream& OS, const std: { if (PI.name.starts_with("vc[")) { - OS << "layout(std140, binding = " << GL_VERTEX_CONSTANT_BUFFERS_BIND_SLOT << ") uniform VertexConstantsBuffer\n"; - OS << "{\n"; - OS << " vec4 " << PI.name << ";\n"; - OS << "};\n\n"; + if (!(m_prog.ctrl & RSX_SHADER_CONTROL_INSTANCED_CONSTANTS)) + { + OS << + "layout(std140, binding = " << GL_VERTEX_CONSTANT_BUFFERS_BIND_SLOT << ") uniform VertexConstantsBuffer\n" + "{\n" + " vec4 " << PI.name << ";\n" + "};\n\n"; + } + else + { + OS << + "layout(std430, binding = " << GL_INSTANCING_LUT_BIND_SLOT << ") readonly buffer InstancingIndirectionLUT\n" + "{\n" + " int constants_addressing_lookup[];\n" + "};\n\n" + + "layout(std430, binding = " << GL_INSTANCING_XFORM_CONSTANTS_SLOT << ") readonly buffer InstancingVertexConstantsBlock\n" + "{\n" + " vec4 instanced_constants_array[];\n" + "};\n\n" + + "#define CONSTANTS_ARRAY_LENGTH " << (properties.has_indexed_constants ? 468 : ::size32(m_constant_ids)) << "\n\n"; + } continue; } @@ -104,12 +124,12 @@ static const vertex_reg_info reg_table[] = { "gl_Position", false, "dst_reg0", "", false }, { "diff_color", true, "dst_reg1", "", false, "", "", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_FRONTDIFFUSE | CELL_GCM_ATTRIB_OUTPUT_MASK_BACKDIFFUSE }, { "spec_color", true, "dst_reg2", "", false, "", "", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_FRONTSPECULAR | CELL_GCM_ATTRIB_OUTPUT_MASK_BACKSPECULAR }, - //These are only present when back variants are specified, otherwise the default diff/spec color vars are for both front and back + // These are only present when back variants are specified, otherwise the default diff/spec color vars are for both front and back { "diff_color1", true, "dst_reg3", "", false, "", "", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_FRONTDIFFUSE | CELL_GCM_ATTRIB_OUTPUT_MASK_BACKDIFFUSE }, { "spec_color1", true, "dst_reg4", "", false, "", "", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_FRONTSPECULAR | CELL_GCM_ATTRIB_OUTPUT_MASK_BACKSPECULAR }, - //Fog output shares a data source register with clip planes 0-2 so only declare when specified + // Fog output shares a data source register with clip planes 0-2 so only declare when specified { "fog_c", true, "dst_reg5", ".xxxx", true, "", "", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_FOG }, - //Warning: Always define all 3 clip plane groups together to avoid flickering with openGL + // Warning: Always define all 3 clip plane groups together to avoid flickering with openGL { "gl_ClipDistance[0]", false, "dst_reg5", ".y * user_clip_factor[0].x", false, "user_clip_enabled[0].x > 0", "0.5", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_UC0 }, { "gl_ClipDistance[1]", false, "dst_reg5", ".z * user_clip_factor[0].y", false, "user_clip_enabled[0].y > 0", "0.5", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_UC1 }, { "gl_ClipDistance[2]", false, "dst_reg5", ".w * user_clip_factor[0].z", false, "user_clip_enabled[0].z > 0", "0.5", "", true, CELL_GCM_ATTRIB_OUTPUT_MASK_UC2 }, @@ -152,6 +172,7 @@ void GLVertexDecompilerThread::insertMainStart(std::stringstream & OS) properties2.emulate_depth_clip_only = dev_caps.NV_depth_buffer_float_supported; properties2.low_precision_tests = dev_caps.vendor_NVIDIA; properties2.require_explicit_invariance = dev_caps.vendor_MESA || (dev_caps.vendor_NVIDIA && g_cfg.video.shader_precision != gpu_preset_level::low); + properties2.require_instanced_render = !!(m_prog.ctrl & RSX_SHADER_CONTROL_INSTANCED_CONSTANTS); insert_glsl_legacy_function(OS, properties2); glsl::insert_vertex_input_fetch(OS, glsl::glsl_rules_opengl4, dev_caps.vendor_INTEL == false); @@ -188,7 +209,7 @@ void GLVertexDecompilerThread::insertMainStart(std::stringstream & OS) OS << "void vs_main()\n"; OS << "{\n"; - //Declare temporary registers, ignoring those mapped to outputs + // Declare temporary registers, ignoring those mapped to outputs for (const ParamType &PT : m_parr.params[PF_PARAM_NONE]) { for (const ParamItem &PI : PT.items) @@ -237,7 +258,7 @@ void GLVertexDecompilerThread::insertMainEnd(std::stringstream & OS) } else { - //Insert if-else condition + // Insert if-else condition OS << " " << i.name << " = " << condition << "? " << i.src_reg << i.src_reg_mask << ": " << i.default_val << ";\n"; } @@ -261,21 +282,21 @@ void GLVertexDecompilerThread::insertMainEnd(std::stringstream & OS) OS << " gl_Position = gl_Position * scale_offset_mat;\n"; OS << " gl_Position = apply_zclip_xform(gl_Position, z_near, z_far);\n"; - //Since our clip_space is symmetrical [-1, 1] we map it to linear space using the eqn: - //ln = (clip * 2) - 1 to fully utilize the 0-1 range of the depth buffer - //RSX matrices passed already map to the [0, 1] range but mapping to classic OGL requires that we undo this step - //This can be made unnecessary using the call glClipControl(GL_LOWER_LEFT, GL_ZERO_TO_ONE). - //However, ClipControl only made it to opengl core in ver 4.5 though, so this is a workaround. - - //NOTE: It is completely valid for games to use very large w values, causing the post-multiplied z to be in the hundreds - //It is therefore critical that this step is done post-transform and the result re-scaled by w - //SEE Naruto: UNS - - //NOTE: On GPUs, poor fp32 precision means dividing z by w, then multiplying by w again gives slightly incorrect results - //This equation is simplified algebraically to an addition and subtraction which gives more accurate results (Fixes flickering skybox in Dark Souls 2) - //OS << " float ndc_z = gl_Position.z / gl_Position.w;\n"; - //OS << " ndc_z = (ndc_z * 2.) - 1.;\n"; - //OS << " gl_Position.z = ndc_z * gl_Position.w;\n"; + // Since our clip_space is symmetrical [-1, 1] we map it to linear space using the eqn: + // ln = (clip * 2) - 1 to fully utilize the 0-1 range of the depth buffer + // RSX matrices passed already map to the [0, 1] range but mapping to classic OGL requires that we undo this step + // This can be made unnecessary using the call glClipControl(GL_LOWER_LEFT, GL_ZERO_TO_ONE). + // However, ClipControl only made it to opengl core in ver 4.5 though, so this is a workaround. + + // NOTE: It is completely valid for games to use very large w values, causing the post-multiplied z to be in the hundreds + // It is therefore critical that this step is done post-transform and the result re-scaled by w + // SEE Naruto: UNS + + // NOTE: On GPUs, poor fp32 precision means dividing z by w, then multiplying by w again gives slightly incorrect results + // This equation is simplified algebraically to an addition and subtraction which gives more accurate results (Fixes flickering skybox in Dark Souls 2) + // OS << " float ndc_z = gl_Position.z / gl_Position.w;\n"; + // OS << " ndc_z = (ndc_z * 2.) - 1.;\n"; + // OS << " gl_Position.z = ndc_z * gl_Position.w;\n"; OS << " gl_Position.z = (gl_Position.z + gl_Position.z) - gl_Position.w;\n"; OS << "}\n"; } diff --git a/rpcs3/Emu/RSX/GL/glutils/common.h b/rpcs3/Emu/RSX/GL/glutils/common.h index ba8aa78cd1..be6d340ad7 100644 --- a/rpcs3/Emu/RSX/GL/glutils/common.h +++ b/rpcs3/Emu/RSX/GL/glutils/common.h @@ -20,6 +20,8 @@ #define GL_RASTERIZER_STATE_BIND_SLOT UBO_SLOT(6) #define GL_INTERPRETER_VERTEX_BLOCK SSBO_SLOT(0) #define GL_INTERPRETER_FRAGMENT_BLOCK SSBO_SLOT(1) +#define GL_INSTANCING_LUT_BIND_SLOT SSBO_SLOT(2) +#define GL_INSTANCING_XFORM_CONSTANTS_SLOT SSBO_SLOT(3) #define GL_COMPUTE_BUFFER_SLOT(index) SSBO_SLOT(2 + index) #define GL_COMPUTE_IMAGE_SLOT(index) SSBO_SLOT(index)