gl: Add support for hardware instancing

This commit is contained in:
kd-11 2025-03-15 22:52:36 +03:00 committed by kd-11
parent 3d3fc2f3cd
commit 65c0d3d425
6 changed files with 133 additions and 52 deletions

View file

@ -599,7 +599,11 @@ void GLGSRender::emit_geometry(u32 sub_index)
if (!upload_info.index_info) if (!upload_info.index_info)
{ {
if (draw_call.is_single_draw()) if (draw_call.is_trivial_instanced_draw)
{
glDrawArraysInstanced(draw_mode, 0, upload_info.vertex_draw_count, draw_call.pass_count());
}
else if (draw_call.is_single_draw())
{ {
glDrawArrays(draw_mode, 0, upload_info.vertex_draw_count); glDrawArrays(draw_mode, 0, upload_info.vertex_draw_count);
} }
@ -667,7 +671,11 @@ void GLGSRender::emit_geometry(u32 sub_index)
m_index_ring_buffer->bind(); m_index_ring_buffer->bind();
if (draw_call.is_single_draw()) if (draw_call.is_trivial_instanced_draw)
{
glDrawElementsInstanced(draw_mode, upload_info.vertex_draw_count, index_type, reinterpret_cast<GLvoid*>(u64{ index_offset }), draw_call.pass_count());
}
else if (draw_call.is_single_draw())
{ {
glDrawElements(draw_mode, upload_info.vertex_draw_count, index_type, reinterpret_cast<GLvoid*>(u64{index_offset})); glDrawElements(draw_mode, upload_info.vertex_draw_count, index_type, reinterpret_cast<GLvoid*>(u64{index_offset}));
} }
@ -781,13 +789,20 @@ void GLGSRender::end()
m_program->validate(); m_program->validate();
} }
rsx::method_registers.current_draw_clause.begin(); auto& draw_call = REGS(m_ctx)->current_draw_clause;
draw_call.begin();
u32 subdraw = 0u; u32 subdraw = 0u;
do do
{ {
emit_geometry(subdraw++); emit_geometry(subdraw++);
if (draw_call.is_trivial_instanced_draw)
{
// We already completed. End the draw.
draw_call.end();
} }
while (rsx::method_registers.current_draw_clause.next()); }
while (draw_call.next());
m_rtts.on_write(m_framebuffer_layout.color_write_enabled, m_framebuffer_layout.zeta_write_enabled); m_rtts.on_write(m_framebuffer_layout.color_write_enabled, m_framebuffer_layout.zeta_write_enabled);

View file

@ -296,6 +296,7 @@ void GLGSRender::on_init_thread()
m_fragment_instructions_buffer = std::make_unique<gl::legacy_ring_buffer>(); m_fragment_instructions_buffer = std::make_unique<gl::legacy_ring_buffer>();
m_raster_env_ring_buffer = std::make_unique<gl::legacy_ring_buffer>(); m_raster_env_ring_buffer = std::make_unique<gl::legacy_ring_buffer>();
m_scratch_ring_buffer = std::make_unique<gl::legacy_ring_buffer>(); m_scratch_ring_buffer = std::make_unique<gl::legacy_ring_buffer>();
m_instancing_ring_buffer = std::make_unique<gl::legacy_ring_buffer>();
} }
else else
{ {
@ -311,6 +312,7 @@ void GLGSRender::on_init_thread()
m_fragment_instructions_buffer = std::make_unique<gl::ring_buffer>(); m_fragment_instructions_buffer = std::make_unique<gl::ring_buffer>();
m_raster_env_ring_buffer = std::make_unique<gl::ring_buffer>(); m_raster_env_ring_buffer = std::make_unique<gl::ring_buffer>();
m_scratch_ring_buffer = std::make_unique<gl::ring_buffer>(); m_scratch_ring_buffer = std::make_unique<gl::ring_buffer>();
m_instancing_ring_buffer = std::make_unique<gl::ring_buffer>();
} }
m_attrib_ring_buffer->create(gl::buffer::target::texture, 256 * 0x100000); m_attrib_ring_buffer->create(gl::buffer::target::texture, 256 * 0x100000);
@ -323,6 +325,7 @@ void GLGSRender::on_init_thread()
m_vertex_layout_buffer->create(gl::buffer::target::uniform, 16 * 0x100000); m_vertex_layout_buffer->create(gl::buffer::target::uniform, 16 * 0x100000);
m_raster_env_ring_buffer->create(gl::buffer::target::uniform, 16 * 0x100000); m_raster_env_ring_buffer->create(gl::buffer::target::uniform, 16 * 0x100000);
m_scratch_ring_buffer->create(gl::buffer::target::uniform, 16 * 0x100000); m_scratch_ring_buffer->create(gl::buffer::target::uniform, 16 * 0x100000);
m_instancing_ring_buffer->create(gl::buffer::target::ssbo, 64 * 0x100000);
if (shadermode == shader_mode::async_with_interpreter || shadermode == shader_mode::interpreter_only) if (shadermode == shader_mode::async_with_interpreter || shadermode == shader_mode::interpreter_only)
{ {
@ -547,6 +550,11 @@ void GLGSRender::on_exit()
m_scratch_ring_buffer->remove(); m_scratch_ring_buffer->remove();
} }
if (m_instancing_ring_buffer)
{
m_instancing_ring_buffer->remove();
}
m_null_textures.clear(); m_null_textures.clear();
m_gl_texture_cache.destroy(); m_gl_texture_cache.destroy();
m_ui_renderer.destroy(); m_ui_renderer.destroy();
@ -866,7 +874,8 @@ void GLGSRender::load_program_env()
const bool update_fragment_env = m_graphics_state & rsx::pipeline_state::fragment_state_dirty; const bool update_fragment_env = m_graphics_state & rsx::pipeline_state::fragment_state_dirty;
const bool update_fragment_texture_env = m_graphics_state & rsx::pipeline_state::fragment_texture_state_dirty; const bool update_fragment_texture_env = m_graphics_state & rsx::pipeline_state::fragment_texture_state_dirty;
const bool update_instruction_buffers = !!m_interpreter_state && m_shader_interpreter.is_interpreter(m_program); const bool update_instruction_buffers = !!m_interpreter_state && m_shader_interpreter.is_interpreter(m_program);
const bool update_raster_env = rsx::method_registers.polygon_stipple_enabled() && (m_graphics_state & rsx::pipeline_state::polygon_stipple_pattern_dirty); const bool update_raster_env = REGS(m_ctx)->polygon_stipple_enabled() && (m_graphics_state & rsx::pipeline_state::polygon_stipple_pattern_dirty);
const bool update_instancing_data = REGS(m_ctx)->current_draw_clause.is_trivial_instanced_draw;
if (manually_flush_ring_buffers) if (manually_flush_ring_buffers)
{ {
@ -876,6 +885,7 @@ void GLGSRender::load_program_env()
if (update_fragment_constants) m_fragment_constants_buffer->reserve_storage_on_heap(utils::align(fragment_constants_size, 256)); if (update_fragment_constants) m_fragment_constants_buffer->reserve_storage_on_heap(utils::align(fragment_constants_size, 256));
if (update_transform_constants) m_transform_constants_buffer->reserve_storage_on_heap(8192); if (update_transform_constants) m_transform_constants_buffer->reserve_storage_on_heap(8192);
if (update_raster_env) m_raster_env_ring_buffer->reserve_storage_on_heap(128); if (update_raster_env) m_raster_env_ring_buffer->reserve_storage_on_heap(128);
if (update_instancing_data) m_instancing_ring_buffer->reserve_storage_on_heap(8192 * REGS(m_ctx)->current_draw_clause.pass_count());
if (update_instruction_buffers) if (update_instruction_buffers)
{ {
@ -899,6 +909,33 @@ void GLGSRender::load_program_env()
m_vertex_env_buffer->bind_range(GL_VERTEX_PARAMS_BIND_SLOT, mapping.second, 144); m_vertex_env_buffer->bind_range(GL_VERTEX_PARAMS_BIND_SLOT, mapping.second, 144);
} }
if (update_instancing_data)
{
// Combines transform load + instancing lookup table
const auto alignment = m_min_ssbo_alignment;
u32 indirection_table_offset = 0;
u32 constants_data_table_offset = 0;
rsx::io_buffer indirection_table_buf([&](usz size) -> std::pair<void*, usz>
{
const auto mapping = m_instancing_ring_buffer->alloc_from_heap(static_cast<u32>(size), alignment);
indirection_table_offset = mapping.second;
return mapping;
});
rsx::io_buffer constants_array_buf([&](usz size) -> std::pair<void*, usz>
{
const auto mapping = m_instancing_ring_buffer->alloc_from_heap(static_cast<u32>(size), alignment);
constants_data_table_offset = mapping.second;
return mapping;
});
m_draw_processor.fill_constants_instancing_buffer(indirection_table_buf, constants_array_buf, m_vertex_prog);
m_instancing_ring_buffer->bind_range(GL_INSTANCING_LUT_BIND_SLOT, indirection_table_offset, ::size32(indirection_table_buf));
m_instancing_ring_buffer->bind_range(GL_INSTANCING_XFORM_CONSTANTS_SLOT, constants_data_table_offset, ::size32(constants_array_buf));
}
if (update_transform_constants) if (update_transform_constants)
{ {
// Vertex constants // Vertex constants
@ -1011,6 +1048,7 @@ void GLGSRender::load_program_env()
if (update_fragment_constants) m_fragment_constants_buffer->unmap(); if (update_fragment_constants) m_fragment_constants_buffer->unmap();
if (update_transform_constants) m_transform_constants_buffer->unmap(); if (update_transform_constants) m_transform_constants_buffer->unmap();
if (update_raster_env) m_raster_env_ring_buffer->unmap(); if (update_raster_env) m_raster_env_ring_buffer->unmap();
if (update_instancing_data) m_instancing_ring_buffer->unmap();
if (update_instruction_buffers) if (update_instruction_buffers)
{ {

View file

@ -105,6 +105,7 @@ class GLGSRender : public GSRender, public ::rsx::reports::ZCULL_control
std::unique_ptr<gl::ring_buffer> m_vertex_instructions_buffer; std::unique_ptr<gl::ring_buffer> m_vertex_instructions_buffer;
std::unique_ptr<gl::ring_buffer> m_fragment_instructions_buffer; std::unique_ptr<gl::ring_buffer> m_fragment_instructions_buffer;
std::unique_ptr<gl::ring_buffer> m_raster_env_ring_buffer; std::unique_ptr<gl::ring_buffer> m_raster_env_ring_buffer;
std::unique_ptr<gl::ring_buffer> m_instancing_ring_buffer;
// Identity buffer used to fix broken gl_VertexID on ATI stack // Identity buffer used to fix broken gl_VertexID on ATI stack
std::unique_ptr<gl::buffer> m_identity_index_buffer; std::unique_ptr<gl::buffer> m_identity_index_buffer;
@ -117,6 +118,7 @@ class GLGSRender : public GSRender, public ::rsx::reports::ZCULL_control
GLint m_min_texbuffer_alignment = 256; GLint m_min_texbuffer_alignment = 256;
GLint m_uniform_buffer_offset_align = 256; GLint m_uniform_buffer_offset_align = 256;
GLint m_min_ssbo_alignment = 256;
GLint m_max_texbuffer_size = 65536; GLint m_max_texbuffer_size = 65536;
bool manually_flush_ring_buffers = false; bool manually_flush_ring_buffers = false;

View file

@ -183,6 +183,9 @@ OPENGL_PROC(PFNGLUNMAPNAMEDBUFFEREXTPROC, UnmapNamedBufferEXT);
OPENGL_PROC(PFNGLMULTIDRAWELEMENTSPROC, MultiDrawElements); OPENGL_PROC(PFNGLMULTIDRAWELEMENTSPROC, MultiDrawElements);
OPENGL_PROC(PFNGLMULTIDRAWARRAYSPROC, MultiDrawArrays); OPENGL_PROC(PFNGLMULTIDRAWARRAYSPROC, MultiDrawArrays);
OPENGL_PROC(PFNGLDRAWARRAYSINSTANCEDPROC, DrawArraysInstanced);
OPENGL_PROC(PFNGLDRAWELEMENTSINSTANCEDPROC, DrawElementsInstanced);
OPENGL_PROC(PFNGLGETTEXTUREIMAGEEXTPROC, GetTextureImageEXT); OPENGL_PROC(PFNGLGETTEXTUREIMAGEEXTPROC, GetTextureImageEXT);
OPENGL_PROC(PFNGLGETTEXTUREIMAGEPROC, GetTextureImage); OPENGL_PROC(PFNGLGETTEXTUREIMAGEPROC, GetTextureImage);
OPENGL_PROC(PFNGLGETTEXTURESUBIMAGEPROC, GetTextureSubImage); OPENGL_PROC(PFNGLGETTEXTURESUBIMAGEPROC, GetTextureSubImage);

View file

@ -28,24 +28,25 @@ std::string GLVertexDecompilerThread::compareFunction(COMPARE f, const std::stri
void GLVertexDecompilerThread::insertHeader(std::stringstream &OS) void GLVertexDecompilerThread::insertHeader(std::stringstream &OS)
{ {
OS << "#version 430\n"; OS <<
OS << "layout(std140, binding = " << GL_VERTEX_PARAMS_BIND_SLOT << ") uniform VertexContextBuffer\n"; "#version 430\n"
OS << "{\n"; "layout(std140, binding = " << GL_VERTEX_PARAMS_BIND_SLOT << ") uniform VertexContextBuffer\n"
OS << " mat4 scale_offset_mat;\n"; "{\n"
OS << " ivec4 user_clip_enabled[2];\n"; " mat4 scale_offset_mat;\n"
OS << " vec4 user_clip_factor[2];\n"; " ivec4 user_clip_enabled[2];\n"
OS << " uint transform_branch_bits;\n"; " vec4 user_clip_factor[2];\n"
OS << " float point_size;\n"; " uint transform_branch_bits;\n"
OS << " float z_near;\n"; " float point_size;\n"
OS << " float z_far;\n"; " float z_near;\n"
OS << "};\n\n"; " float z_far;\n"
"};\n\n"
OS << "layout(std140, binding = " << GL_VERTEX_LAYOUT_BIND_SLOT << ") uniform VertexLayoutBuffer\n"; "layout(std140, binding = " << GL_VERTEX_LAYOUT_BIND_SLOT << ") uniform VertexLayoutBuffer\n"
OS << "{\n"; "{\n"
OS << " uint vertex_base_index;\n"; " uint vertex_base_index;\n"
OS << " uint vertex_index_offset;\n"; " uint vertex_index_offset;\n"
OS << " uvec4 input_attributes_blob[16 / 2];\n"; " uvec4 input_attributes_blob[16 / 2];\n"
OS << "};\n\n"; "};\n\n";
} }
void GLVertexDecompilerThread::insertInputs(std::stringstream& OS, const std::vector<ParamType>& /*inputs*/) void GLVertexDecompilerThread::insertInputs(std::stringstream& OS, const std::vector<ParamType>& /*inputs*/)
@ -62,10 +63,29 @@ void GLVertexDecompilerThread::insertConstants(std::stringstream& OS, const std:
{ {
if (PI.name.starts_with("vc[")) if (PI.name.starts_with("vc["))
{ {
OS << "layout(std140, binding = " << GL_VERTEX_CONSTANT_BUFFERS_BIND_SLOT << ") uniform VertexConstantsBuffer\n"; if (!(m_prog.ctrl & RSX_SHADER_CONTROL_INSTANCED_CONSTANTS))
OS << "{\n"; {
OS << " vec4 " << PI.name << ";\n"; OS <<
OS << "};\n\n"; "layout(std140, binding = " << GL_VERTEX_CONSTANT_BUFFERS_BIND_SLOT << ") uniform VertexConstantsBuffer\n"
"{\n"
" vec4 " << PI.name << ";\n"
"};\n\n";
}
else
{
OS <<
"layout(std430, binding = " << GL_INSTANCING_LUT_BIND_SLOT << ") readonly buffer InstancingIndirectionLUT\n"
"{\n"
" int constants_addressing_lookup[];\n"
"};\n\n"
"layout(std430, binding = " << GL_INSTANCING_XFORM_CONSTANTS_SLOT << ") readonly buffer InstancingVertexConstantsBlock\n"
"{\n"
" vec4 instanced_constants_array[];\n"
"};\n\n"
"#define CONSTANTS_ARRAY_LENGTH " << (properties.has_indexed_constants ? 468 : ::size32(m_constant_ids)) << "\n\n";
}
continue; continue;
} }
@ -152,6 +172,7 @@ void GLVertexDecompilerThread::insertMainStart(std::stringstream & OS)
properties2.emulate_depth_clip_only = dev_caps.NV_depth_buffer_float_supported; properties2.emulate_depth_clip_only = dev_caps.NV_depth_buffer_float_supported;
properties2.low_precision_tests = dev_caps.vendor_NVIDIA; properties2.low_precision_tests = dev_caps.vendor_NVIDIA;
properties2.require_explicit_invariance = dev_caps.vendor_MESA || (dev_caps.vendor_NVIDIA && g_cfg.video.shader_precision != gpu_preset_level::low); properties2.require_explicit_invariance = dev_caps.vendor_MESA || (dev_caps.vendor_NVIDIA && g_cfg.video.shader_precision != gpu_preset_level::low);
properties2.require_instanced_render = !!(m_prog.ctrl & RSX_SHADER_CONTROL_INSTANCED_CONSTANTS);
insert_glsl_legacy_function(OS, properties2); insert_glsl_legacy_function(OS, properties2);
glsl::insert_vertex_input_fetch(OS, glsl::glsl_rules_opengl4, dev_caps.vendor_INTEL == false); glsl::insert_vertex_input_fetch(OS, glsl::glsl_rules_opengl4, dev_caps.vendor_INTEL == false);

View file

@ -20,6 +20,8 @@
#define GL_RASTERIZER_STATE_BIND_SLOT UBO_SLOT(6) #define GL_RASTERIZER_STATE_BIND_SLOT UBO_SLOT(6)
#define GL_INTERPRETER_VERTEX_BLOCK SSBO_SLOT(0) #define GL_INTERPRETER_VERTEX_BLOCK SSBO_SLOT(0)
#define GL_INTERPRETER_FRAGMENT_BLOCK SSBO_SLOT(1) #define GL_INTERPRETER_FRAGMENT_BLOCK SSBO_SLOT(1)
#define GL_INSTANCING_LUT_BIND_SLOT SSBO_SLOT(2)
#define GL_INSTANCING_XFORM_CONSTANTS_SLOT SSBO_SLOT(3)
#define GL_COMPUTE_BUFFER_SLOT(index) SSBO_SLOT(2 + index) #define GL_COMPUTE_BUFFER_SLOT(index) SSBO_SLOT(2 + index)
#define GL_COMPUTE_IMAGE_SLOT(index) SSBO_SLOT(index) #define GL_COMPUTE_IMAGE_SLOT(index) SSBO_SLOT(index)