gl: Add support for hardware instancing

This commit is contained in:
kd-11 2025-03-15 22:52:36 +03:00 committed by kd-11
parent 3d3fc2f3cd
commit 65c0d3d425
6 changed files with 133 additions and 52 deletions

View file

@ -599,7 +599,11 @@ void GLGSRender::emit_geometry(u32 sub_index)
if (!upload_info.index_info)
{
if (draw_call.is_single_draw())
if (draw_call.is_trivial_instanced_draw)
{
glDrawArraysInstanced(draw_mode, 0, upload_info.vertex_draw_count, draw_call.pass_count());
}
else if (draw_call.is_single_draw())
{
glDrawArrays(draw_mode, 0, upload_info.vertex_draw_count);
}
@ -667,7 +671,11 @@ void GLGSRender::emit_geometry(u32 sub_index)
m_index_ring_buffer->bind();
if (draw_call.is_single_draw())
if (draw_call.is_trivial_instanced_draw)
{
glDrawElementsInstanced(draw_mode, upload_info.vertex_draw_count, index_type, reinterpret_cast<GLvoid*>(u64{ index_offset }), draw_call.pass_count());
}
else if (draw_call.is_single_draw())
{
glDrawElements(draw_mode, upload_info.vertex_draw_count, index_type, reinterpret_cast<GLvoid*>(u64{index_offset}));
}
@ -781,13 +789,20 @@ void GLGSRender::end()
m_program->validate();
}
rsx::method_registers.current_draw_clause.begin();
auto& draw_call = REGS(m_ctx)->current_draw_clause;
draw_call.begin();
u32 subdraw = 0u;
do
{
emit_geometry(subdraw++);
if (draw_call.is_trivial_instanced_draw)
{
// We already completed. End the draw.
draw_call.end();
}
while (rsx::method_registers.current_draw_clause.next());
}
while (draw_call.next());
m_rtts.on_write(m_framebuffer_layout.color_write_enabled, m_framebuffer_layout.zeta_write_enabled);

View file

@ -296,6 +296,7 @@ void GLGSRender::on_init_thread()
m_fragment_instructions_buffer = std::make_unique<gl::legacy_ring_buffer>();
m_raster_env_ring_buffer = std::make_unique<gl::legacy_ring_buffer>();
m_scratch_ring_buffer = std::make_unique<gl::legacy_ring_buffer>();
m_instancing_ring_buffer = std::make_unique<gl::legacy_ring_buffer>();
}
else
{
@ -311,6 +312,7 @@ void GLGSRender::on_init_thread()
m_fragment_instructions_buffer = std::make_unique<gl::ring_buffer>();
m_raster_env_ring_buffer = std::make_unique<gl::ring_buffer>();
m_scratch_ring_buffer = std::make_unique<gl::ring_buffer>();
m_instancing_ring_buffer = std::make_unique<gl::ring_buffer>();
}
m_attrib_ring_buffer->create(gl::buffer::target::texture, 256 * 0x100000);
@ -323,6 +325,7 @@ void GLGSRender::on_init_thread()
m_vertex_layout_buffer->create(gl::buffer::target::uniform, 16 * 0x100000);
m_raster_env_ring_buffer->create(gl::buffer::target::uniform, 16 * 0x100000);
m_scratch_ring_buffer->create(gl::buffer::target::uniform, 16 * 0x100000);
m_instancing_ring_buffer->create(gl::buffer::target::ssbo, 64 * 0x100000);
if (shadermode == shader_mode::async_with_interpreter || shadermode == shader_mode::interpreter_only)
{
@ -547,6 +550,11 @@ void GLGSRender::on_exit()
m_scratch_ring_buffer->remove();
}
if (m_instancing_ring_buffer)
{
m_instancing_ring_buffer->remove();
}
m_null_textures.clear();
m_gl_texture_cache.destroy();
m_ui_renderer.destroy();
@ -866,7 +874,8 @@ void GLGSRender::load_program_env()
const bool update_fragment_env = m_graphics_state & rsx::pipeline_state::fragment_state_dirty;
const bool update_fragment_texture_env = m_graphics_state & rsx::pipeline_state::fragment_texture_state_dirty;
const bool update_instruction_buffers = !!m_interpreter_state && m_shader_interpreter.is_interpreter(m_program);
const bool update_raster_env = rsx::method_registers.polygon_stipple_enabled() && (m_graphics_state & rsx::pipeline_state::polygon_stipple_pattern_dirty);
const bool update_raster_env = REGS(m_ctx)->polygon_stipple_enabled() && (m_graphics_state & rsx::pipeline_state::polygon_stipple_pattern_dirty);
const bool update_instancing_data = REGS(m_ctx)->current_draw_clause.is_trivial_instanced_draw;
if (manually_flush_ring_buffers)
{
@ -876,6 +885,7 @@ void GLGSRender::load_program_env()
if (update_fragment_constants) m_fragment_constants_buffer->reserve_storage_on_heap(utils::align(fragment_constants_size, 256));
if (update_transform_constants) m_transform_constants_buffer->reserve_storage_on_heap(8192);
if (update_raster_env) m_raster_env_ring_buffer->reserve_storage_on_heap(128);
if (update_instancing_data) m_instancing_ring_buffer->reserve_storage_on_heap(8192 * REGS(m_ctx)->current_draw_clause.pass_count());
if (update_instruction_buffers)
{
@ -899,6 +909,33 @@ void GLGSRender::load_program_env()
m_vertex_env_buffer->bind_range(GL_VERTEX_PARAMS_BIND_SLOT, mapping.second, 144);
}
if (update_instancing_data)
{
// Combines transform load + instancing lookup table
const auto alignment = m_min_ssbo_alignment;
u32 indirection_table_offset = 0;
u32 constants_data_table_offset = 0;
rsx::io_buffer indirection_table_buf([&](usz size) -> std::pair<void*, usz>
{
const auto mapping = m_instancing_ring_buffer->alloc_from_heap(static_cast<u32>(size), alignment);
indirection_table_offset = mapping.second;
return mapping;
});
rsx::io_buffer constants_array_buf([&](usz size) -> std::pair<void*, usz>
{
const auto mapping = m_instancing_ring_buffer->alloc_from_heap(static_cast<u32>(size), alignment);
constants_data_table_offset = mapping.second;
return mapping;
});
m_draw_processor.fill_constants_instancing_buffer(indirection_table_buf, constants_array_buf, m_vertex_prog);
m_instancing_ring_buffer->bind_range(GL_INSTANCING_LUT_BIND_SLOT, indirection_table_offset, ::size32(indirection_table_buf));
m_instancing_ring_buffer->bind_range(GL_INSTANCING_XFORM_CONSTANTS_SLOT, constants_data_table_offset, ::size32(constants_array_buf));
}
if (update_transform_constants)
{
// Vertex constants
@ -1011,6 +1048,7 @@ void GLGSRender::load_program_env()
if (update_fragment_constants) m_fragment_constants_buffer->unmap();
if (update_transform_constants) m_transform_constants_buffer->unmap();
if (update_raster_env) m_raster_env_ring_buffer->unmap();
if (update_instancing_data) m_instancing_ring_buffer->unmap();
if (update_instruction_buffers)
{

View file

@ -105,6 +105,7 @@ class GLGSRender : public GSRender, public ::rsx::reports::ZCULL_control
std::unique_ptr<gl::ring_buffer> m_vertex_instructions_buffer;
std::unique_ptr<gl::ring_buffer> m_fragment_instructions_buffer;
std::unique_ptr<gl::ring_buffer> m_raster_env_ring_buffer;
std::unique_ptr<gl::ring_buffer> m_instancing_ring_buffer;
// Identity buffer used to fix broken gl_VertexID on ATI stack
std::unique_ptr<gl::buffer> m_identity_index_buffer;
@ -117,6 +118,7 @@ class GLGSRender : public GSRender, public ::rsx::reports::ZCULL_control
GLint m_min_texbuffer_alignment = 256;
GLint m_uniform_buffer_offset_align = 256;
GLint m_min_ssbo_alignment = 256;
GLint m_max_texbuffer_size = 65536;
bool manually_flush_ring_buffers = false;

View file

@ -183,6 +183,9 @@ OPENGL_PROC(PFNGLUNMAPNAMEDBUFFEREXTPROC, UnmapNamedBufferEXT);
OPENGL_PROC(PFNGLMULTIDRAWELEMENTSPROC, MultiDrawElements);
OPENGL_PROC(PFNGLMULTIDRAWARRAYSPROC, MultiDrawArrays);
OPENGL_PROC(PFNGLDRAWARRAYSINSTANCEDPROC, DrawArraysInstanced);
OPENGL_PROC(PFNGLDRAWELEMENTSINSTANCEDPROC, DrawElementsInstanced);
OPENGL_PROC(PFNGLGETTEXTUREIMAGEEXTPROC, GetTextureImageEXT);
OPENGL_PROC(PFNGLGETTEXTUREIMAGEPROC, GetTextureImage);
OPENGL_PROC(PFNGLGETTEXTURESUBIMAGEPROC, GetTextureSubImage);

View file

@ -28,24 +28,25 @@ std::string GLVertexDecompilerThread::compareFunction(COMPARE f, const std::stri
void GLVertexDecompilerThread::insertHeader(std::stringstream &OS)
{
OS << "#version 430\n";
OS << "layout(std140, binding = " << GL_VERTEX_PARAMS_BIND_SLOT << ") uniform VertexContextBuffer\n";
OS << "{\n";
OS << " mat4 scale_offset_mat;\n";
OS << " ivec4 user_clip_enabled[2];\n";
OS << " vec4 user_clip_factor[2];\n";
OS << " uint transform_branch_bits;\n";
OS << " float point_size;\n";
OS << " float z_near;\n";
OS << " float z_far;\n";
OS << "};\n\n";
OS <<
"#version 430\n"
"layout(std140, binding = " << GL_VERTEX_PARAMS_BIND_SLOT << ") uniform VertexContextBuffer\n"
"{\n"
" mat4 scale_offset_mat;\n"
" ivec4 user_clip_enabled[2];\n"
" vec4 user_clip_factor[2];\n"
" uint transform_branch_bits;\n"
" float point_size;\n"
" float z_near;\n"
" float z_far;\n"
"};\n\n"
OS << "layout(std140, binding = " << GL_VERTEX_LAYOUT_BIND_SLOT << ") uniform VertexLayoutBuffer\n";
OS << "{\n";
OS << " uint vertex_base_index;\n";
OS << " uint vertex_index_offset;\n";
OS << " uvec4 input_attributes_blob[16 / 2];\n";
OS << "};\n\n";
"layout(std140, binding = " << GL_VERTEX_LAYOUT_BIND_SLOT << ") uniform VertexLayoutBuffer\n"
"{\n"
" uint vertex_base_index;\n"
" uint vertex_index_offset;\n"
" uvec4 input_attributes_blob[16 / 2];\n"
"};\n\n";
}
void GLVertexDecompilerThread::insertInputs(std::stringstream& OS, const std::vector<ParamType>& /*inputs*/)
@ -62,10 +63,29 @@ void GLVertexDecompilerThread::insertConstants(std::stringstream& OS, const std:
{
if (PI.name.starts_with("vc["))
{
OS << "layout(std140, binding = " << GL_VERTEX_CONSTANT_BUFFERS_BIND_SLOT << ") uniform VertexConstantsBuffer\n";
OS << "{\n";
OS << " vec4 " << PI.name << ";\n";
OS << "};\n\n";
if (!(m_prog.ctrl & RSX_SHADER_CONTROL_INSTANCED_CONSTANTS))
{
OS <<
"layout(std140, binding = " << GL_VERTEX_CONSTANT_BUFFERS_BIND_SLOT << ") uniform VertexConstantsBuffer\n"
"{\n"
" vec4 " << PI.name << ";\n"
"};\n\n";
}
else
{
OS <<
"layout(std430, binding = " << GL_INSTANCING_LUT_BIND_SLOT << ") readonly buffer InstancingIndirectionLUT\n"
"{\n"
" int constants_addressing_lookup[];\n"
"};\n\n"
"layout(std430, binding = " << GL_INSTANCING_XFORM_CONSTANTS_SLOT << ") readonly buffer InstancingVertexConstantsBlock\n"
"{\n"
" vec4 instanced_constants_array[];\n"
"};\n\n"
"#define CONSTANTS_ARRAY_LENGTH " << (properties.has_indexed_constants ? 468 : ::size32(m_constant_ids)) << "\n\n";
}
continue;
}
@ -152,6 +172,7 @@ void GLVertexDecompilerThread::insertMainStart(std::stringstream & OS)
properties2.emulate_depth_clip_only = dev_caps.NV_depth_buffer_float_supported;
properties2.low_precision_tests = dev_caps.vendor_NVIDIA;
properties2.require_explicit_invariance = dev_caps.vendor_MESA || (dev_caps.vendor_NVIDIA && g_cfg.video.shader_precision != gpu_preset_level::low);
properties2.require_instanced_render = !!(m_prog.ctrl & RSX_SHADER_CONTROL_INSTANCED_CONSTANTS);
insert_glsl_legacy_function(OS, properties2);
glsl::insert_vertex_input_fetch(OS, glsl::glsl_rules_opengl4, dev_caps.vendor_INTEL == false);

View file

@ -20,6 +20,8 @@
#define GL_RASTERIZER_STATE_BIND_SLOT UBO_SLOT(6)
#define GL_INTERPRETER_VERTEX_BLOCK SSBO_SLOT(0)
#define GL_INTERPRETER_FRAGMENT_BLOCK SSBO_SLOT(1)
#define GL_INSTANCING_LUT_BIND_SLOT SSBO_SLOT(2)
#define GL_INSTANCING_XFORM_CONSTANTS_SLOT SSBO_SLOT(3)
#define GL_COMPUTE_BUFFER_SLOT(index) SSBO_SLOT(2 + index)
#define GL_COMPUTE_IMAGE_SLOT(index) SSBO_SLOT(index)