From 677b16f5c68fbb2332bfa9f0ec2e746ebf49022e Mon Sep 17 00:00:00 2001 From: kd-11 Date: Mon, 1 Oct 2018 23:05:51 +0300 Subject: [PATCH] rsx: Fixups - Also fix visual corruption when using disjoint indexed draws - Refactor draw call emit again (vk) - Improve execution barrier resolve - Allow vertex/index rebase inside begin/end pair - Add ALPHA_TEST to list of excluded methods [TODO: defer raster state] - gl bringup - Simplify - using the simple_array gets back a few more fps :) --- rpcs3/Emu/RSX/Capture/rsx_capture.cpp | 14 +- rpcs3/Emu/RSX/Common/BufferUtils.cpp | 46 +- rpcs3/Emu/RSX/Common/BufferUtils.h | 4 +- rpcs3/Emu/RSX/D3D12/D3D12Buffer.cpp | 44 +- rpcs3/Emu/RSX/GL/GLGSRender.cpp | 248 ++++--- rpcs3/Emu/RSX/GL/GLVertexBuffers.cpp | 34 +- rpcs3/Emu/RSX/RSXFIFO.cpp | 967 ++++++++++++++++++++++---- rpcs3/Emu/RSX/RSXFIFO.h | 191 ++++- rpcs3/Emu/RSX/RSXThread.cpp | 227 ++++-- rpcs3/Emu/RSX/RSXThread.h | 8 +- rpcs3/Emu/RSX/VK/VKGSRender.cpp | 316 ++++++--- rpcs3/Emu/RSX/VK/VKGSRender.h | 9 +- rpcs3/Emu/RSX/VK/VKHelpers.h | 36 +- rpcs3/Emu/RSX/VK/VKVertexBuffers.cpp | 15 +- rpcs3/Emu/RSX/rsx_cache.h | 32 +- rpcs3/Emu/RSX/rsx_methods.cpp | 72 +- rpcs3/Emu/RSX/rsx_methods.h | 298 +++++++- rpcs3/Emu/RSX/rsx_utils.h | 233 +++++++ rpcs3/Emu/RSX/rsx_vertex_data.h | 13 +- 19 files changed, 2242 insertions(+), 565 deletions(-) diff --git a/rpcs3/Emu/RSX/Capture/rsx_capture.cpp b/rpcs3/Emu/RSX/Capture/rsx_capture.cpp index 6a0df50c34..2f4a53a0e9 100644 --- a/rpcs3/Emu/RSX/Capture/rsx_capture.cpp +++ b/rpcs3/Emu/RSX/Capture/rsx_capture.cpp @@ -175,20 +175,23 @@ namespace rsx const u32 vertSize = get_vertex_type_size_on_host(info.type(), info.size()); const u32 vertStride = info.stride(); - for (const auto& range : method_registers.current_draw_clause.draw_command_ranges) + method_registers.current_draw_clause.begin(); + do { - const u32 vertCount = range.count; + const auto& range = method_registers.current_draw_clause.get_range(); + const u32 vertCount = range.count; const size_t bufferSize = vertCount * vertStride + vertSize; frame_capture_data::memory_block block; block.ioOffset = base_address; block.location = memory_location; - block.offset = (range.first * vertStride); + block.offset = (range.first * vertStride); frame_capture_data::memory_block_data block_data; block_data.data.resize(bufferSize); std::memcpy(block_data.data.data(), vm::base(addr + block.offset), bufferSize); insert_mem_block_in_map(mem_changes, std::move(block), std::move(block_data)); } + while (method_registers.current_draw_clause.next()); } } // save index buffer if used @@ -211,8 +214,10 @@ namespace rsx const bool is_primitive_restart_enabled = method_registers.restart_index_enabled(); const u32 primitive_restart_index = method_registers.restart_index(); - for (const auto& range : method_registers.current_draw_clause.draw_command_ranges) + method_registers.current_draw_clause.begin(); + do { + const auto& range = method_registers.current_draw_clause.get_range(); const u32 idxFirst = range.first; const u32 idxCount = range.count; const u32 idxAddr = base_addr + (idxFirst * type_size); @@ -261,6 +266,7 @@ namespace rsx } } } + while (method_registers.current_draw_clause.next()); if (min_index > max_index) { diff --git a/rpcs3/Emu/RSX/Common/BufferUtils.cpp b/rpcs3/Emu/RSX/Common/BufferUtils.cpp index fdd28d9d95..2df1126f85 100644 --- a/rpcs3/Emu/RSX/Common/BufferUtils.cpp +++ b/rpcs3/Emu/RSX/Common/BufferUtils.cpp @@ -435,14 +435,11 @@ namespace } } -void write_vertex_array_data_to_buffer(gsl::span raw_dst_span, gsl::span src_ptr, const std::vector& first_count_commands, rsx::vertex_base_type type, u32 vector_element_count, u32 attribute_src_stride, u8 dst_stride, bool swap_endianness) +void write_vertex_array_data_to_buffer(gsl::span raw_dst_span, gsl::span src_ptr, u32 count, rsx::vertex_base_type type, u32 vector_element_count, u32 attribute_src_stride, u8 dst_stride, bool swap_endianness) { verify(HERE), (vector_element_count > 0); const u32 src_read_stride = rsx::get_vertex_type_size_on_host(type, vector_element_count); - // HACK! This is a legacy routine only used by D3D12 - const u32 count = first_count_commands.front().count; - bool use_stream_no_stride = false; bool use_stream_with_stride = false; @@ -799,7 +796,7 @@ namespace template std::tuple write_index_array_data_to_buffer_impl(gsl::span dst, gsl::span> src, - rsx::primitive_type draw_mode, bool restart_index_enabled, u32 restart_index, const rsx::draw_range_t &range, + rsx::primitive_type draw_mode, bool restart_index_enabled, u32 restart_index, u32 base_index, std::function expands) { if (!expands(draw_mode)) return upload_untouched(src, dst, restart_index_enabled, restart_index, base_index); @@ -809,7 +806,8 @@ namespace case rsx::primitive_type::line_loop: { const auto &returnvalue = upload_untouched(src, dst, restart_index_enabled, restart_index, base_index); - dst[range.count] = src[0]; + const auto index_count = dst.size_bytes() / sizeof(T); + dst[index_count] = src[0]; return returnvalue; } case rsx::primitive_type::polygon: @@ -826,51 +824,23 @@ namespace std::tuple write_index_array_data_to_buffer(gsl::span dst_ptr, gsl::span src_ptr, rsx::index_array_type type, rsx::primitive_type draw_mode, bool restart_index_enabled, u32 restart_index, - const std::vector &first_count_arguments, u32 base_index, std::function expands) { - u32 read = 0; - u32 written = 0; - u32 min_index = -1u; - u32 max_index = 0; - - const u32 type_size = get_index_type_size(type); - - for (const auto &range : first_count_arguments) - { - auto src = src_ptr.subspan(range.command_data_offset, range.count * type_size); - auto dst = dst_ptr.subspan(written * type_size); - switch (type) { case rsx::index_array_type::u16: { - auto ret = write_index_array_data_to_buffer_impl(as_span_workaround(dst), - as_const_span>(src), draw_mode, restart_index_enabled, restart_index, range, base_index, expands); - - min_index = std::min(std::get<0>(ret), min_index); - max_index = std::min(std::get<1>(ret), max_index); - written += std::get<2>(ret); - break; + return write_index_array_data_to_buffer_impl(as_span_workaround(dst_ptr), + as_const_span>(src_ptr), draw_mode, restart_index_enabled, restart_index, base_index, expands); } case rsx::index_array_type::u32: { - auto ret = write_index_array_data_to_buffer_impl(as_span_workaround(dst), - as_const_span>(src), draw_mode, restart_index_enabled, restart_index, range, base_index, expands); - - min_index = std::min(std::get<0>(ret), min_index); - max_index = std::min(std::get<1>(ret), max_index); - written += std::get<2>(ret); - break; + return write_index_array_data_to_buffer_impl(as_span_workaround(dst_ptr), + as_const_span>(src_ptr), draw_mode, restart_index_enabled, restart_index, base_index, expands); } default: fmt::throw_exception("Unreachable" HERE); } - - read += range.count; - } - - return std::make_tuple(min_index, max_index, written); } void stream_vector(void *dst, u32 x, u32 y, u32 z, u32 w) diff --git a/rpcs3/Emu/RSX/Common/BufferUtils.h b/rpcs3/Emu/RSX/Common/BufferUtils.h index b34be9bce6..5eed4f4714 100644 --- a/rpcs3/Emu/RSX/Common/BufferUtils.h +++ b/rpcs3/Emu/RSX/Common/BufferUtils.h @@ -10,7 +10,7 @@ * Write count vertex attributes from src_ptr. * src_ptr array layout is deduced from the type, vector element count and src_stride arguments. */ -void write_vertex_array_data_to_buffer(gsl::span raw_dst_span, gsl::span src_ptr, const std::vector& first_count_commands, rsx::vertex_base_type type, u32 vector_element_count, u32 attribute_src_stride, u8 dst_stride, bool swap_endianness); +void write_vertex_array_data_to_buffer(gsl::span raw_dst_span, gsl::span src_ptr, u32 count, rsx::vertex_base_type type, u32 vector_element_count, u32 attribute_src_stride, u8 dst_stride, bool swap_endianness); /* * If primitive mode is not supported and need to be emulated (using an index buffer) returns false. @@ -33,7 +33,7 @@ u32 get_index_type_size(rsx::index_array_type type); * The function expands index buffer for non native primitive type if expands(draw_mode) return true. */ std::tuple write_index_array_data_to_buffer(gsl::span dst, gsl::span src, - rsx::index_array_type, rsx::primitive_type draw_mode, bool restart_index_enabled, u32 restart_index, const std::vector &first_count_arguments, + rsx::index_array_type, rsx::primitive_type draw_mode, bool restart_index_enabled, u32 restart_index, u32 base_index, std::function expands); /** diff --git a/rpcs3/Emu/RSX/D3D12/D3D12Buffer.cpp b/rpcs3/Emu/RSX/D3D12/D3D12Buffer.cpp index 8585b81214..c2b9d02393 100644 --- a/rpcs3/Emu/RSX/D3D12/D3D12Buffer.cpp +++ b/rpcs3/Emu/RSX/D3D12/D3D12Buffer.cpp @@ -158,7 +158,7 @@ namespace m_buffer_data.map(CD3DX12_RANGE(heap_offset, heap_offset + buffer_size)); gsl::span mapped_buffer_span = { (gsl::byte*)mapped_buffer, gsl::narrow_cast(buffer_size)}; - write_vertex_array_data_to_buffer(mapped_buffer_span, vertex_array.data, rsx::method_registers.current_draw_clause.draw_command_ranges, + write_vertex_array_data_to_buffer(mapped_buffer_span, vertex_array.data, vertex_count, vertex_array.type, vertex_array.attribute_size, vertex_array.stride, element_size, vertex_array.is_be); m_buffer_data.unmap(CD3DX12_RANGE(heap_offset, heap_offset + buffer_size)); @@ -211,12 +211,9 @@ namespace }; std::tuple generate_index_buffer_for_emulated_primitives_array( - const std::vector & vertex_ranges, d3d12_data_heap& m_buffer_data) + u32 vertex_count, d3d12_data_heap& m_buffer_data) { - size_t index_count = std::accumulate( - vertex_ranges.begin(), vertex_ranges.end(), 0ll, [](size_t acc, const auto& pair) { - return acc + get_index_count(rsx::method_registers.current_draw_clause.primitive, pair.count); - }); + size_t index_count = get_index_count(rsx::method_registers.current_draw_clause.primitive, vertex_count); // Alloc size_t buffer_size = align(index_count * sizeof(u16), 64); @@ -226,10 +223,6 @@ namespace void* mapped_buffer = m_buffer_data.map(CD3DX12_RANGE(heap_offset, heap_offset + buffer_size)); - u32 vertex_count = 0; - for (const auto& pair : vertex_ranges) - vertex_count += pair.count; - write_index_array_for_non_indexed_non_native_primitive_to_buffer((char *)mapped_buffer, rsx::method_registers.current_draw_clause.primitive, vertex_count); m_buffer_data.unmap(CD3DX12_RANGE(heap_offset, heap_offset + buffer_size)); @@ -249,9 +242,8 @@ namespace * range, and whose second element is the number of vertex in this range. */ std::vector upload_vertex_attributes( - std::vector vertex_ranges, - std::function)> - get_vertex_buffers, + u32 vertex_count, + std::function get_vertex_buffers, ID3D12Resource* m_vertex_buffer_data, d3d12_data_heap& m_buffer_data, ID3D12GraphicsCommandList* command_list) { @@ -259,13 +251,9 @@ namespace &CD3DX12_RESOURCE_BARRIER::Transition(m_vertex_buffer_data, D3D12_RESOURCE_STATE_VERTEX_AND_CONSTANT_BUFFER, D3D12_RESOURCE_STATE_COPY_DEST)); - u32 vertex_count = 0; - for (const auto &range : vertex_ranges) - vertex_count += range.count; - vertex_buffer_visitor visitor( vertex_count, command_list, m_vertex_buffer_data, m_buffer_data); - const auto& vertex_buffers = get_vertex_buffers(vertex_ranges); + const auto& vertex_buffers = get_vertex_buffers(); for (const auto& vbo : vertex_buffers) std::visit(visitor, vbo); @@ -348,7 +336,7 @@ namespace { draw_command_visitor(ID3D12GraphicsCommandList* cmd_list, d3d12_data_heap& buffer_data, ID3D12Resource* vertex_buffer_data, - std::function&)> get_vertex_info_lambda) + std::function get_vertex_info_lambda) : command_list(cmd_list), m_buffer_data(buffer_data), m_vertex_buffer_data(vertex_buffer_data), get_vertex_buffers(get_vertex_info_lambda) { @@ -357,10 +345,10 @@ namespace std::tuple> operator()( const rsx::draw_array_command& command) { + const auto vertex_count = rsx::method_registers.current_draw_clause.get_elements_count(); if (is_primitive_native(rsx::method_registers.current_draw_clause.primitive)) { - size_t vertex_count = rsx::method_registers.current_draw_clause.get_elements_count(); return std::make_tuple(false, vertex_count, - upload_vertex_attributes(rsx::method_registers.current_draw_clause.draw_command_ranges, + upload_vertex_attributes(vertex_count, get_vertex_buffers, m_vertex_buffer_data, m_buffer_data, command_list)); } @@ -369,10 +357,10 @@ namespace size_t index_count; std::tie(index_buffer_view, index_count) = generate_index_buffer_for_emulated_primitives_array( - rsx::method_registers.current_draw_clause.draw_command_ranges, m_buffer_data); + vertex_count, m_buffer_data); command_list->IASetIndexBuffer(&index_buffer_view); return std::make_tuple(true, index_count, - upload_vertex_attributes(rsx::method_registers.current_draw_clause.draw_command_ranges, + upload_vertex_attributes(vertex_count, get_vertex_buffers, m_vertex_buffer_data, m_buffer_data, command_list)); } @@ -406,7 +394,7 @@ namespace write_index_array_data_to_buffer(dst, command.raw_index_buffer, indexed_type, rsx::method_registers.current_draw_clause.primitive, rsx::method_registers.restart_index_enabled(), - rsx::method_registers.restart_index(), rsx::method_registers.current_draw_clause.draw_command_ranges, + rsx::method_registers.restart_index(), rsx::method_registers.vertex_data_base_index(), [](auto prim) { return !is_primitive_native(prim); }); m_buffer_data.unmap(CD3DX12_RANGE(heap_offset, heap_offset + buffer_size)); @@ -417,7 +405,7 @@ namespace command_list->IASetIndexBuffer(&index_buffer_view); return std::make_tuple(true, index_count, - upload_vertex_attributes({ {0, max_index + 1} }, get_vertex_buffers, + upload_vertex_attributes(max_index + 1, get_vertex_buffers, m_vertex_buffer_data, m_buffer_data, command_list)); } @@ -439,7 +427,7 @@ namespace size_t index_count; std::tie(index_buffer_view, index_count) = generate_index_buffer_for_emulated_primitives_array( - {{0, (u32)vertex_count}}, m_buffer_data); + vertex_count, m_buffer_data); command_list->IASetIndexBuffer(&index_buffer_view); return std::make_tuple(true, index_count, vertex_buffer_view); } @@ -447,7 +435,7 @@ namespace private: ID3D12GraphicsCommandList* command_list; d3d12_data_heap& m_buffer_data; - std::function&)> get_vertex_buffers; + std::function get_vertex_buffers; ID3D12Resource* m_vertex_buffer_data; }; } // End anonymous namespace @@ -457,7 +445,7 @@ D3D12GSRender::upload_and_set_vertex_index_data(ID3D12GraphicsCommandList* comma { return std::visit( draw_command_visitor(command_list, m_buffer_data, m_vertex_buffer_data.Get(), - [this](const auto& list) { return get_vertex_buffers(rsx::method_registers, list, 0); }), + [this]() { return get_vertex_buffers(rsx::method_registers, 0); }), get_draw_command(rsx::method_registers)); } diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.cpp b/rpcs3/Emu/RSX/GL/GLGSRender.cpp index 4c3bfa41df..9855dea7f8 100644 --- a/rpcs3/Emu/RSX/GL/GLGSRender.cpp +++ b/rpcs3/Emu/RSX/GL/GLGSRender.cpp @@ -195,17 +195,6 @@ void GLGSRender::end() std::chrono::time_point state_check_end = steady_clock::now(); m_begin_time += (u32)std::chrono::duration_cast(state_check_end - state_check_start).count(); - if (manually_flush_ring_buffers) - { - //Use approximations to reserve space. This path is mostly for debug purposes anyway - u32 approx_vertex_count = rsx::method_registers.current_draw_clause.get_elements_count(); - u32 approx_working_buffer_size = approx_vertex_count * 256; - - //Allocate 256K heap if we have no approximation at this time (inlined array) - m_attrib_ring_buffer->reserve_storage_on_heap(std::max(approx_working_buffer_size, 256 * 1024U)); - m_index_ring_buffer->reserve_storage_on_heap(16 * 1024); - } - const auto do_heap_cleanup = [this]() { if (manually_flush_ring_buffers) @@ -220,17 +209,6 @@ void GLGSRender::end() } }; - //Do vertex upload before RTT prep / texture lookups to give the driver time to push data - auto upload_info = set_vertex_buffer(); - - if (upload_info.vertex_draw_count == 0) - { - // Malformed vertex setup; abort - do_heap_cleanup(); - rsx::thread::end(); - return; - } - //Check if depth buffer is bound and valid //If ds is not initialized clear it; it seems new depth textures should have depth cleared auto copy_rtt_contents = [this](gl::render_target *surface, bool is_depth) @@ -407,15 +385,11 @@ void GLGSRender::end() if (!load_program()) { // Program is not ready, skip drawing this - do_heap_cleanup(); std::this_thread::yield(); rsx::thread::end(); return; } - // Load program here since it is dependent on vertex state - load_program_env(upload_info); - std::chrono::time_point program_stop = steady_clock::now(); m_begin_time += (u32)std::chrono::duration_cast(program_stop - program_start).count(); @@ -490,117 +464,161 @@ void GLGSRender::end() std::chrono::time_point draw_start = steady_clock::now(); - do_heap_cleanup(); - if (g_cfg.video.debug_output) { m_program->validate(); } const GLenum draw_mode = gl::draw_mode(rsx::method_registers.current_draw_clause.primitive); - const bool allow_multidraw = supports_multidraw && !g_cfg.video.disable_FIFO_reordering; - const bool single_draw = (!allow_multidraw || - rsx::method_registers.current_draw_clause.draw_command_ranges.size() <= 1 || - rsx::method_registers.current_draw_clause.is_disjoint_primitive); - - if (upload_info.index_info) + rsx::method_registers.current_draw_clause.begin(); + int subdraw = 0; + do { - const GLenum index_type = std::get<0>(*upload_info.index_info); - const u32 index_offset = std::get<1>(*upload_info.index_info); - const bool restarts_valid = gl::is_primitive_native(rsx::method_registers.current_draw_clause.primitive) && !rsx::method_registers.current_draw_clause.is_disjoint_primitive; - - if (gl_state.enable(restarts_valid && rsx::method_registers.restart_index_enabled(), GL_PRIMITIVE_RESTART)) + if (!subdraw) { - glPrimitiveRestartIndex((index_type == GL_UNSIGNED_SHORT)? 0xffff: 0xffffffff); - } - - m_index_ring_buffer->bind(); - - if (single_draw) - { - glDrawElements(draw_mode, upload_info.vertex_draw_count, index_type, (GLvoid *)(uintptr_t)index_offset); + m_vertex_layout = analyse_inputs_interleaved(); + if (!m_vertex_layout.validate()) + { + break; + } } else { - const auto draw_count = rsx::method_registers.current_draw_clause.draw_command_ranges.size(); - const u32 type_scale = (index_type == GL_UNSIGNED_SHORT) ? 1 : 2; - uintptr_t index_ptr = index_offset; - m_scratch_buffer.resize(draw_count * 16); - - GLsizei *counts = (GLsizei*)m_scratch_buffer.data(); - const GLvoid** offsets = (const GLvoid**)(counts + draw_count); - int dst_index = 0; - - for (const auto &range : rsx::method_registers.current_draw_clause.draw_command_ranges) + if (rsx::method_registers.current_draw_clause.execute_pipeline_dependencies() & rsx::vertex_base_changed) { - const auto index_size = get_index_count(rsx::method_registers.current_draw_clause.primitive, range.count); - counts[dst_index] = index_size; - offsets[dst_index++] = (const GLvoid*)index_ptr; - - index_ptr += (index_size << type_scale); - } - - glMultiDrawElements(draw_mode, counts, index_type, offsets, (GLsizei)draw_count); - } - } - else - { - if (single_draw) - { - glDrawArrays(draw_mode, 0, upload_info.vertex_draw_count); - } - else - { - const u32 base_index = rsx::method_registers.current_draw_clause.draw_command_ranges.front().first; - bool use_draw_arrays_fallback = false; - - const auto draw_count = rsx::method_registers.current_draw_clause.draw_command_ranges.size(); - const auto driver_caps = gl::get_driver_caps(); - - m_scratch_buffer.resize(draw_count * 24); - GLint* firsts = (GLint*)m_scratch_buffer.data(); - GLsizei* counts = (GLsizei*)(firsts + draw_count); - const GLvoid** offsets = (const GLvoid**)(counts + draw_count); - int dst_index = 0; - - for (const auto &range : rsx::method_registers.current_draw_clause.draw_command_ranges) - { - const GLint first = range.first - base_index; - const GLsizei count = range.count; - - firsts[dst_index] = first; - counts[dst_index] = count; - offsets[dst_index++] = (const GLvoid*)(first << 2); - - if (driver_caps.vendor_AMD && (first + count) > (0x100000 >> 2)) + // Rebase vertex bases instead of + for (auto &info : m_vertex_layout.interleaved_blocks) { - //Unlikely, but added here in case the identity buffer is not large enough somehow - use_draw_arrays_fallback = true; - break; + const auto vertex_base_offset = rsx::method_registers.vertex_data_base_offset(); + info.real_offset_address = rsx::get_address(rsx::get_vertex_offset_from_base(vertex_base_offset, info.base_offset), info.memory_location); } } + } - if (use_draw_arrays_fallback) + ++subdraw; + + if (manually_flush_ring_buffers) + { + //Use approximations to reserve space. This path is mostly for debug purposes anyway + u32 approx_vertex_count = rsx::method_registers.current_draw_clause.get_elements_count(); + u32 approx_working_buffer_size = approx_vertex_count * 256; + + //Allocate 256K heap if we have no approximation at this time (inlined array) + m_attrib_ring_buffer->reserve_storage_on_heap(std::max(approx_working_buffer_size, 256 * 1024U)); + m_index_ring_buffer->reserve_storage_on_heap(16 * 1024); + } + + //Do vertex upload before RTT prep / texture lookups to give the driver time to push data + auto upload_info = set_vertex_buffer(); + do_heap_cleanup(); + + if (upload_info.vertex_draw_count == 0) + { + // Malformed vertex setup; abort + continue; + } + + load_program_env(upload_info); + + if (!upload_info.index_info) + { + if (rsx::method_registers.current_draw_clause.is_single_draw()) { - //MultiDrawArrays is broken on some primitive types using AMD. One known type is GL_TRIANGLE_STRIP but there could be more - for (const auto &range : rsx::method_registers.current_draw_clause.draw_command_ranges) - { - glDrawArrays(draw_mode, range.first - base_index, range.count); - } - } - else if (driver_caps.vendor_AMD) - { - //Use identity index buffer to fix broken vertexID on AMD - m_identity_index_buffer->bind(); - glMultiDrawElements(draw_mode, counts, GL_UNSIGNED_INT, offsets, (GLsizei)draw_count); + glDrawArrays(draw_mode, 0, upload_info.vertex_draw_count); } else { - //Normal render - glMultiDrawArrays(draw_mode, firsts, counts, (GLsizei)draw_count); + const auto subranges = rsx::method_registers.current_draw_clause.get_subranges(); + const auto draw_count = subranges.size(); + const auto driver_caps = gl::get_driver_caps(); + bool use_draw_arrays_fallback = false; + + m_scratch_buffer.resize(draw_count * 24); + GLint* firsts = (GLint*)m_scratch_buffer.data(); + GLsizei* counts = (GLsizei*)(firsts + draw_count); + const GLvoid** offsets = (const GLvoid**)(counts + draw_count); + + u32 first = 0; + u32 dst_index = 0; + for (const auto &range : subranges) + { + firsts[dst_index] = first; + counts[dst_index] = range.count; + offsets[dst_index++] = (const GLvoid*)(first << 2); + + if (driver_caps.vendor_AMD && (first + range.count) > (0x100000 >> 2)) + { + //Unlikely, but added here in case the identity buffer is not large enough somehow + use_draw_arrays_fallback = true; + break; + } + + first += range.count; + } + + if (use_draw_arrays_fallback) + { + //MultiDrawArrays is broken on some primitive types using AMD. One known type is GL_TRIANGLE_STRIP but there could be more + for (int n = 0; n < draw_count; ++n) + { + glDrawArrays(draw_mode, firsts[n], counts[n]); + } + } + else if (driver_caps.vendor_AMD) + { + //Use identity index buffer to fix broken vertexID on AMD + m_identity_index_buffer->bind(); + glMultiDrawElements(draw_mode, counts, GL_UNSIGNED_INT, offsets, (GLsizei)draw_count); + } + else + { + //Normal render + glMultiDrawArrays(draw_mode, firsts, counts, (GLsizei)draw_count); + } } } - } + else + { + const GLenum index_type = std::get<0>(*upload_info.index_info); + const u32 index_offset = std::get<1>(*upload_info.index_info); + const bool restarts_valid = gl::is_primitive_native(rsx::method_registers.current_draw_clause.primitive) && !rsx::method_registers.current_draw_clause.is_disjoint_primitive; + + if (gl_state.enable(restarts_valid && rsx::method_registers.restart_index_enabled(), GL_PRIMITIVE_RESTART)) + { + glPrimitiveRestartIndex((index_type == GL_UNSIGNED_SHORT) ? 0xffff : 0xffffffff); + } + + m_index_ring_buffer->bind(); + + if (rsx::method_registers.current_draw_clause.is_single_draw()) + { + glDrawElements(draw_mode, upload_info.vertex_draw_count, index_type, (GLvoid *)(uintptr_t)index_offset); + } + else + { + const auto subranges = rsx::method_registers.current_draw_clause.get_subranges(); + const auto draw_count = subranges.size(); + const u32 type_scale = (index_type == GL_UNSIGNED_SHORT) ? 1 : 2; + uintptr_t index_ptr = index_offset; + m_scratch_buffer.resize(draw_count * 16); + + GLsizei *counts = (GLsizei*)m_scratch_buffer.data(); + const GLvoid** offsets = (const GLvoid**)(counts + draw_count); + int dst_index = 0; + + for (const auto &range : subranges) + { + const auto index_size = get_index_count(rsx::method_registers.current_draw_clause.primitive, range.count); + counts[dst_index] = index_size; + offsets[dst_index++] = (const GLvoid*)index_ptr; + + index_ptr += (index_size << type_scale); + } + + glMultiDrawElements(draw_mode, counts, index_type, offsets, (GLsizei)draw_count); + } + } + } while (rsx::method_registers.current_draw_clause.next()); m_rtts.on_write(); diff --git a/rpcs3/Emu/RSX/GL/GLVertexBuffers.cpp b/rpcs3/Emu/RSX/GL/GLVertexBuffers.cpp index 6e6e31d77f..e27da61696 100644 --- a/rpcs3/Emu/RSX/GL/GLVertexBuffers.cpp +++ b/rpcs3/Emu/RSX/GL/GLVertexBuffers.cpp @@ -20,19 +20,12 @@ namespace namespace { // return vertex count if primitive type is not native (empty array otherwise) - std::tuple get_index_array_for_emulated_non_indexed_draw(const std::vector &first_count_commands, rsx::primitive_type primitive_mode, gl::ring_buffer &dst) + std::tuple get_index_array_for_emulated_non_indexed_draw(rsx::primitive_type primitive_mode, gl::ring_buffer &dst, u32 vertex_count) { - //This is an emulated buffer, so our indices only range from 0->original_vertex_array_length - u32 vertex_count = 0; - u32 element_count = 0; + // This is an emulated buffer, so our indices only range from 0->original_vertex_array_length + const auto element_count = get_index_count(primitive_mode, vertex_count); verify(HERE), !gl::is_primitive_native(primitive_mode); - for (const auto &range : first_count_commands) - { - element_count += (u32)get_index_count(primitive_mode, range.count); - vertex_count += range.count; - } - auto mapping = dst.alloc_from_heap(element_count * sizeof(u16), 256); char *mapped_buffer = (char *)mapping.first; @@ -40,7 +33,7 @@ namespace return std::make_tuple(element_count, mapping.second); } - std::tuple upload_index_buffer(gsl::span raw_index_buffer, void *ptr, rsx::index_array_type type, rsx::primitive_type draw_mode, const std::vector& first_count_commands, u32 initial_vertex_count) + std::tuple upload_index_buffer(gsl::span raw_index_buffer, void *ptr, rsx::index_array_type type, rsx::primitive_type draw_mode, u32 initial_vertex_count) { u32 min_index, max_index, vertex_draw_count = initial_vertex_count; @@ -51,7 +44,7 @@ namespace gsl::span dst{ reinterpret_cast(ptr), ::narrow(block_sz) }; std::tie(min_index, max_index, vertex_draw_count) = write_index_array_data_to_buffer(dst, raw_index_buffer, - type, draw_mode, rsx::method_registers.restart_index_enabled(), rsx::method_registers.restart_index(), first_count_commands, + type, draw_mode, rsx::method_registers.restart_index_enabled(), rsx::method_registers.restart_index(), rsx::method_registers.vertex_data_base_index(), [](auto prim) { return !gl::is_primitive_native(prim); }); return std::make_tuple(min_index, max_index, vertex_draw_count); @@ -99,8 +92,8 @@ namespace u32 index_count; u32 offset_in_index_buffer; std::tie(index_count, offset_in_index_buffer) = get_index_array_for_emulated_non_indexed_draw( - rsx::method_registers.current_draw_clause.draw_command_ranges, - rsx::method_registers.current_draw_clause.primitive, m_index_ring_buffer); + rsx::method_registers.current_draw_clause.primitive, m_index_ring_buffer, + rsx::method_registers.current_draw_clause.get_elements_count()); return{ index_count, vertex_count, min_index, 0, std::make_tuple(GL_UNSIGNED_SHORT, offset_in_index_buffer) }; } @@ -128,8 +121,7 @@ namespace u32 offset_in_index_buffer = mapping.second; std::tie(min_index, max_index, index_count) = upload_index_buffer( - command.raw_index_buffer, ptr, type, rsx::method_registers.current_draw_clause.primitive, - rsx::method_registers.current_draw_clause.draw_command_ranges, vertex_count); + command.raw_index_buffer, ptr, type, rsx::method_registers.current_draw_clause.primitive, vertex_count); if (min_index >= max_index) { @@ -163,8 +155,7 @@ namespace u32 offset_in_index_buffer; u32 index_count; std::tie(index_count, offset_in_index_buffer) = get_index_array_for_emulated_non_indexed_draw( - { { 0, 0, vertex_count } }, - rsx::method_registers.current_draw_clause.primitive, m_index_ring_buffer); + rsx::method_registers.current_draw_clause.primitive, m_index_ring_buffer, vertex_count); return{ index_count, vertex_count, 0, 0, std::make_tuple(GL_UNSIGNED_SHORT, offset_in_index_buffer) }; } @@ -182,11 +173,6 @@ gl::vertex_upload_info GLGSRender::set_vertex_buffer() { std::chrono::time_point then = steady_clock::now(); - m_vertex_layout = analyse_inputs_interleaved(); - - if (!m_vertex_layout.validate()) - return {}; - //Write index buffers and count verts auto result = std::visit(draw_command_visitor(*m_index_ring_buffer, m_vertex_layout), get_draw_command(rsx::method_registers)); @@ -214,6 +200,8 @@ gl::vertex_upload_info GLGSRender::set_vertex_buffer() storage_address = m_vertex_layout.interleaved_blocks[0].real_offset_address + vertex_base; if (auto cached = m_vertex_cache->find_vertex_range(storage_address, GL_R8UI, required.first)) { + verify(HERE), cached->local_address == storage_address; + in_cache = true; upload_info.persistent_mapping_offset = cached->offset_in_heap; } diff --git a/rpcs3/Emu/RSX/RSXFIFO.cpp b/rpcs3/Emu/RSX/RSXFIFO.cpp index 3c3e9f34b7..098db67102 100644 --- a/rpcs3/Emu/RSX/RSXFIFO.cpp +++ b/rpcs3/Emu/RSX/RSXFIFO.cpp @@ -5,30 +5,170 @@ #include "Capture/rsx_capture.h" extern rsx::frame_capture_data frame_capture; -//#pragma optimize("", off) +extern bool user_asked_for_frame_capture; + #define ENABLE_OPTIMIZATION_DEBUGGING 0 namespace rsx { namespace FIFO { + template + struct scoped_priority + { + scoped_priority() + { + thread_ctrl::set_native_priority(To); + } + + ~scoped_priority() + { + thread_ctrl::set_native_priority(From); + } + }; + + static inline void __prefetcher_sleep() { std::this_thread::sleep_for(100us); } + static inline void __prefetcher_yield() { std::this_thread::yield(); } + FIFO_control::FIFO_control(::rsx::thread* pctrl) { m_ctrl = pctrl->ctrl; + m_queue.reserve(16384); + m_prefetched_queue.reserve(16384); + + thread_ctrl::spawn(m_prefetcher_thread, "FIFO Prefetch Thread", [this]() + { + // TODO: + return; + + if (g_cfg.core.thread_scheduler_enabled) + { + thread_ctrl::set_thread_affinity_mask(thread_ctrl::get_affinity_mask(thread_class::rsx)); + } + + u32 internal_get; + u32 target_addr; + + while (!Emu.IsStopped()) + { + target_addr = -1u; + + if (m_prefetched_queue.empty() && m_ctrl->put != m_ctrl->get) + { + // Get address to read ahead at/to + const u64 control_tag = m_ctrl_tag; + internal_get = m_internal_get; + + if (m_memwatch_addr) + { + // Spinning + __prefetcher_sleep(); + continue; + } + else + { + // This is normal + m_prefetch_get = m_ctrl->get; + m_prefetcher_speculating = false; + } + + // Check again + if (control_tag != m_ctrl_tag) + { + // Race condition + continue; + } + + if (m_prefetch_get != -1u) + { + // Check for special conditions in the existing queue + { + std::lock_guard lock(m_queue_mutex); + if (!m_queue.empty()) + { + const auto cmd = m_queue.back().reg; + + if ((cmd >> 2) == NV406E_SEMAPHORE_ACQUIRE) + { + // Blocking command, cannot read ahead + __prefetcher_sleep(); + continue; + } + + if ((cmd & RSX_METHOD_OLD_JUMP_CMD_MASK) == RSX_METHOD_OLD_JUMP_CMD) + { + m_prefetch_get = cmd & 0x1ffffffc; + } + else if ((cmd & RSX_METHOD_NEW_JUMP_CMD_MASK) == RSX_METHOD_NEW_JUMP_CMD) + { + m_prefetch_get = cmd & 0xfffffffc; + } + else if ((cmd & RSX_METHOD_CALL_CMD_MASK) == RSX_METHOD_CALL_CMD) + { + m_prefetch_get = cmd & 0xfffffffc; + } + else if ((cmd & RSX_METHOD_RETURN_MASK) == RSX_METHOD_RETURN_CMD) + { + // Cannot determine RET address safely, cannot read ahead + __prefetcher_sleep(); + continue; + } + } + } + + scoped_priority<0, 1> priority; + if (m_prefetch_mutex.try_lock()) + { + if (control_tag != m_ctrl_tag) + { + // Do not stall with the prefetch mutex held! + m_prefetch_mutex.unlock(); + continue; + } + + m_prefetcher_busy.store(true); + + read_ahead(m_prefetcher_info, m_prefetched_queue, m_prefetch_get); + //optimize(m_prefetcher_info, m_prefetched_queue); + + m_prefetcher_busy.store(false); + m_prefetch_mutex.unlock(); + } + } + } + + __prefetcher_sleep(); + } + }); + } + + void FIFO_control::finalize() + { + if (m_prefetcher_thread) + { + m_prefetcher_thread->join(); + m_prefetcher_thread.reset(); + } } bool FIFO_control::is_blocking_cmd(u32 cmd) { switch (cmd) { - case NV4097_WAIT_FOR_IDLE: + // Sync + case NV4097_BACK_END_WRITE_SEMAPHORE_RELEASE: + case NV4097_TEXTURE_READ_SEMAPHORE_RELEASE: case NV406E_SEMAPHORE_ACQUIRE: case NV406E_SEMAPHORE_RELEASE: + case NV406E_SET_REFERENCE: + + // Data xfer case NV3089_IMAGE_IN: case NV0039_BUFFER_NOTIFY: - return false; - default: + case NV308A_COLOR: return true; + default: + return false; } } @@ -38,11 +178,12 @@ namespace rsx { case NV4097_BACK_END_WRITE_SEMAPHORE_RELEASE: case NV4097_TEXTURE_READ_SEMAPHORE_RELEASE: + case NV406E_SEMAPHORE_ACQUIRE: case NV406E_SEMAPHORE_RELEASE: case NV406E_SET_REFERENCE: return true; default: -return false; + return false; } } @@ -53,21 +194,25 @@ return false; void FIFO_control::clear_buffer() { + std::lock_guard lock(m_queue_mutex); + m_queue.clear(); m_command_index = 0; } - void FIFO_control::read_ahead() + void FIFO_control::read_ahead(fifo_buffer_info_t& info, simple_array &commands, u32& get_pointer) { - m_internal_get = m_ctrl->get; + const u32 put = m_ctrl->put; + + info.start_loc = get_pointer; + info.num_draw_calls = 0; + info.draw_call_distance_weight = 0; while (true) { - const u32 get = m_ctrl->get; - const u32 put = m_ctrl->put; - - if (get == put) + if (get_pointer == put) { + // Nothing to do break; } @@ -75,7 +220,7 @@ return false; // TODO: Who should handle graphics exceptions?? u32 cmd; - if (u32 addr = RSXIOMem.RealAddr(get)) + if (u32 addr = RSXIOMem.RealAddr(get_pointer)) { cmd = vm::read32(addr); } @@ -91,20 +236,19 @@ return false; (cmd & RSX_METHOD_RETURN_MASK) == RSX_METHOD_RETURN_CMD) { // Flow control, stop read ahead - m_queue.push_back({ cmd, 0, m_internal_get }); + commands.push_back({ cmd, 0, get_pointer }); break; } if ((cmd & RSX_METHOD_NOP_MASK) == RSX_METHOD_NOP_CMD) { - if (m_queue.back().reg) + if (commands.back().reg != RSX_METHOD_NOP_CMD) { // Insert one NOP only - m_queue.push_back({ cmd, 0, m_internal_get }); + commands.push_back({ RSX_METHOD_NOP_CMD, 0, get_pointer }); } - verify(HERE), m_ctrl->get == get; - m_ctrl->get = m_internal_get = get + 4; + get_pointer += 4; continue; } @@ -117,7 +261,7 @@ return false; u32 count = (cmd >> 18) & 0x7ff; //Validate the args ptr if the command attempts to read from it - auto args = vm::ptr::make(RSXIOMem.RealAddr(get + 4)); + auto args = vm::ptr::make(RSXIOMem.RealAddr(get_pointer + 4)); if (!args && count) { @@ -126,41 +270,141 @@ return false; } // Stop command execution if put will be equal to get ptr during the execution itself - if (count * 4 + 4 > put - get) + if (count * 4 + 4 > put - get_pointer) { - count = (put - get) / 4 - 1; + count = (put - get_pointer) / 4 - 1; } if (count > 1) { // Queue packet header - m_queue.push_back({ FIFO_PACKET_BEGIN, count, m_internal_get }); + commands.push_back({ FIFO_PACKET_BEGIN, count, get_pointer }); const bool no_increment = (cmd & RSX_METHOD_NON_INCREMENT_CMD_MASK) == RSX_METHOD_NON_INCREMENT_CMD; u32 reg = cmd & 0xfffc; - m_internal_get += 4; // First executed command is at data[0] + get_pointer += 4; // First executed command is at data[0] - for (u32 i = 0; i < count; i++, m_internal_get += 4) + for (u32 i = 0; i < count; i++, get_pointer += 4) { - m_queue.push_back({ reg, args[i], m_internal_get }); + commands.push_back({ reg, args[i], get_pointer }); + + if (reg == (NV4097_SET_BEGIN_END << 2)) + { + info.num_draw_calls++; + } if (!no_increment) reg += 4; } } else { - m_queue.push_back({ cmd & 0xfffc, args[0], m_internal_get }); - m_internal_get += 8; - } + const u32 reg = cmd & 0xfffc; + commands.push_back({ reg, args[0], get_pointer }); + get_pointer += 8; - verify(HERE), m_ctrl->get == get; - m_ctrl->get = m_internal_get; + if (reg == (NV4097_SET_BEGIN_END << 2)) + { + info.num_draw_calls++; + } + else if (reg == (NV406E_SEMAPHORE_ACQUIRE << 2)) + { + // Hard sync, stop read ahead + break; + } + } + } + + info.length = get_pointer - info.start_loc; + if (!info.num_draw_calls) + { + return; + } + + info.num_draw_calls /= 2; // Begin+End pairs + //info.draw_call_distance_weight = info.length / info.num_draw_calls; + } +#pragma optimize("", on) + void FIFO_control::report_branch_hit(u32 source, u32 target) + { + const auto range = m_branch_prediction_table.equal_range(source); + for (auto It = range.first; It != range.second; It++) + { + if (It->second.branch_target == target) + { + It->second.weight++; + return; + } + } + + fmt::throw_exception("Unreachable" HERE); + } + + void FIFO_control::report_branch_miss(u32 source, u32 target, u32 actual) + { + const auto range = m_branch_prediction_table.equal_range(source); + for (auto It = range.first; It != range.second; It++) + { + if (target < -1u && It->second.branch_target == target) + { + It->second.weight--; + target = -1u; + + if (actual == -1u) + return; + } + else if (actual < -1u && It->second.branch_target == actual) + { + It->second.weight++; + actual = -1u; + + if (target == -1u) + return; + } + } + + if (target != -1u) + { + branch_target_info_t info; + info.branch_origin = source; + info.branch_target = target; + info.checksum_16 = 0; + info.weight = 0; + + m_branch_prediction_table.emplace(source, info); + } + + if (actual != -1u) + { + branch_target_info_t info; + info.branch_origin = source; + info.branch_target = actual; + info.checksum_16 = 0; + info.weight = 1; + + m_branch_prediction_table.emplace(source, info); } } - void FIFO_control::optimize() + u32 FIFO_control::get_likely_target(u32 source) { - if (m_queue.empty()) + s64 weight = 0; + u32 target = -1u; + + const auto range = m_branch_prediction_table.equal_range(source); + for (auto It = range.first; It != range.second; It++) + { + if (It->second.weight > weight) + { + target = It->second.branch_target; + } + } + + return target; + } + + void FIFO_control::optimize(const fifo_buffer_info_t& info, simple_array& commands) + { + if (commands.empty() || user_asked_for_frame_capture || g_cfg.video.disable_FIFO_reordering) { // Nothing to do return; @@ -168,10 +412,45 @@ return false; for (auto &opt : m_optimization_passes) { - opt->optimize(m_queue, rsx::method_registers.registers.data()); + opt->optimize(info, commands, rsx::method_registers.registers.data()); } } + bool FIFO_control::test_prefetcher_correctness(u32 target) + { + m_fifo_busy.store(true); + + if (!m_prefetched_queue.empty()) + { + const u32 guessed_target = m_prefetcher_info.start_loc; + bool result = true; + + if (guessed_target != m_ctrl->get) + { + const u32 ctrl_get = m_ctrl->get; + LOG_ERROR(RSX, "fifo::Prefetcher was seemingly wrong!, guessed=0x%x, get=0x%x", + guessed_target, ctrl_get); +// report_branch_miss(m_internal_get, guessed_target, get); + + // Kick +// m_ctrl->get = get; + m_prefetched_queue.clear(); + result = false; + } + else + { + // Nothing to do, guessed correctly +// report_branch_hit(m_internal_get, guessed_target); + } + + m_fifo_busy.store(false); + return result; + } + + m_fifo_busy.store(false); + return false; + } + void FIFO_control::set_put(u32 put) { if (m_ctrl->put == put) @@ -182,51 +461,244 @@ return false; m_ctrl->put = put; } - void FIFO_control::set_get(u32 get) + void FIFO_control::set_get(u32 get, bool spinning) { if (m_ctrl->get == get) { + if (spinning) + { + verify(HERE), !m_queue.empty(); + + const auto& last_cmd = m_queue.back(); + m_memwatch_addr = get; + m_memwatch_cmp = last_cmd.reg; + + m_ctrl_tag++; + } + return; } - clear_buffer(); + // Update ctrl registers m_ctrl->get = get; + m_internal_get = get; + + // Clear memwatch spinner + m_memwatch_addr = 0; + + // Update control tag + m_ctrl_tag++; + + // NOTE: This will 'free' the prefetcher in case it was stopped by a sync command + clear_buffer(); } - register_pair FIFO_control::read() + const register_pair& FIFO_control::read_unsafe() { - if (!m_queue.empty() && m_internal_get != m_ctrl->get) + // Fast read with no processing, only safe inside a PACKET_BEGIN+count block + AUDIT(m_command_index < m_queue.size()); + return m_queue[m_command_index++]; + } + + const register_pair& FIFO_control::read() + { + bool registers_changed = false; + const auto queue_size = m_queue.size(); + + if (queue_size > 0) { - // Control register changed - clear_buffer(); + if (m_internal_get != m_ctrl->get) + { + // Control register changed + registers_changed = true; + clear_buffer(); + } + else if (m_command_index >= queue_size) + { + // Consumed whole queue previously + clear_buffer(); + } + else + { + const auto& inst = m_queue[m_command_index++]; + if (inst.reg == FIFO_DISABLED_COMMAND) + { + // Jump to the first safe command + for (u32 n = m_command_index; n < m_queue.size(); ++n) + { + const auto& _inst = m_queue[n]; + if (_inst.reg != FIFO_DISABLED_COMMAND) + { + m_command_index = ++n; + return _inst; + } + } + + // Whole remainder is just disabled commands + clear_buffer(); + } + else + { + // Command is 'ok' + return inst; + } + } } - if (m_command_index && m_command_index >= m_queue.size()) + verify(HERE), m_queue.empty(); + + if (m_ctrl->put == m_ctrl->get) { - // Whole queue consumed - verify(HERE), !m_queue.empty(); - clear_buffer(); + // Nothing to do + return empty_cmd; } - if (m_queue.empty()) + if (m_memwatch_addr) { - // Empty queue, read ahead - read_ahead(); - optimize(); + if (m_ctrl->get == m_memwatch_addr) + { + if (const auto addr = RSXIOMem.RealAddr(m_memwatch_addr)) + { + if (vm::read32(addr) == m_memwatch_cmp) + { + // Still spinning in place + return empty_cmd; + } + } + } + + m_memwatch_addr = 0; + m_memwatch_cmp = 0; + m_ctrl_tag++; } + // Lock to disable the prefetcher + if (!m_prefetch_mutex.try_lock()) + { + return busy_cmd; + } + + if (UNLIKELY(registers_changed)) + { + if (!m_prefetched_queue.empty()) + { + if (m_prefetcher_info.start_loc != m_ctrl->get) + { + // Guessed wrong, discard results + m_prefetched_queue.clear(); + } + } + } + + if (!m_prefetched_queue.empty()) + { + m_ctrl->get = m_internal_get = m_prefetch_get; + m_ctrl_tag++; + + m_queue.swap(m_prefetched_queue); + } + else + { + m_internal_get = m_ctrl->get; + read_ahead(m_fifo_info, m_queue, m_internal_get); + + m_ctrl->get = m_internal_get; + m_ctrl_tag++; + } + + m_prefetch_mutex.unlock(); + if (!m_queue.empty()) { - verify(HERE), m_command_index < m_queue.size(); + // A few guarantees here.. + // First command is not really skippable even if useless + // Queue size is at least 1 return m_queue[m_command_index++]; } - return { FIFO_EMPTY, 0 }; + return empty_cmd; } // Optimization passes - void flattening_pass::optimize(std::vector& commands, const u32* registers) const + flattening_pass::flattening_pass() { + const std::pair skippable_ranges[] = + { + // Texture configuration + { NV4097_SET_TEXTURE_OFFSET, 8 * 16 }, + { NV4097_SET_TEXTURE_CONTROL2, 16 }, + { NV4097_SET_TEXTURE_CONTROL3, 16 }, + { NV4097_SET_VERTEX_TEXTURE_OFFSET, 8 * 4 }, + // Surface configuration + { NV4097_SET_SURFACE_CLIP_HORIZONTAL, 1 }, + { NV4097_SET_SURFACE_CLIP_VERTICAL, 1 }, + { NV4097_SET_SURFACE_COLOR_AOFFSET, 1 }, + { NV4097_SET_SURFACE_COLOR_BOFFSET, 1 }, + { NV4097_SET_SURFACE_COLOR_COFFSET, 1 }, + { NV4097_SET_SURFACE_COLOR_DOFFSET, 1 }, + { NV4097_SET_SURFACE_ZETA_OFFSET, 1 }, + { NV4097_SET_CONTEXT_DMA_COLOR_A, 1 }, + { NV4097_SET_CONTEXT_DMA_COLOR_B, 1 }, + { NV4097_SET_CONTEXT_DMA_COLOR_C, 1 }, + { NV4097_SET_CONTEXT_DMA_COLOR_D, 1 }, + { NV4097_SET_CONTEXT_DMA_ZETA, 1 }, + { NV4097_SET_SURFACE_FORMAT, 1 }, + { NV4097_SET_SURFACE_PITCH_A, 1 }, + { NV4097_SET_SURFACE_PITCH_B, 1 }, + { NV4097_SET_SURFACE_PITCH_C, 1 }, + { NV4097_SET_SURFACE_PITCH_D, 1 }, + { NV4097_SET_SURFACE_PITCH_Z, 1 }, + // Program configuration + { NV4097_SET_TRANSFORM_PROGRAM_START, 1 }, + { NV4097_SET_VERTEX_ATTRIB_OUTPUT_MASK, 1 }, + { NV4097_SET_TRANSFORM_PROGRAM, 512 }, + // Vertex + { NV4097_SET_VERTEX_DATA_ARRAY_FORMAT, 16 }, + { NV4097_SET_VERTEX_DATA_ARRAY_OFFSET, 16 }, + // Raster + { NV4097_SET_ALPHA_TEST_ENABLE, 1 }, + { NV4097_SET_ALPHA_FUNC, 1 }, + { NV4097_SET_ALPHA_REF, 1 }, + { NV4097_SET_FRONT_FACE, 1 }, + }; + + for (u32 reg = 0; reg < m_skippable_registers.size(); ++reg) + { + bool _continue = false; + for (const auto &method : skippable_ranges) + { + if (reg < method.first) + break; + + if (reg - method.first < method.second) + { + // Safe to ignore if value has not changed + m_skippable_registers[reg] = true; + _continue = true; + break; + } + } + + if (_continue) + continue; + + m_skippable_registers[reg] = false; + } + + for (const auto &method : skippable_ranges) + { + for (int subreg = 0; subreg < method.second; ++subreg) + { + // Safe to ignore if value has not changed + verify(HERE), m_skippable_registers[subreg] = true; + } + } + } + + void flattening_pass::optimize(const fifo_buffer_info_t& info, simple_array& commands, const u32* registers) + { + __unused(info); + #if (ENABLE_OPTIMIZATION_DEBUGGING) auto copy = commands; #endif @@ -238,8 +710,6 @@ return false; u32 deferred_primitive_type = UINT32_MAX; bool has_deferred_call = false; - - std::unordered_map register_tracker; // Tracks future register writes auto test_register = [&](u32 reg, u32 value) { @@ -288,6 +758,22 @@ return false; const auto value = command.value; switch (reg) { + case NV4097_INVALIDATE_VERTEX_FILE: // PSLight clears VERTEX_FILE[0-2] + case NV4097_PIPE_NOP: + case NV4097_INVALIDATE_VERTEX_FILE + 2: + case NV4097_INVALIDATE_VERTEX_CACHE_FILE: + case NV4097_INVALIDATE_L2: + case NV4097_INVALIDATE_ZCULL: + case (FIFO_DISABLED_COMMAND >> 2): + case (FIFO_PACKET_BEGIN >> 2): + case (FIFO_DRAW_BARRIER >> 2): + case (FIFO_EMPTY >> 2): + case (FIFO_BUSY >> 2): + { + // Ignore these completely + flush_commands_flag = false; + break; + } case NV4097_SET_BEGIN_END: { if (value && value != deferred_primitive_type) @@ -301,6 +787,8 @@ return false; has_deferred_call = true; flush_commands_flag = false; execute_method_flag = false; + + // TODO: If END, insert draw barrier } break; @@ -323,66 +811,37 @@ return false; flush_commands_flag = false; break; } + case NV4097_SET_VERTEX_DATA_BASE_INDEX: + case NV4097_SET_VERTEX_DATA_BASE_OFFSET: + { + // These can be executed when emitting geometry + flush_commands_flag = false; + break; + } default: { - // TODO: Reorder draw commands between synchronization events to maximize batched sizes - static const std::pair skippable_ranges[] = + // Hopefully this is skippable so the batch can keep growing + if (reg >= m_skippable_registers.size()) { - // Texture configuration - { NV4097_SET_TEXTURE_OFFSET, 8 * 16 }, - { NV4097_SET_TEXTURE_CONTROL2, 16 }, - { NV4097_SET_TEXTURE_CONTROL3, 16 }, - { NV4097_SET_VERTEX_TEXTURE_OFFSET, 8 * 4 }, - // Surface configuration - { NV4097_SET_SURFACE_CLIP_HORIZONTAL, 1 }, - { NV4097_SET_SURFACE_CLIP_VERTICAL, 1 }, - { NV4097_SET_SURFACE_COLOR_AOFFSET, 1 }, - { NV4097_SET_SURFACE_COLOR_BOFFSET, 1 }, - { NV4097_SET_SURFACE_COLOR_COFFSET, 1 }, - { NV4097_SET_SURFACE_COLOR_DOFFSET, 1 }, - { NV4097_SET_SURFACE_ZETA_OFFSET, 1 }, - { NV4097_SET_CONTEXT_DMA_COLOR_A, 1 }, - { NV4097_SET_CONTEXT_DMA_COLOR_B, 1 }, - { NV4097_SET_CONTEXT_DMA_COLOR_C, 1 }, - { NV4097_SET_CONTEXT_DMA_COLOR_D, 1 }, - { NV4097_SET_CONTEXT_DMA_ZETA, 1 }, - { NV4097_SET_SURFACE_FORMAT, 1 }, - { NV4097_SET_SURFACE_PITCH_A, 1 }, - { NV4097_SET_SURFACE_PITCH_B, 1 }, - { NV4097_SET_SURFACE_PITCH_C, 1 }, - { NV4097_SET_SURFACE_PITCH_D, 1 }, - { NV4097_SET_SURFACE_PITCH_Z, 1 }, - // Program configuration - { NV4097_SET_TRANSFORM_PROGRAM_START, 1 }, - { NV4097_SET_VERTEX_ATTRIB_OUTPUT_MASK, 1 }, - { NV4097_SET_TRANSFORM_PROGRAM, 512 } - }; + // Likely flow control, unskippable + break; + } - if (has_deferred_call) + if (m_skippable_registers[reg]) { - // Hopefully this is skippable so the batch can keep growing - for (const auto &method : skippable_ranges) + if (has_deferred_call) { - if (reg < method.first) - continue; - - if (reg - method.first < method.second) + // Safe to ignore if value has not changed + if (test_register(reg, value)) { - // Safe to ignore if value has not changed - if (test_register(reg, value)) - { - execute_method_flag = false; - flush_commands_flag = false; - } - else - { - set_register(reg, value); - } - + execute_method_flag = false; + flush_commands_flag = false; break; } } } + + set_register(reg, value); break; } } @@ -476,38 +935,281 @@ return false; #endif } - void reordering_pass::optimize(std::vector& commands, const u32* registers) const + void reordering_pass::optimize(const fifo_buffer_info_t& info, simple_array& commands, const u32* registers) { -#if 0 - // Define a draw call - struct texture_entry + if (info.num_draw_calls < 8) { - u32 index = -1u; - u32 address = 0; - u32 filter = 0; - u32 control0 = 0; - u32 control1 = 0; - u32 control2 = 0; - u32 control3 = 0; + // TODO: Better threshold checking + return; + } + + std::unordered_map register_tracker; // Tracks future register writes + auto get_register = [&](u32 reg) + { + auto found = register_tracker.find(reg); + if (found == register_tracker.end()) + { + return registers[reg]; + } + else + { + return found->second; + } }; - struct draw_call + auto set_register = [&](u32 reg, u32 value) { - std::vector instructions; - std::array fragment_texture_state{}; - std::array vertex_texture_state{}; + register_tracker[reg] = value; }; - std::vector draw_calls; + bool recording_changes = false; + bool writing_draw_call = false; + bool has_merged = false; + u32 num_draws_processed = 0; + u32 num_draws_merged = 0; + + draw_call *target_bin = nullptr; + const register_pair *rollback_pos = nullptr; + + auto flush_commands = [&](const register_pair* end_pos) mutable + { + if (has_merged) + { + register_pair* mem_ptr = const_cast(bins.front().start_pos); + for (const auto& draw : bins) + { + if (draw.write_prologue) + { + for (u32 n = 0; n < draw.prologue.size(); ++n) + { + const auto e = draw.prologue.get(n); + mem_ptr->reg = e.first; + mem_ptr->value = e.second; + mem_ptr++; + } + } + + mem_ptr->reg = (NV4097_SET_BEGIN_END << 2); + mem_ptr->value = draw.primitive_type; + mem_ptr++; + + for (const auto &inst : draw.draws) + { + *mem_ptr = inst; + mem_ptr++; + } + + mem_ptr->reg = (NV4097_SET_BEGIN_END << 2); + mem_ptr->value = 0; + mem_ptr++; + } + + verify(HERE), mem_ptr <= end_pos; + + for (; mem_ptr <= end_pos; mem_ptr++) + { + mem_ptr->reg = FIFO_DISABLED_COMMAND; + } + } + + bins.clear(); + has_merged = false; + }; + + auto allowed = [](u32 reg) + { + if (reg & ~0xfffc) + return false; + + if (FIFO_control::is_blocking_cmd(reg >> 2)) + return false; + + return true; + }; + +#if (ENABLE_OPTIMIZATION_DEBUGGING) + + auto _get_method_name = [&](u32 reg) -> std::string + { + if (reg == FIFO_DISABLED_COMMAND) + { + return "COMMAND DISABLED"; + } + + if (reg == FIFO_PACKET_BEGIN) + { + return "PACKET BEGIN"; + } + + return rsx::get_method_name(reg >> 2); + }; + + if (user_asked_for_frame_capture) + { + LOG_ERROR(RSX, "-----------------PRE DUMP BEGINS--------------------"); + for (const auto& command : commands) + { + LOG_ERROR(RSX, "[0x%x] %s(0x%x)", command.loc, _get_method_name(command.reg), command.value); + } + LOG_ERROR(RSX, "------------------- DUMP ENDS--------------------"); + } +#endif + + for (const auto& command : commands) + { + bool write = false; + switch (const u32 reg = (command.reg >> 2)) + { + case NV4097_INVALIDATE_VERTEX_FILE: // PSLight clears VERTEX_FILE[0-2] + case NV4097_PIPE_NOP: + case NV4097_INVALIDATE_VERTEX_FILE + 2: + case NV4097_INVALIDATE_VERTEX_CACHE_FILE: + case NV4097_INVALIDATE_L2: + case NV4097_INVALIDATE_ZCULL: + case (FIFO_DISABLED_COMMAND >> 2): + case (FIFO_PACKET_BEGIN >> 2): + case (FIFO_DRAW_BARRIER >> 2): + case (FIFO_EMPTY >> 2): + case (FIFO_BUSY >> 2): + { + break; + } + case NV4097_SET_BEGIN_END: + { + if (!command.value) + { + target_bin = nullptr; + recording_changes = true; + writing_draw_call = false; + rollback_pos = &command; + } + else + { + if (bins.empty()) + { + registers_changed.clear(); + target_bin = &bins.emplace_back(); + target_bin->write_prologue = false; + target_bin->start_pos = &command; + target_bin->primitive_type = command.value; + } + else + { + target_bin = nullptr; + + for (auto& draw : bins) + { + if (draw.matches(registers_changed, command.value)) + { + num_draws_merged++; + has_merged = true; + target_bin = &draw; + //target_bin->draws.push_back({ FIFO_DRAW_BARRIER << 2 }); + break; + } + } + + if (!target_bin) + { + target_bin = &bins.emplace_back(); + target_bin->write_prologue = true; + target_bin->prologue.swap(registers_changed); + target_bin->start_pos = &command; + target_bin->primitive_type = command.value; + } + } + + recording_changes = false; + writing_draw_call = true; + num_draws_processed++; + } + + break; + } + default: + { + write = true; + + if (bins.empty()) + { + break; + } + + if (recording_changes) + { + // Stop if any of the following conditions is met + // The draw 'bin' changes more than 16 instructions (scanning performance) + // The number of unique bins is greater than 4 making it non-trivial and likely not worthwhile to scan + + if (!allowed(command.reg)) + { + // TODO: Maintain list of mergable commands + target_bin = nullptr; + + if (recording_changes) + { + recording_changes = false; + registers_changed.clear(); + } + + flush_commands(rollback_pos); + break; + } + + if (bins.size() == 1) + { + bins[0].prologue.add_cmd(command.reg, get_register(reg)); + } + + registers_changed.add_cmd(command.reg, command.value); + } + else if (writing_draw_call) + { + target_bin->draws.push_back(command); + } + + break; + } + } + + if (write) + { + set_register(command.reg >> 2, command.value); + } + } + + flush_commands(rollback_pos); + + if (num_draws_merged) + { + LOG_ERROR(RSX, "Merges happened: Draws before: %d, draws merged %d", info.num_draw_calls, num_draws_merged); + } + +#if (ENABLE_OPTIMIZATION_DEBUGGING) + if (user_asked_for_frame_capture) + { + LOG_ERROR(RSX, "----------------POST DUMP BEGINS--------------------"); + for (const auto& command : commands) + { + LOG_ERROR(RSX, "[0x%x] %s(0x%x)", command.loc, _get_method_name(command.reg), command.value); + } + LOG_ERROR(RSX, "------------------- DUMP ENDS--------------------"); + } #endif } } void thread::run_FIFO() { - auto command = fifo_ctrl->read(); + const auto& command = fifo_ctrl->read(); const auto cmd = command.reg; + if (cmd == FIFO::FIFO_BUSY) + { + // Do something else + return; + } + if (cmd == FIFO::FIFO_EMPTY || !Emu.IsRunning()) { if (performance_counters.state == FIFO_state::running) @@ -516,6 +1218,7 @@ return false; performance_counters.state = FIFO_state::empty; } + std::this_thread::yield(); return; } @@ -536,7 +1239,7 @@ return false; } //LOG_WARNING(RSX, "rsx jump(0x%x) #addr=0x%x, cmd=0x%x, get=0x%x, put=0x%x", offs, m_ioAddress + get, cmd, get, put); - fifo_ctrl->set_get(offs); + fifo_ctrl->set_get(offs, offs == command.loc); return; } if ((cmd & RSX_METHOD_NEW_JUMP_CMD_MASK) == RSX_METHOD_NEW_JUMP_CMD) @@ -554,7 +1257,7 @@ return false; } //LOG_WARNING(RSX, "rsx jump(0x%x) #addr=0x%x, cmd=0x%x, get=0x%x, put=0x%x", offs, m_ioAddress + get, cmd, get, put); - fifo_ctrl->set_get(offs); + fifo_ctrl->set_get(offs, offs == command.loc); return; } if ((cmd & RSX_METHOD_CALL_CMD_MASK) == RSX_METHOD_CALL_CMD) @@ -587,7 +1290,7 @@ return false; m_return_addr = -1; return; } - if ((cmd & RSX_METHOD_NOP_MASK) == RSX_METHOD_NOP_CMD) + if (cmd == RSX_METHOD_NOP_CMD) { if (performance_counters.state == FIFO_state::running) { @@ -623,18 +1326,26 @@ return false; } u32 count = 1; + auto *command_ptr = &command; + if (cmd == FIFO::FIFO_PACKET_BEGIN) { count = command.value; - command = fifo_ctrl->read(); + command_ptr = &fifo_ctrl->read_unsafe(); } for (u32 i = 0; i < count; ++i) { - if (i) command = fifo_ctrl->read(); + if (i) command_ptr = &fifo_ctrl->read_unsafe(); - const u32 reg = command.reg >> 2; - const u32 value = command.value; + if (command_ptr->reg == FIFO::FIFO_DISABLED_COMMAND) + { + // Placeholder for dropped commands + continue; + } + + const u32 reg = command_ptr->reg >> 2; + const u32& value = command_ptr->value; if (capture_current_frame) { @@ -644,7 +1355,7 @@ return false; { // todo: handle nv406e methods better?, do we care about call/jumps? rsx::frame_capture_data::replay_command replay_cmd; - replay_cmd.rsx_command = std::make_pair(i == 0 ? cmd : 0, value); + replay_cmd.rsx_command = std::make_pair(i == 0 ? command_ptr->reg : 0, value); frame_capture.replay_commands.push_back(replay_cmd); @@ -673,12 +1384,6 @@ return false; } } - if (command.reg == FIFO::FIFO_DISABLED_COMMAND) - { - // Placeholder for dropped commands - continue; - } - method_registers.decode(reg, value); if (auto method = methods[reg]) diff --git a/rpcs3/Emu/RSX/RSXFIFO.h b/rpcs3/Emu/RSX/RSXFIFO.h index 1cd06b9e6f..64a0a8ccdb 100644 --- a/rpcs3/Emu/RSX/RSXFIFO.h +++ b/rpcs3/Emu/RSX/RSXFIFO.h @@ -2,8 +2,19 @@ #include #include +#include +#include + +#include "rsx_utils.h" #include +#include +#include +#include + +#ifndef __unused +#define __unused(expression) do { (void)(expression); } while(0) +#endif struct RsxDmaControl; @@ -17,8 +28,10 @@ namespace rsx { NOP = 0, FIFO_EMPTY = 0xDEADF1F0, + FIFO_BUSY = 0xBABEF1F0, FIFO_PACKET_BEGIN = 0xF1F0, FIFO_DISABLED_COMMAND = 0xF1F4, + FIFO_DRAW_BARRIER = 0xF1F8, }; struct register_pair @@ -26,21 +39,149 @@ namespace rsx u32 reg; u32 value; u32 loc; + u32 reserved; + }; + + struct fifo_buffer_info_t + { + u32 start_loc; + u32 length; + u32 num_draw_calls; + u32 draw_call_distance_weight; + }; + + struct branch_target_info_t + { + u32 branch_target; + u32 branch_origin; + s64 weight; + u64 checksum_16; + u64 reserved; }; struct optimization_pass { - virtual void optimize(std::vector& commands, const u32* registers) const = 0; + virtual void optimize(const fifo_buffer_info_t& info, simple_array& commands, const u32* registers) = 0; }; struct flattening_pass : public optimization_pass { - void optimize(std::vector& commands, const u32* registers) const override; + private: + std::array m_skippable_registers; + + public: + flattening_pass(); + void optimize(const fifo_buffer_info_t& info, simple_array& commands, const u32* registers) override; }; struct reordering_pass : public optimization_pass { - void optimize(std::vector& commands, const u32* registers) const override; + private: + + struct instruction_buffer_t + { + std::unordered_map m_storage; + simple_array m_insertion_order; + + instruction_buffer_t() + { + m_insertion_order.reserve(64); + } + + void add_cmd(u32 reg, u32 value) + { + const auto is_new = std::get<1>(m_storage.insert_or_assign(reg, value)); + if (!is_new) + { + for (auto &loc : m_insertion_order) + { + if (loc == reg) + { + loc |= 0x80000000; + break; + } + } + } + + m_insertion_order.push_back(reg); + } + + void clear() + { + m_storage.clear(); + m_insertion_order.clear(); + } + + void swap(instruction_buffer_t& other) + { + m_storage.swap(other.m_storage); + m_insertion_order.swap(other.m_insertion_order); + } + + auto size() const + { + return m_storage.size(); + } + + inline std::pair get(int index) const + { + const auto key = m_insertion_order[index]; + if (key & 0x80000000) + { + // Disabled by a later write to the same register + // TODO: Track command type registers and avoid this + return { FIFO_DISABLED_COMMAND, 0 }; + } + + const auto value = m_storage.at(key); + return { key, value }; + } + + bool operator == (const instruction_buffer_t& other) const + { + if (size() == other.size()) + { + for (const auto &e : other.m_storage) + { + const auto found = m_storage.find(e.first); + if (found == m_storage.end()) + return false; + + if (found->second != e.second) + return false; + } + + return true; + } + + return false; + } + }; + + struct draw_call + { + instruction_buffer_t prologue; + std::vector draws; + bool write_prologue; + u32 primitive_type; + const register_pair* start_pos; + + bool matches(const instruction_buffer_t setup, u32 prim) const + { + if (prim != primitive_type) + return false; + + return prologue == setup; + } + }; + + instruction_buffer_t registers_changed; + std::vector bins; + + std::unordered_multimap m_results_prediction_table; + + public: + void optimize(const fifo_buffer_info_t& info, simple_array& commands, const u32* registers) override; }; class FIFO_control @@ -48,28 +189,58 @@ namespace rsx RsxDmaControl* m_ctrl = nullptr; u32 m_internal_get = 0; + std::shared_ptr m_prefetcher_thread; + u32 m_prefetch_get = 0; + atomic_t m_prefetcher_busy{ false }; + atomic_t m_fifo_busy{ false }; + fifo_buffer_info_t m_prefetcher_info; + bool m_prefetcher_speculating; + std::vector> m_optimization_passes; - std::vector m_queue; + simple_array m_queue; + simple_array m_prefetched_queue; atomic_t m_command_index{ 0 }; - bool is_blocking_cmd(u32 cmd); - bool is_sync_cmd(u32 cmd); + shared_mutex m_prefetch_mutex; // Guards prefetch queue + shared_mutex m_queue_mutex; // Guards primary queue + atomic_t m_ctrl_tag{ 0 }; // 'Guards' control registers - void read_ahead(); - void optimize(); + register_pair empty_cmd { FIFO_EMPTY }; + register_pair busy_cmd { FIFO_BUSY }; + + u32 m_memwatch_addr = 0; + u32 m_memwatch_cmp = 0; + + fifo_buffer_info_t m_fifo_info; + std::unordered_multimap m_branch_prediction_table; + + void read_ahead(fifo_buffer_info_t& info, simple_array& commands, u32& get_pointer); + void optimize(const fifo_buffer_info_t& info, simple_array& commands); void clear_buffer(); + u32 get_likely_target(u32 source); + void report_branch_miss(u32 source, u32 target, u32 actual); + void report_branch_hit(u32 source, u32 target); + bool test_prefetcher_correctness(u32 actual_target); + public: FIFO_control(rsx::thread* pctrl); ~FIFO_control() {} - void set_get(u32 get); + void set_get(u32 get, bool spinning = false); void set_put(u32 put); - register_pair read(); + const register_pair& read(); + inline const register_pair& read_unsafe(); void register_optimization_pass(optimization_pass* pass); + + void finalize(); + + public: + static bool is_blocking_cmd(u32 cmd); + static bool is_sync_cmd(u32 cmd); }; } } \ No newline at end of file diff --git a/rpcs3/Emu/RSX/RSXThread.cpp b/rpcs3/Emu/RSX/RSXThread.cpp index 8ee272ab28..65f31c7ef8 100644 --- a/rpcs3/Emu/RSX/RSXThread.cpp +++ b/rpcs3/Emu/RSX/RSXThread.cpp @@ -42,7 +42,135 @@ namespace rsx std::function g_access_violation_handler; thread* g_current_renderer = nullptr; - //TODO: Restore a working shaders cache +#pragma optimize("", off) + void run_tests() + { +#if 0 + if (0) + { + auto _get_method_name = [](u32 reg) -> std::string + { + if (reg == FIFO::FIFO_DISABLED_COMMAND) + { + return "COMMAND DISABLED"; + } + + if (reg == FIFO::FIFO_PACKET_BEGIN) + { + return "PACKET BEGIN"; + } + + return rsx::get_method_name(reg >> 2); + }; + + auto _dump_commands = [&](const std::vector& commands) + { + LOG_ERROR(RSX, "DUMP BEGINS--------------------------------"); + for (const auto &cmd : commands) + { + LOG_ERROR(RSX, "%s (0x%x)", _get_method_name(cmd.reg), cmd.value); + } + LOG_ERROR(RSX, "DUMP ENDS--------------------------------"); + }; + + // Test + std::vector fake_commands = + { + { NV4097_SET_TEXTURE_OFFSET << 2, 0xdeadbeef }, + { (NV4097_SET_TEXTURE_OFFSET + 1) << 2, 0xdeadbeef }, + { (NV4097_SET_TEXTURE_OFFSET + 2) << 2, 0xdeadbeef }, + { (NV4097_SET_TEXTURE_OFFSET + 3) << 2, 0xdeadbeef }, + { NV4097_SET_TEXTURE_CONTROL3 << 2, 0x100000}, + { NV4097_INVALIDATE_VERTEX_FILE << 2, 0 }, + { NV4097_SET_BEGIN_END << 2, 5 }, + { NV4097_DRAW_ARRAYS << 2, 0xff000000 }, + { NV4097_SET_BEGIN_END << 2, 0}, + + { NV4097_SET_TEXTURE_OFFSET << 2, 0xdeadbeef }, + { (NV4097_SET_TEXTURE_OFFSET + 1) << 2, 0xdeadbeef }, + { (NV4097_SET_TEXTURE_OFFSET + 2) << 2, 0xdeadbeef }, + { (NV4097_SET_TEXTURE_OFFSET + 3) << 2, 0xcafebabe }, + { NV4097_SET_TEXTURE_CONTROL3 << 2, 0x100000}, + { NV4097_INVALIDATE_VERTEX_FILE << 2, 0 }, + { NV4097_SET_BEGIN_END << 2, 5 }, + { NV4097_DRAW_ARRAYS << 2, 0xff0000ff }, + { NV4097_SET_BEGIN_END << 2, 0}, + + { NV4097_SET_TEXTURE_OFFSET << 2, 0xdeadbeef }, + { (NV4097_SET_TEXTURE_OFFSET + 1) << 2, 0xdeadbeef }, + { (NV4097_SET_TEXTURE_OFFSET + 2) << 2, 0xdeadbeef }, + { (NV4097_SET_TEXTURE_OFFSET + 3) << 2, 0xdeadbeef }, + { NV4097_SET_TEXTURE_CONTROL3 << 2, 0x100000}, + { NV4097_INVALIDATE_VERTEX_FILE << 2, 0 }, + { NV4097_SET_BEGIN_END << 2, 5 }, + { NV4097_DRAW_ARRAYS << 2, 0xff0001fe }, + { NV4097_SET_BEGIN_END << 2, 0}, + + { 0xffffffff, 0 }, + + { NV4097_SET_TEXTURE_OFFSET << 2, 0xdeadbeef }, + { (NV4097_SET_TEXTURE_OFFSET + 1) << 2, 0xdeadbeef }, + { (NV4097_SET_TEXTURE_OFFSET + 2) << 2, 0xdeadbeef }, + { (NV4097_SET_TEXTURE_OFFSET + 3) << 2, 0xcafebabe }, + { NV4097_SET_TEXTURE_CONTROL3 << 2, 0x100000}, + { NV4097_INVALIDATE_VERTEX_FILE << 2, 0 }, + { NV4097_SET_BEGIN_END << 2, 5 }, + { NV4097_DRAW_ARRAYS << 2, 0xff0002fd }, + { NV4097_SET_BEGIN_END << 2, 0}, + + { NV4097_SET_TEXTURE_OFFSET << 2, 0xdeadbeef }, + { (NV4097_SET_TEXTURE_OFFSET + 1) << 2, 0xdeadbeef }, + { (NV4097_SET_TEXTURE_OFFSET + 2) << 2, 0xdeadbeef }, + { (NV4097_SET_TEXTURE_OFFSET + 3) << 2, 0xdeadbeef }, + { NV4097_SET_TEXTURE_CONTROL3 << 2, 0x100000}, + { NV4097_INVALIDATE_VERTEX_FILE << 2, 0 }, + { NV4097_SET_BEGIN_END << 2, 5 }, + { NV4097_DRAW_ARRAYS << 2, 0xff0003fc }, + { NV4097_SET_BEGIN_END << 2, 0}, + + { NV4097_SET_TEXTURE_OFFSET << 2, 0xdeadbeef }, + { (NV4097_SET_TEXTURE_OFFSET + 1) << 2, 0xdeadbeef }, + { (NV4097_SET_TEXTURE_OFFSET + 2) << 2, 0xdeadbeef }, + { (NV4097_SET_TEXTURE_OFFSET + 3) << 2, 0xcafebabe }, + { NV4097_SET_TEXTURE_CONTROL3 << 2, 0x100000}, + { NV4097_INVALIDATE_VERTEX_FILE << 2, 0 }, + { NV4097_SET_BEGIN_END << 2, 5 }, + { NV4097_DRAW_ARRAYS << 2, 0xff0004fb }, + { NV4097_SET_BEGIN_END << 2, 0}, + + { NV4097_SET_TEXTURE_OFFSET << 2, 0xdeadbeef }, + { (NV4097_SET_TEXTURE_OFFSET + 1) << 2, 0xdeadbeef }, + { (NV4097_SET_TEXTURE_OFFSET + 2) << 2, 0xdeadbeef }, + { (NV4097_SET_TEXTURE_OFFSET + 3) << 2, 0xdeadbeef }, + { NV4097_SET_TEXTURE_CONTROL3 << 2, 0x100000}, + { NV4097_INVALIDATE_VERTEX_FILE << 2, 0 }, + { NV4097_SET_BEGIN_END << 2, 5 }, + { NV4097_DRAW_ARRAYS << 2, 0xff0005fa }, + { NV4097_SET_BEGIN_END << 2, 0}, + + { 0xffffffff, 0xdead }, + }; + + std::vector fake_registers(16384); + std::fill(fake_registers.begin(), fake_registers.end(), 0u); + + FIFO::flattening_pass flattening_pass; + FIFO::reordering_pass reordering_pass; + + FIFO::fifo_buffer_info_t info{ 0, fake_commands.size() * 4, /*7*/18, 0 }; + flattening_pass.optimize(info, fake_commands, fake_registers.data()); + + _dump_commands(fake_commands); + + reordering_pass.optimize(info, fake_commands, fake_registers.data()); + + _dump_commands(fake_commands); + + LOG_ERROR(RSX, "FINISHED TEST"); + } +#endif + } +#pragma optimize("", on) u32 get_address(u32 offset, u32 location) { @@ -97,8 +225,10 @@ namespace rsx return get_current_renderer()->ctxt_addr + offset; default: + { fmt::throw_exception("Invalid location (offset=0x%x, location=0x%x)" HERE, offset, location); } + } } u32 get_vertex_type_size_on_host(vertex_base_type type, u32 size) @@ -289,23 +419,7 @@ namespace rsx conditional_render_test_address = 0; } - rsx::method_registers.current_draw_clause.inline_vertex_array.resize(0); in_begin_end = true; - - switch (rsx::method_registers.current_draw_clause.primitive) - { - case rsx::primitive_type::line_loop: - case rsx::primitive_type::line_strip: - case rsx::primitive_type::polygon: - case rsx::primitive_type::quad_strip: - case rsx::primitive_type::triangle_fan: - case rsx::primitive_type::triangle_strip: - // Adjacency matters for these types - rsx::method_registers.current_draw_clause.is_disjoint_primitive = false; - break; - default: - rsx::method_registers.current_draw_clause.is_disjoint_primitive = true; - } } void thread::append_to_push_buffer(u32 attribute, u32 size, u32 subreg_index, vertex_base_type type, u32 value) @@ -348,15 +462,15 @@ namespace rsx m_graphics_state |= rsx::pipeline_state::framebuffer_reads_dirty; ROP_sync_timestamp = get_system_time(); - for (u8 index = 0; index < rsx::limits::vertex_count; ++index) + for (auto & push_buf : vertex_push_buffers) { //Disabled, see https://github.com/RPCS3/rpcs3/issues/1932 //rsx::method_registers.register_vertex_info[index].size = 0; - vertex_push_buffers[index].clear(); + push_buf.clear(); } - element_push_buffer.resize(0); + element_push_buffer.clear(); if (zcull_ctrl->active) zcull_ctrl->on_draw(); @@ -397,6 +511,7 @@ namespace rsx void thread::on_task() { m_rsx_thread = std::this_thread::get_id(); + run_tests(); if (supports_native_ui) { @@ -430,8 +545,8 @@ namespace rsx fifo_ctrl = std::make_unique<::rsx::FIFO::FIFO_control>(this); - fifo_ctrl->register_optimization_pass(new FIFO::flattening_pass()); - //fifo_ctrl->register_optimization_pass(new FIFO::reordering_pass()); + //fifo_ctrl->register_optimization_pass(new FIFO::flattening_pass()); + //fifo_ctrl->register_optimization_pass(new FIFO::reordering_pass()); // R&C2 - Not working if flattening is also enabled!!! //fifo_ctrl->register_optimization_pass(new FIFO::flattening_pass()); last_flip_time = get_system_time() - 1000000; @@ -539,6 +654,29 @@ namespace rsx void thread::on_exit() { m_rsx_thread_exiting = true; + + if (m_vblank_thread) + { + m_vblank_thread->join(); + m_vblank_thread.reset(); + } + + if (m_decompiler_thread) + { + m_decompiler_thread->join(); + m_decompiler_thread.reset(); + } + + if (fifo_ctrl) + { + fifo_ctrl->finalize(); + } + } + + std::string thread::get_name() const + { + return "rsx::thread"; +>>>>>>> rsx: Fixups } void thread::fill_scale_offset_data(void *buffer, bool flip_y) const @@ -740,7 +878,7 @@ namespace rsx return t + timestamp_subvalue; } - gsl::span thread::get_raw_index_array(const std::vector& draw_indexed_clause) const + gsl::span thread::get_raw_index_array(const draw_clause& draw_indexed_clause) const { if (element_push_buffer.size()) { @@ -755,49 +893,29 @@ namespace rsx bool is_primitive_restart_enabled = rsx::method_registers.restart_index_enabled(); u32 primitive_restart_index = rsx::method_registers.restart_index(); - u32 min_index = UINT32_MAX; - u32 max_index = 0; - - for (const auto &range : draw_indexed_clause) - { - const u32 root_index = (range.command_data_offset / type_size) + range.first; - min_index = std::min(root_index, min_index); - max_index = std::max(root_index + range.count, max_index); - } - - const u32 first = min_index; - const u32 count = max_index - min_index; + const u32 first = draw_indexed_clause.min_index(); + const u32 count = draw_indexed_clause.get_elements_count(); const gsl::byte* ptr = static_cast(vm::base(address)); return{ ptr + first * type_size, count * type_size }; } - gsl::span thread::get_raw_vertex_buffer(const rsx::data_array_format_info& vertex_array_info, u32 base_offset, const std::vector& vertex_ranges) const + gsl::span thread::get_raw_vertex_buffer(const rsx::data_array_format_info& vertex_array_info, u32 base_offset, const draw_clause& draw_array_clause) const { u32 offset = vertex_array_info.offset(); u32 address = rsx::get_address(rsx::get_vertex_offset_from_base(base_offset, offset & 0x7fffffff), offset >> 31); u32 element_size = rsx::get_vertex_type_size_on_host(vertex_array_info.type(), vertex_array_info.size()); - u32 min_index = UINT32_MAX; - u32 max_index = 0; - - for (const auto &range : vertex_ranges) - { - const auto root_index = (range.command_data_offset / vertex_array_info.stride()) + range.first; - min_index = std::min(root_index, min_index); - max_index = std::max(root_index + range.count, max_index); - } - - const u32 first = min_index; - const u32 count = max_index - min_index; + const u32 first = draw_array_clause.min_index(); + const u32 count = draw_array_clause.get_elements_count(); const gsl::byte* ptr = gsl::narrow_cast(vm::base(address)); return {ptr + first * vertex_array_info.stride(), count * vertex_array_info.stride() + element_size}; } std::vector> - thread::get_vertex_buffers(const rsx::rsx_state& state, const std::vector& vertex_ranges, const u64 consumed_attrib_mask) const + thread::get_vertex_buffers(const rsx::rsx_state& state, const u64 consumed_attrib_mask) const { std::vector> result; result.reserve(rsx::limits::vertex_count); @@ -815,7 +933,7 @@ namespace rsx { const rsx::data_array_format_info& info = state.vertex_arrays_info[index]; result.push_back(vertex_array_buffer{info.type(), info.size(), info.stride(), - get_raw_vertex_buffer(info, state.vertex_data_base_offset(), vertex_ranges), index, true}); + get_raw_vertex_buffer(info, state.vertex_data_base_offset(), state.current_draw_clause), index, true}); continue; } @@ -854,7 +972,7 @@ namespace rsx { return draw_indexed_array_command { - get_raw_index_array( rsx::method_registers.current_draw_clause.draw_command_ranges) + get_raw_index_array(state.current_draw_clause) }; } @@ -1301,7 +1419,6 @@ namespace rsx if (state.current_draw_clause.command == rsx::draw_command::inlined_array) { vertex_input_layout result = {}; - result.interleaved_blocks.reserve(8); interleaved_range_info info = {}; info.interleaved = true; @@ -1336,8 +1453,8 @@ namespace rsx const u32 frequency_divider_mask = rsx::method_registers.frequency_divider_operation_mask(); vertex_input_layout result = {}; - result.interleaved_blocks.reserve(8); - result.referenced_registers.reserve(4); + result.interleaved_blocks.reserve(16); + result.referenced_registers.reserve(16); for (u8 index = 0; index < rsx::limits::vertex_count; ++index) { @@ -1430,7 +1547,7 @@ namespace rsx block.base_offset = base_address; block.attribute_stride = info.stride(); block.memory_location = info.offset() >> 31; - block.locations.reserve(8); + block.locations.reserve(16); block.locations.push_back(index); block.min_divisor = info.frequency(); block.all_modulus = !!(frequency_divider_mask & (1 << index)); diff --git a/rpcs3/Emu/RSX/RSXThread.h b/rpcs3/Emu/RSX/RSXThread.h index 79b0b0b20b..cc19a52f6d 100644 --- a/rpcs3/Emu/RSX/RSXThread.h +++ b/rpcs3/Emu/RSX/RSXThread.h @@ -528,6 +528,8 @@ namespace rsx virtual void on_decompiler_exit() {} virtual bool on_decompiler_task() { return false; } + virtual void emit_geometry(u32) {} + void run_FIFO(); public: @@ -554,11 +556,11 @@ namespace rsx void read_barrier(u32 memory_address, u32 memory_range); virtual void sync_hint(FIFO_hint hint) {} - gsl::span get_raw_index_array(const std::vector& draw_indexed_clause) const; - gsl::span get_raw_vertex_buffer(const rsx::data_array_format_info&, u32 base_offset, const std::vector& vertex_ranges) const; + gsl::span get_raw_index_array(const draw_clause& draw_indexed_clause) const; + gsl::span get_raw_vertex_buffer(const rsx::data_array_format_info&, u32 base_offset, const draw_clause& draw_array_clause) const; std::vector> - get_vertex_buffers(const rsx::rsx_state& state, const std::vector& vertex_ranges, const u64 consumed_attrib_mask) const; + get_vertex_buffers(const rsx::rsx_state& state, const u64 consumed_attrib_mask) const; std::variant get_draw_command(const rsx::rsx_state& state) const; diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.cpp b/rpcs3/Emu/RSX/VK/VKGSRender.cpp index d8c5b8ed01..b74332d652 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRender.cpp +++ b/rpcs3/Emu/RSX/VK/VKGSRender.cpp @@ -603,7 +603,7 @@ VKGSRender::VKGSRender() : GSRender() std::tie(pipeline_layout, descriptor_layouts) = get_shared_pipeline_layout(*m_device); //Occlusion - m_occlusion_query_pool.create((*m_device), DESCRIPTOR_MAX_DRAW_CALLS); //Enough for 4k draw calls per pass + m_occlusion_query_pool.create((*m_device), OCCLUSION_MAX_POOL_SIZE); for (int n = 0; n < 128; ++n) m_occlusion_query_data[n].driver_handle = n; @@ -619,7 +619,7 @@ VKGSRender::VKGSRender() : GSRender() //VRAM allocation m_attrib_ring_info.create(VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT, VK_ATTRIB_RING_BUFFER_SIZE_M * 0x100000, "attrib buffer", 0x400000); - m_uniform_buffer_ring_info.create(VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, VK_UBO_RING_BUFFER_SIZE_M * 0x100000, "uniform buffer"); + m_uniform_buffer_ring_info.create(VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_UBO_RING_BUFFER_SIZE_M * 0x100000, "uniform buffer"); m_transform_constants_ring_info.create(VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, VK_TRANSFORM_CONSTANTS_BUFFER_SIZE_M * 0x100000, "transform constants buffer"); m_index_buffer_ring_info.create(VK_BUFFER_USAGE_INDEX_BUFFER_BIT, VK_INDEX_RING_BUFFER_SIZE_M * 0x100000, "index buffer"); m_texture_upload_buffer_ring_info.create(VK_BUFFER_USAGE_TRANSFER_SRC_BIT, VK_TEXTURE_UPLOAD_RING_BUFFER_SIZE_M * 0x100000, "texture upload buffer", 32 * 0x100000); @@ -849,11 +849,15 @@ bool VKGSRender::on_access_violation(u32 address, bool is_writing) m_flush_requests.post(sync_timestamp == 0ull); has_queue_ref = true; } - else + else if (!vk::is_uninterruptible()) { //Flush primary cb queue to sync pending changes (e.g image transitions!) flush_command_queue(); } + else + { + LOG_ERROR(RSX, "Fault in uninterruptible code!"); + } if (sync_timestamp > 0) { @@ -1110,6 +1114,145 @@ void VKGSRender::close_render_pass() render_pass_open = false; } +void VKGSRender::emit_geometry(u32 sub_index) +{ + auto &draw_call = rsx::method_registers.current_draw_clause; + //std::chrono::time_point vertex_start = steady_clock::now(); + + if (sub_index == 0) + { + m_vertex_layout = analyse_inputs_interleaved(); + } + + if (!m_vertex_layout.validate()) + { + // No vertex inputs enabled + draw_call.end(); + return; + } + + if (sub_index > 0 && draw_call.execute_pipeline_dependencies() & rsx::vertex_base_changed) + { + // Rebase vertex bases instead of + for (auto &info : m_vertex_layout.interleaved_blocks) + { + const auto vertex_base_offset = rsx::method_registers.vertex_data_base_offset(); + info.real_offset_address = rsx::get_address(rsx::get_vertex_offset_from_base(vertex_base_offset, info.base_offset), info.memory_location); + } + } + + const auto old_persistent_buffer = m_persistent_attribute_storage ? m_persistent_attribute_storage->value : null_buffer_view->value; + const auto old_volatile_buffer = m_volatile_attribute_storage ? m_volatile_attribute_storage->value : null_buffer_view->value; + + // Programs data is dependent on vertex state + auto upload_info = upload_vertex_data(); + if (!upload_info.vertex_draw_count) + { + // Malformed vertex setup; abort + return; + } + + //std::chrono::time_point vertex_end = steady_clock::now(); + //m_vertex_upload_time += std::chrono::duration_cast(vertex_end - vertex_start).count(); + + auto persistent_buffer = m_persistent_attribute_storage ? m_persistent_attribute_storage->value : null_buffer_view->value; + auto volatile_buffer = m_volatile_attribute_storage ? m_volatile_attribute_storage->value : null_buffer_view->value; + bool update_descriptors = false; + + if (sub_index == 0) + { + // Load program execution environment + load_program_env(upload_info); + update_descriptors = true; + } + else + { + // Update vertex fetch environment + update_vertex_env(upload_info); + + if (persistent_buffer != old_persistent_buffer || volatile_buffer != old_volatile_buffer) + { +/* VkDescriptorSetAllocateInfo alloc_info = {}; + alloc_info.descriptorPool = m_current_frame->descriptor_pool; + alloc_info.descriptorSetCount = 1; + alloc_info.pSetLayouts = &descriptor_layouts; + alloc_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; + + VkDescriptorSet new_descriptor_set; + CHECK_RESULT(vkAllocateDescriptorSets(*m_device, &alloc_info, &new_descriptor_set)); + + VkCopyDescriptorSet copy = {}; + copy.sType = VK_STRUCTURE_TYPE_COPY_DESCRIPTOR_SET; + copy + + m_current_frame->descriptor_set = new_descriptor_set; + m_current_frame->used_descriptors++; + + update_descriptors = true;*/ + } + } + + if (update_descriptors) + { + m_program->bind_uniform(persistent_buffer, "persistent_input_stream", m_current_frame->descriptor_set); + m_program->bind_uniform(volatile_buffer, "volatile_input_stream", m_current_frame->descriptor_set); + + vkCmdBindDescriptorSets(*m_current_command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline_layout, 0, 1, &m_current_frame->descriptor_set, 0, nullptr); + } + + //std::chrono::time_point draw_start = steady_clock::now(); + //m_setup_time += std::chrono::duration_cast(draw_start - vertex_end).count(); + + begin_render_pass(); + + if (!upload_info.index_info) + { + if (draw_call.is_single_draw()) + { + vkCmdDraw(*m_current_command_buffer, upload_info.vertex_draw_count, 1, 0, 0); + } + else + { + u32 vertex_offset = 0; + const auto subranges = draw_call.get_subranges(); + for (const auto &range : subranges) + { + vkCmdDraw(*m_current_command_buffer, range.count, 1, vertex_offset, 0); + vertex_offset += range.count; + } + } + } + else + { + const VkIndexType index_type = std::get<1>(*upload_info.index_info); + const VkDeviceSize offset = std::get<0>(*upload_info.index_info); + + vkCmdBindIndexBuffer(*m_current_command_buffer, m_index_buffer_ring_info.heap->value, offset, index_type); + + if (rsx::method_registers.current_draw_clause.is_single_draw()) + { + const u32 index_count = upload_info.vertex_draw_count; + vkCmdDrawIndexed(*m_current_command_buffer, index_count, 1, 0, 0, 0); + } + else + { + u32 vertex_offset = 0; + const auto subranges = draw_call.get_subranges(); + for (const auto &range : subranges) + { + const auto count = get_index_count(draw_call.primitive, range.count); + vkCmdDrawIndexed(*m_current_command_buffer, count, 1, vertex_offset, 0, 0); + vertex_offset += count; + } + } + } + + close_render_pass(); + + //std::chrono::time_point draw_end = steady_clock::now(); + //m_draw_time += std::chrono::duration_cast(draw_end - draw_start).count(); +} + void VKGSRender::end() { if (skip_frame || !framebuffer_status_valid || renderer_unavailable || @@ -1363,31 +1506,6 @@ void VKGSRender::end() std::chrono::time_point program_end = steady_clock::now(); m_setup_time += std::chrono::duration_cast(program_end - program_start).count(); - // Programs data is dependent on vertex state - std::chrono::time_point vertex_start = program_end; - auto upload_info = upload_vertex_data(); - std::chrono::time_point vertex_end = steady_clock::now(); - m_vertex_upload_time += std::chrono::duration_cast(vertex_end - vertex_start).count(); - - if (!upload_info.vertex_draw_count) - { - // Malformed vertex setup; abort - rsx::thread::end(); - return; - } - - // Load program execution environment - program_start = vertex_end; - load_program_env(upload_info); - - VkBufferView persistent_buffer = m_persistent_attribute_storage ? m_persistent_attribute_storage->value : null_buffer_view->value; - VkBufferView volatile_buffer = m_volatile_attribute_storage ? m_volatile_attribute_storage->value : null_buffer_view->value; - m_program->bind_uniform(persistent_buffer, "persistent_input_stream", m_current_frame->descriptor_set); - m_program->bind_uniform(volatile_buffer, "volatile_input_stream", m_current_frame->descriptor_set); - - program_end = steady_clock::now(); - m_setup_time += std::chrono::duration_cast(program_end - program_start).count(); - textures_start = program_end; for (int i = 0; i < rsx::limits::fragment_textures_count; ++i) @@ -1453,10 +1571,6 @@ void VKGSRender::end() textures_end = steady_clock::now(); m_textures_upload_time += std::chrono::duration_cast(textures_end - textures_start).count(); - //While vertex upload is an interruptible process, if we made it this far, there's no need to sync anything that occurs past this point - //Only textures are synchronized tightly with the GPU and they have been read back above - vk::enter_uninterruptible(); - u32 occlusion_id = 0; if (m_occlusion_query_active) { @@ -1475,21 +1589,9 @@ void VKGSRender::end() } } - vkCmdBindPipeline(*m_current_command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS, m_program->pipeline); - vkCmdBindDescriptorSets(*m_current_command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline_layout, 0, 1, &m_current_frame->descriptor_set, 0, nullptr); - - update_draw_state(); - - begin_render_pass(); - bool primitive_emulated = false; vk::get_appropriate_topology(rsx::method_registers.current_draw_clause.primitive, primitive_emulated); - const bool allow_multidraw = supports_multidraw && !g_cfg.video.disable_FIFO_reordering; - const bool single_draw = (!allow_multidraw || - rsx::method_registers.current_draw_clause.draw_command_ranges.size() <= 1 || - rsx::method_registers.current_draw_clause.is_disjoint_primitive); - if (m_occlusion_query_active && (occlusion_id != UINT32_MAX)) { //Begin query @@ -1500,45 +1602,22 @@ void VKGSRender::end() m_current_command_buffer->flags |= cb_has_occlusion_task; } - if (!upload_info.index_info) - { - if (single_draw) - { - vkCmdDraw(*m_current_command_buffer, upload_info.vertex_draw_count, 1, 0, 0); - } - else - { - const auto base_vertex = rsx::method_registers.current_draw_clause.draw_command_ranges.front().first; - for (const auto &range : rsx::method_registers.current_draw_clause.draw_command_ranges) - { - vkCmdDraw(*m_current_command_buffer, range.count, 1, range.first - base_vertex, 0); - } - } - } - else - { - VkIndexType index_type; - const u32 index_count = upload_info.vertex_draw_count; - VkDeviceSize offset; + // While vertex upload is an interruptible process, if we made it this far, there's no need to sync anything that occurs past this point + // Only textures are synchronized tightly with the GPU and they have been read back above + vk::enter_uninterruptible(); - std::tie(offset, index_type) = *upload_info.index_info; - vkCmdBindIndexBuffer(*m_current_command_buffer, m_index_buffer_ring_info.heap->value, offset, index_type); + vkCmdBindPipeline(*m_current_command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS, m_program->pipeline); + update_draw_state(); - if (single_draw) - { - vkCmdDrawIndexed(*m_current_command_buffer, index_count, 1, 0, 0, 0); - } - else - { - u32 first_vertex = 0; - for (const auto &range : rsx::method_registers.current_draw_clause.draw_command_ranges) - { - const auto verts = get_index_count(rsx::method_registers.current_draw_clause.primitive, range.count); - vkCmdDrawIndexed(*m_current_command_buffer, verts, 1, first_vertex, 0, 0); - first_vertex += verts; - } - } + u32 sub_index = 0; + rsx::method_registers.current_draw_clause.begin(); + do + { + emit_geometry(sub_index++); } + while (rsx::method_registers.current_draw_clause.next()); + + vk::leave_uninterruptible(); if (m_occlusion_query_active && (occlusion_id != UINT32_MAX)) { @@ -1546,15 +1625,9 @@ void VKGSRender::end() m_occlusion_query_pool.end_query(*m_current_command_buffer, occlusion_id); } - close_render_pass(); - vk::leave_uninterruptible(); - m_current_command_buffer->num_draws++; m_rtts.on_write(); - std::chrono::time_point draw_end = steady_clock::now(); - m_draw_time += std::chrono::duration_cast(draw_end - textures_end).count(); - m_draw_calls++; rsx::thread::end(); @@ -2479,29 +2552,38 @@ void VKGSRender::load_program_env(const vk::vertex_upload_info& vertex_info) m_graphics_state &= ~handled_flags; } -static const u32 mr_color_offset[rsx::limits::color_buffers_count] = +void VKGSRender::update_vertex_env(const vk::vertex_upload_info& vertex_info) { - NV4097_SET_SURFACE_COLOR_AOFFSET, - NV4097_SET_SURFACE_COLOR_BOFFSET, - NV4097_SET_SURFACE_COLOR_COFFSET, - NV4097_SET_SURFACE_COLOR_DOFFSET -}; + // Vertex base index = vertex_offset + 132 + // Vertex layout = vertex_offset + 160 -static const u32 mr_color_dma[rsx::limits::color_buffers_count] = -{ - NV4097_SET_CONTEXT_DMA_COLOR_A, - NV4097_SET_CONTEXT_DMA_COLOR_B, - NV4097_SET_CONTEXT_DMA_COLOR_C, - NV4097_SET_CONTEXT_DMA_COLOR_D -}; + std::array vertex_layout; + fill_vertex_layout_state(m_vertex_layout, vertex_info.allocated_vertex_count, vertex_layout.data(), + vertex_info.persistent_window_offset, vertex_info.volatile_window_offset); -static const u32 mr_color_pitch[rsx::limits::color_buffers_count] = -{ - NV4097_SET_SURFACE_PITCH_A, - NV4097_SET_SURFACE_PITCH_B, - NV4097_SET_SURFACE_PITCH_C, - NV4097_SET_SURFACE_PITCH_D -}; + vk::insert_buffer_memory_barrier(*m_current_command_buffer, m_uniform_buffer_ring_info.heap->value, m_vertex_state_buffer_info.offset, 512, + VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_TRANSFER_WRITE_BIT); + + vkCmdUpdateBuffer(*m_current_command_buffer, m_uniform_buffer_ring_info.heap->value, m_vertex_state_buffer_info.offset + 132, 4, &vertex_info.vertex_index_base); + + u32 write_offset = m_vertex_state_buffer_info.offset + 160; + s32 *src_ptr = vertex_layout.data(); + + for (const auto& placement : m_vertex_layout.attribute_placement) + { + constexpr u32 data_len = 4 * sizeof(s32); + if (placement != rsx::attribute_buffer_placement::none) + { + vkCmdUpdateBuffer(*m_current_command_buffer, m_uniform_buffer_ring_info.heap->value, write_offset, data_len, src_ptr); + } + + write_offset += data_len; + src_ptr += 4; + } + + vk::insert_buffer_memory_barrier(*m_current_command_buffer, m_uniform_buffer_ring_info.heap->value, m_vertex_state_buffer_info.offset, 512, + VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_VERTEX_SHADER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_UNIFORM_READ_BIT); +} void VKGSRender::init_buffers(rsx::framebuffer_creation_context context, bool skip_reading) { @@ -3048,7 +3130,27 @@ void VKGSRender::flip(int buffer) if (!image_to_flip) { - //Read from cell + // Read from cell + const auto range = utils::address_range::start_length(absolute_address, buffer_pitch * buffer_height); + const auto overlap = m_texture_cache.find_texture_from_range(range); + bool flush_queue = false; + + for (const auto & section : overlap) + { + if (section->get_protection() == utils::protection::no) + { + section->copy_texture(false, *m_current_command_buffer, m_swapchain->get_graphics_queue()); + flush_queue = true; + } + } + + if (flush_queue) + { + // Submit for processing to lower hard fault penalty + flush_command_queue(); + } + + m_texture_cache.invalidate_range(range, rsx::invalidation_cause::read, *m_current_command_buffer, m_swapchain->get_graphics_queue()); image_to_flip = m_texture_cache.upload_image_simple(*m_current_command_buffer, absolute_address, buffer_width, buffer_height); } } diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.h b/rpcs3/Emu/RSX/VK/VKGSRender.h index ac30ccc049..060e22fa99 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRender.h +++ b/rpcs3/Emu/RSX/VK/VKGSRender.h @@ -1,4 +1,4 @@ -#pragma once +#pragma once #include "Emu/RSX/GSRender.h" #include "VKHelpers.h" #include "VKTextureCache.h" @@ -403,9 +403,11 @@ private: vk::vertex_upload_info upload_vertex_data(); -public: bool load_program(); - void load_program_env(const vk::vertex_upload_info& vertex_info); + void load_program_env(const vk::vertex_upload_info& upload_info); + void update_vertex_env(const vk::vertex_upload_info& upload_info); + +public: void init_buffers(rsx::framebuffer_creation_context context, bool skip_reading = false); void read_buffers(); void write_buffers(); @@ -422,6 +424,7 @@ public: protected: void begin() override; void end() override; + void emit_geometry(u32 sub_index) override; void on_init_thread() override; void on_exit() override; diff --git a/rpcs3/Emu/RSX/VK/VKHelpers.h b/rpcs3/Emu/RSX/VK/VKHelpers.h index e1526de208..05704b209d 100644 --- a/rpcs3/Emu/RSX/VK/VKHelpers.h +++ b/rpcs3/Emu/RSX/VK/VKHelpers.h @@ -32,6 +32,7 @@ #endif #define DESCRIPTOR_MAX_DRAW_CALLS 4096 +#define OCCLUSION_MAX_POOL_SIZE 8192 #define VERTEX_BUFFERS_FIRST_BIND_SLOT 3 #define FRAGMENT_CONSTANT_BUFFERS_BIND_SLOT 2 @@ -652,7 +653,7 @@ namespace vk VkImageTiling tiling, VkImageUsageFlags usage, VkImageCreateFlags image_flags) - : m_device(dev) + : m_device(dev), current_layout(initial_layout) { info.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO; info.imageType = image_type; @@ -1195,6 +1196,11 @@ namespace vk return commands; } + bool is_recording() const + { + return is_open; + } + void begin() { if (m_submit_fence && is_pending) @@ -2413,8 +2419,8 @@ public: VkQueryPool query_pool = VK_NULL_HANDLE; vk::render_device* owner = nullptr; + std::deque available_slots; std::vector query_active_status; - public: void create(vk::render_device &dev, u32 num_entries) @@ -2428,6 +2434,12 @@ public: owner = &dev; query_active_status.resize(num_entries, false); + available_slots.resize(num_entries); + + for (u32 n = 0; n < num_entries; ++n) + { + available_slots[n] = n; + } } void destroy() @@ -2484,8 +2496,13 @@ public: void reset_query(vk::command_buffer &cmd, u32 index) { - vkCmdResetQueryPool(cmd, query_pool, index, 1); - query_active_status[index] = false; + if (query_active_status[index]) + { + vkCmdResetQueryPool(cmd, query_pool, index, 1); + + query_active_status[index] = false; + available_slots.push_back(index); + } } void reset_queries(vk::command_buffer &cmd, std::vector &list) @@ -2505,13 +2522,16 @@ public: u32 find_free_slot() { - for (u32 n = 0; n < query_active_status.size(); n++) + if (available_slots.empty()) { - if (query_active_status[n] == false) - return n; + return -1u; } - return UINT32_MAX; + u32 result = available_slots.front(); + available_slots.pop_front(); + + verify(HERE), !query_active_status[result]; + return result; } }; diff --git a/rpcs3/Emu/RSX/VK/VKVertexBuffers.cpp b/rpcs3/Emu/RSX/VK/VKVertexBuffers.cpp index 365894ca76..7a6fb333bc 100644 --- a/rpcs3/Emu/RSX/VK/VKVertexBuffers.cpp +++ b/rpcs3/Emu/RSX/VK/VKVertexBuffers.cpp @@ -106,6 +106,12 @@ namespace const u32 vertex_count = rsx::method_registers.current_draw_clause.get_elements_count(); const u32 min_index = rsx::method_registers.current_draw_clause.min_index(); + //if (rsx::method_registers.current_draw_clause.draw_command_ranges.size() > 1) + //{ + // TODO + //LOG_ERROR(RSX, "REEEEEEEEEEEEEEEEEEEEEEE (prims_emulated=%d)", primitives_emulated); + //} + if (primitives_emulated) { u32 index_count; @@ -165,7 +171,7 @@ namespace command.raw_index_buffer, index_type, rsx::method_registers.current_draw_clause.primitive, rsx::method_registers.restart_index_enabled(), - rsx::method_registers.restart_index(), rsx::method_registers.current_draw_clause.draw_command_ranges, + rsx::method_registers.restart_index(), rsx::method_registers.vertex_data_base_index(), [](auto prim) { return !vk::is_primitive_native(prim); }); if (min_index >= max_index) @@ -227,11 +233,6 @@ namespace vk::vertex_upload_info VKGSRender::upload_vertex_data() { - m_vertex_layout = analyse_inputs_interleaved(); - - if (!m_vertex_layout.validate()) - return {}; - draw_command_visitor visitor(m_index_buffer_ring_info, m_vertex_layout); auto result = std::visit(visitor, get_draw_command(rsx::method_registers)); @@ -258,6 +259,8 @@ vk::vertex_upload_info VKGSRender::upload_vertex_data() storage_address = m_vertex_layout.interleaved_blocks[0].real_offset_address + vertex_base; if (auto cached = m_vertex_cache->find_vertex_range(storage_address, VK_FORMAT_R8_UINT, required.first)) { + verify(HERE), cached->local_address == storage_address; + in_cache = true; persistent_range_base = cached->offset_in_heap; } diff --git a/rpcs3/Emu/RSX/rsx_cache.h b/rpcs3/Emu/RSX/rsx_cache.h index 4c2b6e0227..0deac79a72 100644 --- a/rpcs3/Emu/RSX/rsx_cache.h +++ b/rpcs3/Emu/RSX/rsx_cache.h @@ -880,12 +880,42 @@ namespace rsx storage_type* find_vertex_range(uintptr_t local_addr, upload_format fmt, u32 data_length) override { + const auto data_end = local_addr + data_length; + for (auto &v : vertex_ranges[local_addr]) { - if (v.buffer_format == fmt && v.data_length == data_length) + if (v.buffer_format == fmt && v.data_length >= data_length) return &v; } +#if 0 + for (const auto &range : vertex_ranges) + { + if (range.first > local_addr) + continue; + for (const auto &v : range.second) + { + if (v.buffer_format == fmt) + { + const auto entry_end = v.local_address + v.data_length; + if (data_end <= entry_end) + { + const u32 offset = (local_addr - v.local_address); + if (offset % 16) + continue; // TexelBuffer alignment rules + + storage_type e = v; + e.data_length = data_length; + e.local_address = local_addr; + e.offset_in_heap += offset; + + auto& ret = vertex_ranges[local_addr].emplace_back(e); + return &ret; + } + } + } + } +#endif return nullptr; } diff --git a/rpcs3/Emu/RSX/rsx_methods.cpp b/rpcs3/Emu/RSX/rsx_methods.cpp index f2c9d757e7..18c2785cb8 100644 --- a/rpcs3/Emu/RSX/rsx_methods.cpp +++ b/rpcs3/Emu/RSX/rsx_methods.cpp @@ -423,9 +423,7 @@ namespace rsx { if (arg) { - rsx::method_registers.current_draw_clause.draw_command_ranges.clear(); - rsx::method_registers.current_draw_clause.command = draw_command::none; - rsx::method_registers.current_draw_clause.primitive = to_primitive_type(arg); + rsx::method_registers.current_draw_clause.reset(to_primitive_type(arg)); rsxthr->begin(); return; } @@ -453,9 +451,9 @@ namespace rsx else rsx::method_registers.current_draw_clause.is_immediate_draw = false; - if (!(rsx::method_registers.current_draw_clause.draw_command_ranges.empty() && - rsx::method_registers.current_draw_clause.inline_vertex_array.empty())) + if (!rsx::method_registers.current_draw_clause.empty()) { + rsx::method_registers.current_draw_clause.compile(); rsxthr->end(); } } @@ -598,6 +596,30 @@ namespace rsx rsx->m_rtts_dirty = true; } + void set_vertex_base_offset(thread* rsx, u32 reg, u32 arg) + { + if (rsx->in_begin_end) + { + // Revert change to queue later + method_registers.decode(reg, method_registers.register_previous_value); + + // Insert base mofifier barrier + method_registers.current_draw_clause.insert_command_barrier(vertex_base_modifier_barrier, arg); + } + } + + void set_index_base_offset(thread* rsx, u32 reg, u32 arg) + { + if (rsx->in_begin_end) + { + // Revert change to queue later + method_registers.decode(reg, method_registers.register_previous_value); + + // Insert base mofifier barrier + method_registers.current_draw_clause.insert_command_barrier(index_base_modifier_barrier, arg); + } + } + template struct set_texture_dirty_bit { @@ -1156,6 +1178,13 @@ namespace rsx }; } + namespace fifo + { + void draw_barrier(thread* rsx, u32, u32) + { + } + } + void rsx_state::init() { // Special values set at initialization, these are not set by a context reset @@ -2122,6 +2151,34 @@ namespace rsx return registers[reg] == value; } + u32 draw_clause::execute_pipeline_dependencies() const + { + u32 result = 0; + + for (const auto &barrier : draw_command_barriers[current_range_index]) + { + switch (barrier.type) + { + case primitive_restart_barrier: + break; + case index_base_modifier_barrier: + // Change index base offset + method_registers.decode(NV4097_SET_VERTEX_DATA_BASE_INDEX, barrier.arg); + result |= index_base_changed; + break; + case vertex_base_modifier_barrier: + // Change vertex base offset + method_registers.decode(NV4097_SET_VERTEX_DATA_BASE_OFFSET, barrier.arg); + result |= vertex_base_changed; + break; + default: + fmt::throw_exception("Unreachable" HERE); + } + } + + return result; + } + namespace method_detail { template class T, int Index = 0> @@ -2494,6 +2551,7 @@ namespace rsx //Some custom GCM methods methods[GCM_SET_DRIVER_OBJECT] = nullptr; + methods[FIFO::FIFO_DRAW_BARRIER] = nullptr; bind_array(); bind_array(); @@ -2600,6 +2658,8 @@ namespace rsx bind(); bind(); bind(); + bind(); + bind(); //NV308A bind_range(); @@ -2619,6 +2679,8 @@ namespace rsx // custom methods bind(); + // FIFO + bind(); return true; }(); diff --git a/rpcs3/Emu/RSX/rsx_methods.h b/rpcs3/Emu/RSX/rsx_methods.h index 19f6c473f4..8b823eb13d 100644 --- a/rpcs3/Emu/RSX/rsx_methods.h +++ b/rpcs3/Emu/RSX/rsx_methods.h @@ -3,16 +3,21 @@ #include #include #include +#include +#include #include "GCM.h" #include "rsx_decode.h" #include "RSXTexture.h" #include "rsx_vertex_data.h" +#include "rsx_utils.h" #include "Utilities/geometry.h" #include #include +extern u64 get_system_time(); + namespace rsx { enum class draw_command @@ -23,6 +28,39 @@ namespace rsx indexed, }; + enum command_barrier_type : u32 + { + primitive_restart_barrier, + vertex_base_modifier_barrier, + index_base_modifier_barrier + }; + + enum command_execution_flags : u32 + { + vertex_base_changed = (1 << 0), + index_base_changed = (1 << 1) + }; + + struct barrier_t + { + u64 timestamp; + + u32 address; + u32 arg; + u32 flags; + command_barrier_type type; + + bool operator < (const barrier_t& other) const + { + if (address != -1u) + { + return address < other.address; + } + + return timestamp < other.timestamp; + } + }; + struct draw_range_t { u32 command_data_offset = 0; @@ -30,55 +68,273 @@ namespace rsx u32 count = 0; }; - struct draw_clause + class draw_clause { + // Stores the first and count argument from draw/draw indexed parameters between begin/end clauses. + simple_array draw_command_ranges; + + // Stores rasterization barriers for primitive types sensitive to adjacency + std::vector> draw_command_barriers; + + // Counter used to parse the commands in order + u32 current_range_index; + + // Location of last execution barrier + u32 last_execution_barrier_index; + + // Helper functions + // Add a new draw command + void append_draw_command(const draw_range_t& range) + { + draw_command_ranges.push_back(range); + draw_command_barriers.push_back({}); + } + + // Insert a new draw command within the others + void insert_draw_command(int index, const draw_range_t& range) + { + auto range_It = draw_command_ranges.begin(); + auto barrier_It = draw_command_barriers.begin(); + + // Because deque::insert fails with initializer list on MSVC + const std::set new_barrier; + + while (index--) + { + ++range_It; + ++barrier_It; + } + + draw_command_ranges.insert(range_It, range); + draw_command_barriers.insert(barrier_It, new_barrier); + + verify(HERE), draw_command_ranges.size() == draw_command_barriers.size(); + } + + public: primitive_type primitive; draw_command command; bool is_immediate_draw; bool is_disjoint_primitive; - std::vector inline_vertex_array; + simple_array inline_vertex_array; + + void insert_command_barrier(command_barrier_type type, u32 arg) + { + verify(HERE), !draw_command_ranges.empty(); + + if (type == primitive_restart_barrier) + { + // Rasterization flow barrier + const auto& last = draw_command_ranges.back(); + const auto address = last.first + last.count; + + const auto command_index = draw_command_ranges.size() - 1; + draw_command_barriers[command_index].insert({ 0, address, arg, 0, type }); + } + else + { + // Execution dependency barrier + append_draw_command({}); + const auto command_index = draw_command_ranges.size() - 1; + + draw_command_barriers[command_index].insert({ get_system_time(), -1u, arg, 0, type }); + last_execution_barrier_index = command_index; + } + } /** - * Stores the first and count argument from draw/draw indexed parameters between begin/end clauses. - */ - std::vector draw_command_ranges; + * Optimize commands for rendering + */ + void compile() + { + // TODO + } + + /** + * Insert one command range + */ + + void append(u32 first, u32 count) + { + if (!draw_command_ranges.empty()) + { + auto& last = draw_command_ranges.back(); + + if (last.count == 0) + { + // Special case, usually indicates an execution barrier + last.first = first; + last.count = count; + return; + } + + if (last.first + last.count == first) + { + if (!is_disjoint_primitive) + { + // Insert barrier + insert_command_barrier(primitive_restart_barrier, 0); + } + + last.count += count; + return; + } + + for (int index = last_execution_barrier_index; index < draw_command_ranges.size(); ++index) + { + if (draw_command_ranges[index].first == first && + draw_command_ranges[index].count == count) + { + // Duplicate entry? WTF! + return; + } + + if (draw_command_ranges[index].first > first) + { + insert_draw_command(index, { 0, first, count }); + return; + } + } + } + + append_draw_command({ 0, first, count }); + } /** * Returns how many vertex or index will be consumed by the draw clause. */ u32 get_elements_count() const { - u32 count = 0; - for (const auto &draw : draw_command_ranges) + return get_range().count; + } + + u32 min_index() const + { + return get_range().first; + } + + bool is_single_draw() const + { + if (is_disjoint_primitive) + return true; + + if (draw_command_ranges.empty()) { - count += draw.count; + verify(HERE), !inline_vertex_array.empty(); + return true; } - return count; + verify(HERE), current_range_index != -1u; + for (const auto &barrier : draw_command_barriers[current_range_index]) + { + if (barrier.type == primitive_restart_barrier) + return false; + } + + return true; + } + + bool empty() const + { + return (draw_command_ranges.empty() && inline_vertex_array.empty()); + } + + void reset(rsx::primitive_type type) + { + current_range_index = -1u; + last_execution_barrier_index = 0; + + command = draw_command::none; + primitive = type; + + draw_command_ranges.clear(); + draw_command_barriers.clear(); + inline_vertex_array.clear(); + + switch (primitive) + { + case rsx::primitive_type::line_loop: + case rsx::primitive_type::line_strip: + case rsx::primitive_type::polygon: + case rsx::primitive_type::quad_strip: + case rsx::primitive_type::triangle_fan: + case rsx::primitive_type::triangle_strip: + // Adjacency matters for these types + is_disjoint_primitive = false; + break; + default: + is_disjoint_primitive = true; + } + } + + void begin() + { + current_range_index = 0; + } + + void end() + { + current_range_index = draw_command_ranges.size() - 1; + } + + bool next() + { + current_range_index++; + if (current_range_index >= draw_command_ranges.size()) + { + current_range_index = 0; + return false; + } + + verify(HERE), draw_command_ranges[current_range_index].count != 0; + return true; } /** - * Optimize draw command stream for rendering + * Executes commands reqiured to make the current draw state valid */ - void compile() - { + u32 execute_pipeline_dependencies() const; + const draw_range_t& get_range() const + { + verify(HERE), current_range_index < draw_command_ranges.size(); + return draw_command_ranges[current_range_index]; } - /** - * Insert one command range - */ - void append(u32 first, u32 count) + simple_array get_subranges() const { + verify(HERE), !is_single_draw(); - } + const auto range = get_range(); + const auto limit = range.first + range.count; - u32 min_index() - { - LOG_FATAL(RSX, "Unimplemented"); - return 0; + simple_array ret; + u32 previous_barrier = range.first; + u32 vertex_counter = 0; + + for (const auto &barrier : draw_command_barriers[current_range_index]) + { + if (barrier.type != primitive_restart_barrier) + continue; + + if (barrier.address <= range.first) + continue; + + if (barrier.address >= limit) + break; + + const u32 count = barrier.address - previous_barrier; + ret.push_back({ 0, vertex_counter, count }); + previous_barrier = (u32)barrier.address; + vertex_counter += count; + } + + verify(HERE), !ret.empty(), previous_barrier < limit; + ret.push_back({ 0, vertex_counter, limit - previous_barrier }); + + return ret; } }; diff --git a/rpcs3/Emu/RSX/rsx_utils.h b/rpcs3/Emu/RSX/rsx_utils.h index bf5cdb4c34..0515f97c0e 100644 --- a/rpcs3/Emu/RSX/rsx_utils.h +++ b/rpcs3/Emu/RSX/rsx_utils.h @@ -663,4 +663,237 @@ namespace rsx m_data.store(0); } }; + + template + struct simple_array + { + public: + using iterator = Ty * ; + using const_iterator = Ty * const; + + private: + u32 _capacity = 0; + u32 _size = 0; + Ty* _data = nullptr; + + inline u32 offset(const_iterator pos) + { + return (_data) ? (pos - _data) : 0; + } + + public: + simple_array() {} + + simple_array(u32 initial_size, const Ty val = {}) + { + reserve(initial_size); + _size = initial_size; + + for (int n = 0; n < initial_size; ++n) + { + _data[n] = val; + } + } + + simple_array(const std::initializer_list& args) + { + reserve(args.size()); + + for (const auto& arg : args) + { + push_back(arg); + } + } + + ~simple_array() + { + if (_data) + { + free(_data); + _data = nullptr; + _size = _capacity = 0; + } + } + + void swap(simple_array& other) noexcept + { + std::swap(_capacity, other._capacity); + std::swap(_size, other._size); + std::swap(_data, other._data); + } + + void reserve(u32 size) + { + if (_capacity > size) + return; + + auto old_data = _data; + auto old_size = _size; + + _data = (Ty*)malloc(sizeof(Ty) * size); + _capacity = size; + + if (old_data) + { + memcpy(_data, old_data, sizeof(Ty) * old_size); + free(old_data); + } + } + + void push_back(const Ty& val) + { + if (_size >= _capacity) + { + reserve(_capacity + 16); + } + + _data[_size++] = val; + } + + void push_back(Ty&& val) + { + if (_size >= _capacity) + { + reserve(_capacity + 16); + } + + _data[_size++] = val; + } + + iterator insert(iterator pos, const Ty& val) + { + verify(HERE), pos >= _data; + const auto _loc = offset(pos); + + if (_size >= _capacity) + { + reserve(_capacity + 16); + pos = _data + _loc; + } + + if (_loc >= _size) + { + _data[_size++] = val; + return pos; + } + + verify(HERE), _loc < _size; + + const u32 remaining = (_size - _loc); + memmove(pos + 1, pos, remaining * sizeof(Ty)); + + *pos = val; + _size++; + + return pos; + } + + iterator insert(iterator pos, Ty&& val) + { + verify(HERE), pos >= _data; + const auto _loc = offset(pos); + + if (_size >= _capacity) + { + reserve(_capacity + 16); + pos = _data + _loc; + } + + if (_loc >= _size) + { + _data[_size++] = val; + return pos; + } + + verify(HERE), _loc < _size; + + const u32 remaining = (_size - _loc); + memmove(pos + 1, pos, remaining * sizeof(Ty)); + + *pos = val; + _size++; + + return pos; + } + + void clear() + { + _size = 0; + } + + bool empty() const + { + return _size == 0; + } + + u32 size() const + { + return _size; + } + + u32 capacity() const + { + return _capacity; + } + + Ty& operator[] (u32 index) + { + return _data[index]; + } + + const Ty& operator[] (u32 index) const + { + return _data[index]; + } + + Ty* data() + { + return _data; + } + + const Ty* data() const + { + return _data; + } + + Ty& back() + { + return _data[_size - 1]; + } + + const Ty& back() const + { + return _data[_size - 1]; + } + + Ty& front() + { + return _data[0]; + } + + const Ty& front() const + { + return _data[0]; + } + + iterator begin() + { + return _data; + } + + iterator end() + { + return _data ? _data + _size : nullptr; + } + + const_iterator begin() const + { + return _data; + } + + const_iterator end() const + { + return _data ? _data + _size : nullptr; + } + }; } diff --git a/rpcs3/Emu/RSX/rsx_vertex_data.h b/rpcs3/Emu/RSX/rsx_vertex_data.h index 252020b1d9..4ed65b94da 100644 --- a/rpcs3/Emu/RSX/rsx_vertex_data.h +++ b/rpcs3/Emu/RSX/rsx_vertex_data.h @@ -1,4 +1,4 @@ -#pragma once +#pragma once #include "GCM.h" #include "Utilities/types.h" @@ -64,10 +64,13 @@ struct push_buffer_vertex_info void clear() { - data.resize(0); - attribute_mask = ~0; - vertex_count = 0; - size = 0; + if (size) + { + data.clear(); + attribute_mask = ~0; + vertex_count = 0; + size = 0; + } } u8 get_vertex_size_in_dwords(vertex_base_type type)