From 3b47e433803d3acbd2876f78c913ba468930f76a Mon Sep 17 00:00:00 2001 From: kd-11 Date: Thu, 19 Jul 2018 09:08:20 +0300 Subject: [PATCH] rsx: Synchronization rewritten - Do not do a full sync on a texture read barrier - Avoid calling zcull sync in FIFO spin wait - Do not flush memory to cache from the renderer side; this method is now obsolete --- rpcs3/Emu/RSX/GL/GLGSRender.cpp | 21 ++++----- rpcs3/Emu/RSX/GL/GLGSRender.h | 3 -- rpcs3/Emu/RSX/GL/GLRenderTargets.cpp | 69 ++++++---------------------- rpcs3/Emu/RSX/RSXThread.cpp | 2 +- rpcs3/Emu/RSX/VK/VKGSRender.cpp | 57 ++--------------------- rpcs3/Emu/RSX/VK/VKGSRender.h | 3 -- rpcs3/Emu/RSX/rsx_methods.cpp | 5 +- 7 files changed, 32 insertions(+), 128 deletions(-) diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.cpp b/rpcs3/Emu/RSX/GL/GLGSRender.cpp index 150c2c490c..18c542931f 100644 --- a/rpcs3/Emu/RSX/GL/GLGSRender.cpp +++ b/rpcs3/Emu/RSX/GL/GLGSRender.cpp @@ -593,7 +593,6 @@ void GLGSRender::end() m_draw_time += (u32)std::chrono::duration_cast(draw_end - draw_start).count(); m_draw_calls++; - synchronize_buffers(); rsx::thread::end(); } @@ -1100,7 +1099,6 @@ bool GLGSRender::do_method(u32 cmd, u32 arg) if (arg & 0x3) ctx |= rsx::framebuffer_creation_context::context_clear_depth; init_buffers((rsx::framebuffer_creation_context)ctx, true); - synchronize_buffers(); clear_surface(arg); } @@ -1113,10 +1111,16 @@ bool GLGSRender::do_method(u32 cmd, u32 arg) return true; } case NV4097_TEXTURE_READ_SEMAPHORE_RELEASE: - case NV4097_BACK_END_WRITE_SEMAPHORE_RELEASE: - flush_draw_buffers = true; + { + // Texture barrier, seemingly not very useful return true; } + case NV4097_BACK_END_WRITE_SEMAPHORE_RELEASE: + { + //flush_draw_buffers = true; + return true; + } + } return false; } @@ -1695,15 +1699,6 @@ work_item& GLGSRender::post_flush_request(u32 address, gl::texture_cache::thrash return result; } -void GLGSRender::synchronize_buffers() -{ - if (flush_draw_buffers) - { - write_buffers(); - flush_draw_buffers = false; - } -} - bool GLGSRender::scaled_image_from_memory(rsx::blit_src_info& src, rsx::blit_dst_info& dst, bool interpolate) { if (m_gl_texture_cache.blit(src, dst, interpolate, m_rtts)) diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.h b/rpcs3/Emu/RSX/GL/GLGSRender.h index c8563adfba..ef9928e856 100644 --- a/rpcs3/Emu/RSX/GL/GLGSRender.h +++ b/rpcs3/Emu/RSX/GL/GLGSRender.h @@ -325,7 +325,6 @@ private: shared_mutex queue_guard; std::list work_queue; - bool flush_draw_buffers = false; std::thread::id m_thread_id; GLProgramBuffer m_prog_buffer; @@ -369,10 +368,8 @@ private: public: void read_buffers(); - void write_buffers(); void set_viewport(); - void synchronize_buffers(); work_item& post_flush_request(u32 address, gl::texture_cache::thrashed_set& flush_data); bool scaled_image_from_memory(rsx::blit_src_info& src_info, rsx::blit_dst_info& dst_info, bool interpolate) override; diff --git a/rpcs3/Emu/RSX/GL/GLRenderTargets.cpp b/rpcs3/Emu/RSX/GL/GLRenderTargets.cpp index e9662f7bf4..8b76297108 100644 --- a/rpcs3/Emu/RSX/GL/GLRenderTargets.cpp +++ b/rpcs3/Emu/RSX/GL/GLRenderTargets.cpp @@ -179,9 +179,6 @@ void GLGSRender::init_buffers(rsx::framebuffer_creation_context context, bool sk return; } - //We are about to change buffers, flush any pending requests for the old buffers - synchronize_buffers(); - m_rtts_dirty = false; zcull_surface_active = false; @@ -475,28 +472,28 @@ void GLGSRender::init_buffers(rsx::framebuffer_creation_context context, bool sk case rsx::surface_target::none: break; case rsx::surface_target::surface_a: - __glcheck draw_fbo.draw_buffer(draw_fbo.color[0]); - __glcheck draw_fbo.read_buffer(draw_fbo.color[0]); + draw_fbo.draw_buffer(draw_fbo.color[0]); + draw_fbo.read_buffer(draw_fbo.color[0]); break; case rsx::surface_target::surface_b: - __glcheck draw_fbo.draw_buffer(draw_fbo.color[1]); - __glcheck draw_fbo.read_buffer(draw_fbo.color[1]); + draw_fbo.draw_buffer(draw_fbo.color[1]); + draw_fbo.read_buffer(draw_fbo.color[1]); break; case rsx::surface_target::surfaces_a_b: - __glcheck draw_fbo.draw_buffers({ draw_fbo.color[0], draw_fbo.color[1] }); - __glcheck draw_fbo.read_buffer(draw_fbo.color[0]); + draw_fbo.draw_buffers({ draw_fbo.color[0], draw_fbo.color[1] }); + draw_fbo.read_buffer(draw_fbo.color[0]); break; case rsx::surface_target::surfaces_a_b_c: - __glcheck draw_fbo.draw_buffers({ draw_fbo.color[0], draw_fbo.color[1], draw_fbo.color[2] }); - __glcheck draw_fbo.read_buffer(draw_fbo.color[0]); + draw_fbo.draw_buffers({ draw_fbo.color[0], draw_fbo.color[1], draw_fbo.color[2] }); + draw_fbo.read_buffer(draw_fbo.color[0]); break; case rsx::surface_target::surfaces_a_b_c_d: - __glcheck draw_fbo.draw_buffers({ draw_fbo.color[0], draw_fbo.color[1], draw_fbo.color[2], draw_fbo.color[3] }); - __glcheck draw_fbo.read_buffer(draw_fbo.color[0]); + draw_fbo.draw_buffers({ draw_fbo.color[0], draw_fbo.color[1], draw_fbo.color[2], draw_fbo.color[3] }); + draw_fbo.read_buffer(draw_fbo.color[0]); break; } @@ -590,7 +587,7 @@ void GLGSRender::read_buffers() { if (!color_buffer.tile) { - __glcheck std::get<1>(m_rtts.m_bound_render_targets[i])->copy_from(color_buffer.ptr, color_format.format, color_format.type); + std::get<1>(m_rtts.m_bound_render_targets[i])->copy_from(color_buffer.ptr, color_format.format, color_format.type); } else { @@ -599,7 +596,7 @@ void GLGSRender::read_buffers() std::unique_ptr buffer(new u8[pitch * height]); color_buffer.read(buffer.get(), width, height, pitch); - __glcheck std::get<1>(m_rtts.m_bound_render_targets[i])->copy_from(buffer.get(), color_format.format, color_format.type); + std::get<1>(m_rtts.m_bound_render_targets[i])->copy_from(buffer.get(), color_format.format, color_format.type); } } } @@ -654,8 +651,8 @@ void GLGSRender::read_buffers() int pixel_size = rsx::internals::get_pixel_size(rsx::method_registers.surface_depth_fmt()); gl::buffer pbo_depth; - __glcheck pbo_depth.create(width * height * pixel_size); - __glcheck pbo_depth.map([&](GLubyte* pixels) + pbo_depth.create(width * height * pixel_size); + pbo_depth.map([&](GLubyte* pixels) { u32 depth_address = rsx::get_address(rsx::method_registers.surface_z_offset(), rsx::method_registers.surface_z_dma()); @@ -679,42 +676,6 @@ void GLGSRender::read_buffers() } }, gl::buffer::access::write); - __glcheck std::get<1>(m_rtts.m_bound_depth_stencil)->copy_from(pbo_depth, depth_format.format, depth_format.type); - } -} - -void GLGSRender::write_buffers() -{ - if (!draw_fbo) - return; - - if (g_cfg.video.write_color_buffers) - { - auto write_color_buffers = [&](int index, int count) - { - for (int i = index; i < index + count; ++i) - { - if (m_surface_info[i].pitch == 0) - continue; - - /**Even tiles are loaded as whole textures during read_buffers from testing. - * Need further evaluation to determine correct behavior. Separate paths for both show no difference, - * but using the GPU to perform the caching is many times faster. - */ - - const u32 range = m_surface_info[i].pitch * m_surface_info[i].height; - m_gl_texture_cache.flush_memory_to_cache(m_surface_info[i].address, range, true, 0xFF); - } - }; - - write_color_buffers(0, 4); - } - - if (g_cfg.video.write_depth_buffer) - { - if (m_depth_surface_info.pitch == 0) return; - - const u32 range = m_depth_surface_info.pitch * m_depth_surface_info.height; - m_gl_texture_cache.flush_memory_to_cache(m_depth_surface_info.address, range, true, 0xFF); + std::get<1>(m_rtts.m_bound_depth_stencil)->copy_from(pbo_depth, depth_format.format, depth_format.type); } } diff --git a/rpcs3/Emu/RSX/RSXThread.cpp b/rpcs3/Emu/RSX/RSXThread.cpp index 5331f3fbdb..cd0952dd6c 100644 --- a/rpcs3/Emu/RSX/RSXThread.cpp +++ b/rpcs3/Emu/RSX/RSXThread.cpp @@ -587,7 +587,7 @@ namespace rsx } else if (zcull_ctrl->has_pending()) { - zcull_ctrl->sync(this); + //zcull_ctrl->sync(this); } else { diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.cpp b/rpcs3/Emu/RSX/VK/VKGSRender.cpp index 4e867408ed..6ad1445df7 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRender.cpp +++ b/rpcs3/Emu/RSX/VK/VKGSRender.cpp @@ -1491,7 +1491,6 @@ void VKGSRender::end() std::chrono::time_point draw_end = steady_clock::now(); m_draw_time += std::chrono::duration_cast(draw_end - textures_end).count(); - copy_render_targets_to_dma_location(); m_draw_calls++; rsx::thread::end(); @@ -1638,8 +1637,6 @@ void VKGSRender::clear_surface(u32 mask) if (!framebuffer_status_valid) return; - copy_render_targets_to_dma_location(); - float depth_clear = 1.f; u32 stencil_clear = 0; u32 depth_stencil_mask = 0; @@ -1793,53 +1790,6 @@ void VKGSRender::clear_surface(u32 mask) } } -void VKGSRender::sync_at_semaphore_release() -{ - m_flush_draw_buffers = true; -} - -void VKGSRender::copy_render_targets_to_dma_location() -{ - if (!m_flush_draw_buffers) - return; - - if (!g_cfg.video.write_color_buffers && !g_cfg.video.write_depth_buffer) - return; - - //TODO: Make this asynchronous. Should be similar to a glFlush() but in this case its similar to glFinish - //This is due to all the hard waits for fences - //TODO: Use a command buffer array to allow explicit draw command tracking - - vk::enter_uninterruptible(); - - if (g_cfg.video.write_color_buffers) - { - for (u8 index = 0; index < rsx::limits::color_buffers_count; index++) - { - if (!m_surface_info[index].pitch) - continue; - - m_texture_cache.flush_memory_to_cache(m_surface_info[index].address, m_surface_info[index].pitch * m_surface_info[index].height, true, 0xFF, - *m_current_command_buffer, m_swapchain->get_graphics_queue()); - } - } - - if (g_cfg.video.write_depth_buffer) - { - if (m_depth_surface_info.pitch) - { - m_texture_cache.flush_memory_to_cache(m_depth_surface_info.address, m_depth_surface_info.pitch * m_depth_surface_info.height, true, 0xFF, - *m_current_command_buffer, m_swapchain->get_graphics_queue()); - } - } - - vk::leave_uninterruptible(); - - flush_command_queue(); - - m_flush_draw_buffers = false; -} - void VKGSRender::flush_command_queue(bool hard_sync) { close_and_submit_command_buffer({}, m_current_command_buffer->submit_fence); @@ -2192,9 +2142,11 @@ bool VKGSRender::do_method(u32 cmd, u32 arg) clear_surface(arg); return true; case NV4097_TEXTURE_READ_SEMAPHORE_RELEASE: + // Texture barrier, seemingly not very useful + return true; case NV4097_BACK_END_WRITE_SEMAPHORE_RELEASE: - sync_at_semaphore_release(); - return false; //call rsx::thread method implementation + //sync_at_semaphore_release(); + return true; default: return false; } @@ -2541,7 +2493,6 @@ void VKGSRender::prepare_rtts(rsx::framebuffer_creation_context context) if (m_draw_fbo && !m_rtts_dirty) return; - copy_render_targets_to_dma_location(); m_rtts_dirty = false; u32 clip_width = rsx::method_registers.surface_clip_width(); diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.h b/rpcs3/Emu/RSX/VK/VKGSRender.h index 7f0658ec81..7e372703a6 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRender.h +++ b/rpcs3/Emu/RSX/VK/VKGSRender.h @@ -354,7 +354,6 @@ private: s64 m_flip_time = 0; u8 m_draw_buffers_count = 0; - bool m_flush_draw_buffers = false; shared_mutex m_flush_queue_mutex; flush_request_task m_flush_requests; @@ -380,9 +379,7 @@ private: void clear_surface(u32 mask); void close_and_submit_command_buffer(const std::vector &semaphores, VkFence fence, VkPipelineStageFlags pipeline_stage_flags = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT); void open_command_buffer(); - void sync_at_semaphore_release(); void prepare_rtts(rsx::framebuffer_creation_context context); - void copy_render_targets_to_dma_location(); void flush_command_queue(bool hard_sync = false); void queue_swap_request(); diff --git a/rpcs3/Emu/RSX/rsx_methods.cpp b/rpcs3/Emu/RSX/rsx_methods.cpp index 45c6128d46..3311fec6ed 100644 --- a/rpcs3/Emu/RSX/rsx_methods.cpp +++ b/rpcs3/Emu/RSX/rsx_methods.cpp @@ -160,6 +160,8 @@ namespace rsx void texture_read_semaphore_release(thread* rsx, u32 _reg, u32 arg) { + // Pipeline barrier seems to be equivalent to a SHADER_READ stage barrier + const u32 index = method_registers.semaphore_offset_4097() >> 4; // lle-gcm likes to inject system reserved semaphores, presumably for system/vsh usage // Avoid calling render to avoid any havoc(flickering) they may cause from invalid flush/write @@ -169,7 +171,6 @@ namespace rsx // } - rsx->sync(); auto& sema = vm::_ref(rsx->label_addr); sema.semaphore[index].val = arg; sema.semaphore[index].pad = 0; @@ -178,6 +179,8 @@ namespace rsx void back_end_write_semaphore_release(thread* rsx, u32 _reg, u32 arg) { + // Full pipeline barrier + const u32 index = method_registers.semaphore_offset_4097() >> 4; if (index > 63 && !rsx->do_method(NV4097_BACK_END_WRITE_SEMAPHORE_RELEASE, arg)) {