From 3b47e433803d3acbd2876f78c913ba468930f76a Mon Sep 17 00:00:00 2001
From: kd-11 <karokidii@gmail.com>
Date: Thu, 19 Jul 2018 09:08:20 +0300
Subject: [PATCH] rsx: Synchronization rewritten - Do not do a full sync on a
 texture read barrier - Avoid calling zcull sync in FIFO spin wait - Do not
 flush memory to cache from the renderer side; this method is now obsolete

---
 rpcs3/Emu/RSX/GL/GLGSRender.cpp      | 21 ++++-----
 rpcs3/Emu/RSX/GL/GLGSRender.h        |  3 --
 rpcs3/Emu/RSX/GL/GLRenderTargets.cpp | 69 ++++++----------------------
 rpcs3/Emu/RSX/RSXThread.cpp          |  2 +-
 rpcs3/Emu/RSX/VK/VKGSRender.cpp      | 57 ++---------------------
 rpcs3/Emu/RSX/VK/VKGSRender.h        |  3 --
 rpcs3/Emu/RSX/rsx_methods.cpp        |  5 +-
 7 files changed, 32 insertions(+), 128 deletions(-)

diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.cpp b/rpcs3/Emu/RSX/GL/GLGSRender.cpp
index 150c2c490c..18c542931f 100644
--- a/rpcs3/Emu/RSX/GL/GLGSRender.cpp
+++ b/rpcs3/Emu/RSX/GL/GLGSRender.cpp
@@ -593,7 +593,6 @@ void GLGSRender::end()
 	m_draw_time += (u32)std::chrono::duration_cast<std::chrono::microseconds>(draw_end - draw_start).count();
 	m_draw_calls++;
 
-	synchronize_buffers();
 	rsx::thread::end();
 }
 
@@ -1100,7 +1099,6 @@ bool GLGSRender::do_method(u32 cmd, u32 arg)
 			if (arg & 0x3) ctx |= rsx::framebuffer_creation_context::context_clear_depth;
 
 			init_buffers((rsx::framebuffer_creation_context)ctx, true);
-			synchronize_buffers();
 			clear_surface(arg);
 		}
 
@@ -1113,10 +1111,16 @@ bool GLGSRender::do_method(u32 cmd, u32 arg)
 		return true;
 	}
 	case NV4097_TEXTURE_READ_SEMAPHORE_RELEASE:
-	case NV4097_BACK_END_WRITE_SEMAPHORE_RELEASE:
-		flush_draw_buffers = true;
+	{
+		// Texture barrier, seemingly not very useful
 		return true;
 	}
+	case NV4097_BACK_END_WRITE_SEMAPHORE_RELEASE:
+	{
+		//flush_draw_buffers = true;
+		return true;
+	}
+	}
 
 	return false;
 }
@@ -1695,15 +1699,6 @@ work_item& GLGSRender::post_flush_request(u32 address, gl::texture_cache::thrash
 	return result;
 }
 
-void GLGSRender::synchronize_buffers()
-{
-	if (flush_draw_buffers)
-	{
-		write_buffers();
-		flush_draw_buffers = false;
-	}
-}
-
 bool GLGSRender::scaled_image_from_memory(rsx::blit_src_info& src, rsx::blit_dst_info& dst, bool interpolate)
 {
 	if (m_gl_texture_cache.blit(src, dst, interpolate, m_rtts))
diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.h b/rpcs3/Emu/RSX/GL/GLGSRender.h
index c8563adfba..ef9928e856 100644
--- a/rpcs3/Emu/RSX/GL/GLGSRender.h
+++ b/rpcs3/Emu/RSX/GL/GLGSRender.h
@@ -325,7 +325,6 @@ private:
 	shared_mutex queue_guard;
 	std::list<work_item> work_queue;
 
-	bool flush_draw_buffers = false;
 	std::thread::id m_thread_id;
 
 	GLProgramBuffer m_prog_buffer;
@@ -369,10 +368,8 @@ private:
 
 public:
 	void read_buffers();
-	void write_buffers();
 	void set_viewport();
 
-	void synchronize_buffers();
 	work_item& post_flush_request(u32 address, gl::texture_cache::thrashed_set& flush_data);
 
 	bool scaled_image_from_memory(rsx::blit_src_info& src_info, rsx::blit_dst_info& dst_info, bool interpolate) override;
diff --git a/rpcs3/Emu/RSX/GL/GLRenderTargets.cpp b/rpcs3/Emu/RSX/GL/GLRenderTargets.cpp
index e9662f7bf4..8b76297108 100644
--- a/rpcs3/Emu/RSX/GL/GLRenderTargets.cpp
+++ b/rpcs3/Emu/RSX/GL/GLRenderTargets.cpp
@@ -179,9 +179,6 @@ void GLGSRender::init_buffers(rsx::framebuffer_creation_context context, bool sk
 		return;
 	}
 
-	//We are about to change buffers, flush any pending requests for the old buffers
-	synchronize_buffers();
-
 	m_rtts_dirty = false;
 	zcull_surface_active = false;
 
@@ -475,28 +472,28 @@ void GLGSRender::init_buffers(rsx::framebuffer_creation_context context, bool sk
 	case rsx::surface_target::none: break;
 
 	case rsx::surface_target::surface_a:
-		__glcheck draw_fbo.draw_buffer(draw_fbo.color[0]);
-		__glcheck draw_fbo.read_buffer(draw_fbo.color[0]);
+		draw_fbo.draw_buffer(draw_fbo.color[0]);
+		draw_fbo.read_buffer(draw_fbo.color[0]);
 		break;
 
 	case rsx::surface_target::surface_b:
-		__glcheck draw_fbo.draw_buffer(draw_fbo.color[1]);
-		__glcheck draw_fbo.read_buffer(draw_fbo.color[1]);
+		draw_fbo.draw_buffer(draw_fbo.color[1]);
+		draw_fbo.read_buffer(draw_fbo.color[1]);
 		break;
 
 	case rsx::surface_target::surfaces_a_b:
-		__glcheck draw_fbo.draw_buffers({ draw_fbo.color[0], draw_fbo.color[1] });
-		__glcheck draw_fbo.read_buffer(draw_fbo.color[0]);
+		draw_fbo.draw_buffers({ draw_fbo.color[0], draw_fbo.color[1] });
+		draw_fbo.read_buffer(draw_fbo.color[0]);
 		break;
 
 	case rsx::surface_target::surfaces_a_b_c:
-		__glcheck draw_fbo.draw_buffers({ draw_fbo.color[0], draw_fbo.color[1], draw_fbo.color[2] });
-		__glcheck draw_fbo.read_buffer(draw_fbo.color[0]);
+		draw_fbo.draw_buffers({ draw_fbo.color[0], draw_fbo.color[1], draw_fbo.color[2] });
+		draw_fbo.read_buffer(draw_fbo.color[0]);
 		break;
 
 	case rsx::surface_target::surfaces_a_b_c_d:
-		__glcheck draw_fbo.draw_buffers({ draw_fbo.color[0], draw_fbo.color[1], draw_fbo.color[2], draw_fbo.color[3] });
-		__glcheck draw_fbo.read_buffer(draw_fbo.color[0]);
+		draw_fbo.draw_buffers({ draw_fbo.color[0], draw_fbo.color[1], draw_fbo.color[2], draw_fbo.color[3] });
+		draw_fbo.read_buffer(draw_fbo.color[0]);
 		break;
 	}
 
@@ -590,7 +587,7 @@ void GLGSRender::read_buffers()
 				{
 					if (!color_buffer.tile)
 					{
-						__glcheck std::get<1>(m_rtts.m_bound_render_targets[i])->copy_from(color_buffer.ptr, color_format.format, color_format.type);
+						std::get<1>(m_rtts.m_bound_render_targets[i])->copy_from(color_buffer.ptr, color_format.format, color_format.type);
 					}
 					else
 					{
@@ -599,7 +596,7 @@ void GLGSRender::read_buffers()
 						std::unique_ptr<u8[]> buffer(new u8[pitch * height]);
 						color_buffer.read(buffer.get(), width, height, pitch);
 
-						__glcheck std::get<1>(m_rtts.m_bound_render_targets[i])->copy_from(buffer.get(), color_format.format, color_format.type);
+						std::get<1>(m_rtts.m_bound_render_targets[i])->copy_from(buffer.get(), color_format.format, color_format.type);
 					}
 				}
 			}
@@ -654,8 +651,8 @@ void GLGSRender::read_buffers()
 		int pixel_size    = rsx::internals::get_pixel_size(rsx::method_registers.surface_depth_fmt());
 		gl::buffer pbo_depth;
 
-		__glcheck pbo_depth.create(width * height * pixel_size);
-		__glcheck pbo_depth.map([&](GLubyte* pixels)
+		pbo_depth.create(width * height * pixel_size);
+		pbo_depth.map([&](GLubyte* pixels)
 		{
 			u32 depth_address = rsx::get_address(rsx::method_registers.surface_z_offset(), rsx::method_registers.surface_z_dma());
 
@@ -679,42 +676,6 @@ void GLGSRender::read_buffers()
 			}
 		}, gl::buffer::access::write);
 
-		__glcheck std::get<1>(m_rtts.m_bound_depth_stencil)->copy_from(pbo_depth, depth_format.format, depth_format.type);
-	}
-}
-
-void GLGSRender::write_buffers()
-{
-	if (!draw_fbo)
-		return;
-
-	if (g_cfg.video.write_color_buffers)
-	{
-		auto write_color_buffers = [&](int index, int count)
-		{
-			for (int i = index; i < index + count; ++i)
-			{
-				if (m_surface_info[i].pitch == 0)
-					continue;
-
-				/**Even tiles are loaded as whole textures during read_buffers from testing.
-				* Need further evaluation to determine correct behavior. Separate paths for both show no difference,
-				* but using the GPU to perform the caching is many times faster.
-				*/
-
-				const u32 range = m_surface_info[i].pitch * m_surface_info[i].height;
-				m_gl_texture_cache.flush_memory_to_cache(m_surface_info[i].address, range, true, 0xFF);
-			}
-		};
-
-		write_color_buffers(0, 4);
-	}
-
-	if (g_cfg.video.write_depth_buffer)
-	{
-		if (m_depth_surface_info.pitch == 0) return;
-
-		const u32 range = m_depth_surface_info.pitch * m_depth_surface_info.height;
-		m_gl_texture_cache.flush_memory_to_cache(m_depth_surface_info.address, range, true, 0xFF);
+		std::get<1>(m_rtts.m_bound_depth_stencil)->copy_from(pbo_depth, depth_format.format, depth_format.type);
 	}
 }
diff --git a/rpcs3/Emu/RSX/RSXThread.cpp b/rpcs3/Emu/RSX/RSXThread.cpp
index 5331f3fbdb..cd0952dd6c 100644
--- a/rpcs3/Emu/RSX/RSXThread.cpp
+++ b/rpcs3/Emu/RSX/RSXThread.cpp
@@ -587,7 +587,7 @@ namespace rsx
 					}
 					else if (zcull_ctrl->has_pending())
 					{
-						zcull_ctrl->sync(this);
+						//zcull_ctrl->sync(this);
 					}
 					else
 					{
diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.cpp b/rpcs3/Emu/RSX/VK/VKGSRender.cpp
index 4e867408ed..6ad1445df7 100644
--- a/rpcs3/Emu/RSX/VK/VKGSRender.cpp
+++ b/rpcs3/Emu/RSX/VK/VKGSRender.cpp
@@ -1491,7 +1491,6 @@ void VKGSRender::end()
 	std::chrono::time_point<steady_clock> draw_end = steady_clock::now();
 	m_draw_time += std::chrono::duration_cast<std::chrono::microseconds>(draw_end - textures_end).count();
 
-	copy_render_targets_to_dma_location();
 	m_draw_calls++;
 
 	rsx::thread::end();
@@ -1638,8 +1637,6 @@ void VKGSRender::clear_surface(u32 mask)
 
 	if (!framebuffer_status_valid) return;
 
-	copy_render_targets_to_dma_location();
-
 	float depth_clear = 1.f;
 	u32   stencil_clear = 0;
 	u32   depth_stencil_mask = 0;
@@ -1793,53 +1790,6 @@ void VKGSRender::clear_surface(u32 mask)
 	}
 }
 
-void VKGSRender::sync_at_semaphore_release()
-{
-	m_flush_draw_buffers = true;
-}
-
-void VKGSRender::copy_render_targets_to_dma_location()
-{
-	if (!m_flush_draw_buffers)
-		return;
-
-	if (!g_cfg.video.write_color_buffers && !g_cfg.video.write_depth_buffer)
-		return;
-
-	//TODO: Make this asynchronous. Should be similar to a glFlush() but in this case its similar to glFinish
-	//This is due to all the hard waits for fences
-	//TODO: Use a command buffer array to allow explicit draw command tracking
-
-	vk::enter_uninterruptible();
-
-	if (g_cfg.video.write_color_buffers)
-	{
-		for (u8 index = 0; index < rsx::limits::color_buffers_count; index++)
-		{
-			if (!m_surface_info[index].pitch)
-				continue;
-
-			m_texture_cache.flush_memory_to_cache(m_surface_info[index].address, m_surface_info[index].pitch * m_surface_info[index].height, true, 0xFF,
-					*m_current_command_buffer, m_swapchain->get_graphics_queue());
-		}
-	}
-
-	if (g_cfg.video.write_depth_buffer)
-	{
-		if (m_depth_surface_info.pitch)
-		{
-			m_texture_cache.flush_memory_to_cache(m_depth_surface_info.address, m_depth_surface_info.pitch * m_depth_surface_info.height, true, 0xFF,
-				*m_current_command_buffer, m_swapchain->get_graphics_queue());
-		}
-	}
-
-	vk::leave_uninterruptible();
-
-	flush_command_queue();
-
-	m_flush_draw_buffers = false;
-}
-
 void VKGSRender::flush_command_queue(bool hard_sync)
 {
 	close_and_submit_command_buffer({}, m_current_command_buffer->submit_fence);
@@ -2192,9 +2142,11 @@ bool VKGSRender::do_method(u32 cmd, u32 arg)
 		clear_surface(arg);
 		return true;
 	case NV4097_TEXTURE_READ_SEMAPHORE_RELEASE:
+		// Texture barrier, seemingly not very useful
+		return true;
 	case NV4097_BACK_END_WRITE_SEMAPHORE_RELEASE:
-		sync_at_semaphore_release();
-		return false; //call rsx::thread method implementation
+		//sync_at_semaphore_release();
+		return true;
 	default:
 		return false;
 	}
@@ -2541,7 +2493,6 @@ void VKGSRender::prepare_rtts(rsx::framebuffer_creation_context context)
 	if (m_draw_fbo && !m_rtts_dirty)
 		return;
 
-	copy_render_targets_to_dma_location();
 	m_rtts_dirty = false;
 
 	u32 clip_width = rsx::method_registers.surface_clip_width();
diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.h b/rpcs3/Emu/RSX/VK/VKGSRender.h
index 7f0658ec81..7e372703a6 100644
--- a/rpcs3/Emu/RSX/VK/VKGSRender.h
+++ b/rpcs3/Emu/RSX/VK/VKGSRender.h
@@ -354,7 +354,6 @@ private:
 	s64 m_flip_time = 0;
 
 	u8 m_draw_buffers_count = 0;
-	bool m_flush_draw_buffers = false;
 	
 	shared_mutex m_flush_queue_mutex;
 	flush_request_task m_flush_requests;
@@ -380,9 +379,7 @@ private:
 	void clear_surface(u32 mask);
 	void close_and_submit_command_buffer(const std::vector<VkSemaphore> &semaphores, VkFence fence, VkPipelineStageFlags pipeline_stage_flags = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT);
 	void open_command_buffer();
-	void sync_at_semaphore_release();
 	void prepare_rtts(rsx::framebuffer_creation_context context);
-	void copy_render_targets_to_dma_location();
 
 	void flush_command_queue(bool hard_sync = false);
 	void queue_swap_request();
diff --git a/rpcs3/Emu/RSX/rsx_methods.cpp b/rpcs3/Emu/RSX/rsx_methods.cpp
index 45c6128d46..3311fec6ed 100644
--- a/rpcs3/Emu/RSX/rsx_methods.cpp
+++ b/rpcs3/Emu/RSX/rsx_methods.cpp
@@ -160,6 +160,8 @@ namespace rsx
 
 		void texture_read_semaphore_release(thread* rsx, u32 _reg, u32 arg)
 		{
+			// Pipeline barrier seems to be equivalent to a SHADER_READ stage barrier
+
 			const u32 index = method_registers.semaphore_offset_4097() >> 4;
 			// lle-gcm likes to inject system reserved semaphores, presumably for system/vsh usage
 			// Avoid calling render to avoid any havoc(flickering) they may cause from invalid flush/write
@@ -169,7 +171,6 @@ namespace rsx
 				//
 			}
 
-			rsx->sync();
 			auto& sema = vm::_ref<RsxReports>(rsx->label_addr);
 			sema.semaphore[index].val = arg;
 			sema.semaphore[index].pad = 0;
@@ -178,6 +179,8 @@ namespace rsx
 
 		void back_end_write_semaphore_release(thread* rsx, u32 _reg, u32 arg)
 		{
+			// Full pipeline barrier
+
 			const u32 index = method_registers.semaphore_offset_4097() >> 4;
 			if (index > 63 && !rsx->do_method(NV4097_BACK_END_WRITE_SEMAPHORE_RELEASE, arg))
 			{