diff --git a/rpcs3/Emu/RSX/RSXThread.h b/rpcs3/Emu/RSX/RSXThread.h
index 4a2cce8500..dca0e5c7fa 100644
--- a/rpcs3/Emu/RSX/RSXThread.h
+++ b/rpcs3/Emu/RSX/RSXThread.h
@@ -590,6 +590,7 @@ namespace rsx
 		bool supports_hw_conditional_render; // Conditional render
 		bool supports_passthrough_dma;       // DMA passthrough
 		bool supports_asynchronous_compute;  // Async compute
+		bool supports_host_gpu_labels;       // Advanced host synchronization
 	};
 
 	struct sampled_image_descriptor_base;
@@ -859,6 +860,7 @@ namespace rsx
 		void sync();
 		flags32_t read_barrier(u32 memory_address, u32 memory_range, bool unconditional);
 		virtual void sync_hint(FIFO_hint hint, void* args);
+		virtual bool release_GCM_label(u32 /*address*/, u32 /*value*/) { return false; }
 
 		std::span<const std::byte> get_raw_index_array(const draw_clause& draw_indexed_clause) const;
 
diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.cpp b/rpcs3/Emu/RSX/VK/VKGSRender.cpp
index 8a7a29b843..f9cb10adf2 100644
--- a/rpcs3/Emu/RSX/VK/VKGSRender.cpp
+++ b/rpcs3/Emu/RSX/VK/VKGSRender.cpp
@@ -550,11 +550,16 @@ VKGSRender::VKGSRender() : GSRender()
 	// Relaxed query synchronization
 	backend_config.supports_hw_conditional_render = !!g_cfg.video.relaxed_zcull_sync;
 
+	// Passthrough DMA
+	backend_config.supports_passthrough_dma = m_device->get_external_memory_host_support();
+
+	// Host sync
+	backend_config.supports_host_gpu_labels = !!g_cfg.video.host_label_synchronization;
+
 	// Async compute and related operations
 	if (g_cfg.video.vk.asynchronous_texture_streaming)
 	{
-		// Optimistic, enable async compute and passthrough DMA
-		backend_config.supports_passthrough_dma = m_device->get_external_memory_host_support();
+		// Optimistic, enable async compute
 		backend_config.supports_asynchronous_compute = true;
 
 		if (m_device->get_graphics_queue() == m_device->get_transfer_queue())
@@ -562,10 +567,14 @@ VKGSRender::VKGSRender() : GSRender()
 			rsx_log.error("Cannot run graphics and async transfer in the same queue. Async uploads are disabled. This is a limitation of your GPU");
 			backend_config.supports_asynchronous_compute = false;
 		}
+	}
 
-		switch (vk::get_driver_vendor())
+	// Sanity checks
+	switch (vk::get_driver_vendor())
+	{
+	case vk::driver_vendor::NVIDIA:
+		if (backend_config.supports_asynchronous_compute)
 		{
-		case vk::driver_vendor::NVIDIA:
 			if (auto chip_family = vk::get_chip_family();
 				chip_family == vk::chip_class::NV_kepler || chip_family == vk::chip_class::NV_maxwell)
 			{
@@ -574,35 +583,47 @@ VKGSRender::VKGSRender() : GSRender()
 
 			rsx_log.notice("Forcing safe async compute for NVIDIA device to avoid crashing.");
 			g_cfg.video.vk.asynchronous_scheduler.set(vk_gpu_scheduler_mode::safe);
-			break;
+		}
+		break;
 #if !defined(_WIN32)
-			// Anything running on AMDGPU kernel driver will not work due to the check for fd-backed memory allocations
-		case vk::driver_vendor::RADV:
-		case vk::driver_vendor::AMD:
+		// Anything running on AMDGPU kernel driver will not work due to the check for fd-backed memory allocations
+	case vk::driver_vendor::RADV:
+	case vk::driver_vendor::AMD:
 #if !defined(__linux__)
-			// Intel chipsets would fail on BSD in most cases and DRM_IOCTL_i915_GEM_USERPTR unimplemented
-		case vk::driver_vendor::ANV:
+		// Intel chipsets would fail on BSD in most cases and DRM_IOCTL_i915_GEM_USERPTR unimplemented
+	case vk::driver_vendor::ANV:
 #endif
-			if (backend_config.supports_passthrough_dma)
-			{
-				rsx_log.error("AMDGPU kernel driver on linux and INTEL driver on some platforms cannot support passthrough DMA buffers.");
-				backend_config.supports_passthrough_dma = false;
-			}
-			break;
-#endif
-		case vk::driver_vendor::MVK:
-			// Async compute crashes immediately on Apple GPUs
-			rsx_log.error("Apple GPUs are incompatible with the current implementation of asynchronous texture decoding.");
-			backend_config.supports_asynchronous_compute = false;
-			break;
-		default: break;
-		}
-
-		if (backend_config.supports_asynchronous_compute)
+		if (backend_config.supports_passthrough_dma)
 		{
-			// Run only if async compute can be used.
-			g_fxo->init<vk::AsyncTaskScheduler>(g_cfg.video.vk.asynchronous_scheduler);
+			rsx_log.error("AMDGPU kernel driver on linux and INTEL driver on some platforms cannot support passthrough DMA buffers.");
+			backend_config.supports_passthrough_dma = false;
 		}
+		break;
+#endif
+	case vk::driver_vendor::MVK:
+		// Async compute crashes immediately on Apple GPUs
+		rsx_log.error("Apple GPUs are incompatible with the current implementation of asynchronous texture decoding.");
+		backend_config.supports_asynchronous_compute = false;
+		break;
+	default: break;
+	}
+
+	if (backend_config.supports_asynchronous_compute)
+	{
+		// Run only if async compute can be used.
+		g_fxo->init<vk::AsyncTaskScheduler>(g_cfg.video.vk.asynchronous_scheduler);
+	}
+
+	if (backend_config.supports_host_gpu_labels)
+	{
+		m_host_object_data = std::make_unique<vk::buffer>(*m_device,
+			0x100000,
+			memory_map.device_bar, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT,
+			VK_BUFFER_USAGE_TRANSFER_DST_BIT, 0,
+			VMM_ALLOCATION_POOL_SYSTEM);
+
+		m_host_data_ptr = new (m_host_object_data->map(0, 0x100000)) vk::host_data_t();
+		ensure(m_host_data_ptr->magic == 0xCAFEBABE);
 	}
 }
 
@@ -629,6 +650,13 @@ VKGSRender::~VKGSRender()
 		g_fxo->get<vk::AsyncTaskScheduler>().destroy();
 	}
 
+	// Host data
+	if (m_host_object_data)
+	{
+		m_host_object_data->unmap();
+		m_host_object_data.reset();
+	}
+
 	// Clear flush requests
 	m_flush_requests.clear_pending_flag();
 
@@ -1453,6 +1481,35 @@ void VKGSRender::flush_command_queue(bool hard_sync, bool do_not_switch)
 	m_current_command_buffer->begin();
 }
 
+bool VKGSRender::release_GCM_label(u32 address, u32 args)
+{
+	if (!backend_config.supports_host_gpu_labels)
+	{
+		return false;
+	}
+
+	ensure(m_host_data_ptr);
+	if (m_host_data_ptr->texture_load_complete_event == m_host_data_ptr->texture_load_request_event)
+	{
+		// All texture loads already seen by the host GPU
+		// Wait for all previously submitted labels to be flushed
+		while (m_host_data_ptr->last_label_release_event > m_host_data_ptr->commands_complete_event)
+		{
+			_mm_pause();
+		}
+
+		return false;
+	}
+
+	m_host_data_ptr->last_label_release_event = ++m_host_data_ptr->event_counter;
+
+	const auto mapping = vk::map_dma(address, 4);
+	const auto write_data = std::bit_cast<u32, be_t<u32>>(args);
+	vkCmdUpdateBuffer(*m_current_command_buffer, mapping.second->value, mapping.first, 4, &write_data);
+	flush_command_queue();
+	return true;
+}
+
 void VKGSRender::sync_hint(rsx::FIFO_hint hint, void* args)
 {
 	ensure(args);
@@ -2088,6 +2145,15 @@ void VKGSRender::close_and_submit_command_buffer(vk::fence* pFence, VkSemaphore
 		m_current_command_buffer->flags &= ~vk::command_buffer::cb_has_open_query;
 	}
 
+	if (m_host_data_ptr && m_host_data_ptr->last_label_release_event > m_host_data_ptr->commands_complete_event)
+	{
+		vkCmdUpdateBuffer(*m_current_command_buffer,
+			m_host_object_data->value,
+			::offset32(&vk::host_data_t::commands_complete_event),
+			sizeof(u64),
+			const_cast<u64*>(&m_host_data_ptr->event_counter));
+	}
+
 	m_current_command_buffer->end();
 	m_current_command_buffer->tag();
 
diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.h b/rpcs3/Emu/RSX/VK/VKGSRender.h
index 55cf87d404..d56fe9c098 100644
--- a/rpcs3/Emu/RSX/VK/VKGSRender.h
+++ b/rpcs3/Emu/RSX/VK/VKGSRender.h
@@ -117,6 +117,9 @@ private:
 	vk::command_buffer_chunk* m_current_command_buffer = nullptr;
 	VkSemaphore m_dangling_semaphore_signal = VK_NULL_HANDLE;
 
+	volatile vk::host_data_t* m_host_data_ptr = nullptr;
+	std::unique_ptr<vk::buffer> m_host_object_data;
+
 	VkDescriptorSetLayout descriptor_layouts;
 	VkPipelineLayout pipeline_layout;
 
@@ -242,6 +245,7 @@ public:
 	void bind_viewport();
 
 	void sync_hint(rsx::FIFO_hint hint, void* args) override;
+	bool release_GCM_label(u32 address, u32 data) override;
 
 	void begin_occlusion_query(rsx::reports::occlusion_query_info* query) override;
 	void end_occlusion_query(rsx::reports::occlusion_query_info* query) override;
@@ -259,6 +263,9 @@ public:
 	void begin_conditional_rendering(const std::vector<rsx::reports::occlusion_query_info*>& sources) override;
 	void end_conditional_rendering() override;
 
+	// Host sync object
+	inline std::pair<volatile vk::host_data_t*, VkBuffer> map_host_object_data() { return { m_host_data_ptr, m_host_object_data->value }; }
+
 protected:
 	void clear_surface(u32 mask) override;
 	void begin() override;
diff --git a/rpcs3/Emu/RSX/VK/VKTexture.cpp b/rpcs3/Emu/RSX/VK/VKTexture.cpp
index f37c0c574e..d263528cbc 100644
--- a/rpcs3/Emu/RSX/VK/VKTexture.cpp
+++ b/rpcs3/Emu/RSX/VK/VKTexture.cpp
@@ -9,6 +9,7 @@
 
 #include "vkutils/data_heap.h"
 #include "vkutils/image_helpers.h"
+#include "VKGSRender.h"
 
 #include "../GCM.h"
 #include "../rsx_utils.h"
@@ -1146,6 +1147,17 @@ namespace vk
 			// Release from async chain, the primary chain will acquire later
 			dst_image->queue_release(cmd2, cmd.get_queue_family(), dst_image->current_layout);
 		}
+
+		if (auto rsxthr = rsx::get_current_renderer();
+			rsxthr->get_backend_config().supports_host_gpu_labels)
+		{
+			// Queue a sync update on the CB doing the load
+			auto [host_data, host_buffer] = static_cast<VKGSRender*>(rsxthr)->map_host_object_data();
+			ensure(host_data);
+			const auto event_id = ++host_data->event_counter;
+			host_data->texture_load_request_event = event_id;
+			vkCmdUpdateBuffer(cmd2, host_buffer, ::offset32(&vk::host_data_t::texture_load_complete_event), sizeof(u64), &event_id);
+		}
 	}
 
 	void blitter::scale_image(vk::command_buffer& cmd, vk::image* src, vk::image* dst, areai src_area, areai dst_area, bool interpolate, const rsx::typeless_xfer& xfer_info)
diff --git a/rpcs3/Emu/RSX/VK/vkutils/sync.h b/rpcs3/Emu/RSX/VK/vkutils/sync.h
index a1d0049ab8..2551c51698 100644
--- a/rpcs3/Emu/RSX/VK/vkutils/sync.h
+++ b/rpcs3/Emu/RSX/VK/vkutils/sync.h
@@ -16,6 +16,16 @@ namespace vk
 		gpu = 1
 	};
 
+	struct host_data_t // Pick a better name
+	{
+		u64 magic = 0xCAFEBABE;
+		u64 event_counter = 0;
+		u64 texture_load_request_event = 0;
+		u64 texture_load_complete_event = 0;
+		u64 last_label_release_event = 0;
+		u64 commands_complete_event = 0;
+	};
+
 	struct fence
 	{
 		atomic_t<bool> flushed = false;
diff --git a/rpcs3/Emu/RSX/rsx_methods.cpp b/rpcs3/Emu/RSX/rsx_methods.cpp
index ebba57a77c..ec63a3f052 100644
--- a/rpcs3/Emu/RSX/rsx_methods.cpp
+++ b/rpcs3/Emu/RSX/rsx_methods.cpp
@@ -29,6 +29,35 @@ namespace rsx
 		rsx_log.trace("RSX method 0x%x (arg=0x%x)", reg << 2, arg);
 	}
 
+	template<bool FlushDMA, bool FlushPipe>
+	void write_gcm_label(thread* rsx, u32 address, u32 data)
+	{
+		const bool is_flip_sema = (address == (rsx->label_addr + 0x10) || address == (rsx->label_addr + 0x30));
+		if (!is_flip_sema)
+		{
+			if constexpr (FlushPipe)
+			{
+				// Ignoring these can cause very poor performance due to timestamp queries taking too long.
+				rsx->sync();
+			}
+
+			if (rsx->get_backend_config().supports_host_gpu_labels &&
+				rsx->release_GCM_label(address, data))
+			{
+				// Backend will handle it, nothing to do.
+				// Implicitly handles DMA sync.
+				return;
+			}
+
+			if constexpr (FlushDMA)
+			{
+				g_fxo->get<rsx::dma_manager>().sync();
+			}
+		}
+
+		vm::_ref<RsxSemaphore>(address).val = data;
+	}
+
 	template<typename Type> struct vertex_data_type_from_element_type;
 	template<> struct vertex_data_type_from_element_type<float> { static const vertex_base_type type = vertex_base_type::f; };
 	template<> struct vertex_data_type_from_element_type<f16> { static const vertex_base_type type = vertex_base_type::sf; };
@@ -74,6 +103,8 @@ namespace rsx
 				rsx->flush_fifo();
 			}
 
+			//rsx_log.error("Wait for address at 0x%x to change to 0x%x", addr, arg);
+
 			u64 start = get_system_time();
 			while (sema != arg)
 			{
@@ -116,8 +147,6 @@ namespace rsx
 
 		void semaphore_release(thread* rsx, u32 /*reg*/, u32 arg)
 		{
-			rsx->sync();
-
 			const u32 offset = method_registers.semaphore_offset_406e();
 
 			if (offset % 4)
@@ -144,7 +173,7 @@ namespace rsx
 				rsx_log.fatal("NV406E semaphore unexpected address. Please report to the developers. (offset=0x%x, addr=0x%x)", offset, addr);
 			}
 
-			vm::_ref<RsxSemaphore>(addr).val = arg;
+			write_gcm_label<false, true>(rsx, addr, arg);
 		}
 	}
 
@@ -207,11 +236,6 @@ namespace rsx
 		void texture_read_semaphore_release(thread* rsx, u32 /*reg*/, u32 arg)
 		{
 			// Pipeline barrier seems to be equivalent to a SHADER_READ stage barrier
-			g_fxo->get<rsx::dma_manager>().sync();
-			if (g_cfg.video.strict_rendering_mode)
-			{
-				rsx->sync();
-			}
 
 			// lle-gcm likes to inject system reserved semaphores, presumably for system/vsh usage
 			// Avoid calling render to avoid any havoc(flickering) they may cause from invalid flush/write
@@ -224,14 +248,19 @@ namespace rsx
 				return;
 			}
 
-			vm::_ref<RsxSemaphore>(get_address(offset, method_registers.semaphore_context_dma_4097())).val = arg;
+			if (g_cfg.video.strict_rendering_mode) [[ unlikely ]]
+			{
+				write_gcm_label<true, true>(rsx, get_address(offset, method_registers.semaphore_context_dma_4097()), arg);
+			}
+			else
+			{
+				write_gcm_label<true, false>(rsx, get_address(offset, method_registers.semaphore_context_dma_4097()), arg);
+			}
 		}
 
 		void back_end_write_semaphore_release(thread* rsx, u32 /*reg*/, u32 arg)
 		{
 			// Full pipeline barrier
-			g_fxo->get<rsx::dma_manager>().sync();
-			rsx->sync();
 
 			const u32 offset = method_registers.semaphore_offset_4097();
 
@@ -243,7 +272,7 @@ namespace rsx
 			}
 
 			const u32 val = (arg & 0xff00ff00) | ((arg & 0xff) << 16) | ((arg >> 16) & 0xff);
-			vm::_ref<RsxSemaphore>(get_address(offset, method_registers.semaphore_context_dma_4097())).val = val;
+			write_gcm_label<true, true>(rsx, get_address(offset, method_registers.semaphore_context_dma_4097()), val);
 		}
 
 		/**
diff --git a/rpcs3/Emu/system_config.h b/rpcs3/Emu/system_config.h
index bbe288c565..c3bac89e37 100644
--- a/rpcs3/Emu/system_config.h
+++ b/rpcs3/Emu/system_config.h
@@ -156,6 +156,7 @@ struct cfg_root : cfg::node
 		cfg::_int<1, 1800> vblank_rate{ this, "Vblank Rate", 60, true }; // Changing this from 60 may affect game speed in unexpected ways
 		cfg::_bool vblank_ntsc{ this, "Vblank NTSC Fixup", false, true };
 		cfg::_bool decr_memory_layout{ this, "DECR memory layout", false}; // Force enable increased allowed main memory range as DECR console
+		cfg::_bool host_label_synchronization{ this, "Use Host GPU Labels", false };
 
 		struct node_vk : cfg::node
 		{