diff --git a/rpcs3/Emu/RSX/RSXThread.h b/rpcs3/Emu/RSX/RSXThread.h index 4a2cce8500..dca0e5c7fa 100644 --- a/rpcs3/Emu/RSX/RSXThread.h +++ b/rpcs3/Emu/RSX/RSXThread.h @@ -590,6 +590,7 @@ namespace rsx bool supports_hw_conditional_render; // Conditional render bool supports_passthrough_dma; // DMA passthrough bool supports_asynchronous_compute; // Async compute + bool supports_host_gpu_labels; // Advanced host synchronization }; struct sampled_image_descriptor_base; @@ -859,6 +860,7 @@ namespace rsx void sync(); flags32_t read_barrier(u32 memory_address, u32 memory_range, bool unconditional); virtual void sync_hint(FIFO_hint hint, void* args); + virtual bool release_GCM_label(u32 /*address*/, u32 /*value*/) { return false; } std::span get_raw_index_array(const draw_clause& draw_indexed_clause) const; diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.cpp b/rpcs3/Emu/RSX/VK/VKGSRender.cpp index 8a7a29b843..f9cb10adf2 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRender.cpp +++ b/rpcs3/Emu/RSX/VK/VKGSRender.cpp @@ -550,11 +550,16 @@ VKGSRender::VKGSRender() : GSRender() // Relaxed query synchronization backend_config.supports_hw_conditional_render = !!g_cfg.video.relaxed_zcull_sync; + // Passthrough DMA + backend_config.supports_passthrough_dma = m_device->get_external_memory_host_support(); + + // Host sync + backend_config.supports_host_gpu_labels = !!g_cfg.video.host_label_synchronization; + // Async compute and related operations if (g_cfg.video.vk.asynchronous_texture_streaming) { - // Optimistic, enable async compute and passthrough DMA - backend_config.supports_passthrough_dma = m_device->get_external_memory_host_support(); + // Optimistic, enable async compute backend_config.supports_asynchronous_compute = true; if (m_device->get_graphics_queue() == m_device->get_transfer_queue()) @@ -562,10 +567,14 @@ VKGSRender::VKGSRender() : GSRender() rsx_log.error("Cannot run graphics and async transfer in the same queue. Async uploads are disabled. This is a limitation of your GPU"); backend_config.supports_asynchronous_compute = false; } + } - switch (vk::get_driver_vendor()) + // Sanity checks + switch (vk::get_driver_vendor()) + { + case vk::driver_vendor::NVIDIA: + if (backend_config.supports_asynchronous_compute) { - case vk::driver_vendor::NVIDIA: if (auto chip_family = vk::get_chip_family(); chip_family == vk::chip_class::NV_kepler || chip_family == vk::chip_class::NV_maxwell) { @@ -574,35 +583,47 @@ VKGSRender::VKGSRender() : GSRender() rsx_log.notice("Forcing safe async compute for NVIDIA device to avoid crashing."); g_cfg.video.vk.asynchronous_scheduler.set(vk_gpu_scheduler_mode::safe); - break; + } + break; #if !defined(_WIN32) - // Anything running on AMDGPU kernel driver will not work due to the check for fd-backed memory allocations - case vk::driver_vendor::RADV: - case vk::driver_vendor::AMD: + // Anything running on AMDGPU kernel driver will not work due to the check for fd-backed memory allocations + case vk::driver_vendor::RADV: + case vk::driver_vendor::AMD: #if !defined(__linux__) - // Intel chipsets would fail on BSD in most cases and DRM_IOCTL_i915_GEM_USERPTR unimplemented - case vk::driver_vendor::ANV: + // Intel chipsets would fail on BSD in most cases and DRM_IOCTL_i915_GEM_USERPTR unimplemented + case vk::driver_vendor::ANV: #endif - if (backend_config.supports_passthrough_dma) - { - rsx_log.error("AMDGPU kernel driver on linux and INTEL driver on some platforms cannot support passthrough DMA buffers."); - backend_config.supports_passthrough_dma = false; - } - break; -#endif - case vk::driver_vendor::MVK: - // Async compute crashes immediately on Apple GPUs - rsx_log.error("Apple GPUs are incompatible with the current implementation of asynchronous texture decoding."); - backend_config.supports_asynchronous_compute = false; - break; - default: break; - } - - if (backend_config.supports_asynchronous_compute) + if (backend_config.supports_passthrough_dma) { - // Run only if async compute can be used. - g_fxo->init(g_cfg.video.vk.asynchronous_scheduler); + rsx_log.error("AMDGPU kernel driver on linux and INTEL driver on some platforms cannot support passthrough DMA buffers."); + backend_config.supports_passthrough_dma = false; } + break; +#endif + case vk::driver_vendor::MVK: + // Async compute crashes immediately on Apple GPUs + rsx_log.error("Apple GPUs are incompatible with the current implementation of asynchronous texture decoding."); + backend_config.supports_asynchronous_compute = false; + break; + default: break; + } + + if (backend_config.supports_asynchronous_compute) + { + // Run only if async compute can be used. + g_fxo->init(g_cfg.video.vk.asynchronous_scheduler); + } + + if (backend_config.supports_host_gpu_labels) + { + m_host_object_data = std::make_unique(*m_device, + 0x100000, + memory_map.device_bar, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, + VK_BUFFER_USAGE_TRANSFER_DST_BIT, 0, + VMM_ALLOCATION_POOL_SYSTEM); + + m_host_data_ptr = new (m_host_object_data->map(0, 0x100000)) vk::host_data_t(); + ensure(m_host_data_ptr->magic == 0xCAFEBABE); } } @@ -629,6 +650,13 @@ VKGSRender::~VKGSRender() g_fxo->get().destroy(); } + // Host data + if (m_host_object_data) + { + m_host_object_data->unmap(); + m_host_object_data.reset(); + } + // Clear flush requests m_flush_requests.clear_pending_flag(); @@ -1453,6 +1481,35 @@ void VKGSRender::flush_command_queue(bool hard_sync, bool do_not_switch) m_current_command_buffer->begin(); } +bool VKGSRender::release_GCM_label(u32 address, u32 args) +{ + if (!backend_config.supports_host_gpu_labels) + { + return false; + } + + ensure(m_host_data_ptr); + if (m_host_data_ptr->texture_load_complete_event == m_host_data_ptr->texture_load_request_event) + { + // All texture loads already seen by the host GPU + // Wait for all previously submitted labels to be flushed + while (m_host_data_ptr->last_label_release_event > m_host_data_ptr->commands_complete_event) + { + _mm_pause(); + } + + return false; + } + + m_host_data_ptr->last_label_release_event = ++m_host_data_ptr->event_counter; + + const auto mapping = vk::map_dma(address, 4); + const auto write_data = std::bit_cast>(args); + vkCmdUpdateBuffer(*m_current_command_buffer, mapping.second->value, mapping.first, 4, &write_data); + flush_command_queue(); + return true; +} + void VKGSRender::sync_hint(rsx::FIFO_hint hint, void* args) { ensure(args); @@ -2088,6 +2145,15 @@ void VKGSRender::close_and_submit_command_buffer(vk::fence* pFence, VkSemaphore m_current_command_buffer->flags &= ~vk::command_buffer::cb_has_open_query; } + if (m_host_data_ptr && m_host_data_ptr->last_label_release_event > m_host_data_ptr->commands_complete_event) + { + vkCmdUpdateBuffer(*m_current_command_buffer, + m_host_object_data->value, + ::offset32(&vk::host_data_t::commands_complete_event), + sizeof(u64), + const_cast(&m_host_data_ptr->event_counter)); + } + m_current_command_buffer->end(); m_current_command_buffer->tag(); diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.h b/rpcs3/Emu/RSX/VK/VKGSRender.h index 55cf87d404..d56fe9c098 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRender.h +++ b/rpcs3/Emu/RSX/VK/VKGSRender.h @@ -117,6 +117,9 @@ private: vk::command_buffer_chunk* m_current_command_buffer = nullptr; VkSemaphore m_dangling_semaphore_signal = VK_NULL_HANDLE; + volatile vk::host_data_t* m_host_data_ptr = nullptr; + std::unique_ptr m_host_object_data; + VkDescriptorSetLayout descriptor_layouts; VkPipelineLayout pipeline_layout; @@ -242,6 +245,7 @@ public: void bind_viewport(); void sync_hint(rsx::FIFO_hint hint, void* args) override; + bool release_GCM_label(u32 address, u32 data) override; void begin_occlusion_query(rsx::reports::occlusion_query_info* query) override; void end_occlusion_query(rsx::reports::occlusion_query_info* query) override; @@ -259,6 +263,9 @@ public: void begin_conditional_rendering(const std::vector& sources) override; void end_conditional_rendering() override; + // Host sync object + inline std::pair map_host_object_data() { return { m_host_data_ptr, m_host_object_data->value }; } + protected: void clear_surface(u32 mask) override; void begin() override; diff --git a/rpcs3/Emu/RSX/VK/VKTexture.cpp b/rpcs3/Emu/RSX/VK/VKTexture.cpp index f37c0c574e..d263528cbc 100644 --- a/rpcs3/Emu/RSX/VK/VKTexture.cpp +++ b/rpcs3/Emu/RSX/VK/VKTexture.cpp @@ -9,6 +9,7 @@ #include "vkutils/data_heap.h" #include "vkutils/image_helpers.h" +#include "VKGSRender.h" #include "../GCM.h" #include "../rsx_utils.h" @@ -1146,6 +1147,17 @@ namespace vk // Release from async chain, the primary chain will acquire later dst_image->queue_release(cmd2, cmd.get_queue_family(), dst_image->current_layout); } + + if (auto rsxthr = rsx::get_current_renderer(); + rsxthr->get_backend_config().supports_host_gpu_labels) + { + // Queue a sync update on the CB doing the load + auto [host_data, host_buffer] = static_cast(rsxthr)->map_host_object_data(); + ensure(host_data); + const auto event_id = ++host_data->event_counter; + host_data->texture_load_request_event = event_id; + vkCmdUpdateBuffer(cmd2, host_buffer, ::offset32(&vk::host_data_t::texture_load_complete_event), sizeof(u64), &event_id); + } } void blitter::scale_image(vk::command_buffer& cmd, vk::image* src, vk::image* dst, areai src_area, areai dst_area, bool interpolate, const rsx::typeless_xfer& xfer_info) diff --git a/rpcs3/Emu/RSX/VK/vkutils/sync.h b/rpcs3/Emu/RSX/VK/vkutils/sync.h index a1d0049ab8..2551c51698 100644 --- a/rpcs3/Emu/RSX/VK/vkutils/sync.h +++ b/rpcs3/Emu/RSX/VK/vkutils/sync.h @@ -16,6 +16,16 @@ namespace vk gpu = 1 }; + struct host_data_t // Pick a better name + { + u64 magic = 0xCAFEBABE; + u64 event_counter = 0; + u64 texture_load_request_event = 0; + u64 texture_load_complete_event = 0; + u64 last_label_release_event = 0; + u64 commands_complete_event = 0; + }; + struct fence { atomic_t flushed = false; diff --git a/rpcs3/Emu/RSX/rsx_methods.cpp b/rpcs3/Emu/RSX/rsx_methods.cpp index ebba57a77c..ec63a3f052 100644 --- a/rpcs3/Emu/RSX/rsx_methods.cpp +++ b/rpcs3/Emu/RSX/rsx_methods.cpp @@ -29,6 +29,35 @@ namespace rsx rsx_log.trace("RSX method 0x%x (arg=0x%x)", reg << 2, arg); } + template + void write_gcm_label(thread* rsx, u32 address, u32 data) + { + const bool is_flip_sema = (address == (rsx->label_addr + 0x10) || address == (rsx->label_addr + 0x30)); + if (!is_flip_sema) + { + if constexpr (FlushPipe) + { + // Ignoring these can cause very poor performance due to timestamp queries taking too long. + rsx->sync(); + } + + if (rsx->get_backend_config().supports_host_gpu_labels && + rsx->release_GCM_label(address, data)) + { + // Backend will handle it, nothing to do. + // Implicitly handles DMA sync. + return; + } + + if constexpr (FlushDMA) + { + g_fxo->get().sync(); + } + } + + vm::_ref(address).val = data; + } + template struct vertex_data_type_from_element_type; template<> struct vertex_data_type_from_element_type { static const vertex_base_type type = vertex_base_type::f; }; template<> struct vertex_data_type_from_element_type { static const vertex_base_type type = vertex_base_type::sf; }; @@ -74,6 +103,8 @@ namespace rsx rsx->flush_fifo(); } + //rsx_log.error("Wait for address at 0x%x to change to 0x%x", addr, arg); + u64 start = get_system_time(); while (sema != arg) { @@ -116,8 +147,6 @@ namespace rsx void semaphore_release(thread* rsx, u32 /*reg*/, u32 arg) { - rsx->sync(); - const u32 offset = method_registers.semaphore_offset_406e(); if (offset % 4) @@ -144,7 +173,7 @@ namespace rsx rsx_log.fatal("NV406E semaphore unexpected address. Please report to the developers. (offset=0x%x, addr=0x%x)", offset, addr); } - vm::_ref(addr).val = arg; + write_gcm_label(rsx, addr, arg); } } @@ -207,11 +236,6 @@ namespace rsx void texture_read_semaphore_release(thread* rsx, u32 /*reg*/, u32 arg) { // Pipeline barrier seems to be equivalent to a SHADER_READ stage barrier - g_fxo->get().sync(); - if (g_cfg.video.strict_rendering_mode) - { - rsx->sync(); - } // lle-gcm likes to inject system reserved semaphores, presumably for system/vsh usage // Avoid calling render to avoid any havoc(flickering) they may cause from invalid flush/write @@ -224,14 +248,19 @@ namespace rsx return; } - vm::_ref(get_address(offset, method_registers.semaphore_context_dma_4097())).val = arg; + if (g_cfg.video.strict_rendering_mode) [[ unlikely ]] + { + write_gcm_label(rsx, get_address(offset, method_registers.semaphore_context_dma_4097()), arg); + } + else + { + write_gcm_label(rsx, get_address(offset, method_registers.semaphore_context_dma_4097()), arg); + } } void back_end_write_semaphore_release(thread* rsx, u32 /*reg*/, u32 arg) { // Full pipeline barrier - g_fxo->get().sync(); - rsx->sync(); const u32 offset = method_registers.semaphore_offset_4097(); @@ -243,7 +272,7 @@ namespace rsx } const u32 val = (arg & 0xff00ff00) | ((arg & 0xff) << 16) | ((arg >> 16) & 0xff); - vm::_ref(get_address(offset, method_registers.semaphore_context_dma_4097())).val = val; + write_gcm_label(rsx, get_address(offset, method_registers.semaphore_context_dma_4097()), val); } /** diff --git a/rpcs3/Emu/system_config.h b/rpcs3/Emu/system_config.h index bbe288c565..c3bac89e37 100644 --- a/rpcs3/Emu/system_config.h +++ b/rpcs3/Emu/system_config.h @@ -156,6 +156,7 @@ struct cfg_root : cfg::node cfg::_int<1, 1800> vblank_rate{ this, "Vblank Rate", 60, true }; // Changing this from 60 may affect game speed in unexpected ways cfg::_bool vblank_ntsc{ this, "Vblank NTSC Fixup", false, true }; cfg::_bool decr_memory_layout{ this, "DECR memory layout", false}; // Force enable increased allowed main memory range as DECR console + cfg::_bool host_label_synchronization{ this, "Use Host GPU Labels", false }; struct node_vk : cfg::node {