From 77e312fb9993a363ff22d8de90ca29fc42ab8c15 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Sun, 28 Feb 2021 21:11:33 +0300 Subject: [PATCH] vk: Add the async task scheduler --- rpcs3/Emu/CMakeLists.txt | 1 + rpcs3/Emu/RSX/VK/VKAsyncScheduler.cpp | 161 +++++++++++++++++++++++++ rpcs3/Emu/RSX/VK/VKAsyncScheduler.h | 60 +++++++++ rpcs3/Emu/RSX/VK/VKGSRender.cpp | 13 +- rpcs3/Emu/RSX/VK/VKHelpers.h | 2 +- rpcs3/Emu/RSX/VK/VKResourceManager.cpp | 7 ++ rpcs3/Emu/RSX/VK/VKResourceManager.h | 1 + rpcs3/Emu/RSX/VK/VKTexture.cpp | 13 +- rpcs3/Emu/RSX/VK/VKTextureCache.cpp | 2 +- rpcs3/Emu/RSX/VK/vkutils/sync.cpp | 38 ++++-- rpcs3/Emu/RSX/VK/vkutils/sync.h | 11 +- rpcs3/VKGSRender.vcxproj | 2 + rpcs3/VKGSRender.vcxproj.filters | 6 +- 13 files changed, 292 insertions(+), 25 deletions(-) create mode 100644 rpcs3/Emu/RSX/VK/VKAsyncScheduler.cpp create mode 100644 rpcs3/Emu/RSX/VK/VKAsyncScheduler.h diff --git a/rpcs3/Emu/CMakeLists.txt b/rpcs3/Emu/CMakeLists.txt index e0b056e2ba..77b9e52cb0 100644 --- a/rpcs3/Emu/CMakeLists.txt +++ b/rpcs3/Emu/CMakeLists.txt @@ -462,6 +462,7 @@ if(TARGET 3rdparty_vulkan) RSX/VK/vkutils/device.cpp RSX/VK/vkutils/sampler.cpp RSX/VK/vkutils/shared.cpp + RSX/VK/VKAsyncScheduler.cpp RSX/VK/VKCommandStream.cpp RSX/VK/VKCommonDecompiler.cpp RSX/VK/VKCompute.cpp diff --git a/rpcs3/Emu/RSX/VK/VKAsyncScheduler.cpp b/rpcs3/Emu/RSX/VK/VKAsyncScheduler.cpp new file mode 100644 index 0000000000..242d5db72e --- /dev/null +++ b/rpcs3/Emu/RSX/VK/VKAsyncScheduler.cpp @@ -0,0 +1,161 @@ +#include "VKAsyncScheduler.h" +#include "VKHelpers.h" +#include "VKResourceManager.h" + +#include "Emu/IdManager.h" +#include "Utilities/lockless.h" +#include "Utilities/mutex.h" + +#include + +namespace vk +{ + void AsyncTaskScheduler::operator()() + { + add_ref(); + + while (thread_ctrl::state() != thread_state::aborting) + { + for (auto&& job : m_event_queue.pop_all()) + { + vk::wait_for_event(job->queue1_signal.get(), GENERAL_WAIT_TIMEOUT); + job->queue2_signal->host_signal(); + } + } + + release(); + } + + void AsyncTaskScheduler::delayed_init() + { + auto pdev = get_current_renderer(); + m_command_pool.create(*const_cast(pdev), pdev->get_transfer_queue_family()); + + for (usz i = 0; i < events_pool_size; ++i) + { + auto ev1 = std::make_unique(*get_current_renderer(), sync_domain::gpu); + auto ev2 = std::make_unique(*get_current_renderer(), sync_domain::gpu); + m_events_pool.emplace_back(std::move(ev1), std::move(ev2), 0ull); + } + } + + void AsyncTaskScheduler::insert_sync_event() + { + ensure(m_current_cb); + + xqueue_event* sync_label; + ensure(m_next_event_id < events_pool_size); + sync_label = &m_events_pool[m_next_event_id]; + + if (++m_next_event_id == events_pool_size) + { + // Wrap + m_next_event_id = 0; + } + + ensure(sync_label->completion_eid <= vk::last_completed_event_id()); + + sync_label->queue1_signal->reset(); + sync_label->queue2_signal->reset(); + sync_label->completion_eid = vk::current_event_id(); + + sync_label->queue1_signal->signal(*m_current_cb, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0); + + m_event_queue.push(sync_label); + m_sync_label = sync_label->queue2_signal.get(); + } + + AsyncTaskScheduler::~AsyncTaskScheduler() + { + *g_fxo->get() = thread_state::aborting; + while (has_refs()) _mm_pause(); + + for (auto& cb : m_async_command_queue) + { + cb.destroy(); + } + + m_async_command_queue.clear(); + m_next_cb_index = 0; + m_command_pool.destroy(); + m_events_pool.clear(); + } + + command_buffer* AsyncTaskScheduler::get_current() + { + std::lock_guard lock(m_submit_mutex); + m_sync_required = true; + + // 0. Anything still active? + if (m_current_cb) + { + return m_current_cb; + } + + // 1. Check if there is a 'next' entry + auto pdev = get_current_renderer(); + if (m_async_command_queue.empty()) + { + delayed_init(); + } + else if (m_next_cb_index < m_async_command_queue.size()) + { + m_current_cb = &m_async_command_queue[m_next_cb_index]; + } + + // 2. Create entry + if (!m_current_cb) + { + if (m_next_cb_index == VK_MAX_ASYNC_COMPUTE_QUEUES) + { + m_next_cb_index = 0; + m_current_cb = &m_async_command_queue[m_next_cb_index]; + } + else + { + m_async_command_queue.push_back({}); + m_current_cb = &m_async_command_queue.back(); + m_current_cb->create(m_command_pool, true); + } + } + + m_next_cb_index++; + return m_current_cb; + } + + event* AsyncTaskScheduler::get_primary_sync_label() + { + std::lock_guard lock(m_submit_mutex); + + if (m_sync_required) + { + ensure(m_current_cb); + insert_sync_event(); + m_sync_required = false; + } + + return std::exchange(m_sync_label, nullptr); + } + + void AsyncTaskScheduler::flush(VkSemaphore wait_semaphore, VkPipelineStageFlags wait_dst_stage_mask) + { + std::lock_guard lock(m_submit_mutex); + + if (!m_current_cb) + { + return; + } + + if (m_sync_required) + { + insert_sync_event(); + } + + m_current_cb->end(); + m_current_cb->submit(get_current_renderer()->get_transfer_queue(), wait_semaphore, VK_NULL_HANDLE, nullptr, wait_dst_stage_mask, VK_FALSE); + + m_last_used_cb = m_current_cb; + m_current_cb = nullptr; + m_sync_required = false; + } +} diff --git a/rpcs3/Emu/RSX/VK/VKAsyncScheduler.h b/rpcs3/Emu/RSX/VK/VKAsyncScheduler.h new file mode 100644 index 0000000000..3a3fa65bbe --- /dev/null +++ b/rpcs3/Emu/RSX/VK/VKAsyncScheduler.h @@ -0,0 +1,60 @@ +#pragma once + +#include "vkutils/commands.h" +#include "vkutils/sync.h" + +#include "Utilities/Thread.h" + +#define VK_MAX_ASYNC_COMPUTE_QUEUES 256 + +namespace vk +{ + struct xqueue_event + { + std::unique_ptr queue1_signal; + std::unique_ptr queue2_signal; + u64 completion_eid; + }; + + class AsyncTaskScheduler : private rsx::ref_counted + { + // Vulkan resources + std::vector m_async_command_queue; + command_pool m_command_pool; + + // Running state + command_buffer* m_last_used_cb = nullptr; + command_buffer* m_current_cb = nullptr; + usz m_next_cb_index = 0; + + // Sync + event* m_sync_label = nullptr; + bool m_sync_required = false; + + static constexpr u32 events_pool_size = 16384; + std::vector m_events_pool; + atomic_t m_next_event_id = 0; + + lf_queue m_event_queue; + shared_mutex m_submit_mutex; + + void delayed_init(); + void insert_sync_event(); + + public: + AsyncTaskScheduler() = default; + ~AsyncTaskScheduler(); + + command_buffer* get_current(); + event* get_primary_sync_label(); + + void flush(VkSemaphore wait_semaphore = VK_NULL_HANDLE, VkPipelineStageFlags wait_dst_stage_mask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT); + + // Thread entry-point + void operator()(); + + static constexpr auto thread_name = "Vulkan Async Scheduler"sv; + }; + + using async_scheduler_thread = named_thread; +} \ No newline at end of file diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.cpp b/rpcs3/Emu/RSX/VK/VKGSRender.cpp index 19e1a493dd..edfd9e78f6 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRender.cpp +++ b/rpcs3/Emu/RSX/VK/VKGSRender.cpp @@ -1,13 +1,15 @@ #include "stdafx.h" #include "../Overlays/overlay_shader_compile_notification.h" #include "../Overlays/Shaders/shader_loading_dialog_native.h" -#include "VKGSRender.h" -#include "VKHelpers.h" + +#include "VKAsyncScheduler.h" +#include "VKCommandStream.h" #include "VKCommonDecompiler.h" #include "VKCompute.h" +#include "VKGSRender.h" +#include "VKHelpers.h" #include "VKRenderPass.h" #include "VKResourceManager.h" -#include "VKCommandStream.h" #include "vkutils/buffer_object.h" #include "vkutils/scratch.h" @@ -501,6 +503,8 @@ VKGSRender::VKGSRender() : GSRender() m_shaders_cache = std::make_unique(*m_prog_buffer, "vulkan", "v1.91"); + g_fxo->init(); + open_command_buffer(); for (u32 i = 0; i < m_swapchain->get_swap_image_count(); ++i) @@ -1931,6 +1935,9 @@ void VKGSRender::close_and_submit_command_buffer(vk::fence* pFence, VkSemaphore const bool sync_success = g_fxo->get().sync(); const VkBool32 force_flush = !sync_success; + // Flush any asynchronously scheduled jobs + g_fxo->get()->flush(); + if (vk::test_status_interrupt(vk::heap_dirty)) { if (m_attrib_ring_info.is_dirty() || diff --git a/rpcs3/Emu/RSX/VK/VKHelpers.h b/rpcs3/Emu/RSX/VK/VKHelpers.h index b2b854e693..b75dbd78f9 100644 --- a/rpcs3/Emu/RSX/VK/VKHelpers.h +++ b/rpcs3/Emu/RSX/VK/VKHelpers.h @@ -68,7 +68,7 @@ namespace vk { upload_contents_async = 1, initialize_image_layout = 2, - preserve_image_layout = 3, + preserve_image_layout = 4, // meta-flags upload_contents_inline = 0, diff --git a/rpcs3/Emu/RSX/VK/VKResourceManager.cpp b/rpcs3/Emu/RSX/VK/VKResourceManager.cpp index c4233bfc8f..9e00172714 100644 --- a/rpcs3/Emu/RSX/VK/VKResourceManager.cpp +++ b/rpcs3/Emu/RSX/VK/VKResourceManager.cpp @@ -10,6 +10,7 @@ namespace vk resource_manager g_resource_manager; atomic_t g_event_ctr; + atomic_t g_last_completed_event; constexpr u64 s_vmm_warn_threshold_size = 2000 * 0x100000; // Warn if allocation on a single heap exceeds this value @@ -28,6 +29,11 @@ namespace vk return g_event_ctr.load(); } + u64 last_completed_event_id() + { + return g_last_completed_event.load(); + } + void on_event_completed(u64 event_id, bool flush) { if (!flush && g_cfg.video.multithreaded_rsx) @@ -40,6 +46,7 @@ namespace vk } g_resource_manager.eid_completed(event_id); + g_last_completed_event = std::max(event_id, g_last_completed_event.load()); } static constexpr f32 size_in_GiB(u64 size) diff --git a/rpcs3/Emu/RSX/VK/VKResourceManager.h b/rpcs3/Emu/RSX/VK/VKResourceManager.h index ed5f32a5c2..1e2b8e69d1 100644 --- a/rpcs3/Emu/RSX/VK/VKResourceManager.h +++ b/rpcs3/Emu/RSX/VK/VKResourceManager.h @@ -11,6 +11,7 @@ namespace vk { u64 get_event_id(); u64 current_event_id(); + u64 last_completed_event_id(); void on_event_completed(u64 event_id, bool flush = false); struct eid_scope_t diff --git a/rpcs3/Emu/RSX/VK/VKTexture.cpp b/rpcs3/Emu/RSX/VK/VKTexture.cpp index 34c26fcd9b..18ff71ea03 100644 --- a/rpcs3/Emu/RSX/VK/VKTexture.cpp +++ b/rpcs3/Emu/RSX/VK/VKTexture.cpp @@ -1,8 +1,9 @@ #include "stdafx.h" -#include "VKHelpers.h" -#include "VKFormats.h" +#include "VKAsyncScheduler.h" #include "VKCompute.h" #include "VKDMA.h" +#include "VKHelpers.h" +#include "VKFormats.h" #include "VKRenderPass.h" #include "VKRenderTargets.h" @@ -800,12 +801,11 @@ namespace vk static const vk::command_buffer& prepare_for_transfer(const vk::command_buffer& primary_cb, vk::image* dst_image, rsx::flags32_t& flags) { const vk::command_buffer* pcmd = nullptr; -#if 0 if (flags & image_upload_options::upload_contents_async) { - auto cb = vk::async_transfer_get_current(); - cb->begin(); - pcmd = cb; + auto async_cmd = g_fxo->get()->get_current(); + async_cmd->begin(); + pcmd = async_cmd; if (!(flags & image_upload_options::preserve_image_layout)) { @@ -813,7 +813,6 @@ namespace vk } } else -#endif { if (vk::is_renderpass_open(primary_cb)) { diff --git a/rpcs3/Emu/RSX/VK/VKTextureCache.cpp b/rpcs3/Emu/RSX/VK/VKTextureCache.cpp index 28e51ff007..9af8e6ce39 100644 --- a/rpcs3/Emu/RSX/VK/VKTextureCache.cpp +++ b/rpcs3/Emu/RSX/VK/VKTextureCache.cpp @@ -137,7 +137,7 @@ namespace vk src->pop_layout(cmd); // Create event object for this transfer and queue signal op - dma_fence = std::make_unique(*m_device); + dma_fence = std::make_unique(*m_device, sync_domain::any); dma_fence->signal(cmd, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT); // Set cb flag for queued dma operations diff --git a/rpcs3/Emu/RSX/VK/vkutils/sync.cpp b/rpcs3/Emu/RSX/VK/vkutils/sync.cpp index 27e9bfacb4..757080541b 100644 --- a/rpcs3/Emu/RSX/VK/vkutils/sync.cpp +++ b/rpcs3/Emu/RSX/VK/vkutils/sync.cpp @@ -58,10 +58,10 @@ namespace vk return (handle != VK_NULL_HANDLE); } - event::event(const render_device& dev) + event::event(const render_device& dev, sync_domain domain) { m_device = dev; - if (dev.gpu().get_driver_vendor() != driver_vendor::AMD) + if (domain == sync_domain::gpu || dev.gpu().get_driver_vendor() != driver_vendor::AMD) { VkEventCreateInfo info { @@ -75,14 +75,14 @@ namespace vk { // Work around AMD's broken event signals m_buffer = std::make_unique - ( - dev, - 4, - dev.get_memory_mapping().host_visible_coherent, - VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, - VK_BUFFER_USAGE_TRANSFER_DST_BIT, - 0 - ); + ( + dev, + 4, + dev.get_memory_mapping().host_visible_coherent, + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, + VK_BUFFER_USAGE_TRANSFER_DST_BIT, + 0 + ); m_value = reinterpret_cast(m_buffer->map(0, 4)); *m_value = 0xCAFEBABE; @@ -116,6 +116,24 @@ namespace vk } } + void event::host_signal() const + { + ensure(m_vk_event); + vkSetEvent(m_device, m_vk_event); + } + + void event::reset() const + { + if (m_vk_event) [[likely]] + { + vkResetEvent(m_device, m_vk_event); + } + else + { + *m_value = 0xCAFEBABE; + } + } + VkResult event::status() const { if (m_vk_event) [[likely]] diff --git a/rpcs3/Emu/RSX/VK/vkutils/sync.h b/rpcs3/Emu/RSX/VK/vkutils/sync.h index 6f030a92c4..822781bbad 100644 --- a/rpcs3/Emu/RSX/VK/vkutils/sync.h +++ b/rpcs3/Emu/RSX/VK/vkutils/sync.h @@ -10,6 +10,12 @@ namespace vk { class command_buffer; + enum class sync_domain + { + any = 0, + gpu = 1 + }; + struct fence { atomic_t flushed = false; @@ -35,10 +41,13 @@ namespace vk volatile u32* m_value = nullptr; public: - event(const render_device& dev); + event(const render_device& dev, sync_domain domain); ~event(); + void signal(const command_buffer& cmd, VkPipelineStageFlags stages, VkAccessFlags access); + void host_signal() const; VkResult status() const; + void reset() const; }; VkResult wait_for_fence(fence* pFence, u64 timeout = 0ull); diff --git a/rpcs3/VKGSRender.vcxproj b/rpcs3/VKGSRender.vcxproj index 5a267231e3..d4aea30e5a 100644 --- a/rpcs3/VKGSRender.vcxproj +++ b/rpcs3/VKGSRender.vcxproj @@ -19,6 +19,7 @@ + @@ -64,6 +65,7 @@ + diff --git a/rpcs3/VKGSRender.vcxproj.filters b/rpcs3/VKGSRender.vcxproj.filters index 8941a504e3..d3cd6cbff3 100644 --- a/rpcs3/VKGSRender.vcxproj.filters +++ b/rpcs3/VKGSRender.vcxproj.filters @@ -61,9 +61,10 @@ vkutils - + - + + @@ -149,6 +150,7 @@ vkutils +