vk: Add the async task scheduler

This commit is contained in:
kd-11 2021-02-28 21:11:33 +03:00 committed by kd-11
parent cd6ef2958b
commit 77e312fb99
13 changed files with 292 additions and 25 deletions

View file

@ -462,6 +462,7 @@ if(TARGET 3rdparty_vulkan)
RSX/VK/vkutils/device.cpp
RSX/VK/vkutils/sampler.cpp
RSX/VK/vkutils/shared.cpp
RSX/VK/VKAsyncScheduler.cpp
RSX/VK/VKCommandStream.cpp
RSX/VK/VKCommonDecompiler.cpp
RSX/VK/VKCompute.cpp

View file

@ -0,0 +1,161 @@
#include "VKAsyncScheduler.h"
#include "VKHelpers.h"
#include "VKResourceManager.h"
#include "Emu/IdManager.h"
#include "Utilities/lockless.h"
#include "Utilities/mutex.h"
#include <vector>
namespace vk
{
void AsyncTaskScheduler::operator()()
{
add_ref();
while (thread_ctrl::state() != thread_state::aborting)
{
for (auto&& job : m_event_queue.pop_all())
{
vk::wait_for_event(job->queue1_signal.get(), GENERAL_WAIT_TIMEOUT);
job->queue2_signal->host_signal();
}
}
release();
}
void AsyncTaskScheduler::delayed_init()
{
auto pdev = get_current_renderer();
m_command_pool.create(*const_cast<render_device*>(pdev), pdev->get_transfer_queue_family());
for (usz i = 0; i < events_pool_size; ++i)
{
auto ev1 = std::make_unique<event>(*get_current_renderer(), sync_domain::gpu);
auto ev2 = std::make_unique<event>(*get_current_renderer(), sync_domain::gpu);
m_events_pool.emplace_back(std::move(ev1), std::move(ev2), 0ull);
}
}
void AsyncTaskScheduler::insert_sync_event()
{
ensure(m_current_cb);
xqueue_event* sync_label;
ensure(m_next_event_id < events_pool_size);
sync_label = &m_events_pool[m_next_event_id];
if (++m_next_event_id == events_pool_size)
{
// Wrap
m_next_event_id = 0;
}
ensure(sync_label->completion_eid <= vk::last_completed_event_id());
sync_label->queue1_signal->reset();
sync_label->queue2_signal->reset();
sync_label->completion_eid = vk::current_event_id();
sync_label->queue1_signal->signal(*m_current_cb, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0);
m_event_queue.push(sync_label);
m_sync_label = sync_label->queue2_signal.get();
}
AsyncTaskScheduler::~AsyncTaskScheduler()
{
*g_fxo->get<async_scheduler_thread>() = thread_state::aborting;
while (has_refs()) _mm_pause();
for (auto& cb : m_async_command_queue)
{
cb.destroy();
}
m_async_command_queue.clear();
m_next_cb_index = 0;
m_command_pool.destroy();
m_events_pool.clear();
}
command_buffer* AsyncTaskScheduler::get_current()
{
std::lock_guard lock(m_submit_mutex);
m_sync_required = true;
// 0. Anything still active?
if (m_current_cb)
{
return m_current_cb;
}
// 1. Check if there is a 'next' entry
auto pdev = get_current_renderer();
if (m_async_command_queue.empty())
{
delayed_init();
}
else if (m_next_cb_index < m_async_command_queue.size())
{
m_current_cb = &m_async_command_queue[m_next_cb_index];
}
// 2. Create entry
if (!m_current_cb)
{
if (m_next_cb_index == VK_MAX_ASYNC_COMPUTE_QUEUES)
{
m_next_cb_index = 0;
m_current_cb = &m_async_command_queue[m_next_cb_index];
}
else
{
m_async_command_queue.push_back({});
m_current_cb = &m_async_command_queue.back();
m_current_cb->create(m_command_pool, true);
}
}
m_next_cb_index++;
return m_current_cb;
}
event* AsyncTaskScheduler::get_primary_sync_label()
{
std::lock_guard lock(m_submit_mutex);
if (m_sync_required)
{
ensure(m_current_cb);
insert_sync_event();
m_sync_required = false;
}
return std::exchange(m_sync_label, nullptr);
}
void AsyncTaskScheduler::flush(VkSemaphore wait_semaphore, VkPipelineStageFlags wait_dst_stage_mask)
{
std::lock_guard lock(m_submit_mutex);
if (!m_current_cb)
{
return;
}
if (m_sync_required)
{
insert_sync_event();
}
m_current_cb->end();
m_current_cb->submit(get_current_renderer()->get_transfer_queue(), wait_semaphore, VK_NULL_HANDLE, nullptr, wait_dst_stage_mask, VK_FALSE);
m_last_used_cb = m_current_cb;
m_current_cb = nullptr;
m_sync_required = false;
}
}

View file

@ -0,0 +1,60 @@
#pragma once
#include "vkutils/commands.h"
#include "vkutils/sync.h"
#include "Utilities/Thread.h"
#define VK_MAX_ASYNC_COMPUTE_QUEUES 256
namespace vk
{
struct xqueue_event
{
std::unique_ptr<event> queue1_signal;
std::unique_ptr<event> queue2_signal;
u64 completion_eid;
};
class AsyncTaskScheduler : private rsx::ref_counted
{
// Vulkan resources
std::vector<command_buffer> m_async_command_queue;
command_pool m_command_pool;
// Running state
command_buffer* m_last_used_cb = nullptr;
command_buffer* m_current_cb = nullptr;
usz m_next_cb_index = 0;
// Sync
event* m_sync_label = nullptr;
bool m_sync_required = false;
static constexpr u32 events_pool_size = 16384;
std::vector<xqueue_event> m_events_pool;
atomic_t<u32> m_next_event_id = 0;
lf_queue<xqueue_event*> m_event_queue;
shared_mutex m_submit_mutex;
void delayed_init();
void insert_sync_event();
public:
AsyncTaskScheduler() = default;
~AsyncTaskScheduler();
command_buffer* get_current();
event* get_primary_sync_label();
void flush(VkSemaphore wait_semaphore = VK_NULL_HANDLE, VkPipelineStageFlags wait_dst_stage_mask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT);
// Thread entry-point
void operator()();
static constexpr auto thread_name = "Vulkan Async Scheduler"sv;
};
using async_scheduler_thread = named_thread<AsyncTaskScheduler>;
}

View file

@ -1,13 +1,15 @@
#include "stdafx.h"
#include "../Overlays/overlay_shader_compile_notification.h"
#include "../Overlays/Shaders/shader_loading_dialog_native.h"
#include "VKGSRender.h"
#include "VKHelpers.h"
#include "VKAsyncScheduler.h"
#include "VKCommandStream.h"
#include "VKCommonDecompiler.h"
#include "VKCompute.h"
#include "VKGSRender.h"
#include "VKHelpers.h"
#include "VKRenderPass.h"
#include "VKResourceManager.h"
#include "VKCommandStream.h"
#include "vkutils/buffer_object.h"
#include "vkutils/scratch.h"
@ -501,6 +503,8 @@ VKGSRender::VKGSRender() : GSRender()
m_shaders_cache = std::make_unique<vk::shader_cache>(*m_prog_buffer, "vulkan", "v1.91");
g_fxo->init<vk::async_scheduler_thread>();
open_command_buffer();
for (u32 i = 0; i < m_swapchain->get_swap_image_count(); ++i)
@ -1931,6 +1935,9 @@ void VKGSRender::close_and_submit_command_buffer(vk::fence* pFence, VkSemaphore
const bool sync_success = g_fxo->get<rsx::dma_manager>().sync();
const VkBool32 force_flush = !sync_success;
// Flush any asynchronously scheduled jobs
g_fxo->get<vk::async_scheduler_thread>()->flush();
if (vk::test_status_interrupt(vk::heap_dirty))
{
if (m_attrib_ring_info.is_dirty() ||

View file

@ -68,7 +68,7 @@ namespace vk
{
upload_contents_async = 1,
initialize_image_layout = 2,
preserve_image_layout = 3,
preserve_image_layout = 4,
// meta-flags
upload_contents_inline = 0,

View file

@ -10,6 +10,7 @@ namespace vk
resource_manager g_resource_manager;
atomic_t<u64> g_event_ctr;
atomic_t<u64> g_last_completed_event;
constexpr u64 s_vmm_warn_threshold_size = 2000 * 0x100000; // Warn if allocation on a single heap exceeds this value
@ -28,6 +29,11 @@ namespace vk
return g_event_ctr.load();
}
u64 last_completed_event_id()
{
return g_last_completed_event.load();
}
void on_event_completed(u64 event_id, bool flush)
{
if (!flush && g_cfg.video.multithreaded_rsx)
@ -40,6 +46,7 @@ namespace vk
}
g_resource_manager.eid_completed(event_id);
g_last_completed_event = std::max(event_id, g_last_completed_event.load());
}
static constexpr f32 size_in_GiB(u64 size)

View file

@ -11,6 +11,7 @@ namespace vk
{
u64 get_event_id();
u64 current_event_id();
u64 last_completed_event_id();
void on_event_completed(u64 event_id, bool flush = false);
struct eid_scope_t

View file

@ -1,8 +1,9 @@
#include "stdafx.h"
#include "VKHelpers.h"
#include "VKFormats.h"
#include "VKAsyncScheduler.h"
#include "VKCompute.h"
#include "VKDMA.h"
#include "VKHelpers.h"
#include "VKFormats.h"
#include "VKRenderPass.h"
#include "VKRenderTargets.h"
@ -800,12 +801,11 @@ namespace vk
static const vk::command_buffer& prepare_for_transfer(const vk::command_buffer& primary_cb, vk::image* dst_image, rsx::flags32_t& flags)
{
const vk::command_buffer* pcmd = nullptr;
#if 0
if (flags & image_upload_options::upload_contents_async)
{
auto cb = vk::async_transfer_get_current();
cb->begin();
pcmd = cb;
auto async_cmd = g_fxo->get<vk::async_scheduler_thread>()->get_current();
async_cmd->begin();
pcmd = async_cmd;
if (!(flags & image_upload_options::preserve_image_layout))
{
@ -813,7 +813,6 @@ namespace vk
}
}
else
#endif
{
if (vk::is_renderpass_open(primary_cb))
{

View file

@ -137,7 +137,7 @@ namespace vk
src->pop_layout(cmd);
// Create event object for this transfer and queue signal op
dma_fence = std::make_unique<vk::event>(*m_device);
dma_fence = std::make_unique<vk::event>(*m_device, sync_domain::any);
dma_fence->signal(cmd, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT);
// Set cb flag for queued dma operations

View file

@ -58,10 +58,10 @@ namespace vk
return (handle != VK_NULL_HANDLE);
}
event::event(const render_device& dev)
event::event(const render_device& dev, sync_domain domain)
{
m_device = dev;
if (dev.gpu().get_driver_vendor() != driver_vendor::AMD)
if (domain == sync_domain::gpu || dev.gpu().get_driver_vendor() != driver_vendor::AMD)
{
VkEventCreateInfo info
{
@ -75,14 +75,14 @@ namespace vk
{
// Work around AMD's broken event signals
m_buffer = std::make_unique<buffer>
(
dev,
4,
dev.get_memory_mapping().host_visible_coherent,
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT,
VK_BUFFER_USAGE_TRANSFER_DST_BIT,
0
);
(
dev,
4,
dev.get_memory_mapping().host_visible_coherent,
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT,
VK_BUFFER_USAGE_TRANSFER_DST_BIT,
0
);
m_value = reinterpret_cast<u32*>(m_buffer->map(0, 4));
*m_value = 0xCAFEBABE;
@ -116,6 +116,24 @@ namespace vk
}
}
void event::host_signal() const
{
ensure(m_vk_event);
vkSetEvent(m_device, m_vk_event);
}
void event::reset() const
{
if (m_vk_event) [[likely]]
{
vkResetEvent(m_device, m_vk_event);
}
else
{
*m_value = 0xCAFEBABE;
}
}
VkResult event::status() const
{
if (m_vk_event) [[likely]]

View file

@ -10,6 +10,12 @@ namespace vk
{
class command_buffer;
enum class sync_domain
{
any = 0,
gpu = 1
};
struct fence
{
atomic_t<bool> flushed = false;
@ -35,10 +41,13 @@ namespace vk
volatile u32* m_value = nullptr;
public:
event(const render_device& dev);
event(const render_device& dev, sync_domain domain);
~event();
void signal(const command_buffer& cmd, VkPipelineStageFlags stages, VkAccessFlags access);
void host_signal() const;
VkResult status() const;
void reset() const;
};
VkResult wait_for_fence(fence* pFence, u64 timeout = 0ull);

View file

@ -19,6 +19,7 @@
</ProjectConfiguration>
</ItemGroup>
<ItemGroup>
<ClInclude Include="Emu\RSX\VK\VKAsyncScheduler.h" />
<ClInclude Include="Emu\RSX\VK\VKCommandStream.h" />
<ClInclude Include="Emu\RSX\VK\VKCommonDecompiler.h" />
<ClInclude Include="Emu\RSX\VK\VKCompute.h" />
@ -64,6 +65,7 @@
<ClInclude Include="Emu\RSX\VK\VulkanAPI.h" />
</ItemGroup>
<ItemGroup>
<ClCompile Include="Emu\RSX\VK\VKAsyncScheduler.cpp" />
<ClCompile Include="Emu\RSX\VK\VKCommandStream.cpp" />
<ClCompile Include="Emu\RSX\VK\VKCommonDecompiler.cpp" />
<ClCompile Include="Emu\RSX\VK\VKCompute.cpp" />

View file

@ -64,6 +64,7 @@
</ClCompile>
<ClCompile Include="Emu\RSX\VK\VKOverlays.cpp" />
<ClCompile Include="Emu\RSX\VK\VKCompute.cpp" />
<ClCompile Include="Emu\RSX\VK\VKAsyncScheduler.cpp" />
</ItemGroup>
<ItemGroup>
<ClInclude Include="Emu\RSX\VK\VKCommonDecompiler.h" />
@ -149,6 +150,7 @@
<ClInclude Include="Emu\RSX\VK\vkutils\image_helpers.h">
<Filter>vkutils</Filter>
</ClInclude>
<ClInclude Include="Emu\RSX\VK\VKAsyncScheduler.h" />
</ItemGroup>
<ItemGroup>
<Filter Include="vkutils">