vk/rsx: Tuning and optimization for host labels

This commit is contained in:
kd-11 2022-02-20 01:48:43 +03:00 committed by kd-11
parent 24587ab459
commit da559b5568
3 changed files with 90 additions and 29 deletions

View file

@ -615,16 +615,23 @@ VKGSRender::VKGSRender() : GSRender()
} }
if (backend_config.supports_host_gpu_labels) if (backend_config.supports_host_gpu_labels)
{
if (backend_config.supports_passthrough_dma)
{ {
m_host_object_data = std::make_unique<vk::buffer>(*m_device, m_host_object_data = std::make_unique<vk::buffer>(*m_device,
0x100000, 0x10000,
memory_map.device_bar, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, memory_map.host_visible_coherent, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT,
VK_BUFFER_USAGE_TRANSFER_DST_BIT, 0, VK_BUFFER_USAGE_TRANSFER_DST_BIT, 0,
VMM_ALLOCATION_POOL_SYSTEM); VMM_ALLOCATION_POOL_SYSTEM);
m_host_data_ptr = new (m_host_object_data->map(0, 0x100000)) vk::host_data_t(); m_host_data_ptr = new (m_host_object_data->map(0, 0x100000)) vk::host_data_t();
ensure(m_host_data_ptr->magic == 0xCAFEBABE); ensure(m_host_data_ptr->magic == 0xCAFEBABE);
} }
else
{
rsx_log.error("Your GPU/driver does not support extensions required to enable passthrough DMA emulation. Host GPU labels will be disabled.");
}
}
} }
VKGSRender::~VKGSRender() VKGSRender::~VKGSRender()
@ -1493,20 +1500,67 @@ bool VKGSRender::release_GCM_label(u32 address, u32 args)
{ {
// All texture loads already seen by the host GPU // All texture loads already seen by the host GPU
// Wait for all previously submitted labels to be flushed // Wait for all previously submitted labels to be flushed
if (m_host_data_ptr->last_label_release_event > m_host_data_ptr->commands_complete_event)
{
//const u64 wait_start = utils::get_tsc();
while (m_host_data_ptr->last_label_release_event > m_host_data_ptr->commands_complete_event) while (m_host_data_ptr->last_label_release_event > m_host_data_ptr->commands_complete_event)
{ {
_mm_pause(); _mm_pause();
if (thread_ctrl::state() == thread_state::aborting)
{
break;
}
}
//const u64 now = utils::get_tsc();
//const u64 divisor = utils::get_tsc_freq() / 1000000;
//const u64 full_duration = (now - m_host_data_ptr->last_label_request_timestamp) / divisor;
//const u64 wait_duration = (now - wait_start) / divisor;
//rsx_log.error("GPU sync took [%llu, %llu] microseconds to complete", full_duration, wait_duration);
} }
return false; return false;
} }
m_host_data_ptr->last_label_release_event = ++m_host_data_ptr->event_counter;
const auto mapping = vk::map_dma(address, 4); const auto mapping = vk::map_dma(address, 4);
const auto write_data = std::bit_cast<u32, be_t<u32>>(args); const auto write_data = std::bit_cast<u32, be_t<u32>>(args);
if (!dynamic_cast<vk::memory_block_host*>(mapping.second->memory.get()))
{
// NVIDIA GPUs can disappoint when DMA blocks straddle VirtualAlloc boundaries.
// Take the L and try the fallback.
rsx_log.warning("Host label update at 0x%x was not possible.", address);
return false;
}
m_host_data_ptr->last_label_release_event = ++m_host_data_ptr->event_counter;
//m_host_data_ptr->last_label_request_timestamp = utils::get_tsc();
if (m_host_data_ptr->texture_load_request_event > m_host_data_ptr->last_label_submit_event)
{
if (vk::is_renderpass_open(*m_current_command_buffer))
{
vk::end_renderpass(*m_current_command_buffer);
}
vkCmdUpdateBuffer(*m_current_command_buffer, mapping.second->value, mapping.first, 4, &write_data); vkCmdUpdateBuffer(*m_current_command_buffer, mapping.second->value, mapping.first, 4, &write_data);
flush_command_queue(); flush_command_queue();
}
else
{
auto cmd = m_secondary_cb_list.next();
cmd->begin();
vkCmdUpdateBuffer(*cmd, mapping.second->value, mapping.first, 4, &write_data);
vkCmdUpdateBuffer(*cmd, m_host_object_data->value, ::offset32(&vk::host_data_t::commands_complete_event), 8, const_cast<u64*>(&m_host_data_ptr->last_label_release_event));
cmd->end();
vk::queue_submit_t submit_info = { m_device->get_graphics_queue(), nullptr };
cmd->submit(submit_info);
m_host_data_ptr->last_label_submit_event = m_host_data_ptr->last_label_release_event;
}
return true; return true;
} }
@ -2145,13 +2199,15 @@ void VKGSRender::close_and_submit_command_buffer(vk::fence* pFence, VkSemaphore
m_current_command_buffer->flags &= ~vk::command_buffer::cb_has_open_query; m_current_command_buffer->flags &= ~vk::command_buffer::cb_has_open_query;
} }
if (m_host_data_ptr && m_host_data_ptr->last_label_release_event > m_host_data_ptr->commands_complete_event) if (m_host_data_ptr && m_host_data_ptr->last_label_release_event > m_host_data_ptr->last_label_submit_event)
{ {
vkCmdUpdateBuffer(*m_current_command_buffer, vkCmdUpdateBuffer(*m_current_command_buffer,
m_host_object_data->value, m_host_object_data->value,
::offset32(&vk::host_data_t::commands_complete_event), ::offset32(&vk::host_data_t::commands_complete_event),
sizeof(u64), sizeof(u64),
const_cast<u64*>(&m_host_data_ptr->event_counter)); const_cast<u64*>(&m_host_data_ptr->last_label_release_event));
m_host_data_ptr->last_label_submit_event = m_host_data_ptr->last_label_release_event;
} }
m_current_command_buffer->end(); m_current_command_buffer->end();

View file

@ -23,7 +23,9 @@ namespace vk
u64 texture_load_request_event = 0; u64 texture_load_request_event = 0;
u64 texture_load_complete_event = 0; u64 texture_load_complete_event = 0;
u64 last_label_release_event = 0; u64 last_label_release_event = 0;
u64 last_label_submit_event = 0;
u64 commands_complete_event = 0; u64 commands_complete_event = 0;
u64 last_label_request_timestamp = 0;
}; };
struct fence struct fence

View file

@ -35,24 +35,27 @@ namespace rsx
const bool is_flip_sema = (address == (rsx->label_addr + 0x10) || address == (rsx->label_addr + 0x30)); const bool is_flip_sema = (address == (rsx->label_addr + 0x10) || address == (rsx->label_addr + 0x30));
if (!is_flip_sema) if (!is_flip_sema)
{ {
if constexpr (FlushPipe) // First, queue the GPU work. If it flushes the queue for us, the following routines will be faster.
{ const bool handled = rsx->get_backend_config().supports_host_gpu_labels && rsx->release_GCM_label(address, data);
// Ignoring these can cause very poor performance due to timestamp queries taking too long.
rsx->sync();
}
if (rsx->get_backend_config().supports_host_gpu_labels &&
rsx->release_GCM_label(address, data))
{
// Backend will handle it, nothing to do.
// Implicitly handles DMA sync.
return;
}
if constexpr (FlushDMA) if constexpr (FlushDMA)
{ {
// If the backend handled the request, this call will basically be a NOP
g_fxo->get<rsx::dma_manager>().sync(); g_fxo->get<rsx::dma_manager>().sync();
} }
if constexpr (FlushPipe)
{
// Manually flush the pipeline.
// It is possible to stream report writes using the host GPU, but that generates too much submit traffic.
rsx->sync();
}
if (handled)
{
// Backend will handle it, nothing to write.
return;
}
} }
vm::_ref<RsxSemaphore>(address).val = data; vm::_ref<RsxSemaphore>(address).val = data;
@ -255,7 +258,7 @@ namespace rsx
} }
else else
{ {
write_gcm_label<true, false>(rsx, get_address(offset, method_registers.semaphore_context_dma_4097()), arg); write_gcm_label<false, false>(rsx, get_address(offset, method_registers.semaphore_context_dma_4097()), arg);
} }
} }