From 8800c104763ca872576cb2beb3f70c31dd758fd4 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Thu, 19 Jul 2018 19:57:01 +0300 Subject: [PATCH] zcull synchronization tweaks - Implement forced reading when calling update method to sync partial lists - Defer conditional render evaluation and use a read barrier to avoid extra work - Fix HLE gcm library when binding tiles & zcull RAM --- rpcs3/Emu/Cell/Modules/cellGcmSys.cpp | 2 + rpcs3/Emu/RSX/RSXThread.cpp | 58 +++++++++++++++++++++------ rpcs3/Emu/RSX/RSXThread.h | 15 +++++-- rpcs3/Emu/RSX/VK/VKGSRender.cpp | 40 ++++++++++++++++-- rpcs3/Emu/RSX/VK/VKGSRender.h | 25 ++++++++---- rpcs3/Emu/RSX/rsx_methods.cpp | 9 ++--- 6 files changed, 116 insertions(+), 33 deletions(-) diff --git a/rpcs3/Emu/Cell/Modules/cellGcmSys.cpp b/rpcs3/Emu/Cell/Modules/cellGcmSys.cpp index 87e7458cf4..11ed6fc465 100644 --- a/rpcs3/Emu/Cell/Modules/cellGcmSys.cpp +++ b/rpcs3/Emu/Cell/Modules/cellGcmSys.cpp @@ -698,6 +698,7 @@ void cellGcmSetZcull(u8 index, u32 offset, u32 width, u32 height, u32 cullStart, zcull.sFunc = sFunc; zcull.sRef = sRef; zcull.sMask = sMask; + zcull.binded = (zCullFormat > 0); vm::_ptr(m_config->zculls_addr)[index] = zcull.pack(); } @@ -1261,6 +1262,7 @@ s32 cellGcmSetTile(u8 index, u8 location, u32 offset, u32 size, u32 pitch, u8 co tile.comp = comp; tile.base = base; tile.bank = bank; + tile.binded = (pitch > 0); vm::_ptr(m_config->tiles_addr)[index] = tile.pack(); return CELL_OK; diff --git a/rpcs3/Emu/RSX/RSXThread.cpp b/rpcs3/Emu/RSX/RSXThread.cpp index cd0952dd6c..af0034f7f3 100644 --- a/rpcs3/Emu/RSX/RSXThread.cpp +++ b/rpcs3/Emu/RSX/RSXThread.cpp @@ -275,6 +275,15 @@ namespace rsx void thread::begin() { + if (conditional_render_enabled && conditional_render_test_address) + { + // Evaluate conditional rendering test + zcull_ctrl->read_barrier(this, conditional_render_test_address, 4); + vm::ptr result = vm::cast(conditional_render_test_address); + conditional_render_test_failed = (result->value == 0); + conditional_render_test_address = 0; + } + rsx::method_registers.current_draw_clause.inline_vertex_array.resize(0); in_begin_end = true; @@ -2719,7 +2728,8 @@ namespace rsx { verify(HERE), query->pending; - if (!result && query->num_draws) + const bool implemented = (writer.type == CELL_GCM_ZPASS_PIXEL_CNT || writer.type == CELL_GCM_ZCULL_STATS3); + if (implemented && !result && query->num_draws) { get_occlusion_query_result(query); @@ -2784,13 +2794,23 @@ namespace rsx m_cycles_delay = min_zcull_cycles_delay; } - void ZCULL_control::update(::rsx::thread* ptimer) + void ZCULL_control::update(::rsx::thread* ptimer, u32 sync_address) { m_tsc++; if (m_pending_writes.empty()) return; + if (!sync_address) + { + const auto& front = m_pending_writes.front(); + if (!front.sink || m_tsc < front.due_tsc) + { + // Avoid spamming backend with report status updates + return; + } + } + u32 stat_tag_to_remove = m_statistics_tag_id; u32 processed = 0; for (auto &writer : m_pending_writes) @@ -2810,13 +2830,21 @@ namespace rsx auto query = writer.query; u32 result = m_statistics_map[writer.counter_tag]; + const bool force_read = (sync_address != 0); + if (force_read && writer.sink == sync_address) + { + // Forced reads end here + sync_address = 0; + } + if (query) { verify(HERE), query->pending; - if (UNLIKELY(writer.due_tsc < m_tsc)) + const bool implemented = (writer.type == CELL_GCM_ZPASS_PIXEL_CNT || writer.type == CELL_GCM_ZCULL_STATS3); + if (force_read || writer.due_tsc < m_tsc) { - if (!result && query->num_draws) + if (implemented && !result && query->num_draws) { get_occlusion_query_result(query); @@ -2834,12 +2862,7 @@ namespace rsx } else { - if (result || !query->num_draws) - { - //Not necessary to read the result anymore - discard_occlusion_query(query); - } - else + if (implemented && !result && query->num_draws) { //Maybe we get lucky and results are ready if (check_occlusion_query_status(query)) @@ -2857,6 +2880,11 @@ namespace rsx break; } } + else + { + //Not necessary to read the result anymore + discard_occlusion_query(query); + } } query->pending = false; @@ -2903,14 +2931,20 @@ namespace rsx return; const auto memory_end = memory_address + memory_range; + u32 sync_address = 0; + for (const auto &writer : m_pending_writes) { if (writer.sink >= memory_address && writer.sink < memory_end) { - sync(ptimer); - return; + sync_address = writer.sink; } } + + if (sync_address) + { + update(ptimer, sync_address); + } } } } diff --git a/rpcs3/Emu/RSX/RSXThread.h b/rpcs3/Emu/RSX/RSXThread.h index 16cf1309c0..fbfdd0f814 100644 --- a/rpcs3/Emu/RSX/RSXThread.h +++ b/rpcs3/Emu/RSX/RSXThread.h @@ -118,6 +118,11 @@ namespace rsx lock_wait = 4 // Puller is processing a lock acquire }; + enum FIFO_hint : u8 + { + hint_conditional_render_eval = 1 + }; + u32 get_vertex_type_size_on_host(vertex_base_type type, u32 size); u32 get_address(u32 offset, u32 location); @@ -230,7 +235,7 @@ namespace rsx queued_report_write* forwarder; vm::addr_t sink; - u32 due_tsc; + u64 due_tsc; }; struct ZCULL_control @@ -249,7 +254,7 @@ namespace rsx occlusion_query_info* m_current_task = nullptr; u32 m_statistics_tag_id = 0; - u32 m_tsc = 0; + u64 m_tsc = 0; u32 m_cycles_delay = max_zcull_cycles_delay; std::vector m_pending_writes; @@ -278,8 +283,8 @@ namespace rsx // Conditionally sync any pending writes if range overlaps void read_barrier(class ::rsx::thread* ptimer, u32 memory_address, u32 memory_range); - // Call once every 'tick' to update - void update(class ::rsx::thread* ptimer); + // Call once every 'tick' to update, optional address provided to partially sync until address is processed + void update(class ::rsx::thread* ptimer, u32 sync_address = 0); // Draw call notification void on_draw(); @@ -433,6 +438,7 @@ namespace rsx atomic_t async_tasks_pending{ 0 }; + u32 conditional_render_test_address = 0; bool conditional_render_test_failed = false; bool conditional_render_enabled = false; bool zcull_stats_enabled = false; @@ -482,6 +488,7 @@ namespace rsx // sync void sync(); void read_barrier(u32 memory_address, u32 memory_range); + virtual void sync_hint(FIFO_hint hint) {} gsl::span get_raw_index_array(const std::vector >& draw_indexed_clause) const; gsl::span get_raw_vertex_buffer(const rsx::data_array_format_info&, u32 base_offset, const std::vector>& vertex_ranges) const; diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.cpp b/rpcs3/Emu/RSX/VK/VKGSRender.cpp index 6ad1445df7..902fffd462 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRender.cpp +++ b/rpcs3/Emu/RSX/VK/VKGSRender.cpp @@ -816,19 +816,29 @@ bool VKGSRender::on_access_violation(u32 address, bool is_writing) if (sync_timestamp > 0) { - //Wait for earliest cb submitted after the sync timestamp to finish + // Wait for earliest cb submitted after the sync timestamp to finish command_buffer_chunk *target_cb = nullptr; for (auto &cb : m_primary_cb_list) { - if (cb.pending && cb.last_sync >= sync_timestamp) + if (cb.last_sync >= sync_timestamp) { + if (!cb.pending) + { + target_cb = nullptr; + break; + } + if (target_cb == nullptr || target_cb->last_sync > cb.last_sync) + { target_cb = &cb; + } } } if (target_cb) + { target_cb->wait(); + } } if (has_queue_ref) @@ -1435,6 +1445,8 @@ void VKGSRender::end() m_occlusion_query_pool.begin_query(*m_current_command_buffer, occlusion_id); m_occlusion_map[m_active_query_info->driver_handle].indices.push_back(occlusion_id); m_occlusion_map[m_active_query_info->driver_handle].command_buffer_to_wait = m_current_command_buffer; + + m_current_command_buffer->flags |= cb_has_occlusion_task; } if (!upload_info.index_info) @@ -1486,6 +1498,7 @@ void VKGSRender::end() close_render_pass(); vk::leave_uninterruptible(); + m_current_command_buffer->num_draws++; m_rtts.on_write(); std::chrono::time_point draw_end = steady_clock::now(); @@ -1834,6 +1847,22 @@ void VKGSRender::flush_command_queue(bool hard_sync) open_command_buffer(); } +void VKGSRender::sync_hint(rsx::FIFO_hint hint) +{ + if (hint == rsx::FIFO_hint::hint_conditional_render_eval) + { + if (m_current_command_buffer->flags & cb_has_occlusion_task) + { + // Occlusion test result evaluation is coming up, avoid a hard sync + if (!m_flush_requests.pending()) + { + m_flush_requests.post(false); + m_flush_requests.remove_one(); + } + } + } +} + void VKGSRender::advance_queued_frames() { //Check all other frames for completion and clear resources @@ -3290,8 +3319,11 @@ void VKGSRender::end_occlusion_query(rsx::reports::occlusion_query_info* query) //Avoid stalling later if this query is already tied to a report if (query->num_draws && query->owned && !m_flush_requests.pending()) { - m_flush_requests.post(false); - m_flush_requests.remove_one(); + if (0)//m_current_command_buffer->flags & cb_has_occlusion_task) + { + m_flush_requests.post(false); + m_flush_requests.remove_one(); + } } } diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.h b/rpcs3/Emu/RSX/VK/VKGSRender.h index 7e372703a6..e4c06181c2 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRender.h +++ b/rpcs3/Emu/RSX/VK/VKGSRender.h @@ -47,11 +47,19 @@ namespace vk extern u64 get_system_time(); +enum command_buffer_data_flag +{ + cb_has_occlusion_task = 1 +}; + struct command_buffer_chunk: public vk::command_buffer { VkFence submit_fence = VK_NULL_HANDLE; VkDevice m_device = VK_NULL_HANDLE; + u32 num_draws = 0; + u32 flags = 0; + std::atomic_bool pending = { false }; std::atomic last_sync = { 0 }; shared_mutex guard_mutex; @@ -90,12 +98,17 @@ struct command_buffer_chunk: public vk::command_buffer wait(); CHECK_RESULT(vkResetCommandBuffer(commands, 0)); + num_draws = 0; + flags = 0; } bool poke() { reader_lock lock(guard_mutex); + if (!pending) + return true; + if (vkGetFenceStatus(m_device, submit_fence) == VK_SUCCESS) { lock.upgrade(); @@ -117,14 +130,8 @@ struct command_buffer_chunk: public vk::command_buffer if (!pending) return; - switch(vkGetFenceStatus(m_device, submit_fence)) - { - case VK_SUCCESS: - break; - case VK_NOT_READY: - CHECK_RESULT(vkWaitForFences(m_device, 1, &submit_fence, VK_TRUE, UINT64_MAX)); - break; - } + // NOTE: vkWaitForFences is slower than polling fence status at least on NV + while (vkGetFenceStatus(m_device, submit_fence) == VK_NOT_READY); lock.upgrade(); @@ -406,6 +413,8 @@ public: void write_buffers(); void set_viewport(); + void sync_hint(rsx::FIFO_hint hint) override; + void begin_occlusion_query(rsx::reports::occlusion_query_info* query) override; void end_occlusion_query(rsx::reports::occlusion_query_info* query) override; bool check_occlusion_query_status(rsx::reports::occlusion_query_info* query) override; diff --git a/rpcs3/Emu/RSX/rsx_methods.cpp b/rpcs3/Emu/RSX/rsx_methods.cpp index 3311fec6ed..cc93b3426f 100644 --- a/rpcs3/Emu/RSX/rsx_methods.cpp +++ b/rpcs3/Emu/RSX/rsx_methods.cpp @@ -521,9 +521,10 @@ namespace rsx return; } - rsx->sync(); - vm::ptr result = address_ptr; - rsx->conditional_render_test_failed = (result->value == 0); + // Defer conditional render evaluation + rsx->sync_hint(FIFO_hint::hint_conditional_render_eval); + rsx->conditional_render_test_address = address_ptr; + rsx->conditional_render_test_failed = false; } void set_zcull_render_enable(thread* rsx, u32, u32 arg) @@ -1809,8 +1810,6 @@ namespace rsx bind(); bind(); bind(); - bind(); - bind(); bind(); bind(); bind();