From 9d981de96d6084e68eae139b483f6d6bffc48320 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Sun, 25 Aug 2019 18:47:49 +0300 Subject: [PATCH] rsx: Fix offloader deadlock - Do not allow offloader to handle its own faults. Serialize them on RSX instead. This approach introduces a GPU race condition that should be avoided with improved synchronization. - TODO: Use proper GPU-side synchronization to avoid this situation --- rpcs3/Emu/RSX/GL/GLGSRender.cpp | 18 ++++--- rpcs3/Emu/RSX/GL/GLGSRender.h | 5 +- rpcs3/Emu/RSX/RSXOffload.cpp | 92 +++++++++++++++++++++++++++++---- rpcs3/Emu/RSX/RSXOffload.h | 11 ++++ rpcs3/Emu/RSX/RSXThread.cpp | 23 +-------- rpcs3/Emu/RSX/RSXThread.h | 27 ++++------ rpcs3/Emu/RSX/VK/VKGSRender.cpp | 65 ++++++++++++++++++----- rpcs3/Emu/RSX/VK/VKGSRender.h | 14 ++++- 8 files changed, 183 insertions(+), 72 deletions(-) diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.cpp b/rpcs3/Emu/RSX/GL/GLGSRender.cpp index f4d3ba3387..38b95c5388 100644 --- a/rpcs3/Emu/RSX/GL/GLGSRender.cpp +++ b/rpcs3/Emu/RSX/GL/GLGSRender.cpp @@ -895,7 +895,6 @@ void GLGSRender::on_init_thread() m_video_output_pass.create(); m_gl_texture_cache.initialize(); - m_thread_id = std::this_thread::get_id(); if (!supports_native_ui) { @@ -1821,7 +1820,7 @@ void GLGSRender::flip(int buffer, bool emu_flip) bool GLGSRender::on_access_violation(u32 address, bool is_writing) { - const bool can_flush = (std::this_thread::get_id() == m_thread_id); + const bool can_flush = (std::this_thread::get_id() == m_rsx_thread); const rsx::invalidation_cause cause = is_writing ? (can_flush ? rsx::invalidation_cause::write : rsx::invalidation_cause::deferred_write) : (can_flush ? rsx::invalidation_cause::read : rsx::invalidation_cause::deferred_read); @@ -1848,14 +1847,13 @@ bool GLGSRender::on_access_violation(u32 address, bool is_writing) return true; } -void GLGSRender::on_invalidate_memory_range(const utils::address_range &range) +void GLGSRender::on_invalidate_memory_range(const utils::address_range &range, rsx::invalidation_cause cause) { - //Discard all memory in that range without bothering with writeback (Force it for strict?) gl::command_context cmd{ gl_state }; - auto data = std::move(m_gl_texture_cache.invalidate_range(cmd, range, rsx::invalidation_cause::unmap)); + auto data = std::move(m_gl_texture_cache.invalidate_range(cmd, range, cause)); AUDIT(data.empty()); - if (data.violation_handled) + if (cause == rsx::invalidation_cause::unmap && data.violation_handled) { m_gl_texture_cache.purge_unreleased_sections(); { @@ -1865,6 +1863,14 @@ void GLGSRender::on_invalidate_memory_range(const utils::address_range &range) } } +void GLGSRender::on_semaphore_acquire_wait() +{ + if (!work_queue.empty()) + { + do_local_task(rsx::FIFO_state::lock_wait); + } +} + void GLGSRender::do_local_task(rsx::FIFO_state state) { if (!work_queue.empty()) diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.h b/rpcs3/Emu/RSX/GL/GLGSRender.h index 485b4b25f6..8641d7c660 100644 --- a/rpcs3/Emu/RSX/GL/GLGSRender.h +++ b/rpcs3/Emu/RSX/GL/GLGSRender.h @@ -327,8 +327,6 @@ private: shared_mutex queue_guard; std::list work_queue; - std::thread::id m_thread_id; - GLProgramBuffer m_prog_buffer; draw_context_t m_decompiler_context; @@ -397,8 +395,9 @@ protected: void do_local_task(rsx::FIFO_state state) override; bool on_access_violation(u32 address, bool is_writing) override; - void on_invalidate_memory_range(const utils::address_range &range) override; + void on_invalidate_memory_range(const utils::address_range &range, rsx::invalidation_cause cause) override; void notify_tile_unbound(u32 tile) override; + void on_semaphore_acquire_wait() override; std::array, 4> copy_render_targets_to_memory() override; std::array, 2> copy_depth_stencil_buffer_to_memory() override; diff --git a/rpcs3/Emu/RSX/RSXOffload.cpp b/rpcs3/Emu/RSX/RSXOffload.cpp index 1ebf68d4c0..92d8954549 100644 --- a/rpcs3/Emu/RSX/RSXOffload.cpp +++ b/rpcs3/Emu/RSX/RSXOffload.cpp @@ -3,6 +3,8 @@ #include "Common/BufferUtils.h" #include "Emu/System.h" #include "RSXOffload.h" +#include "RSXThread.h" +#include "rsx_utils.h" #include #include @@ -27,6 +29,9 @@ namespace rsx return; } + // Register thread id + m_thread_id = std::this_thread::get_id(); + if (g_cfg.core.thread_scheduler_enabled) { thread_ctrl::set_thread_affinity_mask(thread_ctrl::get_affinity_mask(thread_class::rsx)); @@ -36,22 +41,21 @@ namespace rsx { if (m_enqueued_count.load() != m_processed_count) { - for (auto slice = m_work_queue.pop_all(); slice; slice.pop_front()) + for (m_current_job = m_work_queue.pop_all(); m_current_job; m_current_job.pop_front()) { - auto task = *slice; - switch (task.type) + switch (m_current_job->type) { case raw_copy: - memcpy(task.dst, task.src, task.length); + memcpy(m_current_job->dst, m_current_job->src, m_current_job->length); break; case vector_copy: - memcpy(task.dst, task.opt_storage.data(), task.length); + memcpy(m_current_job->dst, m_current_job->opt_storage.data(), m_current_job->length); break; case index_emulate: write_index_array_for_non_indexed_non_native_primitive_to_buffer( - reinterpret_cast(task.dst), - static_cast(task.aux_param0), - task.length); + reinterpret_cast(m_current_job->dst), + static_cast(m_current_job->aux_param0), + m_current_job->length); break; default: ASSUME(0); @@ -116,6 +120,11 @@ namespace rsx } // Synchronization + bool dma_manager::is_current_thread() const + { + return (std::this_thread::get_id() == m_thread_id); + } + void dma_manager::sync() { if (LIKELY(m_enqueued_count.load() == m_processed_count)) @@ -124,8 +133,25 @@ namespace rsx return; } - while (m_enqueued_count.load() != m_processed_count) - _mm_pause(); + if (auto rsxthr = get_current_renderer(); rsxthr->is_current_thread()) + { + if (m_mem_fault_flag) + { + // Abort if offloader is in recovery mode + return; + } + + while (m_enqueued_count.load() != m_processed_count) + { + rsxthr->on_semaphore_acquire_wait(); + _mm_pause(); + } + } + else + { + while (m_enqueued_count.load() != m_processed_count) + _mm_pause(); + } } void dma_manager::join() @@ -133,4 +159,50 @@ namespace rsx m_worker_state = thread_state::finished; sync(); } + + void dma_manager::set_mem_fault_flag() + { + verify("Access denied" HERE), is_current_thread(); + m_mem_fault_flag.release(true); + } + + void dma_manager::clear_mem_fault_flag() + { + verify("Access denied" HERE), is_current_thread(); + m_mem_fault_flag.release(false); + } + + // Fault recovery + utils::address_range dma_manager::get_fault_range(bool writing) const + { + verify(HERE), m_current_job; + + void *address = nullptr; + u32 range = m_current_job->length; + + switch (m_current_job->type) + { + case raw_copy: + address = (writing) ? m_current_job->dst : m_current_job->src; + break; + case vector_copy: + verify(HERE), writing; + address = m_current_job->dst; + break; + case index_emulate: + verify(HERE), writing; + address = m_current_job->dst; + range = get_index_count(static_cast(m_current_job->aux_param0), m_current_job->length); + break; + default: + ASSUME(0); + fmt::throw_exception("Unreachable" HERE); + } + + const uintptr_t addr = uintptr_t(address); + const uintptr_t base = uintptr_t(vm::g_base_addr); + + verify(HERE), addr > base; + return utils::address_range::start_length(u32(addr - base), range); + } } diff --git a/rpcs3/Emu/RSX/RSXOffload.h b/rpcs3/Emu/RSX/RSXOffload.h index 7f1b4e34ee..92ba45d8f1 100644 --- a/rpcs3/Emu/RSX/RSXOffload.h +++ b/rpcs3/Emu/RSX/RSXOffload.h @@ -3,9 +3,11 @@ #include "Utilities/types.h" #include "Utilities/lockless.h" #include "Utilities/Thread.h" +#include "Utilities/address_range.h" #include "gcm_enums.h" #include +#include namespace rsx { @@ -42,9 +44,12 @@ namespace rsx }; lf_queue m_work_queue; + lf_queue_slice m_current_job; atomic_t m_enqueued_count{ 0 }; volatile u64 m_processed_count = 0; thread_state m_worker_state = thread_state::detached; + std::thread::id m_thread_id; + atomic_t m_mem_fault_flag{ false }; // TODO: Improved benchmarks here; value determined by profiling on a Ryzen CPU, rounded to the nearest 512 bytes const u32 max_immediate_transfer_size = 3584; @@ -63,8 +68,14 @@ namespace rsx void emulate_as_indexed(void *dst, rsx::primitive_type primitive, u32 count); // Synchronization + bool is_current_thread() const; void sync(); void join(); + void set_mem_fault_flag(); + void clear_mem_fault_flag(); + + // Fault recovery + utils::address_range get_fault_range(bool writing) const; }; extern dma_manager g_dma_manager; diff --git a/rpcs3/Emu/RSX/RSXThread.cpp b/rpcs3/Emu/RSX/RSXThread.cpp index 8e7c4135f1..3cfb338302 100644 --- a/rpcs3/Emu/RSX/RSXThread.cpp +++ b/rpcs3/Emu/RSX/RSXThread.cpp @@ -917,27 +917,6 @@ namespace rsx fmt::throw_exception("ill-formed draw command" HERE); } - void thread::do_internal_task() - { - if (m_internal_tasks.empty()) - { - std::this_thread::yield(); - } - else - { - fmt::throw_exception("Disabled" HERE); - //std::lock_guard lock(m_mtx_task); - - //internal_task_entry &front = m_internal_tasks.front(); - - //if (front.callback()) - //{ - // front.promise.set_value(); - // m_internal_tasks.pop_front(); - //} - } - } - void thread::do_local_task(FIFO_state state) { if (async_flip_requested & flip_request::emu_requested) @@ -2465,7 +2444,7 @@ namespace rsx if (!m_invalidated_memory_range.valid()) return; - on_invalidate_memory_range(m_invalidated_memory_range); + on_invalidate_memory_range(m_invalidated_memory_range, rsx::invalidation_cause::unmap); m_invalidated_memory_range.invalidate(); } diff --git a/rpcs3/Emu/RSX/RSXThread.h b/rpcs3/Emu/RSX/RSXThread.h index 2825979372..d8426be35b 100644 --- a/rpcs3/Emu/RSX/RSXThread.h +++ b/rpcs3/Emu/RSX/RSXThread.h @@ -12,6 +12,7 @@ #include "rsx_methods.h" #include "rsx_utils.h" #include "Overlays/overlays.h" +#include "Common/texture_cache_utils.h" #include "Utilities/Thread.h" #include "Utilities/geometry.h" @@ -418,8 +419,8 @@ namespace rsx protected: std::thread::id m_rsx_thread; - atomic_t m_rsx_thread_exiting{true}; - s32 m_return_addr{-1}, restore_ret{-1}; + atomic_t m_rsx_thread_exiting{ true }; + std::array vertex_push_buffers; std::vector element_push_buffer; @@ -433,6 +434,7 @@ namespace rsx // FIFO std::unique_ptr fifo_ctrl; FIFO::flattening_helper m_flattener; + s32 m_return_addr{ -1 }, restore_ret{ -1 }; // Occlusion query bool zcull_surface_active = false; @@ -605,7 +607,7 @@ namespace rsx virtual void flip(int buffer, bool emu_flip = false) = 0; virtual u64 timestamp(); virtual bool on_access_violation(u32 /*address*/, bool /*is_writing*/) { return false; } - virtual void on_invalidate_memory_range(const address_range & /*range*/) {} + virtual void on_invalidate_memory_range(const address_range & /*range*/, rsx::invalidation_cause) {} virtual void notify_tile_unbound(u32 /*tile*/) {} // zcull @@ -661,18 +663,6 @@ namespace rsx private: shared_mutex m_mtx_task; - struct internal_task_entry - { - std::function callback; - //std::promise promise; - - internal_task_entry(std::function callback) : callback(std::move(callback)) - { - } - }; - - std::deque m_internal_tasks; - void do_internal_task(); void handle_emu_flip(u32 buffer); void handle_invalidated_memory_range(); @@ -732,7 +722,7 @@ namespace rsx /** * Notify to check internal state during semaphore wait */ - void on_semaphore_acquire_wait() { do_local_task(FIFO_state::lock_wait); } + virtual void on_semaphore_acquire_wait() {} /** * Copy rtt values to buffer. @@ -767,7 +757,10 @@ namespace rsx void pause(); void unpause(); - //Get RSX approximate load in % + // Get RSX approximate load in % u32 get_load(); + + // Returns true if the current thread is the active RSX thread + bool is_current_thread() const { return std::this_thread::get_id() == m_rsx_thread; } }; } diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.cpp b/rpcs3/Emu/RSX/VK/VKGSRender.cpp index aa6ec8b93d..93f2882122 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRender.cpp +++ b/rpcs3/Emu/RSX/VK/VKGSRender.cpp @@ -662,10 +662,29 @@ bool VKGSRender::on_access_violation(u32 address, bool is_writing) if (result.num_flushable > 0) { - const bool is_rsxthr = std::this_thread::get_id() == m_rsx_thread; - bool has_queue_ref = false; + if (rsx::g_dma_manager.is_current_thread()) + { + // The offloader thread cannot handle flush requests + verify(HERE), m_queue_status.load() == flush_queue_state::ok; - if (!is_rsxthr) + m_offloader_fault_range = rsx::g_dma_manager.get_fault_range(is_writing); + m_offloader_fault_cause = (is_writing) ? rsx::invalidation_cause::write : rsx::invalidation_cause::read; + + rsx::g_dma_manager.set_mem_fault_flag(); + m_queue_status |= flush_queue_state::deadlock; + + // Wait for deadlock to clear + while (m_queue_status & flush_queue_state::deadlock) + { + _mm_pause(); + } + + rsx::g_dma_manager.clear_mem_fault_flag(); + return true; + } + + bool has_queue_ref = false; + if (!is_current_thread()) { //Always submit primary cb to ensure state consistency (flush pending changes such as image transitions) vm::temporary_unlock(); @@ -703,14 +722,14 @@ bool VKGSRender::on_access_violation(u32 address, bool is_writing) return true; } -void VKGSRender::on_invalidate_memory_range(const utils::address_range &range) +void VKGSRender::on_invalidate_memory_range(const utils::address_range &range, rsx::invalidation_cause cause) { std::lock_guard lock(m_secondary_cb_guard); - auto data = std::move(m_texture_cache.invalidate_range(m_secondary_command_buffer, range, rsx::invalidation_cause::unmap)); + auto data = std::move(m_texture_cache.invalidate_range(m_secondary_command_buffer, range, cause)); AUDIT(data.empty()); - if (data.violation_handled) + if (cause == rsx::invalidation_cause::unmap && data.violation_handled) { m_texture_cache.purge_unreleased_sections(); { @@ -720,6 +739,14 @@ void VKGSRender::on_invalidate_memory_range(const utils::address_range &range) } } +void VKGSRender::on_semaphore_acquire_wait() +{ + if (m_flush_requests.pending() || m_queue_status & flush_queue_state::deadlock) + { + do_local_task(rsx::FIFO_state::lock_wait); + } +} + void VKGSRender::notify_tile_unbound(u32 tile) { //TODO: Handle texture writeback @@ -2326,16 +2353,28 @@ void VKGSRender::frame_context_cleanup(frame_context_t *ctx, bool free_resources void VKGSRender::do_local_task(rsx::FIFO_state state) { + if (m_queue_status & flush_queue_state::deadlock) + { + // Clear offloader deadlock + // NOTE: It is not possible to handle regular flush requests before this is cleared + // NOTE: This may cause graphics corruption due to unsynchronized modification + flush_command_queue(); + on_invalidate_memory_range(m_offloader_fault_range, m_offloader_fault_cause); + m_queue_status.clear(flush_queue_state::deadlock); + } + if (m_flush_requests.pending()) { - std::lock_guard lock(m_flush_queue_mutex); + if (m_flush_queue_mutex.try_lock()) + { + // TODO: Determine if a hard sync is necessary + // Pipeline barriers later may do a better job synchronizing than wholly stalling the pipeline + flush_command_queue(); - //TODO: Determine if a hard sync is necessary - //Pipeline barriers later may do a better job synchronizing than wholly stalling the pipeline - flush_command_queue(); - - m_flush_requests.clear_pending_flag(); - m_flush_requests.consumer_wait(); + m_flush_requests.clear_pending_flag(); + m_flush_requests.consumer_wait(); + m_flush_queue_mutex.unlock(); + } } else if (!in_begin_end && state != rsx::FIFO_state::lock_wait) { diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.h b/rpcs3/Emu/RSX/VK/VKGSRender.h index 501d919b5e..c65c2152da 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRender.h +++ b/rpcs3/Emu/RSX/VK/VKGSRender.h @@ -294,6 +294,12 @@ struct flush_request_task } }; +enum flush_queue_state : u32 +{ + ok = 0, + deadlock = 1 +}; + class VKGSRender : public GSRender, public ::rsx::reports::ZCULL_control { private: @@ -404,6 +410,11 @@ private: shared_mutex m_flush_queue_mutex; flush_request_task m_flush_requests; + // Offloader thread deadlock recovery + rsx::atomic_bitmask_t m_queue_status; + utils::address_range m_offloader_fault_range; + rsx::invalidation_cause m_offloader_fault_cause; + bool m_render_pass_open = false; u64 m_current_renderpass_key = 0; VkRenderPass m_cached_renderpass = VK_NULL_HANDLE; @@ -488,7 +499,8 @@ protected: void notify_tile_unbound(u32 tile) override; bool on_access_violation(u32 address, bool is_writing) override; - void on_invalidate_memory_range(const utils::address_range &range) override; + void on_invalidate_memory_range(const utils::address_range &range, rsx::invalidation_cause cause) override; + void on_semaphore_acquire_wait() override; bool on_decompiler_task() override; };