From bcdf91edbb794bfb2e1344ee6f8e05c409270dee Mon Sep 17 00:00:00 2001 From: Rui Pinheiro Date: Thu, 1 Nov 2018 01:31:12 +0000 Subject: [PATCH] Misc. Texture Cache fixes --- rpcs3/Emu/RSX/Common/texture_cache.h | 71 ++++++----- rpcs3/Emu/RSX/Common/texture_cache_utils.h | 142 ++++++++++++++++++++- rpcs3/Emu/RSX/GL/GLTextureCache.h | 91 +++++-------- rpcs3/Emu/RSX/VK/VKTextureCache.h | 77 ++++------- rpcs3/Emu/RSX/rsx_cache.h | 12 +- 5 files changed, 245 insertions(+), 148 deletions(-) diff --git a/rpcs3/Emu/RSX/Common/texture_cache.h b/rpcs3/Emu/RSX/Common/texture_cache.h index 1288a36d27..8ad581edd6 100644 --- a/rpcs3/Emu/RSX/Common/texture_cache.h +++ b/rpcs3/Emu/RSX/Common/texture_cache.h @@ -55,6 +55,7 @@ namespace rsx std::vector sections_to_exclude; // These sections are do be excluded from protection manipulation (subtracted from other sections) u32 num_flushable = 0; u64 cache_tag = 0; + address_range fault_range; address_range invalidate_range; @@ -300,7 +301,7 @@ namespace rsx public: virtual void destroy() = 0; virtual bool is_depth_texture(u32, u32) = 0; - virtual void on_section_destroyed(section_storage_type& section) + virtual void on_section_destroyed(section_storage_type& /*section*/) {} @@ -405,6 +406,24 @@ namespace rsx } surface->flush(std::forward(extras)...); + + // Exclude this region when flushing other sections that should not trample it + // If we overlap an excluded RO, set it as dirty + for (auto &other : data.sections_to_exclude) + { + AUDIT(other != surface); + if (!other->is_flushable()) + { + if (other->overlaps(*surface, section_bounds::full_range)) + { + other->set_dirty(true); + } + } + else if(surface->last_write_tag > other->last_write_tag) + { + other->add_flush_exclusion(surface->get_confirmed_range()); + } + } } data.flushed = true; @@ -483,7 +502,7 @@ namespace rsx // Sanity checks AUDIT(exclusion_range.is_page_range()); - AUDIT(!exclusion_range.overlaps(data.fault_range)); + AUDIT(data.cause.is_read() && !excluded->is_flushable() || !exclusion_range.overlaps(data.fault_range)); // Apply exclusion ranges_to_unprotect.exclude(exclusion_range); @@ -590,10 +609,6 @@ namespace rsx const auto new_range = tex.get_min_max(invalidate_range, bounds).to_page_range(); AUDIT(new_range.is_page_range() && invalidate_range.inside(new_range)); - const s32 signed_distance = tex.signed_distance(fault_range, section_bounds::locked_range); - const s32 distance = signed_distance < 0 ? -signed_distance : signed_distance; - const bool is_after_fault = (signed_distance >= 0); - // The various chaining policies behave differently bool extend_invalidate_range = tex.overlaps(fault_range, bounds); @@ -662,7 +677,7 @@ namespace rsx AUDIT(fault_range_in.valid()); address_range fault_range = fault_range_in.to_page_range(); - auto trampled_set = std::move(get_intersecting_set(fault_range)); + intersecting_set trampled_set = std::move(get_intersecting_set(fault_range)); thrashed_set result = {}; result.cause = cause; @@ -685,11 +700,12 @@ namespace rsx { if (g_cfg.video.strict_texture_flushing && tex.is_flushable()) { - // TODO: Flush only the part outside the fault_range - LOG_TODO(RSX, "Flushable section data may have been lost"); + tex.add_flush_exclusion(fault_range); + } + else + { + tex.set_dirty(true); } - - tex.set_dirty(true); } } @@ -729,8 +745,7 @@ namespace rsx if ( // RO sections during a read invalidation can be ignored (unless there are flushables in trampled_set, since those could overwrite RO data) - // TODO: Also exclude RO sections even if there are flushables - (invalidation_keep_ro_during_read && !trampled_set.has_flushables && cause.is_read() && tex.get_protection() == utils::protection::ro) || + (invalidation_keep_ro_during_read && !trampled_set.has_flushables && cause.is_read() && !tex.is_flushable()) || // Sections that are not fully contained in invalidate_range can be ignored !tex.inside(trampled_set.invalidate_range, bounds) || // Unsynchronized sections (or any flushable when skipping flushes) that do not overlap the fault range directly can also be ignored @@ -1080,36 +1095,25 @@ namespace rsx template void lock_memory_region(image_storage_type* image, const address_range &rsx_range, u32 width, u32 height, u32 pitch, const std::tuple& flush_extras, Args&&... extras) { - AUDIT(g_cfg.video.write_color_buffers); // this method is only called when WCB is enabled + AUDIT(g_cfg.video.write_color_buffers || g_cfg.video.write_depth_buffer); // this method is only called when either WCB or WDB are enabled std::lock_guard lock(m_cache_mutex); // Find a cached section to use - section_storage_type& region = *find_cached_texture(rsx_range, true, false); - - if (!region.is_locked()) - { - // Invalidate sections from surface cache occupying same address range - std::apply(&texture_cache::invalidate_range_impl_base, std::tuple_cat(std::make_tuple(this, rsx_range, invalidation_cause::superseded_by_fbo), flush_extras)); - } + section_storage_type& region = *find_cached_texture(rsx_range, true, true, width, height); // Prepare and initialize fbo region if (region.exists() && region.get_context() != texture_upload_context::framebuffer_storage) { - AUDIT(region.matches(rsx_range)); - //This space was being used for other purposes other than framebuffer storage //Delete used resources before attaching it to framebuffer memory read_only_tex_invalidate = true; + } - // We are going to reprotect this section in a second, so discard it here - if (region.is_locked()) - { - region.discard(); - } - - // Destroy the resources - region.destroy(); + if (!region.is_locked() || region.get_context() != texture_upload_context::framebuffer_storage) + { + // Invalidate sections from surface cache occupying same address range + std::apply(&texture_cache::invalidate_range_impl_base, std::tuple_cat(std::make_tuple(this, rsx_range, invalidation_cause::superseded_by_fbo), flush_extras)); } if (!region.is_locked() || region.can_be_reused()) @@ -1129,6 +1133,9 @@ namespace rsx } region.create(width, height, 1, 1, image, pitch, false, std::forward(extras)...); + region.reprotect(utils::protection::no, { 0, rsx_range.length() }); + tag_framebuffer(region.get_section_base()); + region.set_dirty(false); region.touch(m_cache_update_tag); @@ -1143,8 +1150,6 @@ namespace rsx AUDIT(m_flush_always_cache.find(region.get_section_range()) != m_flush_always_cache.end()); } - // Delay protection until here in case the invalidation block above has unprotected pages in this range - region.reprotect(utils::protection::no, { 0, rsx_range.length() }); update_cache_tag(); #ifdef TEXTURE_CACHE_DEBUG diff --git a/rpcs3/Emu/RSX/Common/texture_cache_utils.h b/rpcs3/Emu/RSX/Common/texture_cache_utils.h index 21a5a0d247..9ed28cf3e4 100644 --- a/rpcs3/Emu/RSX/Common/texture_cache_utils.h +++ b/rpcs3/Emu/RSX/Common/texture_cache_utils.h @@ -1001,6 +1001,8 @@ namespace rsx rsx::texture_upload_context context = rsx::texture_upload_context::shader_read; rsx::texture_dimension_extended image_type = rsx::texture_dimension_extended::texture_dimension_2d; + address_range_vector flush_exclusions; // Address ranges that will be skipped during flush + predictor_type *m_predictor = nullptr; size_t m_predictor_key_hash = 0; predictor_entry_type *m_predictor_entry = nullptr; @@ -1015,9 +1017,9 @@ namespace rsx } cached_texture_section() = default; - cached_texture_section(ranged_storage_block_type *block) : m_block(block), m_storage(&block->get_storage()), m_tex_cache(&block->get_texture_cache()), m_predictor(&m_tex_cache->get_predictor()) + cached_texture_section(ranged_storage_block_type *block) { - update_unreleased(); + initialize(block); } void initialize(ranged_storage_block_type *block) @@ -1073,6 +1075,8 @@ namespace rsx context = rsx::texture_upload_context::shader_read; image_type = rsx::texture_dimension_extended::texture_dimension_2d; + flush_exclusions.clear(); + // Set to dirty set_dirty(true); @@ -1324,6 +1328,8 @@ namespace rsx { get_predictor_entry().on_flush(); } + + flush_exclusions.clear(); } void on_speculative_flush() @@ -1346,12 +1352,144 @@ namespace rsx { m_tex_cache->on_misprediction(); } + + flush_exclusions.clear(); } + /** + * Flush + */ + private: + void imp_flush_memcpy(u32 vm_dst, u8* src, u32 len) const + { + u8 *dst = get_ptr(vm_dst); + address_range copy_range = address_range::start_length(vm_dst, len); + + if (flush_exclusions.empty() || !copy_range.overlaps(flush_exclusions)) + { + // Normal case = no flush exclusions, or no overlap + memcpy(dst, src, len); + return; + } + else if (copy_range.inside(flush_exclusions)) + { + // Nothing to copy + return; + } + + // Otherwise, we need to filter the memcpy with our flush exclusions + // Should be relatively rare + address_range_vector vec; + vec.merge(copy_range); + vec.exclude(flush_exclusions); + + for (const auto& rng : vec) + { + if (!rng.valid()) + continue; + + AUDIT(rng.inside(copy_range)); + u32 offset = rng.start - vm_dst; + memcpy(dst + offset, src + offset, rng.length()); + } + } + + void imp_flush() + { + AUDIT(synchronized); + + ASSERT(real_pitch > 0); + + // Calculate valid range + const auto valid_range = get_confirmed_range(); + AUDIT(valid_range.valid()); + const auto valid_length = valid_range.length(); + const auto valid_offset = valid_range.start - get_section_base(); + AUDIT(valid_length > 0); + + // Obtain pointers to the source and destination memory regions + u8 *src = static_cast(derived()->map_synchronized(valid_offset, valid_length)); + u32 dst = valid_range.start; + ASSERT(src != nullptr); + + // Copy from src to dst + if (real_pitch >= rsx_pitch || valid_length <= rsx_pitch) + { + imp_flush_memcpy(dst, src, valid_length); + } + else + { + ASSERT(valid_length % rsx_pitch == 0); + + u8 *_src = src; + u32 _dst = dst; + const auto num_rows = valid_length / rsx_pitch; + for (u32 row = 0; row < num_rows; ++row) + { + imp_flush_memcpy(_dst, _src, real_pitch); + _src += real_pitch; + _dst += rsx_pitch; + } + } + } + + + public: + // Returns false if there was a cache miss + template + bool flush(Args&&... extras) + { + if (flushed) return true; + bool miss = false; + + // Sanity checks + ASSERT(exists()); + AUDIT(is_locked()); + + // If we are fully inside the flush exclusions regions, we just mark ourselves as flushed and return + if (get_confirmed_range().inside(flush_exclusions)) + { + flushed = true; + flush_exclusions.clear(); + on_flush(miss); + return !miss; + } + + // If we are not synchronized, we must synchronize before proceeding (hard fault) + if (!synchronized) + { + LOG_WARNING(RSX, "Cache miss at address 0x%X. This is gonna hurt...", get_section_base()); + derived()->synchronize(true, std::forward(extras)...); + miss = true; + + ASSERT(synchronized); // TODO ruipin: This might be possible in OGL. Revisit + } + + // Copy flush result to guest memory + imp_flush(); + + // Finish up + // Its highly likely that this surface will be reused, so we just leave resources in place + flushed = true; + derived()->finish_flush(); + flush_exclusions.clear(); + on_flush(miss); + + return !miss; + } + + void add_flush_exclusion(const address_range& rng) + { + AUDIT(exists() && is_locked() && is_flushable()); + const auto _rng = rng.get_intersect(get_section_range()); + flush_exclusions.merge(_rng); + } + /** * Misc */ + public: predictor_entry_type& get_predictor_entry() { // If we don't have a predictor entry, or the key has changed diff --git a/rpcs3/Emu/RSX/GL/GLTextureCache.h b/rpcs3/Emu/RSX/GL/GLTextureCache.h index d4e67e7202..cf17abdfca 100644 --- a/rpcs3/Emu/RSX/GL/GLTextureCache.h +++ b/rpcs3/Emu/RSX/GL/GLTextureCache.h @@ -484,46 +484,39 @@ namespace gl glBindBuffer(GL_PIXEL_UNPACK_BUFFER, GL_NONE); } - bool flush() + + /** + * Flush + */ + void synchronize(bool blocking) { - ASSERT(exists()); + if (synchronized) + return; - if (flushed) return true; //Already written, ignore - AUDIT(is_locked()); + copy_texture(blocking); - bool result = true; - if (!synchronized) + if (blocking) { - LOG_WARNING(RSX, "Cache miss at address 0x%X. This is gonna hurt...", get_section_base()); - copy_texture(true); - - if (!synchronized) - { - LOG_WARNING(RSX, "Nothing to copy; Setting section to readable and moving on..."); - protect(utils::protection::ro); - return false; - } - - result = false; + m_fence.wait_for_signal(); } + } - verify(HERE), real_pitch > 0; + void* map_synchronized(u32 offset, u32 size) + { + AUDIT(synchronized); - m_fence.wait_for_signal(); - flushed = true; - - const auto valid_range = get_confirmed_range_delta(); - const u32 valid_offset = valid_range.first; - const u32 valid_length = valid_range.second; - AUDIT(valid_length > 0); - - void *dst = get_ptr(get_section_base() + valid_offset); glBindBuffer(GL_PIXEL_PACK_BUFFER, pbo_id); - void *src = glMapBufferRange(GL_PIXEL_PACK_BUFFER, valid_offset, valid_length, GL_MAP_READ_BIT); + return glMapBufferRange(GL_PIXEL_PACK_BUFFER, offset, size, GL_MAP_READ_BIT); + } - //throw if map failed since we'll segfault anyway - verify(HERE), src != nullptr; + void finish_flush() + { + // Free resources + glUnmapBuffer(GL_PIXEL_PACK_BUFFER); + glBindBuffer(GL_PIXEL_PACK_BUFFER, GL_NONE); + + // Shuffle bool require_manual_shuffle = false; if (pack_unpack_swap_bytes) { @@ -531,27 +524,10 @@ namespace gl require_manual_shuffle = true; } - if (real_pitch >= rsx_pitch || valid_length <= rsx_pitch) - { - memcpy(dst, src, valid_length); - } - else - { - if (valid_length % rsx_pitch) - { - fmt::throw_exception("Unreachable" HERE); - } - - u8 *_src = (u8*)src; - u8 *_dst = (u8*)dst; - const auto num_rows = valid_length / rsx_pitch; - for (u32 row = 0; row < num_rows; ++row) - { - memcpy(_dst, _src, real_pitch); - _src += real_pitch; - _dst += rsx_pitch; - } - } + const auto valid_range = get_confirmed_range_delta(); + const u32 valid_offset = valid_range.first; + const u32 valid_length = valid_range.second; + void *dst = get_ptr(get_section_base() + valid_offset); if (require_manual_shuffle) { @@ -560,6 +536,7 @@ namespace gl } else if (pack_unpack_swap_bytes && ::gl::get_driver_caps().vendor_AMD) { + //AMD driver bug - cannot use pack_swap_bytes //Manually byteswap texel data switch (type) @@ -609,15 +586,13 @@ namespace gl } } } - - glUnmapBuffer(GL_PIXEL_PACK_BUFFER); - glBindBuffer(GL_PIXEL_PACK_BUFFER, GL_NONE); - - baseclass::on_flush(!result); - - return result; } + + + /** + * Misc + */ void destroy() { if (!is_locked() && pbo_id == 0 && vram_texture == nullptr && m_fence.is_empty() && managed_texture.get() == nullptr) diff --git a/rpcs3/Emu/RSX/VK/VKTextureCache.h b/rpcs3/Emu/RSX/VK/VKTextureCache.h index a596971376..c39139b1c8 100644 --- a/rpcs3/Emu/RSX/VK/VKTextureCache.h +++ b/rpcs3/Emu/RSX/VK/VKTextureCache.h @@ -310,69 +310,38 @@ namespace vk sync_timestamp = get_system_time(); } - bool flush(vk::command_buffer& cmd, VkQueue submit_queue) + /** + * Flush + */ + void synchronize(bool blocking, vk::command_buffer& cmd, VkQueue submit_queue) { - ASSERT(exists()); - - if (flushed) return true; - AUDIT(is_locked()); + if (synchronized) + return; if (m_device == nullptr) { m_device = &cmd.get_command_pool().get_owner(); } - // Return false if a flush occured 'late', i.e we had a miss - bool result = true; - - if (!synchronized) - { - LOG_WARNING(RSX, "Cache miss at address 0x%X. This is gonna hurt...", get_section_base()); - copy_texture(true, cmd, submit_queue); - result = false; - } - - verify(HERE), real_pitch > 0; - flushed = true; - - const auto valid_range = get_confirmed_range_delta(); - const u32 valid_offset = valid_range.first; - const u32 valid_length = valid_range.second; - AUDIT(valid_length > 0); - - void* pixels_src = dma_buffer->map(valid_offset, valid_length); - void* pixels_dst = get_ptr(get_section_base() + valid_offset); - - if (real_pitch >= rsx_pitch || valid_length <= rsx_pitch) - { - memcpy(pixels_dst, pixels_src, valid_length); - } - else - { - if (valid_length % rsx_pitch) - { - fmt::throw_exception("Unreachable" HERE); - } - - const u32 num_rows = valid_length / rsx_pitch; - auto _src = (u8*)pixels_src; - auto _dst = (u8*)pixels_dst; - - for (u32 y = 0; y < num_rows; ++y) - { - memcpy(_dst, _src, real_pitch); - _src += real_pitch; - _dst += real_pitch; - } - } - - dma_buffer->unmap(); - baseclass::on_flush(!result); - - //Its highly likely that this surface will be reused, so we just leave resources in place - return result; + copy_texture(blocking, cmd, submit_queue); } + void* map_synchronized(u32 offset, u32 size) + { + AUDIT(synchronized); + + return dma_buffer->map(offset, size); + } + + void finish_flush() + { + dma_buffer->unmap(); + } + + + /** + * Misc + */ void set_unpack_swap_bytes(bool swap_bytes) { pack_unpack_swap_bytes = swap_bytes; diff --git a/rpcs3/Emu/RSX/rsx_cache.h b/rpcs3/Emu/RSX/rsx_cache.h index 42c5b5708e..52509f40b5 100644 --- a/rpcs3/Emu/RSX/rsx_cache.h +++ b/rpcs3/Emu/RSX/rsx_cache.h @@ -225,6 +225,11 @@ namespace rsx return get_bounds(bounds).overlaps(other); } + inline bool overlaps(const address_range_vector &other, section_bounds bounds) const + { + return get_bounds(bounds).overlaps(other); + } + inline bool overlaps(const buffered_section &other, section_bounds bounds) const { return get_bounds(bounds).overlaps(other.get_bounds(bounds)); @@ -235,6 +240,11 @@ namespace rsx return get_bounds(bounds).inside(other); } + inline bool inside(const address_range_vector &other, section_bounds bounds) const + { + return get_bounds(bounds).inside(other); + } + inline bool inside(const buffered_section &other, section_bounds bounds) const { return get_bounds(bounds).inside(other.get_bounds(bounds)); @@ -316,7 +326,7 @@ namespace rsx * Super Pointer */ template - inline T* get_ptr(u32 address) + inline T* get_ptr(u32 address) const { return reinterpret_cast(vm::g_sudo_addr + address); }