From d846142f0cfeda666c1fbd62eb93d4cdb051aec5 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Sun, 24 Jul 2022 20:28:57 +0300 Subject: [PATCH] vk: Reimplement compliant async texture streaming - Use CONCURRENT queue access instead of fighting with queue acquire/release via submit chains. The minor benefits of forcing EXCLUSIVE mode are buried under the huge penalty of multiple vkQueueSubmit. Batching submits does not help alleviate this situation. We simply must avoid interrupting execution. --- rpcs3/Emu/RSX/VK/VKDraw.cpp | 17 +++----- rpcs3/Emu/RSX/VK/VKGSRender.cpp | 30 +++---------- rpcs3/Emu/RSX/VK/VKGSRender.h | 1 - rpcs3/Emu/RSX/VK/VKTextureCache.cpp | 45 ++++++++++++++------ rpcs3/Emu/RSX/VK/VKTextureCache.h | 5 ++- rpcs3/Emu/RSX/VK/upscalers/fsr1/fsr_pass.cpp | 2 +- rpcs3/Emu/RSX/VK/vkutils/image.cpp | 44 ++++++++++++++++--- rpcs3/Emu/RSX/VK/vkutils/image.h | 10 +++-- rpcs3/Emu/RSX/VK/vkutils/scratch.cpp | 2 +- 9 files changed, 95 insertions(+), 61 deletions(-) diff --git a/rpcs3/Emu/RSX/VK/VKDraw.cpp b/rpcs3/Emu/RSX/VK/VKDraw.cpp index 568d72f869..892328fa80 100644 --- a/rpcs3/Emu/RSX/VK/VKDraw.cpp +++ b/rpcs3/Emu/RSX/VK/VKDraw.cpp @@ -386,20 +386,13 @@ void VKGSRender::load_texture_env() // We have to do this here, because we have to assume the CB will be dumped auto& async_task_scheduler = g_fxo->get(); - if (async_task_scheduler.is_recording()) + if (async_task_scheduler.is_recording() && + !async_task_scheduler.is_host_mode()) { - if (async_task_scheduler.is_host_mode()) + // Sync any async scheduler tasks + if (auto ev = async_task_scheduler.get_primary_sync_label()) { - flush_command_queue(); - ensure(!async_task_scheduler.is_recording()); - } - else - { - // Sync any async scheduler tasks - if (auto ev = async_task_scheduler.get_primary_sync_label()) - { - ev->gpu_wait(*m_current_command_buffer); - } + ev->gpu_wait(*m_current_command_buffer); } } } diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.cpp b/rpcs3/Emu/RSX/VK/VKGSRender.cpp index c503ff21d0..5876a9b573 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRender.cpp +++ b/rpcs3/Emu/RSX/VK/VKGSRender.cpp @@ -2252,11 +2252,8 @@ void VKGSRender::close_and_submit_command_buffer(vk::fence* pFence, VkSemaphore m_current_command_buffer->end(); m_current_command_buffer->tag(); - // Flush any asynchronously scheduled jobs - // So this is a bit trippy, but, in this case, the primary CB contains the 'release' operations, not the acquire ones. - // The CB that comes in after this submit will acquire the yielded resources automatically. - // This means the primary CB is the precursor to the async CB not the other way around. - // Async CB should wait for the primary CB to signal. + // Supporting concurrent access vastly simplifies this logic. + // Instead of doing CB slice injection, we can just chain these together logically with the async stream going first vk::queue_submit_t primary_submit_info{ m_device->get_graphics_queue(), pFence }; vk::queue_submit_t secondary_submit_info{}; @@ -2265,28 +2262,20 @@ void VKGSRender::close_and_submit_command_buffer(vk::fence* pFence, VkSemaphore primary_submit_info.wait_on(wait_semaphore, pipeline_stage_flags); } - if (const auto wait_sema = std::exchange(m_dangling_semaphore_signal, nullptr)) - { - // TODO: Sync on VS stage - primary_submit_info.wait_on(wait_sema, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT); - } - auto& async_scheduler = g_fxo->get(); - const bool require_secondary_flush = async_scheduler.is_recording(); - if (async_scheduler.is_recording()) { if (async_scheduler.is_host_mode()) { - // Inject dependency chain using semaphores. - // HEAD = externally synchronized. - // TAIL = insert dangling wait, from the async CB to the next CB down. - m_dangling_semaphore_signal = *async_scheduler.get_sema(); - secondary_submit_info.queue_signal(m_dangling_semaphore_signal); + const VkSemaphore async_sema = *async_scheduler.get_sema(); + secondary_submit_info.queue_signal(async_sema); + primary_submit_info.wait_on(async_sema, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT); // Delay object destruction by one cycle vk::get_resource_manager()->push_down_current_scope(); } + + async_scheduler.flush(secondary_submit_info, force_flush); } if (signal_semaphore) @@ -2296,11 +2285,6 @@ void VKGSRender::close_and_submit_command_buffer(vk::fence* pFence, VkSemaphore m_current_command_buffer->submit(primary_submit_info, force_flush); - if (require_secondary_flush) - { - async_scheduler.flush(secondary_submit_info, force_flush); - } - m_queue_status.clear(flush_queue_state::flushing); } diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.h b/rpcs3/Emu/RSX/VK/VKGSRender.h index b8138f15e8..8a3fc3e893 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRender.h +++ b/rpcs3/Emu/RSX/VK/VKGSRender.h @@ -115,7 +115,6 @@ private: vk::command_pool m_command_buffer_pool; vk::command_buffer_chain m_primary_cb_list; vk::command_buffer_chunk* m_current_command_buffer = nullptr; - VkSemaphore m_dangling_semaphore_signal = VK_NULL_HANDLE; volatile vk::host_data_t* m_host_data_ptr = nullptr; std::unique_ptr m_host_object_data; diff --git a/rpcs3/Emu/RSX/VK/VKTextureCache.cpp b/rpcs3/Emu/RSX/VK/VKTextureCache.cpp index 934c599744..6f7721b2ac 100644 --- a/rpcs3/Emu/RSX/VK/VKTextureCache.cpp +++ b/rpcs3/Emu/RSX/VK/VKTextureCache.cpp @@ -7,7 +7,7 @@ namespace vk { - u64 hash_image_properties(VkFormat format, u16 w, u16 h, u16 d, u16 mipmaps, VkImageType type, VkImageCreateFlags create_flags) + u64 hash_image_properties(VkFormat format, u16 w, u16 h, u16 d, u16 mipmaps, VkImageType type, VkImageCreateFlags create_flags, VkSharingMode sharing_mode) { /** * Key layout: @@ -17,7 +17,8 @@ namespace vk * 40-48: Depth (Max 255) * 48-54: Mipmaps (Max 63) <- We have some room here, it is not possible to have more than 12 mip levels on PS3 and 16 on PC is pushing it. * 54-56: Type (Max 3) - * 56-64: Flags (Max 255) <- We have some room here, we only care about a small subset of create flags. + * 56-57: Sharing (Max 1) <- Boolean. Exclusive = 0, shared = 1 + * 57-64: Flags (Max 127) <- We have some room here, we only care about a small subset of create flags. */ ensure(static_cast(format) < 0xFF); return (static_cast(format) & 0xFF) | @@ -26,7 +27,8 @@ namespace vk (static_cast(d) << 40) | (static_cast(mipmaps) << 48) | (static_cast(type) << 54) | - (static_cast(create_flags) << 56); + (static_cast(sharing_mode) << 56) | + (static_cast(create_flags) << 57); } texture_cache::cached_image_reference_t::cached_image_reference_t(texture_cache* parent, std::unique_ptr& previous) @@ -44,7 +46,7 @@ namespace vk data->current_queue_family = VK_QUEUE_FAMILY_IGNORED; // Move this object to the cached image pool - const auto key = hash_image_properties(data->format(), data->width(), data->height(), data->depth(), data->mipmaps(), data->info.imageType, data->info.flags); + const auto key = hash_image_properties(data->format(), data->width(), data->height(), data->depth(), data->mipmaps(), data->info.imageType, data->info.flags, data->info.sharingMode); std::lock_guard lock(parent->m_cached_pool_lock); if (!parent->m_cache_is_exiting) @@ -506,13 +508,13 @@ namespace vk return result; } - std::unique_ptr texture_cache::find_cached_image(VkFormat format, u16 w, u16 h, u16 d, u16 mipmaps, VkImageType type, VkImageCreateFlags create_flags, VkImageUsageFlags usage) + std::unique_ptr texture_cache::find_cached_image(VkFormat format, u16 w, u16 h, u16 d, u16 mipmaps, VkImageType type, VkImageCreateFlags create_flags, VkImageUsageFlags usage, VkSharingMode sharing) { reader_lock lock(m_cached_pool_lock); if (!m_cached_images.empty()) { - const u64 desired_key = hash_image_properties(format, w, h, d, mipmaps, type, create_flags); + const u64 desired_key = hash_image_properties(format, w, h, d, mipmaps, type, create_flags, sharing); lock.upgrade(); for (auto it = m_cached_images.begin(); it != m_cached_images.end(); ++it) @@ -538,7 +540,7 @@ namespace vk const VkFormat dst_format = vk::get_compatible_sampler_format(m_formats_support, gcm_format); const u16 layers = (view_type == VK_IMAGE_VIEW_TYPE_CUBE) ? 6 : 1; - auto image = find_cached_image(dst_format, w, h, d, mips, image_type, image_flags, usage_flags); + auto image = find_cached_image(dst_format, w, h, d, mips, image_type, image_flags, usage_flags, VK_SHARING_MODE_EXCLUSIVE); if (!image) { @@ -546,7 +548,7 @@ namespace vk image_type, dst_format, w, h, d, mips, layers, VK_SAMPLE_COUNT_1_BIT, VK_IMAGE_LAYOUT_UNDEFINED, - VK_IMAGE_TILING_OPTIMAL, VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_SAMPLED_BIT, image_flags | VK_IMAGE_CREATE_ALLOW_NULL, + VK_IMAGE_TILING_OPTIMAL, VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_SAMPLED_BIT, image_flags | VK_IMAGE_CREATE_ALLOW_NULL_RPCS3, VMM_ALLOCATION_POOL_TEXTURE_CACHE, rsx::classify_format(gcm_format)); if (!image->value) @@ -823,7 +825,18 @@ namespace vk if (region.exists()) { image = dynamic_cast(region.get_raw_texture()); - if ((flags & texture_create_flags::do_not_reuse) || !image || region.get_image_type() != type || image->depth() != depth) // TODO + bool reusable = true; + + if (flags & texture_create_flags::do_not_reuse) + { + reusable = false; + } + else if (flags & texture_create_flags::shareable) + { + reusable = (image && image->sharing_mode() == VK_SHARING_MODE_CONCURRENT); + } + + if (!reusable || !image || region.get_image_type() != type || image->depth() != depth) // TODO { // Incompatible view/type region.destroy(); @@ -860,14 +873,20 @@ namespace vk { const bool is_cubemap = type == rsx::texture_dimension_extended::texture_dimension_cubemap; const VkFormat vk_format = get_compatible_sampler_format(m_formats_support, gcm_format); - const VkImageCreateFlags create_flags = is_cubemap ? VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT : 0; + VkImageCreateFlags create_flags = is_cubemap ? VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT : 0; + VkSharingMode sharing_mode = (flags & texture_create_flags::shareable) ? VK_SHARING_MODE_CONCURRENT : VK_SHARING_MODE_EXCLUSIVE; - if (auto found = find_cached_image(vk_format, width, height, depth, mipmaps, image_type, create_flags, usage_flags)) + if (auto found = find_cached_image(vk_format, width, height, depth, mipmaps, image_type, create_flags, usage_flags, sharing_mode)) { image = found.release(); } else { + if (sharing_mode == VK_SHARING_MODE_CONCURRENT) + { + create_flags |= VK_IMAGE_CREATE_SHAREABLE_RPCS3; + } + image = new vk::viewable_image(*m_device, m_memory_types.device_local, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, image_type, vk_format, @@ -946,7 +965,9 @@ namespace vk } } - const rsx::flags32_t create_flags = g_fxo->get().is_host_mode() ? texture_create_flags::do_not_reuse : 0; + const rsx::flags32_t create_flags = g_fxo->get().is_host_mode() + ? (texture_create_flags::shareable | texture_create_flags::do_not_reuse) + : 0; auto section = create_new_texture(cmd, rsx_range, width, height, depth, mipmaps, pitch, gcm_format, context, type, swizzled, rsx::component_order::default_, create_flags); diff --git a/rpcs3/Emu/RSX/VK/VKTextureCache.h b/rpcs3/Emu/RSX/VK/VKTextureCache.h index f63806ec9c..507225bccd 100644 --- a/rpcs3/Emu/RSX/VK/VKTextureCache.h +++ b/rpcs3/Emu/RSX/VK/VKTextureCache.h @@ -390,7 +390,8 @@ namespace vk enum texture_create_flags : u32 { initialize_image_contents = 1, - do_not_reuse = 2 + do_not_reuse = 2, + shareable = 4 }; void on_section_destroyed(cached_texture_section& tex) override; @@ -421,7 +422,7 @@ namespace vk vk::image* get_template_from_collection_impl(const std::vector& sections_to_transfer) const; - std::unique_ptr find_cached_image(VkFormat format, u16 w, u16 h, u16 d, u16 mipmaps, VkImageType type, VkImageCreateFlags create_flags, VkImageUsageFlags usage); + std::unique_ptr find_cached_image(VkFormat format, u16 w, u16 h, u16 d, u16 mipmaps, VkImageType type, VkImageCreateFlags create_flags, VkImageUsageFlags usage, VkSharingMode sharing); protected: vk::image_view* create_temporary_subresource_view_impl(vk::command_buffer& cmd, vk::image* source, VkImageType image_type, VkImageViewType view_type, diff --git a/rpcs3/Emu/RSX/VK/upscalers/fsr1/fsr_pass.cpp b/rpcs3/Emu/RSX/VK/upscalers/fsr1/fsr_pass.cpp index 6b51a98c7e..1dcee7d934 100644 --- a/rpcs3/Emu/RSX/VK/upscalers/fsr1/fsr_pass.cpp +++ b/rpcs3/Emu/RSX/VK/upscalers/fsr1/fsr_pass.cpp @@ -210,7 +210,7 @@ namespace vk VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_TILING_OPTIMAL, usage, - VK_IMAGE_CREATE_ALLOW_NULL, // Allow creation to fail if there is no memory + VK_IMAGE_CREATE_ALLOW_NULL_RPCS3, // Allow creation to fail if there is no memory VMM_ALLOCATION_POOL_SWAPCHAIN, RSX_FORMAT_CLASS_COLOR); }; diff --git a/rpcs3/Emu/RSX/VK/vkutils/image.cpp b/rpcs3/Emu/RSX/VK/vkutils/image.cpp index 265e68c153..3f03185017 100644 --- a/rpcs3/Emu/RSX/VK/vkutils/image.cpp +++ b/rpcs3/Emu/RSX/VK/vkutils/image.cpp @@ -71,6 +71,18 @@ namespace vk info.initialLayout = initial_layout; info.sharingMode = VK_SHARING_MODE_EXCLUSIVE; + if (image_flags & VK_IMAGE_CREATE_SHAREABLE_RPCS3) + { + u32 queue_families[] = { + dev.get_graphics_queue_family(), + dev.get_transfer_queue_family() + }; + + info.sharingMode = VK_SHARING_MODE_CONCURRENT; + info.queueFamilyIndexCount = 2; + info.pQueueFamilyIndices = queue_families; + } + create_impl(dev, access_flags, memory_type, allocation_pool); m_storage_aspect = get_aspect_flags(format); @@ -101,8 +113,8 @@ namespace vk ensure(!value && !memory); validate(dev, info); - const bool nullable = !!(info.flags & VK_IMAGE_CREATE_ALLOW_NULL); - info.flags &= ~VK_IMAGE_CREATE_ALLOW_NULL; + const bool nullable = !!(info.flags & VK_IMAGE_CREATE_ALLOW_NULL_RPCS3); + info.flags &= ~VK_IMAGE_CREATE_SPECIAL_FLAGS_RPCS3; CHECK_RESULT(vkCreateImage(m_device, &info, nullptr, &value)); @@ -170,6 +182,11 @@ namespace vk return info.imageType; } + VkSharingMode image::sharing_mode() const + { + return info.sharingMode; + } + VkImageAspectFlags image::aspect() const { return m_storage_aspect; @@ -210,8 +227,14 @@ namespace vk { ensure(m_layout_stack.empty()); ensure(current_queue_family != cmd.get_queue_family()); - VkImageSubresourceRange range = { aspect(), 0, mipmaps(), 0, layers() }; - change_image_layout(cmd, value, current_layout, new_layout, range, current_queue_family, cmd.get_queue_family(), 0u, ~0u); + + if (info.sharingMode == VK_SHARING_MODE_EXCLUSIVE || current_layout != new_layout) + { + VkImageSubresourceRange range = { aspect(), 0, mipmaps(), 0, layers() }; + const u32 src_queue_family = info.sharingMode == VK_SHARING_MODE_EXCLUSIVE ? current_queue_family : VK_QUEUE_FAMILY_IGNORED; + const u32 dst_queue_family = info.sharingMode == VK_SHARING_MODE_EXCLUSIVE ? cmd.get_queue_family() : VK_QUEUE_FAMILY_IGNORED; + change_image_layout(cmd, value, current_layout, new_layout, range, src_queue_family, dst_queue_family, 0u, ~0u); + } current_layout = new_layout; current_queue_family = cmd.get_queue_family(); @@ -221,8 +244,17 @@ namespace vk { ensure(current_queue_family == src_queue_cmd.get_queue_family()); ensure(m_layout_stack.empty()); - VkImageSubresourceRange range = { aspect(), 0, mipmaps(), 0, layers() }; - change_image_layout(src_queue_cmd, value, current_layout, new_layout, range, current_queue_family, dst_queue_family, ~0u, 0u); + + if (info.sharingMode == VK_SHARING_MODE_EXCLUSIVE || current_layout != new_layout) + { + VkImageSubresourceRange range = { aspect(), 0, mipmaps(), 0, layers() }; + const u32 src_queue_family = info.sharingMode == VK_SHARING_MODE_EXCLUSIVE ? current_queue_family : VK_QUEUE_FAMILY_IGNORED; + const u32 dst_queue_family2 = info.sharingMode == VK_SHARING_MODE_EXCLUSIVE ? dst_queue_family : VK_QUEUE_FAMILY_IGNORED; + change_image_layout(src_queue_cmd, value, current_layout, new_layout, range, current_queue_family, dst_queue_family2, ~0u, 0u); + } + + current_layout = new_layout; + current_queue_family = dst_queue_family; } void image::change_layout(const command_buffer& cmd, VkImageLayout new_layout) diff --git a/rpcs3/Emu/RSX/VK/vkutils/image.h b/rpcs3/Emu/RSX/VK/vkutils/image.h index 8a570b7ddf..4a0372d226 100644 --- a/rpcs3/Emu/RSX/VK/vkutils/image.h +++ b/rpcs3/Emu/RSX/VK/vkutils/image.h @@ -18,9 +18,12 @@ namespace vk { enum : u32// special remap_encoding enums { - VK_REMAP_IDENTITY = 0xCAFEBABE, // Special view encoding to return an identity image view - VK_REMAP_VIEW_MULTISAMPLED = 0xDEADBEEF, // Special encoding for multisampled images; returns a multisampled image view - VK_IMAGE_CREATE_ALLOW_NULL = 0x80000000, // Special flag that allows null images to be created if there is no memory + VK_REMAP_IDENTITY = 0xCAFEBABE, // Special view encoding to return an identity image view + VK_REMAP_VIEW_MULTISAMPLED = 0xDEADBEEF, // Special encoding for multisampled images; returns a multisampled image view + VK_IMAGE_CREATE_ALLOW_NULL_RPCS3 = 0x80000000, // Special flag that allows null images to be created if there is no memory + VK_IMAGE_CREATE_SHAREABLE_RPCS3 = 0x40000000, // Special flag to create a shareable image + + VK_IMAGE_CREATE_SPECIAL_FLAGS_RPCS3 = (VK_IMAGE_CREATE_ALLOW_NULL_RPCS3 | VK_IMAGE_CREATE_SHAREABLE_RPCS3) }; class image @@ -73,6 +76,7 @@ namespace vk u8 samples() const; VkFormat format() const; VkImageType type() const; + VkSharingMode sharing_mode() const; VkImageAspectFlags aspect() const; rsx::format_class format_class() const; diff --git a/rpcs3/Emu/RSX/VK/vkutils/scratch.cpp b/rpcs3/Emu/RSX/VK/vkutils/scratch.cpp index a07a50d59c..7b47108a3d 100644 --- a/rpcs3/Emu/RSX/VK/vkutils/scratch.cpp +++ b/rpcs3/Emu/RSX/VK/vkutils/scratch.cpp @@ -97,7 +97,7 @@ namespace vk auto& tex = g_null_image_views[type]; tex = std::make_unique(*g_render_device, g_render_device->get_memory_mapping().device_local, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, image_type, VK_FORMAT_B8G8R8A8_UNORM, size, size, 1, 1, num_layers, VK_SAMPLE_COUNT_1_BIT, VK_IMAGE_LAYOUT_UNDEFINED, - VK_IMAGE_TILING_OPTIMAL, VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_SAMPLED_BIT, flags | VK_IMAGE_CREATE_ALLOW_NULL, VMM_ALLOCATION_POOL_SCRATCH); + VK_IMAGE_TILING_OPTIMAL, VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_SAMPLED_BIT, flags | VK_IMAGE_CREATE_ALLOW_NULL_RPCS3, VMM_ALLOCATION_POOL_SCRATCH); if (!tex->value) {