diff --git a/rpcs3/Emu/RSX/Common/TextureUtils.cpp b/rpcs3/Emu/RSX/Common/TextureUtils.cpp index ea0cd557f1..f855e22357 100644 --- a/rpcs3/Emu/RSX/Common/TextureUtils.cpp +++ b/rpcs3/Emu/RSX/Common/TextureUtils.cpp @@ -296,6 +296,55 @@ struct copy_rgb655_block_swizzled namespace { + /** + * Generates copy instructions required to build the texture GPU side without actually copying anything. + * Returns a set of addresses and data lengths to use. This can be used to generate a GPU task to avoid CPU doing the heavy lifting. + */ + std::vector + build_transfer_cmds(const void* src, u16 block_size_in_bytes, u16 width_in_block, u16 row_count, u16 depth, u8 border, u32 dst_pitch_in_block, u32 src_pitch_in_block) + { + std::vector result; + + if (src_pitch_in_block == dst_pitch_in_block && !border) + { + // Fast copy + rsx::memory_transfer_cmd cmd; + cmd.src = src; + cmd.dst = nullptr; + cmd.length = src_pitch_in_block * block_size_in_bytes * row_count * depth; + return { cmd }; + } + + const u32 width_in_bytes = width_in_block * block_size_in_bytes; + const u32 src_pitch_in_bytes = src_pitch_in_block * block_size_in_bytes; + const u32 dst_pitch_in_bytes = dst_pitch_in_block * block_size_in_bytes; + + const u32 h_porch = border * block_size_in_bytes; + const u32 v_porch = src_pitch_in_bytes * border; + + auto src_ = static_cast(src) + h_porch; + auto dst_ = static_cast(nullptr); + + for (int layer = 0; layer < depth; ++layer) + { + // Front + src_ += v_porch; + + for (int row = 0; row < row_count; ++row) + { + rsx::memory_transfer_cmd cmd{ dst_, src_, width_in_bytes }; + result.push_back(cmd); + src_ += src_pitch_in_bytes; + dst_ += dst_pitch_in_bytes; + } + + // Back + src_ += v_porch; + } + + return result; + } + /** * Texture upload template. * @@ -533,7 +582,7 @@ namespace rsx return get_subresources_layout_impl(texture); } - texture_memory_info upload_texture_subresource(gsl::span dst_buffer, const rsx::subresource_layout& src_layout, int format, bool is_swizzled, const texture_uploader_capabilities& caps) + texture_memory_info upload_texture_subresource(gsl::span dst_buffer, const rsx::subresource_layout& src_layout, int format, bool is_swizzled, texture_uploader_capabilities& caps) { u16 w = src_layout.width_in_block; u16 h = src_layout.height_in_block; @@ -644,6 +693,11 @@ namespace rsx // Remove the VTC tiling to support ATI and Vulkan. copy_unmodified_block_vtc::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span(src_layout.data), w, h, depth, get_row_pitch_in_block(w, caps.alignment), src_layout.pitch_in_block); } + else if (caps.supports_zero_copy) + { + result.require_upload = true; + result.deferred_cmds = build_transfer_cmds(src_layout.data.data(), 8, w, h, depth, 0, get_row_pitch_in_block(w, caps.alignment), src_layout.pitch_in_block); + } else { copy_unmodified_block::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span(src_layout.data), 1, w, h, depth, 0, get_row_pitch_in_block(w, caps.alignment), src_layout.pitch_in_block); @@ -661,6 +715,11 @@ namespace rsx // Remove the VTC tiling to support ATI and Vulkan. copy_unmodified_block_vtc::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span(src_layout.data), w, h, depth, get_row_pitch_in_block(w, caps.alignment), src_layout.pitch_in_block); } + else if (caps.supports_zero_copy) + { + result.require_upload = true; + result.deferred_cmds = build_transfer_cmds(src_layout.data.data(), 16, w, h, depth, 0, get_row_pitch_in_block(w, caps.alignment), src_layout.pitch_in_block); + } else { copy_unmodified_block::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span(src_layout.data), 1, w, h, depth, 0, get_row_pitch_in_block(w, caps.alignment), src_layout.pitch_in_block); @@ -676,57 +735,73 @@ namespace rsx { if (word_size == 1) { - if (is_swizzled) - copy_unmodified_block_swizzled::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block); - else - copy_unmodified_block::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block); - } - else if (caps.supports_byteswap) - { - result.require_swap = true; - result.element_size = word_size; - result.block_length = words_per_block; - - if (word_size == 2) + if (caps.supports_zero_copy) { - if (is_swizzled) - { - if (((word_size * words_per_block) & 3) == 0 && caps.supports_hw_deswizzle) - { - result.require_deswizzle = true; - } - } - - if (is_swizzled && !result.require_deswizzle) - copy_unmodified_block_swizzled::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block); - else - copy_unmodified_block::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block); + result.require_upload = true; + result.deferred_cmds = build_transfer_cmds(src_layout.data.data(), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block); } - else if (word_size == 4) + else if (is_swizzled) { - result.require_deswizzle = (is_swizzled && caps.supports_hw_deswizzle); - - if (is_swizzled && !caps.supports_hw_deswizzle) - copy_unmodified_block_swizzled::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block); - else - copy_unmodified_block::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block); + copy_unmodified_block_swizzled::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block); + } + else + { + copy_unmodified_block::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block); } } else { - if (word_size == 2) + bool require_cpu_swizzle = !caps.supports_hw_deswizzle; + bool require_cpu_byteswap = !caps.supports_byteswap; + + if (is_swizzled && caps.supports_hw_deswizzle) { - if (is_swizzled) - copy_unmodified_block_swizzled::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span>(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block); + if (word_size == 4 || (((word_size * words_per_block) & 3) == 0)) + { + result.require_deswizzle = true; + } else - copy_unmodified_block::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span>(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block); + { + require_cpu_swizzle = true; + } } - else if (word_size == 4) + + if (!require_cpu_byteswap && !require_cpu_swizzle) { - if (is_swizzled) - copy_unmodified_block_swizzled::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span>(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block); - else - copy_unmodified_block::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span>(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block); + result.require_deswizzle = is_swizzled; + result.require_swap = true; + result.element_size = word_size; + + if (caps.supports_zero_copy) + { + result.require_upload = true; + result.deferred_cmds = build_transfer_cmds(src_layout.data.data(), word_size * words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block); + } + else if (word_size == 2) + { + copy_unmodified_block::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block); + } + else if (word_size == 4) + { + copy_unmodified_block::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block); + } + } + else + { + if (word_size == 2) + { + if (is_swizzled) + copy_unmodified_block_swizzled::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span>(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block); + else + copy_unmodified_block::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span>(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block); + } + else if (word_size == 4) + { + if (is_swizzled) + copy_unmodified_block_swizzled::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span>(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block); + else + copy_unmodified_block::copy_mipmap_level(as_span_workaround(dst_buffer), as_const_span>(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block); + } } } } diff --git a/rpcs3/Emu/RSX/Common/TextureUtils.h b/rpcs3/Emu/RSX/Common/TextureUtils.h index 1339e73e5c..3c576ecc05 100644 --- a/rpcs3/Emu/RSX/Common/TextureUtils.h +++ b/rpcs3/Emu/RSX/Common/TextureUtils.h @@ -112,12 +112,22 @@ namespace rsx u32 pitch_in_block; }; + struct memory_transfer_cmd + { + const void* dst; + const void* src; + u32 length; + }; + struct texture_memory_info { int element_size; int block_length; bool require_swap; bool require_deswizzle; + bool require_upload; + + std::vector deferred_cmds; }; struct texture_uploader_capabilities @@ -125,6 +135,7 @@ namespace rsx bool supports_byteswap; bool supports_vtc_decoding; bool supports_hw_deswizzle; + bool supports_zero_copy; usz alignment; }; @@ -143,7 +154,7 @@ namespace rsx std::vector get_subresources_layout(const rsx::fragment_texture &texture); std::vector get_subresources_layout(const rsx::vertex_texture &texture); - texture_memory_info upload_texture_subresource(gsl::span dst_buffer, const subresource_layout &src_layout, int format, bool is_swizzled, const texture_uploader_capabilities& caps); + texture_memory_info upload_texture_subresource(gsl::span dst_buffer, const subresource_layout &src_layout, int format, bool is_swizzled, texture_uploader_capabilities& caps); u8 get_format_block_size_in_bytes(int format); u8 get_format_block_size_in_texel(int format); diff --git a/rpcs3/Emu/RSX/GL/GLTexture.cpp b/rpcs3/Emu/RSX/GL/GLTexture.cpp index 4ba4c01c73..d55f974ff4 100644 --- a/rpcs3/Emu/RSX/GL/GLTexture.cpp +++ b/rpcs3/Emu/RSX/GL/GLTexture.cpp @@ -645,7 +645,7 @@ namespace gl const std::vector &input_layouts, bool is_swizzled, GLenum gl_format, GLenum gl_type, std::vector& staging_buffer) { - rsx::texture_uploader_capabilities caps{ true, false, false, 4 }; + rsx::texture_uploader_capabilities caps{ true, false, false, false, 4 }; pixel_unpack_settings unpack_settings; unpack_settings.row_length(0).alignment(4); diff --git a/rpcs3/Emu/RSX/VK/VKDMA.cpp b/rpcs3/Emu/RSX/VK/VKDMA.cpp index e1d96f1637..1e13291416 100644 --- a/rpcs3/Emu/RSX/VK/VKDMA.cpp +++ b/rpcs3/Emu/RSX/VK/VKDMA.cpp @@ -178,7 +178,7 @@ namespace vk return inheritance_info.parent->head(); } - void dma_block::set_parent(command_buffer& cmd, dma_block* parent) + void dma_block::set_parent(const command_buffer& cmd, dma_block* parent) { ensure(parent); if (inheritance_info.parent == parent) @@ -206,7 +206,7 @@ namespace vk } } - void dma_block::extend(command_buffer& cmd, const render_device &dev, usz new_size) + void dma_block::extend(const command_buffer& cmd, const render_device &dev, usz new_size) { ensure(allocated_memory); if (new_size <= allocated_memory->size()) @@ -244,7 +244,7 @@ namespace vk return (allocated_memory) ? allocated_memory->size() : 0; } - std::pair map_dma(command_buffer& cmd, u32 local_address, u32 length) + std::pair map_dma(const command_buffer& cmd, u32 local_address, u32 length) { const auto map_range = utils::address_range::start_length(local_address, length); const auto first_block = (local_address & s_dma_block_mask); diff --git a/rpcs3/Emu/RSX/VK/VKDMA.h b/rpcs3/Emu/RSX/VK/VKDMA.h index 827238941c..6dbd6104f5 100644 --- a/rpcs3/Emu/RSX/VK/VKDMA.h +++ b/rpcs3/Emu/RSX/VK/VKDMA.h @@ -4,7 +4,7 @@ namespace vk { - std::pair map_dma(command_buffer& cmd, u32 local_address, u32 length); + std::pair map_dma(const command_buffer& cmd, u32 local_address, u32 length); void load_dma(u32 local_address, u32 length); void flush_dma(u32 local_address, u32 length); @@ -52,7 +52,7 @@ namespace vk dma_block* head(); const dma_block* head() const; - void set_parent(command_buffer& cmd, dma_block* parent); - void extend(command_buffer& cmd, const render_device& dev, usz new_size); + void set_parent(const command_buffer& cmd, dma_block* parent); + void extend(const command_buffer& cmd, const render_device& dev, usz new_size); }; } diff --git a/rpcs3/Emu/RSX/VK/VKHelpers.h b/rpcs3/Emu/RSX/VK/VKHelpers.h index f63ed77086..ab1bf0408b 100644 --- a/rpcs3/Emu/RSX/VK/VKHelpers.h +++ b/rpcs3/Emu/RSX/VK/VKHelpers.h @@ -68,7 +68,7 @@ namespace vk * Then copy all layers into dst_image. * dst_image must be in TRANSFER_DST_OPTIMAL layout and upload_buffer have TRANSFER_SRC_BIT usage flag. */ - void copy_mipmaped_image_using_buffer(VkCommandBuffer cmd, vk::image* dst_image, + void copy_mipmaped_image_using_buffer(const vk::command_buffer& cmd, vk::image* dst_image, const std::vector& subresource_layout, int format, bool is_swizzled, u16 mipmap_count, VkImageAspectFlags flags, vk::data_heap &upload_heap, u32 heap_align = 0); diff --git a/rpcs3/Emu/RSX/VK/VKTexture.cpp b/rpcs3/Emu/RSX/VK/VKTexture.cpp index bd58801462..aa2239fe88 100644 --- a/rpcs3/Emu/RSX/VK/VKTexture.cpp +++ b/rpcs3/Emu/RSX/VK/VKTexture.cpp @@ -2,6 +2,7 @@ #include "VKHelpers.h" #include "VKFormats.h" #include "VKCompute.h" +#include "VKDMA.h" #include "VKRenderPass.h" #include "VKRenderTargets.h" @@ -800,7 +801,7 @@ namespace vk ensure(dst_offset <= scratch_buf->size()); } - void copy_mipmaped_image_using_buffer(VkCommandBuffer cmd, vk::image* dst_image, + void copy_mipmaped_image_using_buffer(const vk::command_buffer& cmd, vk::image* dst_image, const std::vector& subresource_layout, int format, bool is_swizzled, u16 mipmap_count, VkImageAspectFlags flags, vk::data_heap &upload_heap, u32 heap_align) { @@ -808,7 +809,7 @@ namespace vk u32 block_in_pixel = rsx::get_format_block_size_in_texel(format); u8 block_size_in_bytes = rsx::get_format_block_size_in_bytes(format); - rsx::texture_uploader_capabilities caps{ true, false, true, heap_align }; + rsx::texture_uploader_capabilities caps{ true, false, true, true, heap_align }; rsx::texture_memory_info opt{}; bool check_caps = true; @@ -820,6 +821,9 @@ namespace vk std::vector buffer_copies; copy_regions.reserve(subresource_layout.size()); + VkBuffer read_buffer = upload_heap.heap->value; + VkDeviceSize offset_in_read_buffer = 0; + if (vk::is_renderpass_open(cmd)) { vk::end_renderpass(cmd); @@ -877,6 +881,33 @@ namespace vk copy_info.imageSubresource.mipLevel = layout.level; copy_info.bufferRowLength = std::max(block_in_pixel * row_pitch / block_size_in_bytes, layout.width_in_texel); + if (opt.require_upload) + { + ensure(!opt.deferred_cmds.empty()); + + auto base_addr = static_cast(opt.deferred_cmds.front().src); + auto end_addr = static_cast(opt.deferred_cmds.back().src) + opt.deferred_cmds.back().length; + auto data_length = end_addr - base_addr; + u64 src_address = 0; + + if (uptr(base_addr) > uptr(vm::g_sudo_addr)) + { + src_address = uptr(base_addr) - uptr(vm::g_sudo_addr); + } + else + { + src_address = uptr(base_addr) - uptr(vm::g_base_addr); + } + + auto dma_mapping = vk::map_dma(cmd, static_cast(src_address), static_cast(data_length)); + vk::load_dma(src_address, data_length); + + read_buffer = dma_mapping.second->value; + offset_in_read_buffer = dma_mapping.first; + + copy_info.bufferOffset = offset_in_read_buffer; + } + if (opt.require_swap || opt.require_deswizzle || requires_depth_processing) { if (!scratch_buf) @@ -892,11 +923,25 @@ namespace vk } // Copy from upload heap to scratch mem - buffer_copies.push_back({}); - auto& copy = buffer_copies.back(); - copy.srcOffset = offset_in_buffer; - copy.dstOffset = scratch_offset; - copy.size = image_linear_size; + if (!opt.deferred_cmds.empty()) + { + for (const auto& copy_cmd : opt.deferred_cmds) + { + buffer_copies.push_back({}); + auto& copy = buffer_copies.back(); + copy.srcOffset = uptr(copy_cmd.dst) + offset_in_read_buffer; + copy.dstOffset = scratch_offset; + copy.size = copy_cmd.length; + } + } + else + { + buffer_copies.push_back({}); + auto& copy = buffer_copies.back(); + copy.srcOffset = offset_in_buffer; + copy.dstOffset = scratch_offset; + copy.size = image_linear_size; + } // Point data source to scratch mem copy_info.bufferOffset = scratch_offset; @@ -904,12 +949,17 @@ namespace vk scratch_offset += image_linear_size; ensure((scratch_offset + image_linear_size) <= scratch_buf->size()); // "Out of scratch memory" } + else if (opt.require_upload) + { + copy_info.bufferRowLength = std::max(block_in_pixel * layout.pitch_in_block, layout.width_in_texel); + } } if (opt.require_swap || opt.require_deswizzle || requires_depth_processing) { ensure(scratch_buf); - vkCmdCopyBuffer(cmd, upload_heap.heap->value, scratch_buf->value, static_cast(buffer_copies.size()), buffer_copies.data()); + + vkCmdCopyBuffer(cmd, read_buffer, scratch_buf->value, static_cast(buffer_copies.size()), buffer_copies.data()); insert_buffer_memory_barrier(cmd, scratch_buf->value, 0, scratch_offset, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT); @@ -957,7 +1007,7 @@ namespace vk } else { - vkCmdCopyBufferToImage(cmd, upload_heap.heap->value, dst_image->value, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, static_cast(copy_regions.size()), copy_regions.data()); + vkCmdCopyBufferToImage(cmd, read_buffer, dst_image->value, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, static_cast(copy_regions.size()), copy_regions.data()); } }