rsx/vk: DMA stuff

This commit is contained in:
kd-11 2020-12-17 21:33:56 +03:00 committed by kd-11
parent b96864c7e6
commit 7766076042
7 changed files with 195 additions and 59 deletions

View file

@ -296,6 +296,55 @@ struct copy_rgb655_block_swizzled
namespace
{
/**
* Generates copy instructions required to build the texture GPU side without actually copying anything.
* Returns a set of addresses and data lengths to use. This can be used to generate a GPU task to avoid CPU doing the heavy lifting.
*/
std::vector<rsx::memory_transfer_cmd>
build_transfer_cmds(const void* src, u16 block_size_in_bytes, u16 width_in_block, u16 row_count, u16 depth, u8 border, u32 dst_pitch_in_block, u32 src_pitch_in_block)
{
std::vector<rsx::memory_transfer_cmd> result;
if (src_pitch_in_block == dst_pitch_in_block && !border)
{
// Fast copy
rsx::memory_transfer_cmd cmd;
cmd.src = src;
cmd.dst = nullptr;
cmd.length = src_pitch_in_block * block_size_in_bytes * row_count * depth;
return { cmd };
}
const u32 width_in_bytes = width_in_block * block_size_in_bytes;
const u32 src_pitch_in_bytes = src_pitch_in_block * block_size_in_bytes;
const u32 dst_pitch_in_bytes = dst_pitch_in_block * block_size_in_bytes;
const u32 h_porch = border * block_size_in_bytes;
const u32 v_porch = src_pitch_in_bytes * border;
auto src_ = static_cast<const char*>(src) + h_porch;
auto dst_ = static_cast<const char*>(nullptr);
for (int layer = 0; layer < depth; ++layer)
{
// Front
src_ += v_porch;
for (int row = 0; row < row_count; ++row)
{
rsx::memory_transfer_cmd cmd{ dst_, src_, width_in_bytes };
result.push_back(cmd);
src_ += src_pitch_in_bytes;
dst_ += dst_pitch_in_bytes;
}
// Back
src_ += v_porch;
}
return result;
}
/**
* Texture upload template.
*
@ -533,7 +582,7 @@ namespace rsx
return get_subresources_layout_impl(texture);
}
texture_memory_info upload_texture_subresource(gsl::span<std::byte> dst_buffer, const rsx::subresource_layout& src_layout, int format, bool is_swizzled, const texture_uploader_capabilities& caps)
texture_memory_info upload_texture_subresource(gsl::span<std::byte> dst_buffer, const rsx::subresource_layout& src_layout, int format, bool is_swizzled, texture_uploader_capabilities& caps)
{
u16 w = src_layout.width_in_block;
u16 h = src_layout.height_in_block;
@ -644,6 +693,11 @@ namespace rsx
// Remove the VTC tiling to support ATI and Vulkan.
copy_unmodified_block_vtc::copy_mipmap_level(as_span_workaround<u64>(dst_buffer), as_const_span<const u64>(src_layout.data), w, h, depth, get_row_pitch_in_block<u64>(w, caps.alignment), src_layout.pitch_in_block);
}
else if (caps.supports_zero_copy)
{
result.require_upload = true;
result.deferred_cmds = build_transfer_cmds(src_layout.data.data(), 8, w, h, depth, 0, get_row_pitch_in_block<u64>(w, caps.alignment), src_layout.pitch_in_block);
}
else
{
copy_unmodified_block::copy_mipmap_level(as_span_workaround<u64>(dst_buffer), as_const_span<const u64>(src_layout.data), 1, w, h, depth, 0, get_row_pitch_in_block<u64>(w, caps.alignment), src_layout.pitch_in_block);
@ -661,6 +715,11 @@ namespace rsx
// Remove the VTC tiling to support ATI and Vulkan.
copy_unmodified_block_vtc::copy_mipmap_level(as_span_workaround<u128>(dst_buffer), as_const_span<const u128>(src_layout.data), w, h, depth, get_row_pitch_in_block<u128>(w, caps.alignment), src_layout.pitch_in_block);
}
else if (caps.supports_zero_copy)
{
result.require_upload = true;
result.deferred_cmds = build_transfer_cmds(src_layout.data.data(), 16, w, h, depth, 0, get_row_pitch_in_block<u128>(w, caps.alignment), src_layout.pitch_in_block);
}
else
{
copy_unmodified_block::copy_mipmap_level(as_span_workaround<u128>(dst_buffer), as_const_span<const u128>(src_layout.data), 1, w, h, depth, 0, get_row_pitch_in_block<u128>(w, caps.alignment), src_layout.pitch_in_block);
@ -676,39 +735,54 @@ namespace rsx
{
if (word_size == 1)
{
if (is_swizzled)
if (caps.supports_zero_copy)
{
result.require_upload = true;
result.deferred_cmds = build_transfer_cmds(src_layout.data.data(), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block);
}
else if (is_swizzled)
{
copy_unmodified_block_swizzled::copy_mipmap_level(as_span_workaround<u8>(dst_buffer), as_const_span<const u8>(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block);
}
else
{
copy_unmodified_block::copy_mipmap_level(as_span_workaround<u8>(dst_buffer), as_const_span<const u8>(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block);
}
else if (caps.supports_byteswap)
}
else
{
result.require_swap = true;
result.element_size = word_size;
result.block_length = words_per_block;
bool require_cpu_swizzle = !caps.supports_hw_deswizzle;
bool require_cpu_byteswap = !caps.supports_byteswap;
if (word_size == 2)
if (is_swizzled && caps.supports_hw_deswizzle)
{
if (is_swizzled)
{
if (((word_size * words_per_block) & 3) == 0 && caps.supports_hw_deswizzle)
if (word_size == 4 || (((word_size * words_per_block) & 3) == 0))
{
result.require_deswizzle = true;
}
else
{
require_cpu_swizzle = true;
}
}
if (is_swizzled && !result.require_deswizzle)
copy_unmodified_block_swizzled::copy_mipmap_level(as_span_workaround<u16>(dst_buffer), as_const_span<const u16>(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block);
else
if (!require_cpu_byteswap && !require_cpu_swizzle)
{
result.require_deswizzle = is_swizzled;
result.require_swap = true;
result.element_size = word_size;
if (caps.supports_zero_copy)
{
result.require_upload = true;
result.deferred_cmds = build_transfer_cmds(src_layout.data.data(), word_size * words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block);
}
else if (word_size == 2)
{
copy_unmodified_block::copy_mipmap_level(as_span_workaround<u16>(dst_buffer), as_const_span<const u16>(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block);
}
else if (word_size == 4)
{
result.require_deswizzle = (is_swizzled && caps.supports_hw_deswizzle);
if (is_swizzled && !caps.supports_hw_deswizzle)
copy_unmodified_block_swizzled::copy_mipmap_level(as_span_workaround<u32>(dst_buffer), as_const_span<const u32>(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block);
else
copy_unmodified_block::copy_mipmap_level(as_span_workaround<u32>(dst_buffer), as_const_span<const u32>(src_layout.data), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block);
}
}
@ -730,6 +804,7 @@ namespace rsx
}
}
}
}
return result;
}

View file

@ -112,12 +112,22 @@ namespace rsx
u32 pitch_in_block;
};
struct memory_transfer_cmd
{
const void* dst;
const void* src;
u32 length;
};
struct texture_memory_info
{
int element_size;
int block_length;
bool require_swap;
bool require_deswizzle;
bool require_upload;
std::vector<memory_transfer_cmd> deferred_cmds;
};
struct texture_uploader_capabilities
@ -125,6 +135,7 @@ namespace rsx
bool supports_byteswap;
bool supports_vtc_decoding;
bool supports_hw_deswizzle;
bool supports_zero_copy;
usz alignment;
};
@ -143,7 +154,7 @@ namespace rsx
std::vector<subresource_layout> get_subresources_layout(const rsx::fragment_texture &texture);
std::vector<subresource_layout> get_subresources_layout(const rsx::vertex_texture &texture);
texture_memory_info upload_texture_subresource(gsl::span<std::byte> dst_buffer, const subresource_layout &src_layout, int format, bool is_swizzled, const texture_uploader_capabilities& caps);
texture_memory_info upload_texture_subresource(gsl::span<std::byte> dst_buffer, const subresource_layout &src_layout, int format, bool is_swizzled, texture_uploader_capabilities& caps);
u8 get_format_block_size_in_bytes(int format);
u8 get_format_block_size_in_texel(int format);

View file

@ -645,7 +645,7 @@ namespace gl
const std::vector<rsx::subresource_layout> &input_layouts,
bool is_swizzled, GLenum gl_format, GLenum gl_type, std::vector<std::byte>& staging_buffer)
{
rsx::texture_uploader_capabilities caps{ true, false, false, 4 };
rsx::texture_uploader_capabilities caps{ true, false, false, false, 4 };
pixel_unpack_settings unpack_settings;
unpack_settings.row_length(0).alignment(4);

View file

@ -178,7 +178,7 @@ namespace vk
return inheritance_info.parent->head();
}
void dma_block::set_parent(command_buffer& cmd, dma_block* parent)
void dma_block::set_parent(const command_buffer& cmd, dma_block* parent)
{
ensure(parent);
if (inheritance_info.parent == parent)
@ -206,7 +206,7 @@ namespace vk
}
}
void dma_block::extend(command_buffer& cmd, const render_device &dev, usz new_size)
void dma_block::extend(const command_buffer& cmd, const render_device &dev, usz new_size)
{
ensure(allocated_memory);
if (new_size <= allocated_memory->size())
@ -244,7 +244,7 @@ namespace vk
return (allocated_memory) ? allocated_memory->size() : 0;
}
std::pair<u32, vk::buffer*> map_dma(command_buffer& cmd, u32 local_address, u32 length)
std::pair<u32, vk::buffer*> map_dma(const command_buffer& cmd, u32 local_address, u32 length)
{
const auto map_range = utils::address_range::start_length(local_address, length);
const auto first_block = (local_address & s_dma_block_mask);

View file

@ -4,7 +4,7 @@
namespace vk
{
std::pair<u32, vk::buffer*> map_dma(command_buffer& cmd, u32 local_address, u32 length);
std::pair<u32, vk::buffer*> map_dma(const command_buffer& cmd, u32 local_address, u32 length);
void load_dma(u32 local_address, u32 length);
void flush_dma(u32 local_address, u32 length);
@ -52,7 +52,7 @@ namespace vk
dma_block* head();
const dma_block* head() const;
void set_parent(command_buffer& cmd, dma_block* parent);
void extend(command_buffer& cmd, const render_device& dev, usz new_size);
void set_parent(const command_buffer& cmd, dma_block* parent);
void extend(const command_buffer& cmd, const render_device& dev, usz new_size);
};
}

View file

@ -68,7 +68,7 @@ namespace vk
* Then copy all layers into dst_image.
* dst_image must be in TRANSFER_DST_OPTIMAL layout and upload_buffer have TRANSFER_SRC_BIT usage flag.
*/
void copy_mipmaped_image_using_buffer(VkCommandBuffer cmd, vk::image* dst_image,
void copy_mipmaped_image_using_buffer(const vk::command_buffer& cmd, vk::image* dst_image,
const std::vector<rsx::subresource_layout>& subresource_layout, int format, bool is_swizzled, u16 mipmap_count,
VkImageAspectFlags flags, vk::data_heap &upload_heap, u32 heap_align = 0);

View file

@ -2,6 +2,7 @@
#include "VKHelpers.h"
#include "VKFormats.h"
#include "VKCompute.h"
#include "VKDMA.h"
#include "VKRenderPass.h"
#include "VKRenderTargets.h"
@ -800,7 +801,7 @@ namespace vk
ensure(dst_offset <= scratch_buf->size());
}
void copy_mipmaped_image_using_buffer(VkCommandBuffer cmd, vk::image* dst_image,
void copy_mipmaped_image_using_buffer(const vk::command_buffer& cmd, vk::image* dst_image,
const std::vector<rsx::subresource_layout>& subresource_layout, int format, bool is_swizzled, u16 mipmap_count,
VkImageAspectFlags flags, vk::data_heap &upload_heap, u32 heap_align)
{
@ -808,7 +809,7 @@ namespace vk
u32 block_in_pixel = rsx::get_format_block_size_in_texel(format);
u8 block_size_in_bytes = rsx::get_format_block_size_in_bytes(format);
rsx::texture_uploader_capabilities caps{ true, false, true, heap_align };
rsx::texture_uploader_capabilities caps{ true, false, true, true, heap_align };
rsx::texture_memory_info opt{};
bool check_caps = true;
@ -820,6 +821,9 @@ namespace vk
std::vector<VkBufferCopy> buffer_copies;
copy_regions.reserve(subresource_layout.size());
VkBuffer read_buffer = upload_heap.heap->value;
VkDeviceSize offset_in_read_buffer = 0;
if (vk::is_renderpass_open(cmd))
{
vk::end_renderpass(cmd);
@ -877,6 +881,33 @@ namespace vk
copy_info.imageSubresource.mipLevel = layout.level;
copy_info.bufferRowLength = std::max<u32>(block_in_pixel * row_pitch / block_size_in_bytes, layout.width_in_texel);
if (opt.require_upload)
{
ensure(!opt.deferred_cmds.empty());
auto base_addr = static_cast<const char*>(opt.deferred_cmds.front().src);
auto end_addr = static_cast<const char*>(opt.deferred_cmds.back().src) + opt.deferred_cmds.back().length;
auto data_length = end_addr - base_addr;
u64 src_address = 0;
if (uptr(base_addr) > uptr(vm::g_sudo_addr))
{
src_address = uptr(base_addr) - uptr(vm::g_sudo_addr);
}
else
{
src_address = uptr(base_addr) - uptr(vm::g_base_addr);
}
auto dma_mapping = vk::map_dma(cmd, static_cast<u32>(src_address), static_cast<u32>(data_length));
vk::load_dma(src_address, data_length);
read_buffer = dma_mapping.second->value;
offset_in_read_buffer = dma_mapping.first;
copy_info.bufferOffset = offset_in_read_buffer;
}
if (opt.require_swap || opt.require_deswizzle || requires_depth_processing)
{
if (!scratch_buf)
@ -892,11 +923,25 @@ namespace vk
}
// Copy from upload heap to scratch mem
if (!opt.deferred_cmds.empty())
{
for (const auto& copy_cmd : opt.deferred_cmds)
{
buffer_copies.push_back({});
auto& copy = buffer_copies.back();
copy.srcOffset = uptr(copy_cmd.dst) + offset_in_read_buffer;
copy.dstOffset = scratch_offset;
copy.size = copy_cmd.length;
}
}
else
{
buffer_copies.push_back({});
auto& copy = buffer_copies.back();
copy.srcOffset = offset_in_buffer;
copy.dstOffset = scratch_offset;
copy.size = image_linear_size;
}
// Point data source to scratch mem
copy_info.bufferOffset = scratch_offset;
@ -904,12 +949,17 @@ namespace vk
scratch_offset += image_linear_size;
ensure((scratch_offset + image_linear_size) <= scratch_buf->size()); // "Out of scratch memory"
}
else if (opt.require_upload)
{
copy_info.bufferRowLength = std::max<u32>(block_in_pixel * layout.pitch_in_block, layout.width_in_texel);
}
}
if (opt.require_swap || opt.require_deswizzle || requires_depth_processing)
{
ensure(scratch_buf);
vkCmdCopyBuffer(cmd, upload_heap.heap->value, scratch_buf->value, static_cast<u32>(buffer_copies.size()), buffer_copies.data());
vkCmdCopyBuffer(cmd, read_buffer, scratch_buf->value, static_cast<u32>(buffer_copies.size()), buffer_copies.data());
insert_buffer_memory_barrier(cmd, scratch_buf->value, 0, scratch_offset, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT);
@ -957,7 +1007,7 @@ namespace vk
}
else
{
vkCmdCopyBufferToImage(cmd, upload_heap.heap->value, dst_image->value, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, static_cast<u32>(copy_regions.size()), copy_regions.data());
vkCmdCopyBufferToImage(cmd, read_buffer, dst_image->value, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, static_cast<u32>(copy_regions.size()), copy_regions.data());
}
}