From 3afc379746bfe8328214bdb9ef40a8ac1ebb58c6 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Wed, 20 Oct 2021 13:46:12 +0300 Subject: [PATCH] rsx: Import, rebase and clean up the old detiling patches from 2021 --- rpcs3/Emu/RSX/Common/tiled_dma_copy.hpp | 186 ++++++++++++++++++++++++ rpcs3/Emu/RSX/VK/VKPresent.cpp | 18 ++- rpcs3/Emu/RSX/VK/VKRenderTargets.cpp | 25 ++++ rpcs3/Emu/RSX/VK/VKTexture.cpp | 2 +- rpcs3/Emu/RSX/VK/VKTextureCache.cpp | 2 +- rpcs3/Emu/RSX/VK/VKTextureCache.h | 31 ++++ rpcs3/Emu/RSX/rsx_utils.h | 4 +- rpcs3/emucore.vcxproj | 2 + rpcs3/emucore.vcxproj.filters | 3 + 9 files changed, 269 insertions(+), 4 deletions(-) create mode 100644 rpcs3/Emu/RSX/Common/tiled_dma_copy.hpp diff --git a/rpcs3/Emu/RSX/Common/tiled_dma_copy.hpp b/rpcs3/Emu/RSX/Common/tiled_dma_copy.hpp new file mode 100644 index 0000000000..39e88a36f6 --- /dev/null +++ b/rpcs3/Emu/RSX/Common/tiled_dma_copy.hpp @@ -0,0 +1,186 @@ +#pragma once + +#include +#include + +// This is a 1:1 port of the GPU code for my own sanity when debugging misplaced bits +// For a high-level explanation, read https://envytools.readthedocs.io/en/latest/hw/memory/vram.html +namespace rsx +{ + struct detiler_config + { + uint32_t prime; + uint32_t factor; + uint32_t num_tiles_per_row; + uint32_t tile_base_address; + uint32_t tile_size; + uint32_t tile_offset; + uint32_t tile_pitch; + uint32_t tile_bank; + uint32_t image_width; + uint32_t image_height; + uint32_t image_bpp; + }; + +#define RSX_TILE_WIDTH 256 +#define RSX_TILE_HEIGHT 64 +#define RSX_DMA_OP_ENCODE_TILE 0 +#define RSX_DMA_OP_DECODE_TILE 1 + + static void tiled_dma_copy(const uint32_t row, const uint32_t col, const detiler_config& conf, char* tiled_data, char* linear_data, int direction) + { + const uint32_t row_offset = (row * conf.tile_pitch) + conf.tile_base_address + conf.tile_offset; + const uint32_t this_address = row_offset + (col * conf.image_bpp); + + // 1. Calculate row_addr + const uint32_t texel_offset = (this_address - conf.tile_base_address) / RSX_TILE_WIDTH; + // Calculate coordinate of the tile grid we're supposed to be in + const uint32_t tile_x = texel_offset % conf.num_tiles_per_row; + const uint32_t tile_y = (texel_offset / conf.num_tiles_per_row) / RSX_TILE_HEIGHT; + // Calculate the grid offset for the tile selected and add the base offset. It's supposed to affect the bank stuff in the next step + const uint32_t tile_id = tile_y * conf.num_tiles_per_row + tile_x; + const uint32_t tile_selector = (tile_id + (conf.tile_base_address >> 14)) & 0x3ffff; + // Calculate row address + const uint32_t row_address = (tile_selector >> 2) & 0xffff; + + // 2. Calculate bank selector + // There's a lot of weird math here, but it's just a variant of (tile_selector % 4) to pick a value between [0..3] + uint32_t bank_selector = 0; + const uint32_t bank_distribution_lookup[16] = { 0, 1, 2, 3, 2, 3, 0, 1, 1, 2, 3, 0, 3, 0, 1, 2 }; + + if (conf.factor == 1) + { + bank_selector = (tile_selector & 3); + } + else if (conf.factor == 2) + { + const uint32_t idx = ((tile_selector + ((tile_y & 1) << 1)) & 3) * 4 + (tile_y & 3); + bank_selector = bank_distribution_lookup[idx]; + } + else if (conf.factor >= 4) + { + const uint32_t idx = (tile_selector & 3) * 4 + (tile_y & 3); + bank_selector = bank_distribution_lookup[idx]; + } + bank_selector = (bank_selector + conf.tile_bank) & 3; + + // 3. Calculate column selector + uint32_t column_selector = 0; + const uint32_t line_offset_in_tile = (texel_offset / conf.num_tiles_per_row) % RSX_TILE_HEIGHT; + // Calculate column_selector by bit-twiddling line offset and the other calculated parameter bits: + // column_selector[9:7] = line_offset_in_tile[5:3] + // column_selector[6:4] = this_address[7:5] + // column_selector[3:2] = line_offset_in_tile[1:0] + // column_selector[1:0] = 0 + column_selector |= ((line_offset_in_tile >> 3) & 0x7) << 7; + column_selector |= ((this_address >> 5) & 0x7) << 4; + column_selector |= ((line_offset_in_tile >> 0) & 0x3) << 2; + + // 4. Calculate partition selector (0 or 1) + const uint32_t partition_selector = (((line_offset_in_tile >> 2) & 1) + ((this_address >> 6) & 1)) & 1; + + // 5. Build tiled address + uint32_t tile_address = 0; + // tile_address[31:16] = row_adr[15:0] + // tile_address[15:14] = bank_sel[1:0] + // tile_address[13:8] = column_sel[9:4] + // tile_address[7:7] = partition_sel[0:0] + // tile_address[6:5] = column_sel[3:2] + // tile_address[4:0] = this_address[4:0] + tile_address |= ((row_address >> 0) & 0xFFFF) << 16; + tile_address |= ((bank_selector >> 0) & 0x3) << 14; + tile_address |= ((column_selector >> 4) & 0x3F) << 8; + tile_address |= ((partition_selector >> 0) & 0x1) << 7; + tile_address |= ((column_selector >> 2) & 0x3) << 5; + tile_address |= ((this_address >> 0) & 0x1F) << 0; + // Twiddle bits 9 and 10 + tile_address ^= (((tile_address >> 12) ^ ((bank_selector ^ tile_selector) & 1) ^ (tile_address >> 14)) & 1) << 9; + tile_address ^= ((tile_address >> 11) & 1) << 10; + + // Calculate relative addresses and sample + uint32_t linear_image_offset = (row * conf.tile_pitch) + (col * conf.image_bpp); + uint32_t tile_data_offset = tile_address - (conf.tile_base_address + conf.tile_offset); + + if (tile_data_offset >= conf.tile_size) + { + // Do not touch anything out of bounds + return; + } + + if (direction == RSX_DMA_OP_ENCODE_TILE) + { + std::memcpy(tiled_data + tile_data_offset, linear_data + linear_image_offset, conf.image_bpp); + } + else + { + std::memcpy(linear_data + linear_image_offset, tiled_data + tile_data_offset, conf.image_bpp); + } + } + + // Entry point. In GPU code this is handled by dispatch + main + template + void tile_texel_data(void* dst, const void* src, uint32_t base_address, uint32_t base_offset, uint32_t tile_size, uint8_t bank_sense, uint16_t row_pitch_in_bytes, uint16_t image_width, uint16_t image_height) + { + // Some constants + auto get_prime_factor = [](uint32_t pitch) -> std::pair + { + const uint32_t base = (pitch >> 8); + if ((pitch & (pitch - 1)) == 0) + { + return { 1u, base }; + } + + for (const auto prime : { 3, 5, 7, 11, 13 }) + { + if ((base % prime) == 0) + { + return { prime, base / prime }; + } + } + + // rsx_log.error("Unexpected pitch value 0x%x", pitch); + return {}; + }; + + const auto [prime, factor] = get_prime_factor(row_pitch_in_bytes); + const uint32_t tiles_per_row = prime * factor; + constexpr int op = Reverse ? RSX_DMA_OP_DECODE_TILE : RSX_DMA_OP_ENCODE_TILE; + + auto src2 = static_cast(const_cast(src)); + auto dst2 = static_cast(dst); + + const detiler_config dconf = { + .prime = prime, + .factor = factor, + .num_tiles_per_row = tiles_per_row, + .tile_base_address = base_address, + .tile_size = tile_size, + .tile_offset = base_offset, + .tile_pitch = row_pitch_in_bytes, + .tile_bank = bank_sense, + .image_width = image_width, + .image_height = image_height, + .image_bpp = sizeof(T) + }; + + for (u16 row = 0; row < image_height; ++row) + { + for (u16 col = 0; col < image_width; ++col) + { + if constexpr (op == RSX_DMA_OP_DECODE_TILE) + { + tiled_dma_copy(row, col, dconf, src2, dst2, op); + } + else + { + tiled_dma_copy(row, col, dconf, dst2, src2, op); + } + } + } + } + +#undef RSX_TILE_WIDTH +#undef RSX_TILE_HEIGHT +#undef RSX_DMA_OP_ENCODE_TILE +#undef RSX_DMA_OP_DECODE_TILE +} diff --git a/rpcs3/Emu/RSX/VK/VKPresent.cpp b/rpcs3/Emu/RSX/VK/VKPresent.cpp index 2f69505b96..4cbf0a9065 100644 --- a/rpcs3/Emu/RSX/VK/VKPresent.cpp +++ b/rpcs3/Emu/RSX/VK/VKPresent.cpp @@ -14,6 +14,11 @@ extern atomic_t g_user_asked_for_screenshot; extern atomic_t g_recording_mode; +namespace vk +{ + u32 g_debug_vis_address = 0; +} + void VKGSRender::reinitialize_swapchain() { m_swapchain_dims.width = m_frame->client_width(); @@ -477,7 +482,18 @@ void VKGSRender::flip(const rsx::display_flip_info_t& info) present_info.format = av_format; present_info.address = rsx::get_address(display_buffers[info.buffer].offset, CELL_GCM_LOCATION_LOCAL); - image_to_flip = get_present_source(&present_info, avconfig); + if (vk::g_debug_vis_address) + { + //std::vector temp_data(5120 * 1024); + //std::memcpy(temp_data.data(), vm::get_super_ptr(vk::g_debug_vis_address), 5120 * 1024); + //rsx::untile_texel_data(vm::get_super_ptr(vk::g_debug_vis_address), temp_data.data(), vk::g_debug_vis_address, 0, 0, 5120, 1280, 720); + image_to_flip = m_texture_cache.upload_image_simple(*m_current_command_buffer, VK_FORMAT_B8G8R8A8_UNORM, vk::g_debug_vis_address, 1280, 720, 5120); + vk::g_debug_vis_address = 0; + } + else + { + image_to_flip = get_present_source(&present_info, avconfig); + } if (avconfig.stereo_mode != stereo_render_mode_options::disabled) [[unlikely]] { diff --git a/rpcs3/Emu/RSX/VK/VKRenderTargets.cpp b/rpcs3/Emu/RSX/VK/VKRenderTargets.cpp index 48d693f7a0..9509894a04 100644 --- a/rpcs3/Emu/RSX/VK/VKRenderTargets.cpp +++ b/rpcs3/Emu/RSX/VK/VKRenderTargets.cpp @@ -1,6 +1,9 @@ #include "VKRenderTargets.h" #include "VKResourceManager.h" #include "Emu/RSX/rsx_methods.h" +#include "Emu/RSX/RSXThread.h" + +#include "Emu/RSX/Common/tiled_dma_copy.hpp" namespace vk { @@ -678,6 +681,28 @@ namespace vk subres.depth = 1; subres.data = { vm::get_super_ptr(base_addr), static_cast::size_type>(rsx_pitch * surface_height * samples_y) }; + // FIXME: Move to GPU queue + std::vector ext_data; + const auto range = get_memory_range(); + + if (auto region = rsx::get_current_renderer()->get_tiled_memory_region(range)) + { + auto real_data = vm::get_super_ptr(range.start); + ext_data.resize(region.tile->size); + rsx::tile_texel_data( + ext_data.data(), + real_data, + region.base_address, + range.start - region.base_address, + region.tile->size, + region.tile->bank, + region.tile->pitch, + subres.width_in_block, + subres.height_in_block + ); + subres.data = ext_data; + } + if (g_cfg.video.resolution_scale_percent == 100 && spp == 1) [[likely]] { push_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); diff --git a/rpcs3/Emu/RSX/VK/VKTexture.cpp b/rpcs3/Emu/RSX/VK/VKTexture.cpp index 78e5004f64..5de83a08fe 100644 --- a/rpcs3/Emu/RSX/VK/VKTexture.cpp +++ b/rpcs3/Emu/RSX/VK/VKTexture.cpp @@ -1009,7 +1009,7 @@ namespace vk { caps.supports_byteswap = (image_linear_size >= 1024); caps.supports_hw_deswizzle = caps.supports_byteswap; - caps.supports_zero_copy = caps.supports_byteswap; + caps.supports_zero_copy = false;// caps.supports_byteswap; caps.supports_vtc_decoding = false; check_caps = false; } diff --git a/rpcs3/Emu/RSX/VK/VKTextureCache.cpp b/rpcs3/Emu/RSX/VK/VKTextureCache.cpp index 1a46702e81..dd65f9470a 100644 --- a/rpcs3/Emu/RSX/VK/VKTextureCache.cpp +++ b/rpcs3/Emu/RSX/VK/VKTextureCache.cpp @@ -1333,7 +1333,7 @@ namespace vk void* mem = image->memory->map(0, layout.rowPitch * height); - auto src = vm::_ptr(address); + auto src = vm::get_super_ptr(address); auto dst = static_cast(mem); // TODO: SSE optimization diff --git a/rpcs3/Emu/RSX/VK/VKTextureCache.h b/rpcs3/Emu/RSX/VK/VKTextureCache.h index e2aace5832..f8f9fa0830 100644 --- a/rpcs3/Emu/RSX/VK/VKTextureCache.h +++ b/rpcs3/Emu/RSX/VK/VKTextureCache.h @@ -8,11 +8,18 @@ #include "vkutils/image_helpers.h" #include "../Common/texture_cache.h" + #include "Emu/Cell/timers.hpp" #include #include +#define DEBUG_DMA_TILING 1 + +#if DEBUG_DMA_TILING +#include "../Common/tiled_dma_copy.hpp" +#endif + namespace vk { class cached_texture_section; @@ -286,6 +293,30 @@ namespace vk const auto range = (context == rsx::texture_upload_context::framebuffer_storage) ? get_section_range() : get_confirmed_range(); vk::flush_dma(range.start, range.length()); +#if DEBUG_DMA_TILING + // Are we a tiled region? + if (const auto tiled_region = rsx::get_current_renderer()->get_tiled_memory_region(range)) + { + auto real_data = vm::get_super_ptr(range.start); + auto out_data = std::vector(tiled_region.tile->size); + rsx::tile_texel_data( + out_data.data(), + real_data, + tiled_region.base_address, + range.start - tiled_region.base_address, + tiled_region.tile->size, + tiled_region.tile->bank, + tiled_region.tile->pitch, + width, + height + ); + const auto available_tile_size = tiled_region.tile->size - (range.start - tiled_region.base_address); + const auto max_content_size = tiled_region.tile->pitch * utils::align(height, 64); + const auto write_length = std::min(max_content_size, available_tile_size); + std::memcpy(real_data, out_data.data(), write_length); + } +#endif + if (is_swizzled()) { // This format is completely worthless to CPU processing algorithms where cache lines on die are linear. diff --git a/rpcs3/Emu/RSX/rsx_utils.h b/rpcs3/Emu/RSX/rsx_utils.h index 7ecfe48857..4bb1b634fd 100644 --- a/rpcs3/Emu/RSX/rsx_utils.h +++ b/rpcs3/Emu/RSX/rsx_utils.h @@ -288,7 +288,9 @@ namespace rsx static inline u32 get_location(u32 addr) { - return (addr >= rsx::constants::local_mem_base) ? + // We don't really care about the actual memory map, it shouldn't be possible to use the mmio bar region anyway + constexpr address_range local_mem_range = address_range::start_length(rsx::constants::local_mem_base, 0x1000'0000); + return local_mem_range.overlaps(addr) ? CELL_GCM_LOCATION_LOCAL : CELL_GCM_LOCATION_MAIN; } diff --git a/rpcs3/emucore.vcxproj b/rpcs3/emucore.vcxproj index 4aaaf7ef2d..9854e484ae 100644 --- a/rpcs3/emucore.vcxproj +++ b/rpcs3/emucore.vcxproj @@ -550,6 +550,7 @@ + @@ -905,6 +906,7 @@ + diff --git a/rpcs3/emucore.vcxproj.filters b/rpcs3/emucore.vcxproj.filters index fb5bda5d41..b780716bbf 100644 --- a/rpcs3/emucore.vcxproj.filters +++ b/rpcs3/emucore.vcxproj.filters @@ -2436,5 +2436,8 @@ Emu\GPU\RSX\Program\Snippets\RSXProg + + Emu\GPU\RSX\Program\Snippets + \ No newline at end of file