rsx: Blit engine clipping fixes

- Do not round up sub-pixel offsets, round down instead
- Do not allow incomplete sources for hw blit transfer
- Reimplement src clipping (slice_h)
- Check 'area' of incoming texels and correct for them before RTT lookup/transfer
- Filter out incomplete targets when performing RTT lookup (1 texel or less contribution)
This commit is contained in:
kd-11 2019-03-25 16:21:19 +03:00 committed by kd-11
parent f30af3ccd2
commit 443fde760f
3 changed files with 106 additions and 89 deletions

View file

@ -2255,6 +2255,18 @@ namespace rsx
u16 dst_w = dst.clip_width; u16 dst_w = dst.clip_width;
u16 dst_h = dst.clip_height; u16 dst_h = dst.clip_height;
if (UNLIKELY((src_h + src.offset_y) > src.height))
{
src_h = src.height - src.offset_y;
dst_h = u16(src_h * scale_y + 0.000001f);
}
if (UNLIKELY((src_w + src.offset_x) > src.width))
{
src_w = src.width - src.offset_x;
dst_w = u16(src_w * scale_x + 0.000001f);
}
if (dst.scale_y < 0.f) if (dst.scale_y < 0.f)
{ {
typeless_info.flip_vertical = true; typeless_info.flip_vertical = true;
@ -2267,14 +2279,47 @@ namespace rsx
src_address += (src.width - src_w) * src_bpp; src_address += (src.width - src_w) * src_bpp;
} }
auto rtt_lookup = [&m_rtts, &cmd](u32 address, u32 width, u32 height, u32 pitch, u32 bpp, bool allow_clipped) -> typename surface_store_type::surface_overlap_info auto rtt_lookup = [&m_rtts, &cmd, &scale_x, &scale_y, this](u32 address, u32 width, u32 height, u32 pitch, u8 bpp, bool allow_clipped) -> typename surface_store_type::surface_overlap_info
{ {
const auto list = m_rtts.get_merged_texture_memory_region(cmd, address, width, height, pitch, bpp); const auto list = m_rtts.get_merged_texture_memory_region(cmd, address, width, height, pitch, bpp);
if (list.empty() || (list.back().is_clipped && !allow_clipped)) if (list.empty())
{ {
return {}; return {};
} }
if (list.back().is_clipped && !allow_clipped)
{
for (auto It = list.rbegin(); It != list.rend(); ++It)
{
if (!It->is_clipped)
{
return *It;
}
auto _w = u32(It->width * It->surface->get_bpp()) / bpp;
auto _h = u32(It->height);
get_rsx_dimensions(_w, _h, It->surface);
if (_w < width)
{
if ((_w * scale_x) <= 1.f)
continue;
}
if (_h < height)
{
if ((_h * scale_y) <= 1.f)
continue;
}
// Some surface exists, but its size is questionable
// Opt to re-upload (needs WCB/WDB to work properly)
break;
}
return {};
}
return list.back(); return list.back();
}; };
@ -2283,20 +2328,18 @@ namespace rsx
dst_is_render_target = dst_subres.surface != nullptr; dst_is_render_target = dst_subres.surface != nullptr;
// TODO: Handle cases where src or dst can be a depth texture while the other is a color texture - requires a render pass to emulate // TODO: Handle cases where src or dst can be a depth texture while the other is a color texture - requires a render pass to emulate
auto src_subres = rtt_lookup(src_address, src_w, src_h, src.pitch, src_bpp, true); auto src_subres = rtt_lookup(src_address, src_w, src_h, src.pitch, src_bpp, false);
src_is_render_target = src_subres.surface != nullptr; src_is_render_target = src_subres.surface != nullptr;
// Always use GPU blit if src or dst is in the surface store // Always use GPU blit if src or dst is in the surface store
if (!g_cfg.video.use_gpu_texture_scaling && !(src_is_render_target || dst_is_render_target)) if (!g_cfg.video.use_gpu_texture_scaling && !(src_is_render_target || dst_is_render_target))
return false; return false;
// Check if trivial memcpy can perform the same task // Check if trivial memcpy can perform the same task
// Used to copy programs and arbitrary data to the GPU in some cases // Used to copy programs and arbitrary data to the GPU in some cases
if (!src_is_render_target && !dst_is_render_target && dst_is_argb8 == src_is_argb8 && !dst.swizzled) if (!src_is_render_target && !dst_is_render_target && dst_is_argb8 == src_is_argb8 && !dst.swizzled)
{ {
if ((src.slice_h == 1 && dst.clip_height == 1) || if ((src_h == 1 && dst_h == 1) || (dst_w == src_w && dst_h == src_h && src.pitch == dst.pitch))
(dst.clip_width == src.width && dst.clip_height == src.slice_h && src.pitch == dst.pitch))
{ {
if (dst.scale_x > 0.f && dst.scale_y > 0.f) if (dst.scale_x > 0.f && dst.scale_y > 0.f)
{ {
@ -2372,18 +2415,15 @@ namespace rsx
{ {
dst_dimensions.height = std::max(src_subres.surface->get_surface_height(), dst.height); dst_dimensions.height = std::max(src_subres.surface->get_surface_height(), dst.height);
} }
else if (dst.max_tile_h > dst.height) else if (LIKELY(dst_dimensions.width == 1280 || dst_dimensions.width == 2560))
{ {
// Optimizations table based on common width/height pairings. If we guess wrong, the upload resolver will fix it anyway // Optimizations table based on common width/height pairings. If we guess wrong, the upload resolver will fix it anyway
// TODO: Add more entries based on empirical data // TODO: Add more entries based on empirical data
if (LIKELY(dst_dimensions.width == 1280)) dst_dimensions.height = std::max<s32>(dst.height, 720);
{ }
dst_dimensions.height = std::max<s32>(dst.height, 720); else
} {
else //LOG_TRACE(RSX, "Blit transfer to surface with dims %dx%d", dst_dimensions.width, dst.height);
{
dst_dimensions.height = std::min((s32)dst.max_tile_h, 1024);
}
} }
} }
@ -2540,23 +2580,29 @@ namespace rsx
if (!vram_texture) if (!vram_texture)
{ {
// Translate src_area into the declared block
src_area.x1 += src.offset_x;
src_area.x2 += src.offset_x;
src_area.y1 += src.offset_y;
src_area.y2 += src.offset_y;
lock.upgrade(); lock.upgrade();
const auto rsx_range = address_range::start_length(src_address, src.pitch * src.slice_h); const auto rsx_range = address_range::start_length(src.rsx_address, src.pitch * src.height);
invalidate_range_impl_base(cmd, rsx_range, invalidation_cause::read, std::forward<Args>(extras)...); invalidate_range_impl_base(cmd, rsx_range, invalidation_cause::read, std::forward<Args>(extras)...);
const u16 _width = src.pitch / src_bpp; const u16 _width = src.pitch / src_bpp;
std::vector<rsx_subresource_layout> subresource_layout; std::vector<rsx_subresource_layout> subresource_layout;
rsx_subresource_layout subres = {}; rsx_subresource_layout subres = {};
subres.width_in_block = _width; subres.width_in_block = _width;
subres.height_in_block = src.slice_h; subres.height_in_block = src.height;
subres.pitch_in_block = _width; subres.pitch_in_block = _width;
subres.depth = 1; subres.depth = 1;
subres.data = { (const gsl::byte*)src.pixels, src.pitch * src.slice_h }; subres.data = { reinterpret_cast<const gsl::byte*>(vm::base(src.rsx_address)), src.pitch * src.height };
subresource_layout.push_back(subres); subresource_layout.push_back(subres);
const u32 gcm_format = src_is_argb8 ? CELL_GCM_TEXTURE_A8R8G8B8 : CELL_GCM_TEXTURE_R5G6B5; const u32 gcm_format = src_is_argb8 ? CELL_GCM_TEXTURE_A8R8G8B8 : CELL_GCM_TEXTURE_R5G6B5;
vram_texture = upload_image_from_cpu(cmd, rsx_range, _width, src.slice_h, 1, 1, src.pitch, gcm_format, texture_upload_context::blit_engine_src, vram_texture = upload_image_from_cpu(cmd, rsx_range, _width, src.height, 1, 1, src.pitch, gcm_format, texture_upload_context::blit_engine_src,
subresource_layout, rsx::texture_dimension_extended::texture_dimension_2d, dst.swizzled)->get_raw_texture(); subresource_layout, rsx::texture_dimension_extended::texture_dimension_2d, dst.swizzled)->get_raw_texture();
typeless_info.src_context = texture_upload_context::blit_engine_src; typeless_info.src_context = texture_upload_context::blit_engine_src;
@ -2610,7 +2656,7 @@ namespace rsx
// Need to calculate the minium required size that will fit the data, anchored on the rsx_address // Need to calculate the minium required size that will fit the data, anchored on the rsx_address
// If the application starts off with an 'inseted' section, the guessed dimensions may not fit! // If the application starts off with an 'inseted' section, the guessed dimensions may not fit!
const u32 write_end = dst_address + (dst.pitch * dst.clip_height); const u32 write_end = dst_address + (dst.pitch * dst_h);
const u32 expected_end = dst.rsx_address + (dst.pitch * dst_dimensions.height); const u32 expected_end = dst.rsx_address + (dst.pitch * dst_dimensions.height);
const u32 section_length = std::max(write_end, expected_end) - dst.rsx_address; const u32 section_length = std::max(write_end, expected_end) - dst.rsx_address;

View file

@ -806,6 +806,7 @@ namespace rsx
namespace nv3089 namespace nv3089
{ {
#pragma optimize("", off)
void image_in(thread *rsx, u32 _reg, u32 arg) void image_in(thread *rsx, u32 _reg, u32 arg)
{ {
const rsx::blit_engine::transfer_operation operation = method_registers.blit_engine_operation(); const rsx::blit_engine::transfer_operation operation = method_registers.blit_engine_operation();
@ -822,11 +823,28 @@ namespace rsx
const blit_engine::transfer_interpolator in_inter = method_registers.blit_engine_input_inter(); const blit_engine::transfer_interpolator in_inter = method_registers.blit_engine_input_inter();
rsx::blit_engine::transfer_source_format src_color_format = method_registers.blit_engine_src_color_format(); rsx::blit_engine::transfer_source_format src_color_format = method_registers.blit_engine_src_color_format();
const f32 in_x = std::ceil(method_registers.blit_engine_in_x()); const f32 scale_x = method_registers.blit_engine_ds_dx();
const f32 in_y = std::ceil(method_registers.blit_engine_in_y()); const f32 scale_y = method_registers.blit_engine_dt_dy();
//Clipping // NOTE: Do not round these value up!
//Validate that clipping rect will fit onto both src and dst regions // Sub-pixel offsets are used to signify pixel centers and do not mean to read from the next block (fill convention)
auto in_x = (u16)std::floor(method_registers.blit_engine_in_x());
auto in_y = (u16)std::floor(method_registers.blit_engine_in_y());
if ((in_x + in_y) && fabsf(fabsf(scale_x * scale_y) - 1.f) > 0.000001f)
{
// Scaling operation, check for subpixel correction offsets
if (in_x > 1 || in_y > 1)
{
LOG_ERROR(RSX, "Unexpected blit input setup; DT/S=%f,%f, in_x=%d, in_y=%d", scale_x, scale_y, in_x, in_y);
}
if (in_x == 1) in_x = 0;
if (in_y == 1) in_y = 0;
}
// Clipping
// Validate that clipping rect will fit onto both src and dst regions
const u16 clip_w = std::min(method_registers.blit_engine_clip_width(), out_w); const u16 clip_w = std::min(method_registers.blit_engine_clip_width(), out_w);
const u16 clip_h = std::min(method_registers.blit_engine_clip_height(), out_h); const u16 clip_h = std::min(method_registers.blit_engine_clip_height(), out_h);
@ -852,9 +870,12 @@ namespace rsx
u16 in_pitch = method_registers.blit_engine_input_pitch(); u16 in_pitch = method_registers.blit_engine_input_pitch();
if (in_origin != blit_engine::transfer_origin::corner) switch (in_origin)
{ {
// Probably refers to texel geometry which would affect clipping algorithm slightly when rounding texel addresses case blit_engine::transfer_origin::corner:
case blit_engine::transfer_origin::center:
break;
default:
LOG_WARNING(RSX, "NV3089_IMAGE_IN_SIZE: unknown origin (%d)", (u8)in_origin); LOG_WARNING(RSX, "NV3089_IMAGE_IN_SIZE: unknown origin (%d)", (u8)in_origin);
} }
@ -903,14 +924,14 @@ namespace rsx
out_pitch = out_bpp * out_w; out_pitch = out_bpp * out_w;
} }
const u32 in_offset = u32(in_x * in_bpp + in_pitch * in_y); const u32 in_offset = in_x * in_bpp + in_pitch * in_y;
const s32 out_offset = out_x * out_bpp + out_pitch * out_y; const s32 out_offset = out_x * out_bpp + out_pitch * out_y;
const tiled_region src_region = rsx->get_tiled_address(src_offset + in_offset, src_dma & 0xf); const u32 src_address = get_address(src_offset, src_dma);
const tiled_region dst_region = rsx->get_tiled_address(dst_offset + out_offset, dst_dma & 0xf); const u32 dst_address = get_address(dst_offset, dst_dma);
u8* pixels_src = src_region.tile ? src_region.ptr + src_region.base : src_region.ptr; u8* pixels_src = vm::_ptr<u8>(src_address + in_offset);
u8* pixels_dst = vm::_ptr<u8>(get_address(dst_offset + out_offset, dst_dma)); u8* pixels_dst = vm::_ptr<u8>(dst_address + out_offset);
const auto read_address = get_address(src_offset, src_dma); const auto read_address = get_address(src_offset, src_dma);
rsx->read_barrier(read_address, in_pitch * (in_h - 1) + (in_w * in_bpp)); rsx->read_barrier(read_address, in_pitch * (in_h - 1) + (in_w * in_bpp));
@ -936,9 +957,6 @@ namespace rsx
} }
} }
f32 scale_x = method_registers.blit_engine_ds_dx();
f32 scale_y = method_registers.blit_engine_dt_dy();
u32 convert_w = (u32)(std::abs(scale_x) * in_w); u32 convert_w = (u32)(std::abs(scale_x) * in_w);
u32 convert_h = (u32)(std::abs(scale_y) * in_h); u32 convert_h = (u32)(std::abs(scale_y) * in_h);
@ -949,60 +967,20 @@ namespace rsx
return; return;
} }
u32 slice_h = clip_h;
blit_src_info src_info = {}; blit_src_info src_info = {};
blit_dst_info dst_info = {}; blit_dst_info dst_info = {};
if (src_region.tile)
{
switch(src_region.tile->comp)
{
case CELL_GCM_COMPMODE_C32_2X2:
slice_h *= 2;
src_info.compressed_y = true;
case CELL_GCM_COMPMODE_C32_2X1:
src_info.compressed_x = true;
break;
}
u32 size = slice_h * in_pitch;
if (size > src_region.tile->size - src_region.base)
{
u32 diff = size - (src_region.tile->size - src_region.base);
slice_h -= diff / in_pitch + (diff % in_pitch ? 1 : 0);
}
}
if (dst_region.tile)
{
switch (dst_region.tile->comp)
{
case CELL_GCM_COMPMODE_C32_2X2:
dst_info.compressed_y = true;
case CELL_GCM_COMPMODE_C32_2X1:
dst_info.compressed_x = true;
break;
}
dst_info.max_tile_h = static_cast<u16>((dst_region.tile->size - dst_region.base) / out_pitch);
}
if (!g_cfg.video.force_cpu_blit_processing && (dst_dma == CELL_GCM_CONTEXT_DMA_MEMORY_FRAME_BUFFER || src_dma == CELL_GCM_CONTEXT_DMA_MEMORY_FRAME_BUFFER)) if (!g_cfg.video.force_cpu_blit_processing && (dst_dma == CELL_GCM_CONTEXT_DMA_MEMORY_FRAME_BUFFER || src_dma == CELL_GCM_CONTEXT_DMA_MEMORY_FRAME_BUFFER))
{ {
//For now, only use this for actual scaled images, there are use cases that should not go through 3d engine, e.g program ucode transfer
//TODO: Figure out more instances where we can use this without problems
//NOTE: In cases where slice_h is modified due to compression (read from tiled memory), the new value (clip_h * 2) does not matter if memory is on the GPU
src_info.format = src_color_format; src_info.format = src_color_format;
src_info.origin = in_origin; src_info.origin = in_origin;
src_info.width = in_w; src_info.width = in_w;
src_info.height = in_h; src_info.height = in_h;
src_info.pitch = in_pitch; src_info.pitch = in_pitch;
src_info.slice_h = slice_h;
src_info.offset_x = (u16)in_x; src_info.offset_x = (u16)in_x;
src_info.offset_y = (u16)in_y; src_info.offset_y = (u16)in_y;
src_info.rsx_address = src_address;
src_info.pixels = pixels_src; src_info.pixels = pixels_src;
src_info.rsx_address = get_address(src_offset, src_dma);
dst_info.format = dst_color_format; dst_info.format = dst_color_format;
dst_info.width = convert_w; dst_info.width = convert_w;
@ -1016,8 +994,8 @@ namespace rsx
dst_info.pitch = out_pitch; dst_info.pitch = out_pitch;
dst_info.scale_x = scale_x; dst_info.scale_x = scale_x;
dst_info.scale_y = scale_y; dst_info.scale_y = scale_y;
dst_info.rsx_address = dst_address;
dst_info.pixels = pixels_dst; dst_info.pixels = pixels_dst;
dst_info.rsx_address = get_address(dst_offset, dst_dma);
dst_info.swizzled = (method_registers.blit_engine_context_surface() == blit_engine::context_surface::swizzle2d); dst_info.swizzled = (method_registers.blit_engine_context_surface() == blit_engine::context_surface::swizzle2d);
if (rsx->scaled_image_from_memory(src_info, dst_info, in_inter == blit_engine::transfer_interpolator::foh)) if (rsx->scaled_image_from_memory(src_info, dst_info, in_inter == blit_engine::transfer_interpolator::foh))
@ -1067,6 +1045,7 @@ namespace rsx
convert_w != out_w || convert_h != out_h; convert_w != out_w || convert_h != out_h;
const bool need_convert = out_format != in_format || std::abs(scale_x) != 1.0 || std::abs(scale_y) != 1.0; const bool need_convert = out_format != in_format || std::abs(scale_x) != 1.0 || std::abs(scale_y) != 1.0;
const u32 slice_h = std::ceil(f32(clip_h + clip_y) / scale_x);
if (method_registers.blit_engine_context_surface() != blit_engine::context_surface::swizzle2d) if (method_registers.blit_engine_context_surface() != blit_engine::context_surface::swizzle2d)
{ {
@ -1137,7 +1116,7 @@ namespace rsx
temp3.reset(new u8[out_pitch * (out_h - 1) + (out_bpp * out_w)]); temp3.reset(new u8[out_pitch * (out_h - 1) + (out_bpp * out_w)]);
convert_scale_image(temp3.get(), out_format, out_w, out_h, out_pitch, convert_scale_image(temp3.get(), out_format, out_w, out_h, out_pitch,
pixels_src, in_format, in_w, in_h, in_pitch, clip_h, in_inter == blit_engine::transfer_interpolator::foh); pixels_src, in_format, in_w, in_h, in_pitch, slice_h, in_inter == blit_engine::transfer_interpolator::foh);
} }
pixels_src = temp3.get(); pixels_src = temp3.get();
@ -1201,6 +1180,7 @@ namespace rsx
std::memcpy(pixels_dst, swizzled_pixels, out_bpp * sw_width * sw_height); std::memcpy(pixels_dst, swizzled_pixels, out_bpp * sw_width * sw_height);
} }
} }
#pragma optimize("", on)
} }
namespace nv0039 namespace nv0039

View file

@ -103,13 +103,9 @@ namespace rsx
u16 offset_y; u16 offset_y;
u16 width; u16 width;
u16 height; u16 height;
u16 slice_h;
u16 pitch; u16 pitch;
void *pixels;
bool compressed_x;
bool compressed_y;
u32 rsx_address; u32 rsx_address;
void *pixels;
}; };
struct blit_dst_info struct blit_dst_info
@ -124,16 +120,11 @@ namespace rsx
u16 clip_y; u16 clip_y;
u16 clip_width; u16 clip_width;
u16 clip_height; u16 clip_height;
u16 max_tile_h;
f32 scale_x; f32 scale_x;
f32 scale_y; f32 scale_y;
bool swizzled;
void *pixels;
bool compressed_x;
bool compressed_y;
u32 rsx_address; u32 rsx_address;
void *pixels;
bool swizzled;
}; };
static const std::pair<std::array<u8, 4>, std::array<u8, 4>> default_remap_vector = static const std::pair<std::array<u8, 4>, std::array<u8, 4>> default_remap_vector =