rsx: Attempt to reduce stencil load overhead for nvidia cards

This commit is contained in:
kd-11 2019-06-08 23:47:46 +03:00 committed by kd-11
parent ca82dd7200
commit 8a1cf2c913
5 changed files with 75 additions and 9 deletions

View file

@ -148,6 +148,7 @@ namespace rsx
flags32_t memory_usage_flags = surface_usage_flags::unknown;
flags32_t state_flags = surface_state_flags::ready;
flags32_t msaa_flags = surface_state_flags::ready;
flags32_t stencil_init_flags = 0;
union
{
@ -462,9 +463,15 @@ namespace rsx
}
}
void on_write_copy(u64 write_tag = 0)
void on_write_copy(u64 write_tag = 0, bool keep_optimizations = false)
{
on_write(write_tag, rsx::surface_state_flags::require_unresolve);
if (!keep_optimizations && is_depth_surface())
{
// A successful write-copy occured, cannot guarantee flat contents in stencil area
stencil_init_flags |= (1 << 9);
}
}
// Returns the rect area occupied by this surface expressed as an 8bpp image with no AA

View file

@ -261,6 +261,7 @@ struct gl_render_target_traits
surface->set_rsx_pitch((u16)pitch);
surface->queue_tag(address);
surface->last_use_tag = 0;
surface->stencil_init_flags = 0;
surface->memory_usage_flags = rsx::surface_usage_flags::unknown;
}

View file

@ -1882,7 +1882,7 @@ void VKGSRender::clear_surface(u32 mask)
//clip region
std::tie(scissor_x, scissor_y, scissor_w, scissor_h) = rsx::clip_region<u16>(fb_width, fb_height, scissor_x, scissor_y, scissor_w, scissor_h, true);
VkClearRect region = { { { scissor_x, scissor_y },{ scissor_w, scissor_h } }, 0, 1 };
VkClearRect region = { { { scissor_x, scissor_y }, { scissor_w, scissor_h } }, 0, 1 };
const bool require_mem_load = (scissor_w * scissor_h) < (fb_width * fb_height);
auto surface_depth_format = rsx::method_registers.surface_depth_fmt();
@ -1910,6 +1910,12 @@ void VKGSRender::clear_surface(u32 mask)
depth_stencil_clear_values.depthStencil.stencil = clear_stencil;
depth_stencil_mask |= VK_IMAGE_ASPECT_STENCIL_BIT;
if (ds->samples() > 1)
{
if (!require_mem_load) ds->stencil_init_flags &= 0xFF;
ds->stencil_init_flags |= clear_stencil;
}
}
if ((mask & 0x3) != 0x3 && !require_mem_load && ds->state_flags & rsx::surface_state_flags::erase_bkgnd)
@ -2470,6 +2476,21 @@ bool VKGSRender::load_program()
vk::get_compare_func(rsx::method_registers.back_stencil_func()),
0xFF, 0xFF); //write mask, func_mask, ref are dynamic
}
if (auto ds = m_rtts.m_bound_depth_stencil.second;
ds && ds->samples() > 1 && !(ds->stencil_init_flags & 0xFF00))
{
if (properties.state.ds.front.failOp != VK_STENCIL_OP_KEEP ||
properties.state.ds.front.depthFailOp != VK_STENCIL_OP_KEEP ||
properties.state.ds.front.passOp != VK_STENCIL_OP_KEEP ||
properties.state.ds.front.failOp != VK_STENCIL_OP_KEEP ||
properties.state.ds.front.depthFailOp != VK_STENCIL_OP_KEEP ||
properties.state.ds.front.passOp != VK_STENCIL_OP_KEEP)
{
// Toggle bit 9 to signal require full bit-wise transfer
ds->stencil_init_flags |= (1 << 8);
}
}
}
const auto rasterization_samples = u8((m_current_renderpass_key >> 16) & 0xF);

View file

@ -359,12 +359,16 @@ namespace vk
}
vk::image *target_image = (samples() > 1) ? get_resolve_target() : this;
bool memory_load = true;
if (dst_area.x1 == 0 && dst_area.y1 == 0 &&
unsigned(dst_area.x2) == target_image->width() && unsigned(dst_area.y2) == target_image->height())
{
// Skip a bunch of useless work
state_flags &= ~(rsx::surface_state_flags::erase_bkgnd);
msaa_flags = rsx::surface_state_flags::ready;
memory_load = false;
stencil_init_flags = src_texture->stencil_init_flags;
}
else if (state_flags & rsx::surface_state_flags::erase_bkgnd)
{
@ -387,7 +391,7 @@ namespace vk
dst_area,
/*linear?*/false, /*depth?(unused)*/false, typeless_info);
on_write_copy();
on_write_copy(0, !memory_load);
if (!read_access && samples() > 1)
{
@ -570,6 +574,7 @@ namespace rsx
sink->state_flags = rsx::surface_state_flags::erase_bkgnd;
sink->native_component_map = ref->native_component_map;
sink->sample_layout = ref->sample_layout;
sink->stencil_init_flags = ref->stencil_init_flags;
sink->native_pitch = u16(prev.width * ref->get_bpp() * ref->samples_x);
sink->surface_width = prev.width;
sink->surface_height = prev.height;
@ -631,6 +636,7 @@ namespace rsx
surface->rsx_pitch = (u16)pitch;
surface->queue_tag(address);
surface->last_use_tag = 0;
surface->stencil_init_flags = 0;
surface->memory_usage_flags = rsx::surface_usage_flags::unknown;
}

View file

@ -2,6 +2,7 @@
#include "VKResolveHelper.h"
#include "VKRenderPass.h"
#include "VKRenderTargets.h"
namespace
{
@ -83,10 +84,26 @@ namespace vk
else
{
initialize_pass(g_depth_resolver, dev);
initialize_pass(g_stencil_resolver, dev);
g_depth_resolver->run(cmd, src, dst, renderpass);
g_stencil_resolver->run(cmd, src, dst, renderpass);
// Chance for optimization here: If the stencil buffer was not used, simply perform a clear operation
const auto stencil_init_flags = vk::as_rtt(src)->stencil_init_flags;
if (stencil_init_flags & 0xFF00)
{
initialize_pass(g_stencil_resolver, dev);
g_stencil_resolver->run(cmd, src, dst, renderpass);
}
else
{
VkClearDepthStencilValue clear{ 1.f, stencil_init_flags & 0xFF };
VkImageSubresourceRange range{ VK_IMAGE_ASPECT_STENCIL_BIT, 0, 1, 0, 1 };
dst->push_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
vkCmdClearDepthStencilImage(cmd, dst->value, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, &clear, 1, &range);
dst->pop_layout(cmd);
}
vk::as_rtt(dst)->stencil_init_flags = stencil_init_flags;
}
}
else
@ -128,10 +145,24 @@ namespace vk
else
{
initialize_pass(g_depth_unresolver, dev);
initialize_pass(g_stencil_unresolver, dev);
g_depth_unresolver->run(cmd, dst, src, renderpass);
g_stencil_unresolver->run(cmd, dst, src, renderpass);
// Chance for optimization here: If the stencil buffer was not used, simply perform a clear operation
const auto stencil_init_flags = vk::as_rtt(dst)->stencil_init_flags;
if (stencil_init_flags & 0xFF00)
{
initialize_pass(g_stencil_unresolver, dev);
g_stencil_unresolver->run(cmd, dst, src, renderpass);
}
else
{
VkClearDepthStencilValue clear{ 1.f, stencil_init_flags & 0xFF };
VkImageSubresourceRange range{ VK_IMAGE_ASPECT_STENCIL_BIT, 0, 1, 0, 1 };
dst->push_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
vkCmdClearDepthStencilImage(cmd, dst->value, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, &clear, 1, &range);
dst->pop_layout(cmd);
}
}
}
else