From 8a1cf2c913772153e1e9ca6cb0e9857f9371a1f1 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Sat, 8 Jun 2019 23:47:46 +0300 Subject: [PATCH] rsx: Attempt to reduce stencil load overhead for nvidia cards --- rpcs3/Emu/RSX/Common/surface_utils.h | 9 +++++- rpcs3/Emu/RSX/GL/GLRenderTargets.h | 1 + rpcs3/Emu/RSX/VK/VKGSRender.cpp | 23 ++++++++++++++- rpcs3/Emu/RSX/VK/VKRenderTargets.h | 8 +++++- rpcs3/Emu/RSX/VK/VKResolveHelper.cpp | 43 ++++++++++++++++++++++++---- 5 files changed, 75 insertions(+), 9 deletions(-) diff --git a/rpcs3/Emu/RSX/Common/surface_utils.h b/rpcs3/Emu/RSX/Common/surface_utils.h index 6dc68f6c49..0129f16493 100644 --- a/rpcs3/Emu/RSX/Common/surface_utils.h +++ b/rpcs3/Emu/RSX/Common/surface_utils.h @@ -148,6 +148,7 @@ namespace rsx flags32_t memory_usage_flags = surface_usage_flags::unknown; flags32_t state_flags = surface_state_flags::ready; flags32_t msaa_flags = surface_state_flags::ready; + flags32_t stencil_init_flags = 0; union { @@ -462,9 +463,15 @@ namespace rsx } } - void on_write_copy(u64 write_tag = 0) + void on_write_copy(u64 write_tag = 0, bool keep_optimizations = false) { on_write(write_tag, rsx::surface_state_flags::require_unresolve); + + if (!keep_optimizations && is_depth_surface()) + { + // A successful write-copy occured, cannot guarantee flat contents in stencil area + stencil_init_flags |= (1 << 9); + } } // Returns the rect area occupied by this surface expressed as an 8bpp image with no AA diff --git a/rpcs3/Emu/RSX/GL/GLRenderTargets.h b/rpcs3/Emu/RSX/GL/GLRenderTargets.h index 045d531805..70d9873b11 100644 --- a/rpcs3/Emu/RSX/GL/GLRenderTargets.h +++ b/rpcs3/Emu/RSX/GL/GLRenderTargets.h @@ -261,6 +261,7 @@ struct gl_render_target_traits surface->set_rsx_pitch((u16)pitch); surface->queue_tag(address); surface->last_use_tag = 0; + surface->stencil_init_flags = 0; surface->memory_usage_flags = rsx::surface_usage_flags::unknown; } diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.cpp b/rpcs3/Emu/RSX/VK/VKGSRender.cpp index deae932b66..249db2a3aa 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRender.cpp +++ b/rpcs3/Emu/RSX/VK/VKGSRender.cpp @@ -1882,7 +1882,7 @@ void VKGSRender::clear_surface(u32 mask) //clip region std::tie(scissor_x, scissor_y, scissor_w, scissor_h) = rsx::clip_region(fb_width, fb_height, scissor_x, scissor_y, scissor_w, scissor_h, true); - VkClearRect region = { { { scissor_x, scissor_y },{ scissor_w, scissor_h } }, 0, 1 }; + VkClearRect region = { { { scissor_x, scissor_y }, { scissor_w, scissor_h } }, 0, 1 }; const bool require_mem_load = (scissor_w * scissor_h) < (fb_width * fb_height); auto surface_depth_format = rsx::method_registers.surface_depth_fmt(); @@ -1910,6 +1910,12 @@ void VKGSRender::clear_surface(u32 mask) depth_stencil_clear_values.depthStencil.stencil = clear_stencil; depth_stencil_mask |= VK_IMAGE_ASPECT_STENCIL_BIT; + + if (ds->samples() > 1) + { + if (!require_mem_load) ds->stencil_init_flags &= 0xFF; + ds->stencil_init_flags |= clear_stencil; + } } if ((mask & 0x3) != 0x3 && !require_mem_load && ds->state_flags & rsx::surface_state_flags::erase_bkgnd) @@ -2470,6 +2476,21 @@ bool VKGSRender::load_program() vk::get_compare_func(rsx::method_registers.back_stencil_func()), 0xFF, 0xFF); //write mask, func_mask, ref are dynamic } + + if (auto ds = m_rtts.m_bound_depth_stencil.second; + ds && ds->samples() > 1 && !(ds->stencil_init_flags & 0xFF00)) + { + if (properties.state.ds.front.failOp != VK_STENCIL_OP_KEEP || + properties.state.ds.front.depthFailOp != VK_STENCIL_OP_KEEP || + properties.state.ds.front.passOp != VK_STENCIL_OP_KEEP || + properties.state.ds.front.failOp != VK_STENCIL_OP_KEEP || + properties.state.ds.front.depthFailOp != VK_STENCIL_OP_KEEP || + properties.state.ds.front.passOp != VK_STENCIL_OP_KEEP) + { + // Toggle bit 9 to signal require full bit-wise transfer + ds->stencil_init_flags |= (1 << 8); + } + } } const auto rasterization_samples = u8((m_current_renderpass_key >> 16) & 0xF); diff --git a/rpcs3/Emu/RSX/VK/VKRenderTargets.h b/rpcs3/Emu/RSX/VK/VKRenderTargets.h index 082485ab92..4785de57f5 100644 --- a/rpcs3/Emu/RSX/VK/VKRenderTargets.h +++ b/rpcs3/Emu/RSX/VK/VKRenderTargets.h @@ -359,12 +359,16 @@ namespace vk } vk::image *target_image = (samples() > 1) ? get_resolve_target() : this; + bool memory_load = true; if (dst_area.x1 == 0 && dst_area.y1 == 0 && unsigned(dst_area.x2) == target_image->width() && unsigned(dst_area.y2) == target_image->height()) { // Skip a bunch of useless work state_flags &= ~(rsx::surface_state_flags::erase_bkgnd); msaa_flags = rsx::surface_state_flags::ready; + + memory_load = false; + stencil_init_flags = src_texture->stencil_init_flags; } else if (state_flags & rsx::surface_state_flags::erase_bkgnd) { @@ -387,7 +391,7 @@ namespace vk dst_area, /*linear?*/false, /*depth?(unused)*/false, typeless_info); - on_write_copy(); + on_write_copy(0, !memory_load); if (!read_access && samples() > 1) { @@ -570,6 +574,7 @@ namespace rsx sink->state_flags = rsx::surface_state_flags::erase_bkgnd; sink->native_component_map = ref->native_component_map; sink->sample_layout = ref->sample_layout; + sink->stencil_init_flags = ref->stencil_init_flags; sink->native_pitch = u16(prev.width * ref->get_bpp() * ref->samples_x); sink->surface_width = prev.width; sink->surface_height = prev.height; @@ -631,6 +636,7 @@ namespace rsx surface->rsx_pitch = (u16)pitch; surface->queue_tag(address); surface->last_use_tag = 0; + surface->stencil_init_flags = 0; surface->memory_usage_flags = rsx::surface_usage_flags::unknown; } diff --git a/rpcs3/Emu/RSX/VK/VKResolveHelper.cpp b/rpcs3/Emu/RSX/VK/VKResolveHelper.cpp index c7677a5226..5fafa2142c 100644 --- a/rpcs3/Emu/RSX/VK/VKResolveHelper.cpp +++ b/rpcs3/Emu/RSX/VK/VKResolveHelper.cpp @@ -2,6 +2,7 @@ #include "VKResolveHelper.h" #include "VKRenderPass.h" +#include "VKRenderTargets.h" namespace { @@ -83,10 +84,26 @@ namespace vk else { initialize_pass(g_depth_resolver, dev); - initialize_pass(g_stencil_resolver, dev); - g_depth_resolver->run(cmd, src, dst, renderpass); - g_stencil_resolver->run(cmd, src, dst, renderpass); + + // Chance for optimization here: If the stencil buffer was not used, simply perform a clear operation + const auto stencil_init_flags = vk::as_rtt(src)->stencil_init_flags; + if (stencil_init_flags & 0xFF00) + { + initialize_pass(g_stencil_resolver, dev); + g_stencil_resolver->run(cmd, src, dst, renderpass); + } + else + { + VkClearDepthStencilValue clear{ 1.f, stencil_init_flags & 0xFF }; + VkImageSubresourceRange range{ VK_IMAGE_ASPECT_STENCIL_BIT, 0, 1, 0, 1 }; + + dst->push_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); + vkCmdClearDepthStencilImage(cmd, dst->value, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, &clear, 1, &range); + dst->pop_layout(cmd); + } + + vk::as_rtt(dst)->stencil_init_flags = stencil_init_flags; } } else @@ -128,10 +145,24 @@ namespace vk else { initialize_pass(g_depth_unresolver, dev); - initialize_pass(g_stencil_unresolver, dev); - g_depth_unresolver->run(cmd, dst, src, renderpass); - g_stencil_unresolver->run(cmd, dst, src, renderpass); + + // Chance for optimization here: If the stencil buffer was not used, simply perform a clear operation + const auto stencil_init_flags = vk::as_rtt(dst)->stencil_init_flags; + if (stencil_init_flags & 0xFF00) + { + initialize_pass(g_stencil_unresolver, dev); + g_stencil_unresolver->run(cmd, dst, src, renderpass); + } + else + { + VkClearDepthStencilValue clear{ 1.f, stencil_init_flags & 0xFF }; + VkImageSubresourceRange range{ VK_IMAGE_ASPECT_STENCIL_BIT, 0, 1, 0, 1 }; + + dst->push_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); + vkCmdClearDepthStencilImage(cmd, dst->value, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, &clear, 1, &range); + dst->pop_layout(cmd); + } } } else