mirror of
https://github.com/RPCS3/rpcs3.git
synced 2025-07-07 07:21:25 +12:00
Move code to cpp (#9938)
* GL: move GLOverlays code to cpp * GL: move GLCompute code to cpp * VK: move VKOverlays code to cpp * VK: move VKCompute code to cpp
This commit is contained in:
parent
9cbe77904d
commit
cbd895a29c
19 changed files with 2578 additions and 2344 deletions
|
@ -430,10 +430,12 @@ target_sources(rpcs3_emu PRIVATE
|
||||||
RSX/Capture/rsx_capture.cpp
|
RSX/Capture/rsx_capture.cpp
|
||||||
RSX/Capture/rsx_replay.cpp
|
RSX/Capture/rsx_replay.cpp
|
||||||
RSX/GL/GLCommonDecompiler.cpp
|
RSX/GL/GLCommonDecompiler.cpp
|
||||||
|
RSX/GL/GLCompute.cpp
|
||||||
RSX/GL/GLDraw.cpp
|
RSX/GL/GLDraw.cpp
|
||||||
RSX/GL/GLFragmentProgram.cpp
|
RSX/GL/GLFragmentProgram.cpp
|
||||||
RSX/GL/GLGSRender.cpp
|
RSX/GL/GLGSRender.cpp
|
||||||
RSX/GL/GLHelpers.cpp
|
RSX/GL/GLHelpers.cpp
|
||||||
|
RSX/GL/GLOverlays.cpp
|
||||||
RSX/GL/GLPipelineCompiler.cpp
|
RSX/GL/GLPipelineCompiler.cpp
|
||||||
RSX/GL/GLPresent.cpp
|
RSX/GL/GLPresent.cpp
|
||||||
RSX/GL/GLRenderTargets.cpp
|
RSX/GL/GLRenderTargets.cpp
|
||||||
|
@ -462,6 +464,7 @@ if(TARGET 3rdparty_vulkan)
|
||||||
RSX/VK/vkutils/shared.cpp
|
RSX/VK/vkutils/shared.cpp
|
||||||
RSX/VK/VKCommandStream.cpp
|
RSX/VK/VKCommandStream.cpp
|
||||||
RSX/VK/VKCommonDecompiler.cpp
|
RSX/VK/VKCommonDecompiler.cpp
|
||||||
|
RSX/VK/VKCompute.cpp
|
||||||
RSX/VK/VKDMA.cpp
|
RSX/VK/VKDMA.cpp
|
||||||
RSX/VK/VKDraw.cpp
|
RSX/VK/VKDraw.cpp
|
||||||
RSX/VK/VKFormats.cpp
|
RSX/VK/VKFormats.cpp
|
||||||
|
@ -470,6 +473,7 @@ if(TARGET 3rdparty_vulkan)
|
||||||
RSX/VK/VKGSRender.cpp
|
RSX/VK/VKGSRender.cpp
|
||||||
RSX/VK/VKHelpers.cpp
|
RSX/VK/VKHelpers.cpp
|
||||||
RSX/VK/VKMemAlloc.cpp
|
RSX/VK/VKMemAlloc.cpp
|
||||||
|
RSX/VK/VKOverlays.cpp
|
||||||
RSX/VK/VKPipelineCompiler.cpp
|
RSX/VK/VKPipelineCompiler.cpp
|
||||||
RSX/VK/VKPresent.cpp
|
RSX/VK/VKPresent.cpp
|
||||||
RSX/VK/VKProgramPipeline.cpp
|
RSX/VK/VKProgramPipeline.cpp
|
||||||
|
|
297
rpcs3/Emu/RSX/GL/GLCompute.cpp
Normal file
297
rpcs3/Emu/RSX/GL/GLCompute.cpp
Normal file
|
@ -0,0 +1,297 @@
|
||||||
|
#include "GLCompute.h"
|
||||||
|
#include "Utilities/StrUtil.h"
|
||||||
|
|
||||||
|
namespace gl
|
||||||
|
{
|
||||||
|
void compute_task::initialize()
|
||||||
|
{
|
||||||
|
// Set up optimal kernel size
|
||||||
|
const auto& caps = gl::get_driver_caps();
|
||||||
|
if (caps.vendor_AMD || caps.vendor_MESA)
|
||||||
|
{
|
||||||
|
optimal_group_size = 64;
|
||||||
|
unroll_loops = false;
|
||||||
|
}
|
||||||
|
else if (caps.vendor_NVIDIA)
|
||||||
|
{
|
||||||
|
optimal_group_size = 32;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
optimal_group_size = 128;
|
||||||
|
}
|
||||||
|
|
||||||
|
glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_COUNT, 0, reinterpret_cast<GLint*>(&max_invocations_x));
|
||||||
|
}
|
||||||
|
|
||||||
|
void compute_task::create()
|
||||||
|
{
|
||||||
|
if (!compiled)
|
||||||
|
{
|
||||||
|
m_shader.create(::glsl::program_domain::glsl_compute_program, m_src);
|
||||||
|
m_shader.compile();
|
||||||
|
|
||||||
|
m_program.create();
|
||||||
|
m_program.attach(m_shader);
|
||||||
|
m_program.link();
|
||||||
|
|
||||||
|
compiled = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void compute_task::destroy()
|
||||||
|
{
|
||||||
|
if (compiled)
|
||||||
|
{
|
||||||
|
m_program.remove();
|
||||||
|
m_shader.remove();
|
||||||
|
|
||||||
|
compiled = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void compute_task::run(u32 invocations_x, u32 invocations_y)
|
||||||
|
{
|
||||||
|
GLint old_program;
|
||||||
|
glGetIntegerv(GL_CURRENT_PROGRAM, &old_program);
|
||||||
|
|
||||||
|
bind_resources();
|
||||||
|
m_program.use();
|
||||||
|
glDispatchCompute(invocations_x, invocations_y, 1);
|
||||||
|
|
||||||
|
glUseProgram(old_program);
|
||||||
|
}
|
||||||
|
|
||||||
|
void compute_task::run(u32 num_invocations)
|
||||||
|
{
|
||||||
|
u32 invocations_x, invocations_y;
|
||||||
|
if (num_invocations <= max_invocations_x) [[likely]]
|
||||||
|
{
|
||||||
|
invocations_x = num_invocations;
|
||||||
|
invocations_y = 1;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Since all the invocations will run, the optimal distribution is sqrt(count)
|
||||||
|
const u32 optimal_length = static_cast<u32>(floor(std::sqrt(num_invocations)));
|
||||||
|
invocations_x = optimal_length;
|
||||||
|
invocations_y = invocations_x;
|
||||||
|
|
||||||
|
if (num_invocations % invocations_x) invocations_y++;
|
||||||
|
}
|
||||||
|
|
||||||
|
run(invocations_x, invocations_y);
|
||||||
|
}
|
||||||
|
|
||||||
|
cs_shuffle_base::cs_shuffle_base()
|
||||||
|
{
|
||||||
|
work_kernel =
|
||||||
|
" value = data[index];\n"
|
||||||
|
" data[index] = %f(value);\n";
|
||||||
|
|
||||||
|
loop_advance =
|
||||||
|
" index++;\n";
|
||||||
|
|
||||||
|
suffix =
|
||||||
|
"}\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
void cs_shuffle_base::build(const char* function_name, u32 _kernel_size)
|
||||||
|
{
|
||||||
|
// Initialize to allow detecting optimal settings
|
||||||
|
initialize();
|
||||||
|
|
||||||
|
kernel_size = _kernel_size? _kernel_size : optimal_kernel_size;
|
||||||
|
|
||||||
|
m_src =
|
||||||
|
"#version 430\n"
|
||||||
|
"layout(local_size_x=%ws, local_size_y=1, local_size_z=1) in;\n"
|
||||||
|
"layout(binding=%loc, std430) buffer ssbo{ uint data[]; };\n"
|
||||||
|
"%ub"
|
||||||
|
"\n"
|
||||||
|
"#define KERNEL_SIZE %ks\n"
|
||||||
|
"\n"
|
||||||
|
"// Generic swap routines\n"
|
||||||
|
"#define bswap_u16(bits) (bits & 0xFF) << 8 | (bits & 0xFF00) >> 8 | (bits & 0xFF0000) << 8 | (bits & 0xFF000000) >> 8\n"
|
||||||
|
"#define bswap_u32(bits) (bits & 0xFF) << 24 | (bits & 0xFF00) << 8 | (bits & 0xFF0000) >> 8 | (bits & 0xFF000000) >> 24\n"
|
||||||
|
"#define bswap_u16_u32(bits) (bits & 0xFFFF) << 16 | (bits & 0xFFFF0000) >> 16\n"
|
||||||
|
"\n"
|
||||||
|
"// Depth format conversions\n"
|
||||||
|
"#define d24f_to_f32(bits) (bits << 7)\n"
|
||||||
|
"#define f32_to_d24f(bits) (bits >> 7)\n"
|
||||||
|
"\n"
|
||||||
|
"uint linear_invocation_id()\n"
|
||||||
|
"{\n"
|
||||||
|
" uint size_in_x = (gl_NumWorkGroups.x * gl_WorkGroupSize.x);\n"
|
||||||
|
" return (gl_GlobalInvocationID.y * size_in_x) + gl_GlobalInvocationID.x;\n"
|
||||||
|
"}\n"
|
||||||
|
"\n"
|
||||||
|
"%md"
|
||||||
|
"void main()\n"
|
||||||
|
"{\n"
|
||||||
|
" uint invocation_id = linear_invocation_id();\n"
|
||||||
|
" uint index = invocation_id * KERNEL_SIZE;\n"
|
||||||
|
" uint value;\n"
|
||||||
|
" %vars"
|
||||||
|
"\n";
|
||||||
|
|
||||||
|
const std::pair<std::string, std::string> syntax_replace[] =
|
||||||
|
{
|
||||||
|
{ "%loc", std::to_string(GL_COMPUTE_BUFFER_SLOT(0)) },
|
||||||
|
{ "%ws", std::to_string(optimal_group_size) },
|
||||||
|
{ "%ks", std::to_string(kernel_size) },
|
||||||
|
{ "%vars", variables },
|
||||||
|
{ "%f", function_name },
|
||||||
|
{ "%ub", uniforms },
|
||||||
|
{ "%md", method_declarations }
|
||||||
|
};
|
||||||
|
|
||||||
|
m_src = fmt::replace_all(m_src, syntax_replace);
|
||||||
|
work_kernel = fmt::replace_all(work_kernel, syntax_replace);
|
||||||
|
|
||||||
|
if (kernel_size <= 1)
|
||||||
|
{
|
||||||
|
m_src += " {\n" + work_kernel + " }\n";
|
||||||
|
}
|
||||||
|
else if (unroll_loops)
|
||||||
|
{
|
||||||
|
work_kernel += loop_advance + "\n";
|
||||||
|
|
||||||
|
m_src += std::string
|
||||||
|
(
|
||||||
|
" //Unrolled loop\n"
|
||||||
|
" {\n"
|
||||||
|
);
|
||||||
|
|
||||||
|
// Assemble body with manual loop unroll to try loweing GPR usage
|
||||||
|
for (u32 n = 0; n < kernel_size; ++n)
|
||||||
|
{
|
||||||
|
m_src += work_kernel;
|
||||||
|
}
|
||||||
|
|
||||||
|
m_src += " }\n";
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
m_src += " for (int loop = 0; loop < KERNEL_SIZE; ++loop)\n";
|
||||||
|
m_src += " {\n";
|
||||||
|
m_src += work_kernel;
|
||||||
|
m_src += loop_advance;
|
||||||
|
m_src += " }\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
m_src += suffix;
|
||||||
|
}
|
||||||
|
|
||||||
|
void cs_shuffle_base::bind_resources()
|
||||||
|
{
|
||||||
|
m_data->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(0), m_data_offset, m_data_length);
|
||||||
|
}
|
||||||
|
|
||||||
|
void cs_shuffle_base::run(const gl::buffer* data, u32 data_length, u32 data_offset)
|
||||||
|
{
|
||||||
|
m_data = data;
|
||||||
|
m_data_offset = data_offset;
|
||||||
|
m_data_length = data_length;
|
||||||
|
|
||||||
|
const auto num_bytes_per_invocation = optimal_group_size * kernel_size * 4;
|
||||||
|
const auto num_bytes_to_process = utils::align(data_length, num_bytes_per_invocation);
|
||||||
|
const auto num_invocations = num_bytes_to_process / num_bytes_per_invocation;
|
||||||
|
|
||||||
|
if ((num_bytes_to_process + data_offset) > data->size())
|
||||||
|
{
|
||||||
|
// Technically robust buffer access should keep the driver from crashing in OOB situations
|
||||||
|
rsx_log.error("Inadequate buffer length submitted for a compute operation."
|
||||||
|
"Required=%d bytes, Available=%d bytes", num_bytes_to_process, data->size());
|
||||||
|
}
|
||||||
|
|
||||||
|
compute_task::run(num_invocations);
|
||||||
|
}
|
||||||
|
|
||||||
|
cs_shuffle_d32fx8_to_x8d24f::cs_shuffle_d32fx8_to_x8d24f()
|
||||||
|
{
|
||||||
|
uniforms = "uniform uint in_ptr, out_ptr;\n";
|
||||||
|
|
||||||
|
variables =
|
||||||
|
" uint in_offset = in_ptr >> 2;\n"
|
||||||
|
" uint out_offset = out_ptr >> 2;\n"
|
||||||
|
" uint depth, stencil;\n";
|
||||||
|
|
||||||
|
work_kernel =
|
||||||
|
" depth = data[index * 2 + in_offset];\n"
|
||||||
|
" stencil = data[index * 2 + (in_offset + 1)] & 0xFFu;\n"
|
||||||
|
" value = f32_to_d24f(depth) << 8;\n"
|
||||||
|
" value |= stencil;\n"
|
||||||
|
" data[index + out_ptr] = bswap_u32(value);\n";
|
||||||
|
|
||||||
|
cs_shuffle_base::build("");
|
||||||
|
}
|
||||||
|
|
||||||
|
void cs_shuffle_d32fx8_to_x8d24f::bind_resources()
|
||||||
|
{
|
||||||
|
m_data->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(0), m_data_offset, m_ssbo_length);
|
||||||
|
}
|
||||||
|
|
||||||
|
void cs_shuffle_d32fx8_to_x8d24f::run(const gl::buffer* data, u32 src_offset, u32 dst_offset, u32 num_texels)
|
||||||
|
{
|
||||||
|
u32 data_offset;
|
||||||
|
if (src_offset > dst_offset)
|
||||||
|
{
|
||||||
|
data_offset = dst_offset;
|
||||||
|
m_ssbo_length = (src_offset + num_texels * 8) - data_offset;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
data_offset = src_offset;
|
||||||
|
m_ssbo_length = (dst_offset + num_texels * 4) - data_offset;
|
||||||
|
}
|
||||||
|
|
||||||
|
m_program.uniforms["in_ptr"] = src_offset - data_offset;
|
||||||
|
m_program.uniforms["out_ptr"] = dst_offset - data_offset;
|
||||||
|
cs_shuffle_base::run(data, num_texels * 4, data_offset);
|
||||||
|
}
|
||||||
|
|
||||||
|
cs_shuffle_x8d24f_to_d32fx8::cs_shuffle_x8d24f_to_d32fx8()
|
||||||
|
{
|
||||||
|
uniforms = "uniform uint texel_count, in_ptr, out_ptr;\n";
|
||||||
|
|
||||||
|
variables =
|
||||||
|
" uint in_offset = in_ptr >> 2;\n"
|
||||||
|
" uint out_offset = out_ptr >> 2;\n"
|
||||||
|
" uint depth, stencil;\n";
|
||||||
|
|
||||||
|
work_kernel =
|
||||||
|
" value = data[index + in_offset];\n"
|
||||||
|
" value = bswap_u32(value);\n"
|
||||||
|
" stencil = (value & 0xFFu);\n"
|
||||||
|
" depth = (value >> 8);\n"
|
||||||
|
" data[index * 2 + out_offset] = d24f_to_f32(depth);\n"
|
||||||
|
" data[index * 2 + (out_offset + 1)] = stencil;\n";
|
||||||
|
|
||||||
|
cs_shuffle_base::build("");
|
||||||
|
}
|
||||||
|
|
||||||
|
void cs_shuffle_x8d24f_to_d32fx8::bind_resources()
|
||||||
|
{
|
||||||
|
m_data->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(0), m_data_offset, m_ssbo_length);
|
||||||
|
}
|
||||||
|
|
||||||
|
void cs_shuffle_x8d24f_to_d32fx8::run(const gl::buffer* data, u32 src_offset, u32 dst_offset, u32 num_texels)
|
||||||
|
{
|
||||||
|
u32 data_offset;
|
||||||
|
if (src_offset > dst_offset)
|
||||||
|
{
|
||||||
|
data_offset = dst_offset;
|
||||||
|
m_ssbo_length = (src_offset + num_texels * 4) - data_offset;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
data_offset = src_offset;
|
||||||
|
m_ssbo_length = (dst_offset + num_texels * 8) - data_offset;
|
||||||
|
}
|
||||||
|
|
||||||
|
m_program.uniforms["in_ptr"] = src_offset - data_offset;
|
||||||
|
m_program.uniforms["out_ptr"] = dst_offset - data_offset;
|
||||||
|
cs_shuffle_base::run(data, num_texels * 4, data_offset);
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,10 +1,8 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "Utilities/StrUtil.h"
|
|
||||||
#include "Emu/IdManager.h"
|
#include "Emu/IdManager.h"
|
||||||
#include "GLHelpers.h"
|
#include "GLHelpers.h"
|
||||||
|
|
||||||
#include "util/asm.hpp"
|
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
|
|
||||||
namespace gl
|
namespace gl
|
||||||
|
@ -22,88 +20,14 @@ namespace gl
|
||||||
u32 optimal_kernel_size = 1;
|
u32 optimal_kernel_size = 1;
|
||||||
u32 max_invocations_x = 65535;
|
u32 max_invocations_x = 65535;
|
||||||
|
|
||||||
void initialize()
|
void initialize();
|
||||||
{
|
void create();
|
||||||
// Set up optimal kernel size
|
void destroy();
|
||||||
const auto& caps = gl::get_driver_caps();
|
|
||||||
if (caps.vendor_AMD || caps.vendor_MESA)
|
|
||||||
{
|
|
||||||
optimal_group_size = 64;
|
|
||||||
unroll_loops = false;
|
|
||||||
}
|
|
||||||
else if (caps.vendor_NVIDIA)
|
|
||||||
{
|
|
||||||
optimal_group_size = 32;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
optimal_group_size = 128;
|
|
||||||
}
|
|
||||||
|
|
||||||
glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_COUNT, 0, reinterpret_cast<GLint*>(&max_invocations_x));
|
virtual void bind_resources() {}
|
||||||
}
|
|
||||||
|
|
||||||
void create()
|
void run(u32 invocations_x, u32 invocations_y);
|
||||||
{
|
void run(u32 num_invocations);
|
||||||
if (!compiled)
|
|
||||||
{
|
|
||||||
m_shader.create(::glsl::program_domain::glsl_compute_program, m_src);
|
|
||||||
m_shader.compile();
|
|
||||||
|
|
||||||
m_program.create();
|
|
||||||
m_program.attach(m_shader);
|
|
||||||
m_program.link();
|
|
||||||
|
|
||||||
compiled = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void destroy()
|
|
||||||
{
|
|
||||||
if (compiled)
|
|
||||||
{
|
|
||||||
m_program.remove();
|
|
||||||
m_shader.remove();
|
|
||||||
|
|
||||||
compiled = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
virtual void bind_resources()
|
|
||||||
{}
|
|
||||||
|
|
||||||
void run(u32 invocations_x, u32 invocations_y)
|
|
||||||
{
|
|
||||||
GLint old_program;
|
|
||||||
glGetIntegerv(GL_CURRENT_PROGRAM, &old_program);
|
|
||||||
|
|
||||||
bind_resources();
|
|
||||||
m_program.use();
|
|
||||||
glDispatchCompute(invocations_x, invocations_y, 1);
|
|
||||||
|
|
||||||
glUseProgram(old_program);
|
|
||||||
}
|
|
||||||
|
|
||||||
void run(u32 num_invocations)
|
|
||||||
{
|
|
||||||
u32 invocations_x, invocations_y;
|
|
||||||
if (num_invocations <= max_invocations_x) [[likely]]
|
|
||||||
{
|
|
||||||
invocations_x = num_invocations;
|
|
||||||
invocations_y = 1;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
// Since all the invocations will run, the optimal distribution is sqrt(count)
|
|
||||||
const u32 optimal_length = static_cast<u32>(floor(std::sqrt(num_invocations)));
|
|
||||||
invocations_x = optimal_length;
|
|
||||||
invocations_y = invocations_x;
|
|
||||||
|
|
||||||
if (num_invocations % invocations_x) invocations_y++;
|
|
||||||
}
|
|
||||||
|
|
||||||
run(invocations_x, invocations_y);
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct cs_shuffle_base : compute_task
|
struct cs_shuffle_base : compute_task
|
||||||
|
@ -115,130 +39,13 @@ namespace gl
|
||||||
|
|
||||||
std::string uniforms, variables, work_kernel, loop_advance, suffix, method_declarations;
|
std::string uniforms, variables, work_kernel, loop_advance, suffix, method_declarations;
|
||||||
|
|
||||||
cs_shuffle_base()
|
cs_shuffle_base();
|
||||||
{
|
|
||||||
work_kernel =
|
|
||||||
" value = data[index];\n"
|
|
||||||
" data[index] = %f(value);\n";
|
|
||||||
|
|
||||||
loop_advance =
|
void build(const char* function_name, u32 _kernel_size = 0);
|
||||||
" index++;\n";
|
|
||||||
|
|
||||||
suffix =
|
void bind_resources() override;
|
||||||
"}\n";
|
|
||||||
}
|
|
||||||
|
|
||||||
void build(const char* function_name, u32 _kernel_size = 0)
|
void run(const gl::buffer* data, u32 data_length, u32 data_offset = 0);
|
||||||
{
|
|
||||||
// Initialize to allow detecting optimal settings
|
|
||||||
initialize();
|
|
||||||
|
|
||||||
kernel_size = _kernel_size? _kernel_size : optimal_kernel_size;
|
|
||||||
|
|
||||||
m_src =
|
|
||||||
"#version 430\n"
|
|
||||||
"layout(local_size_x=%ws, local_size_y=1, local_size_z=1) in;\n"
|
|
||||||
"layout(binding=%loc, std430) buffer ssbo{ uint data[]; };\n"
|
|
||||||
"%ub"
|
|
||||||
"\n"
|
|
||||||
"#define KERNEL_SIZE %ks\n"
|
|
||||||
"\n"
|
|
||||||
"// Generic swap routines\n"
|
|
||||||
"#define bswap_u16(bits) (bits & 0xFF) << 8 | (bits & 0xFF00) >> 8 | (bits & 0xFF0000) << 8 | (bits & 0xFF000000) >> 8\n"
|
|
||||||
"#define bswap_u32(bits) (bits & 0xFF) << 24 | (bits & 0xFF00) << 8 | (bits & 0xFF0000) >> 8 | (bits & 0xFF000000) >> 24\n"
|
|
||||||
"#define bswap_u16_u32(bits) (bits & 0xFFFF) << 16 | (bits & 0xFFFF0000) >> 16\n"
|
|
||||||
"\n"
|
|
||||||
"// Depth format conversions\n"
|
|
||||||
"#define d24f_to_f32(bits) (bits << 7)\n"
|
|
||||||
"#define f32_to_d24f(bits) (bits >> 7)\n"
|
|
||||||
"\n"
|
|
||||||
"uint linear_invocation_id()\n"
|
|
||||||
"{\n"
|
|
||||||
" uint size_in_x = (gl_NumWorkGroups.x * gl_WorkGroupSize.x);\n"
|
|
||||||
" return (gl_GlobalInvocationID.y * size_in_x) + gl_GlobalInvocationID.x;\n"
|
|
||||||
"}\n"
|
|
||||||
"\n"
|
|
||||||
"%md"
|
|
||||||
"void main()\n"
|
|
||||||
"{\n"
|
|
||||||
" uint invocation_id = linear_invocation_id();\n"
|
|
||||||
" uint index = invocation_id * KERNEL_SIZE;\n"
|
|
||||||
" uint value;\n"
|
|
||||||
" %vars"
|
|
||||||
"\n";
|
|
||||||
|
|
||||||
const std::pair<std::string, std::string> syntax_replace[] =
|
|
||||||
{
|
|
||||||
{ "%loc", std::to_string(GL_COMPUTE_BUFFER_SLOT(0)) },
|
|
||||||
{ "%ws", std::to_string(optimal_group_size) },
|
|
||||||
{ "%ks", std::to_string(kernel_size) },
|
|
||||||
{ "%vars", variables },
|
|
||||||
{ "%f", function_name },
|
|
||||||
{ "%ub", uniforms },
|
|
||||||
{ "%md", method_declarations }
|
|
||||||
};
|
|
||||||
|
|
||||||
m_src = fmt::replace_all(m_src, syntax_replace);
|
|
||||||
work_kernel = fmt::replace_all(work_kernel, syntax_replace);
|
|
||||||
|
|
||||||
if (kernel_size <= 1)
|
|
||||||
{
|
|
||||||
m_src += " {\n" + work_kernel + " }\n";
|
|
||||||
}
|
|
||||||
else if (unroll_loops)
|
|
||||||
{
|
|
||||||
work_kernel += loop_advance + "\n";
|
|
||||||
|
|
||||||
m_src += std::string
|
|
||||||
(
|
|
||||||
" //Unrolled loop\n"
|
|
||||||
" {\n"
|
|
||||||
);
|
|
||||||
|
|
||||||
// Assemble body with manual loop unroll to try loweing GPR usage
|
|
||||||
for (u32 n = 0; n < kernel_size; ++n)
|
|
||||||
{
|
|
||||||
m_src += work_kernel;
|
|
||||||
}
|
|
||||||
|
|
||||||
m_src += " }\n";
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
m_src += " for (int loop = 0; loop < KERNEL_SIZE; ++loop)\n";
|
|
||||||
m_src += " {\n";
|
|
||||||
m_src += work_kernel;
|
|
||||||
m_src += loop_advance;
|
|
||||||
m_src += " }\n";
|
|
||||||
}
|
|
||||||
|
|
||||||
m_src += suffix;
|
|
||||||
}
|
|
||||||
|
|
||||||
void bind_resources() override
|
|
||||||
{
|
|
||||||
m_data->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(0), m_data_offset, m_data_length);
|
|
||||||
}
|
|
||||||
|
|
||||||
void run(const gl::buffer* data, u32 data_length, u32 data_offset = 0)
|
|
||||||
{
|
|
||||||
m_data = data;
|
|
||||||
m_data_offset = data_offset;
|
|
||||||
m_data_length = data_length;
|
|
||||||
|
|
||||||
const auto num_bytes_per_invocation = optimal_group_size * kernel_size * 4;
|
|
||||||
const auto num_bytes_to_process = utils::align(data_length, num_bytes_per_invocation);
|
|
||||||
const auto num_invocations = num_bytes_to_process / num_bytes_per_invocation;
|
|
||||||
|
|
||||||
if ((num_bytes_to_process + data_offset) > data->size())
|
|
||||||
{
|
|
||||||
// Technically robust buffer access should keep the driver from crashing in OOB situations
|
|
||||||
rsx_log.error("Inadequate buffer length submitted for a compute operation."
|
|
||||||
"Required=%d bytes, Available=%d bytes", num_bytes_to_process, data->size());
|
|
||||||
}
|
|
||||||
|
|
||||||
compute_task::run(num_invocations);
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct cs_shuffle_16 : cs_shuffle_base
|
struct cs_shuffle_16 : cs_shuffle_base
|
||||||
|
@ -272,97 +79,22 @@ namespace gl
|
||||||
{
|
{
|
||||||
u32 m_ssbo_length = 0;
|
u32 m_ssbo_length = 0;
|
||||||
|
|
||||||
cs_shuffle_d32fx8_to_x8d24f()
|
cs_shuffle_d32fx8_to_x8d24f();
|
||||||
{
|
|
||||||
uniforms = "uniform uint in_ptr, out_ptr;\n";
|
|
||||||
|
|
||||||
variables =
|
void bind_resources() override;
|
||||||
" uint in_offset = in_ptr >> 2;\n"
|
|
||||||
" uint out_offset = out_ptr >> 2;\n"
|
|
||||||
" uint depth, stencil;\n";
|
|
||||||
|
|
||||||
work_kernel =
|
void run(const gl::buffer* data, u32 src_offset, u32 dst_offset, u32 num_texels);
|
||||||
" depth = data[index * 2 + in_offset];\n"
|
|
||||||
" stencil = data[index * 2 + (in_offset + 1)] & 0xFFu;\n"
|
|
||||||
" value = f32_to_d24f(depth) << 8;\n"
|
|
||||||
" value |= stencil;\n"
|
|
||||||
" data[index + out_ptr] = bswap_u32(value);\n";
|
|
||||||
|
|
||||||
cs_shuffle_base::build("");
|
|
||||||
}
|
|
||||||
|
|
||||||
void bind_resources() override
|
|
||||||
{
|
|
||||||
m_data->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(0), m_data_offset, m_ssbo_length);
|
|
||||||
}
|
|
||||||
|
|
||||||
void run(const gl::buffer* data, u32 src_offset, u32 dst_offset, u32 num_texels)
|
|
||||||
{
|
|
||||||
u32 data_offset;
|
|
||||||
if (src_offset > dst_offset)
|
|
||||||
{
|
|
||||||
data_offset = dst_offset;
|
|
||||||
m_ssbo_length = (src_offset + num_texels * 8) - data_offset;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
data_offset = src_offset;
|
|
||||||
m_ssbo_length = (dst_offset + num_texels * 4) - data_offset;
|
|
||||||
}
|
|
||||||
|
|
||||||
m_program.uniforms["in_ptr"] = src_offset - data_offset;
|
|
||||||
m_program.uniforms["out_ptr"] = dst_offset - data_offset;
|
|
||||||
cs_shuffle_base::run(data, num_texels * 4, data_offset);
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct cs_shuffle_x8d24f_to_d32fx8 : cs_shuffle_base
|
struct cs_shuffle_x8d24f_to_d32fx8 : cs_shuffle_base
|
||||||
{
|
{
|
||||||
u32 m_ssbo_length = 0;
|
u32 m_ssbo_length = 0;
|
||||||
|
|
||||||
cs_shuffle_x8d24f_to_d32fx8()
|
cs_shuffle_x8d24f_to_d32fx8();
|
||||||
{
|
|
||||||
uniforms = "uniform uint texel_count, in_ptr, out_ptr;\n";
|
|
||||||
|
|
||||||
variables =
|
void bind_resources() override;
|
||||||
" uint in_offset = in_ptr >> 2;\n"
|
|
||||||
" uint out_offset = out_ptr >> 2;\n"
|
|
||||||
" uint depth, stencil;\n";
|
|
||||||
|
|
||||||
work_kernel =
|
void run(const gl::buffer* data, u32 src_offset, u32 dst_offset, u32 num_texels);
|
||||||
" value = data[index + in_offset];\n"
|
|
||||||
" value = bswap_u32(value);\n"
|
|
||||||
" stencil = (value & 0xFFu);\n"
|
|
||||||
" depth = (value >> 8);\n"
|
|
||||||
" data[index * 2 + out_offset] = d24f_to_f32(depth);\n"
|
|
||||||
" data[index * 2 + (out_offset + 1)] = stencil;\n";
|
|
||||||
|
|
||||||
cs_shuffle_base::build("");
|
|
||||||
}
|
|
||||||
|
|
||||||
void bind_resources() override
|
|
||||||
{
|
|
||||||
m_data->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(0), m_data_offset, m_ssbo_length);
|
|
||||||
}
|
|
||||||
|
|
||||||
void run(const gl::buffer* data, u32 src_offset, u32 dst_offset, u32 num_texels)
|
|
||||||
{
|
|
||||||
u32 data_offset;
|
|
||||||
if (src_offset > dst_offset)
|
|
||||||
{
|
|
||||||
data_offset = dst_offset;
|
|
||||||
m_ssbo_length = (src_offset + num_texels * 4) - data_offset;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
data_offset = src_offset;
|
|
||||||
m_ssbo_length = (dst_offset + num_texels * 8) - data_offset;
|
|
||||||
}
|
|
||||||
|
|
||||||
m_program.uniforms["in_ptr"] = src_offset - data_offset;
|
|
||||||
m_program.uniforms["out_ptr"] = dst_offset - data_offset;
|
|
||||||
cs_shuffle_base::run(data, num_texels * 4, data_offset);
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
#include "util/logs.hpp"
|
||||||
#include "util/types.hpp"
|
#include "util/types.hpp"
|
||||||
#include "Utilities/geometry.h"
|
#include "Utilities/geometry.h"
|
||||||
#include "OpenGL.h"
|
#include "OpenGL.h"
|
||||||
|
|
648
rpcs3/Emu/RSX/GL/GLOverlays.cpp
Normal file
648
rpcs3/Emu/RSX/GL/GLOverlays.cpp
Normal file
|
@ -0,0 +1,648 @@
|
||||||
|
#include "GLOverlays.h"
|
||||||
|
|
||||||
|
extern u64 get_system_time();
|
||||||
|
|
||||||
|
namespace gl
|
||||||
|
{
|
||||||
|
void overlay_pass::create()
|
||||||
|
{
|
||||||
|
if (!compiled)
|
||||||
|
{
|
||||||
|
fs.create(::glsl::program_domain::glsl_fragment_program, fs_src);
|
||||||
|
fs.compile();
|
||||||
|
|
||||||
|
vs.create(::glsl::program_domain::glsl_vertex_program, vs_src);
|
||||||
|
vs.compile();
|
||||||
|
|
||||||
|
program_handle.create();
|
||||||
|
program_handle.attach(vs);
|
||||||
|
program_handle.attach(fs);
|
||||||
|
program_handle.link();
|
||||||
|
|
||||||
|
fbo.create();
|
||||||
|
|
||||||
|
m_sampler.create();
|
||||||
|
m_sampler.apply_defaults(input_filter);
|
||||||
|
|
||||||
|
m_vertex_data_buffer.create();
|
||||||
|
|
||||||
|
int old_vao;
|
||||||
|
glGetIntegerv(GL_VERTEX_ARRAY_BINDING, &old_vao);
|
||||||
|
|
||||||
|
m_vao.create();
|
||||||
|
m_vao.bind();
|
||||||
|
|
||||||
|
m_vao.array_buffer = m_vertex_data_buffer;
|
||||||
|
auto ptr = buffer_pointer(&m_vao);
|
||||||
|
m_vao[0] = ptr;
|
||||||
|
|
||||||
|
glBindVertexArray(old_vao);
|
||||||
|
|
||||||
|
compiled = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void overlay_pass::destroy()
|
||||||
|
{
|
||||||
|
if (compiled)
|
||||||
|
{
|
||||||
|
program_handle.remove();
|
||||||
|
vs.remove();
|
||||||
|
fs.remove();
|
||||||
|
|
||||||
|
fbo.remove();
|
||||||
|
m_vao.remove();
|
||||||
|
m_vertex_data_buffer.remove();
|
||||||
|
|
||||||
|
m_sampler.remove();
|
||||||
|
|
||||||
|
compiled = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void overlay_pass::emit_geometry()
|
||||||
|
{
|
||||||
|
int old_vao;
|
||||||
|
glGetIntegerv(GL_VERTEX_ARRAY_BINDING, &old_vao);
|
||||||
|
|
||||||
|
m_vao.bind();
|
||||||
|
glDrawArrays(primitives, 0, num_drawable_elements);
|
||||||
|
|
||||||
|
glBindVertexArray(old_vao);
|
||||||
|
}
|
||||||
|
|
||||||
|
void overlay_pass::run(const areau& region, GLuint target_texture, bool depth_target, bool use_blending)
|
||||||
|
{
|
||||||
|
if (!compiled)
|
||||||
|
{
|
||||||
|
rsx_log.error("You must initialize overlay passes with create() before calling run()");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
GLint program;
|
||||||
|
GLint old_fbo;
|
||||||
|
GLint depth_func;
|
||||||
|
GLint viewport[4];
|
||||||
|
GLboolean color_writes[4];
|
||||||
|
GLboolean depth_write;
|
||||||
|
|
||||||
|
GLint blend_src_rgb;
|
||||||
|
GLint blend_src_a;
|
||||||
|
GLint blend_dst_rgb;
|
||||||
|
GLint blend_dst_a;
|
||||||
|
GLint blend_eq_a;
|
||||||
|
GLint blend_eq_rgb;
|
||||||
|
|
||||||
|
if (target_texture)
|
||||||
|
{
|
||||||
|
glGetIntegerv(GL_FRAMEBUFFER_BINDING, &old_fbo);
|
||||||
|
glBindFramebuffer(GL_FRAMEBUFFER, fbo.id());
|
||||||
|
|
||||||
|
if (depth_target)
|
||||||
|
{
|
||||||
|
glFramebufferTexture2D(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_2D, target_texture, 0);
|
||||||
|
glDrawBuffer(GL_NONE);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
GLenum buffer = GL_COLOR_ATTACHMENT0;
|
||||||
|
glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, target_texture, 0);
|
||||||
|
glDrawBuffers(1, &buffer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!target_texture || glCheckFramebufferStatus(GL_FRAMEBUFFER) == GL_FRAMEBUFFER_COMPLETE)
|
||||||
|
{
|
||||||
|
// Push rasterizer state
|
||||||
|
glGetIntegerv(GL_VIEWPORT, viewport);
|
||||||
|
glGetBooleanv(GL_COLOR_WRITEMASK, color_writes);
|
||||||
|
glGetBooleanv(GL_DEPTH_WRITEMASK, &depth_write);
|
||||||
|
glGetIntegerv(GL_CURRENT_PROGRAM, &program);
|
||||||
|
glGetIntegerv(GL_DEPTH_FUNC, &depth_func);
|
||||||
|
|
||||||
|
GLboolean scissor_enabled = glIsEnabled(GL_SCISSOR_TEST);
|
||||||
|
GLboolean depth_test_enabled = glIsEnabled(GL_DEPTH_TEST);
|
||||||
|
GLboolean cull_face_enabled = glIsEnabled(GL_CULL_FACE);
|
||||||
|
GLboolean blend_enabled = glIsEnabledi(GL_BLEND, 0);
|
||||||
|
GLboolean stencil_test_enabled = glIsEnabled(GL_STENCIL_TEST);
|
||||||
|
|
||||||
|
if (use_blending)
|
||||||
|
{
|
||||||
|
glGetIntegerv(GL_BLEND_SRC_RGB, &blend_src_rgb);
|
||||||
|
glGetIntegerv(GL_BLEND_SRC_ALPHA, &blend_src_a);
|
||||||
|
glGetIntegerv(GL_BLEND_DST_RGB, &blend_dst_rgb);
|
||||||
|
glGetIntegerv(GL_BLEND_DST_ALPHA, &blend_dst_a);
|
||||||
|
glGetIntegerv(GL_BLEND_EQUATION_RGB, &blend_eq_rgb);
|
||||||
|
glGetIntegerv(GL_BLEND_EQUATION_ALPHA, &blend_eq_a);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set initial state
|
||||||
|
glViewport(region.x1, region.y1, region.width(), region.height());
|
||||||
|
glColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
|
||||||
|
glDepthMask(depth_target ? GL_TRUE : GL_FALSE);
|
||||||
|
|
||||||
|
// Disabling depth test will also disable depth writes which is not desired
|
||||||
|
glDepthFunc(GL_ALWAYS);
|
||||||
|
glEnable(GL_DEPTH_TEST);
|
||||||
|
|
||||||
|
if (scissor_enabled) glDisable(GL_SCISSOR_TEST);
|
||||||
|
if (cull_face_enabled) glDisable(GL_CULL_FACE);
|
||||||
|
if (stencil_test_enabled) glDisable(GL_STENCIL_TEST);
|
||||||
|
|
||||||
|
if (use_blending)
|
||||||
|
{
|
||||||
|
if (!blend_enabled)
|
||||||
|
glEnablei(GL_BLEND, 0);
|
||||||
|
|
||||||
|
glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);
|
||||||
|
glBlendEquation(GL_FUNC_ADD);
|
||||||
|
}
|
||||||
|
else if (blend_enabled)
|
||||||
|
{
|
||||||
|
glDisablei(GL_BLEND, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Render
|
||||||
|
program_handle.use();
|
||||||
|
on_load();
|
||||||
|
bind_resources();
|
||||||
|
emit_geometry();
|
||||||
|
|
||||||
|
// Clean up
|
||||||
|
if (target_texture)
|
||||||
|
{
|
||||||
|
if (depth_target)
|
||||||
|
glFramebufferTexture2D(GL_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0);
|
||||||
|
else
|
||||||
|
glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0);
|
||||||
|
|
||||||
|
glBindFramebuffer(GL_FRAMEBUFFER, old_fbo);
|
||||||
|
}
|
||||||
|
|
||||||
|
glUseProgram(program);
|
||||||
|
|
||||||
|
glViewport(viewport[0], viewport[1], viewport[2], viewport[3]);
|
||||||
|
glColorMask(color_writes[0], color_writes[1], color_writes[2], color_writes[3]);
|
||||||
|
glDepthMask(depth_write);
|
||||||
|
glDepthFunc(depth_func);
|
||||||
|
|
||||||
|
if (!depth_test_enabled) glDisable(GL_DEPTH_TEST);
|
||||||
|
if (scissor_enabled) glEnable(GL_SCISSOR_TEST);
|
||||||
|
if (cull_face_enabled) glEnable(GL_CULL_FACE);
|
||||||
|
if (stencil_test_enabled) glEnable(GL_STENCIL_TEST);
|
||||||
|
|
||||||
|
if (use_blending)
|
||||||
|
{
|
||||||
|
if (!blend_enabled)
|
||||||
|
glDisablei(GL_BLEND, 0);
|
||||||
|
|
||||||
|
glBlendFuncSeparate(blend_src_rgb, blend_dst_rgb, blend_src_a, blend_dst_a);
|
||||||
|
glBlendEquationSeparate(blend_eq_rgb, blend_eq_a);
|
||||||
|
}
|
||||||
|
else if (blend_enabled)
|
||||||
|
{
|
||||||
|
glEnablei(GL_BLEND, 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
rsx_log.error("Overlay pass failed because framebuffer was not complete. Run with debug output enabled to diagnose the problem");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ui_overlay_renderer::ui_overlay_renderer()
|
||||||
|
{
|
||||||
|
vs_src =
|
||||||
|
"#version 420\n\n"
|
||||||
|
"layout(location=0) in vec4 in_pos;\n"
|
||||||
|
"layout(location=0) out vec2 tc0;\n"
|
||||||
|
"layout(location=1) flat out vec4 clip_rect;\n"
|
||||||
|
"uniform vec4 ui_scale;\n"
|
||||||
|
"uniform vec4 viewport;\n"
|
||||||
|
"uniform vec4 clip_bounds;\n"
|
||||||
|
"\n"
|
||||||
|
"vec2 snap_to_grid(vec2 normalized)\n"
|
||||||
|
"{\n"
|
||||||
|
" return (floor(normalized * viewport.xy) + 0.5) / viewport.xy;\n"
|
||||||
|
"}\n"
|
||||||
|
"\n"
|
||||||
|
"vec4 clip_to_ndc(const in vec4 coord)\n"
|
||||||
|
"{\n"
|
||||||
|
" vec4 ret = (coord * ui_scale.zwzw) / ui_scale.xyxy;\n"
|
||||||
|
" ret.yw = 1. - ret.yw;\n"
|
||||||
|
" return ret;\n"
|
||||||
|
"}\n"
|
||||||
|
"\n"
|
||||||
|
"vec4 ndc_to_window(const in vec4 coord)\n"
|
||||||
|
"{\n"
|
||||||
|
" return fma(coord, viewport.xyxy, viewport.zwzw);\n"
|
||||||
|
"}\n"
|
||||||
|
"\n"
|
||||||
|
"void main()\n"
|
||||||
|
"{\n"
|
||||||
|
" tc0.xy = in_pos.zw;\n"
|
||||||
|
" clip_rect = ndc_to_window(clip_to_ndc(clip_bounds)).xwzy; // Swap y1 and y2 due to flipped origin!\n"
|
||||||
|
" vec4 pos = vec4(clip_to_ndc(in_pos).xy, 0.5, 1.);\n"
|
||||||
|
" pos.xy = snap_to_grid(pos.xy);\n"
|
||||||
|
" gl_Position = (pos + pos) - 1.;\n"
|
||||||
|
"}\n";
|
||||||
|
|
||||||
|
fs_src =
|
||||||
|
"#version 420\n\n"
|
||||||
|
"layout(binding=31) uniform sampler2D fs0;\n"
|
||||||
|
"layout(binding=30) uniform sampler2DArray fs1;\n"
|
||||||
|
"layout(location=0) in vec2 tc0;\n"
|
||||||
|
"layout(location=1) flat in vec4 clip_rect;\n"
|
||||||
|
"layout(location=0) out vec4 ocol;\n"
|
||||||
|
"uniform vec4 color;\n"
|
||||||
|
"uniform float time;\n"
|
||||||
|
"uniform int sampler_mode;\n"
|
||||||
|
"uniform int pulse_glow;\n"
|
||||||
|
"uniform int clip_region;\n"
|
||||||
|
"uniform int blur_strength;\n"
|
||||||
|
"\n"
|
||||||
|
"vec4 blur_sample(sampler2D tex, vec2 coord, vec2 tex_offset)\n"
|
||||||
|
"{\n"
|
||||||
|
" vec2 coords[9];\n"
|
||||||
|
" coords[0] = coord - tex_offset\n;"
|
||||||
|
" coords[1] = coord + vec2(0., -tex_offset.y);\n"
|
||||||
|
" coords[2] = coord + vec2(tex_offset.x, -tex_offset.y);\n"
|
||||||
|
" coords[3] = coord + vec2(-tex_offset.x, 0.);\n"
|
||||||
|
" coords[4] = coord;\n"
|
||||||
|
" coords[5] = coord + vec2(tex_offset.x, 0.);\n"
|
||||||
|
" coords[6] = coord + vec2(-tex_offset.x, tex_offset.y);\n"
|
||||||
|
" coords[7] = coord + vec2(0., tex_offset.y);\n"
|
||||||
|
" coords[8] = coord + tex_offset;\n"
|
||||||
|
"\n"
|
||||||
|
" float weights[9] =\n"
|
||||||
|
" {\n"
|
||||||
|
" 1., 2., 1.,\n"
|
||||||
|
" 2., 4., 2.,\n"
|
||||||
|
" 1., 2., 1.\n"
|
||||||
|
" };\n"
|
||||||
|
"\n"
|
||||||
|
" vec4 blurred = vec4(0.);\n"
|
||||||
|
" for (int n = 0; n < 9; ++n)\n"
|
||||||
|
" {\n"
|
||||||
|
" blurred += texture(tex, coords[n]) * weights[n];\n"
|
||||||
|
" }\n"
|
||||||
|
"\n"
|
||||||
|
" return blurred / 16.f;\n"
|
||||||
|
"}\n"
|
||||||
|
"\n"
|
||||||
|
"vec4 sample_image(sampler2D tex, vec2 coord)\n"
|
||||||
|
"{\n"
|
||||||
|
" vec4 original = texture(tex, coord);\n"
|
||||||
|
" if (blur_strength == 0) return original;\n"
|
||||||
|
" \n"
|
||||||
|
" vec2 constraints = 1.f / vec2(640, 360);\n"
|
||||||
|
" vec2 res_offset = 1.f / textureSize(fs0, 0);\n"
|
||||||
|
" vec2 tex_offset = max(res_offset, constraints);\n"
|
||||||
|
"\n"
|
||||||
|
" // Sample triangle pattern and average\n"
|
||||||
|
" // TODO: Nicer looking gaussian blur with less sampling\n"
|
||||||
|
" vec4 blur0 = blur_sample(tex, coord + vec2(-res_offset.x, 0.), tex_offset);\n"
|
||||||
|
" vec4 blur1 = blur_sample(tex, coord + vec2(res_offset.x, 0.), tex_offset);\n"
|
||||||
|
" vec4 blur2 = blur_sample(tex, coord + vec2(0., res_offset.y), tex_offset);\n"
|
||||||
|
"\n"
|
||||||
|
" vec4 blurred = blur0 + blur1 + blur2;\n"
|
||||||
|
" blurred /= 3.;\n"
|
||||||
|
" return mix(original, blurred, float(blur_strength) / 100.);\n"
|
||||||
|
"}\n"
|
||||||
|
"\n"
|
||||||
|
"void main()\n"
|
||||||
|
"{\n"
|
||||||
|
" if (clip_region != 0)\n"
|
||||||
|
" {"
|
||||||
|
" if (gl_FragCoord.x < clip_rect.x || gl_FragCoord.x > clip_rect.z ||\n"
|
||||||
|
" gl_FragCoord.y < clip_rect.y || gl_FragCoord.y > clip_rect.w)\n"
|
||||||
|
" {\n"
|
||||||
|
" discard;\n"
|
||||||
|
" return;\n"
|
||||||
|
" }\n"
|
||||||
|
" }\n"
|
||||||
|
"\n"
|
||||||
|
" vec4 diff_color = color;\n"
|
||||||
|
" if (pulse_glow != 0)\n"
|
||||||
|
" diff_color.a *= (sin(time) + 1.f) * 0.5f;\n"
|
||||||
|
"\n"
|
||||||
|
" switch (sampler_mode)\n"
|
||||||
|
" {\n"
|
||||||
|
" case 1:\n"
|
||||||
|
" ocol = sample_image(fs0, tc0) * diff_color;\n"
|
||||||
|
" break;\n"
|
||||||
|
" case 2:\n"
|
||||||
|
" ocol = texture(fs1, vec3(tc0.x, fract(tc0.y), trunc(tc0.y))) * diff_color;\n"
|
||||||
|
" break;\n"
|
||||||
|
" default:\n"
|
||||||
|
" ocol = diff_color;\n"
|
||||||
|
" break;\n"
|
||||||
|
" }\n"
|
||||||
|
"}\n";
|
||||||
|
|
||||||
|
// Smooth filtering required for inputs
|
||||||
|
input_filter = GL_LINEAR;
|
||||||
|
}
|
||||||
|
|
||||||
|
gl::texture_view* ui_overlay_renderer::load_simple_image(rsx::overlays::image_info* desc, bool temp_resource, u32 owner_uid)
|
||||||
|
{
|
||||||
|
auto tex = std::make_unique<gl::texture>(GL_TEXTURE_2D, desc->w, desc->h, 1, 1, GL_RGBA8);
|
||||||
|
tex->copy_from(desc->data, gl::texture::format::rgba, gl::texture::type::uint_8_8_8_8, {});
|
||||||
|
|
||||||
|
GLenum remap[] = { GL_RED, GL_ALPHA, GL_BLUE, GL_GREEN };
|
||||||
|
auto view = std::make_unique<gl::texture_view>(tex.get(), remap);
|
||||||
|
|
||||||
|
auto result = view.get();
|
||||||
|
if (!temp_resource)
|
||||||
|
{
|
||||||
|
resources.push_back(std::move(tex));
|
||||||
|
view_cache[view_cache.size()] = std::move(view);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
u64 key = reinterpret_cast<u64>(desc);
|
||||||
|
temp_image_cache[key] = std::make_pair(owner_uid, std::move(tex));
|
||||||
|
temp_view_cache[key] = std::move(view);
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ui_overlay_renderer::create()
|
||||||
|
{
|
||||||
|
overlay_pass::create();
|
||||||
|
|
||||||
|
rsx::overlays::resource_config configuration;
|
||||||
|
configuration.load_files();
|
||||||
|
|
||||||
|
for (const auto &res : configuration.texture_raw_data)
|
||||||
|
{
|
||||||
|
load_simple_image(res.get(), false, UINT32_MAX);
|
||||||
|
}
|
||||||
|
|
||||||
|
configuration.free_resources();
|
||||||
|
}
|
||||||
|
|
||||||
|
void ui_overlay_renderer::destroy()
|
||||||
|
{
|
||||||
|
temp_image_cache.clear();
|
||||||
|
resources.clear();
|
||||||
|
font_cache.clear();
|
||||||
|
overlay_pass::destroy();
|
||||||
|
}
|
||||||
|
|
||||||
|
void ui_overlay_renderer::remove_temp_resources(u64 key)
|
||||||
|
{
|
||||||
|
std::vector<u64> keys_to_remove;
|
||||||
|
for (const auto& temp_image : temp_image_cache)
|
||||||
|
{
|
||||||
|
if (temp_image.second.first == key)
|
||||||
|
{
|
||||||
|
keys_to_remove.push_back(temp_image.first);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const auto& _key : keys_to_remove)
|
||||||
|
{
|
||||||
|
temp_image_cache.erase(_key);
|
||||||
|
temp_view_cache.erase(_key);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
gl::texture_view* ui_overlay_renderer::find_font(rsx::overlays::font* font)
|
||||||
|
{
|
||||||
|
const auto font_size = font->get_glyph_data_dimensions();
|
||||||
|
|
||||||
|
u64 key = reinterpret_cast<u64>(font);
|
||||||
|
auto found = view_cache.find(key);
|
||||||
|
if (found != view_cache.end())
|
||||||
|
{
|
||||||
|
if (const auto this_size = found->second->image()->size3D();
|
||||||
|
font_size.width == this_size.width &&
|
||||||
|
font_size.height == this_size.height &&
|
||||||
|
font_size.depth == this_size.depth)
|
||||||
|
{
|
||||||
|
return found->second.get();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create font file
|
||||||
|
std::vector<u8> glyph_data;
|
||||||
|
font->get_glyph_data(glyph_data);
|
||||||
|
|
||||||
|
auto tex = std::make_unique<gl::texture>(GL_TEXTURE_2D_ARRAY, font_size.width, font_size.height, font_size.depth, 1, GL_R8);
|
||||||
|
tex->copy_from(glyph_data.data(), gl::texture::format::r, gl::texture::type::ubyte, {});
|
||||||
|
|
||||||
|
GLenum remap[] = { GL_RED, GL_RED, GL_RED, GL_RED };
|
||||||
|
auto view = std::make_unique<gl::texture_view>(tex.get(), remap);
|
||||||
|
|
||||||
|
auto result = view.get();
|
||||||
|
font_cache[key] = std::move(tex);
|
||||||
|
view_cache[key] = std::move(view);
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
gl::texture_view* ui_overlay_renderer::find_temp_image(rsx::overlays::image_info* desc, u32 owner_uid)
|
||||||
|
{
|
||||||
|
auto key = reinterpret_cast<u64>(desc);
|
||||||
|
auto cached = temp_view_cache.find(key);
|
||||||
|
if (cached != temp_view_cache.end())
|
||||||
|
{
|
||||||
|
return cached->second.get();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
return load_simple_image(desc, true, owner_uid);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void ui_overlay_renderer::set_primitive_type(rsx::overlays::primitive_type type)
|
||||||
|
{
|
||||||
|
m_current_primitive_type = type;
|
||||||
|
|
||||||
|
switch (type)
|
||||||
|
{
|
||||||
|
case rsx::overlays::primitive_type::quad_list:
|
||||||
|
case rsx::overlays::primitive_type::triangle_strip:
|
||||||
|
primitives = GL_TRIANGLE_STRIP;
|
||||||
|
break;
|
||||||
|
case rsx::overlays::primitive_type::line_list:
|
||||||
|
primitives = GL_LINES;
|
||||||
|
break;
|
||||||
|
case rsx::overlays::primitive_type::line_strip:
|
||||||
|
primitives = GL_LINE_STRIP;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
fmt::throw_exception("Unexpected primitive type %d", static_cast<s32>(type));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void ui_overlay_renderer::emit_geometry()
|
||||||
|
{
|
||||||
|
if (m_current_primitive_type == rsx::overlays::primitive_type::quad_list)
|
||||||
|
{
|
||||||
|
// Emulate quads with disjointed triangle strips
|
||||||
|
int num_quads = num_drawable_elements / 4;
|
||||||
|
std::vector<GLint> firsts;
|
||||||
|
std::vector<GLsizei> counts;
|
||||||
|
|
||||||
|
firsts.resize(num_quads);
|
||||||
|
counts.resize(num_quads);
|
||||||
|
|
||||||
|
for (int n = 0; n < num_quads; ++n)
|
||||||
|
{
|
||||||
|
firsts[n] = (n * 4);
|
||||||
|
counts[n] = 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
int old_vao;
|
||||||
|
glGetIntegerv(GL_VERTEX_ARRAY_BINDING, &old_vao);
|
||||||
|
|
||||||
|
m_vao.bind();
|
||||||
|
glMultiDrawArrays(GL_TRIANGLE_STRIP, firsts.data(), counts.data(), num_quads);
|
||||||
|
|
||||||
|
glBindVertexArray(old_vao);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
overlay_pass::emit_geometry();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void ui_overlay_renderer::run(const areau& viewport, GLuint target, rsx::overlays::overlay& ui)
|
||||||
|
{
|
||||||
|
program_handle.uniforms["viewport"] = color4f(static_cast<f32>(viewport.width()), static_cast<f32>(viewport.height()), static_cast<f32>(viewport.x1), static_cast<f32>(viewport.y1));
|
||||||
|
program_handle.uniforms["ui_scale"] = color4f(static_cast<f32>(ui.virtual_width), static_cast<f32>(ui.virtual_height), 1.f, 1.f);
|
||||||
|
program_handle.uniforms["time"] = static_cast<f32>(get_system_time() / 1000) * 0.005f;
|
||||||
|
|
||||||
|
saved_sampler_state save_30(30, m_sampler);
|
||||||
|
saved_sampler_state save_31(31, m_sampler);
|
||||||
|
|
||||||
|
for (auto &cmd : ui.get_compiled().draw_commands)
|
||||||
|
{
|
||||||
|
set_primitive_type(cmd.config.primitives);
|
||||||
|
upload_vertex_data(cmd.verts.data(), ::size32(cmd.verts));
|
||||||
|
num_drawable_elements = ::size32(cmd.verts);
|
||||||
|
GLint texture_read = GL_TRUE;
|
||||||
|
|
||||||
|
switch (cmd.config.texture_ref)
|
||||||
|
{
|
||||||
|
case rsx::overlays::image_resource_id::game_icon:
|
||||||
|
case rsx::overlays::image_resource_id::backbuffer:
|
||||||
|
//TODO
|
||||||
|
case rsx::overlays::image_resource_id::none:
|
||||||
|
{
|
||||||
|
texture_read = GL_FALSE;
|
||||||
|
glBindTexture(GL_TEXTURE_2D, GL_NONE);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case rsx::overlays::image_resource_id::raw_image:
|
||||||
|
{
|
||||||
|
glBindTexture(GL_TEXTURE_2D, find_temp_image(static_cast<rsx::overlays::image_info*>(cmd.config.external_data_ref), ui.uid)->id());
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case rsx::overlays::image_resource_id::font_file:
|
||||||
|
{
|
||||||
|
texture_read = (GL_TRUE + 1);
|
||||||
|
glActiveTexture(GL_TEXTURE0 + 30);
|
||||||
|
glBindTexture(GL_TEXTURE_2D_ARRAY, find_font(cmd.config.font_ref)->id());
|
||||||
|
glActiveTexture(GL_TEXTURE0 + 31);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
{
|
||||||
|
glBindTexture(GL_TEXTURE_2D, view_cache[cmd.config.texture_ref - 1]->id());
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
program_handle.uniforms["color"] = cmd.config.color;
|
||||||
|
program_handle.uniforms["sampler_mode"] = texture_read;
|
||||||
|
program_handle.uniforms["pulse_glow"] = static_cast<s32>(cmd.config.pulse_glow);
|
||||||
|
program_handle.uniforms["blur_strength"] = static_cast<s32>(cmd.config.blur_strength);
|
||||||
|
program_handle.uniforms["clip_region"] = static_cast<s32>(cmd.config.clip_region);
|
||||||
|
program_handle.uniforms["clip_bounds"] = cmd.config.clip_rect;
|
||||||
|
overlay_pass::run(viewport, target, false, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
ui.update();
|
||||||
|
}
|
||||||
|
|
||||||
|
video_out_calibration_pass::video_out_calibration_pass()
|
||||||
|
{
|
||||||
|
vs_src =
|
||||||
|
"#version 420\n\n"
|
||||||
|
"layout(location=0) out vec2 tc0;\n"
|
||||||
|
"\n"
|
||||||
|
"void main()\n"
|
||||||
|
"{\n"
|
||||||
|
" vec2 positions[] = {vec2(-1., -1.), vec2(1., -1.), vec2(-1., 1.), vec2(1., 1.)};\n"
|
||||||
|
" vec2 coords[] = {vec2(0., 1.), vec2(1., 1.), vec2(0., 0.), vec2(1., 0.)};\n"
|
||||||
|
" tc0 = coords[gl_VertexID % 4];\n"
|
||||||
|
" vec2 pos = positions[gl_VertexID % 4];\n"
|
||||||
|
" gl_Position = vec4(pos, 0., 1.);\n"
|
||||||
|
"}\n";
|
||||||
|
|
||||||
|
fs_src =
|
||||||
|
"#version 420\n\n"
|
||||||
|
"layout(binding=31) uniform sampler2D fs0;\n"
|
||||||
|
"layout(binding=30) uniform sampler2D fs1;\n"
|
||||||
|
"layout(location=0) in vec2 tc0;\n"
|
||||||
|
"layout(location=0) out vec4 ocol;\n"
|
||||||
|
"\n"
|
||||||
|
"uniform float gamma;\n"
|
||||||
|
"uniform int limit_range;\n"
|
||||||
|
"uniform int stereo;\n"
|
||||||
|
"uniform int stereo_image_count;\n"
|
||||||
|
"\n"
|
||||||
|
"vec4 read_source()\n"
|
||||||
|
"{\n"
|
||||||
|
" if (stereo == 0) return texture(fs0, tc0);\n"
|
||||||
|
"\n"
|
||||||
|
" vec4 left, right;\n"
|
||||||
|
" if (stereo_image_count == 2)\n"
|
||||||
|
" {\n"
|
||||||
|
" left = texture(fs0, tc0);\n"
|
||||||
|
" right = texture(fs1, tc0);\n"
|
||||||
|
" }\n"
|
||||||
|
" else\n"
|
||||||
|
" {\n"
|
||||||
|
" vec2 coord_left = tc0 * vec2(1.f, 0.4898f);\n"
|
||||||
|
" vec2 coord_right = coord_left + vec2(0.f, 0.510204f);\n"
|
||||||
|
" left = texture(fs0, coord_left);\n"
|
||||||
|
" right = texture(fs0, coord_right);\n"
|
||||||
|
" }\n"
|
||||||
|
"\n"
|
||||||
|
" return vec4(left.r, right.g, right.b, 1.);\n"
|
||||||
|
"}\n"
|
||||||
|
"\n"
|
||||||
|
"void main()\n"
|
||||||
|
"{\n"
|
||||||
|
" vec4 color = read_source();\n"
|
||||||
|
" color.rgb = pow(color.rgb, vec3(gamma));\n"
|
||||||
|
" if (limit_range > 0)\n"
|
||||||
|
" ocol = ((color * 220.) + 16.) / 255.;\n"
|
||||||
|
" else\n"
|
||||||
|
" ocol = color;\n"
|
||||||
|
"}\n";
|
||||||
|
|
||||||
|
input_filter = GL_LINEAR;
|
||||||
|
}
|
||||||
|
|
||||||
|
void video_out_calibration_pass::run(const areau& viewport, const rsx::simple_array<GLuint>& source, f32 gamma, bool limited_rgb, bool _3d)
|
||||||
|
{
|
||||||
|
program_handle.uniforms["gamma"] = gamma;
|
||||||
|
program_handle.uniforms["limit_range"] = limited_rgb + 0;
|
||||||
|
program_handle.uniforms["stereo"] = _3d + 0;
|
||||||
|
program_handle.uniforms["stereo_image_count"] = (source[1] == GL_NONE? 1 : 2);
|
||||||
|
|
||||||
|
saved_sampler_state saved(31, m_sampler);
|
||||||
|
glBindTexture(GL_TEXTURE_2D, source[0]);
|
||||||
|
|
||||||
|
saved_sampler_state saved2(30, m_sampler);
|
||||||
|
glBindTexture(GL_TEXTURE_2D, source[1]);
|
||||||
|
|
||||||
|
overlay_pass::run(viewport, GL_NONE, false, false);
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,13 +1,12 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "util/types.hpp"
|
#include "util/types.hpp"
|
||||||
#include "GLHelpers.h"
|
|
||||||
#include "../Overlays/overlays.h"
|
#include "../Overlays/overlays.h"
|
||||||
|
#include "GLTexture.h"
|
||||||
|
#include "Emu/RSX/rsx_utils.h"
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
|
|
||||||
extern u64 get_system_time();
|
|
||||||
|
|
||||||
namespace gl
|
namespace gl
|
||||||
{
|
{
|
||||||
struct overlay_pass
|
struct overlay_pass
|
||||||
|
@ -53,61 +52,8 @@ namespace gl
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
void create()
|
void create();
|
||||||
{
|
void destroy();
|
||||||
if (!compiled)
|
|
||||||
{
|
|
||||||
fs.create(::glsl::program_domain::glsl_fragment_program, fs_src);
|
|
||||||
fs.compile();
|
|
||||||
|
|
||||||
vs.create(::glsl::program_domain::glsl_vertex_program, vs_src);
|
|
||||||
vs.compile();
|
|
||||||
|
|
||||||
program_handle.create();
|
|
||||||
program_handle.attach(vs);
|
|
||||||
program_handle.attach(fs);
|
|
||||||
program_handle.link();
|
|
||||||
|
|
||||||
fbo.create();
|
|
||||||
|
|
||||||
m_sampler.create();
|
|
||||||
m_sampler.apply_defaults(input_filter);
|
|
||||||
|
|
||||||
m_vertex_data_buffer.create();
|
|
||||||
|
|
||||||
int old_vao;
|
|
||||||
glGetIntegerv(GL_VERTEX_ARRAY_BINDING, &old_vao);
|
|
||||||
|
|
||||||
m_vao.create();
|
|
||||||
m_vao.bind();
|
|
||||||
|
|
||||||
m_vao.array_buffer = m_vertex_data_buffer;
|
|
||||||
auto ptr = buffer_pointer(&m_vao);
|
|
||||||
m_vao[0] = ptr;
|
|
||||||
|
|
||||||
glBindVertexArray(old_vao);
|
|
||||||
|
|
||||||
compiled = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void destroy()
|
|
||||||
{
|
|
||||||
if (compiled)
|
|
||||||
{
|
|
||||||
program_handle.remove();
|
|
||||||
vs.remove();
|
|
||||||
fs.remove();
|
|
||||||
|
|
||||||
fbo.remove();
|
|
||||||
m_vao.remove();
|
|
||||||
m_vertex_data_buffer.remove();
|
|
||||||
|
|
||||||
m_sampler.remove();
|
|
||||||
|
|
||||||
compiled = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
virtual void on_load() {}
|
virtual void on_load() {}
|
||||||
virtual void on_unload() {}
|
virtual void on_unload() {}
|
||||||
|
@ -121,155 +67,9 @@ namespace gl
|
||||||
m_vertex_data_buffer.data(elements_count * sizeof(T), data);
|
m_vertex_data_buffer.data(elements_count * sizeof(T), data);
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual void emit_geometry()
|
virtual void emit_geometry();
|
||||||
{
|
|
||||||
int old_vao;
|
|
||||||
glGetIntegerv(GL_VERTEX_ARRAY_BINDING, &old_vao);
|
|
||||||
|
|
||||||
m_vao.bind();
|
void run(const areau& region, GLuint target_texture, bool depth_target, bool use_blending = false);
|
||||||
glDrawArrays(primitives, 0, num_drawable_elements);
|
|
||||||
|
|
||||||
glBindVertexArray(old_vao);
|
|
||||||
}
|
|
||||||
|
|
||||||
void run(const areau& region, GLuint target_texture, bool depth_target, bool use_blending = false)
|
|
||||||
{
|
|
||||||
if (!compiled)
|
|
||||||
{
|
|
||||||
rsx_log.error("You must initialize overlay passes with create() before calling run()");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
GLint program;
|
|
||||||
GLint old_fbo;
|
|
||||||
GLint depth_func;
|
|
||||||
GLint viewport[4];
|
|
||||||
GLboolean color_writes[4];
|
|
||||||
GLboolean depth_write;
|
|
||||||
|
|
||||||
GLint blend_src_rgb;
|
|
||||||
GLint blend_src_a;
|
|
||||||
GLint blend_dst_rgb;
|
|
||||||
GLint blend_dst_a;
|
|
||||||
GLint blend_eq_a;
|
|
||||||
GLint blend_eq_rgb;
|
|
||||||
|
|
||||||
if (target_texture)
|
|
||||||
{
|
|
||||||
glGetIntegerv(GL_FRAMEBUFFER_BINDING, &old_fbo);
|
|
||||||
glBindFramebuffer(GL_FRAMEBUFFER, fbo.id());
|
|
||||||
|
|
||||||
if (depth_target)
|
|
||||||
{
|
|
||||||
glFramebufferTexture2D(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_2D, target_texture, 0);
|
|
||||||
glDrawBuffer(GL_NONE);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
GLenum buffer = GL_COLOR_ATTACHMENT0;
|
|
||||||
glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, target_texture, 0);
|
|
||||||
glDrawBuffers(1, &buffer);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!target_texture || glCheckFramebufferStatus(GL_FRAMEBUFFER) == GL_FRAMEBUFFER_COMPLETE)
|
|
||||||
{
|
|
||||||
// Push rasterizer state
|
|
||||||
glGetIntegerv(GL_VIEWPORT, viewport);
|
|
||||||
glGetBooleanv(GL_COLOR_WRITEMASK, color_writes);
|
|
||||||
glGetBooleanv(GL_DEPTH_WRITEMASK, &depth_write);
|
|
||||||
glGetIntegerv(GL_CURRENT_PROGRAM, &program);
|
|
||||||
glGetIntegerv(GL_DEPTH_FUNC, &depth_func);
|
|
||||||
|
|
||||||
GLboolean scissor_enabled = glIsEnabled(GL_SCISSOR_TEST);
|
|
||||||
GLboolean depth_test_enabled = glIsEnabled(GL_DEPTH_TEST);
|
|
||||||
GLboolean cull_face_enabled = glIsEnabled(GL_CULL_FACE);
|
|
||||||
GLboolean blend_enabled = glIsEnabledi(GL_BLEND, 0);
|
|
||||||
GLboolean stencil_test_enabled = glIsEnabled(GL_STENCIL_TEST);
|
|
||||||
|
|
||||||
if (use_blending)
|
|
||||||
{
|
|
||||||
glGetIntegerv(GL_BLEND_SRC_RGB, &blend_src_rgb);
|
|
||||||
glGetIntegerv(GL_BLEND_SRC_ALPHA, &blend_src_a);
|
|
||||||
glGetIntegerv(GL_BLEND_DST_RGB, &blend_dst_rgb);
|
|
||||||
glGetIntegerv(GL_BLEND_DST_ALPHA, &blend_dst_a);
|
|
||||||
glGetIntegerv(GL_BLEND_EQUATION_RGB, &blend_eq_rgb);
|
|
||||||
glGetIntegerv(GL_BLEND_EQUATION_ALPHA, &blend_eq_a);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Set initial state
|
|
||||||
glViewport(region.x1, region.y1, region.width(), region.height());
|
|
||||||
glColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
|
|
||||||
glDepthMask(depth_target ? GL_TRUE : GL_FALSE);
|
|
||||||
|
|
||||||
// Disabling depth test will also disable depth writes which is not desired
|
|
||||||
glDepthFunc(GL_ALWAYS);
|
|
||||||
glEnable(GL_DEPTH_TEST);
|
|
||||||
|
|
||||||
if (scissor_enabled) glDisable(GL_SCISSOR_TEST);
|
|
||||||
if (cull_face_enabled) glDisable(GL_CULL_FACE);
|
|
||||||
if (stencil_test_enabled) glDisable(GL_STENCIL_TEST);
|
|
||||||
|
|
||||||
if (use_blending)
|
|
||||||
{
|
|
||||||
if (!blend_enabled)
|
|
||||||
glEnablei(GL_BLEND, 0);
|
|
||||||
|
|
||||||
glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);
|
|
||||||
glBlendEquation(GL_FUNC_ADD);
|
|
||||||
}
|
|
||||||
else if (blend_enabled)
|
|
||||||
{
|
|
||||||
glDisablei(GL_BLEND, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Render
|
|
||||||
program_handle.use();
|
|
||||||
on_load();
|
|
||||||
bind_resources();
|
|
||||||
emit_geometry();
|
|
||||||
|
|
||||||
// Clean up
|
|
||||||
if (target_texture)
|
|
||||||
{
|
|
||||||
if (depth_target)
|
|
||||||
glFramebufferTexture2D(GL_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0);
|
|
||||||
else
|
|
||||||
glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0);
|
|
||||||
|
|
||||||
glBindFramebuffer(GL_FRAMEBUFFER, old_fbo);
|
|
||||||
}
|
|
||||||
|
|
||||||
glUseProgram(program);
|
|
||||||
|
|
||||||
glViewport(viewport[0], viewport[1], viewport[2], viewport[3]);
|
|
||||||
glColorMask(color_writes[0], color_writes[1], color_writes[2], color_writes[3]);
|
|
||||||
glDepthMask(depth_write);
|
|
||||||
glDepthFunc(depth_func);
|
|
||||||
|
|
||||||
if (!depth_test_enabled) glDisable(GL_DEPTH_TEST);
|
|
||||||
if (scissor_enabled) glEnable(GL_SCISSOR_TEST);
|
|
||||||
if (cull_face_enabled) glEnable(GL_CULL_FACE);
|
|
||||||
if (stencil_test_enabled) glEnable(GL_STENCIL_TEST);
|
|
||||||
|
|
||||||
if (use_blending)
|
|
||||||
{
|
|
||||||
if (!blend_enabled)
|
|
||||||
glDisablei(GL_BLEND, 0);
|
|
||||||
|
|
||||||
glBlendFuncSeparate(blend_src_rgb, blend_dst_rgb, blend_src_a, blend_dst_a);
|
|
||||||
glBlendEquationSeparate(blend_eq_rgb, blend_eq_a);
|
|
||||||
}
|
|
||||||
else if (blend_enabled)
|
|
||||||
{
|
|
||||||
glEnablei(GL_BLEND, 0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
rsx_log.error("Overlay pass failed because framebuffer was not complete. Run with debug output enabled to diagnose the problem");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ui_overlay_renderer : public overlay_pass
|
struct ui_overlay_renderer : public overlay_pass
|
||||||
|
@ -282,443 +82,30 @@ namespace gl
|
||||||
std::unordered_map<u64, std::unique_ptr<gl::texture_view>> view_cache;
|
std::unordered_map<u64, std::unique_ptr<gl::texture_view>> view_cache;
|
||||||
rsx::overlays::primitive_type m_current_primitive_type = rsx::overlays::primitive_type::quad_list;
|
rsx::overlays::primitive_type m_current_primitive_type = rsx::overlays::primitive_type::quad_list;
|
||||||
|
|
||||||
ui_overlay_renderer()
|
ui_overlay_renderer();
|
||||||
{
|
|
||||||
vs_src =
|
|
||||||
"#version 420\n\n"
|
|
||||||
"layout(location=0) in vec4 in_pos;\n"
|
|
||||||
"layout(location=0) out vec2 tc0;\n"
|
|
||||||
"layout(location=1) flat out vec4 clip_rect;\n"
|
|
||||||
"uniform vec4 ui_scale;\n"
|
|
||||||
"uniform vec4 viewport;\n"
|
|
||||||
"uniform vec4 clip_bounds;\n"
|
|
||||||
"\n"
|
|
||||||
"vec2 snap_to_grid(vec2 normalized)\n"
|
|
||||||
"{\n"
|
|
||||||
" return (floor(normalized * viewport.xy) + 0.5) / viewport.xy;\n"
|
|
||||||
"}\n"
|
|
||||||
"\n"
|
|
||||||
"vec4 clip_to_ndc(const in vec4 coord)\n"
|
|
||||||
"{\n"
|
|
||||||
" vec4 ret = (coord * ui_scale.zwzw) / ui_scale.xyxy;\n"
|
|
||||||
" ret.yw = 1. - ret.yw;\n"
|
|
||||||
" return ret;\n"
|
|
||||||
"}\n"
|
|
||||||
"\n"
|
|
||||||
"vec4 ndc_to_window(const in vec4 coord)\n"
|
|
||||||
"{\n"
|
|
||||||
" return fma(coord, viewport.xyxy, viewport.zwzw);\n"
|
|
||||||
"}\n"
|
|
||||||
"\n"
|
|
||||||
"void main()\n"
|
|
||||||
"{\n"
|
|
||||||
" tc0.xy = in_pos.zw;\n"
|
|
||||||
" clip_rect = ndc_to_window(clip_to_ndc(clip_bounds)).xwzy; // Swap y1 and y2 due to flipped origin!\n"
|
|
||||||
" vec4 pos = vec4(clip_to_ndc(in_pos).xy, 0.5, 1.);\n"
|
|
||||||
" pos.xy = snap_to_grid(pos.xy);\n"
|
|
||||||
" gl_Position = (pos + pos) - 1.;\n"
|
|
||||||
"}\n";
|
|
||||||
|
|
||||||
fs_src =
|
gl::texture_view* load_simple_image(rsx::overlays::image_info* desc, bool temp_resource, u32 owner_uid);
|
||||||
"#version 420\n\n"
|
|
||||||
"layout(binding=31) uniform sampler2D fs0;\n"
|
|
||||||
"layout(binding=30) uniform sampler2DArray fs1;\n"
|
|
||||||
"layout(location=0) in vec2 tc0;\n"
|
|
||||||
"layout(location=1) flat in vec4 clip_rect;\n"
|
|
||||||
"layout(location=0) out vec4 ocol;\n"
|
|
||||||
"uniform vec4 color;\n"
|
|
||||||
"uniform float time;\n"
|
|
||||||
"uniform int sampler_mode;\n"
|
|
||||||
"uniform int pulse_glow;\n"
|
|
||||||
"uniform int clip_region;\n"
|
|
||||||
"uniform int blur_strength;\n"
|
|
||||||
"\n"
|
|
||||||
"vec4 blur_sample(sampler2D tex, vec2 coord, vec2 tex_offset)\n"
|
|
||||||
"{\n"
|
|
||||||
" vec2 coords[9];\n"
|
|
||||||
" coords[0] = coord - tex_offset\n;"
|
|
||||||
" coords[1] = coord + vec2(0., -tex_offset.y);\n"
|
|
||||||
" coords[2] = coord + vec2(tex_offset.x, -tex_offset.y);\n"
|
|
||||||
" coords[3] = coord + vec2(-tex_offset.x, 0.);\n"
|
|
||||||
" coords[4] = coord;\n"
|
|
||||||
" coords[5] = coord + vec2(tex_offset.x, 0.);\n"
|
|
||||||
" coords[6] = coord + vec2(-tex_offset.x, tex_offset.y);\n"
|
|
||||||
" coords[7] = coord + vec2(0., tex_offset.y);\n"
|
|
||||||
" coords[8] = coord + tex_offset;\n"
|
|
||||||
"\n"
|
|
||||||
" float weights[9] =\n"
|
|
||||||
" {\n"
|
|
||||||
" 1., 2., 1.,\n"
|
|
||||||
" 2., 4., 2.,\n"
|
|
||||||
" 1., 2., 1.\n"
|
|
||||||
" };\n"
|
|
||||||
"\n"
|
|
||||||
" vec4 blurred = vec4(0.);\n"
|
|
||||||
" for (int n = 0; n < 9; ++n)\n"
|
|
||||||
" {\n"
|
|
||||||
" blurred += texture(tex, coords[n]) * weights[n];\n"
|
|
||||||
" }\n"
|
|
||||||
"\n"
|
|
||||||
" return blurred / 16.f;\n"
|
|
||||||
"}\n"
|
|
||||||
"\n"
|
|
||||||
"vec4 sample_image(sampler2D tex, vec2 coord)\n"
|
|
||||||
"{\n"
|
|
||||||
" vec4 original = texture(tex, coord);\n"
|
|
||||||
" if (blur_strength == 0) return original;\n"
|
|
||||||
" \n"
|
|
||||||
" vec2 constraints = 1.f / vec2(640, 360);\n"
|
|
||||||
" vec2 res_offset = 1.f / textureSize(fs0, 0);\n"
|
|
||||||
" vec2 tex_offset = max(res_offset, constraints);\n"
|
|
||||||
"\n"
|
|
||||||
" // Sample triangle pattern and average\n"
|
|
||||||
" // TODO: Nicer looking gaussian blur with less sampling\n"
|
|
||||||
" vec4 blur0 = blur_sample(tex, coord + vec2(-res_offset.x, 0.), tex_offset);\n"
|
|
||||||
" vec4 blur1 = blur_sample(tex, coord + vec2(res_offset.x, 0.), tex_offset);\n"
|
|
||||||
" vec4 blur2 = blur_sample(tex, coord + vec2(0., res_offset.y), tex_offset);\n"
|
|
||||||
"\n"
|
|
||||||
" vec4 blurred = blur0 + blur1 + blur2;\n"
|
|
||||||
" blurred /= 3.;\n"
|
|
||||||
" return mix(original, blurred, float(blur_strength) / 100.);\n"
|
|
||||||
"}\n"
|
|
||||||
"\n"
|
|
||||||
"void main()\n"
|
|
||||||
"{\n"
|
|
||||||
" if (clip_region != 0)\n"
|
|
||||||
" {"
|
|
||||||
" if (gl_FragCoord.x < clip_rect.x || gl_FragCoord.x > clip_rect.z ||\n"
|
|
||||||
" gl_FragCoord.y < clip_rect.y || gl_FragCoord.y > clip_rect.w)\n"
|
|
||||||
" {\n"
|
|
||||||
" discard;\n"
|
|
||||||
" return;\n"
|
|
||||||
" }\n"
|
|
||||||
" }\n"
|
|
||||||
"\n"
|
|
||||||
" vec4 diff_color = color;\n"
|
|
||||||
" if (pulse_glow != 0)\n"
|
|
||||||
" diff_color.a *= (sin(time) + 1.f) * 0.5f;\n"
|
|
||||||
"\n"
|
|
||||||
" switch (sampler_mode)\n"
|
|
||||||
" {\n"
|
|
||||||
" case 1:\n"
|
|
||||||
" ocol = sample_image(fs0, tc0) * diff_color;\n"
|
|
||||||
" break;\n"
|
|
||||||
" case 2:\n"
|
|
||||||
" ocol = texture(fs1, vec3(tc0.x, fract(tc0.y), trunc(tc0.y))) * diff_color;\n"
|
|
||||||
" break;\n"
|
|
||||||
" default:\n"
|
|
||||||
" ocol = diff_color;\n"
|
|
||||||
" break;\n"
|
|
||||||
" }\n"
|
|
||||||
"}\n";
|
|
||||||
|
|
||||||
// Smooth filtering required for inputs
|
void create();
|
||||||
input_filter = GL_LINEAR;
|
void destroy();
|
||||||
}
|
|
||||||
|
|
||||||
gl::texture_view* load_simple_image(rsx::overlays::image_info* desc, bool temp_resource, u32 owner_uid)
|
void remove_temp_resources(u64 key);
|
||||||
{
|
|
||||||
auto tex = std::make_unique<gl::texture>(GL_TEXTURE_2D, desc->w, desc->h, 1, 1, GL_RGBA8);
|
|
||||||
tex->copy_from(desc->data, gl::texture::format::rgba, gl::texture::type::uint_8_8_8_8, {});
|
|
||||||
|
|
||||||
GLenum remap[] = { GL_RED, GL_ALPHA, GL_BLUE, GL_GREEN };
|
gl::texture_view* find_font(rsx::overlays::font* font);
|
||||||
auto view = std::make_unique<gl::texture_view>(tex.get(), remap);
|
|
||||||
|
|
||||||
auto result = view.get();
|
gl::texture_view* find_temp_image(rsx::overlays::image_info* desc, u32 owner_uid);
|
||||||
if (!temp_resource)
|
|
||||||
{
|
|
||||||
resources.push_back(std::move(tex));
|
|
||||||
view_cache[view_cache.size()] = std::move(view);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
u64 key = reinterpret_cast<u64>(desc);
|
|
||||||
temp_image_cache[key] = std::make_pair(owner_uid, std::move(tex));
|
|
||||||
temp_view_cache[key] = std::move(view);
|
|
||||||
}
|
|
||||||
|
|
||||||
return result;
|
void set_primitive_type(rsx::overlays::primitive_type type);
|
||||||
}
|
|
||||||
|
|
||||||
void create()
|
void emit_geometry() override;
|
||||||
{
|
|
||||||
overlay_pass::create();
|
|
||||||
|
|
||||||
rsx::overlays::resource_config configuration;
|
void run(const areau& viewport, GLuint target, rsx::overlays::overlay& ui);
|
||||||
configuration.load_files();
|
|
||||||
|
|
||||||
for (const auto &res : configuration.texture_raw_data)
|
|
||||||
{
|
|
||||||
load_simple_image(res.get(), false, UINT32_MAX);
|
|
||||||
}
|
|
||||||
|
|
||||||
configuration.free_resources();
|
|
||||||
}
|
|
||||||
|
|
||||||
void destroy()
|
|
||||||
{
|
|
||||||
temp_image_cache.clear();
|
|
||||||
resources.clear();
|
|
||||||
font_cache.clear();
|
|
||||||
overlay_pass::destroy();
|
|
||||||
}
|
|
||||||
|
|
||||||
void remove_temp_resources(u64 key)
|
|
||||||
{
|
|
||||||
std::vector<u64> keys_to_remove;
|
|
||||||
for (const auto& temp_image : temp_image_cache)
|
|
||||||
{
|
|
||||||
if (temp_image.second.first == key)
|
|
||||||
{
|
|
||||||
keys_to_remove.push_back(temp_image.first);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (const auto& _key : keys_to_remove)
|
|
||||||
{
|
|
||||||
temp_image_cache.erase(_key);
|
|
||||||
temp_view_cache.erase(_key);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
gl::texture_view* find_font(rsx::overlays::font *font)
|
|
||||||
{
|
|
||||||
const auto font_size = font->get_glyph_data_dimensions();
|
|
||||||
|
|
||||||
u64 key = reinterpret_cast<u64>(font);
|
|
||||||
auto found = view_cache.find(key);
|
|
||||||
if (found != view_cache.end())
|
|
||||||
{
|
|
||||||
if (const auto this_size = found->second->image()->size3D();
|
|
||||||
font_size.width == this_size.width &&
|
|
||||||
font_size.height == this_size.height &&
|
|
||||||
font_size.depth == this_size.depth)
|
|
||||||
{
|
|
||||||
return found->second.get();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Create font file
|
|
||||||
std::vector<u8> glyph_data;
|
|
||||||
font->get_glyph_data(glyph_data);
|
|
||||||
|
|
||||||
auto tex = std::make_unique<gl::texture>(GL_TEXTURE_2D_ARRAY, font_size.width, font_size.height, font_size.depth, 1, GL_R8);
|
|
||||||
tex->copy_from(glyph_data.data(), gl::texture::format::r, gl::texture::type::ubyte, {});
|
|
||||||
|
|
||||||
GLenum remap[] = { GL_RED, GL_RED, GL_RED, GL_RED };
|
|
||||||
auto view = std::make_unique<gl::texture_view>(tex.get(), remap);
|
|
||||||
|
|
||||||
auto result = view.get();
|
|
||||||
font_cache[key] = std::move(tex);
|
|
||||||
view_cache[key] = std::move(view);
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
gl::texture_view* find_temp_image(rsx::overlays::image_info *desc, u32 owner_uid)
|
|
||||||
{
|
|
||||||
auto key = reinterpret_cast<u64>(desc);
|
|
||||||
auto cached = temp_view_cache.find(key);
|
|
||||||
if (cached != temp_view_cache.end())
|
|
||||||
{
|
|
||||||
return cached->second.get();
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
return load_simple_image(desc, true, owner_uid);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void set_primitive_type(rsx::overlays::primitive_type type)
|
|
||||||
{
|
|
||||||
m_current_primitive_type = type;
|
|
||||||
|
|
||||||
switch (type)
|
|
||||||
{
|
|
||||||
case rsx::overlays::primitive_type::quad_list:
|
|
||||||
case rsx::overlays::primitive_type::triangle_strip:
|
|
||||||
primitives = GL_TRIANGLE_STRIP;
|
|
||||||
break;
|
|
||||||
case rsx::overlays::primitive_type::line_list:
|
|
||||||
primitives = GL_LINES;
|
|
||||||
break;
|
|
||||||
case rsx::overlays::primitive_type::line_strip:
|
|
||||||
primitives = GL_LINE_STRIP;
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
fmt::throw_exception("Unexpected primitive type %d", static_cast<s32>(type));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void emit_geometry() override
|
|
||||||
{
|
|
||||||
if (m_current_primitive_type == rsx::overlays::primitive_type::quad_list)
|
|
||||||
{
|
|
||||||
// Emulate quads with disjointed triangle strips
|
|
||||||
int num_quads = num_drawable_elements / 4;
|
|
||||||
std::vector<GLint> firsts;
|
|
||||||
std::vector<GLsizei> counts;
|
|
||||||
|
|
||||||
firsts.resize(num_quads);
|
|
||||||
counts.resize(num_quads);
|
|
||||||
|
|
||||||
for (int n = 0; n < num_quads; ++n)
|
|
||||||
{
|
|
||||||
firsts[n] = (n * 4);
|
|
||||||
counts[n] = 4;
|
|
||||||
}
|
|
||||||
|
|
||||||
int old_vao;
|
|
||||||
glGetIntegerv(GL_VERTEX_ARRAY_BINDING, &old_vao);
|
|
||||||
|
|
||||||
m_vao.bind();
|
|
||||||
glMultiDrawArrays(GL_TRIANGLE_STRIP, firsts.data(), counts.data(), num_quads);
|
|
||||||
|
|
||||||
glBindVertexArray(old_vao);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
overlay_pass::emit_geometry();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void run(const areau& viewport, GLuint target, rsx::overlays::overlay& ui)
|
|
||||||
{
|
|
||||||
program_handle.uniforms["viewport"] = color4f(static_cast<f32>(viewport.width()), static_cast<f32>(viewport.height()), static_cast<f32>(viewport.x1), static_cast<f32>(viewport.y1));
|
|
||||||
program_handle.uniforms["ui_scale"] = color4f(static_cast<f32>(ui.virtual_width), static_cast<f32>(ui.virtual_height), 1.f, 1.f);
|
|
||||||
program_handle.uniforms["time"] = static_cast<f32>(get_system_time() / 1000) * 0.005f;
|
|
||||||
|
|
||||||
saved_sampler_state save_30(30, m_sampler);
|
|
||||||
saved_sampler_state save_31(31, m_sampler);
|
|
||||||
|
|
||||||
for (auto &cmd : ui.get_compiled().draw_commands)
|
|
||||||
{
|
|
||||||
set_primitive_type(cmd.config.primitives);
|
|
||||||
upload_vertex_data(cmd.verts.data(), ::size32(cmd.verts));
|
|
||||||
num_drawable_elements = ::size32(cmd.verts);
|
|
||||||
GLint texture_read = GL_TRUE;
|
|
||||||
|
|
||||||
switch (cmd.config.texture_ref)
|
|
||||||
{
|
|
||||||
case rsx::overlays::image_resource_id::game_icon:
|
|
||||||
case rsx::overlays::image_resource_id::backbuffer:
|
|
||||||
//TODO
|
|
||||||
case rsx::overlays::image_resource_id::none:
|
|
||||||
{
|
|
||||||
texture_read = GL_FALSE;
|
|
||||||
glBindTexture(GL_TEXTURE_2D, GL_NONE);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case rsx::overlays::image_resource_id::raw_image:
|
|
||||||
{
|
|
||||||
glBindTexture(GL_TEXTURE_2D, find_temp_image(static_cast<rsx::overlays::image_info*>(cmd.config.external_data_ref), ui.uid)->id());
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case rsx::overlays::image_resource_id::font_file:
|
|
||||||
{
|
|
||||||
texture_read = (GL_TRUE + 1);
|
|
||||||
glActiveTexture(GL_TEXTURE0 + 30);
|
|
||||||
glBindTexture(GL_TEXTURE_2D_ARRAY, find_font(cmd.config.font_ref)->id());
|
|
||||||
glActiveTexture(GL_TEXTURE0 + 31);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
default:
|
|
||||||
{
|
|
||||||
glBindTexture(GL_TEXTURE_2D, view_cache[cmd.config.texture_ref - 1]->id());
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
program_handle.uniforms["color"] = cmd.config.color;
|
|
||||||
program_handle.uniforms["sampler_mode"] = texture_read;
|
|
||||||
program_handle.uniforms["pulse_glow"] = static_cast<s32>(cmd.config.pulse_glow);
|
|
||||||
program_handle.uniforms["blur_strength"] = static_cast<s32>(cmd.config.blur_strength);
|
|
||||||
program_handle.uniforms["clip_region"] = static_cast<s32>(cmd.config.clip_region);
|
|
||||||
program_handle.uniforms["clip_bounds"] = cmd.config.clip_rect;
|
|
||||||
overlay_pass::run(viewport, target, false, true);
|
|
||||||
}
|
|
||||||
|
|
||||||
ui.update();
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct video_out_calibration_pass : public overlay_pass
|
struct video_out_calibration_pass : public overlay_pass
|
||||||
{
|
{
|
||||||
video_out_calibration_pass()
|
video_out_calibration_pass();
|
||||||
{
|
|
||||||
vs_src =
|
|
||||||
"#version 420\n\n"
|
|
||||||
"layout(location=0) out vec2 tc0;\n"
|
|
||||||
"\n"
|
|
||||||
"void main()\n"
|
|
||||||
"{\n"
|
|
||||||
" vec2 positions[] = {vec2(-1., -1.), vec2(1., -1.), vec2(-1., 1.), vec2(1., 1.)};\n"
|
|
||||||
" vec2 coords[] = {vec2(0., 1.), vec2(1., 1.), vec2(0., 0.), vec2(1., 0.)};\n"
|
|
||||||
" tc0 = coords[gl_VertexID % 4];\n"
|
|
||||||
" vec2 pos = positions[gl_VertexID % 4];\n"
|
|
||||||
" gl_Position = vec4(pos, 0., 1.);\n"
|
|
||||||
"}\n";
|
|
||||||
|
|
||||||
fs_src =
|
void run(const areau& viewport, const rsx::simple_array<GLuint>& source, f32 gamma, bool limited_rgb, bool _3d);
|
||||||
"#version 420\n\n"
|
|
||||||
"layout(binding=31) uniform sampler2D fs0;\n"
|
|
||||||
"layout(binding=30) uniform sampler2D fs1;\n"
|
|
||||||
"layout(location=0) in vec2 tc0;\n"
|
|
||||||
"layout(location=0) out vec4 ocol;\n"
|
|
||||||
"\n"
|
|
||||||
"uniform float gamma;\n"
|
|
||||||
"uniform int limit_range;\n"
|
|
||||||
"uniform int stereo;\n"
|
|
||||||
"uniform int stereo_image_count;\n"
|
|
||||||
"\n"
|
|
||||||
"vec4 read_source()\n"
|
|
||||||
"{\n"
|
|
||||||
" if (stereo == 0) return texture(fs0, tc0);\n"
|
|
||||||
"\n"
|
|
||||||
" vec4 left, right;\n"
|
|
||||||
" if (stereo_image_count == 2)\n"
|
|
||||||
" {\n"
|
|
||||||
" left = texture(fs0, tc0);\n"
|
|
||||||
" right = texture(fs1, tc0);\n"
|
|
||||||
" }\n"
|
|
||||||
" else\n"
|
|
||||||
" {\n"
|
|
||||||
" vec2 coord_left = tc0 * vec2(1.f, 0.4898f);\n"
|
|
||||||
" vec2 coord_right = coord_left + vec2(0.f, 0.510204f);\n"
|
|
||||||
" left = texture(fs0, coord_left);\n"
|
|
||||||
" right = texture(fs0, coord_right);\n"
|
|
||||||
" }\n"
|
|
||||||
"\n"
|
|
||||||
" return vec4(left.r, right.g, right.b, 1.);\n"
|
|
||||||
"}\n"
|
|
||||||
"\n"
|
|
||||||
"void main()\n"
|
|
||||||
"{\n"
|
|
||||||
" vec4 color = read_source();\n"
|
|
||||||
" color.rgb = pow(color.rgb, vec3(gamma));\n"
|
|
||||||
" if (limit_range > 0)\n"
|
|
||||||
" ocol = ((color * 220.) + 16.) / 255.;\n"
|
|
||||||
" else\n"
|
|
||||||
" ocol = color;\n"
|
|
||||||
"}\n";
|
|
||||||
|
|
||||||
input_filter = GL_LINEAR;
|
|
||||||
}
|
|
||||||
|
|
||||||
void run(const areau& viewport, const rsx::simple_array<GLuint>& source, f32 gamma, bool limited_rgb, bool _3d)
|
|
||||||
{
|
|
||||||
program_handle.uniforms["gamma"] = gamma;
|
|
||||||
program_handle.uniforms["limit_range"] = limited_rgb + 0;
|
|
||||||
program_handle.uniforms["stereo"] = _3d + 0;
|
|
||||||
program_handle.uniforms["stereo_image_count"] = (source[1] == GL_NONE? 1 : 2);
|
|
||||||
|
|
||||||
saved_sampler_state saved(31, m_sampler);
|
|
||||||
glBindTexture(GL_TEXTURE_2D, source[0]);
|
|
||||||
|
|
||||||
saved_sampler_state saved2(30, m_sampler);
|
|
||||||
glBindTexture(GL_TEXTURE_2D, source[1]);
|
|
||||||
|
|
||||||
overlay_pass::run(viewport, GL_NONE, false, false);
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,6 +4,8 @@
|
||||||
#include "Utilities/geometry.h"
|
#include "Utilities/geometry.h"
|
||||||
#include "overlay_utils.h"
|
#include "overlay_utils.h"
|
||||||
|
|
||||||
|
#include <functional>
|
||||||
|
|
||||||
namespace rsx
|
namespace rsx
|
||||||
{
|
{
|
||||||
namespace overlays
|
namespace overlays
|
||||||
|
|
|
@ -19,6 +19,7 @@
|
||||||
#include <sys/types.h>
|
#include <sys/types.h>
|
||||||
#include <pwd.h>
|
#include <pwd.h>
|
||||||
#include <libgen.h>
|
#include <libgen.h>
|
||||||
|
#include <limits.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __APPLE__
|
#ifdef __APPLE__
|
||||||
|
|
428
rpcs3/Emu/RSX/VK/VKCompute.cpp
Normal file
428
rpcs3/Emu/RSX/VK/VKCompute.cpp
Normal file
|
@ -0,0 +1,428 @@
|
||||||
|
#include "VKCompute.h"
|
||||||
|
#include "VKHelpers.h"
|
||||||
|
#include "VKRenderPass.h"
|
||||||
|
#include "vkutils/buffer_object.h"
|
||||||
|
|
||||||
|
#define VK_MAX_COMPUTE_TASKS 4096 // Max number of jobs per frame
|
||||||
|
|
||||||
|
namespace vk
|
||||||
|
{
|
||||||
|
std::vector<std::pair<VkDescriptorType, u8>> compute_task::get_descriptor_layout()
|
||||||
|
{
|
||||||
|
std::vector<std::pair<VkDescriptorType, u8>> result;
|
||||||
|
result.emplace_back(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, ssbo_count);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
void compute_task::init_descriptors()
|
||||||
|
{
|
||||||
|
std::vector<VkDescriptorPoolSize> descriptor_pool_sizes;
|
||||||
|
std::vector<VkDescriptorSetLayoutBinding> bindings;
|
||||||
|
|
||||||
|
const auto layout = get_descriptor_layout();
|
||||||
|
for (const auto &e : layout)
|
||||||
|
{
|
||||||
|
descriptor_pool_sizes.push_back({e.first, u32(VK_MAX_COMPUTE_TASKS * e.second)});
|
||||||
|
|
||||||
|
for (unsigned n = 0; n < e.second; ++n)
|
||||||
|
{
|
||||||
|
bindings.push_back
|
||||||
|
({
|
||||||
|
u32(bindings.size()),
|
||||||
|
e.first,
|
||||||
|
1,
|
||||||
|
VK_SHADER_STAGE_COMPUTE_BIT,
|
||||||
|
nullptr
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reserve descriptor pools
|
||||||
|
m_descriptor_pool.create(*g_render_device, descriptor_pool_sizes.data(), ::size32(descriptor_pool_sizes), VK_MAX_COMPUTE_TASKS, 3);
|
||||||
|
|
||||||
|
VkDescriptorSetLayoutCreateInfo infos = {};
|
||||||
|
infos.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO;
|
||||||
|
infos.pBindings = bindings.data();
|
||||||
|
infos.bindingCount = ::size32(bindings);
|
||||||
|
|
||||||
|
CHECK_RESULT(vkCreateDescriptorSetLayout(*g_render_device, &infos, nullptr, &m_descriptor_layout));
|
||||||
|
|
||||||
|
VkPipelineLayoutCreateInfo layout_info = {};
|
||||||
|
layout_info.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
|
||||||
|
layout_info.setLayoutCount = 1;
|
||||||
|
layout_info.pSetLayouts = &m_descriptor_layout;
|
||||||
|
|
||||||
|
VkPushConstantRange push_constants{};
|
||||||
|
if (use_push_constants)
|
||||||
|
{
|
||||||
|
push_constants.size = push_constants_size;
|
||||||
|
push_constants.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
|
||||||
|
|
||||||
|
layout_info.pushConstantRangeCount = 1;
|
||||||
|
layout_info.pPushConstantRanges = &push_constants;
|
||||||
|
}
|
||||||
|
|
||||||
|
CHECK_RESULT(vkCreatePipelineLayout(*g_render_device, &layout_info, nullptr, &m_pipeline_layout));
|
||||||
|
}
|
||||||
|
|
||||||
|
void compute_task::create()
|
||||||
|
{
|
||||||
|
if (!initialized)
|
||||||
|
{
|
||||||
|
init_descriptors();
|
||||||
|
|
||||||
|
switch (vk::get_driver_vendor())
|
||||||
|
{
|
||||||
|
case vk::driver_vendor::unknown:
|
||||||
|
case vk::driver_vendor::INTEL:
|
||||||
|
// Intel hw has 8 threads, but LDS allocation behavior makes optimal group size between 64 and 256
|
||||||
|
// Based on intel's own OpenCL recommended settings
|
||||||
|
unroll_loops = true;
|
||||||
|
optimal_kernel_size = 1;
|
||||||
|
optimal_group_size = 128;
|
||||||
|
break;
|
||||||
|
case vk::driver_vendor::NVIDIA:
|
||||||
|
// Warps are multiples of 32. Increasing kernel depth seems to hurt performance (Nier, Big Duck sample)
|
||||||
|
unroll_loops = true;
|
||||||
|
optimal_group_size = 32;
|
||||||
|
optimal_kernel_size = 1;
|
||||||
|
break;
|
||||||
|
case vk::driver_vendor::AMD:
|
||||||
|
case vk::driver_vendor::RADV:
|
||||||
|
// Wavefronts are multiples of 64
|
||||||
|
unroll_loops = false;
|
||||||
|
optimal_kernel_size = 1;
|
||||||
|
optimal_group_size = 64;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto& gpu = vk::g_render_device->gpu();
|
||||||
|
max_invocations_x = gpu.get_limits().maxComputeWorkGroupCount[0];
|
||||||
|
|
||||||
|
initialized = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void compute_task::destroy()
|
||||||
|
{
|
||||||
|
if (initialized)
|
||||||
|
{
|
||||||
|
m_shader.destroy();
|
||||||
|
m_program.reset();
|
||||||
|
m_param_buffer.reset();
|
||||||
|
|
||||||
|
vkDestroyDescriptorSetLayout(*g_render_device, m_descriptor_layout, nullptr);
|
||||||
|
vkDestroyPipelineLayout(*g_render_device, m_pipeline_layout, nullptr);
|
||||||
|
m_descriptor_pool.destroy();
|
||||||
|
|
||||||
|
initialized = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void compute_task::free_resources()
|
||||||
|
{
|
||||||
|
if (m_used_descriptors == 0)
|
||||||
|
return;
|
||||||
|
|
||||||
|
m_descriptor_pool.reset(0);
|
||||||
|
m_used_descriptors = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void compute_task::load_program(VkCommandBuffer cmd)
|
||||||
|
{
|
||||||
|
if (!m_program)
|
||||||
|
{
|
||||||
|
m_shader.create(::glsl::program_domain::glsl_compute_program, m_src);
|
||||||
|
auto handle = m_shader.compile();
|
||||||
|
|
||||||
|
VkPipelineShaderStageCreateInfo shader_stage{};
|
||||||
|
shader_stage.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
|
||||||
|
shader_stage.stage = VK_SHADER_STAGE_COMPUTE_BIT;
|
||||||
|
shader_stage.module = handle;
|
||||||
|
shader_stage.pName = "main";
|
||||||
|
|
||||||
|
VkComputePipelineCreateInfo info{};
|
||||||
|
info.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO;
|
||||||
|
info.stage = shader_stage;
|
||||||
|
info.layout = m_pipeline_layout;
|
||||||
|
info.basePipelineIndex = -1;
|
||||||
|
info.basePipelineHandle = VK_NULL_HANDLE;
|
||||||
|
|
||||||
|
auto compiler = vk::get_pipe_compiler();
|
||||||
|
m_program = compiler->compile(info, m_pipeline_layout, vk::pipe_compiler::COMPILE_INLINE);
|
||||||
|
declare_inputs();
|
||||||
|
}
|
||||||
|
|
||||||
|
ensure(m_used_descriptors < VK_MAX_COMPUTE_TASKS);
|
||||||
|
|
||||||
|
VkDescriptorSetAllocateInfo alloc_info = {};
|
||||||
|
alloc_info.descriptorPool = m_descriptor_pool;
|
||||||
|
alloc_info.descriptorSetCount = 1;
|
||||||
|
alloc_info.pSetLayouts = &m_descriptor_layout;
|
||||||
|
alloc_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO;
|
||||||
|
|
||||||
|
CHECK_RESULT(vkAllocateDescriptorSets(*g_render_device, &alloc_info, &m_descriptor_set));
|
||||||
|
m_used_descriptors++;
|
||||||
|
|
||||||
|
bind_resources();
|
||||||
|
|
||||||
|
vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, m_program->pipeline);
|
||||||
|
vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, m_pipeline_layout, 0, 1, &m_descriptor_set, 0, nullptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
void compute_task::run(VkCommandBuffer cmd, u32 invocations_x, u32 invocations_y, u32 invocations_z)
|
||||||
|
{
|
||||||
|
// CmdDispatch is outside renderpass scope only
|
||||||
|
if (vk::is_renderpass_open(cmd))
|
||||||
|
{
|
||||||
|
vk::end_renderpass(cmd);
|
||||||
|
}
|
||||||
|
|
||||||
|
load_program(cmd);
|
||||||
|
vkCmdDispatch(cmd, invocations_x, invocations_y, invocations_z);
|
||||||
|
}
|
||||||
|
|
||||||
|
void compute_task::run(VkCommandBuffer cmd, u32 num_invocations)
|
||||||
|
{
|
||||||
|
u32 invocations_x, invocations_y;
|
||||||
|
if (num_invocations > max_invocations_x)
|
||||||
|
{
|
||||||
|
// AMD hw reports an annoyingly small maximum number of invocations in the X dimension
|
||||||
|
// Split the 1D job into 2 dimensions to accomodate this
|
||||||
|
invocations_x = static_cast<u32>(floor(std::sqrt(num_invocations)));
|
||||||
|
invocations_y = invocations_x;
|
||||||
|
|
||||||
|
if (num_invocations % invocations_x) invocations_y++;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
invocations_x = num_invocations;
|
||||||
|
invocations_y = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
run(cmd, invocations_x, invocations_y, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
cs_shuffle_base::cs_shuffle_base()
|
||||||
|
{
|
||||||
|
work_kernel =
|
||||||
|
" value = data[index];\n"
|
||||||
|
" data[index] = %f(value);\n";
|
||||||
|
|
||||||
|
loop_advance =
|
||||||
|
" index++;\n";
|
||||||
|
|
||||||
|
suffix =
|
||||||
|
"}\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
void cs_shuffle_base::build(const char* function_name, u32 _kernel_size)
|
||||||
|
{
|
||||||
|
// Initialize to allow detecting optimal settings
|
||||||
|
create();
|
||||||
|
|
||||||
|
kernel_size = _kernel_size? _kernel_size : optimal_kernel_size;
|
||||||
|
|
||||||
|
m_src =
|
||||||
|
"#version 430\n"
|
||||||
|
"layout(local_size_x=%ws, local_size_y=1, local_size_z=1) in;\n"
|
||||||
|
"layout(std430, set=0, binding=0) buffer ssbo{ uint data[]; };\n"
|
||||||
|
"%ub"
|
||||||
|
"\n"
|
||||||
|
"#define KERNEL_SIZE %ks\n"
|
||||||
|
"\n"
|
||||||
|
"// Generic swap routines\n"
|
||||||
|
"#define bswap_u16(bits) (bits & 0xFF) << 8 | (bits & 0xFF00) >> 8 | (bits & 0xFF0000) << 8 | (bits & 0xFF000000) >> 8\n"
|
||||||
|
"#define bswap_u32(bits) (bits & 0xFF) << 24 | (bits & 0xFF00) << 8 | (bits & 0xFF0000) >> 8 | (bits & 0xFF000000) >> 24\n"
|
||||||
|
"#define bswap_u16_u32(bits) (bits & 0xFFFF) << 16 | (bits & 0xFFFF0000) >> 16\n"
|
||||||
|
"\n"
|
||||||
|
"// Depth format conversions\n"
|
||||||
|
"#define d24_to_f32(bits) floatBitsToUint(float(bits) / 16777215.f)\n"
|
||||||
|
"#define f32_to_d24(bits) uint(uintBitsToFloat(bits) * 16777215.f)\n"
|
||||||
|
"#define d24f_to_f32(bits) (bits << 7)\n"
|
||||||
|
"#define f32_to_d24f(bits) (bits >> 7)\n"
|
||||||
|
"#define d24x8_to_f32(bits) d24_to_f32(bits >> 8)\n"
|
||||||
|
"#define d24x8_to_d24x8_swapped(bits) (bits & 0xFF00) | (bits & 0xFF0000) >> 16 | (bits & 0xFF) << 16\n"
|
||||||
|
"#define f32_to_d24x8_swapped(bits) d24x8_to_d24x8_swapped(f32_to_d24(bits))\n"
|
||||||
|
"\n"
|
||||||
|
"%md"
|
||||||
|
"void main()\n"
|
||||||
|
"{\n"
|
||||||
|
" uint invocations_x = (gl_NumWorkGroups.x * gl_WorkGroupSize.x);"
|
||||||
|
" uint invocation_id = (gl_GlobalInvocationID.y * invocations_x) + gl_GlobalInvocationID.x;\n"
|
||||||
|
" uint index = invocation_id * KERNEL_SIZE;\n"
|
||||||
|
" uint value;\n"
|
||||||
|
"%vars"
|
||||||
|
"\n";
|
||||||
|
|
||||||
|
const auto parameters_size = utils::align(push_constants_size, 16) / 16;
|
||||||
|
const std::pair<std::string, std::string> syntax_replace[] =
|
||||||
|
{
|
||||||
|
{ "%ws", std::to_string(optimal_group_size) },
|
||||||
|
{ "%ks", std::to_string(kernel_size) },
|
||||||
|
{ "%vars", variables },
|
||||||
|
{ "%f", function_name },
|
||||||
|
{ "%md", method_declarations },
|
||||||
|
{ "%ub", use_push_constants? "layout(push_constant) uniform ubo{ uvec4 params[" + std::to_string(parameters_size) + "]; };\n" : "" },
|
||||||
|
};
|
||||||
|
|
||||||
|
m_src = fmt::replace_all(m_src, syntax_replace);
|
||||||
|
work_kernel = fmt::replace_all(work_kernel, syntax_replace);
|
||||||
|
|
||||||
|
if (kernel_size <= 1)
|
||||||
|
{
|
||||||
|
m_src += " {\n" + work_kernel + " }\n";
|
||||||
|
}
|
||||||
|
else if (unroll_loops)
|
||||||
|
{
|
||||||
|
work_kernel += loop_advance + "\n";
|
||||||
|
|
||||||
|
m_src += std::string
|
||||||
|
(
|
||||||
|
" //Unrolled loop\n"
|
||||||
|
" {\n"
|
||||||
|
);
|
||||||
|
|
||||||
|
// Assemble body with manual loop unroll to try loweing GPR usage
|
||||||
|
for (u32 n = 0; n < kernel_size; ++n)
|
||||||
|
{
|
||||||
|
m_src += work_kernel;
|
||||||
|
}
|
||||||
|
|
||||||
|
m_src += " }\n";
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
m_src += " for (int loop = 0; loop < KERNEL_SIZE; ++loop)\n";
|
||||||
|
m_src += " {\n";
|
||||||
|
m_src += work_kernel;
|
||||||
|
m_src += loop_advance;
|
||||||
|
m_src += " }\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
m_src += suffix;
|
||||||
|
}
|
||||||
|
|
||||||
|
void cs_shuffle_base::bind_resources()
|
||||||
|
{
|
||||||
|
m_program->bind_buffer({ m_data->value, m_data_offset, m_data_length }, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
|
||||||
|
}
|
||||||
|
|
||||||
|
void cs_shuffle_base::set_parameters(VkCommandBuffer cmd, const u32* params, u8 count)
|
||||||
|
{
|
||||||
|
ensure(use_push_constants);
|
||||||
|
vkCmdPushConstants(cmd, m_pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, count * 4, params);
|
||||||
|
}
|
||||||
|
|
||||||
|
void cs_shuffle_base::run(VkCommandBuffer cmd, const vk::buffer* data, u32 data_length, u32 data_offset)
|
||||||
|
{
|
||||||
|
m_data = data;
|
||||||
|
m_data_offset = data_offset;
|
||||||
|
m_data_length = data_length;
|
||||||
|
|
||||||
|
const auto num_bytes_per_invocation = optimal_group_size * kernel_size * 4;
|
||||||
|
const auto num_bytes_to_process = rsx::align2(data_length, num_bytes_per_invocation);
|
||||||
|
const auto num_invocations = num_bytes_to_process / num_bytes_per_invocation;
|
||||||
|
|
||||||
|
if ((num_bytes_to_process + data_offset) > data->size())
|
||||||
|
{
|
||||||
|
// Technically robust buffer access should keep the driver from crashing in OOB situations
|
||||||
|
rsx_log.error("Inadequate buffer length submitted for a compute operation."
|
||||||
|
"Required=%d bytes, Available=%d bytes", num_bytes_to_process, data->size());
|
||||||
|
}
|
||||||
|
|
||||||
|
compute_task::run(cmd, num_invocations);
|
||||||
|
}
|
||||||
|
|
||||||
|
cs_interleave_task::cs_interleave_task()
|
||||||
|
{
|
||||||
|
use_push_constants = true;
|
||||||
|
push_constants_size = 16;
|
||||||
|
|
||||||
|
variables =
|
||||||
|
" uint block_length = params[0].x >> 2;\n"
|
||||||
|
" uint z_offset = params[0].y >> 2;\n"
|
||||||
|
" uint s_offset = params[0].z >> 2;\n"
|
||||||
|
" uint depth;\n"
|
||||||
|
" uint stencil;\n"
|
||||||
|
" uint stencil_shift;\n"
|
||||||
|
" uint stencil_offset;\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
void cs_interleave_task::bind_resources()
|
||||||
|
{
|
||||||
|
m_program->bind_buffer({ m_data->value, m_data_offset, m_ssbo_length }, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
|
||||||
|
}
|
||||||
|
|
||||||
|
void cs_interleave_task::run(VkCommandBuffer cmd, const vk::buffer* data, u32 data_offset, u32 data_length, u32 zeta_offset, u32 stencil_offset)
|
||||||
|
{
|
||||||
|
u32 parameters[4] = { data_length, zeta_offset - data_offset, stencil_offset - data_offset, 0 };
|
||||||
|
set_parameters(cmd, parameters, 4);
|
||||||
|
|
||||||
|
ensure(stencil_offset > data_offset);
|
||||||
|
m_ssbo_length = stencil_offset + (data_length / 4) - data_offset;
|
||||||
|
cs_shuffle_base::run(cmd, data, data_length, data_offset);
|
||||||
|
}
|
||||||
|
|
||||||
|
cs_scatter_d24x8::cs_scatter_d24x8()
|
||||||
|
{
|
||||||
|
work_kernel =
|
||||||
|
" if (index >= block_length)\n"
|
||||||
|
" return;\n"
|
||||||
|
"\n"
|
||||||
|
" value = data[index];\n"
|
||||||
|
" data[index + z_offset] = (value >> 8);\n"
|
||||||
|
" stencil_offset = (index / 4);\n"
|
||||||
|
" stencil_shift = (index % 4) * 8;\n"
|
||||||
|
" stencil = (value & 0xFF) << stencil_shift;\n"
|
||||||
|
" atomicOr(data[stencil_offset + s_offset], stencil);\n";
|
||||||
|
|
||||||
|
cs_shuffle_base::build("");
|
||||||
|
}
|
||||||
|
|
||||||
|
cs_aggregator::cs_aggregator()
|
||||||
|
{
|
||||||
|
ssbo_count = 2;
|
||||||
|
|
||||||
|
create();
|
||||||
|
|
||||||
|
m_src =
|
||||||
|
"#version 450\n"
|
||||||
|
"layout(local_size_x = %ws, local_size_y = 1, local_size_z = 1) in;\n\n"
|
||||||
|
|
||||||
|
"layout(set=0, binding=0, std430) readonly buffer ssbo0{ uint src[]; };\n"
|
||||||
|
"layout(set=0, binding=1, std430) writeonly buffer ssbo1{ uint result; };\n\n"
|
||||||
|
|
||||||
|
"void main()\n"
|
||||||
|
"{\n"
|
||||||
|
" if (gl_GlobalInvocationID.x < src.length())\n"
|
||||||
|
" {\n"
|
||||||
|
" atomicAdd(result, src[gl_GlobalInvocationID.x]);\n"
|
||||||
|
" }\n"
|
||||||
|
"}\n";
|
||||||
|
|
||||||
|
const std::pair<std::string, std::string> syntax_replace[] =
|
||||||
|
{
|
||||||
|
{ "%ws", std::to_string(optimal_group_size) },
|
||||||
|
};
|
||||||
|
|
||||||
|
m_src = fmt::replace_all(m_src, syntax_replace);
|
||||||
|
}
|
||||||
|
|
||||||
|
void cs_aggregator::bind_resources()
|
||||||
|
{
|
||||||
|
m_program->bind_buffer({ src->value, 0, block_length }, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
|
||||||
|
m_program->bind_buffer({ dst->value, 0, 4 }, 1, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
|
||||||
|
}
|
||||||
|
|
||||||
|
void cs_aggregator::run(VkCommandBuffer cmd, const vk::buffer* dst, const vk::buffer* src, u32 num_words)
|
||||||
|
{
|
||||||
|
this->dst = dst;
|
||||||
|
this->src = src;
|
||||||
|
word_count = num_words;
|
||||||
|
block_length = num_words * 4;
|
||||||
|
|
||||||
|
const u32 linear_invocations = utils::aligned_div(word_count, optimal_group_size);
|
||||||
|
compute_task::run(cmd, linear_invocations);
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,18 +1,14 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
#include "VKPipelineCompiler.h"
|
||||||
#include "vkutils/descriptors.hpp"
|
#include "vkutils/descriptors.hpp"
|
||||||
#include "Utilities/StrUtil.h"
|
#include "vkutils/buffer_object.h"
|
||||||
|
|
||||||
#include "Emu/IdManager.h"
|
#include "Emu/IdManager.h"
|
||||||
|
|
||||||
#include "VKPipelineCompiler.h"
|
#include "Utilities/StrUtil.h"
|
||||||
#include "VKRenderPass.h"
|
|
||||||
#include "VKHelpers.h"
|
|
||||||
#include "vkutils/buffer_object.h"
|
|
||||||
#include "vkutils/device.h"
|
|
||||||
|
|
||||||
#include "util/asm.hpp"
|
#include "util/asm.hpp"
|
||||||
#include <unordered_map>
|
|
||||||
|
|
||||||
#define VK_MAX_COMPUTE_TASKS 4096 // Max number of jobs per frame
|
#include <unordered_map>
|
||||||
|
|
||||||
namespace vk
|
namespace vk
|
||||||
{
|
{
|
||||||
|
@ -38,207 +34,22 @@ namespace vk
|
||||||
u32 optimal_kernel_size = 1;
|
u32 optimal_kernel_size = 1;
|
||||||
u32 max_invocations_x = 65535;
|
u32 max_invocations_x = 65535;
|
||||||
|
|
||||||
virtual std::vector<std::pair<VkDescriptorType, u8>> get_descriptor_layout()
|
virtual std::vector<std::pair<VkDescriptorType, u8>> get_descriptor_layout();
|
||||||
{
|
|
||||||
std::vector<std::pair<VkDescriptorType, u8>> result;
|
|
||||||
result.emplace_back(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, ssbo_count);
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
void init_descriptors()
|
void init_descriptors();
|
||||||
{
|
|
||||||
std::vector<VkDescriptorPoolSize> descriptor_pool_sizes;
|
|
||||||
std::vector<VkDescriptorSetLayoutBinding> bindings;
|
|
||||||
|
|
||||||
const auto layout = get_descriptor_layout();
|
void create();
|
||||||
for (const auto &e : layout)
|
void destroy();
|
||||||
{
|
|
||||||
descriptor_pool_sizes.push_back({e.first, u32(VK_MAX_COMPUTE_TASKS * e.second)});
|
|
||||||
|
|
||||||
for (unsigned n = 0; n < e.second; ++n)
|
void free_resources();
|
||||||
{
|
|
||||||
bindings.push_back
|
|
||||||
({
|
|
||||||
u32(bindings.size()),
|
|
||||||
e.first,
|
|
||||||
1,
|
|
||||||
VK_SHADER_STAGE_COMPUTE_BIT,
|
|
||||||
nullptr
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Reserve descriptor pools
|
virtual void bind_resources() {}
|
||||||
m_descriptor_pool.create(*g_render_device, descriptor_pool_sizes.data(), ::size32(descriptor_pool_sizes), VK_MAX_COMPUTE_TASKS, 3);
|
virtual void declare_inputs() {}
|
||||||
|
|
||||||
VkDescriptorSetLayoutCreateInfo infos = {};
|
void load_program(VkCommandBuffer cmd);
|
||||||
infos.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO;
|
|
||||||
infos.pBindings = bindings.data();
|
|
||||||
infos.bindingCount = ::size32(bindings);
|
|
||||||
|
|
||||||
CHECK_RESULT(vkCreateDescriptorSetLayout(*g_render_device, &infos, nullptr, &m_descriptor_layout));
|
void run(VkCommandBuffer cmd, u32 invocations_x, u32 invocations_y, u32 invocations_z);
|
||||||
|
void run(VkCommandBuffer cmd, u32 num_invocations);
|
||||||
VkPipelineLayoutCreateInfo layout_info = {};
|
|
||||||
layout_info.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
|
|
||||||
layout_info.setLayoutCount = 1;
|
|
||||||
layout_info.pSetLayouts = &m_descriptor_layout;
|
|
||||||
|
|
||||||
VkPushConstantRange push_constants{};
|
|
||||||
if (use_push_constants)
|
|
||||||
{
|
|
||||||
push_constants.size = push_constants_size;
|
|
||||||
push_constants.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
|
|
||||||
|
|
||||||
layout_info.pushConstantRangeCount = 1;
|
|
||||||
layout_info.pPushConstantRanges = &push_constants;
|
|
||||||
}
|
|
||||||
|
|
||||||
CHECK_RESULT(vkCreatePipelineLayout(*g_render_device, &layout_info, nullptr, &m_pipeline_layout));
|
|
||||||
}
|
|
||||||
|
|
||||||
void create()
|
|
||||||
{
|
|
||||||
if (!initialized)
|
|
||||||
{
|
|
||||||
init_descriptors();
|
|
||||||
|
|
||||||
switch (vk::get_driver_vendor())
|
|
||||||
{
|
|
||||||
case vk::driver_vendor::unknown:
|
|
||||||
case vk::driver_vendor::INTEL:
|
|
||||||
// Intel hw has 8 threads, but LDS allocation behavior makes optimal group size between 64 and 256
|
|
||||||
// Based on intel's own OpenCL recommended settings
|
|
||||||
unroll_loops = true;
|
|
||||||
optimal_kernel_size = 1;
|
|
||||||
optimal_group_size = 128;
|
|
||||||
break;
|
|
||||||
case vk::driver_vendor::NVIDIA:
|
|
||||||
// Warps are multiples of 32. Increasing kernel depth seems to hurt performance (Nier, Big Duck sample)
|
|
||||||
unroll_loops = true;
|
|
||||||
optimal_group_size = 32;
|
|
||||||
optimal_kernel_size = 1;
|
|
||||||
break;
|
|
||||||
case vk::driver_vendor::AMD:
|
|
||||||
case vk::driver_vendor::RADV:
|
|
||||||
// Wavefronts are multiples of 64
|
|
||||||
unroll_loops = false;
|
|
||||||
optimal_kernel_size = 1;
|
|
||||||
optimal_group_size = 64;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
const auto& gpu = vk::g_render_device->gpu();
|
|
||||||
max_invocations_x = gpu.get_limits().maxComputeWorkGroupCount[0];
|
|
||||||
|
|
||||||
initialized = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void destroy()
|
|
||||||
{
|
|
||||||
if (initialized)
|
|
||||||
{
|
|
||||||
m_shader.destroy();
|
|
||||||
m_program.reset();
|
|
||||||
m_param_buffer.reset();
|
|
||||||
|
|
||||||
vkDestroyDescriptorSetLayout(*g_render_device, m_descriptor_layout, nullptr);
|
|
||||||
vkDestroyPipelineLayout(*g_render_device, m_pipeline_layout, nullptr);
|
|
||||||
m_descriptor_pool.destroy();
|
|
||||||
|
|
||||||
initialized = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void free_resources()
|
|
||||||
{
|
|
||||||
if (m_used_descriptors == 0)
|
|
||||||
return;
|
|
||||||
|
|
||||||
m_descriptor_pool.reset(0);
|
|
||||||
m_used_descriptors = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
virtual void bind_resources()
|
|
||||||
{}
|
|
||||||
|
|
||||||
virtual void declare_inputs()
|
|
||||||
{}
|
|
||||||
|
|
||||||
void load_program(VkCommandBuffer cmd)
|
|
||||||
{
|
|
||||||
if (!m_program)
|
|
||||||
{
|
|
||||||
m_shader.create(::glsl::program_domain::glsl_compute_program, m_src);
|
|
||||||
auto handle = m_shader.compile();
|
|
||||||
|
|
||||||
VkPipelineShaderStageCreateInfo shader_stage{};
|
|
||||||
shader_stage.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
|
|
||||||
shader_stage.stage = VK_SHADER_STAGE_COMPUTE_BIT;
|
|
||||||
shader_stage.module = handle;
|
|
||||||
shader_stage.pName = "main";
|
|
||||||
|
|
||||||
VkComputePipelineCreateInfo info{};
|
|
||||||
info.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO;
|
|
||||||
info.stage = shader_stage;
|
|
||||||
info.layout = m_pipeline_layout;
|
|
||||||
info.basePipelineIndex = -1;
|
|
||||||
info.basePipelineHandle = VK_NULL_HANDLE;
|
|
||||||
|
|
||||||
auto compiler = vk::get_pipe_compiler();
|
|
||||||
m_program = compiler->compile(info, m_pipeline_layout, vk::pipe_compiler::COMPILE_INLINE);
|
|
||||||
declare_inputs();
|
|
||||||
}
|
|
||||||
|
|
||||||
ensure(m_used_descriptors < VK_MAX_COMPUTE_TASKS);
|
|
||||||
|
|
||||||
VkDescriptorSetAllocateInfo alloc_info = {};
|
|
||||||
alloc_info.descriptorPool = m_descriptor_pool;
|
|
||||||
alloc_info.descriptorSetCount = 1;
|
|
||||||
alloc_info.pSetLayouts = &m_descriptor_layout;
|
|
||||||
alloc_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO;
|
|
||||||
|
|
||||||
CHECK_RESULT(vkAllocateDescriptorSets(*g_render_device, &alloc_info, &m_descriptor_set));
|
|
||||||
m_used_descriptors++;
|
|
||||||
|
|
||||||
bind_resources();
|
|
||||||
|
|
||||||
vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, m_program->pipeline);
|
|
||||||
vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, m_pipeline_layout, 0, 1, &m_descriptor_set, 0, nullptr);
|
|
||||||
}
|
|
||||||
|
|
||||||
void run(VkCommandBuffer cmd, u32 invocations_x, u32 invocations_y, u32 invocations_z)
|
|
||||||
{
|
|
||||||
// CmdDispatch is outside renderpass scope only
|
|
||||||
if (vk::is_renderpass_open(cmd))
|
|
||||||
{
|
|
||||||
vk::end_renderpass(cmd);
|
|
||||||
}
|
|
||||||
|
|
||||||
load_program(cmd);
|
|
||||||
vkCmdDispatch(cmd, invocations_x, invocations_y, invocations_z);
|
|
||||||
}
|
|
||||||
|
|
||||||
void run(VkCommandBuffer cmd, u32 num_invocations)
|
|
||||||
{
|
|
||||||
u32 invocations_x, invocations_y;
|
|
||||||
if (num_invocations > max_invocations_x)
|
|
||||||
{
|
|
||||||
// AMD hw reports an annoyingly small maximum number of invocations in the X dimension
|
|
||||||
// Split the 1D job into 2 dimensions to accomodate this
|
|
||||||
invocations_x = static_cast<u32>(floor(std::sqrt(num_invocations)));
|
|
||||||
invocations_y = invocations_x;
|
|
||||||
|
|
||||||
if (num_invocations % invocations_x) invocations_y++;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
invocations_x = num_invocations;
|
|
||||||
invocations_y = 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
run(cmd, invocations_x, invocations_y, 1);
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct cs_shuffle_base : compute_task
|
struct cs_shuffle_base : compute_task
|
||||||
|
@ -251,136 +62,15 @@ namespace vk
|
||||||
std::string variables, work_kernel, loop_advance, suffix;
|
std::string variables, work_kernel, loop_advance, suffix;
|
||||||
std::string method_declarations;
|
std::string method_declarations;
|
||||||
|
|
||||||
cs_shuffle_base()
|
cs_shuffle_base();
|
||||||
{
|
|
||||||
work_kernel =
|
|
||||||
" value = data[index];\n"
|
|
||||||
" data[index] = %f(value);\n";
|
|
||||||
|
|
||||||
loop_advance =
|
void build(const char* function_name, u32 _kernel_size = 0);
|
||||||
" index++;\n";
|
|
||||||
|
|
||||||
suffix =
|
void bind_resources() override;
|
||||||
"}\n";
|
|
||||||
}
|
|
||||||
|
|
||||||
void build(const char* function_name, u32 _kernel_size = 0)
|
void set_parameters(VkCommandBuffer cmd, const u32* params, u8 count);
|
||||||
{
|
|
||||||
// Initialize to allow detecting optimal settings
|
|
||||||
create();
|
|
||||||
|
|
||||||
kernel_size = _kernel_size? _kernel_size : optimal_kernel_size;
|
void run(VkCommandBuffer cmd, const vk::buffer* data, u32 data_length, u32 data_offset = 0);
|
||||||
|
|
||||||
m_src =
|
|
||||||
"#version 430\n"
|
|
||||||
"layout(local_size_x=%ws, local_size_y=1, local_size_z=1) in;\n"
|
|
||||||
"layout(std430, set=0, binding=0) buffer ssbo{ uint data[]; };\n"
|
|
||||||
"%ub"
|
|
||||||
"\n"
|
|
||||||
"#define KERNEL_SIZE %ks\n"
|
|
||||||
"\n"
|
|
||||||
"// Generic swap routines\n"
|
|
||||||
"#define bswap_u16(bits) (bits & 0xFF) << 8 | (bits & 0xFF00) >> 8 | (bits & 0xFF0000) << 8 | (bits & 0xFF000000) >> 8\n"
|
|
||||||
"#define bswap_u32(bits) (bits & 0xFF) << 24 | (bits & 0xFF00) << 8 | (bits & 0xFF0000) >> 8 | (bits & 0xFF000000) >> 24\n"
|
|
||||||
"#define bswap_u16_u32(bits) (bits & 0xFFFF) << 16 | (bits & 0xFFFF0000) >> 16\n"
|
|
||||||
"\n"
|
|
||||||
"// Depth format conversions\n"
|
|
||||||
"#define d24_to_f32(bits) floatBitsToUint(float(bits) / 16777215.f)\n"
|
|
||||||
"#define f32_to_d24(bits) uint(uintBitsToFloat(bits) * 16777215.f)\n"
|
|
||||||
"#define d24f_to_f32(bits) (bits << 7)\n"
|
|
||||||
"#define f32_to_d24f(bits) (bits >> 7)\n"
|
|
||||||
"#define d24x8_to_f32(bits) d24_to_f32(bits >> 8)\n"
|
|
||||||
"#define d24x8_to_d24x8_swapped(bits) (bits & 0xFF00) | (bits & 0xFF0000) >> 16 | (bits & 0xFF) << 16\n"
|
|
||||||
"#define f32_to_d24x8_swapped(bits) d24x8_to_d24x8_swapped(f32_to_d24(bits))\n"
|
|
||||||
"\n"
|
|
||||||
"%md"
|
|
||||||
"void main()\n"
|
|
||||||
"{\n"
|
|
||||||
" uint invocations_x = (gl_NumWorkGroups.x * gl_WorkGroupSize.x);"
|
|
||||||
" uint invocation_id = (gl_GlobalInvocationID.y * invocations_x) + gl_GlobalInvocationID.x;\n"
|
|
||||||
" uint index = invocation_id * KERNEL_SIZE;\n"
|
|
||||||
" uint value;\n"
|
|
||||||
"%vars"
|
|
||||||
"\n";
|
|
||||||
|
|
||||||
const auto parameters_size = utils::align(push_constants_size, 16) / 16;
|
|
||||||
const std::pair<std::string, std::string> syntax_replace[] =
|
|
||||||
{
|
|
||||||
{ "%ws", std::to_string(optimal_group_size) },
|
|
||||||
{ "%ks", std::to_string(kernel_size) },
|
|
||||||
{ "%vars", variables },
|
|
||||||
{ "%f", function_name },
|
|
||||||
{ "%md", method_declarations },
|
|
||||||
{ "%ub", use_push_constants? "layout(push_constant) uniform ubo{ uvec4 params[" + std::to_string(parameters_size) + "]; };\n" : "" },
|
|
||||||
};
|
|
||||||
|
|
||||||
m_src = fmt::replace_all(m_src, syntax_replace);
|
|
||||||
work_kernel = fmt::replace_all(work_kernel, syntax_replace);
|
|
||||||
|
|
||||||
if (kernel_size <= 1)
|
|
||||||
{
|
|
||||||
m_src += " {\n" + work_kernel + " }\n";
|
|
||||||
}
|
|
||||||
else if (unroll_loops)
|
|
||||||
{
|
|
||||||
work_kernel += loop_advance + "\n";
|
|
||||||
|
|
||||||
m_src += std::string
|
|
||||||
(
|
|
||||||
" //Unrolled loop\n"
|
|
||||||
" {\n"
|
|
||||||
);
|
|
||||||
|
|
||||||
// Assemble body with manual loop unroll to try loweing GPR usage
|
|
||||||
for (u32 n = 0; n < kernel_size; ++n)
|
|
||||||
{
|
|
||||||
m_src += work_kernel;
|
|
||||||
}
|
|
||||||
|
|
||||||
m_src += " }\n";
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
m_src += " for (int loop = 0; loop < KERNEL_SIZE; ++loop)\n";
|
|
||||||
m_src += " {\n";
|
|
||||||
m_src += work_kernel;
|
|
||||||
m_src += loop_advance;
|
|
||||||
m_src += " }\n";
|
|
||||||
}
|
|
||||||
|
|
||||||
m_src += suffix;
|
|
||||||
}
|
|
||||||
|
|
||||||
void bind_resources() override
|
|
||||||
{
|
|
||||||
m_program->bind_buffer({ m_data->value, m_data_offset, m_data_length }, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
|
|
||||||
}
|
|
||||||
|
|
||||||
void set_parameters(VkCommandBuffer cmd, const u32* params, u8 count)
|
|
||||||
{
|
|
||||||
ensure(use_push_constants);
|
|
||||||
vkCmdPushConstants(cmd, m_pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, count * 4, params);
|
|
||||||
}
|
|
||||||
|
|
||||||
void run(VkCommandBuffer cmd, const vk::buffer* data, u32 data_length, u32 data_offset = 0)
|
|
||||||
{
|
|
||||||
m_data = data;
|
|
||||||
m_data_offset = data_offset;
|
|
||||||
m_data_length = data_length;
|
|
||||||
|
|
||||||
const auto num_bytes_per_invocation = optimal_group_size * kernel_size * 4;
|
|
||||||
const auto num_bytes_to_process = rsx::align2(data_length, num_bytes_per_invocation);
|
|
||||||
const auto num_invocations = num_bytes_to_process / num_bytes_per_invocation;
|
|
||||||
|
|
||||||
if ((num_bytes_to_process + data_offset) > data->size())
|
|
||||||
{
|
|
||||||
// Technically robust buffer access should keep the driver from crashing in OOB situations
|
|
||||||
rsx_log.error("Inadequate buffer length submitted for a compute operation."
|
|
||||||
"Required=%d bytes, Available=%d bytes", num_bytes_to_process, data->size());
|
|
||||||
}
|
|
||||||
|
|
||||||
compute_task::run(cmd, num_invocations);
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct cs_shuffle_16 : cs_shuffle_base
|
struct cs_shuffle_16 : cs_shuffle_base
|
||||||
|
@ -442,35 +132,11 @@ namespace vk
|
||||||
{
|
{
|
||||||
u32 m_ssbo_length = 0;
|
u32 m_ssbo_length = 0;
|
||||||
|
|
||||||
cs_interleave_task()
|
cs_interleave_task();
|
||||||
{
|
|
||||||
use_push_constants = true;
|
|
||||||
push_constants_size = 16;
|
|
||||||
|
|
||||||
variables =
|
void bind_resources() override;
|
||||||
" uint block_length = params[0].x >> 2;\n"
|
|
||||||
" uint z_offset = params[0].y >> 2;\n"
|
|
||||||
" uint s_offset = params[0].z >> 2;\n"
|
|
||||||
" uint depth;\n"
|
|
||||||
" uint stencil;\n"
|
|
||||||
" uint stencil_shift;\n"
|
|
||||||
" uint stencil_offset;\n";
|
|
||||||
}
|
|
||||||
|
|
||||||
void bind_resources() override
|
void run(VkCommandBuffer cmd, const vk::buffer* data, u32 data_offset, u32 data_length, u32 zeta_offset, u32 stencil_offset);
|
||||||
{
|
|
||||||
m_program->bind_buffer({ m_data->value, m_data_offset, m_ssbo_length }, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
|
|
||||||
}
|
|
||||||
|
|
||||||
void run(VkCommandBuffer cmd, const vk::buffer* data, u32 data_offset, u32 data_length, u32 zeta_offset, u32 stencil_offset)
|
|
||||||
{
|
|
||||||
u32 parameters[4] = { data_length, zeta_offset - data_offset, stencil_offset - data_offset, 0 };
|
|
||||||
set_parameters(cmd, parameters, 4);
|
|
||||||
|
|
||||||
ensure(stencil_offset > data_offset);
|
|
||||||
m_ssbo_length = stencil_offset + (data_length / 4) - data_offset;
|
|
||||||
cs_shuffle_base::run(cmd, data, data_length, data_offset);
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
template<bool _SwapBytes = false>
|
template<bool _SwapBytes = false>
|
||||||
|
@ -549,21 +215,7 @@ namespace vk
|
||||||
|
|
||||||
struct cs_scatter_d24x8 : cs_interleave_task
|
struct cs_scatter_d24x8 : cs_interleave_task
|
||||||
{
|
{
|
||||||
cs_scatter_d24x8()
|
cs_scatter_d24x8();
|
||||||
{
|
|
||||||
work_kernel =
|
|
||||||
" if (index >= block_length)\n"
|
|
||||||
" return;\n"
|
|
||||||
"\n"
|
|
||||||
" value = data[index];\n"
|
|
||||||
" data[index + z_offset] = (value >> 8);\n"
|
|
||||||
" stencil_offset = (index / 4);\n"
|
|
||||||
" stencil_shift = (index % 4) * 8;\n"
|
|
||||||
" stencil = (value & 0xFF) << stencil_shift;\n"
|
|
||||||
" atomicOr(data[stencil_offset + s_offset], stencil);\n";
|
|
||||||
|
|
||||||
cs_shuffle_base::build("");
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
template<bool _DepthFloat = false>
|
template<bool _DepthFloat = false>
|
||||||
|
@ -962,51 +614,11 @@ namespace vk
|
||||||
u32 block_length = 0;
|
u32 block_length = 0;
|
||||||
u32 word_count = 0;
|
u32 word_count = 0;
|
||||||
|
|
||||||
cs_aggregator()
|
cs_aggregator();
|
||||||
{
|
|
||||||
ssbo_count = 2;
|
|
||||||
|
|
||||||
create();
|
void bind_resources() override;
|
||||||
|
|
||||||
m_src =
|
void run(VkCommandBuffer cmd, const vk::buffer* dst, const vk::buffer* src, u32 num_words);
|
||||||
"#version 450\n"
|
|
||||||
"layout(local_size_x = %ws, local_size_y = 1, local_size_z = 1) in;\n\n"
|
|
||||||
|
|
||||||
"layout(set=0, binding=0, std430) readonly buffer ssbo0{ uint src[]; };\n"
|
|
||||||
"layout(set=0, binding=1, std430) writeonly buffer ssbo1{ uint result; };\n\n"
|
|
||||||
|
|
||||||
"void main()\n"
|
|
||||||
"{\n"
|
|
||||||
" if (gl_GlobalInvocationID.x < src.length())\n"
|
|
||||||
" {\n"
|
|
||||||
" atomicAdd(result, src[gl_GlobalInvocationID.x]);\n"
|
|
||||||
" }\n"
|
|
||||||
"}\n";
|
|
||||||
|
|
||||||
const std::pair<std::string, std::string> syntax_replace[] =
|
|
||||||
{
|
|
||||||
{ "%ws", std::to_string(optimal_group_size) },
|
|
||||||
};
|
|
||||||
|
|
||||||
m_src = fmt::replace_all(m_src, syntax_replace);
|
|
||||||
}
|
|
||||||
|
|
||||||
void bind_resources() override
|
|
||||||
{
|
|
||||||
m_program->bind_buffer({ src->value, 0, block_length }, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
|
|
||||||
m_program->bind_buffer({ dst->value, 0, 4 }, 1, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
|
|
||||||
}
|
|
||||||
|
|
||||||
void run(VkCommandBuffer cmd, const vk::buffer* dst, const vk::buffer* src, u32 num_words)
|
|
||||||
{
|
|
||||||
this->dst = dst;
|
|
||||||
this->src = src;
|
|
||||||
word_count = num_words;
|
|
||||||
block_length = num_words * 4;
|
|
||||||
|
|
||||||
const u32 linear_invocations = utils::aligned_div(word_count, optimal_group_size);
|
|
||||||
compute_task::run(cmd, linear_invocations);
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// TODO: Replace with a proper manager
|
// TODO: Replace with a proper manager
|
||||||
|
|
|
@ -10,7 +10,7 @@ namespace vk
|
||||||
{
|
{
|
||||||
std::unordered_map<u64, std::vector<std::unique_ptr<vk::framebuffer_holder>>> g_framebuffers_cache;
|
std::unordered_map<u64, std::vector<std::unique_ptr<vk::framebuffer_holder>>> g_framebuffers_cache;
|
||||||
|
|
||||||
vk::framebuffer_holder *get_framebuffer(VkDevice dev, u16 width, u16 height, VkRenderPass renderpass, const std::vector<vk::image*>& image_list)
|
vk::framebuffer_holder* get_framebuffer(VkDevice dev, u16 width, u16 height, VkRenderPass renderpass, const std::vector<vk::image*>& image_list)
|
||||||
{
|
{
|
||||||
u64 key = u64(width) | (u64(height) << 16);
|
u64 key = u64(width) | (u64(height) << 16);
|
||||||
auto &queue = g_framebuffers_cache[key];
|
auto &queue = g_framebuffers_cache[key];
|
||||||
|
|
1049
rpcs3/Emu/RSX/VK/VKOverlays.cpp
Normal file
1049
rpcs3/Emu/RSX/VK/VKOverlays.cpp
Normal file
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
@ -1,6 +1,7 @@
|
||||||
#include "stdafx.h"
|
#include "stdafx.h"
|
||||||
#include "VKGSRender.h"
|
#include "VKGSRender.h"
|
||||||
#include "vkutils/buffer_object.h"
|
#include "vkutils/buffer_object.h"
|
||||||
|
#include "Emu/RSX/Overlays/overlays.h"
|
||||||
#include "Emu/Cell/Modules/cellVideoOut.h"
|
#include "Emu/Cell/Modules/cellVideoOut.h"
|
||||||
|
|
||||||
#include "util/asm.hpp"
|
#include "util/asm.hpp"
|
||||||
|
|
|
@ -3,6 +3,8 @@
|
||||||
#include "VKCompute.h"
|
#include "VKCompute.h"
|
||||||
#include "VKOverlays.h"
|
#include "VKOverlays.h"
|
||||||
|
|
||||||
|
#include "vkutils/image.h"
|
||||||
|
|
||||||
namespace vk
|
namespace vk
|
||||||
{
|
{
|
||||||
struct cs_resolve_base : compute_task
|
struct cs_resolve_base : compute_task
|
||||||
|
|
|
@ -87,9 +87,11 @@
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<ClCompile Include="Emu\RSX\GL\GLCommonDecompiler.cpp" />
|
<ClCompile Include="Emu\RSX\GL\GLCommonDecompiler.cpp" />
|
||||||
|
<ClCompile Include="Emu\RSX\GL\GLCompute.cpp" />
|
||||||
<ClCompile Include="Emu\RSX\GL\GLDraw.cpp" />
|
<ClCompile Include="Emu\RSX\GL\GLDraw.cpp" />
|
||||||
<ClCompile Include="Emu\RSX\GL\GLFragmentProgram.cpp" />
|
<ClCompile Include="Emu\RSX\GL\GLFragmentProgram.cpp" />
|
||||||
<ClCompile Include="Emu\RSX\GL\GLGSRender.cpp" />
|
<ClCompile Include="Emu\RSX\GL\GLGSRender.cpp" />
|
||||||
|
<ClCompile Include="Emu\RSX\GL\GLOverlays.cpp" />
|
||||||
<ClCompile Include="Emu\RSX\GL\GLPipelineCompiler.cpp" />
|
<ClCompile Include="Emu\RSX\GL\GLPipelineCompiler.cpp" />
|
||||||
<ClCompile Include="Emu\RSX\GL\GLVertexProgram.cpp" />
|
<ClCompile Include="Emu\RSX\GL\GLVertexProgram.cpp" />
|
||||||
<ClCompile Include="Emu\RSX\GL\GLHelpers.cpp" />
|
<ClCompile Include="Emu\RSX\GL\GLHelpers.cpp" />
|
||||||
|
|
|
@ -15,6 +15,8 @@
|
||||||
<ClCompile Include="Emu\RSX\GL\GLVertexBuffers.cpp" />
|
<ClCompile Include="Emu\RSX\GL\GLVertexBuffers.cpp" />
|
||||||
<ClCompile Include="Emu\RSX\GL\GLPipelineCompiler.cpp" />
|
<ClCompile Include="Emu\RSX\GL\GLPipelineCompiler.cpp" />
|
||||||
<ClCompile Include="Emu\RSX\GL\GLTextureCache.cpp" />
|
<ClCompile Include="Emu\RSX\GL\GLTextureCache.cpp" />
|
||||||
|
<ClCompile Include="Emu\RSX\GL\GLOverlays.cpp" />
|
||||||
|
<ClCompile Include="Emu\RSX\GL\GLCompute.cpp" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<ClInclude Include="Emu\RSX\GL\GLTexture.h" />
|
<ClInclude Include="Emu\RSX\GL\GLTexture.h" />
|
||||||
|
|
|
@ -66,6 +66,7 @@
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<ClCompile Include="Emu\RSX\VK\VKCommandStream.cpp" />
|
<ClCompile Include="Emu\RSX\VK\VKCommandStream.cpp" />
|
||||||
<ClCompile Include="Emu\RSX\VK\VKCommonDecompiler.cpp" />
|
<ClCompile Include="Emu\RSX\VK\VKCommonDecompiler.cpp" />
|
||||||
|
<ClCompile Include="Emu\RSX\VK\VKCompute.cpp" />
|
||||||
<ClCompile Include="Emu\RSX\VK\VKDMA.cpp" />
|
<ClCompile Include="Emu\RSX\VK\VKDMA.cpp" />
|
||||||
<ClCompile Include="Emu\RSX\VK\VKDraw.cpp" />
|
<ClCompile Include="Emu\RSX\VK\VKDraw.cpp" />
|
||||||
<ClCompile Include="Emu\RSX\VK\VKFormats.cpp" />
|
<ClCompile Include="Emu\RSX\VK\VKFormats.cpp" />
|
||||||
|
@ -73,6 +74,7 @@
|
||||||
<ClCompile Include="Emu\RSX\VK\VKFramebuffer.cpp" />
|
<ClCompile Include="Emu\RSX\VK\VKFramebuffer.cpp" />
|
||||||
<ClCompile Include="Emu\RSX\VK\VKGSRender.cpp" />
|
<ClCompile Include="Emu\RSX\VK\VKGSRender.cpp" />
|
||||||
<ClCompile Include="Emu\RSX\VK\VKHelpers.cpp" />
|
<ClCompile Include="Emu\RSX\VK\VKHelpers.cpp" />
|
||||||
|
<ClCompile Include="Emu\RSX\VK\VKOverlays.cpp" />
|
||||||
<ClCompile Include="Emu\RSX\VK\VKPipelineCompiler.cpp" />
|
<ClCompile Include="Emu\RSX\VK\VKPipelineCompiler.cpp" />
|
||||||
<ClCompile Include="Emu\RSX\VK\VKPresent.cpp" />
|
<ClCompile Include="Emu\RSX\VK\VKPresent.cpp" />
|
||||||
<ClCompile Include="Emu\RSX\VK\VKProgramPipeline.cpp" />
|
<ClCompile Include="Emu\RSX\VK\VKProgramPipeline.cpp" />
|
||||||
|
|
|
@ -62,6 +62,8 @@
|
||||||
<ClCompile Include="Emu\RSX\VK\vkutils\image_helpers.cpp">
|
<ClCompile Include="Emu\RSX\VK\vkutils\image_helpers.cpp">
|
||||||
<Filter>vkutils</Filter>
|
<Filter>vkutils</Filter>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
|
<ClCompile Include="Emu\RSX\VK\VKOverlays.cpp" />
|
||||||
|
<ClCompile Include="Emu\RSX\VK\VKCompute.cpp" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<ClInclude Include="Emu\RSX\VK\VKCommonDecompiler.h" />
|
<ClInclude Include="Emu\RSX\VK\VKCommonDecompiler.h" />
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue