rsx: Fixups

- Also fix visual corruption when using disjoint indexed draws

- Refactor draw call emit again (vk)

- Improve execution barrier resolve
  - Allow vertex/index rebase inside begin/end pair
  - Add ALPHA_TEST to list of excluded methods [TODO: defer raster state]

- gl bringup

- Simplify
  - using the simple_array gets back a few more fps :)
This commit is contained in:
kd-11 2018-10-01 23:05:51 +03:00 committed by kd-11
parent e01d2f08c9
commit 677b16f5c6
19 changed files with 2242 additions and 565 deletions

View file

@ -175,8 +175,10 @@ namespace rsx
const u32 vertSize = get_vertex_type_size_on_host(info.type(), info.size());
const u32 vertStride = info.stride();
for (const auto& range : method_registers.current_draw_clause.draw_command_ranges)
method_registers.current_draw_clause.begin();
do
{
const auto& range = method_registers.current_draw_clause.get_range();
const u32 vertCount = range.count;
const size_t bufferSize = vertCount * vertStride + vertSize;
@ -189,6 +191,7 @@ namespace rsx
std::memcpy(block_data.data.data(), vm::base(addr + block.offset), bufferSize);
insert_mem_block_in_map(mem_changes, std::move(block), std::move(block_data));
}
while (method_registers.current_draw_clause.next());
}
}
// save index buffer if used
@ -211,8 +214,10 @@ namespace rsx
const bool is_primitive_restart_enabled = method_registers.restart_index_enabled();
const u32 primitive_restart_index = method_registers.restart_index();
for (const auto& range : method_registers.current_draw_clause.draw_command_ranges)
method_registers.current_draw_clause.begin();
do
{
const auto& range = method_registers.current_draw_clause.get_range();
const u32 idxFirst = range.first;
const u32 idxCount = range.count;
const u32 idxAddr = base_addr + (idxFirst * type_size);
@ -261,6 +266,7 @@ namespace rsx
}
}
}
while (method_registers.current_draw_clause.next());
if (min_index > max_index)
{

View file

@ -435,14 +435,11 @@ namespace
}
}
void write_vertex_array_data_to_buffer(gsl::span<gsl::byte> raw_dst_span, gsl::span<const gsl::byte> src_ptr, const std::vector<rsx::draw_range_t>& first_count_commands, rsx::vertex_base_type type, u32 vector_element_count, u32 attribute_src_stride, u8 dst_stride, bool swap_endianness)
void write_vertex_array_data_to_buffer(gsl::span<gsl::byte> raw_dst_span, gsl::span<const gsl::byte> src_ptr, u32 count, rsx::vertex_base_type type, u32 vector_element_count, u32 attribute_src_stride, u8 dst_stride, bool swap_endianness)
{
verify(HERE), (vector_element_count > 0);
const u32 src_read_stride = rsx::get_vertex_type_size_on_host(type, vector_element_count);
// HACK! This is a legacy routine only used by D3D12
const u32 count = first_count_commands.front().count;
bool use_stream_no_stride = false;
bool use_stream_with_stride = false;
@ -799,7 +796,7 @@ namespace
template<typename T>
std::tuple<u32, u32, u32> write_index_array_data_to_buffer_impl(gsl::span<u32> dst,
gsl::span<const be_t<T>> src,
rsx::primitive_type draw_mode, bool restart_index_enabled, u32 restart_index, const rsx::draw_range_t &range,
rsx::primitive_type draw_mode, bool restart_index_enabled, u32 restart_index,
u32 base_index, std::function<bool(rsx::primitive_type)> expands)
{
if (!expands(draw_mode)) return upload_untouched<T>(src, dst, restart_index_enabled, restart_index, base_index);
@ -809,7 +806,8 @@ namespace
case rsx::primitive_type::line_loop:
{
const auto &returnvalue = upload_untouched<T>(src, dst, restart_index_enabled, restart_index, base_index);
dst[range.count] = src[0];
const auto index_count = dst.size_bytes() / sizeof(T);
dst[index_count] = src[0];
return returnvalue;
}
case rsx::primitive_type::polygon:
@ -826,51 +824,23 @@ namespace
std::tuple<u32, u32, u32> write_index_array_data_to_buffer(gsl::span<gsl::byte> dst_ptr,
gsl::span<const gsl::byte> src_ptr,
rsx::index_array_type type, rsx::primitive_type draw_mode, bool restart_index_enabled, u32 restart_index,
const std::vector<rsx::draw_range_t> &first_count_arguments,
u32 base_index, std::function<bool(rsx::primitive_type)> expands)
{
u32 read = 0;
u32 written = 0;
u32 min_index = -1u;
u32 max_index = 0;
const u32 type_size = get_index_type_size(type);
for (const auto &range : first_count_arguments)
{
auto src = src_ptr.subspan(range.command_data_offset, range.count * type_size);
auto dst = dst_ptr.subspan(written * type_size);
switch (type)
{
case rsx::index_array_type::u16:
{
auto ret = write_index_array_data_to_buffer_impl<u16>(as_span_workaround<u32>(dst),
as_const_span<const be_t<u16>>(src), draw_mode, restart_index_enabled, restart_index, range, base_index, expands);
min_index = std::min<u32>(std::get<0>(ret), min_index);
max_index = std::min<u32>(std::get<1>(ret), max_index);
written += std::get<2>(ret);
break;
return write_index_array_data_to_buffer_impl<u16>(as_span_workaround<u32>(dst_ptr),
as_const_span<const be_t<u16>>(src_ptr), draw_mode, restart_index_enabled, restart_index, base_index, expands);
}
case rsx::index_array_type::u32:
{
auto ret = write_index_array_data_to_buffer_impl<u32>(as_span_workaround<u32>(dst),
as_const_span<const be_t<u32>>(src), draw_mode, restart_index_enabled, restart_index, range, base_index, expands);
min_index = std::min<u32>(std::get<0>(ret), min_index);
max_index = std::min<u32>(std::get<1>(ret), max_index);
written += std::get<2>(ret);
break;
return write_index_array_data_to_buffer_impl<u32>(as_span_workaround<u32>(dst_ptr),
as_const_span<const be_t<u32>>(src_ptr), draw_mode, restart_index_enabled, restart_index, base_index, expands);
}
default:
fmt::throw_exception("Unreachable" HERE);
}
read += range.count;
}
return std::make_tuple(min_index, max_index, written);
}
void stream_vector(void *dst, u32 x, u32 y, u32 z, u32 w)

View file

@ -10,7 +10,7 @@
* Write count vertex attributes from src_ptr.
* src_ptr array layout is deduced from the type, vector element count and src_stride arguments.
*/
void write_vertex_array_data_to_buffer(gsl::span<gsl::byte> raw_dst_span, gsl::span<const gsl::byte> src_ptr, const std::vector<rsx::draw_range_t>& first_count_commands, rsx::vertex_base_type type, u32 vector_element_count, u32 attribute_src_stride, u8 dst_stride, bool swap_endianness);
void write_vertex_array_data_to_buffer(gsl::span<gsl::byte> raw_dst_span, gsl::span<const gsl::byte> src_ptr, u32 count, rsx::vertex_base_type type, u32 vector_element_count, u32 attribute_src_stride, u8 dst_stride, bool swap_endianness);
/*
* If primitive mode is not supported and need to be emulated (using an index buffer) returns false.
@ -33,7 +33,7 @@ u32 get_index_type_size(rsx::index_array_type type);
* The function expands index buffer for non native primitive type if expands(draw_mode) return true.
*/
std::tuple<u32, u32, u32> write_index_array_data_to_buffer(gsl::span<gsl::byte> dst, gsl::span<const gsl::byte> src,
rsx::index_array_type, rsx::primitive_type draw_mode, bool restart_index_enabled, u32 restart_index, const std::vector<rsx::draw_range_t> &first_count_arguments,
rsx::index_array_type, rsx::primitive_type draw_mode, bool restart_index_enabled, u32 restart_index,
u32 base_index, std::function<bool(rsx::primitive_type)> expands);
/**

View file

@ -158,7 +158,7 @@ namespace
m_buffer_data.map<void>(CD3DX12_RANGE(heap_offset, heap_offset + buffer_size));
gsl::span<gsl::byte> mapped_buffer_span = {
(gsl::byte*)mapped_buffer, gsl::narrow_cast<int>(buffer_size)};
write_vertex_array_data_to_buffer(mapped_buffer_span, vertex_array.data, rsx::method_registers.current_draw_clause.draw_command_ranges,
write_vertex_array_data_to_buffer(mapped_buffer_span, vertex_array.data, vertex_count,
vertex_array.type, vertex_array.attribute_size, vertex_array.stride, element_size, vertex_array.is_be);
m_buffer_data.unmap(CD3DX12_RANGE(heap_offset, heap_offset + buffer_size));
@ -211,12 +211,9 @@ namespace
};
std::tuple<D3D12_INDEX_BUFFER_VIEW, size_t> generate_index_buffer_for_emulated_primitives_array(
const std::vector<rsx::draw_range_t> & vertex_ranges, d3d12_data_heap& m_buffer_data)
u32 vertex_count, d3d12_data_heap& m_buffer_data)
{
size_t index_count = std::accumulate(
vertex_ranges.begin(), vertex_ranges.end(), 0ll, [](size_t acc, const auto& pair) {
return acc + get_index_count(rsx::method_registers.current_draw_clause.primitive, pair.count);
});
size_t index_count = get_index_count(rsx::method_registers.current_draw_clause.primitive, vertex_count);
// Alloc
size_t buffer_size = align(index_count * sizeof(u16), 64);
@ -226,10 +223,6 @@ namespace
void* mapped_buffer =
m_buffer_data.map<void>(CD3DX12_RANGE(heap_offset, heap_offset + buffer_size));
u32 vertex_count = 0;
for (const auto& pair : vertex_ranges)
vertex_count += pair.count;
write_index_array_for_non_indexed_non_native_primitive_to_buffer((char *)mapped_buffer, rsx::method_registers.current_draw_clause.primitive, vertex_count);
m_buffer_data.unmap(CD3DX12_RANGE(heap_offset, heap_offset + buffer_size));
@ -249,9 +242,8 @@ namespace
* range, and whose second element is the number of vertex in this range.
*/
std::vector<D3D12_SHADER_RESOURCE_VIEW_DESC> upload_vertex_attributes(
std::vector<rsx::draw_range_t> vertex_ranges,
std::function<attribute_storage(std::vector<rsx::draw_range_t>)>
get_vertex_buffers,
u32 vertex_count,
std::function<attribute_storage()> get_vertex_buffers,
ID3D12Resource* m_vertex_buffer_data, d3d12_data_heap& m_buffer_data,
ID3D12GraphicsCommandList* command_list)
{
@ -259,13 +251,9 @@ namespace
&CD3DX12_RESOURCE_BARRIER::Transition(m_vertex_buffer_data,
D3D12_RESOURCE_STATE_VERTEX_AND_CONSTANT_BUFFER, D3D12_RESOURCE_STATE_COPY_DEST));
u32 vertex_count = 0;
for (const auto &range : vertex_ranges)
vertex_count += range.count;
vertex_buffer_visitor visitor(
vertex_count, command_list, m_vertex_buffer_data, m_buffer_data);
const auto& vertex_buffers = get_vertex_buffers(vertex_ranges);
const auto& vertex_buffers = get_vertex_buffers();
for (const auto& vbo : vertex_buffers) std::visit(visitor, vbo);
@ -348,7 +336,7 @@ namespace
{
draw_command_visitor(ID3D12GraphicsCommandList* cmd_list, d3d12_data_heap& buffer_data,
ID3D12Resource* vertex_buffer_data,
std::function<attribute_storage(const std::vector<rsx::draw_range_t>&)> get_vertex_info_lambda)
std::function<attribute_storage()> get_vertex_info_lambda)
: command_list(cmd_list), m_buffer_data(buffer_data),
m_vertex_buffer_data(vertex_buffer_data), get_vertex_buffers(get_vertex_info_lambda)
{
@ -357,10 +345,10 @@ namespace
std::tuple<bool, size_t, std::vector<D3D12_SHADER_RESOURCE_VIEW_DESC>> operator()(
const rsx::draw_array_command& command)
{
const auto vertex_count = rsx::method_registers.current_draw_clause.get_elements_count();
if (is_primitive_native(rsx::method_registers.current_draw_clause.primitive)) {
size_t vertex_count = rsx::method_registers.current_draw_clause.get_elements_count();
return std::make_tuple(false, vertex_count,
upload_vertex_attributes(rsx::method_registers.current_draw_clause.draw_command_ranges,
upload_vertex_attributes(vertex_count,
get_vertex_buffers,
m_vertex_buffer_data, m_buffer_data, command_list));
}
@ -369,10 +357,10 @@ namespace
size_t index_count;
std::tie(index_buffer_view, index_count) =
generate_index_buffer_for_emulated_primitives_array(
rsx::method_registers.current_draw_clause.draw_command_ranges, m_buffer_data);
vertex_count, m_buffer_data);
command_list->IASetIndexBuffer(&index_buffer_view);
return std::make_tuple(true, index_count,
upload_vertex_attributes(rsx::method_registers.current_draw_clause.draw_command_ranges,
upload_vertex_attributes(vertex_count,
get_vertex_buffers,
m_vertex_buffer_data, m_buffer_data, command_list));
}
@ -406,7 +394,7 @@ namespace
write_index_array_data_to_buffer(dst, command.raw_index_buffer, indexed_type,
rsx::method_registers.current_draw_clause.primitive,
rsx::method_registers.restart_index_enabled(),
rsx::method_registers.restart_index(), rsx::method_registers.current_draw_clause.draw_command_ranges,
rsx::method_registers.restart_index(),
rsx::method_registers.vertex_data_base_index(), [](auto prim) { return !is_primitive_native(prim); });
m_buffer_data.unmap(CD3DX12_RANGE(heap_offset, heap_offset + buffer_size));
@ -417,7 +405,7 @@ namespace
command_list->IASetIndexBuffer(&index_buffer_view);
return std::make_tuple(true, index_count,
upload_vertex_attributes({ {0, max_index + 1} }, get_vertex_buffers,
upload_vertex_attributes(max_index + 1, get_vertex_buffers,
m_vertex_buffer_data, m_buffer_data, command_list));
}
@ -439,7 +427,7 @@ namespace
size_t index_count;
std::tie(index_buffer_view, index_count) =
generate_index_buffer_for_emulated_primitives_array(
{{0, (u32)vertex_count}}, m_buffer_data);
vertex_count, m_buffer_data);
command_list->IASetIndexBuffer(&index_buffer_view);
return std::make_tuple(true, index_count, vertex_buffer_view);
}
@ -447,7 +435,7 @@ namespace
private:
ID3D12GraphicsCommandList* command_list;
d3d12_data_heap& m_buffer_data;
std::function<attribute_storage(const std::vector<rsx::draw_range_t>&)> get_vertex_buffers;
std::function<attribute_storage()> get_vertex_buffers;
ID3D12Resource* m_vertex_buffer_data;
};
} // End anonymous namespace
@ -457,7 +445,7 @@ D3D12GSRender::upload_and_set_vertex_index_data(ID3D12GraphicsCommandList* comma
{
return std::visit(
draw_command_visitor(command_list, m_buffer_data, m_vertex_buffer_data.Get(),
[this](const auto& list) { return get_vertex_buffers(rsx::method_registers, list, 0); }),
[this]() { return get_vertex_buffers(rsx::method_registers, 0); }),
get_draw_command(rsx::method_registers));
}

View file

@ -195,17 +195,6 @@ void GLGSRender::end()
std::chrono::time_point<steady_clock> state_check_end = steady_clock::now();
m_begin_time += (u32)std::chrono::duration_cast<std::chrono::microseconds>(state_check_end - state_check_start).count();
if (manually_flush_ring_buffers)
{
//Use approximations to reserve space. This path is mostly for debug purposes anyway
u32 approx_vertex_count = rsx::method_registers.current_draw_clause.get_elements_count();
u32 approx_working_buffer_size = approx_vertex_count * 256;
//Allocate 256K heap if we have no approximation at this time (inlined array)
m_attrib_ring_buffer->reserve_storage_on_heap(std::max(approx_working_buffer_size, 256 * 1024U));
m_index_ring_buffer->reserve_storage_on_heap(16 * 1024);
}
const auto do_heap_cleanup = [this]()
{
if (manually_flush_ring_buffers)
@ -220,17 +209,6 @@ void GLGSRender::end()
}
};
//Do vertex upload before RTT prep / texture lookups to give the driver time to push data
auto upload_info = set_vertex_buffer();
if (upload_info.vertex_draw_count == 0)
{
// Malformed vertex setup; abort
do_heap_cleanup();
rsx::thread::end();
return;
}
//Check if depth buffer is bound and valid
//If ds is not initialized clear it; it seems new depth textures should have depth cleared
auto copy_rtt_contents = [this](gl::render_target *surface, bool is_depth)
@ -407,15 +385,11 @@ void GLGSRender::end()
if (!load_program())
{
// Program is not ready, skip drawing this
do_heap_cleanup();
std::this_thread::yield();
rsx::thread::end();
return;
}
// Load program here since it is dependent on vertex state
load_program_env(upload_info);
std::chrono::time_point<steady_clock> program_stop = steady_clock::now();
m_begin_time += (u32)std::chrono::duration_cast<std::chrono::microseconds>(program_stop - program_start).count();
@ -490,102 +464,104 @@ void GLGSRender::end()
std::chrono::time_point<steady_clock> draw_start = steady_clock::now();
do_heap_cleanup();
if (g_cfg.video.debug_output)
{
m_program->validate();
}
const GLenum draw_mode = gl::draw_mode(rsx::method_registers.current_draw_clause.primitive);
const bool allow_multidraw = supports_multidraw && !g_cfg.video.disable_FIFO_reordering;
const bool single_draw = (!allow_multidraw ||
rsx::method_registers.current_draw_clause.draw_command_ranges.size() <= 1 ||
rsx::method_registers.current_draw_clause.is_disjoint_primitive);
if (upload_info.index_info)
rsx::method_registers.current_draw_clause.begin();
int subdraw = 0;
do
{
const GLenum index_type = std::get<0>(*upload_info.index_info);
const u32 index_offset = std::get<1>(*upload_info.index_info);
const bool restarts_valid = gl::is_primitive_native(rsx::method_registers.current_draw_clause.primitive) && !rsx::method_registers.current_draw_clause.is_disjoint_primitive;
if (gl_state.enable(restarts_valid && rsx::method_registers.restart_index_enabled(), GL_PRIMITIVE_RESTART))
if (!subdraw)
{
glPrimitiveRestartIndex((index_type == GL_UNSIGNED_SHORT)? 0xffff: 0xffffffff);
}
m_index_ring_buffer->bind();
if (single_draw)
m_vertex_layout = analyse_inputs_interleaved();
if (!m_vertex_layout.validate())
{
glDrawElements(draw_mode, upload_info.vertex_draw_count, index_type, (GLvoid *)(uintptr_t)index_offset);
}
else
{
const auto draw_count = rsx::method_registers.current_draw_clause.draw_command_ranges.size();
const u32 type_scale = (index_type == GL_UNSIGNED_SHORT) ? 1 : 2;
uintptr_t index_ptr = index_offset;
m_scratch_buffer.resize(draw_count * 16);
GLsizei *counts = (GLsizei*)m_scratch_buffer.data();
const GLvoid** offsets = (const GLvoid**)(counts + draw_count);
int dst_index = 0;
for (const auto &range : rsx::method_registers.current_draw_clause.draw_command_ranges)
{
const auto index_size = get_index_count(rsx::method_registers.current_draw_clause.primitive, range.count);
counts[dst_index] = index_size;
offsets[dst_index++] = (const GLvoid*)index_ptr;
index_ptr += (index_size << type_scale);
}
glMultiDrawElements(draw_mode, counts, index_type, offsets, (GLsizei)draw_count);
break;
}
}
else
{
if (single_draw)
if (rsx::method_registers.current_draw_clause.execute_pipeline_dependencies() & rsx::vertex_base_changed)
{
// Rebase vertex bases instead of
for (auto &info : m_vertex_layout.interleaved_blocks)
{
const auto vertex_base_offset = rsx::method_registers.vertex_data_base_offset();
info.real_offset_address = rsx::get_address(rsx::get_vertex_offset_from_base(vertex_base_offset, info.base_offset), info.memory_location);
}
}
}
++subdraw;
if (manually_flush_ring_buffers)
{
//Use approximations to reserve space. This path is mostly for debug purposes anyway
u32 approx_vertex_count = rsx::method_registers.current_draw_clause.get_elements_count();
u32 approx_working_buffer_size = approx_vertex_count * 256;
//Allocate 256K heap if we have no approximation at this time (inlined array)
m_attrib_ring_buffer->reserve_storage_on_heap(std::max(approx_working_buffer_size, 256 * 1024U));
m_index_ring_buffer->reserve_storage_on_heap(16 * 1024);
}
//Do vertex upload before RTT prep / texture lookups to give the driver time to push data
auto upload_info = set_vertex_buffer();
do_heap_cleanup();
if (upload_info.vertex_draw_count == 0)
{
// Malformed vertex setup; abort
continue;
}
load_program_env(upload_info);
if (!upload_info.index_info)
{
if (rsx::method_registers.current_draw_clause.is_single_draw())
{
glDrawArrays(draw_mode, 0, upload_info.vertex_draw_count);
}
else
{
const u32 base_index = rsx::method_registers.current_draw_clause.draw_command_ranges.front().first;
bool use_draw_arrays_fallback = false;
const auto draw_count = rsx::method_registers.current_draw_clause.draw_command_ranges.size();
const auto subranges = rsx::method_registers.current_draw_clause.get_subranges();
const auto draw_count = subranges.size();
const auto driver_caps = gl::get_driver_caps();
bool use_draw_arrays_fallback = false;
m_scratch_buffer.resize(draw_count * 24);
GLint* firsts = (GLint*)m_scratch_buffer.data();
GLsizei* counts = (GLsizei*)(firsts + draw_count);
const GLvoid** offsets = (const GLvoid**)(counts + draw_count);
int dst_index = 0;
for (const auto &range : rsx::method_registers.current_draw_clause.draw_command_ranges)
u32 first = 0;
u32 dst_index = 0;
for (const auto &range : subranges)
{
const GLint first = range.first - base_index;
const GLsizei count = range.count;
firsts[dst_index] = first;
counts[dst_index] = count;
counts[dst_index] = range.count;
offsets[dst_index++] = (const GLvoid*)(first << 2);
if (driver_caps.vendor_AMD && (first + count) > (0x100000 >> 2))
if (driver_caps.vendor_AMD && (first + range.count) > (0x100000 >> 2))
{
//Unlikely, but added here in case the identity buffer is not large enough somehow
use_draw_arrays_fallback = true;
break;
}
first += range.count;
}
if (use_draw_arrays_fallback)
{
//MultiDrawArrays is broken on some primitive types using AMD. One known type is GL_TRIANGLE_STRIP but there could be more
for (const auto &range : rsx::method_registers.current_draw_clause.draw_command_ranges)
for (int n = 0; n < draw_count; ++n)
{
glDrawArrays(draw_mode, range.first - base_index, range.count);
glDrawArrays(draw_mode, firsts[n], counts[n]);
}
}
else if (driver_caps.vendor_AMD)
@ -601,6 +577,48 @@ void GLGSRender::end()
}
}
}
else
{
const GLenum index_type = std::get<0>(*upload_info.index_info);
const u32 index_offset = std::get<1>(*upload_info.index_info);
const bool restarts_valid = gl::is_primitive_native(rsx::method_registers.current_draw_clause.primitive) && !rsx::method_registers.current_draw_clause.is_disjoint_primitive;
if (gl_state.enable(restarts_valid && rsx::method_registers.restart_index_enabled(), GL_PRIMITIVE_RESTART))
{
glPrimitiveRestartIndex((index_type == GL_UNSIGNED_SHORT) ? 0xffff : 0xffffffff);
}
m_index_ring_buffer->bind();
if (rsx::method_registers.current_draw_clause.is_single_draw())
{
glDrawElements(draw_mode, upload_info.vertex_draw_count, index_type, (GLvoid *)(uintptr_t)index_offset);
}
else
{
const auto subranges = rsx::method_registers.current_draw_clause.get_subranges();
const auto draw_count = subranges.size();
const u32 type_scale = (index_type == GL_UNSIGNED_SHORT) ? 1 : 2;
uintptr_t index_ptr = index_offset;
m_scratch_buffer.resize(draw_count * 16);
GLsizei *counts = (GLsizei*)m_scratch_buffer.data();
const GLvoid** offsets = (const GLvoid**)(counts + draw_count);
int dst_index = 0;
for (const auto &range : subranges)
{
const auto index_size = get_index_count(rsx::method_registers.current_draw_clause.primitive, range.count);
counts[dst_index] = index_size;
offsets[dst_index++] = (const GLvoid*)index_ptr;
index_ptr += (index_size << type_scale);
}
glMultiDrawElements(draw_mode, counts, index_type, offsets, (GLsizei)draw_count);
}
}
} while (rsx::method_registers.current_draw_clause.next());
m_rtts.on_write();

View file

@ -20,19 +20,12 @@ namespace
namespace
{
// return vertex count if primitive type is not native (empty array otherwise)
std::tuple<u32, u32> get_index_array_for_emulated_non_indexed_draw(const std::vector<rsx::draw_range_t> &first_count_commands, rsx::primitive_type primitive_mode, gl::ring_buffer &dst)
std::tuple<u32, u32> get_index_array_for_emulated_non_indexed_draw(rsx::primitive_type primitive_mode, gl::ring_buffer &dst, u32 vertex_count)
{
// This is an emulated buffer, so our indices only range from 0->original_vertex_array_length
u32 vertex_count = 0;
u32 element_count = 0;
const auto element_count = get_index_count(primitive_mode, vertex_count);
verify(HERE), !gl::is_primitive_native(primitive_mode);
for (const auto &range : first_count_commands)
{
element_count += (u32)get_index_count(primitive_mode, range.count);
vertex_count += range.count;
}
auto mapping = dst.alloc_from_heap(element_count * sizeof(u16), 256);
char *mapped_buffer = (char *)mapping.first;
@ -40,7 +33,7 @@ namespace
return std::make_tuple(element_count, mapping.second);
}
std::tuple<u32, u32, u32> upload_index_buffer(gsl::span<const gsl::byte> raw_index_buffer, void *ptr, rsx::index_array_type type, rsx::primitive_type draw_mode, const std::vector<rsx::draw_range_t>& first_count_commands, u32 initial_vertex_count)
std::tuple<u32, u32, u32> upload_index_buffer(gsl::span<const gsl::byte> raw_index_buffer, void *ptr, rsx::index_array_type type, rsx::primitive_type draw_mode, u32 initial_vertex_count)
{
u32 min_index, max_index, vertex_draw_count = initial_vertex_count;
@ -51,7 +44,7 @@ namespace
gsl::span<gsl::byte> dst{ reinterpret_cast<gsl::byte*>(ptr), ::narrow<u32>(block_sz) };
std::tie(min_index, max_index, vertex_draw_count) = write_index_array_data_to_buffer(dst, raw_index_buffer,
type, draw_mode, rsx::method_registers.restart_index_enabled(), rsx::method_registers.restart_index(), first_count_commands,
type, draw_mode, rsx::method_registers.restart_index_enabled(), rsx::method_registers.restart_index(),
rsx::method_registers.vertex_data_base_index(), [](auto prim) { return !gl::is_primitive_native(prim); });
return std::make_tuple(min_index, max_index, vertex_draw_count);
@ -99,8 +92,8 @@ namespace
u32 index_count;
u32 offset_in_index_buffer;
std::tie(index_count, offset_in_index_buffer) = get_index_array_for_emulated_non_indexed_draw(
rsx::method_registers.current_draw_clause.draw_command_ranges,
rsx::method_registers.current_draw_clause.primitive, m_index_ring_buffer);
rsx::method_registers.current_draw_clause.primitive, m_index_ring_buffer,
rsx::method_registers.current_draw_clause.get_elements_count());
return{ index_count, vertex_count, min_index, 0, std::make_tuple(GL_UNSIGNED_SHORT, offset_in_index_buffer) };
}
@ -128,8 +121,7 @@ namespace
u32 offset_in_index_buffer = mapping.second;
std::tie(min_index, max_index, index_count) = upload_index_buffer(
command.raw_index_buffer, ptr, type, rsx::method_registers.current_draw_clause.primitive,
rsx::method_registers.current_draw_clause.draw_command_ranges, vertex_count);
command.raw_index_buffer, ptr, type, rsx::method_registers.current_draw_clause.primitive, vertex_count);
if (min_index >= max_index)
{
@ -163,8 +155,7 @@ namespace
u32 offset_in_index_buffer;
u32 index_count;
std::tie(index_count, offset_in_index_buffer) = get_index_array_for_emulated_non_indexed_draw(
{ { 0, 0, vertex_count } },
rsx::method_registers.current_draw_clause.primitive, m_index_ring_buffer);
rsx::method_registers.current_draw_clause.primitive, m_index_ring_buffer, vertex_count);
return{ index_count, vertex_count, 0, 0, std::make_tuple(GL_UNSIGNED_SHORT, offset_in_index_buffer) };
}
@ -182,11 +173,6 @@ gl::vertex_upload_info GLGSRender::set_vertex_buffer()
{
std::chrono::time_point<steady_clock> then = steady_clock::now();
m_vertex_layout = analyse_inputs_interleaved();
if (!m_vertex_layout.validate())
return {};
//Write index buffers and count verts
auto result = std::visit(draw_command_visitor(*m_index_ring_buffer, m_vertex_layout), get_draw_command(rsx::method_registers));
@ -214,6 +200,8 @@ gl::vertex_upload_info GLGSRender::set_vertex_buffer()
storage_address = m_vertex_layout.interleaved_blocks[0].real_offset_address + vertex_base;
if (auto cached = m_vertex_cache->find_vertex_range(storage_address, GL_R8UI, required.first))
{
verify(HERE), cached->local_address == storage_address;
in_cache = true;
upload_info.persistent_mapping_offset = cached->offset_in_heap;
}

File diff suppressed because it is too large Load diff

View file

@ -2,8 +2,19 @@
#include <Utilities/types.h>
#include <Utilities/Atomic.h>
#include <Utilities/mutex.h>
#include <Utilities/thread.h>
#include "rsx_utils.h"
#include <vector>
#include <string>
#include <memory>
#include <unordered_map>
#ifndef __unused
#define __unused(expression) do { (void)(expression); } while(0)
#endif
struct RsxDmaControl;
@ -17,8 +28,10 @@ namespace rsx
{
NOP = 0,
FIFO_EMPTY = 0xDEADF1F0,
FIFO_BUSY = 0xBABEF1F0,
FIFO_PACKET_BEGIN = 0xF1F0,
FIFO_DISABLED_COMMAND = 0xF1F4,
FIFO_DRAW_BARRIER = 0xF1F8,
};
struct register_pair
@ -26,21 +39,149 @@ namespace rsx
u32 reg;
u32 value;
u32 loc;
u32 reserved;
};
struct fifo_buffer_info_t
{
u32 start_loc;
u32 length;
u32 num_draw_calls;
u32 draw_call_distance_weight;
};
struct branch_target_info_t
{
u32 branch_target;
u32 branch_origin;
s64 weight;
u64 checksum_16;
u64 reserved;
};
struct optimization_pass
{
virtual void optimize(std::vector<register_pair>& commands, const u32* registers) const = 0;
virtual void optimize(const fifo_buffer_info_t& info, simple_array<register_pair>& commands, const u32* registers) = 0;
};
struct flattening_pass : public optimization_pass
{
void optimize(std::vector<register_pair>& commands, const u32* registers) const override;
private:
std::array<bool, 0x10000 / 4> m_skippable_registers;
public:
flattening_pass();
void optimize(const fifo_buffer_info_t& info, simple_array<register_pair>& commands, const u32* registers) override;
};
struct reordering_pass : public optimization_pass
{
void optimize(std::vector<register_pair>& commands, const u32* registers) const override;
private:
struct instruction_buffer_t
{
std::unordered_map<u32, u32> m_storage;
simple_array<u32> m_insertion_order;
instruction_buffer_t()
{
m_insertion_order.reserve(64);
}
void add_cmd(u32 reg, u32 value)
{
const auto is_new = std::get<1>(m_storage.insert_or_assign(reg, value));
if (!is_new)
{
for (auto &loc : m_insertion_order)
{
if (loc == reg)
{
loc |= 0x80000000;
break;
}
}
}
m_insertion_order.push_back(reg);
}
void clear()
{
m_storage.clear();
m_insertion_order.clear();
}
void swap(instruction_buffer_t& other)
{
m_storage.swap(other.m_storage);
m_insertion_order.swap(other.m_insertion_order);
}
auto size() const
{
return m_storage.size();
}
inline std::pair<u32, u32> get(int index) const
{
const auto key = m_insertion_order[index];
if (key & 0x80000000)
{
// Disabled by a later write to the same register
// TODO: Track command type registers and avoid this
return { FIFO_DISABLED_COMMAND, 0 };
}
const auto value = m_storage.at(key);
return { key, value };
}
bool operator == (const instruction_buffer_t& other) const
{
if (size() == other.size())
{
for (const auto &e : other.m_storage)
{
const auto found = m_storage.find(e.first);
if (found == m_storage.end())
return false;
if (found->second != e.second)
return false;
}
return true;
}
return false;
}
};
struct draw_call
{
instruction_buffer_t prologue;
std::vector<register_pair> draws;
bool write_prologue;
u32 primitive_type;
const register_pair* start_pos;
bool matches(const instruction_buffer_t setup, u32 prim) const
{
if (prim != primitive_type)
return false;
return prologue == setup;
}
};
instruction_buffer_t registers_changed;
std::vector<draw_call> bins;
std::unordered_multimap<u32, fifo_buffer_info_t> m_results_prediction_table;
public:
void optimize(const fifo_buffer_info_t& info, simple_array<register_pair>& commands, const u32* registers) override;
};
class FIFO_control
@ -48,28 +189,58 @@ namespace rsx
RsxDmaControl* m_ctrl = nullptr;
u32 m_internal_get = 0;
std::shared_ptr<thread_base> m_prefetcher_thread;
u32 m_prefetch_get = 0;
atomic_t<bool> m_prefetcher_busy{ false };
atomic_t<bool> m_fifo_busy{ false };
fifo_buffer_info_t m_prefetcher_info;
bool m_prefetcher_speculating;
std::vector<std::unique_ptr<optimization_pass>> m_optimization_passes;
std::vector<register_pair> m_queue;
simple_array<register_pair> m_queue;
simple_array<register_pair> m_prefetched_queue;
atomic_t<u32> m_command_index{ 0 };
bool is_blocking_cmd(u32 cmd);
bool is_sync_cmd(u32 cmd);
shared_mutex m_prefetch_mutex; // Guards prefetch queue
shared_mutex m_queue_mutex; // Guards primary queue
atomic_t<u64> m_ctrl_tag{ 0 }; // 'Guards' control registers
void read_ahead();
void optimize();
register_pair empty_cmd { FIFO_EMPTY };
register_pair busy_cmd { FIFO_BUSY };
u32 m_memwatch_addr = 0;
u32 m_memwatch_cmp = 0;
fifo_buffer_info_t m_fifo_info;
std::unordered_multimap<u32, branch_target_info_t> m_branch_prediction_table;
void read_ahead(fifo_buffer_info_t& info, simple_array<register_pair>& commands, u32& get_pointer);
void optimize(const fifo_buffer_info_t& info, simple_array<register_pair>& commands);
void clear_buffer();
u32 get_likely_target(u32 source);
void report_branch_miss(u32 source, u32 target, u32 actual);
void report_branch_hit(u32 source, u32 target);
bool test_prefetcher_correctness(u32 actual_target);
public:
FIFO_control(rsx::thread* pctrl);
~FIFO_control() {}
void set_get(u32 get);
void set_get(u32 get, bool spinning = false);
void set_put(u32 put);
register_pair read();
const register_pair& read();
inline const register_pair& read_unsafe();
void register_optimization_pass(optimization_pass* pass);
void finalize();
public:
static bool is_blocking_cmd(u32 cmd);
static bool is_sync_cmd(u32 cmd);
};
}
}

View file

@ -42,7 +42,135 @@ namespace rsx
std::function<bool(u32 addr, bool is_writing)> g_access_violation_handler;
thread* g_current_renderer = nullptr;
//TODO: Restore a working shaders cache
#pragma optimize("", off)
void run_tests()
{
#if 0
if (0)
{
auto _get_method_name = [](u32 reg) -> std::string
{
if (reg == FIFO::FIFO_DISABLED_COMMAND)
{
return "COMMAND DISABLED";
}
if (reg == FIFO::FIFO_PACKET_BEGIN)
{
return "PACKET BEGIN";
}
return rsx::get_method_name(reg >> 2);
};
auto _dump_commands = [&](const std::vector<FIFO::register_pair>& commands)
{
LOG_ERROR(RSX, "DUMP BEGINS--------------------------------");
for (const auto &cmd : commands)
{
LOG_ERROR(RSX, "%s (0x%x)", _get_method_name(cmd.reg), cmd.value);
}
LOG_ERROR(RSX, "DUMP ENDS--------------------------------");
};
// Test
std::vector<FIFO::register_pair> fake_commands =
{
{ NV4097_SET_TEXTURE_OFFSET << 2, 0xdeadbeef },
{ (NV4097_SET_TEXTURE_OFFSET + 1) << 2, 0xdeadbeef },
{ (NV4097_SET_TEXTURE_OFFSET + 2) << 2, 0xdeadbeef },
{ (NV4097_SET_TEXTURE_OFFSET + 3) << 2, 0xdeadbeef },
{ NV4097_SET_TEXTURE_CONTROL3 << 2, 0x100000},
{ NV4097_INVALIDATE_VERTEX_FILE << 2, 0 },
{ NV4097_SET_BEGIN_END << 2, 5 },
{ NV4097_DRAW_ARRAYS << 2, 0xff000000 },
{ NV4097_SET_BEGIN_END << 2, 0},
{ NV4097_SET_TEXTURE_OFFSET << 2, 0xdeadbeef },
{ (NV4097_SET_TEXTURE_OFFSET + 1) << 2, 0xdeadbeef },
{ (NV4097_SET_TEXTURE_OFFSET + 2) << 2, 0xdeadbeef },
{ (NV4097_SET_TEXTURE_OFFSET + 3) << 2, 0xcafebabe },
{ NV4097_SET_TEXTURE_CONTROL3 << 2, 0x100000},
{ NV4097_INVALIDATE_VERTEX_FILE << 2, 0 },
{ NV4097_SET_BEGIN_END << 2, 5 },
{ NV4097_DRAW_ARRAYS << 2, 0xff0000ff },
{ NV4097_SET_BEGIN_END << 2, 0},
{ NV4097_SET_TEXTURE_OFFSET << 2, 0xdeadbeef },
{ (NV4097_SET_TEXTURE_OFFSET + 1) << 2, 0xdeadbeef },
{ (NV4097_SET_TEXTURE_OFFSET + 2) << 2, 0xdeadbeef },
{ (NV4097_SET_TEXTURE_OFFSET + 3) << 2, 0xdeadbeef },
{ NV4097_SET_TEXTURE_CONTROL3 << 2, 0x100000},
{ NV4097_INVALIDATE_VERTEX_FILE << 2, 0 },
{ NV4097_SET_BEGIN_END << 2, 5 },
{ NV4097_DRAW_ARRAYS << 2, 0xff0001fe },
{ NV4097_SET_BEGIN_END << 2, 0},
{ 0xffffffff, 0 },
{ NV4097_SET_TEXTURE_OFFSET << 2, 0xdeadbeef },
{ (NV4097_SET_TEXTURE_OFFSET + 1) << 2, 0xdeadbeef },
{ (NV4097_SET_TEXTURE_OFFSET + 2) << 2, 0xdeadbeef },
{ (NV4097_SET_TEXTURE_OFFSET + 3) << 2, 0xcafebabe },
{ NV4097_SET_TEXTURE_CONTROL3 << 2, 0x100000},
{ NV4097_INVALIDATE_VERTEX_FILE << 2, 0 },
{ NV4097_SET_BEGIN_END << 2, 5 },
{ NV4097_DRAW_ARRAYS << 2, 0xff0002fd },
{ NV4097_SET_BEGIN_END << 2, 0},
{ NV4097_SET_TEXTURE_OFFSET << 2, 0xdeadbeef },
{ (NV4097_SET_TEXTURE_OFFSET + 1) << 2, 0xdeadbeef },
{ (NV4097_SET_TEXTURE_OFFSET + 2) << 2, 0xdeadbeef },
{ (NV4097_SET_TEXTURE_OFFSET + 3) << 2, 0xdeadbeef },
{ NV4097_SET_TEXTURE_CONTROL3 << 2, 0x100000},
{ NV4097_INVALIDATE_VERTEX_FILE << 2, 0 },
{ NV4097_SET_BEGIN_END << 2, 5 },
{ NV4097_DRAW_ARRAYS << 2, 0xff0003fc },
{ NV4097_SET_BEGIN_END << 2, 0},
{ NV4097_SET_TEXTURE_OFFSET << 2, 0xdeadbeef },
{ (NV4097_SET_TEXTURE_OFFSET + 1) << 2, 0xdeadbeef },
{ (NV4097_SET_TEXTURE_OFFSET + 2) << 2, 0xdeadbeef },
{ (NV4097_SET_TEXTURE_OFFSET + 3) << 2, 0xcafebabe },
{ NV4097_SET_TEXTURE_CONTROL3 << 2, 0x100000},
{ NV4097_INVALIDATE_VERTEX_FILE << 2, 0 },
{ NV4097_SET_BEGIN_END << 2, 5 },
{ NV4097_DRAW_ARRAYS << 2, 0xff0004fb },
{ NV4097_SET_BEGIN_END << 2, 0},
{ NV4097_SET_TEXTURE_OFFSET << 2, 0xdeadbeef },
{ (NV4097_SET_TEXTURE_OFFSET + 1) << 2, 0xdeadbeef },
{ (NV4097_SET_TEXTURE_OFFSET + 2) << 2, 0xdeadbeef },
{ (NV4097_SET_TEXTURE_OFFSET + 3) << 2, 0xdeadbeef },
{ NV4097_SET_TEXTURE_CONTROL3 << 2, 0x100000},
{ NV4097_INVALIDATE_VERTEX_FILE << 2, 0 },
{ NV4097_SET_BEGIN_END << 2, 5 },
{ NV4097_DRAW_ARRAYS << 2, 0xff0005fa },
{ NV4097_SET_BEGIN_END << 2, 0},
{ 0xffffffff, 0xdead },
};
std::vector<u32> fake_registers(16384);
std::fill(fake_registers.begin(), fake_registers.end(), 0u);
FIFO::flattening_pass flattening_pass;
FIFO::reordering_pass reordering_pass;
FIFO::fifo_buffer_info_t info{ 0, fake_commands.size() * 4, /*7*/18, 0 };
flattening_pass.optimize(info, fake_commands, fake_registers.data());
_dump_commands(fake_commands);
reordering_pass.optimize(info, fake_commands, fake_registers.data());
_dump_commands(fake_commands);
LOG_ERROR(RSX, "FINISHED TEST");
}
#endif
}
#pragma optimize("", on)
u32 get_address(u32 offset, u32 location)
{
@ -97,9 +225,11 @@ namespace rsx
return get_current_renderer()->ctxt_addr + offset;
default:
{
fmt::throw_exception("Invalid location (offset=0x%x, location=0x%x)" HERE, offset, location);
}
}
}
u32 get_vertex_type_size_on_host(vertex_base_type type, u32 size)
{
@ -289,23 +419,7 @@ namespace rsx
conditional_render_test_address = 0;
}
rsx::method_registers.current_draw_clause.inline_vertex_array.resize(0);
in_begin_end = true;
switch (rsx::method_registers.current_draw_clause.primitive)
{
case rsx::primitive_type::line_loop:
case rsx::primitive_type::line_strip:
case rsx::primitive_type::polygon:
case rsx::primitive_type::quad_strip:
case rsx::primitive_type::triangle_fan:
case rsx::primitive_type::triangle_strip:
// Adjacency matters for these types
rsx::method_registers.current_draw_clause.is_disjoint_primitive = false;
break;
default:
rsx::method_registers.current_draw_clause.is_disjoint_primitive = true;
}
}
void thread::append_to_push_buffer(u32 attribute, u32 size, u32 subreg_index, vertex_base_type type, u32 value)
@ -348,15 +462,15 @@ namespace rsx
m_graphics_state |= rsx::pipeline_state::framebuffer_reads_dirty;
ROP_sync_timestamp = get_system_time();
for (u8 index = 0; index < rsx::limits::vertex_count; ++index)
for (auto & push_buf : vertex_push_buffers)
{
//Disabled, see https://github.com/RPCS3/rpcs3/issues/1932
//rsx::method_registers.register_vertex_info[index].size = 0;
vertex_push_buffers[index].clear();
push_buf.clear();
}
element_push_buffer.resize(0);
element_push_buffer.clear();
if (zcull_ctrl->active)
zcull_ctrl->on_draw();
@ -397,6 +511,7 @@ namespace rsx
void thread::on_task()
{
m_rsx_thread = std::this_thread::get_id();
run_tests();
if (supports_native_ui)
{
@ -430,8 +545,8 @@ namespace rsx
fifo_ctrl = std::make_unique<::rsx::FIFO::FIFO_control>(this);
fifo_ctrl->register_optimization_pass(new FIFO::flattening_pass());
//fifo_ctrl->register_optimization_pass(new FIFO::reordering_pass());
//fifo_ctrl->register_optimization_pass(new FIFO::flattening_pass());
//fifo_ctrl->register_optimization_pass(new FIFO::reordering_pass()); // R&C2 - Not working if flattening is also enabled!!!
//fifo_ctrl->register_optimization_pass(new FIFO::flattening_pass());
last_flip_time = get_system_time() - 1000000;
@ -539,6 +654,29 @@ namespace rsx
void thread::on_exit()
{
m_rsx_thread_exiting = true;
if (m_vblank_thread)
{
m_vblank_thread->join();
m_vblank_thread.reset();
}
if (m_decompiler_thread)
{
m_decompiler_thread->join();
m_decompiler_thread.reset();
}
if (fifo_ctrl)
{
fifo_ctrl->finalize();
}
}
std::string thread::get_name() const
{
return "rsx::thread";
>>>>>>> rsx: Fixups
}
void thread::fill_scale_offset_data(void *buffer, bool flip_y) const
@ -740,7 +878,7 @@ namespace rsx
return t + timestamp_subvalue;
}
gsl::span<const gsl::byte> thread::get_raw_index_array(const std::vector<draw_range_t>& draw_indexed_clause) const
gsl::span<const gsl::byte> thread::get_raw_index_array(const draw_clause& draw_indexed_clause) const
{
if (element_push_buffer.size())
{
@ -755,49 +893,29 @@ namespace rsx
bool is_primitive_restart_enabled = rsx::method_registers.restart_index_enabled();
u32 primitive_restart_index = rsx::method_registers.restart_index();
u32 min_index = UINT32_MAX;
u32 max_index = 0;
for (const auto &range : draw_indexed_clause)
{
const u32 root_index = (range.command_data_offset / type_size) + range.first;
min_index = std::min(root_index, min_index);
max_index = std::max(root_index + range.count, max_index);
}
const u32 first = min_index;
const u32 count = max_index - min_index;
const u32 first = draw_indexed_clause.min_index();
const u32 count = draw_indexed_clause.get_elements_count();
const gsl::byte* ptr = static_cast<const gsl::byte*>(vm::base(address));
return{ ptr + first * type_size, count * type_size };
}
gsl::span<const gsl::byte> thread::get_raw_vertex_buffer(const rsx::data_array_format_info& vertex_array_info, u32 base_offset, const std::vector<draw_range_t>& vertex_ranges) const
gsl::span<const gsl::byte> thread::get_raw_vertex_buffer(const rsx::data_array_format_info& vertex_array_info, u32 base_offset, const draw_clause& draw_array_clause) const
{
u32 offset = vertex_array_info.offset();
u32 address = rsx::get_address(rsx::get_vertex_offset_from_base(base_offset, offset & 0x7fffffff), offset >> 31);
u32 element_size = rsx::get_vertex_type_size_on_host(vertex_array_info.type(), vertex_array_info.size());
u32 min_index = UINT32_MAX;
u32 max_index = 0;
for (const auto &range : vertex_ranges)
{
const auto root_index = (range.command_data_offset / vertex_array_info.stride()) + range.first;
min_index = std::min(root_index, min_index);
max_index = std::max(root_index + range.count, max_index);
}
const u32 first = min_index;
const u32 count = max_index - min_index;
const u32 first = draw_array_clause.min_index();
const u32 count = draw_array_clause.get_elements_count();
const gsl::byte* ptr = gsl::narrow_cast<const gsl::byte*>(vm::base(address));
return {ptr + first * vertex_array_info.stride(), count * vertex_array_info.stride() + element_size};
}
std::vector<std::variant<vertex_array_buffer, vertex_array_register, empty_vertex_array>>
thread::get_vertex_buffers(const rsx::rsx_state& state, const std::vector<draw_range_t>& vertex_ranges, const u64 consumed_attrib_mask) const
thread::get_vertex_buffers(const rsx::rsx_state& state, const u64 consumed_attrib_mask) const
{
std::vector<std::variant<vertex_array_buffer, vertex_array_register, empty_vertex_array>> result;
result.reserve(rsx::limits::vertex_count);
@ -815,7 +933,7 @@ namespace rsx
{
const rsx::data_array_format_info& info = state.vertex_arrays_info[index];
result.push_back(vertex_array_buffer{info.type(), info.size(), info.stride(),
get_raw_vertex_buffer(info, state.vertex_data_base_offset(), vertex_ranges), index, true});
get_raw_vertex_buffer(info, state.vertex_data_base_offset(), state.current_draw_clause), index, true});
continue;
}
@ -854,7 +972,7 @@ namespace rsx
{
return draw_indexed_array_command
{
get_raw_index_array( rsx::method_registers.current_draw_clause.draw_command_ranges)
get_raw_index_array(state.current_draw_clause)
};
}
@ -1301,7 +1419,6 @@ namespace rsx
if (state.current_draw_clause.command == rsx::draw_command::inlined_array)
{
vertex_input_layout result = {};
result.interleaved_blocks.reserve(8);
interleaved_range_info info = {};
info.interleaved = true;
@ -1336,8 +1453,8 @@ namespace rsx
const u32 frequency_divider_mask = rsx::method_registers.frequency_divider_operation_mask();
vertex_input_layout result = {};
result.interleaved_blocks.reserve(8);
result.referenced_registers.reserve(4);
result.interleaved_blocks.reserve(16);
result.referenced_registers.reserve(16);
for (u8 index = 0; index < rsx::limits::vertex_count; ++index)
{
@ -1430,7 +1547,7 @@ namespace rsx
block.base_offset = base_address;
block.attribute_stride = info.stride();
block.memory_location = info.offset() >> 31;
block.locations.reserve(8);
block.locations.reserve(16);
block.locations.push_back(index);
block.min_divisor = info.frequency();
block.all_modulus = !!(frequency_divider_mask & (1 << index));

View file

@ -528,6 +528,8 @@ namespace rsx
virtual void on_decompiler_exit() {}
virtual bool on_decompiler_task() { return false; }
virtual void emit_geometry(u32) {}
void run_FIFO();
public:
@ -554,11 +556,11 @@ namespace rsx
void read_barrier(u32 memory_address, u32 memory_range);
virtual void sync_hint(FIFO_hint hint) {}
gsl::span<const gsl::byte> get_raw_index_array(const std::vector<draw_range_t>& draw_indexed_clause) const;
gsl::span<const gsl::byte> get_raw_vertex_buffer(const rsx::data_array_format_info&, u32 base_offset, const std::vector<draw_range_t>& vertex_ranges) const;
gsl::span<const gsl::byte> get_raw_index_array(const draw_clause& draw_indexed_clause) const;
gsl::span<const gsl::byte> get_raw_vertex_buffer(const rsx::data_array_format_info&, u32 base_offset, const draw_clause& draw_array_clause) const;
std::vector<std::variant<vertex_array_buffer, vertex_array_register, empty_vertex_array>>
get_vertex_buffers(const rsx::rsx_state& state, const std::vector<draw_range_t>& vertex_ranges, const u64 consumed_attrib_mask) const;
get_vertex_buffers(const rsx::rsx_state& state, const u64 consumed_attrib_mask) const;
std::variant<draw_array_command, draw_indexed_array_command, draw_inlined_array>
get_draw_command(const rsx::rsx_state& state) const;

View file

@ -603,7 +603,7 @@ VKGSRender::VKGSRender() : GSRender()
std::tie(pipeline_layout, descriptor_layouts) = get_shared_pipeline_layout(*m_device);
//Occlusion
m_occlusion_query_pool.create((*m_device), DESCRIPTOR_MAX_DRAW_CALLS); //Enough for 4k draw calls per pass
m_occlusion_query_pool.create((*m_device), OCCLUSION_MAX_POOL_SIZE);
for (int n = 0; n < 128; ++n)
m_occlusion_query_data[n].driver_handle = n;
@ -619,7 +619,7 @@ VKGSRender::VKGSRender() : GSRender()
//VRAM allocation
m_attrib_ring_info.create(VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT, VK_ATTRIB_RING_BUFFER_SIZE_M * 0x100000, "attrib buffer", 0x400000);
m_uniform_buffer_ring_info.create(VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, VK_UBO_RING_BUFFER_SIZE_M * 0x100000, "uniform buffer");
m_uniform_buffer_ring_info.create(VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_UBO_RING_BUFFER_SIZE_M * 0x100000, "uniform buffer");
m_transform_constants_ring_info.create(VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, VK_TRANSFORM_CONSTANTS_BUFFER_SIZE_M * 0x100000, "transform constants buffer");
m_index_buffer_ring_info.create(VK_BUFFER_USAGE_INDEX_BUFFER_BIT, VK_INDEX_RING_BUFFER_SIZE_M * 0x100000, "index buffer");
m_texture_upload_buffer_ring_info.create(VK_BUFFER_USAGE_TRANSFER_SRC_BIT, VK_TEXTURE_UPLOAD_RING_BUFFER_SIZE_M * 0x100000, "texture upload buffer", 32 * 0x100000);
@ -849,11 +849,15 @@ bool VKGSRender::on_access_violation(u32 address, bool is_writing)
m_flush_requests.post(sync_timestamp == 0ull);
has_queue_ref = true;
}
else
else if (!vk::is_uninterruptible())
{
//Flush primary cb queue to sync pending changes (e.g image transitions!)
flush_command_queue();
}
else
{
LOG_ERROR(RSX, "Fault in uninterruptible code!");
}
if (sync_timestamp > 0)
{
@ -1110,6 +1114,145 @@ void VKGSRender::close_render_pass()
render_pass_open = false;
}
void VKGSRender::emit_geometry(u32 sub_index)
{
auto &draw_call = rsx::method_registers.current_draw_clause;
//std::chrono::time_point<steady_clock> vertex_start = steady_clock::now();
if (sub_index == 0)
{
m_vertex_layout = analyse_inputs_interleaved();
}
if (!m_vertex_layout.validate())
{
// No vertex inputs enabled
draw_call.end();
return;
}
if (sub_index > 0 && draw_call.execute_pipeline_dependencies() & rsx::vertex_base_changed)
{
// Rebase vertex bases instead of
for (auto &info : m_vertex_layout.interleaved_blocks)
{
const auto vertex_base_offset = rsx::method_registers.vertex_data_base_offset();
info.real_offset_address = rsx::get_address(rsx::get_vertex_offset_from_base(vertex_base_offset, info.base_offset), info.memory_location);
}
}
const auto old_persistent_buffer = m_persistent_attribute_storage ? m_persistent_attribute_storage->value : null_buffer_view->value;
const auto old_volatile_buffer = m_volatile_attribute_storage ? m_volatile_attribute_storage->value : null_buffer_view->value;
// Programs data is dependent on vertex state
auto upload_info = upload_vertex_data();
if (!upload_info.vertex_draw_count)
{
// Malformed vertex setup; abort
return;
}
//std::chrono::time_point<steady_clock> vertex_end = steady_clock::now();
//m_vertex_upload_time += std::chrono::duration_cast<std::chrono::microseconds>(vertex_end - vertex_start).count();
auto persistent_buffer = m_persistent_attribute_storage ? m_persistent_attribute_storage->value : null_buffer_view->value;
auto volatile_buffer = m_volatile_attribute_storage ? m_volatile_attribute_storage->value : null_buffer_view->value;
bool update_descriptors = false;
if (sub_index == 0)
{
// Load program execution environment
load_program_env(upload_info);
update_descriptors = true;
}
else
{
// Update vertex fetch environment
update_vertex_env(upload_info);
if (persistent_buffer != old_persistent_buffer || volatile_buffer != old_volatile_buffer)
{
/* VkDescriptorSetAllocateInfo alloc_info = {};
alloc_info.descriptorPool = m_current_frame->descriptor_pool;
alloc_info.descriptorSetCount = 1;
alloc_info.pSetLayouts = &descriptor_layouts;
alloc_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO;
VkDescriptorSet new_descriptor_set;
CHECK_RESULT(vkAllocateDescriptorSets(*m_device, &alloc_info, &new_descriptor_set));
VkCopyDescriptorSet copy = {};
copy.sType = VK_STRUCTURE_TYPE_COPY_DESCRIPTOR_SET;
copy
m_current_frame->descriptor_set = new_descriptor_set;
m_current_frame->used_descriptors++;
update_descriptors = true;*/
}
}
if (update_descriptors)
{
m_program->bind_uniform(persistent_buffer, "persistent_input_stream", m_current_frame->descriptor_set);
m_program->bind_uniform(volatile_buffer, "volatile_input_stream", m_current_frame->descriptor_set);
vkCmdBindDescriptorSets(*m_current_command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline_layout, 0, 1, &m_current_frame->descriptor_set, 0, nullptr);
}
//std::chrono::time_point<steady_clock> draw_start = steady_clock::now();
//m_setup_time += std::chrono::duration_cast<std::chrono::microseconds>(draw_start - vertex_end).count();
begin_render_pass();
if (!upload_info.index_info)
{
if (draw_call.is_single_draw())
{
vkCmdDraw(*m_current_command_buffer, upload_info.vertex_draw_count, 1, 0, 0);
}
else
{
u32 vertex_offset = 0;
const auto subranges = draw_call.get_subranges();
for (const auto &range : subranges)
{
vkCmdDraw(*m_current_command_buffer, range.count, 1, vertex_offset, 0);
vertex_offset += range.count;
}
}
}
else
{
const VkIndexType index_type = std::get<1>(*upload_info.index_info);
const VkDeviceSize offset = std::get<0>(*upload_info.index_info);
vkCmdBindIndexBuffer(*m_current_command_buffer, m_index_buffer_ring_info.heap->value, offset, index_type);
if (rsx::method_registers.current_draw_clause.is_single_draw())
{
const u32 index_count = upload_info.vertex_draw_count;
vkCmdDrawIndexed(*m_current_command_buffer, index_count, 1, 0, 0, 0);
}
else
{
u32 vertex_offset = 0;
const auto subranges = draw_call.get_subranges();
for (const auto &range : subranges)
{
const auto count = get_index_count(draw_call.primitive, range.count);
vkCmdDrawIndexed(*m_current_command_buffer, count, 1, vertex_offset, 0, 0);
vertex_offset += count;
}
}
}
close_render_pass();
//std::chrono::time_point<steady_clock> draw_end = steady_clock::now();
//m_draw_time += std::chrono::duration_cast<std::chrono::microseconds>(draw_end - draw_start).count();
}
void VKGSRender::end()
{
if (skip_frame || !framebuffer_status_valid || renderer_unavailable ||
@ -1363,31 +1506,6 @@ void VKGSRender::end()
std::chrono::time_point<steady_clock> program_end = steady_clock::now();
m_setup_time += std::chrono::duration_cast<std::chrono::microseconds>(program_end - program_start).count();
// Programs data is dependent on vertex state
std::chrono::time_point<steady_clock> vertex_start = program_end;
auto upload_info = upload_vertex_data();
std::chrono::time_point<steady_clock> vertex_end = steady_clock::now();
m_vertex_upload_time += std::chrono::duration_cast<std::chrono::microseconds>(vertex_end - vertex_start).count();
if (!upload_info.vertex_draw_count)
{
// Malformed vertex setup; abort
rsx::thread::end();
return;
}
// Load program execution environment
program_start = vertex_end;
load_program_env(upload_info);
VkBufferView persistent_buffer = m_persistent_attribute_storage ? m_persistent_attribute_storage->value : null_buffer_view->value;
VkBufferView volatile_buffer = m_volatile_attribute_storage ? m_volatile_attribute_storage->value : null_buffer_view->value;
m_program->bind_uniform(persistent_buffer, "persistent_input_stream", m_current_frame->descriptor_set);
m_program->bind_uniform(volatile_buffer, "volatile_input_stream", m_current_frame->descriptor_set);
program_end = steady_clock::now();
m_setup_time += std::chrono::duration_cast<std::chrono::microseconds>(program_end - program_start).count();
textures_start = program_end;
for (int i = 0; i < rsx::limits::fragment_textures_count; ++i)
@ -1453,10 +1571,6 @@ void VKGSRender::end()
textures_end = steady_clock::now();
m_textures_upload_time += std::chrono::duration_cast<std::chrono::microseconds>(textures_end - textures_start).count();
//While vertex upload is an interruptible process, if we made it this far, there's no need to sync anything that occurs past this point
//Only textures are synchronized tightly with the GPU and they have been read back above
vk::enter_uninterruptible();
u32 occlusion_id = 0;
if (m_occlusion_query_active)
{
@ -1475,21 +1589,9 @@ void VKGSRender::end()
}
}
vkCmdBindPipeline(*m_current_command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS, m_program->pipeline);
vkCmdBindDescriptorSets(*m_current_command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline_layout, 0, 1, &m_current_frame->descriptor_set, 0, nullptr);
update_draw_state();
begin_render_pass();
bool primitive_emulated = false;
vk::get_appropriate_topology(rsx::method_registers.current_draw_clause.primitive, primitive_emulated);
const bool allow_multidraw = supports_multidraw && !g_cfg.video.disable_FIFO_reordering;
const bool single_draw = (!allow_multidraw ||
rsx::method_registers.current_draw_clause.draw_command_ranges.size() <= 1 ||
rsx::method_registers.current_draw_clause.is_disjoint_primitive);
if (m_occlusion_query_active && (occlusion_id != UINT32_MAX))
{
//Begin query
@ -1500,45 +1602,22 @@ void VKGSRender::end()
m_current_command_buffer->flags |= cb_has_occlusion_task;
}
if (!upload_info.index_info)
{
if (single_draw)
{
vkCmdDraw(*m_current_command_buffer, upload_info.vertex_draw_count, 1, 0, 0);
}
else
{
const auto base_vertex = rsx::method_registers.current_draw_clause.draw_command_ranges.front().first;
for (const auto &range : rsx::method_registers.current_draw_clause.draw_command_ranges)
{
vkCmdDraw(*m_current_command_buffer, range.count, 1, range.first - base_vertex, 0);
}
}
}
else
{
VkIndexType index_type;
const u32 index_count = upload_info.vertex_draw_count;
VkDeviceSize offset;
// While vertex upload is an interruptible process, if we made it this far, there's no need to sync anything that occurs past this point
// Only textures are synchronized tightly with the GPU and they have been read back above
vk::enter_uninterruptible();
std::tie(offset, index_type) = *upload_info.index_info;
vkCmdBindIndexBuffer(*m_current_command_buffer, m_index_buffer_ring_info.heap->value, offset, index_type);
vkCmdBindPipeline(*m_current_command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS, m_program->pipeline);
update_draw_state();
if (single_draw)
u32 sub_index = 0;
rsx::method_registers.current_draw_clause.begin();
do
{
vkCmdDrawIndexed(*m_current_command_buffer, index_count, 1, 0, 0, 0);
}
else
{
u32 first_vertex = 0;
for (const auto &range : rsx::method_registers.current_draw_clause.draw_command_ranges)
{
const auto verts = get_index_count(rsx::method_registers.current_draw_clause.primitive, range.count);
vkCmdDrawIndexed(*m_current_command_buffer, verts, 1, first_vertex, 0, 0);
first_vertex += verts;
}
}
emit_geometry(sub_index++);
}
while (rsx::method_registers.current_draw_clause.next());
vk::leave_uninterruptible();
if (m_occlusion_query_active && (occlusion_id != UINT32_MAX))
{
@ -1546,15 +1625,9 @@ void VKGSRender::end()
m_occlusion_query_pool.end_query(*m_current_command_buffer, occlusion_id);
}
close_render_pass();
vk::leave_uninterruptible();
m_current_command_buffer->num_draws++;
m_rtts.on_write();
std::chrono::time_point<steady_clock> draw_end = steady_clock::now();
m_draw_time += std::chrono::duration_cast<std::chrono::microseconds>(draw_end - textures_end).count();
m_draw_calls++;
rsx::thread::end();
@ -2479,29 +2552,38 @@ void VKGSRender::load_program_env(const vk::vertex_upload_info& vertex_info)
m_graphics_state &= ~handled_flags;
}
static const u32 mr_color_offset[rsx::limits::color_buffers_count] =
void VKGSRender::update_vertex_env(const vk::vertex_upload_info& vertex_info)
{
NV4097_SET_SURFACE_COLOR_AOFFSET,
NV4097_SET_SURFACE_COLOR_BOFFSET,
NV4097_SET_SURFACE_COLOR_COFFSET,
NV4097_SET_SURFACE_COLOR_DOFFSET
};
// Vertex base index = vertex_offset + 132
// Vertex layout = vertex_offset + 160
static const u32 mr_color_dma[rsx::limits::color_buffers_count] =
{
NV4097_SET_CONTEXT_DMA_COLOR_A,
NV4097_SET_CONTEXT_DMA_COLOR_B,
NV4097_SET_CONTEXT_DMA_COLOR_C,
NV4097_SET_CONTEXT_DMA_COLOR_D
};
std::array<s32, 16 * 4> vertex_layout;
fill_vertex_layout_state(m_vertex_layout, vertex_info.allocated_vertex_count, vertex_layout.data(),
vertex_info.persistent_window_offset, vertex_info.volatile_window_offset);
static const u32 mr_color_pitch[rsx::limits::color_buffers_count] =
vk::insert_buffer_memory_barrier(*m_current_command_buffer, m_uniform_buffer_ring_info.heap->value, m_vertex_state_buffer_info.offset, 512,
VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_TRANSFER_WRITE_BIT);
vkCmdUpdateBuffer(*m_current_command_buffer, m_uniform_buffer_ring_info.heap->value, m_vertex_state_buffer_info.offset + 132, 4, &vertex_info.vertex_index_base);
u32 write_offset = m_vertex_state_buffer_info.offset + 160;
s32 *src_ptr = vertex_layout.data();
for (const auto& placement : m_vertex_layout.attribute_placement)
{
NV4097_SET_SURFACE_PITCH_A,
NV4097_SET_SURFACE_PITCH_B,
NV4097_SET_SURFACE_PITCH_C,
NV4097_SET_SURFACE_PITCH_D
};
constexpr u32 data_len = 4 * sizeof(s32);
if (placement != rsx::attribute_buffer_placement::none)
{
vkCmdUpdateBuffer(*m_current_command_buffer, m_uniform_buffer_ring_info.heap->value, write_offset, data_len, src_ptr);
}
write_offset += data_len;
src_ptr += 4;
}
vk::insert_buffer_memory_barrier(*m_current_command_buffer, m_uniform_buffer_ring_info.heap->value, m_vertex_state_buffer_info.offset, 512,
VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_VERTEX_SHADER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_UNIFORM_READ_BIT);
}
void VKGSRender::init_buffers(rsx::framebuffer_creation_context context, bool skip_reading)
{
@ -3049,6 +3131,26 @@ void VKGSRender::flip(int buffer)
if (!image_to_flip)
{
// Read from cell
const auto range = utils::address_range::start_length(absolute_address, buffer_pitch * buffer_height);
const auto overlap = m_texture_cache.find_texture_from_range(range);
bool flush_queue = false;
for (const auto & section : overlap)
{
if (section->get_protection() == utils::protection::no)
{
section->copy_texture(false, *m_current_command_buffer, m_swapchain->get_graphics_queue());
flush_queue = true;
}
}
if (flush_queue)
{
// Submit for processing to lower hard fault penalty
flush_command_queue();
}
m_texture_cache.invalidate_range(range, rsx::invalidation_cause::read, *m_current_command_buffer, m_swapchain->get_graphics_queue());
image_to_flip = m_texture_cache.upload_image_simple(*m_current_command_buffer, absolute_address, buffer_width, buffer_height);
}
}

View file

@ -1,4 +1,4 @@
#pragma once
#pragma once
#include "Emu/RSX/GSRender.h"
#include "VKHelpers.h"
#include "VKTextureCache.h"
@ -403,9 +403,11 @@ private:
vk::vertex_upload_info upload_vertex_data();
public:
bool load_program();
void load_program_env(const vk::vertex_upload_info& vertex_info);
void load_program_env(const vk::vertex_upload_info& upload_info);
void update_vertex_env(const vk::vertex_upload_info& upload_info);
public:
void init_buffers(rsx::framebuffer_creation_context context, bool skip_reading = false);
void read_buffers();
void write_buffers();
@ -422,6 +424,7 @@ public:
protected:
void begin() override;
void end() override;
void emit_geometry(u32 sub_index) override;
void on_init_thread() override;
void on_exit() override;

View file

@ -32,6 +32,7 @@
#endif
#define DESCRIPTOR_MAX_DRAW_CALLS 4096
#define OCCLUSION_MAX_POOL_SIZE 8192
#define VERTEX_BUFFERS_FIRST_BIND_SLOT 3
#define FRAGMENT_CONSTANT_BUFFERS_BIND_SLOT 2
@ -652,7 +653,7 @@ namespace vk
VkImageTiling tiling,
VkImageUsageFlags usage,
VkImageCreateFlags image_flags)
: m_device(dev)
: m_device(dev), current_layout(initial_layout)
{
info.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO;
info.imageType = image_type;
@ -1195,6 +1196,11 @@ namespace vk
return commands;
}
bool is_recording() const
{
return is_open;
}
void begin()
{
if (m_submit_fence && is_pending)
@ -2413,8 +2419,8 @@ public:
VkQueryPool query_pool = VK_NULL_HANDLE;
vk::render_device* owner = nullptr;
std::deque<u32> available_slots;
std::vector<bool> query_active_status;
public:
void create(vk::render_device &dev, u32 num_entries)
@ -2428,6 +2434,12 @@ public:
owner = &dev;
query_active_status.resize(num_entries, false);
available_slots.resize(num_entries);
for (u32 n = 0; n < num_entries; ++n)
{
available_slots[n] = n;
}
}
void destroy()
@ -2483,9 +2495,14 @@ public:
}
void reset_query(vk::command_buffer &cmd, u32 index)
{
if (query_active_status[index])
{
vkCmdResetQueryPool(cmd, query_pool, index, 1);
query_active_status[index] = false;
available_slots.push_back(index);
}
}
void reset_queries(vk::command_buffer &cmd, std::vector<u32> &list)
@ -2505,13 +2522,16 @@ public:
u32 find_free_slot()
{
for (u32 n = 0; n < query_active_status.size(); n++)
if (available_slots.empty())
{
if (query_active_status[n] == false)
return n;
return -1u;
}
return UINT32_MAX;
u32 result = available_slots.front();
available_slots.pop_front();
verify(HERE), !query_active_status[result];
return result;
}
};

View file

@ -106,6 +106,12 @@ namespace
const u32 vertex_count = rsx::method_registers.current_draw_clause.get_elements_count();
const u32 min_index = rsx::method_registers.current_draw_clause.min_index();
//if (rsx::method_registers.current_draw_clause.draw_command_ranges.size() > 1)
//{
// TODO
//LOG_ERROR(RSX, "REEEEEEEEEEEEEEEEEEEEEEE (prims_emulated=%d)", primitives_emulated);
//}
if (primitives_emulated)
{
u32 index_count;
@ -165,7 +171,7 @@ namespace
command.raw_index_buffer, index_type,
rsx::method_registers.current_draw_clause.primitive,
rsx::method_registers.restart_index_enabled(),
rsx::method_registers.restart_index(), rsx::method_registers.current_draw_clause.draw_command_ranges,
rsx::method_registers.restart_index(),
rsx::method_registers.vertex_data_base_index(), [](auto prim) { return !vk::is_primitive_native(prim); });
if (min_index >= max_index)
@ -227,11 +233,6 @@ namespace
vk::vertex_upload_info VKGSRender::upload_vertex_data()
{
m_vertex_layout = analyse_inputs_interleaved();
if (!m_vertex_layout.validate())
return {};
draw_command_visitor visitor(m_index_buffer_ring_info, m_vertex_layout);
auto result = std::visit(visitor, get_draw_command(rsx::method_registers));
@ -258,6 +259,8 @@ vk::vertex_upload_info VKGSRender::upload_vertex_data()
storage_address = m_vertex_layout.interleaved_blocks[0].real_offset_address + vertex_base;
if (auto cached = m_vertex_cache->find_vertex_range(storage_address, VK_FORMAT_R8_UINT, required.first))
{
verify(HERE), cached->local_address == storage_address;
in_cache = true;
persistent_range_base = cached->offset_in_heap;
}

View file

@ -880,12 +880,42 @@ namespace rsx
storage_type* find_vertex_range(uintptr_t local_addr, upload_format fmt, u32 data_length) override
{
const auto data_end = local_addr + data_length;
for (auto &v : vertex_ranges[local_addr])
{
if (v.buffer_format == fmt && v.data_length == data_length)
if (v.buffer_format == fmt && v.data_length >= data_length)
return &v;
}
#if 0
for (const auto &range : vertex_ranges)
{
if (range.first > local_addr)
continue;
for (const auto &v : range.second)
{
if (v.buffer_format == fmt)
{
const auto entry_end = v.local_address + v.data_length;
if (data_end <= entry_end)
{
const u32 offset = (local_addr - v.local_address);
if (offset % 16)
continue; // TexelBuffer alignment rules
storage_type e = v;
e.data_length = data_length;
e.local_address = local_addr;
e.offset_in_heap += offset;
auto& ret = vertex_ranges[local_addr].emplace_back(e);
return &ret;
}
}
}
}
#endif
return nullptr;
}

View file

@ -423,9 +423,7 @@ namespace rsx
{
if (arg)
{
rsx::method_registers.current_draw_clause.draw_command_ranges.clear();
rsx::method_registers.current_draw_clause.command = draw_command::none;
rsx::method_registers.current_draw_clause.primitive = to_primitive_type(arg);
rsx::method_registers.current_draw_clause.reset(to_primitive_type(arg));
rsxthr->begin();
return;
}
@ -453,9 +451,9 @@ namespace rsx
else
rsx::method_registers.current_draw_clause.is_immediate_draw = false;
if (!(rsx::method_registers.current_draw_clause.draw_command_ranges.empty() &&
rsx::method_registers.current_draw_clause.inline_vertex_array.empty()))
if (!rsx::method_registers.current_draw_clause.empty())
{
rsx::method_registers.current_draw_clause.compile();
rsxthr->end();
}
}
@ -598,6 +596,30 @@ namespace rsx
rsx->m_rtts_dirty = true;
}
void set_vertex_base_offset(thread* rsx, u32 reg, u32 arg)
{
if (rsx->in_begin_end)
{
// Revert change to queue later
method_registers.decode(reg, method_registers.register_previous_value);
// Insert base mofifier barrier
method_registers.current_draw_clause.insert_command_barrier(vertex_base_modifier_barrier, arg);
}
}
void set_index_base_offset(thread* rsx, u32 reg, u32 arg)
{
if (rsx->in_begin_end)
{
// Revert change to queue later
method_registers.decode(reg, method_registers.register_previous_value);
// Insert base mofifier barrier
method_registers.current_draw_clause.insert_command_barrier(index_base_modifier_barrier, arg);
}
}
template<u32 index>
struct set_texture_dirty_bit
{
@ -1156,6 +1178,13 @@ namespace rsx
};
}
namespace fifo
{
void draw_barrier(thread* rsx, u32, u32)
{
}
}
void rsx_state::init()
{
// Special values set at initialization, these are not set by a context reset
@ -2122,6 +2151,34 @@ namespace rsx
return registers[reg] == value;
}
u32 draw_clause::execute_pipeline_dependencies() const
{
u32 result = 0;
for (const auto &barrier : draw_command_barriers[current_range_index])
{
switch (barrier.type)
{
case primitive_restart_barrier:
break;
case index_base_modifier_barrier:
// Change index base offset
method_registers.decode(NV4097_SET_VERTEX_DATA_BASE_INDEX, barrier.arg);
result |= index_base_changed;
break;
case vertex_base_modifier_barrier:
// Change vertex base offset
method_registers.decode(NV4097_SET_VERTEX_DATA_BASE_OFFSET, barrier.arg);
result |= vertex_base_changed;
break;
default:
fmt::throw_exception("Unreachable" HERE);
}
}
return result;
}
namespace method_detail
{
template<int Id, int Step, int Count, template<u32> class T, int Index = 0>
@ -2494,6 +2551,7 @@ namespace rsx
//Some custom GCM methods
methods[GCM_SET_DRIVER_OBJECT] = nullptr;
methods[FIFO::FIFO_DRAW_BARRIER] = nullptr;
bind_array<GCM_FLIP_HEAD, 1, 2, nullptr>();
bind_array<GCM_DRIVER_QUEUE, 1, 8, nullptr>();
@ -2600,6 +2658,8 @@ namespace rsx
bind<NV4097_SET_SHADER_PROGRAM, nv4097::set_shader_program_dirty>();
bind<NV4097_SET_TRANSFORM_PROGRAM_START, nv4097::set_transform_program_start>();
bind<NV4097_SET_VERTEX_ATTRIB_OUTPUT_MASK, nv4097::set_vertex_attribute_output_mask>();
bind<NV4097_SET_VERTEX_DATA_BASE_OFFSET, nv4097::set_vertex_base_offset>();
bind<NV4097_SET_VERTEX_DATA_BASE_INDEX, nv4097::set_index_base_offset>();
//NV308A
bind_range<NV308A_COLOR, 1, 256, nv308a::color>();
@ -2619,6 +2679,8 @@ namespace rsx
// custom methods
bind<GCM_FLIP_COMMAND, flip_command>();
// FIFO
bind<FIFO::FIFO_DRAW_BARRIER, fifo::draw_barrier>();
return true;
}();

View file

@ -3,16 +3,21 @@
#include <array>
#include <vector>
#include <numeric>
#include <deque>
#include <set>
#include "GCM.h"
#include "rsx_decode.h"
#include "RSXTexture.h"
#include "rsx_vertex_data.h"
#include "rsx_utils.h"
#include "Utilities/geometry.h"
#include <cereal/types/array.hpp>
#include <cereal/types/unordered_map.hpp>
extern u64 get_system_time();
namespace rsx
{
enum class draw_command
@ -23,6 +28,39 @@ namespace rsx
indexed,
};
enum command_barrier_type : u32
{
primitive_restart_barrier,
vertex_base_modifier_barrier,
index_base_modifier_barrier
};
enum command_execution_flags : u32
{
vertex_base_changed = (1 << 0),
index_base_changed = (1 << 1)
};
struct barrier_t
{
u64 timestamp;
u32 address;
u32 arg;
u32 flags;
command_barrier_type type;
bool operator < (const barrier_t& other) const
{
if (address != -1u)
{
return address < other.address;
}
return timestamp < other.timestamp;
}
};
struct draw_range_t
{
u32 command_data_offset = 0;
@ -30,55 +68,273 @@ namespace rsx
u32 count = 0;
};
struct draw_clause
class draw_clause
{
// Stores the first and count argument from draw/draw indexed parameters between begin/end clauses.
simple_array<draw_range_t> draw_command_ranges;
// Stores rasterization barriers for primitive types sensitive to adjacency
std::vector<std::set<barrier_t>> draw_command_barriers;
// Counter used to parse the commands in order
u32 current_range_index;
// Location of last execution barrier
u32 last_execution_barrier_index;
// Helper functions
// Add a new draw command
void append_draw_command(const draw_range_t& range)
{
draw_command_ranges.push_back(range);
draw_command_barriers.push_back({});
}
// Insert a new draw command within the others
void insert_draw_command(int index, const draw_range_t& range)
{
auto range_It = draw_command_ranges.begin();
auto barrier_It = draw_command_barriers.begin();
// Because deque::insert fails with initializer list on MSVC
const std::set<barrier_t> new_barrier;
while (index--)
{
++range_It;
++barrier_It;
}
draw_command_ranges.insert(range_It, range);
draw_command_barriers.insert(barrier_It, new_barrier);
verify(HERE), draw_command_ranges.size() == draw_command_barriers.size();
}
public:
primitive_type primitive;
draw_command command;
bool is_immediate_draw;
bool is_disjoint_primitive;
std::vector<u32> inline_vertex_array;
simple_array<u32> inline_vertex_array;
void insert_command_barrier(command_barrier_type type, u32 arg)
{
verify(HERE), !draw_command_ranges.empty();
if (type == primitive_restart_barrier)
{
// Rasterization flow barrier
const auto& last = draw_command_ranges.back();
const auto address = last.first + last.count;
const auto command_index = draw_command_ranges.size() - 1;
draw_command_barriers[command_index].insert({ 0, address, arg, 0, type });
}
else
{
// Execution dependency barrier
append_draw_command({});
const auto command_index = draw_command_ranges.size() - 1;
draw_command_barriers[command_index].insert({ get_system_time(), -1u, arg, 0, type });
last_execution_barrier_index = command_index;
}
}
/**
* Stores the first and count argument from draw/draw indexed parameters between begin/end clauses.
* Optimize commands for rendering
*/
std::vector<draw_range_t> draw_command_ranges;
void compile()
{
// TODO
}
/**
* Insert one command range
*/
void append(u32 first, u32 count)
{
if (!draw_command_ranges.empty())
{
auto& last = draw_command_ranges.back();
if (last.count == 0)
{
// Special case, usually indicates an execution barrier
last.first = first;
last.count = count;
return;
}
if (last.first + last.count == first)
{
if (!is_disjoint_primitive)
{
// Insert barrier
insert_command_barrier(primitive_restart_barrier, 0);
}
last.count += count;
return;
}
for (int index = last_execution_barrier_index; index < draw_command_ranges.size(); ++index)
{
if (draw_command_ranges[index].first == first &&
draw_command_ranges[index].count == count)
{
// Duplicate entry? WTF!
return;
}
if (draw_command_ranges[index].first > first)
{
insert_draw_command(index, { 0, first, count });
return;
}
}
}
append_draw_command({ 0, first, count });
}
/**
* Returns how many vertex or index will be consumed by the draw clause.
*/
u32 get_elements_count() const
{
u32 count = 0;
for (const auto &draw : draw_command_ranges)
{
count += draw.count;
return get_range().count;
}
return count;
u32 min_index() const
{
return get_range().first;
}
bool is_single_draw() const
{
if (is_disjoint_primitive)
return true;
if (draw_command_ranges.empty())
{
verify(HERE), !inline_vertex_array.empty();
return true;
}
verify(HERE), current_range_index != -1u;
for (const auto &barrier : draw_command_barriers[current_range_index])
{
if (barrier.type == primitive_restart_barrier)
return false;
}
return true;
}
bool empty() const
{
return (draw_command_ranges.empty() && inline_vertex_array.empty());
}
void reset(rsx::primitive_type type)
{
current_range_index = -1u;
last_execution_barrier_index = 0;
command = draw_command::none;
primitive = type;
draw_command_ranges.clear();
draw_command_barriers.clear();
inline_vertex_array.clear();
switch (primitive)
{
case rsx::primitive_type::line_loop:
case rsx::primitive_type::line_strip:
case rsx::primitive_type::polygon:
case rsx::primitive_type::quad_strip:
case rsx::primitive_type::triangle_fan:
case rsx::primitive_type::triangle_strip:
// Adjacency matters for these types
is_disjoint_primitive = false;
break;
default:
is_disjoint_primitive = true;
}
}
void begin()
{
current_range_index = 0;
}
void end()
{
current_range_index = draw_command_ranges.size() - 1;
}
bool next()
{
current_range_index++;
if (current_range_index >= draw_command_ranges.size())
{
current_range_index = 0;
return false;
}
verify(HERE), draw_command_ranges[current_range_index].count != 0;
return true;
}
/**
* Optimize draw command stream for rendering
* Executes commands reqiured to make the current draw state valid
*/
void compile()
{
u32 execute_pipeline_dependencies() const;
const draw_range_t& get_range() const
{
verify(HERE), current_range_index < draw_command_ranges.size();
return draw_command_ranges[current_range_index];
}
/**
* Insert one command range
*/
void append(u32 first, u32 count)
simple_array<draw_range_t> get_subranges() const
{
verify(HERE), !is_single_draw();
const auto range = get_range();
const auto limit = range.first + range.count;
simple_array<draw_range_t> ret;
u32 previous_barrier = range.first;
u32 vertex_counter = 0;
for (const auto &barrier : draw_command_barriers[current_range_index])
{
if (barrier.type != primitive_restart_barrier)
continue;
if (barrier.address <= range.first)
continue;
if (barrier.address >= limit)
break;
const u32 count = barrier.address - previous_barrier;
ret.push_back({ 0, vertex_counter, count });
previous_barrier = (u32)barrier.address;
vertex_counter += count;
}
u32 min_index()
{
LOG_FATAL(RSX, "Unimplemented");
return 0;
verify(HERE), !ret.empty(), previous_barrier < limit;
ret.push_back({ 0, vertex_counter, limit - previous_barrier });
return ret;
}
};

View file

@ -663,4 +663,237 @@ namespace rsx
m_data.store(0);
}
};
template <typename Ty>
struct simple_array
{
public:
using iterator = Ty * ;
using const_iterator = Ty * const;
private:
u32 _capacity = 0;
u32 _size = 0;
Ty* _data = nullptr;
inline u32 offset(const_iterator pos)
{
return (_data) ? (pos - _data) : 0;
}
public:
simple_array() {}
simple_array(u32 initial_size, const Ty val = {})
{
reserve(initial_size);
_size = initial_size;
for (int n = 0; n < initial_size; ++n)
{
_data[n] = val;
}
}
simple_array(const std::initializer_list<Ty>& args)
{
reserve(args.size());
for (const auto& arg : args)
{
push_back(arg);
}
}
~simple_array()
{
if (_data)
{
free(_data);
_data = nullptr;
_size = _capacity = 0;
}
}
void swap(simple_array<Ty>& other) noexcept
{
std::swap(_capacity, other._capacity);
std::swap(_size, other._size);
std::swap(_data, other._data);
}
void reserve(u32 size)
{
if (_capacity > size)
return;
auto old_data = _data;
auto old_size = _size;
_data = (Ty*)malloc(sizeof(Ty) * size);
_capacity = size;
if (old_data)
{
memcpy(_data, old_data, sizeof(Ty) * old_size);
free(old_data);
}
}
void push_back(const Ty& val)
{
if (_size >= _capacity)
{
reserve(_capacity + 16);
}
_data[_size++] = val;
}
void push_back(Ty&& val)
{
if (_size >= _capacity)
{
reserve(_capacity + 16);
}
_data[_size++] = val;
}
iterator insert(iterator pos, const Ty& val)
{
verify(HERE), pos >= _data;
const auto _loc = offset(pos);
if (_size >= _capacity)
{
reserve(_capacity + 16);
pos = _data + _loc;
}
if (_loc >= _size)
{
_data[_size++] = val;
return pos;
}
verify(HERE), _loc < _size;
const u32 remaining = (_size - _loc);
memmove(pos + 1, pos, remaining * sizeof(Ty));
*pos = val;
_size++;
return pos;
}
iterator insert(iterator pos, Ty&& val)
{
verify(HERE), pos >= _data;
const auto _loc = offset(pos);
if (_size >= _capacity)
{
reserve(_capacity + 16);
pos = _data + _loc;
}
if (_loc >= _size)
{
_data[_size++] = val;
return pos;
}
verify(HERE), _loc < _size;
const u32 remaining = (_size - _loc);
memmove(pos + 1, pos, remaining * sizeof(Ty));
*pos = val;
_size++;
return pos;
}
void clear()
{
_size = 0;
}
bool empty() const
{
return _size == 0;
}
u32 size() const
{
return _size;
}
u32 capacity() const
{
return _capacity;
}
Ty& operator[] (u32 index)
{
return _data[index];
}
const Ty& operator[] (u32 index) const
{
return _data[index];
}
Ty* data()
{
return _data;
}
const Ty* data() const
{
return _data;
}
Ty& back()
{
return _data[_size - 1];
}
const Ty& back() const
{
return _data[_size - 1];
}
Ty& front()
{
return _data[0];
}
const Ty& front() const
{
return _data[0];
}
iterator begin()
{
return _data;
}
iterator end()
{
return _data ? _data + _size : nullptr;
}
const_iterator begin() const
{
return _data;
}
const_iterator end() const
{
return _data ? _data + _size : nullptr;
}
};
}

View file

@ -1,4 +1,4 @@
#pragma once
#pragma once
#include "GCM.h"
#include "Utilities/types.h"
@ -64,11 +64,14 @@ struct push_buffer_vertex_info
void clear()
{
data.resize(0);
if (size)
{
data.clear();
attribute_mask = ~0;
vertex_count = 0;
size = 0;
}
}
u8 get_vertex_size_in_dwords(vertex_base_type type)
{