mirror of
https://github.com/RPCS3/rpcs3.git
synced 2025-07-06 06:51:26 +12:00
rsx: Batch vertex program load methods
This commit is contained in:
parent
85c4321c24
commit
d97e9f7b4a
5 changed files with 80 additions and 29 deletions
|
@ -77,8 +77,10 @@ namespace
|
||||||
X = X << 5;
|
X = X << 5;
|
||||||
return{ X, Y, Z, 1 };
|
return{ X, Y, Z, 1 };
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
inline void stream_data_to_memory_swapped_u32(void *dst, const void *src, u32 vertex_count, u8 stride)
|
template <bool unaligned>
|
||||||
|
void stream_data_to_memory_swapped_u32(void *dst, const void *src, u32 vertex_count, u8 stride)
|
||||||
{
|
{
|
||||||
const __m128i mask = _mm_set_epi8(
|
const __m128i mask = _mm_set_epi8(
|
||||||
0xC, 0xD, 0xE, 0xF,
|
0xC, 0xD, 0xE, 0xF,
|
||||||
|
@ -99,7 +101,15 @@ namespace
|
||||||
{
|
{
|
||||||
const __m128i vector = _mm_loadu_si128(src_ptr);
|
const __m128i vector = _mm_loadu_si128(src_ptr);
|
||||||
const __m128i shuffled_vector = ssse3_shuffle_epi8(vector, mask);
|
const __m128i shuffled_vector = ssse3_shuffle_epi8(vector, mask);
|
||||||
|
|
||||||
|
if constexpr (!unaligned)
|
||||||
|
{
|
||||||
_mm_stream_si128(dst_ptr, shuffled_vector);
|
_mm_stream_si128(dst_ptr, shuffled_vector);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
_mm_storeu_si128(dst_ptr, shuffled_vector);
|
||||||
|
}
|
||||||
|
|
||||||
src_ptr++;
|
src_ptr++;
|
||||||
dst_ptr++;
|
dst_ptr++;
|
||||||
|
@ -112,7 +122,15 @@ namespace
|
||||||
const __m128i vec0 = _mm_loadu_si128(src_ptr);
|
const __m128i vec0 = _mm_loadu_si128(src_ptr);
|
||||||
const __m128i vec1 = _mm_or_si128(_mm_slli_epi16(vec0, 8), _mm_srli_epi16(vec0, 8));
|
const __m128i vec1 = _mm_or_si128(_mm_slli_epi16(vec0, 8), _mm_srli_epi16(vec0, 8));
|
||||||
const __m128i vec2 = _mm_or_si128(_mm_slli_epi32(vec1, 16), _mm_srli_epi32(vec1, 16));
|
const __m128i vec2 = _mm_or_si128(_mm_slli_epi32(vec1, 16), _mm_srli_epi32(vec1, 16));
|
||||||
|
|
||||||
|
if constexpr (!unaligned)
|
||||||
|
{
|
||||||
_mm_stream_si128(dst_ptr, vec2);
|
_mm_stream_si128(dst_ptr, vec2);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
_mm_storeu_si128(dst_ptr, vec2);
|
||||||
|
}
|
||||||
|
|
||||||
src_ptr++;
|
src_ptr++;
|
||||||
dst_ptr++;
|
dst_ptr++;
|
||||||
|
@ -129,6 +147,11 @@ namespace
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template void stream_data_to_memory_swapped_u32<false>(void *, const void *, u32, u8);
|
||||||
|
template void stream_data_to_memory_swapped_u32<true>(void*, const void*, u32, u8);
|
||||||
|
|
||||||
|
namespace
|
||||||
|
{
|
||||||
inline void stream_data_to_memory_swapped_u16(void *dst, const void *src, u32 vertex_count, u8 stride)
|
inline void stream_data_to_memory_swapped_u16(void *dst, const void *src, u32 vertex_count, u8 stride)
|
||||||
{
|
{
|
||||||
const __m128i mask = _mm_set_epi8(
|
const __m128i mask = _mm_set_epi8(
|
||||||
|
|
|
@ -55,3 +55,11 @@ void stream_vector(void *dst, u32 x, u32 y, u32 z, u32 w);
|
||||||
* Stream a 128 bits vector from src to dst.
|
* Stream a 128 bits vector from src to dst.
|
||||||
*/
|
*/
|
||||||
void stream_vector_from_memory(void *dst, void *src);
|
void stream_vector_from_memory(void *dst, void *src);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Stream and swap data in u32 units.
|
||||||
|
*/
|
||||||
|
template <bool unaligned = false>
|
||||||
|
void stream_data_to_memory_swapped_u32(void *dst, const void *src, u32 vertex_count, u8 stride);
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -524,32 +524,44 @@ namespace rsx
|
||||||
rsx::frame_capture_data::replay_command replay_cmd;
|
rsx::frame_capture_data::replay_command replay_cmd;
|
||||||
replay_cmd.rsx_command = std::make_pair((reg << 2) | (1u << 18), value);
|
replay_cmd.rsx_command = std::make_pair((reg << 2) | (1u << 18), value);
|
||||||
|
|
||||||
frame_capture.replay_commands.push_back(replay_cmd);
|
auto& commands = frame_capture.replay_commands;
|
||||||
auto it = frame_capture.replay_commands.back();
|
commands.push_back(replay_cmd);
|
||||||
|
|
||||||
switch (reg)
|
switch (reg)
|
||||||
{
|
{
|
||||||
case NV3089_IMAGE_IN:
|
case NV3089_IMAGE_IN:
|
||||||
capture::capture_image_in(this, it);
|
capture::capture_image_in(this, commands.back());
|
||||||
break;
|
break;
|
||||||
case NV0039_BUFFER_NOTIFY:
|
case NV0039_BUFFER_NOTIFY:
|
||||||
capture::capture_buffer_notify(this, it);
|
capture::capture_buffer_notify(this, commands.back());
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
// Use legacy logic for NV308A_COLOR - enqueue leading command with count
|
static constexpr std::array<std::pair<u32, u32>, 2> ranges
|
||||||
|
{{
|
||||||
|
{NV308A_COLOR, 0x700},
|
||||||
|
{NV4097_SET_TRANSFORM_PROGRAM, 32}
|
||||||
|
}};
|
||||||
|
|
||||||
|
// Use legacy logic - enqueue leading command with count
|
||||||
// Then enqueue each command arg alone with a no-op command
|
// Then enqueue each command arg alone with a no-op command
|
||||||
if (reg >= NV308A_COLOR && reg < NV308A_COLOR + 0x700)
|
for (const auto& range : ranges)
|
||||||
{
|
{
|
||||||
const u32 remaining = std::min<u32>(fifo_ctrl->get_remaining_args_count(), (NV308A_COLOR + 0x700) - reg);
|
if (reg >= range.first && reg < range.first + range.second)
|
||||||
|
|
||||||
it.rsx_command.first = (fifo_ctrl->last_cmd() & RSX_METHOD_NON_INCREMENT_CMD_MASK) | (reg << 2) | (remaining << 18);
|
|
||||||
|
|
||||||
for (u32 i = 0; i < remaining && fifo_ctrl->get_pos() + (i + 1) * 4 != (ctrl->put & ~3); i++)
|
|
||||||
{
|
{
|
||||||
replay_cmd.rsx_command = std::make_pair(0, vm::read32(fifo_ctrl->get_current_arg_ptr() + (i + 1) * 4));
|
const u32 remaining = std::min<u32>(fifo_ctrl->get_remaining_args_count() + 1,
|
||||||
|
(fifo_ctrl->last_cmd() & RSX_METHOD_NON_INCREMENT_CMD_MASK) ? UINT32_MAX : (range.first + range.second) - reg);
|
||||||
|
|
||||||
frame_capture.replay_commands.push_back(replay_cmd);
|
commands.back().rsx_command.first = (fifo_ctrl->last_cmd() & RSX_METHOD_NON_INCREMENT_CMD_MASK) | (reg << 2) | (remaining << 18);
|
||||||
|
|
||||||
|
for (u32 i = 1; i < remaining && fifo_ctrl->get_pos() + (i - 1) * 4 != (ctrl->put & ~3); i++)
|
||||||
|
{
|
||||||
|
replay_cmd.rsx_command = std::make_pair(0, vm::read32(fifo_ctrl->get_current_arg_ptr() + (i * 4)));
|
||||||
|
|
||||||
|
commands.push_back(replay_cmd);
|
||||||
|
}
|
||||||
|
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -6,6 +6,7 @@
|
||||||
#include "rsx_decode.h"
|
#include "rsx_decode.h"
|
||||||
#include "Emu/Cell/PPUCallback.h"
|
#include "Emu/Cell/PPUCallback.h"
|
||||||
#include "Emu/Cell/lv2/sys_rsx.h"
|
#include "Emu/Cell/lv2/sys_rsx.h"
|
||||||
|
#include "Emu/RSX/Common/BufferUtils.h"
|
||||||
|
|
||||||
#include <thread>
|
#include <thread>
|
||||||
#include <atomic>
|
#include <atomic>
|
||||||
|
@ -450,17 +451,30 @@ namespace rsx
|
||||||
{
|
{
|
||||||
static void impl(thread* rsx, u32 _reg, u32 arg)
|
static void impl(thread* rsx, u32 _reg, u32 arg)
|
||||||
{
|
{
|
||||||
if (rsx::method_registers.transform_program_load() >= 512)
|
// Get real args count
|
||||||
|
const u32 count = std::min<u32>({rsx->fifo_ctrl->get_remaining_args_count() + 1,
|
||||||
|
static_cast<u32>(((rsx->ctrl->put & ~3ull) - (rsx->fifo_ctrl->get_pos() - 4)) / 4), 32 - index});
|
||||||
|
|
||||||
|
const u32 load_pos = rsx::method_registers.transform_program_load();
|
||||||
|
|
||||||
|
u32 rcount = count;
|
||||||
|
|
||||||
|
if (const u32 max = load_pos * 4 + rcount + (index % 4);
|
||||||
|
max > 512 * 4)
|
||||||
{
|
{
|
||||||
// PS3 seems to allow exceeding the program buffer by upto 32 instructions before crashing
|
// PS3 seems to allow exceeding the program buffer by upto 32 instructions before crashing
|
||||||
// Discard the "excess" instructions to not overflow our transform program buffer
|
// Discard the "excess" instructions to not overflow our transform program buffer
|
||||||
// TODO: Check if the instructions in the overflow area are executed by PS3
|
// TODO: Check if the instructions in the overflow area are executed by PS3
|
||||||
rsx_log.warning("Program buffer overflow!");
|
rsx_log.warning("Program buffer overflow!");
|
||||||
return;
|
rcount -= max - (512 * 4);
|
||||||
}
|
}
|
||||||
|
|
||||||
method_registers.commit_4_transform_program_instructions(index);
|
stream_data_to_memory_swapped_u32<true>(&rsx::method_registers.transform_program[load_pos * 4 + index % 4]
|
||||||
|
, vm::base(rsx->fifo_ctrl->get_current_arg_ptr()), rcount, 4);
|
||||||
|
|
||||||
rsx->m_graphics_state |= rsx::pipeline_state::vertex_program_dirty;
|
rsx->m_graphics_state |= rsx::pipeline_state::vertex_program_dirty;
|
||||||
|
rsx::method_registers.transform_program_load_set(load_pos + ((rcount + index % 4) / 4));
|
||||||
|
rsx->fifo_ctrl->skip_methods(count - 1);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -2994,7 +3008,7 @@ namespace rsx
|
||||||
bind_range<NV4097_SET_VERTEX_DATA2S_M, 1, 16, nv4097::set_vertex_data2s_m>();
|
bind_range<NV4097_SET_VERTEX_DATA2S_M, 1, 16, nv4097::set_vertex_data2s_m>();
|
||||||
bind_range<NV4097_SET_VERTEX_DATA4S_M, 1, 32, nv4097::set_vertex_data4s_m>();
|
bind_range<NV4097_SET_VERTEX_DATA4S_M, 1, 32, nv4097::set_vertex_data4s_m>();
|
||||||
bind_range<NV4097_SET_TRANSFORM_CONSTANT, 1, 32, nv4097::set_transform_constant>();
|
bind_range<NV4097_SET_TRANSFORM_CONSTANT, 1, 32, nv4097::set_transform_constant>();
|
||||||
bind_range<NV4097_SET_TRANSFORM_PROGRAM + 3, 4, 32 / 4, nv4097::set_transform_program>();
|
bind_range<NV4097_SET_TRANSFORM_PROGRAM, 1, 32, nv4097::set_transform_program>();
|
||||||
bind<NV4097_GET_REPORT, nv4097::get_report>();
|
bind<NV4097_GET_REPORT, nv4097::get_report>();
|
||||||
bind<NV4097_CLEAR_REPORT_VALUE, nv4097::clear_report_value>();
|
bind<NV4097_CLEAR_REPORT_VALUE, nv4097::clear_report_value>();
|
||||||
bind<NV4097_SET_SURFACE_CLIP_HORIZONTAL, nv4097::set_surface_dirty_bit>();
|
bind<NV4097_SET_SURFACE_CLIP_HORIZONTAL, nv4097::set_surface_dirty_bit>();
|
||||||
|
|
|
@ -1610,20 +1610,14 @@ namespace rsx
|
||||||
return u16(registers[NV308A_SIZE_OUT] & 0xFFFF);
|
return u16(registers[NV308A_SIZE_OUT] & 0xFFFF);
|
||||||
}
|
}
|
||||||
|
|
||||||
u32 transform_program_load()
|
u32 transform_program_load() const
|
||||||
{
|
{
|
||||||
return registers[NV4097_SET_TRANSFORM_PROGRAM_LOAD];
|
return registers[NV4097_SET_TRANSFORM_PROGRAM_LOAD];
|
||||||
}
|
}
|
||||||
|
|
||||||
void commit_4_transform_program_instructions(u32 index)
|
void transform_program_load_set(u32 value)
|
||||||
{
|
{
|
||||||
u32& load = registers[NV4097_SET_TRANSFORM_PROGRAM_LOAD];
|
registers[NV4097_SET_TRANSFORM_PROGRAM_LOAD] = value;
|
||||||
|
|
||||||
transform_program[load * 4] = registers[NV4097_SET_TRANSFORM_PROGRAM + index * 4];
|
|
||||||
transform_program[load * 4 + 1] = registers[NV4097_SET_TRANSFORM_PROGRAM + index * 4 + 1];
|
|
||||||
transform_program[load * 4 + 2] = registers[NV4097_SET_TRANSFORM_PROGRAM + index * 4 + 2];
|
|
||||||
transform_program[load * 4 + 3] = registers[NV4097_SET_TRANSFORM_PROGRAM + index * 4 + 3];
|
|
||||||
load++;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
u32 transform_constant_load()
|
u32 transform_constant_load()
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue