mirror of
https://github.com/RPCS3/rpcs3.git
synced 2025-07-14 18:58:36 +12:00
rsx: Optimize transform constants load methods (#7992)
This commit is contained in:
parent
e8f9fd5430
commit
36fd1d0f0d
3 changed files with 97 additions and 12 deletions
|
@ -139,17 +139,105 @@ namespace
|
||||||
|
|
||||||
if (remaining)
|
if (remaining)
|
||||||
{
|
{
|
||||||
auto src_ptr2 = reinterpret_cast<const u32*>(src_ptr);
|
const auto src_ptr2 = reinterpret_cast<const se_t<u32, true, 1>*>(src_ptr);
|
||||||
auto dst_ptr2 = reinterpret_cast<u32*>(dst_ptr);
|
const auto dst_ptr2 = reinterpret_cast<nse_t<u32, 1>*>(dst_ptr);
|
||||||
|
|
||||||
for (u32 i = 0; i < remaining; ++i)
|
for (u32 i = 0; i < remaining; ++i)
|
||||||
dst_ptr2[i] = se_storage<u32>::swap(src_ptr2[i]);
|
dst_ptr2[i] = src_ptr2[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template void stream_data_to_memory_swapped_u32<false>(void *, const void *, u32, u8);
|
template void stream_data_to_memory_swapped_u32<false>(void *, const void *, u32, u8);
|
||||||
template void stream_data_to_memory_swapped_u32<true>(void*, const void*, u32, u8);
|
template void stream_data_to_memory_swapped_u32<true>(void*, const void*, u32, u8);
|
||||||
|
|
||||||
|
template <bool unaligned>
|
||||||
|
bool stream_data_to_memory_swapped_and_compare_u32(void *dst, const void *src, u32 size)
|
||||||
|
{
|
||||||
|
const __m128i mask = _mm_set_epi8(
|
||||||
|
0xC, 0xD, 0xE, 0xF,
|
||||||
|
0x8, 0x9, 0xA, 0xB,
|
||||||
|
0x4, 0x5, 0x6, 0x7,
|
||||||
|
0x0, 0x1, 0x2, 0x3);
|
||||||
|
|
||||||
|
auto dst_ptr = static_cast<__m128i*>(dst);
|
||||||
|
auto src_ptr = static_cast<const __m128i*>(src);
|
||||||
|
|
||||||
|
const u32 dword_count = size >> 2;
|
||||||
|
const u32 iterations = dword_count >> 2;
|
||||||
|
|
||||||
|
v128 bits_diff{};
|
||||||
|
|
||||||
|
if (s_use_ssse3) [[likely]]
|
||||||
|
{
|
||||||
|
for (u32 i = 0; i < iterations; ++i)
|
||||||
|
{
|
||||||
|
const __m128i vector = _mm_loadu_si128(src_ptr);
|
||||||
|
const __m128i shuffled_vector = ssse3_shuffle_epi8(vector, mask);
|
||||||
|
|
||||||
|
if constexpr (!unaligned)
|
||||||
|
{
|
||||||
|
bits_diff = bits_diff | v128::fromV(_mm_xor_si128(_mm_load_si128(dst_ptr), shuffled_vector));
|
||||||
|
_mm_stream_si128(dst_ptr, shuffled_vector);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
bits_diff = bits_diff | v128::fromV(_mm_xor_si128(_mm_loadu_si128(dst_ptr), shuffled_vector));
|
||||||
|
_mm_storeu_si128(dst_ptr, shuffled_vector);
|
||||||
|
}
|
||||||
|
|
||||||
|
src_ptr++;
|
||||||
|
dst_ptr++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
for (u32 i = 0; i < iterations; ++i)
|
||||||
|
{
|
||||||
|
const __m128i vec0 = _mm_loadu_si128(src_ptr);
|
||||||
|
const __m128i vec1 = _mm_or_si128(_mm_slli_epi16(vec0, 8), _mm_srli_epi16(vec0, 8));
|
||||||
|
const __m128i vec2 = _mm_or_si128(_mm_slli_epi32(vec1, 16), _mm_srli_epi32(vec1, 16));
|
||||||
|
|
||||||
|
if constexpr (!unaligned)
|
||||||
|
{
|
||||||
|
bits_diff = bits_diff | v128::fromV(_mm_xor_si128(_mm_load_si128(dst_ptr), vec2));
|
||||||
|
_mm_stream_si128(dst_ptr, vec2);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
bits_diff = bits_diff | v128::fromV(_mm_xor_si128(_mm_loadu_si128(dst_ptr), vec2));
|
||||||
|
_mm_storeu_si128(dst_ptr, vec2);
|
||||||
|
}
|
||||||
|
|
||||||
|
src_ptr++;
|
||||||
|
dst_ptr++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const u32 remaining = dword_count % 4;
|
||||||
|
|
||||||
|
if (remaining)
|
||||||
|
{
|
||||||
|
const auto src_ptr2 = reinterpret_cast<const se_t<u32, true, 1>*>(src_ptr);
|
||||||
|
const auto dst_ptr2 = reinterpret_cast<nse_t<u32, 1>*>(dst_ptr);
|
||||||
|
|
||||||
|
for (u32 i = 0; i < remaining; ++i)
|
||||||
|
{
|
||||||
|
const u32 data = src_ptr2[i];
|
||||||
|
|
||||||
|
if (dst_ptr2[i] != data)
|
||||||
|
{
|
||||||
|
dst_ptr2[i] = data;
|
||||||
|
bits_diff._u32[0] = UINT32_MAX;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return bits_diff != v128{};
|
||||||
|
}
|
||||||
|
|
||||||
|
template bool stream_data_to_memory_swapped_and_compare_u32<false>(void *dst, const void *src, u32 size);
|
||||||
|
template bool stream_data_to_memory_swapped_and_compare_u32<true>(void *dst, const void *src, u32 size);
|
||||||
|
|
||||||
namespace
|
namespace
|
||||||
{
|
{
|
||||||
inline void stream_data_to_memory_swapped_u16(void *dst, const void *src, u32 vertex_count, u8 stride)
|
inline void stream_data_to_memory_swapped_u16(void *dst, const void *src, u32 vertex_count, u8 stride)
|
||||||
|
@ -194,11 +282,11 @@ namespace
|
||||||
|
|
||||||
if (remaining)
|
if (remaining)
|
||||||
{
|
{
|
||||||
auto src_ptr2 = reinterpret_cast<const u16*>(src_ptr);
|
auto src_ptr2 = reinterpret_cast<const se_t<u16, true, 1>*>(src_ptr);
|
||||||
auto dst_ptr2 = reinterpret_cast<u16*>(dst_ptr);
|
auto dst_ptr2 = reinterpret_cast<nse_t<u16, 1>*>(dst_ptr);
|
||||||
|
|
||||||
for (u32 i = 0; i < remaining; ++i)
|
for (u32 i = 0; i < remaining; ++i)
|
||||||
dst_ptr2[i] = se_storage<u16>::swap(src_ptr2[i]);
|
dst_ptr2[i] = src_ptr2[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -61,5 +61,7 @@ void stream_vector_from_memory(void *dst, void *src);
|
||||||
*/
|
*/
|
||||||
template <bool unaligned = false>
|
template <bool unaligned = false>
|
||||||
void stream_data_to_memory_swapped_u32(void *dst, const void *src, u32 vertex_count, u8 stride);
|
void stream_data_to_memory_swapped_u32(void *dst, const void *src, u32 vertex_count, u8 stride);
|
||||||
|
template <bool unaligned = false>
|
||||||
|
bool stream_data_to_memory_swapped_and_compare_u32(void *dst, const void *src, u32 size);
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -422,8 +422,6 @@ namespace rsx
|
||||||
rcount -= max - (468 * 4);
|
rcount -= max - (468 * 4);
|
||||||
}
|
}
|
||||||
|
|
||||||
alignas(64) u8 buffer[128];
|
|
||||||
|
|
||||||
const auto values = &rsx::method_registers.transform_constants[load + reg][subreg];
|
const auto values = &rsx::method_registers.transform_constants[load + reg][subreg];
|
||||||
|
|
||||||
if (rsx->m_graphics_state & rsx::pipeline_state::transform_constants_dirty)
|
if (rsx->m_graphics_state & rsx::pipeline_state::transform_constants_dirty)
|
||||||
|
@ -433,12 +431,9 @@ namespace rsx
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
stream_data_to_memory_swapped_u32(buffer, vm::base(rsx->fifo_ctrl->get_current_arg_ptr()), rcount, 4);
|
if (stream_data_to_memory_swapped_and_compare_u32<true>(values, vm::base(rsx->fifo_ctrl->get_current_arg_ptr()), rcount * 4))
|
||||||
|
|
||||||
if (std::memcmp(values, buffer, rcount * 4) != 0)
|
|
||||||
{
|
{
|
||||||
// Transform constants invalidation is expensive (~8k bytes per update)
|
// Transform constants invalidation is expensive (~8k bytes per update)
|
||||||
std::memcpy(values, buffer, rcount * 4);
|
|
||||||
rsx->m_graphics_state |= rsx::pipeline_state::transform_constants_dirty;
|
rsx->m_graphics_state |= rsx::pipeline_state::transform_constants_dirty;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue