mirror of
https://github.com/RPCS3/rpcs3.git
synced 2025-07-03 05:21:25 +12:00
RSX: remove SSSE3 dependency
This commit is contained in:
parent
de5dab35e0
commit
61de20a633
3 changed files with 103 additions and 38 deletions
|
@ -470,7 +470,7 @@ struct se_storage<T, 16, 16>
|
||||||
|
|
||||||
static inline v128 swap(const v128& src)
|
static inline v128 swap(const v128& src)
|
||||||
{
|
{
|
||||||
return v128::fromV(_mm_shuffle_epi8(src.vi, _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)));
|
return v128::from64(se_storage<u64>::swap(src._u64[1]), se_storage<u64>::swap(src._u64[0]));
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline v128 to(const T& src)
|
static inline v128 to(const T& src)
|
||||||
|
|
|
@ -1,9 +1,20 @@
|
||||||
#include "stdafx.h"
|
#include "stdafx.h"
|
||||||
#include "BufferUtils.h"
|
#include "BufferUtils.h"
|
||||||
#include "../rsx_methods.h"
|
#include "../rsx_methods.h"
|
||||||
|
#include "Utilities/sysinfo.h"
|
||||||
|
|
||||||
#define DEBUG_VERTEX_STREAMING 0
|
#define DEBUG_VERTEX_STREAMING 0
|
||||||
|
|
||||||
|
const bool s_use_ssse3 =
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
utils::has_ssse3();
|
||||||
|
#elif __SSSE3__
|
||||||
|
true;
|
||||||
|
#else
|
||||||
|
false;
|
||||||
|
#define _mm_shuffle_epi8
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace
|
namespace
|
||||||
{
|
{
|
||||||
// FIXME: GSL as_span break build if template parameter is non const with current revision.
|
// FIXME: GSL as_span break build if template parameter is non const with current revision.
|
||||||
|
@ -49,16 +60,30 @@ namespace
|
||||||
const u32 iterations = dword_count >> 2;
|
const u32 iterations = dword_count >> 2;
|
||||||
const u32 remaining = dword_count % 4;
|
const u32 remaining = dword_count % 4;
|
||||||
|
|
||||||
for (u32 i = 0; i < iterations; ++i)
|
if (LIKELY(s_use_ssse3))
|
||||||
{
|
{
|
||||||
u32 *src_words = (u32*)src_ptr;
|
for (u32 i = 0; i < iterations; ++i)
|
||||||
u32 *dst_words = (u32*)dst_ptr;
|
{
|
||||||
const __m128i &vector = _mm_loadu_si128(src_ptr);
|
const __m128i vector = _mm_loadu_si128(src_ptr);
|
||||||
const __m128i &shuffled_vector = _mm_shuffle_epi8(vector, mask);
|
const __m128i shuffled_vector = _mm_shuffle_epi8(vector, mask);
|
||||||
_mm_stream_si128(dst_ptr, shuffled_vector);
|
_mm_stream_si128(dst_ptr, shuffled_vector);
|
||||||
|
|
||||||
src_ptr++;
|
src_ptr++;
|
||||||
dst_ptr++;
|
dst_ptr++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
for (u32 i = 0; i < iterations; ++i)
|
||||||
|
{
|
||||||
|
const __m128i vec0 = _mm_loadu_si128(src_ptr);
|
||||||
|
const __m128i vec1 = _mm_or_si128(_mm_slli_epi16(vec0, 8), _mm_srli_epi16(vec0, 8));
|
||||||
|
const __m128i vec2 = _mm_or_si128(_mm_slli_epi32(vec1, 16), _mm_srli_epi32(vec1, 16));
|
||||||
|
_mm_stream_si128(dst_ptr, vec2);
|
||||||
|
|
||||||
|
src_ptr++;
|
||||||
|
dst_ptr++;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (remaining)
|
if (remaining)
|
||||||
|
@ -86,16 +111,29 @@ namespace
|
||||||
const u32 iterations = word_count >> 3;
|
const u32 iterations = word_count >> 3;
|
||||||
const u32 remaining = word_count % 8;
|
const u32 remaining = word_count % 8;
|
||||||
|
|
||||||
for (u32 i = 0; i < iterations; ++i)
|
if (LIKELY(s_use_ssse3))
|
||||||
{
|
{
|
||||||
u32 *src_words = (u32*)src_ptr;
|
for (u32 i = 0; i < iterations; ++i)
|
||||||
u32 *dst_words = (u32*)dst_ptr;
|
{
|
||||||
const __m128i &vector = _mm_loadu_si128(src_ptr);
|
const __m128i vector = _mm_loadu_si128(src_ptr);
|
||||||
const __m128i &shuffled_vector = _mm_shuffle_epi8(vector, mask);
|
const __m128i shuffled_vector = _mm_shuffle_epi8(vector, mask);
|
||||||
_mm_stream_si128(dst_ptr, shuffled_vector);
|
_mm_stream_si128(dst_ptr, shuffled_vector);
|
||||||
|
|
||||||
src_ptr++;
|
src_ptr++;
|
||||||
dst_ptr++;
|
dst_ptr++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
for (u32 i = 0; i < iterations; ++i)
|
||||||
|
{
|
||||||
|
const __m128i vec0 = _mm_loadu_si128(src_ptr);
|
||||||
|
const __m128i vec1 = _mm_or_si128(_mm_slli_epi16(vec0, 8), _mm_srli_epi16(vec0, 8));
|
||||||
|
_mm_stream_si128(dst_ptr, vec1);
|
||||||
|
|
||||||
|
src_ptr++;
|
||||||
|
dst_ptr++;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (remaining)
|
if (remaining)
|
||||||
|
@ -133,14 +171,30 @@ namespace
|
||||||
else
|
else
|
||||||
remainder = vertex_count;
|
remainder = vertex_count;
|
||||||
|
|
||||||
for (u32 i = 0; i < iterations; ++i)
|
if (LIKELY(s_use_ssse3))
|
||||||
{
|
{
|
||||||
const __m128i &vector = _mm_loadu_si128((__m128i*)src_ptr);
|
for (u32 i = 0; i < iterations; ++i)
|
||||||
const __m128i &shuffled_vector = _mm_shuffle_epi8(vector, mask);
|
{
|
||||||
_mm_storeu_si128((__m128i*)dst_ptr, shuffled_vector);
|
const __m128i vector = _mm_loadu_si128((__m128i*)src_ptr);
|
||||||
|
const __m128i shuffled_vector = _mm_shuffle_epi8(vector, mask);
|
||||||
|
_mm_storeu_si128((__m128i*)dst_ptr, shuffled_vector);
|
||||||
|
|
||||||
src_ptr += src_stride;
|
src_ptr += src_stride;
|
||||||
dst_ptr += dst_stride;
|
dst_ptr += dst_stride;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
for (u32 i = 0; i < iterations; ++i)
|
||||||
|
{
|
||||||
|
const __m128i vec0 = _mm_loadu_si128((__m128i*)src_ptr);
|
||||||
|
const __m128i vec1 = _mm_or_si128(_mm_slli_epi16(vec0, 8), _mm_srli_epi16(vec0, 8));
|
||||||
|
const __m128i vec2 = _mm_or_si128(_mm_slli_epi32(vec1, 16), _mm_srli_epi32(vec1, 16));
|
||||||
|
_mm_storeu_si128((__m128i*)dst_ptr, vec2);
|
||||||
|
|
||||||
|
src_ptr += src_stride;
|
||||||
|
dst_ptr += dst_stride;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (remainder)
|
if (remainder)
|
||||||
|
@ -181,14 +235,29 @@ namespace
|
||||||
else
|
else
|
||||||
remainder = vertex_count;
|
remainder = vertex_count;
|
||||||
|
|
||||||
for (u32 i = 0; i < iterations; ++i)
|
if (LIKELY(s_use_ssse3))
|
||||||
{
|
{
|
||||||
const __m128i &vector = _mm_loadu_si128((__m128i*)src_ptr);
|
for (u32 i = 0; i < iterations; ++i)
|
||||||
const __m128i &shuffled_vector = _mm_shuffle_epi8(vector, mask);
|
{
|
||||||
_mm_storeu_si128((__m128i*)dst_ptr, shuffled_vector);
|
const __m128i vector = _mm_loadu_si128((__m128i*)src_ptr);
|
||||||
|
const __m128i shuffled_vector = _mm_shuffle_epi8(vector, mask);
|
||||||
|
_mm_storeu_si128((__m128i*)dst_ptr, shuffled_vector);
|
||||||
|
|
||||||
src_ptr += src_stride;
|
src_ptr += src_stride;
|
||||||
dst_ptr += dst_stride;
|
dst_ptr += dst_stride;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
for (u32 i = 0; i < iterations; ++i)
|
||||||
|
{
|
||||||
|
const __m128i vec0 = _mm_loadu_si128((__m128i*)src_ptr);
|
||||||
|
const __m128i vec1 = _mm_or_si128(_mm_slli_epi16(vec0, 8), _mm_srli_epi16(vec0, 8));
|
||||||
|
_mm_storeu_si128((__m128i*)dst_ptr, vec1);
|
||||||
|
|
||||||
|
src_ptr += src_stride;
|
||||||
|
dst_ptr += dst_stride;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (remainder)
|
if (remainder)
|
||||||
|
|
|
@ -310,24 +310,20 @@ public:
|
||||||
const auto I = m_fragment_shader_cache.find(fragment_program);
|
const auto I = m_fragment_shader_cache.find(fragment_program);
|
||||||
if (I == m_fragment_shader_cache.end())
|
if (I == m_fragment_shader_cache.end())
|
||||||
return;
|
return;
|
||||||
__m128i mask = _mm_set_epi8(0xE, 0xF, 0xC, 0xD,
|
|
||||||
0xA, 0xB, 0x8, 0x9,
|
|
||||||
0x6, 0x7, 0x4, 0x5,
|
|
||||||
0x2, 0x3, 0x0, 0x1);
|
|
||||||
|
|
||||||
verify(HERE), (dst_buffer.size_bytes() >= ::narrow<int>(I->second.FragmentConstantOffsetCache.size()) * 16);
|
verify(HERE), (dst_buffer.size_bytes() >= ::narrow<int>(I->second.FragmentConstantOffsetCache.size()) * 16);
|
||||||
|
|
||||||
f32* dst = dst_buffer.data();
|
f32* dst = dst_buffer.data();
|
||||||
f32 tmp[4];
|
alignas(16) f32 tmp[4];
|
||||||
for (size_t offset_in_fragment_program : I->second.FragmentConstantOffsetCache)
|
for (size_t offset_in_fragment_program : I->second.FragmentConstantOffsetCache)
|
||||||
{
|
{
|
||||||
void *data = (char*)fragment_program.addr + (u32)offset_in_fragment_program;
|
char* data = (char*)fragment_program.addr + (u32)offset_in_fragment_program;
|
||||||
const __m128i &vector = _mm_loadu_si128((__m128i*)data);
|
const __m128i vector = _mm_loadu_si128((__m128i*)data);
|
||||||
const __m128i &shuffled_vector = _mm_shuffle_epi8(vector, mask);
|
const __m128i shuffled_vector = _mm_or_si128(_mm_slli_epi16(vector, 8), _mm_srli_epi16(vector, 8));
|
||||||
|
|
||||||
if (!patch_table.is_empty())
|
if (!patch_table.is_empty())
|
||||||
{
|
{
|
||||||
_mm_storeu_ps(tmp, (__m128&)shuffled_vector);
|
_mm_store_ps(tmp, _mm_castsi128_ps(shuffled_vector));
|
||||||
bool patched;
|
bool patched;
|
||||||
|
|
||||||
for (int i = 0; i < 4; ++i)
|
for (int i = 0; i < 4; ++i)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue