RSX: remove SSSE3 dependency

This commit is contained in:
Nekotekina 2017-12-05 23:48:01 +03:00
parent de5dab35e0
commit 61de20a633
3 changed files with 103 additions and 38 deletions

View file

@ -470,7 +470,7 @@ struct se_storage<T, 16, 16>
static inline v128 swap(const v128& src) static inline v128 swap(const v128& src)
{ {
return v128::fromV(_mm_shuffle_epi8(src.vi, _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15))); return v128::from64(se_storage<u64>::swap(src._u64[1]), se_storage<u64>::swap(src._u64[0]));
} }
static inline v128 to(const T& src) static inline v128 to(const T& src)

View file

@ -1,9 +1,20 @@
#include "stdafx.h" #include "stdafx.h"
#include "BufferUtils.h" #include "BufferUtils.h"
#include "../rsx_methods.h" #include "../rsx_methods.h"
#include "Utilities/sysinfo.h"
#define DEBUG_VERTEX_STREAMING 0 #define DEBUG_VERTEX_STREAMING 0
const bool s_use_ssse3 =
#ifdef _MSC_VER
utils::has_ssse3();
#elif __SSSE3__
true;
#else
false;
#define _mm_shuffle_epi8
#endif
namespace namespace
{ {
// FIXME: GSL as_span break build if template parameter is non const with current revision. // FIXME: GSL as_span break build if template parameter is non const with current revision.
@ -49,16 +60,30 @@ namespace
const u32 iterations = dword_count >> 2; const u32 iterations = dword_count >> 2;
const u32 remaining = dword_count % 4; const u32 remaining = dword_count % 4;
for (u32 i = 0; i < iterations; ++i) if (LIKELY(s_use_ssse3))
{ {
u32 *src_words = (u32*)src_ptr; for (u32 i = 0; i < iterations; ++i)
u32 *dst_words = (u32*)dst_ptr; {
const __m128i &vector = _mm_loadu_si128(src_ptr); const __m128i vector = _mm_loadu_si128(src_ptr);
const __m128i &shuffled_vector = _mm_shuffle_epi8(vector, mask); const __m128i shuffled_vector = _mm_shuffle_epi8(vector, mask);
_mm_stream_si128(dst_ptr, shuffled_vector); _mm_stream_si128(dst_ptr, shuffled_vector);
src_ptr++; src_ptr++;
dst_ptr++; dst_ptr++;
}
}
else
{
for (u32 i = 0; i < iterations; ++i)
{
const __m128i vec0 = _mm_loadu_si128(src_ptr);
const __m128i vec1 = _mm_or_si128(_mm_slli_epi16(vec0, 8), _mm_srli_epi16(vec0, 8));
const __m128i vec2 = _mm_or_si128(_mm_slli_epi32(vec1, 16), _mm_srli_epi32(vec1, 16));
_mm_stream_si128(dst_ptr, vec2);
src_ptr++;
dst_ptr++;
}
} }
if (remaining) if (remaining)
@ -86,16 +111,29 @@ namespace
const u32 iterations = word_count >> 3; const u32 iterations = word_count >> 3;
const u32 remaining = word_count % 8; const u32 remaining = word_count % 8;
for (u32 i = 0; i < iterations; ++i) if (LIKELY(s_use_ssse3))
{ {
u32 *src_words = (u32*)src_ptr; for (u32 i = 0; i < iterations; ++i)
u32 *dst_words = (u32*)dst_ptr; {
const __m128i &vector = _mm_loadu_si128(src_ptr); const __m128i vector = _mm_loadu_si128(src_ptr);
const __m128i &shuffled_vector = _mm_shuffle_epi8(vector, mask); const __m128i shuffled_vector = _mm_shuffle_epi8(vector, mask);
_mm_stream_si128(dst_ptr, shuffled_vector); _mm_stream_si128(dst_ptr, shuffled_vector);
src_ptr++; src_ptr++;
dst_ptr++; dst_ptr++;
}
}
else
{
for (u32 i = 0; i < iterations; ++i)
{
const __m128i vec0 = _mm_loadu_si128(src_ptr);
const __m128i vec1 = _mm_or_si128(_mm_slli_epi16(vec0, 8), _mm_srli_epi16(vec0, 8));
_mm_stream_si128(dst_ptr, vec1);
src_ptr++;
dst_ptr++;
}
} }
if (remaining) if (remaining)
@ -133,14 +171,30 @@ namespace
else else
remainder = vertex_count; remainder = vertex_count;
for (u32 i = 0; i < iterations; ++i) if (LIKELY(s_use_ssse3))
{ {
const __m128i &vector = _mm_loadu_si128((__m128i*)src_ptr); for (u32 i = 0; i < iterations; ++i)
const __m128i &shuffled_vector = _mm_shuffle_epi8(vector, mask); {
_mm_storeu_si128((__m128i*)dst_ptr, shuffled_vector); const __m128i vector = _mm_loadu_si128((__m128i*)src_ptr);
const __m128i shuffled_vector = _mm_shuffle_epi8(vector, mask);
_mm_storeu_si128((__m128i*)dst_ptr, shuffled_vector);
src_ptr += src_stride; src_ptr += src_stride;
dst_ptr += dst_stride; dst_ptr += dst_stride;
}
}
else
{
for (u32 i = 0; i < iterations; ++i)
{
const __m128i vec0 = _mm_loadu_si128((__m128i*)src_ptr);
const __m128i vec1 = _mm_or_si128(_mm_slli_epi16(vec0, 8), _mm_srli_epi16(vec0, 8));
const __m128i vec2 = _mm_or_si128(_mm_slli_epi32(vec1, 16), _mm_srli_epi32(vec1, 16));
_mm_storeu_si128((__m128i*)dst_ptr, vec2);
src_ptr += src_stride;
dst_ptr += dst_stride;
}
} }
if (remainder) if (remainder)
@ -181,14 +235,29 @@ namespace
else else
remainder = vertex_count; remainder = vertex_count;
for (u32 i = 0; i < iterations; ++i) if (LIKELY(s_use_ssse3))
{ {
const __m128i &vector = _mm_loadu_si128((__m128i*)src_ptr); for (u32 i = 0; i < iterations; ++i)
const __m128i &shuffled_vector = _mm_shuffle_epi8(vector, mask); {
_mm_storeu_si128((__m128i*)dst_ptr, shuffled_vector); const __m128i vector = _mm_loadu_si128((__m128i*)src_ptr);
const __m128i shuffled_vector = _mm_shuffle_epi8(vector, mask);
_mm_storeu_si128((__m128i*)dst_ptr, shuffled_vector);
src_ptr += src_stride; src_ptr += src_stride;
dst_ptr += dst_stride; dst_ptr += dst_stride;
}
}
else
{
for (u32 i = 0; i < iterations; ++i)
{
const __m128i vec0 = _mm_loadu_si128((__m128i*)src_ptr);
const __m128i vec1 = _mm_or_si128(_mm_slli_epi16(vec0, 8), _mm_srli_epi16(vec0, 8));
_mm_storeu_si128((__m128i*)dst_ptr, vec1);
src_ptr += src_stride;
dst_ptr += dst_stride;
}
} }
if (remainder) if (remainder)

View file

@ -310,24 +310,20 @@ public:
const auto I = m_fragment_shader_cache.find(fragment_program); const auto I = m_fragment_shader_cache.find(fragment_program);
if (I == m_fragment_shader_cache.end()) if (I == m_fragment_shader_cache.end())
return; return;
__m128i mask = _mm_set_epi8(0xE, 0xF, 0xC, 0xD,
0xA, 0xB, 0x8, 0x9,
0x6, 0x7, 0x4, 0x5,
0x2, 0x3, 0x0, 0x1);
verify(HERE), (dst_buffer.size_bytes() >= ::narrow<int>(I->second.FragmentConstantOffsetCache.size()) * 16); verify(HERE), (dst_buffer.size_bytes() >= ::narrow<int>(I->second.FragmentConstantOffsetCache.size()) * 16);
f32* dst = dst_buffer.data(); f32* dst = dst_buffer.data();
f32 tmp[4]; alignas(16) f32 tmp[4];
for (size_t offset_in_fragment_program : I->second.FragmentConstantOffsetCache) for (size_t offset_in_fragment_program : I->second.FragmentConstantOffsetCache)
{ {
void *data = (char*)fragment_program.addr + (u32)offset_in_fragment_program; char* data = (char*)fragment_program.addr + (u32)offset_in_fragment_program;
const __m128i &vector = _mm_loadu_si128((__m128i*)data); const __m128i vector = _mm_loadu_si128((__m128i*)data);
const __m128i &shuffled_vector = _mm_shuffle_epi8(vector, mask); const __m128i shuffled_vector = _mm_or_si128(_mm_slli_epi16(vector, 8), _mm_srli_epi16(vector, 8));
if (!patch_table.is_empty()) if (!patch_table.is_empty())
{ {
_mm_storeu_ps(tmp, (__m128&)shuffled_vector); _mm_store_ps(tmp, _mm_castsi128_ps(shuffled_vector));
bool patched; bool patched;
for (int i = 0; i < 4; ++i) for (int i = 0; i < 4; ++i)