diff --git a/Utilities/BEType.h b/Utilities/BEType.h index 54caf465e5..6cbd544111 100644 --- a/Utilities/BEType.h +++ b/Utilities/BEType.h @@ -470,7 +470,7 @@ struct se_storage static inline v128 swap(const v128& src) { - return v128::fromV(_mm_shuffle_epi8(src.vi, _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15))); + return v128::from64(se_storage::swap(src._u64[1]), se_storage::swap(src._u64[0])); } static inline v128 to(const T& src) diff --git a/rpcs3/Emu/RSX/Common/BufferUtils.cpp b/rpcs3/Emu/RSX/Common/BufferUtils.cpp index fb08421931..707134fa65 100644 --- a/rpcs3/Emu/RSX/Common/BufferUtils.cpp +++ b/rpcs3/Emu/RSX/Common/BufferUtils.cpp @@ -1,9 +1,20 @@ #include "stdafx.h" #include "BufferUtils.h" #include "../rsx_methods.h" +#include "Utilities/sysinfo.h" #define DEBUG_VERTEX_STREAMING 0 +const bool s_use_ssse3 = +#ifdef _MSC_VER + utils::has_ssse3(); +#elif __SSSE3__ + true; +#else + false; +#define _mm_shuffle_epi8 +#endif + namespace { // FIXME: GSL as_span break build if template parameter is non const with current revision. @@ -49,16 +60,30 @@ namespace const u32 iterations = dword_count >> 2; const u32 remaining = dword_count % 4; - for (u32 i = 0; i < iterations; ++i) + if (LIKELY(s_use_ssse3)) { - u32 *src_words = (u32*)src_ptr; - u32 *dst_words = (u32*)dst_ptr; - const __m128i &vector = _mm_loadu_si128(src_ptr); - const __m128i &shuffled_vector = _mm_shuffle_epi8(vector, mask); - _mm_stream_si128(dst_ptr, shuffled_vector); + for (u32 i = 0; i < iterations; ++i) + { + const __m128i vector = _mm_loadu_si128(src_ptr); + const __m128i shuffled_vector = _mm_shuffle_epi8(vector, mask); + _mm_stream_si128(dst_ptr, shuffled_vector); - src_ptr++; - dst_ptr++; + src_ptr++; + dst_ptr++; + } + } + else + { + for (u32 i = 0; i < iterations; ++i) + { + const __m128i vec0 = _mm_loadu_si128(src_ptr); + const __m128i vec1 = _mm_or_si128(_mm_slli_epi16(vec0, 8), _mm_srli_epi16(vec0, 8)); + const __m128i vec2 = _mm_or_si128(_mm_slli_epi32(vec1, 16), _mm_srli_epi32(vec1, 16)); + _mm_stream_si128(dst_ptr, vec2); + + src_ptr++; + dst_ptr++; + } } if (remaining) @@ -86,16 +111,29 @@ namespace const u32 iterations = word_count >> 3; const u32 remaining = word_count % 8; - for (u32 i = 0; i < iterations; ++i) + if (LIKELY(s_use_ssse3)) { - u32 *src_words = (u32*)src_ptr; - u32 *dst_words = (u32*)dst_ptr; - const __m128i &vector = _mm_loadu_si128(src_ptr); - const __m128i &shuffled_vector = _mm_shuffle_epi8(vector, mask); - _mm_stream_si128(dst_ptr, shuffled_vector); + for (u32 i = 0; i < iterations; ++i) + { + const __m128i vector = _mm_loadu_si128(src_ptr); + const __m128i shuffled_vector = _mm_shuffle_epi8(vector, mask); + _mm_stream_si128(dst_ptr, shuffled_vector); - src_ptr++; - dst_ptr++; + src_ptr++; + dst_ptr++; + } + } + else + { + for (u32 i = 0; i < iterations; ++i) + { + const __m128i vec0 = _mm_loadu_si128(src_ptr); + const __m128i vec1 = _mm_or_si128(_mm_slli_epi16(vec0, 8), _mm_srli_epi16(vec0, 8)); + _mm_stream_si128(dst_ptr, vec1); + + src_ptr++; + dst_ptr++; + } } if (remaining) @@ -133,14 +171,30 @@ namespace else remainder = vertex_count; - for (u32 i = 0; i < iterations; ++i) + if (LIKELY(s_use_ssse3)) { - const __m128i &vector = _mm_loadu_si128((__m128i*)src_ptr); - const __m128i &shuffled_vector = _mm_shuffle_epi8(vector, mask); - _mm_storeu_si128((__m128i*)dst_ptr, shuffled_vector); + for (u32 i = 0; i < iterations; ++i) + { + const __m128i vector = _mm_loadu_si128((__m128i*)src_ptr); + const __m128i shuffled_vector = _mm_shuffle_epi8(vector, mask); + _mm_storeu_si128((__m128i*)dst_ptr, shuffled_vector); - src_ptr += src_stride; - dst_ptr += dst_stride; + src_ptr += src_stride; + dst_ptr += dst_stride; + } + } + else + { + for (u32 i = 0; i < iterations; ++i) + { + const __m128i vec0 = _mm_loadu_si128((__m128i*)src_ptr); + const __m128i vec1 = _mm_or_si128(_mm_slli_epi16(vec0, 8), _mm_srli_epi16(vec0, 8)); + const __m128i vec2 = _mm_or_si128(_mm_slli_epi32(vec1, 16), _mm_srli_epi32(vec1, 16)); + _mm_storeu_si128((__m128i*)dst_ptr, vec2); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } } if (remainder) @@ -181,14 +235,29 @@ namespace else remainder = vertex_count; - for (u32 i = 0; i < iterations; ++i) + if (LIKELY(s_use_ssse3)) { - const __m128i &vector = _mm_loadu_si128((__m128i*)src_ptr); - const __m128i &shuffled_vector = _mm_shuffle_epi8(vector, mask); - _mm_storeu_si128((__m128i*)dst_ptr, shuffled_vector); + for (u32 i = 0; i < iterations; ++i) + { + const __m128i vector = _mm_loadu_si128((__m128i*)src_ptr); + const __m128i shuffled_vector = _mm_shuffle_epi8(vector, mask); + _mm_storeu_si128((__m128i*)dst_ptr, shuffled_vector); - src_ptr += src_stride; - dst_ptr += dst_stride; + src_ptr += src_stride; + dst_ptr += dst_stride; + } + } + else + { + for (u32 i = 0; i < iterations; ++i) + { + const __m128i vec0 = _mm_loadu_si128((__m128i*)src_ptr); + const __m128i vec1 = _mm_or_si128(_mm_slli_epi16(vec0, 8), _mm_srli_epi16(vec0, 8)); + _mm_storeu_si128((__m128i*)dst_ptr, vec1); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } } if (remainder) diff --git a/rpcs3/Emu/RSX/Common/ProgramStateCache.h b/rpcs3/Emu/RSX/Common/ProgramStateCache.h index ed8d33539d..9cb4077272 100644 --- a/rpcs3/Emu/RSX/Common/ProgramStateCache.h +++ b/rpcs3/Emu/RSX/Common/ProgramStateCache.h @@ -310,24 +310,20 @@ public: const auto I = m_fragment_shader_cache.find(fragment_program); if (I == m_fragment_shader_cache.end()) return; - __m128i mask = _mm_set_epi8(0xE, 0xF, 0xC, 0xD, - 0xA, 0xB, 0x8, 0x9, - 0x6, 0x7, 0x4, 0x5, - 0x2, 0x3, 0x0, 0x1); verify(HERE), (dst_buffer.size_bytes() >= ::narrow(I->second.FragmentConstantOffsetCache.size()) * 16); f32* dst = dst_buffer.data(); - f32 tmp[4]; + alignas(16) f32 tmp[4]; for (size_t offset_in_fragment_program : I->second.FragmentConstantOffsetCache) { - void *data = (char*)fragment_program.addr + (u32)offset_in_fragment_program; - const __m128i &vector = _mm_loadu_si128((__m128i*)data); - const __m128i &shuffled_vector = _mm_shuffle_epi8(vector, mask); + char* data = (char*)fragment_program.addr + (u32)offset_in_fragment_program; + const __m128i vector = _mm_loadu_si128((__m128i*)data); + const __m128i shuffled_vector = _mm_or_si128(_mm_slli_epi16(vector, 8), _mm_srli_epi16(vector, 8)); if (!patch_table.is_empty()) { - _mm_storeu_ps(tmp, (__m128&)shuffled_vector); + _mm_store_ps(tmp, _mm_castsi128_ps(shuffled_vector)); bool patched; for (int i = 0; i < 4; ++i)