diff --git a/rpcs3/Emu/RSX/Common/BufferUtils.cpp b/rpcs3/Emu/RSX/Common/BufferUtils.cpp index bda8616edc..077c72807a 100644 --- a/rpcs3/Emu/RSX/Common/BufferUtils.cpp +++ b/rpcs3/Emu/RSX/Common/BufferUtils.cpp @@ -10,9 +10,12 @@ #if defined(_MSC_VER) #define __SSSE3__ 1 -#define __SSE4_1__ 1 +#define SSE4_1_FUNC +#define AVX2_FUNC #else #define __sse_intrin static FORCE_INLINE +#define SSE4_1_FUNC __attribute__((__target__("sse4.1"))) +#define AVX2_FUNC __attribute__((__target__("avx2"))) #endif // _MSC_VER // NOTE: Clang does not allow to redefine missing intrinsics @@ -26,38 +29,11 @@ __sse_intrin __m128i __mm_shuffle_epi8(__m128i opd, __m128i opa) #define __mm_shuffle_epi8 _mm_shuffle_epi8 #endif // __SSSE3__ -#ifndef __SSE4_1__ -__sse_intrin __m128i __mm_max_epu32(__m128i opd, __m128i opa) -{ - __asm__("pmaxud %1, %0" : "+x" (opd) : "xm" (opa)); - return opd; -} -__sse_intrin __m128i __mm_min_epu32(__m128i opd, __m128i opa) -{ - __asm__("pminud %1, %0" : "+x" (opd) : "xm" (opa)); - return opd; -} -__sse_intrin __m128i __mm_max_epu16(__m128i opd, __m128i opa) -{ - __asm__("pmaxuw %1, %0" : "+x" (opd) : "xm" (opa)); - return opd; -} -__sse_intrin __m128i __mm_min_epu16(__m128i opd, __m128i opa) -{ - __asm__("pminuw %1, %0" : "+x" (opd) : "xm" (opa)); - return opd; -} -#else -#define __mm_max_epu32 _mm_max_epu32 -#define __mm_min_epu32 _mm_min_epu32 -#define __mm_max_epu16 _mm_max_epu16 -#define __mm_min_epu16 _mm_min_epu16 -#endif // __SSE4_1__ - #undef __sse_intrin const bool s_use_ssse3 = utils::has_ssse3(); const bool s_use_sse4_1 = utils::has_sse41(); +const bool s_use_avx2 = utils::has_avx2(); namespace { @@ -602,8 +578,9 @@ namespace struct untouched_impl { + SSE4_1_FUNC static - std::tuple upload_u16_swapped(const void *src, void *dst, u32 count) + std::tuple upload_u16_swapped_sse4_1(const void *src, void *dst, u32 count) { const __m128i mask = _mm_set_epi8( 0xE, 0xF, 0xC, 0xD, @@ -621,9 +598,9 @@ namespace for (unsigned n = 0; n < iterations; ++n) { const __m128i raw = _mm_loadu_si128(src_stream++); - const __m128i value = __mm_shuffle_epi8(raw, mask); - max = __mm_max_epu16(max, value); - min = __mm_min_epu16(min, value); + const __m128i value = _mm_shuffle_epi8(raw, mask); + max = _mm_max_epu16(max, value); + min = _mm_min_epu16(min, value); _mm_storeu_si128(dst_stream++, value); } @@ -639,19 +616,19 @@ namespace 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x3, 0x2); - __m128i tmp = __mm_shuffle_epi8(min, mask_step1); - min = __mm_min_epu16(min, tmp); - tmp = __mm_shuffle_epi8(min, mask_step2); - min = __mm_min_epu16(min, tmp); - tmp = __mm_shuffle_epi8(min, mask_step3); - min = __mm_min_epu16(min, tmp); + __m128i tmp = _mm_shuffle_epi8(min, mask_step1); + min = _mm_min_epu16(min, tmp); + tmp = _mm_shuffle_epi8(min, mask_step2); + min = _mm_min_epu16(min, tmp); + tmp = _mm_shuffle_epi8(min, mask_step3); + min = _mm_min_epu16(min, tmp); - tmp = __mm_shuffle_epi8(max, mask_step1); - max = __mm_max_epu16(max, tmp); - tmp = __mm_shuffle_epi8(max, mask_step2); - max = __mm_max_epu16(max, tmp); - tmp = __mm_shuffle_epi8(max, mask_step3); - max = __mm_max_epu16(max, tmp); + tmp = _mm_shuffle_epi8(max, mask_step1); + max = _mm_max_epu16(max, tmp); + tmp = _mm_shuffle_epi8(max, mask_step2); + max = _mm_max_epu16(max, tmp); + tmp = _mm_shuffle_epi8(max, mask_step3); + max = _mm_max_epu16(max, tmp); const u16 min_index = u16(_mm_cvtsi128_si32(min) & 0xFFFF); const u16 max_index = u16(_mm_cvtsi128_si32(max) & 0xFFFF); @@ -659,8 +636,9 @@ namespace return std::make_tuple(min_index, max_index, count); } + SSE4_1_FUNC static - std::tuple upload_u32_swapped(const void *src, void *dst, u32 count) + std::tuple upload_u32_swapped_sse4_1(const void *src, void *dst, u32 count) { const __m128i mask = _mm_set_epi8( 0xC, 0xD, 0xE, 0xF, @@ -678,9 +656,9 @@ namespace for (unsigned n = 0; n < iterations; ++n) { const __m128i raw = _mm_loadu_si128(src_stream++); - const __m128i value = __mm_shuffle_epi8(raw, mask); - max = __mm_max_epu32(max, value); - min = __mm_min_epu32(min, value); + const __m128i value = _mm_shuffle_epi8(raw, mask); + max = _mm_max_epu32(max, value); + min = _mm_min_epu32(min, value); _mm_storeu_si128(dst_stream++, value); } @@ -693,15 +671,15 @@ namespace 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x7, 0x6, 0x5, 0x4); - __m128i tmp = __mm_shuffle_epi8(min, mask_step1); - min = __mm_min_epu32(min, tmp); - tmp = __mm_shuffle_epi8(min, mask_step2); - min = __mm_min_epu32(min, tmp); + __m128i tmp = _mm_shuffle_epi8(min, mask_step1); + min = _mm_min_epu32(min, tmp); + tmp = _mm_shuffle_epi8(min, mask_step2); + min = _mm_min_epu32(min, tmp); - tmp = __mm_shuffle_epi8(max, mask_step1); - max = __mm_max_epu32(max, tmp); - tmp = __mm_shuffle_epi8(max, mask_step2); - max = __mm_max_epu32(max, tmp); + tmp = _mm_shuffle_epi8(max, mask_step1); + max = _mm_max_epu32(max, tmp); + tmp = _mm_shuffle_epi8(max, mask_step2); + max = _mm_max_epu32(max, tmp); const u32 min_index = u32(_mm_cvtsi128_si32(min)); const u32 max_index = u32(_mm_cvtsi128_si32(max)); @@ -722,12 +700,12 @@ namespace if constexpr (std::is_same::value) { const auto count = (remaining & ~0x3); - std::tie(min_index, max_index, written) = upload_u32_swapped(src.data(), dst.data(), count); + std::tie(min_index, max_index, written) = upload_u32_swapped_sse4_1(src.data(), dst.data(), count); } else if constexpr (std::is_same::value) { const auto count = (remaining & ~0x7); - std::tie(min_index, max_index, written) = upload_u16_swapped(src.data(), dst.data(), count); + std::tie(min_index, max_index, written) = upload_u16_swapped_sse4_1(src.data(), dst.data(), count); } else { @@ -755,39 +733,228 @@ namespace struct primitive_restart_impl { + AVX2_FUNC + static + std::tuple upload_u16_swapped_avx2(const void *src, void *dst, u32 iterations, u16 restart_index) + { + const __m256i shuffle_mask = _mm256_set_epi8( + 0xE, 0xF, 0xC, 0xD, + 0xA, 0xB, 0x8, 0x9, + 0x6, 0x7, 0x4, 0x5, + 0x2, 0x3, 0x0, 0x1, + 0xE, 0xF, 0xC, 0xD, + 0xA, 0xB, 0x8, 0x9, + 0x6, 0x7, 0x4, 0x5, + 0x2, 0x3, 0x0, 0x1); + + auto src_stream = (const __m256i*)src; + auto dst_stream = (__m256i*)dst; + + __m256i restart = _mm256_set1_epi16(restart_index); + __m256i min = _mm256_set1_epi16(0xffff); + __m256i max = _mm256_set1_epi16(0); + + for (unsigned n = 0; n < iterations; ++n) + { + const __m256i raw = _mm256_loadu_si256(src_stream++); + const __m256i value = _mm256_shuffle_epi8(raw, shuffle_mask); + const __m256i mask = _mm256_cmpeq_epi16(restart, value); + const __m256i value_with_min_restart = _mm256_andnot_si256(mask, value); + const __m256i value_with_max_restart = _mm256_or_si256(mask, value); + max = _mm256_max_epu16(max, value_with_min_restart); + min = _mm256_min_epu16(min, value_with_max_restart); + _mm256_storeu_si256(dst_stream++, value_with_max_restart); + } + + __m128i tmp = _mm256_extracti128_si256(min, 1); + __m128i min2 = _mm256_castsi256_si128(min); + min2 = _mm_min_epu16(min2, tmp); + min2 = _mm_minpos_epu16(min2); + + tmp = _mm256_extracti128_si256(max, 1); + __m128i max2 = _mm256_castsi256_si128(max); + max2 = _mm_max_epu16(max2, tmp); + tmp = _mm_srli_si128(max2, 8); + max2 = _mm_max_epu16(max2, tmp); + tmp = _mm_srli_si128(max2, 4); + max2 = _mm_max_epu16(max2, tmp); + tmp = _mm_srli_si128(max2, 2); + max2 = _mm_max_epu16(max2, tmp); + + const u16 min_index = u16(_mm_cvtsi128_si32(min2) & 0xFFFF); + const u16 max_index = u16(_mm_cvtsi128_si32(max2) & 0xFFFF); + + return std::make_tuple(min_index, max_index); + } + + SSE4_1_FUNC + static + std::tuple upload_u16_swapped_sse4_1(const void *src, void *dst, u32 iterations, u16 restart_index) + { + const __m128i shuffle_mask = _mm_set_epi8( + 0xE, 0xF, 0xC, 0xD, + 0xA, 0xB, 0x8, 0x9, + 0x6, 0x7, 0x4, 0x5, + 0x2, 0x3, 0x0, 0x1); + + auto src_stream = (const __m128i*)src; + auto dst_stream = (__m128i*)dst; + + __m128i restart = _mm_set1_epi16(restart_index); + __m128i min = _mm_set1_epi16(0xffff); + __m128i max = _mm_set1_epi16(0); + + for (unsigned n = 0; n < iterations; ++n) + { + const __m128i raw = _mm_loadu_si128(src_stream++); + const __m128i value = _mm_shuffle_epi8(raw, shuffle_mask); + const __m128i mask = _mm_cmpeq_epi16(restart, value); + const __m128i value_with_min_restart = _mm_andnot_si128(mask, value); + const __m128i value_with_max_restart = _mm_or_si128(mask, value); + max = _mm_max_epu16(max, value_with_min_restart); + min = _mm_min_epu16(min, value_with_max_restart); + _mm_storeu_si128(dst_stream++, value_with_max_restart); + } + + min = _mm_minpos_epu16(min); + + __m128i tmp = _mm_srli_si128(max, 8); + max = _mm_max_epu16(max, tmp); + tmp = _mm_srli_si128(max, 4); + max = _mm_max_epu16(max, tmp); + tmp = _mm_srli_si128(max, 2); + max = _mm_max_epu16(max, tmp); + + const u16 min_index = u16(_mm_cvtsi128_si32(min) & 0xFFFF); + const u16 max_index = u16(_mm_cvtsi128_si32(max) & 0xFFFF); + + return std::make_tuple(min_index, max_index); + } + + SSE4_1_FUNC + static + std::tuple upload_u32_swapped_sse4_1(const void *src, void *dst, u32 iterations, u32 restart_index) + { + const __m128i shuffle_mask = _mm_set_epi8( + 0xC, 0xD, 0xE, 0xF, + 0x8, 0x9, 0xA, 0xB, + 0x4, 0x5, 0x6, 0x7, + 0x0, 0x1, 0x2, 0x3); + + auto src_stream = (const __m128i*)src; + auto dst_stream = (__m128i*)dst; + + __m128i restart = _mm_set1_epi32(restart_index); + __m128i min = _mm_set1_epi32(0xffffffff); + __m128i max = _mm_set1_epi32(0); + + for (unsigned n = 0; n < iterations; ++n) + { + const __m128i raw = _mm_loadu_si128(src_stream++); + const __m128i value = _mm_shuffle_epi8(raw, shuffle_mask); + const __m128i mask = _mm_cmpeq_epi32(restart, value); + const __m128i value_with_min_restart = _mm_andnot_si128(mask, value); + const __m128i value_with_max_restart = _mm_or_si128(mask, value); + max = _mm_max_epu32(max, value_with_min_restart); + min = _mm_min_epu32(min, value_with_max_restart); + _mm_storeu_si128(dst_stream++, value_with_max_restart); + } + + __m128i tmp = _mm_srli_si128(min, 8); + min = _mm_min_epu32(min, tmp); + tmp = _mm_srli_si128(min, 4); + min = _mm_min_epu32(min, tmp); + + tmp = _mm_srli_si128(max, 8); + max = _mm_max_epu32(max, tmp); + tmp = _mm_srli_si128(max, 4); + max = _mm_max_epu32(max, tmp); + + const u32 min_index = u32(_mm_cvtsi128_si32(min)); + const u32 max_index = u32(_mm_cvtsi128_si32(max)); + + return std::make_tuple(min_index, max_index); + } + template static - std::tuple upload_untouched(gsl::span> src, gsl::span dst, u32 restart_index, bool skip_restart) + std::tuple upload_untouched(gsl::span> src, gsl::span dst, T restart_index, bool skip_restart) { - T min_index = index_limit(), max_index = 0; - u32 dst_index = 0; + T min_index = index_limit(); + T max_index = 0; + u32 written = 0; + u32 length = src.size(); - for (const T index : src) + if (length >= 32 && !skip_restart) { - if (index == restart_index) + if constexpr (std::is_same::value) { - if (!skip_restart) + if (s_use_avx2) { - dst[dst_index++] = index_limit(); + u32 iterations = length >> 4; + written = length & ~0xF; + std::tie(min_index, max_index) = upload_u16_swapped_avx2(src.data(), dst.data(), iterations, restart_index); + } + else if (s_use_sse4_1) + { + u32 iterations = length >> 3; + written = length & ~0x7; + std::tie(min_index, max_index) = upload_u16_swapped_sse4_1(src.data(), dst.data(), iterations, restart_index); + } + } + else if constexpr (std::is_same::value) + { + if (s_use_sse4_1) + { + u32 iterations = length >> 2; + written = length & ~0x3; + std::tie(min_index, max_index) = upload_u32_swapped_sse4_1(src.data(), dst.data(), iterations, restart_index); } } else { - dst[dst_index++] = min_max(min_index, max_index, index); + fmt::throw_exception("Unreachable" HERE); } } - return std::make_tuple(min_index, max_index, dst_index); + for (u32 i = written; i < length; ++i) + { + T index = src[i]; + if (index == restart_index) + { + if (!skip_restart) + { + dst[written++] = index_limit(); + } + } + else + { + dst[written++] = min_max(min_index, max_index, index); + } + } + + return std::make_tuple(min_index, max_index, written); } }; template std::tuple upload_untouched(gsl::span> src, gsl::span dst, rsx::primitive_type draw_mode, bool is_primitive_restart_enabled, u32 primitive_restart_index) { - if (LIKELY(!is_primitive_restart_enabled)) + if (!is_primitive_restart_enabled) { return untouched_impl::upload_untouched(src, dst); } + else if constexpr (std::is_same::value) + { + if (primitive_restart_index > 0xffff) + { + return untouched_impl::upload_untouched(src, dst); + } + else + { + return primitive_restart_impl::upload_untouched(src, dst, (u16)primitive_restart_index, is_primitive_disjointed(draw_mode)); + } + } else { return primitive_restart_impl::upload_untouched(src, dst, primitive_restart_index, is_primitive_disjointed(draw_mode));