#include "stdafx.h" #include "BufferUtils.h" #include "../rsx_methods.h" #include "../RSXThread.h" #include "util/to_endian.hpp" #include "util/sysinfo.hpp" #include "emmintrin.h" #include "immintrin.h" #define DEBUG_VERTEX_STREAMING 0 #if !defined(_MSC_VER) && defined(__clang__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wold-style-cast" #endif #if defined(_MSC_VER) #define SSSE3_FUNC #define SSE4_1_FUNC #define AVX2_FUNC #else #define SSSE3_FUNC __attribute__((__target__("ssse3"))) #define SSE4_1_FUNC __attribute__((__target__("sse4.1"))) #define AVX2_FUNC __attribute__((__target__("avx2"))) #endif // _MSC_VER SSSE3_FUNC static inline __m128i ssse3_shuffle_epi8(__m128i x, __m128i y) { return _mm_shuffle_epi8(x, y); } SSE4_1_FUNC static inline u16 sse41_hmin_epu16(__m128i x) { return _mm_cvtsi128_si32(_mm_minpos_epu16(x)); } SSE4_1_FUNC static inline u16 sse41_hmax_epu16(__m128i x) { return ~_mm_cvtsi128_si32(_mm_minpos_epu16(_mm_xor_si128(x, _mm_set1_epi32(-1)))); } const bool s_use_ssse3 = utils::has_ssse3(); const bool s_use_sse4_1 = utils::has_sse41(); const bool s_use_avx2 = utils::has_avx2(); namespace { // FIXME: GSL as_span break build if template parameter is non const with current revision. // Replace with true as_span when fixed. template gsl::span as_span_workaround(gsl::span unformated_span) { return{ reinterpret_cast(unformated_span.data()), unformated_span.size_bytes() / sizeof(T) }; } template gsl::span as_const_span(gsl::span unformated_span) { return{ reinterpret_cast(unformated_span.data()), unformated_span.size_bytes() / sizeof(T) }; } } namespace { /** * Convert CMP vector to RGBA16. * A vector in CMP (compressed) format is stored as X11Y11Z10 and has a W component of 1. * X11 and Y11 channels are int between -1024 and 1023 interpreted as -1.f, 1.f * Z10 is int between -512 and 511 interpreted as -1.f, 1.f */ std::array decode_cmp_vector(u32 encoded_vector) { u16 Z = encoded_vector >> 22; Z = Z << 6; u16 Y = (encoded_vector >> 11) & 0x7FF; Y = Y << 5; u16 X = encoded_vector & 0x7FF; X = X << 5; return{ X, Y, Z, 1 }; } } template void stream_data_to_memory_swapped_u32(void *dst, const void *src, u32 vertex_count, u8 stride) { const __m128i mask = _mm_set_epi8( 0xC, 0xD, 0xE, 0xF, 0x8, 0x9, 0xA, 0xB, 0x4, 0x5, 0x6, 0x7, 0x0, 0x1, 0x2, 0x3); auto dst_ptr = static_cast<__m128i*>(dst); auto src_ptr = static_cast(src); const u32 dword_count = (vertex_count * (stride >> 2)); const u32 iterations = dword_count >> 2; const u32 remaining = dword_count % 4; if (s_use_ssse3) [[likely]] { for (u32 i = 0; i < iterations; ++i) { const __m128i vector = _mm_loadu_si128(src_ptr); const __m128i shuffled_vector = ssse3_shuffle_epi8(vector, mask); if constexpr (!unaligned) { _mm_stream_si128(dst_ptr, shuffled_vector); } else { _mm_storeu_si128(dst_ptr, shuffled_vector); } src_ptr++; dst_ptr++; } } else { for (u32 i = 0; i < iterations; ++i) { const __m128i vec0 = _mm_loadu_si128(src_ptr); const __m128i vec1 = _mm_or_si128(_mm_slli_epi16(vec0, 8), _mm_srli_epi16(vec0, 8)); const __m128i vec2 = _mm_or_si128(_mm_slli_epi32(vec1, 16), _mm_srli_epi32(vec1, 16)); if constexpr (!unaligned) { _mm_stream_si128(dst_ptr, vec2); } else { _mm_storeu_si128(dst_ptr, vec2); } src_ptr++; dst_ptr++; } } if (remaining) { const auto src_ptr2 = reinterpret_cast*>(src_ptr); const auto dst_ptr2 = reinterpret_cast*>(dst_ptr); for (u32 i = 0; i < remaining; ++i) dst_ptr2[i] = src_ptr2[i]; } } template void stream_data_to_memory_swapped_u32(void *, const void *, u32, u8); template void stream_data_to_memory_swapped_u32(void*, const void*, u32, u8); template bool stream_data_to_memory_swapped_and_compare_u32(void *dst, const void *src, u32 size) { const __m128i mask = _mm_set_epi8( 0xC, 0xD, 0xE, 0xF, 0x8, 0x9, 0xA, 0xB, 0x4, 0x5, 0x6, 0x7, 0x0, 0x1, 0x2, 0x3); auto dst_ptr = static_cast<__m128i*>(dst); auto src_ptr = static_cast(src); const u32 dword_count = size >> 2; const u32 iterations = dword_count >> 2; __m128i bits_diff = _mm_setzero_si128(); if (s_use_ssse3) [[likely]] { for (u32 i = 0; i < iterations; ++i) { const __m128i vector = _mm_loadu_si128(src_ptr); const __m128i shuffled_vector = ssse3_shuffle_epi8(vector, mask); if constexpr (!unaligned) { bits_diff = _mm_or_si128(bits_diff, _mm_xor_si128(_mm_load_si128(dst_ptr), shuffled_vector)); _mm_stream_si128(dst_ptr, shuffled_vector); } else { bits_diff = _mm_or_si128(bits_diff, _mm_xor_si128(_mm_loadu_si128(dst_ptr), shuffled_vector)); _mm_storeu_si128(dst_ptr, shuffled_vector); } src_ptr++; dst_ptr++; } } else { for (u32 i = 0; i < iterations; ++i) { const __m128i vec0 = _mm_loadu_si128(src_ptr); const __m128i vec1 = _mm_or_si128(_mm_slli_epi16(vec0, 8), _mm_srli_epi16(vec0, 8)); const __m128i vec2 = _mm_or_si128(_mm_slli_epi32(vec1, 16), _mm_srli_epi32(vec1, 16)); if constexpr (!unaligned) { bits_diff = _mm_or_si128(bits_diff, _mm_xor_si128(_mm_load_si128(dst_ptr), vec2)); _mm_stream_si128(dst_ptr, vec2); } else { bits_diff = _mm_or_si128(bits_diff, _mm_xor_si128(_mm_loadu_si128(dst_ptr), vec2)); _mm_storeu_si128(dst_ptr, vec2); } src_ptr++; dst_ptr++; } } const u32 remaining = dword_count % 4; if (remaining) { const auto src_ptr2 = reinterpret_cast*>(src_ptr); const auto dst_ptr2 = reinterpret_cast*>(dst_ptr); for (u32 i = 0; i < remaining; ++i) { const u32 data = src_ptr2[i]; if (dst_ptr2[i] != data) { dst_ptr2[i] = data; bits_diff = _mm_set1_epi64x(-1); } } } return _mm_cvtsi128_si64(_mm_packs_epi32(bits_diff, bits_diff)) != 0; } template bool stream_data_to_memory_swapped_and_compare_u32(void *dst, const void *src, u32 size); template bool stream_data_to_memory_swapped_and_compare_u32(void *dst, const void *src, u32 size); namespace { inline void stream_data_to_memory_swapped_u16(void *dst, const void *src, u32 vertex_count, u8 stride) { const __m128i mask = _mm_set_epi8( 0xE, 0xF, 0xC, 0xD, 0xA, 0xB, 0x8, 0x9, 0x6, 0x7, 0x4, 0x5, 0x2, 0x3, 0x0, 0x1); auto dst_ptr = static_cast<__m128i*>(dst); auto src_ptr = static_cast(src); const u32 word_count = (vertex_count * (stride >> 1)); const u32 iterations = word_count >> 3; const u32 remaining = word_count % 8; if (s_use_ssse3) [[likely]] { for (u32 i = 0; i < iterations; ++i) { const __m128i vector = _mm_loadu_si128(src_ptr); const __m128i shuffled_vector = ssse3_shuffle_epi8(vector, mask); _mm_stream_si128(dst_ptr, shuffled_vector); src_ptr++; dst_ptr++; } } else { for (u32 i = 0; i < iterations; ++i) { const __m128i vec0 = _mm_loadu_si128(src_ptr); const __m128i vec1 = _mm_or_si128(_mm_slli_epi16(vec0, 8), _mm_srli_epi16(vec0, 8)); _mm_stream_si128(dst_ptr, vec1); src_ptr++; dst_ptr++; } } if (remaining) { auto src_ptr2 = reinterpret_cast*>(src_ptr); auto dst_ptr2 = reinterpret_cast*>(dst_ptr); for (u32 i = 0; i < remaining; ++i) dst_ptr2[i] = src_ptr2[i]; } } inline void stream_data_to_memory_swapped_u32_non_continuous(void *dst, const void *src, u32 vertex_count, u8 dst_stride, u8 src_stride) { const __m128i mask = _mm_set_epi8( 0xC, 0xD, 0xE, 0xF, 0x8, 0x9, 0xA, 0xB, 0x4, 0x5, 0x6, 0x7, 0x0, 0x1, 0x2, 0x3); auto src_ptr = static_cast(src); auto dst_ptr = static_cast(dst); //Count vertices to copy const bool is_128_aligned = !((dst_stride | src_stride) & 15); u32 min_block_size = std::min(src_stride, dst_stride); if (min_block_size == 0) min_block_size = dst_stride; u32 iterations = 0; u32 remainder = is_128_aligned ? 0 : 1 + ((16 - min_block_size) / min_block_size); if (vertex_count > remainder) iterations = vertex_count - remainder; else remainder = vertex_count; if (s_use_ssse3) [[likely]] { for (u32 i = 0; i < iterations; ++i) { const __m128i vector = _mm_loadu_si128(reinterpret_cast(src_ptr)); const __m128i shuffled_vector = ssse3_shuffle_epi8(vector, mask); _mm_storeu_si128(reinterpret_cast<__m128i*>(dst_ptr), shuffled_vector); src_ptr += src_stride; dst_ptr += dst_stride; } } else { for (u32 i = 0; i < iterations; ++i) { const __m128i vec0 = _mm_loadu_si128(reinterpret_cast(src_ptr)); const __m128i vec1 = _mm_or_si128(_mm_slli_epi16(vec0, 8), _mm_srli_epi16(vec0, 8)); const __m128i vec2 = _mm_or_si128(_mm_slli_epi32(vec1, 16), _mm_srli_epi32(vec1, 16)); _mm_storeu_si128(reinterpret_cast<__m128i*>(dst_ptr), vec2); src_ptr += src_stride; dst_ptr += dst_stride; } } if (remainder) { const u8 attribute_sz = min_block_size >> 2; for (u32 n = 0; n < remainder; ++n) { auto src_ptr2 = reinterpret_cast*>(src_ptr); auto dst_ptr2 = reinterpret_cast(dst_ptr); for (u32 v = 0; v < attribute_sz; ++v) dst_ptr2[v] = src_ptr[v]; src_ptr += src_stride; dst_ptr += dst_stride; } } } inline void stream_data_to_memory_swapped_u16_non_continuous(void *dst, const void *src, u32 vertex_count, u8 dst_stride, u8 src_stride) { const __m128i mask = _mm_set_epi8( 0xE, 0xF, 0xC, 0xD, 0xA, 0xB, 0x8, 0x9, 0x6, 0x7, 0x4, 0x5, 0x2, 0x3, 0x0, 0x1); auto src_ptr = static_cast(src); auto dst_ptr = static_cast(dst); const bool is_128_aligned = !((dst_stride | src_stride) & 15); u32 min_block_size = std::min(src_stride, dst_stride); if (min_block_size == 0) min_block_size = dst_stride; u32 iterations = 0; u32 remainder = is_128_aligned ? 0 : 1 + ((16 - min_block_size) / min_block_size); if (vertex_count > remainder) iterations = vertex_count - remainder; else remainder = vertex_count; if (s_use_ssse3) [[likely]] { for (u32 i = 0; i < iterations; ++i) { const __m128i vector = _mm_loadu_si128(reinterpret_cast(src_ptr)); const __m128i shuffled_vector = ssse3_shuffle_epi8(vector, mask); _mm_storeu_si128(reinterpret_cast<__m128i*>(dst_ptr), shuffled_vector); src_ptr += src_stride; dst_ptr += dst_stride; } } else { for (u32 i = 0; i < iterations; ++i) { const __m128i vec0 = _mm_loadu_si128(reinterpret_cast(src_ptr)); const __m128i vec1 = _mm_or_si128(_mm_slli_epi16(vec0, 8), _mm_srli_epi16(vec0, 8)); _mm_storeu_si128(reinterpret_cast<__m128i*>(dst_ptr), vec1); src_ptr += src_stride; dst_ptr += dst_stride; } } if (remainder) { const u8 attribute_sz = min_block_size >> 1; for (u32 n = 0; n < remainder; ++n) { auto src_ptr2 = reinterpret_cast*>(src_ptr); auto dst_ptr2 = reinterpret_cast(dst_ptr); for (u32 v = 0; v < attribute_sz; ++v) dst_ptr[v] = src_ptr[v]; src_ptr += src_stride; dst_ptr += dst_stride; } } } inline void stream_data_to_memory_u8_non_continuous(void *dst, const void *src, u32 vertex_count, u8 attribute_size, u8 dst_stride, u8 src_stride) { auto src_ptr = static_cast(src); auto dst_ptr = static_cast(dst); switch (attribute_size) { case 4: { //Read one dword every iteration for (u32 vertex = 0; vertex < vertex_count; ++vertex) { *reinterpret_cast(dst_ptr) = *reinterpret_cast(src_ptr); dst_ptr += dst_stride; src_ptr += src_stride; } break; } case 3: { //Read one word and one byte for (u32 vertex = 0; vertex < vertex_count; ++vertex) { *reinterpret_cast(dst_ptr) = *reinterpret_cast(src_ptr); dst_ptr[2] = src_ptr[2]; dst_ptr += dst_stride; src_ptr += src_stride; } break; } case 2: { //Copy u16 blocks for (u32 vertex = 0; vertex < vertex_count; ++vertex) { *reinterpret_cast(dst_ptr) = *reinterpret_cast(src_ptr); dst_ptr += dst_stride; src_ptr += src_stride; } break; } case 1: { for (u32 vertex = 0; vertex < vertex_count; ++vertex) { dst_ptr[0] = src_ptr[0]; dst_ptr += dst_stride; src_ptr += src_stride; } break; } } } template void copy_whole_attribute_array_impl(void* raw_dst, const void* raw_src, u8 dst_stride, u32 src_stride, u32 vertex_count) { auto src_ptr = static_cast(raw_src); auto dst_ptr = static_cast(raw_dst); for (u32 vertex = 0; vertex < vertex_count; ++vertex) { auto typed_dst = reinterpret_cast(dst_ptr); auto typed_src = reinterpret_cast(src_ptr); for (u32 i = 0; i < N; ++i) { typed_dst[i] = typed_src[i]; } src_ptr += src_stride; dst_ptr += dst_stride; } } /* * Copies a number of src vertices, repeated over and over to fill the dst * e.g repeat 2 vertices over a range of 16 verts, so 8 reps */ template void copy_whole_attribute_array_repeating_impl(void* raw_dst, const void* raw_src, const u8 dst_stride, const u32 src_stride, const u32 vertex_count, const u32 src_vertex_count) { auto src_ptr = static_cast(raw_src); auto dst_ptr = static_cast(raw_dst); u32 src_offset = 0; u32 src_limit = src_stride * src_vertex_count; for (u32 vertex = 0; vertex < vertex_count; ++vertex) { auto typed_dst = reinterpret_cast(dst_ptr); auto typed_src = reinterpret_cast(src_ptr + src_offset); for (u32 i = 0; i < N; ++i) { typed_dst[i] = typed_src[i]; } src_offset = (src_offset + src_stride) % src_limit; dst_ptr += dst_stride; } } template void copy_whole_attribute_array(void* raw_dst, const void* raw_src, const u8 attribute_size, const u8 dst_stride, const u32 src_stride, const u32 vertex_count, const u32 src_vertex_count) { //Eliminate the inner loop by templating the inner loop counter N if (src_vertex_count == vertex_count) { switch (attribute_size) { case 1: copy_whole_attribute_array_impl(raw_dst, raw_src, dst_stride, src_stride, vertex_count); break; case 2: copy_whole_attribute_array_impl(raw_dst, raw_src, dst_stride, src_stride, vertex_count); break; case 3: copy_whole_attribute_array_impl(raw_dst, raw_src, dst_stride, src_stride, vertex_count); break; case 4: copy_whole_attribute_array_impl(raw_dst, raw_src, dst_stride, src_stride, vertex_count); break; } } else { switch (attribute_size) { case 1: copy_whole_attribute_array_repeating_impl(raw_dst, raw_src, dst_stride, src_stride, vertex_count, src_vertex_count); break; case 2: copy_whole_attribute_array_repeating_impl(raw_dst, raw_src, dst_stride, src_stride, vertex_count, src_vertex_count); break; case 3: copy_whole_attribute_array_repeating_impl(raw_dst, raw_src, dst_stride, src_stride, vertex_count, src_vertex_count); break; case 4: copy_whole_attribute_array_repeating_impl(raw_dst, raw_src, dst_stride, src_stride, vertex_count, src_vertex_count); break; } } } } void write_vertex_array_data_to_buffer(gsl::span raw_dst_span, gsl::span src_ptr, u32 count, rsx::vertex_base_type type, u32 vector_element_count, u32 attribute_src_stride, u8 dst_stride, bool swap_endianness) { ensure((vector_element_count > 0)); const u32 src_read_stride = rsx::get_vertex_type_size_on_host(type, vector_element_count); bool use_stream_no_stride = false; bool use_stream_with_stride = false; //If stride is not defined, we have a packed array if (attribute_src_stride == 0) attribute_src_stride = src_read_stride; //Sometimes, we get a vertex attribute to be repeated. Just copy the supplied vertices only //TODO: Stop these requests from getting here in the first place! //TODO: Check if it is possible to have a repeating array with more than one attribute instance const u32 real_count = static_cast(src_ptr.size_bytes()) / attribute_src_stride; if (real_count == 1) attribute_src_stride = 0; //Always fetch src[0] //TODO: Determine favourable vertex threshold where vector setup costs become negligible //Tests show that even with 4 vertices, using traditional bswap is significantly slower over a large number of calls const u64 src_address = reinterpret_cast(src_ptr.data()); const bool sse_aligned = ((src_address & 15) == 0); #if !DEBUG_VERTEX_STREAMING if (swap_endianness) { if (real_count >= count || real_count == 1) { if (attribute_src_stride == dst_stride && src_read_stride == dst_stride) use_stream_no_stride = true; else use_stream_with_stride = true; } } #endif switch (type) { case rsx::vertex_base_type::ub: case rsx::vertex_base_type::ub256: { if (use_stream_no_stride) memcpy(raw_dst_span.data(), src_ptr.data(), count * dst_stride); else if (use_stream_with_stride) stream_data_to_memory_u8_non_continuous(raw_dst_span.data(), src_ptr.data(), count, vector_element_count, dst_stride, attribute_src_stride); else copy_whole_attribute_array(raw_dst_span.data(), src_ptr.data(), vector_element_count, dst_stride, attribute_src_stride, count, real_count); return; } case rsx::vertex_base_type::s1: case rsx::vertex_base_type::sf: case rsx::vertex_base_type::s32k: { if (use_stream_no_stride && sse_aligned) stream_data_to_memory_swapped_u16(raw_dst_span.data(), src_ptr.data(), count, attribute_src_stride); else if (use_stream_with_stride) stream_data_to_memory_swapped_u16_non_continuous(raw_dst_span.data(), src_ptr.data(), count, dst_stride, attribute_src_stride); else if (swap_endianness) copy_whole_attribute_array, u16>(raw_dst_span.data(), src_ptr.data(), vector_element_count, dst_stride, attribute_src_stride, count, real_count); else copy_whole_attribute_array(raw_dst_span.data(), src_ptr.data(), vector_element_count, dst_stride, attribute_src_stride, count, real_count); return; } case rsx::vertex_base_type::f: { if (use_stream_no_stride && sse_aligned) stream_data_to_memory_swapped_u32(raw_dst_span.data(), src_ptr.data(), count, attribute_src_stride); else if (use_stream_with_stride) stream_data_to_memory_swapped_u32_non_continuous(raw_dst_span.data(), src_ptr.data(), count, dst_stride, attribute_src_stride); else if (swap_endianness) copy_whole_attribute_array, u32>(raw_dst_span.data(), src_ptr.data(), vector_element_count, dst_stride, attribute_src_stride, count, real_count); else copy_whole_attribute_array(raw_dst_span.data(), src_ptr.data(), vector_element_count, dst_stride, attribute_src_stride, count, real_count); return; } case rsx::vertex_base_type::cmp: { gsl::span dst_span = as_span_workaround(raw_dst_span); for (u32 i = 0; i < count; ++i) { u32 src_value; memcpy(&src_value, src_ptr.subspan(attribute_src_stride * i).data(), sizeof(u32)); if (swap_endianness) src_value = stx::se_storage::swap(src_value); const auto& decoded_vector = decode_cmp_vector(src_value); dst_span[i * dst_stride / sizeof(u16)] = decoded_vector[0]; dst_span[i * dst_stride / sizeof(u16) + 1] = decoded_vector[1]; dst_span[i * dst_stride / sizeof(u16) + 2] = decoded_vector[2]; dst_span[i * dst_stride / sizeof(u16) + 3] = decoded_vector[3]; } return; } } } namespace { template constexpr T index_limit() { return -1; } template const T& min_max(T& min, T& max, const T& value) { if (value < min) min = value; if (value > max) max = value; return value; } struct untouched_impl { SSE4_1_FUNC static std::tuple upload_u16_swapped_sse4_1(const void *src, void *dst, u32 count) { const __m128i mask = _mm_set_epi8( 0xE, 0xF, 0xC, 0xD, 0xA, 0xB, 0x8, 0x9, 0x6, 0x7, 0x4, 0x5, 0x2, 0x3, 0x0, 0x1); auto src_stream = static_cast(src); auto dst_stream = static_cast<__m128i*>(dst); __m128i min = _mm_set1_epi16(-1); __m128i max = _mm_set1_epi16(0); const auto iterations = count / 8; for (unsigned n = 0; n < iterations; ++n) { const __m128i raw = _mm_loadu_si128(src_stream++); const __m128i value = _mm_shuffle_epi8(raw, mask); max = _mm_max_epu16(max, value); min = _mm_min_epu16(min, value); _mm_storeu_si128(dst_stream++, value); } const u16 min_index = sse41_hmin_epu16(min); const u16 max_index = sse41_hmax_epu16(max); return std::make_tuple(min_index, max_index, count); } SSE4_1_FUNC static std::tuple upload_u32_swapped_sse4_1(const void *src, void *dst, u32 count) { const __m128i mask = _mm_set_epi8( 0xC, 0xD, 0xE, 0xF, 0x8, 0x9, 0xA, 0xB, 0x4, 0x5, 0x6, 0x7, 0x0, 0x1, 0x2, 0x3); auto src_stream = static_cast(src); auto dst_stream = static_cast<__m128i*>(dst); __m128i min = _mm_set1_epi32(~0u); __m128i max = _mm_set1_epi32(0); const auto iterations = count / 4; for (unsigned n = 0; n < iterations; ++n) { const __m128i raw = _mm_loadu_si128(src_stream++); const __m128i value = _mm_shuffle_epi8(raw, mask); max = _mm_max_epu32(max, value); min = _mm_min_epu32(min, value); _mm_storeu_si128(dst_stream++, value); } __m128i tmp = _mm_srli_si128(min, 8); min = _mm_min_epu32(min, tmp); tmp = _mm_srli_si128(min, 4); min = _mm_min_epu32(min, tmp); tmp = _mm_srli_si128(max, 8); max = _mm_max_epu32(max, tmp); tmp = _mm_srli_si128(max, 4); max = _mm_max_epu32(max, tmp); const u32 min_index = _mm_cvtsi128_si32(min); const u32 max_index = _mm_cvtsi128_si32(max); return std::make_tuple(min_index, max_index, count); } template static std::tuple upload_untouched(gsl::span> src, gsl::span dst) { T min_index, max_index; u32 written; u32 remaining = ::size32(src); if (s_use_sse4_1 && remaining >= 32) { if constexpr (std::is_same::value) { const auto count = (remaining & ~0x3); std::tie(min_index, max_index, written) = upload_u32_swapped_sse4_1(src.data(), dst.data(), count); } else if constexpr (std::is_same::value) { const auto count = (remaining & ~0x7); std::tie(min_index, max_index, written) = upload_u16_swapped_sse4_1(src.data(), dst.data(), count); } else { fmt::throw_exception("Unreachable"); } remaining -= written; } else { min_index = index_limit(); max_index = 0; written = 0; } while (remaining--) { T index = src[written]; dst[written++] = min_max(min_index, max_index, index); } return std::make_tuple(min_index, max_index, written); } }; struct primitive_restart_impl { AVX2_FUNC static std::tuple upload_u16_swapped_avx2(const void *src, void *dst, u32 iterations, u16 restart_index) { const __m256i shuffle_mask = _mm256_set_epi8( 0xE, 0xF, 0xC, 0xD, 0xA, 0xB, 0x8, 0x9, 0x6, 0x7, 0x4, 0x5, 0x2, 0x3, 0x0, 0x1, 0xE, 0xF, 0xC, 0xD, 0xA, 0xB, 0x8, 0x9, 0x6, 0x7, 0x4, 0x5, 0x2, 0x3, 0x0, 0x1); auto src_stream = static_cast(src); auto dst_stream = static_cast<__m256i*>(dst); __m256i restart = _mm256_set1_epi16(restart_index); __m256i min = _mm256_set1_epi16(-1); __m256i max = _mm256_set1_epi16(0); for (unsigned n = 0; n < iterations; ++n) { const __m256i raw = _mm256_loadu_si256(src_stream++); const __m256i value = _mm256_shuffle_epi8(raw, shuffle_mask); const __m256i mask = _mm256_cmpeq_epi16(restart, value); const __m256i value_with_min_restart = _mm256_andnot_si256(mask, value); const __m256i value_with_max_restart = _mm256_or_si256(mask, value); max = _mm256_max_epu16(max, value_with_min_restart); min = _mm256_min_epu16(min, value_with_max_restart); _mm256_storeu_si256(dst_stream++, value_with_max_restart); } __m128i tmp = _mm256_extracti128_si256(min, 1); __m128i min2 = _mm256_castsi256_si128(min); min2 = _mm_min_epu16(min2, tmp); tmp = _mm256_extracti128_si256(max, 1); __m128i max2 = _mm256_castsi256_si128(max); max2 = _mm_max_epu16(max2, tmp); const u16 min_index = sse41_hmin_epu16(min2); const u16 max_index = sse41_hmax_epu16(max2); return std::make_tuple(min_index, max_index); } SSE4_1_FUNC static std::tuple upload_u16_swapped_sse4_1(const void *src, void *dst, u32 iterations, u16 restart_index) { const __m128i shuffle_mask = _mm_set_epi8( 0xE, 0xF, 0xC, 0xD, 0xA, 0xB, 0x8, 0x9, 0x6, 0x7, 0x4, 0x5, 0x2, 0x3, 0x0, 0x1); auto src_stream = static_cast(src); auto dst_stream = static_cast<__m128i*>(dst); __m128i restart = _mm_set1_epi16(restart_index); __m128i min = _mm_set1_epi16(-1); __m128i max = _mm_set1_epi16(0); for (unsigned n = 0; n < iterations; ++n) { const __m128i raw = _mm_loadu_si128(src_stream++); const __m128i value = _mm_shuffle_epi8(raw, shuffle_mask); const __m128i mask = _mm_cmpeq_epi16(restart, value); const __m128i value_with_min_restart = _mm_andnot_si128(mask, value); const __m128i value_with_max_restart = _mm_or_si128(mask, value); max = _mm_max_epu16(max, value_with_min_restart); min = _mm_min_epu16(min, value_with_max_restart); _mm_storeu_si128(dst_stream++, value_with_max_restart); } const u16 min_index = sse41_hmin_epu16(min); const u16 max_index = sse41_hmax_epu16(max); return std::make_tuple(min_index, max_index); } SSE4_1_FUNC static std::tuple upload_u32_swapped_sse4_1(const void *src, void *dst, u32 iterations, u32 restart_index) { const __m128i shuffle_mask = _mm_set_epi8( 0xC, 0xD, 0xE, 0xF, 0x8, 0x9, 0xA, 0xB, 0x4, 0x5, 0x6, 0x7, 0x0, 0x1, 0x2, 0x3); auto src_stream = static_cast(src); auto dst_stream = static_cast<__m128i*>(dst); __m128i restart = _mm_set1_epi32(restart_index); __m128i min = _mm_set1_epi32(0xffffffff); __m128i max = _mm_set1_epi32(0); for (unsigned n = 0; n < iterations; ++n) { const __m128i raw = _mm_loadu_si128(src_stream++); const __m128i value = _mm_shuffle_epi8(raw, shuffle_mask); const __m128i mask = _mm_cmpeq_epi32(restart, value); const __m128i value_with_min_restart = _mm_andnot_si128(mask, value); const __m128i value_with_max_restart = _mm_or_si128(mask, value); max = _mm_max_epu32(max, value_with_min_restart); min = _mm_min_epu32(min, value_with_max_restart); _mm_storeu_si128(dst_stream++, value_with_max_restart); } __m128i tmp = _mm_srli_si128(min, 8); min = _mm_min_epu32(min, tmp); tmp = _mm_srli_si128(min, 4); min = _mm_min_epu32(min, tmp); tmp = _mm_srli_si128(max, 8); max = _mm_max_epu32(max, tmp); tmp = _mm_srli_si128(max, 4); max = _mm_max_epu32(max, tmp); const u32 min_index = _mm_cvtsi128_si32(min); const u32 max_index = _mm_cvtsi128_si32(max); return std::make_tuple(min_index, max_index); } template static std::tuple upload_untouched(gsl::span> src, gsl::span dst, T restart_index, bool skip_restart) { T min_index = index_limit(); T max_index = 0; u32 written = 0; u32 length = ::size32(src); if (length >= 32 && !skip_restart) { if constexpr (std::is_same::value) { if (s_use_avx2) { u32 iterations = length >> 4; written = length & ~0xF; std::tie(min_index, max_index) = upload_u16_swapped_avx2(src.data(), dst.data(), iterations, restart_index); } else if (s_use_sse4_1) { u32 iterations = length >> 3; written = length & ~0x7; std::tie(min_index, max_index) = upload_u16_swapped_sse4_1(src.data(), dst.data(), iterations, restart_index); } } else if constexpr (std::is_same::value) { if (s_use_sse4_1) { u32 iterations = length >> 2; written = length & ~0x3; std::tie(min_index, max_index) = upload_u32_swapped_sse4_1(src.data(), dst.data(), iterations, restart_index); } } else { fmt::throw_exception("Unreachable"); } } for (u32 i = written; i < length; ++i) { T index = src[i]; if (index == restart_index) { if (!skip_restart) { dst[written++] = index_limit(); } } else { dst[written++] = min_max(min_index, max_index, index); } } return std::make_tuple(min_index, max_index, written); } }; template std::tuple upload_untouched(gsl::span> src, gsl::span dst, rsx::primitive_type draw_mode, bool is_primitive_restart_enabled, u32 primitive_restart_index) { if (!is_primitive_restart_enabled) { return untouched_impl::upload_untouched(src, dst); } else if constexpr (std::is_same::value) { if (primitive_restart_index > 0xffff) { return untouched_impl::upload_untouched(src, dst); } else { return primitive_restart_impl::upload_untouched(src, dst, static_cast(primitive_restart_index), is_primitive_disjointed(draw_mode)); } } else { return primitive_restart_impl::upload_untouched(src, dst, primitive_restart_index, is_primitive_disjointed(draw_mode)); } } template std::tuple expand_indexed_triangle_fan(gsl::span> src, gsl::span dst, bool is_primitive_restart_enabled, u32 primitive_restart_index) { const T invalid_index = index_limit(); T min_index = invalid_index; T max_index = 0; ensure((dst.size() >= 3 * (src.size() - 2))); u32 dst_idx = 0; u32 src_idx = 0; bool needs_anchor = true; T anchor = invalid_index; T last_index = invalid_index; for (const T index : src) { if (needs_anchor) { if (is_primitive_restart_enabled && index == primitive_restart_index) continue; anchor = min_max(min_index, max_index, index); needs_anchor = false; continue; } if (is_primitive_restart_enabled && index == primitive_restart_index) { needs_anchor = true; last_index = invalid_index; continue; } if (last_index == invalid_index) { //Need at least one anchor and one outer index to create a triangle last_index = min_max(min_index, max_index, index); continue; } dst[dst_idx++] = anchor; dst[dst_idx++] = last_index; dst[dst_idx++] = min_max(min_index, max_index, index); last_index = index; } return std::make_tuple(min_index, max_index, dst_idx); } template std::tuple expand_indexed_quads(gsl::span> src, gsl::span dst, bool is_primitive_restart_enabled, u32 primitive_restart_index) { T min_index = index_limit(); T max_index = 0; ensure((4 * dst.size_bytes() >= 6 * src.size_bytes())); u32 dst_idx = 0; u8 set_size = 0; T tmp_indices[4]; for (const T index : src) { if (is_primitive_restart_enabled && index == primitive_restart_index) { //empty temp buffer set_size = 0; continue; } tmp_indices[set_size++] = min_max(min_index, max_index, index); if (set_size == 4) { // First triangle dst[dst_idx++] = tmp_indices[0]; dst[dst_idx++] = tmp_indices[1]; dst[dst_idx++] = tmp_indices[2]; // Second triangle dst[dst_idx++] = tmp_indices[2]; dst[dst_idx++] = tmp_indices[3]; dst[dst_idx++] = tmp_indices[0]; set_size = 0; } } return std::make_tuple(min_index, max_index, dst_idx); } } // Only handle quads and triangle fan now bool is_primitive_native(rsx::primitive_type draw_mode) { switch (draw_mode) { case rsx::primitive_type::points: case rsx::primitive_type::lines: case rsx::primitive_type::line_strip: case rsx::primitive_type::triangles: case rsx::primitive_type::triangle_strip: case rsx::primitive_type::quad_strip: return true; case rsx::primitive_type::line_loop: case rsx::primitive_type::polygon: case rsx::primitive_type::triangle_fan: case rsx::primitive_type::quads: return false; case rsx::primitive_type::invalid: break; } fmt::throw_exception("Wrong primitive type"); } bool is_primitive_disjointed(rsx::primitive_type draw_mode) { switch (draw_mode) { case rsx::primitive_type::line_loop: case rsx::primitive_type::line_strip: case rsx::primitive_type::polygon: case rsx::primitive_type::quad_strip: case rsx::primitive_type::triangle_fan: case rsx::primitive_type::triangle_strip: return false; default: return true; } } u32 get_index_count(rsx::primitive_type draw_mode, u32 initial_index_count) { // Index count if (is_primitive_native(draw_mode)) return initial_index_count; switch (draw_mode) { case rsx::primitive_type::line_loop: return initial_index_count + 1; case rsx::primitive_type::polygon: case rsx::primitive_type::triangle_fan: return (initial_index_count - 2) * 3; case rsx::primitive_type::quads: return (6 * initial_index_count) / 4; default: return 0; } } u32 get_index_type_size(rsx::index_array_type type) { switch (type) { case rsx::index_array_type::u16: return sizeof(u16); case rsx::index_array_type::u32: return sizeof(u32); } fmt::throw_exception("Wrong index type"); } void write_index_array_for_non_indexed_non_native_primitive_to_buffer(char* dst, rsx::primitive_type draw_mode, unsigned count) { auto typedDst = reinterpret_cast(dst); switch (draw_mode) { case rsx::primitive_type::line_loop: for (unsigned i = 0; i < count; ++i) typedDst[i] = i; typedDst[count] = 0; return; case rsx::primitive_type::triangle_fan: case rsx::primitive_type::polygon: for (unsigned i = 0; i < (count - 2); i++) { typedDst[3 * i] = 0; typedDst[3 * i + 1] = i + 2 - 1; typedDst[3 * i + 2] = i + 2; } return; case rsx::primitive_type::quads: for (unsigned i = 0; i < count / 4; i++) { // First triangle typedDst[6 * i] = 4 * i; typedDst[6 * i + 1] = 4 * i + 1; typedDst[6 * i + 2] = 4 * i + 2; // Second triangle typedDst[6 * i + 3] = 4 * i + 2; typedDst[6 * i + 4] = 4 * i + 3; typedDst[6 * i + 5] = 4 * i; } return; case rsx::primitive_type::quad_strip: case rsx::primitive_type::points: case rsx::primitive_type::lines: case rsx::primitive_type::line_strip: case rsx::primitive_type::triangles: case rsx::primitive_type::triangle_strip: fmt::throw_exception("Native primitive type doesn't require expansion"); case rsx::primitive_type::invalid: break; } fmt::throw_exception("Tried to load invalid primitive type"); } namespace { /** * Get first index and index count from a draw indexed clause. */ std::tuple get_first_count_from_draw_indexed_clause(const std::vector>& first_count_arguments) { u32 first = std::get<0>(first_count_arguments.front()); u32 count = std::get<0>(first_count_arguments.back()) + std::get<1>(first_count_arguments.back()) - first; return std::make_tuple(first, count); } template std::tuple write_index_array_data_to_buffer_impl(gsl::span dst, gsl::span> src, rsx::primitive_type draw_mode, bool restart_index_enabled, u32 restart_index, const std::function& expands) { if (!expands(draw_mode)) [[likely]] { return upload_untouched(src, dst, draw_mode, restart_index_enabled, restart_index); } switch (draw_mode) { case rsx::primitive_type::line_loop: { const auto &returnvalue = upload_untouched(src, dst, draw_mode, restart_index_enabled, restart_index); const auto index_count = dst.size_bytes() / sizeof(T); dst[index_count] = src[0]; return returnvalue; } case rsx::primitive_type::polygon: case rsx::primitive_type::triangle_fan: { return expand_indexed_triangle_fan(src, dst, restart_index_enabled, restart_index); } case rsx::primitive_type::quads: { return expand_indexed_quads(src, dst, restart_index_enabled, restart_index); } default: fmt::throw_exception("Unknown draw mode (0x%x)", static_cast(draw_mode)); } } } std::tuple write_index_array_data_to_buffer(gsl::span dst_ptr, gsl::span src_ptr, rsx::index_array_type type, rsx::primitive_type draw_mode, bool restart_index_enabled, u32 restart_index, const std::function& expands) { switch (type) { case rsx::index_array_type::u16: { return write_index_array_data_to_buffer_impl(as_span_workaround(dst_ptr), as_const_span>(src_ptr), draw_mode, restart_index_enabled, restart_index, expands); } case rsx::index_array_type::u32: { return write_index_array_data_to_buffer_impl(as_span_workaround(dst_ptr), as_const_span>(src_ptr), draw_mode, restart_index_enabled, restart_index, expands); } default: fmt::throw_exception("Unreachable"); } } void stream_vector(void *dst, u32 x, u32 y, u32 z, u32 w) { const __m128i vector = _mm_set_epi32(w, z, y, x); _mm_stream_si128(reinterpret_cast<__m128i*>(dst), vector); } void stream_vector(void *dst, f32 x, f32 y, f32 z, f32 w) { stream_vector(dst, std::bit_cast(x), std::bit_cast(y), std::bit_cast(z), std::bit_cast(w)); } void stream_vector_from_memory(void *dst, void *src) { const __m128i vector = _mm_loadu_si128(reinterpret_cast<__m128i*>(src)); _mm_stream_si128(reinterpret_cast<__m128i*>(dst), vector); }