From e8b4d332eb6c61a7784c23919b29739c1ff401bb Mon Sep 17 00:00:00 2001 From: kd-11 Date: Wed, 29 Mar 2017 12:29:11 +0300 Subject: [PATCH] rsx: Use faster upload path when conditions allow Fix aligned memory access (SSE) rsx: BufferUtils; always use optimized paths --- rpcs3/Emu/RSX/Common/BufferUtils.cpp | 321 ++++++++++++++++++++++++++- rpcs3/Emu/RSX/GL/GLGSRender.cpp | 4 + rpcs3/Emu/RSX/GL/GLVertexBuffers.cpp | 4 +- rpcs3/Emu/RSX/VK/VKVertexBuffers.cpp | 7 +- 4 files changed, 324 insertions(+), 12 deletions(-) diff --git a/rpcs3/Emu/RSX/Common/BufferUtils.cpp b/rpcs3/Emu/RSX/Common/BufferUtils.cpp index ba4c7a1338..1578e96115 100644 --- a/rpcs3/Emu/RSX/Common/BufferUtils.cpp +++ b/rpcs3/Emu/RSX/Common/BufferUtils.cpp @@ -32,9 +32,232 @@ namespace return{ X, Y, Z, 1 }; } - template - void copy_whole_attribute_array(gsl::span dst, gsl::span src_ptr, u8 attribute_size, u8 dst_stride, u32 src_stride, u32 vertex_count) + inline void stream_data_to_memory_swapped_u32(void *dst, const void *src, u32 vertex_count, u8 stride) { + const __m128i mask = _mm_set_epi8( + 0xC, 0xD, 0xE, 0xF, + 0x8, 0x9, 0xA, 0xB, + 0x4, 0x5, 0x6, 0x7, + 0x0, 0x1, 0x2, 0x3); + + __m128i* dst_ptr = (__m128i*)dst; + __m128i* src_ptr = (__m128i*)src; + + const u32 dword_count = (vertex_count * (stride >> 2)); + const u32 iterations = dword_count >> 2; + const u32 remaining = dword_count % 4; + + for (u32 i = 0; i < iterations; ++i) + { + u32 *src_words = (u32*)src_ptr; + u32 *dst_words = (u32*)dst_ptr; + const __m128i &vector = _mm_loadu_si128(src_ptr); + const __m128i &shuffled_vector = _mm_shuffle_epi8(vector, mask); + _mm_stream_si128(dst_ptr, shuffled_vector); + + src_ptr++; + dst_ptr++; + } + + if (remaining) + { + u32 *src_ptr2 = (u32 *)src_ptr; + u32 *dst_ptr2 = (u32 *)dst_ptr; + + for (u32 i = 0; i < remaining; ++i) + dst_ptr2[i] = se_storage::swap(src_ptr2[i]); + } + } + + inline void stream_data_to_memory_swapped_u16(void *dst, const void *src, u32 vertex_count, u8 stride) + { + const __m128i mask = _mm_set_epi8( + 0xE, 0xF, 0xC, 0xD, + 0xA, 0xB, 0x8, 0x9, + 0x6, 0x7, 0x4, 0x5, + 0x2, 0x3, 0x0, 0x1); + + __m128i* dst_ptr = (__m128i*)dst; + __m128i* src_ptr = (__m128i*)src; + + const u32 word_count = (vertex_count * (stride >> 1)); + const u32 iterations = word_count >> 3; + const u32 remaining = word_count % 8; + + for (u32 i = 0; i < iterations; ++i) + { + u32 *src_words = (u32*)src_ptr; + u32 *dst_words = (u32*)dst_ptr; + const __m128i &vector = _mm_loadu_si128(src_ptr); + const __m128i &shuffled_vector = _mm_shuffle_epi8(vector, mask); + _mm_stream_si128(dst_ptr, shuffled_vector); + + src_ptr++; + dst_ptr++; + } + + if (remaining) + { + u16 *src_ptr2 = (u16 *)src_ptr; + u16 *dst_ptr2 = (u16 *)dst_ptr; + + for (u32 i = 0; i < remaining; ++i) + dst_ptr2[i] = se_storage::swap(src_ptr2[i]); + } + } + + inline void stream_data_to_memory_swapped_u32_non_continuous(void *dst, const void *src, u32 vertex_count, u8 dst_stride, u8 src_stride) + { + const __m128i mask = _mm_set_epi8( + 0xC, 0xD, 0xE, 0xF, + 0x8, 0x9, 0xA, 0xB, + 0x4, 0x5, 0x6, 0x7, + 0x0, 0x1, 0x2, 0x3); + + char *src_ptr = (char *)src; + char *dst_ptr = (char *)dst; + + //Count vertices to copy + const bool is_128_aligned = !((dst_stride | src_stride) & 15); + + const u32 min_block_size = std::min(src_stride, dst_stride); + const u32 remainder = is_128_aligned? 0: (16 - min_block_size) / min_block_size; + const u32 iterations = is_128_aligned? vertex_count: vertex_count - remainder; + + for (u32 i = 0; i < iterations; ++i) + { + const __m128i &vector = _mm_loadu_si128((__m128i*)src_ptr); + const __m128i &shuffled_vector = _mm_shuffle_epi8(vector, mask); + _mm_storeu_si128((__m128i*)dst_ptr, shuffled_vector); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } + + if (remainder) + { + const u8 attribute_sz = min_block_size >> 2; + for (u32 n = 0; n < remainder; ++n) + { + for (u32 v= 0; v < attribute_sz; ++v) + ((u32*)dst_ptr)[v] = ((be_t*)src_ptr)[v]; + + src_ptr += src_stride; + dst_ptr += dst_stride; + } + } + } + + inline void stream_data_to_memory_swapped_u16_non_continuous(void *dst, const void *src, u32 vertex_count, u8 dst_stride, u8 src_stride) + { + const __m128i mask = _mm_set_epi8( + 0xE, 0xF, 0xC, 0xD, + 0xA, 0xB, 0x8, 0x9, + 0x6, 0x7, 0x4, 0x5, + 0x2, 0x3, 0x0, 0x1); + + char *src_ptr = (char *)src; + char *dst_ptr = (char *)dst; + + const bool is_128_aligned = !((dst_stride | src_stride) & 15); + + const u32 min_block_size = std::min(src_stride, dst_stride); + const u32 remainder = is_128_aligned ? 0 : (16 - min_block_size) / min_block_size; + const u32 iterations = is_128_aligned ? vertex_count : vertex_count - remainder; + + for (u32 i = 0; i < iterations; ++i) + { + const __m128i &vector = _mm_loadu_si128((__m128i*)src_ptr); + const __m128i &shuffled_vector = _mm_shuffle_epi8(vector, mask); + _mm_storeu_si128((__m128i*)dst_ptr, shuffled_vector); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } + + if (remainder) + { + const u8 attribute_sz = min_block_size >> 1; + for (u32 n = 0; n < remainder; ++n) + { + for (u32 v = 0; v < attribute_sz; ++v) + ((u16*)dst_ptr)[v] = ((be_t*)src_ptr)[v]; + + src_ptr += src_stride; + dst_ptr += dst_stride; + } + } + } + + inline void stream_data_to_memory_u8_non_continous(void *dst, const void *src, u32 vertex_count, u8 attribute_size, u8 dst_stride, u8 src_stride) + { + char *src_ptr = (char *)src; + char *dst_ptr = (char *)dst; + + switch (attribute_size) + { + case 4: + { + //Read one dword every iteration + for (u32 vertex = 0; vertex < vertex_count; ++vertex) + { + *(u32*)dst_ptr = *(u32*)src_ptr; + + dst_ptr += dst_stride; + src_ptr += src_stride; + } + + break; + } + case 3: + { + //Read one word and one byte + for (u32 vertex = 0; vertex < vertex_count; ++vertex) + { + *(u16*)dst_ptr = *(u16*)src_ptr; + dst_ptr[2] = src_ptr[2]; + + dst_ptr += dst_stride; + src_ptr += src_stride; + } + + break; + } + case 2: + { + //Copy u16 blocks + for (u32 vertex = 0; vertex < vertex_count; ++vertex) + { + *(u32*)dst_ptr = *(u32*)src_ptr; + + dst_ptr += dst_stride; + src_ptr += src_stride; + } + + break; + } + case 1: + { + for (u32 vertex = 0; vertex < vertex_count; ++vertex) + { + dst_ptr[0] = src_ptr[0]; + + dst_ptr += dst_stride; + src_ptr += src_stride; + } + + break; + } + } + } + +#ifdef _DEBUG + template + void copy_whole_attribute_array(void *raw_dst, void *raw_src, u8 attribute_size, u8 dst_stride, u32 src_stride, u32 vertex_count) + { + gsl::span dst = gsl::as_span_workaround(); + const gsl::span src = {raw_src, src_stride * vertex_count}; + for (u32 vertex = 0; vertex < vertex_count; ++vertex) { gsl::span src = gsl::as_span(src_ptr.subspan(src_stride * vertex, attribute_size * sizeof(const U))); @@ -44,33 +267,115 @@ namespace } } } +#else + template + void copy_whole_attribute_array_impl(void *raw_dst, void *raw_src, u8 dst_stride, u32 src_stride, u32 vertex_count) + { + char *src_ptr = (char *)raw_src; + char *dst_ptr = (char *)raw_dst; + + for (u32 vertex = 0; vertex < vertex_count; ++vertex) + { + T* typed_dst = (T*)dst_ptr; + U* typed_src = (U*)src_ptr; + + for (u32 i = 0; i < N; ++i) + { + typed_dst[i] = typed_src[i]; + } + + src_ptr += src_stride; + dst_ptr += dst_stride; + } + } + + template + void copy_whole_attribute_array(void *raw_dst, void *raw_src, u8 attribute_size, u8 dst_stride, u32 src_stride, u32 vertex_count) + { + //Eliminate the inner loop by templating the inner loop counter N + switch (attribute_size) + { + case 1: + copy_whole_attribute_array_impl(raw_dst, raw_src, dst_stride, src_stride, vertex_count); + break; + case 2: + copy_whole_attribute_array_impl(raw_dst, raw_src, dst_stride, src_stride, vertex_count); + break; + case 3: + copy_whole_attribute_array_impl(raw_dst, raw_src, dst_stride, src_stride, vertex_count); + break; + case 4: + copy_whole_attribute_array_impl(raw_dst, raw_src, dst_stride, src_stride, vertex_count); + break; + } + } +#endif } void write_vertex_array_data_to_buffer(gsl::span raw_dst_span, gsl::span src_ptr, u32 count, rsx::vertex_base_type type, u32 vector_element_count, u32 attribute_src_stride, u8 dst_stride) { verify(HERE), (vector_element_count > 0); + const u32 src_read_stride = rsx::get_vertex_type_size_on_host(type, vector_element_count); + + bool use_stream_no_stride = false; + bool use_stream_with_stride = false; + + //If stride is not defined, we have a packed array + if (attribute_src_stride == 0) + attribute_src_stride = src_read_stride; + + //TODO: Determine favourable vertex threshold where vector setup costs become negligible + //Tests show that even with 4 vertices, using traditional bswap is significantly slower over a large number of calls + //Tested with atelier, discriminating based on vertex count is measurably slower + //NOTE: src_read_stride is guaranteed to be less than dst_stride! + + if (src_read_stride > dst_stride) + fmt::throw_exception("src_read is greater than dst write. Impossible situation."); + + const u64 src_address = (u64)src_ptr.data(); + const bool sse_aligned = ((src_address & 15) == 0); + +#ifndef _DEBUG + if (attribute_src_stride == dst_stride && src_read_stride == dst_stride) + use_stream_no_stride = true; + else + use_stream_with_stride = true; +#endif switch (type) { case rsx::vertex_base_type::ub: case rsx::vertex_base_type::ub256: { - gsl::span dst_span = as_span_workaround(raw_dst_span); - copy_whole_attribute_array(dst_span, src_ptr, vector_element_count, dst_stride, attribute_src_stride, count); + if (use_stream_no_stride) + memcpy(raw_dst_span.data(), src_ptr.data(), count * dst_stride); + else + stream_data_to_memory_u8_non_continous(raw_dst_span.data(), src_ptr.data(), count, vector_element_count, dst_stride, attribute_src_stride); + return; } case rsx::vertex_base_type::s1: case rsx::vertex_base_type::sf: case rsx::vertex_base_type::s32k: { - gsl::span dst_span = as_span_workaround(raw_dst_span); - copy_whole_attribute_array>(dst_span, src_ptr, vector_element_count, dst_stride, attribute_src_stride, count); + if (use_stream_no_stride && sse_aligned) + stream_data_to_memory_swapped_u16(raw_dst_span.data(), src_ptr.data(), count, attribute_src_stride); + else if (use_stream_with_stride) + stream_data_to_memory_swapped_u16_non_continuous(raw_dst_span.data(), src_ptr.data(), count, dst_stride, attribute_src_stride); + else + copy_whole_attribute_array, u16>((void *)raw_dst_span.data(), (void *)src_ptr.data(), vector_element_count, dst_stride, attribute_src_stride, count); + return; } case rsx::vertex_base_type::f: { - gsl::span dst_span = as_span_workaround(raw_dst_span); - copy_whole_attribute_array>(dst_span, src_ptr, vector_element_count, dst_stride, attribute_src_stride, count); + if (use_stream_no_stride && sse_aligned) + stream_data_to_memory_swapped_u32(raw_dst_span.data(), src_ptr.data(), count, attribute_src_stride); + else if (use_stream_with_stride) + stream_data_to_memory_swapped_u32_non_continuous(raw_dst_span.data(), src_ptr.data(), count, dst_stride, attribute_src_stride); + else + copy_whole_attribute_array, u32>((void *)raw_dst_span.data(), (void *)src_ptr.data(), vector_element_count, dst_stride, attribute_src_stride, count); + return; } case rsx::vertex_base_type::cmp: diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.cpp b/rpcs3/Emu/RSX/GL/GLGSRender.cpp index db185ee6dd..025b89c699 100644 --- a/rpcs3/Emu/RSX/GL/GLGSRender.cpp +++ b/rpcs3/Emu/RSX/GL/GLGSRender.cpp @@ -501,6 +501,10 @@ void GLGSRender::on_init_thread() glGetIntegerv(GL_TEXTURE_BUFFER_OFFSET_ALIGNMENT, &m_min_texbuffer_alignment); m_vao.create(); + //Set min alignment to 16-bytes for SSE optimizations with aligned addresses to work + m_min_texbuffer_alignment = std::max(m_min_texbuffer_alignment, 16); + m_uniform_buffer_offset_align = std::max(m_uniform_buffer_offset_align, 16); + const u32 texture_index_offset = rsx::limits::fragment_textures_count + rsx::limits::vertex_textures_count; for (int index = 0; index < rsx::limits::vertex_count; ++index) diff --git a/rpcs3/Emu/RSX/GL/GLVertexBuffers.cpp b/rpcs3/Emu/RSX/GL/GLVertexBuffers.cpp index 907e419b29..6a1d2798b7 100644 --- a/rpcs3/Emu/RSX/GL/GLVertexBuffers.cpp +++ b/rpcs3/Emu/RSX/GL/GLVertexBuffers.cpp @@ -231,10 +231,10 @@ namespace buffer_offset = mapping.second; gsl::span dest_span(dst, data_size); - prepare_buffer_for_writing(dst, vertex_array.type, vertex_array.attribute_size, vertex_count); - write_vertex_array_data_to_buffer(dest_span, vertex_array.data, vertex_count, vertex_array.type, vertex_array.attribute_size, vertex_array.stride, rsx::get_vertex_type_size_on_host(vertex_array.type, vertex_array.attribute_size)); + prepare_buffer_for_writing(dst, vertex_array.type, vertex_array.attribute_size, vertex_count); + texture.copy_from(m_attrib_ring_info, gl_type, buffer_offset, data_size); } diff --git a/rpcs3/Emu/RSX/VK/VKVertexBuffers.cpp b/rpcs3/Emu/RSX/VK/VKVertexBuffers.cpp index 6fc423c4f6..5973084ce0 100644 --- a/rpcs3/Emu/RSX/VK/VKVertexBuffers.cpp +++ b/rpcs3/Emu/RSX/VK/VKVertexBuffers.cpp @@ -269,11 +269,14 @@ namespace VkDeviceSize offset_in_attrib_buffer = m_attrib_ring_info.alloc<256>(upload_size); void *dst = m_attrib_ring_info.map(offset_in_attrib_buffer, upload_size); - vk::prepare_buffer_for_writing(dst, vertex_array.type, vertex_array.attribute_size, vertex_count); + gsl::span dest_span(static_cast(dst), upload_size); - write_vertex_array_data_to_buffer(dest_span, vertex_array.data, vertex_count, vertex_array.type, vertex_array.attribute_size, vertex_array.stride, real_element_size); + //Padding the vertex buffer should be done after the writes have been done + //write_vertex_data function may 'dirty' unused sections of the buffer as optimization + vk::prepare_buffer_for_writing(dst, vertex_array.type, vertex_array.attribute_size, vertex_count); + m_attrib_ring_info.unmap(); const VkFormat format = vk::get_suitable_vk_format(vertex_array.type, vertex_array.attribute_size);