From e8b4d332eb6c61a7784c23919b29739c1ff401bb Mon Sep 17 00:00:00 2001
From: kd-11 <karokidii@gmail.com>
Date: Wed, 29 Mar 2017 12:29:11 +0300
Subject: [PATCH] rsx: Use faster upload path when conditions allow

Fix aligned memory access (SSE)

rsx: BufferUtils; always use optimized paths
---
 rpcs3/Emu/RSX/Common/BufferUtils.cpp | 321 ++++++++++++++++++++++++++-
 rpcs3/Emu/RSX/GL/GLGSRender.cpp      |   4 +
 rpcs3/Emu/RSX/GL/GLVertexBuffers.cpp |   4 +-
 rpcs3/Emu/RSX/VK/VKVertexBuffers.cpp |   7 +-
 4 files changed, 324 insertions(+), 12 deletions(-)
diff --git a/rpcs3/Emu/RSX/Common/BufferUtils.cpp b/rpcs3/Emu/RSX/Common/BufferUtils.cpp
index ba4c7a1338..1578e96115 100644
--- a/rpcs3/Emu/RSX/Common/BufferUtils.cpp
+++ b/rpcs3/Emu/RSX/Common/BufferUtils.cpp
@@ -32,9 +32,232 @@ namespace
 		return{ X, Y, Z, 1 };
 	}
 
-	template <typename U, typename T>
-	void copy_whole_attribute_array(gsl::span<T> dst, gsl::span<const gsl::byte> src_ptr, u8 attribute_size, u8 dst_stride, u32 src_stride, u32 vertex_count)
+	inline void stream_data_to_memory_swapped_u32(void *dst, const void *src, u32 vertex_count, u8 stride)
 	{
+		const __m128i mask = _mm_set_epi8(
+			0xC, 0xD, 0xE, 0xF,
+			0x8, 0x9, 0xA, 0xB, 
+			0x4, 0x5, 0x6, 0x7, 
+			0x0, 0x1, 0x2, 0x3);
+
+		__m128i* dst_ptr = (__m128i*)dst;
+		__m128i* src_ptr = (__m128i*)src;
+
+		const u32 dword_count = (vertex_count * (stride >> 2));
+		const u32 iterations = dword_count >> 2;
+		const u32 remaining = dword_count % 4;
+		
+		for (u32 i = 0; i < iterations; ++i)
+		{
+			u32 *src_words = (u32*)src_ptr;
+			u32 *dst_words = (u32*)dst_ptr;
+			const __m128i &vector = _mm_loadu_si128(src_ptr);
+			const __m128i &shuffled_vector = _mm_shuffle_epi8(vector, mask);
+			_mm_stream_si128(dst_ptr, shuffled_vector);
+
+			src_ptr++;
+			dst_ptr++;
+		}
+
+		if (remaining)
+		{
+			u32 *src_ptr2 = (u32 *)src_ptr;
+			u32 *dst_ptr2 = (u32 *)dst_ptr;
+
+			for (u32 i = 0; i < remaining; ++i)
+				dst_ptr2[i] = se_storage<u32>::swap(src_ptr2[i]);
+		}
+	}
+
+	inline void stream_data_to_memory_swapped_u16(void *dst, const void *src, u32 vertex_count, u8 stride)
+	{
+		const __m128i mask = _mm_set_epi8(
+			0xE, 0xF, 0xC, 0xD,
+			0xA, 0xB, 0x8, 0x9,
+			0x6, 0x7, 0x4, 0x5,
+			0x2, 0x3, 0x0, 0x1);
+
+		__m128i* dst_ptr = (__m128i*)dst;
+		__m128i* src_ptr = (__m128i*)src;
+
+		const u32 word_count = (vertex_count * (stride >> 1));
+		const u32 iterations = word_count >> 3;
+		const u32 remaining = word_count % 8;
+
+		for (u32 i = 0; i < iterations; ++i)
+		{
+			u32 *src_words = (u32*)src_ptr;
+			u32 *dst_words = (u32*)dst_ptr;
+			const __m128i &vector = _mm_loadu_si128(src_ptr);
+			const __m128i &shuffled_vector = _mm_shuffle_epi8(vector, mask);
+			_mm_stream_si128(dst_ptr, shuffled_vector);
+
+			src_ptr++;
+			dst_ptr++;
+		}
+
+		if (remaining)
+		{
+			u16 *src_ptr2 = (u16 *)src_ptr;
+			u16 *dst_ptr2 = (u16 *)dst_ptr;
+
+			for (u32 i = 0; i < remaining; ++i)
+				dst_ptr2[i] = se_storage<u16>::swap(src_ptr2[i]);
+		}
+	}
+
+	inline void stream_data_to_memory_swapped_u32_non_continuous(void *dst, const void *src, u32 vertex_count, u8 dst_stride, u8 src_stride)
+	{
+		const __m128i mask = _mm_set_epi8(
+			0xC, 0xD, 0xE, 0xF,
+			0x8, 0x9, 0xA, 0xB,
+			0x4, 0x5, 0x6, 0x7,
+			0x0, 0x1, 0x2, 0x3);
+
+		char *src_ptr = (char *)src;
+		char *dst_ptr = (char *)dst;
+
+		//Count vertices to copy
+		const bool is_128_aligned = !((dst_stride | src_stride) & 15);
+		
+		const u32 min_block_size = std::min(src_stride, dst_stride);
+		const u32 remainder = is_128_aligned? 0:  (16 - min_block_size) / min_block_size;
+		const u32 iterations = is_128_aligned? vertex_count: vertex_count - remainder;
+
+		for (u32 i = 0; i < iterations; ++i)
+		{
+			const __m128i &vector = _mm_loadu_si128((__m128i*)src_ptr);
+			const __m128i &shuffled_vector = _mm_shuffle_epi8(vector, mask);
+			_mm_storeu_si128((__m128i*)dst_ptr, shuffled_vector);
+
+			src_ptr += src_stride;
+			dst_ptr += dst_stride;
+		}
+
+		if (remainder)
+		{
+			const u8 attribute_sz = min_block_size >> 2;
+			for (u32 n = 0; n < remainder; ++n)
+			{
+				for (u32 v= 0; v < attribute_sz; ++v)
+					((u32*)dst_ptr)[v] = ((be_t<u32>*)src_ptr)[v];
+
+				src_ptr += src_stride;
+				dst_ptr += dst_stride;
+			}
+		}
+	}
+
+	inline void stream_data_to_memory_swapped_u16_non_continuous(void *dst, const void *src, u32 vertex_count, u8 dst_stride, u8 src_stride)
+	{
+		const __m128i mask = _mm_set_epi8(
+			0xE, 0xF, 0xC, 0xD,
+			0xA, 0xB, 0x8, 0x9,
+			0x6, 0x7, 0x4, 0x5,
+			0x2, 0x3, 0x0, 0x1);
+
+		char *src_ptr = (char *)src;
+		char *dst_ptr = (char *)dst;
+
+		const bool is_128_aligned = !((dst_stride | src_stride) & 15);
+
+		const u32 min_block_size = std::min(src_stride, dst_stride);
+		const u32 remainder = is_128_aligned ? 0 : (16 - min_block_size) / min_block_size;
+		const u32 iterations = is_128_aligned ? vertex_count : vertex_count - remainder;
+
+		for (u32 i = 0; i < iterations; ++i)
+		{
+			const __m128i &vector = _mm_loadu_si128((__m128i*)src_ptr);
+			const __m128i &shuffled_vector = _mm_shuffle_epi8(vector, mask);
+			_mm_storeu_si128((__m128i*)dst_ptr, shuffled_vector);
+
+			src_ptr += src_stride;
+			dst_ptr += dst_stride;
+		}
+
+		if (remainder)
+		{
+			const u8 attribute_sz = min_block_size >> 1;
+			for (u32 n = 0; n < remainder; ++n)
+			{
+				for (u32 v = 0; v < attribute_sz; ++v)
+					((u16*)dst_ptr)[v] = ((be_t<u16>*)src_ptr)[v];
+
+				src_ptr += src_stride;
+				dst_ptr += dst_stride;
+			}
+		}
+	}
+
+	inline void stream_data_to_memory_u8_non_continous(void *dst, const void *src, u32 vertex_count, u8 attribute_size, u8 dst_stride, u8 src_stride)
+	{
+		char *src_ptr = (char *)src;
+		char *dst_ptr = (char *)dst;
+
+		switch (attribute_size)
+		{
+			case 4:
+			{
+				//Read one dword every iteration
+				for (u32 vertex = 0; vertex < vertex_count; ++vertex)
+				{
+					*(u32*)dst_ptr = *(u32*)src_ptr;
+
+					dst_ptr += dst_stride;
+					src_ptr += src_stride;
+				}
+
+				break;
+			}
+			case 3:
+			{
+				//Read one word and one byte
+				for (u32 vertex = 0; vertex < vertex_count; ++vertex)
+				{
+					*(u16*)dst_ptr = *(u16*)src_ptr;
+					dst_ptr[2] = src_ptr[2];
+
+					dst_ptr += dst_stride;
+					src_ptr += src_stride;
+				}
+
+				break;
+			}
+			case 2:
+			{
+				//Copy u16 blocks
+				for (u32 vertex = 0; vertex < vertex_count; ++vertex)
+				{
+					*(u32*)dst_ptr = *(u32*)src_ptr;
+
+					dst_ptr += dst_stride;
+					src_ptr += src_stride;
+				}
+
+				break;
+			}
+			case 1:
+			{
+				for (u32 vertex = 0; vertex < vertex_count; ++vertex)
+				{
+					dst_ptr[0] = src_ptr[0];
+
+					dst_ptr += dst_stride;
+					src_ptr += src_stride;
+				}
+
+				break;
+			}
+		}
+	}
+
+#ifdef _DEBUG
+	template <typename U, typename T>
+	void copy_whole_attribute_array(void *raw_dst, void *raw_src, u8 attribute_size, u8 dst_stride, u32 src_stride, u32 vertex_count)
+	{
+		gsl::span<T> dst = gsl::as_span_workaround();
+		const gsl::span<gsl::byte> src = {raw_src, src_stride * vertex_count};
+
 		for (u32 vertex = 0; vertex < vertex_count; ++vertex)
 		{
 			gsl::span<const U> src = gsl::as_span<const U>(src_ptr.subspan(src_stride * vertex, attribute_size * sizeof(const U)));
@@ -44,33 +267,115 @@ namespace
 			}
 		}
 	}
+#else
+	template <typename T, typename U, int N>
+	void copy_whole_attribute_array_impl(void *raw_dst, void *raw_src, u8 dst_stride, u32 src_stride, u32 vertex_count)
+	{
+		char *src_ptr = (char *)raw_src;
+		char *dst_ptr = (char *)raw_dst;
+
+		for (u32 vertex = 0; vertex < vertex_count; ++vertex)
+		{
+			T* typed_dst = (T*)dst_ptr;
+			U* typed_src = (U*)src_ptr;
+
+			for (u32 i = 0; i < N; ++i)
+			{
+				typed_dst[i] = typed_src[i];
+			}
+
+			src_ptr += src_stride;
+			dst_ptr += dst_stride;
+		}
+	}
+
+	template <typename U, typename T>
+	void copy_whole_attribute_array(void *raw_dst, void *raw_src, u8 attribute_size, u8 dst_stride, u32 src_stride, u32 vertex_count)
+	{
+		//Eliminate the inner loop by templating the inner loop counter N
+		switch (attribute_size)
+		{
+		case 1:
+			copy_whole_attribute_array_impl<U, T, 1>(raw_dst, raw_src, dst_stride, src_stride, vertex_count);
+			break;
+		case 2:
+			copy_whole_attribute_array_impl<U, T, 2>(raw_dst, raw_src, dst_stride, src_stride, vertex_count);
+			break;
+		case 3:
+			copy_whole_attribute_array_impl<U, T, 3>(raw_dst, raw_src, dst_stride, src_stride, vertex_count);
+			break;
+		case 4:
+			copy_whole_attribute_array_impl<U, T, 4>(raw_dst, raw_src, dst_stride, src_stride, vertex_count);
+			break;
+		}
+	}
+#endif
 }
 
 void write_vertex_array_data_to_buffer(gsl::span<gsl::byte> raw_dst_span, gsl::span<const gsl::byte> src_ptr, u32 count, rsx::vertex_base_type type, u32 vector_element_count, u32 attribute_src_stride, u8 dst_stride)
 {
 	verify(HERE), (vector_element_count > 0);
+	const u32 src_read_stride = rsx::get_vertex_type_size_on_host(type, vector_element_count);
+
+	bool use_stream_no_stride = false;
+	bool use_stream_with_stride = false;
+
+	//If stride is not defined, we have a packed array
+	if (attribute_src_stride == 0)
+		attribute_src_stride = src_read_stride;
+
+	//TODO: Determine favourable vertex threshold where vector setup costs become negligible
+	//Tests show that even with 4 vertices, using traditional bswap is significantly slower over a large number of calls
+	//Tested with atelier, discriminating based on vertex count is measurably slower
+	//NOTE: src_read_stride is guaranteed to be less than dst_stride!
+
+	if (src_read_stride > dst_stride)
+		fmt::throw_exception("src_read is greater than dst write. Impossible situation.");
+
+	const u64 src_address = (u64)src_ptr.data();
+	const bool sse_aligned = ((src_address & 15) == 0);
+	
+#ifndef _DEBUG
+	if (attribute_src_stride == dst_stride && src_read_stride == dst_stride)
+		use_stream_no_stride = true;
+	else
+		use_stream_with_stride = true;
+#endif
 
 	switch (type)
 	{
 	case rsx::vertex_base_type::ub:
 	case rsx::vertex_base_type::ub256:
 	{
-		gsl::span<u8> dst_span = as_span_workaround<u8>(raw_dst_span);
-		copy_whole_attribute_array<u8>(dst_span, src_ptr, vector_element_count, dst_stride, attribute_src_stride, count);
+		if (use_stream_no_stride)
+			memcpy(raw_dst_span.data(), src_ptr.data(), count * dst_stride);
+		else
+			stream_data_to_memory_u8_non_continous(raw_dst_span.data(), src_ptr.data(), count, vector_element_count, dst_stride, attribute_src_stride);
+
 		return;
 	}
 	case rsx::vertex_base_type::s1:
 	case rsx::vertex_base_type::sf:
 	case rsx::vertex_base_type::s32k:
 	{
-		gsl::span<u16> dst_span = as_span_workaround<u16>(raw_dst_span);
-		copy_whole_attribute_array<be_t<u16>>(dst_span, src_ptr, vector_element_count, dst_stride, attribute_src_stride, count);
+		if (use_stream_no_stride && sse_aligned)
+			stream_data_to_memory_swapped_u16(raw_dst_span.data(), src_ptr.data(), count, attribute_src_stride);
+		else if (use_stream_with_stride)
+			stream_data_to_memory_swapped_u16_non_continuous(raw_dst_span.data(), src_ptr.data(), count, dst_stride, attribute_src_stride);
+		else
+			copy_whole_attribute_array<be_t<u16>, u16>((void *)raw_dst_span.data(), (void *)src_ptr.data(), vector_element_count, dst_stride, attribute_src_stride, count);
+
 		return;
 	}
 	case rsx::vertex_base_type::f:
 	{
-		gsl::span<u32> dst_span = as_span_workaround<u32>(raw_dst_span);
-		copy_whole_attribute_array<be_t<u32>>(dst_span, src_ptr, vector_element_count, dst_stride, attribute_src_stride, count);
+		if (use_stream_no_stride && sse_aligned)
+			stream_data_to_memory_swapped_u32(raw_dst_span.data(), src_ptr.data(), count, attribute_src_stride);
+		else if (use_stream_with_stride)
+			stream_data_to_memory_swapped_u32_non_continuous(raw_dst_span.data(), src_ptr.data(), count, dst_stride, attribute_src_stride);
+		else
+			copy_whole_attribute_array<be_t<u32>, u32>((void *)raw_dst_span.data(), (void *)src_ptr.data(), vector_element_count, dst_stride, attribute_src_stride, count);
+
 		return;
 	}
 	case rsx::vertex_base_type::cmp:
diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.cpp b/rpcs3/Emu/RSX/GL/GLGSRender.cpp
index db185ee6dd..025b89c699 100644
--- a/rpcs3/Emu/RSX/GL/GLGSRender.cpp
+++ b/rpcs3/Emu/RSX/GL/GLGSRender.cpp
@@ -501,6 +501,10 @@ void GLGSRender::on_init_thread()
 	glGetIntegerv(GL_TEXTURE_BUFFER_OFFSET_ALIGNMENT, &m_min_texbuffer_alignment);
 	m_vao.create();
 
+	//Set min alignment to 16-bytes for SSE optimizations with aligned addresses to work
+	m_min_texbuffer_alignment = std::max(m_min_texbuffer_alignment, 16);
+	m_uniform_buffer_offset_align = std::max(m_uniform_buffer_offset_align, 16);
+
 	const u32 texture_index_offset = rsx::limits::fragment_textures_count + rsx::limits::vertex_textures_count;
 
 	for (int index = 0; index < rsx::limits::vertex_count; ++index)
diff --git a/rpcs3/Emu/RSX/GL/GLVertexBuffers.cpp b/rpcs3/Emu/RSX/GL/GLVertexBuffers.cpp
index 907e419b29..6a1d2798b7 100644
--- a/rpcs3/Emu/RSX/GL/GLVertexBuffers.cpp
+++ b/rpcs3/Emu/RSX/GL/GLVertexBuffers.cpp
@@ -231,10 +231,10 @@ namespace
 			buffer_offset     = mapping.second;
 			gsl::span<gsl::byte> dest_span(dst, data_size);
 
-			prepare_buffer_for_writing(dst, vertex_array.type, vertex_array.attribute_size, vertex_count);
-
 			write_vertex_array_data_to_buffer(dest_span, vertex_array.data, vertex_count, vertex_array.type, vertex_array.attribute_size, vertex_array.stride, rsx::get_vertex_type_size_on_host(vertex_array.type, vertex_array.attribute_size));
 
+			prepare_buffer_for_writing(dst, vertex_array.type, vertex_array.attribute_size, vertex_count);
+
 			texture.copy_from(m_attrib_ring_info, gl_type, buffer_offset, data_size);
 		}
 
diff --git a/rpcs3/Emu/RSX/VK/VKVertexBuffers.cpp b/rpcs3/Emu/RSX/VK/VKVertexBuffers.cpp
index 6fc423c4f6..5973084ce0 100644
--- a/rpcs3/Emu/RSX/VK/VKVertexBuffers.cpp
+++ b/rpcs3/Emu/RSX/VK/VKVertexBuffers.cpp
@@ -269,11 +269,14 @@ namespace
 
 			VkDeviceSize offset_in_attrib_buffer = m_attrib_ring_info.alloc<256>(upload_size);
 			void *dst = m_attrib_ring_info.map(offset_in_attrib_buffer, upload_size);
-			vk::prepare_buffer_for_writing(dst, vertex_array.type, vertex_array.attribute_size, vertex_count);
+			
 			gsl::span<gsl::byte> dest_span(static_cast<gsl::byte*>(dst), upload_size);
-
 			write_vertex_array_data_to_buffer(dest_span, vertex_array.data, vertex_count, vertex_array.type, vertex_array.attribute_size, vertex_array.stride, real_element_size);
 
+			//Padding the vertex buffer should be done after the writes have been done
+			//write_vertex_data function may 'dirty' unused sections of the buffer as optimization
+			vk::prepare_buffer_for_writing(dst, vertex_array.type, vertex_array.attribute_size, vertex_count);
+
 			m_attrib_ring_info.unmap();
 			const VkFormat format = vk::get_suitable_vk_format(vertex_array.type, vertex_array.attribute_size);