From 3cd8891ab8affea4841aa1009afabf49e0a102d0 Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Thu, 23 Dec 2021 18:03:48 +0300 Subject: [PATCH] Re-refactor copy_data_swap_u32 again Drop AVX2 path for now, since it usually operates on small data. Rely on automatic SSE vectorization on recent compilers. Side refactoring on JIT.h to workaround weird conflict issue. --- Utilities/JIT.cpp | 5 +- Utilities/JIT.h | 47 ++---- rpcs3/Emu/CPU/CPUTranslator.h | 1 + rpcs3/Emu/RSX/Common/BufferUtils.cpp | 235 ++++++--------------------- rpcs3/Emu/RSX/Common/BufferUtils.h | 5 +- 5 files changed, 69 insertions(+), 224 deletions(-) diff --git a/Utilities/JIT.cpp b/Utilities/JIT.cpp index a58c29a8bb..6413053a0e 100644 --- a/Utilities/JIT.cpp +++ b/Utilities/JIT.cpp @@ -763,11 +763,12 @@ std::string jit_compiler::cpu(const std::string& _cpu) } jit_compiler::jit_compiler(const std::unordered_map& _link, const std::string& _cpu, u32 flags) - : m_cpu(cpu(_cpu)) + : m_context(new llvm::LLVMContext) + , m_cpu(cpu(_cpu)) { std::string result; - auto null_mod = std::make_unique ("null_", m_context); + auto null_mod = std::make_unique ("null_", *m_context); if (_link.empty()) { diff --git a/Utilities/JIT.h b/Utilities/JIT.h index e90b039a67..85454c07e1 100644 --- a/Utilities/JIT.h +++ b/Utilities/JIT.h @@ -1,5 +1,7 @@ #pragma once +#include "util/types.hpp" + // Include asmjit with warnings ignored #define ASMJIT_EMBED #define ASMJIT_DEBUG @@ -27,6 +29,10 @@ #include #include +#include +#include +#include +#include enum class jit_class { @@ -251,43 +257,18 @@ public: #ifdef LLVM_AVAILABLE -#include -#include -#include -#include - -#include "util/types.hpp" - -#ifdef _MSC_VER -#pragma warning(push, 0) -#else -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wall" -#pragma GCC diagnostic ignored "-Wextra" -#pragma GCC diagnostic ignored "-Wold-style-cast" -#pragma GCC diagnostic ignored "-Wsuggest-override" -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Weffc++" -#pragma GCC diagnostic ignored "-Wmissing-noreturn" -#ifdef __clang__ -#pragma clang diagnostic ignored "-Winconsistent-missing-override" -#endif -#endif -#include "llvm/IR/LLVMContext.h" -#include "llvm/IR/Module.h" -#include "llvm/ExecutionEngine/ExecutionEngine.h" -#ifdef _MSC_VER -#pragma warning(pop) -#else -#pragma GCC diagnostic pop -#endif +namespace llvm +{ + class LLVMContext; + class ExecutionEngine; + class Module; +} // Temporary compiler interface class jit_compiler final { // Local LLVM context - llvm::LLVMContext m_context{}; + std::unique_ptr m_context{}; // Execution instance std::unique_ptr m_engine{}; @@ -302,7 +283,7 @@ public: // Get LLVM context auto& get_context() { - return m_context; + return *m_context; } auto& get_engine() const diff --git a/rpcs3/Emu/CPU/CPUTranslator.h b/rpcs3/Emu/CPU/CPUTranslator.h index b834fe8aed..9cd3d8cb30 100644 --- a/rpcs3/Emu/CPU/CPUTranslator.h +++ b/rpcs3/Emu/CPU/CPUTranslator.h @@ -15,6 +15,7 @@ #pragma GCC diagnostic ignored "-Wmissing-noreturn" #endif #include "llvm/IR/LLVMContext.h" +#include "llvm/ExecutionEngine/ExecutionEngine.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Module.h" #include "llvm/Target/TargetMachine.h" diff --git a/rpcs3/Emu/RSX/Common/BufferUtils.cpp b/rpcs3/Emu/RSX/Common/BufferUtils.cpp index 3cdabd12ea..8e2f2e89f4 100644 --- a/rpcs3/Emu/RSX/Common/BufferUtils.cpp +++ b/rpcs3/Emu/RSX/Common/BufferUtils.cpp @@ -18,13 +18,22 @@ #endif #if defined(_MSC_VER) +#define PLAIN_FUNC #define SSSE3_FUNC #define SSE4_1_FUNC #define AVX2_FUNC +#define AVX3_FUNC #else +#ifndef __clang__ +#define PLAIN_FUNC __attribute__((optimize("no-tree-vectorize"))) +#define SSSE3_FUNC __attribute__((__target__("ssse3"))) __attribute__((optimize("tree-vectorize"))) +#else +#define PLAIN_FUNC #define SSSE3_FUNC __attribute__((__target__("ssse3"))) +#endif #define SSE4_1_FUNC __attribute__((__target__("sse4.1"))) #define AVX2_FUNC __attribute__((__target__("avx2"))) +#define AVX3_FUNC __attribute__((__target__("avx512f,avx512bw,avx512dq,avx512cd,avx512vl"))) #ifndef __AVX2__ using __m256i = long long __attribute__((vector_size(32))); #endif @@ -45,22 +54,31 @@ SSE4_1_FUNC static inline u16 sse41_hmax_epu16(__m128i x) return ~_mm_cvtsi128_si32(_mm_minpos_epu16(_mm_xor_si128(x, _mm_set1_epi32(-1)))); } -#if defined(__AVX2__) +#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512CD__) && defined(__AVX512BW__) constexpr bool s_use_ssse3 = true; constexpr bool s_use_sse4_1 = true; constexpr bool s_use_avx2 = true; +constexpr bool s_use_avx3 = true; +#elif defined(__AVX2__) +constexpr bool s_use_ssse3 = true; +constexpr bool s_use_sse4_1 = true; +constexpr bool s_use_avx2 = true; +constexpr bool s_use_avx3 = false; #elif defined(__SSE41__) constexpr bool s_use_ssse3 = true; constexpr bool s_use_sse4_1 = true; constexpr bool s_use_avx2 = false; +constexpr bool s_use_avx3 = false; #elif defined(__SSSE3__) constexpr bool s_use_ssse3 = true; constexpr bool s_use_sse4_1 = false; constexpr bool s_use_avx2 = false; +constexpr bool s_use_avx3 = false; #else const bool s_use_ssse3 = utils::has_ssse3(); const bool s_use_sse4_1 = utils::has_sse41(); const bool s_use_avx2 = utils::has_avx2(); +const bool s_use_avx3 = utils::has_avx512(); #endif const __m128i s_bswap_u32_mask = _mm_set_epi8( @@ -102,226 +120,69 @@ namespace X = X << 5; return{ X, Y, Z, 1 }; } -} -template -AVX2_FUNC inline bool copy_data_swap_u32_avx2(void*& dst, const void*& src, u32 count) -{ - const __m256i bswap_u32_mask = _mm256_set_m128i(s_bswap_u32_mask, s_bswap_u32_mask); - - __m128i diff0 = _mm_setzero_si128(); - __m256i diff = _mm256_setzero_si256(); - - if (uptr(dst) & 16 && count >= 4) + template + PLAIN_FUNC bool copy_data_swap_u32_naive(u32* dst, const u32* src, u32 count) { - const auto dst0 = static_cast<__m128i*>(dst); - const auto src0 = static_cast(src); - const auto data = _mm_shuffle_epi8(_mm_loadu_si128(src0), s_bswap_u32_mask); - - if (Compare) - { - diff0 = _mm_xor_si128(data, _mm_load_si128(dst0)); - } - - _mm_store_si128(dst0, data); - dst = dst0 + 1; - src = src0 + 1; - count -= 4; - } - - const u32 lane_count = count / 8; - - auto dst_ptr = static_cast<__m256i*>(dst); - auto src_ptr = static_cast(src); + u32 result = 0; #ifdef __clang__ -#pragma clang loop unroll(disable) + #pragma clang loop vectorize(disable) interleave(disable) unroll(disable) #endif - for (u32 i = 0; i < lane_count; ++i) - { - const __m256i vec0 = _mm256_loadu_si256(src_ptr + i); - const __m256i vec1 = _mm256_shuffle_epi8(vec0, bswap_u32_mask); - - if constexpr (Compare) + for (u32 i = 0; i < count; i++) { - diff = _mm256_or_si256(diff, _mm256_xor_si256(vec1, _mm256_load_si256(dst_ptr + i))); - } - - _mm256_store_si256(dst_ptr + i, vec1); - } - - dst = dst_ptr + lane_count; - src = src_ptr + lane_count; - - if (count & 4) - { - const auto dst0 = static_cast<__m128i*>(dst); - const auto src0 = static_cast(src); - const auto data = _mm_shuffle_epi8(_mm_loadu_si128(src0), s_bswap_u32_mask); - - if (Compare) - { - diff0 = _mm_or_si128(diff0, _mm_xor_si128(data, _mm_load_si128(dst0))); - } - - _mm_store_si128(dst0, data); - dst = dst0 + 1; - src = src0 + 1; - } - - if constexpr (Compare) - { - diff = _mm256_or_si256(diff, _mm256_set_m128i(_mm_setzero_si128(), diff0)); - return !_mm256_testz_si256(diff, diff); - } - else - { - return false; - } -} - -template -static auto copy_data_swap_u32(void* dst, const void* src, u32 count) -{ - bool result = false; - - if (uptr(dst) & 4) - { - const auto dst0 = static_cast(dst); - const auto src0 = static_cast(src); - const u32 data = stx::se_storage::swap(*src0); - - if (Compare && *dst0 != data) - { - result = true; - } - - *dst0 = data; - dst = dst0 + 1; - src = src0 + 1; - count--; - } - - if (uptr(dst) & 8 && count >= 2) - { - const auto dst0 = static_cast(dst); - const auto src0 = static_cast(src); - const u64 data = utils::rol64(stx::se_storage::swap(*src0), 32); - - if (Compare && *dst0 != data) - { - result = true; - } - - *dst0 = data; - dst = dst0 + 1; - src = src0 + 1; - count -= 2; - } - - const u32 lane_count = count / 4; - - if (s_use_avx2) [[likely]] - { - result |= copy_data_swap_u32_avx2(dst, src, count); - } - else if (s_use_ssse3) - { - __m128i diff = _mm_setzero_si128(); - - auto dst_ptr = static_cast<__m128i*>(dst); - auto src_ptr = static_cast(src); - - for (u32 i = 0; i < lane_count; ++i) - { - const __m128i vec0 = _mm_loadu_si128(src_ptr + i); - const __m128i vec1 = ssse3_shuffle_epi8(vec0, s_bswap_u32_mask); + const u32 data = stx::se_storage::swap(src[i]); if constexpr (Compare) { - diff = _mm_or_si128(diff, _mm_xor_si128(vec1, _mm_load_si128(dst_ptr + i))); + result |= data ^ dst[i]; } - _mm_store_si128(dst_ptr + i, vec1); + dst[i] = data; } - result |= _mm_cvtsi128_si64(_mm_packs_epi32(diff, diff)) != 0; - - dst = dst_ptr + lane_count; - src = src_ptr + lane_count; + return static_cast(result); } - else + + template + SSSE3_FUNC bool copy_data_swap_u32_ssse3(u32* dst, const u32* src, u32 count) { - __m128i diff = _mm_setzero_si128(); + u32 result = 0; - auto dst_ptr = static_cast<__m128i*>(dst); - auto src_ptr = static_cast(src); - - for (u32 i = 0; i < lane_count; ++i) +#ifdef __clang__ + #pragma clang loop vectorize(enable) interleave(disable) unroll(disable) +#endif + for (u32 i = 0; i < count; i++) { - const __m128i vec0 = _mm_loadu_si128(src_ptr + i); - const __m128i vec1 = _mm_or_si128(_mm_slli_epi16(vec0, 8), _mm_srli_epi16(vec0, 8)); - const __m128i vec2 = _mm_or_si128(_mm_slli_epi32(vec1, 16), _mm_srli_epi32(vec1, 16)); + const u32 data = stx::se_storage::swap(src[i]); if constexpr (Compare) { - diff = _mm_or_si128(diff, _mm_xor_si128(vec2, _mm_load_si128(dst_ptr + i))); + result |= data ^ dst[i]; } - _mm_store_si128(dst_ptr + i, vec2); + dst[i] = data; } - result |= _mm_cvtsi128_si64(_mm_packs_epi32(diff, diff)) != 0; - - dst = dst_ptr + lane_count; - src = src_ptr + lane_count; + return static_cast(result); } - if (count & 2) + template + void build_copy_data_swap_u32(asmjit::X86Assembler& c, std::array& args) { - const auto dst0 = static_cast(dst); - const auto src0 = static_cast(src); - const u64 data = utils::rol64(stx::se_storage::swap(*src0), 32); - - if (Compare && *dst0 != data) + if (utils::has_ssse3()) { - result = true; + c.jmp(asmjit::imm_ptr(©_data_swap_u32_ssse3)); + return; } - *dst0 = data; - dst = dst0 + 1; - src = src0 + 1; - } - - if (count & 1) - { - const auto dst0 = static_cast(dst); - const auto src0 = static_cast(src); - const u32 data = stx::se_storage::swap(*src0); - - if (Compare && *dst0 != data) - { - result = true; - } - - *dst0 = data; - } - - if constexpr (Compare) - { - return result; + c.jmp(asmjit::imm_ptr(©_data_swap_u32_naive)); } } -bool copy_data_swap_u32_cmp(void* dst, const void* src, u32 count) -{ - return copy_data_swap_u32(dst, src, count); -} +built_function copy_data_swap_u32(&build_copy_data_swap_u32); -void copy_data_swap_u32(void* dst, const void* src, u32 count) -{ - copy_data_swap_u32(dst, src, count); -} +built_function copy_data_swap_u32_cmp(&build_copy_data_swap_u32); namespace { diff --git a/rpcs3/Emu/RSX/Common/BufferUtils.h b/rpcs3/Emu/RSX/Common/BufferUtils.h index 5cef7472cf..84684c33e4 100644 --- a/rpcs3/Emu/RSX/Common/BufferUtils.h +++ b/rpcs3/Emu/RSX/Common/BufferUtils.h @@ -1,6 +1,7 @@ #pragma once #include "../gcm_enums.h" +#include "Utilities/JIT.h" #include @@ -56,7 +57,7 @@ void stream_vector(void *dst, u32 x, u32 y, u32 z, u32 w); void stream_vector_from_memory(void *dst, void *src); // Copy and swap data in 32-bit units -void copy_data_swap_u32(void* dst, const void* src, u32 count); +extern built_function copy_data_swap_u32; // Copy and swap data in 32-bit units, return true if changed -bool copy_data_swap_u32_cmp(void* dst, const void* src, u32 count); +extern built_function copy_data_swap_u32_cmp;