diff --git a/Utilities/types.h b/Utilities/types.h index a040aa400c..cd99e4cb1a 100644 --- a/Utilities/types.h +++ b/Utilities/types.h @@ -304,6 +304,11 @@ struct alignas(16) u128 { } + constexpr explicit operator bool() const noexcept + { + return !!(lo | hi); + } + friend u128 operator+(const u128& l, const u128& r) { u128 value; @@ -384,6 +389,28 @@ struct alignas(16) u128 return value; } + u128 operator<<(u128 shift_value) + { + const u64 v0 = lo << (shift_value.lo & 63); + const u64 v1 = __shiftleft128(lo, hi, shift_value.lo); + + u128 value; + value.lo = (shift_value.lo & 64) ? 0 : v0; + value.hi = (shift_value.lo & 64) ? v0 : v1; + return value; + } + + u128 operator>>(u128 shift_value) + { + const u64 v0 = hi >> (shift_value.lo & 63); + const u64 v1 = __shiftright128(lo, hi, shift_value.lo); + + u128 value; + value.lo = (shift_value.lo & 64) ? v0 : v1; + value.hi = (shift_value.lo & 64) ? 0 : v0; + return value; + } + u128 operator~() const { u128 value; diff --git a/rpcs3/Emu/CMakeLists.txt b/rpcs3/Emu/CMakeLists.txt index 8c77f9aaa4..d6d089b949 100644 --- a/rpcs3/Emu/CMakeLists.txt +++ b/rpcs3/Emu/CMakeLists.txt @@ -339,10 +339,6 @@ target_link_libraries(rpcs3_emu PRIVATE 3rdparty::stblib 3rdparty::libpng) -if(CMAKE_COMPILER_IS_GNUCXX) - target_link_libraries(rpcs3_emu PUBLIC -latomic) -endif() - # CPU target_sources(rpcs3_emu PRIVATE diff --git a/rpcs3/Emu/Cell/PPUThread.cpp b/rpcs3/Emu/Cell/PPUThread.cpp index 51d759f02b..7a9ed9e56c 100644 --- a/rpcs3/Emu/Cell/PPUThread.cpp +++ b/rpcs3/Emu/Cell/PPUThread.cpp @@ -1199,7 +1199,7 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr) verify(HERE), cpu_thread::if_suspended<-1>(&ppu, [&]() { // Guaranteed success - ppu.rtime = vm::reservation_acquire(addr, sizeof(T)) & -128; + ppu.rtime = vm::reservation_acquire(addr, sizeof(T)); mov_rdata(ppu.rdata, *vm::get_super_ptr(addr & -128)); }); diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index 95753f9f2e..823fa8cdea 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -1573,7 +1573,10 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args) } // Keep src point to const - auto [dst, src] = [&]() -> std::pair + u8* dst = nullptr; + const u8* src = nullptr; + + std::tie(dst, src) = [&]() -> std::pair { u8* dst = vm::_ptr(eal); u8* src = _ptr(lsa); @@ -1596,6 +1599,15 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args) if ((!g_use_rtm && !is_get) || g_cfg.core.spu_accurate_dma) [[unlikely]] { + perf_meter<"ADMA_GET"_u64> perf_get; + perf_meter<"ADMA_PUT"_u64> perf_put = perf_get; + + if (!g_cfg.core.spu_accurate_dma) [[likely]] + { + perf_put.reset(); + perf_get.reset(); + } + for (u32 size = args.size, size0; is_get; size -= size0, dst += size0, src += size0, eal += size0) { @@ -1613,7 +1625,9 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args) } else { + state += cpu_flag::wait + cpu_flag::temp; std::this_thread::yield(); + check_state(); } }()) { @@ -1728,8 +1742,102 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args) continue; } - // Lock each cache line execlusively - auto [res, time0] = vm::reservation_lock(eal); + // Lock each cache line + auto& res = vm::reservation_acquire(eal, size0); + + // Lock each bit corresponding to a byte being written, using some free space in reservation memory + auto* bits = reinterpret_cast*>(vm::g_reservations + (eal & 0xff80) / 2 + 16); + + // Get writing mask + const u128 wmask = (~u128{} << (eal & 127)) & (~u128{} >> (127 - ((eal + size0 - 1) & 127))); + //const u64 start = (eal & 127) / 2; + //const u64 _end_ = ((eal + size0 - 1) & 127) / 2; + //const u64 wmask = (UINT64_MAX << start) & (UINT64_MAX >> (63 - _end_)); + + u128 old = 0; + + for (u64 i = 0; i != umax; [&]() + { + if (state & cpu_flag::pause) + { + cpu_thread::if_suspended(this, [&] + { + std::memcpy(dst, src, size0); + res += 128; + }); + + // Exit loop and function + i = -1; + bits = nullptr; + return; + } + else if (++i < 10) + { + busy_wait(500); + } + else + { + // Wait + state += cpu_flag::wait + cpu_flag::temp; + bits->wait(old, wmask); + check_state(); + } + }()) + { + // Completed in suspend_all() + if (!bits) + { + break; + } + + bool ok = false; + + std::tie(old, ok) = bits->fetch_op([&](auto& v) + { + if (v & wmask) + { + return false; + } + + v |= wmask; + return true; + }); + + if (ok) [[likely]] + { + break; + } + } + + if (!bits) + { + if (size == size0) + { + break; + } + + continue; + } + + // Lock reservation (shared) + auto [_oldd, _ok] = res.fetch_op([&](u64& r) + { + if (r & vm::rsrv_unique_lock) + { + return false; + } + + r += 1; + return true; + }); + + if (!_ok) + { + vm::reservation_shared_lock_internal(res); + } + + // Obtain range lock as normal store + vm::range_lock(range_lock, eal, size0); switch (size0) { @@ -1777,7 +1885,17 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args) } } - res += 64; + range_lock->release(0); + + res += 127; + + // Release bits and notify + bits->atomic_op([&](auto& v) + { + v &= ~wmask; + }); + + bits->notify_all(); if (size == size0) { @@ -1785,10 +1903,12 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args) } } - std::atomic_thread_fence(std::memory_order_seq_cst); + //std::atomic_thread_fence(std::memory_order_seq_cst); return; } + perf_meter<"DMA_PUT"_u64> perf2; + switch (u32 size = args.size) { case 1: @@ -2587,7 +2707,7 @@ bool spu_thread::process_mfc_cmd() continue; } - if (g_use_rtm && i >= 15 && g_cfg.core.perf_report) [[unlikely]] + if (i >= 15 && g_cfg.core.perf_report) [[unlikely]] { perf_log.warning("GETLLAR: took too long: %u", i); } diff --git a/rpcs3/Emu/Memory/vm.cpp b/rpcs3/Emu/Memory/vm.cpp index a8f3c93e7e..dacaad4456 100644 --- a/rpcs3/Emu/Memory/vm.cpp +++ b/rpcs3/Emu/Memory/vm.cpp @@ -510,7 +510,18 @@ namespace vm { for (u64 i = 0;; i++) { - if (!(res & rsrv_unique_lock)) [[likely]] + auto [_oldd, _ok] = res.fetch_op([&](u64& r) + { + if (r & rsrv_unique_lock) + { + return false; + } + + r += 1; + return true; + }); + + if (_ok) [[likely]] { return; } diff --git a/rpcs3/Emu/Memory/vm_reservation.h b/rpcs3/Emu/Memory/vm_reservation.h index 4c28571f3f..ff1b6579a5 100644 --- a/rpcs3/Emu/Memory/vm_reservation.h +++ b/rpcs3/Emu/Memory/vm_reservation.h @@ -87,11 +87,11 @@ namespace vm // Use 128-byte aligned addr const u32 addr = static_cast(ptr.addr()) & -128; + auto& res = vm::reservation_acquire(addr, 128); + _m_prefetchw(&res); + if (g_use_rtm) { - auto& res = vm::reservation_acquire(addr, 128); - _m_prefetchw(&res); - // Stage 1: single optimistic transaction attempt unsigned status = _XBEGIN_STARTED; unsigned count = 0; @@ -267,15 +267,15 @@ namespace vm } } - // Perform heavyweight lock - auto [res, rtime] = vm::reservation_lock(addr); + // Lock reservation and perform heavyweight lock + reservation_shared_lock_internal(res); if constexpr (std::is_void_v>) { { vm::writer_lock lock(addr); std::invoke(op, *sptr); - res += 64; + res += 127; } if constexpr (Ack) @@ -290,11 +290,11 @@ namespace vm if ((result = std::invoke(op, *sptr))) { - res += 64; + res += 127; } else { - res -= 64; + res -= 1; } } @@ -314,9 +314,6 @@ namespace vm // Atomic operation will be performed on aligned 128 bytes of data, so the data size and alignment must comply static_assert(sizeof(T) <= 128 && alignof(T) == sizeof(T), "vm::reservation_peek: unsupported type"); - // Use "super" pointer to prevent access violation handling during atomic op - const auto sptr = vm::get_super_ptr(static_cast(ptr.addr())); - // Use 128-byte aligned addr const u32 addr = static_cast(ptr.addr()) & -128; @@ -340,7 +337,7 @@ namespace vm // Observe data non-atomically and make sure no reservation updates were made if constexpr (std::is_void_v>) { - std::invoke(op, *sptr); + std::invoke(op, *ptr); if (rtime == vm::reservation_acquire(addr, 128)) { @@ -349,7 +346,7 @@ namespace vm } else { - auto res = std::invoke(op, *sptr); + auto res = std::invoke(op, *ptr); if (rtime == vm::reservation_acquire(addr, 128)) { @@ -371,7 +368,18 @@ namespace vm // "Lock" reservation auto& res = vm::reservation_acquire(addr, 128); - if (res.fetch_add(1) & vm::rsrv_unique_lock) [[unlikely]] + auto [_old, _ok] = res.fetch_op([&](u64& r) + { + if (r & vm::rsrv_unique_lock) + { + return false; + } + + r += 1; + return true; + }); + + if (!_ok) [[unlikely]] { vm::reservation_shared_lock_internal(res); } diff --git a/rpcs3/util/atomic.hpp b/rpcs3/util/atomic.hpp index a134716f15..f885b1e190 100644 --- a/rpcs3/util/atomic.hpp +++ b/rpcs3/util/atomic.hpp @@ -639,6 +639,13 @@ template struct atomic_storage : atomic_storage { #ifdef _MSC_VER + static inline T load(const T& dest) + { + __m128i val = _mm_load_si128(reinterpret_cast(&dest)); + std::atomic_thread_fence(std::memory_order_acquire); + return std::bit_cast(val); + } + static inline bool compare_exchange(T& dest, T& comp, T exch) { struct alignas(16) llong2 { llong ll[2]; }; @@ -646,13 +653,6 @@ struct atomic_storage : atomic_storage return _InterlockedCompareExchange128(reinterpret_cast(&dest), _exch.ll[1], _exch.ll[0], reinterpret_cast(&comp)) != 0; } - static inline T load(const T& dest) - { - struct alignas(16) llong2 { llong ll[2]; } result{}; - _InterlockedCompareExchange128(reinterpret_cast(&const_cast(dest)), result.ll[1], result.ll[0], result.ll); - return std::bit_cast(result); - } - static inline T exchange(T& dest, T value) { struct alignas(16) llong2 { llong ll[2]; }; @@ -670,9 +670,78 @@ struct atomic_storage : atomic_storage } static inline void release(T& dest, T value) + { + std::atomic_thread_fence(std::memory_order_release); + _mm_store_si128(reinterpret_cast<__m128i*>(&dest), std::bit_cast<__m128i>(value)); + } +#else + static inline T load(const T& dest) + { + __m128i val = _mm_load_si128(reinterpret_cast(&dest)); + __atomic_thread_fence(__ATOMIC_ACQUIRE); + return std::bit_cast(val); + } + + static inline bool compare_exchange(T& dest, T& comp, T exch) + { + bool result; + ullong cmp_lo = 0; + ullong cmp_hi = 0; + ullong exc_lo = 0; + ullong exc_hi = 0; + + if constexpr (std::is_same_v || std::is_same_v) + { + cmp_lo = comp; + cmp_hi = comp >> 64; + exc_lo = exch; + exc_hi = exch >> 64; + } + else + { + std::memcpy(&cmp_lo, reinterpret_cast(&comp) + 0, 8); + std::memcpy(&cmp_hi, reinterpret_cast(&comp) + 8, 8); + std::memcpy(&exc_lo, reinterpret_cast(&exch) + 0, 8); + std::memcpy(&exc_hi, reinterpret_cast(&exch) + 8, 8); + } + + __asm__ volatile("lock cmpxchg16b %1;" + : "=@ccz" (result) + , "+m" (dest) + , "+d" (cmp_hi) + , "+a" (cmp_lo) + : "c" (exc_hi) + , "b" (exc_lo) + : "cc"); + + if constexpr (std::is_same_v || std::is_same_v) + { + comp = T{cmp_hi} << 64 | cmp_lo; + } + else + { + std::memcpy(reinterpret_cast(&comp) + 0, &cmp_lo, 8); + std::memcpy(reinterpret_cast(&comp) + 8, &cmp_hi, 8); + } + + return result; + } + + static inline T exchange(T& dest, T value) + { + return std::bit_cast(__sync_lock_test_and_set(reinterpret_cast(&dest), std::bit_cast(value))); + } + + static inline void store(T& dest, T value) { exchange(dest, value); } + + static inline void release(T& dest, T value) + { + __atomic_thread_fence(__ATOMIC_RELEASE); + _mm_store_si128(reinterpret_cast<__m128i*>(&dest), std::bit_cast<__m128i>(value)); + } #endif // TODO