From cfda4d0ade2e7c1732aec00efe719d3b762ae690 Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Fri, 13 Nov 2020 05:32:50 +0300 Subject: [PATCH] atomic.cpp: optimize raw_notify() for unspecified pointer mode Remove unnecessary optimization from cond_alloc(). Optimistic case was absolutely dominating anyway. Although the whole function is a dirty hack. Now scanning through all threads is faster. --- rpcs3/util/atomic.cpp | 56 +++++++++++++++++++++++++++++++++---------- 1 file changed, 43 insertions(+), 13 deletions(-) diff --git a/rpcs3/util/atomic.cpp b/rpcs3/util/atomic.cpp index a3bb983507..8005c97f51 100644 --- a/rpcs3/util/atomic.cpp +++ b/rpcs3/util/atomic.cpp @@ -532,7 +532,10 @@ static atomic_wait::cond_handle s_cond_list[UINT16_MAX + 1]{}; static atomic_t s_cond_bits[(UINT16_MAX + 1) / 64]{}; // Allocation semaphore -static atomic_t s_cond_sema{0}; +static atomic_t s_cond_sema{0}; + +// Max possible search distance (max i in loop) +static atomic_t s_cond_max{0}; static u32 #ifdef _WIN32 @@ -548,16 +551,7 @@ cond_alloc(std::uintptr_t iptr, __m128i mask) return 0; } - // Diversify search start points to reduce contention and increase immediate success chance -#ifdef _WIN32 - const u32 start = GetCurrentProcessorNumber(); -#elif __linux__ - const u32 start = sched_getcpu(); -#else - const u32 start = __rdtsc(); -#endif - - for (u32 i = start;; i++) + for (u32 i = 0;; i++) { const u32 group = i % ::size32(s_cond_bits); @@ -588,6 +582,18 @@ cond_alloc(std::uintptr_t iptr, __m128i mask) s_cond_list[id].mask = mask; s_cond_list[id].init(iptr); + // Update some stats + s_cond_max.fetch_op([i](u32& val) + { + if (val < i) + { + val = i; + return true; + } + + return false; + }); + return id; } } @@ -1373,9 +1379,33 @@ bool atomic_wait_engine::raw_notify(const void* data, u64 thread_id) // Special operation mode. Note that this is not atomic. if (!data) { - // Special path: search thread_id without pointer information - for (u32 i = 1; i <= UINT16_MAX; i++) + if (!s_cond_sema) { + return false; + } + + // Special path: search thread_id without pointer information + for (u32 i = 1; i < (s_cond_max + 1) * 64; i++) + { + if ((i & 63) == 0) + { + for (u64 bits = s_cond_bits[i / 64]; bits; bits &= bits - 1) + { + utils::prefetch_read(s_cond_list + i + std::countl_zero(bits)); + } + } + + if (!s_cond_bits[i / 64]) + { + i |= 63; + continue; + } + + if (~s_cond_bits[i / 64] & (1ull << i)) + { + continue; + } + const auto cond = s_cond_list + i; const auto [old, ok] = cond->ptr_ref.fetch_op([&](u64& val)