CPU: use u128 in bit counter

Apparently, u64 was not enough.
This commit is contained in:
Nekotekina 2020-11-19 16:05:08 +03:00
parent d4d5dc99f3
commit 292af1e4cd

View file

@ -10,6 +10,7 @@
#include "Emu/Cell/SPUThread.h" #include "Emu/Cell/SPUThread.h"
#include "Emu/perf_meter.hpp" #include "Emu/perf_meter.hpp"
#include "util/asm.hpp"
#include <thread> #include <thread>
#include <unordered_map> #include <unordered_map>
#include <map> #include <map>
@ -254,7 +255,7 @@ thread_local cpu_thread* g_tls_current_cpu_thread = nullptr;
static atomic_t<u64, 64> s_cpu_counter{0}; static atomic_t<u64, 64> s_cpu_counter{0};
// List of posted tasks for suspend_all // List of posted tasks for suspend_all
static atomic_t<cpu_thread::suspend_work*> s_cpu_work[64]{}; static atomic_t<cpu_thread::suspend_work*> s_cpu_work[128]{};
// Linked list of pushed tasks for suspend_all // Linked list of pushed tasks for suspend_all
static atomic_t<cpu_thread::suspend_work*> s_pushed{}; static atomic_t<cpu_thread::suspend_work*> s_pushed{};
@ -263,10 +264,10 @@ static atomic_t<cpu_thread::suspend_work*> s_pushed{};
static shared_mutex s_cpu_lock; static shared_mutex s_cpu_lock;
// Bit allocator for threads which need to be suspended // Bit allocator for threads which need to be suspended
static atomic_t<u64> s_cpu_bits{}; static atomic_t<u128> s_cpu_bits{};
// List of active threads which need to be suspended // List of active threads which need to be suspended
static atomic_t<cpu_thread*> s_cpu_list[64]{}; static atomic_t<cpu_thread*> s_cpu_list[128]{};
namespace cpu_counter namespace cpu_counter
{ {
@ -278,7 +279,7 @@ namespace cpu_counter
for (u64 i = 0;; i++) for (u64 i = 0;; i++)
{ {
const auto [bits, ok] = s_cpu_bits.fetch_op([](u64& bits) -> u64 const auto [bits, ok] = s_cpu_bits.fetch_op([](u128& bits)
{ {
if (~bits) [[likely]] if (~bits) [[likely]]
{ {
@ -293,7 +294,7 @@ namespace cpu_counter
if (ok) [[likely]] if (ok) [[likely]]
{ {
// Get actual slot number // Get actual slot number
id = std::countr_one(bits); id = utils::ctz128(~bits);
// Register thread // Register thread
if (s_cpu_list[id].compare_and_swap_test(nullptr, _this)) [[likely]] if (s_cpu_list[id].compare_and_swap_test(nullptr, _this)) [[likely]]
@ -318,6 +319,14 @@ namespace cpu_counter
s_tls_thread_slot = id; s_tls_thread_slot = id;
} }
static void remove_cpu_bit(u32 bit)
{
s_cpu_bits.atomic_op([=](u128& val)
{
val &= ~(u128{1} << (bit % 128));
});
}
void remove(cpu_thread* _this) noexcept void remove(cpu_thread* _this) noexcept
{ {
// Unregister and wait if necessary // Unregister and wait if necessary
@ -339,31 +348,31 @@ namespace cpu_counter
return; return;
} }
s_cpu_bits &= ~(1ull << (slot % 64)); remove_cpu_bit(slot);
s_tls_thread_slot = -1; s_tls_thread_slot = -1;
} }
template <typename F> template <typename F>
u64 for_all_cpu(/*mutable*/ u64 copy, F func) noexcept u128 for_all_cpu(/*mutable*/ u128 copy, F func) noexcept
{ {
for (u64 bits = copy; bits; bits &= bits - 1) for (u128 bits = copy; bits; bits &= bits - 1)
{ {
const u32 index = std::countr_zero(bits); const u32 index = utils::ctz128(bits);
if (cpu_thread* cpu = s_cpu_list[index].load()) if (cpu_thread* cpu = s_cpu_list[index].load())
{ {
if constexpr (std::is_invocable_v<F, cpu_thread*, u32>) if constexpr (std::is_invocable_v<F, cpu_thread*, u32>)
{ {
if (!func(cpu, index)) if (!func(cpu, index))
copy &= ~(1ull << index); copy &= ~(u128{1} << index);
continue; continue;
} }
if constexpr (std::is_invocable_v<F, cpu_thread*>) if constexpr (std::is_invocable_v<F, cpu_thread*>)
{ {
if (!func(cpu)) if (!func(cpu))
copy &= ~(1ull << index); copy &= ~(u128{1} << index);
continue; continue;
} }
@ -371,7 +380,7 @@ namespace cpu_counter
} }
else else
{ {
copy &= ~(1ull << index); copy &= ~(u128{1} << index);
} }
} }
@ -847,7 +856,7 @@ bool cpu_thread::suspend_work::push(cpu_thread* _this) noexcept
// First thread to push the work to the workload list pauses all threads and processes it // First thread to push the work to the workload list pauses all threads and processes it
std::lock_guard lock(s_cpu_lock); std::lock_guard lock(s_cpu_lock);
u64 copy = s_cpu_bits.load(); u128 copy = s_cpu_bits.load();
// Try to prefetch cpu->state earlier // Try to prefetch cpu->state earlier
copy = cpu_counter::for_all_cpu(copy, [&](cpu_thread* cpu) copy = cpu_counter::for_all_cpu(copy, [&](cpu_thread* cpu)
@ -865,7 +874,7 @@ bool cpu_thread::suspend_work::push(cpu_thread* _this) noexcept
g_suspend_counter += 2; g_suspend_counter += 2;
// Copy snapshot for finalization // Copy snapshot for finalization
u64 copy2 = copy; u128 copy2 = copy;
copy = cpu_counter::for_all_cpu(copy, [&](cpu_thread* cpu, u32 index) copy = cpu_counter::for_all_cpu(copy, [&](cpu_thread* cpu, u32 index)
{ {