From 5693cc9eb33522bf717a6b259b7e616dfa58e573 Mon Sep 17 00:00:00 2001 From: Elad Ashkenazi <18193363+elad335@users.noreply.github.com> Date: Fri, 24 May 2024 17:42:05 +0300 Subject: [PATCH] SPU: smart GETLLAR spin detection --- rpcs3/Emu/Cell/SPUThread.cpp | 145 ++++++++++++++++++++++++++++++++--- rpcs3/Emu/Cell/SPUThread.h | 5 ++ rpcs3/Emu/system_config.h | 2 +- 3 files changed, 139 insertions(+), 13 deletions(-) diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index 6c960f0988..b98dd8b463 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -4422,7 +4422,10 @@ bool spu_thread::process_mfc_cmd() else { // Check if we can reuse our existing reservation - if (rtime == vm::reservation_acquire(addr) && cmp_rdata(rdata, data)) + auto& res = vm::reservation_acquire(addr); + const u64 this_time = res; + + if (this_time % 128 == 0 && cmp_rdata(rdata, data)) { mov_rdata(_ref(ch_mfc_cmd.lsa & 0x3ff80), rdata); ch_atomic_stat.set_value(MFC_GETLLAR_SUCCESS); @@ -4430,27 +4433,105 @@ bool spu_thread::process_mfc_cmd() // Need to check twice for it to be accurate, the code is before and not after this check for: // 1. Reduce time between reservation accesses so TSX panelty would be lowered // 2. Increase the chance of change detection: if GETLLAR has been called again new data is probably wanted - if (rtime == vm::reservation_acquire(addr) && cmp_rdata(rdata, data)) + if (this_time == res && cmp_rdata(rdata, data)) { + if (this_time != rtime) + { + // Reservation was lost but the data itself remains unchanged so try to ignore it + set_events(SPU_EVENT_LR); + rtime = this_time; + } + if ([&]() -> bool { // Validation that it is indeed GETLLAR spinning (large time window is intentional) - if (last_getllar != pc || mfc_cmd_id - 1 != last_getllar_id || perf0.get() - last_gtsc >= 10'000) + if (last_getllar_addr != addr || last_getllar != pc || mfc_cmd_id - 1 != last_getllar_id || perf0.get() - last_gtsc >= 15'000) { // Seemingly not getllar_busy_waiting_switch = umax; + getllar_spin_count = 0; return true; } - getllar_spin_count++; + getllar_spin_count = std::min(getllar_spin_count + 1, u16{umax}); - if (getllar_busy_waiting_switch == umax) + static atomic_t g_ok = 0, g_fail = 0; + + if (getllar_busy_waiting_switch == umax && getllar_spin_count == 4) { - // Evalute its value (shift-right to ensure its randomness with different CPUs) - getllar_busy_waiting_switch = ((perf0.get() >> 8) % 100 < g_cfg.core.spu_getllar_busy_waiting_percentage) ? 1 : 0; + const u32 percent = g_cfg.core.spu_getllar_busy_waiting_percentage; + + // Hidden value to force busy waiting (100 to 1 are dynamically adjusted, 0 is not) + if (percent != 101) + { + // Predict whether or not to use operating system sleep based on history + auto& stats = getllar_wait_time[pc / 32]; + + const auto old_stats = stats; + std::array new_stats{}; + + // Rotate history (prepare newest entry) + new_stats[0] = 0; + new_stats[1] = old_stats[0]; + new_stats[2] = old_stats[1]; + new_stats[3] = old_stats[2]; + + stats = new_stats; + + u32 total_wait = 0; + u32 zero_count = 0; // Try to ignore major inconsistencies + + for (u8 val : old_stats) + { + total_wait += val; + zero_count += (val == 0 ? 1 : 0); + } + + // Add to chance if previous wait was long enough + const u32 add_count = zero_count == 3 && total_wait >= 40 ? (total_wait - 39) * 40 + : zero_count == 2 && total_wait >= 11 ? (total_wait - 10) * 40 + : zero_count == 1 && total_wait >= 8 ? (total_wait - 7) * 40 + : zero_count == 0 && total_wait >= 6 ? (total_wait - 5) * 40 + : 0; + + // Evalute its value (shift-right to ensure its randomness with different CPUs) + getllar_busy_waiting_switch = ((perf0.get() >> 8) % 100 + add_count < percent) ? 1 : 0; + + getllar_evaluate_time = perf0.get(); + + if (getllar_busy_waiting_switch) + { + g_fail++; + } + else + { + g_ok++; + } + + if ((g_ok + g_fail) % 20 == 0 && !getllar_busy_waiting_switch) + spu_log.trace("SPU wait: count=%d. switch=%d, spin=%d, fail=%d, ok=%d, {%d, %d, %d, %d}", total_wait, getllar_busy_waiting_switch, getllar_spin_count, +g_fail, +g_ok, old_stats[0], old_stats[1], old_stats[2], old_stats[3] ); + } + else + { + getllar_busy_waiting_switch = 1; + } + } + // Don't be stubborn, force operating sleep if too much time has passed + else if (getllar_busy_waiting_switch == 1 && perf0.get() > getllar_evaluate_time && perf0.get() - getllar_evaluate_time >= 400'000) + { + const u32 percent = g_cfg.core.spu_getllar_busy_waiting_percentage; + + // Hidden value to force busy waiting + if (percent != 101) + { + spu_log.trace("SPU wait for 0x%x", addr); + getllar_wait_time[pc / 32].front() = 1; + getllar_busy_waiting_switch = 0; + } } - return !!getllar_busy_waiting_switch || getllar_spin_count < 3; + // Either 1 or umax + return getllar_busy_waiting_switch != 0; }()) { if (g_cfg.core.mfc_debug) @@ -4468,6 +4549,8 @@ bool spu_thread::process_mfc_cmd() if (getllar_busy_waiting_switch == 1) { + getllar_wait_time[pc / 32].front() = 0; + #if defined(ARCH_X64) if (utils::has_um_wait()) { @@ -4501,13 +4584,21 @@ bool spu_thread::process_mfc_cmd() // Spinning, might as well yield cpu resources state += cpu_flag::wait; - vm::reservation_notifier(addr).wait(rtime, atomic_wait_timeout{50'000}); + + // Storage efficient method to distinguish different nearby addresses (which are likely) + g_reservation_waiters[std::popcount(addr)]++; + + vm::reservation_notifier(addr).wait(this_time, atomic_wait_timeout{100'000}); + + g_reservation_waiters[std::popcount(addr)]--; // Reset perf perf0.restart(); // Quick check if there were reservation changes - if (rtime == vm::reservation_acquire(addr) && cmp_rdata(rdata, data)) + const u64 new_time = res; + + if (new_time % 128 == 0 && cmp_rdata(rdata, data) && res == new_time && cmp_rdata(rdata, data)) { if (g_cfg.core.mfc_debug) { @@ -4518,12 +4609,40 @@ bool spu_thread::process_mfc_cmd() std::memcpy(dump.data, rdata, 128); } - // Let the game recheck its state, maybe after a long period of time something else changed which satisfies its waiting condition - getllar_spin_count = 1; + if (new_time != rtime) + { + // Reservation was lost but the data itself remains unchanged so try to ignore it + set_events(SPU_EVENT_LR); + rtime = this_time; + } + + u8& val = getllar_wait_time[pc / 32].front(); + val = static_cast(std::min(val + 1, u8{umax})); + last_getllar_id = mfc_cmd_id; last_gtsc = perf0.get(); return true; } + + if (new_time == this_time && res == this_time) + { + spu_log.trace("RTIME unchanged on address 0x%x", addr); + + // Try to forcefully change in order to notify threads + if (g_reservation_waiters[std::popcount(addr)] && res.compare_and_swap_test(this_time, this_time + 128)) + { + vm::reservation_notifier(addr).notify_all(); + } + } + } + } + + if (this_time == rtime) + { + // Try to forcefully change in order to notify threads + if (g_reservation_waiters[std::popcount(addr)] && res.compare_and_swap_test(this_time, this_time + 128)) + { + vm::reservation_notifier(addr).notify_all(); } } @@ -4535,6 +4654,7 @@ bool spu_thread::process_mfc_cmd() last_getllar_id = mfc_cmd_id; last_getllar = pc; + last_getllar_addr = addr; last_gtsc = perf0.get(); getllar_spin_count = 0; getllar_busy_waiting_switch = umax; @@ -6855,3 +6975,4 @@ void fmt_class_string::format(std::string& out, u64 arg) DECLARE(spu_thread::g_raw_spu_ctr){}; DECLARE(spu_thread::g_raw_spu_id){}; DECLARE(spu_thread::g_spu_work_count){}; +DECLARE(spu_thread::g_reservation_waiters){}; diff --git a/rpcs3/Emu/Cell/SPUThread.h b/rpcs3/Emu/Cell/SPUThread.h index 539617f30d..9df636fc7e 100644 --- a/rpcs3/Emu/Cell/SPUThread.h +++ b/rpcs3/Emu/Cell/SPUThread.h @@ -798,8 +798,10 @@ public: u64 last_gtsc = 0; u32 last_getllar = umax; // LS address of last GETLLAR (if matches current GETLLAR we can let the thread rest) u32 last_getllar_id = umax; + u32 last_getllar_addr = umax; u32 getllar_spin_count = 0; u32 getllar_busy_waiting_switch = umax; // umax means the test needs evaluation, otherwise it's a boolean + u64 getllar_evaluate_time = 0; std::vector mfc_history; u64 mfc_dump_idx = 0; @@ -823,6 +825,8 @@ public: u32 current_bp_pc = umax; bool stop_flag_removal_protection = false; + std::array, SPU_LS_SIZE / 32> getllar_wait_time{}; + void push_snr(u32 number, u32 value); static void do_dma_transfer(spu_thread* _this, const spu_mfc_cmd& args, u8* ls); bool do_dma_check(const spu_mfc_cmd& args); @@ -892,6 +896,7 @@ public: static atomic_t g_raw_spu_ctr; static atomic_t g_raw_spu_id[5]; static atomic_t g_spu_work_count; + static atomic_t g_reservation_waiters[32]; static u32 find_raw_spu(u32 id) { diff --git a/rpcs3/Emu/system_config.h b/rpcs3/Emu/system_config.h index 25b82427ab..4d470363f8 100644 --- a/rpcs3/Emu/system_config.h +++ b/rpcs3/Emu/system_config.h @@ -33,7 +33,7 @@ struct cfg_root : cfg::node cfg::_bool set_daz_and_ftz{ this, "Set DAZ and FTZ", false }; cfg::_enum spu_decoder{ this, "SPU Decoder", spu_decoder_type::llvm }; cfg::uint<0, 100> spu_reservation_busy_waiting_percentage{ this, "SPU Reservation Busy Waiting Percentage", 0, true }; - cfg::uint<0, 100> spu_getllar_busy_waiting_percentage{ this, "SPU GETLLAR Busy Waiting Percentage", 100, true }; + cfg::uint<0, 101> spu_getllar_busy_waiting_percentage{ this, "SPU GETLLAR Busy Waiting Percentage", 100, true }; cfg::_bool spu_debug{ this, "SPU Debug" }; cfg::_bool mfc_debug{ this, "MFC Debug" }; cfg::_int<0, 6> preferred_spu_threads{ this, "Preferred SPU Threads", 0, true }; // Number of hardware threads dedicated to heavy simultaneous spu tasks