SPU: smart GETLLAR spin detection

This commit is contained in:
Elad Ashkenazi 2024-05-24 17:42:05 +03:00
parent feff2ba09e
commit 5693cc9eb3
3 changed files with 139 additions and 13 deletions

View file

@ -4422,7 +4422,10 @@ bool spu_thread::process_mfc_cmd()
else else
{ {
// Check if we can reuse our existing reservation // Check if we can reuse our existing reservation
if (rtime == vm::reservation_acquire(addr) && cmp_rdata(rdata, data)) auto& res = vm::reservation_acquire(addr);
const u64 this_time = res;
if (this_time % 128 == 0 && cmp_rdata(rdata, data))
{ {
mov_rdata(_ref<spu_rdata_t>(ch_mfc_cmd.lsa & 0x3ff80), rdata); mov_rdata(_ref<spu_rdata_t>(ch_mfc_cmd.lsa & 0x3ff80), rdata);
ch_atomic_stat.set_value(MFC_GETLLAR_SUCCESS); ch_atomic_stat.set_value(MFC_GETLLAR_SUCCESS);
@ -4430,27 +4433,105 @@ bool spu_thread::process_mfc_cmd()
// Need to check twice for it to be accurate, the code is before and not after this check for: // Need to check twice for it to be accurate, the code is before and not after this check for:
// 1. Reduce time between reservation accesses so TSX panelty would be lowered // 1. Reduce time between reservation accesses so TSX panelty would be lowered
// 2. Increase the chance of change detection: if GETLLAR has been called again new data is probably wanted // 2. Increase the chance of change detection: if GETLLAR has been called again new data is probably wanted
if (rtime == vm::reservation_acquire(addr) && cmp_rdata(rdata, data)) if (this_time == res && cmp_rdata(rdata, data))
{ {
if (this_time != rtime)
{
// Reservation was lost but the data itself remains unchanged so try to ignore it
set_events(SPU_EVENT_LR);
rtime = this_time;
}
if ([&]() -> bool if ([&]() -> bool
{ {
// Validation that it is indeed GETLLAR spinning (large time window is intentional) // Validation that it is indeed GETLLAR spinning (large time window is intentional)
if (last_getllar != pc || mfc_cmd_id - 1 != last_getllar_id || perf0.get() - last_gtsc >= 10'000) if (last_getllar_addr != addr || last_getllar != pc || mfc_cmd_id - 1 != last_getllar_id || perf0.get() - last_gtsc >= 15'000)
{ {
// Seemingly not // Seemingly not
getllar_busy_waiting_switch = umax; getllar_busy_waiting_switch = umax;
getllar_spin_count = 0;
return true; return true;
} }
getllar_spin_count++; getllar_spin_count = std::min<u32>(getllar_spin_count + 1, u16{umax});
if (getllar_busy_waiting_switch == umax) static atomic_t<usz> g_ok = 0, g_fail = 0;
if (getllar_busy_waiting_switch == umax && getllar_spin_count == 4)
{ {
// Evalute its value (shift-right to ensure its randomness with different CPUs) const u32 percent = g_cfg.core.spu_getllar_busy_waiting_percentage;
getllar_busy_waiting_switch = ((perf0.get() >> 8) % 100 < g_cfg.core.spu_getllar_busy_waiting_percentage) ? 1 : 0;
// Hidden value to force busy waiting (100 to 1 are dynamically adjusted, 0 is not)
if (percent != 101)
{
// Predict whether or not to use operating system sleep based on history
auto& stats = getllar_wait_time[pc / 32];
const auto old_stats = stats;
std::array<u8, 4> new_stats{};
// Rotate history (prepare newest entry)
new_stats[0] = 0;
new_stats[1] = old_stats[0];
new_stats[2] = old_stats[1];
new_stats[3] = old_stats[2];
stats = new_stats;
u32 total_wait = 0;
u32 zero_count = 0; // Try to ignore major inconsistencies
for (u8 val : old_stats)
{
total_wait += val;
zero_count += (val == 0 ? 1 : 0);
} }
return !!getllar_busy_waiting_switch || getllar_spin_count < 3; // Add to chance if previous wait was long enough
const u32 add_count = zero_count == 3 && total_wait >= 40 ? (total_wait - 39) * 40
: zero_count == 2 && total_wait >= 11 ? (total_wait - 10) * 40
: zero_count == 1 && total_wait >= 8 ? (total_wait - 7) * 40
: zero_count == 0 && total_wait >= 6 ? (total_wait - 5) * 40
: 0;
// Evalute its value (shift-right to ensure its randomness with different CPUs)
getllar_busy_waiting_switch = ((perf0.get() >> 8) % 100 + add_count < percent) ? 1 : 0;
getllar_evaluate_time = perf0.get();
if (getllar_busy_waiting_switch)
{
g_fail++;
}
else
{
g_ok++;
}
if ((g_ok + g_fail) % 20 == 0 && !getllar_busy_waiting_switch)
spu_log.trace("SPU wait: count=%d. switch=%d, spin=%d, fail=%d, ok=%d, {%d, %d, %d, %d}", total_wait, getllar_busy_waiting_switch, getllar_spin_count, +g_fail, +g_ok, old_stats[0], old_stats[1], old_stats[2], old_stats[3] );
}
else
{
getllar_busy_waiting_switch = 1;
}
}
// Don't be stubborn, force operating sleep if too much time has passed
else if (getllar_busy_waiting_switch == 1 && perf0.get() > getllar_evaluate_time && perf0.get() - getllar_evaluate_time >= 400'000)
{
const u32 percent = g_cfg.core.spu_getllar_busy_waiting_percentage;
// Hidden value to force busy waiting
if (percent != 101)
{
spu_log.trace("SPU wait for 0x%x", addr);
getllar_wait_time[pc / 32].front() = 1;
getllar_busy_waiting_switch = 0;
}
}
// Either 1 or umax
return getllar_busy_waiting_switch != 0;
}()) }())
{ {
if (g_cfg.core.mfc_debug) if (g_cfg.core.mfc_debug)
@ -4468,6 +4549,8 @@ bool spu_thread::process_mfc_cmd()
if (getllar_busy_waiting_switch == 1) if (getllar_busy_waiting_switch == 1)
{ {
getllar_wait_time[pc / 32].front() = 0;
#if defined(ARCH_X64) #if defined(ARCH_X64)
if (utils::has_um_wait()) if (utils::has_um_wait())
{ {
@ -4501,13 +4584,21 @@ bool spu_thread::process_mfc_cmd()
// Spinning, might as well yield cpu resources // Spinning, might as well yield cpu resources
state += cpu_flag::wait; state += cpu_flag::wait;
vm::reservation_notifier(addr).wait(rtime, atomic_wait_timeout{50'000});
// Storage efficient method to distinguish different nearby addresses (which are likely)
g_reservation_waiters[std::popcount(addr)]++;
vm::reservation_notifier(addr).wait(this_time, atomic_wait_timeout{100'000});
g_reservation_waiters[std::popcount(addr)]--;
// Reset perf // Reset perf
perf0.restart(); perf0.restart();
// Quick check if there were reservation changes // Quick check if there were reservation changes
if (rtime == vm::reservation_acquire(addr) && cmp_rdata(rdata, data)) const u64 new_time = res;
if (new_time % 128 == 0 && cmp_rdata(rdata, data) && res == new_time && cmp_rdata(rdata, data))
{ {
if (g_cfg.core.mfc_debug) if (g_cfg.core.mfc_debug)
{ {
@ -4518,12 +4609,40 @@ bool spu_thread::process_mfc_cmd()
std::memcpy(dump.data, rdata, 128); std::memcpy(dump.data, rdata, 128);
} }
// Let the game recheck its state, maybe after a long period of time something else changed which satisfies its waiting condition if (new_time != rtime)
getllar_spin_count = 1; {
// Reservation was lost but the data itself remains unchanged so try to ignore it
set_events(SPU_EVENT_LR);
rtime = this_time;
}
u8& val = getllar_wait_time[pc / 32].front();
val = static_cast<u8>(std::min<u32>(val + 1, u8{umax}));
last_getllar_id = mfc_cmd_id; last_getllar_id = mfc_cmd_id;
last_gtsc = perf0.get(); last_gtsc = perf0.get();
return true; return true;
} }
if (new_time == this_time && res == this_time)
{
spu_log.trace("RTIME unchanged on address 0x%x", addr);
// Try to forcefully change in order to notify threads
if (g_reservation_waiters[std::popcount(addr)] && res.compare_and_swap_test(this_time, this_time + 128))
{
vm::reservation_notifier(addr).notify_all();
}
}
}
}
if (this_time == rtime)
{
// Try to forcefully change in order to notify threads
if (g_reservation_waiters[std::popcount(addr)] && res.compare_and_swap_test(this_time, this_time + 128))
{
vm::reservation_notifier(addr).notify_all();
} }
} }
@ -4535,6 +4654,7 @@ bool spu_thread::process_mfc_cmd()
last_getllar_id = mfc_cmd_id; last_getllar_id = mfc_cmd_id;
last_getllar = pc; last_getllar = pc;
last_getllar_addr = addr;
last_gtsc = perf0.get(); last_gtsc = perf0.get();
getllar_spin_count = 0; getllar_spin_count = 0;
getllar_busy_waiting_switch = umax; getllar_busy_waiting_switch = umax;
@ -6855,3 +6975,4 @@ void fmt_class_string<spu_channel_4_t>::format(std::string& out, u64 arg)
DECLARE(spu_thread::g_raw_spu_ctr){}; DECLARE(spu_thread::g_raw_spu_ctr){};
DECLARE(spu_thread::g_raw_spu_id){}; DECLARE(spu_thread::g_raw_spu_id){};
DECLARE(spu_thread::g_spu_work_count){}; DECLARE(spu_thread::g_spu_work_count){};
DECLARE(spu_thread::g_reservation_waiters){};

View file

@ -798,8 +798,10 @@ public:
u64 last_gtsc = 0; u64 last_gtsc = 0;
u32 last_getllar = umax; // LS address of last GETLLAR (if matches current GETLLAR we can let the thread rest) u32 last_getllar = umax; // LS address of last GETLLAR (if matches current GETLLAR we can let the thread rest)
u32 last_getllar_id = umax; u32 last_getllar_id = umax;
u32 last_getllar_addr = umax;
u32 getllar_spin_count = 0; u32 getllar_spin_count = 0;
u32 getllar_busy_waiting_switch = umax; // umax means the test needs evaluation, otherwise it's a boolean u32 getllar_busy_waiting_switch = umax; // umax means the test needs evaluation, otherwise it's a boolean
u64 getllar_evaluate_time = 0;
std::vector<mfc_cmd_dump> mfc_history; std::vector<mfc_cmd_dump> mfc_history;
u64 mfc_dump_idx = 0; u64 mfc_dump_idx = 0;
@ -823,6 +825,8 @@ public:
u32 current_bp_pc = umax; u32 current_bp_pc = umax;
bool stop_flag_removal_protection = false; bool stop_flag_removal_protection = false;
std::array<std::array<u8, 4>, SPU_LS_SIZE / 32> getllar_wait_time{};
void push_snr(u32 number, u32 value); void push_snr(u32 number, u32 value);
static void do_dma_transfer(spu_thread* _this, const spu_mfc_cmd& args, u8* ls); static void do_dma_transfer(spu_thread* _this, const spu_mfc_cmd& args, u8* ls);
bool do_dma_check(const spu_mfc_cmd& args); bool do_dma_check(const spu_mfc_cmd& args);
@ -892,6 +896,7 @@ public:
static atomic_t<u32> g_raw_spu_ctr; static atomic_t<u32> g_raw_spu_ctr;
static atomic_t<u32> g_raw_spu_id[5]; static atomic_t<u32> g_raw_spu_id[5];
static atomic_t<u32> g_spu_work_count; static atomic_t<u32> g_spu_work_count;
static atomic_t<u8> g_reservation_waiters[32];
static u32 find_raw_spu(u32 id) static u32 find_raw_spu(u32 id)
{ {

View file

@ -33,7 +33,7 @@ struct cfg_root : cfg::node
cfg::_bool set_daz_and_ftz{ this, "Set DAZ and FTZ", false }; cfg::_bool set_daz_and_ftz{ this, "Set DAZ and FTZ", false };
cfg::_enum<spu_decoder_type> spu_decoder{ this, "SPU Decoder", spu_decoder_type::llvm }; cfg::_enum<spu_decoder_type> spu_decoder{ this, "SPU Decoder", spu_decoder_type::llvm };
cfg::uint<0, 100> spu_reservation_busy_waiting_percentage{ this, "SPU Reservation Busy Waiting Percentage", 0, true }; cfg::uint<0, 100> spu_reservation_busy_waiting_percentage{ this, "SPU Reservation Busy Waiting Percentage", 0, true };
cfg::uint<0, 100> spu_getllar_busy_waiting_percentage{ this, "SPU GETLLAR Busy Waiting Percentage", 100, true }; cfg::uint<0, 101> spu_getllar_busy_waiting_percentage{ this, "SPU GETLLAR Busy Waiting Percentage", 100, true };
cfg::_bool spu_debug{ this, "SPU Debug" }; cfg::_bool spu_debug{ this, "SPU Debug" };
cfg::_bool mfc_debug{ this, "MFC Debug" }; cfg::_bool mfc_debug{ this, "MFC Debug" };
cfg::_int<0, 6> preferred_spu_threads{ this, "Preferred SPU Threads", 0, true }; // Number of hardware threads dedicated to heavy simultaneous spu tasks cfg::_int<0, 6> preferred_spu_threads{ this, "Preferred SPU Threads", 0, true }; // Number of hardware threads dedicated to heavy simultaneous spu tasks