diff --git a/rpcs3/Emu/Cell/MFC.cpp b/rpcs3/Emu/Cell/MFC.cpp index 91b30ad0cc..d6c0b01cbc 100644 --- a/rpcs3/Emu/Cell/MFC.cpp +++ b/rpcs3/Emu/Cell/MFC.cpp @@ -142,107 +142,142 @@ void mfc_thread::cpu_task() if (queue_size) { - auto& cmd = spu.mfc_queue[0]; - - if ((cmd.cmd & ~(MFC_BARRIER_MASK | MFC_FENCE_MASK)) == MFC_PUTQLLUC_CMD) + u32 fence_mask = 0; // Using this instead of stall_mask to avoid a possible race condition + u32 barrier_mask = 0; + bool first = true; + for (u32 i = 0; i < spu.mfc_queue.size(); i++, first = false) { - auto& data = vm::ps3::_ref(cmd.eal); - const auto to_write = spu._ref(cmd.lsa & 0x3ffff); + auto& cmd = spu.mfc_queue[i]; - cmd.size = 0; - no_updates = 0; + // this check all revolves around a potential 'stalled list' in the queue as its the one thing that can cause out of order mfc list execution currently + // a list with barrier hard blocks that tag until it's been dealt with + // and a new command that has a fence cant be executed until the stalled list has been dealt with + if ((cmd.size != 0) && ((barrier_mask & (1u << cmd.tag)) || ((cmd.cmd & MFC_FENCE_MASK) && ((1 << cmd.tag) & fence_mask)))) + continue; - vm::reservation_acquire(cmd.eal, 128); - - // Store unconditionally - if (s_use_rtm && utils::transaction_enter()) + if ((cmd.cmd & ~(MFC_BARRIER_MASK | MFC_FENCE_MASK)) == MFC_PUTQLLUC_CMD) { - if (!vm::reader_lock{vm::try_to_lock}) - { - _xabort(0); - } + auto& data = vm::ps3::_ref(cmd.eal); + const auto to_write = spu._ref(cmd.lsa & 0x3ffff); - data = to_write; - vm::reservation_update(cmd.eal, 128); - vm::notify(cmd.eal, 128); - _xend(); - } - else - { - vm::writer_lock lock(0); - data = to_write; - vm::reservation_update(cmd.eal, 128); - vm::notify(cmd.eal, 128); - } - } - else if (cmd.cmd & MFC_LIST_MASK) - { - struct list_element - { - be_t sb; // Stall-and-Notify bit (0x8000) - be_t ts; // List Transfer Size - be_t ea; // External Address Low - }; - - if (cmd.size && (spu.ch_stall_mask & (1u << cmd.tag)) == 0) - { - cmd.lsa &= 0x3fff0; - - const list_element item = spu._ref(cmd.eal & 0x3fff8); - - const u32 size = item.ts; - const u32 addr = item.ea; - - if (size) - { - spu_mfc_cmd transfer; - transfer.eal = addr; - transfer.eah = 0; - transfer.lsa = cmd.lsa | (addr & 0xf); - transfer.tag = cmd.tag; - transfer.cmd = MFC(cmd.cmd & ~MFC_LIST_MASK); - transfer.size = size; - - spu.do_dma_transfer(transfer); - cmd.lsa += std::max(size, 16); - } - - cmd.eal += 8; - cmd.size -= 8; + cmd.size = 0; no_updates = 0; - if (item.sb & 0x8000) + vm::reservation_acquire(cmd.eal, 128); + + // Store unconditionally + if (s_use_rtm && utils::transaction_enter()) { - spu.ch_stall_stat.push_or(spu, 1 << cmd.tag); - - const u32 evt = spu.ch_event_stat.fetch_or(SPU_EVENT_SN); - - if (evt & SPU_EVENT_WAITING) + if (!vm::reader_lock{ vm::try_to_lock }) { - spu.notify(); - } - else if (evt & SPU_EVENT_INTR_ENABLED) - { - spu.state += cpu_flag::suspend; + _xabort(0); } + + data = to_write; + vm::reservation_update(cmd.eal, 128); + vm::notify(cmd.eal, 128); + _xend(); + } + else + { + vm::writer_lock lock(0); + data = to_write; + vm::reservation_update(cmd.eal, 128); + vm::notify(cmd.eal, 128); } } - } - else if (LIKELY(cmd.size)) - { - spu.do_dma_transfer(cmd); - cmd.size = 0; - } - else if (UNLIKELY((cmd.cmd & ~0xc) == MFC_BARRIER_CMD)) - { - // TODO (MFC_BARRIER_CMD, MFC_EIEIO_CMD, MFC_SYNC_CMD) - _mm_mfence(); - } + else if (cmd.cmd & MFC_LIST_MASK) + { + struct list_element + { + be_t sb; // Stall-and-Notify bit (0x8000) + be_t ts; // List Transfer Size + be_t ea; // External Address Low + }; - if (!cmd.size) - { - spu.mfc_queue.end_pop(); - no_updates = 0; + if (cmd.size && (spu.ch_stall_mask & (1u << cmd.tag)) == 0) + { + cmd.lsa &= 0x3fff0; + + // try to get the whole list done in one go + while (cmd.size != 0) + { + const list_element item = spu._ref(cmd.eal & 0x3fff8); + + const u32 size = item.ts; + const u32 addr = item.ea; + + if (size) + { + spu_mfc_cmd transfer; + transfer.eal = addr; + transfer.eah = 0; + transfer.lsa = cmd.lsa | (addr & 0xf); + transfer.tag = cmd.tag; + transfer.cmd = MFC(cmd.cmd & ~MFC_LIST_MASK); + transfer.size = size; + + spu.do_dma_transfer(transfer); + cmd.lsa += std::max(size, 16); + } + + cmd.eal += 8; + cmd.size -= 8; + no_updates = 0; + + // dont stall for last 'item' in list + if ((item.sb & 0x8000) && (cmd.size != 0)) + { + spu.ch_stall_mask |= (1 << cmd.tag); + spu.ch_stall_stat.push_or(spu, 1 << cmd.tag); + + const u32 evt = spu.ch_event_stat.fetch_or(SPU_EVENT_SN); + + if (evt & SPU_EVENT_WAITING) + { + spu.notify(); + } + break; + } + } + } + + if (cmd.size != 0 && (cmd.cmd & MFC_BARRIER_MASK)) + barrier_mask |= (1 << cmd.tag); + else if (cmd.size != 0) + fence_mask |= (1 << cmd.tag); + } + else if (UNLIKELY((cmd.cmd & ~0xc) == MFC_BARRIER_CMD)) + { + // Raw barrier commands / sync commands are tag agnostic and hard sync the mfc list + // Need to gaurentee everything ahead of it has processed before this + if (first) + cmd.size = 0; + else + break; + } + else if (LIKELY(cmd.size)) + { + spu.do_dma_transfer(cmd); + cmd.size = 0; + } + if (!cmd.size && first) + { + spu.mfc_queue.end_pop(); + no_updates = 0; + break; + } + else if (!cmd.size && i == 1) + { + // nasty hack, shoving stalled list down one + // this *works* from the idea that the only thing that could have been passed over in position 0 is a stalled list + // todo: this can still create a situation where we say the mfc queue is full when its actually not, which will cause a rough deadlock between spu and mfc + // which will causes a situation where the spu is waiting for the queue to open up but hasnt signaled the stall yet + spu.mfc_queue[1] = spu.mfc_queue[0]; + spu.mfc_queue.end_pop(); + no_updates = 0; + break; + } } } @@ -252,26 +287,21 @@ void mfc_thread::cpu_task() { // Mask incomplete transfers u32 completed = spu.ch_tag_mask; - - for (u32 i = 0; i < spu.mfc_queue.size(); i++) { - const auto& _cmd = spu.mfc_queue[i]; - - if (_cmd.size) + for (u32 i = 0; i < spu.mfc_queue.size(); i++) { - if (spu.ch_tag_upd == 1) - { + const auto& _cmd = spu.mfc_queue[i]; + if (_cmd.size) completed &= ~(1u << _cmd.tag); - } - else - { - completed = 0; - break; - } } } - if (completed && spu.ch_tag_upd.exchange(0)) + if (completed && spu.ch_tag_upd.compare_and_swap_test(1, 0)) + { + spu.ch_tag_stat.push(spu, completed); + no_updates = 0; + } + else if (completed && spu.ch_tag_mask == completed && spu.ch_tag_upd.compare_and_swap_test(2, 0)) { spu.ch_tag_stat.push(spu, completed); no_updates = 0; @@ -280,7 +310,6 @@ void mfc_thread::cpu_task() test_state(); } - if (no_updates++) { if (no_updates >= 3) diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index 2eca529ee6..ccc22b1ec5 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -815,7 +815,7 @@ void SPUThread::process_mfc_cmd() case MFC_GETLB_CMD: case MFC_GETLF_CMD: { - if (ch_mfc_cmd.size <= max_imm_dma_size && mfc_queue.size() == 0 && (ch_stall_mask & (1u << ch_mfc_cmd.tag)) == 0) + if (ch_mfc_cmd.size <= max_imm_dma_size && mfc_queue.size() == 0) { vm::reader_lock lock(vm::try_to_lock); @@ -890,7 +890,7 @@ void SPUThread::process_mfc_cmd() case MFC_EIEIO_CMD: case MFC_SYNC_CMD: { - ch_mfc_cmd.size = 0; + ch_mfc_cmd.size = 1; if (mfc_queue.size() == 0) { diff --git a/rpcs3/Emu/System.h b/rpcs3/Emu/System.h index fa45db4970..05604dc542 100644 --- a/rpcs3/Emu/System.h +++ b/rpcs3/Emu/System.h @@ -283,7 +283,7 @@ struct cfg_root : cfg::node cfg::_bool bind_spu_cores{this, "Bind SPU threads to secondary cores"}; cfg::_bool lower_spu_priority{this, "Lower SPU thread priority"}; cfg::_bool spu_debug{this, "SPU Debug"}; - cfg::_int<32, 16384> max_spu_immediate_write_size{this, "Maximum immediate DMA write size", 16384}; // Maximum size that an SPU thread can write directly without posting to MFC + cfg::_int<0, 16384> max_spu_immediate_write_size{this, "Maximum immediate DMA write size", 16384}; // Maximum size that an SPU thread can write directly without posting to MFC cfg::_int<0, 6> preferred_spu_threads{this, "Preferred SPU Threads", 0}; //Numnber of hardware threads dedicated to heavy simultaneous spu tasks cfg::_int<0, 16> spu_delay_penalty{this, "SPU delay penalty", 3}; //Number of milliseconds to block a thread if a virtual 'core' isn't free cfg::_bool spu_loop_detection{this, "SPU loop detection", true}; //Try to detect wait loops and trigger thread yield