mirror of
https://github.com/RPCS3/rpcs3.git
synced 2025-07-14 10:48:36 +12:00
rsx: Rework RSX offloading
- Use a lockless queue - Do not enqueue small transfers
This commit is contained in:
parent
c32c1b0a62
commit
b893a75002
6 changed files with 210 additions and 176 deletions
129
rpcs3/Emu/RSX/RSXOffload.cpp
Normal file
129
rpcs3/Emu/RSX/RSXOffload.cpp
Normal file
|
@ -0,0 +1,129 @@
|
|||
#include "stdafx.h"
|
||||
|
||||
#include "Common/BufferUtils.h"
|
||||
#include "Emu/System.h"
|
||||
#include "RSXOffload.h"
|
||||
|
||||
namespace rsx
|
||||
{
|
||||
// initialization
|
||||
void dma_manager::init()
|
||||
{
|
||||
m_worker_state = thread_state::created;
|
||||
m_worker_thread = std::thread([this]()
|
||||
{
|
||||
if (!g_cfg.video.multithreaded_rsx)
|
||||
{
|
||||
// Abort
|
||||
return;
|
||||
}
|
||||
|
||||
if (g_cfg.core.thread_scheduler_enabled)
|
||||
{
|
||||
thread_ctrl::set_thread_affinity_mask(thread_ctrl::get_affinity_mask(thread_class::rsx));
|
||||
}
|
||||
|
||||
bool idle = false;
|
||||
while (m_worker_state != thread_state::finished)
|
||||
{
|
||||
if (m_jobs_count)
|
||||
{
|
||||
if (idle)
|
||||
{
|
||||
thread_ctrl::set_native_priority(0);
|
||||
idle = false;
|
||||
}
|
||||
|
||||
for (auto slice = m_work_queue.pop_all(); slice; slice.pop_front())
|
||||
{
|
||||
auto task = *slice;
|
||||
switch (task.type)
|
||||
{
|
||||
case raw_copy:
|
||||
memcpy(task.dst, task.src, task.length);
|
||||
break;
|
||||
case vector_copy:
|
||||
memcpy(task.dst, task.opt_storage.data(), task.length);
|
||||
break;
|
||||
case index_emulate:
|
||||
write_index_array_for_non_indexed_non_native_primitive_to_buffer(
|
||||
reinterpret_cast<char*>(task.dst),
|
||||
static_cast<rsx::primitive_type>(task.aux_param0),
|
||||
task.length);
|
||||
break;
|
||||
default:
|
||||
ASSUME(0);
|
||||
fmt::throw_exception("Unreachable" HERE);
|
||||
}
|
||||
|
||||
m_jobs_count--;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
idle = true;
|
||||
thread_ctrl::set_native_priority(-1);
|
||||
std::this_thread::yield();
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// General transport
|
||||
void dma_manager::copy(void *dst, std::vector<u8>& src, u32 length)
|
||||
{
|
||||
if (length <= max_immediate_transfer_size || !g_cfg.video.multithreaded_rsx)
|
||||
{
|
||||
std::memcpy(dst, src.data(), length);
|
||||
}
|
||||
else
|
||||
{
|
||||
++m_jobs_count;
|
||||
m_work_queue.push(dst, src, length);
|
||||
}
|
||||
}
|
||||
|
||||
void dma_manager::copy(void *dst, void *src, u32 length)
|
||||
{
|
||||
if (length <= max_immediate_transfer_size || !g_cfg.video.multithreaded_rsx)
|
||||
{
|
||||
std::memcpy(dst, src, length);
|
||||
}
|
||||
else
|
||||
{
|
||||
++m_jobs_count;
|
||||
m_work_queue.push(dst, src, length);
|
||||
}
|
||||
}
|
||||
|
||||
// Vertex utilities
|
||||
void dma_manager::emulate_as_indexed(void *dst, rsx::primitive_type primitive, u32 count)
|
||||
{
|
||||
if (!g_cfg.video.multithreaded_rsx)
|
||||
{
|
||||
write_index_array_for_non_indexed_non_native_primitive_to_buffer(
|
||||
reinterpret_cast<char*>(dst), primitive, count);
|
||||
}
|
||||
else
|
||||
{
|
||||
++m_jobs_count;
|
||||
m_work_queue.push(dst, primitive, count);
|
||||
}
|
||||
}
|
||||
|
||||
// Synchronization
|
||||
void dma_manager::sync()
|
||||
{
|
||||
if (g_cfg.video.multithreaded_rsx)
|
||||
{
|
||||
while (m_jobs_count)
|
||||
_mm_lfence();
|
||||
}
|
||||
}
|
||||
|
||||
void dma_manager::join()
|
||||
{
|
||||
m_worker_state = thread_state::finished;
|
||||
m_worker_thread.join();
|
||||
}
|
||||
}
|
72
rpcs3/Emu/RSX/RSXOffload.h
Normal file
72
rpcs3/Emu/RSX/RSXOffload.h
Normal file
|
@ -0,0 +1,72 @@
|
|||
#pragma once
|
||||
|
||||
#include "Utilities/types.h"
|
||||
#include "Utilities/lockless.h"
|
||||
#include "Utilities/Thread.h"
|
||||
#include "gcm_enums.h"
|
||||
|
||||
#include <vector>
|
||||
#include <thread>
|
||||
|
||||
namespace rsx
|
||||
{
|
||||
class dma_manager
|
||||
{
|
||||
enum op
|
||||
{
|
||||
raw_copy = 0,
|
||||
vector_copy = 1,
|
||||
index_emulate = 2
|
||||
};
|
||||
|
||||
struct transport_packet
|
||||
{
|
||||
op type;
|
||||
std::vector<u8> opt_storage;
|
||||
void *src;
|
||||
void *dst;
|
||||
u32 length;
|
||||
u32 aux_param0;
|
||||
u32 aux_param1;
|
||||
|
||||
transport_packet(void *_dst, void *_src, u32 len)
|
||||
: src(_src), dst(_dst), length(len), type(op::raw_copy)
|
||||
{}
|
||||
|
||||
transport_packet(void *_dst, std::vector<u8>& _src, u32 len)
|
||||
: dst(_dst), opt_storage(std::move(_src)), length(len), type(op::vector_copy)
|
||||
{}
|
||||
|
||||
transport_packet(void *_dst, rsx::primitive_type prim, u32 len)
|
||||
: dst(_dst), aux_param0(static_cast<u8>(prim)), length(len), type(op::index_emulate)
|
||||
{}
|
||||
};
|
||||
|
||||
lf_queue<transport_packet> m_work_queue;
|
||||
atomic_t<int> m_jobs_count;
|
||||
std::thread m_worker_thread;
|
||||
thread_state m_worker_state;
|
||||
|
||||
// TODO: Improved benchmarks here; value determined by profiling on a Ryzen CPU, rounded to the nearest 512 bytes
|
||||
const u32 max_immediate_transfer_size = 3584;
|
||||
|
||||
public:
|
||||
dma_manager() = default;
|
||||
|
||||
// initialization
|
||||
void init();
|
||||
|
||||
// General tranport
|
||||
void copy(void *dst, std::vector<u8>& src, u32 length);
|
||||
void copy(void *dst, void *src, u32 length);
|
||||
|
||||
// Vertex utilities
|
||||
void emulate_as_indexed(void *dst, rsx::primitive_type primitive, u32 count);
|
||||
|
||||
// Synchronization
|
||||
void sync();
|
||||
void join();
|
||||
};
|
||||
|
||||
extern dma_manager g_dma_manager;
|
||||
}
|
|
@ -251,126 +251,6 @@ namespace rsx
|
|||
}
|
||||
}
|
||||
|
||||
// initialization
|
||||
void dma_manager::init()
|
||||
{
|
||||
m_worker_state = thread_state::created;
|
||||
m_worker_thread = std::thread([this]()
|
||||
{
|
||||
if (!g_cfg.video.multithreaded_rsx)
|
||||
{
|
||||
// Abort
|
||||
return;
|
||||
}
|
||||
|
||||
if (g_cfg.core.thread_scheduler_enabled)
|
||||
{
|
||||
thread_ctrl::set_thread_affinity_mask(thread_ctrl::get_affinity_mask(thread_class::rsx));
|
||||
}
|
||||
|
||||
bool idle = false;
|
||||
while (m_worker_state != thread_state::finished)
|
||||
{
|
||||
if (!m_work_queue.empty())
|
||||
{
|
||||
m_queue_mutex.lock();
|
||||
auto task = std::move(m_work_queue.front());
|
||||
m_work_queue.pop_front();
|
||||
m_queue_mutex.unlock();
|
||||
|
||||
if (idle)
|
||||
{
|
||||
thread_ctrl::set_native_priority(0);
|
||||
idle = false;
|
||||
}
|
||||
|
||||
switch (task.type)
|
||||
{
|
||||
case raw_copy:
|
||||
memcpy(task.dst, task.src, task.length);
|
||||
break;
|
||||
case vector_copy:
|
||||
memcpy(task.dst, task.opt_storage.data(), task.length);
|
||||
break;
|
||||
case index_emulate:
|
||||
write_index_array_for_non_indexed_non_native_primitive_to_buffer(
|
||||
reinterpret_cast<char*>(task.dst),
|
||||
static_cast<rsx::primitive_type>(task.aux_param0),
|
||||
task.length);
|
||||
break;
|
||||
default:
|
||||
ASSUME(0);
|
||||
fmt::throw_exception("Unreachable" HERE);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
idle = true;
|
||||
thread_ctrl::set_native_priority(-1);
|
||||
std::this_thread::yield();
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// General tranport
|
||||
void dma_manager::copy(void *dst, std::vector<u8>& src, u32 length)
|
||||
{
|
||||
if (!g_cfg.video.multithreaded_rsx)
|
||||
{
|
||||
std::memcpy(dst, src.data(), length);
|
||||
}
|
||||
else
|
||||
{
|
||||
std::lock_guard lock(m_queue_mutex);
|
||||
m_work_queue.emplace_back(dst, src, length);
|
||||
}
|
||||
}
|
||||
|
||||
void dma_manager::copy(void *dst, void *src, u32 length)
|
||||
{
|
||||
if (!g_cfg.video.multithreaded_rsx)
|
||||
{
|
||||
std::memcpy(dst, src, length);
|
||||
}
|
||||
else
|
||||
{
|
||||
std::lock_guard lock(m_queue_mutex);
|
||||
m_work_queue.emplace_back(dst, src, length);
|
||||
}
|
||||
}
|
||||
|
||||
// Vertex utilities
|
||||
void dma_manager::emulate_as_indexed(void *dst, rsx::primitive_type primitive, u32 count)
|
||||
{
|
||||
if (!g_cfg.video.multithreaded_rsx)
|
||||
{
|
||||
write_index_array_for_non_indexed_non_native_primitive_to_buffer(
|
||||
reinterpret_cast<char*>(dst), primitive, count);
|
||||
}
|
||||
else
|
||||
{
|
||||
std::lock_guard lock(m_queue_mutex);
|
||||
m_work_queue.emplace_back(dst, primitive, count);
|
||||
}
|
||||
}
|
||||
|
||||
// Synchronization
|
||||
void dma_manager::sync()
|
||||
{
|
||||
if (g_cfg.video.multithreaded_rsx)
|
||||
{
|
||||
while (!m_work_queue.empty())
|
||||
_mm_lfence();
|
||||
}
|
||||
}
|
||||
|
||||
void dma_manager::join()
|
||||
{
|
||||
m_worker_state = thread_state::finished;
|
||||
m_worker_thread.join();
|
||||
}
|
||||
|
||||
thread::thread()
|
||||
{
|
||||
g_current_renderer = this;
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
#include "rsx_cache.h"
|
||||
#include "RSXFIFO.h"
|
||||
#include "RSXTexture.h"
|
||||
#include "RSXOffload.h"
|
||||
#include "RSXVertexProgram.h"
|
||||
#include "RSXFragmentProgram.h"
|
||||
#include "rsx_methods.h"
|
||||
|
@ -297,62 +298,6 @@ namespace rsx
|
|||
}
|
||||
};
|
||||
|
||||
class dma_manager
|
||||
{
|
||||
enum op
|
||||
{
|
||||
raw_copy = 0,
|
||||
vector_copy = 1,
|
||||
index_emulate = 2
|
||||
};
|
||||
|
||||
struct transport_packet
|
||||
{
|
||||
op type;
|
||||
std::vector<u8> opt_storage;
|
||||
void *src;
|
||||
void *dst;
|
||||
u32 length;
|
||||
u32 aux_param0;
|
||||
u32 aux_param1;
|
||||
|
||||
transport_packet(void *_dst, void *_src, u32 len)
|
||||
: src(_src), dst(_dst), length(len), type(op::raw_copy)
|
||||
{}
|
||||
|
||||
transport_packet(void *_dst, std::vector<u8>& _src, u32 len)
|
||||
: dst(_dst), opt_storage(std::move(_src)), length(len), type(op::vector_copy)
|
||||
{}
|
||||
|
||||
transport_packet(void *_dst, rsx::primitive_type prim, u32 len)
|
||||
: dst(_dst), aux_param0(static_cast<u8>(prim)), length(len), type(op::index_emulate)
|
||||
{}
|
||||
};
|
||||
|
||||
std::deque<transport_packet> m_work_queue;
|
||||
std::thread m_worker_thread;
|
||||
std::mutex m_queue_mutex;
|
||||
thread_state m_worker_state;
|
||||
|
||||
public:
|
||||
dma_manager() = default;
|
||||
|
||||
// initialization
|
||||
void init();
|
||||
|
||||
// General tranport
|
||||
void copy(void *dst, std::vector<u8>& src, u32 length);
|
||||
void copy(void *dst, void *src, u32 length);
|
||||
|
||||
// Vertex utilities
|
||||
void emulate_as_indexed(void *dst, rsx::primitive_type primitive, u32 count);
|
||||
|
||||
// Synchronization
|
||||
void sync();
|
||||
void join();
|
||||
};
|
||||
extern dma_manager g_dma_manager;
|
||||
|
||||
struct framebuffer_layout
|
||||
{
|
||||
u16 width;
|
||||
|
|
|
@ -315,6 +315,7 @@
|
|||
<ClCompile Include="Emu\RSX\Overlays\overlay_shader_compile_notification.cpp" />
|
||||
<ClCompile Include="Emu\RSX\Overlays\overlay_trophy_notification.cpp" />
|
||||
<ClCompile Include="Emu\RSX\RSXFIFO.cpp" />
|
||||
<ClCompile Include="Emu\RSX\RSXOffload.cpp" />
|
||||
<ClCompile Include="Emu\RSX\rsx_methods.cpp" />
|
||||
<ClCompile Include="Emu\RSX\rsx_utils.cpp" />
|
||||
<ClCompile Include="Crypto\aes.cpp">
|
||||
|
@ -558,6 +559,7 @@
|
|||
<ClInclude Include="Emu\RSX\Overlays\overlays.h" />
|
||||
<ClInclude Include="Emu\RSX\Overlays\overlay_controls.h" />
|
||||
<ClInclude Include="Emu\RSX\RSXFIFO.h" />
|
||||
<ClInclude Include="Emu\RSX\RSXOffload.h" />
|
||||
<ClInclude Include="Emu\RSX\rsx_cache.h" />
|
||||
<ClInclude Include="Emu\RSX\rsx_decode.h" />
|
||||
<ClInclude Include="Emu\RSX\rsx_vertex_data.h" />
|
||||
|
|
|
@ -806,6 +806,9 @@
|
|||
<ClCompile Include="Emu\RSX\Overlays\overlay_progress_bar.cpp">
|
||||
<Filter>Emu\GPU\RSX\Overlays</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="Emu\RSX\RSXOffload.cpp">
|
||||
<Filter>Emu\GPU\RSX</Filter>
|
||||
</ClCompile>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="Crypto\aes.h">
|
||||
|
@ -1522,5 +1525,8 @@
|
|||
<ClInclude Include="Emu\RSX\Common\surface_utils.h">
|
||||
<Filter>Emu\GPU\RSX\Common</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="Emu\RSX\RSXOffload.h">
|
||||
<Filter>Emu\GPU\RSX</Filter>
|
||||
</ClInclude>
|
||||
</ItemGroup>
|
||||
</Project>
|
Loading…
Add table
Add a link
Reference in a new issue