rsx: Rework RSX offloading

- Use a lockless queue
- Do not enqueue small transfers
This commit is contained in:
kd-11 2019-06-18 19:47:23 +03:00 committed by kd-11
parent c32c1b0a62
commit b893a75002
6 changed files with 210 additions and 176 deletions

View file

@ -0,0 +1,129 @@
#include "stdafx.h"
#include "Common/BufferUtils.h"
#include "Emu/System.h"
#include "RSXOffload.h"
namespace rsx
{
// initialization
void dma_manager::init()
{
m_worker_state = thread_state::created;
m_worker_thread = std::thread([this]()
{
if (!g_cfg.video.multithreaded_rsx)
{
// Abort
return;
}
if (g_cfg.core.thread_scheduler_enabled)
{
thread_ctrl::set_thread_affinity_mask(thread_ctrl::get_affinity_mask(thread_class::rsx));
}
bool idle = false;
while (m_worker_state != thread_state::finished)
{
if (m_jobs_count)
{
if (idle)
{
thread_ctrl::set_native_priority(0);
idle = false;
}
for (auto slice = m_work_queue.pop_all(); slice; slice.pop_front())
{
auto task = *slice;
switch (task.type)
{
case raw_copy:
memcpy(task.dst, task.src, task.length);
break;
case vector_copy:
memcpy(task.dst, task.opt_storage.data(), task.length);
break;
case index_emulate:
write_index_array_for_non_indexed_non_native_primitive_to_buffer(
reinterpret_cast<char*>(task.dst),
static_cast<rsx::primitive_type>(task.aux_param0),
task.length);
break;
default:
ASSUME(0);
fmt::throw_exception("Unreachable" HERE);
}
m_jobs_count--;
}
}
else
{
idle = true;
thread_ctrl::set_native_priority(-1);
std::this_thread::yield();
}
}
});
}
// General transport
void dma_manager::copy(void *dst, std::vector<u8>& src, u32 length)
{
if (length <= max_immediate_transfer_size || !g_cfg.video.multithreaded_rsx)
{
std::memcpy(dst, src.data(), length);
}
else
{
++m_jobs_count;
m_work_queue.push(dst, src, length);
}
}
void dma_manager::copy(void *dst, void *src, u32 length)
{
if (length <= max_immediate_transfer_size || !g_cfg.video.multithreaded_rsx)
{
std::memcpy(dst, src, length);
}
else
{
++m_jobs_count;
m_work_queue.push(dst, src, length);
}
}
// Vertex utilities
void dma_manager::emulate_as_indexed(void *dst, rsx::primitive_type primitive, u32 count)
{
if (!g_cfg.video.multithreaded_rsx)
{
write_index_array_for_non_indexed_non_native_primitive_to_buffer(
reinterpret_cast<char*>(dst), primitive, count);
}
else
{
++m_jobs_count;
m_work_queue.push(dst, primitive, count);
}
}
// Synchronization
void dma_manager::sync()
{
if (g_cfg.video.multithreaded_rsx)
{
while (m_jobs_count)
_mm_lfence();
}
}
void dma_manager::join()
{
m_worker_state = thread_state::finished;
m_worker_thread.join();
}
}

View file

@ -0,0 +1,72 @@
#pragma once
#include "Utilities/types.h"
#include "Utilities/lockless.h"
#include "Utilities/Thread.h"
#include "gcm_enums.h"
#include <vector>
#include <thread>
namespace rsx
{
class dma_manager
{
enum op
{
raw_copy = 0,
vector_copy = 1,
index_emulate = 2
};
struct transport_packet
{
op type;
std::vector<u8> opt_storage;
void *src;
void *dst;
u32 length;
u32 aux_param0;
u32 aux_param1;
transport_packet(void *_dst, void *_src, u32 len)
: src(_src), dst(_dst), length(len), type(op::raw_copy)
{}
transport_packet(void *_dst, std::vector<u8>& _src, u32 len)
: dst(_dst), opt_storage(std::move(_src)), length(len), type(op::vector_copy)
{}
transport_packet(void *_dst, rsx::primitive_type prim, u32 len)
: dst(_dst), aux_param0(static_cast<u8>(prim)), length(len), type(op::index_emulate)
{}
};
lf_queue<transport_packet> m_work_queue;
atomic_t<int> m_jobs_count;
std::thread m_worker_thread;
thread_state m_worker_state;
// TODO: Improved benchmarks here; value determined by profiling on a Ryzen CPU, rounded to the nearest 512 bytes
const u32 max_immediate_transfer_size = 3584;
public:
dma_manager() = default;
// initialization
void init();
// General tranport
void copy(void *dst, std::vector<u8>& src, u32 length);
void copy(void *dst, void *src, u32 length);
// Vertex utilities
void emulate_as_indexed(void *dst, rsx::primitive_type primitive, u32 count);
// Synchronization
void sync();
void join();
};
extern dma_manager g_dma_manager;
}

View file

@ -251,126 +251,6 @@ namespace rsx
}
}
// initialization
void dma_manager::init()
{
m_worker_state = thread_state::created;
m_worker_thread = std::thread([this]()
{
if (!g_cfg.video.multithreaded_rsx)
{
// Abort
return;
}
if (g_cfg.core.thread_scheduler_enabled)
{
thread_ctrl::set_thread_affinity_mask(thread_ctrl::get_affinity_mask(thread_class::rsx));
}
bool idle = false;
while (m_worker_state != thread_state::finished)
{
if (!m_work_queue.empty())
{
m_queue_mutex.lock();
auto task = std::move(m_work_queue.front());
m_work_queue.pop_front();
m_queue_mutex.unlock();
if (idle)
{
thread_ctrl::set_native_priority(0);
idle = false;
}
switch (task.type)
{
case raw_copy:
memcpy(task.dst, task.src, task.length);
break;
case vector_copy:
memcpy(task.dst, task.opt_storage.data(), task.length);
break;
case index_emulate:
write_index_array_for_non_indexed_non_native_primitive_to_buffer(
reinterpret_cast<char*>(task.dst),
static_cast<rsx::primitive_type>(task.aux_param0),
task.length);
break;
default:
ASSUME(0);
fmt::throw_exception("Unreachable" HERE);
}
}
else
{
idle = true;
thread_ctrl::set_native_priority(-1);
std::this_thread::yield();
}
}
});
}
// General tranport
void dma_manager::copy(void *dst, std::vector<u8>& src, u32 length)
{
if (!g_cfg.video.multithreaded_rsx)
{
std::memcpy(dst, src.data(), length);
}
else
{
std::lock_guard lock(m_queue_mutex);
m_work_queue.emplace_back(dst, src, length);
}
}
void dma_manager::copy(void *dst, void *src, u32 length)
{
if (!g_cfg.video.multithreaded_rsx)
{
std::memcpy(dst, src, length);
}
else
{
std::lock_guard lock(m_queue_mutex);
m_work_queue.emplace_back(dst, src, length);
}
}
// Vertex utilities
void dma_manager::emulate_as_indexed(void *dst, rsx::primitive_type primitive, u32 count)
{
if (!g_cfg.video.multithreaded_rsx)
{
write_index_array_for_non_indexed_non_native_primitive_to_buffer(
reinterpret_cast<char*>(dst), primitive, count);
}
else
{
std::lock_guard lock(m_queue_mutex);
m_work_queue.emplace_back(dst, primitive, count);
}
}
// Synchronization
void dma_manager::sync()
{
if (g_cfg.video.multithreaded_rsx)
{
while (!m_work_queue.empty())
_mm_lfence();
}
}
void dma_manager::join()
{
m_worker_state = thread_state::finished;
m_worker_thread.join();
}
thread::thread()
{
g_current_renderer = this;

View file

@ -6,6 +6,7 @@
#include "rsx_cache.h"
#include "RSXFIFO.h"
#include "RSXTexture.h"
#include "RSXOffload.h"
#include "RSXVertexProgram.h"
#include "RSXFragmentProgram.h"
#include "rsx_methods.h"
@ -297,62 +298,6 @@ namespace rsx
}
};
class dma_manager
{
enum op
{
raw_copy = 0,
vector_copy = 1,
index_emulate = 2
};
struct transport_packet
{
op type;
std::vector<u8> opt_storage;
void *src;
void *dst;
u32 length;
u32 aux_param0;
u32 aux_param1;
transport_packet(void *_dst, void *_src, u32 len)
: src(_src), dst(_dst), length(len), type(op::raw_copy)
{}
transport_packet(void *_dst, std::vector<u8>& _src, u32 len)
: dst(_dst), opt_storage(std::move(_src)), length(len), type(op::vector_copy)
{}
transport_packet(void *_dst, rsx::primitive_type prim, u32 len)
: dst(_dst), aux_param0(static_cast<u8>(prim)), length(len), type(op::index_emulate)
{}
};
std::deque<transport_packet> m_work_queue;
std::thread m_worker_thread;
std::mutex m_queue_mutex;
thread_state m_worker_state;
public:
dma_manager() = default;
// initialization
void init();
// General tranport
void copy(void *dst, std::vector<u8>& src, u32 length);
void copy(void *dst, void *src, u32 length);
// Vertex utilities
void emulate_as_indexed(void *dst, rsx::primitive_type primitive, u32 count);
// Synchronization
void sync();
void join();
};
extern dma_manager g_dma_manager;
struct framebuffer_layout
{
u16 width;

View file

@ -315,6 +315,7 @@
<ClCompile Include="Emu\RSX\Overlays\overlay_shader_compile_notification.cpp" />
<ClCompile Include="Emu\RSX\Overlays\overlay_trophy_notification.cpp" />
<ClCompile Include="Emu\RSX\RSXFIFO.cpp" />
<ClCompile Include="Emu\RSX\RSXOffload.cpp" />
<ClCompile Include="Emu\RSX\rsx_methods.cpp" />
<ClCompile Include="Emu\RSX\rsx_utils.cpp" />
<ClCompile Include="Crypto\aes.cpp">
@ -558,6 +559,7 @@
<ClInclude Include="Emu\RSX\Overlays\overlays.h" />
<ClInclude Include="Emu\RSX\Overlays\overlay_controls.h" />
<ClInclude Include="Emu\RSX\RSXFIFO.h" />
<ClInclude Include="Emu\RSX\RSXOffload.h" />
<ClInclude Include="Emu\RSX\rsx_cache.h" />
<ClInclude Include="Emu\RSX\rsx_decode.h" />
<ClInclude Include="Emu\RSX\rsx_vertex_data.h" />

View file

@ -806,6 +806,9 @@
<ClCompile Include="Emu\RSX\Overlays\overlay_progress_bar.cpp">
<Filter>Emu\GPU\RSX\Overlays</Filter>
</ClCompile>
<ClCompile Include="Emu\RSX\RSXOffload.cpp">
<Filter>Emu\GPU\RSX</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="Crypto\aes.h">
@ -1522,5 +1525,8 @@
<ClInclude Include="Emu\RSX\Common\surface_utils.h">
<Filter>Emu\GPU\RSX\Common</Filter>
</ClInclude>
<ClInclude Include="Emu\RSX\RSXOffload.h">
<Filter>Emu\GPU\RSX</Filter>
</ClInclude>
</ItemGroup>
</Project>