rsx: ZCULL synchronization fixes

- Track asynchronous operations in RSX core
- Add read barriers to force pending writes to finish.
  Fixes zcull delay flicker in all UE3 titles without forcing hard stall
- Increase zcull latency as all writes should be synchronized now
This commit is contained in:
kd-11 2018-03-13 16:34:31 +03:00
parent 315798b1f4
commit 2dce55d036
6 changed files with 87 additions and 24 deletions

View file

@ -1574,6 +1574,11 @@ void GLGSRender::get_occlusion_query_result(rsx::reports::occlusion_query_info*
} }
} }
void GLGSRender::discard_occlusion_query(rsx::reports::occlusion_query_info* query)
{
glEndQuery(GL_ANY_SAMPLES_PASSED);
}
void GLGSRender::shell_do_cleanup() void GLGSRender::shell_do_cleanup()
{ {
//TODO: Key cleanup requests with UID to identify resources to remove //TODO: Key cleanup requests with UID to identify resources to remove

View file

@ -367,6 +367,7 @@ public:
void end_occlusion_query(rsx::reports::occlusion_query_info* query) override; void end_occlusion_query(rsx::reports::occlusion_query_info* query) override;
bool check_occlusion_query_status(rsx::reports::occlusion_query_info* query) override; bool check_occlusion_query_status(rsx::reports::occlusion_query_info* query) override;
void get_occlusion_query_result(rsx::reports::occlusion_query_info* query) override; void get_occlusion_query_result(rsx::reports::occlusion_query_info* query) override;
void discard_occlusion_query(rsx::reports::occlusion_query_info* query) override;
protected: protected:
void begin() override; void begin() override;

View file

@ -2092,7 +2092,7 @@ namespace rsx
//Reset zcull ctrl //Reset zcull ctrl
zcull_ctrl->set_active(this, false); zcull_ctrl->set_active(this, false);
zcull_ctrl->clear(); zcull_ctrl->clear(this);
if (zcull_ctrl->has_pending()) if (zcull_ctrl->has_pending())
{ {
@ -2142,7 +2142,7 @@ namespace rsx
if (g_cfg.video.disable_zcull_queries) if (g_cfg.video.disable_zcull_queries)
return; return;
zcull_ctrl->clear(); zcull_ctrl->clear(this);
} }
void thread::get_zcull_stats(u32 type, vm::addr_t sink) void thread::get_zcull_stats(u32 type, vm::addr_t sink)
@ -2153,18 +2153,13 @@ namespace rsx
switch (type) switch (type)
{ {
case CELL_GCM_ZPASS_PIXEL_CNT: case CELL_GCM_ZPASS_PIXEL_CNT:
{
zcull_ctrl->read_report(this, sink, type);
return;
}
case CELL_GCM_ZCULL_STATS: case CELL_GCM_ZCULL_STATS:
case CELL_GCM_ZCULL_STATS1: case CELL_GCM_ZCULL_STATS1:
case CELL_GCM_ZCULL_STATS2: case CELL_GCM_ZCULL_STATS2:
case CELL_GCM_ZCULL_STATS3: case CELL_GCM_ZCULL_STATS3:
{ {
//TODO zcull_ctrl->read_report(this, sink, type);
value = (type != CELL_GCM_ZCULL_STATS3)? UINT16_MAX : 0; return;
break;
} }
default: default:
LOG_ERROR(RSX, "Unknown zcull stat type %d", type); LOG_ERROR(RSX, "Unknown zcull stat type %d", type);
@ -2181,6 +2176,14 @@ namespace rsx
void thread::sync() void thread::sync()
{ {
zcull_ctrl->sync(this); zcull_ctrl->sync(this);
_mm_mfence();
verify (HERE), async_tasks_pending.load() == 0;
}
void thread::read_barrier(u32 memory_address, u32 memory_range)
{
zcull_ctrl->read_barrier(this, memory_address, memory_range);
} }
void thread::notify_zcull_info_changed() void thread::notify_zcull_info_changed()
@ -2328,6 +2331,7 @@ namespace rsx
m_pending_writes.push_back({}); m_pending_writes.push_back({});
m_pending_writes.back().query = m_current_task; m_pending_writes.back().query = m_current_task;
ptimer->async_tasks_pending++;
} }
else else
{ {
@ -2342,7 +2346,7 @@ namespace rsx
void ZCULL_control::read_report(::rsx::thread* ptimer, vm::addr_t sink, u32 type) void ZCULL_control::read_report(::rsx::thread* ptimer, vm::addr_t sink, u32 type)
{ {
if (m_current_task) if (m_current_task && type == CELL_GCM_ZPASS_PIXEL_CNT)
{ {
m_current_task->owned = true; m_current_task->owned = true;
end_occlusion_query(m_current_task); end_occlusion_query(m_current_task);
@ -2384,6 +2388,8 @@ namespace rsx
break; break;
} }
ptimer->async_tasks_pending++;
} }
void ZCULL_control::allocate_new_query(::rsx::thread* ptimer) void ZCULL_control::allocate_new_query(::rsx::thread* ptimer)
@ -2436,7 +2442,7 @@ namespace rsx
} }
} }
void ZCULL_control::clear() void ZCULL_control::clear(class ::rsx::thread* ptimer)
{ {
if (!m_pending_writes.empty()) if (!m_pending_writes.empty())
{ {
@ -2449,6 +2455,7 @@ namespace rsx
discard_occlusion_query(It->query); discard_occlusion_query(It->query);
It->query->pending = false; It->query->pending = false;
valid_size--; valid_size--;
ptimer->async_tasks_pending--;
continue; continue;
} }
@ -2470,9 +2477,27 @@ namespace rsx
m_cycles_delay = max_zcull_cycles_delay; m_cycles_delay = max_zcull_cycles_delay;
} }
void ZCULL_control::write(vm::addr_t sink, u32 timestamp, u32 value) void ZCULL_control::write(vm::addr_t sink, u32 timestamp, u32 type, u32 value)
{ {
verify(HERE), sink; verify(HERE), sink;
switch (type)
{
case CELL_GCM_ZPASS_PIXEL_CNT:
value = value ? UINT16_MAX : 0;
break;
case CELL_GCM_ZCULL_STATS3:
value = value ? 0 : UINT16_MAX;
break;
case CELL_GCM_ZCULL_STATS2:
case CELL_GCM_ZCULL_STATS1:
case CELL_GCM_ZCULL_STATS:
default:
//Not implemented
value = UINT32_MAX;
break;
}
vm::ptr<CellGcmReportData> out = sink; vm::ptr<CellGcmReportData> out = sink;
out->value = value; out->value = value;
out->timer = timestamp; out->timer = timestamp;
@ -2520,7 +2545,7 @@ namespace rsx
if (!writer.forwarder) if (!writer.forwarder)
//No other queries in the chain, write result //No other queries in the chain, write result
write(writer.sink, ptimer->timestamp(), result ? UINT16_MAX : 0); write(writer.sink, ptimer->timestamp(), writer.type, result);
processed++; processed++;
} }
@ -2555,10 +2580,13 @@ namespace rsx
else else
It = m_statistics_map.erase(It); It = m_statistics_map.erase(It);
} }
//Decrement jobs counter
ptimer->async_tasks_pending -= processed;
} }
//Critical, since its likely a WAIT_FOR_IDLE type has been processed, all results are considered available //Critical, since its likely a WAIT_FOR_IDLE type has been processed, all results are considered available
m_cycles_delay = 2; m_cycles_delay = min_zcull_cycles_delay;
} }
void ZCULL_control::update(::rsx::thread* ptimer) void ZCULL_control::update(::rsx::thread* ptimer)
@ -2644,7 +2672,7 @@ namespace rsx
//only zpass supported right now //only zpass supported right now
if (!writer.forwarder) if (!writer.forwarder)
//No other queries in the chain, write result //No other queries in the chain, write result
write(writer.sink, ptimer->timestamp(), result ? UINT16_MAX : 0); write(writer.sink, ptimer->timestamp(), writer.type, result);
processed++; processed++;
} }
@ -2669,6 +2697,24 @@ namespace rsx
{ {
m_pending_writes.resize(0); m_pending_writes.resize(0);
} }
ptimer->async_tasks_pending -= processed;
}
}
void ZCULL_control::read_barrier(::rsx::thread* ptimer, u32 memory_address, u32 memory_range)
{
if (m_pending_writes.empty())
return;
const auto memory_end = memory_address + memory_range;
for (const auto &writer : m_pending_writes)
{
if (writer.sink >= memory_address && writer.sink < memory_end)
{
sync(ptimer);
return;
}
} }
} }
} }

View file

@ -185,9 +185,8 @@ namespace rsx
struct ZCULL_control struct ZCULL_control
{ {
//Delay in 'cycles' before a report update operation is forced to retire //Delay in 'cycles' before a report update operation is forced to retire
//Larger values might give more performance but some engines (UE3) dont seem to wait for results and will flicker const u32 max_zcull_cycles_delay = 128;
//TODO: Determine the real max delay in real hardware const u32 min_zcull_cycles_delay = 16;
const u32 max_zcull_cycles_delay = 10;
//Number of occlusion query slots available. Real hardware actually has far fewer units before choking //Number of occlusion query slots available. Real hardware actually has far fewer units before choking
const u32 occlusion_query_count = 128; const u32 occlusion_query_count = 128;
@ -200,7 +199,7 @@ namespace rsx
occlusion_query_info* m_current_task = nullptr; occlusion_query_info* m_current_task = nullptr;
u32 m_statistics_tag_id = 0; u32 m_statistics_tag_id = 0;
u32 m_tsc = 0; u32 m_tsc = 0;
u32 m_cycles_delay = 10; u32 m_cycles_delay = max_zcull_cycles_delay;
std::vector<queued_report_write> m_pending_writes; std::vector<queued_report_write> m_pending_writes;
std::unordered_map<u32, u32> m_statistics_map; std::unordered_map<u32, u32> m_statistics_map;
@ -211,7 +210,7 @@ namespace rsx
void set_enabled(class ::rsx::thread* ptimer, bool enabled); void set_enabled(class ::rsx::thread* ptimer, bool enabled);
void set_active(class ::rsx::thread* ptimer, bool active); void set_active(class ::rsx::thread* ptimer, bool active);
void write(vm::addr_t sink, u32 timestamp, u32 value); void write(vm::addr_t sink, u32 timestamp, u32 type, u32 value);
//Read current zcull statistics into the address provided //Read current zcull statistics into the address provided
void read_report(class ::rsx::thread* ptimer, vm::addr_t sink, u32 type); void read_report(class ::rsx::thread* ptimer, vm::addr_t sink, u32 type);
@ -220,11 +219,14 @@ namespace rsx
void allocate_new_query(class ::rsx::thread* ptimer); void allocate_new_query(class ::rsx::thread* ptimer);
//clears current stat block and increments stat_tag_id //clears current stat block and increments stat_tag_id
void clear(); void clear(class ::rsx::thread* ptimer);
//forcefully flushes all //forcefully flushes all
void sync(class ::rsx::thread* ptimer); void sync(class ::rsx::thread* ptimer);
//conditionally sync any pending writes if range overlaps
void read_barrier(class ::rsx::thread* ptimer, u32 memory_address, u32 memory_range);
//call once every 'tick' to update //call once every 'tick' to update
void update(class ::rsx::thread* ptimer); void update(class ::rsx::thread* ptimer);
@ -367,6 +369,8 @@ namespace rsx
bool sync_point_request = false; bool sync_point_request = false;
bool in_begin_end = false; bool in_begin_end = false;
atomic_t<s32> async_tasks_pending{ 0 };
bool conditional_render_test_failed = false; bool conditional_render_test_failed = false;
bool conditional_render_enabled = false; bool conditional_render_enabled = false;
bool zcull_stats_enabled = false; bool zcull_stats_enabled = false;
@ -412,6 +416,7 @@ namespace rsx
//sync //sync
void sync(); void sync();
void read_barrier(u32 memory_address, u32 memory_range);
gsl::span<const gsl::byte> get_raw_index_array(const std::vector<std::pair<u32, u32> >& draw_indexed_clause) const; gsl::span<const gsl::byte> get_raw_index_array(const std::vector<std::pair<u32, u32> >& draw_indexed_clause) const;
gsl::span<const gsl::byte> get_raw_vertex_buffer(const rsx::data_array_format_info&, u32 base_offset, const std::vector<std::pair<u32, u32>>& vertex_ranges) const; gsl::span<const gsl::byte> get_raw_vertex_buffer(const rsx::data_array_format_info&, u32 base_offset, const std::vector<std::pair<u32, u32>>& vertex_ranges) const;

View file

@ -724,7 +724,7 @@ std::string rsx::get_method_name(const u32 id)
return std::string("CELL_GCM_") + found->second; return std::string("CELL_GCM_") + found->second;
} }
return fmt::format("Unknown/illegal method [0x%08x]", id); return fmt::format("Unknown/illegal method [0x%08x]", id << 2);
} }
// Various parameter pretty printing function // Various parameter pretty printing function

View file

@ -689,6 +689,9 @@ namespace rsx
in_pitch = in_bpp * in_w; in_pitch = in_bpp * in_w;
} }
const auto read_address = get_address(src_offset, src_dma);
rsx->read_barrier(read_address, in_pitch * in_h);
if (dst_color_format != rsx::blit_engine::transfer_destination_format::r5g6b5 && if (dst_color_format != rsx::blit_engine::transfer_destination_format::r5g6b5 &&
dst_color_format != rsx::blit_engine::transfer_destination_format::a8r8g8b8) dst_color_format != rsx::blit_engine::transfer_destination_format::a8r8g8b8)
{ {
@ -933,7 +936,7 @@ namespace rsx
namespace nv0039 namespace nv0039
{ {
void buffer_notify(thread*, u32, u32 arg) void buffer_notify(thread *rsx, u32, u32 arg)
{ {
s32 in_pitch = method_registers.nv0039_input_pitch(); s32 in_pitch = method_registers.nv0039_input_pitch();
s32 out_pitch = method_registers.nv0039_output_pitch(); s32 out_pitch = method_registers.nv0039_output_pitch();
@ -968,8 +971,11 @@ namespace rsx
u32 dst_offset = method_registers.nv0039_output_offset(); u32 dst_offset = method_registers.nv0039_output_offset();
u32 dst_dma = method_registers.nv0039_output_location(); u32 dst_dma = method_registers.nv0039_output_location();
const auto read_address = get_address(src_offset, src_dma);
rsx->read_barrier(read_address, in_pitch * line_count);
u8 *dst = (u8*)vm::base(get_address(dst_offset, dst_dma)); u8 *dst = (u8*)vm::base(get_address(dst_offset, dst_dma));
const u8 *src = (u8*)vm::base(get_address(src_offset, src_dma)); const u8 *src = (u8*)vm::base(read_address);
if (in_pitch == out_pitch && out_pitch == line_length) if (in_pitch == out_pitch && out_pitch == line_length)
{ {