rpcs3/rpcs3/Emu/RSX/GL/glutils/ring_buffer.cpp

#include "stdafx.h"
#include "ring_buffer.h"

namespace gl
{
	void ring_buffer::recreate(GLsizeiptr size, const void* data)
	{
		if (m_id)
		{
			m_fence.wait_for_signal();
			remove();
		}

		buffer::create();
		save_binding_state save(current_target(), *this);

		GLbitfield buffer_storage_flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT;
		if (gl::get_driver_caps().vendor_MESA) buffer_storage_flags |= GL_CLIENT_STORAGE_BIT;

		DSA_CALL2(NamedBufferStorage, m_id, size, data, buffer_storage_flags);
		m_memory_mapping = DSA_CALL2_RET(MapNamedBufferRange, m_id, 0, size, GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT);

		ensure(m_memory_mapping != nullptr);
		m_data_loc = 0;
		m_size = ::narrow<u32>(size);
		m_memory_type = memory_type::host_visible;
	}

	void ring_buffer::create(target target_, GLsizeiptr size, const void* data_)
	{
		m_target = target_;
		recreate(size, data_);
	}

	std::pair<void*, u32> ring_buffer::alloc_from_heap(u32 alloc_size, u16 alignment)
	{
		u32 offset = m_data_loc;
		if (m_data_loc) offset = utils::align(offset, alignment);

		if ((offset + alloc_size) > m_size)
		{
			if (!m_fence.is_empty())
			{
				m_fence.wait_for_signal();
			}
			else
			{
				rsx_log.error("OOM Error: Ring buffer was likely being used without notify() being called");
				glFinish();
			}

			m_data_loc = 0;
			offset = 0;
		}

		//Align data loc to 256; allows some "guard" region so we dont trample our own data inadvertently
		m_data_loc = utils::align(offset + alloc_size, 256);
		return std::make_pair(static_cast<char*>(m_memory_mapping) + offset, offset);
	}

	void ring_buffer::remove()
	{
		if (m_memory_mapping)
		{
			buffer::unmap();

			m_memory_mapping = nullptr;
			m_data_loc = 0;
			m_size = 0;
		}


		if (m_id != GL_NONE)
		{
			glDeleteBuffers(1, &m_id);
			m_id = GL_NONE;
		}
	}

	void ring_buffer::notify()
	{
		//Insert fence about 25% into the buffer
		if (m_fence.is_empty() && (m_data_loc > (m_size >> 2)))
			m_fence.reset();
	}

	// Legacy ring buffer - used when ARB_buffer_storage is not available, OR when capturing with renderdoc
	void legacy_ring_buffer::recreate(GLsizeiptr size, const void* data)
	{
		if (m_id)
			remove();

		buffer::create();
		buffer::data(size, data, GL_DYNAMIC_DRAW);

		m_memory_type = memory_type::host_visible;
		m_memory_mapping = nullptr;
		m_data_loc = 0;
		m_size = ::narrow<u32>(size);
	}

	void legacy_ring_buffer::create(target target_, GLsizeiptr size, const void* data_)
	{
		m_target = target_;
		recreate(size, data_);
	}

	void legacy_ring_buffer::reserve_storage_on_heap(u32 alloc_size)
	{
		ensure(m_memory_mapping == nullptr);

		u32 offset = m_data_loc;
		if (m_data_loc) offset = utils::align(offset, 256);

		const u32 block_size = utils::align(alloc_size + 16, 256);	//Overallocate just in case we need to realign base

		if ((offset + block_size) > m_size)
		{
			buffer::data(m_size, nullptr, GL_DYNAMIC_DRAW);
			m_data_loc = 0;
		}

		m_memory_mapping = DSA_CALL2_RET(MapNamedBufferRange, m_id, m_data_loc, block_size, GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_RANGE_BIT | GL_MAP_UNSYNCHRONIZED_BIT);
		m_mapped_bytes = block_size;
		m_mapping_offset = m_data_loc;
		m_alignment_offset = 0;

		//When using debugging tools, the mapped base might not be aligned as expected
		const u64 mapped_address_base = reinterpret_cast<u64>(m_memory_mapping);
		if (mapped_address_base & 0xF)
		{
			//Unaligned result was returned. We have to modify the base address a bit
			//We lose some memory here, but the 16 byte overallocation above makes up for it
			const u64 new_base = (mapped_address_base & ~0xF) + 16;
			const u64 diff_bytes = new_base - mapped_address_base;

			m_memory_mapping = reinterpret_cast<void*>(new_base);
			m_mapped_bytes -= ::narrow<u32>(diff_bytes);
			m_alignment_offset = ::narrow<u32>(diff_bytes);
		}

		ensure(m_mapped_bytes >= alloc_size);
	}

	std::pair<void*, u32> legacy_ring_buffer::alloc_from_heap(u32 alloc_size, u16 alignment)
	{
		u32 offset = m_data_loc;
		if (m_data_loc) offset = utils::align(offset, alignment);

		u32 padding = (offset - m_data_loc);
		u32 real_size = utils::align(padding + alloc_size, alignment);	//Ensures we leave the loc pointer aligned after we exit

		if (real_size > m_mapped_bytes)
		{
			//Missed allocation. We take a performance hit on doing this.
			//Overallocate slightly for the next allocation if requested size is too small
			unmap();
			reserve_storage_on_heap(std::max(real_size, 4096U));

			offset = m_data_loc;
			if (m_data_loc) offset = utils::align(offset, alignment);

			padding = (offset - m_data_loc);
			real_size = utils::align(padding + alloc_size, alignment);
		}

		m_data_loc = offset + real_size;
		m_mapped_bytes -= real_size;

		u32 local_offset = (offset - m_mapping_offset);
		return std::make_pair(static_cast<char*>(m_memory_mapping) + local_offset, offset + m_alignment_offset);
	}

	void legacy_ring_buffer::remove()
	{
		ring_buffer::remove();
		m_mapped_bytes = 0;
	}

	void legacy_ring_buffer::unmap()
	{
		buffer::unmap();

		m_memory_mapping = nullptr;
		m_mapped_bytes = 0;
		m_mapping_offset = 0;
	}

	// AMD persistent mapping workaround for driver-assisted flushing
	void* transient_ring_buffer::map_internal(u32 offset, u32 length)
	{
		flush();

		dirty = true;
		return DSA_CALL2_RET(MapNamedBufferRange, m_id, offset, length, GL_MAP_WRITE_BIT | GL_MAP_UNSYNCHRONIZED_BIT);
	}

	void transient_ring_buffer::bind()
	{
		flush();
		buffer::bind();
	}

	void transient_ring_buffer::recreate(GLsizeiptr size, const void* data)
	{
		if (m_id)
		{
			m_fence.wait_for_signal();
			remove();
		}

		buffer::create();
		save_binding_state save(current_target(), *this);
		DSA_CALL2(NamedBufferStorage, m_id, size, data, GL_MAP_WRITE_BIT);

		m_data_loc = 0;
		m_size = ::narrow<u32>(size);
		m_memory_type = memory_type::host_visible;
	}

	std::pair<void*, u32> transient_ring_buffer::alloc_from_heap(u32 alloc_size, u16 alignment)
	{
		ensure(m_memory_mapping == nullptr);
		const auto allocation = ring_buffer::alloc_from_heap(alloc_size, alignment);
		return { map_internal(allocation.second, alloc_size), allocation.second };
	}

	void transient_ring_buffer::flush()
	{
		if (dirty)
		{
			buffer::unmap();
			dirty = false;
		}
	}

	void transient_ring_buffer::unmap()
	{
		flush();
	}
}