Optimizations (#1680)

* Optimizations 1) Some headers simplified for better compilation time 2) Some templates simplified for smaller executable size 3) Eliminate std::future to fix compilation for mingw64 4) PKG installation can be cancelled now 5) cellGame fixes 6) XAudio2 fix for mingw64 7) PPUInterpreter bug fixed (Clang) * any_pod<> implemented Aliases: any16, any32, any64 rsx::make_command fixed
2025-07-03 13:31:27 +12:00 · 2016-04-25 13:49:12 +03:00 · 2016-04-25 13:49:12 +03:00 · da7472fe81
commit da7472fe81
parent 75fe95eeb1
96 changed files with 2086 additions and 1772 deletions
--- a/Utilities/Atomic.h
+++ b/Utilities/Atomic.h
@ -976,4 +976,17 @@ public:
 	{
 		return atomic_op(atomic_test_and_complement<type, T2>{}, rhs);
 	}
+
+	// Minimal pointer support (TODO: must forward operator ->())
+	type operator ->() const
+	{
+		return load();
+	}
+
+	// Minimal array support
+	template<typename I = std::size_t>
+	auto operator [](const I& index) const -> decltype(std::declval<const type>()[std::declval<I>()])
+	{
+		return load()[index];
+	}
 };
--- a/Utilities/AtomicPtr.h
+++ b/Utilities/AtomicPtr.h
@ -0,0 +1,150 @@
+#pragma once
+
+#include "Atomic.h"
+#include <memory>
+
+// Unfinished. Only std::default_delete will work as expected.
+template<typename T, typename D>
+class atomic_ptr_base : D
+{
+protected:
+	atomic_t<T*> m_ptr;
+
+	constexpr atomic_ptr_base(T* ptr)
+		: m_ptr(ptr)
+	{
+	}
+
+public:
+	~atomic_ptr_base()
+	{
+		if (m_ptr)
+		{
+			(*this)(m_ptr.load());
+		}
+	}
+
+	D& get_deleter()
+	{
+		return *this;
+	}
+
+	const D& get_deleter() const
+	{
+		return *this;
+	}
+};
+
+// Simple atomic pointer with unique ownership. Draft, unfinished.
+template<typename T, typename D = std::default_delete<T>>
+class atomic_ptr final : atomic_ptr_base<T, D>
+{
+	using base = atomic_ptr_base<T, D>;
+
+	static_assert(sizeof(T*) == sizeof(base), "atomic_ptr<> error: invalid deleter (empty class expected)");
+
+public:
+	constexpr atomic_ptr()
+		: base(nullptr)
+	{
+	}
+
+	constexpr atomic_ptr(std::nullptr_t)
+		: base(nullptr)
+	{
+	}
+
+	explicit atomic_ptr(T* ptr)
+		: base(ptr)
+	{
+	}
+
+	template<typename T2, typename = std::enable_if_t<std::is_convertible<T2, T>::value>>
+	atomic_ptr(std::unique_ptr<T2, D>&& ptr)
+		: base(ptr.release())
+	{
+	}
+
+	atomic_ptr& operator =(std::nullptr_t)
+	{
+		if (T* old = base::m_ptr.exchange(nullptr))
+		{
+			this->get_deleter()(old);
+		}
+
+		return *this;
+	}
+
+	template<typename T2, typename = std::enable_if_t<std::is_convertible<T2, T>::value>>
+	atomic_ptr& operator =(std::unique_ptr<T2, D>&& ptr)
+	{
+		if (T* old = base::m_ptr.exchange(ptr.release()))
+		{
+			this->get_deleter()(old);
+		}
+
+		return *this;
+	}
+
+	void swap(std::unique_ptr<T, D>& ptr)
+	{
+		ptr.reset(base::m_ptr.exchange(ptr.release()));
+	}
+
+	std::add_lvalue_reference_t<T> operator *() const
+	{
+		return *base::m_ptr;
+	}
+
+	T* operator ->() const
+	{
+		return base::m_ptr;
+	}
+
+	T* get() const
+	{
+		return base::m_ptr;
+	}
+
+	explicit operator bool() const
+	{
+		return base::m_ptr != nullptr;
+	}
+
+	T* release() const
+	{
+		return base::m_ptr.exchange(0);
+	}
+
+	void reset(T* ptr = nullptr)
+	{
+		if (T* old = base::m_ptr.exchange(ptr))
+		{
+			this->get_deleter()(old);
+		}
+	}
+
+	// Steal the pointer from `ptr`, convert old value to unique_ptr
+	std::unique_ptr<T, D> exchange(std::unique_ptr<T, D>&& ptr)
+	{
+		return std::unique_ptr<T, D>(base::m_ptr.exchange(ptr.release()));
+	}
+
+	// If pointer is null, steal it from `ptr`
+	bool test_and_swap(std::unique_ptr<T, D>&& ptr)
+	{
+		if (base::m_ptr.compare_and_swap_test(nullptr, ptr.get()))
+		{
+			ptr.release();
+			return true;
+		}
+
+		return false;
+	}
+};
+
+template<typename T, typename D>
+class atomic_ptr<T[], D> final : atomic_ptr_base<T[], D>
+{
+	// TODO
+};
--- a/Utilities/BEType.h
+++ b/Utilities/BEType.h
@ -910,20 +910,15 @@ template<typename T> using atomic_be_t = atomic_t<be_t<T>>;
 template<typename T> using atomic_le_t = atomic_t<le_t<T>>;
 #endif

-namespace fmt
+// Formatting for BE/LE data
+template<typename T, bool Se>
+struct unveil<se_t<T, Se>, void>
 {
-	// Formatting for BE/LE data
-	template<typename T, bool Se>
-	struct unveil<se_t<T, Se>, void>
+	static inline auto get(const se_t<T, Se>& arg)
 	{
-		using result_type = typename unveil<T>::result_type;
-
-		static inline result_type get_value(const se_t<T, Se>& arg)
-		{
-			return unveil<T>::get_value(arg);
-		}
-	};
-}
+		return unveil<T>::get(arg);
+	}
+};

 #undef IS_BINARY_COMPARABLE
 #undef IS_INTEGER
--- a/Utilities/Config.h
+++ b/Utilities/Config.h
@ -1,8 +1,17 @@
 #pragma once

+#include "Utilities/types.h"
 #include "Utilities/Atomic.h"
+
+#include <initializer_list>
+#include <exception>
+#include <utility>
+#include <string>
+#include <vector>
 #include <set>
+#include <unordered_map>
 #include <map>
+#include <mutex>

 namespace cfg
 {
--- a/Utilities/File.cpp
+++ b/Utilities/File.cpp
@ -2,13 +2,13 @@
 #include "StrFmt.h"
 #include "Macro.h"
 #include "SharedMutex.h"
+
 #include <unordered_map>
+#include <algorithm>

 #ifdef _WIN32

 #include <cwchar>
-#undef _WIN32_WINNT
-#define _WIN32_WINNT 0x0601
 #include <Windows.h>

 static std::unique_ptr<wchar_t[]> to_wchar(const std::string& source)
@ -97,7 +97,7 @@ namespace fs
 		std::unordered_map<std::string, std::shared_ptr<device_base>> m_map;

 	public:
-		std::shared_ptr<device_base> get_device(const std::string& name);
+		std::shared_ptr<device_base> get_device(const std::string& path);
 		std::shared_ptr<device_base> set_device(const std::string& name, const std::shared_ptr<device_base>&);
 	};

@ -109,11 +109,11 @@ namespace fs
 	}
 }

-safe_buffers std::shared_ptr<fs::device_base> fs::device_manager::get_device(const std::string& name)
+std::shared_ptr<fs::device_base> fs::device_manager::get_device(const std::string& path)
 {
 	reader_lock lock(m_mutex);

-	const auto found = m_map.find(name);
+	const auto found = m_map.find(path.substr(0, path.find_first_of('/', 2)));

 	if (found == m_map.end())
 	{
@ -123,29 +123,27 @@ safe_buffers std::shared_ptr<fs::device_base> fs::device_manager::get_device(con
 	return found->second;
 }

-safe_buffers std::shared_ptr<fs::device_base> fs::device_manager::set_device(const std::string& name, const std::shared_ptr<device_base>& device)
+std::shared_ptr<fs::device_base> fs::device_manager::set_device(const std::string& name, const std::shared_ptr<device_base>& device)
 {
-	std::lock_guard<shared_mutex> lock(m_mutex);
+	writer_lock lock(m_mutex);

 	return m_map[name] = device;
 }

-safe_buffers std::shared_ptr<fs::device_base> fs::get_virtual_device(const std::string& path)
+std::shared_ptr<fs::device_base> fs::get_virtual_device(const std::string& path)
 {
 	// Every virtual device path must have "//" at the beginning
-	if (path.size() > 2 && reinterpret_cast<const u16&>(path.front()) == '//')
+	if (path.size() > 2 && reinterpret_cast<const u16&>(path.front()) == "//"_u16)
 	{
-		return get_device_manager().get_device(path.substr(0, path.find_first_of('/', 2)));
+		return get_device_manager().get_device(path);
 	}

 	return nullptr;
 }

-safe_buffers std::shared_ptr<fs::device_base> fs::set_virtual_device(const std::string& name, const std::shared_ptr<device_base>& device)
+std::shared_ptr<fs::device_base> fs::set_virtual_device(const std::string& name, const std::shared_ptr<device_base>& device)
 {
-	Expects(name.size() > 2);
-	Expects(name[0] == '/');
-	Expects(name[1] == '/');
+	Expects(name.size() > 2 && name[0] == '/' && name[1] == '/' && name.find('/', 2) == -1);

 	return get_device_manager().set_device(name, device);
 }
@ -1005,17 +1003,17 @@ bool fs::file::open(const std::string& path, mset<open_mode> mode)

 fs::file::file(const void* ptr, std::size_t size)
 {
-	class memory_stream final : public file_base
+	class memory_stream : public file_base
 	{
-		u64 m_pos = 0;
-		u64 m_size;
+		u64 m_pos{}; // TODO: read/seek could modify m_pos atomically
+
+		const char* const m_ptr;
+		const u64 m_size;

 	public:
-		const char* const ptr;
-
-		memory_stream(const void* ptr, std::size_t size)
-			: m_size(size)
-			, ptr(static_cast<const char*>(ptr))
+		memory_stream(const void* ptr, u64 size)
+			: m_ptr(static_cast<const char*>(ptr))
+			, m_size(size)
 		{
 		}

@ -1034,7 +1032,7 @@ fs::file::file(const void* ptr, std::size_t size)
 			const u64 start = m_pos;
 			const u64 end = seek(count, fs::seek_cur);
 			const u64 read_size = end >= start ? end - start : throw std::logic_error("memory_stream::read(): overflow");
-			std::memcpy(buffer, ptr + start, read_size);
+			std::memcpy(buffer, m_ptr + start, read_size);
 			return read_size;
 		}

@ -1045,10 +1043,10 @@ fs::file::file(const void* ptr, std::size_t size)

 		u64 seek(s64 offset, fs::seek_mode whence) override
 		{
-			return m_pos =
-				whence == fs::seek_set ? std::min<u64>(offset, m_size) :
-				whence == fs::seek_cur ? std::min<u64>(offset + m_pos, m_size) :
-				whence == fs::seek_end ? std::min<u64>(offset + m_size, m_size) :
+			return
+				whence == fs::seek_set ? m_pos = std::min<u64>(offset, m_size) :
+				whence == fs::seek_cur ? m_pos = std::min<u64>(offset + m_pos, m_size) :
+				whence == fs::seek_end ? m_pos = std::min<u64>(offset + m_size, m_size) :
 				throw std::logic_error("memory_stream::seek(): invalid whence");
 		}

@ -1061,93 +1059,6 @@ fs::file::file(const void* ptr, std::size_t size)
 	m_file = std::make_unique<memory_stream>(ptr, size);
 }

-fs::file::file(std::vector<char>& vec)
-{
-	class vector_stream final : public file_base
-	{
-		u64 m_pos = 0;
-
-	public:
-		std::vector<char>& vec;
-
-		vector_stream(std::vector<char>& vec)
-			: vec(vec)
-		{
-		}
-
-		fs::stat_t stat() override
-		{
-			throw std::logic_error("vector_stream doesn't support stat()");
-		}
-
-		bool trunc(u64 length) override
-		{
-			vec.resize(length);
-			return true;
-		}
-
-		u64 read(void* buffer, u64 count) override
-		{
-			const u64 start = m_pos;
-			const u64 end = seek(count, fs::seek_cur);
-			const u64 read_size = end >= start ? end - start : throw std::logic_error("vector_stream::read(): overflow");
-			std::memcpy(buffer, vec.data() + start, read_size);
-			return read_size;
-		}
-
-		u64 write(const void* buffer, u64 count) override
-		{
-			throw std::logic_error("TODO: vector_stream doesn't support write()");
-		}
-
-		u64 seek(s64 offset, fs::seek_mode whence) override
-		{
-			return m_pos =
-				whence == fs::seek_set ? std::min<u64>(offset, vec.size()) :
-				whence == fs::seek_cur ? std::min<u64>(offset + m_pos, vec.size()) :
-				whence == fs::seek_end ? std::min<u64>(offset + vec.size(), vec.size()) :
-				throw std::logic_error("vector_stream::seek(): invalid whence");
-		}
-
-		u64 size() override
-		{
-			return vec.size();
-		}
-	};
-
-	m_file = std::make_unique<vector_stream>(vec);
-}
-
-//void fs::file_read_map::reset(const file& f)
-//{
-//	reset();
-//
-//	if (f)
-//	{
-//#ifdef _WIN32
-//		const HANDLE handle = ::CreateFileMapping((HANDLE)f.m_fd, NULL, PAGE_READONLY, 0, 0, NULL);
-//		m_ptr = (char*)::MapViewOfFile(handle, FILE_MAP_READ, 0, 0, 0);
-//		m_size = f.size();
-//		::CloseHandle(handle);
-//#else
-//		m_ptr = (char*)::mmap(nullptr, m_size = f.size(), PROT_READ, MAP_SHARED, f.m_fd, 0);
-//		if (m_ptr == (void*)-1) m_ptr = nullptr;
-//#endif
-//	}
-//}
-//
-//void fs::file_read_map::reset()
-//{
-//	if (m_ptr)
-//	{
-//#ifdef _WIN32
-//		::UnmapViewOfFile(m_ptr);
-//#else
-//		::munmap(m_ptr, m_size);
-//#endif
-//	}
-//}
-
 void fs::dir::xnull() const
 {
 	throw std::logic_error("fs::dir is null");
--- a/Utilities/File.h
+++ b/Utilities/File.h
@ -161,9 +161,6 @@ namespace fs
 		// Open memory for read
 		explicit file(const void* ptr, std::size_t size);

-		// Open vector
-		explicit file(std::vector<char>& vec);
-
 		// Check whether the handle is valid (opened file)
 		explicit operator bool() const
 		{
@ -214,7 +211,7 @@ namespace fs
 			return m_file->write(buffer, count);
 		}

-		// Change current position, returns previous position
+		// Change current position, returns resulting position
 		u64 seek(s64 offset, seek_mode whence = seek_set) const
 		{
 			if (!m_file) xnull();
--- a/Utilities/Log.cpp
+++ b/Utilities/Log.cpp
@ -1,4 +1,5 @@
 #include "Log.h"
+#include <cstdarg>

 namespace _log
 {
@ -23,9 +24,12 @@ namespace _log
 	thread_local std::string(*g_tls_make_prefix)(const channel&, level, const std::string&) = nullptr;
 }

-void _log::broadcast(const _log::channel& ch, _log::level sev, const std::string& text)
+void _log::channel::broadcast(const _log::channel& ch, _log::level sev, const char* fmt...)
 {
-	get_logger().log(ch, sev, text);
+	va_list args;
+	va_start(args, fmt);
+	get_logger().log(ch, sev, fmt::_vformat(fmt, args));
+	va_end(args);
 }

 [[noreturn]] extern void catch_all_exceptions();
--- a/Utilities/Log.h
+++ b/Utilities/Log.h
@ -22,9 +22,6 @@ namespace _log
 	struct channel;
 	struct listener;

-	// Send log message to global logger instance
-	void broadcast(const channel& ch, level sev, const std::string& text);
-
 	// Log channel
 	struct channel
 	{
@ -42,23 +39,23 @@ namespace _log
 		}

 		// Log without formatting
-		force_inline void log(level sev, const std::string& text) const
+		void log(level sev, const std::string& text) const
 		{
 			if (sev <= enabled)
-				broadcast(*this, sev, text);
+				broadcast(*this, sev, "%s", text.c_str());
 		}

 		// Log with formatting
 		template<typename... Args>
-		force_inline safe_buffers void format(level sev, const char* fmt, const Args&... args) const
+		void format(level sev, const char* fmt, const Args&... args) const
 		{
 			if (sev <= enabled)
-				broadcast(*this, sev, fmt::format(fmt, fmt::do_unveil(args)...));
+				broadcast(*this, sev, fmt, ::unveil<Args>::get(args)...);
 		}

 #define GEN_LOG_METHOD(_sev)\
 		template<typename... Args>\
-		force_inline void _sev(const char* fmt, const Args&... args) const\
+		void _sev(const char* fmt, const Args&... args) const\
 		{\
 			return format<Args...>(level::_sev, fmt, args...);\
 		}
@ -72,6 +69,10 @@ namespace _log
 		GEN_LOG_METHOD(trace)

 #undef GEN_LOG_METHOD
+
+	private:
+		// Send log message to global logger instance
+		static void broadcast(const channel& ch, level sev, const char* fmt...);
 	};

 	// Log listener (destination)
--- a/Utilities/Platform.h
+++ b/Utilities/Platform.h
@ -13,25 +13,28 @@
 #include <x86intrin.h>
 #endif

+#ifdef _MSC_VER
+#define ASSUME(cond) __assume(cond)
+#define LIKELY(cond) (cond)
+#define UNLIKELY(cond) (cond)
+#else
+#define ASSUME(cond) do { if (!(cond)) __builtin_unreachable(); } while (0)
+#define LIKELY(cond) __builtin_expect(!!(cond), 1)
+#define UNLIKELY(cond) __builtin_expect(!!(cond), 0)
+#endif
+
 // Some platforms don't support thread_local well yet.
 #ifndef _MSC_VER
 #define thread_local __thread
-#define __assume(cond) do { if (!(cond)) __builtin_unreachable(); } while (0)
 #endif

-#if defined(_MSC_VER)
-#define safe_buffers __declspec(safebuffers)
-#else
-#define safe_buffers
-#endif
-
-#if defined(_MSC_VER)
+#ifdef _MSC_VER
 #define never_inline __declspec(noinline)
 #else
 #define never_inline __attribute__((noinline))
 #endif

-#if defined(_MSC_VER)
+#ifdef _MSC_VER
 #define force_inline __forceinline
 #else
 #define force_inline __attribute__((always_inline)) inline
--- a/Utilities/Semaphore.h
+++ b/Utilities/Semaphore.h
@ -1,5 +1,8 @@
 #pragma once

+#include "types.h"
+#include "Atomic.h"
+
 class semaphore_t
 {
 	// semaphore mutex
@ -34,4 +37,4 @@ public:
 	void wait();

 	bool post_and_wait();
-};
+};
--- a/Utilities/SharedMutex.cpp
+++ b/Utilities/SharedMutex.cpp
@ -0,0 +1,157 @@
+#include "SharedMutex.h"
+
+#include <mutex>
+#include <condition_variable>
+
+struct shared_mutex::internal
+{
+	std::mutex mutex;
+
+	std::size_t rq_size{}; // Reader queue size (threads waiting on m_rcv)
+	std::size_t wq_size{}; // Writer queue size (threads waiting on m_wcv and m_ocv)
+
+	std::condition_variable rcv; // Reader queue
+	std::condition_variable wcv; // Writer queue
+	std::condition_variable ocv; // For current exclusive owner
+};
+
+shared_mutex::~shared_mutex()
+{
+	delete m_data;
+}
+
+void shared_mutex::initialize_once()
+{
+	if (!m_data)
+	{
+		auto ptr = new shared_mutex::internal;
+
+		if (!m_data.compare_and_swap_test(nullptr, ptr))
+		{
+			delete ptr;
+		}
+	}
+}
+
+void shared_mutex::lock_shared_hard()
+{
+	initialize_once();
+
+	std::unique_lock<std::mutex> lock(m_data->mutex);
+
+	// Validate
+	if ((m_ctrl & SM_INVALID_BIT) != 0) throw std::runtime_error("shared_mutex::lock_shared(): Invalid bit");
+	if ((m_ctrl & SM_READER_MASK) == 0) throw std::runtime_error("shared_mutex::lock_shared(): No readers");
+
+	// Notify non-zero reader queue size
+	m_ctrl |= SM_WAITERS_BIT, m_data->rq_size++;
+
+	// Fix excess reader count
+	if ((--m_ctrl & SM_READER_MASK) == 0 && m_data->wq_size)
+	{
+		// Notify exclusive owner
+		m_data->ocv.notify_one();
+	}
+
+	// Obtain the reader lock
+	while (true)
+	{
+		const auto ctrl = m_ctrl.load();
+
+		// Check writers and reader limit
+		if (m_data->wq_size || (ctrl & ~SM_WAITERS_BIT) >= SM_READER_MAX)
+		{
+			m_data->rcv.wait(lock);
+			continue;
+		}
+
+		if (m_ctrl.compare_and_swap_test(ctrl, ctrl + 1))
+		{
+			break;
+		}
+	}
+
+	if (!--m_data->rq_size && !m_data->wq_size)
+	{
+		m_ctrl &= ~SM_WAITERS_BIT;
+	}
+}
+
+void shared_mutex::unlock_shared_notify()
+{
+	initialize_once();
+
+	// Mutex is locked for reliable notification because m_ctrl has been changed outside
+	std::lock_guard<std::mutex> lock(m_data->mutex);
+
+	if ((m_ctrl & SM_READER_MASK) == 0 && m_data->wq_size)
+	{
+		// Notify exclusive owner
+		m_data->ocv.notify_one();
+	}
+	else if (m_data->rq_size)
+	{
+		// Notify other readers
+		m_data->rcv.notify_one();
+	}
+}
+
+void shared_mutex::lock_hard()
+{
+	initialize_once();
+
+	std::unique_lock<std::mutex> lock(m_data->mutex);
+
+	// Validate
+	if ((m_ctrl & SM_INVALID_BIT) != 0) throw std::runtime_error("shared_mutex::lock(): Invalid bit");
+
+	// Notify non-zero writer queue size
+	m_ctrl |= SM_WAITERS_BIT, m_data->wq_size++;
+
+	// Obtain the writer lock
+	while (true)
+	{
+		const auto ctrl = m_ctrl.load();
+
+		if (ctrl & SM_WRITER_LOCK)
+		{
+			m_data->wcv.wait(lock);
+			continue;
+		}
+
+		if (m_ctrl.compare_and_swap_test(ctrl, ctrl | SM_WRITER_LOCK))
+		{
+			break;
+		}
+	}
+
+	// Wait for remaining readers
+	while ((m_ctrl & SM_READER_MASK) != 0)
+	{
+		m_data->ocv.wait(lock);
+	}
+
+	if (!--m_data->wq_size && !m_data->rq_size)
+	{
+		m_ctrl &= ~SM_WAITERS_BIT;
+	}
+}
+
+void shared_mutex::unlock_notify()
+{
+	initialize_once();
+
+	// Mutex is locked for reliable notification because m_ctrl has been changed outside
+	std::lock_guard<std::mutex> lock(m_data->mutex);
+
+	if (m_data->wq_size)
+	{
+		// Notify next exclusive owner
+		m_data->wcv.notify_one();
+	}
+	else if (m_data->rq_size)
+	{
+		// Notify all readers
+		m_data->rcv.notify_all();
+	}
+}
--- a/Utilities/SharedMutex.h
+++ b/Utilities/SharedMutex.h
@ -1,22 +1,16 @@
 #pragma once

-#include <cstdint>
-#include <exception>
-#include <thread>
-#include <mutex>
-#include <condition_variable>
-
+#include "types.h"
 #include "Atomic.h"
+#include "Platform.h"

 //! An attempt to create effective implementation of "shared mutex", lock-free in optimistic case.
-//! All locking and unlocking may be done by single LOCK XADD or LOCK CMPXCHG instructions.
+//! All locking and unlocking may be done by a single LOCK XADD or LOCK CMPXCHG instruction.
 //! MSVC implementation of std::shared_timed_mutex seems suboptimal.
 //! std::shared_mutex is not available until C++17.
 class shared_mutex final
 {
-	using ctrl_type = u32;
-
-	enum : ctrl_type
+	enum : u32
 	{
 		SM_WRITER_LOCK = 1u << 31, // Exclusive lock flag, must be MSB
 		SM_WAITERS_BIT = 1u << 30, // Flag set if m_wq_size or m_rq_size is non-zero
@ -26,186 +20,79 @@ class shared_mutex final
 		SM_READER_MAX  = 1u << 24, // Max reader count
 	};

-	atomic_t<ctrl_type> m_ctrl{}; // Control atomic variable: reader count | SM_* flags
+	atomic_t<u32> m_ctrl{}; // Control variable: reader count | SM_* flags

-	std::mutex m_mutex;
+	struct internal;

-	std::size_t m_rq_size{}; // Reader queue size (threads waiting on m_rcv)
-	std::size_t m_wq_size{}; // Writer queue size (threads waiting on m_wcv and m_ocv)
-	
-	std::condition_variable m_rcv; // Reader queue
-	std::condition_variable m_wcv; // Writer queue
-	std::condition_variable m_ocv; // For current exclusive owner
+	atomic_t<internal*> m_data{}; // Internal data

-	void lock_shared_hard()
-	{
-		std::unique_lock<std::mutex> lock(m_mutex);
+	void lock_shared_hard();
+	void unlock_shared_notify();

-		// Validate
-		if ((m_ctrl & SM_INVALID_BIT) != 0) throw std::runtime_error("shared_mutex::lock_shared(): Invalid bit");
-		if ((m_ctrl & SM_READER_MASK) == 0) throw std::runtime_error("shared_mutex::lock_shared(): No readers");
-
-		// Notify non-zero reader queue size
-		m_ctrl |= SM_WAITERS_BIT, m_rq_size++;
-
-		// Fix excess reader count
-		if ((--m_ctrl & SM_READER_MASK) == 0 && m_wq_size)
-		{
-			// Notify exclusive owner
-			m_ocv.notify_one();
-		}
-
-		// Obtain the reader lock
-		while (true)
-		{
-			const auto ctrl = m_ctrl.load();
-
-			// Check writers and reader limit
-			if (m_wq_size || (ctrl & ~SM_WAITERS_BIT) >= SM_READER_MAX)
-			{
-				m_rcv.wait(lock);
-				continue;
-			}
-
-			if (m_ctrl.compare_and_swap_test(ctrl, ctrl + 1))
-			{
-				break;
-			}
-		}
-
-		if (!--m_rq_size && !m_wq_size)
-		{
-			m_ctrl &= ~SM_WAITERS_BIT;
-		}
-	}
-
-	void unlock_shared_notify()
-	{
-		// Mutex is locked for reliable notification because m_ctrl has been changed outside
-		std::lock_guard<std::mutex> lock(m_mutex);
-
-		if ((m_ctrl & SM_READER_MASK) == 0 && m_wq_size)
-		{
-			// Notify exclusive owner
-			m_ocv.notify_one();
-		}
-		else if (m_rq_size)
-		{
-			// Notify other readers
-			m_rcv.notify_one();
-		}
-	}
-
-	void lock_hard()
-	{
-		std::unique_lock<std::mutex> lock(m_mutex);
-
-		// Validate
-		if ((m_ctrl & SM_INVALID_BIT) != 0) throw std::runtime_error("shared_mutex::lock(): Invalid bit");
-
-		// Notify non-zero writer queue size
-		m_ctrl |= SM_WAITERS_BIT, m_wq_size++;
-
-		// Obtain the writer lock
-		while (true)
-		{
-			const auto ctrl = m_ctrl.load();
-
-			if (ctrl & SM_WRITER_LOCK)
-			{
-				m_wcv.wait(lock);
-				continue;
-			}
-
-			if (m_ctrl.compare_and_swap_test(ctrl, ctrl | SM_WRITER_LOCK))
-			{
-				break;
-			}
-		}
-
-		// Wait for remaining readers
-		while ((m_ctrl & SM_READER_MASK) != 0)
-		{
-			m_ocv.wait(lock);
-		}
-
-		if (!--m_wq_size && !m_rq_size)
-		{
-			m_ctrl &= ~SM_WAITERS_BIT;
-		}
-	}
-
-	void unlock_notify()
-	{
-		// Mutex is locked for reliable notification because m_ctrl has been changed outside
-		std::lock_guard<std::mutex> lock(m_mutex);
-
-		if (m_wq_size)
-		{
-			// Notify next exclusive owner
-			m_wcv.notify_one();
-		}
-		else if (m_rq_size)
-		{
-			// Notify all readers
-			m_rcv.notify_all();
-		}
-	}
+	void lock_hard();
+	void unlock_notify();

 public:
-	shared_mutex() = default;
+	constexpr shared_mutex() = default;
+
+	~shared_mutex();
+
+	// Initialize internal data
+	void initialize_once();
+
+	bool try_lock_shared()
+	{
+		auto ctrl = m_ctrl.load();
+
+		if (UNLIKELY(ctrl >= SM_READER_MAX))
+		{
+			ctrl = 0;
+		}
+
+		// Weak attempt
+		return LIKELY(m_ctrl.compare_and_swap_test(ctrl, ctrl + 1));
+	}

-	// Lock in shared mode
 	void lock_shared()
 	{
-		if (m_ctrl++ >= SM_READER_MAX)
+		// Optimization: unconditional increment, compensated later
+		if (UNLIKELY(m_ctrl++ >= SM_READER_MAX))
 		{
 			lock_shared_hard();
 		}
 	}

-	// Try to lock in shared mode
-	bool try_lock_shared()
-	{
-		auto ctrl = m_ctrl.load();
-
-		return ctrl < SM_READER_MAX && m_ctrl.compare_and_swap_test(ctrl, ctrl + 1);
-	}
-
-	// Unlock in shared mode
 	void unlock_shared()
 	{
-		if (m_ctrl-- >= SM_READER_MAX)
+		if (UNLIKELY(m_ctrl-- >= SM_READER_MAX))
 		{
 			unlock_shared_notify();
 		}
 	}

-	// Try to lock exclusively
 	bool try_lock()
 	{
-		return m_ctrl.compare_and_swap_test(0, SM_WRITER_LOCK);
+		return LIKELY(m_ctrl.compare_and_swap_test(0, SM_WRITER_LOCK));
 	}

-	// Lock exclusively
 	void lock()
 	{
-		if (m_ctrl.compare_and_swap_test(0, SM_WRITER_LOCK)) return;
-
-		lock_hard();		
+		if (UNLIKELY(!try_lock()))
+		{
+			lock_hard();
+		}
 	}

-	// Unlock exclusively
 	void unlock()
 	{
-		if (m_ctrl.fetch_sub(SM_WRITER_LOCK) != SM_WRITER_LOCK)
+		if (UNLIKELY(m_ctrl.fetch_sub(SM_WRITER_LOCK) != SM_WRITER_LOCK))
 		{
 			unlock_notify();
 		}
 	}
 };

-//! Simplified shared (reader) lock implementation, similar to std::lock_guard.
+//! Simplified shared (reader) lock implementation.
 //! std::shared_lock may be used instead if necessary.
 class reader_lock final
 {
@ -225,3 +112,24 @@ public:
 		m_mutex.unlock_shared();
 	}
 };
+
+//! Simplified exclusive (writer) lock implementation.
+//! std::lock_guard may or std::unique_lock be used instead if necessary.
+class writer_lock final
+{
+	shared_mutex& m_mutex;
+
+public:
+	writer_lock(const writer_lock&) = delete;
+
+	writer_lock(shared_mutex& mutex)
+		: m_mutex(mutex)
+	{
+		m_mutex.lock();
+	}
+
+	~writer_lock()
+	{
+		m_mutex.unlock();
+	}
+};
--- a/Utilities/StrFmt.cpp
+++ b/Utilities/StrFmt.cpp
@ -1,6 +1,10 @@
 #include "StrFmt.h"
 #include "BEType.h"

+#include <cassert>
+#include <array>
+#include <memory>
+
 std::string v128::to_hex() const
 {
 	return fmt::format("%016llx%016llx", _u64[1], _u64[0]);
@ -74,6 +78,59 @@ std::string fmt::to_sdec(s64 svalue)
 	return std::string(&res[first], sizeof(res) - first);
 }

+std::string fmt::_vformat(const char* fmt, va_list _args) noexcept
+{
+	// Fixed stack buffer for the first attempt
+	std::array<char, 4096> fixed_buf;
+
+	// Possibly dynamically allocated buffer for the second attempt
+	std::unique_ptr<char[]> buf;
+
+	// Pointer to the current buffer
+	char* buf_addr = fixed_buf.data();
+
+	for (std::size_t buf_size = fixed_buf.size();;)
+	{
+		va_list args;
+		va_copy(args, _args);
+
+#ifndef _MSC_VER
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wformat-security"
+#endif
+		const std::size_t len = std::vsnprintf(buf_addr, buf_size, fmt, args);
+#ifndef _MSC_VER
+#pragma GCC diagnostic pop
+#endif
+		va_end(args);
+
+		assert(len <= INT_MAX);
+
+		if (len < buf_size)
+		{
+			return{ buf_addr, len };
+		}
+
+		buf.reset(buf_addr = new char[buf_size = len + 1]);
+	}
+}
+
+std::string fmt::_format(const char* fmt...) noexcept
+{
+	va_list args;
+	va_start(args, fmt);
+	auto result = fmt::_vformat(fmt, args);
+	va_end(args);
+
+	return result;
+}
+
+fmt::exception_base::exception_base(const char* fmt...)
+	: std::runtime_error((va_start(m_args, fmt), _vformat(fmt, m_args)))
+{
+	va_end(m_args);
+}
+
 std::string fmt::replace_first(const std::string& src, const std::string& from, const std::string& to)
 {
 	auto pos = src.find(from);
--- a/Utilities/StrFmt.h
+++ b/Utilities/StrFmt.h
@ -1,21 +1,16 @@
 #pragma once

-#include <array>
+#include <cstdarg>
 #include <string>
 #include <vector>
 #include <functional>
-#include <memory>

 #include "Platform.h"
 #include "types.h"

-#if defined(_MSC_VER) && _MSC_VER <= 1800
-#define snprintf _snprintf
-#endif
-
 // Copy null-terminated string from std::string to char array with truncation
 template<std::size_t N>
-inline void strcpy_trunc(char(&dst)[N], const std::string& src)
+force_inline void strcpy_trunc(char(&dst)[N], const std::string& src)
 {
 	const std::size_t count = src.size() >= N ? N - 1 : src.size();
 	std::memcpy(dst, src.c_str(), count);
@ -24,13 +19,33 @@ inline void strcpy_trunc(char(&dst)[N], const std::string& src)

 // Copy null-terminated string from char array to another char array with truncation
 template<std::size_t N, std::size_t N2>
-inline void strcpy_trunc(char(&dst)[N], const char(&src)[N2])
+force_inline void strcpy_trunc(char(&dst)[N], const char(&src)[N2])
 {
 	const std::size_t count = N2 >= N ? N - 1 : N2;
 	std::memcpy(dst, src, count);
 	dst[count] = '\0';
 }

+// Formatting helper, type-specific preprocessing for improving safety and functionality
+template<typename T, typename>
+struct unveil
+{
+	// TODO
+	static inline const T& get(const T& arg)
+	{
+		return arg;
+	}
+};
+
+template<>
+struct unveil<std::string, void>
+{
+	static inline const char* get(const std::string& arg)
+	{
+		return arg.c_str();
+	}
+};
+
 namespace fmt
 {
 	std::string replace_first(const std::string& src, const std::string& from, const std::string& to);
@ -87,125 +102,45 @@ namespace fmt
 	std::string to_hex(u64 value, u64 count = 1);
 	std::string to_udec(u64 value);
 	std::string to_sdec(s64 value);
-
-	template<typename T, typename>
-	struct unveil
-	{
-		using result_type = T;
-
-		force_inline static result_type get_value(const T& arg)
-		{
-			return arg;
-		}
-	};
-
-	template<>
-	struct unveil<const char*, void>
-	{
-		using result_type = const char* const;
-
-		static result_type get_value(const char* const& arg)
-		{
-			return arg;
-		}
-	};
-
-	template<std::size_t N>
-	struct unveil<char[N], void>
-	{
-		using result_type = const char* const;
-
-		static result_type get_value(const char(&arg)[N])
-		{
-			return arg;
-		}
-	};
-
-	template<>
-	struct unveil<std::string, void>
-	{
-		using result_type = const char*;
-
-		static result_type get_value(const std::string& arg)
-		{
-			return arg.c_str();
-		}
-	};
-
-	template<typename T>
-	struct unveil<T, std::enable_if_t<std::is_enum<T>::value>>
-	{
-		using result_type = std::underlying_type_t<T>;
-
-		force_inline static result_type get_value(const T& arg)
-		{
-			return static_cast<result_type>(arg);
-		}
-	};
-
-	template<typename T>
-	force_inline typename unveil<T>::result_type do_unveil(const T& arg)
-	{
-		return unveil<T>::get_value(arg);
-	}
+	std::string _format(const char* fmt...) noexcept;
+	std::string _vformat(const char*, va_list) noexcept;

 	// Formatting function with special functionality (fmt::unveil)
 	template<typename... Args>
-	safe_buffers std::string format(const char* fmt, const Args&... args)
+	force_inline std::string format(const char* fmt, const Args&... args) noexcept
 	{
-		// fixed stack buffer for the first attempt
-		std::array<char, 4096> fixed_buf;
+		return _format(fmt, ::unveil<Args>::get(args)...);
+	}

-		// possibly dynamically allocated buffer for the second attempt
-		std::unique_ptr<char[]> buf;
+	// Helper class
+	class exception_base : public std::runtime_error
+	{
+		// Helper (there is no other room)
+		va_list m_args;

-		// pointer to the current buffer
-		char* buf_addr = fixed_buf.data();
+	protected:
+		// Internal formatting constructor
+		exception_base(const char* fmt...);
+	};

-		for (std::size_t buf_size = fixed_buf.size();;)
+	// Exception type derived from std::runtime_error with formatting constructor
+	class exception : public exception_base
+	{
+	public:
+		// Formatting constructor
+		template<typename... Args>
+		exception(const char* fmt, const Args&... args)
+			: exception_base(fmt, ::unveil<Args>::get(args)...)
 		{
-#ifndef _MSC_VER
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wformat-security"
-#endif
-			const std::size_t len = std::snprintf(buf_addr, buf_size, fmt, do_unveil(args)...);
-#ifndef _MSC_VER
-#pragma GCC diagnostic pop
-#endif
-			if (len > INT_MAX)
-			{
-				throw std::runtime_error("std::snprintf() failed");
-			}
-
-			if (len < buf_size)
-			{
-				return{ buf_addr, len };
-			}
-
-			buf.reset(buf_addr = new char[buf_size = len + 1]);
 		}
-	}
-
-	// Create exception of type T (std::runtime_error by default) with formatting
-	template<typename T = std::runtime_error, typename... Args>
-	never_inline safe_buffers T exception(const char* fmt, const Args&... args) noexcept(noexcept(T{ fmt }))
-	{
-		return T{ format(fmt, do_unveil(args)...).c_str() };
-	}
-
-	// Create exception of type T (std::runtime_error by default) without formatting
-	template<typename T = std::runtime_error>
-	safe_buffers T exception(const char* msg) noexcept(noexcept(T{ msg }))
-	{
-		return T{ msg };
-	}
+	};

 	// Narrow cast (similar to gsl::narrow) with exception message formatting
 	template<typename To, typename From, typename... Args>
 	inline auto narrow(const char* format_str, const From& value, const Args&... args) -> decltype(static_cast<To>(static_cast<From>(std::declval<To>())))
 	{
 		const auto result = static_cast<To>(value);
-		if (static_cast<From>(result) != value) throw fmt::exception(format_str, fmt::do_unveil(value), fmt::do_unveil(args)...);
+		if (static_cast<From>(result) != value) throw fmt::exception(format_str, value, args...);
 		return result;
 	}

--- a/Utilities/Thread.cpp
+++ b/Utilities/Thread.cpp
@ -52,38 +52,6 @@ static void report_fatal_error(const std::string& msg)
 	std::abort();
 }

-void SetCurrentThreadDebugName(const char* threadName)
-{
-#if defined(_MSC_VER) // this is VS-specific way to set thread names for the debugger
-
-	#pragma pack(push,8)
-
-	struct THREADNAME_INFO
-	{
-		DWORD dwType;
-		LPCSTR szName;
-		DWORD dwThreadID;
-		DWORD dwFlags;
-	} info;
-
-	#pragma pack(pop)
-
-	info.dwType = 0x1000;
-	info.szName = threadName;
-	info.dwThreadID = -1;
-	info.dwFlags = 0;
-
-	__try
-	{
-		RaiseException(0x406D1388, 0, sizeof(info) / sizeof(ULONG_PTR), (ULONG_PTR*)&info);
-	}
-	__except (EXCEPTION_EXECUTE_HANDLER)
-	{
-	}
-
-#endif
-}
-
 enum x64_reg_t : u32
 {
 	X64R_RAX = 0,
@ -1295,34 +1263,204 @@ const bool s_self_test = []() -> bool
 	return true;
 }();

+#include <mutex>
+#include <condition_variable>
+#include <exception>
+
 thread_local DECLARE(thread_ctrl::g_tls_this_thread) = nullptr;

+struct thread_ctrl::internal
+{
+	std::mutex mutex;
+	std::condition_variable cond;
+	std::condition_variable join; // Allows simultaneous joining
+
+	task_stack atexit;
+
+	std::exception_ptr exception; // Caught exception
+};
+
+// Temporarily until better interface is implemented
+extern std::condition_variable& get_current_thread_cv()
+{
+	return thread_ctrl::get_current()->get_data()->cond;
+}
+
+extern std::mutex& get_current_thread_mutex()
+{
+	return thread_ctrl::get_current()->get_data()->mutex;
+}
+
 // TODO
-atomic_t<u32> g_thread_count{ 0 };
+extern atomic_t<u32> g_thread_count(0);

 void thread_ctrl::initialize()
 {
-	SetCurrentThreadDebugName(g_tls_this_thread->m_name.c_str());
+	// Initialize TLS variable
+	g_tls_this_thread = this;
+
+#if defined(_MSC_VER)
+
+	struct THREADNAME_INFO
+	{
+		DWORD dwType;
+		LPCSTR szName;
+		DWORD dwThreadID;
+		DWORD dwFlags;
+	};
+
+	// Set thread name for VS debugger
+	if (IsDebuggerPresent())
+	{
+		THREADNAME_INFO info;
+		info.dwType = 0x1000;
+		info.szName = m_name.c_str();
+		info.dwThreadID = -1;
+		info.dwFlags = 0;
+
+		__try
+		{
+			RaiseException(0x406D1388, 0, sizeof(info) / sizeof(ULONG_PTR), (ULONG_PTR*)&info);
+		}
+		__except (EXCEPTION_EXECUTE_HANDLER)
+		{
+		}
+	}
+
+#endif

 	_log::g_tls_make_prefix = [](const auto&, auto, const auto&)
 	{
 		return g_tls_this_thread->m_name;
 	};

-	// TODO
 	++g_thread_count;
 }

+void thread_ctrl::set_exception() noexcept
+{
+	initialize_once();
+	m_data->exception = std::current_exception();
+}
+
 void thread_ctrl::finalize() noexcept
 {
 	// TODO
 	vm::reservation_free();

-	// TODO
+	// Call atexit functions
+	if (m_data) m_data->atexit.exec();
+
 	--g_thread_count;

-	// Call atexit functions
-	g_tls_this_thread->m_atexit.exec();
+#ifdef _MSC_VER
+	ULONG64 time;
+	QueryThreadCycleTime(m_thread.native_handle(), &time);
+	LOG_NOTICE(GENERAL, "Thread time: %f Gc", time / 1000000000.);
+#endif
+}
+
+task_stack& thread_ctrl::get_atexit() const
+{
+	initialize_once();
+	return m_data->atexit;
+}
+
+thread_ctrl::~thread_ctrl()
+{
+	if (m_thread.joinable())
+	{
+		m_thread.detach();
+	}
+
+	delete m_data;
+}
+
+void thread_ctrl::initialize_once() const
+{
+	if (!m_data)
+	{
+		auto ptr = new thread_ctrl::internal;
+
+		if (!m_data.compare_and_swap_test(nullptr, ptr))
+		{
+			delete ptr;
+		}
+	}
+}
+
+void thread_ctrl::join()
+{
+	if (m_thread.joinable())
+	{
+		// Increase contention counter
+		if (m_joining++)
+		{
+			// Hard way
+			initialize_once();
+
+			std::unique_lock<std::mutex> lock(m_data->mutex);
+			m_data->join.wait(lock, WRAP_EXPR(!m_thread.joinable()));
+		}
+		else
+		{
+			// Winner joins the thread
+			m_thread.join();
+
+			// Notify others if necessary
+			if (m_joining > 1)
+			{
+				initialize_once();
+
+				// Serialize for reliable notification
+				m_data->mutex.lock();
+				m_data->mutex.unlock();
+				m_data->join.notify_all();
+			}
+		}
+	}
+
+	if (m_data && m_data->exception)
+	{
+		std::rethrow_exception(m_data->exception);
+	}
+}
+
+void thread_ctrl::lock_notify() const
+{
+	if (UNLIKELY(g_tls_this_thread == this))
+	{
+		return;
+	}
+
+	initialize_once();
+
+	// Serialize for reliable notification, condition is assumed to be changed externally
+	m_data->mutex.lock();
+	m_data->mutex.unlock();
+	m_data->cond.notify_one();
+}
+
+void thread_ctrl::notify() const
+{
+	initialize_once();
+	m_data->cond.notify_one();
+}
+
+thread_ctrl::internal* thread_ctrl::get_data() const
+{
+	initialize_once();
+	return m_data;
+}
+
+
+named_thread::named_thread()
+{
+}
+
+named_thread::~named_thread()
+{
+	LOG_TRACE(GENERAL, "%s", __func__);
 }

 std::string named_thread::get_name() const
@ -1332,8 +1470,6 @@ std::string named_thread::get_name() const

 void named_thread::start()
 {
-	Expects(!m_thread);
-
 	// Get shared_ptr instance (will throw if called from the constructor or the object has been created incorrectly)
 	auto&& ptr = shared_from_this();

@ -1359,19 +1495,3 @@ void named_thread::start()
 		thread->on_exit();
 	});
 }
-
-void named_thread::join()
-{
-	Expects(m_thread);
-
-	try
-	{
-		m_thread->join();
-		m_thread.reset();
-	}
-	catch (...)
-	{
-		m_thread.reset();
-		throw;
-	}
-}
--- a/Utilities/Thread.h
+++ b/Utilities/Thread.h
@ -1,13 +1,11 @@
 #pragma once

-#include <exception>
 #include <string>
 #include <memory>
 #include <thread>
-#include <mutex>
-#include <condition_variable>

 #include "Platform.h"
+#include "Atomic.h"

 // Will report exception and call std::abort() if put in catch(...)
 [[noreturn]] void catch_all_exceptions();
@ -32,12 +30,6 @@ class task_stack

 	std::unique_ptr<task_base> m_stack;

-	never_inline void push(std::unique_ptr<task_base> task)
-	{
-		m_stack.swap(task->next);
-		m_stack.swap(task);
-	}
-
 public:
 	template<typename F>
 	void push(F&& func)
@ -58,7 +50,16 @@ public:
 			}
 		};

-		return push(std::unique_ptr<task_base>{ new task_t(std::forward<F>(func)) });
+		auto _top = new task_t(std::forward<F>(func));
+		auto _next = m_stack.release();
+		m_stack.reset(_top);
+#ifndef _MSC_VER
+		_top->next.reset(_next);
+#else
+		auto& next = _top->next;
+		next.release();
+		next.reset(_next);
+#endif
 	}

 	void reset()
@ -78,25 +79,33 @@ public:
 // Thread control class
 class thread_ctrl final
 {
+	struct internal;
+
 	static thread_local thread_ctrl* g_tls_this_thread;

+	// Thread handle
+	std::thread m_thread;
+
+	// Thread join contention counter
+	atomic_t<uint> m_joining{};
+
 	// Fixed name
 	std::string m_name;

-	// Thread handle (be careful)
-	std::thread m_thread;
-
-	// Thread result (exception)
-	std::exception_ptr m_exception;
-
-	// Functions scheduled at thread exit
-	task_stack m_atexit;
+	// Thread internals
+	mutable atomic_t<internal*> m_data{};

 	// Called at the thread start
-	static void initialize();
+	void initialize();
+
+	// Set std::current_exception
+	void set_exception() noexcept;

 	// Called at the thread end
-	static void finalize() noexcept;
+	void finalize() noexcept;
+
+	// Get atexit function
+	task_stack& get_atexit() const;

 public:
 	template<typename N>
@ -108,13 +117,7 @@ public:
 	// Disable copy/move constructors and operators
 	thread_ctrl(const thread_ctrl&) = delete;

-	~thread_ctrl()
-	{
-		if (m_thread.joinable())
-		{
-			m_thread.detach();
-		}
-	}
+	~thread_ctrl();

 	// Get thread name
 	const std::string& get_name() const
@ -122,19 +125,20 @@ public:
 		return m_name;
 	}

-	// Get thread result (may throw)
-	void join()
-	{
-		if (m_thread.joinable())
-		{
-			m_thread.join();
-		}
+	// Initialize internal data
+	void initialize_once() const;

-		if (auto&& e = std::move(m_exception))
-		{
-			std::rethrow_exception(e);
-		}
-	}
+	// Get thread result (may throw, simultaneous joining allowed)
+	void join();
+
+	// Lock, unlock, notify the thread (required if the condition changed locklessly)
+	void lock_notify() const;
+
+	// Notify the thread, beware the condition change
+	void notify() const;
+
+	//
+	internal* get_data() const;

 	// Get current thread (may be nullptr)
 	static const thread_ctrl* get_current()
@ -146,34 +150,30 @@ public:
 	template<typename F>
 	static inline void at_exit(F&& func)
 	{
-		return g_tls_this_thread->m_atexit.push(std::forward<F>(func));
+		return g_tls_this_thread->get_atexit().push(std::forward<F>(func));
 	}

 	// Named thread factory
-	template<typename N, typename F>
-	static inline std::shared_ptr<thread_ctrl> spawn(N&& name, F&& func)
+	template<typename N, typename F, typename... Args>
+	static inline std::shared_ptr<thread_ctrl> spawn(N&& name, F&& func, Args&&... args)
 	{
 		auto ctrl = std::make_shared<thread_ctrl>(std::forward<N>(name));

-		ctrl->m_thread = std::thread([ctrl, task = std::forward<F>(func)]()
+		ctrl->m_thread = std::thread([ctrl, task = std::forward<F>(func)](Args&&... args)
 		{
-			// Initialize TLS variable
-			g_tls_this_thread = ctrl.get();
-
 			try
 			{
-				initialize();
-				task();
-				finalize();
+				ctrl->initialize();
+				task(std::forward<Args>(args)...);
 			}
 			catch (...)
 			{
-				finalize();
-
-				// Set exception
-				ctrl->m_exception = std::current_exception();
+				ctrl->set_exception();
 			}
-		});
+
+			ctrl->finalize();
+
+		}, std::forward<Args>(args)...);

 		return ctrl;
 	}
@ -185,21 +185,27 @@ class named_thread : public std::enable_shared_from_this<named_thread>
 	std::shared_ptr<thread_ctrl> m_thread;

 public:
-	// Thread condition variable for external use (this thread waits on it, other threads may notify)
-	std::condition_variable cv;
+	named_thread();

-	// Thread mutex for external use (can be used with `cv`)
-	std::mutex mutex;
+	virtual ~named_thread();

-	// Lock mutex, notify condition variable
-	void safe_notify()
-	{
-		// Lock for reliable notification, condition is assumed to be changed externally
-		std::unique_lock<std::mutex> lock(mutex);
+	// Deleted copy/move constructors + copy/move operators
+	named_thread(const named_thread&) = delete;

-		cv.notify_one();
-	}
+	// Get thread name
+	virtual std::string get_name() const;

+protected:
+	// Start thread (cannot be called from the constructor: should throw bad_weak_ptr in such case)
+	void start();
+
+	// Thread task (called in the thread)
+	virtual void on_task() = 0;
+
+	// Thread finalization (called after on_task)
+	virtual void on_exit() {}
+
+public:
 	// ID initialization
 	virtual void on_init()
 	{
@ -209,43 +215,25 @@ public:
 	// ID finalization
 	virtual void on_stop()
 	{
-		join();
+		m_thread->join();
 	}

-protected:
-	// Thread task (called in the thread)
-	virtual void on_task() = 0;
-
-	// Thread finalization (called after on_task)
-	virtual void on_exit() {}
-
-public:
-	named_thread() = default;
-
-	virtual ~named_thread() = default;
-
-	// Deleted copy/move constructors + copy/move operators
-	named_thread(const named_thread&) = delete;
-
-	// Get thread name
-	virtual std::string get_name() const;
-
-	// Start thread (cannot be called from the constructor: should throw bad_weak_ptr in such case)
-	void start();
-
-	// Join thread (get thread result)
-	void join();
-
 	// Get thread_ctrl
-	const thread_ctrl* get_thread_ctrl() const
+	const thread_ctrl* operator->() const
 	{
 		return m_thread.get();
 	}

-	// Compare with the current thread
-	bool is_current() const
+	// Lock mutex, notify condition variable
+	void lock_notify() const
 	{
-		return m_thread && thread_ctrl::get_current() == m_thread.get();
+		m_thread->lock_notify();
+	}
+
+	// Notify condition variable
+	void notify() const
+	{
+		m_thread->notify();
 	}
 };

--- a/Utilities/types.h
+++ b/Utilities/types.h
@ -79,11 +79,8 @@ struct atomic_test_and_complement;
 template<typename T>
 class atomic_t;

-namespace fmt
-{
-	template<typename T, typename = void>
-	struct unveil;
-}
+template<typename T, typename = void>
+struct unveil;

 // TODO: replace with std::void_t when available
 namespace void_details
@ -409,6 +406,38 @@ struct ignore
 	}
 };

+// Contains value of any POD type with fixed size and alignment. TT<> is the type converter applied.
+// For example, `simple_t` may be used to remove endianness.
+template<template<typename> class TT, std::size_t S, std::size_t A = S>
+struct alignas(A) any_pod
+{
+	enum class byte : char {} data[S];
+
+	any_pod() = default;
+
+	template<typename T, typename T2 = TT<T>, typename = std::enable_if_t<std::is_pod<T2>::value && sizeof(T2) == S && alignof(T2) <= A>>
+	any_pod(const T& value)
+	{
+		reinterpret_cast<T2&>(data) = value;
+	}
+
+	template<typename T, typename T2 = TT<T>, typename = std::enable_if_t<std::is_pod<T2>::value && sizeof(T2) == S && alignof(T2) <= A>>
+	T2& as()
+	{
+		return reinterpret_cast<T2&>(data);
+	}
+
+	template<typename T, typename T2 = TT<T>, typename = std::enable_if_t<std::is_pod<T2>::value && sizeof(T2) == S && alignof(T2) <= A>>
+	const T2& as() const
+	{
+		return reinterpret_cast<const T2&>(data);
+	}
+};
+
+using any16 = any_pod<simple_t, sizeof(u16)>;
+using any32 = any_pod<simple_t, sizeof(u32)>;
+using any64 = any_pod<simple_t, sizeof(u64)>;
+
 // Allows to define integer convertible to multiple enum types
 template<typename T = void, typename... Ts>
 struct multicast : multicast<Ts...>