atomic.cpp: shrink and simplify main hashtable

Reduce collision detection to 1 or 0 for now.
I think it should be offloaded to notifiers.
This commit is contained in:
Nekotekina 2020-11-26 07:35:25 +03:00
parent 8b6d615aa6
commit bd90e3e37f
2 changed files with 41 additions and 97 deletions

View file

@ -89,7 +89,7 @@ namespace
namespace atomic_wait namespace atomic_wait
{ {
extern void parse_hashtable(bool(*cb)(u64 id, u16 refs, u32 ptr, u32 stats)); extern void parse_hashtable(bool(*cb)(u64 id, u32 refs, u64 ptr, u32 stats));
} }
template<> template<>
@ -1919,19 +1919,13 @@ void Emulator::Stop(bool restart)
aw_colc = 0; aw_colc = 0;
aw_used = 0; aw_used = 0;
atomic_wait::parse_hashtable([](u64 id, u16 refs, u32 ptr, u32 stats) -> bool atomic_wait::parse_hashtable([](u64 id, u32 refs, u64 ptr, u32 maxc) -> bool
{ {
aw_refs += refs; aw_refs += refs != 0;
aw_used += ptr != 0; aw_used += ptr != 0;
stats = (stats & 0xaaaaaaaa) / 2 + (stats & 0x55555555); aw_colm = std::max<u64>(aw_colm, maxc);
stats = (stats & 0xcccccccc) / 4 + (stats & 0x33333333); aw_colc += maxc != 0;
stats = (stats & 0xf0f0f0f0) / 16 + (stats & 0xf0f0f0f);
stats = (stats & 0xff00ff00) / 256 + (stats & 0xff00ff);
stats = (stats >> 16) + (stats & 0xffff);
aw_colm = std::max<u64>(aw_colm, stats);
aw_colc += stats != 0;
return false; return false;
}); });

View file

@ -21,7 +21,7 @@
#include "endian.hpp" #include "endian.hpp"
// Total number of entries, should be a power of 2. // Total number of entries, should be a power of 2.
static constexpr std::size_t s_hashtable_size = 1u << 17; static constexpr std::size_t s_hashtable_size = 1u << 16;
// Reference counter combined with shifted pointer (which is assumed to be 47 bit) // Reference counter combined with shifted pointer (which is assumed to be 47 bit)
static constexpr std::uintptr_t s_ref_mask = (1u << 17) - 1; static constexpr std::uintptr_t s_ref_mask = (1u << 17) - 1;
@ -778,16 +778,19 @@ namespace
{ {
struct alignas(16) slot_allocator struct alignas(16) slot_allocator
{ {
u64 ref : 16; u64 ref : 16; // Ref counter
u64 low : 48; u64 bits: 24; // Allocated bits
u64 high; u64 prio: 24; // Reserved
u64 maxc: 17; // Collision counter
u64 iptr: 47; // First pointer to use slot (to count used slots)
}; };
// Need to spare 16 bits for ref counter // Need to spare 16 bits for ref counter
static constexpr u64 max_threads = 112; static constexpr u64 max_threads = 24;
// (Arbitrary, not justified) Can only allow extended allocations go as far as this (about 585) // (Arbitrary, not justified) Can only allow extended allocations go as far as this
static constexpr u64 max_distance = UINT16_MAX / max_threads; static constexpr u64 max_distance = 500;
// Thread list // Thread list
struct alignas(64) root_info struct alignas(64) root_info
@ -798,12 +801,6 @@ namespace
// Allocation pool, pointers to allocated semaphores // Allocation pool, pointers to allocated semaphores
atomic_t<u16> slots[max_threads]; atomic_t<u16> slots[max_threads];
// For collision statistics (32 middle bits)
atomic_t<u32> first_ptr;
// For collision statistics (bit difference stick flags)
atomic_t<u32> diff_lz, diff_tz, diff_pop;
static atomic_t<u16>* slot_alloc(std::uintptr_t ptr) noexcept; static atomic_t<u16>* slot_alloc(std::uintptr_t ptr) noexcept;
static void slot_free(std::uintptr_t ptr, atomic_t<u16>* slot, u32 tls_slot) noexcept; static void slot_free(std::uintptr_t ptr, atomic_t<u16>* slot, u32 tls_slot) noexcept;
@ -811,10 +808,11 @@ namespace
template <typename F> template <typename F>
static auto slot_search(std::uintptr_t iptr, u32 size, u64 thread_id, __m128i mask, F func) noexcept; static auto slot_search(std::uintptr_t iptr, u32 size, u64 thread_id, __m128i mask, F func) noexcept;
void register_collisions(std::uintptr_t ptr); // Somehow update information about collisions (TODO)
void register_collisions(std::uintptr_t ptr, u64 max_coll);
}; };
static_assert(sizeof(root_info) == 256); static_assert(sizeof(root_info) == 64);
} }
// Main hashtable for atomic wait. // Main hashtable for atomic wait.
@ -887,27 +885,23 @@ atomic_t<u16>* root_info::slot_alloc(std::uintptr_t ptr) noexcept
return nullptr; return nullptr;
} }
if (bits.iptr == 0)
bits.iptr = ptr;
if (bits.maxc == 0 && bits.iptr != ptr && bits.ref)
bits.maxc = 1;
bits.ref++; bits.ref++;
if (~bits.high) if (bits.bits != (1ull << max_threads) - 1)
{ {
const u32 id = std::countr_one(bits.high); const u32 id = std::countr_one(bits.bits);
bits.high |= bits.high + 1; bits.bits |= bits.bits + 1;
return _this->slots + id; return _this->slots + id;
} }
if (~bits.low << 16)
{
const u32 id = std::countr_one(bits.low);
bits.low |= bits.low + 1;
return _this->slots + 64 + id;
}
return nullptr; return nullptr;
}); });
_this->register_collisions(ptr);
if (slot) if (slot)
{ {
break; break;
@ -918,7 +912,7 @@ atomic_t<u16>* root_info::slot_alloc(std::uintptr_t ptr) noexcept
if (limit == max_distance) [[unlikely]] if (limit == max_distance) [[unlikely]]
{ {
fmt::raw_error("Distance limit (585) exceeded for the atomic wait hashtable."); fmt::raw_error("Distance limit (500) exceeded for the atomic wait hashtable.");
return nullptr; return nullptr;
} }
} }
@ -926,44 +920,17 @@ atomic_t<u16>* root_info::slot_alloc(std::uintptr_t ptr) noexcept
return slot; return slot;
} }
void root_info::register_collisions(std::uintptr_t ptr) void root_info::register_collisions(std::uintptr_t ptr, u64 max_coll)
{ {
u32 ptr32 = static_cast<u32>(ptr >> 16); bits.atomic_op([&](slot_allocator& bits)
u32 first = first_ptr.load();
if (!first && first != ptr32)
{ {
// Register first used pointer if (bits.iptr == 0)
first = first_ptr.compare_and_swap(0, ptr32); bits.iptr = ptr;
} if (bits.maxc == 0 && bits.iptr != ptr)
bits.maxc = 1;
if (first && first != ptr32) if (bits.maxc < max_coll)
{ bits.maxc = max_coll;
// Difference bits between pointers });
u32 diff = first ^ ptr32;
// The most significant different bit
u32 diff1 = std::countl_zero(diff);
if (diff1 < 32)
{
diff_lz |= 1u << diff1;
}
u32 diff2 = std::countr_zero(diff);
if (diff2 < 32)
{
diff_tz |= 1u << diff2;
}
diff = (diff & 0xaaaaaaaa) / 2 + (diff & 0x55555555);
diff = (diff & 0xcccccccc) / 4 + (diff & 0x33333333);
diff = (diff & 0xf0f0f0f0) / 16 + (diff & 0x0f0f0f0f);
diff = (diff & 0xff00ff00) / 256 + (diff & 0x00ff00ff);
diff_pop |= 1u << static_cast<u8>((diff >> 16) + diff - 1);
}
} }
void root_info::slot_free(std::uintptr_t iptr, atomic_t<u16>* slot, u32 tls_slot) noexcept void root_info::slot_free(std::uintptr_t iptr, atomic_t<u16>* slot, u32 tls_slot) noexcept
@ -1008,14 +975,7 @@ void root_info::slot_free(std::uintptr_t iptr, atomic_t<u16>* slot, u32 tls_slot
if (_this == curr.current) if (_this == curr.current)
{ {
if (diff < 64) bits.bits &= ~(1ull << diff);
{
bits.high &= ~(1ull << diff);
}
else
{
bits.low &= ~(1ull << (diff - 64));
}
} }
}); });
@ -1044,19 +1004,9 @@ FORCE_INLINE auto root_info::slot_search(std::uintptr_t iptr, u32 size, u64 thre
u16 cond_ids[max_threads]; u16 cond_ids[max_threads];
u32 cond_count = 0; u32 cond_count = 0;
u64 high_val = bits.high; u64 bits_val = bits.bits;
u64 low_val = bits.low;
for (u64 bits = high_val; bits; bits &= bits - 1) for (u64 bits = bits_val; bits; bits &= bits - 1)
{
if (u16 cond_id = _this->slots[std::countr_zero(bits)])
{
utils::prefetch_read(s_cond_list + cond_id);
cond_ids[cond_count++] = cond_id;
}
}
for (u64 bits = low_val; bits; bits &= bits - 1)
{ {
if (u16 cond_id = _this->slots[std::countr_zero(bits)]) if (u16 cond_id = _this->slots[std::countr_zero(bits)])
{ {
@ -1651,14 +1601,14 @@ atomic_wait_engine::notify_all(const void* data, u32 size, __m128i mask)
namespace atomic_wait namespace atomic_wait
{ {
extern void parse_hashtable(bool(*cb)(u64 id, u16 refs, u32 ptr, u32 stats)) extern void parse_hashtable(bool(*cb)(u64 id, u32 refs, u64 ptr, u32 max_coll))
{ {
for (u64 i = 0; i < s_hashtable_size; i++) for (u64 i = 0; i < s_hashtable_size; i++)
{ {
const auto root = &s_hashtable[i]; const auto root = &s_hashtable[i];
const auto slot = root->bits.load(); const auto slot = root->bits.load();
if (cb(i, static_cast<u16>(slot.ref), root->first_ptr.load(), root->diff_lz | root->diff_tz | root->diff_pop)) if (cb(i, static_cast<u32>(slot.ref), slot.iptr, static_cast<u32>(slot.maxc)))
{ {
break; break;
} }