atomic.cpp: minor notification improvement

Check mask before locking the reference.
Can help to not bother non-eligible waiters.
This commit is contained in:
Nekotekina 2020-11-11 11:32:52 +03:00
parent 4b823469f7
commit d391133524

View file

@ -207,7 +207,7 @@ ptr_cmp(const void* data, u32 _size, __m128i old128, __m128i mask128, atomic_wai
} }
// Returns true if mask overlaps, or the argument is invalid // Returns true if mask overlaps, or the argument is invalid
static u32 static bool
#ifdef _WIN32 #ifdef _WIN32
__vectorcall __vectorcall
#endif #endif
@ -219,7 +219,7 @@ cmp_mask(u32 size1, __m128i mask1, __m128i val1, u32 size2, __m128i mask2, __m12
// Simple mask overlap // Simple mask overlap
const auto v0 = _mm_and_si128(mask1, mask2); const auto v0 = _mm_and_si128(mask1, mask2);
const auto v1 = _mm_packs_epi16(v0, v0); const auto v1 = _mm_packs_epi16(v0, v0);
return _mm_cvtsi128_si64(v1) ? 1 : 0; return !!_mm_cvtsi128_si64(v1);
} }
// Generate masked value inequality bits // Generate masked value inequality bits
@ -243,14 +243,14 @@ cmp_mask(u32 size1, __m128i mask1, __m128i val1, u32 size2, __m128i mask2, __m12
if (!(_mm_cvtsi128_si64(v0) & mask)) if (!(_mm_cvtsi128_si64(v0) & mask))
{ {
return flag & op_flag::inverse ? 2 : 0; return !!(flag & op_flag::inverse);
} }
} }
else if (size == 16) else if (size == 16)
{ {
if (!_mm_cvtsi128_si64(_mm_packs_epi16(v0, v0))) if (!_mm_cvtsi128_si64(_mm_packs_epi16(v0, v0)))
{ {
return flag & op_flag::inverse ? 2 : 0; return !!(flag & op_flag::inverse);
} }
} }
else else
@ -259,8 +259,7 @@ cmp_mask(u32 size1, __m128i mask1, __m128i val1, u32 size2, __m128i mask2, __m12
std::abort(); std::abort();
} }
// Use force wake-up return !(flag & op_flag::inverse);
return flag & op_flag::inverse ? 0 : 2;
} }
static atomic_t<u64> s_min_tsc{0}; static atomic_t<u64> s_min_tsc{0};
@ -367,8 +366,8 @@ namespace atomic_wait
mtx.init(mtx); mtx.init(mtx);
#endif #endif
// Initialize first reference verify(HERE), !refs;
verify(HERE), !refs++; refs.release(1);
} }
void destroy() void destroy()
@ -379,7 +378,9 @@ namespace atomic_wait
link = 0; link = 0;
size = 0; size = 0;
flag = 0; flag = 0;
sync = 0; sync.release(0);
mask = _mm_setzero_si128();
oldv = _mm_setzero_si128();
#ifdef USE_STD #ifdef USE_STD
mtx.destroy(); mtx.destroy();
@ -391,7 +392,7 @@ namespace atomic_wait
{ {
const auto [_old, ok] = sync.fetch_op([](u32& val) const auto [_old, ok] = sync.fetch_op([](u32& val)
{ {
if (val == 1 || val == 2) if (val - 1 <= 1)
{ {
val = 3; val = 3;
return true; return true;
@ -408,7 +409,18 @@ namespace atomic_wait
{ {
if (cmp_res == 1) [[likely]] if (cmp_res == 1) [[likely]]
{ {
return sync == 1 && sync.compare_and_swap_test(1, 2); const auto [_old, ok] = sync.fetch_op([](u32& val)
{
if (val == 1)
{
val = 2;
return true;
}
return false;
});
return ok;
} }
if (cmp_res > 1) [[unlikely]] if (cmp_res > 1) [[unlikely]]
@ -421,6 +433,22 @@ namespace atomic_wait
return false; return false;
} }
bool set_sleep()
{
const auto [_old, ok] = sync.fetch_op([](u32& val)
{
if (val == 2)
{
val = 1;
return true;
}
return false;
});
return ok;
}
void alert_native() void alert_native()
{ {
#ifdef USE_FUTEX #ifdef USE_FUTEX
@ -493,7 +521,11 @@ static atomic_t<u64, 64> s_cond_bits[(UINT16_MAX + 1) / 64]{};
// Allocation semaphore // Allocation semaphore
static atomic_t<u32, 64> s_cond_sema{0}; static atomic_t<u32, 64> s_cond_sema{0};
static u32 cond_alloc(const void* data) static u32
#ifdef _WIN32
__vectorcall
#endif
cond_alloc(const void* data, __m128i mask)
{ {
// Determine whether there is a free slot or not // Determine whether there is a free slot or not
if (!s_cond_sema.try_inc(UINT16_MAX + 1)) if (!s_cond_sema.try_inc(UINT16_MAX + 1))
@ -540,6 +572,7 @@ static u32 cond_alloc(const void* data)
} }
// Initialize new "semaphore" // Initialize new "semaphore"
s_cond_list[id].mask = mask;
s_cond_list[id].init(data); s_cond_list[id].init(data);
return id; return id;
@ -577,7 +610,11 @@ static void cond_free(u32 cond_id)
verify(HERE), s_cond_sema--; verify(HERE), s_cond_sema--;
} }
static atomic_wait::cond_handle* cond_id_lock(u32 cond_id, u64 thread_id = 0, const void* data = nullptr) static atomic_wait::cond_handle*
#ifdef _WIN32
__vectorcall
#endif
cond_id_lock(u32 cond_id, __m128i mask, u64 thread_id = 0, const void* data = nullptr)
{ {
if (cond_id - 1 < u32{UINT16_MAX}) if (cond_id - 1 < u32{UINT16_MAX})
{ {
@ -598,6 +635,8 @@ static atomic_wait::cond_handle* cond_id_lock(u32 cond_id, u64 thread_id = 0, co
return false; return false;
} }
const __m128i mask12 = _mm_and_si128(mask, _mm_load_si128(&cond->mask));
if (thread_id) if (thread_id)
{ {
if (atomic_storage<u64>::load(cond->tid) != thread_id) if (atomic_storage<u64>::load(cond->tid) != thread_id)
@ -614,6 +653,11 @@ static atomic_wait::cond_handle* cond_id_lock(u32 cond_id, u64 thread_id = 0, co
} }
} }
if (_mm_cvtsi128_si64(_mm_packs_epi16(mask12, mask12)) == 0)
{
return false;
}
ref++; ref++;
return true; return true;
}); });
@ -625,7 +669,7 @@ static atomic_wait::cond_handle* cond_id_lock(u32 cond_id, u64 thread_id = 0, co
if (old == UINT16_MAX) if (old == UINT16_MAX)
{ {
fmt::raw_error("Thread limit " STRINGIZE(UINT16_MAX) " reached in an atomic notifier."); fmt::raw_error("Reference count limit " STRINGIZE(UINT16_MAX) " reached in an atomic notifier.");
} }
} }
@ -664,7 +708,7 @@ namespace atomic_wait
root_info* slot_free(atomic_t<u16>* slot) noexcept; root_info* slot_free(atomic_t<u16>* slot) noexcept;
template <typename F> template <typename F>
auto slot_search(const void* data, u64 thread_id, F func) noexcept; auto slot_search(const void* data, u64 thread_id, __m128i mask, F func) noexcept;
}; };
static_assert(sizeof(root_info) == 128); static_assert(sizeof(root_info) == 128);
@ -861,7 +905,7 @@ atomic_wait::root_info* atomic_wait::root_info::slot_free(atomic_t<u16>* slot) n
} }
template <typename F> template <typename F>
auto atomic_wait::root_info::slot_search(const void* data, u64 thread_id, F func) noexcept FORCE_INLINE auto atomic_wait::root_info::slot_search(const void* data, u64 thread_id, __m128i mask, F func) noexcept
{ {
const u64 bits_val = this->bits.load(); const u64 bits_val = this->bits.load();
const u64 max_order = bits_val >> max_threads; const u64 max_order = bits_val >> max_threads;
@ -869,7 +913,6 @@ auto atomic_wait::root_info::slot_search(const void* data, u64 thread_id, F func
auto* _this = this; auto* _this = this;
u32 order = 0; u32 order = 0;
u32 count = 0;
u64 new_val = bits_val & thread_mask; u64 new_val = bits_val & thread_mask;
@ -889,7 +932,7 @@ auto atomic_wait::root_info::slot_search(const void* data, u64 thread_id, F func
for (u32 i = 0; i < cond_max; i++) for (u32 i = 0; i < cond_max; i++)
{ {
if (cond_id_lock(cond_ids[i], thread_id, data)) if (cond_id_lock(cond_ids[i], mask, thread_id, data))
{ {
if (func(cond_ids[i])) if (func(cond_ids[i]))
{ {
@ -956,13 +999,13 @@ atomic_wait_engine::wait(const void* data, u32 size, __m128i old_value, u64 time
} }
} }
const u32 cond_id = cond_alloc(data); const u32 cond_id = cond_alloc(data, mask);
u32 cond_id_ext[atomic_wait::max_list - 1]{}; u32 cond_id_ext[atomic_wait::max_list - 1]{};
for (u32 i = 0; i < ext_size; i++) for (u32 i = 0; i < ext_size; i++)
{ {
cond_id_ext[i] = cond_alloc(ext[i].data); cond_id_ext[i] = cond_alloc(ext[i].data, ext[i].mask);
} }
const auto slot = root->slot_alloc(iptr); const auto slot = root->slot_alloc(iptr);
@ -987,17 +1030,17 @@ atomic_wait_engine::wait(const void* data, u32 size, __m128i old_value, u64 time
cond->link = 0; cond->link = 0;
cond->size = static_cast<u8>(size); cond->size = static_cast<u8>(size);
cond->flag = static_cast<u8>(size >> 8); cond->flag = static_cast<u8>(size >> 8);
cond->mask = mask;
cond->oldv = old_value; cond->oldv = old_value;
cond->tsc0 = stamp0; cond->tsc0 = stamp0;
cond->sync.release(1);
for (u32 i = 0; i < ext_size; i++) for (u32 i = 0; i < ext_size; i++)
{ {
// Extensions point to original cond_id, copy remaining info // Extensions point to original cond_id, copy remaining info
cond_ext[i]->link = cond_id; cond_ext[i]->link = cond_id;
cond_ext[i]->size = static_cast<u8>(ext[i].size); cond_ext[i]->size = static_cast<u8>(ext[i].size);
cond_ext[i]->flag = static_cast<u8>(ext[i].size >> 8); cond_ext[i]->flag = static_cast<u8>(ext[i].size >> 8);
cond_ext[i]->mask = ext[i].mask;
cond_ext[i]->oldv = ext[i].old; cond_ext[i]->oldv = ext[i].old;
cond_ext[i]->tsc0 = stamp0; cond_ext[i]->tsc0 = stamp0;
@ -1005,19 +1048,20 @@ atomic_wait_engine::wait(const void* data, u32 size, __m128i old_value, u64 time
cond_ext[i]->sync.release(4); cond_ext[i]->sync.release(4);
} }
cond->sync.release(1); // Final deployment
slot->store(static_cast<u16>(cond_id));
for (u32 i = 0; i < ext_size; i++) for (u32 i = 0; i < ext_size; i++)
{ {
slot_ext[i]->release(static_cast<u16>(cond_id_ext[i])); slot_ext[i]->release(static_cast<u16>(cond_id_ext[i]));
} }
// Final deployment
slot->store(static_cast<u16>(cond_id));
#ifdef USE_STD #ifdef USE_STD
// Lock mutex // Lock mutex
std::unique_lock lock(*cond->mtx.get()); std::unique_lock lock(*cond->mtx.get());
#else
if (ext_size)
_mm_mfence();
#endif #endif
// Can skip unqueue process if true // Can skip unqueue process if true
@ -1034,22 +1078,24 @@ atomic_wait_engine::wait(const void* data, u32 size, __m128i old_value, u64 time
ts.tv_sec = timeout / 1'000'000'000; ts.tv_sec = timeout / 1'000'000'000;
ts.tv_nsec = timeout % 1'000'000'000; ts.tv_nsec = timeout % 1'000'000'000;
if (cond->sync.load() > 1) [[unlikely]] const u32 val = cond->sync;
if (val > 1) [[unlikely]]
{ {
// Signaled prematurely // Signaled prematurely
if (cond->sync.load() == 3 || !cond->sync.compare_and_swap_test(2, 1)) if (!cond->set_sleep())
{ {
break; break;
} }
} }
else else
{ {
futex(&cond->sync, FUTEX_WAIT_PRIVATE, 1, timeout + 1 ? &ts : nullptr); futex(&cond->sync, FUTEX_WAIT_PRIVATE, val, timeout + 1 ? &ts : nullptr);
} }
#elif defined(USE_STD) #elif defined(USE_STD)
if (cond->sync.load() > 1) [[unlikely]] if (cond->sync > 1) [[unlikely]]
{ {
if (cond->sync.load() == 3 || !cond->sync.compare_and_swap_test(2, 1)) if (!cond->set_sleep())
{ {
break; break;
} }
@ -1075,7 +1121,7 @@ atomic_wait_engine::wait(const void* data, u32 size, __m128i old_value, u64 time
if (fallback) [[unlikely]] if (fallback) [[unlikely]]
{ {
if (cond->sync.load() == 3 || !cond->sync.compare_and_swap_test(2, 1)) if (!cond->set_sleep())
{ {
fallback = false; fallback = false;
break; break;
@ -1119,7 +1165,7 @@ atomic_wait_engine::wait(const void* data, u32 size, __m128i old_value, u64 time
#if defined(_WIN32) #if defined(_WIN32)
static LARGE_INTEGER instant{}; static LARGE_INTEGER instant{};
if (cond->sync.compare_and_swap_test(1, 2)) if (cond->wakeup(1))
{ {
// Succeeded in self-notifying // Succeeded in self-notifying
break; break;
@ -1176,11 +1222,11 @@ alert_sema(u32 cond_id, const void* data, u64 tid, u32 size, __m128i mask, __m12
u32 ok = 0; u32 ok = 0;
if (cond->sync && (!size ? (!tid || cond->tid == tid) : (cond->ptr == data && cmp_mask(size, mask, new_value, cond->size | (cond->flag << 8), cond->mask, cond->oldv)))) if (!size ? (!tid || cond->tid == tid) : (cond->ptr == data && cmp_mask(size, mask, new_value, cond->size | (cond->flag << 8), cond->mask, cond->oldv)))
{ {
// Redirect if necessary // Redirect if necessary
const auto _old = cond; const auto _old = cond;
const auto _new = _old->link ? cond_id_lock(_old->link) : _old; const auto _new = _old->link ? cond_id_lock(_old->link, _mm_set1_epi64x(-1)) : _old;
if (_new && _new->tsc0 == _old->tsc0) if (_new && _new->tsc0 == _old->tsc0)
{ {
@ -1278,7 +1324,7 @@ bool atomic_wait_engine::raw_notify(const void* data, u64 thread_id)
if (thread_id) if (thread_id)
{ {
// Check thread if provided // Check thread if provided
if (s_cond_list[i].tid != thread_id) if (atomic_storage<u64>::load(cond->tid) != thread_id)
{ {
return false; return false;
} }
@ -1329,7 +1375,7 @@ bool atomic_wait_engine::raw_notify(const void* data, u64 thread_id)
u64 progress = 0; u64 progress = 0;
root->slot_search(data, thread_id, [&](u32 cond_id) root->slot_search(data, thread_id, _mm_set1_epi64x(-1), [&](u32 cond_id)
{ {
// Forced notification // Forced notification
if (alert_sema(cond_id, data, thread_id, 0, _mm_setzero_si128(), _mm_setzero_si128())) if (alert_sema(cond_id, data, thread_id, 0, _mm_setzero_si128(), _mm_setzero_si128()))
@ -1367,7 +1413,7 @@ atomic_wait_engine::notify_one(const void* data, u32 size, __m128i mask, __m128i
u64 progress = 0; u64 progress = 0;
root->slot_search(data, 0, [&](u32 cond_id) root->slot_search(data, 0, mask, [&](u32 cond_id)
{ {
if (alert_sema(cond_id, data, -1, size, mask, new_value)) if (alert_sema(cond_id, data, -1, size, mask, new_value))
{ {
@ -1401,7 +1447,7 @@ atomic_wait_engine::notify_all(const void* data, u32 size, __m128i mask, __m128i
// Array itself. // Array itself.
u16 cond_ids[UINT16_MAX + 1]; u16 cond_ids[UINT16_MAX + 1];
root->slot_search(data, 0, [&](u32 cond_id) root->slot_search(data, 0, mask, [&](u32 cond_id)
{ {
u32 res = alert_sema<true>(cond_id, data, -1, size, mask, new_value); u32 res = alert_sema<true>(cond_id, data, -1, size, mask, new_value);