Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#651, for 64 bit type on x86 __iso_volatile_store64 #694

Merged
merged 50 commits into from
Jun 30, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
50 commits
Select commit Hold shift + click to select a range
618ec58
#651, for 64 bit type on x86 __iso_volatile_store64
AlexGuteniev Apr 8, 2020
c13a661
Comment fall through
AlexGuteniev Apr 8, 2020
1d23034
Removing macros, now I have ICE
AlexGuteniev Apr 10, 2020
60e0046
more macro removal
AlexGuteniev Apr 10, 2020
5a20d71
clang format
AlexGuteniev Apr 10, 2020
ec56c8e
ARM build fix
AlexGuteniev Apr 10, 2020
9d1f185
ARM build fix
AlexGuteniev Apr 10, 2020
0351eb6
ARM build fix
AlexGuteniev Apr 10, 2020
7a8520c
ARM build fix
AlexGuteniev Apr 10, 2020
458dba8
correct type for load 16
AlexGuteniev Apr 10, 2020
0343b90
correct type for load 16
AlexGuteniev Apr 10, 2020
cdda967
Acquire/release for internal spinlock
AlexGuteniev Apr 12, 2020
0e1953b
Acquire/release for shared_ptr spinlock
AlexGuteniev Apr 12, 2020
ab6b378
clang format
AlexGuteniev Apr 12, 2020
e463c38
Optimize `store` a bit more for `seq_cst`
AlexGuteniev Apr 16, 2020
ec09985
undo shared_ptr change, can go separately
AlexGuteniev Apr 16, 2020
a82bbeb
add pure case
AlexGuteniev Apr 16, 2020
12369f5
Undo weaker memory order for internal spin lock
AlexGuteniev Apr 17, 2020
3b3d25a
Undo weaker memory order for internal spin lock
AlexGuteniev Apr 19, 2020
1b1593a
Merge remote-tracking branch 'upstream/master' into atomic_x86_store
AlexGuteniev Apr 23, 2020
9d5bfa5
avoid loop
AlexGuteniev Apr 26, 2020
040443b
revert irrelevant part, this is separate optimization
AlexGuteniev Apr 26, 2020
1068f08
Order!
AlexGuteniev Apr 26, 2020
bfd77c4
No, actually __both__
AlexGuteniev Apr 26, 2020
f7e37c5
spelling
AlexGuteniev Apr 26, 2020
9e590a5
better fence
AlexGuteniev Apr 26, 2020
eee4786
Merge remote-tracking branch 'upstream/master' into atomic_x86_store
AlexGuteniev Apr 30, 2020
5adca5f
move the fence to top and re-use it
AlexGuteniev Apr 30, 2020
47ad727
Merge remote-tracking branch 'upstream/master' into atomic_x86_store
AlexGuteniev May 1, 2020
443e168
DevCom-986061 preprocessor wrapper
AlexGuteniev May 3, 2020
105c1c3
ifdef for consistency with previous code
AlexGuteniev May 3, 2020
1383d7e
formatting
AlexGuteniev May 3, 2020
68f8f62
Merge remote-tracking branch 'upstream/master' into atomic_x86_store
AlexGuteniev May 13, 2020
518e4df
Merge remote-tracking branch 'upstream/master' into atomic_x86_store
AlexGuteniev May 15, 2020
fa396c7
Unify load
AlexGuteniev May 15, 2020
e47e59b
Merge remote-tracking branch 'upstream/master' into atomic_x86_store
AlexGuteniev May 23, 2020
13f1865
Merge remote-tracking branch 'upstream/master' into atomic_x86_store
AlexGuteniev May 30, 2020
9166bda
Remove DevCom-986061 workaround
AlexGuteniev Jun 4, 2020
31bf5d1
Merge branch 'master' into atomic_x86_store
StephanTLavavej Jun 20, 2020
6276c6f
Update stl/inc/atomic
AlexGuteniev Jun 25, 2020
d24ab46
Update stl/inc/atomic
AlexGuteniev Jun 25, 2020
cb5bf7d
Unsupport unknown hardware
AlexGuteniev Jun 25, 2020
4ade86d
Merge branch 'atomic_x86_store' of https://github.com/AlexGuteniev/ST…
AlexGuteniev Jun 25, 2020
5876a8e
whitespace
AlexGuteniev Jun 25, 2020
6018567
Update stl/inc/atomic
AlexGuteniev Jun 26, 2020
c492c8d
Merge remote-tracking branch 'origin/master' into atomic_x86_store
BillyONeal Jun 29, 2020
a122f3c
Remove hardware test from kill_dependency
BillyONeal Jun 29, 2020
a8af518
Call the intrinsic directly rather than _Atomic_compare_exchange_stro…
BillyONeal Jun 29, 2020
e339636
Suppress windows.h interlocked /analyze warning.
BillyONeal Jun 29, 2020
b1bcc3f
Apply STL CR comments.
BillyONeal Jun 30, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
164 changes: 72 additions & 92 deletions stl/inc/atomic
Original file line number Diff line number Diff line change
Expand Up @@ -30,25 +30,9 @@ _STL_DISABLE_CLANG_WARNINGS
#if defined(_M_ARM) || defined(_M_ARM64)
#define _Memory_barrier() __dmb(0xB) // inner shared data memory barrier
#define _Compiler_or_memory_barrier() _Memory_barrier()

#define _ISO_VOLATILE_STORE8(_Storage, _Value) __iso_volatile_store8(_Atomic_address_as<char>(_Storage), _Value)
#define _ISO_VOLATILE_STORE16(_Storage, _Value) __iso_volatile_store16(_Atomic_address_as<short>(_Storage), _Value)
#define _ISO_VOLATILE_STORE32(_Storage, _Value) __iso_volatile_store32(_Atomic_address_as<int>(_Storage), _Value)
#define _ISO_VOLATILE_STORE64(_Storage, _Value) __iso_volatile_store64(_Atomic_address_as<long long>(_Storage), _Value)
#define _ISO_VOLATILE_LOAD8(_Storage) __iso_volatile_load8(_Atomic_address_as<const char>(_Storage))
#define _ISO_VOLATILE_LOAD16(_Storage) __iso_volatile_load16(_Atomic_address_as<const short>(_Storage))

#elif defined(_M_IX86) || defined(_M_X64)
// x86/x64 hardware only emits memory barriers inside _Interlocked intrinsics
#define _Compiler_or_memory_barrier() _Compiler_barrier()

#define _ISO_VOLATILE_STORE8(_Storage, _Value) (*_Atomic_address_as<char>(_Storage) = _Value)
#define _ISO_VOLATILE_STORE16(_Storage, _Value) (*_Atomic_address_as<short>(_Storage) = _Value)
#define _ISO_VOLATILE_STORE32(_Storage, _Value) (*_Atomic_address_as<long>(_Storage) = _Value)
#define _ISO_VOLATILE_STORE64(_Storage, _Value) (*_Atomic_address_as<long long>(_Storage) = _Value)
#define _ISO_VOLATILE_LOAD8(_Storage) (*_Atomic_address_as<const char>(_Storage))
#define _ISO_VOLATILE_LOAD16(_Storage) (*_Atomic_address_as<const short>(_Storage))

#else // ^^^ x86/x64 / unsupported hardware vvv
#error Unsupported hardware
#endif // hardware
Expand Down Expand Up @@ -133,6 +117,39 @@ _NODISCARD extern "C" bool __cdecl __std_atomic_has_cmpxchg16b() noexcept;

_STD_BEGIN

// FENCES
extern "C" inline void atomic_thread_fence(const memory_order _Order) noexcept {
if (_Order == memory_order_relaxed) {
return;
}

#if defined(_M_IX86) || defined(_M_X64)
_Compiler_barrier();
if (_Order == memory_order_seq_cst) {
volatile long _Guard; // Not initialized to avoid an unnecessary operation; the value does not matter

// _mm_mfence could have been used, but it is not supported on older x86 CPUs and is slower on some recent CPUs.
// The memory fence provided by interlocked operations has some exceptions, but this is fine:
// std::atomic_thread_fence works with respect to other atomics only; it may not be a full fence for all ops.
#pragma warning(suppress : 6001) // "Using uninitialized memory '_Guard'"
#pragma warning(suppress : 28113) // "Accessing a local variable _Guard via an Interlocked function: This is an unusual
// usage which could be reconsidered."
(void) _InterlockedIncrement(&_Guard);
_Compiler_barrier();
}
#elif defined(_M_ARM) || defined(_M_ARM64)
_Memory_barrier();
#else // ^^^ ARM32/ARM64 / unsupported hardware vvv
#error Unsupported hardware
#endif // unsupported hardware
}

extern "C" inline void atomic_signal_fence(const memory_order _Order) noexcept {
if (_Order != memory_order_relaxed) {
_Compiler_barrier();
}
}

// FUNCTION TEMPLATE kill_dependency
template <class _Ty>
_Ty kill_dependency(_Ty _Arg) noexcept { // "magic" template that kills dependency ordering when called
Expand Down Expand Up @@ -417,14 +434,15 @@ struct _Atomic_storage<_Ty, 1> { // lock-free using 1-byte intrinsics
}

void store(const _Ty _Value, const memory_order _Order) noexcept { // store with given memory order
const auto _Mem = _Atomic_address_as<char>(_Storage);
const char _As_bytes = _Atomic_reinterpret_as<char>(_Value);
switch (_Order) {
case memory_order_relaxed:
_ISO_VOLATILE_STORE8(_Storage, _As_bytes);
__iso_volatile_store8(_Mem, _As_bytes);
return;
case memory_order_release:
_Compiler_or_memory_barrier();
_ISO_VOLATILE_STORE8(_Storage, _As_bytes);
__iso_volatile_store8(_Mem, _As_bytes);
return;
default:
case memory_order_consume:
Expand All @@ -439,13 +457,15 @@ struct _Atomic_storage<_Ty, 1> { // lock-free using 1-byte intrinsics
}

_NODISCARD _Ty load() const noexcept { // load with sequential consistency
char _As_bytes = _ISO_VOLATILE_LOAD8(_Storage);
const auto _Mem = _Atomic_address_as<char>(_Storage);
char _As_bytes = __iso_volatile_load8(_Mem);
_Compiler_or_memory_barrier();
return reinterpret_cast<_Ty&>(_As_bytes);
}

_NODISCARD _Ty load(const memory_order _Order) const noexcept { // load with given memory order
char _As_bytes = _ISO_VOLATILE_LOAD8(_Storage);
const auto _Mem = _Atomic_address_as<char>(_Storage);
char _As_bytes = __iso_volatile_load8(_Mem);
_Load_barrier(_Order);
return reinterpret_cast<_Ty&>(_As_bytes);
}
Expand Down Expand Up @@ -496,14 +516,15 @@ struct _Atomic_storage<_Ty, 2> { // lock-free using 2-byte intrinsics
}

void store(const _Ty _Value, const memory_order _Order) noexcept { // store with given memory order
const auto _Mem = _Atomic_address_as<short>(_Storage);
const short _As_bytes = _Atomic_reinterpret_as<short>(_Value);
switch (_Order) {
case memory_order_relaxed:
_ISO_VOLATILE_STORE16(_Storage, _As_bytes);
__iso_volatile_store16(_Mem, _As_bytes);
return;
case memory_order_release:
_Compiler_or_memory_barrier();
_ISO_VOLATILE_STORE16(_Storage, _As_bytes);
__iso_volatile_store16(_Mem, _As_bytes);
return;
default:
case memory_order_consume:
Expand All @@ -518,13 +539,15 @@ struct _Atomic_storage<_Ty, 2> { // lock-free using 2-byte intrinsics
}

_NODISCARD _Ty load() const noexcept { // load with sequential consistency
short _As_bytes = _ISO_VOLATILE_LOAD16(_Storage);
const auto _Mem = _Atomic_address_as<short>(_Storage);
short _As_bytes = __iso_volatile_load16(_Mem);
_Compiler_or_memory_barrier();
return reinterpret_cast<_Ty&>(_As_bytes);
}

_NODISCARD _Ty load(const memory_order _Order) const noexcept { // load with given memory order
short _As_bytes = _ISO_VOLATILE_LOAD16(_Storage);
const auto _Mem = _Atomic_address_as<short>(_Storage);
short _As_bytes = __iso_volatile_load16(_Mem);
_Load_barrier(_Order);
return reinterpret_cast<_Ty&>(_As_bytes);
}
Expand Down Expand Up @@ -565,22 +588,23 @@ struct _Atomic_storage<_Ty, 4> { // lock-free using 4-byte intrinsics
void store(const _Ty _Value) noexcept { // store with sequential consistency
#if defined(_M_ARM) || defined(_M_ARM64)
_Memory_barrier();
_ISO_VOLATILE_STORE32(_Storage, _Atomic_reinterpret_as<int>(_Value));
__iso_volatile_store32(_Atomic_address_as<int>(_Storage), _Atomic_reinterpret_as<int>(_Value));
_Memory_barrier();
#else // ^^^ ARM32/ARM64 hardware / x86/x64 hardware vvv
(void) _InterlockedExchange(_Atomic_address_as<long>(_Storage), _Atomic_reinterpret_as<long>(_Value));
#endif // hardware
}

void store(const _Ty _Value, const memory_order _Order) noexcept { // store with given memory order
const auto _Mem = _Atomic_address_as<int>(_Storage);
const int _As_bytes = _Atomic_reinterpret_as<int>(_Value);
switch (_Order) {
case memory_order_relaxed:
_ISO_VOLATILE_STORE32(_Storage, _As_bytes);
__iso_volatile_store32(_Mem, _As_bytes);
return;
case memory_order_release:
_Compiler_or_memory_barrier();
_ISO_VOLATILE_STORE32(_Storage, _As_bytes);
__iso_volatile_store32(_Mem, _As_bytes);
return;
default:
case memory_order_consume:
Expand All @@ -595,13 +619,15 @@ struct _Atomic_storage<_Ty, 4> { // lock-free using 4-byte intrinsics
}

_NODISCARD _Ty load() const noexcept { // load with sequential consistency
auto _As_bytes = _ISO_VOLATILE_LOAD32(_Storage);
const auto _Mem = _Atomic_address_as<int>(_Storage);
auto _As_bytes = __iso_volatile_load32(_Mem);
_Compiler_or_memory_barrier();
return reinterpret_cast<_Ty&>(_As_bytes);
}

_NODISCARD _Ty load(const memory_order _Order) const noexcept { // load with given memory order
auto _As_bytes = _ISO_VOLATILE_LOAD32(_Storage);
const auto _Mem = _Atomic_address_as<int>(_Storage);
auto _As_bytes = __iso_volatile_load32(_Mem);
_Load_barrier(_Order);
return reinterpret_cast<_Ty&>(_As_bytes);
}
Expand Down Expand Up @@ -639,18 +665,14 @@ struct _Atomic_storage<_Ty, 8> { // lock-free using 8-byte intrinsics
// non-atomically initialize this atomic
}

#ifdef _M_IX86
void store(const _Ty _Value, const memory_order _Order = memory_order_seq_cst) noexcept {
// store with (effectively) sequential consistency
AlexGuteniev marked this conversation as resolved.
Show resolved Hide resolved
_Check_store_memory_order(_Order);
(void) exchange(_Value, _Order);
}
#else // ^^^ _M_IX86 / !_M_IX86 vvv

void store(const _Ty _Value) noexcept { // store with sequential consistency
const auto _Mem = _Atomic_address_as<long long>(_Storage);
const long long _As_bytes = _Atomic_reinterpret_as<long long>(_Value);
#ifdef _M_ARM64
#if defined(_M_IX86)
_Compiler_barrier();
__iso_volatile_store64(_Mem, _As_bytes);
_STD atomic_thread_fence(memory_order_seq_cst);
#elif defined(_M_ARM64)
_Memory_barrier();
__iso_volatile_store64(_Mem, _As_bytes);
_Memory_barrier();
Expand All @@ -660,14 +682,15 @@ struct _Atomic_storage<_Ty, 8> { // lock-free using 8-byte intrinsics
}

void store(const _Ty _Value, const memory_order _Order) noexcept { // store with given memory order
const auto _Mem = _Atomic_address_as<long long>(_Storage);
const long long _As_bytes = _Atomic_reinterpret_as<long long>(_Value);
switch (_Order) {
case memory_order_relaxed:
_ISO_VOLATILE_STORE64(_Storage, _As_bytes);
__iso_volatile_store64(_Mem, _As_bytes);
return;
case memory_order_release:
_Compiler_or_memory_barrier();
_ISO_VOLATILE_STORE64(_Storage, _As_bytes);
__iso_volatile_store64(_Mem, _As_bytes);
return;
default:
case memory_order_consume:
Expand All @@ -680,33 +703,27 @@ struct _Atomic_storage<_Ty, 8> { // lock-free using 8-byte intrinsics
return;
}
}
#endif // _M_IX86

_NODISCARD _Ty load() const noexcept { // load with sequential consistency
const auto _Mem = _Atomic_address_as<const long long>(_Storage);
const auto _Mem = _Atomic_address_as<long long>(_Storage);
long long _As_bytes;
#if defined(_M_ARM)
#ifdef _M_ARM
_As_bytes = __ldrexd(_Mem);
_Memory_barrier();
#elif defined(_M_IX86) || defined(_M_ARM64)
#else
_As_bytes = __iso_volatile_load64(_Mem);
_Compiler_or_memory_barrier();
#else // _M_X64
_As_bytes = *_Mem;
_Compiler_barrier();
#endif // hardware
#endif
return reinterpret_cast<_Ty&>(_As_bytes);
}

_NODISCARD _Ty load(const memory_order _Order) const noexcept { // load with given memory order
const auto _Mem = _Atomic_address_as<const long long>(_Storage);
#if defined(_M_ARM)
const auto _Mem = _Atomic_address_as<long long>(_Storage);
#ifdef _M_ARM
long long _As_bytes = __ldrexd(_Mem);
#elif defined(_M_IX86) || defined(_M_ARM64)
#else
long long _As_bytes = __iso_volatile_load64(_Mem);
#else // _M_X64
long long _As_bytes = *_Mem;
#endif // hardware
#endif
_Load_barrier(_Order);
return reinterpret_cast<_Ty&>(_As_bytes);
}
Expand Down Expand Up @@ -1929,37 +1946,6 @@ _Ty atomic_fetch_xor_explicit(
return _Mem->fetch_xor(_Value, _Order);
}

// FENCES
extern "C" inline void atomic_thread_fence(const memory_order _Order) noexcept {
if (_Order == memory_order_relaxed) {
return;
}

#if defined(_M_ARM) || defined(_M_ARM64)
_Memory_barrier();
#else // ^^^ ARM32/ARM64 hardware / x86/x64 hardware vvv
_Compiler_barrier();
if (_Order == memory_order_seq_cst) {
volatile long _Guard; // Not initialized to avoid an unnecessary operation; the value does not matter

// _mm_mfence could have been used, but it is not supported on older x86 CPUs and is slower on some recent CPUs.
// The memory fence provided by interlocked operations has some exceptions, but this is fine:
// std::atomic_thread_fence works with respect to other atomics only; it may not be a full fence for all ops.
#pragma warning(suppress : 6001) // "Using uninitialized memory '_Guard'"
#pragma warning(suppress : 28113) // "Accessing a local variable _Guard via an Interlocked function: This is an unusual
// usage which could be reconsidered."
(void) _InterlockedIncrement(&_Guard);
_Compiler_barrier();
}
#endif // hardware
}

extern "C" inline void atomic_signal_fence(const memory_order _Order) noexcept {
if (_Order != memory_order_relaxed) {
_Compiler_barrier();
}
}

// ATOMIC TYPEDEFS
using atomic_bool = atomic<bool>;

Expand Down Expand Up @@ -2119,13 +2105,7 @@ _STD_END

#undef _ATOMIC_CHOOSE_INTRINSIC
#undef _ATOMIC_HAS_DCAS
#undef _ISO_VOLATILE_LOAD8
#undef _ISO_VOLATILE_LOAD16
// #undef _ISO_VOLATILE_LOAD32 // Used in <memory>
#undef _ISO_VOLATILE_STORE8
#undef _ISO_VOLATILE_STORE16
#undef _ISO_VOLATILE_STORE32
#undef _ISO_VOLATILE_STORE64

#undef _STD_COMPARE_EXCHANGE_128
#undef _INVALID_MEMORY_ORDER
#undef _Compiler_or_memory_barrier
Expand Down
10 changes: 7 additions & 3 deletions stl/inc/chrono
Original file line number Diff line number Diff line change
Expand Up @@ -604,6 +604,9 @@ namespace chrono {
using time_point = chrono::time_point<steady_clock>;
static constexpr bool is_steady = true;

#pragma warning(push)
#pragma warning(disable : 28112) // A variable which is accessed via an Interlocked function must
// always be accessed via an Interlocked function.
_NODISCARD static time_point now() noexcept { // get current time
#if (defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64)) && !defined(_M_CEE_PURE)
// Implement atomics avoiding <atomic> header dependency
Expand All @@ -623,10 +626,10 @@ namespace chrono {
const long long _Freq = _Query_perf_frequency();
const long long _Ctr = _Query_perf_counter();
const long long _Result = _Scale_large_counter(_Ctr, _Freq);
if (_Atomic_compare_exchange_strong_ll_seq_cst(&_Cached_freq, _Freq, LLONG_MAX)) {
if (_InterlockedCompareExchange64(&_Cached_freq, _Freq, LLONG_MAX) == LLONG_MAX) {
// This is the first result, save current result as base for fast path
_Atomic_compare_exchange_strong_ll_seq_cst(&_Cached_ctr_base, _Ctr, LLONG_MAX);
_Atomic_compare_exchange_strong_ll_seq_cst(&_Cached_result_base, _Result, LLONG_MAX);
_InterlockedCompareExchange64(&_Cached_ctr_base, _Ctr, LLONG_MAX);
_InterlockedCompareExchange64(&_Cached_result_base, _Result, LLONG_MAX);
}
// if _Result is not saved as first, it is still compatible with fast result
return time_point(duration(_Result));
Expand All @@ -636,6 +639,7 @@ namespace chrono {
return time_point(duration(_Scale_large_counter(_Ctr, _Freq)));
#endif // (defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64)) && !defined(_M_CEE_PURE)
}
#pragma warning(pop)

private:
_NODISCARD static long long _Scale_large_counter(const long long _Ctr, const long long _Freq) noexcept {
Expand Down
7 changes: 5 additions & 2 deletions stl/inc/memory
Original file line number Diff line number Diff line change
Expand Up @@ -514,8 +514,11 @@ public:

bool _Incref_nz() noexcept { // increment use count if not zero, return true if successful
auto& _Volatile_uses = reinterpret_cast<volatile long&>(_Uses);
long _Count = _ISO_VOLATILE_LOAD32(_Volatile_uses);

#ifdef _M_CEE_PURE
long _Count = *_Atomic_address_as<const long>(&_Volatile_uses);
#else
long _Count = __iso_volatile_load32(reinterpret_cast<volatile int*>(&_Volatile_uses));
#endif
while (_Count != 0) {
const long _Old_value = _INTRIN_RELAXED(_InterlockedCompareExchange)(&_Volatile_uses, _Count + 1, _Count);
if (_Old_value == _Count) {
Expand Down
Loading