-
Notifications
You must be signed in to change notification settings - Fork 211
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Allow optimization and use fesetround(), fegetround() #642
Merged
Merged
Changes from all commits
Commits
Show all changes
2 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -106,28 +106,23 @@ | |
#pragma message("Macro name collisions may happen with unsupported compilers.") | ||
#endif | ||
|
||
|
||
#if defined(__GNUC__) && !defined(__clang__) | ||
#pragma push_macro("FORCE_INLINE_OPTNONE") | ||
#define FORCE_INLINE_OPTNONE static inline __attribute__((optimize("O0"))) | ||
#elif defined(__clang__) | ||
#pragma push_macro("FORCE_INLINE_OPTNONE") | ||
#define FORCE_INLINE_OPTNONE static inline __attribute__((optnone)) | ||
#else | ||
#define FORCE_INLINE_OPTNONE FORCE_INLINE | ||
#endif | ||
|
||
#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 10 | ||
#warning "GCC versions earlier than 10 are not supported." | ||
#endif | ||
|
||
#ifdef __OPTIMIZE__ | ||
#warning \ | ||
"Report any potential compiler optimization issues when using SSE2NEON. See the 'Optimization' section at https://github.com/DLTcollab/sse2neon." | ||
#endif | ||
|
||
/* C language does not allow initializing a variable with a function call. */ | ||
#ifdef __cplusplus | ||
#define _sse2neon_const static const | ||
#else | ||
#define _sse2neon_const const | ||
#endif | ||
|
||
#include <fenv.h> | ||
#include <stdint.h> | ||
#include <stdlib.h> | ||
#include <string.h> | ||
|
@@ -604,8 +599,8 @@ FORCE_INLINE __m128d _mm_ceil_pd(__m128d); | |
FORCE_INLINE __m128 _mm_ceil_ps(__m128); | ||
FORCE_INLINE __m128d _mm_floor_pd(__m128d); | ||
FORCE_INLINE __m128 _mm_floor_ps(__m128); | ||
FORCE_INLINE_OPTNONE __m128d _mm_round_pd(__m128d, int); | ||
FORCE_INLINE_OPTNONE __m128 _mm_round_ps(__m128, int); | ||
FORCE_INLINE __m128d _mm_round_pd(__m128d, int); | ||
FORCE_INLINE __m128 _mm_round_ps(__m128, int); | ||
// SSE4.2 | ||
FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t); | ||
|
||
|
@@ -1846,25 +1841,20 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode(void) | |
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE | ||
FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void) | ||
{ | ||
union { | ||
fpcr_bitfield field; | ||
#if defined(__aarch64__) || defined(_M_ARM64) | ||
uint64_t value; | ||
#else | ||
uint32_t value; | ||
#endif | ||
} r; | ||
|
||
#if defined(__aarch64__) || defined(_M_ARM64) | ||
r.value = _sse2neon_get_fpcr(); | ||
#else | ||
__asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ | ||
#endif | ||
|
||
if (r.field.bit22) { | ||
return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP; | ||
} else { | ||
return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST; | ||
switch (fegetround()) { | ||
case FE_TONEAREST: | ||
return _MM_ROUND_NEAREST; | ||
case FE_DOWNWARD: | ||
return _MM_ROUND_DOWN; | ||
case FE_UPWARD: | ||
return _MM_ROUND_UP; | ||
case FE_TOWARDZERO: | ||
return _MM_ROUND_TOWARD_ZERO; | ||
default: | ||
// fegetround() must return _MM_ROUND_NEAREST, _MM_ROUND_DOWN, | ||
// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO on success. all the other error | ||
// cases we treat them as FE_TOWARDZERO (truncate). | ||
return _MM_ROUND_TOWARD_ZERO; | ||
} | ||
} | ||
|
||
|
@@ -2458,46 +2448,28 @@ FORCE_INLINE __m128 _mm_set_ps1(float _w) | |
// the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, | ||
// _MM_ROUND_TOWARD_ZERO | ||
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE | ||
FORCE_INLINE_OPTNONE void _MM_SET_ROUNDING_MODE(int rounding) | ||
FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding) | ||
{ | ||
union { | ||
fpcr_bitfield field; | ||
#if defined(__aarch64__) || defined(_M_ARM64) | ||
uint64_t value; | ||
#else | ||
uint32_t value; | ||
#endif | ||
} r; | ||
|
||
#if defined(__aarch64__) || defined(_M_ARM64) | ||
r.value = _sse2neon_get_fpcr(); | ||
#else | ||
__asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ | ||
#endif | ||
|
||
switch (rounding) { | ||
case _MM_ROUND_TOWARD_ZERO: | ||
r.field.bit22 = 1; | ||
r.field.bit23 = 1; | ||
case _MM_ROUND_NEAREST: | ||
rounding = FE_TONEAREST; | ||
break; | ||
case _MM_ROUND_DOWN: | ||
r.field.bit22 = 0; | ||
r.field.bit23 = 1; | ||
rounding = FE_DOWNWARD; | ||
break; | ||
case _MM_ROUND_UP: | ||
r.field.bit22 = 1; | ||
r.field.bit23 = 0; | ||
rounding = FE_UPWARD; | ||
break; | ||
default: //_MM_ROUND_NEAREST | ||
r.field.bit22 = 0; | ||
r.field.bit23 = 0; | ||
case _MM_ROUND_TOWARD_ZERO: | ||
rounding = FE_TOWARDZERO; | ||
break; | ||
default: | ||
// rounding must be _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, | ||
// _MM_ROUND_TOWARD_ZERO. all the other invalid values we treat them as | ||
// FE_TOWARDZERO (truncate). | ||
rounding = FE_TOWARDZERO; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same here |
||
} | ||
|
||
#if defined(__aarch64__) || defined(_M_ARM64) | ||
_sse2neon_set_fpcr(r.value); | ||
#else | ||
__asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */ | ||
#endif | ||
fesetround(rounding); | ||
} | ||
|
||
// Copy single-precision (32-bit) floating-point element a to the lower element | ||
|
@@ -3899,7 +3871,7 @@ FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a) | |
// Convert packed double-precision (64-bit) floating-point elements in a to | ||
// packed 32-bit integers, and store the results in dst. | ||
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32 | ||
FORCE_INLINE_OPTNONE __m128i _mm_cvtpd_epi32(__m128d a) | ||
FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a) | ||
{ | ||
// vrnd32xq_f64 not supported on clang | ||
#if defined(__ARM_FEATURE_FRINT) && !defined(__clang__) | ||
|
@@ -3921,7 +3893,7 @@ FORCE_INLINE_OPTNONE __m128i _mm_cvtpd_epi32(__m128d a) | |
// Convert packed double-precision (64-bit) floating-point elements in a to | ||
// packed 32-bit integers, and store the results in dst. | ||
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_pi32 | ||
FORCE_INLINE_OPTNONE __m64 _mm_cvtpd_pi32(__m128d a) | ||
FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a) | ||
{ | ||
__m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); | ||
double d0, d1; | ||
|
@@ -4217,7 +4189,7 @@ FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a) | |
// Convert packed double-precision (64-bit) floating-point elements in a to | ||
// packed 32-bit integers with truncation, and store the results in dst. | ||
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32 | ||
FORCE_INLINE_OPTNONE __m64 _mm_cvttpd_pi32(__m128d a) | ||
FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a) | ||
{ | ||
double a0, a1; | ||
a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); | ||
|
@@ -7559,7 +7531,7 @@ FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b) | |
// the rounding parameter, and store the results as packed double-precision | ||
// floating-point elements in dst. | ||
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd | ||
FORCE_INLINE_OPTNONE __m128d _mm_round_pd(__m128d a, int rounding) | ||
FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding) | ||
{ | ||
#if defined(__aarch64__) || defined(_M_ARM64) | ||
switch (rounding) { | ||
|
@@ -7628,7 +7600,7 @@ FORCE_INLINE_OPTNONE __m128d _mm_round_pd(__m128d a, int rounding) | |
// the rounding parameter, and store the results as packed single-precision | ||
// floating-point elements in dst. | ||
// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps | ||
FORCE_INLINE_OPTNONE __m128 _mm_round_ps(__m128 a, int rounding) | ||
FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding) | ||
{ | ||
#if (defined(__aarch64__) || defined(_M_ARM64)) || \ | ||
defined(__ARM_FEATURE_DIRECTED_ROUNDING) | ||
|
@@ -9346,8 +9318,7 @@ FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a) | |
#endif | ||
} | ||
|
||
FORCE_INLINE_OPTNONE void _sse2neon_mm_set_denormals_zero_mode( | ||
unsigned int flag) | ||
FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag) | ||
{ | ||
// AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting, | ||
// regardless of the value of the FZ bit. | ||
|
@@ -9419,7 +9390,6 @@ FORCE_INLINE uint64_t _rdtsc(void) | |
#if defined(__GNUC__) || defined(__clang__) | ||
#pragma pop_macro("ALIGN_STRUCT") | ||
#pragma pop_macro("FORCE_INLINE") | ||
#pragma pop_macro("FORCE_INLINE_OPTNONE") | ||
#endif | ||
|
||
#if defined(__GNUC__) && !defined(__clang__) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@jserv I am not sure whether we have a better value to return in the error case, so I temporarily return
_MM_ROUND_TOWARD_ZERO
here.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is reasonable, and you should explain more in comments.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have added comments for explaining the error case