-
Notifications
You must be signed in to change notification settings - Fork 12.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[X86] Align other variants to use void * as 512 variants. #66310
Conversation
For *_stream_* series intrinsics.
@llvm/pr-subscribers-clang ChangesFor *_stream_* series intrinsics.-- 10 Files Affected:
diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h index c45006193eddcc9..675a93bba1c8a4f 100644 --- a/clang/lib/Headers/avx2intrin.h +++ b/clang/lib/Headers/avx2intrin.h @@ -2979,7 +2979,7 @@ _mm256_xor_si256(__m256i __a, __m256i __b) /// A pointer to the 32-byte aligned memory containing the vector to load. /// \returns A 256-bit integer vector loaded from memory. static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_stream_load_si256(__m256i const *__V) +_mm256_stream_load_si256(void const *__V) { typedef __v4di __v4di_aligned __attribute__((aligned(32))); return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V); diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h index 94fac5e6c9da471..b796bb773ec11f0 100644 --- a/clang/lib/Headers/avxintrin.h +++ b/clang/lib/Headers/avxintrin.h @@ -3563,7 +3563,7 @@ _mm_maskstore_ps(float *__p, __m128i __m, __m128 __a) /// \param __b /// A 256-bit integer vector containing the values to be moved. static __inline void __DEFAULT_FN_ATTRS -_mm256_stream_si256(__m256i *__a, __m256i __b) +_mm256_stream_si256(void *__a, __m256i __b) { typedef __v4di __v4di_aligned __attribute__((aligned(32))); __builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a); @@ -3583,7 +3583,7 @@ _mm256_stream_si256(__m256i *__a, __m256i __b) /// \param __b /// A 256-bit vector of [4 x double] containing the values to be moved. static __inline void __DEFAULT_FN_ATTRS -_mm256_stream_pd(double *__a, __m256d __b) +_mm256_stream_pd(void *__a, __m256d __b) { typedef __v4df __v4df_aligned __attribute__((aligned(32))); __builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a); @@ -3604,7 +3604,7 @@ _mm256_stream_pd(double *__a, __m256d __b) /// \param __a /// A 256-bit vector of [8 x float] containing the values to be moved. static __inline void __DEFAULT_FN_ATTRS -_mm256_stream_ps(float *__p, __m256 __a) +_mm256_stream_ps(void *__p, __m256 __a) { typedef __v8sf __v8sf_aligned __attribute__((aligned(32))); __builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p); diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h index 064d974936598f8..eacb0182614304d 100644 --- a/clang/lib/Headers/emmintrin.h +++ b/clang/lib/Headers/emmintrin.h @@ -3945,7 +3945,7 @@ static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p, /// A pointer to the 128-bit aligned memory location used to store the value. /// \param __a /// A vector of [2 x double] containing the 64-bit values to be stored. -static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(double *__p, +static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(void *__p, __m128d __a) { __builtin_nontemporal_store((__v2df)__a, (__v2df *)__p); } @@ -3963,7 +3963,7 @@ static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(double *__p, /// A pointer to the 128-bit aligned memory location used to store the value. /// \param __a /// A 128-bit integer vector containing the values to be stored. -static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(__m128i *__p, +static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(void *__p, __m128i __a) { __builtin_nontemporal_store((__v2di)__a, (__v2di *)__p); } @@ -3983,7 +3983,7 @@ static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(__m128i *__p, /// A 32-bit integer containing the value to be stored. static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("sse2"))) - _mm_stream_si32(int *__p, int __a) { + _mm_stream_si32(void *__p, int __a) { __builtin_ia32_movnti(__p, __a); } @@ -4003,7 +4003,7 @@ static __inline__ void /// A 64-bit integer containing the value to be stored. static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("sse2"))) - _mm_stream_si64(long long *__p, long long __a) { + _mm_stream_si64(void *__p, long long __a) { __builtin_ia32_movnti64(__p, __a); } #endif diff --git a/clang/lib/Headers/smmintrin.h b/clang/lib/Headers/smmintrin.h index 16d8855a1c0b5d0..4e2eb46bb5421f2 100644 --- a/clang/lib/Headers/smmintrin.h +++ b/clang/lib/Headers/smmintrin.h @@ -645,7 +645,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epi32(__m128i __V1, /// \returns A 128-bit integer vector containing the data stored at the /// specified memory location. static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_stream_load_si128(__m128i const *__V) { +_mm_stream_load_si128(void const *__V) { return (__m128i)__builtin_nontemporal_load((const __v2di *)__V); } diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h index 80aa2a817f6afc1..10b6907ace07cc4 100644 --- a/clang/lib/Headers/xmmintrin.h +++ b/clang/lib/Headers/xmmintrin.h @@ -2140,7 +2140,7 @@ _mm_stream_pi(__m64 *__p, __m64 __a) /// \param __a /// A 128-bit vector of [4 x float] containing the values to be moved. static __inline__ void __DEFAULT_FN_ATTRS -_mm_stream_ps(float *__p, __m128 __a) +_mm_stream_ps(void *__p, __m128 __a) { __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p); } diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c index b68d192051b9bf4..06d3c321dd89592 100644 --- a/clang/test/CodeGen/X86/avx-builtins.c +++ b/clang/test/CodeGen/X86/avx-builtins.c @@ -1940,18 +1940,36 @@ void test_mm256_stream_pd(double* A, __m256d B) { _mm256_stream_pd(A, B); } +void test_mm256_stream_pd_void(void* A, __m256d B) { + // CHECK-LABEL: test_mm256_stream_pd_void + // CHECK: store <4 x double> %{{.*}}, ptr %{{.*}}, align 32, !nontemporal + _mm256_stream_pd(A, B); +} + void test_mm256_stream_ps(float* A, __m256 B) { // CHECK-LABEL: test_mm256_stream_ps // CHECK: store <8 x float> %{{.*}}, ptr %{{.*}}, align 32, !nontemporal _mm256_stream_ps(A, B); } +void test_mm256_stream_ps_void(void* A, __m256 B) { + // CHECK-LABEL: test_mm256_stream_ps_void + // CHECK: store <8 x float> %{{.*}}, ptr %{{.*}}, align 32, !nontemporal + _mm256_stream_ps(A, B); +} + void test_mm256_stream_si256(__m256i* A, __m256i B) { // CHECK-LABEL: test_mm256_stream_si256 // CHECK: store <4 x i64> %{{.*}}, ptr %{{.*}}, align 32, !nontemporal _mm256_stream_si256(A, B); } +void test_mm256_stream_si256_void(void* A, __m256i B) { + // CHECK-LABEL: test_mm256_stream_si256_void + // CHECK: store <4 x i64> %{{.*}}, ptr %{{.*}}, align 32, !nontemporal + _mm256_stream_si256(A, B); +} + __m256d test_mm256_sub_pd(__m256d A, __m256d B) { // CHECK-LABEL: test_mm256_sub_pd // CHECK: fsub <4 x double> diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c index 2750e1b227483ee..5b8c6ded7f216b7 100644 --- a/clang/test/CodeGen/X86/avx2-builtins.c +++ b/clang/test/CodeGen/X86/avx2-builtins.c @@ -1223,6 +1223,12 @@ __m256i test_mm256_stream_load_si256(__m256i const *a) { return _mm256_stream_load_si256(a); } +__m256i test_mm256_stream_load_si256_const(void const *a) { + // CHECK-LABEL: test_mm256_stream_load_si256_const + // CHECK: load <4 x i64>, ptr %{{.*}}, align 32, !nontemporal + return _mm256_stream_load_si256(a); +} + __m256i test_mm256_sub_epi8(__m256i a, __m256i b) { // CHECK-LABEL: test_mm256_sub_epi8 // CHECK: sub <32 x i8> diff --git a/clang/test/CodeGen/X86/sse-builtins.c b/clang/test/CodeGen/X86/sse-builtins.c index da40380926d2c8a..9c64d420f7cdf10 100644 --- a/clang/test/CodeGen/X86/sse-builtins.c +++ b/clang/test/CodeGen/X86/sse-builtins.c @@ -720,6 +720,12 @@ void test_mm_stream_ps(float*A, __m128 B) { _mm_stream_ps(A, B); } +void test_mm_stream_ps_2(void*A, __m128 B) { + // CHECK-LABEL: test_mm_stream_ps_2 + // CHECK: store <4 x float> %{{.*}}, ptr %{{.*}}, align 16, !nontemporal + _mm_stream_ps(A, B); +} + __m128 test_mm_sub_ps(__m128 A, __m128 B) { // CHECK-LABEL: test_mm_sub_ps // CHECK: fsub <4 x float> diff --git a/clang/test/CodeGen/X86/sse2-builtins.c b/clang/test/CodeGen/X86/sse2-builtins.c index 7c62a128c331fc5..7165d2791827cfc 100644 --- a/clang/test/CodeGen/X86/sse2-builtins.c +++ b/clang/test/CodeGen/X86/sse2-builtins.c @@ -1488,18 +1488,36 @@ void test_mm_stream_pd(double *A, __m128d B) { _mm_stream_pd(A, B); } +void test_mm_stream_pd_void(void *A, __m128d B) { + // CHECK-LABEL: test_mm_stream_pd_void + // CHECK: store <2 x double> %{{.*}}, ptr %{{.*}}, align 16, !nontemporal + _mm_stream_pd(A, B); +} + void test_mm_stream_si32(int *A, int B) { // CHECK-LABEL: test_mm_stream_si32 // CHECK: store i32 %{{.*}}, ptr %{{.*}}, align 1, !nontemporal _mm_stream_si32(A, B); } +void test_mm_stream_si32_void(void *A, int B) { + // CHECK-LABEL: test_mm_stream_si32_void + // CHECK: store i32 %{{.*}}, ptr %{{.*}}, align 1, !nontemporal + _mm_stream_si32(A, B); +} + #ifdef __x86_64__ void test_mm_stream_si64(long long *A, long long B) { // X64-LABEL: test_mm_stream_si64 // X64: store i64 %{{.*}}, ptr %{{.*}}, align 1, !nontemporal _mm_stream_si64(A, B); } + +void test_mm_stream_si64_void(void *A, long long B) { + // X64-LABEL: test_mm_stream_si64_void + // X64: store i64 %{{.*}}, ptr %{{.*}}, align 1, !nontemporal + _mm_stream_si64(A, B); +} #endif void test_mm_stream_si128(__m128i *A, __m128i B) { @@ -1508,6 +1526,12 @@ void test_mm_stream_si128(__m128i *A, __m128i B) { _mm_stream_si128(A, B); } +void test_mm_stream_si128_void(void *A, __m128i B) { + // CHECK-LABEL: test_mm_stream_si128_void + // CHECK: store <2 x i64> %{{.*}}, ptr %{{.*}}, align 16, !nontemporal + _mm_stream_si128(A, B); +} + __m128i test_mm_sub_epi8(__m128i A, __m128i B) { // CHECK-LABEL: test_mm_sub_epi8 // CHECK: sub <16 x i8> diff --git a/clang/test/CodeGen/X86/sse41-builtins.c b/clang/test/CodeGen/X86/sse41-builtins.c index fe59cbcaf1938c6..ad486a6d9950af6 100644 --- a/clang/test/CodeGen/X86/sse41-builtins.c +++ b/clang/test/CodeGen/X86/sse41-builtins.c @@ -358,6 +358,12 @@ __m128i test_mm_stream_load_si128(__m128i const *a) { return _mm_stream_load_si128(a); } +__m128i test_mm_stream_load_si128_void(void const *a) { + // CHECK-LABEL: test_mm_stream_load_si128_void + // CHECK: load <2 x i64>, ptr %{{.*}}, align 16, !nontemporal + return _mm_stream_load_si128(a); +} + int test_mm_test_all_ones(__m128i x) { // CHECK-LABEL: test_mm_test_all_ones // CHECK: call i32 @llvm.x86.sse41.ptestc(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}) |
Here's the change for 512 variants before: https://reviews.llvm.org/D66786 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM - please update the description to mention that you're updating the movnti i32/i64 scalar integer nt ops as well as the vector ops
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
For completeness the _mm_stream_sd / _mm_stream_ss SSE4A intrinsics still need updating as well: https://github.com/llvm/llvm-project/blob/59fbba94908f65eedb8bdd619e425bf97d84b2e3/clang/lib/Headers/ammintrin.h#L158C1-L158C14 |
Good catch. Seems like intrinsic guide missed these two. |
They're AMD specific so probably not covered |
I think this description has covered? May display wrong in the preview, highlighted here: |
I meant the patch title - I'm never quite sure how the title/description appears when these are squashed+merged |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM - cheers
For stream series intrinsics