[X86] Align other variants to use void * as 512 variants. #66310

FreddyLeaf · 2023-09-14T01:31:44Z

For stream series intrinsics

For *_stream_* series intrinsics.

llvmbot · 2023-09-14T01:32:37Z

@llvm/pr-subscribers-clang

Changes

For *_stream_* series intrinsics.

--
Full diff: https://github.com/llvm/llvm-project/pull/66310.diff

10 Files Affected:

(modified) clang/lib/Headers/avx2intrin.h (+1-1)
(modified) clang/lib/Headers/avxintrin.h (+3-3)
(modified) clang/lib/Headers/emmintrin.h (+4-4)
(modified) clang/lib/Headers/smmintrin.h (+1-1)
(modified) clang/lib/Headers/xmmintrin.h (+1-1)
(modified) clang/test/CodeGen/X86/avx-builtins.c (+18)
(modified) clang/test/CodeGen/X86/avx2-builtins.c (+6)
(modified) clang/test/CodeGen/X86/sse-builtins.c (+6)
(modified) clang/test/CodeGen/X86/sse2-builtins.c (+24)
(modified) clang/test/CodeGen/X86/sse41-builtins.c (+6)

diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h
index c45006193eddcc9..675a93bba1c8a4f 100644
--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@@ -2979,7 +2979,7 @@ _mm256_xor_si256(__m256i __a, __m256i __b)
 ///    A pointer to the 32-byte aligned memory containing the vector to load.
 /// \returns A 256-bit integer vector loaded from memory.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_stream_load_si256(__m256i const *__V)
+_mm256_stream_load_si256(void const *__V)
 {
   typedef __v4di __v4di_aligned __attribute__((aligned(32)));
   return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V);
diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h
index 94fac5e6c9da471..b796bb773ec11f0 100644
--- a/clang/lib/Headers/avxintrin.h
+++ b/clang/lib/Headers/avxintrin.h
@@ -3563,7 +3563,7 @@ _mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
 /// \param __b
 ///    A 256-bit integer vector containing the values to be moved.
 static __inline void __DEFAULT_FN_ATTRS
-_mm256_stream_si256(__m256i *__a, __m256i __b)
+_mm256_stream_si256(void *__a, __m256i __b)
 {
   typedef __v4di __v4di_aligned __attribute__((aligned(32)));
   __builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a);
@@ -3583,7 +3583,7 @@ _mm256_stream_si256(__m256i *__a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [4 x double] containing the values to be moved.
 static __inline void __DEFAULT_FN_ATTRS
-_mm256_stream_pd(double *__a, __m256d __b)
+_mm256_stream_pd(void *__a, __m256d __b)
 {
   typedef __v4df __v4df_aligned __attribute__((aligned(32)));
   __builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a);
@@ -3604,7 +3604,7 @@ _mm256_stream_pd(double *__a, __m256d __b)
 /// \param __a
 ///    A 256-bit vector of [8 x float] containing the values to be moved.
 static __inline void __DEFAULT_FN_ATTRS
-_mm256_stream_ps(float *__p, __m256 __a)
+_mm256_stream_ps(void *__p, __m256 __a)
 {
   typedef __v8sf __v8sf_aligned __attribute__((aligned(32)));
   __builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p);
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index 064d974936598f8..eacb0182614304d 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -3945,7 +3945,7 @@ static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p,
 ///    A pointer to the 128-bit aligned memory location used to store the value.
 /// \param __a
 ///    A vector of [2 x double] containing the 64-bit values to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(double *__p,
+static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(void *__p,
                                                         __m128d __a) {
   __builtin_nontemporal_store((__v2df)__a, (__v2df *)__p);
 }
@@ -3963,7 +3963,7 @@ static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(double *__p,
 ///    A pointer to the 128-bit aligned memory location used to store the value.
 /// \param __a
 ///    A 128-bit integer vector containing the values to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(__m128i *__p,
+static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(void *__p,
                                                            __m128i __a) {
   __builtin_nontemporal_store((__v2di)__a, (__v2di *)__p);
 }
@@ -3983,7 +3983,7 @@ static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(__m128i *__p,
 ///    A 32-bit integer containing the value to be stored.
 static __inline__ void
     __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
-    _mm_stream_si32(int *__p, int __a) {
+    _mm_stream_si32(void *__p, int __a) {
   __builtin_ia32_movnti(__p, __a);
 }
 
@@ -4003,7 +4003,7 @@ static __inline__ void
 ///    A 64-bit integer containing the value to be stored.
 static __inline__ void
     __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
-    _mm_stream_si64(long long *__p, long long __a) {
+    _mm_stream_si64(void *__p, long long __a) {
   __builtin_ia32_movnti64(__p, __a);
 }
 #endif
diff --git a/clang/lib/Headers/smmintrin.h b/clang/lib/Headers/smmintrin.h
index 16d8855a1c0b5d0..4e2eb46bb5421f2 100644
--- a/clang/lib/Headers/smmintrin.h
+++ b/clang/lib/Headers/smmintrin.h
@@ -645,7 +645,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epi32(__m128i __V1,
 /// \returns A 128-bit integer vector containing the data stored at the
 ///    specified memory location.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_stream_load_si128(__m128i const *__V) {
+_mm_stream_load_si128(void const *__V) {
   return (__m128i)__builtin_nontemporal_load((const __v2di *)__V);
 }
 
diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h
index 80aa2a817f6afc1..10b6907ace07cc4 100644
--- a/clang/lib/Headers/xmmintrin.h
+++ b/clang/lib/Headers/xmmintrin.h
@@ -2140,7 +2140,7 @@ _mm_stream_pi(__m64 *__p, __m64 __a)
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing the values to be moved.
 static __inline__ void __DEFAULT_FN_ATTRS
-_mm_stream_ps(float *__p, __m128 __a)
+_mm_stream_ps(void *__p, __m128 __a)
 {
   __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
 }
diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c
index b68d192051b9bf4..06d3c321dd89592 100644
--- a/clang/test/CodeGen/X86/avx-builtins.c
+++ b/clang/test/CodeGen/X86/avx-builtins.c
@@ -1940,18 +1940,36 @@ void test_mm256_stream_pd(double* A, __m256d B) {
   _mm256_stream_pd(A, B);
 }
 
+void test_mm256_stream_pd_void(void* A, __m256d B) {
+  // CHECK-LABEL: test_mm256_stream_pd_void
+  // CHECK: store <4 x double> %{{.*}}, ptr %{{.*}}, align 32, !nontemporal
+  _mm256_stream_pd(A, B);
+}
+
 void test_mm256_stream_ps(float* A, __m256 B) {
   // CHECK-LABEL: test_mm256_stream_ps
   // CHECK: store <8 x float> %{{.*}}, ptr %{{.*}}, align 32, !nontemporal
   _mm256_stream_ps(A, B);
 }
 
+void test_mm256_stream_ps_void(void* A, __m256 B) {
+  // CHECK-LABEL: test_mm256_stream_ps_void
+  // CHECK: store <8 x float> %{{.*}}, ptr %{{.*}}, align 32, !nontemporal
+  _mm256_stream_ps(A, B);
+}
+
 void test_mm256_stream_si256(__m256i* A, __m256i B) {
   // CHECK-LABEL: test_mm256_stream_si256
   // CHECK: store <4 x i64> %{{.*}}, ptr %{{.*}}, align 32, !nontemporal
   _mm256_stream_si256(A, B);
 }
 
+void test_mm256_stream_si256_void(void* A, __m256i B) {
+  // CHECK-LABEL: test_mm256_stream_si256_void
+  // CHECK: store <4 x i64> %{{.*}}, ptr %{{.*}}, align 32, !nontemporal
+  _mm256_stream_si256(A, B);
+}
+
 __m256d test_mm256_sub_pd(__m256d A, __m256d B) {
   // CHECK-LABEL: test_mm256_sub_pd
   // CHECK: fsub <4 x double>
diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c
index 2750e1b227483ee..5b8c6ded7f216b7 100644
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/avx2-builtins.c
@@ -1223,6 +1223,12 @@ __m256i test_mm256_stream_load_si256(__m256i const *a) {
   return _mm256_stream_load_si256(a);
 }
 
+__m256i test_mm256_stream_load_si256_const(void const *a) {
+  // CHECK-LABEL: test_mm256_stream_load_si256_const
+  // CHECK: load <4 x i64>, ptr %{{.*}}, align 32, !nontemporal
+  return _mm256_stream_load_si256(a);
+}
+
 __m256i test_mm256_sub_epi8(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_sub_epi8
   // CHECK: sub <32 x i8>
diff --git a/clang/test/CodeGen/X86/sse-builtins.c b/clang/test/CodeGen/X86/sse-builtins.c
index da40380926d2c8a..9c64d420f7cdf10 100644
--- a/clang/test/CodeGen/X86/sse-builtins.c
+++ b/clang/test/CodeGen/X86/sse-builtins.c
@@ -720,6 +720,12 @@ void test_mm_stream_ps(float*A, __m128 B) {
   _mm_stream_ps(A, B);
 }
 
+void test_mm_stream_ps_2(void*A, __m128 B) {
+  // CHECK-LABEL: test_mm_stream_ps_2
+  // CHECK: store <4 x float> %{{.*}}, ptr %{{.*}}, align 16, !nontemporal
+  _mm_stream_ps(A, B);
+}
+
 __m128 test_mm_sub_ps(__m128 A, __m128 B) {
   // CHECK-LABEL: test_mm_sub_ps
   // CHECK: fsub <4 x float>
diff --git a/clang/test/CodeGen/X86/sse2-builtins.c b/clang/test/CodeGen/X86/sse2-builtins.c
index 7c62a128c331fc5..7165d2791827cfc 100644
--- a/clang/test/CodeGen/X86/sse2-builtins.c
+++ b/clang/test/CodeGen/X86/sse2-builtins.c
@@ -1488,18 +1488,36 @@ void test_mm_stream_pd(double *A, __m128d B) {
   _mm_stream_pd(A, B);
 }
 
+void test_mm_stream_pd_void(void *A, __m128d B) {
+  // CHECK-LABEL: test_mm_stream_pd_void
+  // CHECK: store <2 x double> %{{.*}}, ptr %{{.*}}, align 16, !nontemporal
+  _mm_stream_pd(A, B);
+}
+
 void test_mm_stream_si32(int *A, int B) {
   // CHECK-LABEL: test_mm_stream_si32
   // CHECK: store i32 %{{.*}}, ptr %{{.*}}, align 1, !nontemporal
   _mm_stream_si32(A, B);
 }
 
+void test_mm_stream_si32_void(void *A, int B) {
+  // CHECK-LABEL: test_mm_stream_si32_void
+  // CHECK: store i32 %{{.*}}, ptr %{{.*}}, align 1, !nontemporal
+  _mm_stream_si32(A, B);
+}
+
 #ifdef __x86_64__
 void test_mm_stream_si64(long long *A, long long B) {
   // X64-LABEL: test_mm_stream_si64
   // X64: store i64 %{{.*}}, ptr %{{.*}}, align 1, !nontemporal
   _mm_stream_si64(A, B);
 }
+
+void test_mm_stream_si64_void(void *A, long long B) {
+  // X64-LABEL: test_mm_stream_si64_void
+  // X64: store i64 %{{.*}}, ptr %{{.*}}, align 1, !nontemporal
+  _mm_stream_si64(A, B);
+}
 #endif
 
 void test_mm_stream_si128(__m128i *A, __m128i B) {
@@ -1508,6 +1526,12 @@ void test_mm_stream_si128(__m128i *A, __m128i B) {
   _mm_stream_si128(A, B);
 }
 
+void test_mm_stream_si128_void(void *A, __m128i B) {
+  // CHECK-LABEL: test_mm_stream_si128_void
+  // CHECK: store <2 x i64> %{{.*}}, ptr %{{.*}}, align 16, !nontemporal
+  _mm_stream_si128(A, B);
+}
+
 __m128i test_mm_sub_epi8(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_sub_epi8
   // CHECK: sub <16 x i8>
diff --git a/clang/test/CodeGen/X86/sse41-builtins.c b/clang/test/CodeGen/X86/sse41-builtins.c
index fe59cbcaf1938c6..ad486a6d9950af6 100644
--- a/clang/test/CodeGen/X86/sse41-builtins.c
+++ b/clang/test/CodeGen/X86/sse41-builtins.c
@@ -358,6 +358,12 @@ __m128i test_mm_stream_load_si128(__m128i const *a) {
   return _mm_stream_load_si128(a);
 }
 
+__m128i test_mm_stream_load_si128_void(void const *a) {
+  // CHECK-LABEL: test_mm_stream_load_si128_void
+  // CHECK: load <2 x i64>, ptr %{{.*}}, align 16, !nontemporal
+  return _mm_stream_load_si128(a);
+}
+
 int test_mm_test_all_ones(__m128i x) {
   // CHECK-LABEL: test_mm_test_all_ones
   // CHECK: call i32 @llvm.x86.sse41.ptestc(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})

FreddyLeaf · 2023-09-14T01:36:21Z

Here's the change for 512 variants before: https://reviews.llvm.org/D66786

phoebewang

LGTM.

clang/test/CodeGen/X86/sse41-builtins.c

clang/lib/Headers/smmintrin.h

clang/lib/Headers/avx2intrin.h

clang/test/CodeGen/X86/avx2-builtins.c

RKSimon

LGTM - please update the description to mention that you're updating the movnti i32/i64 scalar integer nt ops as well as the vector ops

KanRobert

LGTM

RKSimon · 2023-09-20T09:15:17Z

For completeness the _mm_stream_sd / _mm_stream_ss SSE4A intrinsics still need updating as well: https://github.com/llvm/llvm-project/blob/59fbba94908f65eedb8bdd619e425bf97d84b2e3/clang/lib/Headers/ammintrin.h#L158C1-L158C14

FreddyLeaf · 2023-09-20T09:23:12Z

For completeness the _mm_stream_sd / _mm_stream_ss SSE4A intrinsics still need updating as well: https://github.com/llvm/llvm-project/blob/59fbba94908f65eedb8bdd619e425bf97d84b2e3/clang/lib/Headers/ammintrin.h#L158C1-L158C14

Good catch. Seems like intrinsic guide missed these two.

RKSimon · 2023-09-20T09:35:14Z

They're AMD specific so probably not covered

FreddyLeaf · 2023-09-20T12:08:50Z

LGTM - please update the description to mention that you're updating the movnti i32/i64 scalar integer nt ops as well as the vector ops

I think this description has covered? May display wrong in the preview, highlighted here:
For *_stream_* series intrinsics

RKSimon · 2023-09-20T12:36:11Z

I meant the patch title - I'm never quite sure how the title/description appears when these are squashed+merged

RKSimon

LGTM - cheers

[X86] Align 128/256 variants to use void * as 512 variants.

21157a0

For *_stream_* series intrinsics.

FreddyLeaf requested a review from a team as a code owner September 14, 2023 01:31

llvmbot added clang Clang issues not falling into any other category backend:X86 clang:headers Headers provided by Clang, e.g. for intrinsics labels Sep 14, 2023

FreddyLeaf requested review from yubingex007-a11y, RKSimon, phoebewang and topperc September 14, 2023 01:32

FreddyLeaf requested a review from KanRobert September 14, 2023 01:37

phoebewang approved these changes Sep 14, 2023

View reviewed changes

clang/test/CodeGen/X86/sse41-builtins.c Outdated Show resolved Hide resolved

clang/lib/Headers/smmintrin.h Outdated Show resolved Hide resolved

clang/lib/Headers/avx2intrin.h Outdated Show resolved Hide resolved

RKSimon reviewed Sep 14, 2023

View reviewed changes

clang/test/CodeGen/X86/avx2-builtins.c Outdated Show resolved Hide resolved

Address comments.

3e463d3

RKSimon approved these changes Sep 19, 2023

View reviewed changes

KanRobert approved these changes Sep 19, 2023

View reviewed changes

Add extra cast for _mm_stream_si64/32 due to lit fails.

24e1765

FreddyLeaf requested review from KanRobert, phoebewang and RKSimon September 20, 2023 02:34

Address comments

36f1f86

RKSimon approved these changes Sep 20, 2023

View reviewed changes

FreddyLeaf changed the title ~~[X86] Align 128/256 variants to use void * as 512 variants.~~ [X86] Align other variants to use void * as 512 variants. Sep 20, 2023

FreddyLeaf merged commit 632d13c into llvm:main Sep 20, 2023

FreddyLeaf deleted the stream_void branch September 20, 2023 12:59

This was referenced Sep 20, 2023

sve gather scatter offset sinking #66932

Closed

sve gather scatter offset sinking #66994

Closed

kstoimenov mentioned this pull request Sep 22, 2023

Add memcpm test kstoimenov/llvm-project#13

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[X86] Align other variants to use void * as 512 variants. #66310

[X86] Align other variants to use void * as 512 variants. #66310

FreddyLeaf commented Sep 14, 2023 •

edited

Loading

llvmbot commented Sep 14, 2023

FreddyLeaf commented Sep 14, 2023

phoebewang left a comment

RKSimon left a comment

KanRobert left a comment

RKSimon commented Sep 20, 2023

FreddyLeaf commented Sep 20, 2023

RKSimon commented Sep 20, 2023

FreddyLeaf commented Sep 20, 2023

RKSimon commented Sep 20, 2023

RKSimon left a comment

[X86] Align other variants to use void * as 512 variants. #66310

[X86] Align other variants to use void * as 512 variants. #66310

Conversation

FreddyLeaf commented Sep 14, 2023 • edited Loading

llvmbot commented Sep 14, 2023

FreddyLeaf commented Sep 14, 2023

phoebewang left a comment

Choose a reason for hiding this comment

RKSimon left a comment

Choose a reason for hiding this comment

KanRobert left a comment

Choose a reason for hiding this comment

RKSimon commented Sep 20, 2023

FreddyLeaf commented Sep 20, 2023

RKSimon commented Sep 20, 2023

FreddyLeaf commented Sep 20, 2023

RKSimon commented Sep 20, 2023

RKSimon left a comment

Choose a reason for hiding this comment

FreddyLeaf commented Sep 14, 2023 •

edited

Loading