From 8a410f013d784cd61e79df678f86608ca5d920d2 Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Tue, 9 Jan 2018 09:49:08 -0600
Subject: [PATCH] Migrate `_mm_add_ss` to `__m128` (#265)

This commit starts the migration towards Intel's types one intrinsic at a time,
starting with `_mm_add_ss`. This is mostly just to get a feel for what the tests
will start to look like.
---
 coresimd/src/x86/i586/sse.rs | 55 +++++++++++++++++++++++++-----------
 coresimd/src/x86/mod.rs      |  5 ++++
 stdsimd-verify/src/lib.rs    |  1 +
 3 files changed, 44 insertions(+), 17 deletions(-)

diff --git a/coresimd/src/x86/i586/sse.rs b/coresimd/src/x86/i586/sse.rs
index d3325701c602f..db44daea2da5d 100644
--- a/coresimd/src/x86/i586/sse.rs
+++ b/coresimd/src/x86/i586/sse.rs
@@ -6,6 +6,7 @@ use core::ptr;
 use simd_llvm::simd_shuffle4;
 use v128::*;
 use v64::*;
+use x86::__m128;
 
 #[cfg(test)]
 use stdsimd_test::assert_instr;
@@ -15,7 +16,7 @@ use stdsimd_test::assert_instr;
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(addss))]
-pub unsafe fn _mm_add_ss(a: f32x4, b: f32x4) -> f32x4 {
+pub unsafe fn _mm_add_ss(a: __m128, b: __m128) -> __m128 {
     addss(a, b)
 }
 
@@ -720,13 +721,13 @@ pub unsafe fn _mm_set_ps1(a: f32) -> f32x4 {
 /// Alternatively:
 ///
 /// ```text
-/// assert_eq!(f32x4::new(a, b, c, d), _mm_set_ps(d, c, b, a));
+/// let v = _mm_set_ps(d, c, b, a);
 /// ```
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(unpcklps))]
-pub unsafe fn _mm_set_ps(a: f32, b: f32, c: f32, d: f32) -> f32x4 {
-    f32x4::new(d, c, b, a)
+pub unsafe fn _mm_set_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 {
+    __m128(d, c, b, a)
 }
 
 /// Construct a `f32x4` from four floating point values lowest to highest.
@@ -1600,7 +1601,7 @@ pub unsafe fn _MM_TRANSPOSE4_PS(
 #[allow(improper_ctypes)]
 extern "C" {
     #[link_name = "llvm.x86.sse.add.ss"]
-    fn addss(a: f32x4, b: f32x4) -> f32x4;
+    fn addss(a: __m128, b: __m128) -> __m128;
     #[link_name = "llvm.x86.sse.sub.ss"]
     fn subss(a: f32x4, b: f32x4) -> f32x4;
     #[link_name = "llvm.x86.sse.mul.ss"]
@@ -1697,11 +1698,26 @@ pub unsafe fn _mm_stream_pi(mem_addr: *mut __m64, a: __m64) {
 
 #[cfg(test)]
 mod tests {
-    use v128::*;
     use x86::i586::sse;
+    use super::*;
     use stdsimd_test::simd_test;
     use test::black_box; // Used to inhibit constant-folding.
 
+    #[target_feature = "+sse"]
+    unsafe fn assert_eq_m128(a: __m128, b: __m128) {
+        use std::mem;
+        let r = _mm_cmpeq_ps(mem::transmute(a), mem::transmute(b));
+        if _mm_movemask_ps(r) != 0b1111 {
+            panic!("{:?} != {:?}", a, b);
+        }
+    }
+
+    #[target_feature = "+sse"]
+    unsafe fn get_m128(a: __m128, idx: usize) -> f32 {
+        union A { a: __m128, b: [f32; 4] };
+        mem::transmute::<__m128, A>(a).b[idx]
+    }
+
     #[simd_test = "sse"]
     unsafe fn _mm_add_ps() {
         let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
@@ -1711,11 +1727,11 @@ mod tests {
     }
 
     #[simd_test = "sse"]
-    unsafe fn _mm_add_ss() {
-        let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
-        let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
-        let r = sse::_mm_add_ss(a, b);
-        assert_eq!(r, f32x4::new(-101.0, 5.0, 0.0, -10.0));
+    unsafe fn test_mm_add_ss() {
+        let a = _mm_set_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_set_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_add_ss(a, b);
+        assert_eq_m128(r, _mm_set_ps(-1.0, 5.0, 0.0, -15.0));
     }
 
     #[simd_test = "sse"]
@@ -2220,7 +2236,8 @@ mod tests {
         assert_eq!(rd, ed);
     }
 
-    unsafe fn _mm_cmpeq_ps() {
+    #[simd_test = "sse"]
+    unsafe fn test_mm_cmpeq_ps() {
         use std::mem::transmute;
         use std::f32::NAN;
 
@@ -2234,6 +2251,7 @@ mod tests {
         assert_eq!(r, e);
     }
 
+    #[simd_test = "sse"]
     unsafe fn _mm_cmplt_ps() {
         use std::mem::transmute;
         use std::f32::NAN;
@@ -2876,7 +2894,7 @@ mod tests {
     }
 
     #[simd_test = "sse"]
-    pub unsafe fn _mm_cvtss_f32() {
+    pub unsafe fn test_mm_cvtss_f32() {
         let a = f32x4::new(312.0134, 5.0, 6.0, 7.0);
         assert_eq!(sse::_mm_cvtss_f32(a), 312.0134);
     }
@@ -2896,14 +2914,17 @@ mod tests {
     }
 
     #[simd_test = "sse"]
-    unsafe fn _mm_set_ps() {
-        let r = sse::_mm_set_ps(
+    unsafe fn test_mm_set_ps() {
+        let r = _mm_set_ps(
             black_box(1.0),
             black_box(2.0),
             black_box(3.0),
             black_box(4.0),
         );
-        assert_eq!(r, f32x4::new(4.0, 3.0, 2.0, 1.0));
+        assert_eq!(get_m128(r, 0), 4.0);
+        assert_eq!(get_m128(r, 1), 3.0);
+        assert_eq!(get_m128(r, 2), 2.0);
+        assert_eq!(get_m128(r, 3), 1.0);
     }
 
     #[simd_test = "sse"]
@@ -3196,7 +3217,7 @@ mod tests {
     }
 
     #[simd_test = "sse"]
-    unsafe fn _mm_movemask_ps() {
+    unsafe fn test_mm_movemask_ps() {
         let r = sse::_mm_movemask_ps(f32x4::new(-1.0, 5.0, -5.0, 0.0));
         assert_eq!(r, 0b0101);
 
diff --git a/coresimd/src/x86/mod.rs b/coresimd/src/x86/mod.rs
index 57e1f219ce4c7..6b82095ea2d3c 100644
--- a/coresimd/src/x86/mod.rs
+++ b/coresimd/src/x86/mod.rs
@@ -3,6 +3,11 @@
 #[macro_use]
 mod macros;
 
+#[repr(simd)]
+#[derive(Clone, Copy, Debug)]
+#[allow(non_camel_case_types)]
+pub struct __m128(f32, f32, f32, f32);
+
 mod i386;
 pub use self::i386::*;
 
diff --git a/stdsimd-verify/src/lib.rs b/stdsimd-verify/src/lib.rs
index 1ca833452a166..4519bce26e031 100644
--- a/stdsimd-verify/src/lib.rs
+++ b/stdsimd-verify/src/lib.rs
@@ -97,6 +97,7 @@ pub fn x86_functions(input: TokenStream) -> TokenStream {
 fn to_type(t: &syn::Type) -> Tokens {
     match *t {
         syn::Type::Path(ref p) => match extract_path_ident(&p.path).as_ref() {
+            "__m128" => my_quote! { &F32x4 },
             "__m128i" => my_quote! { &I8x16 },
             "__m256i" => my_quote! { &I8x32 },
             "__m64" => my_quote! { &I8x8 },