From 8b32e9d914dbe9f95f74fca35ebb8d0cf332e1b8 Mon Sep 17 00:00:00 2001
From: Matevz Tadel <mtadel@ucsd.edu>
Date: Thu, 20 Sep 2018 16:33:18 -0700
Subject: [PATCH 1/3] Add support for AVX2

Some timing measurements:

Single thread, 100 evs, build time only:

AVX-512  5.188  5.188 5.191 => 5.19
AVX2     6.143  6.154 6.130 => 6.14
AVX      7.882  7.881 7.879 => 7.88

32 / 16, 5000 evs, wall time:

AVX-512  20.584 20.542      => 20.6
AVX2     19.919 19.844      => 19.9
AVX      23.448 23.573      => 23.5

64 / 16, 5000 evs, wall time:

AVX-512  16.777 16.900      => 16.8
AVX2     16.244 16.255      => 16.2
AVX      18.910 18.942      => 18.9

128 / 32, 5000 evs, wall time:

AVX-512  29.902 27.163      => 28.5
AVX2     23.919 24.730      => 24.3
AVX      28.804 28.669      => 28.7
---
 Config.h                    |  2 ++
 Makefile                    |  3 +++
 Makefile.config             |  9 +++++++--
 Matriplex/MatriplexCommon.h | 17 +++++++++++++++--
 4 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/Config.h b/Config.h
index 091d3b27227d1..4c52874fc7b1e 100644
--- a/Config.h
+++ b/Config.h
@@ -406,6 +406,8 @@ namespace Config
       #define MPT_SIZE 16
     #elif defined USE_CUDA
       #define MPT_SIZE 8
+    #elif defined(__AVX__) || defined(__AVX2__)
+      #define MPT_SIZE 8
     #else
       #define MPT_SIZE 8
     #endif
diff --git a/Makefile b/Makefile
index 897c0997a4d32..2f9fb23ce1d0b 100644
--- a/Makefile
+++ b/Makefile
@@ -94,3 +94,6 @@ endif
 
 echo:
 	-echo CXX = ${CXX}
+
+echo_cc_defs:
+	${CXX} -dM -E -mavx2 - < /dev/null
diff --git a/Makefile.config b/Makefile.config
index 6a154df194b44..1bdd6a186fc29 100644
--- a/Makefile.config
+++ b/Makefile.config
@@ -21,6 +21,8 @@
 #KNC_BUILD := 1
 # Define to build for AVX_512, the new mic (KNL) and latest generation Xeons.
 #AVX_512 := 1
+# Define to build for AVX2
+#AVX2    := 1
 
 # 0. Use gcc-5 from MacPorts on OSX
 # OSXGCC5    := 1
@@ -71,9 +73,12 @@ OPT := -g -O3
 # 4. Vectorization settings
 ifdef AVX_512
 VEC_GCC  := -mavx512f -mavx512cd # -march=native -fopt-info-vec -mavx
-VEC_ICC  :=  -xHost -qopt-zmm-usage=high #-march=native -mtune=native #-xcore-avx512
+VEC_ICC  := -xHost -qopt-zmm-usage=high #-march=native -mtune=native #-xcore-avx512
+else ifdef AVX2
+VEC_GCC  := -mavx2
+VEC_ICC  := -mavx2
 else
-VEC_GCC  := -msse3 # -mavx # -fopt-info-vec-all
+VEC_GCC  := -mavx # -fopt-info-vec-all
 VEC_ICC  := -mavx
 endif
 VEC_MIC  := -mmic
diff --git a/Matriplex/MatriplexCommon.h b/Matriplex/MatriplexCommon.h
index 476d80a588232..7d87067e7da53 100644
--- a/Matriplex/MatriplexCommon.h
+++ b/Matriplex/MatriplexCommon.h
@@ -31,10 +31,23 @@
 
     #define LD(a, i)      _mm512_load_ps(&a[i*N+n])
     #define ST(a, i, r)   _mm512_store_ps(&a[i*N+n], r)
-    #define ADD(a, b)     _mm512_add_ps(a, b) 
+    #define ADD(a, b)     _mm512_add_ps(a, b)
     #define MUL(a, b)     _mm512_mul_ps(a, b)
     #define FMA(a, b, v)  _mm512_fmadd_ps(a, b, v)
 
+  #elif defined(__AVX2__)
+
+    typedef __m256 IntrVec_t;
+    #define MPLEX_INTRINSICS_WIDTH_BYTES  32
+    #define MPLEX_INTRINSICS_WIDTH_BITS  256
+    #define AVX_INTRINSICS
+
+    #define LD(a, i)      _mm256_load_ps(&a[i*N+n])
+    #define ST(a, i, r)   _mm256_store_ps(&a[i*N+n], r)
+    #define ADD(a, b)     _mm256_add_ps(a, b)
+    #define MUL(a, b)     _mm256_mul_ps(a, b)
+    #define FMA(a, b, v)  _mm256_fmadd_ps(a, b, v)
+
   #elif defined(__AVX__)
 
     typedef __m256 IntrVec_t;
@@ -44,7 +57,7 @@
 
     #define LD(a, i)      _mm256_load_ps(&a[i*N+n])
     #define ST(a, i, r)   _mm256_store_ps(&a[i*N+n], r)
-    #define ADD(a, b)     _mm256_add_ps(a, b) 
+    #define ADD(a, b)     _mm256_add_ps(a, b)
     #define MUL(a, b)     _mm256_mul_ps(a, b)
     // #define FMA(a, b, v)  { __m256 temp = _mm256_mul_ps(a, b); v = _mm256_add_ps(temp, v); }
     inline __m256 FMA(const __m256 &a, const __m256 &b, const __m256 &v)

From 0f9fa97dd5c6df73fcc40df76958f77841fb6b33 Mon Sep 17 00:00:00 2001
From: Matevz Tadel <mtadel@ucsd.edu>
Date: Tue, 25 Sep 2018 14:04:10 -0700
Subject: [PATCH 2/3] Implement SlurpIn with avx2 gather.

---
 Matriplex/Matriplex.h       | 20 ++++++++++++++++++++
 Matriplex/MatriplexCommon.h |  6 +++++-
 Matriplex/MatriplexSym.h    | 20 ++++++++++++++++++++
 mkFit/MkFinder.cc           | 26 +++++++++++++-------------
 mkFit/MkFinderFV.cc         |  6 ++----
 mkFit/MkFitter.cc           | 16 ++++++++--------
 6 files changed, 68 insertions(+), 26 deletions(-)

diff --git a/Matriplex/Matriplex.h b/Matriplex/Matriplex.h
index 80b0d435ace51..d24c04276fc55 100644
--- a/Matriplex/Matriplex.h
+++ b/Matriplex/Matriplex.h
@@ -157,6 +157,26 @@ class Matriplex
    }
    */
 
+#elif defined(AVX2_INTRINSICS)
+
+   void SlurpIn(const char *arr, __m256i& vi, const int N_proc = N)
+   {
+      const __m256 src = { 0 };
+
+      __m256i k_master = _mm256_setr_epi32( -1, -1, -1, -1, -1, -1, -1, -1 );
+      int *kmp = (int*) & k_master;
+      for (int i = N_proc; i < N; ++i) kmp[i] = 0;
+
+      __m256i k = k_master;
+      for (int i = 0; i < kSize; ++i, arr += sizeof(T))
+      {
+         __m256 reg = _mm256_mask_i32gather_ps(src, (float*) arr, vi, (__m256) k, 1);
+         // Restore mask (docs say gather clears it but it doesn't seem to).
+         k = k_master;
+         _mm256_maskstore_ps((float*) &fArray[i*N], k, reg);
+      }
+   }
+
 #else
 
    void SlurpIn(const char *arr, int vi[N], const int N_proc = N)
diff --git a/Matriplex/MatriplexCommon.h b/Matriplex/MatriplexCommon.h
index 7d87067e7da53..11bad035842f5 100644
--- a/Matriplex/MatriplexCommon.h
+++ b/Matriplex/MatriplexCommon.h
@@ -28,6 +28,8 @@
     #define MPLEX_INTRINSICS_WIDTH_BYTES  64
     #define MPLEX_INTRINSICS_WIDTH_BITS  512
     #define MIC_INTRINSICS
+    #define GATHER_INTRINSICS
+    #define GATHER_IDX_LOAD(name, arr)  __m512i name = _mm512_load_epi32(arr);
 
     #define LD(a, i)      _mm512_load_ps(&a[i*N+n])
     #define ST(a, i, r)   _mm512_store_ps(&a[i*N+n], r)
@@ -40,7 +42,9 @@
     typedef __m256 IntrVec_t;
     #define MPLEX_INTRINSICS_WIDTH_BYTES  32
     #define MPLEX_INTRINSICS_WIDTH_BITS  256
-    #define AVX_INTRINSICS
+    #define AVX2_INTRINSICS
+    #define GATHER_INTRINSICS
+    #define GATHER_IDX_LOAD(name, arr)  __m256i name = _mm256_load_epi32(arr);
 
     #define LD(a, i)      _mm256_load_ps(&a[i*N+n])
     #define ST(a, i, r)   _mm256_store_ps(&a[i*N+n], r)
diff --git a/Matriplex/MatriplexSym.h b/Matriplex/MatriplexSym.h
index 87d08c37dc955..5238c5bc1af7d 100644
--- a/Matriplex/MatriplexSym.h
+++ b/Matriplex/MatriplexSym.h
@@ -177,6 +177,26 @@ class MatriplexSym
    }
    */
 
+#elif defined(AVX2_INTRINSICS)
+
+   void SlurpIn(const char *arr, __m256i& vi, const int N_proc = N)
+   {
+      const __m256 src = { 0 };
+
+      __m256i k_master = _mm256_setr_epi32( -1, -1, -1, -1, -1, -1, -1, -1 );
+      int *kmp = (int*) & k_master;
+      for (int i = N_proc; i < N; ++i) kmp[i] = 0;
+
+      __m256i k = k_master;
+      for (int i = 0; i < kSize; ++i, arr += sizeof(T))
+      {
+         __m256 reg = _mm256_mask_i32gather_ps(src, (float *) arr, vi, (__m256) k, 1);
+         // Restore mask (docs say gather clears it but it doesn't seem to).
+         k = k_master;
+         _mm256_maskstore_ps((float*) &fArray[i*N], k, reg);
+      }
+   }
+
 #else
 
    void SlurpIn(const char *arr, int vi[N], const int N_proc = N)
diff --git a/mkFit/MkFinder.cc b/mkFit/MkFinder.cc
index b37c40816636e..0e8bfeb9fe55e 100644
--- a/mkFit/MkFinder.cc
+++ b/mkFit/MkFinder.cc
@@ -468,8 +468,8 @@ void MkFinder::AddBestHit(const LayerOfHits &layer_of_hits, const int N_proc,
         idx[itrack] = XHitArr.At(itrack, hit_cnt, 0) * sizeof(Hit);
       }
     }
-#if defined(MIC_INTRINSICS)
-    __m512i vi = _mm512_load_epi32(idx);
+#if defined(GATHER_INTRINSICS)
+    GATHER_IDX_LOAD(vi, idx);
 #endif
 
 #ifndef NO_PREFETCH
@@ -499,7 +499,7 @@ void MkFinder::AddBestHit(const LayerOfHits &layer_of_hits, const int N_proc,
 
 #else //NO_GATHER
 
-#if defined(MIC_INTRINSICS)
+#if defined(GATHER_INTRINSICS)
     msErr.SlurpIn(varr + off_error, vi);
     msPar.SlurpIn(varr + off_param, vi);
 #else
@@ -672,8 +672,8 @@ void MkFinder::FindCandidates(const LayerOfHits &layer_of_hits,
 	idx[itrack] = XHitArr.At(itrack, hit_cnt, 0) * sizeof(Hit);
       }
     }
-#if defined(MIC_INTRINSICS)
-    __m512i vi = _mm512_load_epi32(idx);
+#if defined(GATHER_INTRINSICS)
+    GATHER_IDX_LOAD(vi, idx);
 #endif
 
     // Prefetch to L2 the hits we'll (probably) process after two loops iterations.
@@ -686,7 +686,7 @@ void MkFinder::FindCandidates(const LayerOfHits &layer_of_hits,
       }
     }
 
-#if defined(MIC_INTRINSICS)
+#if defined(GATHER_INTRINSICS)
     msErr.SlurpIn(varr + off_error, vi);
     msPar.SlurpIn(varr + off_param, vi);
 #else
@@ -851,8 +851,8 @@ void MkFinder::FindCandidatesCloneEngine(const LayerOfHits &layer_of_hits, CandC
         idx[itrack] = XHitArr.At(itrack, hit_cnt, 0) * sizeof(Hit);
       }
     }
-#if defined(MIC_INTRINSICS)
-    __m512i vi = _mm512_load_epi32(idx);
+#if defined(GATHER_INTRINSICS)
+    GATHER_IDX_LOAD(vi, idx);
 #endif
 
     // Prefetch to L2 the hits we'll (probably) process after two loops iterations.
@@ -865,7 +865,7 @@ void MkFinder::FindCandidatesCloneEngine(const LayerOfHits &layer_of_hits, CandC
       }
     }
 
-#if defined(MIC_INTRINSICS)
+#if defined(GATHER_INTRINSICS)
     msErr.SlurpIn(varr + off_error, vi);
     msPar.SlurpIn(varr + off_param, vi);
 #else
@@ -1054,8 +1054,8 @@ void MkFinder::BkFitInputTracks(TrackVec& cands, int beg, int end)
 
   Chi2.SetVal(0);
 
-#ifdef MIC_INTRINSICS
-  __m512i vi      = _mm512_load_epi32(idx);
+#ifdef GATHER_INTRINSICS
+  GATHER_IDX_LOAD(vi, idx);
   Err[iC].SlurpIn(varr + off_error, vi, N_proc);
   Par[iC].SlurpIn(varr + off_param, vi, N_proc);
 #else
@@ -1096,8 +1096,8 @@ void MkFinder::BkFitInputTracks(EventOfCombCandidates& eocss, int beg, int end)
 
   Chi2.SetVal(0);
 
-#ifdef MIC_INTRINSICS
-  __m512i vi      = _mm512_load_epi32(idx);
+#ifdef GATHER_INTRINSICS
+  GATHER_IDX_LOAD(vi, idx);
   Err[iC].SlurpIn(varr + off_error, vi, N_proc);
   Par[iC].SlurpIn(varr + off_param, vi, N_proc);
 #else
diff --git a/mkFit/MkFinderFV.cc b/mkFit/MkFinderFV.cc
index c57fcb6cddb50..d81fa2db2aa35 100644
--- a/mkFit/MkFinderFV.cc
+++ b/mkFit/MkFinderFV.cc
@@ -314,9 +314,6 @@ void MkFinderFV<nseeds, ncands>::FindCandidates(const LayerOfHits &layer_of_hits
         idx[itrack] = XHitArr.At(itrack, hit_cnt, 0) * sizeof(Hit);
       }
     }
-#if defined(MIC_INTRINSICS)
-    __m512i vi = _mm512_load_epi32(idx);
-#endif
 
     // Prefetch to L2 the hits we'll (probably) process after two loops iterations.
     // Ideally this would be initiated before coming here, for whole bunch_of_hits.m_hits vector.
@@ -328,7 +325,8 @@ void MkFinderFV<nseeds, ncands>::FindCandidates(const LayerOfHits &layer_of_hits
       }
     }
 
-#if defined(MIC_INTRINSICS)
+#if defined(GATHER_INTRINSICS)
+    GATHER_IDX_LOAD(vi, idx);
     msErr.SlurpIn(varr + off_error, vi);
     msPar.SlurpIn(varr + off_param, vi);
 #else
diff --git a/mkFit/MkFitter.cc b/mkFit/MkFitter.cc
index 255e11d7b5cd2..5d17e8b5808d1 100644
--- a/mkFit/MkFitter.cc
+++ b/mkFit/MkFitter.cc
@@ -197,8 +197,8 @@ void MkFitter::SlurpInTracksAndHits(const std::vector<Track>&  tracks,
     Chi2(itrack, 0, 0) = trk.chi2();
   }
 
-#ifdef MIC_INTRINSICS
-  __m512i vi      = _mm512_load_epi32(idx);
+#ifdef GATHER_INTRINSICS
+  GATHER_IDX_LOAD(vi, idx);
   Err[iC].SlurpIn(varr + off_error, vi);
   Par[iC].SlurpIn(varr + off_param, vi);
 #else
@@ -228,8 +228,8 @@ void MkFitter::SlurpInTracksAndHits(const std::vector<Track>&  tracks,
       HoTArr[hi](itrack, 0, 0) = tracks[i].getHitOnTrack(hi);
     }
 
-#ifdef MIC_INTRINSICS
-    __m512i vi      = _mm512_load_epi32(idx);
+#ifdef GATHER_INTRINSICS
+    GATHER_IDX_LOAD(vi, idx);
     msErr[hi].SlurpIn(varr + off_error, vi);
     msPar[hi].SlurpIn(varr + off_param, vi);
 #else
@@ -388,8 +388,8 @@ void MkFitter::InputTracksForFit(const std::vector<Track>& tracks,
 
   // for ( ; itrack < NN; ++itrack)  {  idx[itrack] = idx[0];  }
 
-#ifdef MIC_INTRINSICS
-  __m512i vi      = _mm512_load_epi32(idx);
+#ifdef GATHER_INTRINSICS
+  GATHER_IDX_LOAD(vi, idx);
   Err[iC].SlurpIn(varr + off_error, vi, N_proc);
   Par[iC].SlurpIn(varr + off_param, vi, N_proc);
   for (int ll = 0; ll < Config::nLayers; ++ll)
@@ -440,8 +440,8 @@ void MkFitter::FitTracksWithInterSlurp(const std::vector<HitVec>& layersohits,
     }
     for (int i = N_proc; i < NN; ++i)  {  idx[i] = idx[0];  }
 
-#ifdef MIC_INTRINSICS
-    __m512i vi      = _mm512_load_epi32(idx);
+#ifdef GATHER_INTRINSICS
+    GATHER_IDX_LOAD(vi, idx);
     msPar[0].SlurpIn(varr + off_param, vi, N_proc);
     msErr[0].SlurpIn(varr + off_error, vi, N_proc);
 #else

From fff2432b49ee4d20ecc8fcd2aeecd3aece25ec9e Mon Sep 17 00:00:00 2001
From: Matevz Tadel <mtadel@ucsd.edu>
Date: Mon, 1 Oct 2018 16:26:50 -0700
Subject: [PATCH 3/3] Use Steve's trick to set SlurpIn AVX2 mask (but use
 integer cmp).

---
 Matriplex/Matriplex.h    | 8 ++++----
 Matriplex/MatriplexSym.h | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Matriplex/Matriplex.h b/Matriplex/Matriplex.h
index d24c04276fc55..64b1f39b9f52c 100644
--- a/Matriplex/Matriplex.h
+++ b/Matriplex/Matriplex.h
@@ -163,11 +163,11 @@ class Matriplex
    {
       const __m256 src = { 0 };
 
-      __m256i k_master = _mm256_setr_epi32( -1, -1, -1, -1, -1, -1, -1, -1 );
-      int *kmp = (int*) & k_master;
-      for (int i = N_proc; i < N; ++i) kmp[i] = 0;
+      __m256i k        = _mm256_setr_epi32( 0, 1, 2, 3, 4, 5, 6, 7 );
+      __m256i k_sel    = _mm256_set1_epi32(N_proc);
+      __m256i k_master = _mm256_cmpgt_epi32(k_sel, k);
 
-      __m256i k = k_master;
+      k = k_master;
       for (int i = 0; i < kSize; ++i, arr += sizeof(T))
       {
          __m256 reg = _mm256_mask_i32gather_ps(src, (float*) arr, vi, (__m256) k, 1);
diff --git a/Matriplex/MatriplexSym.h b/Matriplex/MatriplexSym.h
index 5238c5bc1af7d..11297506f3a87 100644
--- a/Matriplex/MatriplexSym.h
+++ b/Matriplex/MatriplexSym.h
@@ -183,11 +183,11 @@ class MatriplexSym
    {
       const __m256 src = { 0 };
 
-      __m256i k_master = _mm256_setr_epi32( -1, -1, -1, -1, -1, -1, -1, -1 );
-      int *kmp = (int*) & k_master;
-      for (int i = N_proc; i < N; ++i) kmp[i] = 0;
+      __m256i k        = _mm256_setr_epi32( 0, 1, 2, 3, 4, 5, 6, 7 );
+      __m256i k_sel    = _mm256_set1_epi32(N_proc);
+      __m256i k_master = _mm256_cmpgt_epi32(k_sel, k);
 
-      __m256i k = k_master;
+      k = k_master;
       for (int i = 0; i < kSize; ++i, arr += sizeof(T))
       {
          __m256 reg = _mm256_mask_i32gather_ps(src, (float *) arr, vi, (__m256) k, 1);