From 8b32e9d914dbe9f95f74fca35ebb8d0cf332e1b8 Mon Sep 17 00:00:00 2001 From: Matevz Tadel Date: Thu, 20 Sep 2018 16:33:18 -0700 Subject: [PATCH 1/3] Add support for AVX2 Some timing measurements: Single thread, 100 evs, build time only: AVX-512 5.188 5.188 5.191 => 5.19 AVX2 6.143 6.154 6.130 => 6.14 AVX 7.882 7.881 7.879 => 7.88 32 / 16, 5000 evs, wall time: AVX-512 20.584 20.542 => 20.6 AVX2 19.919 19.844 => 19.9 AVX 23.448 23.573 => 23.5 64 / 16, 5000 evs, wall time: AVX-512 16.777 16.900 => 16.8 AVX2 16.244 16.255 => 16.2 AVX 18.910 18.942 => 18.9 128 / 32, 5000 evs, wall time: AVX-512 29.902 27.163 => 28.5 AVX2 23.919 24.730 => 24.3 AVX 28.804 28.669 => 28.7 --- Config.h | 2 ++ Makefile | 3 +++ Makefile.config | 9 +++++++-- Matriplex/MatriplexCommon.h | 17 +++++++++++++++-- 4 files changed, 27 insertions(+), 4 deletions(-) diff --git a/Config.h b/Config.h index 091d3b27227d1..4c52874fc7b1e 100644 --- a/Config.h +++ b/Config.h @@ -406,6 +406,8 @@ namespace Config #define MPT_SIZE 16 #elif defined USE_CUDA #define MPT_SIZE 8 + #elif defined(__AVX__) || defined(__AVX2__) + #define MPT_SIZE 8 #else #define MPT_SIZE 8 #endif diff --git a/Makefile b/Makefile index 897c0997a4d32..2f9fb23ce1d0b 100644 --- a/Makefile +++ b/Makefile @@ -94,3 +94,6 @@ endif echo: -echo CXX = ${CXX} + +echo_cc_defs: + ${CXX} -dM -E -mavx2 - < /dev/null diff --git a/Makefile.config b/Makefile.config index 6a154df194b44..1bdd6a186fc29 100644 --- a/Makefile.config +++ b/Makefile.config @@ -21,6 +21,8 @@ #KNC_BUILD := 1 # Define to build for AVX_512, the new mic (KNL) and latest generation Xeons. #AVX_512 := 1 +# Define to build for AVX2 +#AVX2 := 1 # 0. Use gcc-5 from MacPorts on OSX # OSXGCC5 := 1 @@ -71,9 +73,12 @@ OPT := -g -O3 # 4. Vectorization settings ifdef AVX_512 VEC_GCC := -mavx512f -mavx512cd # -march=native -fopt-info-vec -mavx -VEC_ICC := -xHost -qopt-zmm-usage=high #-march=native -mtune=native #-xcore-avx512 +VEC_ICC := -xHost -qopt-zmm-usage=high #-march=native -mtune=native #-xcore-avx512 +else ifdef AVX2 +VEC_GCC := -mavx2 +VEC_ICC := -mavx2 else -VEC_GCC := -msse3 # -mavx # -fopt-info-vec-all +VEC_GCC := -mavx # -fopt-info-vec-all VEC_ICC := -mavx endif VEC_MIC := -mmic diff --git a/Matriplex/MatriplexCommon.h b/Matriplex/MatriplexCommon.h index 476d80a588232..7d87067e7da53 100644 --- a/Matriplex/MatriplexCommon.h +++ b/Matriplex/MatriplexCommon.h @@ -31,10 +31,23 @@ #define LD(a, i) _mm512_load_ps(&a[i*N+n]) #define ST(a, i, r) _mm512_store_ps(&a[i*N+n], r) - #define ADD(a, b) _mm512_add_ps(a, b) + #define ADD(a, b) _mm512_add_ps(a, b) #define MUL(a, b) _mm512_mul_ps(a, b) #define FMA(a, b, v) _mm512_fmadd_ps(a, b, v) + #elif defined(__AVX2__) + + typedef __m256 IntrVec_t; + #define MPLEX_INTRINSICS_WIDTH_BYTES 32 + #define MPLEX_INTRINSICS_WIDTH_BITS 256 + #define AVX_INTRINSICS + + #define LD(a, i) _mm256_load_ps(&a[i*N+n]) + #define ST(a, i, r) _mm256_store_ps(&a[i*N+n], r) + #define ADD(a, b) _mm256_add_ps(a, b) + #define MUL(a, b) _mm256_mul_ps(a, b) + #define FMA(a, b, v) _mm256_fmadd_ps(a, b, v) + #elif defined(__AVX__) typedef __m256 IntrVec_t; @@ -44,7 +57,7 @@ #define LD(a, i) _mm256_load_ps(&a[i*N+n]) #define ST(a, i, r) _mm256_store_ps(&a[i*N+n], r) - #define ADD(a, b) _mm256_add_ps(a, b) + #define ADD(a, b) _mm256_add_ps(a, b) #define MUL(a, b) _mm256_mul_ps(a, b) // #define FMA(a, b, v) { __m256 temp = _mm256_mul_ps(a, b); v = _mm256_add_ps(temp, v); } inline __m256 FMA(const __m256 &a, const __m256 &b, const __m256 &v) From 0f9fa97dd5c6df73fcc40df76958f77841fb6b33 Mon Sep 17 00:00:00 2001 From: Matevz Tadel Date: Tue, 25 Sep 2018 14:04:10 -0700 Subject: [PATCH 2/3] Implement SlurpIn with avx2 gather. --- Matriplex/Matriplex.h | 20 ++++++++++++++++++++ Matriplex/MatriplexCommon.h | 6 +++++- Matriplex/MatriplexSym.h | 20 ++++++++++++++++++++ mkFit/MkFinder.cc | 26 +++++++++++++------------- mkFit/MkFinderFV.cc | 6 ++---- mkFit/MkFitter.cc | 16 ++++++++-------- 6 files changed, 68 insertions(+), 26 deletions(-) diff --git a/Matriplex/Matriplex.h b/Matriplex/Matriplex.h index 80b0d435ace51..d24c04276fc55 100644 --- a/Matriplex/Matriplex.h +++ b/Matriplex/Matriplex.h @@ -157,6 +157,26 @@ class Matriplex } */ +#elif defined(AVX2_INTRINSICS) + + void SlurpIn(const char *arr, __m256i& vi, const int N_proc = N) + { + const __m256 src = { 0 }; + + __m256i k_master = _mm256_setr_epi32( -1, -1, -1, -1, -1, -1, -1, -1 ); + int *kmp = (int*) & k_master; + for (int i = N_proc; i < N; ++i) kmp[i] = 0; + + __m256i k = k_master; + for (int i = 0; i < kSize; ++i, arr += sizeof(T)) + { + __m256 reg = _mm256_mask_i32gather_ps(src, (float*) arr, vi, (__m256) k, 1); + // Restore mask (docs say gather clears it but it doesn't seem to). + k = k_master; + _mm256_maskstore_ps((float*) &fArray[i*N], k, reg); + } + } + #else void SlurpIn(const char *arr, int vi[N], const int N_proc = N) diff --git a/Matriplex/MatriplexCommon.h b/Matriplex/MatriplexCommon.h index 7d87067e7da53..11bad035842f5 100644 --- a/Matriplex/MatriplexCommon.h +++ b/Matriplex/MatriplexCommon.h @@ -28,6 +28,8 @@ #define MPLEX_INTRINSICS_WIDTH_BYTES 64 #define MPLEX_INTRINSICS_WIDTH_BITS 512 #define MIC_INTRINSICS + #define GATHER_INTRINSICS + #define GATHER_IDX_LOAD(name, arr) __m512i name = _mm512_load_epi32(arr); #define LD(a, i) _mm512_load_ps(&a[i*N+n]) #define ST(a, i, r) _mm512_store_ps(&a[i*N+n], r) @@ -40,7 +42,9 @@ typedef __m256 IntrVec_t; #define MPLEX_INTRINSICS_WIDTH_BYTES 32 #define MPLEX_INTRINSICS_WIDTH_BITS 256 - #define AVX_INTRINSICS + #define AVX2_INTRINSICS + #define GATHER_INTRINSICS + #define GATHER_IDX_LOAD(name, arr) __m256i name = _mm256_load_epi32(arr); #define LD(a, i) _mm256_load_ps(&a[i*N+n]) #define ST(a, i, r) _mm256_store_ps(&a[i*N+n], r) diff --git a/Matriplex/MatriplexSym.h b/Matriplex/MatriplexSym.h index 87d08c37dc955..5238c5bc1af7d 100644 --- a/Matriplex/MatriplexSym.h +++ b/Matriplex/MatriplexSym.h @@ -177,6 +177,26 @@ class MatriplexSym } */ +#elif defined(AVX2_INTRINSICS) + + void SlurpIn(const char *arr, __m256i& vi, const int N_proc = N) + { + const __m256 src = { 0 }; + + __m256i k_master = _mm256_setr_epi32( -1, -1, -1, -1, -1, -1, -1, -1 ); + int *kmp = (int*) & k_master; + for (int i = N_proc; i < N; ++i) kmp[i] = 0; + + __m256i k = k_master; + for (int i = 0; i < kSize; ++i, arr += sizeof(T)) + { + __m256 reg = _mm256_mask_i32gather_ps(src, (float *) arr, vi, (__m256) k, 1); + // Restore mask (docs say gather clears it but it doesn't seem to). + k = k_master; + _mm256_maskstore_ps((float*) &fArray[i*N], k, reg); + } + } + #else void SlurpIn(const char *arr, int vi[N], const int N_proc = N) diff --git a/mkFit/MkFinder.cc b/mkFit/MkFinder.cc index b37c40816636e..0e8bfeb9fe55e 100644 --- a/mkFit/MkFinder.cc +++ b/mkFit/MkFinder.cc @@ -468,8 +468,8 @@ void MkFinder::AddBestHit(const LayerOfHits &layer_of_hits, const int N_proc, idx[itrack] = XHitArr.At(itrack, hit_cnt, 0) * sizeof(Hit); } } -#if defined(MIC_INTRINSICS) - __m512i vi = _mm512_load_epi32(idx); +#if defined(GATHER_INTRINSICS) + GATHER_IDX_LOAD(vi, idx); #endif #ifndef NO_PREFETCH @@ -499,7 +499,7 @@ void MkFinder::AddBestHit(const LayerOfHits &layer_of_hits, const int N_proc, #else //NO_GATHER -#if defined(MIC_INTRINSICS) +#if defined(GATHER_INTRINSICS) msErr.SlurpIn(varr + off_error, vi); msPar.SlurpIn(varr + off_param, vi); #else @@ -672,8 +672,8 @@ void MkFinder::FindCandidates(const LayerOfHits &layer_of_hits, idx[itrack] = XHitArr.At(itrack, hit_cnt, 0) * sizeof(Hit); } } -#if defined(MIC_INTRINSICS) - __m512i vi = _mm512_load_epi32(idx); +#if defined(GATHER_INTRINSICS) + GATHER_IDX_LOAD(vi, idx); #endif // Prefetch to L2 the hits we'll (probably) process after two loops iterations. @@ -686,7 +686,7 @@ void MkFinder::FindCandidates(const LayerOfHits &layer_of_hits, } } -#if defined(MIC_INTRINSICS) +#if defined(GATHER_INTRINSICS) msErr.SlurpIn(varr + off_error, vi); msPar.SlurpIn(varr + off_param, vi); #else @@ -851,8 +851,8 @@ void MkFinder::FindCandidatesCloneEngine(const LayerOfHits &layer_of_hits, CandC idx[itrack] = XHitArr.At(itrack, hit_cnt, 0) * sizeof(Hit); } } -#if defined(MIC_INTRINSICS) - __m512i vi = _mm512_load_epi32(idx); +#if defined(GATHER_INTRINSICS) + GATHER_IDX_LOAD(vi, idx); #endif // Prefetch to L2 the hits we'll (probably) process after two loops iterations. @@ -865,7 +865,7 @@ void MkFinder::FindCandidatesCloneEngine(const LayerOfHits &layer_of_hits, CandC } } -#if defined(MIC_INTRINSICS) +#if defined(GATHER_INTRINSICS) msErr.SlurpIn(varr + off_error, vi); msPar.SlurpIn(varr + off_param, vi); #else @@ -1054,8 +1054,8 @@ void MkFinder::BkFitInputTracks(TrackVec& cands, int beg, int end) Chi2.SetVal(0); -#ifdef MIC_INTRINSICS - __m512i vi = _mm512_load_epi32(idx); +#ifdef GATHER_INTRINSICS + GATHER_IDX_LOAD(vi, idx); Err[iC].SlurpIn(varr + off_error, vi, N_proc); Par[iC].SlurpIn(varr + off_param, vi, N_proc); #else @@ -1096,8 +1096,8 @@ void MkFinder::BkFitInputTracks(EventOfCombCandidates& eocss, int beg, int end) Chi2.SetVal(0); -#ifdef MIC_INTRINSICS - __m512i vi = _mm512_load_epi32(idx); +#ifdef GATHER_INTRINSICS + GATHER_IDX_LOAD(vi, idx); Err[iC].SlurpIn(varr + off_error, vi, N_proc); Par[iC].SlurpIn(varr + off_param, vi, N_proc); #else diff --git a/mkFit/MkFinderFV.cc b/mkFit/MkFinderFV.cc index c57fcb6cddb50..d81fa2db2aa35 100644 --- a/mkFit/MkFinderFV.cc +++ b/mkFit/MkFinderFV.cc @@ -314,9 +314,6 @@ void MkFinderFV::FindCandidates(const LayerOfHits &layer_of_hits idx[itrack] = XHitArr.At(itrack, hit_cnt, 0) * sizeof(Hit); } } -#if defined(MIC_INTRINSICS) - __m512i vi = _mm512_load_epi32(idx); -#endif // Prefetch to L2 the hits we'll (probably) process after two loops iterations. // Ideally this would be initiated before coming here, for whole bunch_of_hits.m_hits vector. @@ -328,7 +325,8 @@ void MkFinderFV::FindCandidates(const LayerOfHits &layer_of_hits } } -#if defined(MIC_INTRINSICS) +#if defined(GATHER_INTRINSICS) + GATHER_IDX_LOAD(vi, idx); msErr.SlurpIn(varr + off_error, vi); msPar.SlurpIn(varr + off_param, vi); #else diff --git a/mkFit/MkFitter.cc b/mkFit/MkFitter.cc index 255e11d7b5cd2..5d17e8b5808d1 100644 --- a/mkFit/MkFitter.cc +++ b/mkFit/MkFitter.cc @@ -197,8 +197,8 @@ void MkFitter::SlurpInTracksAndHits(const std::vector& tracks, Chi2(itrack, 0, 0) = trk.chi2(); } -#ifdef MIC_INTRINSICS - __m512i vi = _mm512_load_epi32(idx); +#ifdef GATHER_INTRINSICS + GATHER_IDX_LOAD(vi, idx); Err[iC].SlurpIn(varr + off_error, vi); Par[iC].SlurpIn(varr + off_param, vi); #else @@ -228,8 +228,8 @@ void MkFitter::SlurpInTracksAndHits(const std::vector& tracks, HoTArr[hi](itrack, 0, 0) = tracks[i].getHitOnTrack(hi); } -#ifdef MIC_INTRINSICS - __m512i vi = _mm512_load_epi32(idx); +#ifdef GATHER_INTRINSICS + GATHER_IDX_LOAD(vi, idx); msErr[hi].SlurpIn(varr + off_error, vi); msPar[hi].SlurpIn(varr + off_param, vi); #else @@ -388,8 +388,8 @@ void MkFitter::InputTracksForFit(const std::vector& tracks, // for ( ; itrack < NN; ++itrack) { idx[itrack] = idx[0]; } -#ifdef MIC_INTRINSICS - __m512i vi = _mm512_load_epi32(idx); +#ifdef GATHER_INTRINSICS + GATHER_IDX_LOAD(vi, idx); Err[iC].SlurpIn(varr + off_error, vi, N_proc); Par[iC].SlurpIn(varr + off_param, vi, N_proc); for (int ll = 0; ll < Config::nLayers; ++ll) @@ -440,8 +440,8 @@ void MkFitter::FitTracksWithInterSlurp(const std::vector& layersohits, } for (int i = N_proc; i < NN; ++i) { idx[i] = idx[0]; } -#ifdef MIC_INTRINSICS - __m512i vi = _mm512_load_epi32(idx); +#ifdef GATHER_INTRINSICS + GATHER_IDX_LOAD(vi, idx); msPar[0].SlurpIn(varr + off_param, vi, N_proc); msErr[0].SlurpIn(varr + off_error, vi, N_proc); #else From fff2432b49ee4d20ecc8fcd2aeecd3aece25ec9e Mon Sep 17 00:00:00 2001 From: Matevz Tadel Date: Mon, 1 Oct 2018 16:26:50 -0700 Subject: [PATCH 3/3] Use Steve's trick to set SlurpIn AVX2 mask (but use integer cmp). --- Matriplex/Matriplex.h | 8 ++++---- Matriplex/MatriplexSym.h | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/Matriplex/Matriplex.h b/Matriplex/Matriplex.h index d24c04276fc55..64b1f39b9f52c 100644 --- a/Matriplex/Matriplex.h +++ b/Matriplex/Matriplex.h @@ -163,11 +163,11 @@ class Matriplex { const __m256 src = { 0 }; - __m256i k_master = _mm256_setr_epi32( -1, -1, -1, -1, -1, -1, -1, -1 ); - int *kmp = (int*) & k_master; - for (int i = N_proc; i < N; ++i) kmp[i] = 0; + __m256i k = _mm256_setr_epi32( 0, 1, 2, 3, 4, 5, 6, 7 ); + __m256i k_sel = _mm256_set1_epi32(N_proc); + __m256i k_master = _mm256_cmpgt_epi32(k_sel, k); - __m256i k = k_master; + k = k_master; for (int i = 0; i < kSize; ++i, arr += sizeof(T)) { __m256 reg = _mm256_mask_i32gather_ps(src, (float*) arr, vi, (__m256) k, 1); diff --git a/Matriplex/MatriplexSym.h b/Matriplex/MatriplexSym.h index 5238c5bc1af7d..11297506f3a87 100644 --- a/Matriplex/MatriplexSym.h +++ b/Matriplex/MatriplexSym.h @@ -183,11 +183,11 @@ class MatriplexSym { const __m256 src = { 0 }; - __m256i k_master = _mm256_setr_epi32( -1, -1, -1, -1, -1, -1, -1, -1 ); - int *kmp = (int*) & k_master; - for (int i = N_proc; i < N; ++i) kmp[i] = 0; + __m256i k = _mm256_setr_epi32( 0, 1, 2, 3, 4, 5, 6, 7 ); + __m256i k_sel = _mm256_set1_epi32(N_proc); + __m256i k_master = _mm256_cmpgt_epi32(k_sel, k); - __m256i k = k_master; + k = k_master; for (int i = 0; i < kSize; ++i, arr += sizeof(T)) { __m256 reg = _mm256_mask_i32gather_ps(src, (float *) arr, vi, (__m256) k, 1);