From 87d43b95295890220eafda3e0c63369e69325662 Mon Sep 17 00:00:00 2001 From: Jim Borden Date: Thu, 15 Feb 2024 14:05:04 -0800 Subject: [PATCH] Fix AVX2 build on Windows (#3238) Summary: Tested with both MSVC (with /openmp:llvm) and clang-cl (no particular extra flags needed). This PR is separated into two commits (three after I found out that lines need to be 80 chars or less): 1. Changes needed for clang-cl (and probably stock clang too) 2. Changes needed for MSVC So FAISS can decide either to require using LLVM for Windows (not a hard thing to do these days since it is fully supported inside Visual Studio) and discarding the second commits, or taking them all and documenting the need to use /openmp:llvm Closes https://github.com/facebookresearch/faiss/issues/3193 Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3238 Reviewed By: mdouze Differential Revision: D53479325 Pulled By: algoriddle fbshipit-source-id: e8628f44626b6f49c5d9d7f259a9e3061cfe5568 --- faiss/impl/LocalSearchQuantizer.cpp | 4 +++- faiss/impl/platform_macros.h | 9 +++++++++ faiss/utils/distances.cpp | 8 ++++---- faiss/utils/distances_fused/simdlib_based.cpp | 2 +- faiss/utils/distances_simd.cpp | 18 +++++++++--------- 5 files changed, 26 insertions(+), 15 deletions(-) diff --git a/faiss/impl/LocalSearchQuantizer.cpp b/faiss/impl/LocalSearchQuantizer.cpp index abbfe74901..8da989a9a4 100644 --- a/faiss/impl/LocalSearchQuantizer.cpp +++ b/faiss/impl/LocalSearchQuantizer.cpp @@ -628,7 +628,9 @@ void LocalSearchQuantizer::icm_encode_step( { size_t binary_idx = (other_m + 1) * M * K * K + m * K * K + code2 * K + code; - _mm_prefetch(binaries + binary_idx, _MM_HINT_T0); + _mm_prefetch( + (const char*)(binaries + binary_idx), + _MM_HINT_T0); } } #endif diff --git a/faiss/impl/platform_macros.h b/faiss/impl/platform_macros.h index aeafb9531a..2aecc51222 100644 --- a/faiss/impl/platform_macros.h +++ b/faiss/impl/platform_macros.h @@ -40,11 +40,13 @@ #include +#ifndef __clang__ inline int __builtin_ctzll(uint64_t x) { unsigned long ret; _BitScanForward64(&ret, x); return (int)ret; } +#endif // cudatoolkit provides __builtin_ctz for NVCC >= 11.0 #if !defined(__CUDACC__) || __CUDACC_VER_MAJOR__ < 11 @@ -55,13 +57,20 @@ inline int __builtin_ctz(unsigned long x) { } #endif +#ifndef __clang__ inline int __builtin_clzll(uint64_t x) { return (int)__lzcnt64(x); } +#endif #define __builtin_popcount __popcnt #define __builtin_popcountl __popcnt64 +#ifndef __clang__ +#define __m128i_u __m128i +#define __m256i_u __m256i +#endif + // MSVC does not define __SSEx__, and _M_IX86_FP is only defined on 32-bit // processors cf. // https://docs.microsoft.com/en-us/cpp/preprocessor/predefined-macros diff --git a/faiss/utils/distances.cpp b/faiss/utils/distances.cpp index ebc3329c28..82bc164ae1 100644 --- a/faiss/utils/distances.cpp +++ b/faiss/utils/distances.cpp @@ -417,8 +417,8 @@ void exhaustive_L2sqr_blas_cmax_avx2( for (int64_t i = i0; i < i1; i++) { float* ip_line = ip_block.get() + (i - i0) * (j1 - j0); - _mm_prefetch(ip_line, _MM_HINT_NTA); - _mm_prefetch(ip_line + 16, _MM_HINT_NTA); + _mm_prefetch((const char*)ip_line, _MM_HINT_NTA); + _mm_prefetch((const char*)(ip_line + 16), _MM_HINT_NTA); // constant const __m256 mul_minus2 = _mm256_set1_ps(-2); @@ -445,8 +445,8 @@ void exhaustive_L2sqr_blas_cmax_avx2( // process 16 elements per loop for (; idx_j < (count / 16) * 16; idx_j += 16, ip_line += 16) { - _mm_prefetch(ip_line + 32, _MM_HINT_NTA); - _mm_prefetch(ip_line + 48, _MM_HINT_NTA); + _mm_prefetch((const char*)(ip_line + 32), _MM_HINT_NTA); + _mm_prefetch((const char*)(ip_line + 48), _MM_HINT_NTA); // load values for norms const __m256 y_norm_0 = diff --git a/faiss/utils/distances_fused/simdlib_based.cpp b/faiss/utils/distances_fused/simdlib_based.cpp index 31239e866b..309fb72118 100644 --- a/faiss/utils/distances_fused/simdlib_based.cpp +++ b/faiss/utils/distances_fused/simdlib_based.cpp @@ -73,7 +73,7 @@ void kernel( // prefetch the next point #if defined(__AVX2__) - _mm_prefetch(xd_0 + DIM * sizeof(float), _MM_HINT_NTA); + _mm_prefetch((const char*)(xd_0 + DIM * sizeof(float)), _MM_HINT_NTA); #endif // load a single point from x diff --git a/faiss/utils/distances_simd.cpp b/faiss/utils/distances_simd.cpp index d74ca664be..323859f43b 100644 --- a/faiss/utils/distances_simd.cpp +++ b/faiss/utils/distances_simd.cpp @@ -439,14 +439,14 @@ void fvec_op_ny_D2( if (ny8 > 0) { // process 8 D2-vectors per loop. - _mm_prefetch(y, _MM_HINT_T0); - _mm_prefetch(y + 16, _MM_HINT_T0); + _mm_prefetch((const char*)y, _MM_HINT_T0); + _mm_prefetch((const char*)(y + 16), _MM_HINT_T0); const __m256 m0 = _mm256_set1_ps(x[0]); const __m256 m1 = _mm256_set1_ps(x[1]); for (i = 0; i < ny8 * 8; i += 8) { - _mm_prefetch(y + 32, _MM_HINT_T0); + _mm_prefetch((const char*)(y + 32), _MM_HINT_T0); // load 8x2 matrix and transpose it in registers. // the typical bottleneck is memory access, so @@ -496,14 +496,14 @@ void fvec_op_ny_D2( if (ny8 > 0) { // process 8 D2-vectors per loop. - _mm_prefetch(y, _MM_HINT_T0); - _mm_prefetch(y + 16, _MM_HINT_T0); + _mm_prefetch((const char*)y, _MM_HINT_T0); + _mm_prefetch((const char*)(y + 16), _MM_HINT_T0); const __m256 m0 = _mm256_set1_ps(x[0]); const __m256 m1 = _mm256_set1_ps(x[1]); for (i = 0; i < ny8 * 8; i += 8) { - _mm_prefetch(y + 32, _MM_HINT_T0); + _mm_prefetch((const char*)(y + 32), _MM_HINT_T0); // load 8x2 matrix and transpose it in registers. // the typical bottleneck is memory access, so @@ -1084,8 +1084,8 @@ size_t fvec_L2sqr_ny_nearest_D2( // process 8 D2-vectors per loop. const size_t ny8 = ny / 8; if (ny8 > 0) { - _mm_prefetch(y, _MM_HINT_T0); - _mm_prefetch(y + 16, _MM_HINT_T0); + _mm_prefetch((const char*)y, _MM_HINT_T0); + _mm_prefetch((const char*)(y + 16), _MM_HINT_T0); // track min distance and the closest vector independently // for each of 8 AVX2 components. @@ -1100,7 +1100,7 @@ size_t fvec_L2sqr_ny_nearest_D2( const __m256 m1 = _mm256_set1_ps(x[1]); for (; i < ny8 * 8; i += 8) { - _mm_prefetch(y + 32, _MM_HINT_T0); + _mm_prefetch((const char*)(y + 32), _MM_HINT_T0); __m256 v0; __m256 v1;